webpeel 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -9
- package/dist/cli.js +97 -6
- package/dist/cli.js.map +1 -1
- package/dist/core/actions.d.ts +28 -0
- package/dist/core/actions.d.ts.map +1 -1
- package/dist/core/actions.js +60 -0
- package/dist/core/actions.js.map +1 -1
- package/dist/core/bm25-filter.d.ts +10 -0
- package/dist/core/bm25-filter.d.ts.map +1 -1
- package/dist/core/bm25-filter.js +40 -0
- package/dist/core/bm25-filter.js.map +1 -1
- package/dist/core/content-pruner.d.ts +12 -5
- package/dist/core/content-pruner.d.ts.map +1 -1
- package/dist/core/content-pruner.js +247 -190
- package/dist/core/content-pruner.js.map +1 -1
- package/dist/core/research.d.ts +67 -0
- package/dist/core/research.d.ts.map +1 -0
- package/dist/core/research.js +254 -0
- package/dist/core/research.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -3
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +107 -2
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.d.ts.map +1 -0
- package/dist/server/app.js +189 -0
- package/dist/server/app.js.map +1 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.d.ts.map +1 -0
- package/dist/server/auth-store.js +89 -0
- package/dist/server/auth-store.js.map +1 -0
- package/dist/server/job-queue.d.ts +93 -0
- package/dist/server/job-queue.d.ts.map +1 -0
- package/dist/server/job-queue.js +144 -0
- package/dist/server/job-queue.js.map +1 -0
- package/dist/server/middleware/auth.d.ts +28 -0
- package/dist/server/middleware/auth.d.ts.map +1 -0
- package/dist/server/middleware/auth.js +183 -0
- package/dist/server/middleware/auth.js.map +1 -0
- package/dist/server/middleware/rate-limit.d.ts +23 -0
- package/dist/server/middleware/rate-limit.d.ts.map +1 -0
- package/dist/server/middleware/rate-limit.js +126 -0
- package/dist/server/middleware/rate-limit.js.map +1 -0
- package/dist/server/middleware/url-validator.d.ts +16 -0
- package/dist/server/middleware/url-validator.d.ts.map +1 -0
- package/dist/server/middleware/url-validator.js +187 -0
- package/dist/server/middleware/url-validator.js.map +1 -0
- package/dist/server/pg-auth-store.d.ts +129 -0
- package/dist/server/pg-auth-store.d.ts.map +1 -0
- package/dist/server/pg-auth-store.js +457 -0
- package/dist/server/pg-auth-store.js.map +1 -0
- package/dist/server/pg-job-queue.d.ts +60 -0
- package/dist/server/pg-job-queue.d.ts.map +1 -0
- package/dist/server/pg-job-queue.js +365 -0
- package/dist/server/pg-job-queue.js.map +1 -0
- package/dist/server/premium/domain-intel.d.ts +17 -0
- package/dist/server/premium/domain-intel.d.ts.map +1 -0
- package/dist/server/premium/domain-intel.js +134 -0
- package/dist/server/premium/domain-intel.js.map +1 -0
- package/dist/server/premium/index.d.ts +18 -0
- package/dist/server/premium/index.d.ts.map +1 -0
- package/dist/server/premium/index.js +36 -0
- package/dist/server/premium/index.js.map +1 -0
- package/dist/server/premium/swr-cache.d.ts +15 -0
- package/dist/server/premium/swr-cache.d.ts.map +1 -0
- package/dist/server/premium/swr-cache.js +35 -0
- package/dist/server/premium/swr-cache.js.map +1 -0
- package/dist/server/routes/activity.d.ts +7 -0
- package/dist/server/routes/activity.d.ts.map +1 -0
- package/dist/server/routes/activity.js +66 -0
- package/dist/server/routes/activity.js.map +1 -0
- package/dist/server/routes/agent.d.ts +12 -0
- package/dist/server/routes/agent.d.ts.map +1 -0
- package/dist/server/routes/agent.js +356 -0
- package/dist/server/routes/agent.js.map +1 -0
- package/dist/server/routes/answer.d.ts +6 -0
- package/dist/server/routes/answer.d.ts.map +1 -0
- package/dist/server/routes/answer.js +124 -0
- package/dist/server/routes/answer.js.map +1 -0
- package/dist/server/routes/batch.d.ts +7 -0
- package/dist/server/routes/batch.d.ts.map +1 -0
- package/dist/server/routes/batch.js +287 -0
- package/dist/server/routes/batch.js.map +1 -0
- package/dist/server/routes/cli-usage.d.ts +7 -0
- package/dist/server/routes/cli-usage.d.ts.map +1 -0
- package/dist/server/routes/cli-usage.js +121 -0
- package/dist/server/routes/cli-usage.js.map +1 -0
- package/dist/server/routes/compat.d.ts +24 -0
- package/dist/server/routes/compat.d.ts.map +1 -0
- package/dist/server/routes/compat.js +651 -0
- package/dist/server/routes/compat.js.map +1 -0
- package/dist/server/routes/extract.d.ts +9 -0
- package/dist/server/routes/extract.d.ts.map +1 -0
- package/dist/server/routes/extract.js +121 -0
- package/dist/server/routes/extract.js.map +1 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.d.ts.map +1 -0
- package/dist/server/routes/fetch.js +537 -0
- package/dist/server/routes/fetch.js.map +1 -0
- package/dist/server/routes/health.d.ts +8 -0
- package/dist/server/routes/health.d.ts.map +1 -0
- package/dist/server/routes/health.js +36 -0
- package/dist/server/routes/health.js.map +1 -0
- package/dist/server/routes/jobs.d.ts +8 -0
- package/dist/server/routes/jobs.d.ts.map +1 -0
- package/dist/server/routes/jobs.js +374 -0
- package/dist/server/routes/jobs.js.map +1 -0
- package/dist/server/routes/mcp.d.ts +16 -0
- package/dist/server/routes/mcp.d.ts.map +1 -0
- package/dist/server/routes/mcp.js +475 -0
- package/dist/server/routes/mcp.js.map +1 -0
- package/dist/server/routes/oauth.d.ts +10 -0
- package/dist/server/routes/oauth.d.ts.map +1 -0
- package/dist/server/routes/oauth.js +296 -0
- package/dist/server/routes/oauth.js.map +1 -0
- package/dist/server/routes/screenshot.d.ts +10 -0
- package/dist/server/routes/screenshot.d.ts.map +1 -0
- package/dist/server/routes/screenshot.js +217 -0
- package/dist/server/routes/screenshot.js.map +1 -0
- package/dist/server/routes/search.d.ts +7 -0
- package/dist/server/routes/search.d.ts.map +1 -0
- package/dist/server/routes/search.js +287 -0
- package/dist/server/routes/search.js.map +1 -0
- package/dist/server/routes/stats.d.ts +7 -0
- package/dist/server/routes/stats.d.ts.map +1 -0
- package/dist/server/routes/stats.js +65 -0
- package/dist/server/routes/stats.js.map +1 -0
- package/dist/server/routes/stripe.d.ts +9 -0
- package/dist/server/routes/stripe.d.ts.map +1 -0
- package/dist/server/routes/stripe.js +233 -0
- package/dist/server/routes/stripe.js.map +1 -0
- package/dist/server/routes/users.d.ts +9 -0
- package/dist/server/routes/users.d.ts.map +1 -0
- package/dist/server/routes/users.js +954 -0
- package/dist/server/routes/users.js.map +1 -0
- package/dist/server/routes/webhooks.d.ts +15 -0
- package/dist/server/routes/webhooks.d.ts.map +1 -0
- package/dist/server/routes/webhooks.js +73 -0
- package/dist/server/routes/webhooks.js.map +1 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.d.ts.map +1 -0
- package/dist/server/sentry.js +39 -0
- package/dist/server/sentry.js.map +1 -0
- package/dist/types.d.ts +13 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +3 -2
|
@@ -1,240 +1,297 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Content Density Pruner
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* (sidebars, footers, navigation, ads) that CSS selectors miss.
|
|
4
|
+
* Two-pass pruning to reduce HTML before markdown conversion:
|
|
6
5
|
*
|
|
7
|
-
*
|
|
6
|
+
* Pass 1 — Semantic removal: strip elements whose tag or class/id clearly
|
|
7
|
+
* mark them as page chrome (nav, footer, sidebar, cookie banners, ads).
|
|
8
|
+
*
|
|
9
|
+
* Pass 2 — Density scoring: score remaining block elements by text density,
|
|
10
|
+
* link density, tag importance, and word count. Remove low-scorers.
|
|
11
|
+
*
|
|
12
|
+
* Inspired by Crawl4AI's PruningContentFilter — targets 40-60% token savings.
|
|
8
13
|
*/
|
|
9
14
|
import * as cheerio from 'cheerio';
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
15
|
+
// -----------------------------------------------------------------------
|
|
16
|
+
// Pass 1 — Semantic removal: tags and class/id patterns
|
|
17
|
+
// -----------------------------------------------------------------------
|
|
18
|
+
/** Tags that are almost always page chrome, not article content. */
|
|
19
|
+
const CHROME_TAGS = new Set([
|
|
20
|
+
'nav', 'footer', 'aside', 'noscript',
|
|
14
21
|
]);
|
|
15
22
|
/**
|
|
16
|
-
*
|
|
17
|
-
*
|
|
23
|
+
* Class/id patterns that indicate page chrome.
|
|
24
|
+
* Tested against lowercased class/id strings.
|
|
18
25
|
*/
|
|
19
|
-
const
|
|
26
|
+
const CHROME_PATTERNS = [
|
|
27
|
+
/\bsidebar\b/,
|
|
28
|
+
/\bcookie/,
|
|
29
|
+
/\bbanner\b/,
|
|
30
|
+
/\b(ad|ads|advert)\b/,
|
|
31
|
+
/\bpopup\b/,
|
|
32
|
+
/\bmodal\b/,
|
|
33
|
+
/\boverlay\b/,
|
|
34
|
+
/\bsocial/,
|
|
35
|
+
/\bshare\b/,
|
|
36
|
+
/\bbreadcrumb/,
|
|
37
|
+
/\bskip-?link/,
|
|
38
|
+
/\bfootnote/,
|
|
39
|
+
/\brelated-?(post|article)/,
|
|
40
|
+
/\bnewsletter/,
|
|
41
|
+
/\bsubscri/,
|
|
42
|
+
/\bcomment/,
|
|
43
|
+
/\b(sign-?up|sign-?in|log-?in)\b/,
|
|
44
|
+
/\btoc\b/,
|
|
45
|
+
/\btable-?of-?contents\b/,
|
|
46
|
+
/\bgdpr\b/,
|
|
47
|
+
/\bconsent\b/,
|
|
48
|
+
];
|
|
20
49
|
/**
|
|
21
|
-
*
|
|
22
|
-
*
|
|
50
|
+
* Tags we never remove (they likely wrap main content).
|
|
51
|
+
* We recurse into them but never strip the element itself.
|
|
23
52
|
*/
|
|
24
|
-
const
|
|
25
|
-
article: 3,
|
|
26
|
-
main: 3,
|
|
27
|
-
p: 2,
|
|
28
|
-
h1: 2, h2: 2, h3: 2, h4: 2, h5: 2, h6: 2,
|
|
29
|
-
blockquote: 2,
|
|
30
|
-
pre: 2,
|
|
31
|
-
code: 2,
|
|
32
|
-
figure: 2,
|
|
33
|
-
figcaption: 2,
|
|
34
|
-
section: 1,
|
|
35
|
-
td: 1,
|
|
36
|
-
th: 1,
|
|
37
|
-
li: 1,
|
|
38
|
-
dd: 1,
|
|
39
|
-
dt: 1,
|
|
40
|
-
div: 0,
|
|
41
|
-
span: 0,
|
|
42
|
-
aside: -1,
|
|
43
|
-
header: -1,
|
|
44
|
-
form: -1,
|
|
45
|
-
nav: -2,
|
|
46
|
-
footer: -2,
|
|
47
|
-
};
|
|
48
|
-
/** Normalize tag importance (-2..+3) to 0..1 range */
|
|
49
|
-
function normalizeTagScore(rawScore) {
|
|
50
|
-
// Range is 5 units (-2 to +3), shift by +2 and divide
|
|
51
|
-
return (rawScore + 2) / 5;
|
|
52
|
-
}
|
|
53
|
-
function getTagImportance(tagName) {
|
|
54
|
-
return TAG_IMPORTANCE[tagName.toLowerCase()] ?? 0;
|
|
55
|
-
}
|
|
56
|
-
/** Word count bonus using log scale (0-1) */
|
|
57
|
-
function wordCountBonus(text) {
|
|
58
|
-
const words = text.trim().split(/\s+/).filter((w) => w.length > 0);
|
|
59
|
-
if (words.length === 0)
|
|
60
|
-
return 0;
|
|
61
|
-
return Math.min(Math.log(words.length + 1) / Math.log(1000), 1.0);
|
|
62
|
-
}
|
|
53
|
+
const PROTECTED_TAGS = new Set(['main', 'article', 'body']);
|
|
63
54
|
/**
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
55
|
+
* Tags we never remove during density scoring (Pass 2).
|
|
56
|
+
* Headings, paragraphs, and semantic content elements should survive
|
|
57
|
+
* even if they're small — they carry essential meaning.
|
|
67
58
|
*/
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
// normalizedPos > 0.8
|
|
74
|
-
return (1.0 - normalizedPos) / 0.2;
|
|
75
|
-
}
|
|
76
|
-
/** Max HTML length for a "leaf" block — blocks larger than this are recursed into */
|
|
77
|
-
const MAX_LEAF_BLOCK_HTML = 5000;
|
|
59
|
+
const DENSITY_SAFE_TAGS = new Set([
|
|
60
|
+
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
61
|
+
'p', 'pre', 'code', 'blockquote', 'figcaption',
|
|
62
|
+
'main', 'article', 'body',
|
|
63
|
+
]);
|
|
78
64
|
/**
|
|
79
|
-
*
|
|
65
|
+
* Class/id patterns that protect an element from removal.
|
|
80
66
|
*/
|
|
81
|
-
|
|
67
|
+
const CONTENT_PATTERNS = [
|
|
68
|
+
/\barticle/,
|
|
69
|
+
/\bpost-?content/,
|
|
70
|
+
/\bentry-?content/,
|
|
71
|
+
/\bmain-?content/,
|
|
72
|
+
/\bstory/,
|
|
73
|
+
/\bblog/,
|
|
74
|
+
/\bpage-?content/,
|
|
75
|
+
/\bcontent-?area/,
|
|
76
|
+
];
|
|
77
|
+
function isChromeBySemantic(el, $) {
|
|
82
78
|
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
const
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
const
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
const
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
79
|
+
if (CHROME_TAGS.has(tagName))
|
|
80
|
+
return true;
|
|
81
|
+
const cls = ($(el).attr('class') ?? '').toLowerCase();
|
|
82
|
+
const id = ($(el).attr('id') ?? '').toLowerCase();
|
|
83
|
+
const combined = cls + ' ' + id;
|
|
84
|
+
// Don't remove if it matches a content pattern
|
|
85
|
+
for (const p of CONTENT_PATTERNS) {
|
|
86
|
+
if (p.test(combined))
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
for (const p of CHROME_PATTERNS) {
|
|
90
|
+
if (p.test(combined))
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
// Role attribute
|
|
94
|
+
const role = ($(el).attr('role') ?? '').toLowerCase();
|
|
95
|
+
if (['navigation', 'banner', 'complementary', 'contentinfo', 'search'].includes(role)) {
|
|
96
|
+
return true;
|
|
97
|
+
}
|
|
98
|
+
return false;
|
|
99
|
+
}
|
|
100
|
+
// -----------------------------------------------------------------------
|
|
101
|
+
// Pass 2 — Density scoring
|
|
102
|
+
// -----------------------------------------------------------------------
|
|
103
|
+
/** Tag importance scores for density scoring (-2 to +3) */
|
|
104
|
+
const TAG_IMPORTANCE = {
|
|
105
|
+
article: 3, main: 3,
|
|
106
|
+
p: 2, h1: 2, h2: 2, h3: 2, h4: 2, h5: 2, h6: 2,
|
|
107
|
+
blockquote: 2, pre: 2, code: 2, figure: 2, figcaption: 2,
|
|
108
|
+
section: 1, td: 1, th: 1, li: 1, dd: 1, dt: 1,
|
|
109
|
+
div: 0, span: 0, table: 0, ul: 0, ol: 0, dl: 0,
|
|
110
|
+
aside: -1, header: -1, form: -1,
|
|
111
|
+
nav: -2, footer: -2,
|
|
112
|
+
};
|
|
113
|
+
function normalizeTagScore(rawScore) {
|
|
114
|
+
return (rawScore + 2) / 5; // -2..+3 → 0..1
|
|
112
115
|
}
|
|
113
116
|
/**
|
|
114
|
-
*
|
|
117
|
+
* Collect scoreable blocks from a DOM tree.
|
|
115
118
|
*
|
|
116
|
-
*
|
|
117
|
-
*
|
|
118
|
-
*
|
|
119
|
+
* Strategy: walk the tree top-down. For each element:
|
|
120
|
+
* - If it's a "leaf-ish" block (< threshold size), score it as one unit.
|
|
121
|
+
* - If it's large and a wrapper (div/section/table), recurse into children.
|
|
122
|
+
* - Protected elements are always recursed.
|
|
119
123
|
*
|
|
120
|
-
*
|
|
124
|
+
* This finds the right granularity: not scoring a 200KB wrapper div,
|
|
125
|
+
* but scoring the divs/sections/p's nested 3-4 levels deep that carry
|
|
126
|
+
* actual content or chrome.
|
|
121
127
|
*/
|
|
122
|
-
function collectBlocks($, parent, blocks,
|
|
128
|
+
function collectBlocks($, parent, blocks, maxLeafSize) {
|
|
123
129
|
const children = 'children' in parent ? parent.children : [];
|
|
124
130
|
for (const child of children) {
|
|
125
131
|
if (child.type !== 'tag')
|
|
126
132
|
continue;
|
|
127
133
|
const el = child;
|
|
128
134
|
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
}
|
|
145
|
-
else if (tagName === 'tr' || tagName === 'td' || tagName === 'th' || tagName === 'tbody' || tagName === 'thead') {
|
|
146
|
-
// Table layout elements — recurse through them to find block content
|
|
147
|
-
collectBlocks($, el, blocks, totalHtmlLength, depth + 1);
|
|
135
|
+
// Skip script/style
|
|
136
|
+
if (tagName === 'script' || tagName === 'style' || tagName === 'link' || tagName === 'meta')
|
|
137
|
+
continue;
|
|
138
|
+
const $el = $(el);
|
|
139
|
+
const outerHtml = $.html($el) ?? '';
|
|
140
|
+
const htmlLen = outerHtml.length;
|
|
141
|
+
// Skip extremely tiny elements (bare tags like <br>)
|
|
142
|
+
if (htmlLen < 10)
|
|
143
|
+
continue;
|
|
144
|
+
const isProtected = PROTECTED_TAGS.has(tagName);
|
|
145
|
+
const isWrapper = ['div', 'section', 'table', 'tbody', 'thead', 'tr',
|
|
146
|
+
'center', 'details', 'summary'].includes(tagName);
|
|
147
|
+
if (isProtected || (isWrapper && htmlLen > maxLeafSize)) {
|
|
148
|
+
// Too large or protected — recurse deeper
|
|
149
|
+
collectBlocks($, el, blocks, maxLeafSize);
|
|
148
150
|
}
|
|
149
|
-
else {
|
|
150
|
-
//
|
|
151
|
-
|
|
151
|
+
else if (htmlLen > 0) {
|
|
152
|
+
// Score this element
|
|
153
|
+
const clone = $el.clone();
|
|
154
|
+
clone.find('script, style, noscript, svg, path').remove();
|
|
155
|
+
const visibleText = clone.text() ?? '';
|
|
156
|
+
const visibleTextLen = visibleText.trim().length;
|
|
157
|
+
const textDensity = Math.min(visibleTextLen / Math.max(htmlLen, 1), 1.0);
|
|
158
|
+
let linkTextLen = 0;
|
|
159
|
+
$el.find('a').each((_i, a) => {
|
|
160
|
+
linkTextLen += ($(a).text() ?? '').trim().length;
|
|
161
|
+
});
|
|
162
|
+
const linkDensity = visibleTextLen > 0
|
|
163
|
+
? Math.min(linkTextLen / visibleTextLen, 1.0)
|
|
164
|
+
: 0;
|
|
165
|
+
const rawTagScore = TAG_IMPORTANCE[tagName] ?? 0;
|
|
166
|
+
const normalizedTag = normalizeTagScore(rawTagScore);
|
|
167
|
+
const words = visibleText.trim().split(/\s+/).filter(w => w.length > 0);
|
|
168
|
+
const wordBonus = words.length > 0
|
|
169
|
+
? Math.min(Math.log(words.length + 1) / Math.log(1000), 1.0)
|
|
170
|
+
: 0;
|
|
171
|
+
const score = (textDensity * 0.35 +
|
|
172
|
+
(1 - linkDensity) * 0.25 +
|
|
173
|
+
normalizedTag * 0.2 +
|
|
174
|
+
wordBonus * 0.1 +
|
|
175
|
+
0.1 // baseline position score (removed position bias — not useful for deep nesting)
|
|
176
|
+
);
|
|
177
|
+
blocks.push({
|
|
178
|
+
element: el,
|
|
179
|
+
tagName,
|
|
180
|
+
htmlLength: htmlLen,
|
|
181
|
+
visibleText,
|
|
182
|
+
score,
|
|
183
|
+
});
|
|
152
184
|
}
|
|
153
185
|
}
|
|
154
186
|
}
|
|
187
|
+
// -----------------------------------------------------------------------
|
|
188
|
+
// Main export
|
|
189
|
+
// -----------------------------------------------------------------------
|
|
155
190
|
/**
|
|
156
|
-
*
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
if (values.length === 0)
|
|
160
|
-
return 0;
|
|
161
|
-
return Math.max(...values);
|
|
162
|
-
}
|
|
163
|
-
/**
|
|
164
|
-
* Prune low-value HTML blocks using content density scoring.
|
|
191
|
+
* Prune low-value HTML blocks using two-pass approach:
|
|
192
|
+
* 1. Semantic tag/class removal
|
|
193
|
+
* 2. Density scoring of remaining blocks
|
|
165
194
|
*
|
|
166
195
|
* @param html - Raw HTML to prune
|
|
167
196
|
* @param options - Pruning configuration
|
|
168
197
|
* @returns Pruned HTML with stats
|
|
169
198
|
*/
|
|
170
199
|
export function pruneContent(html, options = {}) {
|
|
171
|
-
const { threshold = 0.
|
|
200
|
+
const { threshold = 0.3, minWords = 3, dynamic = true, } = options;
|
|
172
201
|
const originalLength = html.length;
|
|
173
202
|
if (!html.trim()) {
|
|
174
203
|
return { html, nodesRemoved: 0, reductionPercent: 0 };
|
|
175
204
|
}
|
|
176
205
|
const $ = cheerio.load(html);
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
//
|
|
184
|
-
|
|
185
|
-
|
|
206
|
+
let nodesRemoved = 0;
|
|
207
|
+
// =====================================================================
|
|
208
|
+
// Pass 1: Semantic removal
|
|
209
|
+
// =====================================================================
|
|
210
|
+
// Walk top-down; remove entire subtrees that are clearly chrome.
|
|
211
|
+
// We look at direct children of body, and one level deeper, to catch
|
|
212
|
+
// both <body> <nav> and <body> <div> <nav> patterns.
|
|
213
|
+
const toRemoveSemantic = [];
|
|
214
|
+
function walkForChrome(parent, depth) {
|
|
215
|
+
const children = 'children' in parent ? parent.children : [];
|
|
216
|
+
for (const child of children) {
|
|
217
|
+
if (child.type !== 'tag')
|
|
218
|
+
continue;
|
|
219
|
+
const el = child;
|
|
220
|
+
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
221
|
+
if (tagName === 'script' || tagName === 'style')
|
|
222
|
+
continue;
|
|
223
|
+
if (PROTECTED_TAGS.has(tagName)) {
|
|
224
|
+
// Recurse into protected — there might be chrome inside <article>
|
|
225
|
+
walkForChrome(el, depth + 1);
|
|
226
|
+
continue;
|
|
227
|
+
}
|
|
228
|
+
if (isChromeBySemantic(el, $)) {
|
|
229
|
+
toRemoveSemantic.push(el);
|
|
230
|
+
continue; // don't recurse into something we'll remove
|
|
231
|
+
}
|
|
232
|
+
// Recurse up to a reasonable depth
|
|
233
|
+
if (depth < 6) {
|
|
234
|
+
walkForChrome(el, depth + 1);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
186
237
|
}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
const block = blocks[i];
|
|
191
|
-
const normalizedPos = n > 1 ? i / (n - 1) : 0.5;
|
|
192
|
-
const posWeight = positionWeight(normalizedPos);
|
|
193
|
-
block.score = (block.textDensity * 0.35 +
|
|
194
|
-
(1 - block.linkDensity) * 0.25 +
|
|
195
|
-
block.normalizedTagScore * 0.2 +
|
|
196
|
-
block.wordBonus * 0.1 +
|
|
197
|
-
posWeight * 0.1);
|
|
238
|
+
const body = $('body').get(0);
|
|
239
|
+
if (body) {
|
|
240
|
+
walkForChrome(body, 0);
|
|
198
241
|
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
// Use the best-block score as the reference: remove blocks that score below
|
|
203
|
-
// 40% of the highest-quality block. This handles the common bimodal case
|
|
204
|
-
// (one great article block + several low-quality nav/sidebar blocks) much
|
|
205
|
-
// better than median/mean approaches.
|
|
206
|
-
const scores = blocks.map((b) => b.score);
|
|
207
|
-
const best = maxValue(scores);
|
|
208
|
-
effectiveThreshold = best * 0.4;
|
|
242
|
+
for (const el of toRemoveSemantic) {
|
|
243
|
+
$(el).remove();
|
|
244
|
+
nodesRemoved++;
|
|
209
245
|
}
|
|
210
|
-
//
|
|
211
|
-
|
|
212
|
-
//
|
|
213
|
-
const
|
|
214
|
-
const
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
const
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
246
|
+
// =====================================================================
|
|
247
|
+
// Pass 2: Density scoring (on the remaining HTML)
|
|
248
|
+
// =====================================================================
|
|
249
|
+
const postPass1Html = $.html();
|
|
250
|
+
const postPass1Length = postPass1Html.length;
|
|
251
|
+
// Run density scoring on remaining content
|
|
252
|
+
if (postPass1Length > 100 && body) {
|
|
253
|
+
const blocks = [];
|
|
254
|
+
// Max leaf size: ~5KB or 30% of remaining content (whichever is smaller)
|
|
255
|
+
// This ensures we find leaf blocks even in small documents.
|
|
256
|
+
const maxLeafSize = Math.min(5000, Math.ceil(postPass1Length * 0.3));
|
|
257
|
+
collectBlocks($, body, blocks, maxLeafSize);
|
|
258
|
+
if (blocks.length >= 2) {
|
|
259
|
+
const scores = blocks.map(b => b.score);
|
|
260
|
+
const bestScore = Math.max(...scores);
|
|
261
|
+
let effectiveThreshold = threshold;
|
|
262
|
+
if (dynamic) {
|
|
263
|
+
// Blocks scoring below 50% of the best block are candidates for removal
|
|
264
|
+
effectiveThreshold = bestScore * 0.5;
|
|
265
|
+
}
|
|
266
|
+
// Safety: retain at least 40% of post-pass1 content
|
|
267
|
+
const minRetainLength = Math.ceil(postPass1Length * 0.4);
|
|
268
|
+
// Sort ascending by score — remove worst first
|
|
269
|
+
const sorted = blocks
|
|
270
|
+
.map((b, i) => ({ b, i, score: b.score }))
|
|
271
|
+
.sort((a, b) => a.score - b.score);
|
|
272
|
+
const toRemoveDensity = new Set();
|
|
273
|
+
let removedLength = 0;
|
|
274
|
+
for (const { b } of sorted) {
|
|
275
|
+
if (PROTECTED_TAGS.has(b.tagName) || DENSITY_SAFE_TAGS.has(b.tagName))
|
|
276
|
+
continue;
|
|
277
|
+
const words = b.visibleText.trim().split(/\s+/).filter(w => w.length > 0);
|
|
278
|
+
const isTiny = words.length < minWords;
|
|
279
|
+
const isLow = b.score < effectiveThreshold;
|
|
280
|
+
if (!isTiny && !isLow)
|
|
281
|
+
continue;
|
|
282
|
+
// Check safety floor
|
|
283
|
+
const remaining = postPass1Length - (removedLength + b.htmlLength);
|
|
284
|
+
if (remaining < minRetainLength)
|
|
285
|
+
continue;
|
|
286
|
+
toRemoveDensity.add(b.element);
|
|
287
|
+
removedLength += b.htmlLength;
|
|
288
|
+
}
|
|
289
|
+
for (const el of toRemoveDensity) {
|
|
290
|
+
$(el).remove();
|
|
291
|
+
nodesRemoved++;
|
|
292
|
+
}
|
|
232
293
|
}
|
|
233
294
|
}
|
|
234
|
-
// Remove selected elements from the DOM
|
|
235
|
-
for (const el of toRemove) {
|
|
236
|
-
$(el).remove();
|
|
237
|
-
}
|
|
238
295
|
const resultHtml = $.html() ?? html;
|
|
239
296
|
const resultLength = resultHtml.length;
|
|
240
297
|
const reductionPercent = originalLength > 0
|
|
@@ -242,7 +299,7 @@ export function pruneContent(html, options = {}) {
|
|
|
242
299
|
: 0;
|
|
243
300
|
return {
|
|
244
301
|
html: resultHtml,
|
|
245
|
-
nodesRemoved
|
|
302
|
+
nodesRemoved,
|
|
246
303
|
reductionPercent,
|
|
247
304
|
};
|
|
248
305
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content-pruner.js","sourceRoot":"","sources":["../../src/core/content-pruner.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"content-pruner.js","sourceRoot":"","sources":["../../src/core/content-pruner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAqBnC,0EAA0E;AAC1E,wDAAwD;AACxD,0EAA0E;AAE1E,oEAAoE;AACpE,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC;IAC1B,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,UAAU;CACrC,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAM,eAAe,GAAG;IACtB,aAAa;IACb,UAAU;IACV,YAAY;IACZ,qBAAqB;IACrB,WAAW;IACX,WAAW;IACX,aAAa;IACb,UAAU;IACV,WAAW;IACX,cAAc;IACd,cAAc;IACd,YAAY;IACZ,2BAA2B;IAC3B,cAAc;IACd,WAAW;IACX,WAAW;IACX,iCAAiC;IACjC,SAAS;IACT,yBAAyB;IACzB,UAAU;IACV,aAAa;CACd,CAAC;AAEF;;;GAGG;AACH,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC,CAAC;AAE5D;;;;GAIG;AACH,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC;IAChC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;IAClC,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,YAAY,EAAE,YAAY;IAC9C,MAAM,EAAE,SAAS,EAAE,MAAM;CAC1B,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,gBAAgB,GAAG;IACvB,WAAW;IACX,iBAAiB;IACjB,kBAAkB;IAClB,iBAAiB;IACjB,SAAS;IACT,QAAQ;IACR,iBAAiB;IACjB,iBAAiB;CAClB,CAAC;AAEF,SAAS,kBAAkB,CAAC,EAAW,EAAE,CAAqB;IAC5D,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;IAChD,IAAI,WAAW,CAAC,GAAG,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC;IAE1C,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IACtD,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAClD,MAAM,QAAQ,GAAG,GAAG,GAAG,GAAG,GAAG,EAAE,CAAC;IAEhC,+CAA+C;IAC/C,KAAK,MAAM,CAAC,IAAI,gBAAgB,EAAE,CAAC;QACjC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;YAAE,OAAO,KAAK,CAAC;IACrC,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,eAAe,EAAE,CAAC;QAChC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;YAAE,OAAO,IAAI,CAAC;IACpC,CAAC;IAED,iBAAiB;IACjB,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IACtD,IAAI,CAAC,YAAY,EAAE,QAAQ,EAAE,eAAe,EAAE,aAAa,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACtF,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,0EAA0E;AAC1E,2BAA2B;AAC3B,0EAA0E;AAE1E,2DAA2D;AAC3D,MAAM,cAAc,GAA2B;IAC7C,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC;IACnB,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC;IAC9C,UAAU,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC;IACxD,OAAO,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC;IAC7C,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC;IAC9C,KAAK,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/B,GAAG,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;CACpB,CAAC;AAEF,SAAS,iBAAiB,CAAC,QAAgB;IACzC,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,gBAAgB;AAC7C,CAAC;AAUD;;;;;;;;;;;GAWG;AACH,SAAS,aAAa,CACpB,CAAqB,EACrB,MAAe,EACf,MAAqB,EACrB,WAAmB;IAEnB,MAAM,QAAQ,GAAG,UAAU,IAAI,MAAM,CAAC,CAAC,CAAE,MAAM,CAAC,QAAsB,CAAC,CAAC,CAAC,EAAE,CAAC;IAE5E,KAAK,MAAM,KAAK,IAAI,QAAQ,EAAE,CAAC;QAC7B,IAAI,KAAK,CAAC,IAAI,KAAK,KAAK;YAAE,SAAS;QACnC,MAAM,EAAE,GAAG,KAAgB,CAAC;QAC5B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAEhD,oBAAoB;QACpB,IAAI,OAAO,KAAK,QAAQ,IAAI,OAAO,KAAK,OAAO,IAAI,OAAO,KAAK,MAAM,IAAI,OAAO,KAAK,MAAM;YAAE,SAAS;QAEtG,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QACpC,MAAM,OAAO,GAAG,SAAS,CAAC,MAAM,CAAC;QAEjC,qDAAqD;QACrD,IAAI,OAAO,GAAG,EAAE;YAAE,SAAS;QAE3B,MAAM,WAAW,GAAG,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAChD,MAAM,SAAS,GAAG,CAAC,KAAK,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI;YACjD,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QAErE,IAAI,WAAW,IAAI,CAAC,SAAS,IAAI,OAAO,GAAG,WAAW,CAAC,EAAE,CAAC;YACxD,0CAA0C;YAC1C,aAAa,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,WAAW,CAAC,CAAC;QAC5C,CAAC;aAAM,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YACvB,qBAAqB;YACrB,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,EAAE,CAAC;YAC1B,KAAK,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC,MAAM,EAAE,CAAC;YAC1D,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;YACvC,MAAM,cAAc,GAAG,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC;YAEjD,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAEzE,IAAI,WAAW,GAAG,CAAC,CAAC;YACpB,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,EAAE;gBAC3B,WAAW,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC;YACnD,CAAC,CAAC,CAAC;YACH,MAAM,WAAW,GAAG,cAAc,GAAG,CAAC;gBACpC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,WAAW,GAAG,cAAc,EAAE,GAAG,CAAC;gBAC7C,CAAC,CAAC,CAAC,CAAC;YAEN,MAAM,WAAW,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YACjD,MAAM,aAAa,GAAG,iBAAiB,CAAC,WAAW,CAAC,CAAC;YAErD,MAAM,KAAK,GAAG,WAAW,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACxE,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC;gBAChC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC;gBAC5D,CAAC,CAAC,CAAC,CAAC;YAEN,MAAM,KAAK,GAAG,CACZ,WAAW,GAAG,IAAI;gBAClB,CAAC,CAAC,GAAG,WAAW,CAAC,GAAG,IAAI;gBACxB,aAAa,GAAG,GAAG;gBACnB,SAAS,GAAG,GAAG;gBACf,GAAG,CAAC,gFAAgF;aACrF,CAAC;YAEF,MAAM,CAAC,IAAI,CAAC;gBACV,OAAO,EAAE,EAAE;gBACX,OAAO;gBACP,UAAU,EAAE,OAAO;gBACnB,WAAW;gBACX,KAAK;aACN,CAAC,CAAC;QACL,CAAC;IACH,CAAC;AACH,CAAC;AAED,0EAA0E;AAC1E,cAAc;AACd,0EAA0E;AAE1E;;;;;;;;GAQG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY,EAAE,UAAwB,EAAE;IACnE,MAAM,EACJ,SAAS,GAAG,GAAG,EACf,QAAQ,GAAG,CAAC,EACZ,OAAO,GAAG,IAAI,GACf,GAAG,OAAO,CAAC;IAEZ,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC;IACnC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACjB,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,CAAC,EAAE,gBAAgB,EAAE,CAAC,EAAE,CAAC;IACxD,CAAC;IAED,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,IAAI,YAAY,GAAG,CAAC,CAAC;IAErB,wEAAwE;IACxE,2BAA2B;IAC3B,wEAAwE;IACxE,iEAAiE;IACjE,qEAAqE;IACrE,qDAAqD;IACrD,MAAM,gBAAgB,GAAc,EAAE,CAAC;IAEvC,SAAS,aAAa,CAAC,MAAe,EAAE,KAAa;QACnD,MAAM,QAAQ,GAAG,UAAU,IAAI,MAAM,CAAC,CAAC,CAAE,MAAM,CAAC,QAAsB,CAAC,CAAC,CAAC,EAAE,CAAC;QAC5E,KAAK,MAAM,KAAK,IAAI,QAAQ,EAAE,CAAC;YAC7B,IAAI,KAAK,CAAC,IAAI,KAAK,KAAK;gBAAE,SAAS;YACnC,MAAM,EAAE,GAAG,KAAgB,CAAC;YAC5B,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;YAChD,IAAI,OAAO,KAAK,QAAQ,IAAI,OAAO,KAAK,OAAO;gBAAE,SAAS;YAE1D,IAAI,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;gBAChC,kEAAkE;gBAClE,aAAa,CAAC,EAAE,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;gBAC7B,SAAS;YACX,CAAC;YAED,IAAI,kBAAkB,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC;gBAC9B,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAC1B,SAAS,CAAC,4CAA4C;YACxD,CAAC;YAED,mCAAmC;YACnC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;gBACd,aAAa,CAAC,EAAE,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;YAC/B,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IAC9B,IAAI,IAAI,EAAE,CAAC;QACT,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACzB,CAAC;IAED,KAAK,MAAM,EAAE,IAAI,gBAAgB,EAAE,CAAC;QAClC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;QACf,YAAY,EAAE,CAAC;IACjB,CAAC;IAED,wEAAwE;IACxE,kDAAkD;IAClD,wEAAwE;IACxE,MAAM,aAAa,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/B,MAAM,eAAe,GAAG,aAAa,CAAC,MAAM,CAAC;IAE7C,2CAA2C;IAC3C,IAAI,eAAe,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAkB,EAAE,CAAC;QACjC,yEAAyE;QACzE,4DAA4D;QAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,eAAe,GAAG,GAAG,CAAC,CAAC,CAAC;QACrE,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,CAAC,CAAC;QAE5C,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YACvB,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;YAEtC,IAAI,kBAAkB,GAAG,SAAS,CAAC;YACnC,IAAI,OAAO,EAAE,CAAC;gBACZ,wEAAwE;gBACxE,kBAAkB,GAAG,SAAS,GAAG,GAAG,CAAC;YACvC,CAAC;YAED,oDAAoD;YACpD,MAAM,eAAe,GAAG,IAAI,CAAC,IAAI,CAAC,eAAe,GAAG,GAAG,CAAC,CAAC;YAEzD,+CAA+C;YAC/C,MAAM,MAAM,GAAG,MAAM;iBAClB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;iBACzC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;YAErC,MAAM,eAAe,GAAG,IAAI,GAAG,EAAW,CAAC;YAC3C,IAAI,aAAa,GAAG,CAAC,CAAC;YAEtB,KAAK,MAAM,EAAE,CAAC,EAAE,IAAI,MAAM,EAAE,CAAC;gBAC3B,IAAI,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC;oBAAE,SAAS;gBAEhF,MAAM,KAAK,GAAG,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAC1E,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,QAAQ,CAAC;gBACvC,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,GAAG,kBAAkB,CAAC;gBAE3C,IAAI,CAAC,MAAM,IAAI,CAAC,KAAK;oBAAE,SAAS;gBAEhC,qBAAqB;gBACrB,MAAM,SAAS,GAAG,eAAe,GAAG,CAAC,aAAa,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC;gBACnE,IAAI,SAAS,GAAG,eAAe;oBAAE,SAAS;gBAE1C,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;gBAC/B,aAAa,IAAI,CAAC,CAAC,UAAU,CAAC;YAChC,CAAC;YAED,KAAK,MAAM,EAAE,IAAI,eAAe,EAAE,CAAC;gBACjC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;gBACf,YAAY,EAAE,CAAC;YACjB,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;IACpC,MAAM,YAAY,GAAG,UAAU,CAAC,MAAM,CAAC;IACvC,MAAM,gBAAgB,GAAG,cAAc,GAAG,CAAC;QACzC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,cAAc,GAAG,YAAY,CAAC,GAAG,cAAc,CAAC,GAAG,GAAG,CAAC,CAAC;QACnF,CAAC,CAAC,CAAC,CAAC;IAEN,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,YAAY;QACZ,gBAAgB;KACjB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel Deep Research Agent
|
|
3
|
+
*
|
|
4
|
+
* Autonomously searches the web, fetches top sources, filters content with
|
|
5
|
+
* BM25, optionally follows promising links, and synthesizes a comprehensive
|
|
6
|
+
* report using an LLM.
|
|
7
|
+
*
|
|
8
|
+
* Design principle: orchestrate existing modules (peel, bm25-filter,
|
|
9
|
+
* llm-extract) — don't reinvent anything.
|
|
10
|
+
*/
|
|
11
|
+
export interface ResearchOptions {
|
|
12
|
+
/** Research question or topic */
|
|
13
|
+
query: string;
|
|
14
|
+
/** Maximum number of sources to consult. Default: 5 */
|
|
15
|
+
maxSources?: number;
|
|
16
|
+
/** Maximum depth of link-following. Default: 1 (just search results; 2+ follows links) */
|
|
17
|
+
maxDepth?: number;
|
|
18
|
+
/** LLM API key for synthesis */
|
|
19
|
+
apiKey?: string;
|
|
20
|
+
/** LLM model for synthesis. Default: gpt-4o-mini */
|
|
21
|
+
model?: string;
|
|
22
|
+
/** LLM base URL. Default: https://api.openai.com/v1 */
|
|
23
|
+
baseUrl?: string;
|
|
24
|
+
/** Maximum total time in ms. Default: 60000 (1 minute) */
|
|
25
|
+
timeout?: number;
|
|
26
|
+
/** Output format: 'report' (markdown synthesis) or 'sources' (raw extracted data). Default: 'report' */
|
|
27
|
+
outputFormat?: 'report' | 'sources';
|
|
28
|
+
/** Optional callback for progress updates */
|
|
29
|
+
onProgress?: (step: ResearchStep) => void;
|
|
30
|
+
}
|
|
31
|
+
export interface ResearchStep {
|
|
32
|
+
phase: 'searching' | 'fetching' | 'extracting' | 'following' | 'synthesizing';
|
|
33
|
+
message: string;
|
|
34
|
+
sourcesFound?: number;
|
|
35
|
+
sourcesFetched?: number;
|
|
36
|
+
}
|
|
37
|
+
export interface ResearchSource {
|
|
38
|
+
url: string;
|
|
39
|
+
title: string;
|
|
40
|
+
/** Key findings from this source */
|
|
41
|
+
findings: string;
|
|
42
|
+
/** Relevance score (0-1) */
|
|
43
|
+
relevance: number;
|
|
44
|
+
}
|
|
45
|
+
export interface ResearchResult {
|
|
46
|
+
/** Synthesized research report (markdown) */
|
|
47
|
+
report: string;
|
|
48
|
+
/** Sources consulted */
|
|
49
|
+
sources: ResearchSource[];
|
|
50
|
+
/** Total sources found vs consulted */
|
|
51
|
+
totalSourcesFound: number;
|
|
52
|
+
sourcesConsulted: number;
|
|
53
|
+
/** Time taken in ms */
|
|
54
|
+
elapsed: number;
|
|
55
|
+
/** Tokens used for synthesis */
|
|
56
|
+
tokensUsed?: {
|
|
57
|
+
input: number;
|
|
58
|
+
output: number;
|
|
59
|
+
};
|
|
60
|
+
/** Estimated cost in USD */
|
|
61
|
+
cost?: number;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Conduct autonomous multi-step web research on a topic.
|
|
65
|
+
*/
|
|
66
|
+
export declare function research(options: ResearchOptions): Promise<ResearchResult>;
|
|
67
|
+
//# sourceMappingURL=research.d.ts.map
|