@j0hanz/fetch-url-mcp 1.8.4 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +98 -11
- package/dist/lib/fetch-pipeline.d.ts +1 -2
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +6 -16
- package/dist/resources/instructions.d.ts.map +1 -1
- package/dist/resources/instructions.js +1 -2
- package/dist/schemas.d.ts +0 -2
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +0 -11
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +5 -7
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +1 -23
- package/dist/transform/metadata.d.ts +1 -0
- package/dist/transform/metadata.d.ts.map +1 -1
- package/dist/transform/metadata.js +25 -0
- package/dist/transform/shared.d.ts.map +1 -1
- package/dist/transform/shared.js +2 -4
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +90 -23
- package/dist/transform/types.d.ts +0 -2
- package/dist/transform/types.d.ts.map +1 -1
- package/dist/transform/worker-pool.d.ts +0 -3
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +0 -2
- package/package.json +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAiiB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAuCD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAkVD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CA6BvE;AA+CD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAyRD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6DR;AA2GD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAmCxE;AAcD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|
package/dist/lib/content.js
CHANGED
|
@@ -6,6 +6,7 @@ const NOISE_SCAN_LIMIT = 50_000;
|
|
|
6
6
|
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
7
|
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
8
|
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
|
+
const ABORT_CHECK_INTERVAL = 500;
|
|
9
10
|
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
10
11
|
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
11
12
|
const NOISE_PATTERNS = [
|
|
@@ -30,6 +31,7 @@ const BASE_STRUCTURAL_TAGS = new Set([
|
|
|
30
31
|
'style',
|
|
31
32
|
'noscript',
|
|
32
33
|
'iframe',
|
|
34
|
+
'template',
|
|
33
35
|
'form',
|
|
34
36
|
'button',
|
|
35
37
|
'input',
|
|
@@ -76,6 +78,10 @@ const PROMO_TOKENS_ALWAYS = [
|
|
|
76
78
|
'pagination',
|
|
77
79
|
'pager',
|
|
78
80
|
'taglist',
|
|
81
|
+
'twitter-tweet',
|
|
82
|
+
'fb-post',
|
|
83
|
+
'instagram-media',
|
|
84
|
+
'social-embed',
|
|
79
85
|
];
|
|
80
86
|
const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
|
|
81
87
|
const PROMO_TOKENS_BY_CATEGORY = {
|
|
@@ -86,7 +92,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
|
|
|
86
92
|
const BASE_NOISE_SELECTORS = {
|
|
87
93
|
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
88
94
|
cookieBanners: '[role="dialog"]',
|
|
89
|
-
hidden: '[style*="display: none"],[style*="display:none"],[hidden],[aria-hidden="true"]',
|
|
95
|
+
hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
|
|
90
96
|
};
|
|
91
97
|
const NO_MATCH_REGEX = /a^/i;
|
|
92
98
|
let cachedContext;
|
|
@@ -188,8 +194,9 @@ function getContext() {
|
|
|
188
194
|
function isInteractive(element, role) {
|
|
189
195
|
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
190
196
|
return true;
|
|
197
|
+
const tag = element.tagName.toLowerCase();
|
|
191
198
|
const ds = element.getAttribute('data-state');
|
|
192
|
-
if (ds === 'inactive' || ds === 'closed')
|
|
199
|
+
if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
|
|
193
200
|
return true;
|
|
194
201
|
const dataOrientation = element.getAttribute('data-orientation');
|
|
195
202
|
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
@@ -209,6 +216,19 @@ function isWithinPrimaryContent(element) {
|
|
|
209
216
|
}
|
|
210
217
|
return false;
|
|
211
218
|
}
|
|
219
|
+
const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
|
|
220
|
+
const ASIDE_NAV_MIN_LINKS = 10;
|
|
221
|
+
function isNavigationAside(element) {
|
|
222
|
+
if (element.querySelector('nav'))
|
|
223
|
+
return true;
|
|
224
|
+
const links = element.querySelectorAll('a[href]');
|
|
225
|
+
if (links.length < ASIDE_NAV_MIN_LINKS)
|
|
226
|
+
return false;
|
|
227
|
+
const textLen = (element.textContent || '').trim().length;
|
|
228
|
+
if (textLen === 0)
|
|
229
|
+
return true;
|
|
230
|
+
return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
|
|
231
|
+
}
|
|
212
232
|
function shouldPreserve(element, tagName) {
|
|
213
233
|
// Check Dialog
|
|
214
234
|
const role = element.getAttribute('role');
|
|
@@ -227,6 +247,12 @@ function shouldPreserve(element, tagName) {
|
|
|
227
247
|
return ((element.textContent || '').trim().length >=
|
|
228
248
|
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
229
249
|
}
|
|
250
|
+
// Check Aside — preserve only if it looks like article content, not navigation
|
|
251
|
+
if (tagName === 'aside') {
|
|
252
|
+
if (!isWithinPrimaryContent(element))
|
|
253
|
+
return false;
|
|
254
|
+
return !isNavigationAside(element);
|
|
255
|
+
}
|
|
230
256
|
return false;
|
|
231
257
|
}
|
|
232
258
|
function removeNodes(nodes) {
|
|
@@ -237,20 +263,24 @@ function removeNodes(nodes) {
|
|
|
237
263
|
}
|
|
238
264
|
}
|
|
239
265
|
}
|
|
240
|
-
function scoreNavFooter(
|
|
266
|
+
function scoreNavFooter(meta, weights) {
|
|
241
267
|
let score = 0;
|
|
242
|
-
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
268
|
+
if (ALWAYS_NOISE_TAGS.has(meta.tagName))
|
|
243
269
|
score += weights.structural;
|
|
244
270
|
// Header Boilerplate
|
|
245
|
-
if (tagName === 'header') {
|
|
246
|
-
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
247
|
-
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
271
|
+
if (meta.tagName === 'header') {
|
|
272
|
+
if ((meta.role && NAVIGATION_ROLES.has(meta.role)) ||
|
|
273
|
+
HEADER_NOISE_PATTERN.test(`${meta.className} ${meta.id}`)) {
|
|
248
274
|
score += weights.structural;
|
|
249
275
|
}
|
|
250
276
|
}
|
|
277
|
+
// Aside (sidebar/complementary) — noise unless inside primary content
|
|
278
|
+
if (meta.tagName === 'aside') {
|
|
279
|
+
score += weights.structural;
|
|
280
|
+
}
|
|
251
281
|
// Role Noise
|
|
252
|
-
if (role && NAVIGATION_ROLES.has(role)) {
|
|
253
|
-
if (tagName !== 'aside' || role !== 'complementary') {
|
|
282
|
+
if (meta.role && NAVIGATION_ROLES.has(meta.role)) {
|
|
283
|
+
if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
|
|
254
284
|
score += weights.structural;
|
|
255
285
|
}
|
|
256
286
|
}
|
|
@@ -287,7 +317,7 @@ function isNoiseElement(element, context) {
|
|
|
287
317
|
}
|
|
288
318
|
// Nav/Footer Scoring
|
|
289
319
|
if (context.flags.navFooter) {
|
|
290
|
-
score += scoreNavFooter(meta
|
|
320
|
+
score += scoreNavFooter(meta, weights);
|
|
291
321
|
}
|
|
292
322
|
// Hidden
|
|
293
323
|
if (meta.isHidden && !meta.isInteractive) {
|
|
@@ -375,7 +405,7 @@ function stripNoise(document, context, signal) {
|
|
|
375
405
|
// Candidates
|
|
376
406
|
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
377
407
|
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
378
|
-
if (i %
|
|
408
|
+
if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
|
|
379
409
|
throw new Error('Noise removal aborted');
|
|
380
410
|
}
|
|
381
411
|
const node = candidates[i];
|
|
@@ -463,6 +493,29 @@ function mayContainNoise(html) {
|
|
|
463
493
|
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
464
494
|
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
465
495
|
}
|
|
496
|
+
function stripTabTriggers(document) {
|
|
497
|
+
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
498
|
+
for (let i = tabs.length - 1; i >= 0; i--) {
|
|
499
|
+
tabs[i]?.remove();
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
function escapeTableCellPipes(document) {
|
|
503
|
+
const codes = document.querySelectorAll('td code, th code');
|
|
504
|
+
for (const code of codes) {
|
|
505
|
+
if (code.textContent.includes('|')) {
|
|
506
|
+
code.textContent = code.textContent.replace(/\|/g, '\\|');
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
function separateAdjacentInlineElements(document) {
|
|
511
|
+
const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
|
|
512
|
+
for (const badge of badges) {
|
|
513
|
+
const next = badge.nextSibling;
|
|
514
|
+
if (next?.nodeType === 1) {
|
|
515
|
+
badge.after(document.createTextNode(' '));
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
466
519
|
export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
467
520
|
const context = getContext();
|
|
468
521
|
if (config.noiseRemoval.debug) {
|
|
@@ -471,9 +524,37 @@ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
|
471
524
|
});
|
|
472
525
|
}
|
|
473
526
|
stripNoise(document, context, signal);
|
|
527
|
+
stripTabTriggers(document);
|
|
528
|
+
separateAdjacentInlineElements(document);
|
|
529
|
+
flattenTableCellBreaks(document);
|
|
530
|
+
escapeTableCellPipes(document);
|
|
531
|
+
normalizeTableStructure(document);
|
|
474
532
|
if (baseUrl)
|
|
475
533
|
resolveUrls(document, baseUrl);
|
|
476
534
|
}
|
|
535
|
+
// Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
|
|
536
|
+
function normalizeTableStructure(document) {
|
|
537
|
+
for (const table of document.querySelectorAll('table')) {
|
|
538
|
+
for (const cell of table.querySelectorAll('th, td')) {
|
|
539
|
+
for (const tag of ['tbody', 'thead', 'tfoot']) {
|
|
540
|
+
let nested = cell.querySelector(tag);
|
|
541
|
+
while (nested) {
|
|
542
|
+
table.appendChild(nested);
|
|
543
|
+
nested = cell.querySelector(tag);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
function flattenTableCellBreaks(document) {
|
|
550
|
+
const cells = document.querySelectorAll('td, th');
|
|
551
|
+
for (const cell of cells) {
|
|
552
|
+
const brs = cell.querySelectorAll('br');
|
|
553
|
+
for (const br of brs) {
|
|
554
|
+
br.replaceWith(' ');
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
}
|
|
477
558
|
export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
478
559
|
const shouldParse = isFullDocumentHtml(html) ||
|
|
479
560
|
mayContainNoise(html) ||
|
|
@@ -1070,6 +1151,8 @@ function processTextBuffer(lines, options) {
|
|
|
1070
1151
|
function applyGlobalRegexes(text, options) {
|
|
1071
1152
|
let result = text;
|
|
1072
1153
|
const checkAbort = createAbortChecker(options);
|
|
1154
|
+
// Normalize non-breaking spaces to regular spaces
|
|
1155
|
+
result = result.replace(/\u00A0/g, ' ');
|
|
1073
1156
|
checkAbort('markdown:cleanup:headings');
|
|
1074
1157
|
// fixAndSpaceHeadings
|
|
1075
1158
|
result = result
|
|
@@ -1099,6 +1182,10 @@ function applyGlobalRegexes(text, options) {
|
|
|
1099
1182
|
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
1100
1183
|
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
1101
1184
|
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
1185
|
+
// Trim leading whitespace inside inline code spans
|
|
1186
|
+
result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
|
|
1187
|
+
// Unescape backticks inside markdown link text
|
|
1188
|
+
result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
|
|
1102
1189
|
result = normalizeNestedListIndentation(result);
|
|
1103
1190
|
checkAbort('markdown:cleanup:properties');
|
|
1104
1191
|
// fixProperties
|
|
@@ -45,14 +45,13 @@ export declare const markdownTransform: (input: {
|
|
|
45
45
|
buffer: Uint8Array;
|
|
46
46
|
encoding: string;
|
|
47
47
|
truncated?: boolean;
|
|
48
|
-
}, url: string, signal?: AbortSignal
|
|
48
|
+
}, url: string, signal?: AbortSignal) => Promise<MarkdownPipelineResult>;
|
|
49
49
|
export declare function serializeMarkdownResult(result: MarkdownPipelineResult): string;
|
|
50
50
|
interface SharedFetchOptions {
|
|
51
51
|
readonly url: string;
|
|
52
52
|
readonly signal?: AbortSignal;
|
|
53
53
|
readonly cacheVary?: Record<string, unknown> | string;
|
|
54
54
|
readonly forceRefresh?: boolean;
|
|
55
|
-
readonly maxInlineChars?: number;
|
|
56
55
|
readonly onStage?: (stage: SharedFetchStage) => void;
|
|
57
56
|
readonly transform: (input: {
|
|
58
57
|
buffer: Uint8Array;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch-pipeline.d.ts","sourceRoot":"","sources":["../../src/lib/fetch-pipeline.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,KAAK,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAqBrE,KAAK,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAY1C,wBAAgB,gBAAgB,CAC9B,GAAG,EAAE,OAAO,EACZ,IAAI,EAAE,SAAS,MAAM,EAAE,GACtB,UAAU,GAAG,SAAS,CAOxB;AACD,wBAAgB,UAAU,CACxB,MAAM,CAAC,EAAE,WAAW,GACnB;IAAE,MAAM,EAAE,WAAW,CAAA;CAAE,GAAG,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,CAEjD;AAMD,eAAO,MAAM,iBAAiB,mBAAmB,CAAC;AAClD,MAAM,WAAW,mBAAmB;IAClC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAuED,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,GACb,MAAM,CAkBR;
|
|
1
|
+
{"version":3,"file":"fetch-pipeline.d.ts","sourceRoot":"","sources":["../../src/lib/fetch-pipeline.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,KAAK,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAqBrE,KAAK,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAY1C,wBAAgB,gBAAgB,CAC9B,GAAG,EAAE,OAAO,EACZ,IAAI,EAAE,SAAS,MAAM,EAAE,GACtB,UAAU,GAAG,SAAS,CAOxB;AACD,wBAAgB,UAAU,CACxB,MAAM,CAAC,EAAE,WAAW,GACnB;IAAE,MAAM,EAAE,WAAW,CAAA;CAAE,GAAG,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,CAEjD;AAMD,eAAO,MAAM,iBAAiB,mBAAmB,CAAC;AAClD,MAAM,WAAW,mBAAmB;IAClC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAuED,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,GACb,MAAM,CAkBR;AAsCD,UAAU,oBAAoB,CAAC,CAAC;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,cAAc,EAAE,MAAM,CAAC;IACvB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC;IAC7C,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;IAC5C,SAAS,EAAE,CACT,KAAK,EAAE;QAAE,MAAM,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,SAAS,CAAC,EAAE,OAAO,CAAA;KAAE,EACpE,GAAG,EAAE,MAAM,KACR,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;IACpB,SAAS,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,CAAC;IAClC,WAAW,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,CAAC,GAAG,SAAS,CAAC;CACjD;AACD,MAAM,WAAW,cAAc,CAAC,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC;IACR,SAAS,EAAE,OAAO,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC1B;AACD,MAAM,MAAM,gBAAgB,GACxB,aAAa,GACb,aAAa,GACb,WAAW,GACX,eAAe,GACf,cAAc,GACd,gBAAgB,GAChB,iBAAiB,GACjB,gBAAgB,GAChB,iBAAiB,CAAC;AAmMtB,wBAAsB,oBAAoB,CAAC,CAAC,EAC1C,OAAO,EAAE,oBAAoB,CAAC,CAAC,CAAC,GAC/B,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAmE5B;AAMD,MAAM,MAAM,sBAAsB,GAAG,uBAAuB,GAAG;IAC7D,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;AACF,wBAAgB,yBAAyB,CACvC,MAAM,EAAE,MAAM,GACb,sBAAsB,GAAG,SAAS,CAqBpC;AACD,eAAO,MAAM,iBAAiB,GAC5B,OAAO;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,EACpE,KAAK,MAAM,EACX,SAAS,WAAW,KACnB,OAAO,CAAC,sBAAsB,CAShC,CAAC;AACF,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,sBAAsB,GAC7B,MAAM,CAaR;AAMD,UAAU,kBAAkB;IAC1B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,MAAM,CAAC,EAAE,WAAW,CAAC;IAC9B,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC;IACtD,QAAQ,CAAC,YAAY,CAAC,EAAE,OAAO,CAAC;IAChC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;IACrD,QAAQ,CAAC,SAAS,EAAE,CAClB,KAAK,EAAE;QAAE,MAAM,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,SAAS,CAAC,EAAE,OAAO,CAAA;KAAE,EACpE,aAAa,EAAE,MAAM,KAClB,sBAAsB,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAAC;IAC9D,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC,MAAM,EAAE,sBAAsB,KAAK,MAAM,CAAC;IAChE,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,sBAAsB,GAAG,SAAS,CAAC;CAC/E;AACD,UAAU,eAAe;IACvB,QAAQ,CAAC,oBAAoB,CAAC,EAAE,OAAO,oBAAoB,CAAC;CAC7D;AAgBD,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,kBAAkB,EAC3B,IAAI,GAAE,eAAoB,GACzB,OAAO,CAAC;IACT,QAAQ,EAAE,cAAc,CAAC,sBAAsB,CAAC,CAAC;IACjD,YAAY,EAAE,mBAAmB,CAAC;CACnC,CAAC,CAUD"}
|
|
@@ -107,9 +107,9 @@ export function appendTruncationMarker(content, marker) {
|
|
|
107
107
|
return `${contentWithFence}${marker}`;
|
|
108
108
|
}
|
|
109
109
|
class InlineContentLimiter {
|
|
110
|
-
apply(content
|
|
110
|
+
apply(content) {
|
|
111
111
|
const contentSize = content.length;
|
|
112
|
-
const inlineLimit =
|
|
112
|
+
const inlineLimit = config.constants.maxInlineContentChars;
|
|
113
113
|
if (isWithinInlineLimit(contentSize, inlineLimit)) {
|
|
114
114
|
return { content, contentSize };
|
|
115
115
|
}
|
|
@@ -120,22 +120,13 @@ class InlineContentLimiter {
|
|
|
120
120
|
truncated: true,
|
|
121
121
|
};
|
|
122
122
|
}
|
|
123
|
-
resolveInlineLimit(inlineLimitOverride) {
|
|
124
|
-
const globalLimit = config.constants.maxInlineContentChars;
|
|
125
|
-
if (inlineLimitOverride === undefined)
|
|
126
|
-
return globalLimit;
|
|
127
|
-
if (globalLimit > 0 && inlineLimitOverride > 0) {
|
|
128
|
-
return Math.min(inlineLimitOverride, globalLimit);
|
|
129
|
-
}
|
|
130
|
-
return inlineLimitOverride;
|
|
131
|
-
}
|
|
132
123
|
}
|
|
133
124
|
function isWithinInlineLimit(contentSize, inlineLimit) {
|
|
134
125
|
return inlineLimit <= 0 || contentSize <= inlineLimit;
|
|
135
126
|
}
|
|
136
127
|
const inlineLimiter = new InlineContentLimiter();
|
|
137
|
-
function applyInlineContentLimit(content
|
|
138
|
-
return inlineLimiter.apply(content
|
|
128
|
+
function applyInlineContentLimit(content) {
|
|
129
|
+
return inlineLimiter.apply(content);
|
|
139
130
|
}
|
|
140
131
|
function resolveNormalizedUrl(url) {
|
|
141
132
|
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
@@ -341,12 +332,11 @@ export function parseCachedMarkdownResult(cached) {
|
|
|
341
332
|
truncated,
|
|
342
333
|
};
|
|
343
334
|
}
|
|
344
|
-
export const markdownTransform = async (input, url, signal
|
|
335
|
+
export const markdownTransform = async (input, url, signal) => {
|
|
345
336
|
const result = await transformBufferToMarkdown(input.buffer, url, {
|
|
346
337
|
includeMetadata: true,
|
|
347
338
|
encoding: input.encoding,
|
|
348
339
|
...withSignal(signal),
|
|
349
|
-
...(skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
350
340
|
...(input.truncated ? { inputTruncated: true } : {}),
|
|
351
341
|
});
|
|
352
342
|
const truncated = Boolean(result.truncated || input.truncated);
|
|
@@ -382,6 +372,6 @@ export async function performSharedFetch(options, deps = {}) {
|
|
|
382
372
|
const pipeline = await executePipeline(buildSharedFetchPipelineOptions(options));
|
|
383
373
|
options.onStage?.('prepare_output');
|
|
384
374
|
options.onStage?.('finalize_output');
|
|
385
|
-
const inlineResult = applyInlineContentLimit(pipeline.data.content
|
|
375
|
+
const inlineResult = applyInlineContentLimit(pipeline.data.content);
|
|
386
376
|
return { pipeline, inlineResult };
|
|
387
377
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"instructions.d.ts","sourceRoot":"","sources":["../../src/resources/instructions.ts"],"names":[],"mappings":"AAIA,wBAAgB,uBAAuB,IAAI,MAAM,
|
|
1
|
+
{"version":3,"file":"instructions.d.ts","sourceRoot":"","sources":["../../src/resources/instructions.ts"],"names":[],"mappings":"AAIA,wBAAgB,uBAAuB,IAAI,MAAM,CA0ChD"}
|
|
@@ -18,8 +18,7 @@ export function buildServerInstructions() {
|
|
|
18
18
|
<workflows>
|
|
19
19
|
1. Standard: Call \`${FETCH_URL_TOOL_NAME}\` -> Read \`markdown\`. If \`truncated: true\`, retry with \`forceRefresh: true\`.
|
|
20
20
|
2. Fresh: Set \`forceRefresh: true\` to bypass cache.
|
|
21
|
-
3.
|
|
22
|
-
4. Async: Add \`task: { ttl: <ms> }\` to \`tools/call\` -> Poll \`tasks/get\` -> Call \`tasks/result\`.
|
|
21
|
+
3. Async: Add \`task: { ttl: <ms> }\` to \`tools/call\` -> Poll \`tasks/get\` -> Call \`tasks/result\`.
|
|
23
22
|
</workflows>
|
|
24
23
|
|
|
25
24
|
<constraints>
|
package/dist/schemas.d.ts
CHANGED
|
@@ -30,9 +30,7 @@ export declare const cachedPayloadSchema: z.ZodObject<{
|
|
|
30
30
|
export type CachedPayload = z.infer<typeof cachedPayloadSchema>;
|
|
31
31
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
32
32
|
url: z.ZodURL;
|
|
33
|
-
skipNoiseRemoval: z.ZodOptional<z.ZodBoolean>;
|
|
34
33
|
forceRefresh: z.ZodOptional<z.ZodBoolean>;
|
|
35
|
-
maxInlineChars: z.ZodOptional<z.ZodNumber>;
|
|
36
34
|
}, z.core.$strict>;
|
|
37
35
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
38
36
|
url: z.ZodURL;
|
package/dist/schemas.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAI9D,eAAO,MAAM,eAAe;;;;;;;;CAQlB,CAAC;AAiCX,eAAO,MAAM,uBAAuB;;;;;;;;kBAQlC,CAAC;AAgBH,wBAAgB,0BAA0B,CACxC,KAAK,EAAE,OAAO,GACb,iBAAiB,GAAG,SAAS,CAQ/B;AAED,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,GAAG,SAAS,CAErE;AAUD,eAAO,MAAM,mBAAmB;;;;;;iBA2B7B,CAAC;AAEJ,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEhE,eAAO,MAAM,mBAAmB
|
|
1
|
+
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAI9D,eAAO,MAAM,eAAe;;;;;;;;CAQlB,CAAC;AAiCX,eAAO,MAAM,uBAAuB;;;;;;;;kBAQlC,CAAC;AAgBH,wBAAgB,0BAA0B,CACxC,KAAK,EAAE,OAAO,GACb,iBAAiB,GAAG,SAAS,CAQ/B;AAED,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,GAAG,SAAS,CAErE;AAUD,eAAO,MAAM,mBAAmB;;;;;;iBA2B7B,CAAC;AAEJ,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEhE,eAAO,MAAM,mBAAmB;;;kBAU9B,CAAC;AAEH,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;kBAqC/B,CAAC;AAEH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI,CAkBpE;AAED,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,aAAa,GACrB,MAAM,GAAG,IAAI,CAEf"}
|
package/dist/schemas.js
CHANGED
|
@@ -95,21 +95,10 @@ export const fetchUrlInputSchema = z.strictObject({
|
|
|
95
95
|
.min(1)
|
|
96
96
|
.max(config.constants.maxUrlLength)
|
|
97
97
|
.describe(`Target URL. Max ${config.constants.maxUrlLength} chars.`),
|
|
98
|
-
skipNoiseRemoval: z
|
|
99
|
-
.boolean()
|
|
100
|
-
.optional()
|
|
101
|
-
.describe('Preserve navigation/footers (disable noise filtering).'),
|
|
102
98
|
forceRefresh: z
|
|
103
99
|
.boolean()
|
|
104
100
|
.optional()
|
|
105
101
|
.describe('Bypass cache and fetch fresh content.'),
|
|
106
|
-
maxInlineChars: z
|
|
107
|
-
.number()
|
|
108
|
-
.int()
|
|
109
|
-
.min(0)
|
|
110
|
-
.max(config.constants.maxHtmlSize)
|
|
111
|
-
.optional()
|
|
112
|
-
.describe(`Inline markdown limit (0-${config.constants.maxHtmlSize}, 0=unlimited). Lower of this or global limit applies.`),
|
|
113
102
|
});
|
|
114
103
|
export const fetchUrlOutputSchema = z.strictObject({
|
|
115
104
|
url: z.httpUrl().max(config.constants.maxUrlLength).describe('Fetched URL.'),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch-url.d.ts","sourceRoot":"","sources":["../../src/tools/fetch-url.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,SAAS,EAEV,MAAM,yCAAyC,CAAC;AACjD,OAAO,KAAK,EACV,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAE5C,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,OAAO,EAGL,KAAK,gBAAgB,EACtB,MAAM,qBAAqB,CAAC;AAI7B,OAAO,EACL,mBAAmB,EAIpB,MAAM,eAAe,CAAC;AAMvB,KAAK,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEzD,KAAK,qBAAqB,GAAG,YAAY,CAAC;AAE1C,UAAU,gBAAgB;IACxB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;IACvB,OAAO,EAAE,qBAAqB,EAAE,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,SAAS,CAAC;IACxD,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,eAAO,MAAM,mBAAmB,cAAc,CAAC;
|
|
1
|
+
{"version":3,"file":"fetch-url.d.ts","sourceRoot":"","sources":["../../src/tools/fetch-url.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,SAAS,EAEV,MAAM,yCAAyC,CAAC;AACjD,OAAO,KAAK,EACV,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAE5C,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,OAAO,EAGL,KAAK,gBAAgB,EACtB,MAAM,qBAAqB,CAAC;AAI7B,OAAO,EACL,mBAAmB,EAIpB,MAAM,eAAe,CAAC;AAMvB,KAAK,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEzD,KAAK,qBAAqB,GAAG,YAAY,CAAC;AAE1C,UAAU,gBAAgB;IACxB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;IACvB,OAAO,EAAE,qBAAqB,EAAE,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,SAAS,CAAC;IACxD,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,eAAO,MAAM,mBAAmB,cAAc,CAAC;AAwT/C,wBAAsB,mBAAmB,CACvC,KAAK,EAAE,aAAa,EACpB,KAAK,CAAC,EAAE,gBAAgB,GACvB,OAAO,CAAC,gBAAgB,CAAC,CAK3B;AAgDD;;;;;;GAMG;AACH,wBAAgB,2BAA2B,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,EAC5E,OAAO,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,GAC7D,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAmBvD;AAwBD,wBAAgB,aAAa,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI,CAwCrD"}
|
package/dist/tools/fetch-url.js
CHANGED
|
@@ -179,26 +179,24 @@ function mapFetchStageToProgress(stage, context) {
|
|
|
179
179
|
return { step: 7, message: 'Finalizing output' };
|
|
180
180
|
}
|
|
181
181
|
}
|
|
182
|
-
function buildFetchOptions(url, context, signal, progress,
|
|
182
|
+
function buildFetchOptions(url, context, signal, progress, forceRefresh) {
|
|
183
183
|
return {
|
|
184
184
|
url,
|
|
185
185
|
...withSignal(signal),
|
|
186
|
-
...(skipNoiseRemoval ? { cacheVary: { skipNoiseRemoval: true } } : {}),
|
|
187
186
|
...(forceRefresh ? { forceRefresh: true } : {}),
|
|
188
|
-
...(maxInlineChars !== undefined ? { maxInlineChars } : {}),
|
|
189
187
|
onStage: (stage) => {
|
|
190
188
|
const update = mapFetchStageToProgress(stage, context);
|
|
191
189
|
reportProgress(progress, update.step, update.message);
|
|
192
190
|
},
|
|
193
191
|
transform: async ({ buffer, encoding, truncated }, normalizedUrl) => {
|
|
194
|
-
return markdownTransform({ buffer, encoding, ...(truncated ? { truncated } : {}) }, normalizedUrl, signal
|
|
192
|
+
return markdownTransform({ buffer, encoding, ...(truncated ? { truncated } : {}) }, normalizedUrl, signal);
|
|
195
193
|
},
|
|
196
194
|
serialize: serializeMarkdownResult,
|
|
197
195
|
deserialize: parseCachedMarkdownResult,
|
|
198
196
|
};
|
|
199
197
|
}
|
|
200
|
-
async function fetchPipeline(url, context, signal, progress,
|
|
201
|
-
return performSharedFetch(buildFetchOptions(url, context, signal, progress,
|
|
198
|
+
async function fetchPipeline(url, context, signal, progress, forceRefresh) {
|
|
199
|
+
return performSharedFetch(buildFetchOptions(url, context, signal, progress, forceRefresh));
|
|
202
200
|
}
|
|
203
201
|
function formatContentSize(chars) {
|
|
204
202
|
if (chars < 1000)
|
|
@@ -220,7 +218,7 @@ async function executeFetch(input, extra) {
|
|
|
220
218
|
logDebug('Fetching URL', { url });
|
|
221
219
|
try {
|
|
222
220
|
reportProgress(progress, 1, 'Preparing request');
|
|
223
|
-
const { pipeline, inlineResult } = await fetchPipeline(url, context, signal, progress, input.
|
|
221
|
+
const { pipeline, inlineResult } = await fetchPipeline(url, context, signal, progress, input.forceRefresh);
|
|
224
222
|
const size = formatContentSize(inlineResult.contentSize);
|
|
225
223
|
reportProgress(progress, 8, `Done — ${size}`);
|
|
226
224
|
return buildResponse(pipeline, inlineResult, url);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4fA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -346,14 +346,8 @@ function buildSpanTranslator(ctx) {
|
|
|
346
346
|
return {};
|
|
347
347
|
}
|
|
348
348
|
// ---------------------------------------------------------------------------
|
|
349
|
-
//
|
|
349
|
+
// DL helpers
|
|
350
350
|
// ---------------------------------------------------------------------------
|
|
351
|
-
function hasComplexTableLayout(node) {
|
|
352
|
-
if (!isLikeNode(node))
|
|
353
|
-
return false;
|
|
354
|
-
const innerHTML = typeof node.innerHTML === 'string' ? node.innerHTML : '';
|
|
355
|
-
return /(?:colspan|rowspan)=["']?[2-9]/i.test(innerHTML);
|
|
356
|
-
}
|
|
357
351
|
function resolveDlNodeName(child) {
|
|
358
352
|
if (!isLikeNode(child))
|
|
359
353
|
return '';
|
|
@@ -381,22 +375,6 @@ function createCustomTranslators() {
|
|
|
381
375
|
return {
|
|
382
376
|
code: (ctx) => buildCodeTranslator(ctx),
|
|
383
377
|
img: (ctx) => buildImageTranslator(ctx),
|
|
384
|
-
table: (ctx) => {
|
|
385
|
-
if (!isObject(ctx))
|
|
386
|
-
return {};
|
|
387
|
-
const { node } = ctx;
|
|
388
|
-
if (hasComplexTableLayout(node)) {
|
|
389
|
-
return {
|
|
390
|
-
postprocess: ({ content }) => {
|
|
391
|
-
const trimmed = content.trim();
|
|
392
|
-
if (!trimmed)
|
|
393
|
-
return '';
|
|
394
|
-
return `\n\n${trimmed}\n\n`;
|
|
395
|
-
},
|
|
396
|
-
};
|
|
397
|
-
}
|
|
398
|
-
return {};
|
|
399
|
-
},
|
|
400
378
|
dl: (ctx) => {
|
|
401
379
|
if (!isObject(ctx))
|
|
402
380
|
return { content: '' };
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtractedMetadata } from './types.js';
|
|
2
|
+
export declare function normalizeDocumentTitle(title: string, baseUrl?: string): string;
|
|
2
3
|
export declare function extractMetadata(document: Document, baseUrl?: string): ExtractedMetadata;
|
|
3
4
|
export declare function extractMetadataFromHead(html: string, baseUrl?: string): ExtractedMetadata | null;
|
|
4
5
|
export declare function mergeMetadata(early: ExtractedMetadata | null, late: ExtractedMetadata): ExtractedMetadata;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAoCpD,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAsBR;AAuID,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,CAkBnB;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,GAAG,IAAI,CAY1B;AAED,wBAAgB,aAAa,CAC3B,KAAK,EAAE,iBAAiB,GAAG,IAAI,EAC/B,IAAI,EAAE,iBAAiB,GACtB,iBAAiB,CAmBnB"}
|
|
@@ -15,6 +15,28 @@ function extractHeadSection(html) {
|
|
|
15
15
|
return null;
|
|
16
16
|
return html.substring(0, match.index);
|
|
17
17
|
}
|
|
18
|
+
export function normalizeDocumentTitle(title, baseUrl) {
|
|
19
|
+
if (!baseUrl || !title.startsWith('GitHub - '))
|
|
20
|
+
return title;
|
|
21
|
+
let parsed;
|
|
22
|
+
try {
|
|
23
|
+
parsed = new URL(baseUrl);
|
|
24
|
+
}
|
|
25
|
+
catch {
|
|
26
|
+
return title;
|
|
27
|
+
}
|
|
28
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
29
|
+
if (hostname !== 'github.com' && hostname !== 'www.github.com') {
|
|
30
|
+
return title;
|
|
31
|
+
}
|
|
32
|
+
const segments = parsed.pathname.split('/').filter(Boolean);
|
|
33
|
+
if (segments.length !== 2)
|
|
34
|
+
return title;
|
|
35
|
+
const [owner, repo] = segments;
|
|
36
|
+
if (!owner || !repo)
|
|
37
|
+
return title;
|
|
38
|
+
return `${owner}/${repo}`;
|
|
39
|
+
}
|
|
18
40
|
const META_PROPERTY_HANDLERS = new Map([
|
|
19
41
|
[
|
|
20
42
|
'og:title',
|
|
@@ -139,6 +161,9 @@ function resolveFaviconUrl(href, baseUrl) {
|
|
|
139
161
|
export function extractMetadata(document, baseUrl) {
|
|
140
162
|
const ctx = buildMetaContext(document);
|
|
141
163
|
const metadata = resolveMetadataFromContext(ctx);
|
|
164
|
+
if (metadata.title) {
|
|
165
|
+
metadata.title = normalizeDocumentTitle(metadata.title, baseUrl);
|
|
166
|
+
}
|
|
142
167
|
if (baseUrl) {
|
|
143
168
|
const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
|
|
144
169
|
const href = icon32?.getAttribute('href');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../src/transform/shared.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,uBAAuB,EACvB,gBAAgB,EAChB,8BAA8B,EAE/B,MAAM,YAAY,CAAC;AAEpB,UAAU,2BAA2B;IACnC,WAAW,EAAE,CAAC,OAAO,EAAE,8BAA8B,KAAK,IAAI,CAAC;IAC/D,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,KACtB,uBAAuB,CAAC;CAC9B;
|
|
1
|
+
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../src/transform/shared.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,uBAAuB,EACvB,gBAAgB,EAChB,8BAA8B,EAE/B,MAAM,YAAY,CAAC;AAEpB,UAAU,2BAA2B;IACnC,WAAW,EAAE,CAAC,OAAO,EAAE,8BAA8B,KAAK,IAAI,CAAC;IAC/D,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,KACtB,uBAAuB,CAAC;CAC9B;AAgGD,wBAAgB,6BAA6B,CAC3C,OAAO,EAAE,2BAA2B,GACnC,CAAC,GAAG,EAAE,OAAO,KAAK,IAAI,CA8ExB"}
|
package/dist/transform/shared.js
CHANGED
|
@@ -3,14 +3,13 @@ function isTransformMessage(message) {
|
|
|
3
3
|
if (!message || typeof message !== 'object')
|
|
4
4
|
return false;
|
|
5
5
|
const value = message;
|
|
6
|
-
const { id, url, html, htmlBuffer, encoding, includeMetadata,
|
|
6
|
+
const { id, url, html, htmlBuffer, encoding, includeMetadata, inputTruncated, } = value;
|
|
7
7
|
return (typeof id === 'string' &&
|
|
8
8
|
typeof url === 'string' &&
|
|
9
9
|
typeof includeMetadata === 'boolean' &&
|
|
10
10
|
(html === undefined || typeof html === 'string') &&
|
|
11
11
|
(htmlBuffer === undefined || htmlBuffer instanceof Uint8Array) &&
|
|
12
12
|
(encoding === undefined || typeof encoding === 'string') &&
|
|
13
|
-
(skipNoiseRemoval === undefined || typeof skipNoiseRemoval === 'boolean') &&
|
|
14
13
|
(inputTruncated === undefined || typeof inputTruncated === 'boolean'));
|
|
15
14
|
}
|
|
16
15
|
function decodeHtml(html, htmlBuffer, encoding, decoder) {
|
|
@@ -83,7 +82,7 @@ export function createTransformMessageHandler(options) {
|
|
|
83
82
|
}
|
|
84
83
|
if (messageType !== 'transform' || !isTransformMessage(message))
|
|
85
84
|
return;
|
|
86
|
-
const { id, url, html, htmlBuffer, encoding, includeMetadata,
|
|
85
|
+
const { id, url, html, htmlBuffer, encoding, includeMetadata, inputTruncated, } = message;
|
|
87
86
|
if (!id.trim()) {
|
|
88
87
|
sendMessage({
|
|
89
88
|
type: 'error',
|
|
@@ -115,7 +114,6 @@ export function createTransformMessageHandler(options) {
|
|
|
115
114
|
const result = runTransform(content, url, {
|
|
116
115
|
includeMetadata,
|
|
117
116
|
signal: controller.signal,
|
|
118
|
-
...(skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
119
117
|
...(inputTruncated ? { inputTruncated: true } : {}),
|
|
120
118
|
});
|
|
121
119
|
sendMessage(createResultMessage(id, result));
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAuCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA4ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AA6UD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAqPD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuKD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AA6DD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAyUD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CA6CzB;AAED,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkI1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -11,7 +11,7 @@ import { createAbortError, throwIfAborted } from '../lib/utils.js';
|
|
|
11
11
|
import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
|
|
12
12
|
import { isObject } from '../lib/utils.js';
|
|
13
13
|
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
14
|
-
import { extractMetadata, extractMetadataFromHead, mergeMetadata, } from './metadata.js';
|
|
14
|
+
import { extractMetadata, extractMetadataFromHead, mergeMetadata, normalizeDocumentTitle, } from './metadata.js';
|
|
15
15
|
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
16
16
|
function decodeInput(input, encoding) {
|
|
17
17
|
if (typeof input === 'string')
|
|
@@ -270,6 +270,14 @@ function resolveCollapsedTextLengthUpTo(text, max) {
|
|
|
270
270
|
}
|
|
271
271
|
return length;
|
|
272
272
|
}
|
|
273
|
+
function preserveAlertElements(doc) {
|
|
274
|
+
const alerts = doc.querySelectorAll('[role="alert"], .admonition, .callout');
|
|
275
|
+
for (const el of alerts) {
|
|
276
|
+
const bq = doc.createElement('blockquote');
|
|
277
|
+
bq.innerHTML = el.innerHTML;
|
|
278
|
+
el.replaceWith(bq);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
273
281
|
function extractArticle(document, url, signal) {
|
|
274
282
|
if (!isReadabilityCompatible(document)) {
|
|
275
283
|
logWarn('Document not compatible with Readability');
|
|
@@ -298,6 +306,7 @@ function extractArticle(document, url, signal) {
|
|
|
298
306
|
const readabilityDoc = typeof doc.cloneNode === 'function'
|
|
299
307
|
? doc.cloneNode(true)
|
|
300
308
|
: doc;
|
|
309
|
+
preserveAlertElements(readabilityDoc);
|
|
301
310
|
// F1: Check abort before heavy Readability parse
|
|
302
311
|
abortPolicy.throwIfAborted(signal, url, 'extract:article:parse');
|
|
303
312
|
const reader = new Readability(readabilityDoc, {
|
|
@@ -797,8 +806,9 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
797
806
|
fetchedAt: new Date().toISOString(),
|
|
798
807
|
};
|
|
799
808
|
if (shouldExtractFromArticle && article) {
|
|
800
|
-
if (article.title !== undefined)
|
|
801
|
-
metadata.title = article.title;
|
|
809
|
+
if (article.title !== undefined) {
|
|
810
|
+
metadata.title = normalizeDocumentTitle(article.title, url);
|
|
811
|
+
}
|
|
802
812
|
if (article.byline !== undefined)
|
|
803
813
|
metadata.author = article.byline;
|
|
804
814
|
}
|
|
@@ -828,6 +838,12 @@ const CONTENT_ROOT_SELECTORS = [
|
|
|
828
838
|
'.post-body',
|
|
829
839
|
'.article-body',
|
|
830
840
|
];
|
|
841
|
+
const PRIMARY_HEADING_ROOT_SELECTORS = [
|
|
842
|
+
...CONTENT_ROOT_SELECTORS,
|
|
843
|
+
'.markdown-body',
|
|
844
|
+
'.entry-content',
|
|
845
|
+
'[itemprop="text"]',
|
|
846
|
+
];
|
|
831
847
|
function findContentRoot(document) {
|
|
832
848
|
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
833
849
|
const element = document.querySelector(selector);
|
|
@@ -841,6 +857,34 @@ function findContentRoot(document) {
|
|
|
841
857
|
}
|
|
842
858
|
return undefined;
|
|
843
859
|
}
|
|
860
|
+
function findPrimaryHeading(document) {
|
|
861
|
+
for (const selector of PRIMARY_HEADING_ROOT_SELECTORS) {
|
|
862
|
+
const root = document.querySelector(selector);
|
|
863
|
+
if (!root)
|
|
864
|
+
continue;
|
|
865
|
+
const heading = root.querySelector('h1, h2');
|
|
866
|
+
if (!heading)
|
|
867
|
+
continue;
|
|
868
|
+
const text = heading.textContent.trim();
|
|
869
|
+
if (text)
|
|
870
|
+
return text;
|
|
871
|
+
}
|
|
872
|
+
return undefined;
|
|
873
|
+
}
|
|
874
|
+
function isGithubRepositoryRootUrl(url) {
|
|
875
|
+
let parsed;
|
|
876
|
+
try {
|
|
877
|
+
parsed = new URL(url);
|
|
878
|
+
}
|
|
879
|
+
catch {
|
|
880
|
+
return false;
|
|
881
|
+
}
|
|
882
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
883
|
+
if (hostname !== 'github.com' && hostname !== 'www.github.com') {
|
|
884
|
+
return false;
|
|
885
|
+
}
|
|
886
|
+
return parsed.pathname.split('/').filter(Boolean).length === 2;
|
|
887
|
+
}
|
|
844
888
|
function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
845
889
|
const articleLength = article.textContent.length;
|
|
846
890
|
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
@@ -868,35 +912,30 @@ function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
|
868
912
|
return !hasTruncatedSentences(article.textContent);
|
|
869
913
|
}
|
|
870
914
|
function buildContentSource(params) {
|
|
871
|
-
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated,
|
|
915
|
+
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, signal, } = params;
|
|
872
916
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
873
917
|
const base = {
|
|
874
918
|
favicon: extractedMeta.favicon,
|
|
875
919
|
metadata,
|
|
876
920
|
extractedMetadata: extractedMeta,
|
|
877
921
|
truncated,
|
|
922
|
+
primaryHeading: document ? findPrimaryHeading(document) : undefined,
|
|
878
923
|
};
|
|
879
924
|
if (useArticleContent && article) {
|
|
880
|
-
const
|
|
881
|
-
|
|
882
|
-
|
|
925
|
+
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
926
|
+
prepareDocumentForMarkdown(articleDoc, url, signal);
|
|
927
|
+
const preferPrimaryHeading = isGithubRepositoryRootUrl(url);
|
|
883
928
|
return {
|
|
884
929
|
...base,
|
|
885
|
-
sourceHtml:
|
|
886
|
-
title:
|
|
930
|
+
sourceHtml: articleDoc.body.innerHTML,
|
|
931
|
+
title: (preferPrimaryHeading ? base.primaryHeading : undefined) ??
|
|
932
|
+
(article.title !== undefined
|
|
933
|
+
? normalizeDocumentTitle(article.title, url)
|
|
934
|
+
: undefined),
|
|
887
935
|
skipNoiseRemoval: true,
|
|
888
936
|
};
|
|
889
937
|
}
|
|
890
938
|
if (document) {
|
|
891
|
-
if (skipNoiseRemoval) {
|
|
892
|
-
return {
|
|
893
|
-
...base,
|
|
894
|
-
sourceHtml: html,
|
|
895
|
-
title: extractedMeta.title,
|
|
896
|
-
skipNoiseRemoval: true,
|
|
897
|
-
document,
|
|
898
|
-
};
|
|
899
|
-
}
|
|
900
939
|
prepareDocumentForMarkdown(document, url, signal);
|
|
901
940
|
const contentRoot = findContentRoot(document);
|
|
902
941
|
return {
|
|
@@ -919,7 +958,7 @@ function resolveContentSource(params) {
|
|
|
919
958
|
...(params.signal ? { signal: params.signal } : {}),
|
|
920
959
|
...(params.inputTruncated ? { inputTruncated: true } : {}),
|
|
921
960
|
});
|
|
922
|
-
const useArticleContent =
|
|
961
|
+
const useArticleContent = article
|
|
923
962
|
? shouldUseArticleContent(article, document)
|
|
924
963
|
: false;
|
|
925
964
|
return buildContentSource({
|
|
@@ -931,7 +970,6 @@ function resolveContentSource(params) {
|
|
|
931
970
|
useArticleContent,
|
|
932
971
|
document,
|
|
933
972
|
truncated: truncated ?? false,
|
|
934
|
-
...(params.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
935
973
|
...(params.signal ? { signal: params.signal } : {}),
|
|
936
974
|
});
|
|
937
975
|
}
|
|
@@ -942,7 +980,10 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
942
980
|
...(context.document ? { document: context.document } : {}),
|
|
943
981
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
944
982
|
}));
|
|
945
|
-
if (context.
|
|
983
|
+
if (context.primaryHeading && isGithubRepositoryRootUrl(url)) {
|
|
984
|
+
content = stripLeadingHeading(content, context.primaryHeading);
|
|
985
|
+
}
|
|
986
|
+
if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
|
|
946
987
|
const icon = context.favicon;
|
|
947
988
|
let prefix = ' ';
|
|
948
989
|
if (icon) {
|
|
@@ -964,6 +1005,34 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
964
1005
|
metadata: context.extractedMetadata,
|
|
965
1006
|
};
|
|
966
1007
|
}
|
|
1008
|
+
function normalizeHeadingText(value) {
|
|
1009
|
+
return value.replace(/\s+/g, ' ').trim().toLowerCase();
|
|
1010
|
+
}
|
|
1011
|
+
function stripLeadingHeading(markdown, headingText) {
|
|
1012
|
+
if (!markdown)
|
|
1013
|
+
return markdown;
|
|
1014
|
+
const lines = markdown.split('\n');
|
|
1015
|
+
const target = normalizeHeadingText(headingText);
|
|
1016
|
+
let nonEmptySeen = 0;
|
|
1017
|
+
for (let i = 0; i < lines.length && nonEmptySeen < 12; i += 1) {
|
|
1018
|
+
const trimmed = lines[i]?.trim() ?? '';
|
|
1019
|
+
if (!trimmed)
|
|
1020
|
+
continue;
|
|
1021
|
+
nonEmptySeen += 1;
|
|
1022
|
+
const match = /^(#{1,6})\s+(.+?)\s*$/.exec(trimmed);
|
|
1023
|
+
if (!match)
|
|
1024
|
+
continue;
|
|
1025
|
+
const current = normalizeHeadingText(match[2] ?? '');
|
|
1026
|
+
if (current !== target)
|
|
1027
|
+
return markdown;
|
|
1028
|
+
lines.splice(i, 1);
|
|
1029
|
+
if ((lines[i] ?? '').trim() === '') {
|
|
1030
|
+
lines.splice(i, 1);
|
|
1031
|
+
}
|
|
1032
|
+
return lines.join('\n');
|
|
1033
|
+
}
|
|
1034
|
+
return markdown;
|
|
1035
|
+
}
|
|
967
1036
|
const REPLACEMENT_CHAR = '\ufffd';
|
|
968
1037
|
const BINARY_INDICATOR_THRESHOLD = 0.1;
|
|
969
1038
|
function hasBinaryIndicators(content) {
|
|
@@ -1004,7 +1073,6 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1004
1073
|
url,
|
|
1005
1074
|
includeMetadata: options.includeMetadata,
|
|
1006
1075
|
...(signal ? { signal } : {}),
|
|
1007
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1008
1076
|
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1009
1077
|
}));
|
|
1010
1078
|
const result = buildMarkdownFromContext(context, url, signal);
|
|
@@ -1035,7 +1103,6 @@ function buildWorkerTransformOptions(options) {
|
|
|
1035
1103
|
return {
|
|
1036
1104
|
includeMetadata: options.includeMetadata,
|
|
1037
1105
|
...(options.signal ? { signal: options.signal } : {}),
|
|
1038
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1039
1106
|
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1040
1107
|
};
|
|
1041
1108
|
}
|
|
@@ -61,7 +61,6 @@ export interface MarkdownTransformResult extends MarkdownPayload {
|
|
|
61
61
|
export interface TransformOptions {
|
|
62
62
|
includeMetadata: boolean;
|
|
63
63
|
signal?: AbortSignal;
|
|
64
|
-
skipNoiseRemoval?: boolean;
|
|
65
64
|
inputTruncated?: boolean;
|
|
66
65
|
}
|
|
67
66
|
/**
|
|
@@ -98,7 +97,6 @@ export interface TransformWorkerTransformMessage {
|
|
|
98
97
|
encoding?: string | undefined;
|
|
99
98
|
url: string;
|
|
100
99
|
includeMetadata: boolean;
|
|
101
|
-
skipNoiseRemoval?: boolean | undefined;
|
|
102
100
|
inputTruncated?: boolean | undefined;
|
|
103
101
|
}
|
|
104
102
|
export interface TransformWorkerCancelledMessage {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/transform/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,gBAAgB,GAAG,IAAI,CAAC;IACjC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B;AAED,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC3B,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,iBAAiB,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAC9D,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,eAAe,EAAE,OAAO,CAAC;IACzB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/transform/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,gBAAgB,GAAG,IAAI,CAAC;IACjC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B;AAED,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC3B,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,iBAAiB,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAC9D,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,eAAe,EAAE,OAAO,CAAC;IACzB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,CAAC,EAAE,CAAC,CAAC;IACL,IAAI,EAAE,OAAO,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;CACjC;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC9C,IAAI,EAAE,WAAW,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC1B,UAAU,CAAC,EAAE,UAAU,GAAG,SAAS,CAAC;IACpC,QAAQ,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,EAAE,OAAO,CAAC;IACzB,cAAc,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;CACtC;AAED,MAAM,WAAW,+BAA+B;IAC9C,IAAI,EAAE,WAAW,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;CACZ;AAED,MAAM,WAAW,4BAA4B;IAC3C,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,eAAe,CAAC;CACzB;AAED,MAAM,WAAW,2BAA2B;IAC1C,IAAI,EAAE,OAAO,CAAC;IACd,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,CAAC;QAChB,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACnC,CAAC;CACH;AAED,MAAM,MAAM,8BAA8B,GACtC,4BAA4B,GAC5B,2BAA2B,GAC3B,+BAA+B,CAAC"}
|
|
@@ -3,7 +3,6 @@ interface TransformWorkerPool {
|
|
|
3
3
|
transform(html: string, url: string, options: {
|
|
4
4
|
includeMetadata: boolean;
|
|
5
5
|
signal?: AbortSignal;
|
|
6
|
-
skipNoiseRemoval?: boolean;
|
|
7
6
|
inputTruncated?: boolean;
|
|
8
7
|
}): Promise<MarkdownTransformResult>;
|
|
9
8
|
close(): Promise<void>;
|
|
@@ -29,13 +28,11 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
29
28
|
transform(html: string, url: string, options: {
|
|
30
29
|
includeMetadata: boolean;
|
|
31
30
|
signal?: AbortSignal;
|
|
32
|
-
skipNoiseRemoval?: boolean;
|
|
33
31
|
inputTruncated?: boolean;
|
|
34
32
|
}): Promise<MarkdownTransformResult>;
|
|
35
33
|
transform(htmlBuffer: Uint8Array, url: string, options: {
|
|
36
34
|
includeMetadata: boolean;
|
|
37
35
|
signal?: AbortSignal;
|
|
38
|
-
skipNoiseRemoval?: boolean;
|
|
39
36
|
inputTruncated?: boolean;
|
|
40
37
|
encoding?: string;
|
|
41
38
|
}): Promise<MarkdownTransformResult>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EACV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EACV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;AAqJpB,UAAU,mBAAmB;IAC3B,SAAS,CACP,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC,CAAC;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,aAAa,IAAI,MAAM,CAAC;IACxB,gBAAgB,IAAI,MAAM,CAAC;IAC3B,WAAW,IAAI,MAAM,CAAC;CACvB;AAkBD,cAAM,UAAW,YAAW,mBAAmB;IAC7C,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAkC;IAExE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkC;IAC1D,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAChD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAEhD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAqB;IAC3C,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAmC;IAC5D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAOvB;IAEJ,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAK;gBAEV,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM;IASrC,SAAS,CACb,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC;IAC7B,SAAS,CACb,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GACA,OAAO,CAAC,uBAAuB,CAAC;IAmCnC,aAAa,IAAI,MAAM;IAKvB,gBAAgB,IAAI,MAAM;IAI1B,WAAW,IAAI,MAAM;IAIrB,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAWpB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAiC5B,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,iBAAiB;IAkDzB,OAAO,CAAC,aAAa;IAsCrB,OAAO,CAAC,gBAAgB;IAOxB,OAAO,CAAC,gBAAgB;YAyBV,aAAa;IA2B3B,OAAO,CAAC,kBAAkB;IAY1B,OAAO,CAAC,WAAW;IAmCnB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,aAAa;IAYrB,OAAO,CAAC,eAAe;IAsDvB,OAAO,CAAC,YAAY;IAWpB,OAAO,CAAC,QAAQ;IAOhB,OAAO,CAAC,QAAQ;IAWhB,OAAO,CAAC,YAAY;IASpB,OAAO,CAAC,UAAU;IA2BlB,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,iBAAiB;IAiFzB,OAAO,CAAC,YAAY;IAQpB,OAAO,CAAC,eAAe;IAQvB,OAAO,CAAC,iBAAiB;CAW1B;AAMD,wBAAgB,qBAAqB,IAAI,UAAU,CAIlD;AAED,wBAAgB,kBAAkB,IAAI;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB,GAAG,IAAI,CAOP;AAED,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAIxD"}
|
|
@@ -83,7 +83,6 @@ function buildWorkerDispatchPayload(task) {
|
|
|
83
83
|
id: task.id,
|
|
84
84
|
url: task.url,
|
|
85
85
|
includeMetadata: task.includeMetadata,
|
|
86
|
-
...(task.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
87
86
|
...(task.inputTruncated ? { inputTruncated: true } : {}),
|
|
88
87
|
};
|
|
89
88
|
if (!task.htmlBuffer) {
|
|
@@ -214,7 +213,6 @@ class WorkerPool {
|
|
|
214
213
|
id,
|
|
215
214
|
url,
|
|
216
215
|
includeMetadata: options.includeMetadata,
|
|
217
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
218
216
|
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
219
217
|
signal: options.signal,
|
|
220
218
|
abortListener,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/fetch-url-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.9.1",
|
|
4
4
|
"mcpName": "io.github.j0hanz/fetch-url-mcp",
|
|
5
5
|
"description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
|
|
6
6
|
"type": "module",
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
"@mozilla/readability": "^0.6.0",
|
|
75
75
|
"linkedom": "^0.18.12",
|
|
76
76
|
"node-html-markdown": "^2.0.0",
|
|
77
|
-
"undici": "^7.
|
|
77
|
+
"undici": "^7.24.1",
|
|
78
78
|
"zod": "^4.3.6"
|
|
79
79
|
},
|
|
80
80
|
"devDependencies": {
|