@j0hanz/fetch-url-mcp 1.8.4 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +16 -10
- package/dist/lib/fetch-pipeline.d.ts +1 -2
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +6 -16
- package/dist/resources/instructions.d.ts.map +1 -1
- package/dist/resources/instructions.js +1 -2
- package/dist/schemas.d.ts +0 -2
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +0 -11
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +5 -7
- package/dist/transform/shared.d.ts.map +1 -1
- package/dist/transform/shared.js +2 -4
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +5 -18
- package/dist/transform/types.d.ts +0 -2
- package/dist/transform/types.d.ts.map +1 -1
- package/dist/transform/worker-pool.d.ts +0 -3
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +0 -2
- package/package.json +1 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AA0gB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAWD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAYN;AACD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAkVD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CA6BvE;AA+CD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AA4QD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6DR;AA2GD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAmCxE;AAcD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|
package/dist/lib/content.js
CHANGED
|
@@ -6,6 +6,7 @@ const NOISE_SCAN_LIMIT = 50_000;
|
|
|
6
6
|
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
7
|
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
8
|
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
|
+
const ABORT_CHECK_INTERVAL = 500;
|
|
9
10
|
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
10
11
|
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
11
12
|
const NOISE_PATTERNS = [
|
|
@@ -30,6 +31,7 @@ const BASE_STRUCTURAL_TAGS = new Set([
|
|
|
30
31
|
'style',
|
|
31
32
|
'noscript',
|
|
32
33
|
'iframe',
|
|
34
|
+
'template',
|
|
33
35
|
'form',
|
|
34
36
|
'button',
|
|
35
37
|
'input',
|
|
@@ -76,6 +78,10 @@ const PROMO_TOKENS_ALWAYS = [
|
|
|
76
78
|
'pagination',
|
|
77
79
|
'pager',
|
|
78
80
|
'taglist',
|
|
81
|
+
'twitter-tweet',
|
|
82
|
+
'fb-post',
|
|
83
|
+
'instagram-media',
|
|
84
|
+
'social-embed',
|
|
79
85
|
];
|
|
80
86
|
const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
|
|
81
87
|
const PROMO_TOKENS_BY_CATEGORY = {
|
|
@@ -86,7 +92,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
|
|
|
86
92
|
const BASE_NOISE_SELECTORS = {
|
|
87
93
|
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
88
94
|
cookieBanners: '[role="dialog"]',
|
|
89
|
-
hidden: '[style*="display: none"],[style*="display:none"],[hidden],[aria-hidden="true"]',
|
|
95
|
+
hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
|
|
90
96
|
};
|
|
91
97
|
const NO_MATCH_REGEX = /a^/i;
|
|
92
98
|
let cachedContext;
|
|
@@ -237,20 +243,20 @@ function removeNodes(nodes) {
|
|
|
237
243
|
}
|
|
238
244
|
}
|
|
239
245
|
}
|
|
240
|
-
function scoreNavFooter(
|
|
246
|
+
function scoreNavFooter(meta, weights) {
|
|
241
247
|
let score = 0;
|
|
242
|
-
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
248
|
+
if (ALWAYS_NOISE_TAGS.has(meta.tagName))
|
|
243
249
|
score += weights.structural;
|
|
244
250
|
// Header Boilerplate
|
|
245
|
-
if (tagName === 'header') {
|
|
246
|
-
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
247
|
-
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
251
|
+
if (meta.tagName === 'header') {
|
|
252
|
+
if ((meta.role && NAVIGATION_ROLES.has(meta.role)) ||
|
|
253
|
+
HEADER_NOISE_PATTERN.test(`${meta.className} ${meta.id}`)) {
|
|
248
254
|
score += weights.structural;
|
|
249
255
|
}
|
|
250
256
|
}
|
|
251
257
|
// Role Noise
|
|
252
|
-
if (role && NAVIGATION_ROLES.has(role)) {
|
|
253
|
-
if (tagName !== 'aside' || role !== 'complementary') {
|
|
258
|
+
if (meta.role && NAVIGATION_ROLES.has(meta.role)) {
|
|
259
|
+
if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
|
|
254
260
|
score += weights.structural;
|
|
255
261
|
}
|
|
256
262
|
}
|
|
@@ -287,7 +293,7 @@ function isNoiseElement(element, context) {
|
|
|
287
293
|
}
|
|
288
294
|
// Nav/Footer Scoring
|
|
289
295
|
if (context.flags.navFooter) {
|
|
290
|
-
score += scoreNavFooter(meta
|
|
296
|
+
score += scoreNavFooter(meta, weights);
|
|
291
297
|
}
|
|
292
298
|
// Hidden
|
|
293
299
|
if (meta.isHidden && !meta.isInteractive) {
|
|
@@ -375,7 +381,7 @@ function stripNoise(document, context, signal) {
|
|
|
375
381
|
// Candidates
|
|
376
382
|
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
377
383
|
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
378
|
-
if (i %
|
|
384
|
+
if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
|
|
379
385
|
throw new Error('Noise removal aborted');
|
|
380
386
|
}
|
|
381
387
|
const node = candidates[i];
|
|
@@ -45,14 +45,13 @@ export declare const markdownTransform: (input: {
|
|
|
45
45
|
buffer: Uint8Array;
|
|
46
46
|
encoding: string;
|
|
47
47
|
truncated?: boolean;
|
|
48
|
-
}, url: string, signal?: AbortSignal
|
|
48
|
+
}, url: string, signal?: AbortSignal) => Promise<MarkdownPipelineResult>;
|
|
49
49
|
export declare function serializeMarkdownResult(result: MarkdownPipelineResult): string;
|
|
50
50
|
interface SharedFetchOptions {
|
|
51
51
|
readonly url: string;
|
|
52
52
|
readonly signal?: AbortSignal;
|
|
53
53
|
readonly cacheVary?: Record<string, unknown> | string;
|
|
54
54
|
readonly forceRefresh?: boolean;
|
|
55
|
-
readonly maxInlineChars?: number;
|
|
56
55
|
readonly onStage?: (stage: SharedFetchStage) => void;
|
|
57
56
|
readonly transform: (input: {
|
|
58
57
|
buffer: Uint8Array;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch-pipeline.d.ts","sourceRoot":"","sources":["../../src/lib/fetch-pipeline.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,KAAK,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAqBrE,KAAK,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAY1C,wBAAgB,gBAAgB,CAC9B,GAAG,EAAE,OAAO,EACZ,IAAI,EAAE,SAAS,MAAM,EAAE,GACtB,UAAU,GAAG,SAAS,CAOxB;AACD,wBAAgB,UAAU,CACxB,MAAM,CAAC,EAAE,WAAW,GACnB;IAAE,MAAM,EAAE,WAAW,CAAA;CAAE,GAAG,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,CAEjD;AAMD,eAAO,MAAM,iBAAiB,mBAAmB,CAAC;AAClD,MAAM,WAAW,mBAAmB;IAClC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAuED,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,GACb,MAAM,CAkBR;
|
|
1
|
+
{"version":3,"file":"fetch-pipeline.d.ts","sourceRoot":"","sources":["../../src/lib/fetch-pipeline.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,KAAK,uBAAuB,EAAE,MAAM,uBAAuB,CAAC;AAqBrE,KAAK,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAY1C,wBAAgB,gBAAgB,CAC9B,GAAG,EAAE,OAAO,EACZ,IAAI,EAAE,SAAS,MAAM,EAAE,GACtB,UAAU,GAAG,SAAS,CAOxB;AACD,wBAAgB,UAAU,CACxB,MAAM,CAAC,EAAE,WAAW,GACnB;IAAE,MAAM,EAAE,WAAW,CAAA;CAAE,GAAG,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,CAEjD;AAMD,eAAO,MAAM,iBAAiB,mBAAmB,CAAC;AAClD,MAAM,WAAW,mBAAmB;IAClC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAuED,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,GACb,MAAM,CAkBR;AAsCD,UAAU,oBAAoB,CAAC,CAAC;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,cAAc,EAAE,MAAM,CAAC;IACvB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC;IAC7C,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;IAC5C,SAAS,EAAE,CACT,KAAK,EAAE;QAAE,MAAM,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,SAAS,CAAC,EAAE,OAAO,CAAA;KAAE,EACpE,GAAG,EAAE,MAAM,KACR,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;IACpB,SAAS,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,CAAC;IAClC,WAAW,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,CAAC,GAAG,SAAS,CAAC;CACjD;AACD,MAAM,WAAW,cAAc,CAAC,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC;IACR,SAAS,EAAE,OAAO,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC1B;AACD,MAAM,MAAM,gBAAgB,GACxB,aAAa,GACb,aAAa,GACb,WAAW,GACX,eAAe,GACf,cAAc,GACd,gBAAgB,GAChB,iBAAiB,GACjB,gBAAgB,GAChB,iBAAiB,CAAC;AAmMtB,wBAAsB,oBAAoB,CAAC,CAAC,EAC1C,OAAO,EAAE,oBAAoB,CAAC,CAAC,CAAC,GAC/B,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAmE5B;AAMD,MAAM,MAAM,sBAAsB,GAAG,uBAAuB,GAAG;IAC7D,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;AACF,wBAAgB,yBAAyB,CACvC,MAAM,EAAE,MAAM,GACb,sBAAsB,GAAG,SAAS,CAqBpC;AACD,eAAO,MAAM,iBAAiB,GAC5B,OAAO;IAAE,MAAM,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,EACpE,KAAK,MAAM,EACX,SAAS,WAAW,KACnB,OAAO,CAAC,sBAAsB,CAShC,CAAC;AACF,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,sBAAsB,GAC7B,MAAM,CAaR;AAMD,UAAU,kBAAkB;IAC1B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,MAAM,CAAC,EAAE,WAAW,CAAC;IAC9B,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC;IACtD,QAAQ,CAAC,YAAY,CAAC,EAAE,OAAO,CAAC;IAChC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;IACrD,QAAQ,CAAC,SAAS,EAAE,CAClB,KAAK,EAAE;QAAE,MAAM,EAAE,UAAU,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,SAAS,CAAC,EAAE,OAAO,CAAA;KAAE,EACpE,aAAa,EAAE,MAAM,KAClB,sBAAsB,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAAC;IAC9D,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC,MAAM,EAAE,sBAAsB,KAAK,MAAM,CAAC;IAChE,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,sBAAsB,GAAG,SAAS,CAAC;CAC/E;AACD,UAAU,eAAe;IACvB,QAAQ,CAAC,oBAAoB,CAAC,EAAE,OAAO,oBAAoB,CAAC;CAC7D;AAgBD,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,kBAAkB,EAC3B,IAAI,GAAE,eAAoB,GACzB,OAAO,CAAC;IACT,QAAQ,EAAE,cAAc,CAAC,sBAAsB,CAAC,CAAC;IACjD,YAAY,EAAE,mBAAmB,CAAC;CACnC,CAAC,CAUD"}
|
|
@@ -107,9 +107,9 @@ export function appendTruncationMarker(content, marker) {
|
|
|
107
107
|
return `${contentWithFence}${marker}`;
|
|
108
108
|
}
|
|
109
109
|
class InlineContentLimiter {
|
|
110
|
-
apply(content
|
|
110
|
+
apply(content) {
|
|
111
111
|
const contentSize = content.length;
|
|
112
|
-
const inlineLimit =
|
|
112
|
+
const inlineLimit = config.constants.maxInlineContentChars;
|
|
113
113
|
if (isWithinInlineLimit(contentSize, inlineLimit)) {
|
|
114
114
|
return { content, contentSize };
|
|
115
115
|
}
|
|
@@ -120,22 +120,13 @@ class InlineContentLimiter {
|
|
|
120
120
|
truncated: true,
|
|
121
121
|
};
|
|
122
122
|
}
|
|
123
|
-
resolveInlineLimit(inlineLimitOverride) {
|
|
124
|
-
const globalLimit = config.constants.maxInlineContentChars;
|
|
125
|
-
if (inlineLimitOverride === undefined)
|
|
126
|
-
return globalLimit;
|
|
127
|
-
if (globalLimit > 0 && inlineLimitOverride > 0) {
|
|
128
|
-
return Math.min(inlineLimitOverride, globalLimit);
|
|
129
|
-
}
|
|
130
|
-
return inlineLimitOverride;
|
|
131
|
-
}
|
|
132
123
|
}
|
|
133
124
|
function isWithinInlineLimit(contentSize, inlineLimit) {
|
|
134
125
|
return inlineLimit <= 0 || contentSize <= inlineLimit;
|
|
135
126
|
}
|
|
136
127
|
const inlineLimiter = new InlineContentLimiter();
|
|
137
|
-
function applyInlineContentLimit(content
|
|
138
|
-
return inlineLimiter.apply(content
|
|
128
|
+
function applyInlineContentLimit(content) {
|
|
129
|
+
return inlineLimiter.apply(content);
|
|
139
130
|
}
|
|
140
131
|
function resolveNormalizedUrl(url) {
|
|
141
132
|
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
@@ -341,12 +332,11 @@ export function parseCachedMarkdownResult(cached) {
|
|
|
341
332
|
truncated,
|
|
342
333
|
};
|
|
343
334
|
}
|
|
344
|
-
export const markdownTransform = async (input, url, signal
|
|
335
|
+
export const markdownTransform = async (input, url, signal) => {
|
|
345
336
|
const result = await transformBufferToMarkdown(input.buffer, url, {
|
|
346
337
|
includeMetadata: true,
|
|
347
338
|
encoding: input.encoding,
|
|
348
339
|
...withSignal(signal),
|
|
349
|
-
...(skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
350
340
|
...(input.truncated ? { inputTruncated: true } : {}),
|
|
351
341
|
});
|
|
352
342
|
const truncated = Boolean(result.truncated || input.truncated);
|
|
@@ -382,6 +372,6 @@ export async function performSharedFetch(options, deps = {}) {
|
|
|
382
372
|
const pipeline = await executePipeline(buildSharedFetchPipelineOptions(options));
|
|
383
373
|
options.onStage?.('prepare_output');
|
|
384
374
|
options.onStage?.('finalize_output');
|
|
385
|
-
const inlineResult = applyInlineContentLimit(pipeline.data.content
|
|
375
|
+
const inlineResult = applyInlineContentLimit(pipeline.data.content);
|
|
386
376
|
return { pipeline, inlineResult };
|
|
387
377
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"instructions.d.ts","sourceRoot":"","sources":["../../src/resources/instructions.ts"],"names":[],"mappings":"AAIA,wBAAgB,uBAAuB,IAAI,MAAM,
|
|
1
|
+
{"version":3,"file":"instructions.d.ts","sourceRoot":"","sources":["../../src/resources/instructions.ts"],"names":[],"mappings":"AAIA,wBAAgB,uBAAuB,IAAI,MAAM,CA0ChD"}
|
|
@@ -18,8 +18,7 @@ export function buildServerInstructions() {
|
|
|
18
18
|
<workflows>
|
|
19
19
|
1. Standard: Call \`${FETCH_URL_TOOL_NAME}\` -> Read \`markdown\`. If \`truncated: true\`, retry with \`forceRefresh: true\`.
|
|
20
20
|
2. Fresh: Set \`forceRefresh: true\` to bypass cache.
|
|
21
|
-
3.
|
|
22
|
-
4. Async: Add \`task: { ttl: <ms> }\` to \`tools/call\` -> Poll \`tasks/get\` -> Call \`tasks/result\`.
|
|
21
|
+
3. Async: Add \`task: { ttl: <ms> }\` to \`tools/call\` -> Poll \`tasks/get\` -> Call \`tasks/result\`.
|
|
23
22
|
</workflows>
|
|
24
23
|
|
|
25
24
|
<constraints>
|
package/dist/schemas.d.ts
CHANGED
|
@@ -30,9 +30,7 @@ export declare const cachedPayloadSchema: z.ZodObject<{
|
|
|
30
30
|
export type CachedPayload = z.infer<typeof cachedPayloadSchema>;
|
|
31
31
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
32
32
|
url: z.ZodURL;
|
|
33
|
-
skipNoiseRemoval: z.ZodOptional<z.ZodBoolean>;
|
|
34
33
|
forceRefresh: z.ZodOptional<z.ZodBoolean>;
|
|
35
|
-
maxInlineChars: z.ZodOptional<z.ZodNumber>;
|
|
36
34
|
}, z.core.$strict>;
|
|
37
35
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
38
36
|
url: z.ZodURL;
|
package/dist/schemas.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAI9D,eAAO,MAAM,eAAe;;;;;;;;CAQlB,CAAC;AAiCX,eAAO,MAAM,uBAAuB;;;;;;;;kBAQlC,CAAC;AAgBH,wBAAgB,0BAA0B,CACxC,KAAK,EAAE,OAAO,GACb,iBAAiB,GAAG,SAAS,CAQ/B;AAED,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,GAAG,SAAS,CAErE;AAUD,eAAO,MAAM,mBAAmB;;;;;;iBA2B7B,CAAC;AAEJ,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEhE,eAAO,MAAM,mBAAmB
|
|
1
|
+
{"version":3,"file":"schemas.d.ts","sourceRoot":"","sources":["../src/schemas.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAI9D,eAAO,MAAM,eAAe;;;;;;;;CAQlB,CAAC;AAiCX,eAAO,MAAM,uBAAuB;;;;;;;;kBAQlC,CAAC;AAgBH,wBAAgB,0BAA0B,CACxC,KAAK,EAAE,OAAO,GACb,iBAAiB,GAAG,SAAS,CAQ/B;AAED,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,GAAG,SAAS,CAErE;AAUD,eAAO,MAAM,mBAAmB;;;;;;iBA2B7B,CAAC;AAEJ,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEhE,eAAO,MAAM,mBAAmB;;;kBAU9B,CAAC;AAEH,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;kBAqC/B,CAAC;AAEH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI,CAkBpE;AAED,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,aAAa,GACrB,MAAM,GAAG,IAAI,CAEf"}
|
package/dist/schemas.js
CHANGED
|
@@ -95,21 +95,10 @@ export const fetchUrlInputSchema = z.strictObject({
|
|
|
95
95
|
.min(1)
|
|
96
96
|
.max(config.constants.maxUrlLength)
|
|
97
97
|
.describe(`Target URL. Max ${config.constants.maxUrlLength} chars.`),
|
|
98
|
-
skipNoiseRemoval: z
|
|
99
|
-
.boolean()
|
|
100
|
-
.optional()
|
|
101
|
-
.describe('Preserve navigation/footers (disable noise filtering).'),
|
|
102
98
|
forceRefresh: z
|
|
103
99
|
.boolean()
|
|
104
100
|
.optional()
|
|
105
101
|
.describe('Bypass cache and fetch fresh content.'),
|
|
106
|
-
maxInlineChars: z
|
|
107
|
-
.number()
|
|
108
|
-
.int()
|
|
109
|
-
.min(0)
|
|
110
|
-
.max(config.constants.maxHtmlSize)
|
|
111
|
-
.optional()
|
|
112
|
-
.describe(`Inline markdown limit (0-${config.constants.maxHtmlSize}, 0=unlimited). Lower of this or global limit applies.`),
|
|
113
102
|
});
|
|
114
103
|
export const fetchUrlOutputSchema = z.strictObject({
|
|
115
104
|
url: z.httpUrl().max(config.constants.maxUrlLength).describe('Fetched URL.'),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fetch-url.d.ts","sourceRoot":"","sources":["../../src/tools/fetch-url.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,SAAS,EAEV,MAAM,yCAAyC,CAAC;AACjD,OAAO,KAAK,EACV,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAE5C,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,OAAO,EAGL,KAAK,gBAAgB,EACtB,MAAM,qBAAqB,CAAC;AAI7B,OAAO,EACL,mBAAmB,EAIpB,MAAM,eAAe,CAAC;AAMvB,KAAK,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEzD,KAAK,qBAAqB,GAAG,YAAY,CAAC;AAE1C,UAAU,gBAAgB;IACxB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;IACvB,OAAO,EAAE,qBAAqB,EAAE,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,SAAS,CAAC;IACxD,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,eAAO,MAAM,mBAAmB,cAAc,CAAC;
|
|
1
|
+
{"version":3,"file":"fetch-url.d.ts","sourceRoot":"","sources":["../../src/tools/fetch-url.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,SAAS,EAEV,MAAM,yCAAyC,CAAC;AACjD,OAAO,KAAK,EACV,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAE5C,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,OAAO,EAGL,KAAK,gBAAgB,EACtB,MAAM,qBAAqB,CAAC;AAI7B,OAAO,EACL,mBAAmB,EAIpB,MAAM,eAAe,CAAC;AAMvB,KAAK,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEzD,KAAK,qBAAqB,GAAG,YAAY,CAAC;AAE1C,UAAU,gBAAgB;IACxB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;IACvB,OAAO,EAAE,qBAAqB,EAAE,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,SAAS,CAAC;IACxD,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,eAAO,MAAM,mBAAmB,cAAc,CAAC;AAwT/C,wBAAsB,mBAAmB,CACvC,KAAK,EAAE,aAAa,EACpB,KAAK,CAAC,EAAE,gBAAgB,GACvB,OAAO,CAAC,gBAAgB,CAAC,CAK3B;AAgDD;;;;;;GAMG;AACH,wBAAgB,2BAA2B,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,EAC5E,OAAO,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,GAC7D,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,CAmBvD;AAwBD,wBAAgB,aAAa,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI,CAwCrD"}
|
package/dist/tools/fetch-url.js
CHANGED
|
@@ -179,26 +179,24 @@ function mapFetchStageToProgress(stage, context) {
|
|
|
179
179
|
return { step: 7, message: 'Finalizing output' };
|
|
180
180
|
}
|
|
181
181
|
}
|
|
182
|
-
function buildFetchOptions(url, context, signal, progress,
|
|
182
|
+
function buildFetchOptions(url, context, signal, progress, forceRefresh) {
|
|
183
183
|
return {
|
|
184
184
|
url,
|
|
185
185
|
...withSignal(signal),
|
|
186
|
-
...(skipNoiseRemoval ? { cacheVary: { skipNoiseRemoval: true } } : {}),
|
|
187
186
|
...(forceRefresh ? { forceRefresh: true } : {}),
|
|
188
|
-
...(maxInlineChars !== undefined ? { maxInlineChars } : {}),
|
|
189
187
|
onStage: (stage) => {
|
|
190
188
|
const update = mapFetchStageToProgress(stage, context);
|
|
191
189
|
reportProgress(progress, update.step, update.message);
|
|
192
190
|
},
|
|
193
191
|
transform: async ({ buffer, encoding, truncated }, normalizedUrl) => {
|
|
194
|
-
return markdownTransform({ buffer, encoding, ...(truncated ? { truncated } : {}) }, normalizedUrl, signal
|
|
192
|
+
return markdownTransform({ buffer, encoding, ...(truncated ? { truncated } : {}) }, normalizedUrl, signal);
|
|
195
193
|
},
|
|
196
194
|
serialize: serializeMarkdownResult,
|
|
197
195
|
deserialize: parseCachedMarkdownResult,
|
|
198
196
|
};
|
|
199
197
|
}
|
|
200
|
-
async function fetchPipeline(url, context, signal, progress,
|
|
201
|
-
return performSharedFetch(buildFetchOptions(url, context, signal, progress,
|
|
198
|
+
async function fetchPipeline(url, context, signal, progress, forceRefresh) {
|
|
199
|
+
return performSharedFetch(buildFetchOptions(url, context, signal, progress, forceRefresh));
|
|
202
200
|
}
|
|
203
201
|
function formatContentSize(chars) {
|
|
204
202
|
if (chars < 1000)
|
|
@@ -220,7 +218,7 @@ async function executeFetch(input, extra) {
|
|
|
220
218
|
logDebug('Fetching URL', { url });
|
|
221
219
|
try {
|
|
222
220
|
reportProgress(progress, 1, 'Preparing request');
|
|
223
|
-
const { pipeline, inlineResult } = await fetchPipeline(url, context, signal, progress, input.
|
|
221
|
+
const { pipeline, inlineResult } = await fetchPipeline(url, context, signal, progress, input.forceRefresh);
|
|
224
222
|
const size = formatContentSize(inlineResult.contentSize);
|
|
225
223
|
reportProgress(progress, 8, `Done — ${size}`);
|
|
226
224
|
return buildResponse(pipeline, inlineResult, url);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../src/transform/shared.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,uBAAuB,EACvB,gBAAgB,EAChB,8BAA8B,EAE/B,MAAM,YAAY,CAAC;AAEpB,UAAU,2BAA2B;IACnC,WAAW,EAAE,CAAC,OAAO,EAAE,8BAA8B,KAAK,IAAI,CAAC;IAC/D,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,KACtB,uBAAuB,CAAC;CAC9B;
|
|
1
|
+
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../src/transform/shared.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,uBAAuB,EACvB,gBAAgB,EAChB,8BAA8B,EAE/B,MAAM,YAAY,CAAC;AAEpB,UAAU,2BAA2B;IACnC,WAAW,EAAE,CAAC,OAAO,EAAE,8BAA8B,KAAK,IAAI,CAAC;IAC/D,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,KACtB,uBAAuB,CAAC;CAC9B;AAgGD,wBAAgB,6BAA6B,CAC3C,OAAO,EAAE,2BAA2B,GACnC,CAAC,GAAG,EAAE,OAAO,KAAK,IAAI,CA8ExB"}
|
package/dist/transform/shared.js
CHANGED
|
@@ -3,14 +3,13 @@ function isTransformMessage(message) {
|
|
|
3
3
|
if (!message || typeof message !== 'object')
|
|
4
4
|
return false;
|
|
5
5
|
const value = message;
|
|
6
|
-
const { id, url, html, htmlBuffer, encoding, includeMetadata,
|
|
6
|
+
const { id, url, html, htmlBuffer, encoding, includeMetadata, inputTruncated, } = value;
|
|
7
7
|
return (typeof id === 'string' &&
|
|
8
8
|
typeof url === 'string' &&
|
|
9
9
|
typeof includeMetadata === 'boolean' &&
|
|
10
10
|
(html === undefined || typeof html === 'string') &&
|
|
11
11
|
(htmlBuffer === undefined || htmlBuffer instanceof Uint8Array) &&
|
|
12
12
|
(encoding === undefined || typeof encoding === 'string') &&
|
|
13
|
-
(skipNoiseRemoval === undefined || typeof skipNoiseRemoval === 'boolean') &&
|
|
14
13
|
(inputTruncated === undefined || typeof inputTruncated === 'boolean'));
|
|
15
14
|
}
|
|
16
15
|
function decodeHtml(html, htmlBuffer, encoding, decoder) {
|
|
@@ -83,7 +82,7 @@ export function createTransformMessageHandler(options) {
|
|
|
83
82
|
}
|
|
84
83
|
if (messageType !== 'transform' || !isTransformMessage(message))
|
|
85
84
|
return;
|
|
86
|
-
const { id, url, html, htmlBuffer, encoding, includeMetadata,
|
|
85
|
+
const { id, url, html, htmlBuffer, encoding, includeMetadata, inputTruncated, } = message;
|
|
87
86
|
if (!id.trim()) {
|
|
88
87
|
sendMessage({
|
|
89
88
|
type: 'error',
|
|
@@ -115,7 +114,6 @@ export function createTransformMessageHandler(options) {
|
|
|
115
114
|
const result = runTransform(content, url, {
|
|
116
115
|
includeMetadata,
|
|
117
116
|
signal: controller.signal,
|
|
118
|
-
...(skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
119
117
|
...(inputTruncated ? { inputTruncated: true } : {}),
|
|
120
118
|
});
|
|
121
119
|
sendMessage(createResultMessage(id, result));
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAsCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA4ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkUD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAqPD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuKD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AA6DD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAqB3B;
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAsCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA4ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkUD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAqPD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuKD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AA6DD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAqB3B;AAqPD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CA6CzB;AAED,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkI1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -868,7 +868,7 @@ function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
|
868
868
|
return !hasTruncatedSentences(article.textContent);
|
|
869
869
|
}
|
|
870
870
|
function buildContentSource(params) {
|
|
871
|
-
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated,
|
|
871
|
+
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, signal, } = params;
|
|
872
872
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
873
873
|
const base = {
|
|
874
874
|
favicon: extractedMeta.favicon,
|
|
@@ -877,26 +877,16 @@ function buildContentSource(params) {
|
|
|
877
877
|
truncated,
|
|
878
878
|
};
|
|
879
879
|
if (useArticleContent && article) {
|
|
880
|
-
const
|
|
881
|
-
|
|
882
|
-
: removeNoiseFromHtml(article.content, undefined, url, signal);
|
|
880
|
+
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
881
|
+
prepareDocumentForMarkdown(articleDoc, url, signal);
|
|
883
882
|
return {
|
|
884
883
|
...base,
|
|
885
|
-
sourceHtml:
|
|
884
|
+
sourceHtml: articleDoc.body.innerHTML,
|
|
886
885
|
title: article.title,
|
|
887
886
|
skipNoiseRemoval: true,
|
|
888
887
|
};
|
|
889
888
|
}
|
|
890
889
|
if (document) {
|
|
891
|
-
if (skipNoiseRemoval) {
|
|
892
|
-
return {
|
|
893
|
-
...base,
|
|
894
|
-
sourceHtml: html,
|
|
895
|
-
title: extractedMeta.title,
|
|
896
|
-
skipNoiseRemoval: true,
|
|
897
|
-
document,
|
|
898
|
-
};
|
|
899
|
-
}
|
|
900
890
|
prepareDocumentForMarkdown(document, url, signal);
|
|
901
891
|
const contentRoot = findContentRoot(document);
|
|
902
892
|
return {
|
|
@@ -919,7 +909,7 @@ function resolveContentSource(params) {
|
|
|
919
909
|
...(params.signal ? { signal: params.signal } : {}),
|
|
920
910
|
...(params.inputTruncated ? { inputTruncated: true } : {}),
|
|
921
911
|
});
|
|
922
|
-
const useArticleContent =
|
|
912
|
+
const useArticleContent = article
|
|
923
913
|
? shouldUseArticleContent(article, document)
|
|
924
914
|
: false;
|
|
925
915
|
return buildContentSource({
|
|
@@ -931,7 +921,6 @@ function resolveContentSource(params) {
|
|
|
931
921
|
useArticleContent,
|
|
932
922
|
document,
|
|
933
923
|
truncated: truncated ?? false,
|
|
934
|
-
...(params.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
935
924
|
...(params.signal ? { signal: params.signal } : {}),
|
|
936
925
|
});
|
|
937
926
|
}
|
|
@@ -1004,7 +993,6 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1004
993
|
url,
|
|
1005
994
|
includeMetadata: options.includeMetadata,
|
|
1006
995
|
...(signal ? { signal } : {}),
|
|
1007
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1008
996
|
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1009
997
|
}));
|
|
1010
998
|
const result = buildMarkdownFromContext(context, url, signal);
|
|
@@ -1035,7 +1023,6 @@ function buildWorkerTransformOptions(options) {
|
|
|
1035
1023
|
return {
|
|
1036
1024
|
includeMetadata: options.includeMetadata,
|
|
1037
1025
|
...(options.signal ? { signal: options.signal } : {}),
|
|
1038
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1039
1026
|
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1040
1027
|
};
|
|
1041
1028
|
}
|
|
@@ -61,7 +61,6 @@ export interface MarkdownTransformResult extends MarkdownPayload {
|
|
|
61
61
|
export interface TransformOptions {
|
|
62
62
|
includeMetadata: boolean;
|
|
63
63
|
signal?: AbortSignal;
|
|
64
|
-
skipNoiseRemoval?: boolean;
|
|
65
64
|
inputTruncated?: boolean;
|
|
66
65
|
}
|
|
67
66
|
/**
|
|
@@ -98,7 +97,6 @@ export interface TransformWorkerTransformMessage {
|
|
|
98
97
|
encoding?: string | undefined;
|
|
99
98
|
url: string;
|
|
100
99
|
includeMetadata: boolean;
|
|
101
|
-
skipNoiseRemoval?: boolean | undefined;
|
|
102
100
|
inputTruncated?: boolean | undefined;
|
|
103
101
|
}
|
|
104
102
|
export interface TransformWorkerCancelledMessage {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/transform/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,gBAAgB,GAAG,IAAI,CAAC;IACjC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B;AAED,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC3B,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,iBAAiB,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAC9D,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,eAAe,EAAE,OAAO,CAAC;IACzB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/transform/types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,UAAU,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,gBAAgB,GAAG,IAAI,CAAC;IACjC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B;AAED,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC3B,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,iBAAiB,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAwB,SAAQ,eAAe;IAC9D,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,eAAe,EAAE,OAAO,CAAC;IACzB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,CAAC,EAAE,CAAC,CAAC;IACL,IAAI,EAAE,OAAO,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;CACjC;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC9C,IAAI,EAAE,WAAW,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC1B,UAAU,CAAC,EAAE,UAAU,GAAG,SAAS,CAAC;IACpC,QAAQ,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,eAAe,EAAE,OAAO,CAAC;IACzB,cAAc,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;CACtC;AAED,MAAM,WAAW,+BAA+B;IAC9C,IAAI,EAAE,WAAW,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;CACZ;AAED,MAAM,WAAW,4BAA4B;IAC3C,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,eAAe,CAAC;CACzB;AAED,MAAM,WAAW,2BAA2B;IAC1C,IAAI,EAAE,OAAO,CAAC;IACd,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,CAAC;QAChB,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACnC,CAAC;CACH;AAED,MAAM,MAAM,8BAA8B,GACtC,4BAA4B,GAC5B,2BAA2B,GAC3B,+BAA+B,CAAC"}
|
|
@@ -3,7 +3,6 @@ interface TransformWorkerPool {
|
|
|
3
3
|
transform(html: string, url: string, options: {
|
|
4
4
|
includeMetadata: boolean;
|
|
5
5
|
signal?: AbortSignal;
|
|
6
|
-
skipNoiseRemoval?: boolean;
|
|
7
6
|
inputTruncated?: boolean;
|
|
8
7
|
}): Promise<MarkdownTransformResult>;
|
|
9
8
|
close(): Promise<void>;
|
|
@@ -29,13 +28,11 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
29
28
|
transform(html: string, url: string, options: {
|
|
30
29
|
includeMetadata: boolean;
|
|
31
30
|
signal?: AbortSignal;
|
|
32
|
-
skipNoiseRemoval?: boolean;
|
|
33
31
|
inputTruncated?: boolean;
|
|
34
32
|
}): Promise<MarkdownTransformResult>;
|
|
35
33
|
transform(htmlBuffer: Uint8Array, url: string, options: {
|
|
36
34
|
includeMetadata: boolean;
|
|
37
35
|
signal?: AbortSignal;
|
|
38
|
-
skipNoiseRemoval?: boolean;
|
|
39
36
|
inputTruncated?: boolean;
|
|
40
37
|
encoding?: string;
|
|
41
38
|
}): Promise<MarkdownTransformResult>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EACV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AAuBA,OAAO,KAAK,EACV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;AAqJpB,UAAU,mBAAmB;IAC3B,SAAS,CACP,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC,CAAC;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,aAAa,IAAI,MAAM,CAAC;IACxB,gBAAgB,IAAI,MAAM,CAAC;IAC3B,WAAW,IAAI,MAAM,CAAC;CACvB;AAkBD,cAAM,UAAW,YAAW,mBAAmB;IAC7C,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAkC;IAExE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkC;IAC1D,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAChD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAEhD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAqB;IAC3C,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAmC;IAC5D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAOvB;IAEJ,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAK;gBAEV,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM;IASrC,SAAS,CACb,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC;IAC7B,SAAS,CACb,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GACA,OAAO,CAAC,uBAAuB,CAAC;IAmCnC,aAAa,IAAI,MAAM;IAKvB,gBAAgB,IAAI,MAAM;IAI1B,WAAW,IAAI,MAAM;IAIrB,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAWpB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAiC5B,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,iBAAiB;IAkDzB,OAAO,CAAC,aAAa;IAsCrB,OAAO,CAAC,gBAAgB;IAOxB,OAAO,CAAC,gBAAgB;YAyBV,aAAa;IA2B3B,OAAO,CAAC,kBAAkB;IAY1B,OAAO,CAAC,WAAW;IAmCnB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,aAAa;IAYrB,OAAO,CAAC,eAAe;IAsDvB,OAAO,CAAC,YAAY;IAWpB,OAAO,CAAC,QAAQ;IAOhB,OAAO,CAAC,QAAQ;IAWhB,OAAO,CAAC,YAAY;IASpB,OAAO,CAAC,UAAU;IA2BlB,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,iBAAiB;IAiFzB,OAAO,CAAC,YAAY;IAQpB,OAAO,CAAC,eAAe;IAQvB,OAAO,CAAC,iBAAiB;CAW1B;AAMD,wBAAgB,qBAAqB,IAAI,UAAU,CAIlD;AAED,wBAAgB,kBAAkB,IAAI;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB,GAAG,IAAI,CAOP;AAED,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAIxD"}
|
|
@@ -83,7 +83,6 @@ function buildWorkerDispatchPayload(task) {
|
|
|
83
83
|
id: task.id,
|
|
84
84
|
url: task.url,
|
|
85
85
|
includeMetadata: task.includeMetadata,
|
|
86
|
-
...(task.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
87
86
|
...(task.inputTruncated ? { inputTruncated: true } : {}),
|
|
88
87
|
};
|
|
89
88
|
if (!task.htmlBuffer) {
|
|
@@ -214,7 +213,6 @@ class WorkerPool {
|
|
|
214
213
|
id,
|
|
215
214
|
url,
|
|
216
215
|
includeMetadata: options.includeMetadata,
|
|
217
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
218
216
|
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
219
217
|
signal: options.signal,
|
|
220
218
|
abortListener,
|
package/package.json
CHANGED