@j0hanz/fetch-url-mcp 1.12.4 → 1.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +44 -29
- package/dist/http/helpers.d.ts.map +1 -1
- package/dist/http/helpers.js +22 -12
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +30 -29
- package/dist/http/rate-limit.d.ts.map +1 -1
- package/dist/http/rate-limit.js +5 -3
- package/dist/index.js +3 -2
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +11 -7
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +12 -9
- package/dist/lib/error-codes.d.ts +11 -0
- package/dist/lib/error-codes.d.ts.map +1 -0
- package/dist/lib/error-codes.js +15 -0
- package/dist/lib/error-messages.d.ts +13 -0
- package/dist/lib/error-messages.d.ts.map +1 -0
- package/dist/lib/error-messages.js +51 -0
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +5 -4
- package/dist/lib/http.d.ts.map +1 -1
- package/dist/lib/http.js +74 -41
- package/dist/lib/logger-names.d.ts +14 -0
- package/dist/lib/logger-names.d.ts.map +1 -0
- package/dist/lib/logger-names.js +13 -0
- package/dist/lib/mcp-interop.d.ts +1 -11
- package/dist/lib/mcp-interop.d.ts.map +1 -1
- package/dist/lib/mcp-interop.js +10 -73
- package/dist/lib/session.d.ts.map +1 -1
- package/dist/lib/session.js +2 -1
- package/dist/lib/tool-errors.d.ts +39 -0
- package/dist/lib/tool-errors.d.ts.map +1 -0
- package/dist/lib/tool-errors.js +252 -0
- package/dist/lib/url.d.ts.map +1 -1
- package/dist/lib/url.js +18 -15
- package/dist/lib/utils.d.ts +4 -1
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +18 -9
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +3 -3
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +7 -6
- package/dist/tasks/call-contract.d.ts.map +1 -1
- package/dist/tasks/call-contract.js +8 -10
- package/dist/tasks/execution.d.ts.map +1 -1
- package/dist/tasks/execution.js +17 -14
- package/dist/tasks/handlers.d.ts.map +1 -1
- package/dist/tasks/handlers.js +9 -8
- package/dist/tasks/manager.d.ts.map +1 -1
- package/dist/tasks/manager.js +14 -13
- package/dist/tasks/owner.d.ts +0 -1
- package/dist/tasks/owner.d.ts.map +1 -1
- package/dist/tasks/owner.js +0 -25
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +14 -26
- package/dist/transform/dom-prep.d.ts.map +1 -1
- package/dist/transform/dom-prep.js +10 -8
- package/dist/transform/shared.d.ts.map +1 -1
- package/dist/transform/shared.js +2 -1
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +29 -21
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +16 -12
- package/package.json +1 -1
package/dist/tools/fetch-url.js
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import { ErrorCode
|
|
2
|
-
import { config,
|
|
1
|
+
import { ErrorCode } from '@modelcontextprotocol/sdk/types.js';
|
|
2
|
+
import { config, logInfo, logWarn } from '../lib/core.js';
|
|
3
3
|
import { finalizeInlineMarkdown, markdownTransform, performSharedFetch, withSignal, } from '../lib/fetch-pipeline.js';
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
4
|
+
import { LOG_FETCH_URL } from '../lib/logger-names.js';
|
|
5
|
+
import { createMcpError, createProgressReporter, registerToolPresentation, } from '../lib/mcp-interop.js';
|
|
6
|
+
import { classifyAndLogToolError } from '../lib/tool-errors.js';
|
|
7
|
+
import { composeAbortSignal, isAbortError, isObject, parseUrlOrNull, } from '../lib/utils.js';
|
|
6
8
|
import { formatZodError } from '../lib/zod.js';
|
|
7
9
|
import { fetchUrlInputSchema, fetchUrlOutputSchema, normalizeExtractedMetadata, normalizePageTitle, } from '../schemas.js';
|
|
8
10
|
import { withRequestContextIfMissing } from '../tasks/owner.js';
|
|
@@ -70,8 +72,10 @@ function validateStructuredContent(structuredContent, inputUrl) {
|
|
|
70
72
|
logWarn('Tool output schema validation failed', {
|
|
71
73
|
url: inputUrl,
|
|
72
74
|
issues,
|
|
73
|
-
},
|
|
74
|
-
throw createMcpError(ErrorCode.InternalError, '
|
|
75
|
+
}, LOG_FETCH_URL);
|
|
76
|
+
throw createMcpError(ErrorCode.InternalError, 'Output validation failed', {
|
|
77
|
+
issues,
|
|
78
|
+
});
|
|
75
79
|
}
|
|
76
80
|
export function buildFetchUrlContentBlocks(structuredContent) {
|
|
77
81
|
const markdown = typeof structuredContent['markdown'] === 'string'
|
|
@@ -182,7 +186,7 @@ function buildToolAbortSignal(extraSignal) {
|
|
|
182
186
|
const timeout = config.tools.timeoutMs > 0 ? config.tools.timeoutMs : HARD_TOOL_TIMEOUT_MS;
|
|
183
187
|
const signal = composeAbortSignal(extraSignal, timeout);
|
|
184
188
|
if (!signal) {
|
|
185
|
-
throw createMcpError(ErrorCode.InternalError, '
|
|
189
|
+
throw createMcpError(ErrorCode.InternalError, 'Failed to create timeout signal');
|
|
186
190
|
}
|
|
187
191
|
return signal;
|
|
188
192
|
}
|
|
@@ -212,7 +216,7 @@ async function executeFetch(input, extra) {
|
|
|
212
216
|
...(isObject(relatedTask) && typeof relatedTask['taskId'] === 'string'
|
|
213
217
|
? { taskId: relatedTask['taskId'] }
|
|
214
218
|
: {}),
|
|
215
|
-
},
|
|
219
|
+
}, LOG_FETCH_URL);
|
|
216
220
|
progressPlan.reportStart();
|
|
217
221
|
const { pipeline, inlineResult } = await performSharedFetch(buildFetchOptions(url, signal, progressPlan));
|
|
218
222
|
const truncated = inlineResult.truncated ?? pipeline.data.truncated;
|
|
@@ -223,7 +227,7 @@ async function executeFetch(input, extra) {
|
|
|
223
227
|
contentSize: inlineResult.contentSize,
|
|
224
228
|
durationMs: Math.round(performance.now() - startedAt),
|
|
225
229
|
...(truncated ? { truncated: true } : {}),
|
|
226
|
-
},
|
|
230
|
+
}, LOG_FETCH_URL);
|
|
227
231
|
const response = buildResponse(pipeline, inlineResult, url);
|
|
228
232
|
progressPlan.reportSuccess(inlineResult.contentSize);
|
|
229
233
|
return response;
|
|
@@ -237,23 +241,7 @@ export async function fetchUrlToolHandler(input, extra) {
|
|
|
237
241
|
const startedAt = performance.now();
|
|
238
242
|
return executeFetch(input, extra).catch((error) => {
|
|
239
243
|
const durationMs = Math.round(performance.now() - startedAt);
|
|
240
|
-
|
|
241
|
-
logError('fetch-url tool failed', { url: input.url, durationMs, error: toError(error) }, 'fetch-url');
|
|
242
|
-
}
|
|
243
|
-
else if (error instanceof FetchError || isAbortError(error)) {
|
|
244
|
-
logWarn('fetch-url request failed', {
|
|
245
|
-
url: input.url,
|
|
246
|
-
error: toError(error).message,
|
|
247
|
-
durationMs,
|
|
248
|
-
}, 'fetch-url');
|
|
249
|
-
}
|
|
250
|
-
else {
|
|
251
|
-
logError('fetch-url request failed unexpectedly', { url: input.url, error: toError(error).message, durationMs }, 'fetch-url');
|
|
252
|
-
}
|
|
253
|
-
if (error instanceof McpError) {
|
|
254
|
-
throw error;
|
|
255
|
-
}
|
|
256
|
-
return handleToolError(error, input.url, 'Failed to fetch URL');
|
|
244
|
+
return classifyAndLogToolError(error, { url: input.url, durationMs }, LOG_FETCH_URL, 'fetch-url', 'Failed to fetch URL');
|
|
257
245
|
});
|
|
258
246
|
}
|
|
259
247
|
/* -------------------------------------------------------------------------------------------------
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/transform/dom-prep.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/transform/dom-prep.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AA+mBnD,eAAO,MAAM,sBAAsB,QAAmB,CAAC;AAyCvD,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA6B9D;AAuBD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AA0RD,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA2BjE;AAED,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAE1D;AAED,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAY9D;AAWD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAQ3D;AAuDD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAON;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AA0ED,wBAAgB,oBAAoB,CAClC,cAAc,EAAE,MAAM,GAAG,QAAQ,GAChC,MAAM,CAaR;AAiMD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,EACzB,QAAQ,EAAE,QAAQ,GACjB,QAAQ,GAAG,IAAI,CAsCjB"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { parseHTML } from 'linkedom';
|
|
2
2
|
import { config, logDebug } from '../lib/core.js';
|
|
3
|
+
import { LOG_TRANSFORM } from '../lib/logger-names.js';
|
|
3
4
|
import { CharCode, isWhitespaceChar } from '../lib/utils.js';
|
|
4
5
|
// ── Thresholds ──────────────────────────────────────────────────────
|
|
5
6
|
const NOISE_SCAN_LIMIT = 50_000;
|
|
@@ -156,7 +157,8 @@ function escapeRegexLiteral(value) {
|
|
|
156
157
|
function buildTokenRegex(tokens) {
|
|
157
158
|
if (tokens.size === 0)
|
|
158
159
|
return NO_MATCH_REGEX;
|
|
159
|
-
|
|
160
|
+
const pattern = new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
|
|
161
|
+
return pattern;
|
|
160
162
|
}
|
|
161
163
|
function getPromoMatchers(currentConfig, enabledCategories) {
|
|
162
164
|
const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
|
|
@@ -457,7 +459,7 @@ function stripNoise(document, signal) {
|
|
|
457
459
|
if (config.noiseRemoval.debug) {
|
|
458
460
|
logDebug('Noise removal audit enabled', {
|
|
459
461
|
categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
|
|
460
|
-
},
|
|
462
|
+
}, LOG_TRANSFORM);
|
|
461
463
|
}
|
|
462
464
|
// Structural Removal
|
|
463
465
|
removeNodes(document.querySelectorAll(context.noiseSelector));
|
|
@@ -469,7 +471,7 @@ function stripNoise(document, signal) {
|
|
|
469
471
|
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
470
472
|
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
471
473
|
if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
|
|
472
|
-
throw
|
|
474
|
+
throw Error('Noise removal aborted');
|
|
473
475
|
}
|
|
474
476
|
const node = candidates[i];
|
|
475
477
|
if (!node)
|
|
@@ -1261,23 +1263,23 @@ function passesEmptySectionRatio(articleDoc) {
|
|
|
1261
1263
|
}
|
|
1262
1264
|
export function evaluateArticleContent(article, document) {
|
|
1263
1265
|
if (!passesContentRatioGate(article.textContent.length, document)) {
|
|
1264
|
-
logDebug('FAILED passesContentRatioGate', undefined,
|
|
1266
|
+
logDebug('FAILED passesContentRatioGate', undefined, LOG_TRANSFORM);
|
|
1265
1267
|
return null;
|
|
1266
1268
|
}
|
|
1267
1269
|
if (!passesRetentionRulesFromHtml(document, article.content)) {
|
|
1268
|
-
logDebug('FAILED passesRetentionRulesFromHtml', undefined,
|
|
1270
|
+
logDebug('FAILED passesRetentionRulesFromHtml', undefined, LOG_TRANSFORM);
|
|
1269
1271
|
return null;
|
|
1270
1272
|
}
|
|
1271
1273
|
if (hasTruncatedSentences(article.textContent)) {
|
|
1272
|
-
logDebug('FAILED hasTruncatedSentences', undefined,
|
|
1274
|
+
logDebug('FAILED hasTruncatedSentences', undefined, LOG_TRANSFORM);
|
|
1273
1275
|
return null;
|
|
1274
1276
|
}
|
|
1275
1277
|
const articleDoc = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`).document;
|
|
1276
1278
|
if (!passesEmptySectionRatio(articleDoc)) {
|
|
1277
1279
|
const headings = articleDoc.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
1278
|
-
logDebug(`FAILED passesEmptySectionRatio: ${headings.length} headings`, undefined,
|
|
1280
|
+
logDebug(`FAILED passesEmptySectionRatio: ${headings.length} headings`, undefined, LOG_TRANSFORM);
|
|
1279
1281
|
for (const h of headings) {
|
|
1280
|
-
logDebug(`H: ${h.textContent} ${String(hasSectionContent(h))}`, undefined,
|
|
1282
|
+
logDebug(`H: ${h.textContent} ${String(hasSectionContent(h))}`, undefined, LOG_TRANSFORM);
|
|
1281
1283
|
}
|
|
1282
1284
|
return null;
|
|
1283
1285
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../src/transform/shared.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,uBAAuB,EACvB,gBAAgB,EAChB,8BAA8B,EAE/B,MAAM,YAAY,CAAC;AAEpB,UAAU,2BAA2B;IACnC,WAAW,EAAE,CAAC,OAAO,EAAE,8BAA8B,KAAK,IAAI,CAAC;IAC/D,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,KACtB,uBAAuB,CAAC;CAC9B;
|
|
1
|
+
{"version":3,"file":"shared.d.ts","sourceRoot":"","sources":["../../src/transform/shared.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACV,uBAAuB,EACvB,gBAAgB,EAChB,8BAA8B,EAE/B,MAAM,YAAY,CAAC;AAEpB,UAAU,2BAA2B;IACnC,WAAW,EAAE,CAAC,OAAO,EAAE,8BAA8B,KAAK,IAAI,CAAC;IAC/D,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,KACtB,uBAAuB,CAAC;CAC9B;AAkLD,wBAAgB,6BAA6B,CAC3C,OAAO,EAAE,2BAA2B,GACnC,CAAC,GAAG,EAAE,OAAO,KAAK,IAAI,CA2BxB"}
|
package/dist/transform/shared.js
CHANGED
|
@@ -19,7 +19,8 @@ function decodeHtml(html, htmlBuffer, encoding, decoder) {
|
|
|
19
19
|
return decoder.decode(htmlBuffer);
|
|
20
20
|
}
|
|
21
21
|
try {
|
|
22
|
-
|
|
22
|
+
const decoded = new TextDecoder(encoding).decode(htmlBuffer);
|
|
23
|
+
return decoded;
|
|
23
24
|
}
|
|
24
25
|
catch {
|
|
25
26
|
return decoder.decode(htmlBuffer);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA6CA,OAAO,EACL,wBAAwB,EACxB,wBAAwB,EACxB,oBAAoB,EACrB,MAAM,uBAAuB,CAAC;AAkB/B,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAoCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAqMD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AA8bD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAuKD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,GAAG,SAAS,CAAC;IACjC,QAAQ,CAAC,EAAE,QAAQ,GAAG,SAAS,CAAC;IAChC,gBAAgB,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;CACxC,GACA,MAAM,CAgCR;AA+DD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAKD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,qBAAqB,EAAE,OAAO,GAC7B,aAAa,GAAG,SAAS,CAuB3B;AA6bD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAMzB;AAcD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,OAAO,EACL,wBAAwB,EACxB,wBAAwB,EACxB,oBAAoB,GACrB,CAAC"}
|
|
@@ -2,7 +2,9 @@ import diagnosticsChannel from 'node:diagnostics_channel';
|
|
|
2
2
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
3
3
|
import { parseHTML } from 'linkedom';
|
|
4
4
|
import { config, getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
5
|
+
import { QUEUE_FULL } from '../lib/error-codes.js';
|
|
5
6
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
7
|
+
import { LOG_TRANSFORM } from '../lib/logger-names.js';
|
|
6
8
|
import { composeAbortSignal, FetchError, getErrorMessage, getUtf8ByteLength, isAsciiOnly, isObject, throwIfAborted, toError, trimDanglingTagFragment, truncateToUtf8Boundary, } from '../lib/utils.js';
|
|
7
9
|
import { evaluateArticleContent, extractNoscriptImages, getVisibleTextLength, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, stripDocsControls, stripScreenReaderText, surfaceCodeEditorContent, } from './dom-prep.js';
|
|
8
10
|
import { extractLanguageFromClassName } from './html-translators.js';
|
|
@@ -19,13 +21,16 @@ function decodeInput(input, encoding) {
|
|
|
19
21
|
if (!normalizedEncoding ||
|
|
20
22
|
normalizedEncoding === 'utf-8' ||
|
|
21
23
|
normalizedEncoding === 'utf8') {
|
|
22
|
-
|
|
24
|
+
const decoded = new TextDecoder('utf-8').decode(input);
|
|
25
|
+
return decoded;
|
|
23
26
|
}
|
|
24
27
|
try {
|
|
25
|
-
|
|
28
|
+
const decoded = new TextDecoder(normalizedEncoding, { fatal: true }).decode(input);
|
|
29
|
+
return decoded;
|
|
26
30
|
}
|
|
27
31
|
catch {
|
|
28
|
-
|
|
32
|
+
const decoded = new TextDecoder('utf-8').decode(input);
|
|
33
|
+
return decoded;
|
|
29
34
|
}
|
|
30
35
|
}
|
|
31
36
|
function buildTransformSignal(signal) {
|
|
@@ -67,7 +72,7 @@ class StageTracker {
|
|
|
67
72
|
durationMs: Math.round(durationMs),
|
|
68
73
|
thresholdMs: Math.round(warnThresholdMs),
|
|
69
74
|
url: context.url,
|
|
70
|
-
},
|
|
75
|
+
}, LOG_TRANSFORM);
|
|
71
76
|
}
|
|
72
77
|
}
|
|
73
78
|
const event = {
|
|
@@ -87,12 +92,13 @@ class StageTracker {
|
|
|
87
92
|
}
|
|
88
93
|
checkBudget(url, stage, budget) {
|
|
89
94
|
if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
|
|
90
|
-
|
|
95
|
+
const error = new FetchError('Transform budget exhausted', url, 504, {
|
|
91
96
|
reason: 'timeout',
|
|
92
97
|
stage: `${stage}:budget_exhausted`,
|
|
93
98
|
elapsedMs: budget.elapsedMs,
|
|
94
99
|
totalBudgetMs: budget.totalBudgetMs,
|
|
95
100
|
});
|
|
101
|
+
throw error;
|
|
96
102
|
}
|
|
97
103
|
}
|
|
98
104
|
run(url, stage, fn, budget) {
|
|
@@ -134,7 +140,7 @@ class StageTracker {
|
|
|
134
140
|
logDebug('Diagnostic channel publish failed', {
|
|
135
141
|
stage: event.stage,
|
|
136
142
|
error: getErrorMessage(error),
|
|
137
|
-
},
|
|
143
|
+
}, LOG_TRANSFORM);
|
|
138
144
|
}
|
|
139
145
|
}
|
|
140
146
|
runTrackedSync(url, signal, fn) {
|
|
@@ -195,7 +201,7 @@ function truncateHtml(html, inputTruncated = false) {
|
|
|
195
201
|
size: getUtf8ByteLength(html),
|
|
196
202
|
maxSize,
|
|
197
203
|
truncatedSize: getUtf8ByteLength(content),
|
|
198
|
-
},
|
|
204
|
+
}, LOG_TRANSFORM);
|
|
199
205
|
return { html: content, truncated: true };
|
|
200
206
|
}
|
|
201
207
|
const MIN_SPA_CONTENT_LENGTH = 100;
|
|
@@ -334,7 +340,7 @@ function validateReaderability(doc, url, signal) {
|
|
|
334
340
|
if (textLength < MIN_SPA_CONTENT_LENGTH) {
|
|
335
341
|
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
336
342
|
'This might be a client-side rendered (SPA) application. ' +
|
|
337
|
-
'Content extraction may be incomplete.', { textLength },
|
|
343
|
+
'Content extraction may be incomplete.', { textLength }, LOG_TRANSFORM);
|
|
338
344
|
}
|
|
339
345
|
throwIfAborted(signal, url, 'extract:article:readabilityCheck');
|
|
340
346
|
if (textLength >= MIN_READERABLE_TEXT_LENGTH && !isProbablyReaderable(doc)) {
|
|
@@ -385,7 +391,7 @@ function mapReadabilityResult(parsed) {
|
|
|
385
391
|
// runs later in buildContentSource), so this clone starts from raw HTML.
|
|
386
392
|
function extractArticle(document, url, signal) {
|
|
387
393
|
if (!isReadabilityCompatible(document)) {
|
|
388
|
-
logWarn('Document not compatible with Readability', undefined,
|
|
394
|
+
logWarn('Document not compatible with Readability', undefined, LOG_TRANSFORM);
|
|
389
395
|
return null;
|
|
390
396
|
}
|
|
391
397
|
try {
|
|
@@ -398,17 +404,17 @@ function extractArticle(document, url, signal) {
|
|
|
398
404
|
return mapReadabilityResult(parsed);
|
|
399
405
|
}
|
|
400
406
|
catch (error) {
|
|
401
|
-
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined,
|
|
407
|
+
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined, LOG_TRANSFORM);
|
|
402
408
|
return null;
|
|
403
409
|
}
|
|
404
410
|
}
|
|
405
411
|
function isValidInput(html, url) {
|
|
406
412
|
if (typeof html !== 'string' || html.length === 0) {
|
|
407
|
-
logWarn('extractContent called with invalid HTML input', undefined,
|
|
413
|
+
logWarn('extractContent called with invalid HTML input', undefined, LOG_TRANSFORM);
|
|
408
414
|
return false;
|
|
409
415
|
}
|
|
410
416
|
if (typeof url !== 'string' || url.length === 0) {
|
|
411
|
-
logWarn('extractContent called with invalid URL', undefined,
|
|
417
|
+
logWarn('extractContent called with invalid URL', undefined, LOG_TRANSFORM);
|
|
412
418
|
return false;
|
|
413
419
|
}
|
|
414
420
|
return true;
|
|
@@ -421,7 +427,7 @@ function applyBaseUri(document, url) {
|
|
|
421
427
|
logInfo('Failed to set baseURI (non-critical)', {
|
|
422
428
|
url: url.substring(0, 100),
|
|
423
429
|
error: getErrorMessage(error),
|
|
424
|
-
},
|
|
430
|
+
}, LOG_TRANSFORM);
|
|
425
431
|
}
|
|
426
432
|
}
|
|
427
433
|
function createEmptyExtractionContext() {
|
|
@@ -477,7 +483,7 @@ function extractContentContext(html, url, options) {
|
|
|
477
483
|
if (error instanceof FetchError)
|
|
478
484
|
throw error;
|
|
479
485
|
throwIfAborted(options.signal, url, 'extract:error');
|
|
480
|
-
logError('Failed to extract content', error instanceof Error ? error : undefined,
|
|
486
|
+
logError('Failed to extract content', error instanceof Error ? error : undefined, LOG_TRANSFORM);
|
|
481
487
|
return createEmptyExtractionContext();
|
|
482
488
|
}
|
|
483
489
|
}
|
|
@@ -613,10 +619,11 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
613
619
|
catch (error) {
|
|
614
620
|
if (error instanceof FetchError)
|
|
615
621
|
throw error;
|
|
616
|
-
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined,
|
|
617
|
-
|
|
622
|
+
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined, LOG_TRANSFORM);
|
|
623
|
+
const fetchError = new FetchError('Failed to convert HTML to markdown', url, 500, {
|
|
618
624
|
reason: 'markdown_convert_failed',
|
|
619
625
|
});
|
|
626
|
+
throw fetchError;
|
|
620
627
|
}
|
|
621
628
|
}
|
|
622
629
|
const HTML_DOCUMENT_START = /^\s*<(?:!doctype|html|head|body)\b/i;
|
|
@@ -644,7 +651,7 @@ function tryTransformRawContent(params) {
|
|
|
644
651
|
return null;
|
|
645
652
|
logDebug('Preserving raw markdown content', {
|
|
646
653
|
url: params.url.substring(0, 80),
|
|
647
|
-
},
|
|
654
|
+
}, LOG_TRANSFORM);
|
|
648
655
|
const { content, title } = buildRawMarkdownPayload({
|
|
649
656
|
rawContent: params.html,
|
|
650
657
|
url: params.url,
|
|
@@ -954,7 +961,8 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
954
961
|
}
|
|
955
962
|
function validateBinaryContent(html, url) {
|
|
956
963
|
if (hasBinaryIndicators(html)) {
|
|
957
|
-
|
|
964
|
+
const error = new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
|
|
965
|
+
throw error;
|
|
958
966
|
}
|
|
959
967
|
}
|
|
960
968
|
export function getTransformPoolStats() {
|
|
@@ -990,12 +998,12 @@ async function transformWithWorkerPool(htmlOrBuffer, url, options) {
|
|
|
990
998
|
}
|
|
991
999
|
function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
|
|
992
1000
|
const poolStats = getWorkerPoolStats();
|
|
993
|
-
const isQueueFull = error instanceof FetchError && error.details['reason'] ===
|
|
1001
|
+
const isQueueFull = error instanceof FetchError && error.details['reason'] === QUEUE_FULL;
|
|
994
1002
|
if (isQueueFull) {
|
|
995
1003
|
logWarn('Transform worker queue full; falling back to in-process', {
|
|
996
1004
|
url: redactUrl(url),
|
|
997
1005
|
...(poolStats ?? {}),
|
|
998
|
-
},
|
|
1006
|
+
}, LOG_TRANSFORM);
|
|
999
1007
|
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1000
1008
|
}
|
|
1001
1009
|
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
@@ -1008,7 +1016,7 @@ function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
|
|
|
1008
1016
|
url: redactUrl(url),
|
|
1009
1017
|
error: message,
|
|
1010
1018
|
...(poolStats ?? {}),
|
|
1011
|
-
},
|
|
1019
|
+
}, LOG_TRANSFORM);
|
|
1012
1020
|
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1013
1021
|
}
|
|
1014
1022
|
async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AA6BA,OAAO,KAAK,EACV,uBAAuB,EAKxB,MAAM,YAAY,CAAC;AAsIpB,UAAU,mBAAmB;IAC3B,SAAS,CACP,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,qBAAqB,EAAE,OAAO,CAAC;QAC/B,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC,CAAC;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,aAAa,IAAI,MAAM,CAAC;IACxB,gBAAgB,IAAI,MAAM,CAAC;IAC3B,WAAW,IAAI,MAAM,CAAC;CACvB;AA6JD,cAAM,UAAW,YAAW,mBAAmB;IAC7C,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAkC;IAExE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkC;IAC1D,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAChD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAEhD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAgC;IACtD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAmC;IAC5D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAA0B;IAErD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,QAAQ,CAAC,cAAc,CAA6B;gBAEhD,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM;IASrC,SAAS,CACb,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,qBAAqB,EAAE,OAAO,CAAC;QAC/B,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC;IAC7B,SAAS,CACb,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,qBAAqB,EAAE,OAAO,CAAC;QAC/B,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GACA,OAAO,CAAC,uBAAuB,CAAC;IAqDnC,aAAa,IAAI,MAAM;IAIvB,gBAAgB,IAAI,MAAM;IAI1B,WAAW,IAAI,MAAM;IAIrB,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAWpB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAqC5B,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,iBAAiB;IAoDzB,OAAO,CAAC,aAAa;YA4BP,aAAa;IA2B3B,OAAO,CAAC,kBAAkB;IAY1B,OAAO,CAAC,WAAW;IA4CnB,OAAO,CAAC,cAAc;IAiCtB,OAAO,CAAC,aAAa;IAkCrB,OAAO,CAAC,eAAe;IAgCvB,OAAO,CAAC,mBAAmB;IA8B3B,OAAO,CAAC,YAAY;IAWpB,OAAO,CAAC,QAAQ;IAUhB,OAAO,CAAC,QAAQ;IAUhB,OAAO,CAAC,QAAQ;IAShB,OAAO,CAAC,YAAY;IAmBpB,OAAO,CAAC,UAAU;IAgClB,OAAO,CAAC,iBAAiB;IAgCzB,OAAO,CAAC,gBAAgB;IAyDxB,OAAO,CAAC,YAAY;IAwBpB,OAAO,CAAC,YAAY;IAIpB,OAAO,CAAC,iBAAiB;CAS1B;AAMD,wBAAgB,qBAAqB,IAAI,UAAU,CAclD;AAED,wBAAgB,kBAAkB,IAAI;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB,GAAG,IAAI,CAOP;AAED,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAIxD"}
|
|
@@ -5,6 +5,8 @@ import { isSharedArrayBuffer } from 'node:util/types';
|
|
|
5
5
|
import { isMainThread, isMarkedAsUntransferable, parentPort, Worker, } from 'node:worker_threads';
|
|
6
6
|
import { z } from 'zod';
|
|
7
7
|
import { config, logDebug, logInfo, logWarn } from '../lib/core.js';
|
|
8
|
+
import { QUEUE_FULL } from '../lib/error-codes.js';
|
|
9
|
+
import { LOG_TRANSFORM } from '../lib/logger-names.js';
|
|
8
10
|
import { createAbortError, createUnrefTimeout, FetchError, getErrorMessage, } from '../lib/utils.js';
|
|
9
11
|
import { formatZodError } from '../lib/zod.js';
|
|
10
12
|
import { extractedMetadataSchema } from '../schemas.js';
|
|
@@ -53,7 +55,8 @@ function ensureTightBuffer(buffer) {
|
|
|
53
55
|
buffer.byteLength === buffer.buffer.byteLength) {
|
|
54
56
|
return buffer;
|
|
55
57
|
}
|
|
56
|
-
|
|
58
|
+
const copy = new Uint8Array(buffer);
|
|
59
|
+
return copy;
|
|
57
60
|
}
|
|
58
61
|
function getTransferableBuffer(buffer) {
|
|
59
62
|
const backingBuffer = buffer.buffer;
|
|
@@ -238,11 +241,12 @@ class WorkerPool {
|
|
|
238
241
|
activeWorkers: this.busyCount,
|
|
239
242
|
capacity: this.capacity,
|
|
240
243
|
url,
|
|
241
|
-
},
|
|
242
|
-
|
|
243
|
-
reason:
|
|
244
|
+
}, LOG_TRANSFORM);
|
|
245
|
+
const error = new FetchError('Transform worker queue is full', url, HTTP_SERVICE_UNAVAILABLE, {
|
|
246
|
+
reason: QUEUE_FULL,
|
|
244
247
|
stage: 'transform:enqueue',
|
|
245
248
|
});
|
|
249
|
+
throw error;
|
|
246
250
|
}
|
|
247
251
|
const { promise, resolve, reject } = Promise.withResolvers();
|
|
248
252
|
const task = this.createPendingTask(htmlOrBuffer, url, options, resolve, reject);
|
|
@@ -275,7 +279,7 @@ class WorkerPool {
|
|
|
275
279
|
activeWorkers: this.busyCount,
|
|
276
280
|
queueDepth: this.queue.depth,
|
|
277
281
|
inflight: this.inflight.size,
|
|
278
|
-
},
|
|
282
|
+
}, LOG_TRANSFORM);
|
|
279
283
|
const terminations = this.workers
|
|
280
284
|
.map((slot) => slot?.worker.terminate().catch(() => undefined))
|
|
281
285
|
.filter((p) => p !== undefined);
|
|
@@ -296,7 +300,7 @@ class WorkerPool {
|
|
|
296
300
|
}
|
|
297
301
|
ensureOpen() {
|
|
298
302
|
if (this.closed)
|
|
299
|
-
throw
|
|
303
|
+
throw Error(WorkerPool.CLOSED_MESSAGE);
|
|
300
304
|
}
|
|
301
305
|
createPendingTask(htmlOrBuffer, url, options, resolve, reject) {
|
|
302
306
|
const id = (this.taskIdSeq++).toString(36);
|
|
@@ -390,7 +394,7 @@ class WorkerPool {
|
|
|
390
394
|
logDebug('Spawned transform worker', {
|
|
391
395
|
workerIndex,
|
|
392
396
|
workerName: name,
|
|
393
|
-
},
|
|
397
|
+
}, LOG_TRANSFORM);
|
|
394
398
|
worker.unref();
|
|
395
399
|
worker.on('message', (raw) => {
|
|
396
400
|
this.onWorkerMessage(workerIndex, raw);
|
|
@@ -417,7 +421,7 @@ class WorkerPool {
|
|
|
417
421
|
workerIndex,
|
|
418
422
|
workerName: slot.name,
|
|
419
423
|
threadId: slot.worker.threadId,
|
|
420
|
-
},
|
|
424
|
+
}, LOG_TRANSFORM);
|
|
421
425
|
if (slot.busy && slot.currentTaskId) {
|
|
422
426
|
try {
|
|
423
427
|
this.failTask(slot.currentTaskId, new FetchError(message, '', HTTP_SERVICE_UNAVAILABLE, {
|
|
@@ -445,7 +449,7 @@ class WorkerPool {
|
|
|
445
449
|
workerIndex,
|
|
446
450
|
delayMs,
|
|
447
451
|
attempt: attempts + 1,
|
|
448
|
-
},
|
|
452
|
+
}, LOG_TRANSFORM);
|
|
449
453
|
setTimeout(() => {
|
|
450
454
|
if (this.closed)
|
|
451
455
|
return;
|
|
@@ -547,7 +551,7 @@ class WorkerPool {
|
|
|
547
551
|
fromCapacity: previousCapacity,
|
|
548
552
|
toCapacity: this.capacity,
|
|
549
553
|
queueDepth: this.getQueueDepth(),
|
|
550
|
-
},
|
|
554
|
+
}, LOG_TRANSFORM);
|
|
551
555
|
}
|
|
552
556
|
}
|
|
553
557
|
drainQueue() {
|
|
@@ -619,7 +623,7 @@ class WorkerPool {
|
|
|
619
623
|
url: task.url,
|
|
620
624
|
workerIndex,
|
|
621
625
|
timeoutMs: this.timeoutMs,
|
|
622
|
-
},
|
|
626
|
+
}, LOG_TRANSFORM);
|
|
623
627
|
this.abortAndCleanTask(inflight, new FetchError('Request timeout', task.url, HTTP_GATEWAY_TIMEOUT, {
|
|
624
628
|
reason: 'timeout',
|
|
625
629
|
stage: 'transform:worker-timeout',
|
|
@@ -676,7 +680,7 @@ export function getOrCreateWorkerPool() {
|
|
|
676
680
|
logInfo('Initialized transform worker pool', {
|
|
677
681
|
initialCapacity: workerPool.getCapacity(),
|
|
678
682
|
timeoutMs: DEFAULT_TIMEOUT_MS,
|
|
679
|
-
},
|
|
683
|
+
}, LOG_TRANSFORM);
|
|
680
684
|
}
|
|
681
685
|
return workerPool;
|
|
682
686
|
}
|
package/package.json
CHANGED