@j0hanz/fetch-url-mcp 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +0 -1
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +1 -13
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +2 -5
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +378 -346
- package/dist/lib/core.d.ts +78 -71
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +308 -372
- package/dist/lib/fetch-pipeline.d.ts +2 -6
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +51 -137
- package/dist/lib/http.d.ts.map +1 -1
- package/dist/lib/http.js +188 -130
- package/dist/lib/mcp-tools.d.ts +3 -5
- package/dist/lib/mcp-tools.d.ts.map +1 -1
- package/dist/lib/mcp-tools.js +22 -58
- package/dist/lib/task-handlers.js +4 -4
- package/dist/lib/utils.d.ts +6 -0
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +23 -0
- package/dist/resources/index.js +1 -1
- package/dist/schemas.d.ts +0 -1
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +4 -6
- package/dist/server.js +1 -1
- package/dist/tasks/owner.d.ts +1 -1
- package/dist/tasks/owner.d.ts.map +1 -1
- package/dist/tasks/tool-registry.d.ts +1 -1
- package/dist/tasks/tool-registry.d.ts.map +1 -1
- package/dist/tools/fetch-url.d.ts +2 -3
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +89 -152
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +1 -23
- package/dist/transform/metadata.d.ts +1 -0
- package/dist/transform/metadata.d.ts.map +1 -1
- package/dist/transform/metadata.js +25 -0
- package/dist/transform/transform.d.ts +8 -0
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +190 -109
- package/dist/transform/worker-pool.d.ts +3 -6
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +148 -118
- package/package.json +3 -2
|
@@ -20,6 +20,14 @@ export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock, o
|
|
|
20
20
|
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtmlOrDocument: string | Document): boolean;
|
|
21
21
|
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
22
22
|
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
23
|
+
declare function findContentRoot(document: Document): string | undefined;
|
|
24
|
+
declare function findPrimaryHeading(document: Document): string | undefined;
|
|
25
|
+
declare function isGithubRepositoryRootUrl(url: string): boolean;
|
|
26
|
+
export declare const TransformHeuristics: {
|
|
27
|
+
readonly findContentRoot: typeof findContentRoot;
|
|
28
|
+
readonly findPrimaryHeading: typeof findPrimaryHeading;
|
|
29
|
+
readonly isGithubRepositoryRootUrl: typeof isGithubRepositoryRootUrl;
|
|
30
|
+
};
|
|
23
31
|
export declare function transformHtmlToMarkdownInProcess(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
|
24
32
|
interface TransformPoolStats {
|
|
25
33
|
queueDepth: number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAuCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAuJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAuVD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA+OD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAwJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAsCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AAED,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AAiQX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAmBzB;AA+CD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAsH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -11,7 +11,7 @@ import { createAbortError, throwIfAborted } from '../lib/utils.js';
|
|
|
11
11
|
import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
|
|
12
12
|
import { isObject } from '../lib/utils.js';
|
|
13
13
|
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
14
|
-
import { extractMetadata, extractMetadataFromHead, mergeMetadata, } from './metadata.js';
|
|
14
|
+
import { extractMetadata, extractMetadataFromHead, mergeMetadata, normalizeDocumentTitle, } from './metadata.js';
|
|
15
15
|
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
16
16
|
function decodeInput(input, encoding) {
|
|
17
17
|
if (typeof input === 'string')
|
|
@@ -35,6 +35,16 @@ function asError(value) {
|
|
|
35
35
|
return value instanceof Error ? value : undefined;
|
|
36
36
|
}
|
|
37
37
|
const abortPolicy = { throwIfAborted, createAbortError };
|
|
38
|
+
function isWhitespaceChar(code) {
|
|
39
|
+
return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
|
|
40
|
+
}
|
|
41
|
+
function containsWhitespace(value) {
|
|
42
|
+
for (let i = 0; i < value.length; i += 1) {
|
|
43
|
+
if (isWhitespaceChar(value.charCodeAt(i)))
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
38
48
|
function buildTransformSignal(signal) {
|
|
39
49
|
const { timeoutMs } = config.transform;
|
|
40
50
|
if (timeoutMs <= 0)
|
|
@@ -232,6 +242,9 @@ function willTruncate(html) {
|
|
|
232
242
|
const maxSize = config.constants.maxHtmlSize;
|
|
233
243
|
return (maxSize > 0 && (html.length > maxSize || getUtf8ByteLength(html) > maxSize));
|
|
234
244
|
}
|
|
245
|
+
const MIN_SPA_CONTENT_LENGTH = 100;
|
|
246
|
+
const MIN_READERABLE_TEXT_LENGTH = 400;
|
|
247
|
+
const MAX_READABILITY_ELEMENTS = 20_000;
|
|
235
248
|
function isReadabilityCompatible(doc) {
|
|
236
249
|
if (!isObject(doc))
|
|
237
250
|
return false;
|
|
@@ -270,38 +283,47 @@ function resolveCollapsedTextLengthUpTo(text, max) {
|
|
|
270
283
|
}
|
|
271
284
|
return length;
|
|
272
285
|
}
|
|
286
|
+
function preserveAlertElements(doc) {
|
|
287
|
+
const alerts = doc.querySelectorAll('[role="alert"], .admonition, .callout');
|
|
288
|
+
for (const el of alerts) {
|
|
289
|
+
const bq = doc.createElement('blockquote');
|
|
290
|
+
bq.innerHTML = el.innerHTML;
|
|
291
|
+
el.replaceWith(bq);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
273
294
|
function extractArticle(document, url, signal) {
|
|
274
295
|
if (!isReadabilityCompatible(document)) {
|
|
275
296
|
logWarn('Document not compatible with Readability');
|
|
276
297
|
return null;
|
|
277
298
|
}
|
|
299
|
+
const checkAbort = (stage) => {
|
|
300
|
+
abortPolicy.throwIfAborted(signal, url, stage);
|
|
301
|
+
};
|
|
278
302
|
try {
|
|
279
303
|
const doc = document;
|
|
280
|
-
|
|
281
|
-
abortPolicy.throwIfAborted(signal, url, 'extract:article:textCheck');
|
|
304
|
+
checkAbort('extract:article:textCheck');
|
|
282
305
|
const rawText = doc.querySelector('body')?.textContent ??
|
|
283
306
|
doc.documentElement.textContent ??
|
|
284
307
|
'';
|
|
285
|
-
const textLength = resolveCollapsedTextLengthUpTo(rawText,
|
|
286
|
-
if (textLength <
|
|
308
|
+
const textLength = resolveCollapsedTextLengthUpTo(rawText, MIN_READERABLE_TEXT_LENGTH + 1);
|
|
309
|
+
if (textLength < MIN_SPA_CONTENT_LENGTH) {
|
|
287
310
|
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
288
311
|
'This might be a client-side rendered (SPA) application. ' +
|
|
289
312
|
'Content extraction may be incomplete.', { textLength });
|
|
290
313
|
}
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
314
|
+
checkAbort('extract:article:readabilityCheck');
|
|
315
|
+
if (textLength >= MIN_READERABLE_TEXT_LENGTH &&
|
|
316
|
+
!isProbablyReaderable(doc)) {
|
|
294
317
|
return null;
|
|
295
318
|
}
|
|
296
|
-
|
|
297
|
-
abortPolicy.throwIfAborted(signal, url, 'extract:article:clone');
|
|
319
|
+
checkAbort('extract:article:clone');
|
|
298
320
|
const readabilityDoc = typeof doc.cloneNode === 'function'
|
|
299
321
|
? doc.cloneNode(true)
|
|
300
322
|
: doc;
|
|
301
|
-
|
|
302
|
-
|
|
323
|
+
preserveAlertElements(readabilityDoc);
|
|
324
|
+
checkAbort('extract:article:parse');
|
|
303
325
|
const reader = new Readability(readabilityDoc, {
|
|
304
|
-
maxElemsToParse:
|
|
326
|
+
maxElemsToParse: MAX_READABILITY_ELEMENTS,
|
|
305
327
|
classesToPreserve: [
|
|
306
328
|
'admonition',
|
|
307
329
|
'callout',
|
|
@@ -401,16 +423,8 @@ export function extractContent(html, url, options = {
|
|
|
401
423
|
const result = extractContentContext(html, url, options);
|
|
402
424
|
return { article: result.article, metadata: result.metadata };
|
|
403
425
|
}
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
}
|
|
407
|
-
function containsWhitespace(value) {
|
|
408
|
-
for (let i = 0; i < value.length; i += 1) {
|
|
409
|
-
if (isWhitespaceChar(value.charCodeAt(i)))
|
|
410
|
-
return true;
|
|
411
|
-
}
|
|
412
|
-
return false;
|
|
413
|
-
}
|
|
426
|
+
const ABORT_CHECK_LINE_INTERVAL = 500;
|
|
427
|
+
const CR_CHAR_CODE = 13;
|
|
414
428
|
function resolveRelativeHref(href, baseUrl, origin) {
|
|
415
429
|
const trimmedHref = href.trim();
|
|
416
430
|
if (!trimmedHref || containsWhitespace(trimmedHref))
|
|
@@ -503,39 +517,31 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
503
517
|
let output = '';
|
|
504
518
|
let buffer = '';
|
|
505
519
|
let fenceMarker = null;
|
|
506
|
-
const flushBuffer = () => {
|
|
507
|
-
if (!buffer)
|
|
508
|
-
return;
|
|
509
|
-
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
510
|
-
buffer = '';
|
|
511
|
-
};
|
|
512
520
|
const len = markdown.length;
|
|
513
521
|
let lastIndex = 0;
|
|
514
522
|
let lineCount = 0;
|
|
515
523
|
while (lastIndex < len) {
|
|
516
|
-
if (++lineCount %
|
|
524
|
+
if (++lineCount % ABORT_CHECK_LINE_INTERVAL === 0) {
|
|
517
525
|
abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
518
526
|
}
|
|
527
|
+
// Extract next line (handling CR+LF)
|
|
519
528
|
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
if (nextIndex === -1) {
|
|
523
|
-
line = markdown.slice(lastIndex);
|
|
524
|
-
lineWithNewline = line;
|
|
529
|
+
const isLastLine = nextIndex === -1;
|
|
530
|
+
if (isLastLine)
|
|
525
531
|
nextIndex = len;
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
const trimmed = line.trimStart();
|
|
532
|
+
const lineWithNewline = isLastLine
|
|
533
|
+
? markdown.slice(lastIndex)
|
|
534
|
+
: markdown.slice(lastIndex, nextIndex + 1);
|
|
535
|
+
const lineEnd = !isLastLine &&
|
|
536
|
+
nextIndex > lastIndex &&
|
|
537
|
+
markdown.charCodeAt(nextIndex - 1) === CR_CHAR_CODE
|
|
538
|
+
? nextIndex - 1
|
|
539
|
+
: isLastLine
|
|
540
|
+
? len
|
|
541
|
+
: nextIndex;
|
|
542
|
+
const trimmed = markdown.slice(lastIndex, lineEnd).trimStart();
|
|
538
543
|
if (fenceMarker) {
|
|
544
|
+
// Inside a code fence — pass through without URL resolution
|
|
539
545
|
output += lineWithNewline;
|
|
540
546
|
if (trimmed.startsWith(fenceMarker) &&
|
|
541
547
|
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
@@ -543,9 +549,13 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
543
549
|
}
|
|
544
550
|
}
|
|
545
551
|
else {
|
|
546
|
-
const fenceMatch = FENCE_LINE_PATTERN.exec(
|
|
552
|
+
const fenceMatch = FENCE_LINE_PATTERN.exec(markdown.slice(lastIndex, lineEnd));
|
|
547
553
|
if (fenceMatch?.[1]) {
|
|
548
|
-
|
|
554
|
+
// Entering a code fence — flush buffered content first
|
|
555
|
+
if (buffer) {
|
|
556
|
+
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
557
|
+
buffer = '';
|
|
558
|
+
}
|
|
549
559
|
output += lineWithNewline;
|
|
550
560
|
fenceMarker = fenceMatch[1];
|
|
551
561
|
}
|
|
@@ -553,9 +563,11 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
553
563
|
buffer += lineWithNewline;
|
|
554
564
|
}
|
|
555
565
|
}
|
|
556
|
-
lastIndex = nextIndex;
|
|
566
|
+
lastIndex = isLastLine ? len : nextIndex + 1;
|
|
567
|
+
}
|
|
568
|
+
if (buffer) {
|
|
569
|
+
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
557
570
|
}
|
|
558
|
-
flushBuffer();
|
|
559
571
|
return output;
|
|
560
572
|
}
|
|
561
573
|
function translateHtmlToMarkdown(params) {
|
|
@@ -667,9 +679,6 @@ function resolveHtmlDocument(htmlOrDocument) {
|
|
|
667
679
|
return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
|
|
668
680
|
}
|
|
669
681
|
}
|
|
670
|
-
function countDomSelector(htmlOrDocument, selector) {
|
|
671
|
-
return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
|
|
672
|
-
}
|
|
673
682
|
function countTagsInString(html, regex) {
|
|
674
683
|
let count = 0;
|
|
675
684
|
while (regex.exec(html) !== null) {
|
|
@@ -677,12 +686,6 @@ function countTagsInString(html, regex) {
|
|
|
677
686
|
}
|
|
678
687
|
return count;
|
|
679
688
|
}
|
|
680
|
-
function countHeadingsDom(htmlOrDocument) {
|
|
681
|
-
return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
|
|
682
|
-
}
|
|
683
|
-
function countCodeBlocksDom(htmlOrDocument) {
|
|
684
|
-
return countDomSelector(htmlOrDocument, 'pre');
|
|
685
|
-
}
|
|
686
689
|
function stripNonVisibleNodes(root) {
|
|
687
690
|
for (const el of root.querySelectorAll('script,style,noscript')) {
|
|
688
691
|
el.remove();
|
|
@@ -785,6 +788,9 @@ function hasTruncatedSentences(text) {
|
|
|
785
788
|
return false;
|
|
786
789
|
return incompleteFound / linesFound > MAX_TRUNCATED_LINE_RATIO;
|
|
787
790
|
}
|
|
791
|
+
const MIN_CONTENT_ROOT_LENGTH = 100;
|
|
792
|
+
const HEADING_SCAN_LIMIT = 12;
|
|
793
|
+
const BINARY_SAMPLE_SIZE = 2000;
|
|
788
794
|
export function determineContentExtractionSource(article) {
|
|
789
795
|
return article !== null;
|
|
790
796
|
}
|
|
@@ -797,8 +803,9 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
797
803
|
fetchedAt: new Date().toISOString(),
|
|
798
804
|
};
|
|
799
805
|
if (shouldExtractFromArticle && article) {
|
|
800
|
-
if (article.title !== undefined)
|
|
801
|
-
metadata.title = article.title;
|
|
806
|
+
if (article.title !== undefined) {
|
|
807
|
+
metadata.title = normalizeDocumentTitle(article.title, url);
|
|
808
|
+
}
|
|
802
809
|
if (article.byline !== undefined)
|
|
803
810
|
metadata.author = article.byline;
|
|
804
811
|
}
|
|
@@ -828,6 +835,12 @@ const CONTENT_ROOT_SELECTORS = [
|
|
|
828
835
|
'.post-body',
|
|
829
836
|
'.article-body',
|
|
830
837
|
];
|
|
838
|
+
const PRIMARY_HEADING_ROOT_SELECTORS = [
|
|
839
|
+
...CONTENT_ROOT_SELECTORS,
|
|
840
|
+
'.markdown-body',
|
|
841
|
+
'.entry-content',
|
|
842
|
+
'[itemprop="text"]',
|
|
843
|
+
];
|
|
831
844
|
function findContentRoot(document) {
|
|
832
845
|
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
833
846
|
const element = document.querySelector(selector);
|
|
@@ -836,30 +849,61 @@ function findContentRoot(document) {
|
|
|
836
849
|
const innerHTML = typeof element.innerHTML === 'string'
|
|
837
850
|
? element.innerHTML
|
|
838
851
|
: undefined;
|
|
839
|
-
if (innerHTML && innerHTML.trim().length >
|
|
852
|
+
if (innerHTML && innerHTML.trim().length > MIN_CONTENT_ROOT_LENGTH)
|
|
840
853
|
return innerHTML;
|
|
841
854
|
}
|
|
842
855
|
return undefined;
|
|
843
856
|
}
|
|
844
|
-
function
|
|
857
|
+
function findPrimaryHeading(document) {
|
|
858
|
+
for (const selector of PRIMARY_HEADING_ROOT_SELECTORS) {
|
|
859
|
+
const root = document.querySelector(selector);
|
|
860
|
+
if (!root)
|
|
861
|
+
continue;
|
|
862
|
+
const heading = root.querySelector('h1, h2');
|
|
863
|
+
if (!heading)
|
|
864
|
+
continue;
|
|
865
|
+
const text = heading.textContent.trim();
|
|
866
|
+
if (text)
|
|
867
|
+
return text;
|
|
868
|
+
}
|
|
869
|
+
return undefined;
|
|
870
|
+
}
|
|
871
|
+
function isGithubRepositoryRootUrl(url) {
|
|
872
|
+
let parsed;
|
|
873
|
+
try {
|
|
874
|
+
parsed = new URL(url);
|
|
875
|
+
}
|
|
876
|
+
catch {
|
|
877
|
+
return false;
|
|
878
|
+
}
|
|
879
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
880
|
+
if (hostname !== 'github.com' && hostname !== 'www.github.com') {
|
|
881
|
+
return false;
|
|
882
|
+
}
|
|
883
|
+
return parsed.pathname.split('/').filter(Boolean).length === 2;
|
|
884
|
+
}
|
|
885
|
+
export const TransformHeuristics = {
|
|
886
|
+
findContentRoot,
|
|
887
|
+
findPrimaryHeading,
|
|
888
|
+
isGithubRepositoryRootUrl,
|
|
889
|
+
};
|
|
890
|
+
function shouldUseArticleContent(article, document) {
|
|
845
891
|
const articleLength = article.textContent.length;
|
|
846
|
-
const originalLength = getVisibleTextLength(
|
|
892
|
+
const originalLength = getVisibleTextLength(document);
|
|
847
893
|
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
848
894
|
const ratio = articleLength / originalLength;
|
|
849
895
|
if (ratio < MIN_CONTENT_RATIO)
|
|
850
896
|
return false;
|
|
851
897
|
}
|
|
852
|
-
const originalHeadings =
|
|
898
|
+
const originalHeadings = document.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
|
|
853
899
|
if (originalHeadings > 0) {
|
|
854
|
-
// Optimization: Use regex on article content string instead of parsing it to DOM
|
|
855
900
|
const articleHeadings = countTagsInString(article.content, /<h[1-6]\b/gi);
|
|
856
901
|
const retentionRatio = articleHeadings / originalHeadings;
|
|
857
902
|
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
858
903
|
return false;
|
|
859
904
|
}
|
|
860
|
-
const originalCodeBlocks =
|
|
905
|
+
const originalCodeBlocks = document.querySelectorAll('pre').length;
|
|
861
906
|
if (originalCodeBlocks > 0) {
|
|
862
|
-
// Optimization: Use regex on article content string
|
|
863
907
|
const articleCodeBlocks = countTagsInString(article.content, /<pre\b/gi);
|
|
864
908
|
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
865
909
|
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
|
|
@@ -875,20 +919,27 @@ function buildContentSource(params) {
|
|
|
875
919
|
metadata,
|
|
876
920
|
extractedMetadata: extractedMeta,
|
|
877
921
|
truncated,
|
|
922
|
+
primaryHeading: document
|
|
923
|
+
? TransformHeuristics.findPrimaryHeading(document)
|
|
924
|
+
: undefined,
|
|
878
925
|
};
|
|
879
926
|
if (useArticleContent && article) {
|
|
880
927
|
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
881
928
|
prepareDocumentForMarkdown(articleDoc, url, signal);
|
|
929
|
+
const preferPrimaryHeading = TransformHeuristics.isGithubRepositoryRootUrl(url);
|
|
882
930
|
return {
|
|
883
931
|
...base,
|
|
884
932
|
sourceHtml: articleDoc.body.innerHTML,
|
|
885
|
-
title:
|
|
933
|
+
title: (preferPrimaryHeading ? base.primaryHeading : undefined) ??
|
|
934
|
+
(article.title !== undefined
|
|
935
|
+
? normalizeDocumentTitle(article.title, url)
|
|
936
|
+
: undefined),
|
|
886
937
|
skipNoiseRemoval: true,
|
|
887
938
|
};
|
|
888
939
|
}
|
|
889
940
|
if (document) {
|
|
890
941
|
prepareDocumentForMarkdown(document, url, signal);
|
|
891
|
-
const contentRoot = findContentRoot(document);
|
|
942
|
+
const contentRoot = TransformHeuristics.findContentRoot(document);
|
|
892
943
|
return {
|
|
893
944
|
...base,
|
|
894
945
|
sourceHtml: contentRoot ?? serializeDocumentForMarkdown(document, html),
|
|
@@ -931,7 +982,11 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
931
982
|
...(context.document ? { document: context.document } : {}),
|
|
932
983
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
933
984
|
}));
|
|
934
|
-
if (context.
|
|
985
|
+
if (context.primaryHeading &&
|
|
986
|
+
TransformHeuristics.isGithubRepositoryRootUrl(url)) {
|
|
987
|
+
content = stripLeadingHeading(content, context.primaryHeading);
|
|
988
|
+
}
|
|
989
|
+
if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
|
|
935
990
|
const icon = context.favicon;
|
|
936
991
|
let prefix = ' ';
|
|
937
992
|
if (icon) {
|
|
@@ -953,6 +1008,34 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
953
1008
|
metadata: context.extractedMetadata,
|
|
954
1009
|
};
|
|
955
1010
|
}
|
|
1011
|
+
function normalizeHeadingText(value) {
|
|
1012
|
+
return value.replace(/\s+/g, ' ').trim().toLowerCase();
|
|
1013
|
+
}
|
|
1014
|
+
function stripLeadingHeading(markdown, headingText) {
|
|
1015
|
+
if (!markdown)
|
|
1016
|
+
return markdown;
|
|
1017
|
+
const lines = markdown.split('\n');
|
|
1018
|
+
const target = normalizeHeadingText(headingText);
|
|
1019
|
+
let nonEmptySeen = 0;
|
|
1020
|
+
for (let i = 0; i < lines.length && nonEmptySeen < HEADING_SCAN_LIMIT; i += 1) {
|
|
1021
|
+
const trimmed = lines[i]?.trim() ?? '';
|
|
1022
|
+
if (!trimmed)
|
|
1023
|
+
continue;
|
|
1024
|
+
nonEmptySeen += 1;
|
|
1025
|
+
const match = /^(#{1,6})\s+(.+?)\s*$/.exec(trimmed);
|
|
1026
|
+
if (!match)
|
|
1027
|
+
continue;
|
|
1028
|
+
const current = normalizeHeadingText(match[2] ?? '');
|
|
1029
|
+
if (current !== target)
|
|
1030
|
+
return markdown;
|
|
1031
|
+
lines.splice(i, 1);
|
|
1032
|
+
if ((lines[i] ?? '').trim() === '') {
|
|
1033
|
+
lines.splice(i, 1);
|
|
1034
|
+
}
|
|
1035
|
+
return lines.join('\n');
|
|
1036
|
+
}
|
|
1037
|
+
return markdown;
|
|
1038
|
+
}
|
|
956
1039
|
const REPLACEMENT_CHAR = '\ufffd';
|
|
957
1040
|
const BINARY_INDICATOR_THRESHOLD = 0.1;
|
|
958
1041
|
function hasBinaryIndicators(content) {
|
|
@@ -960,7 +1043,7 @@ function hasBinaryIndicators(content) {
|
|
|
960
1043
|
return false;
|
|
961
1044
|
if (content.includes('\x00'))
|
|
962
1045
|
return true;
|
|
963
|
-
const sampleSize = Math.min(content.length,
|
|
1046
|
+
const sampleSize = Math.min(content.length, BINARY_SAMPLE_SIZE);
|
|
964
1047
|
let replacementCount = 0;
|
|
965
1048
|
let i = -1;
|
|
966
1049
|
while ((i = content.indexOf(REPLACEMENT_CHAR, i + 1)) !== -1 &&
|
|
@@ -972,37 +1055,42 @@ function hasBinaryIndicators(content) {
|
|
|
972
1055
|
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
973
1056
|
const signal = buildTransformSignal(options.signal);
|
|
974
1057
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
975
|
-
let completed = null;
|
|
976
1058
|
try {
|
|
977
1059
|
abortPolicy.throwIfAborted(signal, url, 'transform:begin');
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
html,
|
|
983
|
-
url,
|
|
984
|
-
includeMetadata: options.includeMetadata,
|
|
985
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
986
|
-
}));
|
|
987
|
-
if (raw) {
|
|
988
|
-
completed = raw;
|
|
989
|
-
return raw;
|
|
990
|
-
}
|
|
991
|
-
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
992
|
-
html,
|
|
993
|
-
url,
|
|
994
|
-
includeMetadata: options.includeMetadata,
|
|
995
|
-
...(signal ? { signal } : {}),
|
|
996
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
997
|
-
}));
|
|
998
|
-
const result = buildMarkdownFromContext(context, url, signal);
|
|
999
|
-
completed = result;
|
|
1060
|
+
validateBinaryContent(html, url);
|
|
1061
|
+
const result = tryRawContentPipeline(html, url, options) ??
|
|
1062
|
+
tryHtmlContentPipeline(html, url, options, signal);
|
|
1063
|
+
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1000
1064
|
return result;
|
|
1001
1065
|
}
|
|
1002
|
-
|
|
1003
|
-
|
|
1066
|
+
catch (error) {
|
|
1067
|
+
stageTracker.end(totalStage);
|
|
1068
|
+
throw error;
|
|
1004
1069
|
}
|
|
1005
1070
|
}
|
|
1071
|
+
function validateBinaryContent(html, url) {
|
|
1072
|
+
if (hasBinaryIndicators(html)) {
|
|
1073
|
+
throw new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
function tryRawContentPipeline(html, url, options) {
|
|
1077
|
+
return stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
1078
|
+
html,
|
|
1079
|
+
url,
|
|
1080
|
+
includeMetadata: options.includeMetadata,
|
|
1081
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1082
|
+
}));
|
|
1083
|
+
}
|
|
1084
|
+
function tryHtmlContentPipeline(html, url, options, signal) {
|
|
1085
|
+
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1086
|
+
html,
|
|
1087
|
+
url,
|
|
1088
|
+
includeMetadata: options.includeMetadata,
|
|
1089
|
+
...(signal ? { signal } : {}),
|
|
1090
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1091
|
+
}));
|
|
1092
|
+
return buildMarkdownFromContext(context, url, signal);
|
|
1093
|
+
}
|
|
1006
1094
|
export function getTransformPoolStats() {
|
|
1007
1095
|
return getWorkerPoolStats();
|
|
1008
1096
|
}
|
|
@@ -1012,13 +1100,6 @@ export async function shutdownTransformWorkerPool() {
|
|
|
1012
1100
|
function transformInputInProcess(htmlOrBuffer, url, options) {
|
|
1013
1101
|
return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
|
|
1014
1102
|
}
|
|
1015
|
-
function endTotalTransformStage(context, result) {
|
|
1016
|
-
if (!result) {
|
|
1017
|
-
stageTracker.end(context);
|
|
1018
|
-
return;
|
|
1019
|
-
}
|
|
1020
|
-
stageTracker.end(context, { truncated: result.truncated });
|
|
1021
|
-
}
|
|
1022
1103
|
function buildWorkerTransformOptions(options) {
|
|
1023
1104
|
return {
|
|
1024
1105
|
includeMetadata: options.includeMetadata,
|
|
@@ -1073,15 +1154,15 @@ async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
|
|
|
1073
1154
|
}
|
|
1074
1155
|
async function transformInputToMarkdown(htmlOrBuffer, url, options) {
|
|
1075
1156
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1076
|
-
let completed = null;
|
|
1077
1157
|
try {
|
|
1078
1158
|
abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
|
|
1079
1159
|
const result = await runWorkerTransformWithFallback(htmlOrBuffer, url, options);
|
|
1080
|
-
|
|
1160
|
+
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1081
1161
|
return result;
|
|
1082
1162
|
}
|
|
1083
|
-
|
|
1084
|
-
|
|
1163
|
+
catch (error) {
|
|
1164
|
+
stageTracker.end(totalStage);
|
|
1165
|
+
throw error;
|
|
1085
1166
|
}
|
|
1086
1167
|
}
|
|
1087
1168
|
export async function transformHtmlToMarkdown(html, url, options) {
|
|
@@ -17,13 +17,13 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
17
17
|
private readonly minCapacity;
|
|
18
18
|
private readonly maxCapacity;
|
|
19
19
|
private readonly queue;
|
|
20
|
-
private queueHead;
|
|
21
20
|
private readonly inflight;
|
|
22
21
|
private readonly cancelAcks;
|
|
23
22
|
private readonly timeoutMs;
|
|
24
23
|
private readonly queueMax;
|
|
25
24
|
private closed;
|
|
26
25
|
private taskIdSeq;
|
|
26
|
+
private busyCount;
|
|
27
27
|
constructor(size: number, timeoutMs: number);
|
|
28
28
|
transform(html: string, url: string, options: {
|
|
29
29
|
includeMetadata: boolean;
|
|
@@ -44,8 +44,6 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
44
44
|
private ensureOpen;
|
|
45
45
|
private createPendingTask;
|
|
46
46
|
private onAbortSignal;
|
|
47
|
-
private resolveCancelAck;
|
|
48
|
-
private waitForCancelAck;
|
|
49
47
|
private abortInflight;
|
|
50
48
|
private clearAbortListener;
|
|
51
49
|
private spawnWorker;
|
|
@@ -57,11 +55,10 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
57
55
|
private failTask;
|
|
58
56
|
private maybeScaleUp;
|
|
59
57
|
private drainQueue;
|
|
60
|
-
private takeNextQueuedTask;
|
|
61
58
|
private dispatchFromQueue;
|
|
59
|
+
private registerInflight;
|
|
60
|
+
private sendToWorker;
|
|
62
61
|
private finalizeTask;
|
|
63
|
-
private findQueuedIndex;
|
|
64
|
-
private maybeCompactQueue;
|
|
65
62
|
}
|
|
66
63
|
export declare function getOrCreateWorkerPool(): WorkerPool;
|
|
67
64
|
export declare function getWorkerPoolStats(): {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AA0BA,OAAO,KAAK,EACV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;AAqJpB,UAAU,mBAAmB;IAC3B,SAAS,CACP,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC,CAAC;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,aAAa,IAAI,MAAM,CAAC;IACxB,gBAAgB,IAAI,MAAM,CAAC;IAC3B,WAAW,IAAI,MAAM,CAAC;CACvB;AAiID,cAAM,UAAW,YAAW,mBAAmB;IAC7C,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAkC;IAExE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkC;IAC1D,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAChD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAEhD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAgC;IACtD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAmC;IAC5D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAA0B;IAErD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,SAAS,CAAK;gBAEV,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM;IASrC,SAAS,CACb,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC;IAC7B,SAAS,CACb,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GACA,OAAO,CAAC,uBAAuB,CAAC;IAmCnC,aAAa,IAAI,MAAM;IAIvB,gBAAgB,IAAI,MAAM;IAI1B,WAAW,IAAI,MAAM;IAIrB,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAWpB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IA+B5B,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,iBAAiB;IAkDzB,OAAO,CAAC,aAAa;YA4BP,aAAa;IA2B3B,OAAO,CAAC,kBAAkB;IAY1B,OAAO,CAAC,WAAW;IAmCnB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,aAAa;IAYrB,OAAO,CAAC,eAAe;IAsDvB,OAAO,CAAC,YAAY;IAWpB,OAAO,CAAC,QAAQ;IAUhB,OAAO,CAAC,QAAQ;IAWhB,OAAO,CAAC,YAAY;IASpB,OAAO,CAAC,UAAU;IA2BlB,OAAO,CAAC,iBAAiB;IA4BzB,OAAO,CAAC,gBAAgB;IA8CxB,OAAO,CAAC,YAAY;IA0BpB,OAAO,CAAC,YAAY;CAOrB;AAMD,wBAAgB,qBAAqB,IAAI,UAAU,CAIlD;AAED,wBAAgB,kBAAkB,IAAI;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB,GAAG,IAAI,CAOP;AAED,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAIxD"}
|