@j0hanz/fetch-url-mcp 1.9.4 → 1.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/code-lang.d.ts.map +1 -1
- package/dist/lib/code-lang.js +4 -2
- package/dist/lib/dom-prep.d.ts.map +1 -1
- package/dist/lib/dom-prep.js +173 -4
- package/dist/lib/md-cleanup.d.ts +1 -0
- package/dist/lib/md-cleanup.d.ts.map +1 -1
- package/dist/lib/md-cleanup.js +108 -13
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +71 -22
- package/package.json +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"code-lang.d.ts","sourceRoot":"","sources":["../../src/lib/code-lang.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"code-lang.d.ts","sourceRoot":"","sources":["../../src/lib/code-lang.ts"],"names":[],"mappings":"AAgRA,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,MAAM,GAChB,MAAM,GAAG,SAAS,CAuBpB;AAqBD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAqBvE"}
|
package/dist/lib/code-lang.js
CHANGED
|
@@ -82,7 +82,7 @@ const RUST_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
|
82
82
|
const JS_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
83
83
|
const PYTHON_UNIQUE_REGEX = /\b(?:def |elif |except |finally:|yield |lambda |raise |pass$)/m;
|
|
84
84
|
const JS_SIGNAL_REGEX = /\b(?:const |let |var |function |require\(|=>|===|!==|console\.)/;
|
|
85
|
-
const CSS_REGEX = /@media|@import|@keyframes/;
|
|
85
|
+
const CSS_REGEX = /@media|@import|@keyframes|@theme\b|@utility\b|@layer\b|@apply\b|@variant\b|@custom-variant\b|@reference\b|@source\b/;
|
|
86
86
|
const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
|
|
87
87
|
function containsJsxTag(code) {
|
|
88
88
|
const len = code.length;
|
|
@@ -195,6 +195,8 @@ function hasJsSignals(lowerCode) {
|
|
|
195
195
|
lowerCode.includes("from '"));
|
|
196
196
|
}
|
|
197
197
|
function matchPython(ctx) {
|
|
198
|
+
if (matchHtml(ctx))
|
|
199
|
+
return false;
|
|
198
200
|
const l = ctx.lower;
|
|
199
201
|
if (l.includes('print(') || l.includes('__name__'))
|
|
200
202
|
return true;
|
|
@@ -229,6 +231,7 @@ const LANGUAGES = [
|
|
|
229
231
|
{ lang: 'jsx', weight: 22, match: matchJsx },
|
|
230
232
|
{ lang: 'typescript', weight: 20, match: matchTypeScript },
|
|
231
233
|
{ lang: 'sql', weight: 20, match: matchSql },
|
|
234
|
+
{ lang: 'html', weight: 19, match: matchHtml },
|
|
232
235
|
{ lang: 'python', weight: 18, match: matchPython },
|
|
233
236
|
{
|
|
234
237
|
lang: 'css',
|
|
@@ -238,7 +241,6 @@ const LANGUAGES = [
|
|
|
238
241
|
{ lang: 'bash', weight: 15, match: (ctx) => detectBashIndicators(ctx.lines) },
|
|
239
242
|
{ lang: 'yaml', weight: 15, match: (ctx) => detectYamlStructure(ctx.lines) },
|
|
240
243
|
{ lang: 'javascript', weight: 15, match: (ctx) => JS_REGEX.test(ctx.lower) },
|
|
241
|
-
{ lang: 'html', weight: 12, match: matchHtml },
|
|
242
244
|
{
|
|
243
245
|
lang: 'json',
|
|
244
246
|
weight: 10,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAwlBA,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAmPD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAkBN;AAiED,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
|
package/dist/lib/dom-prep.js
CHANGED
|
@@ -17,6 +17,8 @@ const NOISE_PATTERNS = [
|
|
|
17
17
|
];
|
|
18
18
|
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
19
19
|
const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
|
|
20
|
+
const HEADING_PERMALINK_TEXT_PATTERN = /^(?:#|¶|§|¤|🔗)$/u;
|
|
21
|
+
const HEADING_PERMALINK_CLASS_PATTERN = /\b(?:mark|permalink|hash-link|anchor(?:js)?-?link|header-?link|heading-anchor|deep-link)\b/i;
|
|
20
22
|
const SKIP_URL_PREFIXES = [
|
|
21
23
|
'#',
|
|
22
24
|
'javascript:',
|
|
@@ -50,6 +52,9 @@ const NAVIGATION_ROLES = new Set([
|
|
|
50
52
|
'alertdialog',
|
|
51
53
|
'search',
|
|
52
54
|
]);
|
|
55
|
+
const INLINE_DEMO_INSTRUCTION_MAX_CHARS = 160;
|
|
56
|
+
const REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS = 60;
|
|
57
|
+
const REDUNDANT_PREVIEW_MAX_SEGMENTS = 12;
|
|
53
58
|
const INTERACTIVE_CONTENT_ROLES = new Set([
|
|
54
59
|
'tabpanel',
|
|
55
60
|
'tab',
|
|
@@ -362,11 +367,8 @@ function cleanHeadings(document) {
|
|
|
362
367
|
const a = anchors[j];
|
|
363
368
|
if (!a?.parentNode)
|
|
364
369
|
continue;
|
|
365
|
-
|
|
366
|
-
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
367
|
-
if (href.startsWith('#') && txt.length === 0) {
|
|
370
|
+
if (isHeadingPermalinkAnchor(a))
|
|
368
371
|
a.remove();
|
|
369
|
-
}
|
|
370
372
|
}
|
|
371
373
|
// Strip zero-width spaces from text nodes
|
|
372
374
|
const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
|
|
@@ -378,6 +380,47 @@ function cleanHeadings(document) {
|
|
|
378
380
|
}
|
|
379
381
|
}
|
|
380
382
|
}
|
|
383
|
+
function getCollapsedHeadingAnchorText(anchor) {
|
|
384
|
+
return (anchor.textContent || '').replace(/[\u200B\s]/g, '');
|
|
385
|
+
}
|
|
386
|
+
function isHeadingPermalinkAnchor(anchor) {
|
|
387
|
+
const href = anchor.getAttribute('href') ?? '';
|
|
388
|
+
if (!href.startsWith('#'))
|
|
389
|
+
return false;
|
|
390
|
+
const text = getCollapsedHeadingAnchorText(anchor);
|
|
391
|
+
if (text.length === 0 || HEADING_PERMALINK_TEXT_PATTERN.test(text)) {
|
|
392
|
+
return true;
|
|
393
|
+
}
|
|
394
|
+
const className = anchor.getAttribute('class') ?? '';
|
|
395
|
+
if (HEADING_PERMALINK_CLASS_PATTERN.test(className) && text.length <= 2) {
|
|
396
|
+
return true;
|
|
397
|
+
}
|
|
398
|
+
const ariaHidden = anchor.getAttribute('aria-hidden');
|
|
399
|
+
const tabindex = anchor.getAttribute('tabindex');
|
|
400
|
+
return (ariaHidden === 'true' || tabindex === '-1') && text.length <= 2;
|
|
401
|
+
}
|
|
402
|
+
function getDirectRows(section) {
|
|
403
|
+
return Array.from(section.children).filter((child) => child.tagName === 'TR');
|
|
404
|
+
}
|
|
405
|
+
function getDirectCells(row) {
|
|
406
|
+
return Array.from(row.children).filter((child) => child.tagName === 'TH' || child.tagName === 'TD');
|
|
407
|
+
}
|
|
408
|
+
function hoistNestedRows(table) {
|
|
409
|
+
const sections = Array.from(table.querySelectorAll('thead,tbody,tfoot'));
|
|
410
|
+
for (const section of sections) {
|
|
411
|
+
const rows = getDirectRows(section);
|
|
412
|
+
for (const row of rows) {
|
|
413
|
+
let insertAfter = row;
|
|
414
|
+
for (const cell of getDirectCells(row)) {
|
|
415
|
+
const nestedRows = Array.from(cell.querySelectorAll('tr')).filter((nested) => nested.closest('table') === table);
|
|
416
|
+
for (const nestedRow of nestedRows) {
|
|
417
|
+
insertAfter.after(nestedRow);
|
|
418
|
+
insertAfter = nestedRow;
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
381
424
|
function stripNoise(document, context, signal) {
|
|
382
425
|
cleanHeadings(document);
|
|
383
426
|
// Structural Removal
|
|
@@ -506,6 +549,130 @@ function escapeTableCellPipes(document) {
|
|
|
506
549
|
}
|
|
507
550
|
}
|
|
508
551
|
}
|
|
552
|
+
function normalizeWhitespace(value) {
|
|
553
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
554
|
+
}
|
|
555
|
+
function hasDirectPreDescendant(element) {
|
|
556
|
+
return (element.tagName === 'PRE' ||
|
|
557
|
+
Array.from(element.children).some((child) => child.tagName === 'PRE' || child.querySelector('pre') !== null));
|
|
558
|
+
}
|
|
559
|
+
function collectLeafTextSegments(element) {
|
|
560
|
+
const seen = new Set();
|
|
561
|
+
const segments = [];
|
|
562
|
+
const candidates = element.querySelectorAll('p,li,div,span');
|
|
563
|
+
for (const candidate of candidates) {
|
|
564
|
+
if (candidate.children.length > 0 ||
|
|
565
|
+
candidate.querySelector('pre,code,table,ul,ol,blockquote,figure') !== null) {
|
|
566
|
+
continue;
|
|
567
|
+
}
|
|
568
|
+
const text = normalizeWhitespace(candidate.textContent || '');
|
|
569
|
+
if (text.length === 0 ||
|
|
570
|
+
text.length > REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS ||
|
|
571
|
+
seen.has(text)) {
|
|
572
|
+
continue;
|
|
573
|
+
}
|
|
574
|
+
seen.add(text);
|
|
575
|
+
segments.push(text);
|
|
576
|
+
}
|
|
577
|
+
if (segments.length > 0)
|
|
578
|
+
return segments;
|
|
579
|
+
const fallback = normalizeWhitespace(element.textContent || '');
|
|
580
|
+
return fallback ? [fallback] : [];
|
|
581
|
+
}
|
|
582
|
+
function isHostnameLike(value) {
|
|
583
|
+
return /^[a-z0-9.-]+\.[a-z]{2,}$/i.test(value);
|
|
584
|
+
}
|
|
585
|
+
function hasPreviewMedia(element) {
|
|
586
|
+
return element.querySelector('svg,canvas') !== null;
|
|
587
|
+
}
|
|
588
|
+
function isRedundantCodePreview(preview, codeContainer) {
|
|
589
|
+
if (preview.tagName === 'FIGCAPTION' ||
|
|
590
|
+
preview.querySelector('a[href],button,input,select,textarea,form,video,audio,iframe,table,ul,ol,blockquote') !== null) {
|
|
591
|
+
return false;
|
|
592
|
+
}
|
|
593
|
+
const segments = collectLeafTextSegments(preview);
|
|
594
|
+
if (segments.length === 0 ||
|
|
595
|
+
segments.length > REDUNDANT_PREVIEW_MAX_SEGMENTS ||
|
|
596
|
+
segments.some((segment) => segment.length > REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS)) {
|
|
597
|
+
return false;
|
|
598
|
+
}
|
|
599
|
+
const codeText = normalizeWhitespace(codeContainer.textContent || '');
|
|
600
|
+
if (!codeText)
|
|
601
|
+
return false;
|
|
602
|
+
const matchingSegments = segments.filter((segment) => codeText.includes(segment));
|
|
603
|
+
if (matchingSegments.length === segments.length)
|
|
604
|
+
return true;
|
|
605
|
+
return ((hasPreviewMedia(preview) ||
|
|
606
|
+
segments.some((segment) => isHostnameLike(segment))) &&
|
|
607
|
+
matchingSegments.length > 0 &&
|
|
608
|
+
segments.every((segment) => segment.length <= REDUNDANT_PREVIEW_SEGMENT_MAX_CHARS));
|
|
609
|
+
}
|
|
610
|
+
function pruneFigurePreviewPanes(document) {
|
|
611
|
+
for (const figure of document.querySelectorAll('figure')) {
|
|
612
|
+
const directChildren = Array.from(figure.children);
|
|
613
|
+
const codeChild = directChildren.find((child) => hasDirectPreDescendant(child));
|
|
614
|
+
if (!codeChild)
|
|
615
|
+
continue;
|
|
616
|
+
for (const child of directChildren) {
|
|
617
|
+
if (child === codeChild || child.tagName === 'FIGCAPTION')
|
|
618
|
+
continue;
|
|
619
|
+
if (isRedundantCodePreview(child, codeChild))
|
|
620
|
+
child.remove();
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
function isDemoInstructionBlock(element) {
|
|
625
|
+
if (element.querySelector('a[href],pre,code,table,ul,ol,blockquote,figure,h1,h2,h3,h4,h5,h6') !== null) {
|
|
626
|
+
return false;
|
|
627
|
+
}
|
|
628
|
+
const text = normalizeWhitespace(element.textContent || '');
|
|
629
|
+
if (text.length === 0 ||
|
|
630
|
+
text.length > INLINE_DEMO_INSTRUCTION_MAX_CHARS ||
|
|
631
|
+
/[.!?]$/.test(text)) {
|
|
632
|
+
return false;
|
|
633
|
+
}
|
|
634
|
+
return collectLeafTextSegments(element).length <= 3;
|
|
635
|
+
}
|
|
636
|
+
function pruneDemoInstructionBlocks(document) {
|
|
637
|
+
for (const container of document.querySelectorAll('div,section,article')) {
|
|
638
|
+
const children = Array.from(container.children);
|
|
639
|
+
const figureIndex = children.findIndex((child) => child.tagName === 'FIGURE' && child.querySelector('pre') !== null);
|
|
640
|
+
if (figureIndex <= 0)
|
|
641
|
+
continue;
|
|
642
|
+
for (let i = 0; i < figureIndex; i++) {
|
|
643
|
+
const child = children[i];
|
|
644
|
+
if (child && isDemoInstructionBlock(child))
|
|
645
|
+
child.remove();
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
function normalizeHighlightedCodeLines(document) {
|
|
650
|
+
for (const code of document.querySelectorAll('pre > code')) {
|
|
651
|
+
const directChildren = Array.from(code.children);
|
|
652
|
+
if (directChildren.length < 2)
|
|
653
|
+
continue;
|
|
654
|
+
const directSpans = directChildren.filter((child) => child.tagName === 'SPAN');
|
|
655
|
+
if (directSpans.length !== directChildren.length)
|
|
656
|
+
continue;
|
|
657
|
+
const hasLineClass = directSpans.some((child) => (child.getAttribute('class') ?? '').split(/\s+/).includes('line'));
|
|
658
|
+
const hasNewlineNode = Array.from(code.childNodes).some((node) => node.nodeType === 3 && /[\r\n]/.test(node.textContent ?? ''));
|
|
659
|
+
if (hasNewlineNode || !hasLineClass)
|
|
660
|
+
continue;
|
|
661
|
+
for (let i = 0; i < directSpans.length - 1; i++) {
|
|
662
|
+
const current = directSpans[i];
|
|
663
|
+
const next = current?.nextSibling;
|
|
664
|
+
if (next?.nodeType === 3 && (next.textContent ?? '').startsWith('\n')) {
|
|
665
|
+
continue;
|
|
666
|
+
}
|
|
667
|
+
current?.after(document.createTextNode('\n'));
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
function cleanCodeExamples(document) {
|
|
672
|
+
pruneFigurePreviewPanes(document);
|
|
673
|
+
pruneDemoInstructionBlocks(document);
|
|
674
|
+
normalizeHighlightedCodeLines(document);
|
|
675
|
+
}
|
|
509
676
|
function separateAdjacentInlineElements(document) {
|
|
510
677
|
const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
|
|
511
678
|
for (const badge of badges) {
|
|
@@ -524,6 +691,7 @@ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
|
524
691
|
}
|
|
525
692
|
stripNoise(document, context, signal);
|
|
526
693
|
stripTabTriggers(document);
|
|
694
|
+
cleanCodeExamples(document);
|
|
527
695
|
separateAdjacentInlineElements(document);
|
|
528
696
|
flattenTableCellBreaks(document);
|
|
529
697
|
escapeTableCellPipes(document);
|
|
@@ -552,6 +720,7 @@ function normalizeTableStructure(document) {
|
|
|
552
720
|
}
|
|
553
721
|
}
|
|
554
722
|
}
|
|
723
|
+
hoistNestedRows(table);
|
|
555
724
|
}
|
|
556
725
|
}
|
|
557
726
|
function flattenTableCellBreaks(document) {
|
package/dist/lib/md-cleanup.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"md-cleanup.d.ts","sourceRoot":"","sources":["../../src/lib/md-cleanup.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"md-cleanup.d.ts","sourceRoot":"","sources":["../../src/lib/md-cleanup.ts"],"names":[],"mappings":"AAwEA,UAAU,cAAc;IACtB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AA6cD;;;;GAIG;AACH,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,MAAM,EACf,kBAAkB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,GAC3C,MAAM,CAuCR;AAWD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CAmBR"}
|
package/dist/lib/md-cleanup.js
CHANGED
|
@@ -22,6 +22,7 @@ const REGEX = {
|
|
|
22
22
|
HEADING_STRICT: /^#{1,6}\s+/m,
|
|
23
23
|
EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
|
|
24
24
|
ANCHOR_ONLY_HEADING: /^#{1,6}\s+\[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
25
|
+
HEADING_TRAILING_PERMALINK: /^(#{1,6}\s+.+?)\s*\[(?:#|¶|§|¤|🔗)\]\(#[^)]+\)\s*$/gmu,
|
|
25
26
|
FENCE_START: FENCE_PATTERN,
|
|
26
27
|
LIST_MARKER: /^(?:[-*+])\s/m,
|
|
27
28
|
TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
@@ -36,7 +37,10 @@ const REGEX = {
|
|
|
36
37
|
HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
|
|
37
38
|
SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
|
|
38
39
|
SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
|
|
40
|
+
SPACING_CODE_PAD_BEFORE: /(\S)[ \t]{2,}(?=`[^`\n]+`)/g,
|
|
41
|
+
SPACING_CODE_PAD_AFTER: /(`[^`\n]+`)[ \t]{2,}(?=\S)/g,
|
|
39
42
|
SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
|
|
43
|
+
SPACING_ESCAPED_DASH: /(?<=[\w)\]`])\s*\\-\s*(?=[A-Za-z0-9([])/g,
|
|
40
44
|
SPACING_ESCAPES: /\\([[\].])/g,
|
|
41
45
|
SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
|
|
42
46
|
PUNCT_ONLY_LIST_ARTIFACT: /^(?:[-*+]|\d+\.)\s*(?:\\[-*+|/]|[-*+|/])(?:\s+(?:\\[-*+|/]|[-*+|/]))*\s*$/gm,
|
|
@@ -72,6 +76,14 @@ function hasFollowingContent(lines, startIndex) {
|
|
|
72
76
|
}
|
|
73
77
|
return false;
|
|
74
78
|
}
|
|
79
|
+
function findNextNonBlankLine(lines, startIndex) {
|
|
80
|
+
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
|
|
81
|
+
const line = lines[i];
|
|
82
|
+
if (!isBlank(line))
|
|
83
|
+
return line?.trim();
|
|
84
|
+
}
|
|
85
|
+
return undefined;
|
|
86
|
+
}
|
|
75
87
|
function stripAnchorOnlyHeading(line) {
|
|
76
88
|
return line.replace(/^(#{1,6})\s+\[([^\]]+)\]\(#[^)]+\)\s*$/, '$1 $2');
|
|
77
89
|
}
|
|
@@ -191,6 +203,11 @@ function tryPromoteOrphan(lines, i, trimmed) {
|
|
|
191
203
|
const isSpecialPrefix = SPECIAL_PREFIXES.test(trimmed);
|
|
192
204
|
if (!isSpecialPrefix && !hasFollowingContent(lines, i))
|
|
193
205
|
return null;
|
|
206
|
+
if (!isSpecialPrefix) {
|
|
207
|
+
const nextLine = findNextNonBlankLine(lines, i);
|
|
208
|
+
if (nextLine && REGEX.HEADING_MARKER.test(nextLine))
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
194
211
|
return `${prefix}${trimmed}`;
|
|
195
212
|
}
|
|
196
213
|
function shouldSkipAsToc(lines, i, trimmed, removeToc, options) {
|
|
@@ -205,13 +222,16 @@ function shouldSkipAsToc(lines, i, trimmed, removeToc, options) {
|
|
|
205
222
|
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:toc');
|
|
206
223
|
return skipTocLines(lines, i + 1);
|
|
207
224
|
}
|
|
208
|
-
function normalizePreprocessLine(lines, i, trimmed, line) {
|
|
225
|
+
function normalizePreprocessLine(lines, i, trimmed, line, options) {
|
|
209
226
|
if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
|
|
210
227
|
return null;
|
|
211
228
|
if (!REGEX.ANCHOR_ONLY_HEADING.test(trimmed))
|
|
212
229
|
return line;
|
|
213
|
-
if (!hasFollowingContent(lines, i))
|
|
214
|
-
return
|
|
230
|
+
if (!hasFollowingContent(lines, i)) {
|
|
231
|
+
return options?.preserveEmptyHeadings
|
|
232
|
+
? stripAnchorOnlyHeading(trimmed)
|
|
233
|
+
: null;
|
|
234
|
+
}
|
|
215
235
|
return stripAnchorOnlyHeading(trimmed);
|
|
216
236
|
}
|
|
217
237
|
function maybeSkipTocBlock(lines, i, trimmed, options) {
|
|
@@ -235,7 +255,7 @@ function preprocessLines(lines, options) {
|
|
|
235
255
|
if (currentLine === undefined)
|
|
236
256
|
continue;
|
|
237
257
|
const trimmed = currentLine.trim();
|
|
238
|
-
const normalizedLine = normalizePreprocessLine(lines, i, trimmed, currentLine);
|
|
258
|
+
const normalizedLine = normalizePreprocessLine(lines, i, trimmed, currentLine, options);
|
|
239
259
|
if (normalizedLine === null)
|
|
240
260
|
continue;
|
|
241
261
|
const tocSkip = maybeSkipTocBlock(lines, i, trimmed, options);
|
|
@@ -269,21 +289,91 @@ function removeSkipLinks(text) {
|
|
|
269
289
|
function normalizeInlineCodeTokens(text) {
|
|
270
290
|
return text.replace(/`([^`\n]+)`/g, (match, inner) => {
|
|
271
291
|
const trimmed = inner.trim();
|
|
272
|
-
if (trimmed === inner)
|
|
273
|
-
return match;
|
|
274
292
|
if (!/[A-Za-z0-9]/.test(trimmed))
|
|
275
293
|
return match;
|
|
276
294
|
const parts = /^(\s*)(.*?)(\s*)$/.exec(inner);
|
|
277
295
|
if (!parts)
|
|
278
296
|
return match;
|
|
279
|
-
|
|
297
|
+
const normalized = collapseQualifiedIdentifierSpacing(parts[2] ?? '');
|
|
298
|
+
if (trimmed === inner && normalized === inner)
|
|
299
|
+
return match;
|
|
300
|
+
return `${parts[1] ?? ''}\`${normalized}\`${parts[3] ?? ''}`;
|
|
280
301
|
});
|
|
281
302
|
}
|
|
303
|
+
function collapseQualifiedIdentifierSpacing(text) {
|
|
304
|
+
let result = text;
|
|
305
|
+
for (let i = 0; i < PROPERTY_FIX_MAX_PASSES; i++) {
|
|
306
|
+
const next = result.replace(/\b([A-Za-z_$][\w$]*)\.\s+(?=[A-Za-z_$<])/g, '$1.');
|
|
307
|
+
if (next === result)
|
|
308
|
+
break;
|
|
309
|
+
result = next;
|
|
310
|
+
}
|
|
311
|
+
return result;
|
|
312
|
+
}
|
|
313
|
+
function normalizeMarkdownLinkText(text) {
|
|
314
|
+
const normalized = collapseQualifiedIdentifierSpacing(text.replace(/\\`/g, '`').replace(/\\</g, '<').replace(/\\>/g, '>'));
|
|
315
|
+
return normalized.replace(/</g, '\\<').replace(/>/g, '\\>');
|
|
316
|
+
}
|
|
317
|
+
function normalizeMarkdownLinkLabels(text) {
|
|
318
|
+
return text.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${normalizeMarkdownLinkText(linkText)}](${url})`);
|
|
319
|
+
}
|
|
320
|
+
function collapseInlineCodePadding(text) {
|
|
321
|
+
return text
|
|
322
|
+
.replace(/(\S)[ \t]{2,}(?=`[^`\n]+`)/g, '$1 ')
|
|
323
|
+
.replace(/(`[^`\n]+`)[ \t]{2,}(?=\S)/g, '$1 ');
|
|
324
|
+
}
|
|
325
|
+
function escapeAngleBracketsInMarkdownTables(text) {
|
|
326
|
+
return text.replace(/^(?!\|\s*[-: ]+\|)(\|.*\|)\s*$/gm, (line) => line
|
|
327
|
+
.replace(/<\/([A-Za-z][A-Za-z0-9-]*)>/g, '\\</$1\\>')
|
|
328
|
+
.replace(/<([A-Za-z][A-Za-z0-9-]*)>/g, '\\<$1\\>'));
|
|
329
|
+
}
|
|
330
|
+
function stripTrailingHeadingPermalinks(text) {
|
|
331
|
+
return text
|
|
332
|
+
.replace(REGEX.HEADING_TRAILING_PERMALINK, '$1')
|
|
333
|
+
.replace(/^(#{1,6})\s{2,}/gm, '$1 ')
|
|
334
|
+
.replace(/^(#{1,6}\s+.*?)[ \t]+$/gm, '$1');
|
|
335
|
+
}
|
|
336
|
+
function getHeadingInfo(line) {
|
|
337
|
+
const match = /^(#{1,6})\s+/.exec(line.trim());
|
|
338
|
+
if (!match)
|
|
339
|
+
return null;
|
|
340
|
+
return { level: match[1]?.length ?? 0 };
|
|
341
|
+
}
|
|
342
|
+
function removeEmptyHeadingSections(text) {
|
|
343
|
+
const lines = text.split('\n');
|
|
344
|
+
const kept = [];
|
|
345
|
+
for (let i = 0; i < lines.length; i++) {
|
|
346
|
+
const line = lines[i] ?? '';
|
|
347
|
+
const heading = getHeadingInfo(line);
|
|
348
|
+
if (!heading) {
|
|
349
|
+
kept.push(line);
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
let nextIndex = i + 1;
|
|
353
|
+
while (nextIndex < lines.length && isBlank(lines[nextIndex])) {
|
|
354
|
+
nextIndex += 1;
|
|
355
|
+
}
|
|
356
|
+
const nextLine = lines[nextIndex];
|
|
357
|
+
if (nextLine === undefined) {
|
|
358
|
+
kept.push(line);
|
|
359
|
+
continue;
|
|
360
|
+
}
|
|
361
|
+
const nextHeading = getHeadingInfo(nextLine);
|
|
362
|
+
if (nextHeading && nextHeading.level <= heading.level) {
|
|
363
|
+
continue;
|
|
364
|
+
}
|
|
365
|
+
kept.push(line);
|
|
366
|
+
}
|
|
367
|
+
return kept.join('\n').replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
368
|
+
}
|
|
282
369
|
function normalizeMarkdownSpacing(text) {
|
|
283
370
|
let result = text
|
|
284
|
-
.replace(REGEX.SPACING_LINK_FIX, ']($1)
|
|
371
|
+
.replace(REGEX.SPACING_LINK_FIX, ']($1) [')
|
|
285
372
|
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
373
|
+
.replace(REGEX.SPACING_CODE_PAD_BEFORE, '$1 ')
|
|
374
|
+
.replace(REGEX.SPACING_CODE_PAD_AFTER, '$1 ')
|
|
286
375
|
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
376
|
+
.replace(REGEX.SPACING_ESCAPED_DASH, ' - ')
|
|
287
377
|
.replace(REGEX.SPACING_ESCAPES, '$1')
|
|
288
378
|
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
289
379
|
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
@@ -292,9 +382,9 @@ function normalizeMarkdownSpacing(text) {
|
|
|
292
382
|
result = result.replace(/([.!?:;])([A-Z])/g, '$1 $2');
|
|
293
383
|
// Trim whitespace around token-like inline code spans.
|
|
294
384
|
result = normalizeInlineCodeTokens(result);
|
|
295
|
-
|
|
296
|
-
result = result
|
|
297
|
-
result = result
|
|
385
|
+
result = collapseInlineCodePadding(result);
|
|
386
|
+
result = normalizeMarkdownLinkLabels(result);
|
|
387
|
+
result = escapeAngleBracketsInMarkdownTables(result);
|
|
298
388
|
return normalizeNestedListIndentation(result);
|
|
299
389
|
}
|
|
300
390
|
function fixConcatenatedProperties(text) {
|
|
@@ -325,7 +415,8 @@ function applyGlobalRegexes(text, options) {
|
|
|
325
415
|
checkAbort('markdown:cleanup:spacing');
|
|
326
416
|
result = normalizeMarkdownSpacing(result);
|
|
327
417
|
checkAbort('markdown:cleanup:properties');
|
|
328
|
-
|
|
418
|
+
result = fixConcatenatedProperties(result);
|
|
419
|
+
return stripTrailingHeadingPermalinks(result);
|
|
329
420
|
}
|
|
330
421
|
function normalizeNestedListIndentation(text) {
|
|
331
422
|
return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
|
|
@@ -386,6 +477,10 @@ export function cleanupMarkdownArtifacts(content, options) {
|
|
|
386
477
|
if (!content)
|
|
387
478
|
return '';
|
|
388
479
|
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:begin');
|
|
389
|
-
|
|
480
|
+
let result = processFencedContent(content, (text) => processTextBuffer(text.split('\n'), options)).trim();
|
|
481
|
+
if (!options?.preserveEmptyHeadings) {
|
|
482
|
+
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:empty-headings');
|
|
483
|
+
result = removeEmptyHeadingSections(result);
|
|
484
|
+
}
|
|
390
485
|
return stripLeadingBreadcrumbNoise(result);
|
|
391
486
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAgDA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAqJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAwYD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAgDA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAqJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAwYD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAgLD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AA4DD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAyBlE;AA6CD,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AA8xBX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAqCzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -553,7 +553,9 @@ function translateHtmlToMarkdown(params) {
|
|
|
553
553
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
554
554
|
const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
|
|
555
555
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
556
|
-
const cleaned = cleanupMarkdownArtifacts(content, signal
|
|
556
|
+
const cleaned = cleanupMarkdownArtifacts(content, signal
|
|
557
|
+
? { preserveEmptyHeadings: true, signal, url }
|
|
558
|
+
: { preserveEmptyHeadings: true, url });
|
|
557
559
|
return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
|
|
558
560
|
}
|
|
559
561
|
function appendMetadataFooter(content, metadata, url) {
|
|
@@ -815,6 +817,22 @@ const PRIMARY_HEADING_ROOT_SELECTORS = [
|
|
|
815
817
|
'.entry-content',
|
|
816
818
|
'[itemprop="text"]',
|
|
817
819
|
];
|
|
820
|
+
function normalizeSyntheticTitleToken(value) {
|
|
821
|
+
return (value ?? '').replace(/\s+/g, ' ').trim().toLowerCase();
|
|
822
|
+
}
|
|
823
|
+
function shouldPreferPrimaryHeadingTitle(primaryHeading, title) {
|
|
824
|
+
const primary = normalizeSyntheticTitleToken(primaryHeading);
|
|
825
|
+
if (!primary)
|
|
826
|
+
return false;
|
|
827
|
+
const normalizedTitle = normalizeSyntheticTitleToken(title);
|
|
828
|
+
if (!normalizedTitle)
|
|
829
|
+
return true;
|
|
830
|
+
if (normalizedTitle === primary)
|
|
831
|
+
return true;
|
|
832
|
+
return normalizedTitle
|
|
833
|
+
.split(/\s*(?:[-|:•·]|–|—)\s*/u)
|
|
834
|
+
.some((part) => part === primary);
|
|
835
|
+
}
|
|
818
836
|
function findContentRoot(document) {
|
|
819
837
|
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
820
838
|
const element = document.querySelector(selector);
|
|
@@ -829,17 +847,31 @@ function findContentRoot(document) {
|
|
|
829
847
|
return undefined;
|
|
830
848
|
}
|
|
831
849
|
function findPrimaryHeading(document) {
|
|
832
|
-
for (const
|
|
833
|
-
const
|
|
834
|
-
if (!root)
|
|
835
|
-
continue;
|
|
836
|
-
const heading = root.querySelector('h1, h2');
|
|
850
|
+
for (const headingSelector of ['[data-title="true"]', 'h1']) {
|
|
851
|
+
const heading = document.querySelector(headingSelector);
|
|
837
852
|
if (!heading)
|
|
838
853
|
continue;
|
|
839
854
|
const text = heading.textContent.trim();
|
|
840
855
|
if (text)
|
|
841
856
|
return text;
|
|
842
857
|
}
|
|
858
|
+
for (const selector of PRIMARY_HEADING_ROOT_SELECTORS) {
|
|
859
|
+
const root = document.querySelector(selector);
|
|
860
|
+
if (!root)
|
|
861
|
+
continue;
|
|
862
|
+
for (const headingSelector of [
|
|
863
|
+
'[data-title="true"]',
|
|
864
|
+
'h1',
|
|
865
|
+
'h2',
|
|
866
|
+
]) {
|
|
867
|
+
const heading = root.querySelector(headingSelector);
|
|
868
|
+
if (!heading)
|
|
869
|
+
continue;
|
|
870
|
+
const text = heading.textContent.trim();
|
|
871
|
+
if (text)
|
|
872
|
+
return text;
|
|
873
|
+
}
|
|
874
|
+
}
|
|
843
875
|
return undefined;
|
|
844
876
|
}
|
|
845
877
|
function countMatchingElements(root, selector) {
|
|
@@ -971,39 +1003,56 @@ function shouldUseArticleContent(article, document) {
|
|
|
971
1003
|
function buildContentSource(params) {
|
|
972
1004
|
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, signal, } = params;
|
|
973
1005
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1006
|
+
const preparedDocument = document;
|
|
1007
|
+
let primaryHeading = document
|
|
1008
|
+
? TransformHeuristics.findPrimaryHeading(document)
|
|
1009
|
+
: undefined;
|
|
1010
|
+
if (preparedDocument) {
|
|
1011
|
+
prepareDocumentForMarkdown(preparedDocument, url, signal);
|
|
1012
|
+
primaryHeading =
|
|
1013
|
+
TransformHeuristics.findPrimaryHeading(preparedDocument) ??
|
|
1014
|
+
primaryHeading;
|
|
1015
|
+
}
|
|
974
1016
|
const base = {
|
|
975
1017
|
favicon: extractedMeta.favicon,
|
|
976
1018
|
metadata,
|
|
977
1019
|
extractedMetadata: extractedMeta,
|
|
978
1020
|
truncated,
|
|
979
|
-
primaryHeading
|
|
980
|
-
? TransformHeuristics.findPrimaryHeading(document)
|
|
981
|
-
: undefined,
|
|
1021
|
+
primaryHeading,
|
|
982
1022
|
originalHtml: html,
|
|
983
1023
|
};
|
|
984
1024
|
if (useArticleContent && article) {
|
|
985
1025
|
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
986
1026
|
prepareDocumentForMarkdown(articleDoc, url, signal);
|
|
987
|
-
const
|
|
1027
|
+
const articleTitle = article.title !== undefined
|
|
1028
|
+
? normalizeDocumentTitle(article.title, url)
|
|
1029
|
+
: extractedMeta.title;
|
|
1030
|
+
const preferPrimaryHeading = TransformHeuristics.isGithubRepositoryRootUrl(url) ||
|
|
1031
|
+
shouldPreferPrimaryHeadingTitle(base.primaryHeading, articleTitle);
|
|
1032
|
+
const resolvedTitle = (preferPrimaryHeading ? base.primaryHeading : undefined) ?? articleTitle;
|
|
988
1033
|
return {
|
|
989
1034
|
...base,
|
|
990
1035
|
sourceHtml: articleDoc.body.innerHTML,
|
|
991
|
-
title:
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
: undefined),
|
|
1036
|
+
title: resolvedTitle,
|
|
1037
|
+
suppressSyntheticFavicon: normalizeSyntheticTitleToken(resolvedTitle) ===
|
|
1038
|
+
normalizeSyntheticTitleToken(base.primaryHeading),
|
|
995
1039
|
skipNoiseRemoval: true,
|
|
996
1040
|
};
|
|
997
1041
|
}
|
|
998
1042
|
if (document) {
|
|
999
|
-
|
|
1000
|
-
const contentRoot = TransformHeuristics.findContentRoot(
|
|
1043
|
+
const resolvedDocument = preparedDocument ?? document;
|
|
1044
|
+
const contentRoot = TransformHeuristics.findContentRoot(resolvedDocument);
|
|
1045
|
+
const preferPrimaryHeading = shouldPreferPrimaryHeadingTitle(base.primaryHeading, extractedMeta.title);
|
|
1046
|
+
const resolvedTitle = (preferPrimaryHeading ? base.primaryHeading : undefined) ??
|
|
1047
|
+
extractedMeta.title;
|
|
1001
1048
|
return {
|
|
1002
1049
|
...base,
|
|
1003
|
-
sourceHtml: contentRoot ?? serializeDocumentForMarkdown(
|
|
1004
|
-
title:
|
|
1050
|
+
sourceHtml: contentRoot ?? serializeDocumentForMarkdown(resolvedDocument, html),
|
|
1051
|
+
title: resolvedTitle,
|
|
1052
|
+
suppressSyntheticFavicon: normalizeSyntheticTitleToken(resolvedTitle) ===
|
|
1053
|
+
normalizeSyntheticTitleToken(base.primaryHeading),
|
|
1005
1054
|
skipNoiseRemoval: true,
|
|
1006
|
-
document,
|
|
1055
|
+
document: resolvedDocument,
|
|
1007
1056
|
};
|
|
1008
1057
|
}
|
|
1009
1058
|
return {
|
|
@@ -1326,8 +1375,8 @@ function maybeStripGithubPrimaryHeading(markdown, context, url) {
|
|
|
1326
1375
|
return markdown;
|
|
1327
1376
|
return stripLeadingHeading(markdown, context.primaryHeading ?? '');
|
|
1328
1377
|
}
|
|
1329
|
-
function buildSyntheticTitlePrefix(url, favicon) {
|
|
1330
|
-
if (!favicon)
|
|
1378
|
+
function buildSyntheticTitlePrefix(url, favicon, suppressFavicon) {
|
|
1379
|
+
if (!favicon || suppressFavicon)
|
|
1331
1380
|
return ' ';
|
|
1332
1381
|
let alt = '';
|
|
1333
1382
|
try {
|
|
@@ -1342,7 +1391,7 @@ function maybePrependSyntheticTitle(markdown, context, url) {
|
|
|
1342
1391
|
if (!context.title || /^(#{1,6})\s/.test(markdown.trimStart())) {
|
|
1343
1392
|
return markdown;
|
|
1344
1393
|
}
|
|
1345
|
-
return `#${buildSyntheticTitlePrefix(url, context.favicon)}${context.title}\n\n${markdown}`;
|
|
1394
|
+
return `#${buildSyntheticTitlePrefix(url, context.favicon, context.suppressSyntheticFavicon)}${context.title}\n\n${markdown}`;
|
|
1346
1395
|
}
|
|
1347
1396
|
function buildMarkdownFromContext(context, url, signal) {
|
|
1348
1397
|
let content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/fetch-url-mcp",
|
|
3
|
-
"version": "1.9.
|
|
3
|
+
"version": "1.9.5",
|
|
4
4
|
"mcpName": "io.github.j0hanz/fetch-url-mcp",
|
|
5
5
|
"description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
|
|
6
6
|
"type": "module",
|
|
@@ -75,7 +75,7 @@
|
|
|
75
75
|
"linkedom": "^0.18.12",
|
|
76
76
|
"node-html-markdown": "^2.0.0",
|
|
77
77
|
"ts-morph": "^27.0.2",
|
|
78
|
-
"undici": "^7.24.
|
|
78
|
+
"undici": "^7.24.3",
|
|
79
79
|
"zod": "^4.3.6"
|
|
80
80
|
},
|
|
81
81
|
"devDependencies": {
|