@j0hanz/fetch-url-mcp 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +82 -1
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +1 -23
- package/dist/transform/metadata.d.ts +1 -0
- package/dist/transform/metadata.d.ts.map +1 -1
- package/dist/transform/metadata.js +25 -0
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +85 -5
- package/package.json +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAiiB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAuCD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAkVD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CA6BvE;AA+CD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAyRD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6DR;AA2GD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAmCxE;AAcD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|
package/dist/lib/content.js
CHANGED
|
@@ -194,8 +194,9 @@ function getContext() {
|
|
|
194
194
|
function isInteractive(element, role) {
|
|
195
195
|
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
196
196
|
return true;
|
|
197
|
+
const tag = element.tagName.toLowerCase();
|
|
197
198
|
const ds = element.getAttribute('data-state');
|
|
198
|
-
if (ds === 'inactive' || ds === 'closed')
|
|
199
|
+
if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
|
|
199
200
|
return true;
|
|
200
201
|
const dataOrientation = element.getAttribute('data-orientation');
|
|
201
202
|
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
@@ -215,6 +216,19 @@ function isWithinPrimaryContent(element) {
|
|
|
215
216
|
}
|
|
216
217
|
return false;
|
|
217
218
|
}
|
|
219
|
+
const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
|
|
220
|
+
const ASIDE_NAV_MIN_LINKS = 10;
|
|
221
|
+
function isNavigationAside(element) {
|
|
222
|
+
if (element.querySelector('nav'))
|
|
223
|
+
return true;
|
|
224
|
+
const links = element.querySelectorAll('a[href]');
|
|
225
|
+
if (links.length < ASIDE_NAV_MIN_LINKS)
|
|
226
|
+
return false;
|
|
227
|
+
const textLen = (element.textContent || '').trim().length;
|
|
228
|
+
if (textLen === 0)
|
|
229
|
+
return true;
|
|
230
|
+
return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
|
|
231
|
+
}
|
|
218
232
|
function shouldPreserve(element, tagName) {
|
|
219
233
|
// Check Dialog
|
|
220
234
|
const role = element.getAttribute('role');
|
|
@@ -233,6 +247,12 @@ function shouldPreserve(element, tagName) {
|
|
|
233
247
|
return ((element.textContent || '').trim().length >=
|
|
234
248
|
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
235
249
|
}
|
|
250
|
+
// Check Aside — preserve only if it looks like article content, not navigation
|
|
251
|
+
if (tagName === 'aside') {
|
|
252
|
+
if (!isWithinPrimaryContent(element))
|
|
253
|
+
return false;
|
|
254
|
+
return !isNavigationAside(element);
|
|
255
|
+
}
|
|
236
256
|
return false;
|
|
237
257
|
}
|
|
238
258
|
function removeNodes(nodes) {
|
|
@@ -254,6 +274,10 @@ function scoreNavFooter(meta, weights) {
|
|
|
254
274
|
score += weights.structural;
|
|
255
275
|
}
|
|
256
276
|
}
|
|
277
|
+
// Aside (sidebar/complementary) — noise unless inside primary content
|
|
278
|
+
if (meta.tagName === 'aside') {
|
|
279
|
+
score += weights.structural;
|
|
280
|
+
}
|
|
257
281
|
// Role Noise
|
|
258
282
|
if (meta.role && NAVIGATION_ROLES.has(meta.role)) {
|
|
259
283
|
if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
|
|
@@ -469,6 +493,29 @@ function mayContainNoise(html) {
|
|
|
469
493
|
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
470
494
|
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
471
495
|
}
|
|
496
|
+
function stripTabTriggers(document) {
|
|
497
|
+
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
498
|
+
for (let i = tabs.length - 1; i >= 0; i--) {
|
|
499
|
+
tabs[i]?.remove();
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
function escapeTableCellPipes(document) {
|
|
503
|
+
const codes = document.querySelectorAll('td code, th code');
|
|
504
|
+
for (const code of codes) {
|
|
505
|
+
if (code.textContent.includes('|')) {
|
|
506
|
+
code.textContent = code.textContent.replace(/\|/g, '\\|');
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
function separateAdjacentInlineElements(document) {
|
|
511
|
+
const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
|
|
512
|
+
for (const badge of badges) {
|
|
513
|
+
const next = badge.nextSibling;
|
|
514
|
+
if (next?.nodeType === 1) {
|
|
515
|
+
badge.after(document.createTextNode(' '));
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
472
519
|
export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
473
520
|
const context = getContext();
|
|
474
521
|
if (config.noiseRemoval.debug) {
|
|
@@ -477,9 +524,37 @@ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
|
477
524
|
});
|
|
478
525
|
}
|
|
479
526
|
stripNoise(document, context, signal);
|
|
527
|
+
stripTabTriggers(document);
|
|
528
|
+
separateAdjacentInlineElements(document);
|
|
529
|
+
flattenTableCellBreaks(document);
|
|
530
|
+
escapeTableCellPipes(document);
|
|
531
|
+
normalizeTableStructure(document);
|
|
480
532
|
if (baseUrl)
|
|
481
533
|
resolveUrls(document, baseUrl);
|
|
482
534
|
}
|
|
535
|
+
// Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
|
|
536
|
+
function normalizeTableStructure(document) {
|
|
537
|
+
for (const table of document.querySelectorAll('table')) {
|
|
538
|
+
for (const cell of table.querySelectorAll('th, td')) {
|
|
539
|
+
for (const tag of ['tbody', 'thead', 'tfoot']) {
|
|
540
|
+
let nested = cell.querySelector(tag);
|
|
541
|
+
while (nested) {
|
|
542
|
+
table.appendChild(nested);
|
|
543
|
+
nested = cell.querySelector(tag);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
function flattenTableCellBreaks(document) {
|
|
550
|
+
const cells = document.querySelectorAll('td, th');
|
|
551
|
+
for (const cell of cells) {
|
|
552
|
+
const brs = cell.querySelectorAll('br');
|
|
553
|
+
for (const br of brs) {
|
|
554
|
+
br.replaceWith(' ');
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
}
|
|
483
558
|
export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
484
559
|
const shouldParse = isFullDocumentHtml(html) ||
|
|
485
560
|
mayContainNoise(html) ||
|
|
@@ -1076,6 +1151,8 @@ function processTextBuffer(lines, options) {
|
|
|
1076
1151
|
function applyGlobalRegexes(text, options) {
|
|
1077
1152
|
let result = text;
|
|
1078
1153
|
const checkAbort = createAbortChecker(options);
|
|
1154
|
+
// Normalize non-breaking spaces to regular spaces
|
|
1155
|
+
result = result.replace(/\u00A0/g, ' ');
|
|
1079
1156
|
checkAbort('markdown:cleanup:headings');
|
|
1080
1157
|
// fixAndSpaceHeadings
|
|
1081
1158
|
result = result
|
|
@@ -1105,6 +1182,10 @@ function applyGlobalRegexes(text, options) {
|
|
|
1105
1182
|
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
1106
1183
|
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
1107
1184
|
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
1185
|
+
// Trim leading whitespace inside inline code spans
|
|
1186
|
+
result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
|
|
1187
|
+
// Unescape backticks inside markdown link text
|
|
1188
|
+
result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
|
|
1108
1189
|
result = normalizeNestedListIndentation(result);
|
|
1109
1190
|
checkAbort('markdown:cleanup:properties');
|
|
1110
1191
|
// fixProperties
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4fA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -346,14 +346,8 @@ function buildSpanTranslator(ctx) {
|
|
|
346
346
|
return {};
|
|
347
347
|
}
|
|
348
348
|
// ---------------------------------------------------------------------------
|
|
349
|
-
//
|
|
349
|
+
// DL helpers
|
|
350
350
|
// ---------------------------------------------------------------------------
|
|
351
|
-
function hasComplexTableLayout(node) {
|
|
352
|
-
if (!isLikeNode(node))
|
|
353
|
-
return false;
|
|
354
|
-
const innerHTML = typeof node.innerHTML === 'string' ? node.innerHTML : '';
|
|
355
|
-
return /(?:colspan|rowspan)=["']?[2-9]/i.test(innerHTML);
|
|
356
|
-
}
|
|
357
351
|
function resolveDlNodeName(child) {
|
|
358
352
|
if (!isLikeNode(child))
|
|
359
353
|
return '';
|
|
@@ -381,22 +375,6 @@ function createCustomTranslators() {
|
|
|
381
375
|
return {
|
|
382
376
|
code: (ctx) => buildCodeTranslator(ctx),
|
|
383
377
|
img: (ctx) => buildImageTranslator(ctx),
|
|
384
|
-
table: (ctx) => {
|
|
385
|
-
if (!isObject(ctx))
|
|
386
|
-
return {};
|
|
387
|
-
const { node } = ctx;
|
|
388
|
-
if (hasComplexTableLayout(node)) {
|
|
389
|
-
return {
|
|
390
|
-
postprocess: ({ content }) => {
|
|
391
|
-
const trimmed = content.trim();
|
|
392
|
-
if (!trimmed)
|
|
393
|
-
return '';
|
|
394
|
-
return `\n\n${trimmed}\n\n`;
|
|
395
|
-
},
|
|
396
|
-
};
|
|
397
|
-
}
|
|
398
|
-
return {};
|
|
399
|
-
},
|
|
400
378
|
dl: (ctx) => {
|
|
401
379
|
if (!isObject(ctx))
|
|
402
380
|
return { content: '' };
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtractedMetadata } from './types.js';
|
|
2
|
+
export declare function normalizeDocumentTitle(title: string, baseUrl?: string): string;
|
|
2
3
|
export declare function extractMetadata(document: Document, baseUrl?: string): ExtractedMetadata;
|
|
3
4
|
export declare function extractMetadataFromHead(html: string, baseUrl?: string): ExtractedMetadata | null;
|
|
4
5
|
export declare function mergeMetadata(early: ExtractedMetadata | null, late: ExtractedMetadata): ExtractedMetadata;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAoCpD,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAsBR;AAuID,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,CAkBnB;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,GAAG,IAAI,CAY1B;AAED,wBAAgB,aAAa,CAC3B,KAAK,EAAE,iBAAiB,GAAG,IAAI,EAC/B,IAAI,EAAE,iBAAiB,GACtB,iBAAiB,CAmBnB"}
|
|
@@ -15,6 +15,28 @@ function extractHeadSection(html) {
|
|
|
15
15
|
return null;
|
|
16
16
|
return html.substring(0, match.index);
|
|
17
17
|
}
|
|
18
|
+
export function normalizeDocumentTitle(title, baseUrl) {
|
|
19
|
+
if (!baseUrl || !title.startsWith('GitHub - '))
|
|
20
|
+
return title;
|
|
21
|
+
let parsed;
|
|
22
|
+
try {
|
|
23
|
+
parsed = new URL(baseUrl);
|
|
24
|
+
}
|
|
25
|
+
catch {
|
|
26
|
+
return title;
|
|
27
|
+
}
|
|
28
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
29
|
+
if (hostname !== 'github.com' && hostname !== 'www.github.com') {
|
|
30
|
+
return title;
|
|
31
|
+
}
|
|
32
|
+
const segments = parsed.pathname.split('/').filter(Boolean);
|
|
33
|
+
if (segments.length !== 2)
|
|
34
|
+
return title;
|
|
35
|
+
const [owner, repo] = segments;
|
|
36
|
+
if (!owner || !repo)
|
|
37
|
+
return title;
|
|
38
|
+
return `${owner}/${repo}`;
|
|
39
|
+
}
|
|
18
40
|
const META_PROPERTY_HANDLERS = new Map([
|
|
19
41
|
[
|
|
20
42
|
'og:title',
|
|
@@ -139,6 +161,9 @@ function resolveFaviconUrl(href, baseUrl) {
|
|
|
139
161
|
export function extractMetadata(document, baseUrl) {
|
|
140
162
|
const ctx = buildMetaContext(document);
|
|
141
163
|
const metadata = resolveMetadataFromContext(ctx);
|
|
164
|
+
if (metadata.title) {
|
|
165
|
+
metadata.title = normalizeDocumentTitle(metadata.title, baseUrl);
|
|
166
|
+
}
|
|
142
167
|
if (baseUrl) {
|
|
143
168
|
const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
|
|
144
169
|
const href = icon32?.getAttribute('href');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAuCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA4ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AA6UD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAqPD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuKD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AA6DD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAyUD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CA6CzB;AAED,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkI1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -11,7 +11,7 @@ import { createAbortError, throwIfAborted } from '../lib/utils.js';
|
|
|
11
11
|
import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
|
|
12
12
|
import { isObject } from '../lib/utils.js';
|
|
13
13
|
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
14
|
-
import { extractMetadata, extractMetadataFromHead, mergeMetadata, } from './metadata.js';
|
|
14
|
+
import { extractMetadata, extractMetadataFromHead, mergeMetadata, normalizeDocumentTitle, } from './metadata.js';
|
|
15
15
|
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
16
16
|
function decodeInput(input, encoding) {
|
|
17
17
|
if (typeof input === 'string')
|
|
@@ -270,6 +270,14 @@ function resolveCollapsedTextLengthUpTo(text, max) {
|
|
|
270
270
|
}
|
|
271
271
|
return length;
|
|
272
272
|
}
|
|
273
|
+
function preserveAlertElements(doc) {
|
|
274
|
+
const alerts = doc.querySelectorAll('[role="alert"], .admonition, .callout');
|
|
275
|
+
for (const el of alerts) {
|
|
276
|
+
const bq = doc.createElement('blockquote');
|
|
277
|
+
bq.innerHTML = el.innerHTML;
|
|
278
|
+
el.replaceWith(bq);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
273
281
|
function extractArticle(document, url, signal) {
|
|
274
282
|
if (!isReadabilityCompatible(document)) {
|
|
275
283
|
logWarn('Document not compatible with Readability');
|
|
@@ -298,6 +306,7 @@ function extractArticle(document, url, signal) {
|
|
|
298
306
|
const readabilityDoc = typeof doc.cloneNode === 'function'
|
|
299
307
|
? doc.cloneNode(true)
|
|
300
308
|
: doc;
|
|
309
|
+
preserveAlertElements(readabilityDoc);
|
|
301
310
|
// F1: Check abort before heavy Readability parse
|
|
302
311
|
abortPolicy.throwIfAborted(signal, url, 'extract:article:parse');
|
|
303
312
|
const reader = new Readability(readabilityDoc, {
|
|
@@ -797,8 +806,9 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
797
806
|
fetchedAt: new Date().toISOString(),
|
|
798
807
|
};
|
|
799
808
|
if (shouldExtractFromArticle && article) {
|
|
800
|
-
if (article.title !== undefined)
|
|
801
|
-
metadata.title = article.title;
|
|
809
|
+
if (article.title !== undefined) {
|
|
810
|
+
metadata.title = normalizeDocumentTitle(article.title, url);
|
|
811
|
+
}
|
|
802
812
|
if (article.byline !== undefined)
|
|
803
813
|
metadata.author = article.byline;
|
|
804
814
|
}
|
|
@@ -828,6 +838,12 @@ const CONTENT_ROOT_SELECTORS = [
|
|
|
828
838
|
'.post-body',
|
|
829
839
|
'.article-body',
|
|
830
840
|
];
|
|
841
|
+
const PRIMARY_HEADING_ROOT_SELECTORS = [
|
|
842
|
+
...CONTENT_ROOT_SELECTORS,
|
|
843
|
+
'.markdown-body',
|
|
844
|
+
'.entry-content',
|
|
845
|
+
'[itemprop="text"]',
|
|
846
|
+
];
|
|
831
847
|
function findContentRoot(document) {
|
|
832
848
|
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
833
849
|
const element = document.querySelector(selector);
|
|
@@ -841,6 +857,34 @@ function findContentRoot(document) {
|
|
|
841
857
|
}
|
|
842
858
|
return undefined;
|
|
843
859
|
}
|
|
860
|
+
function findPrimaryHeading(document) {
|
|
861
|
+
for (const selector of PRIMARY_HEADING_ROOT_SELECTORS) {
|
|
862
|
+
const root = document.querySelector(selector);
|
|
863
|
+
if (!root)
|
|
864
|
+
continue;
|
|
865
|
+
const heading = root.querySelector('h1, h2');
|
|
866
|
+
if (!heading)
|
|
867
|
+
continue;
|
|
868
|
+
const text = heading.textContent.trim();
|
|
869
|
+
if (text)
|
|
870
|
+
return text;
|
|
871
|
+
}
|
|
872
|
+
return undefined;
|
|
873
|
+
}
|
|
874
|
+
function isGithubRepositoryRootUrl(url) {
|
|
875
|
+
let parsed;
|
|
876
|
+
try {
|
|
877
|
+
parsed = new URL(url);
|
|
878
|
+
}
|
|
879
|
+
catch {
|
|
880
|
+
return false;
|
|
881
|
+
}
|
|
882
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
883
|
+
if (hostname !== 'github.com' && hostname !== 'www.github.com') {
|
|
884
|
+
return false;
|
|
885
|
+
}
|
|
886
|
+
return parsed.pathname.split('/').filter(Boolean).length === 2;
|
|
887
|
+
}
|
|
844
888
|
function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
845
889
|
const articleLength = article.textContent.length;
|
|
846
890
|
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
@@ -875,14 +919,19 @@ function buildContentSource(params) {
|
|
|
875
919
|
metadata,
|
|
876
920
|
extractedMetadata: extractedMeta,
|
|
877
921
|
truncated,
|
|
922
|
+
primaryHeading: document ? findPrimaryHeading(document) : undefined,
|
|
878
923
|
};
|
|
879
924
|
if (useArticleContent && article) {
|
|
880
925
|
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
881
926
|
prepareDocumentForMarkdown(articleDoc, url, signal);
|
|
927
|
+
const preferPrimaryHeading = isGithubRepositoryRootUrl(url);
|
|
882
928
|
return {
|
|
883
929
|
...base,
|
|
884
930
|
sourceHtml: articleDoc.body.innerHTML,
|
|
885
|
-
title:
|
|
931
|
+
title: (preferPrimaryHeading ? base.primaryHeading : undefined) ??
|
|
932
|
+
(article.title !== undefined
|
|
933
|
+
? normalizeDocumentTitle(article.title, url)
|
|
934
|
+
: undefined),
|
|
886
935
|
skipNoiseRemoval: true,
|
|
887
936
|
};
|
|
888
937
|
}
|
|
@@ -931,7 +980,10 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
931
980
|
...(context.document ? { document: context.document } : {}),
|
|
932
981
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
933
982
|
}));
|
|
934
|
-
if (context.
|
|
983
|
+
if (context.primaryHeading && isGithubRepositoryRootUrl(url)) {
|
|
984
|
+
content = stripLeadingHeading(content, context.primaryHeading);
|
|
985
|
+
}
|
|
986
|
+
if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
|
|
935
987
|
const icon = context.favicon;
|
|
936
988
|
let prefix = ' ';
|
|
937
989
|
if (icon) {
|
|
@@ -953,6 +1005,34 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
953
1005
|
metadata: context.extractedMetadata,
|
|
954
1006
|
};
|
|
955
1007
|
}
|
|
1008
|
+
function normalizeHeadingText(value) {
|
|
1009
|
+
return value.replace(/\s+/g, ' ').trim().toLowerCase();
|
|
1010
|
+
}
|
|
1011
|
+
function stripLeadingHeading(markdown, headingText) {
|
|
1012
|
+
if (!markdown)
|
|
1013
|
+
return markdown;
|
|
1014
|
+
const lines = markdown.split('\n');
|
|
1015
|
+
const target = normalizeHeadingText(headingText);
|
|
1016
|
+
let nonEmptySeen = 0;
|
|
1017
|
+
for (let i = 0; i < lines.length && nonEmptySeen < 12; i += 1) {
|
|
1018
|
+
const trimmed = lines[i]?.trim() ?? '';
|
|
1019
|
+
if (!trimmed)
|
|
1020
|
+
continue;
|
|
1021
|
+
nonEmptySeen += 1;
|
|
1022
|
+
const match = /^(#{1,6})\s+(.+?)\s*$/.exec(trimmed);
|
|
1023
|
+
if (!match)
|
|
1024
|
+
continue;
|
|
1025
|
+
const current = normalizeHeadingText(match[2] ?? '');
|
|
1026
|
+
if (current !== target)
|
|
1027
|
+
return markdown;
|
|
1028
|
+
lines.splice(i, 1);
|
|
1029
|
+
if ((lines[i] ?? '').trim() === '') {
|
|
1030
|
+
lines.splice(i, 1);
|
|
1031
|
+
}
|
|
1032
|
+
return lines.join('\n');
|
|
1033
|
+
}
|
|
1034
|
+
return markdown;
|
|
1035
|
+
}
|
|
956
1036
|
const REPLACEMENT_CHAR = '\ufffd';
|
|
957
1037
|
const BINARY_INDICATOR_THRESHOLD = 0.1;
|
|
958
1038
|
function hasBinaryIndicators(content) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/fetch-url-mcp",
|
|
3
|
-
"version": "1.9.
|
|
3
|
+
"version": "1.9.1",
|
|
4
4
|
"mcpName": "io.github.j0hanz/fetch-url-mcp",
|
|
5
5
|
"description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
|
|
6
6
|
"type": "module",
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
"@mozilla/readability": "^0.6.0",
|
|
75
75
|
"linkedom": "^0.18.12",
|
|
76
76
|
"node-html-markdown": "^2.0.0",
|
|
77
|
-
"undici": "^7.
|
|
77
|
+
"undici": "^7.24.1",
|
|
78
78
|
"zod": "^4.3.6"
|
|
79
79
|
},
|
|
80
80
|
"devDependencies": {
|