@j0hanz/fetch-url-mcp 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AA0gB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAWD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAYN;AACD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAkVD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CA6BvE;AA+CD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AA4QD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6DR;AA2GD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAmCxE;AAcD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
1
+ {"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAiiB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAuCD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAkVD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CA6BvE;AA+CD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAyRD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6DR;AA2GD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAmCxE;AAcD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
@@ -194,8 +194,9 @@ function getContext() {
194
194
  function isInteractive(element, role) {
195
195
  if (role && INTERACTIVE_CONTENT_ROLES.has(role))
196
196
  return true;
197
+ const tag = element.tagName.toLowerCase();
197
198
  const ds = element.getAttribute('data-state');
198
- if (ds === 'inactive' || ds === 'closed')
199
+ if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
199
200
  return true;
200
201
  const dataOrientation = element.getAttribute('data-orientation');
201
202
  if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
@@ -215,6 +216,19 @@ function isWithinPrimaryContent(element) {
215
216
  }
216
217
  return false;
217
218
  }
219
+ const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
220
+ const ASIDE_NAV_MIN_LINKS = 10;
221
+ function isNavigationAside(element) {
222
+ if (element.querySelector('nav'))
223
+ return true;
224
+ const links = element.querySelectorAll('a[href]');
225
+ if (links.length < ASIDE_NAV_MIN_LINKS)
226
+ return false;
227
+ const textLen = (element.textContent || '').trim().length;
228
+ if (textLen === 0)
229
+ return true;
230
+ return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
231
+ }
218
232
  function shouldPreserve(element, tagName) {
219
233
  // Check Dialog
220
234
  const role = element.getAttribute('role');
@@ -233,6 +247,12 @@ function shouldPreserve(element, tagName) {
233
247
  return ((element.textContent || '').trim().length >=
234
248
  NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
235
249
  }
250
+ // Check Aside — preserve only if it looks like article content, not navigation
251
+ if (tagName === 'aside') {
252
+ if (!isWithinPrimaryContent(element))
253
+ return false;
254
+ return !isNavigationAside(element);
255
+ }
236
256
  return false;
237
257
  }
238
258
  function removeNodes(nodes) {
@@ -254,6 +274,10 @@ function scoreNavFooter(meta, weights) {
254
274
  score += weights.structural;
255
275
  }
256
276
  }
277
+ // Aside (sidebar/complementary) — noise unless inside primary content
278
+ if (meta.tagName === 'aside') {
279
+ score += weights.structural;
280
+ }
257
281
  // Role Noise
258
282
  if (meta.role && NAVIGATION_ROLES.has(meta.role)) {
259
283
  if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
@@ -469,6 +493,29 @@ function mayContainNoise(html) {
469
493
  : `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
470
494
  return NOISE_PATTERNS.some((re) => re.test(sample));
471
495
  }
496
+ function stripTabTriggers(document) {
497
+ const tabs = document.querySelectorAll('button[role="tab"]');
498
+ for (let i = tabs.length - 1; i >= 0; i--) {
499
+ tabs[i]?.remove();
500
+ }
501
+ }
502
+ function escapeTableCellPipes(document) {
503
+ const codes = document.querySelectorAll('td code, th code');
504
+ for (const code of codes) {
505
+ if (code.textContent.includes('|')) {
506
+ code.textContent = code.textContent.replace(/\|/g, '\\|');
507
+ }
508
+ }
509
+ }
510
+ function separateAdjacentInlineElements(document) {
511
+ const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
512
+ for (const badge of badges) {
513
+ const next = badge.nextSibling;
514
+ if (next?.nodeType === 1) {
515
+ badge.after(document.createTextNode(' '));
516
+ }
517
+ }
518
+ }
472
519
  export function prepareDocumentForMarkdown(document, baseUrl, signal) {
473
520
  const context = getContext();
474
521
  if (config.noiseRemoval.debug) {
@@ -477,9 +524,37 @@ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
477
524
  });
478
525
  }
479
526
  stripNoise(document, context, signal);
527
+ stripTabTriggers(document);
528
+ separateAdjacentInlineElements(document);
529
+ flattenTableCellBreaks(document);
530
+ escapeTableCellPipes(document);
531
+ normalizeTableStructure(document);
480
532
  if (baseUrl)
481
533
  resolveUrls(document, baseUrl);
482
534
  }
535
+ // Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
536
+ function normalizeTableStructure(document) {
537
+ for (const table of document.querySelectorAll('table')) {
538
+ for (const cell of table.querySelectorAll('th, td')) {
539
+ for (const tag of ['tbody', 'thead', 'tfoot']) {
540
+ let nested = cell.querySelector(tag);
541
+ while (nested) {
542
+ table.appendChild(nested);
543
+ nested = cell.querySelector(tag);
544
+ }
545
+ }
546
+ }
547
+ }
548
+ }
549
+ function flattenTableCellBreaks(document) {
550
+ const cells = document.querySelectorAll('td, th');
551
+ for (const cell of cells) {
552
+ const brs = cell.querySelectorAll('br');
553
+ for (const br of brs) {
554
+ br.replaceWith(' ');
555
+ }
556
+ }
557
+ }
483
558
  export function removeNoiseFromHtml(html, document, baseUrl, signal) {
484
559
  const shouldParse = isFullDocumentHtml(html) ||
485
560
  mayContainNoise(html) ||
@@ -1076,6 +1151,8 @@ function processTextBuffer(lines, options) {
1076
1151
  function applyGlobalRegexes(text, options) {
1077
1152
  let result = text;
1078
1153
  const checkAbort = createAbortChecker(options);
1154
+ // Normalize non-breaking spaces to regular spaces
1155
+ result = result.replace(/\u00A0/g, ' ');
1079
1156
  checkAbort('markdown:cleanup:headings');
1080
1157
  // fixAndSpaceHeadings
1081
1158
  result = result
@@ -1105,6 +1182,10 @@ function applyGlobalRegexes(text, options) {
1105
1182
  .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
1106
1183
  .replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
1107
1184
  .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
1185
+ // Trim leading whitespace inside inline code spans
1186
+ result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
1187
+ // Unescape backticks inside markdown link text
1188
+ result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
1108
1189
  result = normalizeNestedListIndentation(result);
1109
1190
  checkAbort('markdown:cleanup:properties');
1110
1191
  // fixProperties
@@ -1 +1 @@
1
- {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAghBA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
1
+ {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4fA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -346,14 +346,8 @@ function buildSpanTranslator(ctx) {
346
346
  return {};
347
347
  }
348
348
  // ---------------------------------------------------------------------------
349
- // Table / DL helpers
349
+ // DL helpers
350
350
  // ---------------------------------------------------------------------------
351
- function hasComplexTableLayout(node) {
352
- if (!isLikeNode(node))
353
- return false;
354
- const innerHTML = typeof node.innerHTML === 'string' ? node.innerHTML : '';
355
- return /(?:colspan|rowspan)=["']?[2-9]/i.test(innerHTML);
356
- }
357
351
  function resolveDlNodeName(child) {
358
352
  if (!isLikeNode(child))
359
353
  return '';
@@ -381,22 +375,6 @@ function createCustomTranslators() {
381
375
  return {
382
376
  code: (ctx) => buildCodeTranslator(ctx),
383
377
  img: (ctx) => buildImageTranslator(ctx),
384
- table: (ctx) => {
385
- if (!isObject(ctx))
386
- return {};
387
- const { node } = ctx;
388
- if (hasComplexTableLayout(node)) {
389
- return {
390
- postprocess: ({ content }) => {
391
- const trimmed = content.trim();
392
- if (!trimmed)
393
- return '';
394
- return `\n\n${trimmed}\n\n`;
395
- },
396
- };
397
- }
398
- return {};
399
- },
400
378
  dl: (ctx) => {
401
379
  if (!isObject(ctx))
402
380
  return { content: '' };
@@ -1,4 +1,5 @@
1
1
  import type { ExtractedMetadata } from './types.js';
2
+ export declare function normalizeDocumentTitle(title: string, baseUrl?: string): string;
2
3
  export declare function extractMetadata(document: Document, baseUrl?: string): ExtractedMetadata;
3
4
  export declare function extractMetadataFromHead(html: string, baseUrl?: string): ExtractedMetadata | null;
4
5
  export declare function mergeMetadata(early: ExtractedMetadata | null, late: ExtractedMetadata): ExtractedMetadata;
@@ -1 +1 @@
1
- {"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAyKpD,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,CAenB;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,GAAG,IAAI,CAY1B;AAED,wBAAgB,aAAa,CAC3B,KAAK,EAAE,iBAAiB,GAAG,IAAI,EAC/B,IAAI,EAAE,iBAAiB,GACtB,iBAAiB,CAmBnB"}
1
+ {"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAoCpD,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAsBR;AAuID,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,CAkBnB;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,GAAG,IAAI,CAY1B;AAED,wBAAgB,aAAa,CAC3B,KAAK,EAAE,iBAAiB,GAAG,IAAI,EAC/B,IAAI,EAAE,iBAAiB,GACtB,iBAAiB,CAmBnB"}
@@ -15,6 +15,28 @@ function extractHeadSection(html) {
15
15
  return null;
16
16
  return html.substring(0, match.index);
17
17
  }
18
+ export function normalizeDocumentTitle(title, baseUrl) {
19
+ if (!baseUrl || !title.startsWith('GitHub - '))
20
+ return title;
21
+ let parsed;
22
+ try {
23
+ parsed = new URL(baseUrl);
24
+ }
25
+ catch {
26
+ return title;
27
+ }
28
+ const hostname = parsed.hostname.toLowerCase();
29
+ if (hostname !== 'github.com' && hostname !== 'www.github.com') {
30
+ return title;
31
+ }
32
+ const segments = parsed.pathname.split('/').filter(Boolean);
33
+ if (segments.length !== 2)
34
+ return title;
35
+ const [owner, repo] = segments;
36
+ if (!owner || !repo)
37
+ return title;
38
+ return `${owner}/${repo}`;
39
+ }
18
40
  const META_PROPERTY_HANDLERS = new Map([
19
41
  [
20
42
  'og:title',
@@ -139,6 +161,9 @@ function resolveFaviconUrl(href, baseUrl) {
139
161
  export function extractMetadata(document, baseUrl) {
140
162
  const ctx = buildMetaContext(document);
141
163
  const metadata = resolveMetadataFromContext(ctx);
164
+ if (metadata.title) {
165
+ metadata.title = normalizeDocumentTitle(metadata.title, baseUrl);
166
+ }
142
167
  if (baseUrl) {
143
168
  const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
144
169
  const href = icon32?.getAttribute('href');
@@ -1 +1 @@
1
- {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAsCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA4ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkUD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAqPD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuKD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AA6DD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAqB3B;AAqPD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CA6CzB;AAED,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkI1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
1
+ {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAuCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA4ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AA6UD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAqPD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuKD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AA6DD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAyUD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CA6CzB;AAED,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkI1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
@@ -11,7 +11,7 @@ import { createAbortError, throwIfAborted } from '../lib/utils.js';
11
11
  import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
12
12
  import { isObject } from '../lib/utils.js';
13
13
  import { translateHtmlFragmentToMarkdown } from './html-translators.js';
14
- import { extractMetadata, extractMetadataFromHead, mergeMetadata, } from './metadata.js';
14
+ import { extractMetadata, extractMetadataFromHead, mergeMetadata, normalizeDocumentTitle, } from './metadata.js';
15
15
  import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
16
16
  function decodeInput(input, encoding) {
17
17
  if (typeof input === 'string')
@@ -270,6 +270,14 @@ function resolveCollapsedTextLengthUpTo(text, max) {
270
270
  }
271
271
  return length;
272
272
  }
273
+ function preserveAlertElements(doc) {
274
+ const alerts = doc.querySelectorAll('[role="alert"], .admonition, .callout');
275
+ for (const el of alerts) {
276
+ const bq = doc.createElement('blockquote');
277
+ bq.innerHTML = el.innerHTML;
278
+ el.replaceWith(bq);
279
+ }
280
+ }
273
281
  function extractArticle(document, url, signal) {
274
282
  if (!isReadabilityCompatible(document)) {
275
283
  logWarn('Document not compatible with Readability');
@@ -298,6 +306,7 @@ function extractArticle(document, url, signal) {
298
306
  const readabilityDoc = typeof doc.cloneNode === 'function'
299
307
  ? doc.cloneNode(true)
300
308
  : doc;
309
+ preserveAlertElements(readabilityDoc);
301
310
  // F1: Check abort before heavy Readability parse
302
311
  abortPolicy.throwIfAborted(signal, url, 'extract:article:parse');
303
312
  const reader = new Readability(readabilityDoc, {
@@ -797,8 +806,9 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
797
806
  fetchedAt: new Date().toISOString(),
798
807
  };
799
808
  if (shouldExtractFromArticle && article) {
800
- if (article.title !== undefined)
801
- metadata.title = article.title;
809
+ if (article.title !== undefined) {
810
+ metadata.title = normalizeDocumentTitle(article.title, url);
811
+ }
802
812
  if (article.byline !== undefined)
803
813
  metadata.author = article.byline;
804
814
  }
@@ -828,6 +838,12 @@ const CONTENT_ROOT_SELECTORS = [
828
838
  '.post-body',
829
839
  '.article-body',
830
840
  ];
841
+ const PRIMARY_HEADING_ROOT_SELECTORS = [
842
+ ...CONTENT_ROOT_SELECTORS,
843
+ '.markdown-body',
844
+ '.entry-content',
845
+ '[itemprop="text"]',
846
+ ];
831
847
  function findContentRoot(document) {
832
848
  for (const selector of CONTENT_ROOT_SELECTORS) {
833
849
  const element = document.querySelector(selector);
@@ -841,6 +857,34 @@ function findContentRoot(document) {
841
857
  }
842
858
  return undefined;
843
859
  }
860
+ function findPrimaryHeading(document) {
861
+ for (const selector of PRIMARY_HEADING_ROOT_SELECTORS) {
862
+ const root = document.querySelector(selector);
863
+ if (!root)
864
+ continue;
865
+ const heading = root.querySelector('h1, h2');
866
+ if (!heading)
867
+ continue;
868
+ const text = heading.textContent.trim();
869
+ if (text)
870
+ return text;
871
+ }
872
+ return undefined;
873
+ }
874
+ function isGithubRepositoryRootUrl(url) {
875
+ let parsed;
876
+ try {
877
+ parsed = new URL(url);
878
+ }
879
+ catch {
880
+ return false;
881
+ }
882
+ const hostname = parsed.hostname.toLowerCase();
883
+ if (hostname !== 'github.com' && hostname !== 'www.github.com') {
884
+ return false;
885
+ }
886
+ return parsed.pathname.split('/').filter(Boolean).length === 2;
887
+ }
844
888
  function shouldUseArticleContent(article, originalHtmlOrDocument) {
845
889
  const articleLength = article.textContent.length;
846
890
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
@@ -875,14 +919,19 @@ function buildContentSource(params) {
875
919
  metadata,
876
920
  extractedMetadata: extractedMeta,
877
921
  truncated,
922
+ primaryHeading: document ? findPrimaryHeading(document) : undefined,
878
923
  };
879
924
  if (useArticleContent && article) {
880
925
  const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
881
926
  prepareDocumentForMarkdown(articleDoc, url, signal);
927
+ const preferPrimaryHeading = isGithubRepositoryRootUrl(url);
882
928
  return {
883
929
  ...base,
884
930
  sourceHtml: articleDoc.body.innerHTML,
885
- title: article.title,
931
+ title: (preferPrimaryHeading ? base.primaryHeading : undefined) ??
932
+ (article.title !== undefined
933
+ ? normalizeDocumentTitle(article.title, url)
934
+ : undefined),
886
935
  skipNoiseRemoval: true,
887
936
  };
888
937
  }
@@ -931,7 +980,10 @@ function buildMarkdownFromContext(context, url, signal) {
931
980
  ...(context.document ? { document: context.document } : {}),
932
981
  ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
933
982
  }));
934
- if (context.title && !content.trim().startsWith('# ')) {
983
+ if (context.primaryHeading && isGithubRepositoryRootUrl(url)) {
984
+ content = stripLeadingHeading(content, context.primaryHeading);
985
+ }
986
+ if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
935
987
  const icon = context.favicon;
936
988
  let prefix = ' ';
937
989
  if (icon) {
@@ -953,6 +1005,34 @@ function buildMarkdownFromContext(context, url, signal) {
953
1005
  metadata: context.extractedMetadata,
954
1006
  };
955
1007
  }
1008
+ function normalizeHeadingText(value) {
1009
+ return value.replace(/\s+/g, ' ').trim().toLowerCase();
1010
+ }
1011
+ function stripLeadingHeading(markdown, headingText) {
1012
+ if (!markdown)
1013
+ return markdown;
1014
+ const lines = markdown.split('\n');
1015
+ const target = normalizeHeadingText(headingText);
1016
+ let nonEmptySeen = 0;
1017
+ for (let i = 0; i < lines.length && nonEmptySeen < 12; i += 1) {
1018
+ const trimmed = lines[i]?.trim() ?? '';
1019
+ if (!trimmed)
1020
+ continue;
1021
+ nonEmptySeen += 1;
1022
+ const match = /^(#{1,6})\s+(.+?)\s*$/.exec(trimmed);
1023
+ if (!match)
1024
+ continue;
1025
+ const current = normalizeHeadingText(match[2] ?? '');
1026
+ if (current !== target)
1027
+ return markdown;
1028
+ lines.splice(i, 1);
1029
+ if ((lines[i] ?? '').trim() === '') {
1030
+ lines.splice(i, 1);
1031
+ }
1032
+ return lines.join('\n');
1033
+ }
1034
+ return markdown;
1035
+ }
956
1036
  const REPLACEMENT_CHAR = '\ufffd';
957
1037
  const BINARY_INDICATOR_THRESHOLD = 0.1;
958
1038
  function hasBinaryIndicators(content) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/fetch-url-mcp",
3
- "version": "1.9.0",
3
+ "version": "1.9.1",
4
4
  "mcpName": "io.github.j0hanz/fetch-url-mcp",
5
5
  "description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
6
6
  "type": "module",
@@ -74,7 +74,7 @@
74
74
  "@mozilla/readability": "^0.6.0",
75
75
  "linkedom": "^0.18.12",
76
76
  "node-html-markdown": "^2.0.0",
77
- "undici": "^7.22.0",
77
+ "undici": "^7.24.1",
78
78
  "zod": "^4.3.6"
79
79
  },
80
80
  "devDependencies": {