@j0hanz/superfetch 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/transform.js CHANGED
@@ -325,26 +325,32 @@ const META_NAME_HANDLERS = new Map([
325
325
  },
326
326
  ],
327
327
  ]);
328
- function extractMetadata(document) {
328
+ function processMetaTag(ctx, tag) {
329
+ const content = tag.getAttribute('content')?.trim();
330
+ if (!content)
331
+ return;
332
+ const property = tag.getAttribute('property');
333
+ if (property)
334
+ META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
335
+ const name = tag.getAttribute('name');
336
+ if (name)
337
+ META_NAME_HANDLERS.get(name)?.(ctx, content);
338
+ }
339
+ function buildMetaContext(document) {
329
340
  const ctx = { title: {}, description: {} };
330
341
  for (const tag of document.querySelectorAll('meta')) {
331
- const content = tag.getAttribute('content')?.trim();
332
- if (!content)
333
- continue;
334
- const property = tag.getAttribute('property');
335
- if (property)
336
- META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
337
- const name = tag.getAttribute('name');
338
- if (name)
339
- META_NAME_HANDLERS.get(name)?.(ctx, content);
342
+ processMetaTag(ctx, tag);
340
343
  }
341
344
  const titleEl = document.querySelector('title');
342
345
  if (!ctx.title.standard && titleEl?.textContent) {
343
346
  ctx.title.standard = titleEl.textContent.trim();
344
347
  }
348
+ return ctx;
349
+ }
350
+ function resolveMetadataFromContext(ctx) {
351
+ const metadata = {};
345
352
  const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
346
353
  const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
347
- const metadata = {};
348
354
  if (resolvedTitle)
349
355
  metadata.title = resolvedTitle;
350
356
  if (resolvedDesc)
@@ -359,6 +365,37 @@ function extractMetadata(document) {
359
365
  metadata.modifiedAt = ctx.modifiedAt;
360
366
  return metadata;
361
367
  }
368
+ function extractMetadata(document, baseUrl) {
369
+ const ctx = buildMetaContext(document);
370
+ const metadata = resolveMetadataFromContext(ctx);
371
+ if (baseUrl) {
372
+ const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
373
+ const href = icon32?.getAttribute('href');
374
+ if (href) {
375
+ const resolved = resolveFaviconUrl(href, baseUrl);
376
+ if (resolved)
377
+ metadata.favicon = resolved;
378
+ }
379
+ }
380
+ return metadata;
381
+ }
382
+ function resolveFaviconUrl(href, baseUrl) {
383
+ const trimmed = href.trim();
384
+ if (!trimmed)
385
+ return undefined;
386
+ if (trimmed.toLowerCase().startsWith('data:'))
387
+ return undefined;
388
+ try {
389
+ const resolved = new URL(trimmed, baseUrl);
390
+ if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') {
391
+ return undefined;
392
+ }
393
+ return resolved.toString();
394
+ }
395
+ catch {
396
+ return undefined;
397
+ }
398
+ }
362
399
  function isReadabilityCompatible(doc) {
363
400
  if (!isObject(doc))
364
401
  return false;
@@ -469,7 +506,7 @@ function extractContentContext(html, url, options) {
469
506
  const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
470
507
  abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
471
508
  applyBaseUri(document, url);
472
- const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document));
509
+ const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
473
510
  abortPolicy.throwIfAborted(options.signal, url, 'extract:metadata');
474
511
  // Merge early (pre-truncation) with late (post-truncation) metadata
475
512
  const metadata = mergeMetadata(earlyMetadata, lateMetadata);
@@ -615,6 +652,59 @@ function buildCodeTranslator(ctx) {
615
652
  return buildInlineCodeTranslator();
616
653
  return { noEscape: true, preserveWhitespace: true };
617
654
  }
655
+ function extractFirstSrcsetUrl(srcset) {
656
+ const first = srcset.split(',')[0];
657
+ if (!first)
658
+ return '';
659
+ return first.trim().split(/\s+/)[0] ?? '';
660
+ }
661
+ const LAZY_SRC_ATTRIBUTES = [
662
+ 'data-src',
663
+ 'data-lazy-src',
664
+ 'data-original',
665
+ 'data-srcset',
666
+ ];
667
+ function extractNonDataSrcsetUrl(value) {
668
+ const url = extractFirstSrcsetUrl(value);
669
+ return url && !url.startsWith('data:') ? url : undefined;
670
+ }
671
+ function resolveLazySrc(getAttribute) {
672
+ for (const attr of LAZY_SRC_ATTRIBUTES) {
673
+ const lazy = getAttribute(attr);
674
+ if (!lazy || lazy.startsWith('data:'))
675
+ continue;
676
+ if (attr === 'data-srcset') {
677
+ const url = extractNonDataSrcsetUrl(lazy);
678
+ if (url)
679
+ return url;
680
+ continue;
681
+ }
682
+ return lazy;
683
+ }
684
+ return undefined;
685
+ }
686
+ function resolveImageSrc(getAttribute) {
687
+ if (!getAttribute)
688
+ return '';
689
+ const srcRaw = getAttribute('src') ?? '';
690
+ if (srcRaw && !srcRaw.startsWith('data:'))
691
+ return srcRaw;
692
+ // First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
693
+ const lazySrc = resolveLazySrc(getAttribute);
694
+ if (lazySrc)
695
+ return lazySrc;
696
+ // If the src is a data URI or missing, check srcset for a valid URL. Some sites use srcset with data URIs in src and actual URLs in srcset for responsive images.
697
+ const srcset = getAttribute('srcset');
698
+ if (srcset) {
699
+ const url = extractNonDataSrcsetUrl(srcset);
700
+ if (url)
701
+ return url;
702
+ }
703
+ // If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
704
+ if (srcRaw.startsWith('data:'))
705
+ return '[data URI removed]';
706
+ return '';
707
+ }
618
708
  function buildImageTranslator(ctx) {
619
709
  if (!isObject(ctx))
620
710
  return { content: '' };
@@ -622,8 +712,7 @@ function buildImageTranslator(ctx) {
622
712
  const getAttribute = hasGetAttribute(node)
623
713
  ? node.getAttribute.bind(node)
624
714
  : undefined;
625
- const srcRaw = getAttribute?.('src') ?? '';
626
- const src = srcRaw.startsWith('data:') ? '[data URI removed]' : srcRaw;
715
+ const src = resolveImageSrc(getAttribute);
627
716
  const existingAlt = getAttribute?.('alt') ?? '';
628
717
  const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
629
718
  const markdown = `![${alt}](${src})`;
@@ -794,11 +883,11 @@ function createCustomTranslators() {
794
883
  const trimmed = content.trim();
795
884
  if (!trimmed)
796
885
  return '';
797
- return `\n\n<details>\n${trimmed}\n</details>\n\n`;
886
+ return `\n\n${trimmed}\n\n`;
798
887
  },
799
888
  }),
800
889
  summary: () => ({
801
- postprocess: ({ content }) => `<summary>${content.trim()}</summary>\n\n`,
890
+ postprocess: ({ content }) => `${content.trim()}\n\n`,
802
891
  }),
803
892
  span: (ctx) => {
804
893
  if (!isObject(ctx) || !isObject(ctx.node))
@@ -1315,6 +1404,7 @@ function buildContentSource(params) {
1315
1404
  return {
1316
1405
  sourceHtml: cleanedArticleHtml,
1317
1406
  title: article.title,
1407
+ favicon: extractedMeta.favicon,
1318
1408
  metadata,
1319
1409
  skipNoiseRemoval: true,
1320
1410
  truncated,
@@ -1329,6 +1419,7 @@ function buildContentSource(params) {
1329
1419
  return {
1330
1420
  sourceHtml: contentRoot,
1331
1421
  title: extractedMeta.title,
1422
+ favicon: extractedMeta.favicon,
1332
1423
  metadata,
1333
1424
  skipNoiseRemoval: true,
1334
1425
  document,
@@ -1338,6 +1429,7 @@ function buildContentSource(params) {
1338
1429
  return {
1339
1430
  sourceHtml: cleanedHtml,
1340
1431
  title: extractedMeta.title,
1432
+ favicon: extractedMeta.favicon,
1341
1433
  metadata,
1342
1434
  skipNoiseRemoval: true,
1343
1435
  document,
@@ -1347,6 +1439,7 @@ function buildContentSource(params) {
1347
1439
  return {
1348
1440
  sourceHtml: html,
1349
1441
  title: extractedMeta.title,
1442
+ favicon: extractedMeta.favicon,
1350
1443
  metadata,
1351
1444
  truncated,
1352
1445
  };
@@ -1379,7 +1472,19 @@ function buildMarkdownFromContext(context, url, signal) {
1379
1472
  ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1380
1473
  }));
1381
1474
  if (context.title && !content.trim().startsWith('# ')) {
1382
- content = `# ${context.title}\n\n${content}`;
1475
+ const icon = context.favicon;
1476
+ let prefix = ' ';
1477
+ if (icon) {
1478
+ let alt = '';
1479
+ try {
1480
+ alt = new URL(url).hostname;
1481
+ }
1482
+ catch {
1483
+ /* skip */
1484
+ }
1485
+ prefix = ` ![${alt}](${icon}) `;
1486
+ }
1487
+ content = `#${prefix}${context.title}\n\n${content}`;
1383
1488
  }
1384
1489
  return {
1385
1490
  markdown: content,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "2.6.0",
3
+ "version": "2.7.0",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
6
6
  "type": "module",