@j0hanz/superfetch 2.6.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache.js +14 -12
- package/dist/config.js +51 -39
- package/dist/dom-noise-removal.js +4 -1
- package/dist/fetch.d.ts +1 -0
- package/dist/fetch.js +160 -97
- package/dist/http-native.js +31 -14
- package/dist/language-detection.js +28 -4
- package/dist/mcp.js +7 -1
- package/dist/tasks.d.ts +1 -0
- package/dist/tasks.js +129 -95
- package/dist/tools.d.ts +2 -0
- package/dist/tools.js +4 -3
- package/dist/transform-types.d.ts +1 -0
- package/dist/transform.js +122 -17
- package/package.json +1 -1
package/dist/transform.js
CHANGED
|
@@ -325,26 +325,32 @@ const META_NAME_HANDLERS = new Map([
|
|
|
325
325
|
},
|
|
326
326
|
],
|
|
327
327
|
]);
|
|
328
|
-
function
|
|
328
|
+
function processMetaTag(ctx, tag) {
|
|
329
|
+
const content = tag.getAttribute('content')?.trim();
|
|
330
|
+
if (!content)
|
|
331
|
+
return;
|
|
332
|
+
const property = tag.getAttribute('property');
|
|
333
|
+
if (property)
|
|
334
|
+
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
335
|
+
const name = tag.getAttribute('name');
|
|
336
|
+
if (name)
|
|
337
|
+
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
338
|
+
}
|
|
339
|
+
function buildMetaContext(document) {
|
|
329
340
|
const ctx = { title: {}, description: {} };
|
|
330
341
|
for (const tag of document.querySelectorAll('meta')) {
|
|
331
|
-
|
|
332
|
-
if (!content)
|
|
333
|
-
continue;
|
|
334
|
-
const property = tag.getAttribute('property');
|
|
335
|
-
if (property)
|
|
336
|
-
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
337
|
-
const name = tag.getAttribute('name');
|
|
338
|
-
if (name)
|
|
339
|
-
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
342
|
+
processMetaTag(ctx, tag);
|
|
340
343
|
}
|
|
341
344
|
const titleEl = document.querySelector('title');
|
|
342
345
|
if (!ctx.title.standard && titleEl?.textContent) {
|
|
343
346
|
ctx.title.standard = titleEl.textContent.trim();
|
|
344
347
|
}
|
|
348
|
+
return ctx;
|
|
349
|
+
}
|
|
350
|
+
function resolveMetadataFromContext(ctx) {
|
|
351
|
+
const metadata = {};
|
|
345
352
|
const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
|
|
346
353
|
const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
|
|
347
|
-
const metadata = {};
|
|
348
354
|
if (resolvedTitle)
|
|
349
355
|
metadata.title = resolvedTitle;
|
|
350
356
|
if (resolvedDesc)
|
|
@@ -359,6 +365,37 @@ function extractMetadata(document) {
|
|
|
359
365
|
metadata.modifiedAt = ctx.modifiedAt;
|
|
360
366
|
return metadata;
|
|
361
367
|
}
|
|
368
|
+
function extractMetadata(document, baseUrl) {
|
|
369
|
+
const ctx = buildMetaContext(document);
|
|
370
|
+
const metadata = resolveMetadataFromContext(ctx);
|
|
371
|
+
if (baseUrl) {
|
|
372
|
+
const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
|
|
373
|
+
const href = icon32?.getAttribute('href');
|
|
374
|
+
if (href) {
|
|
375
|
+
const resolved = resolveFaviconUrl(href, baseUrl);
|
|
376
|
+
if (resolved)
|
|
377
|
+
metadata.favicon = resolved;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
return metadata;
|
|
381
|
+
}
|
|
382
|
+
function resolveFaviconUrl(href, baseUrl) {
|
|
383
|
+
const trimmed = href.trim();
|
|
384
|
+
if (!trimmed)
|
|
385
|
+
return undefined;
|
|
386
|
+
if (trimmed.toLowerCase().startsWith('data:'))
|
|
387
|
+
return undefined;
|
|
388
|
+
try {
|
|
389
|
+
const resolved = new URL(trimmed, baseUrl);
|
|
390
|
+
if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') {
|
|
391
|
+
return undefined;
|
|
392
|
+
}
|
|
393
|
+
return resolved.toString();
|
|
394
|
+
}
|
|
395
|
+
catch {
|
|
396
|
+
return undefined;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
362
399
|
function isReadabilityCompatible(doc) {
|
|
363
400
|
if (!isObject(doc))
|
|
364
401
|
return false;
|
|
@@ -469,7 +506,7 @@ function extractContentContext(html, url, options) {
|
|
|
469
506
|
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
470
507
|
abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
|
|
471
508
|
applyBaseUri(document, url);
|
|
472
|
-
const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document));
|
|
509
|
+
const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
|
|
473
510
|
abortPolicy.throwIfAborted(options.signal, url, 'extract:metadata');
|
|
474
511
|
// Merge early (pre-truncation) with late (post-truncation) metadata
|
|
475
512
|
const metadata = mergeMetadata(earlyMetadata, lateMetadata);
|
|
@@ -615,6 +652,59 @@ function buildCodeTranslator(ctx) {
|
|
|
615
652
|
return buildInlineCodeTranslator();
|
|
616
653
|
return { noEscape: true, preserveWhitespace: true };
|
|
617
654
|
}
|
|
655
|
+
function extractFirstSrcsetUrl(srcset) {
|
|
656
|
+
const first = srcset.split(',')[0];
|
|
657
|
+
if (!first)
|
|
658
|
+
return '';
|
|
659
|
+
return first.trim().split(/\s+/)[0] ?? '';
|
|
660
|
+
}
|
|
661
|
+
const LAZY_SRC_ATTRIBUTES = [
|
|
662
|
+
'data-src',
|
|
663
|
+
'data-lazy-src',
|
|
664
|
+
'data-original',
|
|
665
|
+
'data-srcset',
|
|
666
|
+
];
|
|
667
|
+
function extractNonDataSrcsetUrl(value) {
|
|
668
|
+
const url = extractFirstSrcsetUrl(value);
|
|
669
|
+
return url && !url.startsWith('data:') ? url : undefined;
|
|
670
|
+
}
|
|
671
|
+
function resolveLazySrc(getAttribute) {
|
|
672
|
+
for (const attr of LAZY_SRC_ATTRIBUTES) {
|
|
673
|
+
const lazy = getAttribute(attr);
|
|
674
|
+
if (!lazy || lazy.startsWith('data:'))
|
|
675
|
+
continue;
|
|
676
|
+
if (attr === 'data-srcset') {
|
|
677
|
+
const url = extractNonDataSrcsetUrl(lazy);
|
|
678
|
+
if (url)
|
|
679
|
+
return url;
|
|
680
|
+
continue;
|
|
681
|
+
}
|
|
682
|
+
return lazy;
|
|
683
|
+
}
|
|
684
|
+
return undefined;
|
|
685
|
+
}
|
|
686
|
+
function resolveImageSrc(getAttribute) {
|
|
687
|
+
if (!getAttribute)
|
|
688
|
+
return '';
|
|
689
|
+
const srcRaw = getAttribute('src') ?? '';
|
|
690
|
+
if (srcRaw && !srcRaw.startsWith('data:'))
|
|
691
|
+
return srcRaw;
|
|
692
|
+
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
693
|
+
const lazySrc = resolveLazySrc(getAttribute);
|
|
694
|
+
if (lazySrc)
|
|
695
|
+
return lazySrc;
|
|
696
|
+
// If the src is a data URI or missing, check srcset for a valid URL. Some sites use srcset with data URIs in src and actual URLs in srcset for responsive images.
|
|
697
|
+
const srcset = getAttribute('srcset');
|
|
698
|
+
if (srcset) {
|
|
699
|
+
const url = extractNonDataSrcsetUrl(srcset);
|
|
700
|
+
if (url)
|
|
701
|
+
return url;
|
|
702
|
+
}
|
|
703
|
+
// If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
|
|
704
|
+
if (srcRaw.startsWith('data:'))
|
|
705
|
+
return '[data URI removed]';
|
|
706
|
+
return '';
|
|
707
|
+
}
|
|
618
708
|
function buildImageTranslator(ctx) {
|
|
619
709
|
if (!isObject(ctx))
|
|
620
710
|
return { content: '' };
|
|
@@ -622,8 +712,7 @@ function buildImageTranslator(ctx) {
|
|
|
622
712
|
const getAttribute = hasGetAttribute(node)
|
|
623
713
|
? node.getAttribute.bind(node)
|
|
624
714
|
: undefined;
|
|
625
|
-
const
|
|
626
|
-
const src = srcRaw.startsWith('data:') ? '[data URI removed]' : srcRaw;
|
|
715
|
+
const src = resolveImageSrc(getAttribute);
|
|
627
716
|
const existingAlt = getAttribute?.('alt') ?? '';
|
|
628
717
|
const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
|
|
629
718
|
const markdown = ``;
|
|
@@ -794,11 +883,11 @@ function createCustomTranslators() {
|
|
|
794
883
|
const trimmed = content.trim();
|
|
795
884
|
if (!trimmed)
|
|
796
885
|
return '';
|
|
797
|
-
return `\n\n
|
|
886
|
+
return `\n\n${trimmed}\n\n`;
|
|
798
887
|
},
|
|
799
888
|
}),
|
|
800
889
|
summary: () => ({
|
|
801
|
-
postprocess: ({ content }) =>
|
|
890
|
+
postprocess: ({ content }) => `${content.trim()}\n\n`,
|
|
802
891
|
}),
|
|
803
892
|
span: (ctx) => {
|
|
804
893
|
if (!isObject(ctx) || !isObject(ctx.node))
|
|
@@ -1315,6 +1404,7 @@ function buildContentSource(params) {
|
|
|
1315
1404
|
return {
|
|
1316
1405
|
sourceHtml: cleanedArticleHtml,
|
|
1317
1406
|
title: article.title,
|
|
1407
|
+
favicon: extractedMeta.favicon,
|
|
1318
1408
|
metadata,
|
|
1319
1409
|
skipNoiseRemoval: true,
|
|
1320
1410
|
truncated,
|
|
@@ -1329,6 +1419,7 @@ function buildContentSource(params) {
|
|
|
1329
1419
|
return {
|
|
1330
1420
|
sourceHtml: contentRoot,
|
|
1331
1421
|
title: extractedMeta.title,
|
|
1422
|
+
favicon: extractedMeta.favicon,
|
|
1332
1423
|
metadata,
|
|
1333
1424
|
skipNoiseRemoval: true,
|
|
1334
1425
|
document,
|
|
@@ -1338,6 +1429,7 @@ function buildContentSource(params) {
|
|
|
1338
1429
|
return {
|
|
1339
1430
|
sourceHtml: cleanedHtml,
|
|
1340
1431
|
title: extractedMeta.title,
|
|
1432
|
+
favicon: extractedMeta.favicon,
|
|
1341
1433
|
metadata,
|
|
1342
1434
|
skipNoiseRemoval: true,
|
|
1343
1435
|
document,
|
|
@@ -1347,6 +1439,7 @@ function buildContentSource(params) {
|
|
|
1347
1439
|
return {
|
|
1348
1440
|
sourceHtml: html,
|
|
1349
1441
|
title: extractedMeta.title,
|
|
1442
|
+
favicon: extractedMeta.favicon,
|
|
1350
1443
|
metadata,
|
|
1351
1444
|
truncated,
|
|
1352
1445
|
};
|
|
@@ -1379,7 +1472,19 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1379
1472
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1380
1473
|
}));
|
|
1381
1474
|
if (context.title && !content.trim().startsWith('# ')) {
|
|
1382
|
-
|
|
1475
|
+
const icon = context.favicon;
|
|
1476
|
+
let prefix = ' ';
|
|
1477
|
+
if (icon) {
|
|
1478
|
+
let alt = '';
|
|
1479
|
+
try {
|
|
1480
|
+
alt = new URL(url).hostname;
|
|
1481
|
+
}
|
|
1482
|
+
catch {
|
|
1483
|
+
/* skip */
|
|
1484
|
+
}
|
|
1485
|
+
prefix = `  `;
|
|
1486
|
+
}
|
|
1487
|
+
content = `#${prefix}${context.title}\n\n${content}`;
|
|
1383
1488
|
}
|
|
1384
1489
|
return {
|
|
1385
1490
|
markdown: content,
|
package/package.json
CHANGED