metanova 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +49 -0
- package/README.md +4 -0
- package/dist/index.cjs +993 -59
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +32 -1
- package/dist/index.d.ts +32 -1
- package/dist/index.js +992 -59
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.js
CHANGED
|
@@ -756,8 +756,24 @@ function isRecord2(value) {
|
|
|
756
756
|
}
|
|
757
757
|
|
|
758
758
|
// src/extractors/media.ts
|
|
759
|
-
var LAZY_IMAGE_ATTRIBUTES = [
|
|
760
|
-
|
|
759
|
+
var LAZY_IMAGE_ATTRIBUTES = [
|
|
760
|
+
"data-src",
|
|
761
|
+
"data-original",
|
|
762
|
+
"data-lazy-src",
|
|
763
|
+
"data-image",
|
|
764
|
+
"data-image-url",
|
|
765
|
+
"data-og-image",
|
|
766
|
+
"data-thumbnail",
|
|
767
|
+
"data-thumb",
|
|
768
|
+
"data-media",
|
|
769
|
+
"data-full-src",
|
|
770
|
+
"data-hi-res-src",
|
|
771
|
+
"data-zoom-src",
|
|
772
|
+
"data-poster",
|
|
773
|
+
"data-bg"
|
|
774
|
+
];
|
|
775
|
+
var LAZY_IMAGE_SRCSET_ATTRIBUTES = ["data-srcset", "data-lazy-srcset", "data-original-srcset"];
|
|
776
|
+
var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-video-url", "data-media", "data-playback-url"];
|
|
761
777
|
function extractImages(html, baseUrl) {
|
|
762
778
|
const $ = loadDocument(html);
|
|
763
779
|
const images = [];
|
|
@@ -783,6 +799,15 @@ function extractImages(html, baseUrl) {
|
|
|
783
799
|
type,
|
|
784
800
|
metadata: { discoveredFrom: "link.preload" }
|
|
785
801
|
}, baseUrl);
|
|
802
|
+
for (const candidate of parseSrcset($(element).attr("imagesrcset"))) {
|
|
803
|
+
pushResolved(images, {
|
|
804
|
+
url: candidate,
|
|
805
|
+
kind: "image",
|
|
806
|
+
source: "html",
|
|
807
|
+
type,
|
|
808
|
+
metadata: { discoveredFrom: "link.imagesrcset" }
|
|
809
|
+
}, baseUrl);
|
|
810
|
+
}
|
|
786
811
|
}
|
|
787
812
|
});
|
|
788
813
|
collectDocumentImages($, images, baseUrl, "html");
|
|
@@ -907,7 +932,8 @@ function collectDocumentImages($, images, baseUrl, source) {
|
|
|
907
932
|
const candidates = [
|
|
908
933
|
normalizeWhitespace($(element).attr("src")),
|
|
909
934
|
...LAZY_IMAGE_ATTRIBUTES.map((attribute) => normalizeWhitespace($(element).attr(attribute))),
|
|
910
|
-
...parseSrcset($(element).attr("srcset"))
|
|
935
|
+
...parseSrcset($(element).attr("srcset")),
|
|
936
|
+
...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
|
|
911
937
|
];
|
|
912
938
|
for (const candidate of candidates) {
|
|
913
939
|
pushResolved(images, {
|
|
@@ -920,7 +946,10 @@ function collectDocumentImages($, images, baseUrl, source) {
|
|
|
920
946
|
}
|
|
921
947
|
});
|
|
922
948
|
$("picture source[srcset], source[type^='image/'][srcset]").each((_, element) => {
|
|
923
|
-
for (const candidate of
|
|
949
|
+
for (const candidate of [
|
|
950
|
+
...parseSrcset($(element).attr("srcset")),
|
|
951
|
+
...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
|
|
952
|
+
]) {
|
|
924
953
|
pushResolved(images, {
|
|
925
954
|
url: candidate,
|
|
926
955
|
kind: "image",
|
|
@@ -1212,11 +1241,11 @@ function uniqueStrings(values) {
|
|
|
1212
1241
|
|
|
1213
1242
|
// src/scorers/image.ts
|
|
1214
1243
|
var SOURCE_WEIGHT = {
|
|
1215
|
-
adapter:
|
|
1216
|
-
openGraph:
|
|
1244
|
+
adapter: 98,
|
|
1245
|
+
openGraph: 94,
|
|
1217
1246
|
oEmbed: 88,
|
|
1218
1247
|
jsonLd: 82,
|
|
1219
|
-
twitter:
|
|
1248
|
+
twitter: 86,
|
|
1220
1249
|
nextData: 76,
|
|
1221
1250
|
nuxt: 74,
|
|
1222
1251
|
initialState: 73,
|
|
@@ -1244,7 +1273,9 @@ function scoreImages(images, customScorers = []) {
|
|
|
1244
1273
|
scoreReasons: reasons
|
|
1245
1274
|
}
|
|
1246
1275
|
};
|
|
1247
|
-
}).sort(
|
|
1276
|
+
}).sort(
|
|
1277
|
+
(left, right) => (right.score ?? 0) - (left.score ?? 0) || sourceSortWeight(right) - sourceSortWeight(left) || imageArea(right) - imageArea(left)
|
|
1278
|
+
);
|
|
1248
1279
|
}
|
|
1249
1280
|
function selectBestImage(images, customScorers = []) {
|
|
1250
1281
|
const scored = scoreImages(images, customScorers);
|
|
@@ -1343,17 +1374,32 @@ function scoreFormat(image) {
|
|
|
1343
1374
|
}
|
|
1344
1375
|
function scoreUrlSignal(image) {
|
|
1345
1376
|
const url = image.url.toLowerCase();
|
|
1346
|
-
const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social/g) ?? [];
|
|
1347
|
-
|
|
1377
|
+
const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social|maxres|highres|large|original/g) ?? [];
|
|
1378
|
+
const platformScore = platformThumbnailScore(url);
|
|
1379
|
+
if (matches.length === 0 && platformScore.score === 0) {
|
|
1348
1380
|
return { score: 0, reasons: [] };
|
|
1349
1381
|
}
|
|
1350
1382
|
const uniqueMatches = [...new Set(matches)];
|
|
1351
|
-
const score = Math.min(uniqueMatches.length * 4,
|
|
1383
|
+
const score = Math.min(uniqueMatches.length * 4, 14) + platformScore.score;
|
|
1384
|
+
const reasons = uniqueMatches.length > 0 ? [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${Math.min(uniqueMatches.length * 4, 14)} points`] : [];
|
|
1385
|
+
reasons.push(...platformScore.reasons);
|
|
1352
1386
|
return {
|
|
1353
1387
|
score,
|
|
1354
|
-
reasons
|
|
1388
|
+
reasons
|
|
1355
1389
|
};
|
|
1356
1390
|
}
|
|
1391
|
+
function platformThumbnailScore(url) {
|
|
1392
|
+
if (/ytimg\.com\/vi\/[^/]+\/(?:maxresdefault|sddefault|hqdefault)/i.test(url)) {
|
|
1393
|
+
return { score: 12, reasons: ["YouTube platform thumbnail added 12 points"] };
|
|
1394
|
+
}
|
|
1395
|
+
if (/(?:i|preview|external-preview)\.redd\.it|v\.redd\.it/i.test(url)) {
|
|
1396
|
+
return { score: 10, reasons: ["Reddit media host added 10 points"] };
|
|
1397
|
+
}
|
|
1398
|
+
if (/pbs\.twimg\.com\/media|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|mir-s3-cdn-cf\.behance\.net/i.test(url)) {
|
|
1399
|
+
return { score: 8, reasons: ["social platform media host added 8 points"] };
|
|
1400
|
+
}
|
|
1401
|
+
return { score: 0, reasons: [] };
|
|
1402
|
+
}
|
|
1357
1403
|
function scoreUrlPenalty(image) {
|
|
1358
1404
|
const url = image.url.toLowerCase();
|
|
1359
1405
|
let penalty = 0;
|
|
@@ -1426,6 +1472,12 @@ function countDuplicates(images) {
|
|
|
1426
1472
|
}
|
|
1427
1473
|
return counts;
|
|
1428
1474
|
}
|
|
1475
|
+
function imageArea(image) {
|
|
1476
|
+
return (image.width ?? 0) * (image.height ?? 0);
|
|
1477
|
+
}
|
|
1478
|
+
function sourceSortWeight(image) {
|
|
1479
|
+
return SOURCE_WEIGHT[image.source] ?? 50;
|
|
1480
|
+
}
|
|
1429
1481
|
function mediaSignature(url) {
|
|
1430
1482
|
try {
|
|
1431
1483
|
const parsed = new URL(url);
|
|
@@ -1443,16 +1495,45 @@ var IMAGE_KEYS = [
|
|
|
1443
1495
|
"thumbnailUrl",
|
|
1444
1496
|
"thumbnail_url",
|
|
1445
1497
|
"thumbnailSrc",
|
|
1498
|
+
"thumbnail_src",
|
|
1446
1499
|
"previewImage",
|
|
1447
1500
|
"preview_image",
|
|
1501
|
+
"preview",
|
|
1448
1502
|
"ogImage",
|
|
1503
|
+
"og_image",
|
|
1449
1504
|
"cardImage",
|
|
1505
|
+
"displayUrl",
|
|
1506
|
+
"display_url",
|
|
1507
|
+
"mediaUrl",
|
|
1508
|
+
"media_url",
|
|
1509
|
+
"media_url_https",
|
|
1510
|
+
"fullPicture",
|
|
1511
|
+
"full_picture",
|
|
1450
1512
|
"cover",
|
|
1451
1513
|
"coverImage",
|
|
1514
|
+
"cover_image",
|
|
1515
|
+
"original",
|
|
1516
|
+
"source",
|
|
1452
1517
|
"poster",
|
|
1518
|
+
"posterImage",
|
|
1519
|
+
"media"
|
|
1520
|
+
];
|
|
1521
|
+
var VIDEO_KEYS = [
|
|
1522
|
+
"video",
|
|
1523
|
+
"videos",
|
|
1524
|
+
"videoUrl",
|
|
1525
|
+
"video_url",
|
|
1526
|
+
"contentUrl",
|
|
1527
|
+
"content_url",
|
|
1528
|
+
"embedUrl",
|
|
1529
|
+
"embed_url",
|
|
1530
|
+
"playbackUrl",
|
|
1531
|
+
"playback_url",
|
|
1532
|
+
"fallback_url",
|
|
1533
|
+
"hls_url",
|
|
1534
|
+
"dash_url",
|
|
1453
1535
|
"media"
|
|
1454
1536
|
];
|
|
1455
|
-
var VIDEO_KEYS = ["video", "videos", "videoUrl", "video_url", "contentUrl", "embedUrl", "playbackUrl"];
|
|
1456
1537
|
var AUDIO_KEYS = ["audio", "audios", "audioUrl", "audio_url", "podcastUrl"];
|
|
1457
1538
|
function discoverMedia(rawSources, finalUrl) {
|
|
1458
1539
|
const trace = [];
|
|
@@ -1586,25 +1667,87 @@ function mediaFromJsonValue(value, kind, source) {
|
|
|
1586
1667
|
return value.flatMap((item) => mediaFromJsonValue(item, kind, source));
|
|
1587
1668
|
}
|
|
1588
1669
|
if (isRecord3(value)) {
|
|
1589
|
-
const
|
|
1670
|
+
const srcset = stringFromUnknown(value.srcset) ?? stringFromUnknown(value.srcSet);
|
|
1671
|
+
const srcsetAssets = parseSrcset(srcset).flatMap((url2) => mediaFromJsonValue(url2, kind, source));
|
|
1672
|
+
const url = mediaUrlFromRecord(value, kind);
|
|
1673
|
+
const nestedDetails = nestedMediaDetailsRecord(value, kind);
|
|
1590
1674
|
if (!url || !looksLikeMediaUrl(url, kind)) {
|
|
1591
|
-
return
|
|
1675
|
+
return srcsetAssets;
|
|
1592
1676
|
}
|
|
1593
1677
|
return [
|
|
1594
1678
|
{
|
|
1595
1679
|
url,
|
|
1596
1680
|
kind,
|
|
1597
1681
|
source,
|
|
1598
|
-
width: parseNumber(stringFromUnknown(value.width)),
|
|
1599
|
-
height: parseNumber(stringFromUnknown(value.height)),
|
|
1600
|
-
alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name),
|
|
1601
|
-
title: stringFromUnknown(value.title),
|
|
1602
|
-
type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat)
|
|
1603
|
-
}
|
|
1682
|
+
width: parseNumber(stringFromUnknown(value.width)) ?? parseNumber(stringFromUnknown(nestedDetails?.width)),
|
|
1683
|
+
height: parseNumber(stringFromUnknown(value.height)) ?? parseNumber(stringFromUnknown(nestedDetails?.height)),
|
|
1684
|
+
alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name) ?? stringFromUnknown(nestedDetails?.alt),
|
|
1685
|
+
title: stringFromUnknown(value.title) ?? stringFromUnknown(nestedDetails?.title),
|
|
1686
|
+
type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat) ?? stringFromUnknown(nestedDetails?.type)
|
|
1687
|
+
},
|
|
1688
|
+
...srcsetAssets
|
|
1604
1689
|
];
|
|
1605
1690
|
}
|
|
1606
1691
|
return [];
|
|
1607
1692
|
}
|
|
1693
|
+
function nestedMediaDetailsRecord(value, kind) {
|
|
1694
|
+
const candidates = [
|
|
1695
|
+
value.source,
|
|
1696
|
+
value.original,
|
|
1697
|
+
value.image,
|
|
1698
|
+
value.thumbnail,
|
|
1699
|
+
value.thumbnailUrl,
|
|
1700
|
+
value.thumbnail_url,
|
|
1701
|
+
value.previewImage,
|
|
1702
|
+
value.preview_image,
|
|
1703
|
+
value.video,
|
|
1704
|
+
value.reddit_video
|
|
1705
|
+
];
|
|
1706
|
+
return candidates.find((candidate) => isRecord3(candidate) && Boolean(mediaUrlFromRecord(candidate, kind)));
|
|
1707
|
+
}
|
|
1708
|
+
function mediaUrlFromRecord(value, kind) {
|
|
1709
|
+
const commonCandidates = [
|
|
1710
|
+
value.url,
|
|
1711
|
+
value.src,
|
|
1712
|
+
value.secure_url,
|
|
1713
|
+
value.secureUrl,
|
|
1714
|
+
value.contentUrl,
|
|
1715
|
+
value.content_url,
|
|
1716
|
+
value.embedUrl,
|
|
1717
|
+
value.embed_url,
|
|
1718
|
+
value.thumbnailUrl,
|
|
1719
|
+
value.thumbnail_url,
|
|
1720
|
+
value.thumbnailSrc,
|
|
1721
|
+
value.thumbnail_src,
|
|
1722
|
+
value.mediaUrl,
|
|
1723
|
+
value.media_url,
|
|
1724
|
+
value.media_url_https,
|
|
1725
|
+
value.displayUrl,
|
|
1726
|
+
value.display_url,
|
|
1727
|
+
value.fullPicture,
|
|
1728
|
+
value.full_picture,
|
|
1729
|
+
value.previewImage,
|
|
1730
|
+
value.preview_image,
|
|
1731
|
+
value.poster,
|
|
1732
|
+
value.posterUrl,
|
|
1733
|
+
value.poster_url,
|
|
1734
|
+
value.coverImage,
|
|
1735
|
+
value.cover_image,
|
|
1736
|
+
value.original,
|
|
1737
|
+
value.source
|
|
1738
|
+
];
|
|
1739
|
+
const videoCandidates = [
|
|
1740
|
+
value.videoUrl,
|
|
1741
|
+
value.video_url,
|
|
1742
|
+
value.playbackUrl,
|
|
1743
|
+
value.playback_url,
|
|
1744
|
+
value.fallback_url,
|
|
1745
|
+
value.hls_url,
|
|
1746
|
+
value.dash_url
|
|
1747
|
+
];
|
|
1748
|
+
const candidates = kind === "video" ? [...videoCandidates, ...commonCandidates] : commonCandidates;
|
|
1749
|
+
return candidates.map(stringFromUnknown).find((candidate) => candidate && looksLikeMediaUrl(candidate, kind));
|
|
1750
|
+
}
|
|
1608
1751
|
function assetFromEmbedded(value, kind, item, parent) {
|
|
1609
1752
|
return {
|
|
1610
1753
|
url: value,
|
|
@@ -1684,7 +1827,7 @@ function sourceRank(source) {
|
|
|
1684
1827
|
}
|
|
1685
1828
|
function shouldIgnoreMediaUrl2(url) {
|
|
1686
1829
|
const normalized = url.toLowerCase();
|
|
1687
|
-
return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
|
|
1830
|
+
return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji|favicon|apple-touch-icon)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
|
|
1688
1831
|
}
|
|
1689
1832
|
function looksLikeMediaUrl(value, kind) {
|
|
1690
1833
|
if (shouldIgnoreMediaUrl2(value)) {
|
|
@@ -1692,10 +1835,10 @@ function looksLikeMediaUrl(value, kind) {
|
|
|
1692
1835
|
}
|
|
1693
1836
|
if (/^https?:\/\//i.test(value) || value.startsWith("/") || value.startsWith("./") || value.startsWith("../")) {
|
|
1694
1837
|
if (kind === "image") {
|
|
1695
|
-
return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo)/i.test(value);
|
|
1838
|
+
return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo|format=(?:jpg|jpeg|png|webp))/i.test(value) || /(?:ytimg\.com|i\.redd\.it|preview\.redd\.it|external-preview\.redd\.it|pbs\.twimg\.com|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|behance\.net)/i.test(value);
|
|
1696
1839
|
}
|
|
1697
1840
|
if (kind === "video") {
|
|
1698
|
-
return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts)/i.test(value);
|
|
1841
|
+
return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts|v\.redd\.it)/i.test(value);
|
|
1699
1842
|
}
|
|
1700
1843
|
if (kind === "audio") {
|
|
1701
1844
|
return /\.(?:mp3|m4a|wav|ogg|aac)(?:[?#].*)?$/i.test(value) || /(?:audio|podcast)/i.test(value);
|
|
@@ -1770,6 +1913,20 @@ function calculateConfidence(input) {
|
|
|
1770
1913
|
score -= Math.min(input.warnings.length * 3, 18);
|
|
1771
1914
|
return Math.round(clamp2(score, 0, 100));
|
|
1772
1915
|
}
|
|
1916
|
+
function calculateConfidenceBreakdown(input) {
|
|
1917
|
+
const title = qualityPoints(input.title, 100, 6, 120);
|
|
1918
|
+
const description = qualityPoints(input.description, 100, 24, 300);
|
|
1919
|
+
const image = input.bestImage ? clamp2(58 + Math.min(input.bestImage.score ?? 0, 100) * 0.27 + sourceConfidenceBonus(input.bestImage.source), 0, 100) : 0;
|
|
1920
|
+
const structuredData = input.hasStructuredData ? 100 : input.rawSources.embeddedData.items.length > 0 ? 55 : 0;
|
|
1921
|
+
const adapter = adapterSucceeded(input.rawSources.adapters) ? adapterConfidence(input.rawSources.adapters[0]) : 0;
|
|
1922
|
+
return {
|
|
1923
|
+
title: Math.round(title),
|
|
1924
|
+
description: Math.round(description),
|
|
1925
|
+
image: Math.round(image),
|
|
1926
|
+
structuredData: Math.round(structuredData),
|
|
1927
|
+
adapter: Math.round(adapter)
|
|
1928
|
+
};
|
|
1929
|
+
}
|
|
1773
1930
|
function calculateCompleteness(input) {
|
|
1774
1931
|
const weights = [
|
|
1775
1932
|
input.title ? 20 : 0,
|
|
@@ -1839,6 +1996,25 @@ function sourceConfidenceBonus(source) {
|
|
|
1839
1996
|
function adapterSucceeded(adapters) {
|
|
1840
1997
|
return adapters.some((adapter) => Boolean(adapter.title || adapter.description || adapter.images?.length || adapter.videos?.length));
|
|
1841
1998
|
}
|
|
1999
|
+
function adapterConfidence(adapter) {
|
|
2000
|
+
if (!adapter) {
|
|
2001
|
+
return 0;
|
|
2002
|
+
}
|
|
2003
|
+
let score = 45;
|
|
2004
|
+
if (adapter.title) {
|
|
2005
|
+
score += 22;
|
|
2006
|
+
}
|
|
2007
|
+
if (adapter.description) {
|
|
2008
|
+
score += 14;
|
|
2009
|
+
}
|
|
2010
|
+
if ((adapter.images?.length ?? 0) > 0 || (adapter.videos?.length ?? 0) > 0) {
|
|
2011
|
+
score += 14;
|
|
2012
|
+
}
|
|
2013
|
+
if (adapter.author) {
|
|
2014
|
+
score += 6;
|
|
2015
|
+
}
|
|
2016
|
+
return clamp2(score, 0, 100);
|
|
2017
|
+
}
|
|
1842
2018
|
function clamp2(value, min, max) {
|
|
1843
2019
|
return Math.max(min, Math.min(max, value));
|
|
1844
2020
|
}
|
|
@@ -1902,6 +2078,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1902
2078
|
const type = inferType(rawSources, externalResults, jsonLdNodes, article, product, app, playlist, videos, audio);
|
|
1903
2079
|
const author = firstResultValue(externalResults, (result) => result.author) ?? firstEntity(article?.authors) ?? entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"]);
|
|
1904
2080
|
const publisher = article?.publisher ?? firstResultValue(externalResults, (result) => result.publisher) ?? entityFromJsonLd(organizationNode) ?? entityFromEmbedded(embeddedNodes, ["publisher", "provider", "organization"]);
|
|
2081
|
+
const publishDate = firstDefined(article?.publishedTime, video?.publishedTime);
|
|
1905
2082
|
const sourcesUsed = detectSourcesUsed(rawSources);
|
|
1906
2083
|
const warnings = diagnosticsWarnings(rawSources, externalResults, context.diagnostics);
|
|
1907
2084
|
const fieldSources = {
|
|
@@ -1910,7 +2087,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1910
2087
|
author: fieldSource(rawSources, externalResults, embeddedNodes, "author", selectedImage.best),
|
|
1911
2088
|
image: fieldSource(rawSources, externalResults, embeddedNodes, "image", selectedImage.best)
|
|
1912
2089
|
};
|
|
1913
|
-
const
|
|
2090
|
+
const confidenceInput = {
|
|
1914
2091
|
title,
|
|
1915
2092
|
description,
|
|
1916
2093
|
bestImage: selectedImage.best,
|
|
@@ -1919,7 +2096,9 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1919
2096
|
rawSources,
|
|
1920
2097
|
sourcesUsed,
|
|
1921
2098
|
warnings
|
|
1922
|
-
}
|
|
2099
|
+
};
|
|
2100
|
+
const confidence = calculateConfidence(confidenceInput);
|
|
2101
|
+
const confidenceBreakdown = calculateConfidenceBreakdown(confidenceInput);
|
|
1923
2102
|
const completeness = calculateCompleteness({
|
|
1924
2103
|
title,
|
|
1925
2104
|
description,
|
|
@@ -1929,7 +2108,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1929
2108
|
author,
|
|
1930
2109
|
publisher,
|
|
1931
2110
|
type,
|
|
1932
|
-
publishedTime:
|
|
2111
|
+
publishedTime: publishDate,
|
|
1933
2112
|
mediaCount: images.length + videos.length + audio.length
|
|
1934
2113
|
});
|
|
1935
2114
|
const reliability = calculateReliability({
|
|
@@ -1948,7 +2127,19 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1948
2127
|
};
|
|
1949
2128
|
diagnostics.sourcesUsed = uniqueStrings2([...diagnostics.sourcesUsed, ...sourcesUsed]);
|
|
1950
2129
|
diagnostics.warnings = uniqueStrings2([...diagnostics.warnings, ...rawSources.jsonLd.warnings, ...externalResults.flatMap((result) => result.warnings ?? [])]);
|
|
2130
|
+
diagnostics.adapterUsed = diagnostics.adapterUsed ?? rawSources.adapters[0]?.source;
|
|
2131
|
+
diagnostics.extractionMethod = diagnostics.extractionMethod ?? adapterRawString(rawSources.adapters[0], "extractionMethod") ?? fieldSources.title;
|
|
2132
|
+
diagnostics.sourcePriority = uniqueStrings2([
|
|
2133
|
+
...diagnostics.sourcePriority ?? [],
|
|
2134
|
+
...arrayOfStrings(rawSources.adapters[0]?.raw?.sourcePriority) ?? []
|
|
2135
|
+
]);
|
|
2136
|
+
diagnostics.fallbacksAttempted = mergeFallbackAttempts(
|
|
2137
|
+
diagnostics.fallbacksAttempted,
|
|
2138
|
+
fallbackAttemptsFromUnknown(rawSources.adapters[0]?.raw?.fallbacksAttempted)
|
|
2139
|
+
);
|
|
2140
|
+
diagnostics.retryInfo = diagnostics.retryInfo ?? retryInfoFromUnknown(rawSources.adapters[0]?.raw?.retryInfo);
|
|
1951
2141
|
diagnostics.selectedImageReason = selectedImage.reason;
|
|
2142
|
+
diagnostics.confidenceBreakdown = confidenceBreakdown;
|
|
1952
2143
|
diagnostics.originalUrl = diagnostics.originalUrl ?? url;
|
|
1953
2144
|
diagnostics.finalUrl = diagnostics.finalUrl ?? finalUrl;
|
|
1954
2145
|
diagnostics.canonicalUrl = canonicalUrl;
|
|
@@ -1965,6 +2156,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1965
2156
|
type,
|
|
1966
2157
|
title,
|
|
1967
2158
|
description,
|
|
2159
|
+
publishDate,
|
|
1968
2160
|
siteName,
|
|
1969
2161
|
canonicalUrl,
|
|
1970
2162
|
confidence,
|
|
@@ -2254,6 +2446,61 @@ function adapterDiagnostics(adapters) {
|
|
|
2254
2446
|
confidence: Math.min(confidence, 100)
|
|
2255
2447
|
};
|
|
2256
2448
|
}
|
|
2449
|
+
function adapterRawString(adapter, key) {
|
|
2450
|
+
const value = adapter?.raw?.[key];
|
|
2451
|
+
return typeof value === "string" && value.trim() ? value.trim() : void 0;
|
|
2452
|
+
}
|
|
2453
|
+
function fallbackAttemptsFromUnknown(value) {
|
|
2454
|
+
if (!Array.isArray(value)) {
|
|
2455
|
+
return void 0;
|
|
2456
|
+
}
|
|
2457
|
+
const attempts = value.flatMap((item) => {
|
|
2458
|
+
if (!isJsonLdNode(item) || typeof item.method !== "string") {
|
|
2459
|
+
return [];
|
|
2460
|
+
}
|
|
2461
|
+
return [{
|
|
2462
|
+
method: item.method,
|
|
2463
|
+
url: typeof item.url === "string" ? item.url : void 0,
|
|
2464
|
+
ok: typeof item.ok === "boolean" ? item.ok : false,
|
|
2465
|
+
statusCode: typeof item.statusCode === "number" ? item.statusCode : void 0,
|
|
2466
|
+
blocked: typeof item.blocked === "boolean" ? item.blocked : void 0,
|
|
2467
|
+
error: typeof item.error === "string" ? item.error : void 0,
|
|
2468
|
+
retryAfter: typeof item.retryAfter === "string" ? item.retryAfter : void 0
|
|
2469
|
+
}];
|
|
2470
|
+
});
|
|
2471
|
+
return attempts.length > 0 ? attempts : void 0;
|
|
2472
|
+
}
|
|
2473
|
+
function mergeFallbackAttempts(existing, incoming) {
|
|
2474
|
+
const attempts = [...existing ?? [], ...incoming ?? []];
|
|
2475
|
+
if (attempts.length === 0) {
|
|
2476
|
+
return void 0;
|
|
2477
|
+
}
|
|
2478
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2479
|
+
return attempts.filter((attempt) => {
|
|
2480
|
+
const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
|
|
2481
|
+
if (seen.has(key)) {
|
|
2482
|
+
return false;
|
|
2483
|
+
}
|
|
2484
|
+
seen.add(key);
|
|
2485
|
+
return true;
|
|
2486
|
+
});
|
|
2487
|
+
}
|
|
2488
|
+
function retryInfoFromUnknown(value) {
|
|
2489
|
+
if (!isJsonLdNode(value)) {
|
|
2490
|
+
return void 0;
|
|
2491
|
+
}
|
|
2492
|
+
const retryable = typeof value.retryable === "boolean" ? value.retryable : void 0;
|
|
2493
|
+
if (retryable === void 0) {
|
|
2494
|
+
return void 0;
|
|
2495
|
+
}
|
|
2496
|
+
return {
|
|
2497
|
+
retryable,
|
|
2498
|
+
reason: typeof value.reason === "string" ? value.reason : void 0,
|
|
2499
|
+
retryAfter: typeof value.retryAfter === "string" ? value.retryAfter : void 0,
|
|
2500
|
+
retryAfterMs: typeof value.retryAfterMs === "number" ? value.retryAfterMs : void 0,
|
|
2501
|
+
attempts: typeof value.attempts === "number" ? value.attempts : void 0
|
|
2502
|
+
};
|
|
2503
|
+
}
|
|
2257
2504
|
function fieldSource(rawSources, externalResults, embeddedNodes, field, bestImage) {
|
|
2258
2505
|
if (field === "image") {
|
|
2259
2506
|
return bestImage ? sourceLabel2(bestImage) : void 0;
|
|
@@ -2502,23 +2749,26 @@ var youtubeAdapter = {
|
|
|
2502
2749
|
const videoId = getYouTubeVideoId(url);
|
|
2503
2750
|
const playlistId = getYouTubePlaylistId(url);
|
|
2504
2751
|
const communityPostId = getYouTubeCommunityPostId(url);
|
|
2752
|
+
const titleSelection = youtubeTitleFromContext(context, { videoId, playlistId, communityPostId });
|
|
2753
|
+
const descriptionSelection = youtubeDescriptionFromContext(context);
|
|
2505
2754
|
const channel = entityFromContext(context, ["author", "ownerChannelName", "channel", "owner"]);
|
|
2506
2755
|
const playlistVideos = playlistId ? extractPlaylistVideos(context) : [];
|
|
2756
|
+
const sourcePriority = youtubeSourcePriority();
|
|
2507
2757
|
return compactAdapterResult({
|
|
2508
2758
|
source: "youtubeAdapter",
|
|
2509
2759
|
platform: "YouTube",
|
|
2510
2760
|
type: playlistId ? "playlist" : communityPostId ? "social_post" : "video",
|
|
2511
2761
|
siteName: "YouTube",
|
|
2512
2762
|
canonicalUrl: videoId ? `https://www.youtube.com/watch?v=${videoId}` : context.raw.openGraph.url,
|
|
2513
|
-
title:
|
|
2514
|
-
description:
|
|
2763
|
+
title: titleSelection.value,
|
|
2764
|
+
description: descriptionSelection.value,
|
|
2515
2765
|
videos: markAdapterMedia(mediaFromContext(context).videos, "youtubeAdapter"),
|
|
2516
2766
|
images: markAdapterMedia(mediaFromContext(context).images, "youtubeAdapter"),
|
|
2517
2767
|
author: channel,
|
|
2518
2768
|
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2519
2769
|
video: videoId ? {
|
|
2520
2770
|
id: videoId,
|
|
2521
|
-
title:
|
|
2771
|
+
title: titleSelection.value,
|
|
2522
2772
|
channel,
|
|
2523
2773
|
publishedTime: publishedTimeFromContext(context),
|
|
2524
2774
|
duration: findEmbeddedString(context, ["duration", "lengthSeconds", "approxDurationMs"]),
|
|
@@ -2528,11 +2778,15 @@ var youtubeAdapter = {
|
|
|
2528
2778
|
} : void 0,
|
|
2529
2779
|
playlist: playlistId ? {
|
|
2530
2780
|
id: playlistId,
|
|
2531
|
-
title:
|
|
2781
|
+
title: youtubePlaylistTitleFromContext(context) ?? context.raw.openGraph.title,
|
|
2532
2782
|
channel,
|
|
2533
2783
|
videos: playlistVideos
|
|
2534
2784
|
} : void 0,
|
|
2535
|
-
identifiers: { videoId, playlistId, communityPostId }
|
|
2785
|
+
identifiers: { videoId, playlistId, communityPostId },
|
|
2786
|
+
raw: {
|
|
2787
|
+
sourcePriority,
|
|
2788
|
+
extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "youtube:htmlFallback"
|
|
2789
|
+
}
|
|
2536
2790
|
});
|
|
2537
2791
|
},
|
|
2538
2792
|
normalize(rawData) {
|
|
@@ -2551,20 +2805,27 @@ var redditAdapter = {
|
|
|
2551
2805
|
const url = new URL(context.finalUrl);
|
|
2552
2806
|
const reddit = parseRedditUrl(url);
|
|
2553
2807
|
const username = typeof reddit.username === "string" ? reddit.username : void 0;
|
|
2808
|
+
const titleSelection = redditTitleFromContext(context);
|
|
2809
|
+
const descriptionSelection = redditDescriptionFromContext(context);
|
|
2810
|
+
const sourcePriority = redditSourcePriority();
|
|
2554
2811
|
return compactAdapterResult({
|
|
2555
2812
|
source: "redditAdapter",
|
|
2556
2813
|
platform: "Reddit",
|
|
2557
2814
|
type: reddit.isPost ? "social_post" : "website",
|
|
2558
2815
|
siteName: "Reddit",
|
|
2559
|
-
canonicalUrl: context.raw.openGraph.url,
|
|
2560
|
-
title: cleanSocialTitle(
|
|
2561
|
-
description:
|
|
2816
|
+
canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
|
|
2817
|
+
title: cleanSocialTitle(titleSelection.value),
|
|
2818
|
+
description: descriptionSelection.value,
|
|
2562
2819
|
images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
|
|
2563
2820
|
videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
|
|
2564
2821
|
author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
|
|
2565
2822
|
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2566
2823
|
identifiers: { subreddit: reddit.subreddit, postId: reddit.postId, username: reddit.username },
|
|
2567
|
-
raw: {
|
|
2824
|
+
raw: {
|
|
2825
|
+
...reddit,
|
|
2826
|
+
sourcePriority,
|
|
2827
|
+
extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "reddit:htmlFallback"
|
|
2828
|
+
}
|
|
2568
2829
|
});
|
|
2569
2830
|
},
|
|
2570
2831
|
normalize(rawData) {
|
|
@@ -2665,6 +2926,7 @@ var facebookAdapter = {
|
|
|
2665
2926
|
platform: "Facebook",
|
|
2666
2927
|
type: isPhoto ? "image" : isPost || media.images.length > 0 || media.videos.length > 0 ? "social_post" : "website",
|
|
2667
2928
|
siteName: "Facebook",
|
|
2929
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2668
2930
|
title: titleFromContext(context, ["title", "headline", "name"]),
|
|
2669
2931
|
description: descriptionFromContext(context),
|
|
2670
2932
|
images: markAdapterMedia(media.images, "facebookAdapter"),
|
|
@@ -2748,6 +3010,116 @@ var defaultAdapters = [
|
|
|
2748
3010
|
twitterAdapter,
|
|
2749
3011
|
instagramAdapter
|
|
2750
3012
|
];
|
|
3013
|
+
function youtubeSourcePriority() {
|
|
3014
|
+
return [
|
|
3015
|
+
"structuredData:VideoObject",
|
|
3016
|
+
"embeddedData:ytInitialPlayerResponse",
|
|
3017
|
+
"embeddedData:ytInitialData",
|
|
3018
|
+
"openGraph",
|
|
3019
|
+
"twitter",
|
|
3020
|
+
"html"
|
|
3021
|
+
];
|
|
3022
|
+
}
|
|
3023
|
+
function youtubeTitleFromContext(context, ids) {
|
|
3024
|
+
const videoObjectTitle = jsonLdVideoObjectString(context, ["name", "headline"]);
|
|
3025
|
+
if (videoObjectTitle) {
|
|
3026
|
+
return { value: videoObjectTitle, method: "youtube:structuredData.VideoObject" };
|
|
3027
|
+
}
|
|
3028
|
+
const playerTitle = youtubePlayerString(context, ["videoDetails.title", "microformat.playerMicroformatRenderer.title"]);
|
|
3029
|
+
if (playerTitle) {
|
|
3030
|
+
return { value: playerTitle, method: "youtube:ytInitialPlayerResponse" };
|
|
3031
|
+
}
|
|
3032
|
+
const initialDataTitle = youtubeInitialDataTitle(context, ids);
|
|
3033
|
+
if (initialDataTitle) {
|
|
3034
|
+
return { value: initialDataTitle, method: "youtube:ytInitialData" };
|
|
3035
|
+
}
|
|
3036
|
+
if (context.raw.openGraph.title) {
|
|
3037
|
+
return { value: context.raw.openGraph.title, method: "youtube:openGraph" };
|
|
3038
|
+
}
|
|
3039
|
+
if (context.raw.twitter.title) {
|
|
3040
|
+
return { value: context.raw.twitter.title, method: "youtube:twitter" };
|
|
3041
|
+
}
|
|
3042
|
+
return { value: cleanYouTubeHtmlTitle(context.raw.html.title), method: context.raw.html.title ? "youtube:html" : void 0 };
|
|
3043
|
+
}
|
|
3044
|
+
function youtubeDescriptionFromContext(context) {
|
|
3045
|
+
const videoObjectDescription = jsonLdVideoObjectString(context, ["description"]);
|
|
3046
|
+
if (videoObjectDescription) {
|
|
3047
|
+
return { value: videoObjectDescription, method: "youtube:structuredData.VideoObject" };
|
|
3048
|
+
}
|
|
3049
|
+
const playerDescription = youtubePlayerString(context, [
|
|
3050
|
+
"videoDetails.shortDescription",
|
|
3051
|
+
"microformat.playerMicroformatRenderer.description",
|
|
3052
|
+
"microformat.playerMicroformatRenderer.shortDescription"
|
|
3053
|
+
]);
|
|
3054
|
+
if (playerDescription) {
|
|
3055
|
+
return { value: playerDescription, method: "youtube:ytInitialPlayerResponse" };
|
|
3056
|
+
}
|
|
3057
|
+
const initialDataDescription = youtubeInitialDataDescription(context);
|
|
3058
|
+
if (initialDataDescription) {
|
|
3059
|
+
return { value: initialDataDescription, method: "youtube:ytInitialData" };
|
|
3060
|
+
}
|
|
3061
|
+
if (context.raw.openGraph.description) {
|
|
3062
|
+
return { value: context.raw.openGraph.description, method: "youtube:openGraph" };
|
|
3063
|
+
}
|
|
3064
|
+
if (context.raw.twitter.description) {
|
|
3065
|
+
return { value: context.raw.twitter.description, method: "youtube:twitter" };
|
|
3066
|
+
}
|
|
3067
|
+
return { value: context.raw.html.description, method: context.raw.html.description ? "youtube:html" : void 0 };
|
|
3068
|
+
}
|
|
3069
|
+
function redditSourcePriority() {
|
|
3070
|
+
return [
|
|
3071
|
+
"redditJsonEndpoint",
|
|
3072
|
+
"oldReddit",
|
|
3073
|
+
"embeddedStructuredData",
|
|
3074
|
+
"openGraph",
|
|
3075
|
+
"twitter",
|
|
3076
|
+
"html"
|
|
3077
|
+
];
|
|
3078
|
+
}
|
|
3079
|
+
function redditTitleFromContext(context) {
|
|
3080
|
+
const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
|
|
3081
|
+
"postTitle",
|
|
3082
|
+
"title",
|
|
3083
|
+
"headline"
|
|
3084
|
+
]);
|
|
3085
|
+
if (embedded) {
|
|
3086
|
+
return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
|
|
3087
|
+
}
|
|
3088
|
+
const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["headline", "name"]);
|
|
3089
|
+
if (structured) {
|
|
3090
|
+
return { value: structured, method: "reddit:structuredData" };
|
|
3091
|
+
}
|
|
3092
|
+
if (context.raw.openGraph.title) {
|
|
3093
|
+
return { value: context.raw.openGraph.title, method: "reddit:openGraph" };
|
|
3094
|
+
}
|
|
3095
|
+
if (context.raw.twitter.title) {
|
|
3096
|
+
return { value: context.raw.twitter.title, method: "reddit:twitter" };
|
|
3097
|
+
}
|
|
3098
|
+
return { value: context.raw.html.title, method: context.raw.html.title ? "reddit:html" : void 0 };
|
|
3099
|
+
}
|
|
3100
|
+
function redditDescriptionFromContext(context) {
|
|
3101
|
+
const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
|
|
3102
|
+
"description",
|
|
3103
|
+
"selftext",
|
|
3104
|
+
"excerpt",
|
|
3105
|
+
"summary",
|
|
3106
|
+
"body"
|
|
3107
|
+
]);
|
|
3108
|
+
if (embedded) {
|
|
3109
|
+
return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
|
|
3110
|
+
}
|
|
3111
|
+
const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["description", "articleBody"]);
|
|
3112
|
+
if (structured) {
|
|
3113
|
+
return { value: structured, method: "reddit:structuredData" };
|
|
3114
|
+
}
|
|
3115
|
+
if (context.raw.openGraph.description) {
|
|
3116
|
+
return { value: context.raw.openGraph.description, method: "reddit:openGraph" };
|
|
3117
|
+
}
|
|
3118
|
+
if (context.raw.twitter.description) {
|
|
3119
|
+
return { value: context.raw.twitter.description, method: "reddit:twitter" };
|
|
3120
|
+
}
|
|
3121
|
+
return { value: context.raw.html.description, method: context.raw.html.description ? "reddit:html" : void 0 };
|
|
3122
|
+
}
|
|
2751
3123
|
function socialVideoResult(source, platform, context) {
|
|
2752
3124
|
const url = new URL(context.finalUrl);
|
|
2753
3125
|
const username = url.pathname.match(/@([^/]+)/)?.[1];
|
|
@@ -2806,6 +3178,143 @@ function markAdapterMedia(assets, adapterName) {
|
|
|
2806
3178
|
}
|
|
2807
3179
|
}));
|
|
2808
3180
|
}
|
|
3181
|
+
function jsonLdVideoObjectString(context, keys) {
|
|
3182
|
+
return jsonLdStringByType(context.raw.jsonLd.nodes, ["VideoObject"], keys);
|
|
3183
|
+
}
|
|
3184
|
+
function jsonLdStringByType(nodes, types, keys) {
|
|
3185
|
+
for (const node of nodes) {
|
|
3186
|
+
if (!hasJsonLdType2(node, types)) {
|
|
3187
|
+
continue;
|
|
3188
|
+
}
|
|
3189
|
+
for (const key of keys) {
|
|
3190
|
+
const value = stringFromUnknown3(node[key]);
|
|
3191
|
+
if (value) {
|
|
3192
|
+
return value;
|
|
3193
|
+
}
|
|
3194
|
+
}
|
|
3195
|
+
}
|
|
3196
|
+
return void 0;
|
|
3197
|
+
}
|
|
3198
|
+
function hasJsonLdType2(node, types) {
|
|
3199
|
+
const nodeTypes = Array.isArray(node["@type"]) ? node["@type"] : [node["@type"]];
|
|
3200
|
+
return nodeTypes.some((type) => typeof type === "string" && types.some((candidate) => type.toLowerCase().endsWith(candidate.toLowerCase())));
|
|
3201
|
+
}
|
|
3202
|
+
function youtubePlayerString(context, paths) {
|
|
3203
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3204
|
+
if (item.source !== "youtubePlayerResponse") {
|
|
3205
|
+
continue;
|
|
3206
|
+
}
|
|
3207
|
+
for (const path of paths) {
|
|
3208
|
+
const value = stringFromUnknown3(valueAtPath(item.data, path));
|
|
3209
|
+
if (value) {
|
|
3210
|
+
return value;
|
|
3211
|
+
}
|
|
3212
|
+
}
|
|
3213
|
+
}
|
|
3214
|
+
return void 0;
|
|
3215
|
+
}
|
|
3216
|
+
function youtubeInitialDataTitle(context, ids) {
|
|
3217
|
+
const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
|
|
3218
|
+
const primary = findRendererText(items, ["videoPrimaryInfoRenderer", "watchMetadata"], ["title"]);
|
|
3219
|
+
if (primary) {
|
|
3220
|
+
return primary;
|
|
3221
|
+
}
|
|
3222
|
+
if (ids.videoId) {
|
|
3223
|
+
const matchingVideo = findYouTubeRendererForVideoId(items, ids.videoId, ["title"]);
|
|
3224
|
+
if (matchingVideo) {
|
|
3225
|
+
return matchingVideo;
|
|
3226
|
+
}
|
|
3227
|
+
}
|
|
3228
|
+
if (ids.communityPostId) {
|
|
3229
|
+
const communityPost = findEmbeddedStringBySources(context, ["youtubeInitialData"], ["contentText"]) ?? findRendererText(items, ["backstagePostRenderer", "postRenderer"], ["contentText", "title"]);
|
|
3230
|
+
if (communityPost) {
|
|
3231
|
+
return communityPost;
|
|
3232
|
+
}
|
|
3233
|
+
}
|
|
3234
|
+
if (ids.playlistId && !ids.videoId) {
|
|
3235
|
+
return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
|
|
3236
|
+
}
|
|
3237
|
+
return void 0;
|
|
3238
|
+
}
|
|
3239
|
+
function youtubeInitialDataDescription(context) {
|
|
3240
|
+
const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
|
|
3241
|
+
return findRendererText(items, ["expandableVideoDescriptionBodyRenderer", "videoSecondaryInfoRenderer", "watchMetadata"], [
|
|
3242
|
+
"description",
|
|
3243
|
+
"attributedDescription",
|
|
3244
|
+
"content"
|
|
3245
|
+
]);
|
|
3246
|
+
}
|
|
3247
|
+
function youtubePlaylistTitleFromContext(context) {
|
|
3248
|
+
const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
|
|
3249
|
+
return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
|
|
3250
|
+
}
|
|
3251
|
+
function findRendererText(items, rendererKeys, textKeys) {
|
|
3252
|
+
for (const item of items) {
|
|
3253
|
+
let found;
|
|
3254
|
+
walkData(item.data, (value, key) => {
|
|
3255
|
+
if (found || !key || !rendererKeys.includes(key) || !isRecord4(value)) {
|
|
3256
|
+
return;
|
|
3257
|
+
}
|
|
3258
|
+
for (const textKey of textKeys) {
|
|
3259
|
+
found = stringFromUnknown3(value[textKey]);
|
|
3260
|
+
if (found) {
|
|
3261
|
+
return;
|
|
3262
|
+
}
|
|
3263
|
+
}
|
|
3264
|
+
});
|
|
3265
|
+
if (found) {
|
|
3266
|
+
return found;
|
|
3267
|
+
}
|
|
3268
|
+
}
|
|
3269
|
+
return void 0;
|
|
3270
|
+
}
|
|
3271
|
+
function findYouTubeRendererForVideoId(items, videoId, textKeys) {
|
|
3272
|
+
for (const item of items) {
|
|
3273
|
+
let found;
|
|
3274
|
+
walkData(item.data, (value) => {
|
|
3275
|
+
if (found || !isRecord4(value) || stringFromUnknown3(value.videoId) !== videoId) {
|
|
3276
|
+
return;
|
|
3277
|
+
}
|
|
3278
|
+
for (const textKey of textKeys) {
|
|
3279
|
+
found = stringFromUnknown3(value[textKey]);
|
|
3280
|
+
if (found) {
|
|
3281
|
+
return;
|
|
3282
|
+
}
|
|
3283
|
+
}
|
|
3284
|
+
});
|
|
3285
|
+
if (found) {
|
|
3286
|
+
return found;
|
|
3287
|
+
}
|
|
3288
|
+
}
|
|
3289
|
+
return void 0;
|
|
3290
|
+
}
|
|
3291
|
+
function findEmbeddedStringBySources(context, sources, keys) {
|
|
3292
|
+
const candidates = [];
|
|
3293
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3294
|
+
if (!sources.includes(item.source)) {
|
|
3295
|
+
continue;
|
|
3296
|
+
}
|
|
3297
|
+
walkData(item.data, (value, key) => {
|
|
3298
|
+
if (!key || !matchesKey(key, keys)) {
|
|
3299
|
+
return;
|
|
3300
|
+
}
|
|
3301
|
+
const text = stringFromUnknown3(value);
|
|
3302
|
+
if (text) {
|
|
3303
|
+
candidates.push(text);
|
|
3304
|
+
}
|
|
3305
|
+
});
|
|
3306
|
+
}
|
|
3307
|
+
return bestTextCandidate(candidates);
|
|
3308
|
+
}
|
|
3309
|
+
function hasRedditJsonEndpointPayload(context) {
|
|
3310
|
+
return context.raw.embeddedData.items.some((item) => item.source === "applicationJson" && item.path === "metanova-reddit-json");
|
|
3311
|
+
}
|
|
3312
|
+
function valueAtPath(node, path) {
|
|
3313
|
+
return path.split(".").reduce((current, key) => isRecord4(current) ? current[key] : void 0, node);
|
|
3314
|
+
}
|
|
3315
|
+
function cleanYouTubeHtmlTitle(title) {
|
|
3316
|
+
return title?.replace(/\s*-\s*YouTube\s*$/i, "").trim();
|
|
3317
|
+
}
|
|
2809
3318
|
function titleFromContext(context, embeddedKeys) {
|
|
2810
3319
|
return firstText(
|
|
2811
3320
|
context.raw.openGraph.title,
|
|
@@ -3373,9 +3882,8 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3373
3882
|
const startedAt = Date.now();
|
|
3374
3883
|
try {
|
|
3375
3884
|
const requestedUrl = normalizeUrl(url);
|
|
3376
|
-
const
|
|
3377
|
-
const
|
|
3378
|
-
const page = fallback.page;
|
|
3885
|
+
const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
|
|
3886
|
+
const page = fetchResult.page;
|
|
3379
3887
|
const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
|
|
3380
3888
|
if (directMedia) {
|
|
3381
3889
|
return directMedia;
|
|
@@ -3396,12 +3904,17 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3396
3904
|
metadata.diagnostics.trace = [
|
|
3397
3905
|
...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
|
|
3398
3906
|
...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
|
|
3399
|
-
...
|
|
3907
|
+
...fetchResult.trace,
|
|
3400
3908
|
"downloaded page",
|
|
3401
3909
|
...metadata.diagnostics.trace,
|
|
3402
3910
|
...metadata.canonicalUrl ? ["resolved canonical URL"] : []
|
|
3403
3911
|
];
|
|
3912
|
+
metadata.diagnostics.fallbacksAttempted = mergeFallbackAttempts2(metadata.diagnostics.fallbacksAttempted, fetchResult.fallbacksAttempted);
|
|
3913
|
+
metadata.diagnostics.sourcePriority = uniqueStrings3([...metadata.diagnostics.sourcePriority ?? [], ...fetchResult.sourcePriority ?? []]);
|
|
3914
|
+
metadata.diagnostics.extractionMethod = metadata.diagnostics.extractionMethod ?? fetchResult.extractionMethod;
|
|
3915
|
+
metadata.diagnostics.retryInfo = metadata.diagnostics.retryInfo ?? fetchResult.retryInfo;
|
|
3404
3916
|
metadata.trace = metadata.diagnostics.trace;
|
|
3917
|
+
metadata.diagnostics.warnings.push(...fetchResult.warnings);
|
|
3405
3918
|
if (!metadata.ok) {
|
|
3406
3919
|
metadata.diagnostics.warnings.push(`Fetch completed with non-success status code ${page.statusCode}.`);
|
|
3407
3920
|
}
|
|
@@ -3439,30 +3952,441 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3439
3952
|
};
|
|
3440
3953
|
}
|
|
3441
3954
|
}
|
|
3442
|
-
async function
|
|
3443
|
-
|
|
3955
|
+
async function fetchPageWithStrategies(requestedUrl, options) {
|
|
3956
|
+
if (isRedditUrl(requestedUrl)) {
|
|
3957
|
+
return fetchRedditPageWithStrategy(requestedUrl, options);
|
|
3958
|
+
}
|
|
3959
|
+
return {
|
|
3960
|
+
page: await fetchPage(requestedUrl, options),
|
|
3961
|
+
fallbacksAttempted: [],
|
|
3962
|
+
warnings: [],
|
|
3963
|
+
trace: []
|
|
3964
|
+
};
|
|
3965
|
+
}
|
|
3966
|
+
async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
3967
|
+
const attempts = [];
|
|
3968
|
+
const warnings = [];
|
|
3969
|
+
const sourcePriority = ["redditJsonEndpoint", "oldReddit", "embeddedStructuredData", "openGraph", "html"];
|
|
3970
|
+
let lastError;
|
|
3971
|
+
const jsonUrl = redditJsonEndpoint(requestedUrl);
|
|
3972
|
+
if (jsonUrl) {
|
|
3973
|
+
const attempt = await attemptFetch("redditJsonEndpoint", jsonUrl, {
|
|
3974
|
+
...options,
|
|
3975
|
+
accept: "application/json,text/html;q=0.8,*/*;q=0.5"
|
|
3976
|
+
});
|
|
3977
|
+
attempts.push(attempt);
|
|
3978
|
+
lastError = attempt.error;
|
|
3979
|
+
if (attempt.page && attempt.ok && !attempt.blocked) {
|
|
3980
|
+
const redditPost = parseRedditJsonPayload(attempt.page.html);
|
|
3981
|
+
if (redditPost?.title) {
|
|
3982
|
+
return {
|
|
3983
|
+
page: synthesizeRedditJsonPage(attempt.page, requestedUrl, redditPost),
|
|
3984
|
+
fallbacksAttempted: attempts,
|
|
3985
|
+
warnings,
|
|
3986
|
+
trace: ["used Reddit JSON endpoint"],
|
|
3987
|
+
sourcePriority,
|
|
3988
|
+
extractionMethod: "reddit:jsonEndpoint",
|
|
3989
|
+
retryInfo: redditRetryInfo(attempts)
|
|
3990
|
+
};
|
|
3991
|
+
}
|
|
3992
|
+
warnings.push("Reddit JSON endpoint responded, but no post payload could be extracted.");
|
|
3993
|
+
} else if (attempt.blocked) {
|
|
3994
|
+
warnings.push("Reddit JSON endpoint appears to have blocked access.");
|
|
3995
|
+
}
|
|
3996
|
+
}
|
|
3997
|
+
const oldRedditUrl = redditOldUrl(requestedUrl);
|
|
3998
|
+
if (oldRedditUrl && oldRedditUrl !== requestedUrl) {
|
|
3999
|
+
const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
|
|
4000
|
+
attempts.push(attempt);
|
|
4001
|
+
lastError = attempt.error;
|
|
4002
|
+
if (attempt.page && attempt.ok && !attempt.blocked) {
|
|
4003
|
+
return {
|
|
4004
|
+
page: attempt.page,
|
|
4005
|
+
fallbacksAttempted: attempts,
|
|
4006
|
+
warnings,
|
|
4007
|
+
trace: ["retried Reddit page through old.reddit"],
|
|
4008
|
+
sourcePriority,
|
|
4009
|
+
extractionMethod: "reddit:oldReddit",
|
|
4010
|
+
retryInfo: redditRetryInfo(attempts)
|
|
4011
|
+
};
|
|
4012
|
+
}
|
|
4013
|
+
if (attempt.blocked) {
|
|
4014
|
+
warnings.push("old.reddit fallback appears to have been blocked.");
|
|
4015
|
+
}
|
|
4016
|
+
}
|
|
4017
|
+
const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
|
|
4018
|
+
attempts.push(htmlAttempt);
|
|
4019
|
+
lastError = htmlAttempt.error;
|
|
4020
|
+
if (htmlAttempt.page) {
|
|
4021
|
+
if (htmlAttempt.blocked) {
|
|
4022
|
+
warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
|
|
4023
|
+
}
|
|
4024
|
+
return {
|
|
4025
|
+
page: htmlAttempt.page,
|
|
4026
|
+
fallbacksAttempted: attempts,
|
|
4027
|
+
warnings,
|
|
4028
|
+
trace: ["used Reddit HTML fallback"],
|
|
4029
|
+
sourcePriority,
|
|
4030
|
+
extractionMethod: "reddit:htmlFallback",
|
|
4031
|
+
retryInfo: redditRetryInfo(attempts)
|
|
4032
|
+
};
|
|
4033
|
+
}
|
|
4034
|
+
throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
|
|
4035
|
+
}
|
|
4036
|
+
async function attemptFetch(method, url, options) {
|
|
3444
4037
|
try {
|
|
3445
|
-
|
|
4038
|
+
const page = await fetchPage(url, options);
|
|
4039
|
+
const retryAfter = page.headers["retry-after"];
|
|
4040
|
+
const blocked = isRedditBlocked(page);
|
|
4041
|
+
return {
|
|
4042
|
+
method,
|
|
4043
|
+
url,
|
|
4044
|
+
ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
|
|
4045
|
+
statusCode: page.statusCode,
|
|
4046
|
+
blocked,
|
|
4047
|
+
retryAfter,
|
|
4048
|
+
page
|
|
4049
|
+
};
|
|
4050
|
+
} catch (error) {
|
|
4051
|
+
return {
|
|
4052
|
+
method,
|
|
4053
|
+
url,
|
|
4054
|
+
ok: false,
|
|
4055
|
+
error: error instanceof Error ? error.message : String(error)
|
|
4056
|
+
};
|
|
4057
|
+
}
|
|
4058
|
+
}
|
|
4059
|
+
function isRedditUrl(url) {
|
|
4060
|
+
try {
|
|
4061
|
+
const host = new URL(url).hostname.toLowerCase().replace(/^www\./, "");
|
|
4062
|
+
return host === "reddit.com" || host === "redd.it" || host.endsWith(".reddit.com");
|
|
3446
4063
|
} catch {
|
|
3447
|
-
return
|
|
3448
|
-
}
|
|
3449
|
-
|
|
3450
|
-
|
|
3451
|
-
const isOldReddit = host === "old.reddit.com";
|
|
3452
|
-
const looksLikeVerification = /please wait for verification|blocked|whoa there, pardner/i.test(page.html);
|
|
3453
|
-
const hasUsefulPreview = /og:(?:title|image|description)|twitter:(?:title|image|description)/i.test(page.html);
|
|
3454
|
-
if (!isReddit || isOldReddit || hasUsefulPreview || !looksLikeVerification) {
|
|
3455
|
-
return { page, used: false };
|
|
3456
|
-
}
|
|
3457
|
-
const fallbackUrl = new URL(page.finalUrl);
|
|
3458
|
-
fallbackUrl.hostname = "old.reddit.com";
|
|
3459
|
-
fallbackUrl.search = "";
|
|
4064
|
+
return false;
|
|
4065
|
+
}
|
|
4066
|
+
}
|
|
4067
|
+
function redditJsonEndpoint(url) {
|
|
3460
4068
|
try {
|
|
3461
|
-
const
|
|
3462
|
-
|
|
4069
|
+
const parsed = new URL(url);
|
|
4070
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
|
|
4071
|
+
const endpoint = new URL(url);
|
|
4072
|
+
endpoint.protocol = "https:";
|
|
4073
|
+
endpoint.hostname = "www.reddit.com";
|
|
4074
|
+
endpoint.search = "";
|
|
4075
|
+
if (host === "redd.it") {
|
|
4076
|
+
const postId = parsed.pathname.split("/").filter(Boolean)[0];
|
|
4077
|
+
if (!postId) {
|
|
4078
|
+
return void 0;
|
|
4079
|
+
}
|
|
4080
|
+
endpoint.pathname = `/comments/${postId}.json`;
|
|
4081
|
+
} else {
|
|
4082
|
+
endpoint.pathname = parsed.pathname.endsWith(".json") ? parsed.pathname : `${parsed.pathname.endsWith("/") ? parsed.pathname : `${parsed.pathname}/`}.json`;
|
|
4083
|
+
}
|
|
4084
|
+
endpoint.searchParams.set("raw_json", "1");
|
|
4085
|
+
return endpoint.toString();
|
|
3463
4086
|
} catch {
|
|
3464
|
-
return
|
|
4087
|
+
return void 0;
|
|
4088
|
+
}
|
|
4089
|
+
}
|
|
4090
|
+
function redditOldUrl(url) {
|
|
4091
|
+
try {
|
|
4092
|
+
const parsed = new URL(url);
|
|
4093
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
|
|
4094
|
+
parsed.protocol = "https:";
|
|
4095
|
+
parsed.hostname = "old.reddit.com";
|
|
4096
|
+
parsed.search = "";
|
|
4097
|
+
if (host === "redd.it") {
|
|
4098
|
+
const postId = parsed.pathname.split("/").filter(Boolean)[0];
|
|
4099
|
+
if (!postId) {
|
|
4100
|
+
return void 0;
|
|
4101
|
+
}
|
|
4102
|
+
parsed.pathname = `/comments/${postId}/`;
|
|
4103
|
+
}
|
|
4104
|
+
return parsed.toString();
|
|
4105
|
+
} catch {
|
|
4106
|
+
return void 0;
|
|
4107
|
+
}
|
|
4108
|
+
}
|
|
4109
|
+
function parseRedditJsonPayload(source) {
|
|
4110
|
+
try {
|
|
4111
|
+
const parsed = JSON.parse(source);
|
|
4112
|
+
const post = findRedditPostRecord(parsed);
|
|
4113
|
+
if (!post) {
|
|
4114
|
+
return void 0;
|
|
4115
|
+
}
|
|
4116
|
+
const createdUtc = numberFromUnknown2(post.created_utc);
|
|
4117
|
+
const permalink = stringFromUnknown4(post.permalink);
|
|
4118
|
+
const canonicalUrl = permalink ? `https://www.reddit.com${permalink.startsWith("/") ? permalink : `/${permalink}`}` : void 0;
|
|
4119
|
+
const images = redditImagesFromPost(post);
|
|
4120
|
+
const videos = redditVideosFromPost(post);
|
|
4121
|
+
const description = firstText2(
|
|
4122
|
+
stringFromUnknown4(post.selftext),
|
|
4123
|
+
stringFromUnknown4(post.selftext_html),
|
|
4124
|
+
stringFromUnknown4(post.url_overridden_by_dest)
|
|
4125
|
+
);
|
|
4126
|
+
return {
|
|
4127
|
+
title: stringFromUnknown4(post.title),
|
|
4128
|
+
description,
|
|
4129
|
+
author: stringFromUnknown4(post.author) ?? stringFromUnknown4(post.author_fullname),
|
|
4130
|
+
createdAt: createdUtc ? new Date(createdUtc * 1e3).toISOString() : void 0,
|
|
4131
|
+
canonicalUrl,
|
|
4132
|
+
url: stringFromUnknown4(post.url_overridden_by_dest) ?? stringFromUnknown4(post.url),
|
|
4133
|
+
images,
|
|
4134
|
+
videos,
|
|
4135
|
+
subreddit: stringFromUnknown4(post.subreddit_name_prefixed) ?? stringFromUnknown4(post.subreddit),
|
|
4136
|
+
postId: stringFromUnknown4(post.id)
|
|
4137
|
+
};
|
|
4138
|
+
} catch {
|
|
4139
|
+
return void 0;
|
|
4140
|
+
}
|
|
4141
|
+
}
|
|
4142
|
+
function findRedditPostRecord(value) {
|
|
4143
|
+
if (Array.isArray(value)) {
|
|
4144
|
+
for (const item of value) {
|
|
4145
|
+
const found = findRedditPostRecord(item);
|
|
4146
|
+
if (found) {
|
|
4147
|
+
return found;
|
|
4148
|
+
}
|
|
4149
|
+
}
|
|
4150
|
+
return void 0;
|
|
4151
|
+
}
|
|
4152
|
+
if (!isRecord5(value)) {
|
|
4153
|
+
return void 0;
|
|
4154
|
+
}
|
|
4155
|
+
if (typeof value.title === "string" && (typeof value.id === "string" || typeof value.name === "string")) {
|
|
4156
|
+
return value;
|
|
4157
|
+
}
|
|
4158
|
+
const children = isRecord5(value.data) && Array.isArray(value.data.children) ? value.data.children : void 0;
|
|
4159
|
+
if (children) {
|
|
4160
|
+
for (const child of children) {
|
|
4161
|
+
if (isRecord5(child) && isRecord5(child.data) && (child.kind === "t3" || typeof child.data.title === "string")) {
|
|
4162
|
+
return child.data;
|
|
4163
|
+
}
|
|
4164
|
+
}
|
|
4165
|
+
}
|
|
4166
|
+
for (const childValue of Object.values(value).slice(0, 100)) {
|
|
4167
|
+
const found = findRedditPostRecord(childValue);
|
|
4168
|
+
if (found) {
|
|
4169
|
+
return found;
|
|
4170
|
+
}
|
|
4171
|
+
}
|
|
4172
|
+
return void 0;
|
|
4173
|
+
}
|
|
4174
|
+
function redditImagesFromPost(post) {
|
|
4175
|
+
const images = [];
|
|
4176
|
+
const preview = isRecord5(post.preview) && Array.isArray(post.preview.images) ? post.preview.images : [];
|
|
4177
|
+
for (const image of preview) {
|
|
4178
|
+
if (!isRecord5(image)) {
|
|
4179
|
+
continue;
|
|
4180
|
+
}
|
|
4181
|
+
for (const candidate of [image.source, ...Array.isArray(image.resolutions) ? image.resolutions : []]) {
|
|
4182
|
+
if (!isRecord5(candidate)) {
|
|
4183
|
+
continue;
|
|
4184
|
+
}
|
|
4185
|
+
const url = redditMediaUrl(stringFromUnknown4(candidate.url));
|
|
4186
|
+
if (!url) {
|
|
4187
|
+
continue;
|
|
4188
|
+
}
|
|
4189
|
+
images.push({
|
|
4190
|
+
url,
|
|
4191
|
+
kind: "image",
|
|
4192
|
+
source: "adapter",
|
|
4193
|
+
width: numberFromUnknown2(candidate.width),
|
|
4194
|
+
height: numberFromUnknown2(candidate.height),
|
|
4195
|
+
metadata: {
|
|
4196
|
+
adapter: "redditJsonEndpoint",
|
|
4197
|
+
originalSource: "redditJsonEndpoint"
|
|
4198
|
+
}
|
|
4199
|
+
});
|
|
4200
|
+
}
|
|
4201
|
+
}
|
|
4202
|
+
const thumbnail = redditMediaUrl(stringFromUnknown4(post.thumbnail));
|
|
4203
|
+
if (thumbnail && /^https?:\/\//i.test(thumbnail)) {
|
|
4204
|
+
images.push({
|
|
4205
|
+
url: thumbnail,
|
|
4206
|
+
kind: "image",
|
|
4207
|
+
source: "adapter",
|
|
4208
|
+
metadata: {
|
|
4209
|
+
adapter: "redditJsonEndpoint",
|
|
4210
|
+
originalSource: "redditJsonEndpoint"
|
|
4211
|
+
}
|
|
4212
|
+
});
|
|
4213
|
+
}
|
|
4214
|
+
return images;
|
|
4215
|
+
}
|
|
4216
|
+
function redditVideosFromPost(post) {
|
|
4217
|
+
const videos = [];
|
|
4218
|
+
const media = [post.media, post.secure_media].filter(isRecord5);
|
|
4219
|
+
for (const item of media) {
|
|
4220
|
+
const redditVideo = isRecord5(item.reddit_video) ? item.reddit_video : void 0;
|
|
4221
|
+
const url = redditMediaUrl(stringFromUnknown4(redditVideo?.fallback_url) ?? stringFromUnknown4(redditVideo?.hls_url) ?? stringFromUnknown4(redditVideo?.dash_url));
|
|
4222
|
+
if (!url) {
|
|
4223
|
+
continue;
|
|
4224
|
+
}
|
|
4225
|
+
videos.push({
|
|
4226
|
+
url,
|
|
4227
|
+
kind: "video",
|
|
4228
|
+
source: "adapter",
|
|
4229
|
+
width: numberFromUnknown2(redditVideo?.width),
|
|
4230
|
+
height: numberFromUnknown2(redditVideo?.height),
|
|
4231
|
+
metadata: {
|
|
4232
|
+
adapter: "redditJsonEndpoint",
|
|
4233
|
+
originalSource: "redditJsonEndpoint"
|
|
4234
|
+
}
|
|
4235
|
+
});
|
|
3465
4236
|
}
|
|
4237
|
+
return videos;
|
|
4238
|
+
}
|
|
4239
|
+
function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
|
|
4240
|
+
const finalUrl = post.canonicalUrl ?? requestedUrl;
|
|
4241
|
+
const bestImage = post.images.sort((left, right) => (right.width ?? 0) * (right.height ?? 0) - (left.width ?? 0) * (left.height ?? 0))[0];
|
|
4242
|
+
const video = post.videos[0];
|
|
4243
|
+
const structuredData = {
|
|
4244
|
+
"@context": "https://schema.org",
|
|
4245
|
+
"@type": "SocialMediaPosting",
|
|
4246
|
+
headline: post.title,
|
|
4247
|
+
description: post.description,
|
|
4248
|
+
author: post.author ? { "@type": "Person", name: post.author } : void 0,
|
|
4249
|
+
datePublished: post.createdAt,
|
|
4250
|
+
url: finalUrl,
|
|
4251
|
+
image: bestImage ? { "@type": "ImageObject", url: bestImage.url, width: bestImage.width, height: bestImage.height } : void 0,
|
|
4252
|
+
video: video ? { "@type": "VideoObject", contentUrl: video.url, width: video.width, height: video.height } : void 0
|
|
4253
|
+
};
|
|
4254
|
+
const embeddedPayload = {
|
|
4255
|
+
post: {
|
|
4256
|
+
postTitle: post.title,
|
|
4257
|
+
description: post.description,
|
|
4258
|
+
author: post.author ? { name: post.author } : void 0,
|
|
4259
|
+
createdAt: post.createdAt,
|
|
4260
|
+
canonicalUrl: finalUrl,
|
|
4261
|
+
previewImage: bestImage,
|
|
4262
|
+
media: {
|
|
4263
|
+
videoUrl: video?.url
|
|
4264
|
+
},
|
|
4265
|
+
images: post.images,
|
|
4266
|
+
videos: post.videos,
|
|
4267
|
+
subreddit: post.subreddit,
|
|
4268
|
+
postId: post.postId
|
|
4269
|
+
}
|
|
4270
|
+
};
|
|
4271
|
+
const html = [
|
|
4272
|
+
"<!doctype html><html><head>",
|
|
4273
|
+
`<title>${escapeHtml(post.title ?? "Reddit post")}</title>`,
|
|
4274
|
+
post.title ? `<meta property="og:title" content="${escapeHtml(post.title)}">` : "",
|
|
4275
|
+
post.description ? `<meta property="og:description" content="${escapeHtml(post.description)}">` : "",
|
|
4276
|
+
`<meta property="og:site_name" content="Reddit">`,
|
|
4277
|
+
`<meta property="og:url" content="${escapeHtml(finalUrl)}">`,
|
|
4278
|
+
bestImage ? `<meta property="og:image" content="${escapeHtml(bestImage.url)}">` : "",
|
|
4279
|
+
bestImage?.width ? `<meta property="og:image:width" content="${bestImage.width}">` : "",
|
|
4280
|
+
bestImage?.height ? `<meta property="og:image:height" content="${bestImage.height}">` : "",
|
|
4281
|
+
`<link rel="canonical" href="${escapeHtml(finalUrl)}">`,
|
|
4282
|
+
`<script type="application/ld+json">${safeJson(structuredData)}</script>`,
|
|
4283
|
+
`<script type="application/json" id="metanova-reddit-json">${safeJson(embeddedPayload)}</script>`,
|
|
4284
|
+
"</head><body></body></html>"
|
|
4285
|
+
].join("");
|
|
4286
|
+
return {
|
|
4287
|
+
...jsonPage,
|
|
4288
|
+
url: requestedUrl,
|
|
4289
|
+
originalUrl: requestedUrl,
|
|
4290
|
+
finalUrl,
|
|
4291
|
+
html,
|
|
4292
|
+
bytes: new TextEncoder().encode(html),
|
|
4293
|
+
contentType: "text/html; charset=utf-8",
|
|
4294
|
+
statusCode: jsonPage.statusCode
|
|
4295
|
+
};
|
|
4296
|
+
}
|
|
4297
|
+
function isRedditBlocked(page) {
|
|
4298
|
+
return page.statusCode === 403 || page.statusCode === 429 || /please wait for verification|whoa there, pardner|blocked|forbidden|too many requests|request has been blocked/i.test(page.html);
|
|
4299
|
+
}
|
|
4300
|
+
function redditRetryInfo(attempts) {
|
|
4301
|
+
const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
|
|
4302
|
+
if (blockedAttempts.length === 0) {
|
|
4303
|
+
return void 0;
|
|
4304
|
+
}
|
|
4305
|
+
const retryAfter = blockedAttempts.map((attempt) => attempt.retryAfter).find((value) => Boolean(value));
|
|
4306
|
+
return {
|
|
4307
|
+
retryable: blockedAttempts.some((attempt) => attempt.statusCode === 429 || Boolean(attempt.retryAfter)),
|
|
4308
|
+
reason: blockedAttempts.map((attempt) => `${attempt.method}${attempt.statusCode ? ` returned ${attempt.statusCode}` : " failed"}`).join("; "),
|
|
4309
|
+
retryAfter,
|
|
4310
|
+
retryAfterMs: retryAfterToMs(retryAfter),
|
|
4311
|
+
attempts: attempts.length
|
|
4312
|
+
};
|
|
4313
|
+
}
|
|
4314
|
+
function retryAfterToMs(value) {
|
|
4315
|
+
if (!value) {
|
|
4316
|
+
return void 0;
|
|
4317
|
+
}
|
|
4318
|
+
const seconds = Number.parseInt(value, 10);
|
|
4319
|
+
if (Number.isFinite(seconds)) {
|
|
4320
|
+
return seconds * 1e3;
|
|
4321
|
+
}
|
|
4322
|
+
const dateMs = Date.parse(value);
|
|
4323
|
+
return Number.isFinite(dateMs) ? Math.max(dateMs - Date.now(), 0) : void 0;
|
|
4324
|
+
}
|
|
4325
|
+
function mergeFallbackAttempts2(existing, incoming) {
|
|
4326
|
+
const attempts = [...existing ?? [], ...incoming];
|
|
4327
|
+
if (attempts.length === 0) {
|
|
4328
|
+
return void 0;
|
|
4329
|
+
}
|
|
4330
|
+
const seen = /* @__PURE__ */ new Set();
|
|
4331
|
+
return attempts.map((value) => {
|
|
4332
|
+
const { page: _page, ...attempt } = value;
|
|
4333
|
+
return attempt;
|
|
4334
|
+
}).filter((attempt) => {
|
|
4335
|
+
const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
|
|
4336
|
+
if (seen.has(key)) {
|
|
4337
|
+
return false;
|
|
4338
|
+
}
|
|
4339
|
+
seen.add(key);
|
|
4340
|
+
return true;
|
|
4341
|
+
});
|
|
4342
|
+
}
|
|
4343
|
+
function uniqueStrings3(values) {
|
|
4344
|
+
return [...new Set(values.filter((value) => Boolean(value)))];
|
|
4345
|
+
}
|
|
4346
|
+
function redditMediaUrl(value) {
|
|
4347
|
+
return value?.replace(/&/g, "&");
|
|
4348
|
+
}
|
|
4349
|
+
function firstText2(...values) {
|
|
4350
|
+
return values.map((value) => value?.replace(/\s+/g, " ").trim()).find((value) => Boolean(value));
|
|
4351
|
+
}
|
|
4352
|
+
function stringFromUnknown4(value) {
|
|
4353
|
+
if (typeof value === "string" && value.trim()) {
|
|
4354
|
+
return value.trim();
|
|
4355
|
+
}
|
|
4356
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
4357
|
+
return String(value);
|
|
4358
|
+
}
|
|
4359
|
+
return void 0;
|
|
4360
|
+
}
|
|
4361
|
+
function numberFromUnknown2(value) {
|
|
4362
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
4363
|
+
return value;
|
|
4364
|
+
}
|
|
4365
|
+
if (typeof value !== "string") {
|
|
4366
|
+
return void 0;
|
|
4367
|
+
}
|
|
4368
|
+
const parsed = Number.parseFloat(value);
|
|
4369
|
+
return Number.isFinite(parsed) ? parsed : void 0;
|
|
4370
|
+
}
|
|
4371
|
+
function safeJson(value) {
|
|
4372
|
+
return JSON.stringify(stripUndefinedDeep(value)).replace(/</g, "\\u003c");
|
|
4373
|
+
}
|
|
4374
|
+
function escapeHtml(value) {
|
|
4375
|
+
return value.replace(/&/g, "&").replace(/"/g, """).replace(/</g, "<").replace(/>/g, ">");
|
|
4376
|
+
}
|
|
4377
|
+
function stripUndefinedDeep(value) {
|
|
4378
|
+
if (Array.isArray(value)) {
|
|
4379
|
+
return value.map(stripUndefinedDeep).filter((item) => item !== void 0);
|
|
4380
|
+
}
|
|
4381
|
+
if (isRecord5(value)) {
|
|
4382
|
+
return Object.fromEntries(
|
|
4383
|
+
Object.entries(value).map(([key, item]) => [key, stripUndefinedDeep(item)]).filter(([, item]) => item !== void 0 && item !== null && (!Array.isArray(item) || item.length > 0))
|
|
4384
|
+
);
|
|
4385
|
+
}
|
|
4386
|
+
return value;
|
|
4387
|
+
}
|
|
4388
|
+
function isRecord5(value) {
|
|
4389
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3466
4390
|
}
|
|
3467
4391
|
function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
|
|
3468
4392
|
const contentType = page.contentType?.toLowerCase() ?? "";
|
|
@@ -3519,7 +4443,15 @@ function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
|
|
|
3519
4443
|
sourcesUsed: ["direct"],
|
|
3520
4444
|
warnings: [],
|
|
3521
4445
|
trace,
|
|
4446
|
+
extractionMethod: `direct:${kind}`,
|
|
3522
4447
|
selectedImageReason: kind === "image" ? "Selected direct image URL because the response content type is an image." : void 0,
|
|
4448
|
+
confidenceBreakdown: {
|
|
4449
|
+
title: 0,
|
|
4450
|
+
description: 0,
|
|
4451
|
+
image: kind === "image" ? 100 : 0,
|
|
4452
|
+
structuredData: 0,
|
|
4453
|
+
adapter: 0
|
|
4454
|
+
},
|
|
3523
4455
|
fetchDurationMs,
|
|
3524
4456
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3525
4457
|
}
|
|
@@ -3634,6 +4566,7 @@ export {
|
|
|
3634
4566
|
behanceAdapter,
|
|
3635
4567
|
calculateCompleteness,
|
|
3636
4568
|
calculateConfidence,
|
|
4569
|
+
calculateConfidenceBreakdown,
|
|
3637
4570
|
calculateReliability,
|
|
3638
4571
|
createDiagnostics,
|
|
3639
4572
|
createPreviewCard,
|