metanova 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +49 -0
- package/README.md +4 -0
- package/dist/index.cjs +993 -59
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +32 -1
- package/dist/index.d.ts +32 -1
- package/dist/index.js +992 -59
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.cjs
CHANGED
|
@@ -41,6 +41,7 @@ __export(index_exports, {
|
|
|
41
41
|
behanceAdapter: () => behanceAdapter,
|
|
42
42
|
calculateCompleteness: () => calculateCompleteness,
|
|
43
43
|
calculateConfidence: () => calculateConfidence,
|
|
44
|
+
calculateConfidenceBreakdown: () => calculateConfidenceBreakdown,
|
|
44
45
|
calculateReliability: () => calculateReliability,
|
|
45
46
|
createDiagnostics: () => createDiagnostics,
|
|
46
47
|
createPreviewCard: () => createPreviewCard,
|
|
@@ -838,8 +839,24 @@ function isRecord2(value) {
|
|
|
838
839
|
}
|
|
839
840
|
|
|
840
841
|
// src/extractors/media.ts
|
|
841
|
-
var LAZY_IMAGE_ATTRIBUTES = [
|
|
842
|
-
|
|
842
|
+
var LAZY_IMAGE_ATTRIBUTES = [
|
|
843
|
+
"data-src",
|
|
844
|
+
"data-original",
|
|
845
|
+
"data-lazy-src",
|
|
846
|
+
"data-image",
|
|
847
|
+
"data-image-url",
|
|
848
|
+
"data-og-image",
|
|
849
|
+
"data-thumbnail",
|
|
850
|
+
"data-thumb",
|
|
851
|
+
"data-media",
|
|
852
|
+
"data-full-src",
|
|
853
|
+
"data-hi-res-src",
|
|
854
|
+
"data-zoom-src",
|
|
855
|
+
"data-poster",
|
|
856
|
+
"data-bg"
|
|
857
|
+
];
|
|
858
|
+
var LAZY_IMAGE_SRCSET_ATTRIBUTES = ["data-srcset", "data-lazy-srcset", "data-original-srcset"];
|
|
859
|
+
var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-video-url", "data-media", "data-playback-url"];
|
|
843
860
|
function extractImages(html, baseUrl) {
|
|
844
861
|
const $ = loadDocument(html);
|
|
845
862
|
const images = [];
|
|
@@ -865,6 +882,15 @@ function extractImages(html, baseUrl) {
|
|
|
865
882
|
type,
|
|
866
883
|
metadata: { discoveredFrom: "link.preload" }
|
|
867
884
|
}, baseUrl);
|
|
885
|
+
for (const candidate of parseSrcset($(element).attr("imagesrcset"))) {
|
|
886
|
+
pushResolved(images, {
|
|
887
|
+
url: candidate,
|
|
888
|
+
kind: "image",
|
|
889
|
+
source: "html",
|
|
890
|
+
type,
|
|
891
|
+
metadata: { discoveredFrom: "link.imagesrcset" }
|
|
892
|
+
}, baseUrl);
|
|
893
|
+
}
|
|
868
894
|
}
|
|
869
895
|
});
|
|
870
896
|
collectDocumentImages($, images, baseUrl, "html");
|
|
@@ -989,7 +1015,8 @@ function collectDocumentImages($, images, baseUrl, source) {
|
|
|
989
1015
|
const candidates = [
|
|
990
1016
|
normalizeWhitespace($(element).attr("src")),
|
|
991
1017
|
...LAZY_IMAGE_ATTRIBUTES.map((attribute) => normalizeWhitespace($(element).attr(attribute))),
|
|
992
|
-
...parseSrcset($(element).attr("srcset"))
|
|
1018
|
+
...parseSrcset($(element).attr("srcset")),
|
|
1019
|
+
...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
|
|
993
1020
|
];
|
|
994
1021
|
for (const candidate of candidates) {
|
|
995
1022
|
pushResolved(images, {
|
|
@@ -1002,7 +1029,10 @@ function collectDocumentImages($, images, baseUrl, source) {
|
|
|
1002
1029
|
}
|
|
1003
1030
|
});
|
|
1004
1031
|
$("picture source[srcset], source[type^='image/'][srcset]").each((_, element) => {
|
|
1005
|
-
for (const candidate of
|
|
1032
|
+
for (const candidate of [
|
|
1033
|
+
...parseSrcset($(element).attr("srcset")),
|
|
1034
|
+
...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
|
|
1035
|
+
]) {
|
|
1006
1036
|
pushResolved(images, {
|
|
1007
1037
|
url: candidate,
|
|
1008
1038
|
kind: "image",
|
|
@@ -1294,11 +1324,11 @@ function uniqueStrings(values) {
|
|
|
1294
1324
|
|
|
1295
1325
|
// src/scorers/image.ts
|
|
1296
1326
|
var SOURCE_WEIGHT = {
|
|
1297
|
-
adapter:
|
|
1298
|
-
openGraph:
|
|
1327
|
+
adapter: 98,
|
|
1328
|
+
openGraph: 94,
|
|
1299
1329
|
oEmbed: 88,
|
|
1300
1330
|
jsonLd: 82,
|
|
1301
|
-
twitter:
|
|
1331
|
+
twitter: 86,
|
|
1302
1332
|
nextData: 76,
|
|
1303
1333
|
nuxt: 74,
|
|
1304
1334
|
initialState: 73,
|
|
@@ -1326,7 +1356,9 @@ function scoreImages(images, customScorers = []) {
|
|
|
1326
1356
|
scoreReasons: reasons
|
|
1327
1357
|
}
|
|
1328
1358
|
};
|
|
1329
|
-
}).sort(
|
|
1359
|
+
}).sort(
|
|
1360
|
+
(left, right) => (right.score ?? 0) - (left.score ?? 0) || sourceSortWeight(right) - sourceSortWeight(left) || imageArea(right) - imageArea(left)
|
|
1361
|
+
);
|
|
1330
1362
|
}
|
|
1331
1363
|
function selectBestImage(images, customScorers = []) {
|
|
1332
1364
|
const scored = scoreImages(images, customScorers);
|
|
@@ -1425,17 +1457,32 @@ function scoreFormat(image) {
|
|
|
1425
1457
|
}
|
|
1426
1458
|
function scoreUrlSignal(image) {
|
|
1427
1459
|
const url = image.url.toLowerCase();
|
|
1428
|
-
const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social/g) ?? [];
|
|
1429
|
-
|
|
1460
|
+
const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social|maxres|highres|large|original/g) ?? [];
|
|
1461
|
+
const platformScore = platformThumbnailScore(url);
|
|
1462
|
+
if (matches.length === 0 && platformScore.score === 0) {
|
|
1430
1463
|
return { score: 0, reasons: [] };
|
|
1431
1464
|
}
|
|
1432
1465
|
const uniqueMatches = [...new Set(matches)];
|
|
1433
|
-
const score = Math.min(uniqueMatches.length * 4,
|
|
1466
|
+
const score = Math.min(uniqueMatches.length * 4, 14) + platformScore.score;
|
|
1467
|
+
const reasons = uniqueMatches.length > 0 ? [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${Math.min(uniqueMatches.length * 4, 14)} points`] : [];
|
|
1468
|
+
reasons.push(...platformScore.reasons);
|
|
1434
1469
|
return {
|
|
1435
1470
|
score,
|
|
1436
|
-
reasons
|
|
1471
|
+
reasons
|
|
1437
1472
|
};
|
|
1438
1473
|
}
|
|
1474
|
+
function platformThumbnailScore(url) {
|
|
1475
|
+
if (/ytimg\.com\/vi\/[^/]+\/(?:maxresdefault|sddefault|hqdefault)/i.test(url)) {
|
|
1476
|
+
return { score: 12, reasons: ["YouTube platform thumbnail added 12 points"] };
|
|
1477
|
+
}
|
|
1478
|
+
if (/(?:i|preview|external-preview)\.redd\.it|v\.redd\.it/i.test(url)) {
|
|
1479
|
+
return { score: 10, reasons: ["Reddit media host added 10 points"] };
|
|
1480
|
+
}
|
|
1481
|
+
if (/pbs\.twimg\.com\/media|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|mir-s3-cdn-cf\.behance\.net/i.test(url)) {
|
|
1482
|
+
return { score: 8, reasons: ["social platform media host added 8 points"] };
|
|
1483
|
+
}
|
|
1484
|
+
return { score: 0, reasons: [] };
|
|
1485
|
+
}
|
|
1439
1486
|
function scoreUrlPenalty(image) {
|
|
1440
1487
|
const url = image.url.toLowerCase();
|
|
1441
1488
|
let penalty = 0;
|
|
@@ -1508,6 +1555,12 @@ function countDuplicates(images) {
|
|
|
1508
1555
|
}
|
|
1509
1556
|
return counts;
|
|
1510
1557
|
}
|
|
1558
|
+
function imageArea(image) {
|
|
1559
|
+
return (image.width ?? 0) * (image.height ?? 0);
|
|
1560
|
+
}
|
|
1561
|
+
function sourceSortWeight(image) {
|
|
1562
|
+
return SOURCE_WEIGHT[image.source] ?? 50;
|
|
1563
|
+
}
|
|
1511
1564
|
function mediaSignature(url) {
|
|
1512
1565
|
try {
|
|
1513
1566
|
const parsed = new URL(url);
|
|
@@ -1525,16 +1578,45 @@ var IMAGE_KEYS = [
|
|
|
1525
1578
|
"thumbnailUrl",
|
|
1526
1579
|
"thumbnail_url",
|
|
1527
1580
|
"thumbnailSrc",
|
|
1581
|
+
"thumbnail_src",
|
|
1528
1582
|
"previewImage",
|
|
1529
1583
|
"preview_image",
|
|
1584
|
+
"preview",
|
|
1530
1585
|
"ogImage",
|
|
1586
|
+
"og_image",
|
|
1531
1587
|
"cardImage",
|
|
1588
|
+
"displayUrl",
|
|
1589
|
+
"display_url",
|
|
1590
|
+
"mediaUrl",
|
|
1591
|
+
"media_url",
|
|
1592
|
+
"media_url_https",
|
|
1593
|
+
"fullPicture",
|
|
1594
|
+
"full_picture",
|
|
1532
1595
|
"cover",
|
|
1533
1596
|
"coverImage",
|
|
1597
|
+
"cover_image",
|
|
1598
|
+
"original",
|
|
1599
|
+
"source",
|
|
1534
1600
|
"poster",
|
|
1601
|
+
"posterImage",
|
|
1602
|
+
"media"
|
|
1603
|
+
];
|
|
1604
|
+
var VIDEO_KEYS = [
|
|
1605
|
+
"video",
|
|
1606
|
+
"videos",
|
|
1607
|
+
"videoUrl",
|
|
1608
|
+
"video_url",
|
|
1609
|
+
"contentUrl",
|
|
1610
|
+
"content_url",
|
|
1611
|
+
"embedUrl",
|
|
1612
|
+
"embed_url",
|
|
1613
|
+
"playbackUrl",
|
|
1614
|
+
"playback_url",
|
|
1615
|
+
"fallback_url",
|
|
1616
|
+
"hls_url",
|
|
1617
|
+
"dash_url",
|
|
1535
1618
|
"media"
|
|
1536
1619
|
];
|
|
1537
|
-
var VIDEO_KEYS = ["video", "videos", "videoUrl", "video_url", "contentUrl", "embedUrl", "playbackUrl"];
|
|
1538
1620
|
var AUDIO_KEYS = ["audio", "audios", "audioUrl", "audio_url", "podcastUrl"];
|
|
1539
1621
|
function discoverMedia(rawSources, finalUrl) {
|
|
1540
1622
|
const trace = [];
|
|
@@ -1668,25 +1750,87 @@ function mediaFromJsonValue(value, kind, source) {
|
|
|
1668
1750
|
return value.flatMap((item) => mediaFromJsonValue(item, kind, source));
|
|
1669
1751
|
}
|
|
1670
1752
|
if (isRecord3(value)) {
|
|
1671
|
-
const
|
|
1753
|
+
const srcset = stringFromUnknown(value.srcset) ?? stringFromUnknown(value.srcSet);
|
|
1754
|
+
const srcsetAssets = parseSrcset(srcset).flatMap((url2) => mediaFromJsonValue(url2, kind, source));
|
|
1755
|
+
const url = mediaUrlFromRecord(value, kind);
|
|
1756
|
+
const nestedDetails = nestedMediaDetailsRecord(value, kind);
|
|
1672
1757
|
if (!url || !looksLikeMediaUrl(url, kind)) {
|
|
1673
|
-
return
|
|
1758
|
+
return srcsetAssets;
|
|
1674
1759
|
}
|
|
1675
1760
|
return [
|
|
1676
1761
|
{
|
|
1677
1762
|
url,
|
|
1678
1763
|
kind,
|
|
1679
1764
|
source,
|
|
1680
|
-
width: parseNumber(stringFromUnknown(value.width)),
|
|
1681
|
-
height: parseNumber(stringFromUnknown(value.height)),
|
|
1682
|
-
alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name),
|
|
1683
|
-
title: stringFromUnknown(value.title),
|
|
1684
|
-
type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat)
|
|
1685
|
-
}
|
|
1765
|
+
width: parseNumber(stringFromUnknown(value.width)) ?? parseNumber(stringFromUnknown(nestedDetails?.width)),
|
|
1766
|
+
height: parseNumber(stringFromUnknown(value.height)) ?? parseNumber(stringFromUnknown(nestedDetails?.height)),
|
|
1767
|
+
alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name) ?? stringFromUnknown(nestedDetails?.alt),
|
|
1768
|
+
title: stringFromUnknown(value.title) ?? stringFromUnknown(nestedDetails?.title),
|
|
1769
|
+
type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat) ?? stringFromUnknown(nestedDetails?.type)
|
|
1770
|
+
},
|
|
1771
|
+
...srcsetAssets
|
|
1686
1772
|
];
|
|
1687
1773
|
}
|
|
1688
1774
|
return [];
|
|
1689
1775
|
}
|
|
1776
|
+
function nestedMediaDetailsRecord(value, kind) {
|
|
1777
|
+
const candidates = [
|
|
1778
|
+
value.source,
|
|
1779
|
+
value.original,
|
|
1780
|
+
value.image,
|
|
1781
|
+
value.thumbnail,
|
|
1782
|
+
value.thumbnailUrl,
|
|
1783
|
+
value.thumbnail_url,
|
|
1784
|
+
value.previewImage,
|
|
1785
|
+
value.preview_image,
|
|
1786
|
+
value.video,
|
|
1787
|
+
value.reddit_video
|
|
1788
|
+
];
|
|
1789
|
+
return candidates.find((candidate) => isRecord3(candidate) && Boolean(mediaUrlFromRecord(candidate, kind)));
|
|
1790
|
+
}
|
|
1791
|
+
function mediaUrlFromRecord(value, kind) {
|
|
1792
|
+
const commonCandidates = [
|
|
1793
|
+
value.url,
|
|
1794
|
+
value.src,
|
|
1795
|
+
value.secure_url,
|
|
1796
|
+
value.secureUrl,
|
|
1797
|
+
value.contentUrl,
|
|
1798
|
+
value.content_url,
|
|
1799
|
+
value.embedUrl,
|
|
1800
|
+
value.embed_url,
|
|
1801
|
+
value.thumbnailUrl,
|
|
1802
|
+
value.thumbnail_url,
|
|
1803
|
+
value.thumbnailSrc,
|
|
1804
|
+
value.thumbnail_src,
|
|
1805
|
+
value.mediaUrl,
|
|
1806
|
+
value.media_url,
|
|
1807
|
+
value.media_url_https,
|
|
1808
|
+
value.displayUrl,
|
|
1809
|
+
value.display_url,
|
|
1810
|
+
value.fullPicture,
|
|
1811
|
+
value.full_picture,
|
|
1812
|
+
value.previewImage,
|
|
1813
|
+
value.preview_image,
|
|
1814
|
+
value.poster,
|
|
1815
|
+
value.posterUrl,
|
|
1816
|
+
value.poster_url,
|
|
1817
|
+
value.coverImage,
|
|
1818
|
+
value.cover_image,
|
|
1819
|
+
value.original,
|
|
1820
|
+
value.source
|
|
1821
|
+
];
|
|
1822
|
+
const videoCandidates = [
|
|
1823
|
+
value.videoUrl,
|
|
1824
|
+
value.video_url,
|
|
1825
|
+
value.playbackUrl,
|
|
1826
|
+
value.playback_url,
|
|
1827
|
+
value.fallback_url,
|
|
1828
|
+
value.hls_url,
|
|
1829
|
+
value.dash_url
|
|
1830
|
+
];
|
|
1831
|
+
const candidates = kind === "video" ? [...videoCandidates, ...commonCandidates] : commonCandidates;
|
|
1832
|
+
return candidates.map(stringFromUnknown).find((candidate) => candidate && looksLikeMediaUrl(candidate, kind));
|
|
1833
|
+
}
|
|
1690
1834
|
function assetFromEmbedded(value, kind, item, parent) {
|
|
1691
1835
|
return {
|
|
1692
1836
|
url: value,
|
|
@@ -1766,7 +1910,7 @@ function sourceRank(source) {
|
|
|
1766
1910
|
}
|
|
1767
1911
|
function shouldIgnoreMediaUrl2(url) {
|
|
1768
1912
|
const normalized = url.toLowerCase();
|
|
1769
|
-
return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
|
|
1913
|
+
return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji|favicon|apple-touch-icon)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
|
|
1770
1914
|
}
|
|
1771
1915
|
function looksLikeMediaUrl(value, kind) {
|
|
1772
1916
|
if (shouldIgnoreMediaUrl2(value)) {
|
|
@@ -1774,10 +1918,10 @@ function looksLikeMediaUrl(value, kind) {
|
|
|
1774
1918
|
}
|
|
1775
1919
|
if (/^https?:\/\//i.test(value) || value.startsWith("/") || value.startsWith("./") || value.startsWith("../")) {
|
|
1776
1920
|
if (kind === "image") {
|
|
1777
|
-
return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo)/i.test(value);
|
|
1921
|
+
return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo|format=(?:jpg|jpeg|png|webp))/i.test(value) || /(?:ytimg\.com|i\.redd\.it|preview\.redd\.it|external-preview\.redd\.it|pbs\.twimg\.com|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|behance\.net)/i.test(value);
|
|
1778
1922
|
}
|
|
1779
1923
|
if (kind === "video") {
|
|
1780
|
-
return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts)/i.test(value);
|
|
1924
|
+
return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts|v\.redd\.it)/i.test(value);
|
|
1781
1925
|
}
|
|
1782
1926
|
if (kind === "audio") {
|
|
1783
1927
|
return /\.(?:mp3|m4a|wav|ogg|aac)(?:[?#].*)?$/i.test(value) || /(?:audio|podcast)/i.test(value);
|
|
@@ -1852,6 +1996,20 @@ function calculateConfidence(input) {
|
|
|
1852
1996
|
score -= Math.min(input.warnings.length * 3, 18);
|
|
1853
1997
|
return Math.round(clamp2(score, 0, 100));
|
|
1854
1998
|
}
|
|
1999
|
+
function calculateConfidenceBreakdown(input) {
|
|
2000
|
+
const title = qualityPoints(input.title, 100, 6, 120);
|
|
2001
|
+
const description = qualityPoints(input.description, 100, 24, 300);
|
|
2002
|
+
const image = input.bestImage ? clamp2(58 + Math.min(input.bestImage.score ?? 0, 100) * 0.27 + sourceConfidenceBonus(input.bestImage.source), 0, 100) : 0;
|
|
2003
|
+
const structuredData = input.hasStructuredData ? 100 : input.rawSources.embeddedData.items.length > 0 ? 55 : 0;
|
|
2004
|
+
const adapter = adapterSucceeded(input.rawSources.adapters) ? adapterConfidence(input.rawSources.adapters[0]) : 0;
|
|
2005
|
+
return {
|
|
2006
|
+
title: Math.round(title),
|
|
2007
|
+
description: Math.round(description),
|
|
2008
|
+
image: Math.round(image),
|
|
2009
|
+
structuredData: Math.round(structuredData),
|
|
2010
|
+
adapter: Math.round(adapter)
|
|
2011
|
+
};
|
|
2012
|
+
}
|
|
1855
2013
|
function calculateCompleteness(input) {
|
|
1856
2014
|
const weights = [
|
|
1857
2015
|
input.title ? 20 : 0,
|
|
@@ -1921,6 +2079,25 @@ function sourceConfidenceBonus(source) {
|
|
|
1921
2079
|
function adapterSucceeded(adapters) {
|
|
1922
2080
|
return adapters.some((adapter) => Boolean(adapter.title || adapter.description || adapter.images?.length || adapter.videos?.length));
|
|
1923
2081
|
}
|
|
2082
|
+
function adapterConfidence(adapter) {
|
|
2083
|
+
if (!adapter) {
|
|
2084
|
+
return 0;
|
|
2085
|
+
}
|
|
2086
|
+
let score = 45;
|
|
2087
|
+
if (adapter.title) {
|
|
2088
|
+
score += 22;
|
|
2089
|
+
}
|
|
2090
|
+
if (adapter.description) {
|
|
2091
|
+
score += 14;
|
|
2092
|
+
}
|
|
2093
|
+
if ((adapter.images?.length ?? 0) > 0 || (adapter.videos?.length ?? 0) > 0) {
|
|
2094
|
+
score += 14;
|
|
2095
|
+
}
|
|
2096
|
+
if (adapter.author) {
|
|
2097
|
+
score += 6;
|
|
2098
|
+
}
|
|
2099
|
+
return clamp2(score, 0, 100);
|
|
2100
|
+
}
|
|
1924
2101
|
function clamp2(value, min, max) {
|
|
1925
2102
|
return Math.max(min, Math.min(max, value));
|
|
1926
2103
|
}
|
|
@@ -1984,6 +2161,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1984
2161
|
const type = inferType(rawSources, externalResults, jsonLdNodes, article, product, app, playlist, videos, audio);
|
|
1985
2162
|
const author = firstResultValue(externalResults, (result) => result.author) ?? firstEntity(article?.authors) ?? entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"]);
|
|
1986
2163
|
const publisher = article?.publisher ?? firstResultValue(externalResults, (result) => result.publisher) ?? entityFromJsonLd(organizationNode) ?? entityFromEmbedded(embeddedNodes, ["publisher", "provider", "organization"]);
|
|
2164
|
+
const publishDate = firstDefined(article?.publishedTime, video?.publishedTime);
|
|
1987
2165
|
const sourcesUsed = detectSourcesUsed(rawSources);
|
|
1988
2166
|
const warnings = diagnosticsWarnings(rawSources, externalResults, context.diagnostics);
|
|
1989
2167
|
const fieldSources = {
|
|
@@ -1992,7 +2170,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
1992
2170
|
author: fieldSource(rawSources, externalResults, embeddedNodes, "author", selectedImage.best),
|
|
1993
2171
|
image: fieldSource(rawSources, externalResults, embeddedNodes, "image", selectedImage.best)
|
|
1994
2172
|
};
|
|
1995
|
-
const
|
|
2173
|
+
const confidenceInput = {
|
|
1996
2174
|
title,
|
|
1997
2175
|
description,
|
|
1998
2176
|
bestImage: selectedImage.best,
|
|
@@ -2001,7 +2179,9 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
2001
2179
|
rawSources,
|
|
2002
2180
|
sourcesUsed,
|
|
2003
2181
|
warnings
|
|
2004
|
-
}
|
|
2182
|
+
};
|
|
2183
|
+
const confidence = calculateConfidence(confidenceInput);
|
|
2184
|
+
const confidenceBreakdown = calculateConfidenceBreakdown(confidenceInput);
|
|
2005
2185
|
const completeness = calculateCompleteness({
|
|
2006
2186
|
title,
|
|
2007
2187
|
description,
|
|
@@ -2011,7 +2191,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
2011
2191
|
author,
|
|
2012
2192
|
publisher,
|
|
2013
2193
|
type,
|
|
2014
|
-
publishedTime:
|
|
2194
|
+
publishedTime: publishDate,
|
|
2015
2195
|
mediaCount: images.length + videos.length + audio.length
|
|
2016
2196
|
});
|
|
2017
2197
|
const reliability = calculateReliability({
|
|
@@ -2030,7 +2210,19 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
2030
2210
|
};
|
|
2031
2211
|
diagnostics.sourcesUsed = uniqueStrings2([...diagnostics.sourcesUsed, ...sourcesUsed]);
|
|
2032
2212
|
diagnostics.warnings = uniqueStrings2([...diagnostics.warnings, ...rawSources.jsonLd.warnings, ...externalResults.flatMap((result) => result.warnings ?? [])]);
|
|
2213
|
+
diagnostics.adapterUsed = diagnostics.adapterUsed ?? rawSources.adapters[0]?.source;
|
|
2214
|
+
diagnostics.extractionMethod = diagnostics.extractionMethod ?? adapterRawString(rawSources.adapters[0], "extractionMethod") ?? fieldSources.title;
|
|
2215
|
+
diagnostics.sourcePriority = uniqueStrings2([
|
|
2216
|
+
...diagnostics.sourcePriority ?? [],
|
|
2217
|
+
...arrayOfStrings(rawSources.adapters[0]?.raw?.sourcePriority) ?? []
|
|
2218
|
+
]);
|
|
2219
|
+
diagnostics.fallbacksAttempted = mergeFallbackAttempts(
|
|
2220
|
+
diagnostics.fallbacksAttempted,
|
|
2221
|
+
fallbackAttemptsFromUnknown(rawSources.adapters[0]?.raw?.fallbacksAttempted)
|
|
2222
|
+
);
|
|
2223
|
+
diagnostics.retryInfo = diagnostics.retryInfo ?? retryInfoFromUnknown(rawSources.adapters[0]?.raw?.retryInfo);
|
|
2033
2224
|
diagnostics.selectedImageReason = selectedImage.reason;
|
|
2225
|
+
diagnostics.confidenceBreakdown = confidenceBreakdown;
|
|
2034
2226
|
diagnostics.originalUrl = diagnostics.originalUrl ?? url;
|
|
2035
2227
|
diagnostics.finalUrl = diagnostics.finalUrl ?? finalUrl;
|
|
2036
2228
|
diagnostics.canonicalUrl = canonicalUrl;
|
|
@@ -2047,6 +2239,7 @@ function normalizeMetadata(rawSources, context = {}) {
|
|
|
2047
2239
|
type,
|
|
2048
2240
|
title,
|
|
2049
2241
|
description,
|
|
2242
|
+
publishDate,
|
|
2050
2243
|
siteName,
|
|
2051
2244
|
canonicalUrl,
|
|
2052
2245
|
confidence,
|
|
@@ -2336,6 +2529,61 @@ function adapterDiagnostics(adapters) {
|
|
|
2336
2529
|
confidence: Math.min(confidence, 100)
|
|
2337
2530
|
};
|
|
2338
2531
|
}
|
|
2532
|
+
function adapterRawString(adapter, key) {
|
|
2533
|
+
const value = adapter?.raw?.[key];
|
|
2534
|
+
return typeof value === "string" && value.trim() ? value.trim() : void 0;
|
|
2535
|
+
}
|
|
2536
|
+
function fallbackAttemptsFromUnknown(value) {
|
|
2537
|
+
if (!Array.isArray(value)) {
|
|
2538
|
+
return void 0;
|
|
2539
|
+
}
|
|
2540
|
+
const attempts = value.flatMap((item) => {
|
|
2541
|
+
if (!isJsonLdNode(item) || typeof item.method !== "string") {
|
|
2542
|
+
return [];
|
|
2543
|
+
}
|
|
2544
|
+
return [{
|
|
2545
|
+
method: item.method,
|
|
2546
|
+
url: typeof item.url === "string" ? item.url : void 0,
|
|
2547
|
+
ok: typeof item.ok === "boolean" ? item.ok : false,
|
|
2548
|
+
statusCode: typeof item.statusCode === "number" ? item.statusCode : void 0,
|
|
2549
|
+
blocked: typeof item.blocked === "boolean" ? item.blocked : void 0,
|
|
2550
|
+
error: typeof item.error === "string" ? item.error : void 0,
|
|
2551
|
+
retryAfter: typeof item.retryAfter === "string" ? item.retryAfter : void 0
|
|
2552
|
+
}];
|
|
2553
|
+
});
|
|
2554
|
+
return attempts.length > 0 ? attempts : void 0;
|
|
2555
|
+
}
|
|
2556
|
+
function mergeFallbackAttempts(existing, incoming) {
|
|
2557
|
+
const attempts = [...existing ?? [], ...incoming ?? []];
|
|
2558
|
+
if (attempts.length === 0) {
|
|
2559
|
+
return void 0;
|
|
2560
|
+
}
|
|
2561
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2562
|
+
return attempts.filter((attempt) => {
|
|
2563
|
+
const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
|
|
2564
|
+
if (seen.has(key)) {
|
|
2565
|
+
return false;
|
|
2566
|
+
}
|
|
2567
|
+
seen.add(key);
|
|
2568
|
+
return true;
|
|
2569
|
+
});
|
|
2570
|
+
}
|
|
2571
|
+
function retryInfoFromUnknown(value) {
|
|
2572
|
+
if (!isJsonLdNode(value)) {
|
|
2573
|
+
return void 0;
|
|
2574
|
+
}
|
|
2575
|
+
const retryable = typeof value.retryable === "boolean" ? value.retryable : void 0;
|
|
2576
|
+
if (retryable === void 0) {
|
|
2577
|
+
return void 0;
|
|
2578
|
+
}
|
|
2579
|
+
return {
|
|
2580
|
+
retryable,
|
|
2581
|
+
reason: typeof value.reason === "string" ? value.reason : void 0,
|
|
2582
|
+
retryAfter: typeof value.retryAfter === "string" ? value.retryAfter : void 0,
|
|
2583
|
+
retryAfterMs: typeof value.retryAfterMs === "number" ? value.retryAfterMs : void 0,
|
|
2584
|
+
attempts: typeof value.attempts === "number" ? value.attempts : void 0
|
|
2585
|
+
};
|
|
2586
|
+
}
|
|
2339
2587
|
function fieldSource(rawSources, externalResults, embeddedNodes, field, bestImage) {
|
|
2340
2588
|
if (field === "image") {
|
|
2341
2589
|
return bestImage ? sourceLabel2(bestImage) : void 0;
|
|
@@ -2584,23 +2832,26 @@ var youtubeAdapter = {
|
|
|
2584
2832
|
const videoId = getYouTubeVideoId(url);
|
|
2585
2833
|
const playlistId = getYouTubePlaylistId(url);
|
|
2586
2834
|
const communityPostId = getYouTubeCommunityPostId(url);
|
|
2835
|
+
const titleSelection = youtubeTitleFromContext(context, { videoId, playlistId, communityPostId });
|
|
2836
|
+
const descriptionSelection = youtubeDescriptionFromContext(context);
|
|
2587
2837
|
const channel = entityFromContext(context, ["author", "ownerChannelName", "channel", "owner"]);
|
|
2588
2838
|
const playlistVideos = playlistId ? extractPlaylistVideos(context) : [];
|
|
2839
|
+
const sourcePriority = youtubeSourcePriority();
|
|
2589
2840
|
return compactAdapterResult({
|
|
2590
2841
|
source: "youtubeAdapter",
|
|
2591
2842
|
platform: "YouTube",
|
|
2592
2843
|
type: playlistId ? "playlist" : communityPostId ? "social_post" : "video",
|
|
2593
2844
|
siteName: "YouTube",
|
|
2594
2845
|
canonicalUrl: videoId ? `https://www.youtube.com/watch?v=${videoId}` : context.raw.openGraph.url,
|
|
2595
|
-
title:
|
|
2596
|
-
description:
|
|
2846
|
+
title: titleSelection.value,
|
|
2847
|
+
description: descriptionSelection.value,
|
|
2597
2848
|
videos: markAdapterMedia(mediaFromContext(context).videos, "youtubeAdapter"),
|
|
2598
2849
|
images: markAdapterMedia(mediaFromContext(context).images, "youtubeAdapter"),
|
|
2599
2850
|
author: channel,
|
|
2600
2851
|
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2601
2852
|
video: videoId ? {
|
|
2602
2853
|
id: videoId,
|
|
2603
|
-
title:
|
|
2854
|
+
title: titleSelection.value,
|
|
2604
2855
|
channel,
|
|
2605
2856
|
publishedTime: publishedTimeFromContext(context),
|
|
2606
2857
|
duration: findEmbeddedString(context, ["duration", "lengthSeconds", "approxDurationMs"]),
|
|
@@ -2610,11 +2861,15 @@ var youtubeAdapter = {
|
|
|
2610
2861
|
} : void 0,
|
|
2611
2862
|
playlist: playlistId ? {
|
|
2612
2863
|
id: playlistId,
|
|
2613
|
-
title:
|
|
2864
|
+
title: youtubePlaylistTitleFromContext(context) ?? context.raw.openGraph.title,
|
|
2614
2865
|
channel,
|
|
2615
2866
|
videos: playlistVideos
|
|
2616
2867
|
} : void 0,
|
|
2617
|
-
identifiers: { videoId, playlistId, communityPostId }
|
|
2868
|
+
identifiers: { videoId, playlistId, communityPostId },
|
|
2869
|
+
raw: {
|
|
2870
|
+
sourcePriority,
|
|
2871
|
+
extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "youtube:htmlFallback"
|
|
2872
|
+
}
|
|
2618
2873
|
});
|
|
2619
2874
|
},
|
|
2620
2875
|
normalize(rawData) {
|
|
@@ -2633,20 +2888,27 @@ var redditAdapter = {
|
|
|
2633
2888
|
const url = new URL(context.finalUrl);
|
|
2634
2889
|
const reddit = parseRedditUrl(url);
|
|
2635
2890
|
const username = typeof reddit.username === "string" ? reddit.username : void 0;
|
|
2891
|
+
const titleSelection = redditTitleFromContext(context);
|
|
2892
|
+
const descriptionSelection = redditDescriptionFromContext(context);
|
|
2893
|
+
const sourcePriority = redditSourcePriority();
|
|
2636
2894
|
return compactAdapterResult({
|
|
2637
2895
|
source: "redditAdapter",
|
|
2638
2896
|
platform: "Reddit",
|
|
2639
2897
|
type: reddit.isPost ? "social_post" : "website",
|
|
2640
2898
|
siteName: "Reddit",
|
|
2641
|
-
canonicalUrl: context.raw.openGraph.url,
|
|
2642
|
-
title: cleanSocialTitle(
|
|
2643
|
-
description:
|
|
2899
|
+
canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
|
|
2900
|
+
title: cleanSocialTitle(titleSelection.value),
|
|
2901
|
+
description: descriptionSelection.value,
|
|
2644
2902
|
images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
|
|
2645
2903
|
videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
|
|
2646
2904
|
author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
|
|
2647
2905
|
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2648
2906
|
identifiers: { subreddit: reddit.subreddit, postId: reddit.postId, username: reddit.username },
|
|
2649
|
-
raw: {
|
|
2907
|
+
raw: {
|
|
2908
|
+
...reddit,
|
|
2909
|
+
sourcePriority,
|
|
2910
|
+
extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "reddit:htmlFallback"
|
|
2911
|
+
}
|
|
2650
2912
|
});
|
|
2651
2913
|
},
|
|
2652
2914
|
normalize(rawData) {
|
|
@@ -2747,6 +3009,7 @@ var facebookAdapter = {
|
|
|
2747
3009
|
platform: "Facebook",
|
|
2748
3010
|
type: isPhoto ? "image" : isPost || media.images.length > 0 || media.videos.length > 0 ? "social_post" : "website",
|
|
2749
3011
|
siteName: "Facebook",
|
|
3012
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2750
3013
|
title: titleFromContext(context, ["title", "headline", "name"]),
|
|
2751
3014
|
description: descriptionFromContext(context),
|
|
2752
3015
|
images: markAdapterMedia(media.images, "facebookAdapter"),
|
|
@@ -2830,6 +3093,116 @@ var defaultAdapters = [
|
|
|
2830
3093
|
twitterAdapter,
|
|
2831
3094
|
instagramAdapter
|
|
2832
3095
|
];
|
|
3096
|
+
function youtubeSourcePriority() {
|
|
3097
|
+
return [
|
|
3098
|
+
"structuredData:VideoObject",
|
|
3099
|
+
"embeddedData:ytInitialPlayerResponse",
|
|
3100
|
+
"embeddedData:ytInitialData",
|
|
3101
|
+
"openGraph",
|
|
3102
|
+
"twitter",
|
|
3103
|
+
"html"
|
|
3104
|
+
];
|
|
3105
|
+
}
|
|
3106
|
+
function youtubeTitleFromContext(context, ids) {
|
|
3107
|
+
const videoObjectTitle = jsonLdVideoObjectString(context, ["name", "headline"]);
|
|
3108
|
+
if (videoObjectTitle) {
|
|
3109
|
+
return { value: videoObjectTitle, method: "youtube:structuredData.VideoObject" };
|
|
3110
|
+
}
|
|
3111
|
+
const playerTitle = youtubePlayerString(context, ["videoDetails.title", "microformat.playerMicroformatRenderer.title"]);
|
|
3112
|
+
if (playerTitle) {
|
|
3113
|
+
return { value: playerTitle, method: "youtube:ytInitialPlayerResponse" };
|
|
3114
|
+
}
|
|
3115
|
+
const initialDataTitle = youtubeInitialDataTitle(context, ids);
|
|
3116
|
+
if (initialDataTitle) {
|
|
3117
|
+
return { value: initialDataTitle, method: "youtube:ytInitialData" };
|
|
3118
|
+
}
|
|
3119
|
+
if (context.raw.openGraph.title) {
|
|
3120
|
+
return { value: context.raw.openGraph.title, method: "youtube:openGraph" };
|
|
3121
|
+
}
|
|
3122
|
+
if (context.raw.twitter.title) {
|
|
3123
|
+
return { value: context.raw.twitter.title, method: "youtube:twitter" };
|
|
3124
|
+
}
|
|
3125
|
+
return { value: cleanYouTubeHtmlTitle(context.raw.html.title), method: context.raw.html.title ? "youtube:html" : void 0 };
|
|
3126
|
+
}
|
|
3127
|
+
function youtubeDescriptionFromContext(context) {
|
|
3128
|
+
const videoObjectDescription = jsonLdVideoObjectString(context, ["description"]);
|
|
3129
|
+
if (videoObjectDescription) {
|
|
3130
|
+
return { value: videoObjectDescription, method: "youtube:structuredData.VideoObject" };
|
|
3131
|
+
}
|
|
3132
|
+
const playerDescription = youtubePlayerString(context, [
|
|
3133
|
+
"videoDetails.shortDescription",
|
|
3134
|
+
"microformat.playerMicroformatRenderer.description",
|
|
3135
|
+
"microformat.playerMicroformatRenderer.shortDescription"
|
|
3136
|
+
]);
|
|
3137
|
+
if (playerDescription) {
|
|
3138
|
+
return { value: playerDescription, method: "youtube:ytInitialPlayerResponse" };
|
|
3139
|
+
}
|
|
3140
|
+
const initialDataDescription = youtubeInitialDataDescription(context);
|
|
3141
|
+
if (initialDataDescription) {
|
|
3142
|
+
return { value: initialDataDescription, method: "youtube:ytInitialData" };
|
|
3143
|
+
}
|
|
3144
|
+
if (context.raw.openGraph.description) {
|
|
3145
|
+
return { value: context.raw.openGraph.description, method: "youtube:openGraph" };
|
|
3146
|
+
}
|
|
3147
|
+
if (context.raw.twitter.description) {
|
|
3148
|
+
return { value: context.raw.twitter.description, method: "youtube:twitter" };
|
|
3149
|
+
}
|
|
3150
|
+
return { value: context.raw.html.description, method: context.raw.html.description ? "youtube:html" : void 0 };
|
|
3151
|
+
}
|
|
3152
|
+
function redditSourcePriority() {
|
|
3153
|
+
return [
|
|
3154
|
+
"redditJsonEndpoint",
|
|
3155
|
+
"oldReddit",
|
|
3156
|
+
"embeddedStructuredData",
|
|
3157
|
+
"openGraph",
|
|
3158
|
+
"twitter",
|
|
3159
|
+
"html"
|
|
3160
|
+
];
|
|
3161
|
+
}
|
|
3162
|
+
function redditTitleFromContext(context) {
|
|
3163
|
+
const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
|
|
3164
|
+
"postTitle",
|
|
3165
|
+
"title",
|
|
3166
|
+
"headline"
|
|
3167
|
+
]);
|
|
3168
|
+
if (embedded) {
|
|
3169
|
+
return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
|
|
3170
|
+
}
|
|
3171
|
+
const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["headline", "name"]);
|
|
3172
|
+
if (structured) {
|
|
3173
|
+
return { value: structured, method: "reddit:structuredData" };
|
|
3174
|
+
}
|
|
3175
|
+
if (context.raw.openGraph.title) {
|
|
3176
|
+
return { value: context.raw.openGraph.title, method: "reddit:openGraph" };
|
|
3177
|
+
}
|
|
3178
|
+
if (context.raw.twitter.title) {
|
|
3179
|
+
return { value: context.raw.twitter.title, method: "reddit:twitter" };
|
|
3180
|
+
}
|
|
3181
|
+
return { value: context.raw.html.title, method: context.raw.html.title ? "reddit:html" : void 0 };
|
|
3182
|
+
}
|
|
3183
|
+
function redditDescriptionFromContext(context) {
|
|
3184
|
+
const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
|
|
3185
|
+
"description",
|
|
3186
|
+
"selftext",
|
|
3187
|
+
"excerpt",
|
|
3188
|
+
"summary",
|
|
3189
|
+
"body"
|
|
3190
|
+
]);
|
|
3191
|
+
if (embedded) {
|
|
3192
|
+
return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
|
|
3193
|
+
}
|
|
3194
|
+
const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["description", "articleBody"]);
|
|
3195
|
+
if (structured) {
|
|
3196
|
+
return { value: structured, method: "reddit:structuredData" };
|
|
3197
|
+
}
|
|
3198
|
+
if (context.raw.openGraph.description) {
|
|
3199
|
+
return { value: context.raw.openGraph.description, method: "reddit:openGraph" };
|
|
3200
|
+
}
|
|
3201
|
+
if (context.raw.twitter.description) {
|
|
3202
|
+
return { value: context.raw.twitter.description, method: "reddit:twitter" };
|
|
3203
|
+
}
|
|
3204
|
+
return { value: context.raw.html.description, method: context.raw.html.description ? "reddit:html" : void 0 };
|
|
3205
|
+
}
|
|
2833
3206
|
function socialVideoResult(source, platform, context) {
|
|
2834
3207
|
const url = new URL(context.finalUrl);
|
|
2835
3208
|
const username = url.pathname.match(/@([^/]+)/)?.[1];
|
|
@@ -2888,6 +3261,143 @@ function markAdapterMedia(assets, adapterName) {
|
|
|
2888
3261
|
}
|
|
2889
3262
|
}));
|
|
2890
3263
|
}
|
|
3264
|
+
function jsonLdVideoObjectString(context, keys) {
|
|
3265
|
+
return jsonLdStringByType(context.raw.jsonLd.nodes, ["VideoObject"], keys);
|
|
3266
|
+
}
|
|
3267
|
+
function jsonLdStringByType(nodes, types, keys) {
|
|
3268
|
+
for (const node of nodes) {
|
|
3269
|
+
if (!hasJsonLdType2(node, types)) {
|
|
3270
|
+
continue;
|
|
3271
|
+
}
|
|
3272
|
+
for (const key of keys) {
|
|
3273
|
+
const value = stringFromUnknown3(node[key]);
|
|
3274
|
+
if (value) {
|
|
3275
|
+
return value;
|
|
3276
|
+
}
|
|
3277
|
+
}
|
|
3278
|
+
}
|
|
3279
|
+
return void 0;
|
|
3280
|
+
}
|
|
3281
|
+
function hasJsonLdType2(node, types) {
|
|
3282
|
+
const nodeTypes = Array.isArray(node["@type"]) ? node["@type"] : [node["@type"]];
|
|
3283
|
+
return nodeTypes.some((type) => typeof type === "string" && types.some((candidate) => type.toLowerCase().endsWith(candidate.toLowerCase())));
|
|
3284
|
+
}
|
|
3285
|
+
function youtubePlayerString(context, paths) {
|
|
3286
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3287
|
+
if (item.source !== "youtubePlayerResponse") {
|
|
3288
|
+
continue;
|
|
3289
|
+
}
|
|
3290
|
+
for (const path of paths) {
|
|
3291
|
+
const value = stringFromUnknown3(valueAtPath(item.data, path));
|
|
3292
|
+
if (value) {
|
|
3293
|
+
return value;
|
|
3294
|
+
}
|
|
3295
|
+
}
|
|
3296
|
+
}
|
|
3297
|
+
return void 0;
|
|
3298
|
+
}
|
|
3299
|
+
function youtubeInitialDataTitle(context, ids) {
|
|
3300
|
+
const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
|
|
3301
|
+
const primary = findRendererText(items, ["videoPrimaryInfoRenderer", "watchMetadata"], ["title"]);
|
|
3302
|
+
if (primary) {
|
|
3303
|
+
return primary;
|
|
3304
|
+
}
|
|
3305
|
+
if (ids.videoId) {
|
|
3306
|
+
const matchingVideo = findYouTubeRendererForVideoId(items, ids.videoId, ["title"]);
|
|
3307
|
+
if (matchingVideo) {
|
|
3308
|
+
return matchingVideo;
|
|
3309
|
+
}
|
|
3310
|
+
}
|
|
3311
|
+
if (ids.communityPostId) {
|
|
3312
|
+
const communityPost = findEmbeddedStringBySources(context, ["youtubeInitialData"], ["contentText"]) ?? findRendererText(items, ["backstagePostRenderer", "postRenderer"], ["contentText", "title"]);
|
|
3313
|
+
if (communityPost) {
|
|
3314
|
+
return communityPost;
|
|
3315
|
+
}
|
|
3316
|
+
}
|
|
3317
|
+
if (ids.playlistId && !ids.videoId) {
|
|
3318
|
+
return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
|
|
3319
|
+
}
|
|
3320
|
+
return void 0;
|
|
3321
|
+
}
|
|
3322
|
+
function youtubeInitialDataDescription(context) {
|
|
3323
|
+
const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
|
|
3324
|
+
return findRendererText(items, ["expandableVideoDescriptionBodyRenderer", "videoSecondaryInfoRenderer", "watchMetadata"], [
|
|
3325
|
+
"description",
|
|
3326
|
+
"attributedDescription",
|
|
3327
|
+
"content"
|
|
3328
|
+
]);
|
|
3329
|
+
}
|
|
3330
|
+
function youtubePlaylistTitleFromContext(context) {
|
|
3331
|
+
const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
|
|
3332
|
+
return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
|
|
3333
|
+
}
|
|
3334
|
+
function findRendererText(items, rendererKeys, textKeys) {
|
|
3335
|
+
for (const item of items) {
|
|
3336
|
+
let found;
|
|
3337
|
+
walkData(item.data, (value, key) => {
|
|
3338
|
+
if (found || !key || !rendererKeys.includes(key) || !isRecord4(value)) {
|
|
3339
|
+
return;
|
|
3340
|
+
}
|
|
3341
|
+
for (const textKey of textKeys) {
|
|
3342
|
+
found = stringFromUnknown3(value[textKey]);
|
|
3343
|
+
if (found) {
|
|
3344
|
+
return;
|
|
3345
|
+
}
|
|
3346
|
+
}
|
|
3347
|
+
});
|
|
3348
|
+
if (found) {
|
|
3349
|
+
return found;
|
|
3350
|
+
}
|
|
3351
|
+
}
|
|
3352
|
+
return void 0;
|
|
3353
|
+
}
|
|
3354
|
+
function findYouTubeRendererForVideoId(items, videoId, textKeys) {
|
|
3355
|
+
for (const item of items) {
|
|
3356
|
+
let found;
|
|
3357
|
+
walkData(item.data, (value) => {
|
|
3358
|
+
if (found || !isRecord4(value) || stringFromUnknown3(value.videoId) !== videoId) {
|
|
3359
|
+
return;
|
|
3360
|
+
}
|
|
3361
|
+
for (const textKey of textKeys) {
|
|
3362
|
+
found = stringFromUnknown3(value[textKey]);
|
|
3363
|
+
if (found) {
|
|
3364
|
+
return;
|
|
3365
|
+
}
|
|
3366
|
+
}
|
|
3367
|
+
});
|
|
3368
|
+
if (found) {
|
|
3369
|
+
return found;
|
|
3370
|
+
}
|
|
3371
|
+
}
|
|
3372
|
+
return void 0;
|
|
3373
|
+
}
|
|
3374
|
+
function findEmbeddedStringBySources(context, sources, keys) {
|
|
3375
|
+
const candidates = [];
|
|
3376
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3377
|
+
if (!sources.includes(item.source)) {
|
|
3378
|
+
continue;
|
|
3379
|
+
}
|
|
3380
|
+
walkData(item.data, (value, key) => {
|
|
3381
|
+
if (!key || !matchesKey(key, keys)) {
|
|
3382
|
+
return;
|
|
3383
|
+
}
|
|
3384
|
+
const text = stringFromUnknown3(value);
|
|
3385
|
+
if (text) {
|
|
3386
|
+
candidates.push(text);
|
|
3387
|
+
}
|
|
3388
|
+
});
|
|
3389
|
+
}
|
|
3390
|
+
return bestTextCandidate(candidates);
|
|
3391
|
+
}
|
|
3392
|
+
function hasRedditJsonEndpointPayload(context) {
|
|
3393
|
+
return context.raw.embeddedData.items.some((item) => item.source === "applicationJson" && item.path === "metanova-reddit-json");
|
|
3394
|
+
}
|
|
3395
|
+
function valueAtPath(node, path) {
|
|
3396
|
+
return path.split(".").reduce((current, key) => isRecord4(current) ? current[key] : void 0, node);
|
|
3397
|
+
}
|
|
3398
|
+
function cleanYouTubeHtmlTitle(title) {
|
|
3399
|
+
return title?.replace(/\s*-\s*YouTube\s*$/i, "").trim();
|
|
3400
|
+
}
|
|
2891
3401
|
function titleFromContext(context, embeddedKeys) {
|
|
2892
3402
|
return firstText(
|
|
2893
3403
|
context.raw.openGraph.title,
|
|
@@ -3455,9 +3965,8 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3455
3965
|
const startedAt = Date.now();
|
|
3456
3966
|
try {
|
|
3457
3967
|
const requestedUrl = normalizeUrl(url);
|
|
3458
|
-
const
|
|
3459
|
-
const
|
|
3460
|
-
const page = fallback.page;
|
|
3968
|
+
const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
|
|
3969
|
+
const page = fetchResult.page;
|
|
3461
3970
|
const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
|
|
3462
3971
|
if (directMedia) {
|
|
3463
3972
|
return directMedia;
|
|
@@ -3478,12 +3987,17 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3478
3987
|
metadata.diagnostics.trace = [
|
|
3479
3988
|
...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
|
|
3480
3989
|
...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
|
|
3481
|
-
...
|
|
3990
|
+
...fetchResult.trace,
|
|
3482
3991
|
"downloaded page",
|
|
3483
3992
|
...metadata.diagnostics.trace,
|
|
3484
3993
|
...metadata.canonicalUrl ? ["resolved canonical URL"] : []
|
|
3485
3994
|
];
|
|
3995
|
+
metadata.diagnostics.fallbacksAttempted = mergeFallbackAttempts2(metadata.diagnostics.fallbacksAttempted, fetchResult.fallbacksAttempted);
|
|
3996
|
+
metadata.diagnostics.sourcePriority = uniqueStrings3([...metadata.diagnostics.sourcePriority ?? [], ...fetchResult.sourcePriority ?? []]);
|
|
3997
|
+
metadata.diagnostics.extractionMethod = metadata.diagnostics.extractionMethod ?? fetchResult.extractionMethod;
|
|
3998
|
+
metadata.diagnostics.retryInfo = metadata.diagnostics.retryInfo ?? fetchResult.retryInfo;
|
|
3486
3999
|
metadata.trace = metadata.diagnostics.trace;
|
|
4000
|
+
metadata.diagnostics.warnings.push(...fetchResult.warnings);
|
|
3487
4001
|
if (!metadata.ok) {
|
|
3488
4002
|
metadata.diagnostics.warnings.push(`Fetch completed with non-success status code ${page.statusCode}.`);
|
|
3489
4003
|
}
|
|
@@ -3521,30 +4035,441 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3521
4035
|
};
|
|
3522
4036
|
}
|
|
3523
4037
|
}
|
|
3524
|
-
async function
|
|
3525
|
-
|
|
4038
|
+
async function fetchPageWithStrategies(requestedUrl, options) {
|
|
4039
|
+
if (isRedditUrl(requestedUrl)) {
|
|
4040
|
+
return fetchRedditPageWithStrategy(requestedUrl, options);
|
|
4041
|
+
}
|
|
4042
|
+
return {
|
|
4043
|
+
page: await fetchPage(requestedUrl, options),
|
|
4044
|
+
fallbacksAttempted: [],
|
|
4045
|
+
warnings: [],
|
|
4046
|
+
trace: []
|
|
4047
|
+
};
|
|
4048
|
+
}
|
|
4049
|
+
async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
4050
|
+
const attempts = [];
|
|
4051
|
+
const warnings = [];
|
|
4052
|
+
const sourcePriority = ["redditJsonEndpoint", "oldReddit", "embeddedStructuredData", "openGraph", "html"];
|
|
4053
|
+
let lastError;
|
|
4054
|
+
const jsonUrl = redditJsonEndpoint(requestedUrl);
|
|
4055
|
+
if (jsonUrl) {
|
|
4056
|
+
const attempt = await attemptFetch("redditJsonEndpoint", jsonUrl, {
|
|
4057
|
+
...options,
|
|
4058
|
+
accept: "application/json,text/html;q=0.8,*/*;q=0.5"
|
|
4059
|
+
});
|
|
4060
|
+
attempts.push(attempt);
|
|
4061
|
+
lastError = attempt.error;
|
|
4062
|
+
if (attempt.page && attempt.ok && !attempt.blocked) {
|
|
4063
|
+
const redditPost = parseRedditJsonPayload(attempt.page.html);
|
|
4064
|
+
if (redditPost?.title) {
|
|
4065
|
+
return {
|
|
4066
|
+
page: synthesizeRedditJsonPage(attempt.page, requestedUrl, redditPost),
|
|
4067
|
+
fallbacksAttempted: attempts,
|
|
4068
|
+
warnings,
|
|
4069
|
+
trace: ["used Reddit JSON endpoint"],
|
|
4070
|
+
sourcePriority,
|
|
4071
|
+
extractionMethod: "reddit:jsonEndpoint",
|
|
4072
|
+
retryInfo: redditRetryInfo(attempts)
|
|
4073
|
+
};
|
|
4074
|
+
}
|
|
4075
|
+
warnings.push("Reddit JSON endpoint responded, but no post payload could be extracted.");
|
|
4076
|
+
} else if (attempt.blocked) {
|
|
4077
|
+
warnings.push("Reddit JSON endpoint appears to have blocked access.");
|
|
4078
|
+
}
|
|
4079
|
+
}
|
|
4080
|
+
const oldRedditUrl = redditOldUrl(requestedUrl);
|
|
4081
|
+
if (oldRedditUrl && oldRedditUrl !== requestedUrl) {
|
|
4082
|
+
const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
|
|
4083
|
+
attempts.push(attempt);
|
|
4084
|
+
lastError = attempt.error;
|
|
4085
|
+
if (attempt.page && attempt.ok && !attempt.blocked) {
|
|
4086
|
+
return {
|
|
4087
|
+
page: attempt.page,
|
|
4088
|
+
fallbacksAttempted: attempts,
|
|
4089
|
+
warnings,
|
|
4090
|
+
trace: ["retried Reddit page through old.reddit"],
|
|
4091
|
+
sourcePriority,
|
|
4092
|
+
extractionMethod: "reddit:oldReddit",
|
|
4093
|
+
retryInfo: redditRetryInfo(attempts)
|
|
4094
|
+
};
|
|
4095
|
+
}
|
|
4096
|
+
if (attempt.blocked) {
|
|
4097
|
+
warnings.push("old.reddit fallback appears to have been blocked.");
|
|
4098
|
+
}
|
|
4099
|
+
}
|
|
4100
|
+
const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
|
|
4101
|
+
attempts.push(htmlAttempt);
|
|
4102
|
+
lastError = htmlAttempt.error;
|
|
4103
|
+
if (htmlAttempt.page) {
|
|
4104
|
+
if (htmlAttempt.blocked) {
|
|
4105
|
+
warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
|
|
4106
|
+
}
|
|
4107
|
+
return {
|
|
4108
|
+
page: htmlAttempt.page,
|
|
4109
|
+
fallbacksAttempted: attempts,
|
|
4110
|
+
warnings,
|
|
4111
|
+
trace: ["used Reddit HTML fallback"],
|
|
4112
|
+
sourcePriority,
|
|
4113
|
+
extractionMethod: "reddit:htmlFallback",
|
|
4114
|
+
retryInfo: redditRetryInfo(attempts)
|
|
4115
|
+
};
|
|
4116
|
+
}
|
|
4117
|
+
throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
|
|
4118
|
+
}
|
|
4119
|
+
async function attemptFetch(method, url, options) {
|
|
3526
4120
|
try {
|
|
3527
|
-
|
|
4121
|
+
const page = await fetchPage(url, options);
|
|
4122
|
+
const retryAfter = page.headers["retry-after"];
|
|
4123
|
+
const blocked = isRedditBlocked(page);
|
|
4124
|
+
return {
|
|
4125
|
+
method,
|
|
4126
|
+
url,
|
|
4127
|
+
ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
|
|
4128
|
+
statusCode: page.statusCode,
|
|
4129
|
+
blocked,
|
|
4130
|
+
retryAfter,
|
|
4131
|
+
page
|
|
4132
|
+
};
|
|
4133
|
+
} catch (error) {
|
|
4134
|
+
return {
|
|
4135
|
+
method,
|
|
4136
|
+
url,
|
|
4137
|
+
ok: false,
|
|
4138
|
+
error: error instanceof Error ? error.message : String(error)
|
|
4139
|
+
};
|
|
4140
|
+
}
|
|
4141
|
+
}
|
|
4142
|
+
function isRedditUrl(url) {
|
|
4143
|
+
try {
|
|
4144
|
+
const host = new URL(url).hostname.toLowerCase().replace(/^www\./, "");
|
|
4145
|
+
return host === "reddit.com" || host === "redd.it" || host.endsWith(".reddit.com");
|
|
3528
4146
|
} catch {
|
|
3529
|
-
return
|
|
3530
|
-
}
|
|
3531
|
-
|
|
3532
|
-
|
|
3533
|
-
const isOldReddit = host === "old.reddit.com";
|
|
3534
|
-
const looksLikeVerification = /please wait for verification|blocked|whoa there, pardner/i.test(page.html);
|
|
3535
|
-
const hasUsefulPreview = /og:(?:title|image|description)|twitter:(?:title|image|description)/i.test(page.html);
|
|
3536
|
-
if (!isReddit || isOldReddit || hasUsefulPreview || !looksLikeVerification) {
|
|
3537
|
-
return { page, used: false };
|
|
3538
|
-
}
|
|
3539
|
-
const fallbackUrl = new URL(page.finalUrl);
|
|
3540
|
-
fallbackUrl.hostname = "old.reddit.com";
|
|
3541
|
-
fallbackUrl.search = "";
|
|
4147
|
+
return false;
|
|
4148
|
+
}
|
|
4149
|
+
}
|
|
4150
|
+
function redditJsonEndpoint(url) {
|
|
3542
4151
|
try {
|
|
3543
|
-
const
|
|
3544
|
-
|
|
4152
|
+
const parsed = new URL(url);
|
|
4153
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
|
|
4154
|
+
const endpoint = new URL(url);
|
|
4155
|
+
endpoint.protocol = "https:";
|
|
4156
|
+
endpoint.hostname = "www.reddit.com";
|
|
4157
|
+
endpoint.search = "";
|
|
4158
|
+
if (host === "redd.it") {
|
|
4159
|
+
const postId = parsed.pathname.split("/").filter(Boolean)[0];
|
|
4160
|
+
if (!postId) {
|
|
4161
|
+
return void 0;
|
|
4162
|
+
}
|
|
4163
|
+
endpoint.pathname = `/comments/${postId}.json`;
|
|
4164
|
+
} else {
|
|
4165
|
+
endpoint.pathname = parsed.pathname.endsWith(".json") ? parsed.pathname : `${parsed.pathname.endsWith("/") ? parsed.pathname : `${parsed.pathname}/`}.json`;
|
|
4166
|
+
}
|
|
4167
|
+
endpoint.searchParams.set("raw_json", "1");
|
|
4168
|
+
return endpoint.toString();
|
|
3545
4169
|
} catch {
|
|
3546
|
-
return
|
|
4170
|
+
return void 0;
|
|
4171
|
+
}
|
|
4172
|
+
}
|
|
4173
|
+
function redditOldUrl(url) {
|
|
4174
|
+
try {
|
|
4175
|
+
const parsed = new URL(url);
|
|
4176
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
|
|
4177
|
+
parsed.protocol = "https:";
|
|
4178
|
+
parsed.hostname = "old.reddit.com";
|
|
4179
|
+
parsed.search = "";
|
|
4180
|
+
if (host === "redd.it") {
|
|
4181
|
+
const postId = parsed.pathname.split("/").filter(Boolean)[0];
|
|
4182
|
+
if (!postId) {
|
|
4183
|
+
return void 0;
|
|
4184
|
+
}
|
|
4185
|
+
parsed.pathname = `/comments/${postId}/`;
|
|
4186
|
+
}
|
|
4187
|
+
return parsed.toString();
|
|
4188
|
+
} catch {
|
|
4189
|
+
return void 0;
|
|
4190
|
+
}
|
|
4191
|
+
}
|
|
4192
|
+
function parseRedditJsonPayload(source) {
|
|
4193
|
+
try {
|
|
4194
|
+
const parsed = JSON.parse(source);
|
|
4195
|
+
const post = findRedditPostRecord(parsed);
|
|
4196
|
+
if (!post) {
|
|
4197
|
+
return void 0;
|
|
4198
|
+
}
|
|
4199
|
+
const createdUtc = numberFromUnknown2(post.created_utc);
|
|
4200
|
+
const permalink = stringFromUnknown4(post.permalink);
|
|
4201
|
+
const canonicalUrl = permalink ? `https://www.reddit.com${permalink.startsWith("/") ? permalink : `/${permalink}`}` : void 0;
|
|
4202
|
+
const images = redditImagesFromPost(post);
|
|
4203
|
+
const videos = redditVideosFromPost(post);
|
|
4204
|
+
const description = firstText2(
|
|
4205
|
+
stringFromUnknown4(post.selftext),
|
|
4206
|
+
stringFromUnknown4(post.selftext_html),
|
|
4207
|
+
stringFromUnknown4(post.url_overridden_by_dest)
|
|
4208
|
+
);
|
|
4209
|
+
return {
|
|
4210
|
+
title: stringFromUnknown4(post.title),
|
|
4211
|
+
description,
|
|
4212
|
+
author: stringFromUnknown4(post.author) ?? stringFromUnknown4(post.author_fullname),
|
|
4213
|
+
createdAt: createdUtc ? new Date(createdUtc * 1e3).toISOString() : void 0,
|
|
4214
|
+
canonicalUrl,
|
|
4215
|
+
url: stringFromUnknown4(post.url_overridden_by_dest) ?? stringFromUnknown4(post.url),
|
|
4216
|
+
images,
|
|
4217
|
+
videos,
|
|
4218
|
+
subreddit: stringFromUnknown4(post.subreddit_name_prefixed) ?? stringFromUnknown4(post.subreddit),
|
|
4219
|
+
postId: stringFromUnknown4(post.id)
|
|
4220
|
+
};
|
|
4221
|
+
} catch {
|
|
4222
|
+
return void 0;
|
|
4223
|
+
}
|
|
4224
|
+
}
|
|
4225
|
+
function findRedditPostRecord(value) {
|
|
4226
|
+
if (Array.isArray(value)) {
|
|
4227
|
+
for (const item of value) {
|
|
4228
|
+
const found = findRedditPostRecord(item);
|
|
4229
|
+
if (found) {
|
|
4230
|
+
return found;
|
|
4231
|
+
}
|
|
4232
|
+
}
|
|
4233
|
+
return void 0;
|
|
4234
|
+
}
|
|
4235
|
+
if (!isRecord5(value)) {
|
|
4236
|
+
return void 0;
|
|
4237
|
+
}
|
|
4238
|
+
if (typeof value.title === "string" && (typeof value.id === "string" || typeof value.name === "string")) {
|
|
4239
|
+
return value;
|
|
4240
|
+
}
|
|
4241
|
+
const children = isRecord5(value.data) && Array.isArray(value.data.children) ? value.data.children : void 0;
|
|
4242
|
+
if (children) {
|
|
4243
|
+
for (const child of children) {
|
|
4244
|
+
if (isRecord5(child) && isRecord5(child.data) && (child.kind === "t3" || typeof child.data.title === "string")) {
|
|
4245
|
+
return child.data;
|
|
4246
|
+
}
|
|
4247
|
+
}
|
|
4248
|
+
}
|
|
4249
|
+
for (const childValue of Object.values(value).slice(0, 100)) {
|
|
4250
|
+
const found = findRedditPostRecord(childValue);
|
|
4251
|
+
if (found) {
|
|
4252
|
+
return found;
|
|
4253
|
+
}
|
|
4254
|
+
}
|
|
4255
|
+
return void 0;
|
|
4256
|
+
}
|
|
4257
|
+
function redditImagesFromPost(post) {
|
|
4258
|
+
const images = [];
|
|
4259
|
+
const preview = isRecord5(post.preview) && Array.isArray(post.preview.images) ? post.preview.images : [];
|
|
4260
|
+
for (const image of preview) {
|
|
4261
|
+
if (!isRecord5(image)) {
|
|
4262
|
+
continue;
|
|
4263
|
+
}
|
|
4264
|
+
for (const candidate of [image.source, ...Array.isArray(image.resolutions) ? image.resolutions : []]) {
|
|
4265
|
+
if (!isRecord5(candidate)) {
|
|
4266
|
+
continue;
|
|
4267
|
+
}
|
|
4268
|
+
const url = redditMediaUrl(stringFromUnknown4(candidate.url));
|
|
4269
|
+
if (!url) {
|
|
4270
|
+
continue;
|
|
4271
|
+
}
|
|
4272
|
+
images.push({
|
|
4273
|
+
url,
|
|
4274
|
+
kind: "image",
|
|
4275
|
+
source: "adapter",
|
|
4276
|
+
width: numberFromUnknown2(candidate.width),
|
|
4277
|
+
height: numberFromUnknown2(candidate.height),
|
|
4278
|
+
metadata: {
|
|
4279
|
+
adapter: "redditJsonEndpoint",
|
|
4280
|
+
originalSource: "redditJsonEndpoint"
|
|
4281
|
+
}
|
|
4282
|
+
});
|
|
4283
|
+
}
|
|
4284
|
+
}
|
|
4285
|
+
const thumbnail = redditMediaUrl(stringFromUnknown4(post.thumbnail));
|
|
4286
|
+
if (thumbnail && /^https?:\/\//i.test(thumbnail)) {
|
|
4287
|
+
images.push({
|
|
4288
|
+
url: thumbnail,
|
|
4289
|
+
kind: "image",
|
|
4290
|
+
source: "adapter",
|
|
4291
|
+
metadata: {
|
|
4292
|
+
adapter: "redditJsonEndpoint",
|
|
4293
|
+
originalSource: "redditJsonEndpoint"
|
|
4294
|
+
}
|
|
4295
|
+
});
|
|
4296
|
+
}
|
|
4297
|
+
return images;
|
|
4298
|
+
}
|
|
4299
|
+
function redditVideosFromPost(post) {
|
|
4300
|
+
const videos = [];
|
|
4301
|
+
const media = [post.media, post.secure_media].filter(isRecord5);
|
|
4302
|
+
for (const item of media) {
|
|
4303
|
+
const redditVideo = isRecord5(item.reddit_video) ? item.reddit_video : void 0;
|
|
4304
|
+
const url = redditMediaUrl(stringFromUnknown4(redditVideo?.fallback_url) ?? stringFromUnknown4(redditVideo?.hls_url) ?? stringFromUnknown4(redditVideo?.dash_url));
|
|
4305
|
+
if (!url) {
|
|
4306
|
+
continue;
|
|
4307
|
+
}
|
|
4308
|
+
videos.push({
|
|
4309
|
+
url,
|
|
4310
|
+
kind: "video",
|
|
4311
|
+
source: "adapter",
|
|
4312
|
+
width: numberFromUnknown2(redditVideo?.width),
|
|
4313
|
+
height: numberFromUnknown2(redditVideo?.height),
|
|
4314
|
+
metadata: {
|
|
4315
|
+
adapter: "redditJsonEndpoint",
|
|
4316
|
+
originalSource: "redditJsonEndpoint"
|
|
4317
|
+
}
|
|
4318
|
+
});
|
|
3547
4319
|
}
|
|
4320
|
+
return videos;
|
|
4321
|
+
}
|
|
4322
|
+
function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
|
|
4323
|
+
const finalUrl = post.canonicalUrl ?? requestedUrl;
|
|
4324
|
+
const bestImage = post.images.sort((left, right) => (right.width ?? 0) * (right.height ?? 0) - (left.width ?? 0) * (left.height ?? 0))[0];
|
|
4325
|
+
const video = post.videos[0];
|
|
4326
|
+
const structuredData = {
|
|
4327
|
+
"@context": "https://schema.org",
|
|
4328
|
+
"@type": "SocialMediaPosting",
|
|
4329
|
+
headline: post.title,
|
|
4330
|
+
description: post.description,
|
|
4331
|
+
author: post.author ? { "@type": "Person", name: post.author } : void 0,
|
|
4332
|
+
datePublished: post.createdAt,
|
|
4333
|
+
url: finalUrl,
|
|
4334
|
+
image: bestImage ? { "@type": "ImageObject", url: bestImage.url, width: bestImage.width, height: bestImage.height } : void 0,
|
|
4335
|
+
video: video ? { "@type": "VideoObject", contentUrl: video.url, width: video.width, height: video.height } : void 0
|
|
4336
|
+
};
|
|
4337
|
+
const embeddedPayload = {
|
|
4338
|
+
post: {
|
|
4339
|
+
postTitle: post.title,
|
|
4340
|
+
description: post.description,
|
|
4341
|
+
author: post.author ? { name: post.author } : void 0,
|
|
4342
|
+
createdAt: post.createdAt,
|
|
4343
|
+
canonicalUrl: finalUrl,
|
|
4344
|
+
previewImage: bestImage,
|
|
4345
|
+
media: {
|
|
4346
|
+
videoUrl: video?.url
|
|
4347
|
+
},
|
|
4348
|
+
images: post.images,
|
|
4349
|
+
videos: post.videos,
|
|
4350
|
+
subreddit: post.subreddit,
|
|
4351
|
+
postId: post.postId
|
|
4352
|
+
}
|
|
4353
|
+
};
|
|
4354
|
+
const html = [
|
|
4355
|
+
"<!doctype html><html><head>",
|
|
4356
|
+
`<title>${escapeHtml(post.title ?? "Reddit post")}</title>`,
|
|
4357
|
+
post.title ? `<meta property="og:title" content="${escapeHtml(post.title)}">` : "",
|
|
4358
|
+
post.description ? `<meta property="og:description" content="${escapeHtml(post.description)}">` : "",
|
|
4359
|
+
`<meta property="og:site_name" content="Reddit">`,
|
|
4360
|
+
`<meta property="og:url" content="${escapeHtml(finalUrl)}">`,
|
|
4361
|
+
bestImage ? `<meta property="og:image" content="${escapeHtml(bestImage.url)}">` : "",
|
|
4362
|
+
bestImage?.width ? `<meta property="og:image:width" content="${bestImage.width}">` : "",
|
|
4363
|
+
bestImage?.height ? `<meta property="og:image:height" content="${bestImage.height}">` : "",
|
|
4364
|
+
`<link rel="canonical" href="${escapeHtml(finalUrl)}">`,
|
|
4365
|
+
`<script type="application/ld+json">${safeJson(structuredData)}</script>`,
|
|
4366
|
+
`<script type="application/json" id="metanova-reddit-json">${safeJson(embeddedPayload)}</script>`,
|
|
4367
|
+
"</head><body></body></html>"
|
|
4368
|
+
].join("");
|
|
4369
|
+
return {
|
|
4370
|
+
...jsonPage,
|
|
4371
|
+
url: requestedUrl,
|
|
4372
|
+
originalUrl: requestedUrl,
|
|
4373
|
+
finalUrl,
|
|
4374
|
+
html,
|
|
4375
|
+
bytes: new TextEncoder().encode(html),
|
|
4376
|
+
contentType: "text/html; charset=utf-8",
|
|
4377
|
+
statusCode: jsonPage.statusCode
|
|
4378
|
+
};
|
|
4379
|
+
}
|
|
4380
|
+
function isRedditBlocked(page) {
|
|
4381
|
+
return page.statusCode === 403 || page.statusCode === 429 || /please wait for verification|whoa there, pardner|blocked|forbidden|too many requests|request has been blocked/i.test(page.html);
|
|
4382
|
+
}
|
|
4383
|
+
function redditRetryInfo(attempts) {
|
|
4384
|
+
const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
|
|
4385
|
+
if (blockedAttempts.length === 0) {
|
|
4386
|
+
return void 0;
|
|
4387
|
+
}
|
|
4388
|
+
const retryAfter = blockedAttempts.map((attempt) => attempt.retryAfter).find((value) => Boolean(value));
|
|
4389
|
+
return {
|
|
4390
|
+
retryable: blockedAttempts.some((attempt) => attempt.statusCode === 429 || Boolean(attempt.retryAfter)),
|
|
4391
|
+
reason: blockedAttempts.map((attempt) => `${attempt.method}${attempt.statusCode ? ` returned ${attempt.statusCode}` : " failed"}`).join("; "),
|
|
4392
|
+
retryAfter,
|
|
4393
|
+
retryAfterMs: retryAfterToMs(retryAfter),
|
|
4394
|
+
attempts: attempts.length
|
|
4395
|
+
};
|
|
4396
|
+
}
|
|
4397
|
+
function retryAfterToMs(value) {
|
|
4398
|
+
if (!value) {
|
|
4399
|
+
return void 0;
|
|
4400
|
+
}
|
|
4401
|
+
const seconds = Number.parseInt(value, 10);
|
|
4402
|
+
if (Number.isFinite(seconds)) {
|
|
4403
|
+
return seconds * 1e3;
|
|
4404
|
+
}
|
|
4405
|
+
const dateMs = Date.parse(value);
|
|
4406
|
+
return Number.isFinite(dateMs) ? Math.max(dateMs - Date.now(), 0) : void 0;
|
|
4407
|
+
}
|
|
4408
|
+
function mergeFallbackAttempts2(existing, incoming) {
|
|
4409
|
+
const attempts = [...existing ?? [], ...incoming];
|
|
4410
|
+
if (attempts.length === 0) {
|
|
4411
|
+
return void 0;
|
|
4412
|
+
}
|
|
4413
|
+
const seen = /* @__PURE__ */ new Set();
|
|
4414
|
+
return attempts.map((value) => {
|
|
4415
|
+
const { page: _page, ...attempt } = value;
|
|
4416
|
+
return attempt;
|
|
4417
|
+
}).filter((attempt) => {
|
|
4418
|
+
const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
|
|
4419
|
+
if (seen.has(key)) {
|
|
4420
|
+
return false;
|
|
4421
|
+
}
|
|
4422
|
+
seen.add(key);
|
|
4423
|
+
return true;
|
|
4424
|
+
});
|
|
4425
|
+
}
|
|
4426
|
+
function uniqueStrings3(values) {
|
|
4427
|
+
return [...new Set(values.filter((value) => Boolean(value)))];
|
|
4428
|
+
}
|
|
4429
|
+
function redditMediaUrl(value) {
|
|
4430
|
+
return value?.replace(/&/g, "&");
|
|
4431
|
+
}
|
|
4432
|
+
function firstText2(...values) {
|
|
4433
|
+
return values.map((value) => value?.replace(/\s+/g, " ").trim()).find((value) => Boolean(value));
|
|
4434
|
+
}
|
|
4435
|
+
function stringFromUnknown4(value) {
|
|
4436
|
+
if (typeof value === "string" && value.trim()) {
|
|
4437
|
+
return value.trim();
|
|
4438
|
+
}
|
|
4439
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
4440
|
+
return String(value);
|
|
4441
|
+
}
|
|
4442
|
+
return void 0;
|
|
4443
|
+
}
|
|
4444
|
+
function numberFromUnknown2(value) {
|
|
4445
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
4446
|
+
return value;
|
|
4447
|
+
}
|
|
4448
|
+
if (typeof value !== "string") {
|
|
4449
|
+
return void 0;
|
|
4450
|
+
}
|
|
4451
|
+
const parsed = Number.parseFloat(value);
|
|
4452
|
+
return Number.isFinite(parsed) ? parsed : void 0;
|
|
4453
|
+
}
|
|
4454
|
+
function safeJson(value) {
|
|
4455
|
+
return JSON.stringify(stripUndefinedDeep(value)).replace(/</g, "\\u003c");
|
|
4456
|
+
}
|
|
4457
|
+
function escapeHtml(value) {
|
|
4458
|
+
return value.replace(/&/g, "&").replace(/"/g, """).replace(/</g, "<").replace(/>/g, ">");
|
|
4459
|
+
}
|
|
4460
|
+
function stripUndefinedDeep(value) {
|
|
4461
|
+
if (Array.isArray(value)) {
|
|
4462
|
+
return value.map(stripUndefinedDeep).filter((item) => item !== void 0);
|
|
4463
|
+
}
|
|
4464
|
+
if (isRecord5(value)) {
|
|
4465
|
+
return Object.fromEntries(
|
|
4466
|
+
Object.entries(value).map(([key, item]) => [key, stripUndefinedDeep(item)]).filter(([, item]) => item !== void 0 && item !== null && (!Array.isArray(item) || item.length > 0))
|
|
4467
|
+
);
|
|
4468
|
+
}
|
|
4469
|
+
return value;
|
|
4470
|
+
}
|
|
4471
|
+
function isRecord5(value) {
|
|
4472
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3548
4473
|
}
|
|
3549
4474
|
function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
|
|
3550
4475
|
const contentType = page.contentType?.toLowerCase() ?? "";
|
|
@@ -3601,7 +4526,15 @@ function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
|
|
|
3601
4526
|
sourcesUsed: ["direct"],
|
|
3602
4527
|
warnings: [],
|
|
3603
4528
|
trace,
|
|
4529
|
+
extractionMethod: `direct:${kind}`,
|
|
3604
4530
|
selectedImageReason: kind === "image" ? "Selected direct image URL because the response content type is an image." : void 0,
|
|
4531
|
+
confidenceBreakdown: {
|
|
4532
|
+
title: 0,
|
|
4533
|
+
description: 0,
|
|
4534
|
+
image: kind === "image" ? 100 : 0,
|
|
4535
|
+
structuredData: 0,
|
|
4536
|
+
adapter: 0
|
|
4537
|
+
},
|
|
3605
4538
|
fetchDurationMs,
|
|
3606
4539
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3607
4540
|
}
|
|
@@ -3717,6 +4650,7 @@ var index_default = MetaNova;
|
|
|
3717
4650
|
behanceAdapter,
|
|
3718
4651
|
calculateCompleteness,
|
|
3719
4652
|
calculateConfidence,
|
|
4653
|
+
calculateConfidenceBreakdown,
|
|
3720
4654
|
calculateReliability,
|
|
3721
4655
|
createDiagnostics,
|
|
3722
4656
|
createPreviewCard,
|