metanova 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -756,8 +756,24 @@ function isRecord2(value) {
756
756
  }
757
757
 
758
758
  // src/extractors/media.ts
759
- var LAZY_IMAGE_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-image", "data-thumbnail"];
760
- var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-media"];
759
+ var LAZY_IMAGE_ATTRIBUTES = [
760
+ "data-src",
761
+ "data-original",
762
+ "data-lazy-src",
763
+ "data-image",
764
+ "data-image-url",
765
+ "data-og-image",
766
+ "data-thumbnail",
767
+ "data-thumb",
768
+ "data-media",
769
+ "data-full-src",
770
+ "data-hi-res-src",
771
+ "data-zoom-src",
772
+ "data-poster",
773
+ "data-bg"
774
+ ];
775
+ var LAZY_IMAGE_SRCSET_ATTRIBUTES = ["data-srcset", "data-lazy-srcset", "data-original-srcset"];
776
+ var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-video-url", "data-media", "data-playback-url"];
761
777
  function extractImages(html, baseUrl) {
762
778
  const $ = loadDocument(html);
763
779
  const images = [];
@@ -783,6 +799,15 @@ function extractImages(html, baseUrl) {
783
799
  type,
784
800
  metadata: { discoveredFrom: "link.preload" }
785
801
  }, baseUrl);
802
+ for (const candidate of parseSrcset($(element).attr("imagesrcset"))) {
803
+ pushResolved(images, {
804
+ url: candidate,
805
+ kind: "image",
806
+ source: "html",
807
+ type,
808
+ metadata: { discoveredFrom: "link.imagesrcset" }
809
+ }, baseUrl);
810
+ }
786
811
  }
787
812
  });
788
813
  collectDocumentImages($, images, baseUrl, "html");
@@ -907,7 +932,8 @@ function collectDocumentImages($, images, baseUrl, source) {
907
932
  const candidates = [
908
933
  normalizeWhitespace($(element).attr("src")),
909
934
  ...LAZY_IMAGE_ATTRIBUTES.map((attribute) => normalizeWhitespace($(element).attr(attribute))),
910
- ...parseSrcset($(element).attr("srcset"))
935
+ ...parseSrcset($(element).attr("srcset")),
936
+ ...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
911
937
  ];
912
938
  for (const candidate of candidates) {
913
939
  pushResolved(images, {
@@ -920,7 +946,10 @@ function collectDocumentImages($, images, baseUrl, source) {
920
946
  }
921
947
  });
922
948
  $("picture source[srcset], source[type^='image/'][srcset]").each((_, element) => {
923
- for (const candidate of parseSrcset($(element).attr("srcset"))) {
949
+ for (const candidate of [
950
+ ...parseSrcset($(element).attr("srcset")),
951
+ ...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
952
+ ]) {
924
953
  pushResolved(images, {
925
954
  url: candidate,
926
955
  kind: "image",
@@ -1212,11 +1241,11 @@ function uniqueStrings(values) {
1212
1241
 
1213
1242
  // src/scorers/image.ts
1214
1243
  var SOURCE_WEIGHT = {
1215
- adapter: 96,
1216
- openGraph: 92,
1244
+ adapter: 98,
1245
+ openGraph: 94,
1217
1246
  oEmbed: 88,
1218
1247
  jsonLd: 82,
1219
- twitter: 78,
1248
+ twitter: 86,
1220
1249
  nextData: 76,
1221
1250
  nuxt: 74,
1222
1251
  initialState: 73,
@@ -1244,7 +1273,9 @@ function scoreImages(images, customScorers = []) {
1244
1273
  scoreReasons: reasons
1245
1274
  }
1246
1275
  };
1247
- }).sort((left, right) => (right.score ?? 0) - (left.score ?? 0));
1276
+ }).sort(
1277
+ (left, right) => (right.score ?? 0) - (left.score ?? 0) || sourceSortWeight(right) - sourceSortWeight(left) || imageArea(right) - imageArea(left)
1278
+ );
1248
1279
  }
1249
1280
  function selectBestImage(images, customScorers = []) {
1250
1281
  const scored = scoreImages(images, customScorers);
@@ -1343,17 +1374,32 @@ function scoreFormat(image) {
1343
1374
  }
1344
1375
  function scoreUrlSignal(image) {
1345
1376
  const url = image.url.toLowerCase();
1346
- const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social/g) ?? [];
1347
- if (matches.length === 0) {
1377
+ const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social|maxres|highres|large|original/g) ?? [];
1378
+ const platformScore = platformThumbnailScore(url);
1379
+ if (matches.length === 0 && platformScore.score === 0) {
1348
1380
  return { score: 0, reasons: [] };
1349
1381
  }
1350
1382
  const uniqueMatches = [...new Set(matches)];
1351
- const score = Math.min(uniqueMatches.length * 4, 12);
1383
+ const score = Math.min(uniqueMatches.length * 4, 14) + platformScore.score;
1384
+ const reasons = uniqueMatches.length > 0 ? [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${Math.min(uniqueMatches.length * 4, 14)} points`] : [];
1385
+ reasons.push(...platformScore.reasons);
1352
1386
  return {
1353
1387
  score,
1354
- reasons: [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${score} points`]
1388
+ reasons
1355
1389
  };
1356
1390
  }
1391
+ function platformThumbnailScore(url) {
1392
+ if (/ytimg\.com\/vi\/[^/]+\/(?:maxresdefault|sddefault|hqdefault)/i.test(url)) {
1393
+ return { score: 12, reasons: ["YouTube platform thumbnail added 12 points"] };
1394
+ }
1395
+ if (/(?:i|preview|external-preview)\.redd\.it|v\.redd\.it/i.test(url)) {
1396
+ return { score: 10, reasons: ["Reddit media host added 10 points"] };
1397
+ }
1398
+ if (/pbs\.twimg\.com\/media|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|mir-s3-cdn-cf\.behance\.net/i.test(url)) {
1399
+ return { score: 8, reasons: ["social platform media host added 8 points"] };
1400
+ }
1401
+ return { score: 0, reasons: [] };
1402
+ }
1357
1403
  function scoreUrlPenalty(image) {
1358
1404
  const url = image.url.toLowerCase();
1359
1405
  let penalty = 0;
@@ -1426,6 +1472,12 @@ function countDuplicates(images) {
1426
1472
  }
1427
1473
  return counts;
1428
1474
  }
1475
+ function imageArea(image) {
1476
+ return (image.width ?? 0) * (image.height ?? 0);
1477
+ }
1478
+ function sourceSortWeight(image) {
1479
+ return SOURCE_WEIGHT[image.source] ?? 50;
1480
+ }
1429
1481
  function mediaSignature(url) {
1430
1482
  try {
1431
1483
  const parsed = new URL(url);
@@ -1443,16 +1495,45 @@ var IMAGE_KEYS = [
1443
1495
  "thumbnailUrl",
1444
1496
  "thumbnail_url",
1445
1497
  "thumbnailSrc",
1498
+ "thumbnail_src",
1446
1499
  "previewImage",
1447
1500
  "preview_image",
1501
+ "preview",
1448
1502
  "ogImage",
1503
+ "og_image",
1449
1504
  "cardImage",
1505
+ "displayUrl",
1506
+ "display_url",
1507
+ "mediaUrl",
1508
+ "media_url",
1509
+ "media_url_https",
1510
+ "fullPicture",
1511
+ "full_picture",
1450
1512
  "cover",
1451
1513
  "coverImage",
1514
+ "cover_image",
1515
+ "original",
1516
+ "source",
1452
1517
  "poster",
1518
+ "posterImage",
1519
+ "media"
1520
+ ];
1521
+ var VIDEO_KEYS = [
1522
+ "video",
1523
+ "videos",
1524
+ "videoUrl",
1525
+ "video_url",
1526
+ "contentUrl",
1527
+ "content_url",
1528
+ "embedUrl",
1529
+ "embed_url",
1530
+ "playbackUrl",
1531
+ "playback_url",
1532
+ "fallback_url",
1533
+ "hls_url",
1534
+ "dash_url",
1453
1535
  "media"
1454
1536
  ];
1455
- var VIDEO_KEYS = ["video", "videos", "videoUrl", "video_url", "contentUrl", "embedUrl", "playbackUrl"];
1456
1537
  var AUDIO_KEYS = ["audio", "audios", "audioUrl", "audio_url", "podcastUrl"];
1457
1538
  function discoverMedia(rawSources, finalUrl) {
1458
1539
  const trace = [];
@@ -1586,25 +1667,87 @@ function mediaFromJsonValue(value, kind, source) {
1586
1667
  return value.flatMap((item) => mediaFromJsonValue(item, kind, source));
1587
1668
  }
1588
1669
  if (isRecord3(value)) {
1589
- const url = stringFromUnknown(value.url) ?? stringFromUnknown(value.src) ?? stringFromUnknown(value.contentUrl) ?? stringFromUnknown(value.thumbnailUrl);
1670
+ const srcset = stringFromUnknown(value.srcset) ?? stringFromUnknown(value.srcSet);
1671
+ const srcsetAssets = parseSrcset(srcset).flatMap((url2) => mediaFromJsonValue(url2, kind, source));
1672
+ const url = mediaUrlFromRecord(value, kind);
1673
+ const nestedDetails = nestedMediaDetailsRecord(value, kind);
1590
1674
  if (!url || !looksLikeMediaUrl(url, kind)) {
1591
- return [];
1675
+ return srcsetAssets;
1592
1676
  }
1593
1677
  return [
1594
1678
  {
1595
1679
  url,
1596
1680
  kind,
1597
1681
  source,
1598
- width: parseNumber(stringFromUnknown(value.width)),
1599
- height: parseNumber(stringFromUnknown(value.height)),
1600
- alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name),
1601
- title: stringFromUnknown(value.title),
1602
- type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat)
1603
- }
1682
+ width: parseNumber(stringFromUnknown(value.width)) ?? parseNumber(stringFromUnknown(nestedDetails?.width)),
1683
+ height: parseNumber(stringFromUnknown(value.height)) ?? parseNumber(stringFromUnknown(nestedDetails?.height)),
1684
+ alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name) ?? stringFromUnknown(nestedDetails?.alt),
1685
+ title: stringFromUnknown(value.title) ?? stringFromUnknown(nestedDetails?.title),
1686
+ type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat) ?? stringFromUnknown(nestedDetails?.type)
1687
+ },
1688
+ ...srcsetAssets
1604
1689
  ];
1605
1690
  }
1606
1691
  return [];
1607
1692
  }
1693
+ function nestedMediaDetailsRecord(value, kind) {
1694
+ const candidates = [
1695
+ value.source,
1696
+ value.original,
1697
+ value.image,
1698
+ value.thumbnail,
1699
+ value.thumbnailUrl,
1700
+ value.thumbnail_url,
1701
+ value.previewImage,
1702
+ value.preview_image,
1703
+ value.video,
1704
+ value.reddit_video
1705
+ ];
1706
+ return candidates.find((candidate) => isRecord3(candidate) && Boolean(mediaUrlFromRecord(candidate, kind)));
1707
+ }
1708
+ function mediaUrlFromRecord(value, kind) {
1709
+ const commonCandidates = [
1710
+ value.url,
1711
+ value.src,
1712
+ value.secure_url,
1713
+ value.secureUrl,
1714
+ value.contentUrl,
1715
+ value.content_url,
1716
+ value.embedUrl,
1717
+ value.embed_url,
1718
+ value.thumbnailUrl,
1719
+ value.thumbnail_url,
1720
+ value.thumbnailSrc,
1721
+ value.thumbnail_src,
1722
+ value.mediaUrl,
1723
+ value.media_url,
1724
+ value.media_url_https,
1725
+ value.displayUrl,
1726
+ value.display_url,
1727
+ value.fullPicture,
1728
+ value.full_picture,
1729
+ value.previewImage,
1730
+ value.preview_image,
1731
+ value.poster,
1732
+ value.posterUrl,
1733
+ value.poster_url,
1734
+ value.coverImage,
1735
+ value.cover_image,
1736
+ value.original,
1737
+ value.source
1738
+ ];
1739
+ const videoCandidates = [
1740
+ value.videoUrl,
1741
+ value.video_url,
1742
+ value.playbackUrl,
1743
+ value.playback_url,
1744
+ value.fallback_url,
1745
+ value.hls_url,
1746
+ value.dash_url
1747
+ ];
1748
+ const candidates = kind === "video" ? [...videoCandidates, ...commonCandidates] : commonCandidates;
1749
+ return candidates.map(stringFromUnknown).find((candidate) => candidate && looksLikeMediaUrl(candidate, kind));
1750
+ }
1608
1751
  function assetFromEmbedded(value, kind, item, parent) {
1609
1752
  return {
1610
1753
  url: value,
@@ -1684,7 +1827,7 @@ function sourceRank(source) {
1684
1827
  }
1685
1828
  function shouldIgnoreMediaUrl2(url) {
1686
1829
  const normalized = url.toLowerCase();
1687
- return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
1830
+ return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji|favicon|apple-touch-icon)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
1688
1831
  }
1689
1832
  function looksLikeMediaUrl(value, kind) {
1690
1833
  if (shouldIgnoreMediaUrl2(value)) {
@@ -1692,10 +1835,10 @@ function looksLikeMediaUrl(value, kind) {
1692
1835
  }
1693
1836
  if (/^https?:\/\//i.test(value) || value.startsWith("/") || value.startsWith("./") || value.startsWith("../")) {
1694
1837
  if (kind === "image") {
1695
- return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo)/i.test(value);
1838
+ return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo|format=(?:jpg|jpeg|png|webp))/i.test(value) || /(?:ytimg\.com|i\.redd\.it|preview\.redd\.it|external-preview\.redd\.it|pbs\.twimg\.com|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|behance\.net)/i.test(value);
1696
1839
  }
1697
1840
  if (kind === "video") {
1698
- return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts)/i.test(value);
1841
+ return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts|v\.redd\.it)/i.test(value);
1699
1842
  }
1700
1843
  if (kind === "audio") {
1701
1844
  return /\.(?:mp3|m4a|wav|ogg|aac)(?:[?#].*)?$/i.test(value) || /(?:audio|podcast)/i.test(value);
@@ -1770,6 +1913,20 @@ function calculateConfidence(input) {
1770
1913
  score -= Math.min(input.warnings.length * 3, 18);
1771
1914
  return Math.round(clamp2(score, 0, 100));
1772
1915
  }
1916
+ function calculateConfidenceBreakdown(input) {
1917
+ const title = qualityPoints(input.title, 100, 6, 120);
1918
+ const description = qualityPoints(input.description, 100, 24, 300);
1919
+ const image = input.bestImage ? clamp2(58 + Math.min(input.bestImage.score ?? 0, 100) * 0.27 + sourceConfidenceBonus(input.bestImage.source), 0, 100) : 0;
1920
+ const structuredData = input.hasStructuredData ? 100 : input.rawSources.embeddedData.items.length > 0 ? 55 : 0;
1921
+ const adapter = adapterSucceeded(input.rawSources.adapters) ? adapterConfidence(input.rawSources.adapters[0]) : 0;
1922
+ return {
1923
+ title: Math.round(title),
1924
+ description: Math.round(description),
1925
+ image: Math.round(image),
1926
+ structuredData: Math.round(structuredData),
1927
+ adapter: Math.round(adapter)
1928
+ };
1929
+ }
1773
1930
  function calculateCompleteness(input) {
1774
1931
  const weights = [
1775
1932
  input.title ? 20 : 0,
@@ -1839,6 +1996,25 @@ function sourceConfidenceBonus(source) {
1839
1996
  function adapterSucceeded(adapters) {
1840
1997
  return adapters.some((adapter) => Boolean(adapter.title || adapter.description || adapter.images?.length || adapter.videos?.length));
1841
1998
  }
1999
+ function adapterConfidence(adapter) {
2000
+ if (!adapter) {
2001
+ return 0;
2002
+ }
2003
+ let score = 45;
2004
+ if (adapter.title) {
2005
+ score += 22;
2006
+ }
2007
+ if (adapter.description) {
2008
+ score += 14;
2009
+ }
2010
+ if ((adapter.images?.length ?? 0) > 0 || (adapter.videos?.length ?? 0) > 0) {
2011
+ score += 14;
2012
+ }
2013
+ if (adapter.author) {
2014
+ score += 6;
2015
+ }
2016
+ return clamp2(score, 0, 100);
2017
+ }
1842
2018
  function clamp2(value, min, max) {
1843
2019
  return Math.max(min, Math.min(max, value));
1844
2020
  }
@@ -1902,6 +2078,7 @@ function normalizeMetadata(rawSources, context = {}) {
1902
2078
  const type = inferType(rawSources, externalResults, jsonLdNodes, article, product, app, playlist, videos, audio);
1903
2079
  const author = firstResultValue(externalResults, (result) => result.author) ?? firstEntity(article?.authors) ?? entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"]);
1904
2080
  const publisher = article?.publisher ?? firstResultValue(externalResults, (result) => result.publisher) ?? entityFromJsonLd(organizationNode) ?? entityFromEmbedded(embeddedNodes, ["publisher", "provider", "organization"]);
2081
+ const publishDate = firstDefined(article?.publishedTime, video?.publishedTime);
1905
2082
  const sourcesUsed = detectSourcesUsed(rawSources);
1906
2083
  const warnings = diagnosticsWarnings(rawSources, externalResults, context.diagnostics);
1907
2084
  const fieldSources = {
@@ -1910,7 +2087,7 @@ function normalizeMetadata(rawSources, context = {}) {
1910
2087
  author: fieldSource(rawSources, externalResults, embeddedNodes, "author", selectedImage.best),
1911
2088
  image: fieldSource(rawSources, externalResults, embeddedNodes, "image", selectedImage.best)
1912
2089
  };
1913
- const confidence = calculateConfidence({
2090
+ const confidenceInput = {
1914
2091
  title,
1915
2092
  description,
1916
2093
  bestImage: selectedImage.best,
@@ -1919,7 +2096,9 @@ function normalizeMetadata(rawSources, context = {}) {
1919
2096
  rawSources,
1920
2097
  sourcesUsed,
1921
2098
  warnings
1922
- });
2099
+ };
2100
+ const confidence = calculateConfidence(confidenceInput);
2101
+ const confidenceBreakdown = calculateConfidenceBreakdown(confidenceInput);
1923
2102
  const completeness = calculateCompleteness({
1924
2103
  title,
1925
2104
  description,
@@ -1929,7 +2108,7 @@ function normalizeMetadata(rawSources, context = {}) {
1929
2108
  author,
1930
2109
  publisher,
1931
2110
  type,
1932
- publishedTime: article?.publishedTime,
2111
+ publishedTime: publishDate,
1933
2112
  mediaCount: images.length + videos.length + audio.length
1934
2113
  });
1935
2114
  const reliability = calculateReliability({
@@ -1948,7 +2127,19 @@ function normalizeMetadata(rawSources, context = {}) {
1948
2127
  };
1949
2128
  diagnostics.sourcesUsed = uniqueStrings2([...diagnostics.sourcesUsed, ...sourcesUsed]);
1950
2129
  diagnostics.warnings = uniqueStrings2([...diagnostics.warnings, ...rawSources.jsonLd.warnings, ...externalResults.flatMap((result) => result.warnings ?? [])]);
2130
+ diagnostics.adapterUsed = diagnostics.adapterUsed ?? rawSources.adapters[0]?.source;
2131
+ diagnostics.extractionMethod = diagnostics.extractionMethod ?? adapterRawString(rawSources.adapters[0], "extractionMethod") ?? fieldSources.title;
2132
+ diagnostics.sourcePriority = uniqueStrings2([
2133
+ ...diagnostics.sourcePriority ?? [],
2134
+ ...arrayOfStrings(rawSources.adapters[0]?.raw?.sourcePriority) ?? []
2135
+ ]);
2136
+ diagnostics.fallbacksAttempted = mergeFallbackAttempts(
2137
+ diagnostics.fallbacksAttempted,
2138
+ fallbackAttemptsFromUnknown(rawSources.adapters[0]?.raw?.fallbacksAttempted)
2139
+ );
2140
+ diagnostics.retryInfo = diagnostics.retryInfo ?? retryInfoFromUnknown(rawSources.adapters[0]?.raw?.retryInfo);
1951
2141
  diagnostics.selectedImageReason = selectedImage.reason;
2142
+ diagnostics.confidenceBreakdown = confidenceBreakdown;
1952
2143
  diagnostics.originalUrl = diagnostics.originalUrl ?? url;
1953
2144
  diagnostics.finalUrl = diagnostics.finalUrl ?? finalUrl;
1954
2145
  diagnostics.canonicalUrl = canonicalUrl;
@@ -1965,6 +2156,7 @@ function normalizeMetadata(rawSources, context = {}) {
1965
2156
  type,
1966
2157
  title,
1967
2158
  description,
2159
+ publishDate,
1968
2160
  siteName,
1969
2161
  canonicalUrl,
1970
2162
  confidence,
@@ -2254,6 +2446,61 @@ function adapterDiagnostics(adapters) {
2254
2446
  confidence: Math.min(confidence, 100)
2255
2447
  };
2256
2448
  }
2449
+ function adapterRawString(adapter, key) {
2450
+ const value = adapter?.raw?.[key];
2451
+ return typeof value === "string" && value.trim() ? value.trim() : void 0;
2452
+ }
2453
+ function fallbackAttemptsFromUnknown(value) {
2454
+ if (!Array.isArray(value)) {
2455
+ return void 0;
2456
+ }
2457
+ const attempts = value.flatMap((item) => {
2458
+ if (!isJsonLdNode(item) || typeof item.method !== "string") {
2459
+ return [];
2460
+ }
2461
+ return [{
2462
+ method: item.method,
2463
+ url: typeof item.url === "string" ? item.url : void 0,
2464
+ ok: typeof item.ok === "boolean" ? item.ok : false,
2465
+ statusCode: typeof item.statusCode === "number" ? item.statusCode : void 0,
2466
+ blocked: typeof item.blocked === "boolean" ? item.blocked : void 0,
2467
+ error: typeof item.error === "string" ? item.error : void 0,
2468
+ retryAfter: typeof item.retryAfter === "string" ? item.retryAfter : void 0
2469
+ }];
2470
+ });
2471
+ return attempts.length > 0 ? attempts : void 0;
2472
+ }
2473
+ function mergeFallbackAttempts(existing, incoming) {
2474
+ const attempts = [...existing ?? [], ...incoming ?? []];
2475
+ if (attempts.length === 0) {
2476
+ return void 0;
2477
+ }
2478
+ const seen = /* @__PURE__ */ new Set();
2479
+ return attempts.filter((attempt) => {
2480
+ const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
2481
+ if (seen.has(key)) {
2482
+ return false;
2483
+ }
2484
+ seen.add(key);
2485
+ return true;
2486
+ });
2487
+ }
2488
+ function retryInfoFromUnknown(value) {
2489
+ if (!isJsonLdNode(value)) {
2490
+ return void 0;
2491
+ }
2492
+ const retryable = typeof value.retryable === "boolean" ? value.retryable : void 0;
2493
+ if (retryable === void 0) {
2494
+ return void 0;
2495
+ }
2496
+ return {
2497
+ retryable,
2498
+ reason: typeof value.reason === "string" ? value.reason : void 0,
2499
+ retryAfter: typeof value.retryAfter === "string" ? value.retryAfter : void 0,
2500
+ retryAfterMs: typeof value.retryAfterMs === "number" ? value.retryAfterMs : void 0,
2501
+ attempts: typeof value.attempts === "number" ? value.attempts : void 0
2502
+ };
2503
+ }
2257
2504
  function fieldSource(rawSources, externalResults, embeddedNodes, field, bestImage) {
2258
2505
  if (field === "image") {
2259
2506
  return bestImage ? sourceLabel2(bestImage) : void 0;
@@ -2502,23 +2749,26 @@ var youtubeAdapter = {
2502
2749
  const videoId = getYouTubeVideoId(url);
2503
2750
  const playlistId = getYouTubePlaylistId(url);
2504
2751
  const communityPostId = getYouTubeCommunityPostId(url);
2752
+ const titleSelection = youtubeTitleFromContext(context, { videoId, playlistId, communityPostId });
2753
+ const descriptionSelection = youtubeDescriptionFromContext(context);
2505
2754
  const channel = entityFromContext(context, ["author", "ownerChannelName", "channel", "owner"]);
2506
2755
  const playlistVideos = playlistId ? extractPlaylistVideos(context) : [];
2756
+ const sourcePriority = youtubeSourcePriority();
2507
2757
  return compactAdapterResult({
2508
2758
  source: "youtubeAdapter",
2509
2759
  platform: "YouTube",
2510
2760
  type: playlistId ? "playlist" : communityPostId ? "social_post" : "video",
2511
2761
  siteName: "YouTube",
2512
2762
  canonicalUrl: videoId ? `https://www.youtube.com/watch?v=${videoId}` : context.raw.openGraph.url,
2513
- title: titleFromContext(context, ["videoDetails", "title", "headline", "name", "contentText"]),
2514
- description: descriptionFromContext(context),
2763
+ title: titleSelection.value,
2764
+ description: descriptionSelection.value,
2515
2765
  videos: markAdapterMedia(mediaFromContext(context).videos, "youtubeAdapter"),
2516
2766
  images: markAdapterMedia(mediaFromContext(context).images, "youtubeAdapter"),
2517
2767
  author: channel,
2518
2768
  article: { publishedTime: publishedTimeFromContext(context) },
2519
2769
  video: videoId ? {
2520
2770
  id: videoId,
2521
- title: titleFromContext(context, ["videoDetails", "title"]),
2771
+ title: titleSelection.value,
2522
2772
  channel,
2523
2773
  publishedTime: publishedTimeFromContext(context),
2524
2774
  duration: findEmbeddedString(context, ["duration", "lengthSeconds", "approxDurationMs"]),
@@ -2528,11 +2778,15 @@ var youtubeAdapter = {
2528
2778
  } : void 0,
2529
2779
  playlist: playlistId ? {
2530
2780
  id: playlistId,
2531
- title: findEmbeddedString(context, ["playlistTitle", "playlistName", "title"]) ?? context.raw.openGraph.title,
2781
+ title: youtubePlaylistTitleFromContext(context) ?? context.raw.openGraph.title,
2532
2782
  channel,
2533
2783
  videos: playlistVideos
2534
2784
  } : void 0,
2535
- identifiers: { videoId, playlistId, communityPostId }
2785
+ identifiers: { videoId, playlistId, communityPostId },
2786
+ raw: {
2787
+ sourcePriority,
2788
+ extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "youtube:htmlFallback"
2789
+ }
2536
2790
  });
2537
2791
  },
2538
2792
  normalize(rawData) {
@@ -2551,20 +2805,27 @@ var redditAdapter = {
2551
2805
  const url = new URL(context.finalUrl);
2552
2806
  const reddit = parseRedditUrl(url);
2553
2807
  const username = typeof reddit.username === "string" ? reddit.username : void 0;
2808
+ const titleSelection = redditTitleFromContext(context);
2809
+ const descriptionSelection = redditDescriptionFromContext(context);
2810
+ const sourcePriority = redditSourcePriority();
2554
2811
  return compactAdapterResult({
2555
2812
  source: "redditAdapter",
2556
2813
  platform: "Reddit",
2557
2814
  type: reddit.isPost ? "social_post" : "website",
2558
2815
  siteName: "Reddit",
2559
- canonicalUrl: context.raw.openGraph.url,
2560
- title: cleanSocialTitle(titleFromContext(context, ["title", "postTitle", "headline"])),
2561
- description: descriptionFromContext(context),
2816
+ canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
2817
+ title: cleanSocialTitle(titleSelection.value),
2818
+ description: descriptionSelection.value,
2562
2819
  images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
2563
2820
  videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
2564
2821
  author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
2565
2822
  article: { publishedTime: publishedTimeFromContext(context) },
2566
2823
  identifiers: { subreddit: reddit.subreddit, postId: reddit.postId, username: reddit.username },
2567
- raw: { ...reddit }
2824
+ raw: {
2825
+ ...reddit,
2826
+ sourcePriority,
2827
+ extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "reddit:htmlFallback"
2828
+ }
2568
2829
  });
2569
2830
  },
2570
2831
  normalize(rawData) {
@@ -2665,6 +2926,7 @@ var facebookAdapter = {
2665
2926
  platform: "Facebook",
2666
2927
  type: isPhoto ? "image" : isPost || media.images.length > 0 || media.videos.length > 0 ? "social_post" : "website",
2667
2928
  siteName: "Facebook",
2929
+ canonicalUrl: context.raw.openGraph.url,
2668
2930
  title: titleFromContext(context, ["title", "headline", "name"]),
2669
2931
  description: descriptionFromContext(context),
2670
2932
  images: markAdapterMedia(media.images, "facebookAdapter"),
@@ -2748,6 +3010,116 @@ var defaultAdapters = [
2748
3010
  twitterAdapter,
2749
3011
  instagramAdapter
2750
3012
  ];
3013
+ function youtubeSourcePriority() {
3014
+ return [
3015
+ "structuredData:VideoObject",
3016
+ "embeddedData:ytInitialPlayerResponse",
3017
+ "embeddedData:ytInitialData",
3018
+ "openGraph",
3019
+ "twitter",
3020
+ "html"
3021
+ ];
3022
+ }
3023
+ function youtubeTitleFromContext(context, ids) {
3024
+ const videoObjectTitle = jsonLdVideoObjectString(context, ["name", "headline"]);
3025
+ if (videoObjectTitle) {
3026
+ return { value: videoObjectTitle, method: "youtube:structuredData.VideoObject" };
3027
+ }
3028
+ const playerTitle = youtubePlayerString(context, ["videoDetails.title", "microformat.playerMicroformatRenderer.title"]);
3029
+ if (playerTitle) {
3030
+ return { value: playerTitle, method: "youtube:ytInitialPlayerResponse" };
3031
+ }
3032
+ const initialDataTitle = youtubeInitialDataTitle(context, ids);
3033
+ if (initialDataTitle) {
3034
+ return { value: initialDataTitle, method: "youtube:ytInitialData" };
3035
+ }
3036
+ if (context.raw.openGraph.title) {
3037
+ return { value: context.raw.openGraph.title, method: "youtube:openGraph" };
3038
+ }
3039
+ if (context.raw.twitter.title) {
3040
+ return { value: context.raw.twitter.title, method: "youtube:twitter" };
3041
+ }
3042
+ return { value: cleanYouTubeHtmlTitle(context.raw.html.title), method: context.raw.html.title ? "youtube:html" : void 0 };
3043
+ }
3044
+ function youtubeDescriptionFromContext(context) {
3045
+ const videoObjectDescription = jsonLdVideoObjectString(context, ["description"]);
3046
+ if (videoObjectDescription) {
3047
+ return { value: videoObjectDescription, method: "youtube:structuredData.VideoObject" };
3048
+ }
3049
+ const playerDescription = youtubePlayerString(context, [
3050
+ "videoDetails.shortDescription",
3051
+ "microformat.playerMicroformatRenderer.description",
3052
+ "microformat.playerMicroformatRenderer.shortDescription"
3053
+ ]);
3054
+ if (playerDescription) {
3055
+ return { value: playerDescription, method: "youtube:ytInitialPlayerResponse" };
3056
+ }
3057
+ const initialDataDescription = youtubeInitialDataDescription(context);
3058
+ if (initialDataDescription) {
3059
+ return { value: initialDataDescription, method: "youtube:ytInitialData" };
3060
+ }
3061
+ if (context.raw.openGraph.description) {
3062
+ return { value: context.raw.openGraph.description, method: "youtube:openGraph" };
3063
+ }
3064
+ if (context.raw.twitter.description) {
3065
+ return { value: context.raw.twitter.description, method: "youtube:twitter" };
3066
+ }
3067
+ return { value: context.raw.html.description, method: context.raw.html.description ? "youtube:html" : void 0 };
3068
+ }
3069
+ function redditSourcePriority() {
3070
+ return [
3071
+ "redditJsonEndpoint",
3072
+ "oldReddit",
3073
+ "embeddedStructuredData",
3074
+ "openGraph",
3075
+ "twitter",
3076
+ "html"
3077
+ ];
3078
+ }
3079
+ function redditTitleFromContext(context) {
3080
+ const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
3081
+ "postTitle",
3082
+ "title",
3083
+ "headline"
3084
+ ]);
3085
+ if (embedded) {
3086
+ return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
3087
+ }
3088
+ const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["headline", "name"]);
3089
+ if (structured) {
3090
+ return { value: structured, method: "reddit:structuredData" };
3091
+ }
3092
+ if (context.raw.openGraph.title) {
3093
+ return { value: context.raw.openGraph.title, method: "reddit:openGraph" };
3094
+ }
3095
+ if (context.raw.twitter.title) {
3096
+ return { value: context.raw.twitter.title, method: "reddit:twitter" };
3097
+ }
3098
+ return { value: context.raw.html.title, method: context.raw.html.title ? "reddit:html" : void 0 };
3099
+ }
3100
+ function redditDescriptionFromContext(context) {
3101
+ const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
3102
+ "description",
3103
+ "selftext",
3104
+ "excerpt",
3105
+ "summary",
3106
+ "body"
3107
+ ]);
3108
+ if (embedded) {
3109
+ return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
3110
+ }
3111
+ const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["description", "articleBody"]);
3112
+ if (structured) {
3113
+ return { value: structured, method: "reddit:structuredData" };
3114
+ }
3115
+ if (context.raw.openGraph.description) {
3116
+ return { value: context.raw.openGraph.description, method: "reddit:openGraph" };
3117
+ }
3118
+ if (context.raw.twitter.description) {
3119
+ return { value: context.raw.twitter.description, method: "reddit:twitter" };
3120
+ }
3121
+ return { value: context.raw.html.description, method: context.raw.html.description ? "reddit:html" : void 0 };
3122
+ }
2751
3123
  function socialVideoResult(source, platform, context) {
2752
3124
  const url = new URL(context.finalUrl);
2753
3125
  const username = url.pathname.match(/@([^/]+)/)?.[1];
@@ -2806,6 +3178,143 @@ function markAdapterMedia(assets, adapterName) {
2806
3178
  }
2807
3179
  }));
2808
3180
  }
3181
+ function jsonLdVideoObjectString(context, keys) {
3182
+ return jsonLdStringByType(context.raw.jsonLd.nodes, ["VideoObject"], keys);
3183
+ }
3184
+ function jsonLdStringByType(nodes, types, keys) {
3185
+ for (const node of nodes) {
3186
+ if (!hasJsonLdType2(node, types)) {
3187
+ continue;
3188
+ }
3189
+ for (const key of keys) {
3190
+ const value = stringFromUnknown3(node[key]);
3191
+ if (value) {
3192
+ return value;
3193
+ }
3194
+ }
3195
+ }
3196
+ return void 0;
3197
+ }
3198
+ function hasJsonLdType2(node, types) {
3199
+ const nodeTypes = Array.isArray(node["@type"]) ? node["@type"] : [node["@type"]];
3200
+ return nodeTypes.some((type) => typeof type === "string" && types.some((candidate) => type.toLowerCase().endsWith(candidate.toLowerCase())));
3201
+ }
3202
+ function youtubePlayerString(context, paths) {
3203
+ for (const item of context.raw.embeddedData.items) {
3204
+ if (item.source !== "youtubePlayerResponse") {
3205
+ continue;
3206
+ }
3207
+ for (const path of paths) {
3208
+ const value = stringFromUnknown3(valueAtPath(item.data, path));
3209
+ if (value) {
3210
+ return value;
3211
+ }
3212
+ }
3213
+ }
3214
+ return void 0;
3215
+ }
3216
+ function youtubeInitialDataTitle(context, ids) {
3217
+ const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
3218
+ const primary = findRendererText(items, ["videoPrimaryInfoRenderer", "watchMetadata"], ["title"]);
3219
+ if (primary) {
3220
+ return primary;
3221
+ }
3222
+ if (ids.videoId) {
3223
+ const matchingVideo = findYouTubeRendererForVideoId(items, ids.videoId, ["title"]);
3224
+ if (matchingVideo) {
3225
+ return matchingVideo;
3226
+ }
3227
+ }
3228
+ if (ids.communityPostId) {
3229
+ const communityPost = findEmbeddedStringBySources(context, ["youtubeInitialData"], ["contentText"]) ?? findRendererText(items, ["backstagePostRenderer", "postRenderer"], ["contentText", "title"]);
3230
+ if (communityPost) {
3231
+ return communityPost;
3232
+ }
3233
+ }
3234
+ if (ids.playlistId && !ids.videoId) {
3235
+ return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
3236
+ }
3237
+ return void 0;
3238
+ }
3239
+ function youtubeInitialDataDescription(context) {
3240
+ const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
3241
+ return findRendererText(items, ["expandableVideoDescriptionBodyRenderer", "videoSecondaryInfoRenderer", "watchMetadata"], [
3242
+ "description",
3243
+ "attributedDescription",
3244
+ "content"
3245
+ ]);
3246
+ }
3247
+ function youtubePlaylistTitleFromContext(context) {
3248
+ const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
3249
+ return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
3250
+ }
3251
+ function findRendererText(items, rendererKeys, textKeys) {
3252
+ for (const item of items) {
3253
+ let found;
3254
+ walkData(item.data, (value, key) => {
3255
+ if (found || !key || !rendererKeys.includes(key) || !isRecord4(value)) {
3256
+ return;
3257
+ }
3258
+ for (const textKey of textKeys) {
3259
+ found = stringFromUnknown3(value[textKey]);
3260
+ if (found) {
3261
+ return;
3262
+ }
3263
+ }
3264
+ });
3265
+ if (found) {
3266
+ return found;
3267
+ }
3268
+ }
3269
+ return void 0;
3270
+ }
3271
+ function findYouTubeRendererForVideoId(items, videoId, textKeys) {
3272
+ for (const item of items) {
3273
+ let found;
3274
+ walkData(item.data, (value) => {
3275
+ if (found || !isRecord4(value) || stringFromUnknown3(value.videoId) !== videoId) {
3276
+ return;
3277
+ }
3278
+ for (const textKey of textKeys) {
3279
+ found = stringFromUnknown3(value[textKey]);
3280
+ if (found) {
3281
+ return;
3282
+ }
3283
+ }
3284
+ });
3285
+ if (found) {
3286
+ return found;
3287
+ }
3288
+ }
3289
+ return void 0;
3290
+ }
3291
+ function findEmbeddedStringBySources(context, sources, keys) {
3292
+ const candidates = [];
3293
+ for (const item of context.raw.embeddedData.items) {
3294
+ if (!sources.includes(item.source)) {
3295
+ continue;
3296
+ }
3297
+ walkData(item.data, (value, key) => {
3298
+ if (!key || !matchesKey(key, keys)) {
3299
+ return;
3300
+ }
3301
+ const text = stringFromUnknown3(value);
3302
+ if (text) {
3303
+ candidates.push(text);
3304
+ }
3305
+ });
3306
+ }
3307
+ return bestTextCandidate(candidates);
3308
+ }
3309
+ function hasRedditJsonEndpointPayload(context) {
3310
+ return context.raw.embeddedData.items.some((item) => item.source === "applicationJson" && item.path === "metanova-reddit-json");
3311
+ }
3312
+ function valueAtPath(node, path) {
3313
+ return path.split(".").reduce((current, key) => isRecord4(current) ? current[key] : void 0, node);
3314
+ }
3315
+ function cleanYouTubeHtmlTitle(title) {
3316
+ return title?.replace(/\s*-\s*YouTube\s*$/i, "").trim();
3317
+ }
2809
3318
  function titleFromContext(context, embeddedKeys) {
2810
3319
  return firstText(
2811
3320
  context.raw.openGraph.title,
@@ -3373,9 +3882,8 @@ async function fetchMetadata(url, options = {}) {
3373
3882
  const startedAt = Date.now();
3374
3883
  try {
3375
3884
  const requestedUrl = normalizeUrl(url);
3376
- const firstPage = await fetchPage(requestedUrl, options);
3377
- const fallback = await maybeFetchRedditFallback(firstPage, options);
3378
- const page = fallback.page;
3885
+ const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
3886
+ const page = fetchResult.page;
3379
3887
  const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
3380
3888
  if (directMedia) {
3381
3889
  return directMedia;
@@ -3396,12 +3904,17 @@ async function fetchMetadata(url, options = {}) {
3396
3904
  metadata.diagnostics.trace = [
3397
3905
  ...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
3398
3906
  ...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
3399
- ...fallback.used ? ["retried Reddit page through old.reddit fallback"] : [],
3907
+ ...fetchResult.trace,
3400
3908
  "downloaded page",
3401
3909
  ...metadata.diagnostics.trace,
3402
3910
  ...metadata.canonicalUrl ? ["resolved canonical URL"] : []
3403
3911
  ];
3912
+ metadata.diagnostics.fallbacksAttempted = mergeFallbackAttempts2(metadata.diagnostics.fallbacksAttempted, fetchResult.fallbacksAttempted);
3913
+ metadata.diagnostics.sourcePriority = uniqueStrings3([...metadata.diagnostics.sourcePriority ?? [], ...fetchResult.sourcePriority ?? []]);
3914
+ metadata.diagnostics.extractionMethod = metadata.diagnostics.extractionMethod ?? fetchResult.extractionMethod;
3915
+ metadata.diagnostics.retryInfo = metadata.diagnostics.retryInfo ?? fetchResult.retryInfo;
3404
3916
  metadata.trace = metadata.diagnostics.trace;
3917
+ metadata.diagnostics.warnings.push(...fetchResult.warnings);
3405
3918
  if (!metadata.ok) {
3406
3919
  metadata.diagnostics.warnings.push(`Fetch completed with non-success status code ${page.statusCode}.`);
3407
3920
  }
@@ -3439,30 +3952,441 @@ async function fetchMetadata(url, options = {}) {
3439
3952
  };
3440
3953
  }
3441
3954
  }
3442
- async function maybeFetchRedditFallback(page, options) {
3443
- let parsed;
3955
+ async function fetchPageWithStrategies(requestedUrl, options) {
3956
+ if (isRedditUrl(requestedUrl)) {
3957
+ return fetchRedditPageWithStrategy(requestedUrl, options);
3958
+ }
3959
+ return {
3960
+ page: await fetchPage(requestedUrl, options),
3961
+ fallbacksAttempted: [],
3962
+ warnings: [],
3963
+ trace: []
3964
+ };
3965
+ }
3966
+ async function fetchRedditPageWithStrategy(requestedUrl, options) {
3967
+ const attempts = [];
3968
+ const warnings = [];
3969
+ const sourcePriority = ["redditJsonEndpoint", "oldReddit", "embeddedStructuredData", "openGraph", "html"];
3970
+ let lastError;
3971
+ const jsonUrl = redditJsonEndpoint(requestedUrl);
3972
+ if (jsonUrl) {
3973
+ const attempt = await attemptFetch("redditJsonEndpoint", jsonUrl, {
3974
+ ...options,
3975
+ accept: "application/json,text/html;q=0.8,*/*;q=0.5"
3976
+ });
3977
+ attempts.push(attempt);
3978
+ lastError = attempt.error;
3979
+ if (attempt.page && attempt.ok && !attempt.blocked) {
3980
+ const redditPost = parseRedditJsonPayload(attempt.page.html);
3981
+ if (redditPost?.title) {
3982
+ return {
3983
+ page: synthesizeRedditJsonPage(attempt.page, requestedUrl, redditPost),
3984
+ fallbacksAttempted: attempts,
3985
+ warnings,
3986
+ trace: ["used Reddit JSON endpoint"],
3987
+ sourcePriority,
3988
+ extractionMethod: "reddit:jsonEndpoint",
3989
+ retryInfo: redditRetryInfo(attempts)
3990
+ };
3991
+ }
3992
+ warnings.push("Reddit JSON endpoint responded, but no post payload could be extracted.");
3993
+ } else if (attempt.blocked) {
3994
+ warnings.push("Reddit JSON endpoint appears to have blocked access.");
3995
+ }
3996
+ }
3997
+ const oldRedditUrl = redditOldUrl(requestedUrl);
3998
+ if (oldRedditUrl && oldRedditUrl !== requestedUrl) {
3999
+ const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
4000
+ attempts.push(attempt);
4001
+ lastError = attempt.error;
4002
+ if (attempt.page && attempt.ok && !attempt.blocked) {
4003
+ return {
4004
+ page: attempt.page,
4005
+ fallbacksAttempted: attempts,
4006
+ warnings,
4007
+ trace: ["retried Reddit page through old.reddit"],
4008
+ sourcePriority,
4009
+ extractionMethod: "reddit:oldReddit",
4010
+ retryInfo: redditRetryInfo(attempts)
4011
+ };
4012
+ }
4013
+ if (attempt.blocked) {
4014
+ warnings.push("old.reddit fallback appears to have been blocked.");
4015
+ }
4016
+ }
4017
+ const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
4018
+ attempts.push(htmlAttempt);
4019
+ lastError = htmlAttempt.error;
4020
+ if (htmlAttempt.page) {
4021
+ if (htmlAttempt.blocked) {
4022
+ warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
4023
+ }
4024
+ return {
4025
+ page: htmlAttempt.page,
4026
+ fallbacksAttempted: attempts,
4027
+ warnings,
4028
+ trace: ["used Reddit HTML fallback"],
4029
+ sourcePriority,
4030
+ extractionMethod: "reddit:htmlFallback",
4031
+ retryInfo: redditRetryInfo(attempts)
4032
+ };
4033
+ }
4034
+ throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
4035
+ }
4036
+ async function attemptFetch(method, url, options) {
3444
4037
  try {
3445
- parsed = new URL(page.finalUrl);
4038
+ const page = await fetchPage(url, options);
4039
+ const retryAfter = page.headers["retry-after"];
4040
+ const blocked = isRedditBlocked(page);
4041
+ return {
4042
+ method,
4043
+ url,
4044
+ ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
4045
+ statusCode: page.statusCode,
4046
+ blocked,
4047
+ retryAfter,
4048
+ page
4049
+ };
4050
+ } catch (error) {
4051
+ return {
4052
+ method,
4053
+ url,
4054
+ ok: false,
4055
+ error: error instanceof Error ? error.message : String(error)
4056
+ };
4057
+ }
4058
+ }
4059
+ function isRedditUrl(url) {
4060
+ try {
4061
+ const host = new URL(url).hostname.toLowerCase().replace(/^www\./, "");
4062
+ return host === "reddit.com" || host === "redd.it" || host.endsWith(".reddit.com");
3446
4063
  } catch {
3447
- return { page, used: false };
3448
- }
3449
- const host = parsed.hostname.toLowerCase();
3450
- const isReddit = host === "www.reddit.com" || host === "reddit.com" || host.endsWith(".reddit.com");
3451
- const isOldReddit = host === "old.reddit.com";
3452
- const looksLikeVerification = /please wait for verification|blocked|whoa there, pardner/i.test(page.html);
3453
- const hasUsefulPreview = /og:(?:title|image|description)|twitter:(?:title|image|description)/i.test(page.html);
3454
- if (!isReddit || isOldReddit || hasUsefulPreview || !looksLikeVerification) {
3455
- return { page, used: false };
3456
- }
3457
- const fallbackUrl = new URL(page.finalUrl);
3458
- fallbackUrl.hostname = "old.reddit.com";
3459
- fallbackUrl.search = "";
4064
+ return false;
4065
+ }
4066
+ }
4067
+ function redditJsonEndpoint(url) {
3460
4068
  try {
3461
- const fallbackPage = await fetchPage(fallbackUrl.toString(), options);
3462
- return { page: fallbackPage, used: true };
4069
+ const parsed = new URL(url);
4070
+ const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
4071
+ const endpoint = new URL(url);
4072
+ endpoint.protocol = "https:";
4073
+ endpoint.hostname = "www.reddit.com";
4074
+ endpoint.search = "";
4075
+ if (host === "redd.it") {
4076
+ const postId = parsed.pathname.split("/").filter(Boolean)[0];
4077
+ if (!postId) {
4078
+ return void 0;
4079
+ }
4080
+ endpoint.pathname = `/comments/${postId}.json`;
4081
+ } else {
4082
+ endpoint.pathname = parsed.pathname.endsWith(".json") ? parsed.pathname : `${parsed.pathname.endsWith("/") ? parsed.pathname : `${parsed.pathname}/`}.json`;
4083
+ }
4084
+ endpoint.searchParams.set("raw_json", "1");
4085
+ return endpoint.toString();
3463
4086
  } catch {
3464
- return { page, used: false };
4087
+ return void 0;
4088
+ }
4089
+ }
4090
+ function redditOldUrl(url) {
4091
+ try {
4092
+ const parsed = new URL(url);
4093
+ const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
4094
+ parsed.protocol = "https:";
4095
+ parsed.hostname = "old.reddit.com";
4096
+ parsed.search = "";
4097
+ if (host === "redd.it") {
4098
+ const postId = parsed.pathname.split("/").filter(Boolean)[0];
4099
+ if (!postId) {
4100
+ return void 0;
4101
+ }
4102
+ parsed.pathname = `/comments/${postId}/`;
4103
+ }
4104
+ return parsed.toString();
4105
+ } catch {
4106
+ return void 0;
4107
+ }
4108
+ }
4109
+ function parseRedditJsonPayload(source) {
4110
+ try {
4111
+ const parsed = JSON.parse(source);
4112
+ const post = findRedditPostRecord(parsed);
4113
+ if (!post) {
4114
+ return void 0;
4115
+ }
4116
+ const createdUtc = numberFromUnknown2(post.created_utc);
4117
+ const permalink = stringFromUnknown4(post.permalink);
4118
+ const canonicalUrl = permalink ? `https://www.reddit.com${permalink.startsWith("/") ? permalink : `/${permalink}`}` : void 0;
4119
+ const images = redditImagesFromPost(post);
4120
+ const videos = redditVideosFromPost(post);
4121
+ const description = firstText2(
4122
+ stringFromUnknown4(post.selftext),
4123
+ stringFromUnknown4(post.selftext_html),
4124
+ stringFromUnknown4(post.url_overridden_by_dest)
4125
+ );
4126
+ return {
4127
+ title: stringFromUnknown4(post.title),
4128
+ description,
4129
+ author: stringFromUnknown4(post.author) ?? stringFromUnknown4(post.author_fullname),
4130
+ createdAt: createdUtc ? new Date(createdUtc * 1e3).toISOString() : void 0,
4131
+ canonicalUrl,
4132
+ url: stringFromUnknown4(post.url_overridden_by_dest) ?? stringFromUnknown4(post.url),
4133
+ images,
4134
+ videos,
4135
+ subreddit: stringFromUnknown4(post.subreddit_name_prefixed) ?? stringFromUnknown4(post.subreddit),
4136
+ postId: stringFromUnknown4(post.id)
4137
+ };
4138
+ } catch {
4139
+ return void 0;
4140
+ }
4141
+ }
4142
+ function findRedditPostRecord(value) {
4143
+ if (Array.isArray(value)) {
4144
+ for (const item of value) {
4145
+ const found = findRedditPostRecord(item);
4146
+ if (found) {
4147
+ return found;
4148
+ }
4149
+ }
4150
+ return void 0;
4151
+ }
4152
+ if (!isRecord5(value)) {
4153
+ return void 0;
4154
+ }
4155
+ if (typeof value.title === "string" && (typeof value.id === "string" || typeof value.name === "string")) {
4156
+ return value;
4157
+ }
4158
+ const children = isRecord5(value.data) && Array.isArray(value.data.children) ? value.data.children : void 0;
4159
+ if (children) {
4160
+ for (const child of children) {
4161
+ if (isRecord5(child) && isRecord5(child.data) && (child.kind === "t3" || typeof child.data.title === "string")) {
4162
+ return child.data;
4163
+ }
4164
+ }
4165
+ }
4166
+ for (const childValue of Object.values(value).slice(0, 100)) {
4167
+ const found = findRedditPostRecord(childValue);
4168
+ if (found) {
4169
+ return found;
4170
+ }
4171
+ }
4172
+ return void 0;
4173
+ }
4174
+ function redditImagesFromPost(post) {
4175
+ const images = [];
4176
+ const preview = isRecord5(post.preview) && Array.isArray(post.preview.images) ? post.preview.images : [];
4177
+ for (const image of preview) {
4178
+ if (!isRecord5(image)) {
4179
+ continue;
4180
+ }
4181
+ for (const candidate of [image.source, ...Array.isArray(image.resolutions) ? image.resolutions : []]) {
4182
+ if (!isRecord5(candidate)) {
4183
+ continue;
4184
+ }
4185
+ const url = redditMediaUrl(stringFromUnknown4(candidate.url));
4186
+ if (!url) {
4187
+ continue;
4188
+ }
4189
+ images.push({
4190
+ url,
4191
+ kind: "image",
4192
+ source: "adapter",
4193
+ width: numberFromUnknown2(candidate.width),
4194
+ height: numberFromUnknown2(candidate.height),
4195
+ metadata: {
4196
+ adapter: "redditJsonEndpoint",
4197
+ originalSource: "redditJsonEndpoint"
4198
+ }
4199
+ });
4200
+ }
4201
+ }
4202
+ const thumbnail = redditMediaUrl(stringFromUnknown4(post.thumbnail));
4203
+ if (thumbnail && /^https?:\/\//i.test(thumbnail)) {
4204
+ images.push({
4205
+ url: thumbnail,
4206
+ kind: "image",
4207
+ source: "adapter",
4208
+ metadata: {
4209
+ adapter: "redditJsonEndpoint",
4210
+ originalSource: "redditJsonEndpoint"
4211
+ }
4212
+ });
4213
+ }
4214
+ return images;
4215
+ }
4216
+ function redditVideosFromPost(post) {
4217
+ const videos = [];
4218
+ const media = [post.media, post.secure_media].filter(isRecord5);
4219
+ for (const item of media) {
4220
+ const redditVideo = isRecord5(item.reddit_video) ? item.reddit_video : void 0;
4221
+ const url = redditMediaUrl(stringFromUnknown4(redditVideo?.fallback_url) ?? stringFromUnknown4(redditVideo?.hls_url) ?? stringFromUnknown4(redditVideo?.dash_url));
4222
+ if (!url) {
4223
+ continue;
4224
+ }
4225
+ videos.push({
4226
+ url,
4227
+ kind: "video",
4228
+ source: "adapter",
4229
+ width: numberFromUnknown2(redditVideo?.width),
4230
+ height: numberFromUnknown2(redditVideo?.height),
4231
+ metadata: {
4232
+ adapter: "redditJsonEndpoint",
4233
+ originalSource: "redditJsonEndpoint"
4234
+ }
4235
+ });
3465
4236
  }
4237
+ return videos;
4238
+ }
4239
+ function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
4240
+ const finalUrl = post.canonicalUrl ?? requestedUrl;
4241
+ const bestImage = post.images.sort((left, right) => (right.width ?? 0) * (right.height ?? 0) - (left.width ?? 0) * (left.height ?? 0))[0];
4242
+ const video = post.videos[0];
4243
+ const structuredData = {
4244
+ "@context": "https://schema.org",
4245
+ "@type": "SocialMediaPosting",
4246
+ headline: post.title,
4247
+ description: post.description,
4248
+ author: post.author ? { "@type": "Person", name: post.author } : void 0,
4249
+ datePublished: post.createdAt,
4250
+ url: finalUrl,
4251
+ image: bestImage ? { "@type": "ImageObject", url: bestImage.url, width: bestImage.width, height: bestImage.height } : void 0,
4252
+ video: video ? { "@type": "VideoObject", contentUrl: video.url, width: video.width, height: video.height } : void 0
4253
+ };
4254
+ const embeddedPayload = {
4255
+ post: {
4256
+ postTitle: post.title,
4257
+ description: post.description,
4258
+ author: post.author ? { name: post.author } : void 0,
4259
+ createdAt: post.createdAt,
4260
+ canonicalUrl: finalUrl,
4261
+ previewImage: bestImage,
4262
+ media: {
4263
+ videoUrl: video?.url
4264
+ },
4265
+ images: post.images,
4266
+ videos: post.videos,
4267
+ subreddit: post.subreddit,
4268
+ postId: post.postId
4269
+ }
4270
+ };
4271
+ const html = [
4272
+ "<!doctype html><html><head>",
4273
+ `<title>${escapeHtml(post.title ?? "Reddit post")}</title>`,
4274
+ post.title ? `<meta property="og:title" content="${escapeHtml(post.title)}">` : "",
4275
+ post.description ? `<meta property="og:description" content="${escapeHtml(post.description)}">` : "",
4276
+ `<meta property="og:site_name" content="Reddit">`,
4277
+ `<meta property="og:url" content="${escapeHtml(finalUrl)}">`,
4278
+ bestImage ? `<meta property="og:image" content="${escapeHtml(bestImage.url)}">` : "",
4279
+ bestImage?.width ? `<meta property="og:image:width" content="${bestImage.width}">` : "",
4280
+ bestImage?.height ? `<meta property="og:image:height" content="${bestImage.height}">` : "",
4281
+ `<link rel="canonical" href="${escapeHtml(finalUrl)}">`,
4282
+ `<script type="application/ld+json">${safeJson(structuredData)}</script>`,
4283
+ `<script type="application/json" id="metanova-reddit-json">${safeJson(embeddedPayload)}</script>`,
4284
+ "</head><body></body></html>"
4285
+ ].join("");
4286
+ return {
4287
+ ...jsonPage,
4288
+ url: requestedUrl,
4289
+ originalUrl: requestedUrl,
4290
+ finalUrl,
4291
+ html,
4292
+ bytes: new TextEncoder().encode(html),
4293
+ contentType: "text/html; charset=utf-8",
4294
+ statusCode: jsonPage.statusCode
4295
+ };
4296
+ }
4297
+ function isRedditBlocked(page) {
4298
+ return page.statusCode === 403 || page.statusCode === 429 || /please wait for verification|whoa there, pardner|blocked|forbidden|too many requests|request has been blocked/i.test(page.html);
4299
+ }
4300
+ function redditRetryInfo(attempts) {
4301
+ const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
4302
+ if (blockedAttempts.length === 0) {
4303
+ return void 0;
4304
+ }
4305
+ const retryAfter = blockedAttempts.map((attempt) => attempt.retryAfter).find((value) => Boolean(value));
4306
+ return {
4307
+ retryable: blockedAttempts.some((attempt) => attempt.statusCode === 429 || Boolean(attempt.retryAfter)),
4308
+ reason: blockedAttempts.map((attempt) => `${attempt.method}${attempt.statusCode ? ` returned ${attempt.statusCode}` : " failed"}`).join("; "),
4309
+ retryAfter,
4310
+ retryAfterMs: retryAfterToMs(retryAfter),
4311
+ attempts: attempts.length
4312
+ };
4313
+ }
4314
+ function retryAfterToMs(value) {
4315
+ if (!value) {
4316
+ return void 0;
4317
+ }
4318
+ const seconds = Number.parseInt(value, 10);
4319
+ if (Number.isFinite(seconds)) {
4320
+ return seconds * 1e3;
4321
+ }
4322
+ const dateMs = Date.parse(value);
4323
+ return Number.isFinite(dateMs) ? Math.max(dateMs - Date.now(), 0) : void 0;
4324
+ }
4325
+ function mergeFallbackAttempts2(existing, incoming) {
4326
+ const attempts = [...existing ?? [], ...incoming];
4327
+ if (attempts.length === 0) {
4328
+ return void 0;
4329
+ }
4330
+ const seen = /* @__PURE__ */ new Set();
4331
+ return attempts.map((value) => {
4332
+ const { page: _page, ...attempt } = value;
4333
+ return attempt;
4334
+ }).filter((attempt) => {
4335
+ const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
4336
+ if (seen.has(key)) {
4337
+ return false;
4338
+ }
4339
+ seen.add(key);
4340
+ return true;
4341
+ });
4342
+ }
4343
+ function uniqueStrings3(values) {
4344
+ return [...new Set(values.filter((value) => Boolean(value)))];
4345
+ }
4346
+ function redditMediaUrl(value) {
4347
+ return value?.replace(/&amp;/g, "&");
4348
+ }
4349
+ function firstText2(...values) {
4350
+ return values.map((value) => value?.replace(/\s+/g, " ").trim()).find((value) => Boolean(value));
4351
+ }
4352
+ function stringFromUnknown4(value) {
4353
+ if (typeof value === "string" && value.trim()) {
4354
+ return value.trim();
4355
+ }
4356
+ if (typeof value === "number" && Number.isFinite(value)) {
4357
+ return String(value);
4358
+ }
4359
+ return void 0;
4360
+ }
4361
+ function numberFromUnknown2(value) {
4362
+ if (typeof value === "number" && Number.isFinite(value)) {
4363
+ return value;
4364
+ }
4365
+ if (typeof value !== "string") {
4366
+ return void 0;
4367
+ }
4368
+ const parsed = Number.parseFloat(value);
4369
+ return Number.isFinite(parsed) ? parsed : void 0;
4370
+ }
4371
+ function safeJson(value) {
4372
+ return JSON.stringify(stripUndefinedDeep(value)).replace(/</g, "\\u003c");
4373
+ }
4374
+ function escapeHtml(value) {
4375
+ return value.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
4376
+ }
4377
+ function stripUndefinedDeep(value) {
4378
+ if (Array.isArray(value)) {
4379
+ return value.map(stripUndefinedDeep).filter((item) => item !== void 0);
4380
+ }
4381
+ if (isRecord5(value)) {
4382
+ return Object.fromEntries(
4383
+ Object.entries(value).map(([key, item]) => [key, stripUndefinedDeep(item)]).filter(([, item]) => item !== void 0 && item !== null && (!Array.isArray(item) || item.length > 0))
4384
+ );
4385
+ }
4386
+ return value;
4387
+ }
4388
+ function isRecord5(value) {
4389
+ return typeof value === "object" && value !== null && !Array.isArray(value);
3466
4390
  }
3467
4391
  function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
3468
4392
  const contentType = page.contentType?.toLowerCase() ?? "";
@@ -3519,7 +4443,15 @@ function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
3519
4443
  sourcesUsed: ["direct"],
3520
4444
  warnings: [],
3521
4445
  trace,
4446
+ extractionMethod: `direct:${kind}`,
3522
4447
  selectedImageReason: kind === "image" ? "Selected direct image URL because the response content type is an image." : void 0,
4448
+ confidenceBreakdown: {
4449
+ title: 0,
4450
+ description: 0,
4451
+ image: kind === "image" ? 100 : 0,
4452
+ structuredData: 0,
4453
+ adapter: 0
4454
+ },
3523
4455
  fetchDurationMs,
3524
4456
  extractedAt: (/* @__PURE__ */ new Date()).toISOString()
3525
4457
  }
@@ -3634,6 +4566,7 @@ export {
3634
4566
  behanceAdapter,
3635
4567
  calculateCompleteness,
3636
4568
  calculateConfidence,
4569
+ calculateConfidenceBreakdown,
3637
4570
  calculateReliability,
3638
4571
  createDiagnostics,
3639
4572
  createPreviewCard,