metanova 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -41,6 +41,7 @@ __export(index_exports, {
41
41
  behanceAdapter: () => behanceAdapter,
42
42
  calculateCompleteness: () => calculateCompleteness,
43
43
  calculateConfidence: () => calculateConfidence,
44
+ calculateConfidenceBreakdown: () => calculateConfidenceBreakdown,
44
45
  calculateReliability: () => calculateReliability,
45
46
  createDiagnostics: () => createDiagnostics,
46
47
  createPreviewCard: () => createPreviewCard,
@@ -838,8 +839,24 @@ function isRecord2(value) {
838
839
  }
839
840
 
840
841
  // src/extractors/media.ts
841
- var LAZY_IMAGE_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-image", "data-thumbnail"];
842
- var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-media"];
842
+ var LAZY_IMAGE_ATTRIBUTES = [
843
+ "data-src",
844
+ "data-original",
845
+ "data-lazy-src",
846
+ "data-image",
847
+ "data-image-url",
848
+ "data-og-image",
849
+ "data-thumbnail",
850
+ "data-thumb",
851
+ "data-media",
852
+ "data-full-src",
853
+ "data-hi-res-src",
854
+ "data-zoom-src",
855
+ "data-poster",
856
+ "data-bg"
857
+ ];
858
+ var LAZY_IMAGE_SRCSET_ATTRIBUTES = ["data-srcset", "data-lazy-srcset", "data-original-srcset"];
859
+ var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-video-url", "data-media", "data-playback-url"];
843
860
  function extractImages(html, baseUrl) {
844
861
  const $ = loadDocument(html);
845
862
  const images = [];
@@ -865,6 +882,15 @@ function extractImages(html, baseUrl) {
865
882
  type,
866
883
  metadata: { discoveredFrom: "link.preload" }
867
884
  }, baseUrl);
885
+ for (const candidate of parseSrcset($(element).attr("imagesrcset"))) {
886
+ pushResolved(images, {
887
+ url: candidate,
888
+ kind: "image",
889
+ source: "html",
890
+ type,
891
+ metadata: { discoveredFrom: "link.imagesrcset" }
892
+ }, baseUrl);
893
+ }
868
894
  }
869
895
  });
870
896
  collectDocumentImages($, images, baseUrl, "html");
@@ -989,7 +1015,8 @@ function collectDocumentImages($, images, baseUrl, source) {
989
1015
  const candidates = [
990
1016
  normalizeWhitespace($(element).attr("src")),
991
1017
  ...LAZY_IMAGE_ATTRIBUTES.map((attribute) => normalizeWhitespace($(element).attr(attribute))),
992
- ...parseSrcset($(element).attr("srcset"))
1018
+ ...parseSrcset($(element).attr("srcset")),
1019
+ ...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
993
1020
  ];
994
1021
  for (const candidate of candidates) {
995
1022
  pushResolved(images, {
@@ -1002,7 +1029,10 @@ function collectDocumentImages($, images, baseUrl, source) {
1002
1029
  }
1003
1030
  });
1004
1031
  $("picture source[srcset], source[type^='image/'][srcset]").each((_, element) => {
1005
- for (const candidate of parseSrcset($(element).attr("srcset"))) {
1032
+ for (const candidate of [
1033
+ ...parseSrcset($(element).attr("srcset")),
1034
+ ...LAZY_IMAGE_SRCSET_ATTRIBUTES.flatMap((attribute) => parseSrcset($(element).attr(attribute)))
1035
+ ]) {
1006
1036
  pushResolved(images, {
1007
1037
  url: candidate,
1008
1038
  kind: "image",
@@ -1294,11 +1324,11 @@ function uniqueStrings(values) {
1294
1324
 
1295
1325
  // src/scorers/image.ts
1296
1326
  var SOURCE_WEIGHT = {
1297
- adapter: 96,
1298
- openGraph: 92,
1327
+ adapter: 98,
1328
+ openGraph: 94,
1299
1329
  oEmbed: 88,
1300
1330
  jsonLd: 82,
1301
- twitter: 78,
1331
+ twitter: 86,
1302
1332
  nextData: 76,
1303
1333
  nuxt: 74,
1304
1334
  initialState: 73,
@@ -1326,7 +1356,9 @@ function scoreImages(images, customScorers = []) {
1326
1356
  scoreReasons: reasons
1327
1357
  }
1328
1358
  };
1329
- }).sort((left, right) => (right.score ?? 0) - (left.score ?? 0));
1359
+ }).sort(
1360
+ (left, right) => (right.score ?? 0) - (left.score ?? 0) || sourceSortWeight(right) - sourceSortWeight(left) || imageArea(right) - imageArea(left)
1361
+ );
1330
1362
  }
1331
1363
  function selectBestImage(images, customScorers = []) {
1332
1364
  const scored = scoreImages(images, customScorers);
@@ -1425,17 +1457,32 @@ function scoreFormat(image) {
1425
1457
  }
1426
1458
  function scoreUrlSignal(image) {
1427
1459
  const url = image.url.toLowerCase();
1428
- const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social/g) ?? [];
1429
- if (matches.length === 0) {
1460
+ const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social|maxres|highres|large|original/g) ?? [];
1461
+ const platformScore = platformThumbnailScore(url);
1462
+ if (matches.length === 0 && platformScore.score === 0) {
1430
1463
  return { score: 0, reasons: [] };
1431
1464
  }
1432
1465
  const uniqueMatches = [...new Set(matches)];
1433
- const score = Math.min(uniqueMatches.length * 4, 12);
1466
+ const score = Math.min(uniqueMatches.length * 4, 14) + platformScore.score;
1467
+ const reasons = uniqueMatches.length > 0 ? [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${Math.min(uniqueMatches.length * 4, 14)} points`] : [];
1468
+ reasons.push(...platformScore.reasons);
1434
1469
  return {
1435
1470
  score,
1436
- reasons: [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${score} points`]
1471
+ reasons
1437
1472
  };
1438
1473
  }
1474
+ function platformThumbnailScore(url) {
1475
+ if (/ytimg\.com\/vi\/[^/]+\/(?:maxresdefault|sddefault|hqdefault)/i.test(url)) {
1476
+ return { score: 12, reasons: ["YouTube platform thumbnail added 12 points"] };
1477
+ }
1478
+ if (/(?:i|preview|external-preview)\.redd\.it|v\.redd\.it/i.test(url)) {
1479
+ return { score: 10, reasons: ["Reddit media host added 10 points"] };
1480
+ }
1481
+ if (/pbs\.twimg\.com\/media|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|mir-s3-cdn-cf\.behance\.net/i.test(url)) {
1482
+ return { score: 8, reasons: ["social platform media host added 8 points"] };
1483
+ }
1484
+ return { score: 0, reasons: [] };
1485
+ }
1439
1486
  function scoreUrlPenalty(image) {
1440
1487
  const url = image.url.toLowerCase();
1441
1488
  let penalty = 0;
@@ -1508,6 +1555,12 @@ function countDuplicates(images) {
1508
1555
  }
1509
1556
  return counts;
1510
1557
  }
1558
+ function imageArea(image) {
1559
+ return (image.width ?? 0) * (image.height ?? 0);
1560
+ }
1561
+ function sourceSortWeight(image) {
1562
+ return SOURCE_WEIGHT[image.source] ?? 50;
1563
+ }
1511
1564
  function mediaSignature(url) {
1512
1565
  try {
1513
1566
  const parsed = new URL(url);
@@ -1525,16 +1578,45 @@ var IMAGE_KEYS = [
1525
1578
  "thumbnailUrl",
1526
1579
  "thumbnail_url",
1527
1580
  "thumbnailSrc",
1581
+ "thumbnail_src",
1528
1582
  "previewImage",
1529
1583
  "preview_image",
1584
+ "preview",
1530
1585
  "ogImage",
1586
+ "og_image",
1531
1587
  "cardImage",
1588
+ "displayUrl",
1589
+ "display_url",
1590
+ "mediaUrl",
1591
+ "media_url",
1592
+ "media_url_https",
1593
+ "fullPicture",
1594
+ "full_picture",
1532
1595
  "cover",
1533
1596
  "coverImage",
1597
+ "cover_image",
1598
+ "original",
1599
+ "source",
1534
1600
  "poster",
1601
+ "posterImage",
1602
+ "media"
1603
+ ];
1604
+ var VIDEO_KEYS = [
1605
+ "video",
1606
+ "videos",
1607
+ "videoUrl",
1608
+ "video_url",
1609
+ "contentUrl",
1610
+ "content_url",
1611
+ "embedUrl",
1612
+ "embed_url",
1613
+ "playbackUrl",
1614
+ "playback_url",
1615
+ "fallback_url",
1616
+ "hls_url",
1617
+ "dash_url",
1535
1618
  "media"
1536
1619
  ];
1537
- var VIDEO_KEYS = ["video", "videos", "videoUrl", "video_url", "contentUrl", "embedUrl", "playbackUrl"];
1538
1620
  var AUDIO_KEYS = ["audio", "audios", "audioUrl", "audio_url", "podcastUrl"];
1539
1621
  function discoverMedia(rawSources, finalUrl) {
1540
1622
  const trace = [];
@@ -1668,25 +1750,87 @@ function mediaFromJsonValue(value, kind, source) {
1668
1750
  return value.flatMap((item) => mediaFromJsonValue(item, kind, source));
1669
1751
  }
1670
1752
  if (isRecord3(value)) {
1671
- const url = stringFromUnknown(value.url) ?? stringFromUnknown(value.src) ?? stringFromUnknown(value.contentUrl) ?? stringFromUnknown(value.thumbnailUrl);
1753
+ const srcset = stringFromUnknown(value.srcset) ?? stringFromUnknown(value.srcSet);
1754
+ const srcsetAssets = parseSrcset(srcset).flatMap((url2) => mediaFromJsonValue(url2, kind, source));
1755
+ const url = mediaUrlFromRecord(value, kind);
1756
+ const nestedDetails = nestedMediaDetailsRecord(value, kind);
1672
1757
  if (!url || !looksLikeMediaUrl(url, kind)) {
1673
- return [];
1758
+ return srcsetAssets;
1674
1759
  }
1675
1760
  return [
1676
1761
  {
1677
1762
  url,
1678
1763
  kind,
1679
1764
  source,
1680
- width: parseNumber(stringFromUnknown(value.width)),
1681
- height: parseNumber(stringFromUnknown(value.height)),
1682
- alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name),
1683
- title: stringFromUnknown(value.title),
1684
- type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat)
1685
- }
1765
+ width: parseNumber(stringFromUnknown(value.width)) ?? parseNumber(stringFromUnknown(nestedDetails?.width)),
1766
+ height: parseNumber(stringFromUnknown(value.height)) ?? parseNumber(stringFromUnknown(nestedDetails?.height)),
1767
+ alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name) ?? stringFromUnknown(nestedDetails?.alt),
1768
+ title: stringFromUnknown(value.title) ?? stringFromUnknown(nestedDetails?.title),
1769
+ type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat) ?? stringFromUnknown(nestedDetails?.type)
1770
+ },
1771
+ ...srcsetAssets
1686
1772
  ];
1687
1773
  }
1688
1774
  return [];
1689
1775
  }
1776
+ function nestedMediaDetailsRecord(value, kind) {
1777
+ const candidates = [
1778
+ value.source,
1779
+ value.original,
1780
+ value.image,
1781
+ value.thumbnail,
1782
+ value.thumbnailUrl,
1783
+ value.thumbnail_url,
1784
+ value.previewImage,
1785
+ value.preview_image,
1786
+ value.video,
1787
+ value.reddit_video
1788
+ ];
1789
+ return candidates.find((candidate) => isRecord3(candidate) && Boolean(mediaUrlFromRecord(candidate, kind)));
1790
+ }
1791
+ function mediaUrlFromRecord(value, kind) {
1792
+ const commonCandidates = [
1793
+ value.url,
1794
+ value.src,
1795
+ value.secure_url,
1796
+ value.secureUrl,
1797
+ value.contentUrl,
1798
+ value.content_url,
1799
+ value.embedUrl,
1800
+ value.embed_url,
1801
+ value.thumbnailUrl,
1802
+ value.thumbnail_url,
1803
+ value.thumbnailSrc,
1804
+ value.thumbnail_src,
1805
+ value.mediaUrl,
1806
+ value.media_url,
1807
+ value.media_url_https,
1808
+ value.displayUrl,
1809
+ value.display_url,
1810
+ value.fullPicture,
1811
+ value.full_picture,
1812
+ value.previewImage,
1813
+ value.preview_image,
1814
+ value.poster,
1815
+ value.posterUrl,
1816
+ value.poster_url,
1817
+ value.coverImage,
1818
+ value.cover_image,
1819
+ value.original,
1820
+ value.source
1821
+ ];
1822
+ const videoCandidates = [
1823
+ value.videoUrl,
1824
+ value.video_url,
1825
+ value.playbackUrl,
1826
+ value.playback_url,
1827
+ value.fallback_url,
1828
+ value.hls_url,
1829
+ value.dash_url
1830
+ ];
1831
+ const candidates = kind === "video" ? [...videoCandidates, ...commonCandidates] : commonCandidates;
1832
+ return candidates.map(stringFromUnknown).find((candidate) => candidate && looksLikeMediaUrl(candidate, kind));
1833
+ }
1690
1834
  function assetFromEmbedded(value, kind, item, parent) {
1691
1835
  return {
1692
1836
  url: value,
@@ -1766,7 +1910,7 @@ function sourceRank(source) {
1766
1910
  }
1767
1911
  function shouldIgnoreMediaUrl2(url) {
1768
1912
  const normalized = url.toLowerCase();
1769
- return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
1913
+ return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji|favicon|apple-touch-icon)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
1770
1914
  }
1771
1915
  function looksLikeMediaUrl(value, kind) {
1772
1916
  if (shouldIgnoreMediaUrl2(value)) {
@@ -1774,10 +1918,10 @@ function looksLikeMediaUrl(value, kind) {
1774
1918
  }
1775
1919
  if (/^https?:\/\//i.test(value) || value.startsWith("/") || value.startsWith("./") || value.startsWith("../")) {
1776
1920
  if (kind === "image") {
1777
- return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo)/i.test(value);
1921
+ return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo|format=(?:jpg|jpeg|png|webp))/i.test(value) || /(?:ytimg\.com|i\.redd\.it|preview\.redd\.it|external-preview\.redd\.it|pbs\.twimg\.com|pinimg\.com|cdninstagram\.com|fbcdn\.net|tiktokcdn\.com|behance\.net)/i.test(value);
1778
1922
  }
1779
1923
  if (kind === "video") {
1780
- return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts)/i.test(value);
1924
+ return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts|v\.redd\.it)/i.test(value);
1781
1925
  }
1782
1926
  if (kind === "audio") {
1783
1927
  return /\.(?:mp3|m4a|wav|ogg|aac)(?:[?#].*)?$/i.test(value) || /(?:audio|podcast)/i.test(value);
@@ -1852,6 +1996,20 @@ function calculateConfidence(input) {
1852
1996
  score -= Math.min(input.warnings.length * 3, 18);
1853
1997
  return Math.round(clamp2(score, 0, 100));
1854
1998
  }
1999
+ function calculateConfidenceBreakdown(input) {
2000
+ const title = qualityPoints(input.title, 100, 6, 120);
2001
+ const description = qualityPoints(input.description, 100, 24, 300);
2002
+ const image = input.bestImage ? clamp2(58 + Math.min(input.bestImage.score ?? 0, 100) * 0.27 + sourceConfidenceBonus(input.bestImage.source), 0, 100) : 0;
2003
+ const structuredData = input.hasStructuredData ? 100 : input.rawSources.embeddedData.items.length > 0 ? 55 : 0;
2004
+ const adapter = adapterSucceeded(input.rawSources.adapters) ? adapterConfidence(input.rawSources.adapters[0]) : 0;
2005
+ return {
2006
+ title: Math.round(title),
2007
+ description: Math.round(description),
2008
+ image: Math.round(image),
2009
+ structuredData: Math.round(structuredData),
2010
+ adapter: Math.round(adapter)
2011
+ };
2012
+ }
1855
2013
  function calculateCompleteness(input) {
1856
2014
  const weights = [
1857
2015
  input.title ? 20 : 0,
@@ -1921,6 +2079,25 @@ function sourceConfidenceBonus(source) {
1921
2079
  function adapterSucceeded(adapters) {
1922
2080
  return adapters.some((adapter) => Boolean(adapter.title || adapter.description || adapter.images?.length || adapter.videos?.length));
1923
2081
  }
2082
+ function adapterConfidence(adapter) {
2083
+ if (!adapter) {
2084
+ return 0;
2085
+ }
2086
+ let score = 45;
2087
+ if (adapter.title) {
2088
+ score += 22;
2089
+ }
2090
+ if (adapter.description) {
2091
+ score += 14;
2092
+ }
2093
+ if ((adapter.images?.length ?? 0) > 0 || (adapter.videos?.length ?? 0) > 0) {
2094
+ score += 14;
2095
+ }
2096
+ if (adapter.author) {
2097
+ score += 6;
2098
+ }
2099
+ return clamp2(score, 0, 100);
2100
+ }
1924
2101
  function clamp2(value, min, max) {
1925
2102
  return Math.max(min, Math.min(max, value));
1926
2103
  }
@@ -1984,6 +2161,7 @@ function normalizeMetadata(rawSources, context = {}) {
1984
2161
  const type = inferType(rawSources, externalResults, jsonLdNodes, article, product, app, playlist, videos, audio);
1985
2162
  const author = firstResultValue(externalResults, (result) => result.author) ?? firstEntity(article?.authors) ?? entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"]);
1986
2163
  const publisher = article?.publisher ?? firstResultValue(externalResults, (result) => result.publisher) ?? entityFromJsonLd(organizationNode) ?? entityFromEmbedded(embeddedNodes, ["publisher", "provider", "organization"]);
2164
+ const publishDate = firstDefined(article?.publishedTime, video?.publishedTime);
1987
2165
  const sourcesUsed = detectSourcesUsed(rawSources);
1988
2166
  const warnings = diagnosticsWarnings(rawSources, externalResults, context.diagnostics);
1989
2167
  const fieldSources = {
@@ -1992,7 +2170,7 @@ function normalizeMetadata(rawSources, context = {}) {
1992
2170
  author: fieldSource(rawSources, externalResults, embeddedNodes, "author", selectedImage.best),
1993
2171
  image: fieldSource(rawSources, externalResults, embeddedNodes, "image", selectedImage.best)
1994
2172
  };
1995
- const confidence = calculateConfidence({
2173
+ const confidenceInput = {
1996
2174
  title,
1997
2175
  description,
1998
2176
  bestImage: selectedImage.best,
@@ -2001,7 +2179,9 @@ function normalizeMetadata(rawSources, context = {}) {
2001
2179
  rawSources,
2002
2180
  sourcesUsed,
2003
2181
  warnings
2004
- });
2182
+ };
2183
+ const confidence = calculateConfidence(confidenceInput);
2184
+ const confidenceBreakdown = calculateConfidenceBreakdown(confidenceInput);
2005
2185
  const completeness = calculateCompleteness({
2006
2186
  title,
2007
2187
  description,
@@ -2011,7 +2191,7 @@ function normalizeMetadata(rawSources, context = {}) {
2011
2191
  author,
2012
2192
  publisher,
2013
2193
  type,
2014
- publishedTime: article?.publishedTime,
2194
+ publishedTime: publishDate,
2015
2195
  mediaCount: images.length + videos.length + audio.length
2016
2196
  });
2017
2197
  const reliability = calculateReliability({
@@ -2030,7 +2210,19 @@ function normalizeMetadata(rawSources, context = {}) {
2030
2210
  };
2031
2211
  diagnostics.sourcesUsed = uniqueStrings2([...diagnostics.sourcesUsed, ...sourcesUsed]);
2032
2212
  diagnostics.warnings = uniqueStrings2([...diagnostics.warnings, ...rawSources.jsonLd.warnings, ...externalResults.flatMap((result) => result.warnings ?? [])]);
2213
+ diagnostics.adapterUsed = diagnostics.adapterUsed ?? rawSources.adapters[0]?.source;
2214
+ diagnostics.extractionMethod = diagnostics.extractionMethod ?? adapterRawString(rawSources.adapters[0], "extractionMethod") ?? fieldSources.title;
2215
+ diagnostics.sourcePriority = uniqueStrings2([
2216
+ ...diagnostics.sourcePriority ?? [],
2217
+ ...arrayOfStrings(rawSources.adapters[0]?.raw?.sourcePriority) ?? []
2218
+ ]);
2219
+ diagnostics.fallbacksAttempted = mergeFallbackAttempts(
2220
+ diagnostics.fallbacksAttempted,
2221
+ fallbackAttemptsFromUnknown(rawSources.adapters[0]?.raw?.fallbacksAttempted)
2222
+ );
2223
+ diagnostics.retryInfo = diagnostics.retryInfo ?? retryInfoFromUnknown(rawSources.adapters[0]?.raw?.retryInfo);
2033
2224
  diagnostics.selectedImageReason = selectedImage.reason;
2225
+ diagnostics.confidenceBreakdown = confidenceBreakdown;
2034
2226
  diagnostics.originalUrl = diagnostics.originalUrl ?? url;
2035
2227
  diagnostics.finalUrl = diagnostics.finalUrl ?? finalUrl;
2036
2228
  diagnostics.canonicalUrl = canonicalUrl;
@@ -2047,6 +2239,7 @@ function normalizeMetadata(rawSources, context = {}) {
2047
2239
  type,
2048
2240
  title,
2049
2241
  description,
2242
+ publishDate,
2050
2243
  siteName,
2051
2244
  canonicalUrl,
2052
2245
  confidence,
@@ -2336,6 +2529,61 @@ function adapterDiagnostics(adapters) {
2336
2529
  confidence: Math.min(confidence, 100)
2337
2530
  };
2338
2531
  }
2532
+ function adapterRawString(adapter, key) {
2533
+ const value = adapter?.raw?.[key];
2534
+ return typeof value === "string" && value.trim() ? value.trim() : void 0;
2535
+ }
2536
+ function fallbackAttemptsFromUnknown(value) {
2537
+ if (!Array.isArray(value)) {
2538
+ return void 0;
2539
+ }
2540
+ const attempts = value.flatMap((item) => {
2541
+ if (!isJsonLdNode(item) || typeof item.method !== "string") {
2542
+ return [];
2543
+ }
2544
+ return [{
2545
+ method: item.method,
2546
+ url: typeof item.url === "string" ? item.url : void 0,
2547
+ ok: typeof item.ok === "boolean" ? item.ok : false,
2548
+ statusCode: typeof item.statusCode === "number" ? item.statusCode : void 0,
2549
+ blocked: typeof item.blocked === "boolean" ? item.blocked : void 0,
2550
+ error: typeof item.error === "string" ? item.error : void 0,
2551
+ retryAfter: typeof item.retryAfter === "string" ? item.retryAfter : void 0
2552
+ }];
2553
+ });
2554
+ return attempts.length > 0 ? attempts : void 0;
2555
+ }
2556
+ function mergeFallbackAttempts(existing, incoming) {
2557
+ const attempts = [...existing ?? [], ...incoming ?? []];
2558
+ if (attempts.length === 0) {
2559
+ return void 0;
2560
+ }
2561
+ const seen = /* @__PURE__ */ new Set();
2562
+ return attempts.filter((attempt) => {
2563
+ const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
2564
+ if (seen.has(key)) {
2565
+ return false;
2566
+ }
2567
+ seen.add(key);
2568
+ return true;
2569
+ });
2570
+ }
2571
+ function retryInfoFromUnknown(value) {
2572
+ if (!isJsonLdNode(value)) {
2573
+ return void 0;
2574
+ }
2575
+ const retryable = typeof value.retryable === "boolean" ? value.retryable : void 0;
2576
+ if (retryable === void 0) {
2577
+ return void 0;
2578
+ }
2579
+ return {
2580
+ retryable,
2581
+ reason: typeof value.reason === "string" ? value.reason : void 0,
2582
+ retryAfter: typeof value.retryAfter === "string" ? value.retryAfter : void 0,
2583
+ retryAfterMs: typeof value.retryAfterMs === "number" ? value.retryAfterMs : void 0,
2584
+ attempts: typeof value.attempts === "number" ? value.attempts : void 0
2585
+ };
2586
+ }
2339
2587
  function fieldSource(rawSources, externalResults, embeddedNodes, field, bestImage) {
2340
2588
  if (field === "image") {
2341
2589
  return bestImage ? sourceLabel2(bestImage) : void 0;
@@ -2584,23 +2832,26 @@ var youtubeAdapter = {
2584
2832
  const videoId = getYouTubeVideoId(url);
2585
2833
  const playlistId = getYouTubePlaylistId(url);
2586
2834
  const communityPostId = getYouTubeCommunityPostId(url);
2835
+ const titleSelection = youtubeTitleFromContext(context, { videoId, playlistId, communityPostId });
2836
+ const descriptionSelection = youtubeDescriptionFromContext(context);
2587
2837
  const channel = entityFromContext(context, ["author", "ownerChannelName", "channel", "owner"]);
2588
2838
  const playlistVideos = playlistId ? extractPlaylistVideos(context) : [];
2839
+ const sourcePriority = youtubeSourcePriority();
2589
2840
  return compactAdapterResult({
2590
2841
  source: "youtubeAdapter",
2591
2842
  platform: "YouTube",
2592
2843
  type: playlistId ? "playlist" : communityPostId ? "social_post" : "video",
2593
2844
  siteName: "YouTube",
2594
2845
  canonicalUrl: videoId ? `https://www.youtube.com/watch?v=${videoId}` : context.raw.openGraph.url,
2595
- title: titleFromContext(context, ["videoDetails", "title", "headline", "name", "contentText"]),
2596
- description: descriptionFromContext(context),
2846
+ title: titleSelection.value,
2847
+ description: descriptionSelection.value,
2597
2848
  videos: markAdapterMedia(mediaFromContext(context).videos, "youtubeAdapter"),
2598
2849
  images: markAdapterMedia(mediaFromContext(context).images, "youtubeAdapter"),
2599
2850
  author: channel,
2600
2851
  article: { publishedTime: publishedTimeFromContext(context) },
2601
2852
  video: videoId ? {
2602
2853
  id: videoId,
2603
- title: titleFromContext(context, ["videoDetails", "title"]),
2854
+ title: titleSelection.value,
2604
2855
  channel,
2605
2856
  publishedTime: publishedTimeFromContext(context),
2606
2857
  duration: findEmbeddedString(context, ["duration", "lengthSeconds", "approxDurationMs"]),
@@ -2610,11 +2861,15 @@ var youtubeAdapter = {
2610
2861
  } : void 0,
2611
2862
  playlist: playlistId ? {
2612
2863
  id: playlistId,
2613
- title: findEmbeddedString(context, ["playlistTitle", "playlistName", "title"]) ?? context.raw.openGraph.title,
2864
+ title: youtubePlaylistTitleFromContext(context) ?? context.raw.openGraph.title,
2614
2865
  channel,
2615
2866
  videos: playlistVideos
2616
2867
  } : void 0,
2617
- identifiers: { videoId, playlistId, communityPostId }
2868
+ identifiers: { videoId, playlistId, communityPostId },
2869
+ raw: {
2870
+ sourcePriority,
2871
+ extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "youtube:htmlFallback"
2872
+ }
2618
2873
  });
2619
2874
  },
2620
2875
  normalize(rawData) {
@@ -2633,20 +2888,27 @@ var redditAdapter = {
2633
2888
  const url = new URL(context.finalUrl);
2634
2889
  const reddit = parseRedditUrl(url);
2635
2890
  const username = typeof reddit.username === "string" ? reddit.username : void 0;
2891
+ const titleSelection = redditTitleFromContext(context);
2892
+ const descriptionSelection = redditDescriptionFromContext(context);
2893
+ const sourcePriority = redditSourcePriority();
2636
2894
  return compactAdapterResult({
2637
2895
  source: "redditAdapter",
2638
2896
  platform: "Reddit",
2639
2897
  type: reddit.isPost ? "social_post" : "website",
2640
2898
  siteName: "Reddit",
2641
- canonicalUrl: context.raw.openGraph.url,
2642
- title: cleanSocialTitle(titleFromContext(context, ["title", "postTitle", "headline"])),
2643
- description: descriptionFromContext(context),
2899
+ canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
2900
+ title: cleanRedditTitle(titleSelection.value),
2901
+ description: cleanRedditDescription(descriptionSelection.value),
2644
2902
  images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
2645
2903
  videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
2646
2904
  author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
2647
2905
  article: { publishedTime: publishedTimeFromContext(context) },
2648
2906
  identifiers: { subreddit: reddit.subreddit, postId: reddit.postId, username: reddit.username },
2649
- raw: { ...reddit }
2907
+ raw: {
2908
+ ...reddit,
2909
+ sourcePriority,
2910
+ extractionMethod: titleSelection.method ?? descriptionSelection.method ?? "reddit:htmlFallback"
2911
+ }
2650
2912
  });
2651
2913
  },
2652
2914
  normalize(rawData) {
@@ -2747,6 +3009,7 @@ var facebookAdapter = {
2747
3009
  platform: "Facebook",
2748
3010
  type: isPhoto ? "image" : isPost || media.images.length > 0 || media.videos.length > 0 ? "social_post" : "website",
2749
3011
  siteName: "Facebook",
3012
+ canonicalUrl: context.raw.openGraph.url,
2750
3013
  title: titleFromContext(context, ["title", "headline", "name"]),
2751
3014
  description: descriptionFromContext(context),
2752
3015
  images: markAdapterMedia(media.images, "facebookAdapter"),
@@ -2830,6 +3093,116 @@ var defaultAdapters = [
2830
3093
  twitterAdapter,
2831
3094
  instagramAdapter
2832
3095
  ];
3096
+ function youtubeSourcePriority() {
3097
+ return [
3098
+ "structuredData:VideoObject",
3099
+ "embeddedData:ytInitialPlayerResponse",
3100
+ "embeddedData:ytInitialData",
3101
+ "openGraph",
3102
+ "twitter",
3103
+ "html"
3104
+ ];
3105
+ }
3106
+ function youtubeTitleFromContext(context, ids) {
3107
+ const videoObjectTitle = jsonLdVideoObjectString(context, ["name", "headline"]);
3108
+ if (videoObjectTitle) {
3109
+ return { value: videoObjectTitle, method: "youtube:structuredData.VideoObject" };
3110
+ }
3111
+ const playerTitle = youtubePlayerString(context, ["videoDetails.title", "microformat.playerMicroformatRenderer.title"]);
3112
+ if (playerTitle) {
3113
+ return { value: playerTitle, method: "youtube:ytInitialPlayerResponse" };
3114
+ }
3115
+ const initialDataTitle = youtubeInitialDataTitle(context, ids);
3116
+ if (initialDataTitle) {
3117
+ return { value: initialDataTitle, method: "youtube:ytInitialData" };
3118
+ }
3119
+ if (context.raw.openGraph.title) {
3120
+ return { value: context.raw.openGraph.title, method: "youtube:openGraph" };
3121
+ }
3122
+ if (context.raw.twitter.title) {
3123
+ return { value: context.raw.twitter.title, method: "youtube:twitter" };
3124
+ }
3125
+ return { value: cleanYouTubeHtmlTitle(context.raw.html.title), method: context.raw.html.title ? "youtube:html" : void 0 };
3126
+ }
3127
+ function youtubeDescriptionFromContext(context) {
3128
+ const videoObjectDescription = jsonLdVideoObjectString(context, ["description"]);
3129
+ if (videoObjectDescription) {
3130
+ return { value: videoObjectDescription, method: "youtube:structuredData.VideoObject" };
3131
+ }
3132
+ const playerDescription = youtubePlayerString(context, [
3133
+ "videoDetails.shortDescription",
3134
+ "microformat.playerMicroformatRenderer.description",
3135
+ "microformat.playerMicroformatRenderer.shortDescription"
3136
+ ]);
3137
+ if (playerDescription) {
3138
+ return { value: playerDescription, method: "youtube:ytInitialPlayerResponse" };
3139
+ }
3140
+ const initialDataDescription = youtubeInitialDataDescription(context);
3141
+ if (initialDataDescription) {
3142
+ return { value: initialDataDescription, method: "youtube:ytInitialData" };
3143
+ }
3144
+ if (context.raw.openGraph.description) {
3145
+ return { value: context.raw.openGraph.description, method: "youtube:openGraph" };
3146
+ }
3147
+ if (context.raw.twitter.description) {
3148
+ return { value: context.raw.twitter.description, method: "youtube:twitter" };
3149
+ }
3150
+ return { value: context.raw.html.description, method: context.raw.html.description ? "youtube:html" : void 0 };
3151
+ }
3152
+ function redditSourcePriority() {
3153
+ return [
3154
+ "redditJsonEndpoint",
3155
+ "oldReddit",
3156
+ "embeddedStructuredData",
3157
+ "openGraph",
3158
+ "twitter",
3159
+ "html"
3160
+ ];
3161
+ }
3162
+ function redditTitleFromContext(context) {
3163
+ const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
3164
+ "postTitle",
3165
+ "title",
3166
+ "headline"
3167
+ ]);
3168
+ if (embedded) {
3169
+ return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
3170
+ }
3171
+ const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["headline", "name"]);
3172
+ if (structured) {
3173
+ return { value: structured, method: "reddit:structuredData" };
3174
+ }
3175
+ if (context.raw.openGraph.title) {
3176
+ return { value: context.raw.openGraph.title, method: "reddit:openGraph" };
3177
+ }
3178
+ if (context.raw.twitter.title) {
3179
+ return { value: context.raw.twitter.title, method: "reddit:twitter" };
3180
+ }
3181
+ return { value: context.raw.html.title, method: context.raw.html.title ? "reddit:html" : void 0 };
3182
+ }
3183
+ function redditDescriptionFromContext(context) {
3184
+ const embedded = findEmbeddedStringBySources(context, ["applicationJson", "jsonScript", "initialState", "preloadedState", "nextData"], [
3185
+ "description",
3186
+ "selftext",
3187
+ "excerpt",
3188
+ "summary",
3189
+ "body"
3190
+ ]);
3191
+ if (embedded) {
3192
+ return { value: embedded, method: hasRedditJsonEndpointPayload(context) ? "reddit:jsonEndpoint" : "reddit:embeddedStructuredData" };
3193
+ }
3194
+ const structured = jsonLdStringByType(context.raw.jsonLd.nodes, ["SocialMediaPosting", "DiscussionForumPosting", "Article"], ["description", "articleBody"]);
3195
+ if (structured) {
3196
+ return { value: structured, method: "reddit:structuredData" };
3197
+ }
3198
+ if (context.raw.openGraph.description) {
3199
+ return { value: context.raw.openGraph.description, method: "reddit:openGraph" };
3200
+ }
3201
+ if (context.raw.twitter.description) {
3202
+ return { value: context.raw.twitter.description, method: "reddit:twitter" };
3203
+ }
3204
+ return { value: context.raw.html.description, method: context.raw.html.description ? "reddit:html" : void 0 };
3205
+ }
2833
3206
  function socialVideoResult(source, platform, context) {
2834
3207
  const url = new URL(context.finalUrl);
2835
3208
  const username = url.pathname.match(/@([^/]+)/)?.[1];
@@ -2888,6 +3261,143 @@ function markAdapterMedia(assets, adapterName) {
2888
3261
  }
2889
3262
  }));
2890
3263
  }
3264
+ function jsonLdVideoObjectString(context, keys) {
3265
+ return jsonLdStringByType(context.raw.jsonLd.nodes, ["VideoObject"], keys);
3266
+ }
3267
+ function jsonLdStringByType(nodes, types, keys) {
3268
+ for (const node of nodes) {
3269
+ if (!hasJsonLdType2(node, types)) {
3270
+ continue;
3271
+ }
3272
+ for (const key of keys) {
3273
+ const value = stringFromUnknown3(node[key]);
3274
+ if (value) {
3275
+ return value;
3276
+ }
3277
+ }
3278
+ }
3279
+ return void 0;
3280
+ }
3281
+ function hasJsonLdType2(node, types) {
3282
+ const nodeTypes = Array.isArray(node["@type"]) ? node["@type"] : [node["@type"]];
3283
+ return nodeTypes.some((type) => typeof type === "string" && types.some((candidate) => type.toLowerCase().endsWith(candidate.toLowerCase())));
3284
+ }
3285
+ function youtubePlayerString(context, paths) {
3286
+ for (const item of context.raw.embeddedData.items) {
3287
+ if (item.source !== "youtubePlayerResponse") {
3288
+ continue;
3289
+ }
3290
+ for (const path of paths) {
3291
+ const value = stringFromUnknown3(valueAtPath(item.data, path));
3292
+ if (value) {
3293
+ return value;
3294
+ }
3295
+ }
3296
+ }
3297
+ return void 0;
3298
+ }
3299
+ function youtubeInitialDataTitle(context, ids) {
3300
+ const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
3301
+ const primary = findRendererText(items, ["videoPrimaryInfoRenderer", "watchMetadata"], ["title"]);
3302
+ if (primary) {
3303
+ return primary;
3304
+ }
3305
+ if (ids.videoId) {
3306
+ const matchingVideo = findYouTubeRendererForVideoId(items, ids.videoId, ["title"]);
3307
+ if (matchingVideo) {
3308
+ return matchingVideo;
3309
+ }
3310
+ }
3311
+ if (ids.communityPostId) {
3312
+ const communityPost = findEmbeddedStringBySources(context, ["youtubeInitialData"], ["contentText"]) ?? findRendererText(items, ["backstagePostRenderer", "postRenderer"], ["contentText", "title"]);
3313
+ if (communityPost) {
3314
+ return communityPost;
3315
+ }
3316
+ }
3317
+ if (ids.playlistId && !ids.videoId) {
3318
+ return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
3319
+ }
3320
+ return void 0;
3321
+ }
3322
+ function youtubeInitialDataDescription(context) {
3323
+ const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
3324
+ return findRendererText(items, ["expandableVideoDescriptionBodyRenderer", "videoSecondaryInfoRenderer", "watchMetadata"], [
3325
+ "description",
3326
+ "attributedDescription",
3327
+ "content"
3328
+ ]);
3329
+ }
3330
+ function youtubePlaylistTitleFromContext(context) {
3331
+ const items = context.raw.embeddedData.items.filter((item) => item.source === "youtubeInitialData");
3332
+ return findRendererText(items, ["playlistMetadataRenderer", "playlistHeaderRenderer"], ["title", "playlistTitle", "name"]);
3333
+ }
3334
+ function findRendererText(items, rendererKeys, textKeys) {
3335
+ for (const item of items) {
3336
+ let found;
3337
+ walkData(item.data, (value, key) => {
3338
+ if (found || !key || !rendererKeys.includes(key) || !isRecord4(value)) {
3339
+ return;
3340
+ }
3341
+ for (const textKey of textKeys) {
3342
+ found = stringFromUnknown3(value[textKey]);
3343
+ if (found) {
3344
+ return;
3345
+ }
3346
+ }
3347
+ });
3348
+ if (found) {
3349
+ return found;
3350
+ }
3351
+ }
3352
+ return void 0;
3353
+ }
3354
+ function findYouTubeRendererForVideoId(items, videoId, textKeys) {
3355
+ for (const item of items) {
3356
+ let found;
3357
+ walkData(item.data, (value) => {
3358
+ if (found || !isRecord4(value) || stringFromUnknown3(value.videoId) !== videoId) {
3359
+ return;
3360
+ }
3361
+ for (const textKey of textKeys) {
3362
+ found = stringFromUnknown3(value[textKey]);
3363
+ if (found) {
3364
+ return;
3365
+ }
3366
+ }
3367
+ });
3368
+ if (found) {
3369
+ return found;
3370
+ }
3371
+ }
3372
+ return void 0;
3373
+ }
3374
+ function findEmbeddedStringBySources(context, sources, keys) {
3375
+ const candidates = [];
3376
+ for (const item of context.raw.embeddedData.items) {
3377
+ if (!sources.includes(item.source)) {
3378
+ continue;
3379
+ }
3380
+ walkData(item.data, (value, key) => {
3381
+ if (!key || !matchesKey(key, keys)) {
3382
+ return;
3383
+ }
3384
+ const text = stringFromUnknown3(value);
3385
+ if (text) {
3386
+ candidates.push(text);
3387
+ }
3388
+ });
3389
+ }
3390
+ return bestTextCandidate(candidates);
3391
+ }
3392
+ function hasRedditJsonEndpointPayload(context) {
3393
+ return context.raw.embeddedData.items.some((item) => item.source === "applicationJson" && item.path === "metanova-reddit-json");
3394
+ }
3395
+ function valueAtPath(node, path) {
3396
+ return path.split(".").reduce((current, key) => isRecord4(current) ? current[key] : void 0, node);
3397
+ }
3398
+ function cleanYouTubeHtmlTitle(title) {
3399
+ return title?.replace(/\s*-\s*YouTube\s*$/i, "").trim();
3400
+ }
2891
3401
  function titleFromContext(context, embeddedKeys) {
2892
3402
  return firstText(
2893
3403
  context.raw.openGraph.title,
@@ -3062,6 +3572,20 @@ function parseRedditUrl(url) {
3062
3572
  function cleanSocialTitle(title) {
3063
3573
  return title?.replace(/\s*:\s*r\/[A-Za-z0-9_]+$/i, "").trim();
3064
3574
  }
3575
+ function cleanRedditTitle(title) {
3576
+ const cleaned = cleanSocialTitle(title);
3577
+ if (!cleaned || /reddit\s*-\s*please wait for verification|please wait for verification|whoa there, pardner/i.test(cleaned)) {
3578
+ return void 0;
3579
+ }
3580
+ return cleaned;
3581
+ }
3582
+ function cleanRedditDescription(description) {
3583
+ const cleaned = description?.replace(/\s+/g, " ").trim();
3584
+ if (!cleaned || /please wait for verification|whoa there, pardner|request has been blocked/i.test(cleaned)) {
3585
+ return void 0;
3586
+ }
3587
+ return cleaned;
3588
+ }
3065
3589
  function hostMatches(url, domains) {
3066
3590
  const host = url.hostname.toLowerCase().replace(/^www\./, "");
3067
3591
  return domains.some((domain) => host === domain || host.endsWith(`.${domain}`));
@@ -3451,13 +3975,17 @@ function ascii(bytes, offset, length) {
3451
3975
  }
3452
3976
 
3453
3977
  // src/fetchMetadata.ts
3978
+ var REDDIT_BLOCKED_METADATA_WARNING = "Reddit returned a verification/block page; metadata is incomplete.";
3979
+ var PROVIDER_BLOCKED_SUGGESTED_ACTION = "retry_on_different_host_or_use_supported_proxy";
3454
3980
  async function fetchMetadata(url, options = {}) {
3455
3981
  const startedAt = Date.now();
3456
3982
  try {
3457
3983
  const requestedUrl = normalizeUrl(url);
3458
- const firstPage = await fetchPage(requestedUrl, options);
3459
- const fallback = await maybeFetchRedditFallback(firstPage, options);
3460
- const page = fallback.page;
3984
+ const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
3985
+ const page = fetchResult.page;
3986
+ if (fetchResult.providerDiagnostics?.blocked) {
3987
+ return createBlockedProviderMetadata(requestedUrl, fetchResult, Date.now() - startedAt);
3988
+ }
3461
3989
  const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
3462
3990
  if (directMedia) {
3463
3991
  return directMedia;
@@ -3478,12 +4006,17 @@ async function fetchMetadata(url, options = {}) {
3478
4006
  metadata.diagnostics.trace = [
3479
4007
  ...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
3480
4008
  ...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
3481
- ...fallback.used ? ["retried Reddit page through old.reddit fallback"] : [],
4009
+ ...fetchResult.trace,
3482
4010
  "downloaded page",
3483
4011
  ...metadata.diagnostics.trace,
3484
4012
  ...metadata.canonicalUrl ? ["resolved canonical URL"] : []
3485
4013
  ];
4014
+ metadata.diagnostics.fallbacksAttempted = mergeFallbackAttempts2(metadata.diagnostics.fallbacksAttempted, fetchResult.fallbacksAttempted);
4015
+ metadata.diagnostics.sourcePriority = uniqueStrings3([...metadata.diagnostics.sourcePriority ?? [], ...fetchResult.sourcePriority ?? []]);
4016
+ metadata.diagnostics.extractionMethod = metadata.diagnostics.extractionMethod ?? fetchResult.extractionMethod;
4017
+ metadata.diagnostics.retryInfo = metadata.diagnostics.retryInfo ?? fetchResult.retryInfo;
3486
4018
  metadata.trace = metadata.diagnostics.trace;
4019
+ metadata.diagnostics.warnings.push(...fetchResult.warnings);
3487
4020
  if (!metadata.ok) {
3488
4021
  metadata.diagnostics.warnings.push(`Fetch completed with non-success status code ${page.statusCode}.`);
3489
4022
  }
@@ -3521,30 +4054,556 @@ async function fetchMetadata(url, options = {}) {
3521
4054
  };
3522
4055
  }
3523
4056
  }
3524
- async function maybeFetchRedditFallback(page, options) {
3525
- let parsed;
4057
+ function createBlockedProviderMetadata(requestedUrl, fetchResult, fetchDurationMs) {
4058
+ const page = fetchResult.page;
4059
+ const providerDiagnostics = fetchResult.providerDiagnostics;
4060
+ const trace = uniqueStrings3([
4061
+ ...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
4062
+ ...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
4063
+ ...fetchResult.trace,
4064
+ "detected blocked provider response"
4065
+ ]);
4066
+ const warnings = uniqueStrings3([
4067
+ ...fetchResult.warnings,
4068
+ REDDIT_BLOCKED_METADATA_WARNING,
4069
+ ...page.statusCode < 200 || page.statusCode >= 300 ? [`Fetch completed with non-success status code ${page.statusCode}.`] : []
4070
+ ]);
4071
+ return {
4072
+ ok: false,
4073
+ url: requestedUrl,
4074
+ finalUrl: page.finalUrl,
4075
+ type: "unknown",
4076
+ siteName: providerDiagnostics?.platform === "reddit" ? "Reddit" : void 0,
4077
+ confidence: 0,
4078
+ completeness: 0,
4079
+ reliability: 0,
4080
+ images: [],
4081
+ videos: [],
4082
+ audio: [],
4083
+ favicons: [],
4084
+ trace,
4085
+ diagnostics: {
4086
+ originalUrl: requestedUrl,
4087
+ finalUrl: page.finalUrl,
4088
+ isShortUrl: page.isShortUrl,
4089
+ shortUrlProvider: page.shortUrlProvider,
4090
+ statusCode: page.statusCode,
4091
+ contentType: page.contentType,
4092
+ redirects: page.redirects,
4093
+ sourcesUsed: [],
4094
+ warnings,
4095
+ fallbacksAttempted: mergeFallbackAttempts2(void 0, fetchResult.fallbacksAttempted),
4096
+ trace,
4097
+ sourcePriority: fetchResult.sourcePriority,
4098
+ extractionMethod: fetchResult.extractionMethod,
4099
+ retryInfo: fetchResult.retryInfo,
4100
+ providerDiagnostics,
4101
+ confidenceBreakdown: {
4102
+ title: 0,
4103
+ description: 0,
4104
+ image: 0,
4105
+ structuredData: 0,
4106
+ adapter: 0
4107
+ },
4108
+ fetchDurationMs,
4109
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString()
4110
+ }
4111
+ };
4112
+ }
4113
+ async function fetchPageWithStrategies(requestedUrl, options) {
4114
+ if (isRedditUrl(requestedUrl)) {
4115
+ return fetchRedditPageWithStrategy(requestedUrl, options);
4116
+ }
4117
+ return {
4118
+ page: await fetchPage(requestedUrl, options),
4119
+ fallbacksAttempted: [],
4120
+ warnings: [],
4121
+ trace: []
4122
+ };
4123
+ }
4124
+ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4125
+ const attempts = [];
4126
+ const warnings = [];
4127
+ const sourcePriority = ["redditJsonEndpoint", "oldReddit", "embeddedStructuredData", "openGraph", "html"];
4128
+ let lastError;
4129
+ const jsonUrl = redditJsonEndpoint(requestedUrl);
4130
+ if (jsonUrl) {
4131
+ const attempt = await attemptFetch("redditJsonEndpoint", jsonUrl, {
4132
+ ...options,
4133
+ accept: "application/json,text/html;q=0.8,*/*;q=0.5"
4134
+ });
4135
+ attempts.push(attempt);
4136
+ lastError = attempt.error;
4137
+ if (attempt.page && attempt.ok) {
4138
+ const redditPost = parseRedditJsonPayload(attempt.page.html);
4139
+ if (redditPost?.title) {
4140
+ return {
4141
+ page: synthesizeRedditJsonPage(attempt.page, requestedUrl, redditPost),
4142
+ fallbacksAttempted: attempts,
4143
+ warnings,
4144
+ trace: ["used Reddit JSON endpoint"],
4145
+ sourcePriority,
4146
+ extractionMethod: "reddit:jsonEndpoint",
4147
+ retryInfo: redditRetryInfo(attempts)
4148
+ };
4149
+ }
4150
+ warnings.push("Reddit JSON endpoint responded, but no post payload could be extracted.");
4151
+ } else if (attempt.blocked) {
4152
+ warnings.push("Reddit JSON endpoint appears to have blocked access.");
4153
+ }
4154
+ }
4155
+ const oldRedditUrl = redditOldUrl(requestedUrl);
4156
+ if (oldRedditUrl && oldRedditUrl !== requestedUrl) {
4157
+ const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
4158
+ attempts.push(attempt);
4159
+ lastError = attempt.error;
4160
+ if (attempt.page && attempt.ok) {
4161
+ return {
4162
+ page: attempt.page,
4163
+ fallbacksAttempted: attempts,
4164
+ warnings,
4165
+ trace: ["retried Reddit page through old.reddit"],
4166
+ sourcePriority,
4167
+ extractionMethod: "reddit:oldReddit",
4168
+ retryInfo: redditRetryInfo(attempts)
4169
+ };
4170
+ }
4171
+ if (attempt.blocked) {
4172
+ warnings.push("old.reddit fallback appears to have been blocked.");
4173
+ }
4174
+ }
4175
+ const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
4176
+ attempts.push(htmlAttempt);
4177
+ lastError = htmlAttempt.error;
4178
+ if (htmlAttempt.page && htmlAttempt.ok) {
4179
+ return {
4180
+ page: htmlAttempt.page,
4181
+ fallbacksAttempted: attempts,
4182
+ warnings,
4183
+ trace: ["used Reddit HTML fallback"],
4184
+ sourcePriority,
4185
+ extractionMethod: "reddit:htmlFallback",
4186
+ retryInfo: redditRetryInfo(attempts)
4187
+ };
4188
+ }
4189
+ if (htmlAttempt.blocked) {
4190
+ warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
4191
+ }
4192
+ const providerDiagnostics = redditProviderDiagnosticsFromAttempts(attempts);
4193
+ if (providerDiagnostics) {
4194
+ return {
4195
+ page: synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics),
4196
+ fallbacksAttempted: attempts,
4197
+ warnings: uniqueStrings3([...warnings, REDDIT_BLOCKED_METADATA_WARNING]),
4198
+ trace: ["Reddit provider blocked metadata extraction"],
4199
+ sourcePriority,
4200
+ extractionMethod: "reddit:blockedProvider",
4201
+ retryInfo: redditRetryInfo(attempts),
4202
+ providerDiagnostics
4203
+ };
4204
+ }
4205
+ throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
4206
+ }
4207
+ async function attemptFetch(method, url, options) {
4208
+ try {
4209
+ const page = await fetchPage(url, options);
4210
+ const retryAfter = page.headers["retry-after"];
4211
+ const blockReason = redditBlockReason(page);
4212
+ const blocked = Boolean(blockReason);
4213
+ return {
4214
+ method,
4215
+ url,
4216
+ ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
4217
+ statusCode: page.statusCode,
4218
+ blocked,
4219
+ blockReason,
4220
+ retryAfter,
4221
+ page
4222
+ };
4223
+ } catch (error) {
4224
+ return {
4225
+ method,
4226
+ url,
4227
+ ok: false,
4228
+ error: error instanceof Error ? error.message : String(error)
4229
+ };
4230
+ }
4231
+ }
4232
+ function isRedditUrl(url) {
3526
4233
  try {
3527
- parsed = new URL(page.finalUrl);
4234
+ const host = new URL(url).hostname.toLowerCase().replace(/^www\./, "");
4235
+ return host === "reddit.com" || host === "redd.it" || host.endsWith(".reddit.com");
3528
4236
  } catch {
3529
- return { page, used: false };
3530
- }
3531
- const host = parsed.hostname.toLowerCase();
3532
- const isReddit = host === "www.reddit.com" || host === "reddit.com" || host.endsWith(".reddit.com");
3533
- const isOldReddit = host === "old.reddit.com";
3534
- const looksLikeVerification = /please wait for verification|blocked|whoa there, pardner/i.test(page.html);
3535
- const hasUsefulPreview = /og:(?:title|image|description)|twitter:(?:title|image|description)/i.test(page.html);
3536
- if (!isReddit || isOldReddit || hasUsefulPreview || !looksLikeVerification) {
3537
- return { page, used: false };
3538
- }
3539
- const fallbackUrl = new URL(page.finalUrl);
3540
- fallbackUrl.hostname = "old.reddit.com";
3541
- fallbackUrl.search = "";
4237
+ return false;
4238
+ }
4239
+ }
4240
+ function redditJsonEndpoint(url) {
4241
+ try {
4242
+ const parsed = new URL(url);
4243
+ const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
4244
+ const endpoint = new URL(url);
4245
+ endpoint.protocol = "https:";
4246
+ endpoint.hostname = "www.reddit.com";
4247
+ endpoint.search = "";
4248
+ if (host === "redd.it") {
4249
+ const postId = parsed.pathname.split("/").filter(Boolean)[0];
4250
+ if (!postId) {
4251
+ return void 0;
4252
+ }
4253
+ endpoint.pathname = `/comments/${postId}.json`;
4254
+ } else {
4255
+ endpoint.pathname = parsed.pathname.endsWith(".json") ? parsed.pathname : `${parsed.pathname.endsWith("/") ? parsed.pathname : `${parsed.pathname}/`}.json`;
4256
+ }
4257
+ endpoint.searchParams.set("raw_json", "1");
4258
+ return endpoint.toString();
4259
+ } catch {
4260
+ return void 0;
4261
+ }
4262
+ }
4263
+ function redditOldUrl(url) {
4264
+ try {
4265
+ const parsed = new URL(url);
4266
+ const host = parsed.hostname.toLowerCase().replace(/^www\./, "");
4267
+ parsed.protocol = "https:";
4268
+ parsed.hostname = "old.reddit.com";
4269
+ parsed.search = "";
4270
+ if (host === "redd.it") {
4271
+ const postId = parsed.pathname.split("/").filter(Boolean)[0];
4272
+ if (!postId) {
4273
+ return void 0;
4274
+ }
4275
+ parsed.pathname = `/comments/${postId}/`;
4276
+ }
4277
+ return parsed.toString();
4278
+ } catch {
4279
+ return void 0;
4280
+ }
4281
+ }
4282
+ function parseRedditJsonPayload(source) {
3542
4283
  try {
3543
- const fallbackPage = await fetchPage(fallbackUrl.toString(), options);
3544
- return { page: fallbackPage, used: true };
4284
+ const parsed = JSON.parse(source);
4285
+ const post = findRedditPostRecord(parsed);
4286
+ if (!post) {
4287
+ return void 0;
4288
+ }
4289
+ const createdUtc = numberFromUnknown2(post.created_utc);
4290
+ const permalink = stringFromUnknown4(post.permalink);
4291
+ const canonicalUrl = permalink ? `https://www.reddit.com${permalink.startsWith("/") ? permalink : `/${permalink}`}` : void 0;
4292
+ const images = redditImagesFromPost(post);
4293
+ const videos = redditVideosFromPost(post);
4294
+ const description = firstText2(
4295
+ stringFromUnknown4(post.selftext),
4296
+ stringFromUnknown4(post.selftext_html),
4297
+ stringFromUnknown4(post.url_overridden_by_dest)
4298
+ );
4299
+ return {
4300
+ title: stringFromUnknown4(post.title),
4301
+ description,
4302
+ author: stringFromUnknown4(post.author) ?? stringFromUnknown4(post.author_fullname),
4303
+ createdAt: createdUtc ? new Date(createdUtc * 1e3).toISOString() : void 0,
4304
+ canonicalUrl,
4305
+ url: stringFromUnknown4(post.url_overridden_by_dest) ?? stringFromUnknown4(post.url),
4306
+ images,
4307
+ videos,
4308
+ subreddit: stringFromUnknown4(post.subreddit_name_prefixed) ?? stringFromUnknown4(post.subreddit),
4309
+ postId: stringFromUnknown4(post.id)
4310
+ };
3545
4311
  } catch {
3546
- return { page, used: false };
4312
+ return void 0;
4313
+ }
4314
+ }
4315
+ function findRedditPostRecord(value) {
4316
+ if (Array.isArray(value)) {
4317
+ for (const item of value) {
4318
+ const found = findRedditPostRecord(item);
4319
+ if (found) {
4320
+ return found;
4321
+ }
4322
+ }
4323
+ return void 0;
4324
+ }
4325
+ if (!isRecord5(value)) {
4326
+ return void 0;
4327
+ }
4328
+ if (typeof value.title === "string" && (typeof value.id === "string" || typeof value.name === "string")) {
4329
+ return value;
4330
+ }
4331
+ const children = isRecord5(value.data) && Array.isArray(value.data.children) ? value.data.children : void 0;
4332
+ if (children) {
4333
+ for (const child of children) {
4334
+ if (isRecord5(child) && isRecord5(child.data) && (child.kind === "t3" || typeof child.data.title === "string")) {
4335
+ return child.data;
4336
+ }
4337
+ }
4338
+ }
4339
+ for (const childValue of Object.values(value).slice(0, 100)) {
4340
+ const found = findRedditPostRecord(childValue);
4341
+ if (found) {
4342
+ return found;
4343
+ }
4344
+ }
4345
+ return void 0;
4346
+ }
4347
+ function redditImagesFromPost(post) {
4348
+ const images = [];
4349
+ const preview = isRecord5(post.preview) && Array.isArray(post.preview.images) ? post.preview.images : [];
4350
+ for (const image of preview) {
4351
+ if (!isRecord5(image)) {
4352
+ continue;
4353
+ }
4354
+ for (const candidate of [image.source, ...Array.isArray(image.resolutions) ? image.resolutions : []]) {
4355
+ if (!isRecord5(candidate)) {
4356
+ continue;
4357
+ }
4358
+ const url = redditMediaUrl(stringFromUnknown4(candidate.url));
4359
+ if (!url) {
4360
+ continue;
4361
+ }
4362
+ images.push({
4363
+ url,
4364
+ kind: "image",
4365
+ source: "adapter",
4366
+ width: numberFromUnknown2(candidate.width),
4367
+ height: numberFromUnknown2(candidate.height),
4368
+ metadata: {
4369
+ adapter: "redditJsonEndpoint",
4370
+ originalSource: "redditJsonEndpoint"
4371
+ }
4372
+ });
4373
+ }
4374
+ }
4375
+ const thumbnail = redditMediaUrl(stringFromUnknown4(post.thumbnail));
4376
+ if (thumbnail && /^https?:\/\//i.test(thumbnail)) {
4377
+ images.push({
4378
+ url: thumbnail,
4379
+ kind: "image",
4380
+ source: "adapter",
4381
+ metadata: {
4382
+ adapter: "redditJsonEndpoint",
4383
+ originalSource: "redditJsonEndpoint"
4384
+ }
4385
+ });
4386
+ }
4387
+ return images;
4388
+ }
4389
+ function redditVideosFromPost(post) {
4390
+ const videos = [];
4391
+ const media = [post.media, post.secure_media].filter(isRecord5);
4392
+ for (const item of media) {
4393
+ const redditVideo = isRecord5(item.reddit_video) ? item.reddit_video : void 0;
4394
+ const url = redditMediaUrl(stringFromUnknown4(redditVideo?.fallback_url) ?? stringFromUnknown4(redditVideo?.hls_url) ?? stringFromUnknown4(redditVideo?.dash_url));
4395
+ if (!url) {
4396
+ continue;
4397
+ }
4398
+ videos.push({
4399
+ url,
4400
+ kind: "video",
4401
+ source: "adapter",
4402
+ width: numberFromUnknown2(redditVideo?.width),
4403
+ height: numberFromUnknown2(redditVideo?.height),
4404
+ metadata: {
4405
+ adapter: "redditJsonEndpoint",
4406
+ originalSource: "redditJsonEndpoint"
4407
+ }
4408
+ });
4409
+ }
4410
+ return videos;
4411
+ }
4412
+ function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
4413
+ const finalUrl = post.canonicalUrl ?? requestedUrl;
4414
+ const bestImage = post.images.sort((left, right) => (right.width ?? 0) * (right.height ?? 0) - (left.width ?? 0) * (left.height ?? 0))[0];
4415
+ const video = post.videos[0];
4416
+ const structuredData = {
4417
+ "@context": "https://schema.org",
4418
+ "@type": "SocialMediaPosting",
4419
+ headline: post.title,
4420
+ description: post.description,
4421
+ author: post.author ? { "@type": "Person", name: post.author } : void 0,
4422
+ datePublished: post.createdAt,
4423
+ url: finalUrl,
4424
+ image: bestImage ? { "@type": "ImageObject", url: bestImage.url, width: bestImage.width, height: bestImage.height } : void 0,
4425
+ video: video ? { "@type": "VideoObject", contentUrl: video.url, width: video.width, height: video.height } : void 0
4426
+ };
4427
+ const embeddedPayload = {
4428
+ post: {
4429
+ postTitle: post.title,
4430
+ description: post.description,
4431
+ author: post.author ? { name: post.author } : void 0,
4432
+ createdAt: post.createdAt,
4433
+ canonicalUrl: finalUrl,
4434
+ previewImage: bestImage,
4435
+ media: {
4436
+ videoUrl: video?.url
4437
+ },
4438
+ images: post.images,
4439
+ videos: post.videos,
4440
+ subreddit: post.subreddit,
4441
+ postId: post.postId
4442
+ }
4443
+ };
4444
+ const html = [
4445
+ "<!doctype html><html><head>",
4446
+ `<title>${escapeHtml(post.title ?? "Reddit post")}</title>`,
4447
+ post.title ? `<meta property="og:title" content="${escapeHtml(post.title)}">` : "",
4448
+ post.description ? `<meta property="og:description" content="${escapeHtml(post.description)}">` : "",
4449
+ `<meta property="og:site_name" content="Reddit">`,
4450
+ `<meta property="og:url" content="${escapeHtml(finalUrl)}">`,
4451
+ bestImage ? `<meta property="og:image" content="${escapeHtml(bestImage.url)}">` : "",
4452
+ bestImage?.width ? `<meta property="og:image:width" content="${bestImage.width}">` : "",
4453
+ bestImage?.height ? `<meta property="og:image:height" content="${bestImage.height}">` : "",
4454
+ `<link rel="canonical" href="${escapeHtml(finalUrl)}">`,
4455
+ `<script type="application/ld+json">${safeJson(structuredData)}</script>`,
4456
+ `<script type="application/json" id="metanova-reddit-json">${safeJson(embeddedPayload)}</script>`,
4457
+ "</head><body></body></html>"
4458
+ ].join("");
4459
+ return {
4460
+ ...jsonPage,
4461
+ url: requestedUrl,
4462
+ originalUrl: requestedUrl,
4463
+ finalUrl,
4464
+ html,
4465
+ bytes: new TextEncoder().encode(html),
4466
+ contentType: "text/html; charset=utf-8",
4467
+ statusCode: jsonPage.statusCode
4468
+ };
4469
+ }
4470
+ function redditProviderDiagnosticsFromAttempts(attempts) {
4471
+ const blockedAttempts = attempts.filter((attempt) => attempt.blocked);
4472
+ if (blockedAttempts.length === 0) {
4473
+ return void 0;
3547
4474
  }
4475
+ const selectedAttempt = blockedAttempts.find((attempt) => attempt.blockReason === "provider_verification_required") ?? blockedAttempts.at(-1);
4476
+ return {
4477
+ platform: "reddit",
4478
+ blocked: true,
4479
+ statusCode: selectedAttempt?.statusCode,
4480
+ reason: selectedAttempt?.blockReason ?? "provider_blocked_request",
4481
+ suggestedAction: PROVIDER_BLOCKED_SUGGESTED_ACTION
4482
+ };
4483
+ }
4484
+ function synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics) {
4485
+ const selectedPage = attempts.find((attempt) => attempt.blockReason === providerDiagnostics.reason)?.page ?? attempts.slice().reverse().find((attempt) => attempt.page)?.page;
4486
+ return {
4487
+ url: requestedUrl,
4488
+ originalUrl: requestedUrl,
4489
+ finalUrl: requestedUrl,
4490
+ isShortUrl: selectedPage?.isShortUrl ?? false,
4491
+ shortUrlProvider: selectedPage?.shortUrlProvider,
4492
+ html: "",
4493
+ bytes: new Uint8Array(),
4494
+ statusCode: providerDiagnostics.statusCode ?? selectedPage?.statusCode ?? 403,
4495
+ contentType: selectedPage?.contentType,
4496
+ redirects: selectedPage?.redirects ?? [],
4497
+ headers: selectedPage?.headers ?? {}
4498
+ };
4499
+ }
4500
+ function redditBlockReason(page) {
4501
+ const title = htmlTitle(page.html);
4502
+ const text = normalizeText(`${title ?? ""} ${page.html}`);
4503
+ if (/reddit\s*-\s*please wait for verification/i.test(title ?? "") || /please wait for verification|verification required|verify you are human/i.test(text)) {
4504
+ return "provider_verification_required";
4505
+ }
4506
+ if (page.statusCode === 403 || page.statusCode === 429 || /whoa there, pardner|request has been blocked|too many requests|forbidden|you're blocked|you are blocked|youre blocked|blocked by network security/i.test(text) || /^blocked$/i.test(title ?? "")) {
4507
+ return "provider_blocked_request";
4508
+ }
4509
+ return void 0;
4510
+ }
4511
+ function htmlTitle(html) {
4512
+ return normalizeText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1]);
4513
+ }
4514
+ function normalizeText(value) {
4515
+ return value?.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim() ?? "";
4516
+ }
4517
+ function redditRetryInfo(attempts) {
4518
+ const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
4519
+ if (blockedAttempts.length === 0) {
4520
+ return void 0;
4521
+ }
4522
+ const retryAfter = blockedAttempts.map((attempt) => attempt.retryAfter).find((value) => Boolean(value));
4523
+ return {
4524
+ retryable: blockedAttempts.some((attempt) => attempt.statusCode === 429 || Boolean(attempt.retryAfter)),
4525
+ reason: blockedAttempts.map((attempt) => `${attempt.method}${attempt.statusCode ? ` returned ${attempt.statusCode}` : " failed"}`).join("; "),
4526
+ retryAfter,
4527
+ retryAfterMs: retryAfterToMs(retryAfter),
4528
+ attempts: attempts.length
4529
+ };
4530
+ }
4531
+ function retryAfterToMs(value) {
4532
+ if (!value) {
4533
+ return void 0;
4534
+ }
4535
+ const seconds = Number.parseInt(value, 10);
4536
+ if (Number.isFinite(seconds)) {
4537
+ return seconds * 1e3;
4538
+ }
4539
+ const dateMs = Date.parse(value);
4540
+ return Number.isFinite(dateMs) ? Math.max(dateMs - Date.now(), 0) : void 0;
4541
+ }
4542
+ function mergeFallbackAttempts2(existing, incoming) {
4543
+ const attempts = [...existing ?? [], ...incoming];
4544
+ if (attempts.length === 0) {
4545
+ return void 0;
4546
+ }
4547
+ const seen = /* @__PURE__ */ new Set();
4548
+ return attempts.map((value) => {
4549
+ const { page: _page, blockReason: _blockReason, ...attempt } = value;
4550
+ return attempt;
4551
+ }).filter((attempt) => {
4552
+ const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
4553
+ if (seen.has(key)) {
4554
+ return false;
4555
+ }
4556
+ seen.add(key);
4557
+ return true;
4558
+ });
4559
+ }
4560
+ function uniqueStrings3(values) {
4561
+ return [...new Set(values.filter((value) => Boolean(value)))];
4562
+ }
4563
+ function redditMediaUrl(value) {
4564
+ return value?.replace(/&amp;/g, "&");
4565
+ }
4566
+ function firstText2(...values) {
4567
+ return values.map((value) => value?.replace(/\s+/g, " ").trim()).find((value) => Boolean(value));
4568
+ }
4569
+ function stringFromUnknown4(value) {
4570
+ if (typeof value === "string" && value.trim()) {
4571
+ return value.trim();
4572
+ }
4573
+ if (typeof value === "number" && Number.isFinite(value)) {
4574
+ return String(value);
4575
+ }
4576
+ return void 0;
4577
+ }
4578
+ function numberFromUnknown2(value) {
4579
+ if (typeof value === "number" && Number.isFinite(value)) {
4580
+ return value;
4581
+ }
4582
+ if (typeof value !== "string") {
4583
+ return void 0;
4584
+ }
4585
+ const parsed = Number.parseFloat(value);
4586
+ return Number.isFinite(parsed) ? parsed : void 0;
4587
+ }
4588
+ function safeJson(value) {
4589
+ return JSON.stringify(stripUndefinedDeep(value)).replace(/</g, "\\u003c");
4590
+ }
4591
+ function escapeHtml(value) {
4592
+ return value.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
4593
+ }
4594
+ function stripUndefinedDeep(value) {
4595
+ if (Array.isArray(value)) {
4596
+ return value.map(stripUndefinedDeep).filter((item) => item !== void 0);
4597
+ }
4598
+ if (isRecord5(value)) {
4599
+ return Object.fromEntries(
4600
+ Object.entries(value).map(([key, item]) => [key, stripUndefinedDeep(item)]).filter(([, item]) => item !== void 0 && item !== null && (!Array.isArray(item) || item.length > 0))
4601
+ );
4602
+ }
4603
+ return value;
4604
+ }
4605
+ function isRecord5(value) {
4606
+ return typeof value === "object" && value !== null && !Array.isArray(value);
3548
4607
  }
3549
4608
  function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
3550
4609
  const contentType = page.contentType?.toLowerCase() ?? "";
@@ -3601,7 +4660,15 @@ function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
3601
4660
  sourcesUsed: ["direct"],
3602
4661
  warnings: [],
3603
4662
  trace,
4663
+ extractionMethod: `direct:${kind}`,
3604
4664
  selectedImageReason: kind === "image" ? "Selected direct image URL because the response content type is an image." : void 0,
4665
+ confidenceBreakdown: {
4666
+ title: 0,
4667
+ description: 0,
4668
+ image: kind === "image" ? 100 : 0,
4669
+ structuredData: 0,
4670
+ adapter: 0
4671
+ },
3605
4672
  fetchDurationMs,
3606
4673
  extractedAt: (/* @__PURE__ */ new Date()).toISOString()
3607
4674
  }
@@ -3717,6 +4784,7 @@ var index_default = MetaNova;
3717
4784
  behanceAdapter,
3718
4785
  calculateCompleteness,
3719
4786
  calculateConfidence,
4787
+ calculateConfidenceBreakdown,
3720
4788
  calculateReliability,
3721
4789
  createDiagnostics,
3722
4790
  createPreviewCard,