metanova 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## v0.2.1
4
+
5
+ Release date: 2026-06-04
6
+
7
+ ### Fixed
8
+
9
+ - Fixed Reddit verification/block pages that returned HTTP 200 being treated as successful metadata.
10
+ - Fixed blocked Reddit fallback handling so provider diagnostics clearly report blocked status, reason, status code, and suggested action.
11
+ - Fixed Reddit verification titles such as `Reddit - Please wait for verification` leaking into `metadata.title`.
12
+
3
13
  ## v0.2.0
4
14
 
5
15
  Release date: 2026-06-04
package/dist/index.cjs CHANGED
@@ -2897,8 +2897,8 @@ var redditAdapter = {
2897
2897
  type: reddit.isPost ? "social_post" : "website",
2898
2898
  siteName: "Reddit",
2899
2899
  canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
2900
- title: cleanSocialTitle(titleSelection.value),
2901
- description: descriptionSelection.value,
2900
+ title: cleanRedditTitle(titleSelection.value),
2901
+ description: cleanRedditDescription(descriptionSelection.value),
2902
2902
  images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
2903
2903
  videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
2904
2904
  author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
@@ -3572,6 +3572,20 @@ function parseRedditUrl(url) {
3572
3572
  function cleanSocialTitle(title) {
3573
3573
  return title?.replace(/\s*:\s*r\/[A-Za-z0-9_]+$/i, "").trim();
3574
3574
  }
3575
+ function cleanRedditTitle(title) {
3576
+ const cleaned = cleanSocialTitle(title);
3577
+ if (!cleaned || /reddit\s*-\s*please wait for verification|please wait for verification|whoa there, pardner/i.test(cleaned)) {
3578
+ return void 0;
3579
+ }
3580
+ return cleaned;
3581
+ }
3582
+ function cleanRedditDescription(description) {
3583
+ const cleaned = description?.replace(/\s+/g, " ").trim();
3584
+ if (!cleaned || /please wait for verification|whoa there, pardner|request has been blocked/i.test(cleaned)) {
3585
+ return void 0;
3586
+ }
3587
+ return cleaned;
3588
+ }
3575
3589
  function hostMatches(url, domains) {
3576
3590
  const host = url.hostname.toLowerCase().replace(/^www\./, "");
3577
3591
  return domains.some((domain) => host === domain || host.endsWith(`.${domain}`));
@@ -3961,12 +3975,17 @@ function ascii(bytes, offset, length) {
3961
3975
  }
3962
3976
 
3963
3977
  // src/fetchMetadata.ts
3978
+ var REDDIT_BLOCKED_METADATA_WARNING = "Reddit returned a verification/block page; metadata is incomplete.";
3979
+ var PROVIDER_BLOCKED_SUGGESTED_ACTION = "retry_on_different_host_or_use_supported_proxy";
3964
3980
  async function fetchMetadata(url, options = {}) {
3965
3981
  const startedAt = Date.now();
3966
3982
  try {
3967
3983
  const requestedUrl = normalizeUrl(url);
3968
3984
  const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
3969
3985
  const page = fetchResult.page;
3986
+ if (fetchResult.providerDiagnostics?.blocked) {
3987
+ return createBlockedProviderMetadata(requestedUrl, fetchResult, Date.now() - startedAt);
3988
+ }
3970
3989
  const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
3971
3990
  if (directMedia) {
3972
3991
  return directMedia;
@@ -4035,6 +4054,62 @@ async function fetchMetadata(url, options = {}) {
4035
4054
  };
4036
4055
  }
4037
4056
  }
4057
+ function createBlockedProviderMetadata(requestedUrl, fetchResult, fetchDurationMs) {
4058
+ const page = fetchResult.page;
4059
+ const providerDiagnostics = fetchResult.providerDiagnostics;
4060
+ const trace = uniqueStrings3([
4061
+ ...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
4062
+ ...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
4063
+ ...fetchResult.trace,
4064
+ "detected blocked provider response"
4065
+ ]);
4066
+ const warnings = uniqueStrings3([
4067
+ ...fetchResult.warnings,
4068
+ REDDIT_BLOCKED_METADATA_WARNING,
4069
+ ...page.statusCode < 200 || page.statusCode >= 300 ? [`Fetch completed with non-success status code ${page.statusCode}.`] : []
4070
+ ]);
4071
+ return {
4072
+ ok: false,
4073
+ url: requestedUrl,
4074
+ finalUrl: page.finalUrl,
4075
+ type: "unknown",
4076
+ siteName: providerDiagnostics?.platform === "reddit" ? "Reddit" : void 0,
4077
+ confidence: 0,
4078
+ completeness: 0,
4079
+ reliability: 0,
4080
+ images: [],
4081
+ videos: [],
4082
+ audio: [],
4083
+ favicons: [],
4084
+ trace,
4085
+ diagnostics: {
4086
+ originalUrl: requestedUrl,
4087
+ finalUrl: page.finalUrl,
4088
+ isShortUrl: page.isShortUrl,
4089
+ shortUrlProvider: page.shortUrlProvider,
4090
+ statusCode: page.statusCode,
4091
+ contentType: page.contentType,
4092
+ redirects: page.redirects,
4093
+ sourcesUsed: [],
4094
+ warnings,
4095
+ fallbacksAttempted: mergeFallbackAttempts2(void 0, fetchResult.fallbacksAttempted),
4096
+ trace,
4097
+ sourcePriority: fetchResult.sourcePriority,
4098
+ extractionMethod: fetchResult.extractionMethod,
4099
+ retryInfo: fetchResult.retryInfo,
4100
+ providerDiagnostics,
4101
+ confidenceBreakdown: {
4102
+ title: 0,
4103
+ description: 0,
4104
+ image: 0,
4105
+ structuredData: 0,
4106
+ adapter: 0
4107
+ },
4108
+ fetchDurationMs,
4109
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString()
4110
+ }
4111
+ };
4112
+ }
4038
4113
  async function fetchPageWithStrategies(requestedUrl, options) {
4039
4114
  if (isRedditUrl(requestedUrl)) {
4040
4115
  return fetchRedditPageWithStrategy(requestedUrl, options);
@@ -4059,7 +4134,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4059
4134
  });
4060
4135
  attempts.push(attempt);
4061
4136
  lastError = attempt.error;
4062
- if (attempt.page && attempt.ok && !attempt.blocked) {
4137
+ if (attempt.page && attempt.ok) {
4063
4138
  const redditPost = parseRedditJsonPayload(attempt.page.html);
4064
4139
  if (redditPost?.title) {
4065
4140
  return {
@@ -4082,7 +4157,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4082
4157
  const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
4083
4158
  attempts.push(attempt);
4084
4159
  lastError = attempt.error;
4085
- if (attempt.page && attempt.ok && !attempt.blocked) {
4160
+ if (attempt.page && attempt.ok) {
4086
4161
  return {
4087
4162
  page: attempt.page,
4088
4163
  fallbacksAttempted: attempts,
@@ -4100,10 +4175,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4100
4175
  const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
4101
4176
  attempts.push(htmlAttempt);
4102
4177
  lastError = htmlAttempt.error;
4103
- if (htmlAttempt.page) {
4104
- if (htmlAttempt.blocked) {
4105
- warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
4106
- }
4178
+ if (htmlAttempt.page && htmlAttempt.ok) {
4107
4179
  return {
4108
4180
  page: htmlAttempt.page,
4109
4181
  fallbacksAttempted: attempts,
@@ -4114,19 +4186,37 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4114
4186
  retryInfo: redditRetryInfo(attempts)
4115
4187
  };
4116
4188
  }
4189
+ if (htmlAttempt.blocked) {
4190
+ warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
4191
+ }
4192
+ const providerDiagnostics = redditProviderDiagnosticsFromAttempts(attempts);
4193
+ if (providerDiagnostics) {
4194
+ return {
4195
+ page: synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics),
4196
+ fallbacksAttempted: attempts,
4197
+ warnings: uniqueStrings3([...warnings, REDDIT_BLOCKED_METADATA_WARNING]),
4198
+ trace: ["Reddit provider blocked metadata extraction"],
4199
+ sourcePriority,
4200
+ extractionMethod: "reddit:blockedProvider",
4201
+ retryInfo: redditRetryInfo(attempts),
4202
+ providerDiagnostics
4203
+ };
4204
+ }
4117
4205
  throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
4118
4206
  }
4119
4207
  async function attemptFetch(method, url, options) {
4120
4208
  try {
4121
4209
  const page = await fetchPage(url, options);
4122
4210
  const retryAfter = page.headers["retry-after"];
4123
- const blocked = isRedditBlocked(page);
4211
+ const blockReason = redditBlockReason(page);
4212
+ const blocked = Boolean(blockReason);
4124
4213
  return {
4125
4214
  method,
4126
4215
  url,
4127
4216
  ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
4128
4217
  statusCode: page.statusCode,
4129
4218
  blocked,
4219
+ blockReason,
4130
4220
  retryAfter,
4131
4221
  page
4132
4222
  };
@@ -4377,8 +4467,52 @@ function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
4377
4467
  statusCode: jsonPage.statusCode
4378
4468
  };
4379
4469
  }
4380
- function isRedditBlocked(page) {
4381
- return page.statusCode === 403 || page.statusCode === 429 || /please wait for verification|whoa there, pardner|blocked|forbidden|too many requests|request has been blocked/i.test(page.html);
4470
+ function redditProviderDiagnosticsFromAttempts(attempts) {
4471
+ const blockedAttempts = attempts.filter((attempt) => attempt.blocked);
4472
+ if (blockedAttempts.length === 0) {
4473
+ return void 0;
4474
+ }
4475
+ const selectedAttempt = blockedAttempts.find((attempt) => attempt.blockReason === "provider_verification_required") ?? blockedAttempts.at(-1);
4476
+ return {
4477
+ platform: "reddit",
4478
+ blocked: true,
4479
+ statusCode: selectedAttempt?.statusCode,
4480
+ reason: selectedAttempt?.blockReason ?? "provider_blocked_request",
4481
+ suggestedAction: PROVIDER_BLOCKED_SUGGESTED_ACTION
4482
+ };
4483
+ }
4484
+ function synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics) {
4485
+ const selectedPage = attempts.find((attempt) => attempt.blockReason === providerDiagnostics.reason)?.page ?? attempts.slice().reverse().find((attempt) => attempt.page)?.page;
4486
+ return {
4487
+ url: requestedUrl,
4488
+ originalUrl: requestedUrl,
4489
+ finalUrl: requestedUrl,
4490
+ isShortUrl: selectedPage?.isShortUrl ?? false,
4491
+ shortUrlProvider: selectedPage?.shortUrlProvider,
4492
+ html: "",
4493
+ bytes: new Uint8Array(),
4494
+ statusCode: providerDiagnostics.statusCode ?? selectedPage?.statusCode ?? 403,
4495
+ contentType: selectedPage?.contentType,
4496
+ redirects: selectedPage?.redirects ?? [],
4497
+ headers: selectedPage?.headers ?? {}
4498
+ };
4499
+ }
4500
+ function redditBlockReason(page) {
4501
+ const title = htmlTitle(page.html);
4502
+ const text = normalizeText(`${title ?? ""} ${page.html}`);
4503
+ if (/reddit\s*-\s*please wait for verification/i.test(title ?? "") || /please wait for verification|verification required|verify you are human/i.test(text)) {
4504
+ return "provider_verification_required";
4505
+ }
4506
+ if (page.statusCode === 403 || page.statusCode === 429 || /whoa there, pardner|request has been blocked|too many requests|forbidden|you're blocked|you are blocked|youre blocked|blocked by network security/i.test(text) || /^blocked$/i.test(title ?? "")) {
4507
+ return "provider_blocked_request";
4508
+ }
4509
+ return void 0;
4510
+ }
4511
+ function htmlTitle(html) {
4512
+ return normalizeText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1]);
4513
+ }
4514
+ function normalizeText(value) {
4515
+ return value?.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim() ?? "";
4382
4516
  }
4383
4517
  function redditRetryInfo(attempts) {
4384
4518
  const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
@@ -4412,7 +4546,7 @@ function mergeFallbackAttempts2(existing, incoming) {
4412
4546
  }
4413
4547
  const seen = /* @__PURE__ */ new Set();
4414
4548
  return attempts.map((value) => {
4415
- const { page: _page, ...attempt } = value;
4549
+ const { page: _page, blockReason: _blockReason, ...attempt } = value;
4416
4550
  return attempt;
4417
4551
  }).filter((attempt) => {
4418
4552
  const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;