metanova 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -118,6 +118,7 @@ interface ExtractionDiagnostics {
118
118
  };
119
119
  errors?: string[];
120
120
  retryInfo?: ExtractionRetryInfo;
121
+ providerDiagnostics?: ProviderDiagnostics;
121
122
  selectedImageReason?: string;
122
123
  confidenceBreakdown?: ConfidenceBreakdown;
123
124
  fetchDurationMs?: number;
@@ -139,6 +140,13 @@ interface ExtractionRetryInfo {
139
140
  retryAfterMs?: number;
140
141
  attempts?: number;
141
142
  }
143
+ interface ProviderDiagnostics {
144
+ platform: string;
145
+ blocked: boolean;
146
+ statusCode?: number;
147
+ reason?: "provider_verification_required" | "provider_blocked_request" | string;
148
+ suggestedAction?: "retry_on_different_host_or_use_supported_proxy" | string;
149
+ }
142
150
  interface ConfidenceBreakdown {
143
151
  title: number;
144
152
  description: number;
@@ -521,4 +529,4 @@ interface MetaNovaRegistry {
521
529
  declare function registerGlobalPlugin(plugin: MetaNovaPlugin): void;
522
530
  declare function createRegistry(options?: ParseMetadataOptions): MetaNovaRegistry;
523
531
 
524
- export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
532
+ export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type ProviderDiagnostics, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
package/dist/index.d.ts CHANGED
@@ -118,6 +118,7 @@ interface ExtractionDiagnostics {
118
118
  };
119
119
  errors?: string[];
120
120
  retryInfo?: ExtractionRetryInfo;
121
+ providerDiagnostics?: ProviderDiagnostics;
121
122
  selectedImageReason?: string;
122
123
  confidenceBreakdown?: ConfidenceBreakdown;
123
124
  fetchDurationMs?: number;
@@ -139,6 +140,13 @@ interface ExtractionRetryInfo {
139
140
  retryAfterMs?: number;
140
141
  attempts?: number;
141
142
  }
143
+ interface ProviderDiagnostics {
144
+ platform: string;
145
+ blocked: boolean;
146
+ statusCode?: number;
147
+ reason?: "provider_verification_required" | "provider_blocked_request" | string;
148
+ suggestedAction?: "retry_on_different_host_or_use_supported_proxy" | string;
149
+ }
142
150
  interface ConfidenceBreakdown {
143
151
  title: number;
144
152
  description: number;
@@ -521,4 +529,4 @@ interface MetaNovaRegistry {
521
529
  declare function registerGlobalPlugin(plugin: MetaNovaPlugin): void;
522
530
  declare function createRegistry(options?: ParseMetadataOptions): MetaNovaRegistry;
523
531
 
524
- export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
532
+ export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type ProviderDiagnostics, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
package/dist/index.js CHANGED
@@ -2814,8 +2814,8 @@ var redditAdapter = {
2814
2814
  type: reddit.isPost ? "social_post" : "website",
2815
2815
  siteName: "Reddit",
2816
2816
  canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
2817
- title: cleanSocialTitle(titleSelection.value),
2818
- description: descriptionSelection.value,
2817
+ title: cleanRedditTitle(titleSelection.value),
2818
+ description: cleanRedditDescription(descriptionSelection.value),
2819
2819
  images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
2820
2820
  videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
2821
2821
  author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
@@ -3489,6 +3489,20 @@ function parseRedditUrl(url) {
3489
3489
  function cleanSocialTitle(title) {
3490
3490
  return title?.replace(/\s*:\s*r\/[A-Za-z0-9_]+$/i, "").trim();
3491
3491
  }
3492
+ function cleanRedditTitle(title) {
3493
+ const cleaned = cleanSocialTitle(title);
3494
+ if (!cleaned || /reddit\s*-\s*please wait for verification|please wait for verification|whoa there, pardner/i.test(cleaned)) {
3495
+ return void 0;
3496
+ }
3497
+ return cleaned;
3498
+ }
3499
+ function cleanRedditDescription(description) {
3500
+ const cleaned = description?.replace(/\s+/g, " ").trim();
3501
+ if (!cleaned || /please wait for verification|whoa there, pardner|request has been blocked/i.test(cleaned)) {
3502
+ return void 0;
3503
+ }
3504
+ return cleaned;
3505
+ }
3492
3506
  function hostMatches(url, domains) {
3493
3507
  const host = url.hostname.toLowerCase().replace(/^www\./, "");
3494
3508
  return domains.some((domain) => host === domain || host.endsWith(`.${domain}`));
@@ -3878,12 +3892,17 @@ function ascii(bytes, offset, length) {
3878
3892
  }
3879
3893
 
3880
3894
  // src/fetchMetadata.ts
3895
+ var REDDIT_BLOCKED_METADATA_WARNING = "Reddit returned a verification/block page; metadata is incomplete.";
3896
+ var PROVIDER_BLOCKED_SUGGESTED_ACTION = "retry_on_different_host_or_use_supported_proxy";
3881
3897
  async function fetchMetadata(url, options = {}) {
3882
3898
  const startedAt = Date.now();
3883
3899
  try {
3884
3900
  const requestedUrl = normalizeUrl(url);
3885
3901
  const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
3886
3902
  const page = fetchResult.page;
3903
+ if (fetchResult.providerDiagnostics?.blocked) {
3904
+ return createBlockedProviderMetadata(requestedUrl, fetchResult, Date.now() - startedAt);
3905
+ }
3887
3906
  const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
3888
3907
  if (directMedia) {
3889
3908
  return directMedia;
@@ -3952,6 +3971,62 @@ async function fetchMetadata(url, options = {}) {
3952
3971
  };
3953
3972
  }
3954
3973
  }
3974
+ function createBlockedProviderMetadata(requestedUrl, fetchResult, fetchDurationMs) {
3975
+ const page = fetchResult.page;
3976
+ const providerDiagnostics = fetchResult.providerDiagnostics;
3977
+ const trace = uniqueStrings3([
3978
+ ...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
3979
+ ...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
3980
+ ...fetchResult.trace,
3981
+ "detected blocked provider response"
3982
+ ]);
3983
+ const warnings = uniqueStrings3([
3984
+ ...fetchResult.warnings,
3985
+ REDDIT_BLOCKED_METADATA_WARNING,
3986
+ ...page.statusCode < 200 || page.statusCode >= 300 ? [`Fetch completed with non-success status code ${page.statusCode}.`] : []
3987
+ ]);
3988
+ return {
3989
+ ok: false,
3990
+ url: requestedUrl,
3991
+ finalUrl: page.finalUrl,
3992
+ type: "unknown",
3993
+ siteName: providerDiagnostics?.platform === "reddit" ? "Reddit" : void 0,
3994
+ confidence: 0,
3995
+ completeness: 0,
3996
+ reliability: 0,
3997
+ images: [],
3998
+ videos: [],
3999
+ audio: [],
4000
+ favicons: [],
4001
+ trace,
4002
+ diagnostics: {
4003
+ originalUrl: requestedUrl,
4004
+ finalUrl: page.finalUrl,
4005
+ isShortUrl: page.isShortUrl,
4006
+ shortUrlProvider: page.shortUrlProvider,
4007
+ statusCode: page.statusCode,
4008
+ contentType: page.contentType,
4009
+ redirects: page.redirects,
4010
+ sourcesUsed: [],
4011
+ warnings,
4012
+ fallbacksAttempted: mergeFallbackAttempts2(void 0, fetchResult.fallbacksAttempted),
4013
+ trace,
4014
+ sourcePriority: fetchResult.sourcePriority,
4015
+ extractionMethod: fetchResult.extractionMethod,
4016
+ retryInfo: fetchResult.retryInfo,
4017
+ providerDiagnostics,
4018
+ confidenceBreakdown: {
4019
+ title: 0,
4020
+ description: 0,
4021
+ image: 0,
4022
+ structuredData: 0,
4023
+ adapter: 0
4024
+ },
4025
+ fetchDurationMs,
4026
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString()
4027
+ }
4028
+ };
4029
+ }
3955
4030
  async function fetchPageWithStrategies(requestedUrl, options) {
3956
4031
  if (isRedditUrl(requestedUrl)) {
3957
4032
  return fetchRedditPageWithStrategy(requestedUrl, options);
@@ -3976,7 +4051,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
3976
4051
  });
3977
4052
  attempts.push(attempt);
3978
4053
  lastError = attempt.error;
3979
- if (attempt.page && attempt.ok && !attempt.blocked) {
4054
+ if (attempt.page && attempt.ok) {
3980
4055
  const redditPost = parseRedditJsonPayload(attempt.page.html);
3981
4056
  if (redditPost?.title) {
3982
4057
  return {
@@ -3999,7 +4074,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
3999
4074
  const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
4000
4075
  attempts.push(attempt);
4001
4076
  lastError = attempt.error;
4002
- if (attempt.page && attempt.ok && !attempt.blocked) {
4077
+ if (attempt.page && attempt.ok) {
4003
4078
  return {
4004
4079
  page: attempt.page,
4005
4080
  fallbacksAttempted: attempts,
@@ -4017,10 +4092,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4017
4092
  const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
4018
4093
  attempts.push(htmlAttempt);
4019
4094
  lastError = htmlAttempt.error;
4020
- if (htmlAttempt.page) {
4021
- if (htmlAttempt.blocked) {
4022
- warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
4023
- }
4095
+ if (htmlAttempt.page && htmlAttempt.ok) {
4024
4096
  return {
4025
4097
  page: htmlAttempt.page,
4026
4098
  fallbacksAttempted: attempts,
@@ -4031,19 +4103,37 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
4031
4103
  retryInfo: redditRetryInfo(attempts)
4032
4104
  };
4033
4105
  }
4106
+ if (htmlAttempt.blocked) {
4107
+ warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
4108
+ }
4109
+ const providerDiagnostics = redditProviderDiagnosticsFromAttempts(attempts);
4110
+ if (providerDiagnostics) {
4111
+ return {
4112
+ page: synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics),
4113
+ fallbacksAttempted: attempts,
4114
+ warnings: uniqueStrings3([...warnings, REDDIT_BLOCKED_METADATA_WARNING]),
4115
+ trace: ["Reddit provider blocked metadata extraction"],
4116
+ sourcePriority,
4117
+ extractionMethod: "reddit:blockedProvider",
4118
+ retryInfo: redditRetryInfo(attempts),
4119
+ providerDiagnostics
4120
+ };
4121
+ }
4034
4122
  throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
4035
4123
  }
4036
4124
  async function attemptFetch(method, url, options) {
4037
4125
  try {
4038
4126
  const page = await fetchPage(url, options);
4039
4127
  const retryAfter = page.headers["retry-after"];
4040
- const blocked = isRedditBlocked(page);
4128
+ const blockReason = redditBlockReason(page);
4129
+ const blocked = Boolean(blockReason);
4041
4130
  return {
4042
4131
  method,
4043
4132
  url,
4044
4133
  ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
4045
4134
  statusCode: page.statusCode,
4046
4135
  blocked,
4136
+ blockReason,
4047
4137
  retryAfter,
4048
4138
  page
4049
4139
  };
@@ -4294,8 +4384,52 @@ function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
4294
4384
  statusCode: jsonPage.statusCode
4295
4385
  };
4296
4386
  }
4297
- function isRedditBlocked(page) {
4298
- return page.statusCode === 403 || page.statusCode === 429 || /please wait for verification|whoa there, pardner|blocked|forbidden|too many requests|request has been blocked/i.test(page.html);
4387
+ function redditProviderDiagnosticsFromAttempts(attempts) {
4388
+ const blockedAttempts = attempts.filter((attempt) => attempt.blocked);
4389
+ if (blockedAttempts.length === 0) {
4390
+ return void 0;
4391
+ }
4392
+ const selectedAttempt = blockedAttempts.find((attempt) => attempt.blockReason === "provider_verification_required") ?? blockedAttempts.at(-1);
4393
+ return {
4394
+ platform: "reddit",
4395
+ blocked: true,
4396
+ statusCode: selectedAttempt?.statusCode,
4397
+ reason: selectedAttempt?.blockReason ?? "provider_blocked_request",
4398
+ suggestedAction: PROVIDER_BLOCKED_SUGGESTED_ACTION
4399
+ };
4400
+ }
4401
+ function synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics) {
4402
+ const selectedPage = attempts.find((attempt) => attempt.blockReason === providerDiagnostics.reason)?.page ?? attempts.slice().reverse().find((attempt) => attempt.page)?.page;
4403
+ return {
4404
+ url: requestedUrl,
4405
+ originalUrl: requestedUrl,
4406
+ finalUrl: requestedUrl,
4407
+ isShortUrl: selectedPage?.isShortUrl ?? false,
4408
+ shortUrlProvider: selectedPage?.shortUrlProvider,
4409
+ html: "",
4410
+ bytes: new Uint8Array(),
4411
+ statusCode: providerDiagnostics.statusCode ?? selectedPage?.statusCode ?? 403,
4412
+ contentType: selectedPage?.contentType,
4413
+ redirects: selectedPage?.redirects ?? [],
4414
+ headers: selectedPage?.headers ?? {}
4415
+ };
4416
+ }
4417
+ function redditBlockReason(page) {
4418
+ const title = htmlTitle(page.html);
4419
+ const text = normalizeText(`${title ?? ""} ${page.html}`);
4420
+ if (/reddit\s*-\s*please wait for verification/i.test(title ?? "") || /please wait for verification|verification required|verify you are human/i.test(text)) {
4421
+ return "provider_verification_required";
4422
+ }
4423
+ if (page.statusCode === 403 || page.statusCode === 429 || /whoa there, pardner|request has been blocked|too many requests|forbidden|you're blocked|you are blocked|youre blocked|blocked by network security/i.test(text) || /^blocked$/i.test(title ?? "")) {
4424
+ return "provider_blocked_request";
4425
+ }
4426
+ return void 0;
4427
+ }
4428
+ function htmlTitle(html) {
4429
+ return normalizeText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1]);
4430
+ }
4431
+ function normalizeText(value) {
4432
+ return value?.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim() ?? "";
4299
4433
  }
4300
4434
  function redditRetryInfo(attempts) {
4301
4435
  const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
@@ -4329,7 +4463,7 @@ function mergeFallbackAttempts2(existing, incoming) {
4329
4463
  }
4330
4464
  const seen = /* @__PURE__ */ new Set();
4331
4465
  return attempts.map((value) => {
4332
- const { page: _page, ...attempt } = value;
4466
+ const { page: _page, blockReason: _blockReason, ...attempt } = value;
4333
4467
  return attempt;
4334
4468
  }).filter((attempt) => {
4335
4469
  const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;