metanova 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/dist/index.cjs +146 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -1
- package/dist/index.d.ts +9 -1
- package/dist/index.js +146 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -118,6 +118,7 @@ interface ExtractionDiagnostics {
|
|
|
118
118
|
};
|
|
119
119
|
errors?: string[];
|
|
120
120
|
retryInfo?: ExtractionRetryInfo;
|
|
121
|
+
providerDiagnostics?: ProviderDiagnostics;
|
|
121
122
|
selectedImageReason?: string;
|
|
122
123
|
confidenceBreakdown?: ConfidenceBreakdown;
|
|
123
124
|
fetchDurationMs?: number;
|
|
@@ -139,6 +140,13 @@ interface ExtractionRetryInfo {
|
|
|
139
140
|
retryAfterMs?: number;
|
|
140
141
|
attempts?: number;
|
|
141
142
|
}
|
|
143
|
+
interface ProviderDiagnostics {
|
|
144
|
+
platform: string;
|
|
145
|
+
blocked: boolean;
|
|
146
|
+
statusCode?: number;
|
|
147
|
+
reason?: "provider_verification_required" | "provider_blocked_request" | string;
|
|
148
|
+
suggestedAction?: "retry_on_different_host_or_use_supported_proxy" | string;
|
|
149
|
+
}
|
|
142
150
|
interface ConfidenceBreakdown {
|
|
143
151
|
title: number;
|
|
144
152
|
description: number;
|
|
@@ -521,4 +529,4 @@ interface MetaNovaRegistry {
|
|
|
521
529
|
declare function registerGlobalPlugin(plugin: MetaNovaPlugin): void;
|
|
522
530
|
declare function createRegistry(options?: ParseMetadataOptions): MetaNovaRegistry;
|
|
523
531
|
|
|
524
|
-
export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
|
|
532
|
+
export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type ProviderDiagnostics, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
|
package/dist/index.d.ts
CHANGED
|
@@ -118,6 +118,7 @@ interface ExtractionDiagnostics {
|
|
|
118
118
|
};
|
|
119
119
|
errors?: string[];
|
|
120
120
|
retryInfo?: ExtractionRetryInfo;
|
|
121
|
+
providerDiagnostics?: ProviderDiagnostics;
|
|
121
122
|
selectedImageReason?: string;
|
|
122
123
|
confidenceBreakdown?: ConfidenceBreakdown;
|
|
123
124
|
fetchDurationMs?: number;
|
|
@@ -139,6 +140,13 @@ interface ExtractionRetryInfo {
|
|
|
139
140
|
retryAfterMs?: number;
|
|
140
141
|
attempts?: number;
|
|
141
142
|
}
|
|
143
|
+
interface ProviderDiagnostics {
|
|
144
|
+
platform: string;
|
|
145
|
+
blocked: boolean;
|
|
146
|
+
statusCode?: number;
|
|
147
|
+
reason?: "provider_verification_required" | "provider_blocked_request" | string;
|
|
148
|
+
suggestedAction?: "retry_on_different_host_or_use_supported_proxy" | string;
|
|
149
|
+
}
|
|
142
150
|
interface ConfidenceBreakdown {
|
|
143
151
|
title: number;
|
|
144
152
|
description: number;
|
|
@@ -521,4 +529,4 @@ interface MetaNovaRegistry {
|
|
|
521
529
|
declare function registerGlobalPlugin(plugin: MetaNovaPlugin): void;
|
|
522
530
|
declare function createRegistry(options?: ParseMetadataOptions): MetaNovaRegistry;
|
|
523
531
|
|
|
524
|
-
export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
|
|
532
|
+
export { type AdapterContext, type AdapterExtractionResult, type AdapterRawData, type ApplicationMetadata, type ArticleMetadata, type CompletenessInput, type ConfidenceBreakdown, type ConfidenceEngineInput, type CustomExtractor, DEFAULT_ACCEPT, DEFAULT_ACCEPT_ENCODING, DEFAULT_ACCEPT_LANGUAGE, DEFAULT_BROWSER_USER_AGENT, type EmbeddedDataItem, type EmbeddedDataMetadata, type Entity, type ExtractionDiagnostics, type ExtractionFallbackAttempt, type ExtractionRetryInfo, type ExtractorContext, type FetchMetadataOptions, type FetchedPage, type HtmlMetadata, type ImageScorer, type ImageScoringContext, type JsonLdMetadata, type JsonLdNode, type MediaAsset, type MediaDiscoveryResult, type MediaKind, MetaNova, type MetaNovaCache, type MetaNovaCacheEntry, type MetaNovaPlugin, type MetaNovaPluginApi, type MetaNovaRegistry, type MetadataSource, type MetadataSourceAttribution, type MetadataType, type OEmbedData, type OEmbedLink, type OEmbedMetadata, type OpenGraphMetadata, type ParseMetadataOptions, type PlaylistMetadata, type PlaylistVideo, type PluginExtractionResult, type PreviewCard, type ProductMetadata, type ProviderDiagnostics, type RawMetadataSources, type RedirectEntry, type RedirectResolution, type ReliabilityInput, SecurityError, type SiteAdapter, type TwitterMetadata, type UnifiedMetadata, type VideoMetadata, addWarning, assertSafeRequestUrl, behanceAdapter, calculateCompleteness, calculateConfidence, calculateConfidenceBreakdown, calculateReliability, createDiagnostics, createPreviewCard, createRegistry, MetaNova as default, defaultAdapters, detectShortUrl, discoverMedia, extractAudio, extractEmbeddedData, extractHtmlMetadata, extractImages, extractJsonLd, extractOEmbed, extractOpenGraph, extractTwitterCards, extractVideos, facebookAdapter, fetchMetadata, fetchPage, instagramAdapter, normalizeMetadata, normalizeUrl, parseMetadata, parseMetadataAsync, pinterestAdapter, redditAdapter, registerGlobalPlugin, resolveCanonicalUrl, resolveRedirects, resolveUrl, scoreImages, tiktokAdapter, twitterAdapter, validateUrl, youtubeAdapter };
|
package/dist/index.js
CHANGED
|
@@ -2814,8 +2814,8 @@ var redditAdapter = {
|
|
|
2814
2814
|
type: reddit.isPost ? "social_post" : "website",
|
|
2815
2815
|
siteName: "Reddit",
|
|
2816
2816
|
canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
|
|
2817
|
-
title:
|
|
2818
|
-
description: descriptionSelection.value,
|
|
2817
|
+
title: cleanRedditTitle(titleSelection.value),
|
|
2818
|
+
description: cleanRedditDescription(descriptionSelection.value),
|
|
2819
2819
|
images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
|
|
2820
2820
|
videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
|
|
2821
2821
|
author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
|
|
@@ -3489,6 +3489,20 @@ function parseRedditUrl(url) {
|
|
|
3489
3489
|
function cleanSocialTitle(title) {
|
|
3490
3490
|
return title?.replace(/\s*:\s*r\/[A-Za-z0-9_]+$/i, "").trim();
|
|
3491
3491
|
}
|
|
3492
|
+
function cleanRedditTitle(title) {
|
|
3493
|
+
const cleaned = cleanSocialTitle(title);
|
|
3494
|
+
if (!cleaned || /reddit\s*-\s*please wait for verification|please wait for verification|whoa there, pardner/i.test(cleaned)) {
|
|
3495
|
+
return void 0;
|
|
3496
|
+
}
|
|
3497
|
+
return cleaned;
|
|
3498
|
+
}
|
|
3499
|
+
function cleanRedditDescription(description) {
|
|
3500
|
+
const cleaned = description?.replace(/\s+/g, " ").trim();
|
|
3501
|
+
if (!cleaned || /please wait for verification|whoa there, pardner|request has been blocked/i.test(cleaned)) {
|
|
3502
|
+
return void 0;
|
|
3503
|
+
}
|
|
3504
|
+
return cleaned;
|
|
3505
|
+
}
|
|
3492
3506
|
function hostMatches(url, domains) {
|
|
3493
3507
|
const host = url.hostname.toLowerCase().replace(/^www\./, "");
|
|
3494
3508
|
return domains.some((domain) => host === domain || host.endsWith(`.${domain}`));
|
|
@@ -3878,12 +3892,17 @@ function ascii(bytes, offset, length) {
|
|
|
3878
3892
|
}
|
|
3879
3893
|
|
|
3880
3894
|
// src/fetchMetadata.ts
|
|
3895
|
+
var REDDIT_BLOCKED_METADATA_WARNING = "Reddit returned a verification/block page; metadata is incomplete.";
|
|
3896
|
+
var PROVIDER_BLOCKED_SUGGESTED_ACTION = "retry_on_different_host_or_use_supported_proxy";
|
|
3881
3897
|
async function fetchMetadata(url, options = {}) {
|
|
3882
3898
|
const startedAt = Date.now();
|
|
3883
3899
|
try {
|
|
3884
3900
|
const requestedUrl = normalizeUrl(url);
|
|
3885
3901
|
const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
|
|
3886
3902
|
const page = fetchResult.page;
|
|
3903
|
+
if (fetchResult.providerDiagnostics?.blocked) {
|
|
3904
|
+
return createBlockedProviderMetadata(requestedUrl, fetchResult, Date.now() - startedAt);
|
|
3905
|
+
}
|
|
3887
3906
|
const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
|
|
3888
3907
|
if (directMedia) {
|
|
3889
3908
|
return directMedia;
|
|
@@ -3952,6 +3971,62 @@ async function fetchMetadata(url, options = {}) {
|
|
|
3952
3971
|
};
|
|
3953
3972
|
}
|
|
3954
3973
|
}
|
|
3974
|
+
function createBlockedProviderMetadata(requestedUrl, fetchResult, fetchDurationMs) {
|
|
3975
|
+
const page = fetchResult.page;
|
|
3976
|
+
const providerDiagnostics = fetchResult.providerDiagnostics;
|
|
3977
|
+
const trace = uniqueStrings3([
|
|
3978
|
+
...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
|
|
3979
|
+
...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
|
|
3980
|
+
...fetchResult.trace,
|
|
3981
|
+
"detected blocked provider response"
|
|
3982
|
+
]);
|
|
3983
|
+
const warnings = uniqueStrings3([
|
|
3984
|
+
...fetchResult.warnings,
|
|
3985
|
+
REDDIT_BLOCKED_METADATA_WARNING,
|
|
3986
|
+
...page.statusCode < 200 || page.statusCode >= 300 ? [`Fetch completed with non-success status code ${page.statusCode}.`] : []
|
|
3987
|
+
]);
|
|
3988
|
+
return {
|
|
3989
|
+
ok: false,
|
|
3990
|
+
url: requestedUrl,
|
|
3991
|
+
finalUrl: page.finalUrl,
|
|
3992
|
+
type: "unknown",
|
|
3993
|
+
siteName: providerDiagnostics?.platform === "reddit" ? "Reddit" : void 0,
|
|
3994
|
+
confidence: 0,
|
|
3995
|
+
completeness: 0,
|
|
3996
|
+
reliability: 0,
|
|
3997
|
+
images: [],
|
|
3998
|
+
videos: [],
|
|
3999
|
+
audio: [],
|
|
4000
|
+
favicons: [],
|
|
4001
|
+
trace,
|
|
4002
|
+
diagnostics: {
|
|
4003
|
+
originalUrl: requestedUrl,
|
|
4004
|
+
finalUrl: page.finalUrl,
|
|
4005
|
+
isShortUrl: page.isShortUrl,
|
|
4006
|
+
shortUrlProvider: page.shortUrlProvider,
|
|
4007
|
+
statusCode: page.statusCode,
|
|
4008
|
+
contentType: page.contentType,
|
|
4009
|
+
redirects: page.redirects,
|
|
4010
|
+
sourcesUsed: [],
|
|
4011
|
+
warnings,
|
|
4012
|
+
fallbacksAttempted: mergeFallbackAttempts2(void 0, fetchResult.fallbacksAttempted),
|
|
4013
|
+
trace,
|
|
4014
|
+
sourcePriority: fetchResult.sourcePriority,
|
|
4015
|
+
extractionMethod: fetchResult.extractionMethod,
|
|
4016
|
+
retryInfo: fetchResult.retryInfo,
|
|
4017
|
+
providerDiagnostics,
|
|
4018
|
+
confidenceBreakdown: {
|
|
4019
|
+
title: 0,
|
|
4020
|
+
description: 0,
|
|
4021
|
+
image: 0,
|
|
4022
|
+
structuredData: 0,
|
|
4023
|
+
adapter: 0
|
|
4024
|
+
},
|
|
4025
|
+
fetchDurationMs,
|
|
4026
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
4027
|
+
}
|
|
4028
|
+
};
|
|
4029
|
+
}
|
|
3955
4030
|
async function fetchPageWithStrategies(requestedUrl, options) {
|
|
3956
4031
|
if (isRedditUrl(requestedUrl)) {
|
|
3957
4032
|
return fetchRedditPageWithStrategy(requestedUrl, options);
|
|
@@ -3976,7 +4051,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
3976
4051
|
});
|
|
3977
4052
|
attempts.push(attempt);
|
|
3978
4053
|
lastError = attempt.error;
|
|
3979
|
-
if (attempt.page && attempt.ok
|
|
4054
|
+
if (attempt.page && attempt.ok) {
|
|
3980
4055
|
const redditPost = parseRedditJsonPayload(attempt.page.html);
|
|
3981
4056
|
if (redditPost?.title) {
|
|
3982
4057
|
return {
|
|
@@ -3999,7 +4074,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
3999
4074
|
const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
|
|
4000
4075
|
attempts.push(attempt);
|
|
4001
4076
|
lastError = attempt.error;
|
|
4002
|
-
if (attempt.page && attempt.ok
|
|
4077
|
+
if (attempt.page && attempt.ok) {
|
|
4003
4078
|
return {
|
|
4004
4079
|
page: attempt.page,
|
|
4005
4080
|
fallbacksAttempted: attempts,
|
|
@@ -4017,10 +4092,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
4017
4092
|
const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
|
|
4018
4093
|
attempts.push(htmlAttempt);
|
|
4019
4094
|
lastError = htmlAttempt.error;
|
|
4020
|
-
if (htmlAttempt.page) {
|
|
4021
|
-
if (htmlAttempt.blocked) {
|
|
4022
|
-
warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
|
|
4023
|
-
}
|
|
4095
|
+
if (htmlAttempt.page && htmlAttempt.ok) {
|
|
4024
4096
|
return {
|
|
4025
4097
|
page: htmlAttempt.page,
|
|
4026
4098
|
fallbacksAttempted: attempts,
|
|
@@ -4031,19 +4103,37 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
4031
4103
|
retryInfo: redditRetryInfo(attempts)
|
|
4032
4104
|
};
|
|
4033
4105
|
}
|
|
4106
|
+
if (htmlAttempt.blocked) {
|
|
4107
|
+
warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
|
|
4108
|
+
}
|
|
4109
|
+
const providerDiagnostics = redditProviderDiagnosticsFromAttempts(attempts);
|
|
4110
|
+
if (providerDiagnostics) {
|
|
4111
|
+
return {
|
|
4112
|
+
page: synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics),
|
|
4113
|
+
fallbacksAttempted: attempts,
|
|
4114
|
+
warnings: uniqueStrings3([...warnings, REDDIT_BLOCKED_METADATA_WARNING]),
|
|
4115
|
+
trace: ["Reddit provider blocked metadata extraction"],
|
|
4116
|
+
sourcePriority,
|
|
4117
|
+
extractionMethod: "reddit:blockedProvider",
|
|
4118
|
+
retryInfo: redditRetryInfo(attempts),
|
|
4119
|
+
providerDiagnostics
|
|
4120
|
+
};
|
|
4121
|
+
}
|
|
4034
4122
|
throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
|
|
4035
4123
|
}
|
|
4036
4124
|
async function attemptFetch(method, url, options) {
|
|
4037
4125
|
try {
|
|
4038
4126
|
const page = await fetchPage(url, options);
|
|
4039
4127
|
const retryAfter = page.headers["retry-after"];
|
|
4040
|
-
const
|
|
4128
|
+
const blockReason = redditBlockReason(page);
|
|
4129
|
+
const blocked = Boolean(blockReason);
|
|
4041
4130
|
return {
|
|
4042
4131
|
method,
|
|
4043
4132
|
url,
|
|
4044
4133
|
ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
|
|
4045
4134
|
statusCode: page.statusCode,
|
|
4046
4135
|
blocked,
|
|
4136
|
+
blockReason,
|
|
4047
4137
|
retryAfter,
|
|
4048
4138
|
page
|
|
4049
4139
|
};
|
|
@@ -4294,8 +4384,52 @@ function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
|
|
|
4294
4384
|
statusCode: jsonPage.statusCode
|
|
4295
4385
|
};
|
|
4296
4386
|
}
|
|
4297
|
-
function
|
|
4298
|
-
|
|
4387
|
+
function redditProviderDiagnosticsFromAttempts(attempts) {
|
|
4388
|
+
const blockedAttempts = attempts.filter((attempt) => attempt.blocked);
|
|
4389
|
+
if (blockedAttempts.length === 0) {
|
|
4390
|
+
return void 0;
|
|
4391
|
+
}
|
|
4392
|
+
const selectedAttempt = blockedAttempts.find((attempt) => attempt.blockReason === "provider_verification_required") ?? blockedAttempts.at(-1);
|
|
4393
|
+
return {
|
|
4394
|
+
platform: "reddit",
|
|
4395
|
+
blocked: true,
|
|
4396
|
+
statusCode: selectedAttempt?.statusCode,
|
|
4397
|
+
reason: selectedAttempt?.blockReason ?? "provider_blocked_request",
|
|
4398
|
+
suggestedAction: PROVIDER_BLOCKED_SUGGESTED_ACTION
|
|
4399
|
+
};
|
|
4400
|
+
}
|
|
4401
|
+
function synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics) {
|
|
4402
|
+
const selectedPage = attempts.find((attempt) => attempt.blockReason === providerDiagnostics.reason)?.page ?? attempts.slice().reverse().find((attempt) => attempt.page)?.page;
|
|
4403
|
+
return {
|
|
4404
|
+
url: requestedUrl,
|
|
4405
|
+
originalUrl: requestedUrl,
|
|
4406
|
+
finalUrl: requestedUrl,
|
|
4407
|
+
isShortUrl: selectedPage?.isShortUrl ?? false,
|
|
4408
|
+
shortUrlProvider: selectedPage?.shortUrlProvider,
|
|
4409
|
+
html: "",
|
|
4410
|
+
bytes: new Uint8Array(),
|
|
4411
|
+
statusCode: providerDiagnostics.statusCode ?? selectedPage?.statusCode ?? 403,
|
|
4412
|
+
contentType: selectedPage?.contentType,
|
|
4413
|
+
redirects: selectedPage?.redirects ?? [],
|
|
4414
|
+
headers: selectedPage?.headers ?? {}
|
|
4415
|
+
};
|
|
4416
|
+
}
|
|
4417
|
+
function redditBlockReason(page) {
|
|
4418
|
+
const title = htmlTitle(page.html);
|
|
4419
|
+
const text = normalizeText(`${title ?? ""} ${page.html}`);
|
|
4420
|
+
if (/reddit\s*-\s*please wait for verification/i.test(title ?? "") || /please wait for verification|verification required|verify you are human/i.test(text)) {
|
|
4421
|
+
return "provider_verification_required";
|
|
4422
|
+
}
|
|
4423
|
+
if (page.statusCode === 403 || page.statusCode === 429 || /whoa there, pardner|request has been blocked|too many requests|forbidden|you're blocked|you are blocked|youre blocked|blocked by network security/i.test(text) || /^blocked$/i.test(title ?? "")) {
|
|
4424
|
+
return "provider_blocked_request";
|
|
4425
|
+
}
|
|
4426
|
+
return void 0;
|
|
4427
|
+
}
|
|
4428
|
+
function htmlTitle(html) {
|
|
4429
|
+
return normalizeText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1]);
|
|
4430
|
+
}
|
|
4431
|
+
function normalizeText(value) {
|
|
4432
|
+
return value?.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim() ?? "";
|
|
4299
4433
|
}
|
|
4300
4434
|
function redditRetryInfo(attempts) {
|
|
4301
4435
|
const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
|
|
@@ -4329,7 +4463,7 @@ function mergeFallbackAttempts2(existing, incoming) {
|
|
|
4329
4463
|
}
|
|
4330
4464
|
const seen = /* @__PURE__ */ new Set();
|
|
4331
4465
|
return attempts.map((value) => {
|
|
4332
|
-
const { page: _page, ...attempt } = value;
|
|
4466
|
+
const { page: _page, blockReason: _blockReason, ...attempt } = value;
|
|
4333
4467
|
return attempt;
|
|
4334
4468
|
}).filter((attempt) => {
|
|
4335
4469
|
const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
|