metanova 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/dist/index.cjs +146 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -1
- package/dist/index.d.ts +9 -1
- package/dist/index.js +146 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
|
+
## v0.2.1
|
|
4
|
+
|
|
5
|
+
Release date: 2026-06-04
|
|
6
|
+
|
|
7
|
+
### Fixed
|
|
8
|
+
|
|
9
|
+
- Fixed Reddit verification/block pages that returned HTTP 200 being treated as successful metadata.
|
|
10
|
+
- Fixed blocked Reddit fallback handling so provider diagnostics clearly report blocked status, reason, status code, and suggested action.
|
|
11
|
+
- Fixed Reddit verification titles such as `Reddit - Please wait for verification` leaking into `metadata.title`.
|
|
12
|
+
|
|
3
13
|
## v0.2.0
|
|
4
14
|
|
|
5
15
|
Release date: 2026-06-04
|
package/dist/index.cjs
CHANGED
|
@@ -2897,8 +2897,8 @@ var redditAdapter = {
|
|
|
2897
2897
|
type: reddit.isPost ? "social_post" : "website",
|
|
2898
2898
|
siteName: "Reddit",
|
|
2899
2899
|
canonicalUrl: context.raw.openGraph.url ?? context.raw.html.canonicalUrl,
|
|
2900
|
-
title:
|
|
2901
|
-
description: descriptionSelection.value,
|
|
2900
|
+
title: cleanRedditTitle(titleSelection.value),
|
|
2901
|
+
description: cleanRedditDescription(descriptionSelection.value),
|
|
2902
2902
|
images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
|
|
2903
2903
|
videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
|
|
2904
2904
|
author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
|
|
@@ -3572,6 +3572,20 @@ function parseRedditUrl(url) {
|
|
|
3572
3572
|
function cleanSocialTitle(title) {
|
|
3573
3573
|
return title?.replace(/\s*:\s*r\/[A-Za-z0-9_]+$/i, "").trim();
|
|
3574
3574
|
}
|
|
3575
|
+
function cleanRedditTitle(title) {
|
|
3576
|
+
const cleaned = cleanSocialTitle(title);
|
|
3577
|
+
if (!cleaned || /reddit\s*-\s*please wait for verification|please wait for verification|whoa there, pardner/i.test(cleaned)) {
|
|
3578
|
+
return void 0;
|
|
3579
|
+
}
|
|
3580
|
+
return cleaned;
|
|
3581
|
+
}
|
|
3582
|
+
function cleanRedditDescription(description) {
|
|
3583
|
+
const cleaned = description?.replace(/\s+/g, " ").trim();
|
|
3584
|
+
if (!cleaned || /please wait for verification|whoa there, pardner|request has been blocked/i.test(cleaned)) {
|
|
3585
|
+
return void 0;
|
|
3586
|
+
}
|
|
3587
|
+
return cleaned;
|
|
3588
|
+
}
|
|
3575
3589
|
function hostMatches(url, domains) {
|
|
3576
3590
|
const host = url.hostname.toLowerCase().replace(/^www\./, "");
|
|
3577
3591
|
return domains.some((domain) => host === domain || host.endsWith(`.${domain}`));
|
|
@@ -3961,12 +3975,17 @@ function ascii(bytes, offset, length) {
|
|
|
3961
3975
|
}
|
|
3962
3976
|
|
|
3963
3977
|
// src/fetchMetadata.ts
|
|
3978
|
+
var REDDIT_BLOCKED_METADATA_WARNING = "Reddit returned a verification/block page; metadata is incomplete.";
|
|
3979
|
+
var PROVIDER_BLOCKED_SUGGESTED_ACTION = "retry_on_different_host_or_use_supported_proxy";
|
|
3964
3980
|
async function fetchMetadata(url, options = {}) {
|
|
3965
3981
|
const startedAt = Date.now();
|
|
3966
3982
|
try {
|
|
3967
3983
|
const requestedUrl = normalizeUrl(url);
|
|
3968
3984
|
const fetchResult = await fetchPageWithStrategies(requestedUrl, options);
|
|
3969
3985
|
const page = fetchResult.page;
|
|
3986
|
+
if (fetchResult.providerDiagnostics?.blocked) {
|
|
3987
|
+
return createBlockedProviderMetadata(requestedUrl, fetchResult, Date.now() - startedAt);
|
|
3988
|
+
}
|
|
3970
3989
|
const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
|
|
3971
3990
|
if (directMedia) {
|
|
3972
3991
|
return directMedia;
|
|
@@ -4035,6 +4054,62 @@ async function fetchMetadata(url, options = {}) {
|
|
|
4035
4054
|
};
|
|
4036
4055
|
}
|
|
4037
4056
|
}
|
|
4057
|
+
function createBlockedProviderMetadata(requestedUrl, fetchResult, fetchDurationMs) {
|
|
4058
|
+
const page = fetchResult.page;
|
|
4059
|
+
const providerDiagnostics = fetchResult.providerDiagnostics;
|
|
4060
|
+
const trace = uniqueStrings3([
|
|
4061
|
+
...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
|
|
4062
|
+
...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
|
|
4063
|
+
...fetchResult.trace,
|
|
4064
|
+
"detected blocked provider response"
|
|
4065
|
+
]);
|
|
4066
|
+
const warnings = uniqueStrings3([
|
|
4067
|
+
...fetchResult.warnings,
|
|
4068
|
+
REDDIT_BLOCKED_METADATA_WARNING,
|
|
4069
|
+
...page.statusCode < 200 || page.statusCode >= 300 ? [`Fetch completed with non-success status code ${page.statusCode}.`] : []
|
|
4070
|
+
]);
|
|
4071
|
+
return {
|
|
4072
|
+
ok: false,
|
|
4073
|
+
url: requestedUrl,
|
|
4074
|
+
finalUrl: page.finalUrl,
|
|
4075
|
+
type: "unknown",
|
|
4076
|
+
siteName: providerDiagnostics?.platform === "reddit" ? "Reddit" : void 0,
|
|
4077
|
+
confidence: 0,
|
|
4078
|
+
completeness: 0,
|
|
4079
|
+
reliability: 0,
|
|
4080
|
+
images: [],
|
|
4081
|
+
videos: [],
|
|
4082
|
+
audio: [],
|
|
4083
|
+
favicons: [],
|
|
4084
|
+
trace,
|
|
4085
|
+
diagnostics: {
|
|
4086
|
+
originalUrl: requestedUrl,
|
|
4087
|
+
finalUrl: page.finalUrl,
|
|
4088
|
+
isShortUrl: page.isShortUrl,
|
|
4089
|
+
shortUrlProvider: page.shortUrlProvider,
|
|
4090
|
+
statusCode: page.statusCode,
|
|
4091
|
+
contentType: page.contentType,
|
|
4092
|
+
redirects: page.redirects,
|
|
4093
|
+
sourcesUsed: [],
|
|
4094
|
+
warnings,
|
|
4095
|
+
fallbacksAttempted: mergeFallbackAttempts2(void 0, fetchResult.fallbacksAttempted),
|
|
4096
|
+
trace,
|
|
4097
|
+
sourcePriority: fetchResult.sourcePriority,
|
|
4098
|
+
extractionMethod: fetchResult.extractionMethod,
|
|
4099
|
+
retryInfo: fetchResult.retryInfo,
|
|
4100
|
+
providerDiagnostics,
|
|
4101
|
+
confidenceBreakdown: {
|
|
4102
|
+
title: 0,
|
|
4103
|
+
description: 0,
|
|
4104
|
+
image: 0,
|
|
4105
|
+
structuredData: 0,
|
|
4106
|
+
adapter: 0
|
|
4107
|
+
},
|
|
4108
|
+
fetchDurationMs,
|
|
4109
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
4110
|
+
}
|
|
4111
|
+
};
|
|
4112
|
+
}
|
|
4038
4113
|
async function fetchPageWithStrategies(requestedUrl, options) {
|
|
4039
4114
|
if (isRedditUrl(requestedUrl)) {
|
|
4040
4115
|
return fetchRedditPageWithStrategy(requestedUrl, options);
|
|
@@ -4059,7 +4134,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
4059
4134
|
});
|
|
4060
4135
|
attempts.push(attempt);
|
|
4061
4136
|
lastError = attempt.error;
|
|
4062
|
-
if (attempt.page && attempt.ok
|
|
4137
|
+
if (attempt.page && attempt.ok) {
|
|
4063
4138
|
const redditPost = parseRedditJsonPayload(attempt.page.html);
|
|
4064
4139
|
if (redditPost?.title) {
|
|
4065
4140
|
return {
|
|
@@ -4082,7 +4157,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
4082
4157
|
const attempt = await attemptFetch("oldReddit", oldRedditUrl, options);
|
|
4083
4158
|
attempts.push(attempt);
|
|
4084
4159
|
lastError = attempt.error;
|
|
4085
|
-
if (attempt.page && attempt.ok
|
|
4160
|
+
if (attempt.page && attempt.ok) {
|
|
4086
4161
|
return {
|
|
4087
4162
|
page: attempt.page,
|
|
4088
4163
|
fallbacksAttempted: attempts,
|
|
@@ -4100,10 +4175,7 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
4100
4175
|
const htmlAttempt = await attemptFetch("redditHtmlFallback", requestedUrl, options);
|
|
4101
4176
|
attempts.push(htmlAttempt);
|
|
4102
4177
|
lastError = htmlAttempt.error;
|
|
4103
|
-
if (htmlAttempt.page) {
|
|
4104
|
-
if (htmlAttempt.blocked) {
|
|
4105
|
-
warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
|
|
4106
|
-
}
|
|
4178
|
+
if (htmlAttempt.page && htmlAttempt.ok) {
|
|
4107
4179
|
return {
|
|
4108
4180
|
page: htmlAttempt.page,
|
|
4109
4181
|
fallbacksAttempted: attempts,
|
|
@@ -4114,19 +4186,37 @@ async function fetchRedditPageWithStrategy(requestedUrl, options) {
|
|
|
4114
4186
|
retryInfo: redditRetryInfo(attempts)
|
|
4115
4187
|
};
|
|
4116
4188
|
}
|
|
4189
|
+
if (htmlAttempt.blocked) {
|
|
4190
|
+
warnings.push("Reddit HTML fallback appears to have been blocked; metadata may be incomplete.");
|
|
4191
|
+
}
|
|
4192
|
+
const providerDiagnostics = redditProviderDiagnosticsFromAttempts(attempts);
|
|
4193
|
+
if (providerDiagnostics) {
|
|
4194
|
+
return {
|
|
4195
|
+
page: synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics),
|
|
4196
|
+
fallbacksAttempted: attempts,
|
|
4197
|
+
warnings: uniqueStrings3([...warnings, REDDIT_BLOCKED_METADATA_WARNING]),
|
|
4198
|
+
trace: ["Reddit provider blocked metadata extraction"],
|
|
4199
|
+
sourcePriority,
|
|
4200
|
+
extractionMethod: "reddit:blockedProvider",
|
|
4201
|
+
retryInfo: redditRetryInfo(attempts),
|
|
4202
|
+
providerDiagnostics
|
|
4203
|
+
};
|
|
4204
|
+
}
|
|
4117
4205
|
throw lastError ?? new Error("All Reddit extraction fetch attempts failed.");
|
|
4118
4206
|
}
|
|
4119
4207
|
async function attemptFetch(method, url, options) {
|
|
4120
4208
|
try {
|
|
4121
4209
|
const page = await fetchPage(url, options);
|
|
4122
4210
|
const retryAfter = page.headers["retry-after"];
|
|
4123
|
-
const
|
|
4211
|
+
const blockReason = redditBlockReason(page);
|
|
4212
|
+
const blocked = Boolean(blockReason);
|
|
4124
4213
|
return {
|
|
4125
4214
|
method,
|
|
4126
4215
|
url,
|
|
4127
4216
|
ok: page.statusCode >= 200 && page.statusCode < 300 && !blocked,
|
|
4128
4217
|
statusCode: page.statusCode,
|
|
4129
4218
|
blocked,
|
|
4219
|
+
blockReason,
|
|
4130
4220
|
retryAfter,
|
|
4131
4221
|
page
|
|
4132
4222
|
};
|
|
@@ -4377,8 +4467,52 @@ function synthesizeRedditJsonPage(jsonPage, requestedUrl, post) {
|
|
|
4377
4467
|
statusCode: jsonPage.statusCode
|
|
4378
4468
|
};
|
|
4379
4469
|
}
|
|
4380
|
-
function
|
|
4381
|
-
|
|
4470
|
+
function redditProviderDiagnosticsFromAttempts(attempts) {
|
|
4471
|
+
const blockedAttempts = attempts.filter((attempt) => attempt.blocked);
|
|
4472
|
+
if (blockedAttempts.length === 0) {
|
|
4473
|
+
return void 0;
|
|
4474
|
+
}
|
|
4475
|
+
const selectedAttempt = blockedAttempts.find((attempt) => attempt.blockReason === "provider_verification_required") ?? blockedAttempts.at(-1);
|
|
4476
|
+
return {
|
|
4477
|
+
platform: "reddit",
|
|
4478
|
+
blocked: true,
|
|
4479
|
+
statusCode: selectedAttempt?.statusCode,
|
|
4480
|
+
reason: selectedAttempt?.blockReason ?? "provider_blocked_request",
|
|
4481
|
+
suggestedAction: PROVIDER_BLOCKED_SUGGESTED_ACTION
|
|
4482
|
+
};
|
|
4483
|
+
}
|
|
4484
|
+
function synthesizeRedditBlockedPage(requestedUrl, attempts, providerDiagnostics) {
|
|
4485
|
+
const selectedPage = attempts.find((attempt) => attempt.blockReason === providerDiagnostics.reason)?.page ?? attempts.slice().reverse().find((attempt) => attempt.page)?.page;
|
|
4486
|
+
return {
|
|
4487
|
+
url: requestedUrl,
|
|
4488
|
+
originalUrl: requestedUrl,
|
|
4489
|
+
finalUrl: requestedUrl,
|
|
4490
|
+
isShortUrl: selectedPage?.isShortUrl ?? false,
|
|
4491
|
+
shortUrlProvider: selectedPage?.shortUrlProvider,
|
|
4492
|
+
html: "",
|
|
4493
|
+
bytes: new Uint8Array(),
|
|
4494
|
+
statusCode: providerDiagnostics.statusCode ?? selectedPage?.statusCode ?? 403,
|
|
4495
|
+
contentType: selectedPage?.contentType,
|
|
4496
|
+
redirects: selectedPage?.redirects ?? [],
|
|
4497
|
+
headers: selectedPage?.headers ?? {}
|
|
4498
|
+
};
|
|
4499
|
+
}
|
|
4500
|
+
function redditBlockReason(page) {
|
|
4501
|
+
const title = htmlTitle(page.html);
|
|
4502
|
+
const text = normalizeText(`${title ?? ""} ${page.html}`);
|
|
4503
|
+
if (/reddit\s*-\s*please wait for verification/i.test(title ?? "") || /please wait for verification|verification required|verify you are human/i.test(text)) {
|
|
4504
|
+
return "provider_verification_required";
|
|
4505
|
+
}
|
|
4506
|
+
if (page.statusCode === 403 || page.statusCode === 429 || /whoa there, pardner|request has been blocked|too many requests|forbidden|you're blocked|you are blocked|youre blocked|blocked by network security/i.test(text) || /^blocked$/i.test(title ?? "")) {
|
|
4507
|
+
return "provider_blocked_request";
|
|
4508
|
+
}
|
|
4509
|
+
return void 0;
|
|
4510
|
+
}
|
|
4511
|
+
function htmlTitle(html) {
|
|
4512
|
+
return normalizeText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1]);
|
|
4513
|
+
}
|
|
4514
|
+
function normalizeText(value) {
|
|
4515
|
+
return value?.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim() ?? "";
|
|
4382
4516
|
}
|
|
4383
4517
|
function redditRetryInfo(attempts) {
|
|
4384
4518
|
const blockedAttempts = attempts.filter((attempt) => attempt.blocked || attempt.statusCode === 429 || attempt.statusCode === 403);
|
|
@@ -4412,7 +4546,7 @@ function mergeFallbackAttempts2(existing, incoming) {
|
|
|
4412
4546
|
}
|
|
4413
4547
|
const seen = /* @__PURE__ */ new Set();
|
|
4414
4548
|
return attempts.map((value) => {
|
|
4415
|
-
const { page: _page, ...attempt } = value;
|
|
4549
|
+
const { page: _page, blockReason: _blockReason, ...attempt } = value;
|
|
4416
4550
|
return attempt;
|
|
4417
4551
|
}).filter((attempt) => {
|
|
4418
4552
|
const key = `${attempt.method}:${attempt.url ?? ""}:${attempt.statusCode ?? ""}:${attempt.error ?? ""}`;
|