pi-smart-fetch 0.2.29 → 0.2.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +59 -7
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -9609,6 +9609,12 @@ function buildPlainTextResult(opts, finalUrl, rawBody, format, maxChars, browser
|
|
|
9609
9609
|
os
|
|
9610
9610
|
};
|
|
9611
9611
|
}
|
|
9612
|
+
function isTwitterJsDisabledPage(document, url) {
|
|
9613
|
+
if (!/^(https?:\/\/)?(www\.)?(x\.com|twitter\.com)\//i.test(url))
|
|
9614
|
+
return false;
|
|
9615
|
+
const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
|
|
9616
|
+
return text.includes("JavaScript is disabled") && text.includes("supported browser");
|
|
9617
|
+
}
|
|
9612
9618
|
function extractDomTextFallback(document) {
|
|
9613
9619
|
const bodyText = document.body?.textContent ?? document.documentElement?.textContent ?? "";
|
|
9614
9620
|
return bodyText.replace(/\r\n/g, "\n").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").replace(/[ \t]{2,}/g, " ").trim();
|
|
@@ -10199,15 +10205,61 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10199
10205
|
});
|
|
10200
10206
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10201
10207
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10202
|
-
|
|
10203
|
-
|
|
10204
|
-
|
|
10205
|
-
|
|
10206
|
-
|
|
10207
|
-
|
|
10208
|
-
|
|
10208
|
+
let extracted;
|
|
10209
|
+
const suppressedErrors = [];
|
|
10210
|
+
try {
|
|
10211
|
+
const origConsoleError = console.error;
|
|
10212
|
+
console.error = (...args) => {
|
|
10213
|
+
suppressedErrors.push(args);
|
|
10214
|
+
};
|
|
10215
|
+
try {
|
|
10216
|
+
extracted = await dependencies.defuddle(
|
|
10217
|
+
extractionDocument,
|
|
10218
|
+
finalUrl,
|
|
10219
|
+
{
|
|
10220
|
+
markdown: format !== "html",
|
|
10221
|
+
removeImages,
|
|
10222
|
+
includeReplies
|
|
10223
|
+
}
|
|
10224
|
+
);
|
|
10225
|
+
} finally {
|
|
10226
|
+
console.error = origConsoleError;
|
|
10209
10227
|
}
|
|
10228
|
+
} catch (_error) {
|
|
10229
|
+
extracted = {
|
|
10230
|
+
content: void 0,
|
|
10231
|
+
wordCount: 0
|
|
10232
|
+
};
|
|
10233
|
+
}
|
|
10234
|
+
const isXUrl = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
|
|
10235
|
+
opts.url
|
|
10210
10236
|
);
|
|
10237
|
+
if (isXUrl) {
|
|
10238
|
+
const hasOembed404 = suppressedErrors.some(
|
|
10239
|
+
(args) => args.some(
|
|
10240
|
+
(arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
|
|
10241
|
+
)
|
|
10242
|
+
);
|
|
10243
|
+
const hasJsDisabledShell = isTwitterJsDisabledPage(
|
|
10244
|
+
fallbackDocument,
|
|
10245
|
+
opts.url
|
|
10246
|
+
);
|
|
10247
|
+
if (hasOembed404 || hasJsDisabledShell) {
|
|
10248
|
+
return {
|
|
10249
|
+
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
10250
|
+
code: "http_error",
|
|
10251
|
+
phase: "loading",
|
|
10252
|
+
retryable: false,
|
|
10253
|
+
timeoutMs,
|
|
10254
|
+
url: opts.url,
|
|
10255
|
+
finalUrl,
|
|
10256
|
+
statusCode: 404,
|
|
10257
|
+
statusText: "Not Found",
|
|
10258
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10259
|
+
contentLength: errorContext.contentLength
|
|
10260
|
+
};
|
|
10261
|
+
}
|
|
10262
|
+
}
|
|
10211
10263
|
let extractedContent = extracted.content;
|
|
10212
10264
|
let wordCount = extracted.wordCount;
|
|
10213
10265
|
if (!extractedContent || wordCount === 0) {
|