openclaw-smart-fetch 0.2.29 → 0.2.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +59 -7
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -9607,6 +9607,12 @@ function buildPlainTextResult(opts, finalUrl, rawBody, format, maxChars, browser
|
|
|
9607
9607
|
os
|
|
9608
9608
|
};
|
|
9609
9609
|
}
|
|
9610
|
+
function isTwitterJsDisabledPage(document, url) {
|
|
9611
|
+
if (!/^(https?:\/\/)?(www\.)?(x\.com|twitter\.com)\//i.test(url))
|
|
9612
|
+
return false;
|
|
9613
|
+
const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
|
|
9614
|
+
return text.includes("JavaScript is disabled") && text.includes("supported browser");
|
|
9615
|
+
}
|
|
9610
9616
|
function extractDomTextFallback(document) {
|
|
9611
9617
|
const bodyText = document.body?.textContent ?? document.documentElement?.textContent ?? "";
|
|
9612
9618
|
return bodyText.replace(/\r\n/g, "\n").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").replace(/[ \t]{2,}/g, " ").trim();
|
|
@@ -10197,15 +10203,61 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10197
10203
|
});
|
|
10198
10204
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10199
10205
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10200
|
-
|
|
10201
|
-
|
|
10202
|
-
|
|
10203
|
-
|
|
10204
|
-
|
|
10205
|
-
|
|
10206
|
-
|
|
10206
|
+
let extracted;
|
|
10207
|
+
const suppressedErrors = [];
|
|
10208
|
+
try {
|
|
10209
|
+
const origConsoleError = console.error;
|
|
10210
|
+
console.error = (...args) => {
|
|
10211
|
+
suppressedErrors.push(args);
|
|
10212
|
+
};
|
|
10213
|
+
try {
|
|
10214
|
+
extracted = await dependencies.defuddle(
|
|
10215
|
+
extractionDocument,
|
|
10216
|
+
finalUrl,
|
|
10217
|
+
{
|
|
10218
|
+
markdown: format !== "html",
|
|
10219
|
+
removeImages,
|
|
10220
|
+
includeReplies
|
|
10221
|
+
}
|
|
10222
|
+
);
|
|
10223
|
+
} finally {
|
|
10224
|
+
console.error = origConsoleError;
|
|
10207
10225
|
}
|
|
10226
|
+
} catch (_error) {
|
|
10227
|
+
extracted = {
|
|
10228
|
+
content: void 0,
|
|
10229
|
+
wordCount: 0
|
|
10230
|
+
};
|
|
10231
|
+
}
|
|
10232
|
+
const isXUrl = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
|
|
10233
|
+
opts.url
|
|
10208
10234
|
);
|
|
10235
|
+
if (isXUrl) {
|
|
10236
|
+
const hasOembed404 = suppressedErrors.some(
|
|
10237
|
+
(args) => args.some(
|
|
10238
|
+
(arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
|
|
10239
|
+
)
|
|
10240
|
+
);
|
|
10241
|
+
const hasJsDisabledShell = isTwitterJsDisabledPage(
|
|
10242
|
+
fallbackDocument,
|
|
10243
|
+
opts.url
|
|
10244
|
+
);
|
|
10245
|
+
if (hasOembed404 || hasJsDisabledShell) {
|
|
10246
|
+
return {
|
|
10247
|
+
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
10248
|
+
code: "http_error",
|
|
10249
|
+
phase: "loading",
|
|
10250
|
+
retryable: false,
|
|
10251
|
+
timeoutMs,
|
|
10252
|
+
url: opts.url,
|
|
10253
|
+
finalUrl,
|
|
10254
|
+
statusCode: 404,
|
|
10255
|
+
statusText: "Not Found",
|
|
10256
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10257
|
+
contentLength: errorContext.contentLength
|
|
10258
|
+
};
|
|
10259
|
+
}
|
|
10260
|
+
}
|
|
10209
10261
|
let extractedContent = extracted.content;
|
|
10210
10262
|
let wordCount = extracted.wordCount;
|
|
10211
10263
|
if (!extractedContent || wordCount === 0) {
|