openclaw-smart-fetch 0.2.29 → 0.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9607,6 +9607,12 @@ function buildPlainTextResult(opts, finalUrl, rawBody, format, maxChars, browser
9607
9607
  os
9608
9608
  };
9609
9609
  }
9610
+ function isTwitterJsDisabledPage(document, url) {
9611
+ if (!/^(https?:\/\/)?(www\.)?(x\.com|twitter\.com)\//i.test(url))
9612
+ return false;
9613
+ const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
9614
+ return text.includes("JavaScript is disabled") && text.includes("supported browser");
9615
+ }
9610
9616
  function extractDomTextFallback(document) {
9611
9617
  const bodyText = document.body?.textContent ?? document.documentElement?.textContent ?? "";
9612
9618
  return bodyText.replace(/\r\n/g, "\n").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").replace(/[ \t]{2,}/g, " ").trim();
@@ -10197,15 +10203,47 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10197
10203
  });
10198
10204
  const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
10199
10205
  const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
10200
- const extracted = await dependencies.defuddle(
10201
- extractionDocument,
10202
- finalUrl,
10203
- {
10204
- markdown: format !== "html",
10205
- removeImages,
10206
- includeReplies
10206
+ if (isTwitterJsDisabledPage(fallbackDocument, opts.url)) {
10207
+ return {
10208
+ error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
10209
+ code: "http_error",
10210
+ phase: "loading",
10211
+ retryable: false,
10212
+ timeoutMs,
10213
+ url: opts.url,
10214
+ finalUrl,
10215
+ statusCode: 404,
10216
+ statusText: "Not Found",
10217
+ mimeType: normalizeContentType(contentType) || void 0,
10218
+ contentLength: errorContext.contentLength
10219
+ };
10220
+ }
10221
+ let extracted;
10222
+ try {
10223
+ const origConsoleError = console.error;
10224
+ const suppressedErrors = [];
10225
+ console.error = (...args) => {
10226
+ suppressedErrors.push(args);
10227
+ };
10228
+ try {
10229
+ extracted = await dependencies.defuddle(
10230
+ extractionDocument,
10231
+ finalUrl,
10232
+ {
10233
+ markdown: format !== "html",
10234
+ removeImages,
10235
+ includeReplies
10236
+ }
10237
+ );
10238
+ } finally {
10239
+ console.error = origConsoleError;
10207
10240
  }
10208
- );
10241
+ } catch (_error) {
10242
+ extracted = {
10243
+ content: void 0,
10244
+ wordCount: 0
10245
+ };
10246
+ }
10209
10247
  let extractedContent = extracted.content;
10210
10248
  let wordCount = extracted.wordCount;
10211
10249
  if (!extractedContent || wordCount === 0) {