pi-smart-fetch 0.2.28 → 0.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8,8 +8,8 @@ import { readFile, mkdir, chmod, writeFile, unlink } from 'fs/promises';
8
8
  import { tmpdir } from 'os';
9
9
  import { join, parse } from 'path';
10
10
  import { pipeline } from 'stream/promises';
11
- import { getProfiles, fetch } from '@thinkscape/wreq-js';
12
11
  import { Defuddle } from 'defuddle/node';
12
+ import { getProfiles, fetch } from 'wreq-js';
13
13
  import { parseHTML } from 'linkedom';
14
14
 
15
15
  var __create = Object.create;
@@ -9609,6 +9609,12 @@ function buildPlainTextResult(opts, finalUrl, rawBody, format, maxChars, browser
9609
9609
  os
9610
9610
  };
9611
9611
  }
9612
+ function isTwitterJsDisabledPage(document, url) {
9613
+ if (!/^(https?:\/\/)?(www\.)?(x\.com|twitter\.com)\//i.test(url))
9614
+ return false;
9615
+ const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
9616
+ return text.includes("JavaScript is disabled") && text.includes("supported browser");
9617
+ }
9612
9618
  function extractDomTextFallback(document) {
9613
9619
  const bodyText = document.body?.textContent ?? document.documentElement?.textContent ?? "";
9614
9620
  return bodyText.replace(/\r\n/g, "\n").replace(/\n{3,}/g, "\n\n").split("\n").map((line) => line.trim()).join("\n").replace(/[ \t]{2,}/g, " ").trim();
@@ -10199,15 +10205,47 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10199
10205
  });
10200
10206
  const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
10201
10207
  const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
10202
- const extracted = await dependencies.defuddle(
10203
- extractionDocument,
10204
- finalUrl,
10205
- {
10206
- markdown: format !== "html",
10207
- removeImages,
10208
- includeReplies
10208
+ if (isTwitterJsDisabledPage(fallbackDocument, opts.url)) {
10209
+ return {
10210
+ error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
10211
+ code: "http_error",
10212
+ phase: "loading",
10213
+ retryable: false,
10214
+ timeoutMs,
10215
+ url: opts.url,
10216
+ finalUrl,
10217
+ statusCode: 404,
10218
+ statusText: "Not Found",
10219
+ mimeType: normalizeContentType(contentType) || void 0,
10220
+ contentLength: errorContext.contentLength
10221
+ };
10222
+ }
10223
+ let extracted;
10224
+ try {
10225
+ const origConsoleError = console.error;
10226
+ const suppressedErrors = [];
10227
+ console.error = (...args) => {
10228
+ suppressedErrors.push(args);
10229
+ };
10230
+ try {
10231
+ extracted = await dependencies.defuddle(
10232
+ extractionDocument,
10233
+ finalUrl,
10234
+ {
10235
+ markdown: format !== "html",
10236
+ removeImages,
10237
+ includeReplies
10238
+ }
10239
+ );
10240
+ } finally {
10241
+ console.error = origConsoleError;
10209
10242
  }
10210
- );
10243
+ } catch (_error) {
10244
+ extracted = {
10245
+ content: void 0,
10246
+ wordCount: 0
10247
+ };
10248
+ }
10211
10249
  let extractedContent = extracted.content;
10212
10250
  let wordCount = extracted.wordCount;
10213
10251
  if (!extractedContent || wordCount === 0) {