npm - pi-smart-fetch - Versions diffs - 0.3.2 → 0.3.4 - Mend

pi-smart-fetch 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -13,7 +13,7 @@
 - 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
 - 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
 - ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
-- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
+- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`, `raw`
 ## Site optimisations
@@ -70,6 +70,7 @@ For `batch_web_fetch`, each item in `requests` accepts the same parameters as `w
 | `html` | Cleaned HTML output |
 | `text` | Plain text with markdown stripped |
 | `json` | Structured JSON for metadata-heavy workflows |
+| `raw` | Full raw server response without extraction or truncation — for further parsing |
 ## Global defaults

package/dist/index.js CHANGED Viewed

@@ -9960,6 +9960,7 @@ var DEFAULT_TIMEOUT_MS = 15e3;
 var DEFAULT_BATCH_CONCURRENCY = 8;
 var DEFAULT_INCLUDE_REPLIES = "extractors";
 var DEFAULT_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+var DEFAULT_RAW_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/json,application/xml;q=0.9,text/markdown;q=0.8,text/plain;q=0.8,*/*;q=0.7";
 var DEFAULT_JSON_ACCEPT_HEADER = "application/json,text/json,application/ld+json;q=0.9,text/plain;q=0.8,*/*;q=0.7";
 var DEFAULT_ACCEPT_LANGUAGE_HEADER = "en-US,en;q=0.9";
@@ -10177,7 +10178,8 @@ function buildCompactMetadataHeader(result) {
     ["URL", result.finalUrl],
     ["Title", result.title],
     ["Author", result.author],
-    ["Published", result.published]
+    ["Published", result.published],
+    ["Content-Type", result.contentType]
   ]);
 }
 function buildMetadataHeader(result) {
@@ -10195,6 +10197,7 @@ function buildMetadataHeader(result) {
     ["Title", result.title],
     ["Author", result.author],
     ["Published", result.published],
+    ["Content-Type", result.contentType],
     ["Site", result.site],
     ["Language", result.language],
     ["Words", result.wordCount],
@@ -10777,7 +10780,9 @@ function mapRequestEventToProgress(event) {
   }
 }
 function resolveAcceptHeader(format) {
-  return format === "json" ? DEFAULT_JSON_ACCEPT_HEADER : DEFAULT_ACCEPT_HEADER;
+  if (format === "json") return DEFAULT_JSON_ACCEPT_HEADER;
+  if (format === "raw") return DEFAULT_RAW_ACCEPT_HEADER;
+  return DEFAULT_ACCEPT_HEADER;
 }
 function isJsonContentType(contentType) {
   const normalized = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
@@ -10798,7 +10803,8 @@ function extractQualifiedAlternateLinks(document, baseUrl, format) {
     markdown: ["text/markdown", "text/x-markdown"],
     text: ["text/plain", "text/markdown", "text/x-markdown"],
     html: ["text/html", "application/xhtml+xml"],
-    json: ["application/json", "text/json"]
+    json: ["application/json", "text/json"],
+    raw: []
   };
   const accepted = acceptedTypes[format];
   const head = document.head;
@@ -11076,6 +11082,81 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
         );
       }
       const jsonResponse = isJsonResponse(contentType, rawBody);
+      if (format === "raw") {
+        const isXUrl2 = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
+          opts.url
+        );
+        if (isXUrl2) {
+          let extractedContent2;
+          const suppressedErrors2 = [];
+          const origConsoleError = console.error;
+          console.error = (...args) => {
+            suppressedErrors2.push(args);
+          };
+          try {
+            const extractionDocument2 = parseLinkedomHTML(rawBody, finalUrl);
+            const extracted2 = await dependencies.defuddle(
+              extractionDocument2,
+              finalUrl,
+              {
+                markdown: true,
+                removeImages,
+                includeReplies
+              }
+            );
+            extractedContent2 = extracted2.content;
+          } finally {
+            console.error = origConsoleError;
+          }
+          const hasOembed404 = suppressedErrors2.some(
+            (args) => args.some(
+              (arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
+            )
+          );
+          const hasJsDisabledShell = isTwitterJsDisabledPage(
+            parseLinkedomHTML(rawBody, finalUrl),
+            opts.url
+          );
+          if ((hasOembed404 || hasJsDisabledShell) && !extractedContent2) {
+            return {
+              error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
+              code: "http_error",
+              phase: "loading",
+              retryable: false,
+              timeoutMs,
+              url: opts.url,
+              finalUrl,
+              statusCode: 404,
+              statusText: "Not Found",
+              mimeType: normalizeContentType(contentType) || void 0,
+              contentLength: errorContext.contentLength
+            };
+          }
+        }
+        const effectiveContent = opts.maxChars !== void 0 ? truncateContent(rawBody, maxChars) : rawBody;
+        const result2 = {
+          kind: "content",
+          url: opts.url,
+          finalUrl,
+          title: "",
+          author: "",
+          published: "",
+          site: new URL(finalUrl).hostname,
+          language: "",
+          wordCount: 0,
+          content: effectiveContent,
+          browser,
+          os,
+          contentType: normalizeContentType(contentType) || void 0
+        };
+        emitStatus(hooks, "done");
+        emitProgress(hooks, {
+          status: "done",
+          progress: 1,
+          phase: "raw_done"
+        });
+        return result2;
+      }
       if (format === "json") {
         if (!jsonResponse) {
           if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
@@ -11381,10 +11462,11 @@ function createBaseFetchToolParameterProperties(defaults) {
           Type.Literal("markdown"),
           Type.Literal("html"),
           Type.Literal("text"),
-          Type.Literal("json")
+          Type.Literal("json"),
+          Type.Literal("raw")
         ],
         {
-          description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), or "json" (pretty-printed JSON)'
+          description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), "json" (pretty-printed JSON), or "raw" (full raw server response without extraction or truncation, for further parsing)'
         }
       )
     ),