npm - pi-smart-fetch - Versions diffs - 0.2.33 → 0.2.35 - Mend

pi-smart-fetch 0.2.33 → 0.2.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -10,6 +10,8 @@
 - 🧹 **Defuddle extraction** — clean readable content instead of noisy HTML
 - 🧠 **Useful metadata** — title, author, site, language, published date when available
 - 📦 **Downloads + large file support** — stream attachments and binaries to temp files
+- 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
+- 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
 - ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
 - 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
@@ -28,6 +30,7 @@ This package works on general web pages, but some site types benefit especially
 Notes:
 - Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
 - It does **not** execute JavaScript or solve interactive anti-bot/login flows
+- If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
 ## Install

package/dist/index.js CHANGED Viewed

@@ -9406,6 +9406,9 @@ var HTML_CONTENT_TYPES = [
   "text/plain",
   "text/markdown"
 ];
+var MAX_CLIENT_SIDE_REDIRECTS = 5;
+var MAX_ALTERNATE_LINK_FALLBACKS = 3;
+var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
 function normalizeContentType(contentType) {
   return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
 }
@@ -9904,6 +9907,62 @@ function isLikelyJsonBody(body) {
 function isJsonResponse(contentType, body) {
   return isJsonContentType(contentType) || isLikelyJsonBody(body);
 }
+function decodeHtmlAttribute(value) {
+  return value.replace(/&amp;/gi, "&").replace(/&quot;/gi, '"').replace(/&#39;|&apos;/gi, "'").replace(/&lt;/gi, "<").replace(/&gt;/gi, ">");
+}
+function extractQualifiedAlternateLinks(document, baseUrl, format) {
+  const acceptedTypes = {
+    markdown: ["text/markdown", "text/x-markdown"],
+    text: ["text/plain", "text/markdown", "text/x-markdown"],
+    html: ["text/html", "application/xhtml+xml"],
+    json: ["application/json", "text/json"]
+  };
+  const accepted = acceptedTypes[format];
+  const head = document.head;
+  if (!head) return [];
+  const links = Array.from(head.querySelectorAll("link"));
+  const candidates = [];
+  for (const link of links) {
+    const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
+    if (!rel.includes("alternate")) continue;
+    const type = normalizeContentType(link.getAttribute("type") ?? "");
+    const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
+    if (!isAccepted) continue;
+    const href = link.getAttribute("href");
+    if (!href) continue;
+    try {
+      const target = new URL(href, baseUrl).toString();
+      if (target !== baseUrl && !candidates.includes(target)) {
+        candidates.push(target);
+      }
+    } catch {
+    }
+  }
+  return candidates;
+}
+function extractClientSideRedirect(body, baseUrl) {
+  const snippet = body.slice(0, 4096);
+  const metaRefreshMatch = snippet.match(
+    /<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
+  );
+  const refreshContent = metaRefreshMatch?.[1];
+  if (!refreshContent) {
+    return null;
+  }
+  const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
+  const delaySeconds = Number.parseFloat(delayPart.trim());
+  const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
+  const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
+  if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
+    return null;
+  }
+  try {
+    const targetUrl = new URL(rawTarget, baseUrl).toString();
+    return targetUrl === baseUrl ? null : targetUrl;
+  } catch {
+    return null;
+  }
+}
 function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
   const parsedJson = parseAndFormatJson(rawBody);
   if ("error" in parsedJson) {
@@ -9983,7 +10042,7 @@ function shouldStripReplies(site) {
   return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
 }
 function createDefuddleFetch(dependencies = runtimeDependencies) {
-  return async function defuddleFetch2(opts, hooks = {}) {
+  async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
     const browser = opts.browser ?? DEFAULT_BROWSER;
     const os = opts.os ?? DEFAULT_OS;
     const format = opts.format ?? "markdown";
@@ -10111,9 +10170,46 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
       }
       errorContext.phase = "loading";
       const rawBody = await response.text();
+      const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
+      if (clientSideRedirect) {
+        if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
+          return {
+            error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
+            code: "too_many_redirects",
+            phase: "loading",
+            retryable: false,
+            timeoutMs,
+            url: opts.url,
+            finalUrl,
+            mimeType: normalizeContentType(contentType) || void 0,
+            contentLength: errorContext.contentLength
+          };
+        }
+        return fetchWithClientRedirects(
+          { ...opts, url: clientSideRedirect },
+          hooks,
+          clientSideRedirectCount + 1,
+          alternateLinkFallbackCount
+        );
+      }
       const jsonResponse = isJsonResponse(contentType, rawBody);
       if (format === "json") {
         if (!jsonResponse) {
+          if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
+            const alternateLinks2 = extractQualifiedAlternateLinks(
+              parseLinkedomHTML(rawBody, finalUrl),
+              finalUrl,
+              format
+            );
+            if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
+              return fetchWithClientRedirects(
+                { ...opts, url: alternateLinks2[0] },
+                hooks,
+                clientSideRedirectCount,
+                alternateLinkFallbackCount + 1
+              );
+            }
+          }
           return {
             error: `Not a JSON response (content-type: ${contentType})`,
             code: "unexpected_response",
@@ -10205,6 +10301,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
       });
       const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
       const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
+      const alternateLinks = extractQualifiedAlternateLinks(
+        fallbackDocument,
+        finalUrl,
+        format
+      );
+      const tryAlternateLinkFallback = async () => {
+        if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
+          return null;
+        }
+        return fetchWithClientRedirects(
+          { ...opts, url: alternateLinks[0] },
+          hooks,
+          clientSideRedirectCount,
+          alternateLinkFallbackCount + 1
+        );
+      };
       let extracted;
       const suppressedErrors = [];
       try {
@@ -10265,6 +10377,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
       if (!extractedContent || wordCount === 0) {
         const fallbackText = extractDomTextFallback(fallbackDocument);
         if (!fallbackText) {
+          const alternateResult = await tryAlternateLinkFallback();
+          if (alternateResult) return alternateResult;
           return {
             error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
             code: "no_content",
@@ -10280,6 +10394,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
         extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
         wordCount = estimateWordCount(fallbackText);
       }
+      const extractedTextWordCount = estimateWordCount(
+        format === "text" ? extractedContent : markdownToText(extractedContent)
+      );
+      if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
+        const alternateResult = await tryAlternateLinkFallback();
+        if (alternateResult) return alternateResult;
+      }
       if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
         const strippedContent = stripExtractorComments(
           extractedContent,
@@ -10316,6 +10437,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
       emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
       return fetchError;
     }
+  }
+  return function defuddleFetch2(opts, hooks = {}) {
+    return fetchWithClientRedirects(opts, hooks, 0, 0);
   };
 }
 var defuddleFetch = createDefuddleFetch();