pi-smart-fetch 0.2.34 → 0.2.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,6 +11,7 @@
11
11
  - 🧠 **Useful metadata** — title, author, site, language, published date when available
12
12
  - 📦 **Downloads + large file support** — stream attachments and binaries to temp files
13
13
  - 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
14
+ - 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
14
15
  - ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
15
16
  - 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
16
17
 
@@ -29,6 +30,7 @@ This package works on general web pages, but some site types benefit especially
29
30
  Notes:
30
31
  - Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
31
32
  - It does **not** execute JavaScript or solve interactive anti-bot/login flows
33
+ - If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
32
34
 
33
35
  ## Install
34
36
 
package/dist/index.js CHANGED
@@ -9407,6 +9407,8 @@ var HTML_CONTENT_TYPES = [
9407
9407
  "text/markdown"
9408
9408
  ];
9409
9409
  var MAX_CLIENT_SIDE_REDIRECTS = 5;
9410
+ var MAX_ALTERNATE_LINK_FALLBACKS = 3;
9411
+ var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
9410
9412
  function normalizeContentType(contentType) {
9411
9413
  return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
9412
9414
  }
@@ -9908,6 +9910,36 @@ function isJsonResponse(contentType, body) {
9908
9910
  function decodeHtmlAttribute(value) {
9909
9911
  return value.replace(/&amp;/gi, "&").replace(/&quot;/gi, '"').replace(/&#39;|&apos;/gi, "'").replace(/&lt;/gi, "<").replace(/&gt;/gi, ">");
9910
9912
  }
9913
+ function extractQualifiedAlternateLinks(document, baseUrl, format) {
9914
+ const acceptedTypes = {
9915
+ markdown: ["text/markdown", "text/x-markdown"],
9916
+ text: ["text/plain", "text/markdown", "text/x-markdown"],
9917
+ html: ["text/html", "application/xhtml+xml"],
9918
+ json: ["application/json", "text/json"]
9919
+ };
9920
+ const accepted = acceptedTypes[format];
9921
+ const head = document.head;
9922
+ if (!head) return [];
9923
+ const links = Array.from(head.querySelectorAll("link"));
9924
+ const candidates = [];
9925
+ for (const link of links) {
9926
+ const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
9927
+ if (!rel.includes("alternate")) continue;
9928
+ const type = normalizeContentType(link.getAttribute("type") ?? "");
9929
+ const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
9930
+ if (!isAccepted) continue;
9931
+ const href = link.getAttribute("href");
9932
+ if (!href) continue;
9933
+ try {
9934
+ const target = new URL(href, baseUrl).toString();
9935
+ if (target !== baseUrl && !candidates.includes(target)) {
9936
+ candidates.push(target);
9937
+ }
9938
+ } catch {
9939
+ }
9940
+ }
9941
+ return candidates;
9942
+ }
9911
9943
  function extractClientSideRedirect(body, baseUrl) {
9912
9944
  const snippet = body.slice(0, 4096);
9913
9945
  const metaRefreshMatch = snippet.match(
@@ -10010,7 +10042,7 @@ function shouldStripReplies(site) {
10010
10042
  return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
10011
10043
  }
10012
10044
  function createDefuddleFetch(dependencies = runtimeDependencies) {
10013
- async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount) {
10045
+ async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
10014
10046
  const browser = opts.browser ?? DEFAULT_BROWSER;
10015
10047
  const os = opts.os ?? DEFAULT_OS;
10016
10048
  const format = opts.format ?? "markdown";
@@ -10156,12 +10188,28 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10156
10188
  return fetchWithClientRedirects(
10157
10189
  { ...opts, url: clientSideRedirect },
10158
10190
  hooks,
10159
- clientSideRedirectCount + 1
10191
+ clientSideRedirectCount + 1,
10192
+ alternateLinkFallbackCount
10160
10193
  );
10161
10194
  }
10162
10195
  const jsonResponse = isJsonResponse(contentType, rawBody);
10163
10196
  if (format === "json") {
10164
10197
  if (!jsonResponse) {
10198
+ if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
10199
+ const alternateLinks2 = extractQualifiedAlternateLinks(
10200
+ parseLinkedomHTML(rawBody, finalUrl),
10201
+ finalUrl,
10202
+ format
10203
+ );
10204
+ if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
10205
+ return fetchWithClientRedirects(
10206
+ { ...opts, url: alternateLinks2[0] },
10207
+ hooks,
10208
+ clientSideRedirectCount,
10209
+ alternateLinkFallbackCount + 1
10210
+ );
10211
+ }
10212
+ }
10165
10213
  return {
10166
10214
  error: `Not a JSON response (content-type: ${contentType})`,
10167
10215
  code: "unexpected_response",
@@ -10253,6 +10301,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10253
10301
  });
10254
10302
  const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
10255
10303
  const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
10304
+ const alternateLinks = extractQualifiedAlternateLinks(
10305
+ fallbackDocument,
10306
+ finalUrl,
10307
+ format
10308
+ );
10309
+ const tryAlternateLinkFallback = async () => {
10310
+ if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
10311
+ return null;
10312
+ }
10313
+ return fetchWithClientRedirects(
10314
+ { ...opts, url: alternateLinks[0] },
10315
+ hooks,
10316
+ clientSideRedirectCount,
10317
+ alternateLinkFallbackCount + 1
10318
+ );
10319
+ };
10256
10320
  let extracted;
10257
10321
  const suppressedErrors = [];
10258
10322
  try {
@@ -10313,6 +10377,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10313
10377
  if (!extractedContent || wordCount === 0) {
10314
10378
  const fallbackText = extractDomTextFallback(fallbackDocument);
10315
10379
  if (!fallbackText) {
10380
+ const alternateResult = await tryAlternateLinkFallback();
10381
+ if (alternateResult) return alternateResult;
10316
10382
  return {
10317
10383
  error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
10318
10384
  code: "no_content",
@@ -10328,6 +10394,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10328
10394
  extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
10329
10395
  wordCount = estimateWordCount(fallbackText);
10330
10396
  }
10397
+ const extractedTextWordCount = estimateWordCount(
10398
+ format === "text" ? extractedContent : markdownToText(extractedContent)
10399
+ );
10400
+ if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
10401
+ const alternateResult = await tryAlternateLinkFallback();
10402
+ if (alternateResult) return alternateResult;
10403
+ }
10331
10404
  if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
10332
10405
  const strippedContent = stripExtractorComments(
10333
10406
  extractedContent,
@@ -10366,7 +10439,7 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10366
10439
  }
10367
10440
  }
10368
10441
  return function defuddleFetch2(opts, hooks = {}) {
10369
- return fetchWithClientRedirects(opts, hooks, 0);
10442
+ return fetchWithClientRedirects(opts, hooks, 0, 0);
10370
10443
  };
10371
10444
  }
10372
10445
  var defuddleFetch = createDefuddleFetch();