pi-smart-fetch 0.2.33 โ†’ 0.2.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,6 +10,8 @@
10
10
  - ๐Ÿงน **Defuddle extraction** โ€” clean readable content instead of noisy HTML
11
11
  - ๐Ÿง  **Useful metadata** โ€” title, author, site, language, published date when available
12
12
  - ๐Ÿ“ฆ **Downloads + large file support** โ€” stream attachments and binaries to temp files
13
+ - ๐Ÿ” **Client-side `<meta>` redirects** โ€” follows sane meta refresh redirects with loop limits
14
+ - ๐Ÿ”— **Alternate content fallback** โ€” when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
13
15
  - โšก **Batch fetch** โ€” fetch many URLs with bounded concurrency
14
16
  - ๐Ÿ“ **Multiple output formats** โ€” `markdown`, `html`, `text`, `json`
15
17
 
@@ -28,6 +30,7 @@ This package works on general web pages, but some site types benefit especially
28
30
  Notes:
29
31
  - Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
30
32
  - It does **not** execute JavaScript or solve interactive anti-bot/login flows
33
+ - If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
31
34
 
32
35
  ## Install
33
36
 
package/dist/index.js CHANGED
@@ -9406,6 +9406,9 @@ var HTML_CONTENT_TYPES = [
9406
9406
  "text/plain",
9407
9407
  "text/markdown"
9408
9408
  ];
9409
+ var MAX_CLIENT_SIDE_REDIRECTS = 5;
9410
+ var MAX_ALTERNATE_LINK_FALLBACKS = 3;
9411
+ var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
9409
9412
  function normalizeContentType(contentType) {
9410
9413
  return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
9411
9414
  }
@@ -9904,6 +9907,62 @@ function isLikelyJsonBody(body) {
9904
9907
  function isJsonResponse(contentType, body) {
9905
9908
  return isJsonContentType(contentType) || isLikelyJsonBody(body);
9906
9909
  }
9910
+ function decodeHtmlAttribute(value) {
9911
+ return value.replace(/&amp;/gi, "&").replace(/&quot;/gi, '"').replace(/&#39;|&apos;/gi, "'").replace(/&lt;/gi, "<").replace(/&gt;/gi, ">");
9912
+ }
9913
+ function extractQualifiedAlternateLinks(document, baseUrl, format) {
9914
+ const acceptedTypes = {
9915
+ markdown: ["text/markdown", "text/x-markdown"],
9916
+ text: ["text/plain", "text/markdown", "text/x-markdown"],
9917
+ html: ["text/html", "application/xhtml+xml"],
9918
+ json: ["application/json", "text/json"]
9919
+ };
9920
+ const accepted = acceptedTypes[format];
9921
+ const head = document.head;
9922
+ if (!head) return [];
9923
+ const links = Array.from(head.querySelectorAll("link"));
9924
+ const candidates = [];
9925
+ for (const link of links) {
9926
+ const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
9927
+ if (!rel.includes("alternate")) continue;
9928
+ const type = normalizeContentType(link.getAttribute("type") ?? "");
9929
+ const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
9930
+ if (!isAccepted) continue;
9931
+ const href = link.getAttribute("href");
9932
+ if (!href) continue;
9933
+ try {
9934
+ const target = new URL(href, baseUrl).toString();
9935
+ if (target !== baseUrl && !candidates.includes(target)) {
9936
+ candidates.push(target);
9937
+ }
9938
+ } catch {
9939
+ }
9940
+ }
9941
+ return candidates;
9942
+ }
9943
+ function extractClientSideRedirect(body, baseUrl) {
9944
+ const snippet = body.slice(0, 4096);
9945
+ const metaRefreshMatch = snippet.match(
9946
+ /<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
9947
+ );
9948
+ const refreshContent = metaRefreshMatch?.[1];
9949
+ if (!refreshContent) {
9950
+ return null;
9951
+ }
9952
+ const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
9953
+ const delaySeconds = Number.parseFloat(delayPart.trim());
9954
+ const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
9955
+ const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
9956
+ if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
9957
+ return null;
9958
+ }
9959
+ try {
9960
+ const targetUrl = new URL(rawTarget, baseUrl).toString();
9961
+ return targetUrl === baseUrl ? null : targetUrl;
9962
+ } catch {
9963
+ return null;
9964
+ }
9965
+ }
9907
9966
  function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
9908
9967
  const parsedJson = parseAndFormatJson(rawBody);
9909
9968
  if ("error" in parsedJson) {
@@ -9983,7 +10042,7 @@ function shouldStripReplies(site) {
9983
10042
  return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
9984
10043
  }
9985
10044
  function createDefuddleFetch(dependencies = runtimeDependencies) {
9986
- return async function defuddleFetch2(opts, hooks = {}) {
10045
+ async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
9987
10046
  const browser = opts.browser ?? DEFAULT_BROWSER;
9988
10047
  const os = opts.os ?? DEFAULT_OS;
9989
10048
  const format = opts.format ?? "markdown";
@@ -10111,9 +10170,46 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10111
10170
  }
10112
10171
  errorContext.phase = "loading";
10113
10172
  const rawBody = await response.text();
10173
+ const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
10174
+ if (clientSideRedirect) {
10175
+ if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
10176
+ return {
10177
+ error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
10178
+ code: "too_many_redirects",
10179
+ phase: "loading",
10180
+ retryable: false,
10181
+ timeoutMs,
10182
+ url: opts.url,
10183
+ finalUrl,
10184
+ mimeType: normalizeContentType(contentType) || void 0,
10185
+ contentLength: errorContext.contentLength
10186
+ };
10187
+ }
10188
+ return fetchWithClientRedirects(
10189
+ { ...opts, url: clientSideRedirect },
10190
+ hooks,
10191
+ clientSideRedirectCount + 1,
10192
+ alternateLinkFallbackCount
10193
+ );
10194
+ }
10114
10195
  const jsonResponse = isJsonResponse(contentType, rawBody);
10115
10196
  if (format === "json") {
10116
10197
  if (!jsonResponse) {
10198
+ if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
10199
+ const alternateLinks2 = extractQualifiedAlternateLinks(
10200
+ parseLinkedomHTML(rawBody, finalUrl),
10201
+ finalUrl,
10202
+ format
10203
+ );
10204
+ if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
10205
+ return fetchWithClientRedirects(
10206
+ { ...opts, url: alternateLinks2[0] },
10207
+ hooks,
10208
+ clientSideRedirectCount,
10209
+ alternateLinkFallbackCount + 1
10210
+ );
10211
+ }
10212
+ }
10117
10213
  return {
10118
10214
  error: `Not a JSON response (content-type: ${contentType})`,
10119
10215
  code: "unexpected_response",
@@ -10205,6 +10301,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10205
10301
  });
10206
10302
  const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
10207
10303
  const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
10304
+ const alternateLinks = extractQualifiedAlternateLinks(
10305
+ fallbackDocument,
10306
+ finalUrl,
10307
+ format
10308
+ );
10309
+ const tryAlternateLinkFallback = async () => {
10310
+ if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
10311
+ return null;
10312
+ }
10313
+ return fetchWithClientRedirects(
10314
+ { ...opts, url: alternateLinks[0] },
10315
+ hooks,
10316
+ clientSideRedirectCount,
10317
+ alternateLinkFallbackCount + 1
10318
+ );
10319
+ };
10208
10320
  let extracted;
10209
10321
  const suppressedErrors = [];
10210
10322
  try {
@@ -10265,6 +10377,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10265
10377
  if (!extractedContent || wordCount === 0) {
10266
10378
  const fallbackText = extractDomTextFallback(fallbackDocument);
10267
10379
  if (!fallbackText) {
10380
+ const alternateResult = await tryAlternateLinkFallback();
10381
+ if (alternateResult) return alternateResult;
10268
10382
  return {
10269
10383
  error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
10270
10384
  code: "no_content",
@@ -10280,6 +10394,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10280
10394
  extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
10281
10395
  wordCount = estimateWordCount(fallbackText);
10282
10396
  }
10397
+ const extractedTextWordCount = estimateWordCount(
10398
+ format === "text" ? extractedContent : markdownToText(extractedContent)
10399
+ );
10400
+ if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
10401
+ const alternateResult = await tryAlternateLinkFallback();
10402
+ if (alternateResult) return alternateResult;
10403
+ }
10283
10404
  if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
10284
10405
  const strippedContent = stripExtractorComments(
10285
10406
  extractedContent,
@@ -10316,6 +10437,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10316
10437
  emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
10317
10438
  return fetchError;
10318
10439
  }
10440
+ }
10441
+ return function defuddleFetch2(opts, hooks = {}) {
10442
+ return fetchWithClientRedirects(opts, hooks, 0, 0);
10319
10443
  };
10320
10444
  }
10321
10445
  var defuddleFetch = createDefuddleFetch();