pi-smart-fetch 0.2.32 โ†’ 0.2.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,6 +10,7 @@
10
10
  - ๐Ÿงน **Defuddle extraction** โ€” clean readable content instead of noisy HTML
11
11
  - ๐Ÿง  **Useful metadata** โ€” title, author, site, language, published date when available
12
12
  - ๐Ÿ“ฆ **Downloads + large file support** โ€” stream attachments and binaries to temp files
13
+ - ๐Ÿ” **Client-side `<meta>` redirects** โ€” follows sane meta refresh redirects with loop limits
13
14
  - โšก **Batch fetch** โ€” fetch many URLs with bounded concurrency
14
15
  - ๐Ÿ“ **Multiple output formats** โ€” `markdown`, `html`, `text`, `json`
15
16
 
package/dist/index.js CHANGED
@@ -9406,6 +9406,7 @@ var HTML_CONTENT_TYPES = [
9406
9406
  "text/plain",
9407
9407
  "text/markdown"
9408
9408
  ];
9409
+ var MAX_CLIENT_SIDE_REDIRECTS = 5;
9409
9410
  function normalizeContentType(contentType) {
9410
9411
  return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
9411
9412
  }
@@ -9904,6 +9905,32 @@ function isLikelyJsonBody(body) {
9904
9905
  function isJsonResponse(contentType, body) {
9905
9906
  return isJsonContentType(contentType) || isLikelyJsonBody(body);
9906
9907
  }
9908
+ function decodeHtmlAttribute(value) {
9909
+ return value.replace(/&amp;/gi, "&").replace(/&quot;/gi, '"').replace(/&#39;|&apos;/gi, "'").replace(/&lt;/gi, "<").replace(/&gt;/gi, ">");
9910
+ }
9911
+ function extractClientSideRedirect(body, baseUrl) {
9912
+ const snippet = body.slice(0, 4096);
9913
+ const metaRefreshMatch = snippet.match(
9914
+ /<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
9915
+ );
9916
+ const refreshContent = metaRefreshMatch?.[1];
9917
+ if (!refreshContent) {
9918
+ return null;
9919
+ }
9920
+ const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
9921
+ const delaySeconds = Number.parseFloat(delayPart.trim());
9922
+ const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
9923
+ const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
9924
+ if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
9925
+ return null;
9926
+ }
9927
+ try {
9928
+ const targetUrl = new URL(rawTarget, baseUrl).toString();
9929
+ return targetUrl === baseUrl ? null : targetUrl;
9930
+ } catch {
9931
+ return null;
9932
+ }
9933
+ }
9907
9934
  function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
9908
9935
  const parsedJson = parseAndFormatJson(rawBody);
9909
9936
  if ("error" in parsedJson) {
@@ -9983,7 +10010,7 @@ function shouldStripReplies(site) {
9983
10010
  return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
9984
10011
  }
9985
10012
  function createDefuddleFetch(dependencies = runtimeDependencies) {
9986
- return async function defuddleFetch2(opts, hooks = {}) {
10013
+ async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount) {
9987
10014
  const browser = opts.browser ?? DEFAULT_BROWSER;
9988
10015
  const os = opts.os ?? DEFAULT_OS;
9989
10016
  const format = opts.format ?? "markdown";
@@ -10111,6 +10138,27 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10111
10138
  }
10112
10139
  errorContext.phase = "loading";
10113
10140
  const rawBody = await response.text();
10141
+ const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
10142
+ if (clientSideRedirect) {
10143
+ if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
10144
+ return {
10145
+ error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
10146
+ code: "too_many_redirects",
10147
+ phase: "loading",
10148
+ retryable: false,
10149
+ timeoutMs,
10150
+ url: opts.url,
10151
+ finalUrl,
10152
+ mimeType: normalizeContentType(contentType) || void 0,
10153
+ contentLength: errorContext.contentLength
10154
+ };
10155
+ }
10156
+ return fetchWithClientRedirects(
10157
+ { ...opts, url: clientSideRedirect },
10158
+ hooks,
10159
+ clientSideRedirectCount + 1
10160
+ );
10161
+ }
10114
10162
  const jsonResponse = isJsonResponse(contentType, rawBody);
10115
10163
  if (format === "json") {
10116
10164
  if (!jsonResponse) {
@@ -10244,7 +10292,7 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10244
10292
  fallbackDocument,
10245
10293
  opts.url
10246
10294
  );
10247
- if (hasOembed404 || hasJsDisabledShell) {
10295
+ if ((hasOembed404 || hasJsDisabledShell) && !extracted.content) {
10248
10296
  return {
10249
10297
  error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
10250
10298
  code: "http_error",
@@ -10316,6 +10364,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
10316
10364
  emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
10317
10365
  return fetchError;
10318
10366
  }
10367
+ }
10368
+ return function defuddleFetch2(opts, hooks = {}) {
10369
+ return fetchWithClientRedirects(opts, hooks, 0);
10319
10370
  };
10320
10371
  }
10321
10372
  var defuddleFetch = createDefuddleFetch();