pi-smart-fetch 0.2.33 โ 0.2.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/index.js +52 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
- ๐งน **Defuddle extraction** โ clean readable content instead of noisy HTML
|
|
11
11
|
- ๐ง **Useful metadata** โ title, author, site, language, published date when available
|
|
12
12
|
- ๐ฆ **Downloads + large file support** โ stream attachments and binaries to temp files
|
|
13
|
+
- ๐ **Client-side `<meta>` redirects** โ follows sane meta refresh redirects with loop limits
|
|
13
14
|
- โก **Batch fetch** โ fetch many URLs with bounded concurrency
|
|
14
15
|
- ๐ **Multiple output formats** โ `markdown`, `html`, `text`, `json`
|
|
15
16
|
|
package/dist/index.js
CHANGED
|
@@ -9406,6 +9406,7 @@ var HTML_CONTENT_TYPES = [
|
|
|
9406
9406
|
"text/plain",
|
|
9407
9407
|
"text/markdown"
|
|
9408
9408
|
];
|
|
9409
|
+
var MAX_CLIENT_SIDE_REDIRECTS = 5;
|
|
9409
9410
|
function normalizeContentType(contentType) {
|
|
9410
9411
|
return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
9411
9412
|
}
|
|
@@ -9904,6 +9905,32 @@ function isLikelyJsonBody(body) {
|
|
|
9904
9905
|
function isJsonResponse(contentType, body) {
|
|
9905
9906
|
return isJsonContentType(contentType) || isLikelyJsonBody(body);
|
|
9906
9907
|
}
|
|
9908
|
+
function decodeHtmlAttribute(value) {
|
|
9909
|
+
return value.replace(/&/gi, "&").replace(/"/gi, '"').replace(/'|'/gi, "'").replace(/</gi, "<").replace(/>/gi, ">");
|
|
9910
|
+
}
|
|
9911
|
+
function extractClientSideRedirect(body, baseUrl) {
|
|
9912
|
+
const snippet = body.slice(0, 4096);
|
|
9913
|
+
const metaRefreshMatch = snippet.match(
|
|
9914
|
+
/<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
|
|
9915
|
+
);
|
|
9916
|
+
const refreshContent = metaRefreshMatch?.[1];
|
|
9917
|
+
if (!refreshContent) {
|
|
9918
|
+
return null;
|
|
9919
|
+
}
|
|
9920
|
+
const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
|
|
9921
|
+
const delaySeconds = Number.parseFloat(delayPart.trim());
|
|
9922
|
+
const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
|
|
9923
|
+
const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
|
|
9924
|
+
if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
|
|
9925
|
+
return null;
|
|
9926
|
+
}
|
|
9927
|
+
try {
|
|
9928
|
+
const targetUrl = new URL(rawTarget, baseUrl).toString();
|
|
9929
|
+
return targetUrl === baseUrl ? null : targetUrl;
|
|
9930
|
+
} catch {
|
|
9931
|
+
return null;
|
|
9932
|
+
}
|
|
9933
|
+
}
|
|
9907
9934
|
function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
|
|
9908
9935
|
const parsedJson = parseAndFormatJson(rawBody);
|
|
9909
9936
|
if ("error" in parsedJson) {
|
|
@@ -9983,7 +10010,7 @@ function shouldStripReplies(site) {
|
|
|
9983
10010
|
return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
|
|
9984
10011
|
}
|
|
9985
10012
|
function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
9986
|
-
|
|
10013
|
+
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount) {
|
|
9987
10014
|
const browser = opts.browser ?? DEFAULT_BROWSER;
|
|
9988
10015
|
const os = opts.os ?? DEFAULT_OS;
|
|
9989
10016
|
const format = opts.format ?? "markdown";
|
|
@@ -10111,6 +10138,27 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10111
10138
|
}
|
|
10112
10139
|
errorContext.phase = "loading";
|
|
10113
10140
|
const rawBody = await response.text();
|
|
10141
|
+
const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
|
|
10142
|
+
if (clientSideRedirect) {
|
|
10143
|
+
if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
|
|
10144
|
+
return {
|
|
10145
|
+
error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
|
|
10146
|
+
code: "too_many_redirects",
|
|
10147
|
+
phase: "loading",
|
|
10148
|
+
retryable: false,
|
|
10149
|
+
timeoutMs,
|
|
10150
|
+
url: opts.url,
|
|
10151
|
+
finalUrl,
|
|
10152
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10153
|
+
contentLength: errorContext.contentLength
|
|
10154
|
+
};
|
|
10155
|
+
}
|
|
10156
|
+
return fetchWithClientRedirects(
|
|
10157
|
+
{ ...opts, url: clientSideRedirect },
|
|
10158
|
+
hooks,
|
|
10159
|
+
clientSideRedirectCount + 1
|
|
10160
|
+
);
|
|
10161
|
+
}
|
|
10114
10162
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
10115
10163
|
if (format === "json") {
|
|
10116
10164
|
if (!jsonResponse) {
|
|
@@ -10316,6 +10364,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10316
10364
|
emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
|
|
10317
10365
|
return fetchError;
|
|
10318
10366
|
}
|
|
10367
|
+
}
|
|
10368
|
+
return function defuddleFetch2(opts, hooks = {}) {
|
|
10369
|
+
return fetchWithClientRedirects(opts, hooks, 0);
|
|
10319
10370
|
};
|
|
10320
10371
|
}
|
|
10321
10372
|
var defuddleFetch = createDefuddleFetch();
|