pi-smart-fetch 0.2.33 โ 0.2.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/index.js +125 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
- ๐งน **Defuddle extraction** โ clean readable content instead of noisy HTML
|
|
11
11
|
- ๐ง **Useful metadata** โ title, author, site, language, published date when available
|
|
12
12
|
- ๐ฆ **Downloads + large file support** โ stream attachments and binaries to temp files
|
|
13
|
+
- ๐ **Client-side `<meta>` redirects** โ follows sane meta refresh redirects with loop limits
|
|
14
|
+
- ๐ **Alternate content fallback** โ when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
|
|
13
15
|
- โก **Batch fetch** โ fetch many URLs with bounded concurrency
|
|
14
16
|
- ๐ **Multiple output formats** โ `markdown`, `html`, `text`, `json`
|
|
15
17
|
|
|
@@ -28,6 +30,7 @@ This package works on general web pages, but some site types benefit especially
|
|
|
28
30
|
Notes:
|
|
29
31
|
- Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
|
|
30
32
|
- It does **not** execute JavaScript or solve interactive anti-bot/login flows
|
|
33
|
+
- If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
|
|
31
34
|
|
|
32
35
|
## Install
|
|
33
36
|
|
package/dist/index.js
CHANGED
|
@@ -9406,6 +9406,9 @@ var HTML_CONTENT_TYPES = [
|
|
|
9406
9406
|
"text/plain",
|
|
9407
9407
|
"text/markdown"
|
|
9408
9408
|
];
|
|
9409
|
+
var MAX_CLIENT_SIDE_REDIRECTS = 5;
|
|
9410
|
+
var MAX_ALTERNATE_LINK_FALLBACKS = 3;
|
|
9411
|
+
var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
|
|
9409
9412
|
function normalizeContentType(contentType) {
|
|
9410
9413
|
return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
9411
9414
|
}
|
|
@@ -9904,6 +9907,62 @@ function isLikelyJsonBody(body) {
|
|
|
9904
9907
|
function isJsonResponse(contentType, body) {
|
|
9905
9908
|
return isJsonContentType(contentType) || isLikelyJsonBody(body);
|
|
9906
9909
|
}
|
|
9910
|
+
function decodeHtmlAttribute(value) {
|
|
9911
|
+
return value.replace(/&/gi, "&").replace(/"/gi, '"').replace(/'|'/gi, "'").replace(/</gi, "<").replace(/>/gi, ">");
|
|
9912
|
+
}
|
|
9913
|
+
function extractQualifiedAlternateLinks(document, baseUrl, format) {
|
|
9914
|
+
const acceptedTypes = {
|
|
9915
|
+
markdown: ["text/markdown", "text/x-markdown"],
|
|
9916
|
+
text: ["text/plain", "text/markdown", "text/x-markdown"],
|
|
9917
|
+
html: ["text/html", "application/xhtml+xml"],
|
|
9918
|
+
json: ["application/json", "text/json"]
|
|
9919
|
+
};
|
|
9920
|
+
const accepted = acceptedTypes[format];
|
|
9921
|
+
const head = document.head;
|
|
9922
|
+
if (!head) return [];
|
|
9923
|
+
const links = Array.from(head.querySelectorAll("link"));
|
|
9924
|
+
const candidates = [];
|
|
9925
|
+
for (const link of links) {
|
|
9926
|
+
const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
|
|
9927
|
+
if (!rel.includes("alternate")) continue;
|
|
9928
|
+
const type = normalizeContentType(link.getAttribute("type") ?? "");
|
|
9929
|
+
const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
|
|
9930
|
+
if (!isAccepted) continue;
|
|
9931
|
+
const href = link.getAttribute("href");
|
|
9932
|
+
if (!href) continue;
|
|
9933
|
+
try {
|
|
9934
|
+
const target = new URL(href, baseUrl).toString();
|
|
9935
|
+
if (target !== baseUrl && !candidates.includes(target)) {
|
|
9936
|
+
candidates.push(target);
|
|
9937
|
+
}
|
|
9938
|
+
} catch {
|
|
9939
|
+
}
|
|
9940
|
+
}
|
|
9941
|
+
return candidates;
|
|
9942
|
+
}
|
|
9943
|
+
function extractClientSideRedirect(body, baseUrl) {
|
|
9944
|
+
const snippet = body.slice(0, 4096);
|
|
9945
|
+
const metaRefreshMatch = snippet.match(
|
|
9946
|
+
/<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
|
|
9947
|
+
);
|
|
9948
|
+
const refreshContent = metaRefreshMatch?.[1];
|
|
9949
|
+
if (!refreshContent) {
|
|
9950
|
+
return null;
|
|
9951
|
+
}
|
|
9952
|
+
const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
|
|
9953
|
+
const delaySeconds = Number.parseFloat(delayPart.trim());
|
|
9954
|
+
const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
|
|
9955
|
+
const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
|
|
9956
|
+
if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
|
|
9957
|
+
return null;
|
|
9958
|
+
}
|
|
9959
|
+
try {
|
|
9960
|
+
const targetUrl = new URL(rawTarget, baseUrl).toString();
|
|
9961
|
+
return targetUrl === baseUrl ? null : targetUrl;
|
|
9962
|
+
} catch {
|
|
9963
|
+
return null;
|
|
9964
|
+
}
|
|
9965
|
+
}
|
|
9907
9966
|
function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
|
|
9908
9967
|
const parsedJson = parseAndFormatJson(rawBody);
|
|
9909
9968
|
if ("error" in parsedJson) {
|
|
@@ -9983,7 +10042,7 @@ function shouldStripReplies(site) {
|
|
|
9983
10042
|
return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
|
|
9984
10043
|
}
|
|
9985
10044
|
function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
9986
|
-
|
|
10045
|
+
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
|
|
9987
10046
|
const browser = opts.browser ?? DEFAULT_BROWSER;
|
|
9988
10047
|
const os = opts.os ?? DEFAULT_OS;
|
|
9989
10048
|
const format = opts.format ?? "markdown";
|
|
@@ -10111,9 +10170,46 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10111
10170
|
}
|
|
10112
10171
|
errorContext.phase = "loading";
|
|
10113
10172
|
const rawBody = await response.text();
|
|
10173
|
+
const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
|
|
10174
|
+
if (clientSideRedirect) {
|
|
10175
|
+
if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
|
|
10176
|
+
return {
|
|
10177
|
+
error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
|
|
10178
|
+
code: "too_many_redirects",
|
|
10179
|
+
phase: "loading",
|
|
10180
|
+
retryable: false,
|
|
10181
|
+
timeoutMs,
|
|
10182
|
+
url: opts.url,
|
|
10183
|
+
finalUrl,
|
|
10184
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10185
|
+
contentLength: errorContext.contentLength
|
|
10186
|
+
};
|
|
10187
|
+
}
|
|
10188
|
+
return fetchWithClientRedirects(
|
|
10189
|
+
{ ...opts, url: clientSideRedirect },
|
|
10190
|
+
hooks,
|
|
10191
|
+
clientSideRedirectCount + 1,
|
|
10192
|
+
alternateLinkFallbackCount
|
|
10193
|
+
);
|
|
10194
|
+
}
|
|
10114
10195
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
10115
10196
|
if (format === "json") {
|
|
10116
10197
|
if (!jsonResponse) {
|
|
10198
|
+
if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
|
|
10199
|
+
const alternateLinks2 = extractQualifiedAlternateLinks(
|
|
10200
|
+
parseLinkedomHTML(rawBody, finalUrl),
|
|
10201
|
+
finalUrl,
|
|
10202
|
+
format
|
|
10203
|
+
);
|
|
10204
|
+
if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10205
|
+
return fetchWithClientRedirects(
|
|
10206
|
+
{ ...opts, url: alternateLinks2[0] },
|
|
10207
|
+
hooks,
|
|
10208
|
+
clientSideRedirectCount,
|
|
10209
|
+
alternateLinkFallbackCount + 1
|
|
10210
|
+
);
|
|
10211
|
+
}
|
|
10212
|
+
}
|
|
10117
10213
|
return {
|
|
10118
10214
|
error: `Not a JSON response (content-type: ${contentType})`,
|
|
10119
10215
|
code: "unexpected_response",
|
|
@@ -10205,6 +10301,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10205
10301
|
});
|
|
10206
10302
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10207
10303
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10304
|
+
const alternateLinks = extractQualifiedAlternateLinks(
|
|
10305
|
+
fallbackDocument,
|
|
10306
|
+
finalUrl,
|
|
10307
|
+
format
|
|
10308
|
+
);
|
|
10309
|
+
const tryAlternateLinkFallback = async () => {
|
|
10310
|
+
if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10311
|
+
return null;
|
|
10312
|
+
}
|
|
10313
|
+
return fetchWithClientRedirects(
|
|
10314
|
+
{ ...opts, url: alternateLinks[0] },
|
|
10315
|
+
hooks,
|
|
10316
|
+
clientSideRedirectCount,
|
|
10317
|
+
alternateLinkFallbackCount + 1
|
|
10318
|
+
);
|
|
10319
|
+
};
|
|
10208
10320
|
let extracted;
|
|
10209
10321
|
const suppressedErrors = [];
|
|
10210
10322
|
try {
|
|
@@ -10265,6 +10377,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10265
10377
|
if (!extractedContent || wordCount === 0) {
|
|
10266
10378
|
const fallbackText = extractDomTextFallback(fallbackDocument);
|
|
10267
10379
|
if (!fallbackText) {
|
|
10380
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10381
|
+
if (alternateResult) return alternateResult;
|
|
10268
10382
|
return {
|
|
10269
10383
|
error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
|
|
10270
10384
|
code: "no_content",
|
|
@@ -10280,6 +10394,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10280
10394
|
extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
|
|
10281
10395
|
wordCount = estimateWordCount(fallbackText);
|
|
10282
10396
|
}
|
|
10397
|
+
const extractedTextWordCount = estimateWordCount(
|
|
10398
|
+
format === "text" ? extractedContent : markdownToText(extractedContent)
|
|
10399
|
+
);
|
|
10400
|
+
if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
|
|
10401
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10402
|
+
if (alternateResult) return alternateResult;
|
|
10403
|
+
}
|
|
10283
10404
|
if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
|
|
10284
10405
|
const strippedContent = stripExtractorComments(
|
|
10285
10406
|
extractedContent,
|
|
@@ -10316,6 +10437,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10316
10437
|
emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
|
|
10317
10438
|
return fetchError;
|
|
10318
10439
|
}
|
|
10440
|
+
}
|
|
10441
|
+
return function defuddleFetch2(opts, hooks = {}) {
|
|
10442
|
+
return fetchWithClientRedirects(opts, hooks, 0, 0);
|
|
10319
10443
|
};
|
|
10320
10444
|
}
|
|
10321
10445
|
var defuddleFetch = createDefuddleFetch();
|