pi-smart-fetch 0.2.34 → 0.2.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.js +76 -3
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
- 🧠 **Useful metadata** — title, author, site, language, published date when available
|
|
12
12
|
- 📦 **Downloads + large file support** — stream attachments and binaries to temp files
|
|
13
13
|
- 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
|
|
14
|
+
- 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
|
|
14
15
|
- ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
|
|
15
16
|
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
|
|
16
17
|
|
|
@@ -29,6 +30,7 @@ This package works on general web pages, but some site types benefit especially
|
|
|
29
30
|
Notes:
|
|
30
31
|
- Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
|
|
31
32
|
- It does **not** execute JavaScript or solve interactive anti-bot/login flows
|
|
33
|
+
- If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
|
|
32
34
|
|
|
33
35
|
## Install
|
|
34
36
|
|
package/dist/index.js
CHANGED
|
@@ -9407,6 +9407,8 @@ var HTML_CONTENT_TYPES = [
|
|
|
9407
9407
|
"text/markdown"
|
|
9408
9408
|
];
|
|
9409
9409
|
var MAX_CLIENT_SIDE_REDIRECTS = 5;
|
|
9410
|
+
var MAX_ALTERNATE_LINK_FALLBACKS = 3;
|
|
9411
|
+
var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
|
|
9410
9412
|
function normalizeContentType(contentType) {
|
|
9411
9413
|
return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
9412
9414
|
}
|
|
@@ -9908,6 +9910,36 @@ function isJsonResponse(contentType, body) {
|
|
|
9908
9910
|
function decodeHtmlAttribute(value) {
|
|
9909
9911
|
return value.replace(/&/gi, "&").replace(/"/gi, '"').replace(/'|'/gi, "'").replace(/</gi, "<").replace(/>/gi, ">");
|
|
9910
9912
|
}
|
|
9913
|
+
function extractQualifiedAlternateLinks(document, baseUrl, format) {
|
|
9914
|
+
const acceptedTypes = {
|
|
9915
|
+
markdown: ["text/markdown", "text/x-markdown"],
|
|
9916
|
+
text: ["text/plain", "text/markdown", "text/x-markdown"],
|
|
9917
|
+
html: ["text/html", "application/xhtml+xml"],
|
|
9918
|
+
json: ["application/json", "text/json"]
|
|
9919
|
+
};
|
|
9920
|
+
const accepted = acceptedTypes[format];
|
|
9921
|
+
const head = document.head;
|
|
9922
|
+
if (!head) return [];
|
|
9923
|
+
const links = Array.from(head.querySelectorAll("link"));
|
|
9924
|
+
const candidates = [];
|
|
9925
|
+
for (const link of links) {
|
|
9926
|
+
const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
|
|
9927
|
+
if (!rel.includes("alternate")) continue;
|
|
9928
|
+
const type = normalizeContentType(link.getAttribute("type") ?? "");
|
|
9929
|
+
const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
|
|
9930
|
+
if (!isAccepted) continue;
|
|
9931
|
+
const href = link.getAttribute("href");
|
|
9932
|
+
if (!href) continue;
|
|
9933
|
+
try {
|
|
9934
|
+
const target = new URL(href, baseUrl).toString();
|
|
9935
|
+
if (target !== baseUrl && !candidates.includes(target)) {
|
|
9936
|
+
candidates.push(target);
|
|
9937
|
+
}
|
|
9938
|
+
} catch {
|
|
9939
|
+
}
|
|
9940
|
+
}
|
|
9941
|
+
return candidates;
|
|
9942
|
+
}
|
|
9911
9943
|
function extractClientSideRedirect(body, baseUrl) {
|
|
9912
9944
|
const snippet = body.slice(0, 4096);
|
|
9913
9945
|
const metaRefreshMatch = snippet.match(
|
|
@@ -10010,7 +10042,7 @@ function shouldStripReplies(site) {
|
|
|
10010
10042
|
return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
|
|
10011
10043
|
}
|
|
10012
10044
|
function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
10013
|
-
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount) {
|
|
10045
|
+
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
|
|
10014
10046
|
const browser = opts.browser ?? DEFAULT_BROWSER;
|
|
10015
10047
|
const os = opts.os ?? DEFAULT_OS;
|
|
10016
10048
|
const format = opts.format ?? "markdown";
|
|
@@ -10156,12 +10188,28 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10156
10188
|
return fetchWithClientRedirects(
|
|
10157
10189
|
{ ...opts, url: clientSideRedirect },
|
|
10158
10190
|
hooks,
|
|
10159
|
-
clientSideRedirectCount + 1
|
|
10191
|
+
clientSideRedirectCount + 1,
|
|
10192
|
+
alternateLinkFallbackCount
|
|
10160
10193
|
);
|
|
10161
10194
|
}
|
|
10162
10195
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
10163
10196
|
if (format === "json") {
|
|
10164
10197
|
if (!jsonResponse) {
|
|
10198
|
+
if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
|
|
10199
|
+
const alternateLinks2 = extractQualifiedAlternateLinks(
|
|
10200
|
+
parseLinkedomHTML(rawBody, finalUrl),
|
|
10201
|
+
finalUrl,
|
|
10202
|
+
format
|
|
10203
|
+
);
|
|
10204
|
+
if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10205
|
+
return fetchWithClientRedirects(
|
|
10206
|
+
{ ...opts, url: alternateLinks2[0] },
|
|
10207
|
+
hooks,
|
|
10208
|
+
clientSideRedirectCount,
|
|
10209
|
+
alternateLinkFallbackCount + 1
|
|
10210
|
+
);
|
|
10211
|
+
}
|
|
10212
|
+
}
|
|
10165
10213
|
return {
|
|
10166
10214
|
error: `Not a JSON response (content-type: ${contentType})`,
|
|
10167
10215
|
code: "unexpected_response",
|
|
@@ -10253,6 +10301,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10253
10301
|
});
|
|
10254
10302
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10255
10303
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10304
|
+
const alternateLinks = extractQualifiedAlternateLinks(
|
|
10305
|
+
fallbackDocument,
|
|
10306
|
+
finalUrl,
|
|
10307
|
+
format
|
|
10308
|
+
);
|
|
10309
|
+
const tryAlternateLinkFallback = async () => {
|
|
10310
|
+
if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10311
|
+
return null;
|
|
10312
|
+
}
|
|
10313
|
+
return fetchWithClientRedirects(
|
|
10314
|
+
{ ...opts, url: alternateLinks[0] },
|
|
10315
|
+
hooks,
|
|
10316
|
+
clientSideRedirectCount,
|
|
10317
|
+
alternateLinkFallbackCount + 1
|
|
10318
|
+
);
|
|
10319
|
+
};
|
|
10256
10320
|
let extracted;
|
|
10257
10321
|
const suppressedErrors = [];
|
|
10258
10322
|
try {
|
|
@@ -10313,6 +10377,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10313
10377
|
if (!extractedContent || wordCount === 0) {
|
|
10314
10378
|
const fallbackText = extractDomTextFallback(fallbackDocument);
|
|
10315
10379
|
if (!fallbackText) {
|
|
10380
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10381
|
+
if (alternateResult) return alternateResult;
|
|
10316
10382
|
return {
|
|
10317
10383
|
error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
|
|
10318
10384
|
code: "no_content",
|
|
@@ -10328,6 +10394,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10328
10394
|
extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
|
|
10329
10395
|
wordCount = estimateWordCount(fallbackText);
|
|
10330
10396
|
}
|
|
10397
|
+
const extractedTextWordCount = estimateWordCount(
|
|
10398
|
+
format === "text" ? extractedContent : markdownToText(extractedContent)
|
|
10399
|
+
);
|
|
10400
|
+
if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
|
|
10401
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10402
|
+
if (alternateResult) return alternateResult;
|
|
10403
|
+
}
|
|
10331
10404
|
if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
|
|
10332
10405
|
const strippedContent = stripExtractorComments(
|
|
10333
10406
|
extractedContent,
|
|
@@ -10366,7 +10439,7 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10366
10439
|
}
|
|
10367
10440
|
}
|
|
10368
10441
|
return function defuddleFetch2(opts, hooks = {}) {
|
|
10369
|
-
return fetchWithClientRedirects(opts, hooks, 0);
|
|
10442
|
+
return fetchWithClientRedirects(opts, hooks, 0, 0);
|
|
10370
10443
|
};
|
|
10371
10444
|
}
|
|
10372
10445
|
var defuddleFetch = createDefuddleFetch();
|