openclaw-smart-fetch 0.2.34 → 0.2.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.js +76 -3
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
- 🧠 **Useful metadata** — title, author, site, language, published date when available
|
|
10
10
|
- 📦 **Downloads + large file support** — stream attachments and binaries to temp files
|
|
11
11
|
- 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
|
|
12
|
+
- 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
|
|
12
13
|
- ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
|
|
13
14
|
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
|
|
14
15
|
- 🔄 **Built-in `web_fetch` fallback** — automatically improves the core web_fetch tool
|
|
@@ -35,6 +36,7 @@ from Defuddle's extractors and cleanup:
|
|
|
35
36
|
Notes:
|
|
36
37
|
- Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
|
|
37
38
|
- It does **not** execute JavaScript or solve interactive anti-bot/login flows
|
|
39
|
+
- If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
|
|
38
40
|
|
|
39
41
|
## Install
|
|
40
42
|
|
package/dist/index.js
CHANGED
|
@@ -9405,6 +9405,8 @@ var HTML_CONTENT_TYPES = [
|
|
|
9405
9405
|
"text/markdown"
|
|
9406
9406
|
];
|
|
9407
9407
|
var MAX_CLIENT_SIDE_REDIRECTS = 5;
|
|
9408
|
+
var MAX_ALTERNATE_LINK_FALLBACKS = 3;
|
|
9409
|
+
var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
|
|
9408
9410
|
function normalizeContentType(contentType) {
|
|
9409
9411
|
return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
9410
9412
|
}
|
|
@@ -9906,6 +9908,36 @@ function isJsonResponse(contentType, body) {
|
|
|
9906
9908
|
function decodeHtmlAttribute(value) {
|
|
9907
9909
|
return value.replace(/&/gi, "&").replace(/"/gi, '"').replace(/'|'/gi, "'").replace(/</gi, "<").replace(/>/gi, ">");
|
|
9908
9910
|
}
|
|
9911
|
+
function extractQualifiedAlternateLinks(document, baseUrl, format) {
|
|
9912
|
+
const acceptedTypes = {
|
|
9913
|
+
markdown: ["text/markdown", "text/x-markdown"],
|
|
9914
|
+
text: ["text/plain", "text/markdown", "text/x-markdown"],
|
|
9915
|
+
html: ["text/html", "application/xhtml+xml"],
|
|
9916
|
+
json: ["application/json", "text/json"]
|
|
9917
|
+
};
|
|
9918
|
+
const accepted = acceptedTypes[format];
|
|
9919
|
+
const head = document.head;
|
|
9920
|
+
if (!head) return [];
|
|
9921
|
+
const links = Array.from(head.querySelectorAll("link"));
|
|
9922
|
+
const candidates = [];
|
|
9923
|
+
for (const link of links) {
|
|
9924
|
+
const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
|
|
9925
|
+
if (!rel.includes("alternate")) continue;
|
|
9926
|
+
const type = normalizeContentType(link.getAttribute("type") ?? "");
|
|
9927
|
+
const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
|
|
9928
|
+
if (!isAccepted) continue;
|
|
9929
|
+
const href = link.getAttribute("href");
|
|
9930
|
+
if (!href) continue;
|
|
9931
|
+
try {
|
|
9932
|
+
const target = new URL(href, baseUrl).toString();
|
|
9933
|
+
if (target !== baseUrl && !candidates.includes(target)) {
|
|
9934
|
+
candidates.push(target);
|
|
9935
|
+
}
|
|
9936
|
+
} catch {
|
|
9937
|
+
}
|
|
9938
|
+
}
|
|
9939
|
+
return candidates;
|
|
9940
|
+
}
|
|
9909
9941
|
function extractClientSideRedirect(body, baseUrl) {
|
|
9910
9942
|
const snippet = body.slice(0, 4096);
|
|
9911
9943
|
const metaRefreshMatch = snippet.match(
|
|
@@ -10008,7 +10040,7 @@ function shouldStripReplies(site) {
|
|
|
10008
10040
|
return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
|
|
10009
10041
|
}
|
|
10010
10042
|
function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
10011
|
-
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount) {
|
|
10043
|
+
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
|
|
10012
10044
|
const browser = opts.browser ?? DEFAULT_BROWSER;
|
|
10013
10045
|
const os = opts.os ?? DEFAULT_OS;
|
|
10014
10046
|
const format = opts.format ?? "markdown";
|
|
@@ -10154,12 +10186,28 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10154
10186
|
return fetchWithClientRedirects(
|
|
10155
10187
|
{ ...opts, url: clientSideRedirect },
|
|
10156
10188
|
hooks,
|
|
10157
|
-
clientSideRedirectCount + 1
|
|
10189
|
+
clientSideRedirectCount + 1,
|
|
10190
|
+
alternateLinkFallbackCount
|
|
10158
10191
|
);
|
|
10159
10192
|
}
|
|
10160
10193
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
10161
10194
|
if (format === "json") {
|
|
10162
10195
|
if (!jsonResponse) {
|
|
10196
|
+
if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
|
|
10197
|
+
const alternateLinks2 = extractQualifiedAlternateLinks(
|
|
10198
|
+
parseLinkedomHTML(rawBody, finalUrl),
|
|
10199
|
+
finalUrl,
|
|
10200
|
+
format
|
|
10201
|
+
);
|
|
10202
|
+
if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10203
|
+
return fetchWithClientRedirects(
|
|
10204
|
+
{ ...opts, url: alternateLinks2[0] },
|
|
10205
|
+
hooks,
|
|
10206
|
+
clientSideRedirectCount,
|
|
10207
|
+
alternateLinkFallbackCount + 1
|
|
10208
|
+
);
|
|
10209
|
+
}
|
|
10210
|
+
}
|
|
10163
10211
|
return {
|
|
10164
10212
|
error: `Not a JSON response (content-type: ${contentType})`,
|
|
10165
10213
|
code: "unexpected_response",
|
|
@@ -10251,6 +10299,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10251
10299
|
});
|
|
10252
10300
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10253
10301
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10302
|
+
const alternateLinks = extractQualifiedAlternateLinks(
|
|
10303
|
+
fallbackDocument,
|
|
10304
|
+
finalUrl,
|
|
10305
|
+
format
|
|
10306
|
+
);
|
|
10307
|
+
const tryAlternateLinkFallback = async () => {
|
|
10308
|
+
if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10309
|
+
return null;
|
|
10310
|
+
}
|
|
10311
|
+
return fetchWithClientRedirects(
|
|
10312
|
+
{ ...opts, url: alternateLinks[0] },
|
|
10313
|
+
hooks,
|
|
10314
|
+
clientSideRedirectCount,
|
|
10315
|
+
alternateLinkFallbackCount + 1
|
|
10316
|
+
);
|
|
10317
|
+
};
|
|
10254
10318
|
let extracted;
|
|
10255
10319
|
const suppressedErrors = [];
|
|
10256
10320
|
try {
|
|
@@ -10311,6 +10375,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10311
10375
|
if (!extractedContent || wordCount === 0) {
|
|
10312
10376
|
const fallbackText = extractDomTextFallback(fallbackDocument);
|
|
10313
10377
|
if (!fallbackText) {
|
|
10378
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10379
|
+
if (alternateResult) return alternateResult;
|
|
10314
10380
|
return {
|
|
10315
10381
|
error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
|
|
10316
10382
|
code: "no_content",
|
|
@@ -10326,6 +10392,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10326
10392
|
extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
|
|
10327
10393
|
wordCount = estimateWordCount(fallbackText);
|
|
10328
10394
|
}
|
|
10395
|
+
const extractedTextWordCount = estimateWordCount(
|
|
10396
|
+
format === "text" ? extractedContent : markdownToText(extractedContent)
|
|
10397
|
+
);
|
|
10398
|
+
if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
|
|
10399
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10400
|
+
if (alternateResult) return alternateResult;
|
|
10401
|
+
}
|
|
10329
10402
|
if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
|
|
10330
10403
|
const strippedContent = stripExtractorComments(
|
|
10331
10404
|
extractedContent,
|
|
@@ -10364,7 +10437,7 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10364
10437
|
}
|
|
10365
10438
|
}
|
|
10366
10439
|
return function defuddleFetch2(opts, hooks = {}) {
|
|
10367
|
-
return fetchWithClientRedirects(opts, hooks, 0);
|
|
10440
|
+
return fetchWithClientRedirects(opts, hooks, 0, 0);
|
|
10368
10441
|
};
|
|
10369
10442
|
}
|
|
10370
10443
|
var defuddleFetch = createDefuddleFetch();
|