openclaw-smart-fetch 0.2.33 โ 0.2.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/index.js +125 -1
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
- ๐งน **Defuddle extraction** โ clean readable content instead of noisy HTML
|
|
9
9
|
- ๐ง **Useful metadata** โ title, author, site, language, published date when available
|
|
10
10
|
- ๐ฆ **Downloads + large file support** โ stream attachments and binaries to temp files
|
|
11
|
+
- ๐ **Client-side `<meta>` redirects** โ follows sane meta refresh redirects with loop limits
|
|
12
|
+
- ๐ **Alternate content fallback** โ when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
|
|
11
13
|
- โก **Batch fetch** โ fetch many URLs with bounded concurrency
|
|
12
14
|
- ๐ **Multiple output formats** โ `markdown`, `html`, `text`, `json`
|
|
13
15
|
- ๐ **Built-in `web_fetch` fallback** โ automatically improves the core web_fetch tool
|
|
@@ -34,6 +36,7 @@ from Defuddle's extractors and cleanup:
|
|
|
34
36
|
Notes:
|
|
35
37
|
- Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
|
|
36
38
|
- It does **not** execute JavaScript or solve interactive anti-bot/login flows
|
|
39
|
+
- If an HTML shell advertises alternate content in `<head>`, smart-fetch can follow matching alternates such as `text/markdown`, `text/plain`, `text/html`, or JSON media types according to the requested `format`
|
|
37
40
|
|
|
38
41
|
## Install
|
|
39
42
|
|
package/dist/index.js
CHANGED
|
@@ -9404,6 +9404,9 @@ var HTML_CONTENT_TYPES = [
|
|
|
9404
9404
|
"text/plain",
|
|
9405
9405
|
"text/markdown"
|
|
9406
9406
|
];
|
|
9407
|
+
var MAX_CLIENT_SIDE_REDIRECTS = 5;
|
|
9408
|
+
var MAX_ALTERNATE_LINK_FALLBACKS = 3;
|
|
9409
|
+
var MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK = 30;
|
|
9407
9410
|
function normalizeContentType(contentType) {
|
|
9408
9411
|
return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
9409
9412
|
}
|
|
@@ -9902,6 +9905,62 @@ function isLikelyJsonBody(body) {
|
|
|
9902
9905
|
function isJsonResponse(contentType, body) {
|
|
9903
9906
|
return isJsonContentType(contentType) || isLikelyJsonBody(body);
|
|
9904
9907
|
}
|
|
9908
|
+
function decodeHtmlAttribute(value) {
|
|
9909
|
+
return value.replace(/&/gi, "&").replace(/"/gi, '"').replace(/'|'/gi, "'").replace(/</gi, "<").replace(/>/gi, ">");
|
|
9910
|
+
}
|
|
9911
|
+
function extractQualifiedAlternateLinks(document, baseUrl, format) {
|
|
9912
|
+
const acceptedTypes = {
|
|
9913
|
+
markdown: ["text/markdown", "text/x-markdown"],
|
|
9914
|
+
text: ["text/plain", "text/markdown", "text/x-markdown"],
|
|
9915
|
+
html: ["text/html", "application/xhtml+xml"],
|
|
9916
|
+
json: ["application/json", "text/json"]
|
|
9917
|
+
};
|
|
9918
|
+
const accepted = acceptedTypes[format];
|
|
9919
|
+
const head = document.head;
|
|
9920
|
+
if (!head) return [];
|
|
9921
|
+
const links = Array.from(head.querySelectorAll("link"));
|
|
9922
|
+
const candidates = [];
|
|
9923
|
+
for (const link of links) {
|
|
9924
|
+
const rel = (link.getAttribute("rel") ?? "").toLowerCase().split(/\s+/);
|
|
9925
|
+
if (!rel.includes("alternate")) continue;
|
|
9926
|
+
const type = normalizeContentType(link.getAttribute("type") ?? "");
|
|
9927
|
+
const isAccepted = accepted.some((value) => type === value) || format === "json" && type.endsWith("+json");
|
|
9928
|
+
if (!isAccepted) continue;
|
|
9929
|
+
const href = link.getAttribute("href");
|
|
9930
|
+
if (!href) continue;
|
|
9931
|
+
try {
|
|
9932
|
+
const target = new URL(href, baseUrl).toString();
|
|
9933
|
+
if (target !== baseUrl && !candidates.includes(target)) {
|
|
9934
|
+
candidates.push(target);
|
|
9935
|
+
}
|
|
9936
|
+
} catch {
|
|
9937
|
+
}
|
|
9938
|
+
}
|
|
9939
|
+
return candidates;
|
|
9940
|
+
}
|
|
9941
|
+
function extractClientSideRedirect(body, baseUrl) {
|
|
9942
|
+
const snippet = body.slice(0, 4096);
|
|
9943
|
+
const metaRefreshMatch = snippet.match(
|
|
9944
|
+
/<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
|
|
9945
|
+
);
|
|
9946
|
+
const refreshContent = metaRefreshMatch?.[1];
|
|
9947
|
+
if (!refreshContent) {
|
|
9948
|
+
return null;
|
|
9949
|
+
}
|
|
9950
|
+
const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
|
|
9951
|
+
const delaySeconds = Number.parseFloat(delayPart.trim());
|
|
9952
|
+
const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
|
|
9953
|
+
const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
|
|
9954
|
+
if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
|
|
9955
|
+
return null;
|
|
9956
|
+
}
|
|
9957
|
+
try {
|
|
9958
|
+
const targetUrl = new URL(rawTarget, baseUrl).toString();
|
|
9959
|
+
return targetUrl === baseUrl ? null : targetUrl;
|
|
9960
|
+
} catch {
|
|
9961
|
+
return null;
|
|
9962
|
+
}
|
|
9963
|
+
}
|
|
9905
9964
|
function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
|
|
9906
9965
|
const parsedJson = parseAndFormatJson(rawBody);
|
|
9907
9966
|
if ("error" in parsedJson) {
|
|
@@ -9981,7 +10040,7 @@ function shouldStripReplies(site) {
|
|
|
9981
10040
|
return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
|
|
9982
10041
|
}
|
|
9983
10042
|
function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
9984
|
-
|
|
10043
|
+
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount, alternateLinkFallbackCount) {
|
|
9985
10044
|
const browser = opts.browser ?? DEFAULT_BROWSER;
|
|
9986
10045
|
const os = opts.os ?? DEFAULT_OS;
|
|
9987
10046
|
const format = opts.format ?? "markdown";
|
|
@@ -10109,9 +10168,46 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10109
10168
|
}
|
|
10110
10169
|
errorContext.phase = "loading";
|
|
10111
10170
|
const rawBody = await response.text();
|
|
10171
|
+
const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
|
|
10172
|
+
if (clientSideRedirect) {
|
|
10173
|
+
if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
|
|
10174
|
+
return {
|
|
10175
|
+
error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
|
|
10176
|
+
code: "too_many_redirects",
|
|
10177
|
+
phase: "loading",
|
|
10178
|
+
retryable: false,
|
|
10179
|
+
timeoutMs,
|
|
10180
|
+
url: opts.url,
|
|
10181
|
+
finalUrl,
|
|
10182
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10183
|
+
contentLength: errorContext.contentLength
|
|
10184
|
+
};
|
|
10185
|
+
}
|
|
10186
|
+
return fetchWithClientRedirects(
|
|
10187
|
+
{ ...opts, url: clientSideRedirect },
|
|
10188
|
+
hooks,
|
|
10189
|
+
clientSideRedirectCount + 1,
|
|
10190
|
+
alternateLinkFallbackCount
|
|
10191
|
+
);
|
|
10192
|
+
}
|
|
10112
10193
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
10113
10194
|
if (format === "json") {
|
|
10114
10195
|
if (!jsonResponse) {
|
|
10196
|
+
if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
|
|
10197
|
+
const alternateLinks2 = extractQualifiedAlternateLinks(
|
|
10198
|
+
parseLinkedomHTML(rawBody, finalUrl),
|
|
10199
|
+
finalUrl,
|
|
10200
|
+
format
|
|
10201
|
+
);
|
|
10202
|
+
if (alternateLinks2.length > 0 && alternateLinkFallbackCount < MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10203
|
+
return fetchWithClientRedirects(
|
|
10204
|
+
{ ...opts, url: alternateLinks2[0] },
|
|
10205
|
+
hooks,
|
|
10206
|
+
clientSideRedirectCount,
|
|
10207
|
+
alternateLinkFallbackCount + 1
|
|
10208
|
+
);
|
|
10209
|
+
}
|
|
10210
|
+
}
|
|
10115
10211
|
return {
|
|
10116
10212
|
error: `Not a JSON response (content-type: ${contentType})`,
|
|
10117
10213
|
code: "unexpected_response",
|
|
@@ -10203,6 +10299,22 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10203
10299
|
});
|
|
10204
10300
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10205
10301
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10302
|
+
const alternateLinks = extractQualifiedAlternateLinks(
|
|
10303
|
+
fallbackDocument,
|
|
10304
|
+
finalUrl,
|
|
10305
|
+
format
|
|
10306
|
+
);
|
|
10307
|
+
const tryAlternateLinkFallback = async () => {
|
|
10308
|
+
if (alternateLinks.length === 0 || alternateLinkFallbackCount >= MAX_ALTERNATE_LINK_FALLBACKS) {
|
|
10309
|
+
return null;
|
|
10310
|
+
}
|
|
10311
|
+
return fetchWithClientRedirects(
|
|
10312
|
+
{ ...opts, url: alternateLinks[0] },
|
|
10313
|
+
hooks,
|
|
10314
|
+
clientSideRedirectCount,
|
|
10315
|
+
alternateLinkFallbackCount + 1
|
|
10316
|
+
);
|
|
10317
|
+
};
|
|
10206
10318
|
let extracted;
|
|
10207
10319
|
const suppressedErrors = [];
|
|
10208
10320
|
try {
|
|
@@ -10263,6 +10375,8 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10263
10375
|
if (!extractedContent || wordCount === 0) {
|
|
10264
10376
|
const fallbackText = extractDomTextFallback(fallbackDocument);
|
|
10265
10377
|
if (!fallbackText) {
|
|
10378
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10379
|
+
if (alternateResult) return alternateResult;
|
|
10266
10380
|
return {
|
|
10267
10381
|
error: `No content extracted from ${opts.url}. May need JS rendering or is blocked.`,
|
|
10268
10382
|
code: "no_content",
|
|
@@ -10278,6 +10392,13 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10278
10392
|
extractedContent = format === "html" ? rawBody : format === "markdown" ? extractDomMarkdownFallback(fallbackDocument) || fallbackText : fallbackText;
|
|
10279
10393
|
wordCount = estimateWordCount(fallbackText);
|
|
10280
10394
|
}
|
|
10395
|
+
const extractedTextWordCount = estimateWordCount(
|
|
10396
|
+
format === "text" ? extractedContent : markdownToText(extractedContent)
|
|
10397
|
+
);
|
|
10398
|
+
if (Math.min(wordCount, extractedTextWordCount) < MIN_EXTRACTED_WORDS_BEFORE_ALTERNATE_FALLBACK && alternateLinks.length > 0) {
|
|
10399
|
+
const alternateResult = await tryAlternateLinkFallback();
|
|
10400
|
+
if (alternateResult) return alternateResult;
|
|
10401
|
+
}
|
|
10281
10402
|
if (includeReplies === false && shouldStripReplies(extracted.site ?? "")) {
|
|
10282
10403
|
const strippedContent = stripExtractorComments(
|
|
10283
10404
|
extractedContent,
|
|
@@ -10314,6 +10435,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10314
10435
|
emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
|
|
10315
10436
|
return fetchError;
|
|
10316
10437
|
}
|
|
10438
|
+
}
|
|
10439
|
+
return function defuddleFetch2(opts, hooks = {}) {
|
|
10440
|
+
return fetchWithClientRedirects(opts, hooks, 0, 0);
|
|
10317
10441
|
};
|
|
10318
10442
|
}
|
|
10319
10443
|
var defuddleFetch = createDefuddleFetch();
|