openclaw-smart-fetch 0.2.32 โ 0.2.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/index.js +53 -2
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
- ๐งน **Defuddle extraction** โ clean readable content instead of noisy HTML
|
|
9
9
|
- ๐ง **Useful metadata** โ title, author, site, language, published date when available
|
|
10
10
|
- ๐ฆ **Downloads + large file support** โ stream attachments and binaries to temp files
|
|
11
|
+
- ๐ **Client-side `<meta>` redirects** โ follows sane meta refresh redirects with loop limits
|
|
11
12
|
- โก **Batch fetch** โ fetch many URLs with bounded concurrency
|
|
12
13
|
- ๐ **Multiple output formats** โ `markdown`, `html`, `text`, `json`
|
|
13
14
|
- ๐ **Built-in `web_fetch` fallback** โ automatically improves the core web_fetch tool
|
package/dist/index.js
CHANGED
|
@@ -9404,6 +9404,7 @@ var HTML_CONTENT_TYPES = [
|
|
|
9404
9404
|
"text/plain",
|
|
9405
9405
|
"text/markdown"
|
|
9406
9406
|
];
|
|
9407
|
+
var MAX_CLIENT_SIDE_REDIRECTS = 5;
|
|
9407
9408
|
function normalizeContentType(contentType) {
|
|
9408
9409
|
return contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
9409
9410
|
}
|
|
@@ -9902,6 +9903,32 @@ function isLikelyJsonBody(body) {
|
|
|
9902
9903
|
function isJsonResponse(contentType, body) {
|
|
9903
9904
|
return isJsonContentType(contentType) || isLikelyJsonBody(body);
|
|
9904
9905
|
}
|
|
9906
|
+
function decodeHtmlAttribute(value) {
|
|
9907
|
+
return value.replace(/&/gi, "&").replace(/"/gi, '"').replace(/'|'/gi, "'").replace(/</gi, "<").replace(/>/gi, ">");
|
|
9908
|
+
}
|
|
9909
|
+
function extractClientSideRedirect(body, baseUrl) {
|
|
9910
|
+
const snippet = body.slice(0, 4096);
|
|
9911
|
+
const metaRefreshMatch = snippet.match(
|
|
9912
|
+
/<meta\b[^>]*http-equiv=["']?refresh["']?[^>]*content=["']?([^"'>]*)["']?[^>]*>/i
|
|
9913
|
+
);
|
|
9914
|
+
const refreshContent = metaRefreshMatch?.[1];
|
|
9915
|
+
if (!refreshContent) {
|
|
9916
|
+
return null;
|
|
9917
|
+
}
|
|
9918
|
+
const [delayPart = "", ...rest] = decodeHtmlAttribute(refreshContent).split(";");
|
|
9919
|
+
const delaySeconds = Number.parseFloat(delayPart.trim());
|
|
9920
|
+
const urlMatch = rest.join(";").match(/\burl\s*=\s*(.+)$/i);
|
|
9921
|
+
const rawTarget = urlMatch?.[1]?.trim().replace(/^['"]|['"]$/g, "");
|
|
9922
|
+
if (!rawTarget || !Number.isFinite(delaySeconds) || delaySeconds < 0 || delaySeconds >= 30) {
|
|
9923
|
+
return null;
|
|
9924
|
+
}
|
|
9925
|
+
try {
|
|
9926
|
+
const targetUrl = new URL(rawTarget, baseUrl).toString();
|
|
9927
|
+
return targetUrl === baseUrl ? null : targetUrl;
|
|
9928
|
+
} catch {
|
|
9929
|
+
return null;
|
|
9930
|
+
}
|
|
9931
|
+
}
|
|
9905
9932
|
function buildJsonResult(opts, finalUrl, rawBody, format, maxChars, browser, os) {
|
|
9906
9933
|
const parsedJson = parseAndFormatJson(rawBody);
|
|
9907
9934
|
if ("error" in parsedJson) {
|
|
@@ -9981,7 +10008,7 @@ function shouldStripReplies(site) {
|
|
|
9981
10008
|
return site === "Hacker News" || site.startsWith("r/") || site.startsWith("GitHub - ");
|
|
9982
10009
|
}
|
|
9983
10010
|
function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
9984
|
-
|
|
10011
|
+
async function fetchWithClientRedirects(opts, hooks, clientSideRedirectCount) {
|
|
9985
10012
|
const browser = opts.browser ?? DEFAULT_BROWSER;
|
|
9986
10013
|
const os = opts.os ?? DEFAULT_OS;
|
|
9987
10014
|
const format = opts.format ?? "markdown";
|
|
@@ -10109,6 +10136,27 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10109
10136
|
}
|
|
10110
10137
|
errorContext.phase = "loading";
|
|
10111
10138
|
const rawBody = await response.text();
|
|
10139
|
+
const clientSideRedirect = extractClientSideRedirect(rawBody, finalUrl);
|
|
10140
|
+
if (clientSideRedirect) {
|
|
10141
|
+
if (clientSideRedirectCount >= MAX_CLIENT_SIDE_REDIRECTS) {
|
|
10142
|
+
return {
|
|
10143
|
+
error: `Client-side redirect limit (${MAX_CLIENT_SIDE_REDIRECTS}) exceeded while fetching ${opts.url}.`,
|
|
10144
|
+
code: "too_many_redirects",
|
|
10145
|
+
phase: "loading",
|
|
10146
|
+
retryable: false,
|
|
10147
|
+
timeoutMs,
|
|
10148
|
+
url: opts.url,
|
|
10149
|
+
finalUrl,
|
|
10150
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10151
|
+
contentLength: errorContext.contentLength
|
|
10152
|
+
};
|
|
10153
|
+
}
|
|
10154
|
+
return fetchWithClientRedirects(
|
|
10155
|
+
{ ...opts, url: clientSideRedirect },
|
|
10156
|
+
hooks,
|
|
10157
|
+
clientSideRedirectCount + 1
|
|
10158
|
+
);
|
|
10159
|
+
}
|
|
10112
10160
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
10113
10161
|
if (format === "json") {
|
|
10114
10162
|
if (!jsonResponse) {
|
|
@@ -10242,7 +10290,7 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10242
10290
|
fallbackDocument,
|
|
10243
10291
|
opts.url
|
|
10244
10292
|
);
|
|
10245
|
-
if (hasOembed404 || hasJsDisabledShell) {
|
|
10293
|
+
if ((hasOembed404 || hasJsDisabledShell) && !extracted.content) {
|
|
10246
10294
|
return {
|
|
10247
10295
|
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
10248
10296
|
code: "http_error",
|
|
@@ -10314,6 +10362,9 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10314
10362
|
emitProgress(hooks, { status: "error", progress: 1, phase: "error" });
|
|
10315
10363
|
return fetchError;
|
|
10316
10364
|
}
|
|
10365
|
+
}
|
|
10366
|
+
return function defuddleFetch2(opts, hooks = {}) {
|
|
10367
|
+
return fetchWithClientRedirects(opts, hooks, 0);
|
|
10317
10368
|
};
|
|
10318
10369
|
}
|
|
10319
10370
|
var defuddleFetch = createDefuddleFetch();
|