openclaw-smart-fetch 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/index.js +87 -5
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/skills/smart-fetch/SKILL.md +4 -1
package/README.md
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
- 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
|
|
12
12
|
- 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
|
|
13
13
|
- ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
|
|
14
|
-
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
|
|
14
|
+
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`, `raw`
|
|
15
15
|
- 🔄 **Built-in `web_fetch` fallback** — automatically improves the core web_fetch tool
|
|
16
16
|
- 📖 **Bundled skill** — agents get usage guidance injected into their system prompt
|
|
17
17
|
|
|
@@ -139,6 +139,7 @@ Skills are declared in the manifest (`openclaw.plugin.json`) under `"skills":
|
|
|
139
139
|
| `html` | Cleaned HTML output |
|
|
140
140
|
| `text` | Plain text with markdown stripped |
|
|
141
141
|
| `json` | Structured JSON for metadata-heavy workflows |
|
|
142
|
+
| `raw` | Full raw server response without extraction or truncation — for further parsing |
|
|
142
143
|
|
|
143
144
|
## Plugin config
|
|
144
145
|
|
package/dist/index.js
CHANGED
|
@@ -9958,6 +9958,7 @@ var DEFAULT_TIMEOUT_MS = 15e3;
|
|
|
9958
9958
|
var DEFAULT_BATCH_CONCURRENCY = 8;
|
|
9959
9959
|
var DEFAULT_INCLUDE_REPLIES = "extractors";
|
|
9960
9960
|
var DEFAULT_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
|
|
9961
|
+
var DEFAULT_RAW_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/json,application/xml;q=0.9,text/markdown;q=0.8,text/plain;q=0.8,*/*;q=0.7";
|
|
9961
9962
|
var DEFAULT_JSON_ACCEPT_HEADER = "application/json,text/json,application/ld+json;q=0.9,text/plain;q=0.8,*/*;q=0.7";
|
|
9962
9963
|
var DEFAULT_ACCEPT_LANGUAGE_HEADER = "en-US,en;q=0.9";
|
|
9963
9964
|
|
|
@@ -10175,7 +10176,8 @@ function buildCompactMetadataHeader(result) {
|
|
|
10175
10176
|
["URL", result.finalUrl],
|
|
10176
10177
|
["Title", result.title],
|
|
10177
10178
|
["Author", result.author],
|
|
10178
|
-
["Published", result.published]
|
|
10179
|
+
["Published", result.published],
|
|
10180
|
+
["Content-Type", result.contentType]
|
|
10179
10181
|
]);
|
|
10180
10182
|
}
|
|
10181
10183
|
function buildMetadataHeader(result) {
|
|
@@ -10193,6 +10195,7 @@ function buildMetadataHeader(result) {
|
|
|
10193
10195
|
["Title", result.title],
|
|
10194
10196
|
["Author", result.author],
|
|
10195
10197
|
["Published", result.published],
|
|
10198
|
+
["Content-Type", result.contentType],
|
|
10196
10199
|
["Site", result.site],
|
|
10197
10200
|
["Language", result.language],
|
|
10198
10201
|
["Words", result.wordCount],
|
|
@@ -10775,7 +10778,9 @@ function mapRequestEventToProgress(event) {
|
|
|
10775
10778
|
}
|
|
10776
10779
|
}
|
|
10777
10780
|
function resolveAcceptHeader(format) {
|
|
10778
|
-
|
|
10781
|
+
if (format === "json") return DEFAULT_JSON_ACCEPT_HEADER;
|
|
10782
|
+
if (format === "raw") return DEFAULT_RAW_ACCEPT_HEADER;
|
|
10783
|
+
return DEFAULT_ACCEPT_HEADER;
|
|
10779
10784
|
}
|
|
10780
10785
|
function isJsonContentType(contentType) {
|
|
10781
10786
|
const normalized = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
@@ -10796,7 +10801,8 @@ function extractQualifiedAlternateLinks(document, baseUrl, format) {
|
|
|
10796
10801
|
markdown: ["text/markdown", "text/x-markdown"],
|
|
10797
10802
|
text: ["text/plain", "text/markdown", "text/x-markdown"],
|
|
10798
10803
|
html: ["text/html", "application/xhtml+xml"],
|
|
10799
|
-
json: ["application/json", "text/json"]
|
|
10804
|
+
json: ["application/json", "text/json"],
|
|
10805
|
+
raw: []
|
|
10800
10806
|
};
|
|
10801
10807
|
const accepted = acceptedTypes[format];
|
|
10802
10808
|
const head = document.head;
|
|
@@ -11074,6 +11080,81 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
11074
11080
|
);
|
|
11075
11081
|
}
|
|
11076
11082
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
11083
|
+
if (format === "raw") {
|
|
11084
|
+
const isXUrl2 = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
|
|
11085
|
+
opts.url
|
|
11086
|
+
);
|
|
11087
|
+
if (isXUrl2) {
|
|
11088
|
+
let extractedContent2;
|
|
11089
|
+
const suppressedErrors2 = [];
|
|
11090
|
+
const origConsoleError = console.error;
|
|
11091
|
+
console.error = (...args) => {
|
|
11092
|
+
suppressedErrors2.push(args);
|
|
11093
|
+
};
|
|
11094
|
+
try {
|
|
11095
|
+
const extractionDocument2 = parseLinkedomHTML(rawBody, finalUrl);
|
|
11096
|
+
const extracted2 = await dependencies.defuddle(
|
|
11097
|
+
extractionDocument2,
|
|
11098
|
+
finalUrl,
|
|
11099
|
+
{
|
|
11100
|
+
markdown: true,
|
|
11101
|
+
removeImages,
|
|
11102
|
+
includeReplies
|
|
11103
|
+
}
|
|
11104
|
+
);
|
|
11105
|
+
extractedContent2 = extracted2.content;
|
|
11106
|
+
} finally {
|
|
11107
|
+
console.error = origConsoleError;
|
|
11108
|
+
}
|
|
11109
|
+
const hasOembed404 = suppressedErrors2.some(
|
|
11110
|
+
(args) => args.some(
|
|
11111
|
+
(arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
|
|
11112
|
+
)
|
|
11113
|
+
);
|
|
11114
|
+
const hasJsDisabledShell = isTwitterJsDisabledPage(
|
|
11115
|
+
parseLinkedomHTML(rawBody, finalUrl),
|
|
11116
|
+
opts.url
|
|
11117
|
+
);
|
|
11118
|
+
if ((hasOembed404 || hasJsDisabledShell) && !extractedContent2) {
|
|
11119
|
+
return {
|
|
11120
|
+
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
11121
|
+
code: "http_error",
|
|
11122
|
+
phase: "loading",
|
|
11123
|
+
retryable: false,
|
|
11124
|
+
timeoutMs,
|
|
11125
|
+
url: opts.url,
|
|
11126
|
+
finalUrl,
|
|
11127
|
+
statusCode: 404,
|
|
11128
|
+
statusText: "Not Found",
|
|
11129
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
11130
|
+
contentLength: errorContext.contentLength
|
|
11131
|
+
};
|
|
11132
|
+
}
|
|
11133
|
+
}
|
|
11134
|
+
const effectiveContent = opts.maxChars !== void 0 ? truncateContent(rawBody, maxChars) : rawBody;
|
|
11135
|
+
const result2 = {
|
|
11136
|
+
kind: "content",
|
|
11137
|
+
url: opts.url,
|
|
11138
|
+
finalUrl,
|
|
11139
|
+
title: "",
|
|
11140
|
+
author: "",
|
|
11141
|
+
published: "",
|
|
11142
|
+
site: new URL(finalUrl).hostname,
|
|
11143
|
+
language: "",
|
|
11144
|
+
wordCount: 0,
|
|
11145
|
+
content: effectiveContent,
|
|
11146
|
+
browser,
|
|
11147
|
+
os,
|
|
11148
|
+
contentType: normalizeContentType(contentType) || void 0
|
|
11149
|
+
};
|
|
11150
|
+
emitStatus(hooks, "done");
|
|
11151
|
+
emitProgress(hooks, {
|
|
11152
|
+
status: "done",
|
|
11153
|
+
progress: 1,
|
|
11154
|
+
phase: "raw_done"
|
|
11155
|
+
});
|
|
11156
|
+
return result2;
|
|
11157
|
+
}
|
|
11077
11158
|
if (format === "json") {
|
|
11078
11159
|
if (!jsonResponse) {
|
|
11079
11160
|
if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
|
|
@@ -11379,10 +11460,11 @@ function createBaseFetchToolParameterProperties(defaults) {
|
|
|
11379
11460
|
Type.Literal("markdown"),
|
|
11380
11461
|
Type.Literal("html"),
|
|
11381
11462
|
Type.Literal("text"),
|
|
11382
|
-
Type.Literal("json")
|
|
11463
|
+
Type.Literal("json"),
|
|
11464
|
+
Type.Literal("raw")
|
|
11383
11465
|
],
|
|
11384
11466
|
{
|
|
11385
|
-
description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting),
|
|
11467
|
+
description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), "json" (pretty-printed JSON), or "raw" (full raw server response without extraction or truncation, for further parsing)'
|
|
11386
11468
|
}
|
|
11387
11469
|
)
|
|
11388
11470
|
),
|