openclaw-smart-fetch 0.2.30 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +106 -23
- package/dist/index.d.ts +32 -0
- package/dist/index.js +105 -20
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +8 -3
- package/package.json +2 -1
- package/skills/smart-fetch/SKILL.md +99 -0
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# openclaw-smart-fetch
|
|
2
2
|
|
|
3
|
-
`openclaw-smart-fetch` adds smarter web fetching tools to OpenClaw.
|
|
3
|
+
`openclaw-smart-fetch` adds smarter web fetching tools to [OpenClaw](https://github.com/nicepkg/openclaw).
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
@@ -10,18 +10,26 @@
|
|
|
10
10
|
- 📦 **Downloads + large file support** — stream attachments and binaries to temp files
|
|
11
11
|
- ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
|
|
12
12
|
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
|
|
13
|
+
- 🔄 **Built-in `web_fetch` fallback** — automatically improves the core web_fetch tool
|
|
14
|
+
- 📖 **Bundled skill** — agents get usage guidance injected into their system prompt
|
|
13
15
|
|
|
14
16
|
## Site optimisations
|
|
15
17
|
|
|
16
|
-
This package works on general web pages, but some site types benefit especially
|
|
18
|
+
This package works on general web pages, but some site types benefit especially
|
|
19
|
+
from Defuddle's extractors and cleanup:
|
|
17
20
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
| Site / page type | What's improved |
|
|
22
|
+
|---|---|
|
|
23
|
+
| **X / Twitter posts** | oEmbed-based tweet extraction; deleted/protected tweet detection |
|
|
24
|
+
| **Reddit posts & threads** | Comment thread extraction with `includeReplies` |
|
|
25
|
+
| **YouTube** | Page metadata and transcript extraction |
|
|
26
|
+
| **GitHub** | READMEs, issues, PRs, discussions — strips chrome, keeps code blocks |
|
|
27
|
+
| **Hacker News** | Thread extraction with comment cleanup |
|
|
28
|
+
| **Substack / Medium** | Article content with author, publish date, paywall bypass on open pages |
|
|
29
|
+
| **Stack Overflow** | Q&A extraction with code blocks and accepted answers |
|
|
30
|
+
| **Wikipedia** | Article content with infobox cleanup |
|
|
31
|
+
| **Documentation sites** | Keeps code blocks, callouts, footnotes, math (MathML/KaTeX/MathJax) |
|
|
32
|
+
| **Blog posts & articles** | Schema.org metadata, clean main-content extraction |
|
|
25
33
|
|
|
26
34
|
Notes:
|
|
27
35
|
- Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
|
|
@@ -44,8 +52,9 @@ openclaw plugins install -l /absolute/path/to/agent-smart-fetch/packages/opencla
|
|
|
44
52
|
## OpenClaw tools
|
|
45
53
|
|
|
46
54
|
Registers:
|
|
47
|
-
|
|
48
|
-
- `
|
|
55
|
+
|
|
56
|
+
- `smart_fetch` — single URL fetch with TLS fingerprinting and Defuddle extraction
|
|
57
|
+
- `batch_smart_fetch` — multiple URLs with bounded concurrency and per-item results
|
|
49
58
|
|
|
50
59
|
Synopsis:
|
|
51
60
|
|
|
@@ -56,6 +65,69 @@ batch_smart_fetch(requests)
|
|
|
56
65
|
|
|
57
66
|
For `batch_smart_fetch`, each item in `requests` accepts the same parameters as `smart_fetch`.
|
|
58
67
|
|
|
68
|
+
## Built-in `web_fetch` fallback provider
|
|
69
|
+
|
|
70
|
+
When this plugin is installed and enabled, it **automatically registers as a
|
|
71
|
+
WebFetch provider** for OpenClaw's built-in `web_fetch` tool. No extra
|
|
72
|
+
configuration needed.
|
|
73
|
+
|
|
74
|
+
### How it works
|
|
75
|
+
|
|
76
|
+
When `web_fetch`'s built-in HTTP + Readability extraction fails (e.g. the page
|
|
77
|
+
blocks plain HTTP clients or Readability can't find content), OpenClaw calls
|
|
78
|
+
the smart_fetch provider as a fallback. The provider runs the full
|
|
79
|
+
TLS-fingerprinted + Defuddle pipeline and returns clean content.
|
|
80
|
+
|
|
81
|
+
This means you get smart_fetch's better extraction on bot-protected sites
|
|
82
|
+
_without replacing `web_fetch` or changing any agent prompts_.
|
|
83
|
+
|
|
84
|
+
### Provider priority
|
|
85
|
+
|
|
86
|
+
| Provider | `autoDetectOrder` | Credential required |
|
|
87
|
+
|-----------------|:-------------------:|:-------------------|
|
|
88
|
+
| **smart-fetch** | **10** (highest) | No |
|
|
89
|
+
| firecrawl | 50 | Yes (API key) |
|
|
90
|
+
|
|
91
|
+
Because `smart-fetch` has the highest priority and requires no credentials, it
|
|
92
|
+
is selected first during auto-detection. If the smart_fetch provider itself
|
|
93
|
+
fails (e.g. the page needs full browser automation), OpenClaw falls through to
|
|
94
|
+
the next configured provider.
|
|
95
|
+
|
|
96
|
+
### Explicit provider selection
|
|
97
|
+
|
|
98
|
+
You can force the built-in `web_fetch` to use smart_fetch when it needs a fallback:
|
|
99
|
+
|
|
100
|
+
```json5
|
|
101
|
+
{
|
|
102
|
+
"tools": {
|
|
103
|
+
"web": {
|
|
104
|
+
"fetch": {
|
|
105
|
+
"provider": "smart-fetch"
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Note: setting `provider` only affects which provider is selected as the fallback —
|
|
113
|
+
the built-in HTTP fetch still runs first. The provider is only called when
|
|
114
|
+
Readability extraction fails (or when `readability: false` is set for HTML
|
|
115
|
+
responses). The provider then re-fetches the URL with its own TLS-fingerprinted
|
|
116
|
+
client, so there is a double-fetch cost when the fallback kicks in.
|
|
117
|
+
|
|
118
|
+
## Bundled skill
|
|
119
|
+
|
|
120
|
+
The plugin ships a skill (`smart-fetch`) that OpenClaw injects into agent
|
|
121
|
+
system prompts when the plugin is enabled. The skill documents:
|
|
122
|
+
|
|
123
|
+
- When to prefer `smart_fetch` over `web_fetch` or the browser tool
|
|
124
|
+
- Parameter reference for both tools
|
|
125
|
+
- Workflow escalation pattern (smart_fetch → batch → web_fetch → browser)
|
|
126
|
+
- The automatic fallback behavior
|
|
127
|
+
|
|
128
|
+
Skills are declared in the manifest (`openclaw.plugin.json`) under `"skills":
|
|
129
|
+
["./skills"]` and loaded from `skills/smart-fetch/SKILL.md`.
|
|
130
|
+
|
|
59
131
|
## Output formats
|
|
60
132
|
|
|
61
133
|
| Format | What you get |
|
|
@@ -65,20 +137,29 @@ For `batch_smart_fetch`, each item in `requests` accepts the same parameters as
|
|
|
65
137
|
| `text` | Plain text with markdown stripped |
|
|
66
138
|
| `json` | Structured JSON for metadata-heavy workflows |
|
|
67
139
|
|
|
68
|
-
## Plugin
|
|
140
|
+
## Plugin config
|
|
69
141
|
|
|
70
|
-
See `openclaw.plugin.json` for the schema.
|
|
142
|
+
See `openclaw.plugin.json` for the full schema. Configure under
|
|
143
|
+
`plugins.entries.smart-fetch.config`:
|
|
71
144
|
|
|
72
|
-
```
|
|
145
|
+
```json5
|
|
73
146
|
{
|
|
74
|
-
"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
147
|
+
"plugins": {
|
|
148
|
+
"entries": {
|
|
149
|
+
"smart-fetch": {
|
|
150
|
+
"enabled": true,
|
|
151
|
+
"config": {
|
|
152
|
+
"maxChars": 50000,
|
|
153
|
+
"timeoutMs": 15000,
|
|
154
|
+
"browser": "chrome_145",
|
|
155
|
+
"os": "windows",
|
|
156
|
+
"removeImages": false,
|
|
157
|
+
"includeReplies": "extractors",
|
|
158
|
+
"batchConcurrency": 8
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
82
163
|
}
|
|
83
164
|
```
|
|
84
165
|
|
|
@@ -95,4 +176,6 @@ See `openclaw.plugin.json` for the schema. The effective defaults are:
|
|
|
95
176
|
|
|
96
177
|
## Dev and publishing note
|
|
97
178
|
|
|
98
|
-
This repo uses Bun for local development, tests, and workspace scripts. Package
|
|
179
|
+
This repo uses Bun for local development, tests, and workspace scripts. Package
|
|
180
|
+
publishing still goes through `npm publish` in CI so npm Trusted Publishing can
|
|
181
|
+
be used.
|
package/dist/index.d.ts
CHANGED
|
@@ -21,6 +21,35 @@ interface FetchToolDefaults {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
type PluginConfig = FetchToolConfig;
|
|
24
|
+
/**
|
|
25
|
+
* WebFetch provider plugin shape — subset of OpenClaw's WebFetchProviderPlugin
|
|
26
|
+
* that we need for registration. Defined locally to avoid importing from the
|
|
27
|
+
* openclaw plugin SDK (which may not be installed in all environments).
|
|
28
|
+
*/
|
|
29
|
+
interface WebFetchProvider {
|
|
30
|
+
id: string;
|
|
31
|
+
label: string;
|
|
32
|
+
hint: string;
|
|
33
|
+
requiresCredential?: boolean;
|
|
34
|
+
envVars: string[];
|
|
35
|
+
placeholder: string;
|
|
36
|
+
signupUrl: string;
|
|
37
|
+
docsUrl?: string;
|
|
38
|
+
autoDetectOrder?: number;
|
|
39
|
+
credentialPath: string;
|
|
40
|
+
getCredentialValue: (fetchConfig?: Record<string, unknown>) => unknown;
|
|
41
|
+
setCredentialValue: (fetchConfigTarget: Record<string, unknown>, value: unknown) => void;
|
|
42
|
+
getConfiguredCredentialValue?: (config?: Record<string, unknown>) => unknown;
|
|
43
|
+
setConfiguredCredentialValue?: (configTarget: Record<string, unknown>, value: unknown) => void;
|
|
44
|
+
applySelectionConfig?: (config: Record<string, unknown>) => Record<string, unknown>;
|
|
45
|
+
createTool: (ctx: {
|
|
46
|
+
config?: Record<string, unknown>;
|
|
47
|
+
}) => {
|
|
48
|
+
description: string;
|
|
49
|
+
parameters: Record<string, unknown>;
|
|
50
|
+
execute: (args: Record<string, unknown>) => Promise<Record<string, unknown>>;
|
|
51
|
+
} | null;
|
|
52
|
+
}
|
|
24
53
|
interface ToolRegistrationApi {
|
|
25
54
|
pluginConfig?: PluginConfig;
|
|
26
55
|
registerTool(definition: {
|
|
@@ -35,12 +64,15 @@ interface ToolRegistrationApi {
|
|
|
35
64
|
isError?: boolean;
|
|
36
65
|
}>;
|
|
37
66
|
}): void;
|
|
67
|
+
/** Register a WebFetch provider for the built-in web_fetch fallback pipeline. */
|
|
68
|
+
registerWebFetchProvider?: (provider: WebFetchProvider) => void;
|
|
38
69
|
logger: {
|
|
39
70
|
info(message: string): void;
|
|
40
71
|
};
|
|
41
72
|
}
|
|
42
73
|
|
|
43
74
|
declare const resolvePluginDefaults: (pluginConfig?: PluginConfig) => FetchToolDefaults;
|
|
75
|
+
|
|
44
76
|
declare const plugin: {
|
|
45
77
|
id: string;
|
|
46
78
|
name: string;
|
package/dist/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { tmpdir } from 'os';
|
|
2
|
-
import { join, parse } from 'path';
|
|
3
1
|
import { Type } from '@sinclair/typebox';
|
|
4
2
|
import { randomUUID } from 'crypto';
|
|
5
3
|
import { once } from 'events';
|
|
6
4
|
import { createWriteStream } from 'fs';
|
|
7
5
|
import { mkdir, chmod, writeFile, unlink } from 'fs/promises';
|
|
6
|
+
import { tmpdir } from 'os';
|
|
7
|
+
import { join, parse } from 'path';
|
|
8
8
|
import { pipeline } from 'stream/promises';
|
|
9
9
|
import { Defuddle } from 'defuddle/node';
|
|
10
10
|
import { getProfiles, fetch } from 'wreq-js';
|
|
@@ -10203,25 +10203,10 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10203
10203
|
});
|
|
10204
10204
|
const fallbackDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10205
10205
|
const extractionDocument = parseLinkedomHTML(rawBody, finalUrl);
|
|
10206
|
-
if (isTwitterJsDisabledPage(fallbackDocument, opts.url)) {
|
|
10207
|
-
return {
|
|
10208
|
-
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
10209
|
-
code: "http_error",
|
|
10210
|
-
phase: "loading",
|
|
10211
|
-
retryable: false,
|
|
10212
|
-
timeoutMs,
|
|
10213
|
-
url: opts.url,
|
|
10214
|
-
finalUrl,
|
|
10215
|
-
statusCode: 404,
|
|
10216
|
-
statusText: "Not Found",
|
|
10217
|
-
mimeType: normalizeContentType(contentType) || void 0,
|
|
10218
|
-
contentLength: errorContext.contentLength
|
|
10219
|
-
};
|
|
10220
|
-
}
|
|
10221
10206
|
let extracted;
|
|
10207
|
+
const suppressedErrors = [];
|
|
10222
10208
|
try {
|
|
10223
10209
|
const origConsoleError = console.error;
|
|
10224
|
-
const suppressedErrors = [];
|
|
10225
10210
|
console.error = (...args) => {
|
|
10226
10211
|
suppressedErrors.push(args);
|
|
10227
10212
|
};
|
|
@@ -10244,6 +10229,35 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
10244
10229
|
wordCount: 0
|
|
10245
10230
|
};
|
|
10246
10231
|
}
|
|
10232
|
+
const isXUrl = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
|
|
10233
|
+
opts.url
|
|
10234
|
+
);
|
|
10235
|
+
if (isXUrl) {
|
|
10236
|
+
const hasOembed404 = suppressedErrors.some(
|
|
10237
|
+
(args) => args.some(
|
|
10238
|
+
(arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
|
|
10239
|
+
)
|
|
10240
|
+
);
|
|
10241
|
+
const hasJsDisabledShell = isTwitterJsDisabledPage(
|
|
10242
|
+
fallbackDocument,
|
|
10243
|
+
opts.url
|
|
10244
|
+
);
|
|
10245
|
+
if (hasOembed404 || hasJsDisabledShell) {
|
|
10246
|
+
return {
|
|
10247
|
+
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
10248
|
+
code: "http_error",
|
|
10249
|
+
phase: "loading",
|
|
10250
|
+
retryable: false,
|
|
10251
|
+
timeoutMs,
|
|
10252
|
+
url: opts.url,
|
|
10253
|
+
finalUrl,
|
|
10254
|
+
statusCode: 404,
|
|
10255
|
+
statusText: "Not Found",
|
|
10256
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
10257
|
+
contentLength: errorContext.contentLength
|
|
10258
|
+
};
|
|
10259
|
+
}
|
|
10260
|
+
}
|
|
10247
10261
|
let extractedContent = extracted.content;
|
|
10248
10262
|
let wordCount = extracted.wordCount;
|
|
10249
10263
|
if (!extractedContent || wordCount === 0) {
|
|
@@ -10549,12 +10563,80 @@ async function executeBatchFetchToolCall(params, defaults, options = {}) {
|
|
|
10549
10563
|
batchConcurrency
|
|
10550
10564
|
};
|
|
10551
10565
|
}
|
|
10552
|
-
|
|
10553
|
-
// src/index.ts
|
|
10554
10566
|
var resolvePluginDefaults = (pluginConfig = {}) => resolveFetchToolDefaults({
|
|
10555
10567
|
tempDir: join(tmpdir(), "smart-fetch-openclaw"),
|
|
10556
10568
|
...pluginConfig
|
|
10557
10569
|
});
|
|
10570
|
+
function createSmartFetchWebFetchProvider() {
|
|
10571
|
+
return {
|
|
10572
|
+
id: "smart-fetch",
|
|
10573
|
+
label: "Smart Fetch",
|
|
10574
|
+
hint: "TLS-fingerprinted fetch with Defuddle extraction. Fallback for bot-protected sites.",
|
|
10575
|
+
requiresCredential: false,
|
|
10576
|
+
envVars: [],
|
|
10577
|
+
placeholder: "No API key required",
|
|
10578
|
+
signupUrl: "https://github.com/Thinkscape/agent-smart-fetch",
|
|
10579
|
+
docsUrl: "https://github.com/Thinkscape/agent-smart-fetch#readme",
|
|
10580
|
+
autoDetectOrder: 10,
|
|
10581
|
+
// Lower = higher priority than firecrawl (50)
|
|
10582
|
+
credentialPath: "",
|
|
10583
|
+
getCredentialValue: () => void 0,
|
|
10584
|
+
setCredentialValue: () => {
|
|
10585
|
+
},
|
|
10586
|
+
getConfiguredCredentialValue: () => void 0,
|
|
10587
|
+
setConfiguredCredentialValue: () => {
|
|
10588
|
+
},
|
|
10589
|
+
applySelectionConfig: (config) => config,
|
|
10590
|
+
createTool: (ctx) => ({
|
|
10591
|
+
description: "Fetch a URL using Smart Fetch (TLS fingerprinting + Defuddle extraction).",
|
|
10592
|
+
parameters: {},
|
|
10593
|
+
execute: async (args) => {
|
|
10594
|
+
const url = typeof args.url === "string" ? args.url : "";
|
|
10595
|
+
const extractMode = args.extractMode === "text" ? "text" : "markdown";
|
|
10596
|
+
const maxChars = typeof args.maxChars === "number" && Number.isFinite(args.maxChars) ? Math.floor(args.maxChars) : 5e4;
|
|
10597
|
+
if (!url) {
|
|
10598
|
+
return { text: "" };
|
|
10599
|
+
}
|
|
10600
|
+
const pluginConfig = extractPluginConfig(ctx.config);
|
|
10601
|
+
const defaults = resolvePluginDefaults(pluginConfig);
|
|
10602
|
+
try {
|
|
10603
|
+
const result = await executeFetchToolCall(
|
|
10604
|
+
{
|
|
10605
|
+
url,
|
|
10606
|
+
extractMode,
|
|
10607
|
+
maxChars
|
|
10608
|
+
},
|
|
10609
|
+
defaults
|
|
10610
|
+
);
|
|
10611
|
+
if (isError(result)) {
|
|
10612
|
+
return { text: "" };
|
|
10613
|
+
}
|
|
10614
|
+
return {
|
|
10615
|
+
text: buildFetchResponseText(result, { verbose: true }),
|
|
10616
|
+
title: result.title || void 0,
|
|
10617
|
+
finalUrl: result.finalUrl || url,
|
|
10618
|
+
extractor: "smart-fetch"
|
|
10619
|
+
};
|
|
10620
|
+
} catch {
|
|
10621
|
+
return { text: "" };
|
|
10622
|
+
}
|
|
10623
|
+
}
|
|
10624
|
+
})
|
|
10625
|
+
};
|
|
10626
|
+
}
|
|
10627
|
+
function extractPluginConfig(config) {
|
|
10628
|
+
if (!config) return void 0;
|
|
10629
|
+
const plugins = config.plugins;
|
|
10630
|
+
if (!plugins || typeof plugins !== "object") return void 0;
|
|
10631
|
+
const entries = plugins.entries;
|
|
10632
|
+
if (!entries || typeof entries !== "object") return void 0;
|
|
10633
|
+
const smartFetch = entries["smart-fetch"];
|
|
10634
|
+
if (!smartFetch || typeof smartFetch !== "object") return void 0;
|
|
10635
|
+
const pluginConfig = smartFetch.config;
|
|
10636
|
+
return pluginConfig;
|
|
10637
|
+
}
|
|
10638
|
+
|
|
10639
|
+
// src/index.ts
|
|
10558
10640
|
function renderToolResponse(result) {
|
|
10559
10641
|
return {
|
|
10560
10642
|
content: [
|
|
@@ -10570,6 +10652,9 @@ var plugin = {
|
|
|
10570
10652
|
name: "Smart Fetch",
|
|
10571
10653
|
description: "Clean web content extraction with TLS fingerprinting. Uses wreq-js (Rust native bindings) for browser-grade TLS and Defuddle for extraction.",
|
|
10572
10654
|
register(api) {
|
|
10655
|
+
if (api.registerWebFetchProvider) {
|
|
10656
|
+
api.registerWebFetchProvider(createSmartFetchWebFetchProvider());
|
|
10657
|
+
}
|
|
10573
10658
|
const defaults = resolvePluginDefaults(api.pluginConfig);
|
|
10574
10659
|
api.registerTool({
|
|
10575
10660
|
name: "smart_fetch",
|