openclaw-smart-fetch 0.2.31 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +106 -23
- package/dist/index.d.ts +32 -0
- package/dist/index.js +75 -4
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +8 -3
- package/package.json +2 -1
- package/skills/smart-fetch/SKILL.md +99 -0
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# openclaw-smart-fetch
|
|
2
2
|
|
|
3
|
-
`openclaw-smart-fetch` adds smarter web fetching tools to OpenClaw.
|
|
3
|
+
`openclaw-smart-fetch` adds smarter web fetching tools to [OpenClaw](https://github.com/nicepkg/openclaw).
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
@@ -10,18 +10,26 @@
|
|
|
10
10
|
- 📦 **Downloads + large file support** — stream attachments and binaries to temp files
|
|
11
11
|
- ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
|
|
12
12
|
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
|
|
13
|
+
- 🔄 **Built-in `web_fetch` fallback** — automatically improves the core web_fetch tool
|
|
14
|
+
- 📖 **Bundled skill** — agents get usage guidance injected into their system prompt
|
|
13
15
|
|
|
14
16
|
## Site optimisations
|
|
15
17
|
|
|
16
|
-
This package works on general web pages, but some site types benefit especially
|
|
18
|
+
This package works on general web pages, but some site types benefit especially
|
|
19
|
+
from Defuddle's extractors and cleanup:
|
|
17
20
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
| Site / page type | What's improved |
|
|
22
|
+
|---|---|
|
|
23
|
+
| **X / Twitter posts** | oEmbed-based tweet extraction; deleted/protected tweet detection |
|
|
24
|
+
| **Reddit posts & threads** | Comment thread extraction with `includeReplies` |
|
|
25
|
+
| **YouTube** | Page metadata and transcript extraction |
|
|
26
|
+
| **GitHub** | READMEs, issues, PRs, discussions — strips chrome, keeps code blocks |
|
|
27
|
+
| **Hacker News** | Thread extraction with comment cleanup |
|
|
28
|
+
| **Substack / Medium** | Article content with author, publish date, paywall bypass on open pages |
|
|
29
|
+
| **Stack Overflow** | Q&A extraction with code blocks and accepted answers |
|
|
30
|
+
| **Wikipedia** | Article content with infobox cleanup |
|
|
31
|
+
| **Documentation sites** | Keeps code blocks, callouts, footnotes, math (MathML/KaTeX/MathJax) |
|
|
32
|
+
| **Blog posts & articles** | Schema.org metadata, clean main-content extraction |
|
|
25
33
|
|
|
26
34
|
Notes:
|
|
27
35
|
- Defuddle is the cleanup layer: it strips common page chrome like nav, sidebars, related links, share widgets, and footers
|
|
@@ -44,8 +52,9 @@ openclaw plugins install -l /absolute/path/to/agent-smart-fetch/packages/opencla
|
|
|
44
52
|
## OpenClaw tools
|
|
45
53
|
|
|
46
54
|
Registers:
|
|
47
|
-
|
|
48
|
-
- `
|
|
55
|
+
|
|
56
|
+
- `smart_fetch` — single URL fetch with TLS fingerprinting and Defuddle extraction
|
|
57
|
+
- `batch_smart_fetch` — multiple URLs with bounded concurrency and per-item results
|
|
49
58
|
|
|
50
59
|
Synopsis:
|
|
51
60
|
|
|
@@ -56,6 +65,69 @@ batch_smart_fetch(requests)
|
|
|
56
65
|
|
|
57
66
|
For `batch_smart_fetch`, each item in `requests` accepts the same parameters as `smart_fetch`.
|
|
58
67
|
|
|
68
|
+
## Built-in `web_fetch` fallback provider
|
|
69
|
+
|
|
70
|
+
When this plugin is installed and enabled, it **automatically registers as a
|
|
71
|
+
WebFetch provider** for OpenClaw's built-in `web_fetch` tool. No extra
|
|
72
|
+
configuration needed.
|
|
73
|
+
|
|
74
|
+
### How it works
|
|
75
|
+
|
|
76
|
+
When `web_fetch`'s built-in HTTP + Readability extraction fails (e.g. the page
|
|
77
|
+
blocks plain HTTP clients or Readability can't find content), OpenClaw calls
|
|
78
|
+
the smart_fetch provider as a fallback. The provider runs the full
|
|
79
|
+
TLS-fingerprinted + Defuddle pipeline and returns clean content.
|
|
80
|
+
|
|
81
|
+
This means you get smart_fetch's better extraction on bot-protected sites
|
|
82
|
+
_without replacing `web_fetch` or changing any agent prompts_.
|
|
83
|
+
|
|
84
|
+
### Provider priority
|
|
85
|
+
|
|
86
|
+
| Provider | `autoDetectOrder` | Credential required |
|
|
87
|
+
|-----------------|:-------------------:|:-------------------|
|
|
88
|
+
| **smart-fetch** | **10** (highest) | No |
|
|
89
|
+
| firecrawl | 50 | Yes (API key) |
|
|
90
|
+
|
|
91
|
+
Because `smart-fetch` has the highest priority and requires no credentials, it
|
|
92
|
+
is selected first during auto-detection. If the smart_fetch provider itself
|
|
93
|
+
fails (e.g. the page needs full browser automation), OpenClaw falls through to
|
|
94
|
+
the next configured provider.
|
|
95
|
+
|
|
96
|
+
### Explicit provider selection
|
|
97
|
+
|
|
98
|
+
You can force the built-in `web_fetch` to use smart_fetch when it needs a fallback:
|
|
99
|
+
|
|
100
|
+
```json5
|
|
101
|
+
{
|
|
102
|
+
"tools": {
|
|
103
|
+
"web": {
|
|
104
|
+
"fetch": {
|
|
105
|
+
"provider": "smart-fetch"
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Note: setting `provider` only affects which provider is selected as the fallback —
|
|
113
|
+
the built-in HTTP fetch still runs first. The provider is only called when
|
|
114
|
+
Readability extraction fails (or when `readability: false` is set for HTML
|
|
115
|
+
responses). The provider then re-fetches the URL with its own TLS-fingerprinted
|
|
116
|
+
client, so there is a double-fetch cost when the fallback kicks in.
|
|
117
|
+
|
|
118
|
+
## Bundled skill
|
|
119
|
+
|
|
120
|
+
The plugin ships a skill (`smart-fetch`) that OpenClaw injects into agent
|
|
121
|
+
system prompts when the plugin is enabled. The skill documents:
|
|
122
|
+
|
|
123
|
+
- When to prefer `smart_fetch` over `web_fetch` or the browser tool
|
|
124
|
+
- Parameter reference for both tools
|
|
125
|
+
- Workflow escalation pattern (smart_fetch → batch → web_fetch → browser)
|
|
126
|
+
- The automatic fallback behavior
|
|
127
|
+
|
|
128
|
+
Skills are declared in the manifest (`openclaw.plugin.json`) under `"skills":
|
|
129
|
+
["./skills"]` and loaded from `skills/smart-fetch/SKILL.md`.
|
|
130
|
+
|
|
59
131
|
## Output formats
|
|
60
132
|
|
|
61
133
|
| Format | What you get |
|
|
@@ -65,20 +137,29 @@ For `batch_smart_fetch`, each item in `requests` accepts the same parameters as
|
|
|
65
137
|
| `text` | Plain text with markdown stripped |
|
|
66
138
|
| `json` | Structured JSON for metadata-heavy workflows |
|
|
67
139
|
|
|
68
|
-
## Plugin
|
|
140
|
+
## Plugin config
|
|
69
141
|
|
|
70
|
-
See `openclaw.plugin.json` for the schema.
|
|
142
|
+
See `openclaw.plugin.json` for the full schema. Configure under
|
|
143
|
+
`plugins.entries.smart-fetch.config`:
|
|
71
144
|
|
|
72
|
-
```
|
|
145
|
+
```json5
|
|
73
146
|
{
|
|
74
|
-
"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
147
|
+
"plugins": {
|
|
148
|
+
"entries": {
|
|
149
|
+
"smart-fetch": {
|
|
150
|
+
"enabled": true,
|
|
151
|
+
"config": {
|
|
152
|
+
"maxChars": 50000,
|
|
153
|
+
"timeoutMs": 15000,
|
|
154
|
+
"browser": "chrome_145",
|
|
155
|
+
"os": "windows",
|
|
156
|
+
"removeImages": false,
|
|
157
|
+
"includeReplies": "extractors",
|
|
158
|
+
"batchConcurrency": 8
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
82
163
|
}
|
|
83
164
|
```
|
|
84
165
|
|
|
@@ -95,4 +176,6 @@ See `openclaw.plugin.json` for the schema. The effective defaults are:
|
|
|
95
176
|
|
|
96
177
|
## Dev and publishing note
|
|
97
178
|
|
|
98
|
-
This repo uses Bun for local development, tests, and workspace scripts. Package
|
|
179
|
+
This repo uses Bun for local development, tests, and workspace scripts. Package
|
|
180
|
+
publishing still goes through `npm publish` in CI so npm Trusted Publishing can
|
|
181
|
+
be used.
|
package/dist/index.d.ts
CHANGED
|
@@ -21,6 +21,35 @@ interface FetchToolDefaults {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
type PluginConfig = FetchToolConfig;
|
|
24
|
+
/**
|
|
25
|
+
* WebFetch provider plugin shape — subset of OpenClaw's WebFetchProviderPlugin
|
|
26
|
+
* that we need for registration. Defined locally to avoid importing from the
|
|
27
|
+
* openclaw plugin SDK (which may not be installed in all environments).
|
|
28
|
+
*/
|
|
29
|
+
interface WebFetchProvider {
|
|
30
|
+
id: string;
|
|
31
|
+
label: string;
|
|
32
|
+
hint: string;
|
|
33
|
+
requiresCredential?: boolean;
|
|
34
|
+
envVars: string[];
|
|
35
|
+
placeholder: string;
|
|
36
|
+
signupUrl: string;
|
|
37
|
+
docsUrl?: string;
|
|
38
|
+
autoDetectOrder?: number;
|
|
39
|
+
credentialPath: string;
|
|
40
|
+
getCredentialValue: (fetchConfig?: Record<string, unknown>) => unknown;
|
|
41
|
+
setCredentialValue: (fetchConfigTarget: Record<string, unknown>, value: unknown) => void;
|
|
42
|
+
getConfiguredCredentialValue?: (config?: Record<string, unknown>) => unknown;
|
|
43
|
+
setConfiguredCredentialValue?: (configTarget: Record<string, unknown>, value: unknown) => void;
|
|
44
|
+
applySelectionConfig?: (config: Record<string, unknown>) => Record<string, unknown>;
|
|
45
|
+
createTool: (ctx: {
|
|
46
|
+
config?: Record<string, unknown>;
|
|
47
|
+
}) => {
|
|
48
|
+
description: string;
|
|
49
|
+
parameters: Record<string, unknown>;
|
|
50
|
+
execute: (args: Record<string, unknown>) => Promise<Record<string, unknown>>;
|
|
51
|
+
} | null;
|
|
52
|
+
}
|
|
24
53
|
interface ToolRegistrationApi {
|
|
25
54
|
pluginConfig?: PluginConfig;
|
|
26
55
|
registerTool(definition: {
|
|
@@ -35,12 +64,15 @@ interface ToolRegistrationApi {
|
|
|
35
64
|
isError?: boolean;
|
|
36
65
|
}>;
|
|
37
66
|
}): void;
|
|
67
|
+
/** Register a WebFetch provider for the built-in web_fetch fallback pipeline. */
|
|
68
|
+
registerWebFetchProvider?: (provider: WebFetchProvider) => void;
|
|
38
69
|
logger: {
|
|
39
70
|
info(message: string): void;
|
|
40
71
|
};
|
|
41
72
|
}
|
|
42
73
|
|
|
43
74
|
declare const resolvePluginDefaults: (pluginConfig?: PluginConfig) => FetchToolDefaults;
|
|
75
|
+
|
|
44
76
|
declare const plugin: {
|
|
45
77
|
id: string;
|
|
46
78
|
name: string;
|
package/dist/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { tmpdir } from 'os';
|
|
2
|
-
import { join, parse } from 'path';
|
|
3
1
|
import { Type } from '@sinclair/typebox';
|
|
4
2
|
import { randomUUID } from 'crypto';
|
|
5
3
|
import { once } from 'events';
|
|
6
4
|
import { createWriteStream } from 'fs';
|
|
7
5
|
import { mkdir, chmod, writeFile, unlink } from 'fs/promises';
|
|
6
|
+
import { tmpdir } from 'os';
|
|
7
|
+
import { join, parse } from 'path';
|
|
8
8
|
import { pipeline } from 'stream/promises';
|
|
9
9
|
import { Defuddle } from 'defuddle/node';
|
|
10
10
|
import { getProfiles, fetch } from 'wreq-js';
|
|
@@ -10563,12 +10563,80 @@ async function executeBatchFetchToolCall(params, defaults, options = {}) {
|
|
|
10563
10563
|
batchConcurrency
|
|
10564
10564
|
};
|
|
10565
10565
|
}
|
|
10566
|
-
|
|
10567
|
-
// src/index.ts
|
|
10568
10566
|
var resolvePluginDefaults = (pluginConfig = {}) => resolveFetchToolDefaults({
|
|
10569
10567
|
tempDir: join(tmpdir(), "smart-fetch-openclaw"),
|
|
10570
10568
|
...pluginConfig
|
|
10571
10569
|
});
|
|
10570
|
+
function createSmartFetchWebFetchProvider() {
|
|
10571
|
+
return {
|
|
10572
|
+
id: "smart-fetch",
|
|
10573
|
+
label: "Smart Fetch",
|
|
10574
|
+
hint: "TLS-fingerprinted fetch with Defuddle extraction. Fallback for bot-protected sites.",
|
|
10575
|
+
requiresCredential: false,
|
|
10576
|
+
envVars: [],
|
|
10577
|
+
placeholder: "No API key required",
|
|
10578
|
+
signupUrl: "https://github.com/Thinkscape/agent-smart-fetch",
|
|
10579
|
+
docsUrl: "https://github.com/Thinkscape/agent-smart-fetch#readme",
|
|
10580
|
+
autoDetectOrder: 10,
|
|
10581
|
+
// Lower = higher priority than firecrawl (50)
|
|
10582
|
+
credentialPath: "",
|
|
10583
|
+
getCredentialValue: () => void 0,
|
|
10584
|
+
setCredentialValue: () => {
|
|
10585
|
+
},
|
|
10586
|
+
getConfiguredCredentialValue: () => void 0,
|
|
10587
|
+
setConfiguredCredentialValue: () => {
|
|
10588
|
+
},
|
|
10589
|
+
applySelectionConfig: (config) => config,
|
|
10590
|
+
createTool: (ctx) => ({
|
|
10591
|
+
description: "Fetch a URL using Smart Fetch (TLS fingerprinting + Defuddle extraction).",
|
|
10592
|
+
parameters: {},
|
|
10593
|
+
execute: async (args) => {
|
|
10594
|
+
const url = typeof args.url === "string" ? args.url : "";
|
|
10595
|
+
const extractMode = args.extractMode === "text" ? "text" : "markdown";
|
|
10596
|
+
const maxChars = typeof args.maxChars === "number" && Number.isFinite(args.maxChars) ? Math.floor(args.maxChars) : 5e4;
|
|
10597
|
+
if (!url) {
|
|
10598
|
+
return { text: "" };
|
|
10599
|
+
}
|
|
10600
|
+
const pluginConfig = extractPluginConfig(ctx.config);
|
|
10601
|
+
const defaults = resolvePluginDefaults(pluginConfig);
|
|
10602
|
+
try {
|
|
10603
|
+
const result = await executeFetchToolCall(
|
|
10604
|
+
{
|
|
10605
|
+
url,
|
|
10606
|
+
extractMode,
|
|
10607
|
+
maxChars
|
|
10608
|
+
},
|
|
10609
|
+
defaults
|
|
10610
|
+
);
|
|
10611
|
+
if (isError(result)) {
|
|
10612
|
+
return { text: "" };
|
|
10613
|
+
}
|
|
10614
|
+
return {
|
|
10615
|
+
text: buildFetchResponseText(result, { verbose: true }),
|
|
10616
|
+
title: result.title || void 0,
|
|
10617
|
+
finalUrl: result.finalUrl || url,
|
|
10618
|
+
extractor: "smart-fetch"
|
|
10619
|
+
};
|
|
10620
|
+
} catch {
|
|
10621
|
+
return { text: "" };
|
|
10622
|
+
}
|
|
10623
|
+
}
|
|
10624
|
+
})
|
|
10625
|
+
};
|
|
10626
|
+
}
|
|
10627
|
+
function extractPluginConfig(config) {
|
|
10628
|
+
if (!config) return void 0;
|
|
10629
|
+
const plugins = config.plugins;
|
|
10630
|
+
if (!plugins || typeof plugins !== "object") return void 0;
|
|
10631
|
+
const entries = plugins.entries;
|
|
10632
|
+
if (!entries || typeof entries !== "object") return void 0;
|
|
10633
|
+
const smartFetch = entries["smart-fetch"];
|
|
10634
|
+
if (!smartFetch || typeof smartFetch !== "object") return void 0;
|
|
10635
|
+
const pluginConfig = smartFetch.config;
|
|
10636
|
+
return pluginConfig;
|
|
10637
|
+
}
|
|
10638
|
+
|
|
10639
|
+
// src/index.ts
|
|
10572
10640
|
function renderToolResponse(result) {
|
|
10573
10641
|
return {
|
|
10574
10642
|
content: [
|
|
@@ -10584,6 +10652,9 @@ var plugin = {
|
|
|
10584
10652
|
name: "Smart Fetch",
|
|
10585
10653
|
description: "Clean web content extraction with TLS fingerprinting. Uses wreq-js (Rust native bindings) for browser-grade TLS and Defuddle for extraction.",
|
|
10586
10654
|
register(api) {
|
|
10655
|
+
if (api.registerWebFetchProvider) {
|
|
10656
|
+
api.registerWebFetchProvider(createSmartFetchWebFetchProvider());
|
|
10657
|
+
}
|
|
10587
10658
|
const defaults = resolvePluginDefaults(api.pluginConfig);
|
|
10588
10659
|
api.registerTool({
|
|
10589
10660
|
name: "smart_fetch",
|