smart-web-mcp 0.11.6 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/LICENSE +68 -21
- package/README.md +275 -78
- package/dist/assessment.d.ts +9 -1
- package/dist/assessment.js +38 -0
- package/dist/assessment.js.map +1 -1
- package/dist/composition.d.ts +5 -0
- package/dist/composition.js +1 -0
- package/dist/composition.js.map +1 -1
- package/dist/shared.d.ts +5 -0
- package/dist/shared.js.map +1 -1
- package/dist/smartcrawl.js +9 -4
- package/dist/smartcrawl.js.map +1 -1
- package/dist/smartfetch/pipeline.js +7 -3
- package/dist/smartfetch/pipeline.js.map +1 -1
- package/dist/smartfetch/providers/article.js +13 -7
- package/dist/smartfetch/providers/article.js.map +1 -1
- package/dist/smartfetch.js +3 -2
- package/dist/smartfetch.js.map +1 -1
- package/package.json +3 -3
package/CHANGELOG.md
CHANGED
|
@@ -68,6 +68,20 @@ The format is based on Keep a Changelog[^1] and this project uses SemVer.
|
|
|
68
68
|
- `smart-web-dev` issue reporting now points at `jojo-labs/smart-web`, and the dev hot-reload runtime writes explicit ESM snapshot metadata before loading rebuilt `dist/*.js` modules
|
|
69
69
|
- `initSettings` now writes `settings.json` through a temp file + rename so crashes during first-run or `--force` initialization cannot leave the runtime config truncated
|
|
70
70
|
|
|
71
|
+
## [0.12.0](https://github.com/jojo-labs/smart-web/compare/v0.11.7...v0.12.0) (2026-04-27)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
### Features
|
|
75
|
+
|
|
76
|
+
* add CAPTCHA type hints to handoff metadata ([#142](https://github.com/jojo-labs/smart-web/issues/142)) ([f7eeec1](https://github.com/jojo-labs/smart-web/commit/f7eeec107d8c9950662e29fafdb8cea6bb016485))
|
|
77
|
+
|
|
78
|
+
## [0.11.7](https://github.com/jojo-labs/smart-web/compare/v0.11.6...v0.11.7) (2026-04-26)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
### Bug Fixes
|
|
82
|
+
|
|
83
|
+
* allow external links on root index pages, reject timestamp/points noise in thread ([#138](https://github.com/jojo-labs/smart-web/issues/138)) ([3e1c00e](https://github.com/jojo-labs/smart-web/commit/3e1c00e5db2d912cf61eb9f914efeac905d581ca))
|
|
84
|
+
|
|
71
85
|
## [0.11.6](https://github.com/jojo-labs/smart-web/compare/v0.11.5...v0.11.6) (2026-04-26)
|
|
72
86
|
|
|
73
87
|
|
package/LICENSE
CHANGED
|
@@ -1,21 +1,68 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2026
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
1
|
+
smart-web MCP Server — Proprietary License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 jojo-labs. All rights reserved.
|
|
4
|
+
|
|
5
|
+
1. Grant of License
|
|
6
|
+
|
|
7
|
+
Subject to the terms and conditions of this License, jojo-labs hereby
|
|
8
|
+
grants you a non-exclusive, non-transferable, limited license to:
|
|
9
|
+
|
|
10
|
+
a) Install and run the smart-web-mcp npm package as a local MCP server;
|
|
11
|
+
b) Use the smartfetch, smartsearch, and smartcrawl tools provided by the
|
|
12
|
+
server through an MCP host.
|
|
13
|
+
|
|
14
|
+
This license does not grant any right to modify, reverse-engineer,
|
|
15
|
+
decompile, disassemble, sublicense, redistribute, or create derivative
|
|
16
|
+
works from the software.
|
|
17
|
+
|
|
18
|
+
2. Restrictions
|
|
19
|
+
|
|
20
|
+
You may not:
|
|
21
|
+
|
|
22
|
+
a) Distribute, sublicense, or make available the software or any portion
|
|
23
|
+
thereof to any third party, except as an integral part of a runtime
|
|
24
|
+
MCP host configuration that invokes the unmodified software via npx;
|
|
25
|
+
b) Use the software in a competing product or service that provides web
|
|
26
|
+
retrieval, search, or crawl capabilities as a primary feature;
|
|
27
|
+
c) Remove, alter, or obscure any proprietary notices on the software;
|
|
28
|
+
d) Use the software in any way that violates applicable law or regulation.
|
|
29
|
+
|
|
30
|
+
3. Intellectual Property
|
|
31
|
+
|
|
32
|
+
The software and all associated content, including but not limited to
|
|
33
|
+
source code, binaries, documentation, and configuration templates, are
|
|
34
|
+
the proprietary property of jojo-labs and are protected by copyright and
|
|
35
|
+
other intellectual property laws. No title to or ownership of the
|
|
36
|
+
software is transferred to you under this License.
|
|
37
|
+
|
|
38
|
+
4. Disclaimer of Warranties
|
|
39
|
+
|
|
40
|
+
THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
41
|
+
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
42
|
+
FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL
|
|
43
|
+
JOJO-LABS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER
|
|
44
|
+
IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR
|
|
45
|
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
46
|
+
SOFTWARE.
|
|
47
|
+
|
|
48
|
+
5. Limitation of Liability
|
|
49
|
+
|
|
50
|
+
IN NO EVENT SHALL JOJO-LABS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
51
|
+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING BUT
|
|
52
|
+
NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
53
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
54
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
55
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
56
|
+
THE SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
57
|
+
|
|
58
|
+
6. Termination
|
|
59
|
+
|
|
60
|
+
This License and the rights granted hereunder will terminate automatically
|
|
61
|
+
upon any breach by you of the terms of this License. Upon termination, you
|
|
62
|
+
shall destroy all copies of the software in your possession.
|
|
63
|
+
|
|
64
|
+
7. Governing Law
|
|
65
|
+
|
|
66
|
+
This License shall be governed by and construed in accordance with the
|
|
67
|
+
laws of the Republic of Korea, without regard to its conflict of law
|
|
68
|
+
provisions.
|
package/README.md
CHANGED
|
@@ -14,31 +14,29 @@ It is designed for **agents and MCP hosts**, not for manual browser automation.
|
|
|
14
14
|
|
|
15
15
|
If a page is login-gated, keep the login UI, session persistence, and capture flow in a companion runtime instead of moving that stateful behavior into the `smart-web` core.
|
|
16
16
|
|
|
17
|
-
- npm:
|
|
17
|
+
- npm: <https://www.npmjs.com/package/smart-web-mcp>
|
|
18
18
|
- MCP Registry: `io.github.jojo-labs/smart-web`
|
|
19
|
-
-
|
|
19
|
+
- Issues: <https://github.com/jojo-labs/smart-web/issues>
|
|
20
20
|
|
|
21
21
|
If `smart-web` returns reproducible incorrect or misleading output, open a GitHub issue with the exact input, tool arguments, observed output, expected behavior, and version. Skip obvious transient network, auth, or rate-limit failures unless the smart-web classification itself looks wrong.
|
|
22
22
|
|
|
23
23
|
## Highlights
|
|
24
24
|
|
|
25
25
|
- one local MCP instead of separate search, fetch, and crawl servers
|
|
26
|
-
- explicit routing: URL
|
|
27
|
-
- up-front smartfetch acquisition lanes instead of broad direct
|
|
28
|
-
- optional Jina Reader relay for weak generic/article fetches when you explicitly enable it,
|
|
29
|
-
- weak generic pages can
|
|
30
|
-
- paywalled article pages
|
|
31
|
-
- LinkedIn authwall handling
|
|
26
|
+
- explicit routing: URL → `smartfetch`, query → `smartsearch`, known site → `smartcrawl`
|
|
27
|
+
- up-front smartfetch acquisition lanes instead of broad direct → browser retry chains
|
|
28
|
+
- optional Jina Reader relay for weak generic/article fetches when you explicitly enable it, with JSON mode and alternate-link preservation
|
|
29
|
+
- weak generic pages can recover through JSON-LD or Next.js payload extraction and same-origin RSS/Atom feed discovery before falling back to a handoff
|
|
30
|
+
- paywalled article pages prefer lawful archive recovery and search-indexed reference previews before telling the host to escalate elsewhere
|
|
31
|
+
- LinkedIn authwall handling prefers compliant public fallbacks such as archives and search-indexed reference metadata before giving up
|
|
32
32
|
- legal paper fallback for academic URLs: DOI-aware OpenAlex, Unpaywall, Semantic Scholar, CORE discovery, and Europe PMC enrichment plus bioRxiv/medRxiv API fallback when a direct paper page is thin or blocked
|
|
33
|
-
- site-native public search fallbacks
|
|
33
|
+
- site-native public search fallbacks cover Reddit, GitHub repository discovery, npm, Hacker News, Stack Exchange, and Velog when a matching `site:` query is available
|
|
34
34
|
- structured output with `assessment` and `pipeline` fields for host-side decision making
|
|
35
35
|
- local-first defaults with privacy-aware search/fetch behavior
|
|
36
36
|
- useful normalization for common public surfaces instead of raw HTML dumps
|
|
37
37
|
|
|
38
38
|
## Tool routing
|
|
39
39
|
|
|
40
|
-
Use the tools like this:
|
|
41
|
-
|
|
42
40
|
- `smartfetch` first when the user already gave a URL or shortlink
|
|
43
41
|
- `smartsearch` when the user gave a topic, keywords, or a `site:` query
|
|
44
42
|
- `smartcrawl` when you already know the site and need multiple relevant pages
|
|
@@ -53,9 +51,7 @@ Use the tools like this:
|
|
|
53
51
|
|
|
54
52
|
## What the tools return
|
|
55
53
|
|
|
56
|
-
All three tools are shaped for agent consumption.
|
|
57
|
-
|
|
58
|
-
Common high-signal fields include:
|
|
54
|
+
All three tools are shaped for agent consumption. Common high-signal fields include:
|
|
59
55
|
|
|
60
56
|
- `assessment`: confidence, block/auth hints, recommended handoff
|
|
61
57
|
- `pipeline`: resolve/acquire/normalize/assess stages
|
|
@@ -85,9 +81,7 @@ When a source stays blocked or under-specified, `smart-web` prefers partial but
|
|
|
85
81
|
npx -y smart-web-mcp
|
|
86
82
|
```
|
|
87
83
|
|
|
88
|
-
`
|
|
89
|
-
|
|
90
|
-
When `smartfetch` or `smartcrawl` needs Playwright-backed page loading, `smart-web` now checks whether the bundled Chromium revision is available. If the local Playwright browser cache is missing or outdated, `smart-web` warns at startup and will try `npx playwright install chromium` automatically on first browser use before surfacing a structured error.
|
|
84
|
+
When `smartfetch` or `smartcrawl` needs Playwright-backed page loading, `smart-web` checks whether the bundled Chromium revision is available. If the local Playwright browser cache is missing or outdated, `smart-web` warns at startup and will try `npx playwright install chromium` automatically on first browser use before surfacing a structured error.
|
|
91
85
|
|
|
92
86
|
## Host setup
|
|
93
87
|
|
|
@@ -97,8 +91,6 @@ When `smartfetch` or `smartcrawl` needs Playwright-backed page loading, `smart-w
|
|
|
97
91
|
claude mcp add smart-web -- npx -y smart-web-mcp
|
|
98
92
|
```
|
|
99
93
|
|
|
100
|
-
For a checked-out local repo, use [examples/claude.mcp.json](examples/claude.mcp.json).
|
|
101
|
-
|
|
102
94
|
### Codex
|
|
103
95
|
|
|
104
96
|
```bash
|
|
@@ -128,14 +120,40 @@ args = ["-y", "smart-web-mcp"]
|
|
|
128
120
|
}
|
|
129
121
|
```
|
|
130
122
|
|
|
131
|
-
|
|
123
|
+
### Custom settings file
|
|
124
|
+
|
|
125
|
+
To use a non-default settings path, append `--settings-file` to the command:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
npx -y smart-web-mcp --settings-file /absolute/path/to/smart-web.settings.json
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
For Claude Code:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
claude mcp add smart-web -- npx -y smart-web-mcp --settings-file /absolute/path/to/smart-web.settings.json
|
|
135
|
+
```
|
|
132
136
|
|
|
133
|
-
|
|
137
|
+
For OpenCode, add `"args"` after `"command"`:
|
|
134
138
|
|
|
135
|
-
```
|
|
136
|
-
|
|
139
|
+
```json
|
|
140
|
+
{
|
|
141
|
+
"mcp": {
|
|
142
|
+
"smart-web": {
|
|
143
|
+
"type": "local",
|
|
144
|
+
"command": ["npx", "-y", "smart-web-mcp", "--settings-file", "/absolute/path/to/smart-web.settings.json"],
|
|
145
|
+
"enabled": true
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
137
149
|
```
|
|
138
150
|
|
|
151
|
+
## Configuration
|
|
152
|
+
|
|
153
|
+
### Settings path
|
|
154
|
+
|
|
155
|
+
Default: `~/.config/smart-web/settings.json`
|
|
156
|
+
|
|
139
157
|
Print a template:
|
|
140
158
|
|
|
141
159
|
```bash
|
|
@@ -148,90 +166,269 @@ Initialize the default file:
|
|
|
148
166
|
npx -y smart-web-mcp --init-settings
|
|
149
167
|
```
|
|
150
168
|
|
|
151
|
-
|
|
169
|
+
`smart-web` treats the settings file as the single runtime config surface.
|
|
152
170
|
|
|
153
|
-
|
|
154
|
-
|
|
171
|
+
### Full settings reference
|
|
172
|
+
|
|
173
|
+
```jsonc
|
|
174
|
+
{
|
|
175
|
+
// "balanced" (default) or "private"
|
|
176
|
+
// "private" disables relay-style providers and public search helpers
|
|
177
|
+
"profile": "balanced",
|
|
178
|
+
|
|
179
|
+
"runtime": {
|
|
180
|
+
// Optional override for staging/export temp files
|
|
181
|
+
// Default: platform cache root (e.g. ~/.cache/smart-web/tmp)
|
|
182
|
+
"tempDir": ""
|
|
183
|
+
},
|
|
184
|
+
|
|
185
|
+
"search": {
|
|
186
|
+
// API keys — leave empty to skip that provider
|
|
187
|
+
"exaApiKey": "",
|
|
188
|
+
"braveSearchApiKey": "",
|
|
189
|
+
// Self-hosted SearXNG instance URL
|
|
190
|
+
"searxngBaseUrl": "",
|
|
191
|
+
"enableSearxng": true
|
|
192
|
+
},
|
|
193
|
+
|
|
194
|
+
"fetch": {
|
|
195
|
+
// Browser overrides — most users leave these empty
|
|
196
|
+
"chromeChannel": "",
|
|
197
|
+
"chromePath": "",
|
|
198
|
+
// Auto-install Playwright Chromium when missing (default: true)
|
|
199
|
+
"autoInstallPlaywright": true,
|
|
200
|
+
// Jina Reader relay — opt-in third-party path for weak article pages
|
|
201
|
+
"enableJinaReader": false,
|
|
202
|
+
"jinaReaderBaseUrl": "https://r.jina.ai/",
|
|
203
|
+
// Academic fallback — legal OA enrichment for paper URLs
|
|
204
|
+
"enableAcademicFallback": true,
|
|
205
|
+
"enableOpenAlex": true,
|
|
206
|
+
"enableEuropePmc": true,
|
|
207
|
+
"enableBiorxivApi": true,
|
|
208
|
+
// Unpaywall — requires a contact email
|
|
209
|
+
"enableUnpaywall": true,
|
|
210
|
+
"unpaywallEmail": "",
|
|
211
|
+
// Semantic Scholar — optional API key for higher-rate access
|
|
212
|
+
"enableSemanticScholar": true,
|
|
213
|
+
"semanticScholarApiKey": "",
|
|
214
|
+
// CORE — optional API key for richer search-backed enrichment
|
|
215
|
+
"enableCoreDiscovery": true,
|
|
216
|
+
"coreApiKey": "",
|
|
217
|
+
// FxTwitter — transparent x.com → fxtwitter redirect
|
|
218
|
+
"enableFxTwitter": true,
|
|
219
|
+
// Undetected-chromedriver — optional Python Selenium fallback for tough anti-bot pages
|
|
220
|
+
"enableUndetectedChromedriver": true,
|
|
221
|
+
"undetectedChromedriverPython": "python3",
|
|
222
|
+
// Site-specific fetches
|
|
223
|
+
"enableRedditJson": true,
|
|
224
|
+
"enableYoutubeTranscript": true,
|
|
225
|
+
// Archive fallback — Wayback and archive.md recovery
|
|
226
|
+
"enableArchiveFallback": true,
|
|
227
|
+
"enableWayback": true,
|
|
228
|
+
"enableArchiveMd": true
|
|
229
|
+
},
|
|
230
|
+
|
|
231
|
+
"network": {
|
|
232
|
+
// Allow localhost/private/reserved fetch targets (default: false)
|
|
233
|
+
"allowPrivateHosts": false
|
|
234
|
+
}
|
|
235
|
+
}
|
|
155
236
|
```
|
|
156
237
|
|
|
157
|
-
|
|
238
|
+
### Common setups
|
|
158
239
|
|
|
159
|
-
|
|
240
|
+
#### Default — no config file needed
|
|
160
241
|
|
|
161
|
-
|
|
162
|
-
- `profile: "private"`
|
|
163
|
-
- `search.searxngBaseUrl`
|
|
164
|
-
- `search.exaApiKey`
|
|
165
|
-
- `search.braveSearchApiKey`
|
|
166
|
-
- `fetch.enableJinaReader`
|
|
167
|
-
- `fetch.unpaywallEmail`
|
|
242
|
+
Out of the box `smart-web` works with sensible defaults. Create a settings file only when you need to change something.
|
|
168
243
|
|
|
169
|
-
|
|
244
|
+
#### Minimal: profile only
|
|
170
245
|
|
|
171
|
-
|
|
246
|
+
```json
|
|
247
|
+
{
|
|
248
|
+
"profile": "balanced"
|
|
249
|
+
}
|
|
250
|
+
```
|
|
172
251
|
|
|
173
|
-
|
|
252
|
+
#### Private / local-first
|
|
174
253
|
|
|
175
|
-
|
|
254
|
+
```json
|
|
255
|
+
{
|
|
256
|
+
"profile": "private",
|
|
257
|
+
"search": {
|
|
258
|
+
"searxngBaseUrl": "http://localhost:8080"
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
```
|
|
176
262
|
|
|
177
|
-
|
|
178
|
-
- `smartfetch` on a Medium or other member-only article URL and confirm it returns either archive-backed content or an honest `reference_only` preview
|
|
179
|
-
- `smartfetch` on a Naver Map, Kakao Map, or `naver.me` place-share URL
|
|
180
|
-
- `smartfetch` on an arXiv abstract URL
|
|
181
|
-
- `smartfetch` on a PubMed, PMC, or bioRxiv/medRxiv paper URL
|
|
182
|
-
- `smartfetch` on a known product URL
|
|
183
|
-
- `smartsearch` on a `site:` query
|
|
184
|
-
- `smartcrawl` on a docs site or board you actually use
|
|
263
|
+
In `private` mode, relay-style provider requests are blocked, Exa and Brave default off unless you explicitly re-enable them, public no-key search fallbacks are disabled, and SearXNG bases must resolve to localhost or private addresses.
|
|
185
264
|
|
|
186
|
-
|
|
265
|
+
#### Private with Exa allowed back in
|
|
187
266
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
267
|
+
```json
|
|
268
|
+
{
|
|
269
|
+
"profile": "private",
|
|
270
|
+
"search": {
|
|
271
|
+
"exaApiKey": "...",
|
|
272
|
+
"enableExa": true
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
```
|
|
192
276
|
|
|
193
|
-
|
|
277
|
+
#### With Exa and Brave API keys
|
|
194
278
|
|
|
195
|
-
|
|
279
|
+
```json
|
|
280
|
+
{
|
|
281
|
+
"profile": "balanced",
|
|
282
|
+
"search": {
|
|
283
|
+
"exaApiKey": "exa-...",
|
|
284
|
+
"braveSearchApiKey": "BSA-..."
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
```
|
|
196
288
|
|
|
197
|
-
|
|
198
|
-
- [Configuration](docs/configuration.md)
|
|
199
|
-
- [Crawling](docs/crawling.md)
|
|
200
|
-
- [Docs export](docs/docscrawl.md)
|
|
201
|
-
- [Retrieval lanes](docs/retrieval-lanes.md)
|
|
202
|
-
- [Providers](docs/providers.md)
|
|
203
|
-
- [Architecture](docs/architecture.md)
|
|
204
|
-
- [Long-term design](docs/long-term-design.md)
|
|
205
|
-
- [Contributing](CONTRIBUTING.md)
|
|
206
|
-
- [CHANGELOG](CHANGELOG.md)
|
|
289
|
+
#### Disable Playwright auto-install
|
|
207
290
|
|
|
208
|
-
|
|
291
|
+
```json
|
|
292
|
+
{
|
|
293
|
+
"fetch": {
|
|
294
|
+
"autoInstallPlaywright": false
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
```
|
|
209
298
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
299
|
+
The server will return an actionable `playwright_browsers_missing` or `playwright_browsers_outdated` error instead.
|
|
300
|
+
|
|
301
|
+
#### Opt in to Jina Reader for weak generic article pages
|
|
302
|
+
|
|
303
|
+
```json
|
|
304
|
+
{
|
|
305
|
+
"fetch": {
|
|
306
|
+
"enableJinaReader": true
|
|
307
|
+
}
|
|
308
|
+
}
|
|
214
309
|
```
|
|
215
310
|
|
|
216
|
-
|
|
311
|
+
This stays opt-in because it is a third-party relay path. When enabled, it only applies to weak generic/article direct fetches and prefers Jina JSON mode when that improves the page. `profile: "private"` hard-disables relay-style providers.
|
|
217
312
|
|
|
218
|
-
|
|
313
|
+
#### Force archive fallback off
|
|
219
314
|
|
|
220
|
-
```
|
|
221
|
-
|
|
222
|
-
|
|
315
|
+
```json
|
|
316
|
+
{
|
|
317
|
+
"fetch": {
|
|
318
|
+
"enableArchiveFallback": false
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
#### Legal OA DOI recovery for paywalled paper pages
|
|
324
|
+
|
|
325
|
+
```json
|
|
326
|
+
{
|
|
327
|
+
"fetch": {
|
|
328
|
+
"enableUnpaywall": true,
|
|
329
|
+
"unpaywallEmail": "research@example.com",
|
|
330
|
+
"enableSemanticScholar": true,
|
|
331
|
+
"enableCoreDiscovery": true
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
#### Enable undetected-chromedriver from a dedicated virtualenv
|
|
337
|
+
|
|
338
|
+
```json
|
|
339
|
+
{
|
|
340
|
+
"fetch": {
|
|
341
|
+
"enableUndetectedChromedriver": true,
|
|
342
|
+
"undetectedChromedriverPython": "/absolute/path/to/venv/bin/python"
|
|
343
|
+
}
|
|
344
|
+
}
|
|
223
345
|
```
|
|
224
346
|
|
|
225
|
-
`smart-web
|
|
347
|
+
This helper is optional. `smart-web` still works without it, but when installed and reachable it can act as a second browser engine for stubborn pages where Playwright alone is not enough.
|
|
226
348
|
|
|
227
|
-
|
|
349
|
+
#### Custom temp directory
|
|
350
|
+
|
|
351
|
+
```json
|
|
352
|
+
{
|
|
353
|
+
"runtime": {
|
|
354
|
+
"tempDir": "/absolute/path/to/smart-web-tmp"
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
### Advanced overrides
|
|
360
|
+
|
|
361
|
+
These keys live inside `settings.json` when you need to force one provider on or off:
|
|
362
|
+
|
|
363
|
+
**Search**
|
|
364
|
+
|
|
365
|
+
- `search.enableExa`
|
|
366
|
+
- `search.enableBrave`
|
|
367
|
+
- `search.enableSearxng`
|
|
368
|
+
- `search.enableDuckDuckGo`
|
|
369
|
+
- `search.enableBraveHtml`
|
|
370
|
+
|
|
371
|
+
**Fetch**
|
|
372
|
+
|
|
373
|
+
- `fetch.enableFxTwitter`
|
|
374
|
+
- `fetch.enableXOembed`
|
|
375
|
+
- `fetch.autoInstallPlaywright`
|
|
376
|
+
- `fetch.enableJinaReader`
|
|
377
|
+
- `fetch.jinaReaderBaseUrl`
|
|
378
|
+
- `fetch.enableAcademicFallback`
|
|
379
|
+
- `fetch.enableOpenAlex`
|
|
380
|
+
- `fetch.enableEuropePmc`
|
|
381
|
+
- `fetch.enableBiorxivApi`
|
|
382
|
+
- `fetch.enableUnpaywall`
|
|
383
|
+
- `fetch.unpaywallEmail`
|
|
384
|
+
- `fetch.enableSemanticScholar`
|
|
385
|
+
- `fetch.semanticScholarApiKey`
|
|
386
|
+
- `fetch.enableCoreDiscovery`
|
|
387
|
+
- `fetch.coreApiKey`
|
|
388
|
+
- `fetch.enableUndetectedChromedriver`
|
|
389
|
+
- `fetch.undetectedChromedriverPython`
|
|
390
|
+
- `fetch.enableRedditJson`
|
|
391
|
+
- `fetch.enableYoutubeTranscript`
|
|
392
|
+
- `fetch.enableArchiveFallback`
|
|
393
|
+
- `fetch.enableWayback`
|
|
394
|
+
- `fetch.enableArchiveMd`
|
|
395
|
+
|
|
396
|
+
**Compatibility**
|
|
397
|
+
|
|
398
|
+
- `network.localOnly`: advanced override that forces local-only behavior regardless of profile
|
|
399
|
+
|
|
400
|
+
## Quick verification
|
|
401
|
+
|
|
402
|
+
Sanity-check the server with a few real calls:
|
|
403
|
+
|
|
404
|
+
- `smartfetch` on a normal article URL
|
|
405
|
+
- `smartfetch` on a Medium or other member-only article URL — confirm it returns either archive-backed content or an honest `reference_only` preview
|
|
406
|
+
- `smartfetch` on a Naver Map, Kakao Map, or `naver.me` place-share URL
|
|
407
|
+
- `smartfetch` on an arXiv abstract URL
|
|
408
|
+
- `smartfetch` on a PubMed, PMC, or bioRxiv/medRxiv paper URL
|
|
409
|
+
- `smartfetch` on a known product URL
|
|
410
|
+
- `smartsearch` on a `site:` query
|
|
411
|
+
- `smartcrawl` on a docs site or board you actually use
|
|
412
|
+
|
|
413
|
+
## Reporting issues
|
|
414
|
+
|
|
415
|
+
If `smart-web` returns reproducible incorrect or misleading output, open a GitHub issue with:
|
|
416
|
+
|
|
417
|
+
- exact input and tool arguments
|
|
418
|
+
- observed output
|
|
419
|
+
- expected behavior
|
|
420
|
+
- `smart-web-mcp` version
|
|
421
|
+
|
|
422
|
+
Skip obvious transient network, auth, or rate-limit failures unless the classification itself looks wrong.
|
|
228
423
|
|
|
229
424
|
## License
|
|
230
425
|
|
|
231
|
-
|
|
426
|
+
All rights reserved. See [LICENSE](LICENSE) for terms.
|
|
427
|
+
|
|
428
|
+
This software is licensed under a proprietary license that permits installation and use as a local MCP server but prohibits modification, redistribution, reverse engineering, or use in competing products.
|
|
232
429
|
|
|
233
430
|
## References
|
|
234
431
|
|
|
235
432
|
[^1]: Model Context Protocol, "Tools" specification — MCP tools are a retrieval and integration surface, not a requirement to emulate full browser-automation flows inside one tool.
|
|
236
433
|
|
|
237
|
-
[^2]: Model Context Protocol Blog, "Tool Annotations as Risk Vocabulary: What Hints Can and Can't Do" (2026-03-16) — structured tool output and accurate hints improve host-side routing and escalation decisions.
|
|
434
|
+
[^2]: Model Context Protocol Blog, "Tool Annotations as Risk Vocabulary: What Hints Can and Can't Do" (2026-03-16) — structured tool output and accurate hints improve host-side routing and escalation decisions.
|
package/dist/assessment.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
import type { RetrievalAssessment, RetrievalConfidence, RetrievalHandoffSuggestion, ToolError } from "./shared.js";
|
|
1
|
+
import type { CaptchaType, RetrievalAssessment, RetrievalConfidence, RetrievalHandoffSuggestion, ToolError } from "./shared.js";
|
|
2
2
|
export declare function createHandoffSuggestion(input: {
|
|
3
3
|
tool: RetrievalHandoffSuggestion["tool"];
|
|
4
4
|
reason: string;
|
|
5
5
|
mode?: RetrievalHandoffSuggestion["mode"];
|
|
6
6
|
url: string;
|
|
7
7
|
goal: string;
|
|
8
|
+
captcha_type?: CaptchaType;
|
|
8
9
|
}): RetrievalHandoffSuggestion;
|
|
9
10
|
export declare function createAssessment(input?: {
|
|
10
11
|
confidence?: RetrievalConfidence;
|
|
@@ -17,3 +18,10 @@ export declare function createAssessment(input?: {
|
|
|
17
18
|
export declare function firstMeaningfulBlockedReason(items: Array<ToolError | string | null | undefined>): string;
|
|
18
19
|
export declare function detectAuthLikeText(value: string): boolean;
|
|
19
20
|
export declare function detectInteractiveLikeText(value: string): boolean;
|
|
21
|
+
/**
|
|
22
|
+
* Detect CAPTCHA widget type from page text/HTML content.
|
|
23
|
+
* smart-web is read-only so it cannot inspect the live DOM — instead it
|
|
24
|
+
* infers the likely CAPTCHA type from known signatures in the fetched HTML
|
|
25
|
+
* (iframe src patterns, class names, data-sitekey attributes).
|
|
26
|
+
*/
|
|
27
|
+
export declare function detectCaptchaTypeFromContent(html: string): CaptchaType | null;
|
package/dist/assessment.js
CHANGED
|
@@ -8,6 +8,7 @@ export function createHandoffSuggestion(input) {
|
|
|
8
8
|
mode: input.mode || "read",
|
|
9
9
|
url: String(input.url || "").trim(),
|
|
10
10
|
goal: String(input.goal || "").trim(),
|
|
11
|
+
...(input.captcha_type ? { captcha_type: input.captcha_type } : {}),
|
|
11
12
|
};
|
|
12
13
|
}
|
|
13
14
|
export function createAssessment(input = {}) {
|
|
@@ -49,4 +50,41 @@ export function detectInteractiveLikeText(value) {
|
|
|
49
50
|
return false;
|
|
50
51
|
return /(enable javascript|checking your browser|just a moment|please wait|open the app|continue in browser|use the search box|select a filter|load more|show more)/i.test(text);
|
|
51
52
|
}
|
|
53
|
+
/**
|
|
54
|
+
* Detect CAPTCHA widget type from page text/HTML content.
|
|
55
|
+
* smart-web is read-only so it cannot inspect the live DOM — instead it
|
|
56
|
+
* infers the likely CAPTCHA type from known signatures in the fetched HTML
|
|
57
|
+
* (iframe src patterns, class names, data-sitekey attributes).
|
|
58
|
+
*/
|
|
59
|
+
export function detectCaptchaTypeFromContent(html) {
|
|
60
|
+
if (!html)
|
|
61
|
+
return null;
|
|
62
|
+
// Turnstile: Cloudflare challenge platform iframe or widget class
|
|
63
|
+
if (/challenges\.cloudflare\.com\/cdn-cgi\/challenge-platform/i.test(html))
|
|
64
|
+
return "turnstile";
|
|
65
|
+
if (/class="[^"]*cf-turnstile[^"]*"/i.test(html))
|
|
66
|
+
return "turnstile";
|
|
67
|
+
if (/data-sitekey[^>]*class="[^"]*cf-turnstile/i.test(html))
|
|
68
|
+
return "turnstile";
|
|
69
|
+
// reCAPTCHA: Google iframe or widget class
|
|
70
|
+
if (/recaptcha\/(?:api2|enterprise)\/anchor/i.test(html))
|
|
71
|
+
return "recaptcha";
|
|
72
|
+
if (/class="[^"]*g-recaptcha[^"]*"/i.test(html))
|
|
73
|
+
return "recaptcha";
|
|
74
|
+
if (/class="[^"]*g_recaptcha[^"]*"/i.test(html))
|
|
75
|
+
return "recaptcha";
|
|
76
|
+
if (/data-sitekey[^>]*class="[^"]*g-recaptcha/i.test(html))
|
|
77
|
+
return "recaptcha";
|
|
78
|
+
// hCaptcha: hCaptcha iframe or widget class
|
|
79
|
+
if (/hcaptcha\.com\/captcha/i.test(html))
|
|
80
|
+
return "hcaptcha";
|
|
81
|
+
if (/class="[^"]*h-captcha[^"]*"/i.test(html))
|
|
82
|
+
return "hcaptcha";
|
|
83
|
+
if (/data-sitekey[^>]*class="[^"]*h-captcha/i.test(html))
|
|
84
|
+
return "hcaptcha";
|
|
85
|
+
// Fallback: "captcha" keyword without a specific type
|
|
86
|
+
if (/\bcaptcha\b/i.test(html))
|
|
87
|
+
return "recaptcha";
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
52
90
|
//# sourceMappingURL=assessment.js.map
|
package/dist/assessment.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"assessment.js","sourceRoot":"","sources":["../src/assessment.ts"],"names":[],"mappings":"AAEA,SAAS,gBAAgB,CAAC,OAAiB;IACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;AAC9F,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,
|
|
1
|
+
{"version":3,"file":"assessment.js","sourceRoot":"","sources":["../src/assessment.ts"],"names":[],"mappings":"AAEA,SAAS,gBAAgB,CAAC,OAAiB;IACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;AAC9F,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,KAOvC;IACC,OAAO;QACL,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACzC,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,MAAM;QAC1B,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACnC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACrC,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACpE,CAAA;AACH,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,QAO7B,EAAE;IACJ,OAAO;QACL,UAAU,EAAE,KAAK,CAAC,UAAU,IAAI,QAAQ;QACxC,cAAc,EAAE,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACzD,oBAAoB,EAAE,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC;QACzD,uBAAuB,EAAE,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC;QAC/D,OAAO,EAAE,gBAAgB,CAAC,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC;QAC9C,mBAAmB,EAAE,KAAK,CAAC,mBAAmB,IAAI,IAAI;KACvD,CAAA;AACH,CAAC;AAED,MAAM,UAAU,4BAA4B,CAAC,KAAmD;IAC9F,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,IAAI;YAAE,SAAQ;QACnB,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;YACxB,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,6EAA6E,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,IAAI,CAAA;YACxK,SAAQ;QACV,CAAC;QACD,IAAI,IAAI,CAAC,QAAQ,KAAK,OAAO;YAAE,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAA;QAC/D,IAAI,yDAAyD,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAAE,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAA;IACtI,CAAC;IACD,OAAO,EAAE,CAAA;AACX,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;IAC9C,IAAI,CAAC,IAAI;QAAE,OAAO,KAAK,CAAA;IACvB,OAAO,gLAAgL,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACpM,CAAC;AAED,MAAM,UAAU,yBAAyB,CAAC,KAAa;IACrD,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;IAC9C,IAAI,CAAC,IAAI;QAAE,OAAO,KAAK,CAAA;IACvB,OAAO,8JAA8J,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AAClL,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,4BAA4B,CAAC,IAAY;IACvD,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,kEAAkE;IAClE,IAAI,2DAA2D,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC9F,IAAI,iCAAiC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACpE,IAAI,4CAA4C,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC/E,2CAA2C;IAC3C,IAAI,yCAAyC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC5E,IAAI,gCAAgC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACnE,IAAI,gCAAgC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACnE,IAAI,2CAA2C,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC9E,4CAA4C;IAC5C,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAC3D,IAAI,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAChE,IAAI,yCAAyC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAC3E,sDAAsD;IACtD,IAAI,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACjD,OAAO,IAAI,CAAA;AACb,CAAC"}
|
package/dist/composition.d.ts
CHANGED
|
@@ -4,6 +4,9 @@ export type WebTaskRequestPlan = {
|
|
|
4
4
|
goal: string;
|
|
5
5
|
startUrl: string;
|
|
6
6
|
mode: "read" | "act";
|
|
7
|
+
/** When the smart-web handoff identified a known CAPTCHA type on the page,
|
|
8
|
+
* this field lets web-task-api prepare the right solving strategy upfront. */
|
|
9
|
+
captchaType?: "turnstile" | "recaptcha" | "hcaptcha";
|
|
7
10
|
};
|
|
8
11
|
export type SmartcrawlRequestPlan = {
|
|
9
12
|
url: string;
|
|
@@ -11,11 +14,13 @@ export type SmartcrawlRequestPlan = {
|
|
|
11
14
|
mode: "auto" | "board" | "docs";
|
|
12
15
|
};
|
|
13
16
|
export declare function planWebTaskFromSmartfetch(output: SmartfetchOutput): {
|
|
17
|
+
captchaType?: import("./shared.js").CaptchaType;
|
|
14
18
|
goal: string;
|
|
15
19
|
startUrl: string;
|
|
16
20
|
mode: "read" | "act";
|
|
17
21
|
} | null;
|
|
18
22
|
export declare function planWebTaskFromSmartcrawl(response: SmartcrawlResponse): {
|
|
23
|
+
captchaType?: import("./shared.js").CaptchaType;
|
|
19
24
|
goal: string;
|
|
20
25
|
startUrl: string;
|
|
21
26
|
mode: "read" | "act";
|