smart-web-mcp 0.11.7 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -68,6 +68,13 @@ The format is based on Keep a Changelog[^1] and this project uses SemVer.
68
68
  - `smart-web-dev` issue reporting now points at `jojo-labs/smart-web`, and the dev hot-reload runtime writes explicit ESM snapshot metadata before loading rebuilt `dist/*.js` modules
69
69
  - `initSettings` now writes `settings.json` through a temp file + rename so crashes during first-run or `--force` initialization cannot leave the runtime config truncated
70
70
 
71
+ ## [0.12.0](https://github.com/jojo-labs/smart-web/compare/v0.11.7...v0.12.0) (2026-04-27)
72
+
73
+
74
+ ### Features
75
+
76
+ * add CAPTCHA type hints to handoff metadata ([#142](https://github.com/jojo-labs/smart-web/issues/142)) ([f7eeec1](https://github.com/jojo-labs/smart-web/commit/f7eeec107d8c9950662e29fafdb8cea6bb016485))
77
+
71
78
  ## [0.11.7](https://github.com/jojo-labs/smart-web/compare/v0.11.6...v0.11.7) (2026-04-26)
72
79
 
73
80
 
package/LICENSE CHANGED
@@ -1,21 +1,68 @@
1
- MIT License
2
-
3
- Copyright (c) 2026 rich-jojo
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ smart-web MCP Server — Proprietary License
2
+
3
+ Copyright (c) 2026 jojo-labs. All rights reserved.
4
+
5
+ 1. Grant of License
6
+
7
+ Subject to the terms and conditions of this License, jojo-labs hereby
8
+ grants you a non-exclusive, non-transferable, limited license to:
9
+
10
+ a) Install and run the smart-web-mcp npm package as a local MCP server;
11
+ b) Use the smartfetch, smartsearch, and smartcrawl tools provided by the
12
+ server through an MCP host.
13
+
14
+ This license does not grant any right to modify, reverse-engineer,
15
+ decompile, disassemble, sublicense, redistribute, or create derivative
16
+ works from the software.
17
+
18
+ 2. Restrictions
19
+
20
+ You may not:
21
+
22
+ a) Distribute, sublicense, or make available the software or any portion
23
+ thereof to any third party, except as an integral part of a runtime
24
+ MCP host configuration that invokes the unmodified software via npx;
25
+ b) Use the software in a competing product or service that provides web
26
+ retrieval, search, or crawl capabilities as a primary feature;
27
+ c) Remove, alter, or obscure any proprietary notices on the software;
28
+ d) Use the software in any way that violates applicable law or regulation.
29
+
30
+ 3. Intellectual Property
31
+
32
+ The software and all associated content, including but not limited to
33
+ source code, binaries, documentation, and configuration templates, are
34
+ the proprietary property of jojo-labs and are protected by copyright and
35
+ other intellectual property laws. No title to or ownership of the
36
+ software is transferred to you under this License.
37
+
38
+ 4. Disclaimer of Warranties
39
+
40
+ THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EXPRESS
41
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42
+ FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL
43
+ JOJO-LABS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER
44
+ IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR
45
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46
+ SOFTWARE.
47
+
48
+ 5. Limitation of Liability
49
+
50
+ IN NO EVENT SHALL JOJO-LABS BE LIABLE FOR ANY DIRECT, INDIRECT,
51
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING BUT
52
+ NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
53
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
54
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
55
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
56
+ THE SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
57
+
58
+ 6. Termination
59
+
60
+ This License and the rights granted hereunder will terminate automatically
61
+ upon any breach by you of the terms of this License. Upon termination, you
62
+ shall destroy all copies of the software in your possession.
63
+
64
+ 7. Governing Law
65
+
66
+ This License shall be governed by and construed in accordance with the
67
+ laws of the Republic of Korea, without regard to its conflict of law
68
+ provisions.
package/README.md CHANGED
@@ -14,31 +14,29 @@ It is designed for **agents and MCP hosts**, not for manual browser automation.
14
14
 
15
15
  If a page is login-gated, keep the login UI, session persistence, and capture flow in a companion runtime instead of moving that stateful behavior into the `smart-web` core.
16
16
 
17
- - npm: `https://www.npmjs.com/package/smart-web-mcp`
17
+ - npm: <https://www.npmjs.com/package/smart-web-mcp>
18
18
  - MCP Registry: `io.github.jojo-labs/smart-web`
19
- - GitHub: `https://github.com/jojo-labs/smart-web`
19
+ - Issues: <https://github.com/jojo-labs/smart-web/issues>
20
20
 
21
21
  If `smart-web` returns reproducible incorrect or misleading output, open a GitHub issue with the exact input, tool arguments, observed output, expected behavior, and version. Skip obvious transient network, auth, or rate-limit failures unless the smart-web classification itself looks wrong.
22
22
 
23
23
  ## Highlights
24
24
 
25
25
  - one local MCP instead of separate search, fetch, and crawl servers
26
- - explicit routing: URL -> `smartfetch`, query -> `smartsearch`, known site -> `smartcrawl`
27
- - up-front smartfetch acquisition lanes instead of broad direct -> browser retry chains
28
- - optional Jina Reader relay for weak generic/article fetches when you explicitly enable it, now with JSON mode and alternate-link preservation
29
- - weak generic pages can now recover through JSON-LD or Next.js payload extraction and same-origin RSS/Atom feed discovery before falling back to a handoff
30
- - paywalled article pages now prefer lawful archive recovery and search-indexed reference previews before telling the host to escalate elsewhere
31
- - LinkedIn authwall handling now prefers compliant public fallbacks such as archives and search-indexed reference metadata before giving up
26
+ - explicit routing: URL `smartfetch`, query `smartsearch`, known site `smartcrawl`
27
+ - up-front smartfetch acquisition lanes instead of broad direct browser retry chains
28
+ - optional Jina Reader relay for weak generic/article fetches when you explicitly enable it, with JSON mode and alternate-link preservation
29
+ - weak generic pages can recover through JSON-LD or Next.js payload extraction and same-origin RSS/Atom feed discovery before falling back to a handoff
30
+ - paywalled article pages prefer lawful archive recovery and search-indexed reference previews before telling the host to escalate elsewhere
31
+ - LinkedIn authwall handling prefers compliant public fallbacks such as archives and search-indexed reference metadata before giving up
32
32
  - legal paper fallback for academic URLs: DOI-aware OpenAlex, Unpaywall, Semantic Scholar, CORE discovery, and Europe PMC enrichment plus bioRxiv/medRxiv API fallback when a direct paper page is thin or blocked
33
- - site-native public search fallbacks now cover Reddit, GitHub repository discovery, npm, Hacker News, Stack Exchange, and Velog when a matching `site:` query is available
33
+ - site-native public search fallbacks cover Reddit, GitHub repository discovery, npm, Hacker News, Stack Exchange, and Velog when a matching `site:` query is available
34
34
  - structured output with `assessment` and `pipeline` fields for host-side decision making
35
35
  - local-first defaults with privacy-aware search/fetch behavior
36
36
  - useful normalization for common public surfaces instead of raw HTML dumps
37
37
 
38
38
  ## Tool routing
39
39
 
40
- Use the tools like this:
41
-
42
40
  - `smartfetch` first when the user already gave a URL or shortlink
43
41
  - `smartsearch` when the user gave a topic, keywords, or a `site:` query
44
42
  - `smartcrawl` when you already know the site and need multiple relevant pages
@@ -53,9 +51,7 @@ Use the tools like this:
53
51
 
54
52
  ## What the tools return
55
53
 
56
- All three tools are shaped for agent consumption.
57
-
58
- Common high-signal fields include:
54
+ All three tools are shaped for agent consumption. Common high-signal fields include:
59
55
 
60
56
  - `assessment`: confidence, block/auth hints, recommended handoff
61
57
  - `pipeline`: resolve/acquire/normalize/assess stages
@@ -85,9 +81,7 @@ When a source stays blocked or under-specified, `smart-web` prefers partial but
85
81
  npx -y smart-web-mcp
86
82
  ```
87
83
 
88
- `npm` is the canonical package-manager path for this repo and package.
89
-
90
- When `smartfetch` or `smartcrawl` needs Playwright-backed page loading, `smart-web` now checks whether the bundled Chromium revision is available. If the local Playwright browser cache is missing or outdated, `smart-web` warns at startup and will try `npx playwright install chromium` automatically on first browser use before surfacing a structured error.
84
+ When `smartfetch` or `smartcrawl` needs Playwright-backed page loading, `smart-web` checks whether the bundled Chromium revision is available. If the local Playwright browser cache is missing or outdated, `smart-web` warns at startup and will try `npx playwright install chromium` automatically on first browser use before surfacing a structured error.
91
85
 
92
86
  ## Host setup
93
87
 
@@ -97,8 +91,6 @@ When `smartfetch` or `smartcrawl` needs Playwright-backed page loading, `smart-w
97
91
  claude mcp add smart-web -- npx -y smart-web-mcp
98
92
  ```
99
93
 
100
- For a checked-out local repo, use [examples/claude.mcp.json](examples/claude.mcp.json).
101
-
102
94
  ### Codex
103
95
 
104
96
  ```bash
@@ -128,14 +120,40 @@ args = ["-y", "smart-web-mcp"]
128
120
  }
129
121
  ```
130
122
 
131
- ## Configuration
123
+ ### Custom settings file
124
+
125
+ To use a non-default settings path, append `--settings-file` to the command:
126
+
127
+ ```bash
128
+ npx -y smart-web-mcp --settings-file /absolute/path/to/smart-web.settings.json
129
+ ```
130
+
131
+ For Claude Code:
132
+
133
+ ```bash
134
+ claude mcp add smart-web -- npx -y smart-web-mcp --settings-file /absolute/path/to/smart-web.settings.json
135
+ ```
132
136
 
133
- Default settings path:
137
+ For OpenCode, add `"args"` after `"command"`:
134
138
 
135
- ```text
136
- ~/.config/smart-web/settings.json
139
+ ```json
140
+ {
141
+ "mcp": {
142
+ "smart-web": {
143
+ "type": "local",
144
+ "command": ["npx", "-y", "smart-web-mcp", "--settings-file", "/absolute/path/to/smart-web.settings.json"],
145
+ "enabled": true
146
+ }
147
+ }
148
+ }
137
149
  ```
138
150
 
151
+ ## Configuration
152
+
153
+ ### Settings path
154
+
155
+ Default: `~/.config/smart-web/settings.json`
156
+
139
157
  Print a template:
140
158
 
141
159
  ```bash
@@ -148,90 +166,269 @@ Initialize the default file:
148
166
  npx -y smart-web-mcp --init-settings
149
167
  ```
150
168
 
151
- Use a non-default settings file:
169
+ `smart-web` treats the settings file as the single runtime config surface.
152
170
 
153
- ```bash
154
- npx -y smart-web-mcp --settings-file /absolute/path/to/smart-web.settings.json
171
+ ### Full settings reference
172
+
173
+ ```jsonc
174
+ {
175
+ // "balanced" (default) or "private"
176
+ // "private" disables relay-style providers and public search helpers
177
+ "profile": "balanced",
178
+
179
+ "runtime": {
180
+ // Optional override for staging/export temp files
181
+ // Default: platform cache root (e.g. ~/.cache/smart-web/tmp)
182
+ "tempDir": ""
183
+ },
184
+
185
+ "search": {
186
+ // API keys — leave empty to skip that provider
187
+ "exaApiKey": "",
188
+ "braveSearchApiKey": "",
189
+ // Self-hosted SearXNG instance URL
190
+ "searxngBaseUrl": "",
191
+ "enableSearxng": true
192
+ },
193
+
194
+ "fetch": {
195
+ // Browser overrides — most users leave these empty
196
+ "chromeChannel": "",
197
+ "chromePath": "",
198
+ // Auto-install Playwright Chromium when missing (default: true)
199
+ "autoInstallPlaywright": true,
200
+ // Jina Reader relay — opt-in third-party path for weak article pages
201
+ "enableJinaReader": false,
202
+ "jinaReaderBaseUrl": "https://r.jina.ai/",
203
+ // Academic fallback — legal OA enrichment for paper URLs
204
+ "enableAcademicFallback": true,
205
+ "enableOpenAlex": true,
206
+ "enableEuropePmc": true,
207
+ "enableBiorxivApi": true,
208
+ // Unpaywall — requires a contact email
209
+ "enableUnpaywall": true,
210
+ "unpaywallEmail": "",
211
+ // Semantic Scholar — optional API key for higher-rate access
212
+ "enableSemanticScholar": true,
213
+ "semanticScholarApiKey": "",
214
+ // CORE — optional API key for richer search-backed enrichment
215
+ "enableCoreDiscovery": true,
216
+ "coreApiKey": "",
217
+ // FxTwitter — transparent x.com → fxtwitter redirect
218
+ "enableFxTwitter": true,
219
+ // Undetected-chromedriver — optional Python Selenium fallback for tough anti-bot pages
220
+ "enableUndetectedChromedriver": true,
221
+ "undetectedChromedriverPython": "python3",
222
+ // Site-specific fetches
223
+ "enableRedditJson": true,
224
+ "enableYoutubeTranscript": true,
225
+ // Archive fallback — Wayback and archive.md recovery
226
+ "enableArchiveFallback": true,
227
+ "enableWayback": true,
228
+ "enableArchiveMd": true
229
+ },
230
+
231
+ "network": {
232
+ // Allow localhost/private/reserved fetch targets (default: false)
233
+ "allowPrivateHosts": false
234
+ }
235
+ }
155
236
  ```
156
237
 
157
- `smart-web` now treats the settings file as the single runtime config surface. Use the default path or pass `--settings-file` when you need a different file.
238
+ ### Common setups
158
239
 
159
- Most installations only need:
240
+ #### Default no config file needed
160
241
 
161
- - `profile: "balanced"`
162
- - `profile: "private"`
163
- - `search.searxngBaseUrl`
164
- - `search.exaApiKey`
165
- - `search.braveSearchApiKey`
166
- - `fetch.enableJinaReader`
167
- - `fetch.unpaywallEmail`
242
+ Out of the box `smart-web` works with sensible defaults. Create a settings file only when you need to change something.
168
243
 
169
- If you do not want `smart-web` to auto-install Playwright browsers, set `fetch.autoInstallPlaywright` to `false` and the server will return an actionable `playwright_browsers_missing` or `playwright_browsers_outdated` error instead.
244
+ #### Minimal: profile only
170
245
 
171
- If you want a narrow URL-to-Markdown relay for weak generic/article pages, set `fetch.enableJinaReader` to `true`. This stays opt-in because it is a third-party relay path rather than the default deterministic fetch lane.
246
+ ```json
247
+ {
248
+ "profile": "balanced"
249
+ }
250
+ ```
172
251
 
173
- ## Quick verification
252
+ #### Private / local-first
174
253
 
175
- Sanity-check the server with a few real calls:
254
+ ```json
255
+ {
256
+ "profile": "private",
257
+ "search": {
258
+ "searxngBaseUrl": "http://localhost:8080"
259
+ }
260
+ }
261
+ ```
176
262
 
177
- - `smartfetch` on a normal article URL
178
- - `smartfetch` on a Medium or other member-only article URL and confirm it returns either archive-backed content or an honest `reference_only` preview
179
- - `smartfetch` on a Naver Map, Kakao Map, or `naver.me` place-share URL
180
- - `smartfetch` on an arXiv abstract URL
181
- - `smartfetch` on a PubMed, PMC, or bioRxiv/medRxiv paper URL
182
- - `smartfetch` on a known product URL
183
- - `smartsearch` on a `site:` query
184
- - `smartcrawl` on a docs site or board you actually use
263
+ In `private` mode, relay-style provider requests are blocked, Exa and Brave default off unless you explicitly re-enable them, public no-key search fallbacks are disabled, and SearXNG bases must resolve to localhost or private addresses.
185
264
 
186
- ## Examples
265
+ #### Private with Exa allowed back in
187
266
 
188
- - [Claude Code](examples/claude.mcp.json)
189
- - [OpenCode](examples/opencode.json)
190
- - [OpenCode dev hot-reload](examples/opencode.dev.json)
191
- - [Settings template](examples/smart-web.settings.json)
267
+ ```json
268
+ {
269
+ "profile": "private",
270
+ "search": {
271
+ "exaApiKey": "...",
272
+ "enableExa": true
273
+ }
274
+ }
275
+ ```
192
276
 
193
- These examples use `/absolute/path/to/...` placeholders for local checkouts.
277
+ #### With Exa and Brave API keys
194
278
 
195
- ## Docs
279
+ ```json
280
+ {
281
+ "profile": "balanced",
282
+ "search": {
283
+ "exaApiKey": "exa-...",
284
+ "braveSearchApiKey": "BSA-..."
285
+ }
286
+ }
287
+ ```
196
288
 
197
- - [Getting started](docs/getting-started.md)
198
- - [Configuration](docs/configuration.md)
199
- - [Crawling](docs/crawling.md)
200
- - [Docs export](docs/docscrawl.md)
201
- - [Retrieval lanes](docs/retrieval-lanes.md)
202
- - [Providers](docs/providers.md)
203
- - [Architecture](docs/architecture.md)
204
- - [Long-term design](docs/long-term-design.md)
205
- - [Contributing](CONTRIBUTING.md)
206
- - [CHANGELOG](CHANGELOG.md)
289
+ #### Disable Playwright auto-install
207
290
 
208
- ## Development
291
+ ```json
292
+ {
293
+ "fetch": {
294
+ "autoInstallPlaywright": false
295
+ }
296
+ }
297
+ ```
209
298
 
210
- ```bash
211
- npm install
212
- npm run test:file -- src/mcp.test.ts
213
- npm run check
299
+ The server will return an actionable `playwright_browsers_missing` or `playwright_browsers_outdated` error instead.
300
+
301
+ #### Opt in to Jina Reader for weak generic article pages
302
+
303
+ ```json
304
+ {
305
+ "fetch": {
306
+ "enableJinaReader": true
307
+ }
308
+ }
214
309
  ```
215
310
 
216
- Repo-owned hook entrypoints now live under `.git-hooks/` and are chained from the machine-level git hooks. `npm install` no longer rewrites repo-local git config.
311
+ This stays opt-in because it is a third-party relay path. When enabled, it only applies to weak generic/article direct fetches and prefers Jina JSON mode when that improves the page. `profile: "private"` hard-disables relay-style providers.
217
312
 
218
- ### Dev MCP without restarting the host
313
+ #### Force archive fallback off
219
314
 
220
- ```bash
221
- npm run dev:watch
222
- smart-web-dev
315
+ ```json
316
+ {
317
+ "fetch": {
318
+ "enableArchiveFallback": false
319
+ }
320
+ }
321
+ ```
322
+
323
+ #### Legal OA DOI recovery for paywalled paper pages
324
+
325
+ ```json
326
+ {
327
+ "fetch": {
328
+ "enableUnpaywall": true,
329
+ "unpaywallEmail": "research@example.com",
330
+ "enableSemanticScholar": true,
331
+ "enableCoreDiscovery": true
332
+ }
333
+ }
334
+ ```
335
+
336
+ #### Enable undetected-chromedriver from a dedicated virtualenv
337
+
338
+ ```json
339
+ {
340
+ "fetch": {
341
+ "enableUndetectedChromedriver": true,
342
+ "undetectedChromedriverPython": "/absolute/path/to/venv/bin/python"
343
+ }
344
+ }
223
345
  ```
224
346
 
225
- `smart-web-dev` hot-swaps rebuilt `dist/*.js` modules on each tool call. New implementations update after rebuilds, but brand-new MCP tools or schema changes still require a client reconnect because MCP registration happens at process start.
347
+ This helper is optional. `smart-web` still works without it, but when installed and reachable it can act as a second browser engine for stubborn pages where Playwright alone is not enough.
226
348
 
227
- By default, staging snapshots live under the platform cache root (for example `~/.cache/smart-web/tmp` on Linux). Set `runtime.tempDir` in `settings.json` if you need an explicit override.
349
+ #### Custom temp directory
350
+
351
+ ```json
352
+ {
353
+ "runtime": {
354
+ "tempDir": "/absolute/path/to/smart-web-tmp"
355
+ }
356
+ }
357
+ ```
358
+
359
+ ### Advanced overrides
360
+
361
+ These keys live inside `settings.json` when you need to force one provider on or off:
362
+
363
+ **Search**
364
+
365
+ - `search.enableExa`
366
+ - `search.enableBrave`
367
+ - `search.enableSearxng`
368
+ - `search.enableDuckDuckGo`
369
+ - `search.enableBraveHtml`
370
+
371
+ **Fetch**
372
+
373
+ - `fetch.enableFxTwitter`
374
+ - `fetch.enableXOembed`
375
+ - `fetch.autoInstallPlaywright`
376
+ - `fetch.enableJinaReader`
377
+ - `fetch.jinaReaderBaseUrl`
378
+ - `fetch.enableAcademicFallback`
379
+ - `fetch.enableOpenAlex`
380
+ - `fetch.enableEuropePmc`
381
+ - `fetch.enableBiorxivApi`
382
+ - `fetch.enableUnpaywall`
383
+ - `fetch.unpaywallEmail`
384
+ - `fetch.enableSemanticScholar`
385
+ - `fetch.semanticScholarApiKey`
386
+ - `fetch.enableCoreDiscovery`
387
+ - `fetch.coreApiKey`
388
+ - `fetch.enableUndetectedChromedriver`
389
+ - `fetch.undetectedChromedriverPython`
390
+ - `fetch.enableRedditJson`
391
+ - `fetch.enableYoutubeTranscript`
392
+ - `fetch.enableArchiveFallback`
393
+ - `fetch.enableWayback`
394
+ - `fetch.enableArchiveMd`
395
+
396
+ **Compatibility**
397
+
398
+ - `network.localOnly`: advanced override that forces local-only behavior regardless of profile
399
+
400
+ ## Quick verification
401
+
402
+ Sanity-check the server with a few real calls:
403
+
404
+ - `smartfetch` on a normal article URL
405
+ - `smartfetch` on a Medium or other member-only article URL — confirm it returns either archive-backed content or an honest `reference_only` preview
406
+ - `smartfetch` on a Naver Map, Kakao Map, or `naver.me` place-share URL
407
+ - `smartfetch` on an arXiv abstract URL
408
+ - `smartfetch` on a PubMed, PMC, or bioRxiv/medRxiv paper URL
409
+ - `smartfetch` on a known product URL
410
+ - `smartsearch` on a `site:` query
411
+ - `smartcrawl` on a docs site or board you actually use
412
+
413
+ ## Reporting issues
414
+
415
+ If `smart-web` returns reproducible incorrect or misleading output, open a GitHub issue with:
416
+
417
+ - exact input and tool arguments
418
+ - observed output
419
+ - expected behavior
420
+ - `smart-web-mcp` version
421
+
422
+ Skip obvious transient network, auth, or rate-limit failures unless the classification itself looks wrong.
228
423
 
229
424
  ## License
230
425
 
231
- MIT
426
+ All rights reserved. See [LICENSE](LICENSE) for terms.
427
+
428
+ This software is licensed under a proprietary license that permits installation and use as a local MCP server but prohibits modification, redistribution, reverse engineering, or use in competing products.
232
429
 
233
430
  ## References
234
431
 
235
432
  [^1]: Model Context Protocol, "Tools" specification — MCP tools are a retrieval and integration surface, not a requirement to emulate full browser-automation flows inside one tool.
236
433
 
237
- [^2]: Model Context Protocol Blog, "Tool Annotations as Risk Vocabulary: What Hints Can and Can't Do" (2026-03-16) — structured tool output and accurate hints improve host-side routing and escalation decisions.
434
+ [^2]: Model Context Protocol Blog, "Tool Annotations as Risk Vocabulary: What Hints Can and Can't Do" (2026-03-16) — structured tool output and accurate hints improve host-side routing and escalation decisions.
@@ -1,10 +1,11 @@
1
- import type { RetrievalAssessment, RetrievalConfidence, RetrievalHandoffSuggestion, ToolError } from "./shared.js";
1
+ import type { CaptchaType, RetrievalAssessment, RetrievalConfidence, RetrievalHandoffSuggestion, ToolError } from "./shared.js";
2
2
  export declare function createHandoffSuggestion(input: {
3
3
  tool: RetrievalHandoffSuggestion["tool"];
4
4
  reason: string;
5
5
  mode?: RetrievalHandoffSuggestion["mode"];
6
6
  url: string;
7
7
  goal: string;
8
+ captcha_type?: CaptchaType;
8
9
  }): RetrievalHandoffSuggestion;
9
10
  export declare function createAssessment(input?: {
10
11
  confidence?: RetrievalConfidence;
@@ -17,3 +18,10 @@ export declare function createAssessment(input?: {
17
18
  export declare function firstMeaningfulBlockedReason(items: Array<ToolError | string | null | undefined>): string;
18
19
  export declare function detectAuthLikeText(value: string): boolean;
19
20
  export declare function detectInteractiveLikeText(value: string): boolean;
21
+ /**
22
+ * Detect CAPTCHA widget type from page text/HTML content.
23
+ * smart-web is read-only so it cannot inspect the live DOM — instead it
24
+ * infers the likely CAPTCHA type from known signatures in the fetched HTML
25
+ * (iframe src patterns, class names, data-sitekey attributes).
26
+ */
27
+ export declare function detectCaptchaTypeFromContent(html: string): CaptchaType | null;
@@ -8,6 +8,7 @@ export function createHandoffSuggestion(input) {
8
8
  mode: input.mode || "read",
9
9
  url: String(input.url || "").trim(),
10
10
  goal: String(input.goal || "").trim(),
11
+ ...(input.captcha_type ? { captcha_type: input.captcha_type } : {}),
11
12
  };
12
13
  }
13
14
  export function createAssessment(input = {}) {
@@ -49,4 +50,41 @@ export function detectInteractiveLikeText(value) {
49
50
  return false;
50
51
  return /(enable javascript|checking your browser|just a moment|please wait|open the app|continue in browser|use the search box|select a filter|load more|show more)/i.test(text);
51
52
  }
53
+ /**
54
+ * Detect CAPTCHA widget type from page text/HTML content.
55
+ * smart-web is read-only so it cannot inspect the live DOM — instead it
56
+ * infers the likely CAPTCHA type from known signatures in the fetched HTML
57
+ * (iframe src patterns, class names, data-sitekey attributes).
58
+ */
59
+ export function detectCaptchaTypeFromContent(html) {
60
+ if (!html)
61
+ return null;
62
+ // Turnstile: Cloudflare challenge platform iframe or widget class
63
+ if (/challenges\.cloudflare\.com\/cdn-cgi\/challenge-platform/i.test(html))
64
+ return "turnstile";
65
+ if (/class="[^"]*cf-turnstile[^"]*"/i.test(html))
66
+ return "turnstile";
67
+ if (/data-sitekey[^>]*class="[^"]*cf-turnstile/i.test(html))
68
+ return "turnstile";
69
+ // reCAPTCHA: Google iframe or widget class
70
+ if (/recaptcha\/(?:api2|enterprise)\/anchor/i.test(html))
71
+ return "recaptcha";
72
+ if (/class="[^"]*g-recaptcha[^"]*"/i.test(html))
73
+ return "recaptcha";
74
+ if (/class="[^"]*g_recaptcha[^"]*"/i.test(html))
75
+ return "recaptcha";
76
+ if (/data-sitekey[^>]*class="[^"]*g-recaptcha/i.test(html))
77
+ return "recaptcha";
78
+ // hCaptcha: hCaptcha iframe or widget class
79
+ if (/hcaptcha\.com\/captcha/i.test(html))
80
+ return "hcaptcha";
81
+ if (/class="[^"]*h-captcha[^"]*"/i.test(html))
82
+ return "hcaptcha";
83
+ if (/data-sitekey[^>]*class="[^"]*h-captcha/i.test(html))
84
+ return "hcaptcha";
85
+ // Fallback: "captcha" keyword without a specific type
86
+ if (/\bcaptcha\b/i.test(html))
87
+ return "recaptcha";
88
+ return null;
89
+ }
52
90
  //# sourceMappingURL=assessment.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"assessment.js","sourceRoot":"","sources":["../src/assessment.ts"],"names":[],"mappings":"AAEA,SAAS,gBAAgB,CAAC,OAAiB;IACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;AAC9F,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,KAMvC;IACC,OAAO;QACL,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACzC,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,MAAM;QAC1B,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACnC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;KACtC,CAAA;AACH,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,QAO7B,EAAE;IACJ,OAAO;QACL,UAAU,EAAE,KAAK,CAAC,UAAU,IAAI,QAAQ;QACxC,cAAc,EAAE,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACzD,oBAAoB,EAAE,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC;QACzD,uBAAuB,EAAE,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC;QAC/D,OAAO,EAAE,gBAAgB,CAAC,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC;QAC9C,mBAAmB,EAAE,KAAK,CAAC,mBAAmB,IAAI,IAAI;KACvD,CAAA;AACH,CAAC;AAED,MAAM,UAAU,4BAA4B,CAAC,KAAmD;IAC9F,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,IAAI;YAAE,SAAQ;QACnB,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;YACxB,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,6EAA6E,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,IAAI,CAAA;YACxK,SAAQ;QACV,CAAC;QACD,IAAI,IAAI,CAAC,QAAQ,KAAK,OAAO;YAAE,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAA;QAC/D,IAAI,yDAAyD,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAAE,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAA;IACtI,CAAC;IACD,OAAO,EAAE,CAAA;AACX,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;IAC9C,IAAI,CAAC,IAAI;QAAE,OAAO,KAAK,CAAA;IACvB,OAAO,gLAAgL,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACpM,CAAC;AAED,MAAM,UAAU,yBAAyB,CAAC,KAAa;IACrD,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;IAC9C,IAAI,CAAC,IAAI;QAAE,OAAO,KAAK,CAAA;IACvB,OAAO,8JAA8J,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AAClL,CAAC"}
1
+ {"version":3,"file":"assessment.js","sourceRoot":"","sources":["../src/assessment.ts"],"names":[],"mappings":"AAEA,SAAS,gBAAgB,CAAC,OAAiB;IACzC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;AAC9F,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,KAOvC;IACC,OAAO;QACL,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACzC,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,MAAM;QAC1B,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACnC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACrC,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACpE,CAAA;AACH,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,QAO7B,EAAE;IACJ,OAAO;QACL,UAAU,EAAE,KAAK,CAAC,UAAU,IAAI,QAAQ;QACxC,cAAc,EAAE,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE;QACzD,oBAAoB,EAAE,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC;QACzD,uBAAuB,EAAE,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC;QAC/D,OAAO,EAAE,gBAAgB,CAAC,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC;QAC9C,mBAAmB,EAAE,KAAK,CAAC,mBAAmB,IAAI,IAAI;KACvD,CAAA;AACH,CAAC;AAED,MAAM,UAAU,4BAA4B,CAAC,KAAmD;IAC9F,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,IAAI;YAAE,SAAQ;QACnB,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;YACxB,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,6EAA6E,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,OAAO,IAAI,CAAA;YACxK,SAAQ;QACV,CAAC;QACD,IAAI,IAAI,CAAC,QAAQ,KAAK,OAAO;YAAE,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAA;QAC/D,IAAI,yDAAyD,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAAE,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,IAAI,CAAA;IACtI,CAAC;IACD,OAAO,EAAE,CAAA;AACX,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;IAC9C,IAAI,CAAC,IAAI;QAAE,OAAO,KAAK,CAAA;IACvB,OAAO,gLAAgL,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACpM,CAAC;AAED,MAAM,UAAU,yBAAyB,CAAC,KAAa;IACrD,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;IAC9C,IAAI,CAAC,IAAI;QAAE,OAAO,KAAK,CAAA;IACvB,OAAO,8JAA8J,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AAClL,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,4BAA4B,CAAC,IAAY;IACvD,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,kEAAkE;IAClE,IAAI,2DAA2D,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC9F,IAAI,iCAAiC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACpE,IAAI,4CAA4C,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC/E,2CAA2C;IAC3C,IAAI,yCAAyC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC5E,IAAI,gCAAgC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACnE,IAAI,gCAAgC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACnE,IAAI,2CAA2C,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IAC9E,4CAA4C;IAC5C,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAC3D,IAAI,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAChE,IAAI,yCAAyC,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAC3E,sDAAsD;IACtD,IAAI,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,WAAW,CAAA;IACjD,OAAO,IAAI,CAAA;AACb,CAAC"}
@@ -4,6 +4,9 @@ export type WebTaskRequestPlan = {
4
4
  goal: string;
5
5
  startUrl: string;
6
6
  mode: "read" | "act";
7
+ /** When the smart-web handoff identified a known CAPTCHA type on the page,
8
+ * this field lets web-task-api prepare the right solving strategy upfront. */
9
+ captchaType?: "turnstile" | "recaptcha" | "hcaptcha";
7
10
  };
8
11
  export type SmartcrawlRequestPlan = {
9
12
  url: string;
@@ -11,11 +14,13 @@ export type SmartcrawlRequestPlan = {
11
14
  mode: "auto" | "board" | "docs";
12
15
  };
13
16
  export declare function planWebTaskFromSmartfetch(output: SmartfetchOutput): {
17
+ captchaType?: import("./shared.js").CaptchaType;
14
18
  goal: string;
15
19
  startUrl: string;
16
20
  mode: "read" | "act";
17
21
  } | null;
18
22
  export declare function planWebTaskFromSmartcrawl(response: SmartcrawlResponse): {
23
+ captchaType?: import("./shared.js").CaptchaType;
19
24
  goal: string;
20
25
  startUrl: string;
21
26
  mode: "read" | "act";