webpeel 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -9
- package/dist/cli.js +138 -6
- package/dist/cli.js.map +1 -1
- package/dist/core/actions.d.ts +28 -0
- package/dist/core/actions.d.ts.map +1 -1
- package/dist/core/actions.js +60 -0
- package/dist/core/actions.js.map +1 -1
- package/dist/core/bm25-filter.d.ts +67 -0
- package/dist/core/bm25-filter.d.ts.map +1 -0
- package/dist/core/bm25-filter.js +289 -0
- package/dist/core/bm25-filter.js.map +1 -0
- package/dist/core/chunking.d.ts +43 -0
- package/dist/core/chunking.d.ts.map +1 -0
- package/dist/core/chunking.js +182 -0
- package/dist/core/chunking.js.map +1 -0
- package/dist/core/content-pruner.d.ts +40 -0
- package/dist/core/content-pruner.d.ts.map +1 -0
- package/dist/core/content-pruner.js +306 -0
- package/dist/core/content-pruner.js.map +1 -0
- package/dist/core/markdown.d.ts +4 -1
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +11 -2
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/research.d.ts +67 -0
- package/dist/core/research.d.ts.map +1 -0
- package/dist/core/research.js +254 -0
- package/dist/core/research.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +52 -4
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +107 -2
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.d.ts.map +1 -0
- package/dist/server/app.js +189 -0
- package/dist/server/app.js.map +1 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.d.ts.map +1 -0
- package/dist/server/auth-store.js +89 -0
- package/dist/server/auth-store.js.map +1 -0
- package/dist/server/job-queue.d.ts +93 -0
- package/dist/server/job-queue.d.ts.map +1 -0
- package/dist/server/job-queue.js +144 -0
- package/dist/server/job-queue.js.map +1 -0
- package/dist/server/middleware/auth.d.ts +24 -0
- package/dist/server/middleware/auth.d.ts.map +1 -0
- package/dist/server/middleware/auth.js +152 -0
- package/dist/server/middleware/auth.js.map +1 -0
- package/dist/server/middleware/rate-limit.d.ts +23 -0
- package/dist/server/middleware/rate-limit.d.ts.map +1 -0
- package/dist/server/middleware/rate-limit.js +126 -0
- package/dist/server/middleware/rate-limit.js.map +1 -0
- package/dist/server/middleware/url-validator.d.ts +16 -0
- package/dist/server/middleware/url-validator.d.ts.map +1 -0
- package/dist/server/middleware/url-validator.js +187 -0
- package/dist/server/middleware/url-validator.js.map +1 -0
- package/dist/server/pg-auth-store.d.ts +129 -0
- package/dist/server/pg-auth-store.d.ts.map +1 -0
- package/dist/server/pg-auth-store.js +457 -0
- package/dist/server/pg-auth-store.js.map +1 -0
- package/dist/server/pg-job-queue.d.ts +60 -0
- package/dist/server/pg-job-queue.d.ts.map +1 -0
- package/dist/server/pg-job-queue.js +365 -0
- package/dist/server/pg-job-queue.js.map +1 -0
- package/dist/server/premium/domain-intel.d.ts +17 -0
- package/dist/server/premium/domain-intel.d.ts.map +1 -0
- package/dist/server/premium/domain-intel.js +134 -0
- package/dist/server/premium/domain-intel.js.map +1 -0
- package/dist/server/premium/index.d.ts +18 -0
- package/dist/server/premium/index.d.ts.map +1 -0
- package/dist/server/premium/index.js +36 -0
- package/dist/server/premium/index.js.map +1 -0
- package/dist/server/premium/swr-cache.d.ts +15 -0
- package/dist/server/premium/swr-cache.d.ts.map +1 -0
- package/dist/server/premium/swr-cache.js +35 -0
- package/dist/server/premium/swr-cache.js.map +1 -0
- package/dist/server/routes/activity.d.ts +7 -0
- package/dist/server/routes/activity.d.ts.map +1 -0
- package/dist/server/routes/activity.js +66 -0
- package/dist/server/routes/activity.js.map +1 -0
- package/dist/server/routes/agent.d.ts +12 -0
- package/dist/server/routes/agent.d.ts.map +1 -0
- package/dist/server/routes/agent.js +356 -0
- package/dist/server/routes/agent.js.map +1 -0
- package/dist/server/routes/answer.d.ts +6 -0
- package/dist/server/routes/answer.d.ts.map +1 -0
- package/dist/server/routes/answer.js +124 -0
- package/dist/server/routes/answer.js.map +1 -0
- package/dist/server/routes/batch.d.ts +7 -0
- package/dist/server/routes/batch.d.ts.map +1 -0
- package/dist/server/routes/batch.js +287 -0
- package/dist/server/routes/batch.js.map +1 -0
- package/dist/server/routes/cli-usage.d.ts +7 -0
- package/dist/server/routes/cli-usage.d.ts.map +1 -0
- package/dist/server/routes/cli-usage.js +121 -0
- package/dist/server/routes/cli-usage.js.map +1 -0
- package/dist/server/routes/compat.d.ts +24 -0
- package/dist/server/routes/compat.d.ts.map +1 -0
- package/dist/server/routes/compat.js +651 -0
- package/dist/server/routes/compat.js.map +1 -0
- package/dist/server/routes/extract.d.ts +9 -0
- package/dist/server/routes/extract.d.ts.map +1 -0
- package/dist/server/routes/extract.js +121 -0
- package/dist/server/routes/extract.js.map +1 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.d.ts.map +1 -0
- package/dist/server/routes/fetch.js +537 -0
- package/dist/server/routes/fetch.js.map +1 -0
- package/dist/server/routes/health.d.ts +8 -0
- package/dist/server/routes/health.d.ts.map +1 -0
- package/dist/server/routes/health.js +36 -0
- package/dist/server/routes/health.js.map +1 -0
- package/dist/server/routes/jobs.d.ts +8 -0
- package/dist/server/routes/jobs.d.ts.map +1 -0
- package/dist/server/routes/jobs.js +374 -0
- package/dist/server/routes/jobs.js.map +1 -0
- package/dist/server/routes/mcp.d.ts +16 -0
- package/dist/server/routes/mcp.d.ts.map +1 -0
- package/dist/server/routes/mcp.js +475 -0
- package/dist/server/routes/mcp.js.map +1 -0
- package/dist/server/routes/oauth.d.ts +10 -0
- package/dist/server/routes/oauth.d.ts.map +1 -0
- package/dist/server/routes/oauth.js +296 -0
- package/dist/server/routes/oauth.js.map +1 -0
- package/dist/server/routes/screenshot.d.ts +10 -0
- package/dist/server/routes/screenshot.d.ts.map +1 -0
- package/dist/server/routes/screenshot.js +217 -0
- package/dist/server/routes/screenshot.js.map +1 -0
- package/dist/server/routes/search.d.ts +7 -0
- package/dist/server/routes/search.d.ts.map +1 -0
- package/dist/server/routes/search.js +287 -0
- package/dist/server/routes/search.js.map +1 -0
- package/dist/server/routes/stats.d.ts +7 -0
- package/dist/server/routes/stats.d.ts.map +1 -0
- package/dist/server/routes/stats.js +65 -0
- package/dist/server/routes/stats.js.map +1 -0
- package/dist/server/routes/stripe.d.ts +9 -0
- package/dist/server/routes/stripe.d.ts.map +1 -0
- package/dist/server/routes/stripe.js +233 -0
- package/dist/server/routes/stripe.js.map +1 -0
- package/dist/server/routes/users.d.ts +9 -0
- package/dist/server/routes/users.d.ts.map +1 -0
- package/dist/server/routes/users.js +954 -0
- package/dist/server/routes/users.js.map +1 -0
- package/dist/server/routes/webhooks.d.ts +15 -0
- package/dist/server/routes/webhooks.d.ts.map +1 -0
- package/dist/server/routes/webhooks.js +73 -0
- package/dist/server/routes/webhooks.js.map +1 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.d.ts.map +1 -0
- package/dist/server/sentry.js +39 -0
- package/dist/server/sentry.js.map +1 -0
- package/dist/types.d.ts +22 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -87,12 +87,16 @@ First 25 fetches work instantly, no signup. After that, [sign up free](https://a
|
|
|
87
87
|
| **Stealth mode** | ✅ v2, all plans | ✅ | ⚠️ Limited | ❌ |
|
|
88
88
|
| **Browser profiles** | ✅ Persistent sessions | ❌ | ❌ | ❌ |
|
|
89
89
|
| **Hotel search** | ✅ Multi-source parallel | ❌ | ❌ | ❌ |
|
|
90
|
-
| **CSS schema extraction** | ✅
|
|
90
|
+
| **CSS schema extraction** | ✅ 7 bundled + auto-detect | ❌ | ❌ | ❌ |
|
|
91
91
|
| **LLM extraction** | ✅ BYOK, cost tracking | ⚠️ Cloud only | ❌ | ❌ |
|
|
92
92
|
| **Firecrawl-compatible** | ✅ Drop-in replacement | ✅ Native | ❌ | ❌ |
|
|
93
93
|
| **Self-hosting** | ✅ Docker compose | ⚠️ Complex | ❌ | N/A |
|
|
94
94
|
| **Autonomous agent** | ✅ BYOK any LLM | ⚠️ Locked | ❌ | ❌ |
|
|
95
|
-
| **
|
|
95
|
+
| **Deep research** | ✅ Multi-source + BM25 | ⚠️ Cloud only | ❌ | ❌ |
|
|
96
|
+
| **Content pruning** | ✅ 2-pass, 15-33% savings | ❌ | ❌ | ❌ |
|
|
97
|
+
| **BM25 filtering** | ✅ Query-focused | ❌ | ❌ | ❌ |
|
|
98
|
+
| **Python SDK** | ✅ `pip install` | ✅ | ❌ | ❌ |
|
|
99
|
+
| **MCP tools** | ✅ 13 tools | ~6 | 0 | 1 |
|
|
96
100
|
| **License** | ✅ AGPL-3.0 | AGPL-3.0 | Proprietary | MIT |
|
|
97
101
|
| **Pricing** | **Free / $9 / $29** | $0 / $16 / $83 | Custom | Free |
|
|
98
102
|
|
|
@@ -179,7 +183,27 @@ Zero dependencies. Pure Python 3.8+. [Full SDK docs →](python-sdk/README.md)
|
|
|
179
183
|
|
|
180
184
|
> **Where to add this config:** Claude Desktop → `~/Library/Application Support/Claude/claude_desktop_config.json` · Cursor → Settings → MCP Servers · VS Code → `~/.vscode/mcp.json` · Windsurf → `~/.codeium/windsurf/mcp_config.json`
|
|
181
185
|
|
|
182
|
-
### Docker
|
|
186
|
+
### Docker
|
|
187
|
+
|
|
188
|
+
**MCP Server (stdio — for Claude Desktop, Cursor, Windsurf):**
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
docker run -i webpeel/mcp
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**MCP Server (HTTP Streamable transport):**
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
docker run -e MCP_HTTP_MODE=true -p 3100:3100 webpeel/mcp
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
**API Server (Firecrawl-compatible REST API):**
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
docker run -p 3000:3000 webpeel/api
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**Self-Hosted (full stack with database):**
|
|
183
207
|
|
|
184
208
|
```bash
|
|
185
209
|
git clone https://github.com/webpeel/webpeel.git
|
|
@@ -188,6 +212,19 @@ cd webpeel && docker compose up
|
|
|
188
212
|
|
|
189
213
|
Full API at `http://localhost:3000`. AGPL-3.0 licensed. [Commercial licensing available](mailto:support@webpeel.dev).
|
|
190
214
|
|
|
215
|
+
**MCP config for Docker:**
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"mcpServers": {
|
|
220
|
+
"webpeel": {
|
|
221
|
+
"command": "docker",
|
|
222
|
+
"args": ["run", "-i", "--rm", "webpeel/mcp"]
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
```
|
|
227
|
+
|
|
191
228
|
## Features
|
|
192
229
|
|
|
193
230
|
### 🎯 Smart Escalation
|
|
@@ -216,6 +253,36 @@ npx webpeel crawl https://docs.example.com --max-pages 100
|
|
|
216
253
|
npx webpeel map https://example.com --max-urls 5000
|
|
217
254
|
```
|
|
218
255
|
|
|
256
|
+
### 🔬 Deep Research
|
|
257
|
+
|
|
258
|
+
Multi-source research with BM25 relevance ranking. No API key needed for sources mode.
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
# Get ranked sources with relevance scores
|
|
262
|
+
npx webpeel research "best web scraping tools 2025" --max-sources 5
|
|
263
|
+
|
|
264
|
+
# Full synthesis with LLM (BYOK)
|
|
265
|
+
npx webpeel research "compare Firecrawl vs Crawl4AI" --llm-key sk-...
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### 🧹 Token Efficiency
|
|
269
|
+
|
|
270
|
+
Save 15-77% on AI tokens automatically.
|
|
271
|
+
|
|
272
|
+
```bash
|
|
273
|
+
# Content pruning (default ON — strips nav/footer/sidebar)
|
|
274
|
+
npx webpeel https://en.wikipedia.org/wiki/Web_scraping
|
|
275
|
+
|
|
276
|
+
# Query-focused filtering (BM25)
|
|
277
|
+
npx webpeel https://en.wikipedia.org/wiki/Web_scraping --focus "legal issues"
|
|
278
|
+
|
|
279
|
+
# Token budget (hard cap)
|
|
280
|
+
npx webpeel https://en.wikipedia.org/wiki/Web_scraping --budget 3000
|
|
281
|
+
|
|
282
|
+
# Combined: prune → focus → budget = 77% savings
|
|
283
|
+
npx webpeel https://en.wikipedia.org/wiki/Web_scraping --focus "legal" --budget 3000
|
|
284
|
+
```
|
|
285
|
+
|
|
219
286
|
### 🤖 Autonomous Agent (BYOK)
|
|
220
287
|
|
|
221
288
|
Give it a prompt, it researches the web using your own LLM key.
|
|
@@ -228,24 +295,30 @@ npx webpeel agent "Compare pricing of Notion vs Coda" --llm-key sk-...
|
|
|
228
295
|
|
|
229
296
|
| Feature | CLI | Node.js | Python | API |
|
|
230
297
|
|---------|:---:|:-------:|:------:|:---:|
|
|
298
|
+
| Web scraping | ✅ | ✅ | ✅ | ✅ |
|
|
299
|
+
| Deep research | ✅ | ✅ | ✅ | ✅ |
|
|
300
|
+
| Content pruning | ✅ | ✅ | ✅ | ✅ |
|
|
301
|
+
| BM25 query filtering | ✅ | ✅ | — | ✅ |
|
|
231
302
|
| Structured extraction | ✅ | ✅ | ✅ | ✅ |
|
|
232
303
|
| CSS schema extraction | ✅ | ✅ | — | ✅ |
|
|
233
|
-
| LLM extraction (BYOK) | ✅ | ✅ |
|
|
304
|
+
| LLM extraction (BYOK) | ✅ | ✅ | ✅ | ✅ |
|
|
305
|
+
| Page actions | ✅ | ✅ | ✅ | ✅ |
|
|
234
306
|
| Browser profiles | ✅ | ✅ | — | — |
|
|
307
|
+
| Screenshots | ✅ | ✅ | ✅ | ✅ |
|
|
308
|
+
| Crawling | ✅ | ✅ | ✅ | ✅ |
|
|
309
|
+
| Batch fetching | ✅ | ✅ | ✅ | ✅ |
|
|
235
310
|
| Hotel search | ✅ | — | — | — |
|
|
236
|
-
|
|
|
311
|
+
| Token budget | ✅ | ✅ | ✅ | ✅ |
|
|
312
|
+
| Smart chunking | ✅ | ✅ | — | — |
|
|
237
313
|
| Branding extraction | ✅ | ✅ | — | — |
|
|
238
314
|
| Change tracking | ✅ | ✅ | — | — |
|
|
239
|
-
| Token budget | ✅ | ✅ | ✅ | ✅ |
|
|
240
|
-
| Tag filtering | ✅ | ✅ | ✅ | ✅ |
|
|
241
|
-
| Image extraction | ✅ | ✅ | — | ✅ |
|
|
242
315
|
| AI summarization | ✅ | ✅ | — | ✅ |
|
|
243
316
|
| Batch processing | — | ✅ | — | ✅ |
|
|
244
317
|
| PDF extraction | ✅ | ✅ | — | — |
|
|
245
318
|
|
|
246
319
|
## Integrations
|
|
247
320
|
|
|
248
|
-
Works with **
|
|
321
|
+
Works with **CrewAI**, **Dify**, and **n8n** via the Firecrawl-compatible API. LangChain & LlamaIndex integrations coming soon. [Integration docs →](https://webpeel.dev/docs)
|
|
249
322
|
|
|
250
323
|
## Hosted API
|
|
251
324
|
|
package/dist/cli.js
CHANGED
|
@@ -146,6 +146,11 @@ program
|
|
|
146
146
|
.option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
|
|
147
147
|
.option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
|
|
148
148
|
.option('--only-main-content', 'Shortcut for --include-tags main,article')
|
|
149
|
+
.option('--full-content', 'Return full page content (disable automatic content density pruning)')
|
|
150
|
+
.option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
|
|
151
|
+
.option('--chunk <size>', 'Split content into N-token chunks for LLM processing (default strategy: semantic)', parseInt)
|
|
152
|
+
.option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 200)', parseInt)
|
|
153
|
+
.option('--chunk-strategy <strategy>', 'Chunking strategy: fixed, semantic (default), paragraph')
|
|
149
154
|
.option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
|
|
150
155
|
.option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
|
|
151
156
|
.option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
|
|
@@ -169,7 +174,8 @@ program
|
|
|
169
174
|
.option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
|
|
170
175
|
.option('--schema <name>', 'Force a specific extraction schema by name or domain (e.g., "booking.com", "amazon")')
|
|
171
176
|
.option('--list-schemas', 'List all available extraction schemas and their supported domains')
|
|
172
|
-
.option('--scroll-extract [count]', 'Scroll page N times to load lazy content, then extract (implies --render)', (v) => parseInt(v, 10))
|
|
177
|
+
.option('--scroll-extract [count]', 'Scroll page N times to load lazy content (bare flag = smart auto-scroll until stable), then extract (implies --render)', (v) => parseInt(v, 10))
|
|
178
|
+
.option('--scroll-extract-timeout <ms>', 'Total timeout in ms for auto-scroll (default: 30000, only used with bare --scroll-extract)', parseInt)
|
|
173
179
|
.option('--csv', 'Output extraction results as CSV')
|
|
174
180
|
.option('--table', 'Output extraction results as a formatted table')
|
|
175
181
|
.option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
|
|
@@ -453,11 +459,16 @@ program
|
|
|
453
459
|
// --stealth auto-enables --render (stealth requires browser)
|
|
454
460
|
// --action auto-enables --render (actions require browser)
|
|
455
461
|
// --scroll-extract implies --render (needs browser)
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
const
|
|
460
|
-
|
|
462
|
+
//
|
|
463
|
+
// Bare --scroll-extract (no number) → smart autoScroll (detects stable height)
|
|
464
|
+
// --scroll-extract N (with number) → legacy fixed N scrolls via actions
|
|
465
|
+
const scrollExtractRaw = options.scrollExtract;
|
|
466
|
+
const isAutoScroll = scrollExtractRaw !== undefined && typeof scrollExtractRaw !== 'number';
|
|
467
|
+
const scrollExtractCount = isAutoScroll
|
|
468
|
+
? 0
|
|
469
|
+
: (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
|
|
470
|
+
const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll || false;
|
|
471
|
+
// Inject scroll actions when --scroll-extract N (fixed count) is used
|
|
461
472
|
if (scrollExtractCount > 0) {
|
|
462
473
|
const scrollActions = [];
|
|
463
474
|
for (let i = 0; i < scrollExtractCount; i++) {
|
|
@@ -493,6 +504,11 @@ program
|
|
|
493
504
|
headed: options.headed || false,
|
|
494
505
|
storageState: resolvedStorageState,
|
|
495
506
|
proxy: options.proxy,
|
|
507
|
+
fullPage: options.fullContent || false,
|
|
508
|
+
// Smart auto-scroll (bare --scroll-extract flag)
|
|
509
|
+
autoScroll: isAutoScroll
|
|
510
|
+
? { timeout: options.scrollExtractTimeout }
|
|
511
|
+
: undefined,
|
|
496
512
|
};
|
|
497
513
|
// Add summary option if requested
|
|
498
514
|
if (options.summary) {
|
|
@@ -569,6 +585,41 @@ program
|
|
|
569
585
|
result.tokens = estimateTokens(distilled);
|
|
570
586
|
}
|
|
571
587
|
}
|
|
588
|
+
// --- BM25 Query-Focused Filtering ---
|
|
589
|
+
if (options.focus && result.content) {
|
|
590
|
+
const { filterByRelevance } = await import('./core/bm25-filter.js');
|
|
591
|
+
const focusResult = filterByRelevance(result.content, { query: options.focus });
|
|
592
|
+
result.content = focusResult.content;
|
|
593
|
+
result.tokens = estimateTokens(focusResult.content);
|
|
594
|
+
if (isJson) {
|
|
595
|
+
result.focusQuery = options.focus;
|
|
596
|
+
result.focusReduction = focusResult.reductionPercent;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
// --- Smart Chunking ---
|
|
600
|
+
if (options.chunk && options.chunk > 0 && result.content) {
|
|
601
|
+
const { chunkContent } = await import('./core/chunking.js');
|
|
602
|
+
const chunkResult = chunkContent(result.content, {
|
|
603
|
+
chunkSize: options.chunk,
|
|
604
|
+
overlap: options.chunkOverlap || 200,
|
|
605
|
+
strategy: options.chunkStrategy || 'semantic',
|
|
606
|
+
});
|
|
607
|
+
// Replace content with chunked output
|
|
608
|
+
if (isJson) {
|
|
609
|
+
result.chunks = chunkResult.chunks;
|
|
610
|
+
result.totalChunks = chunkResult.totalChunks;
|
|
611
|
+
result.originalTokens = chunkResult.originalTokens;
|
|
612
|
+
// Keep content as first chunk for non-JSON fallback
|
|
613
|
+
result.content = chunkResult.chunks[0]?.content || '';
|
|
614
|
+
result.tokens = chunkResult.chunks[0]?.tokens || 0;
|
|
615
|
+
}
|
|
616
|
+
else {
|
|
617
|
+
// Plain text mode: output chunks separated by markers
|
|
618
|
+
const chunkOutput = chunkResult.chunks.map((c, i) => `--- Chunk ${i + 1}/${chunkResult.totalChunks} (${c.tokens} tokens) ---\n${c.content}`).join('\n\n');
|
|
619
|
+
result.content = chunkOutput;
|
|
620
|
+
result.tokens = chunkResult.totalTokens;
|
|
621
|
+
}
|
|
622
|
+
}
|
|
572
623
|
// --- #4: Content quality warning ---
|
|
573
624
|
const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
|
|
574
625
|
const isRedirect = false; // peel() follows redirects — final result is always 200
|
|
@@ -3118,6 +3169,87 @@ program
|
|
|
3118
3169
|
process.exit(1);
|
|
3119
3170
|
}
|
|
3120
3171
|
});
|
|
3172
|
+
// ============================================================
|
|
3173
|
+
// research command — autonomous multi-step web research
|
|
3174
|
+
// ============================================================
|
|
3175
|
+
program
|
|
3176
|
+
.command('research <query>')
|
|
3177
|
+
.description('Conduct autonomous multi-step web research on a topic and synthesize a report')
|
|
3178
|
+
.option('--max-sources <n>', 'Maximum sources to consult (default: 5)', '5')
|
|
3179
|
+
.option('--max-depth <n>', 'Link-following depth (default: 1)', '1')
|
|
3180
|
+
.option('--format <f>', 'Output format: report (default) or sources', 'report')
|
|
3181
|
+
.option('--llm-key <key>', 'LLM API key for synthesis (or env OPENAI_API_KEY)')
|
|
3182
|
+
.option('--llm-model <model>', 'LLM model for synthesis (default: gpt-4o-mini)')
|
|
3183
|
+
.option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
|
|
3184
|
+
.option('--timeout <ms>', 'Max research time in ms (default: 60000)', '60000')
|
|
3185
|
+
.option('--json', 'Output result as JSON')
|
|
3186
|
+
.option('-s, --silent', 'Suppress progress output')
|
|
3187
|
+
.action(async (query, options) => {
|
|
3188
|
+
const isSilent = !!options.silent;
|
|
3189
|
+
const isJson = !!options.json;
|
|
3190
|
+
const maxSources = parseInt(options.maxSources) || 5;
|
|
3191
|
+
const maxDepth = parseInt(options.maxDepth) || 1;
|
|
3192
|
+
const timeout = parseInt(options.timeout) || 60000;
|
|
3193
|
+
const outputFormat = options.format === 'sources' ? 'sources' : 'report';
|
|
3194
|
+
const apiKey = options.llmKey || process.env.OPENAI_API_KEY;
|
|
3195
|
+
const model = options.llmModel;
|
|
3196
|
+
const baseUrl = options.llmBaseUrl;
|
|
3197
|
+
const phaseIcons = {
|
|
3198
|
+
searching: '🔍',
|
|
3199
|
+
fetching: '📄',
|
|
3200
|
+
extracting: '🧠',
|
|
3201
|
+
following: '🔗',
|
|
3202
|
+
synthesizing: '✍️',
|
|
3203
|
+
};
|
|
3204
|
+
try {
|
|
3205
|
+
const { research } = await import('./core/research.js');
|
|
3206
|
+
const result = await research({
|
|
3207
|
+
query,
|
|
3208
|
+
maxSources,
|
|
3209
|
+
maxDepth,
|
|
3210
|
+
timeout,
|
|
3211
|
+
outputFormat: outputFormat,
|
|
3212
|
+
apiKey,
|
|
3213
|
+
model,
|
|
3214
|
+
baseUrl,
|
|
3215
|
+
onProgress: (step) => {
|
|
3216
|
+
if (!isSilent && !isJson) {
|
|
3217
|
+
const icon = phaseIcons[step.phase] ?? '⚙️';
|
|
3218
|
+
const extra = step.sourcesFound !== undefined
|
|
3219
|
+
? ` (found ${step.sourcesFound})`
|
|
3220
|
+
: step.sourcesFetched !== undefined
|
|
3221
|
+
? ` (${step.sourcesFetched} fetched)`
|
|
3222
|
+
: '';
|
|
3223
|
+
process.stderr.write(`${icon} ${step.message}${extra}...\n`);
|
|
3224
|
+
}
|
|
3225
|
+
},
|
|
3226
|
+
});
|
|
3227
|
+
if (isJson) {
|
|
3228
|
+
await writeStdout(JSON.stringify(result, null, 2) + '\n');
|
|
3229
|
+
}
|
|
3230
|
+
else {
|
|
3231
|
+
await writeStdout(result.report + '\n');
|
|
3232
|
+
if (!isSilent) {
|
|
3233
|
+
const elapsed = (result.elapsed / 1000).toFixed(1);
|
|
3234
|
+
const cost = result.cost !== undefined ? ` | cost: $${result.cost.toFixed(4)}` : '';
|
|
3235
|
+
process.stderr.write(`\n📊 ${result.sourcesConsulted} sources consulted (${result.totalSourcesFound} found) | ${elapsed}s${cost}\n`);
|
|
3236
|
+
}
|
|
3237
|
+
}
|
|
3238
|
+
await cleanup();
|
|
3239
|
+
process.exit(0);
|
|
3240
|
+
}
|
|
3241
|
+
catch (error) {
|
|
3242
|
+
const msg = error instanceof Error ? error.message : 'Unknown error';
|
|
3243
|
+
if (isJson) {
|
|
3244
|
+
await writeStdout(JSON.stringify({ error: msg, code: 'RESEARCH_FAILED' }) + '\n');
|
|
3245
|
+
}
|
|
3246
|
+
else {
|
|
3247
|
+
console.error(`\nError: ${msg}`);
|
|
3248
|
+
}
|
|
3249
|
+
await cleanup();
|
|
3250
|
+
process.exit(1);
|
|
3251
|
+
}
|
|
3252
|
+
});
|
|
3121
3253
|
program.parse();
|
|
3122
3254
|
// ============================================================
|
|
3123
3255
|
// Time formatting helper
|