webpeel 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/README.md +82 -9
  2. package/dist/cli.js +97 -6
  3. package/dist/cli.js.map +1 -1
  4. package/dist/core/actions.d.ts +28 -0
  5. package/dist/core/actions.d.ts.map +1 -1
  6. package/dist/core/actions.js +60 -0
  7. package/dist/core/actions.js.map +1 -1
  8. package/dist/core/bm25-filter.d.ts +10 -0
  9. package/dist/core/bm25-filter.d.ts.map +1 -1
  10. package/dist/core/bm25-filter.js +40 -0
  11. package/dist/core/bm25-filter.js.map +1 -1
  12. package/dist/core/content-pruner.d.ts +12 -5
  13. package/dist/core/content-pruner.d.ts.map +1 -1
  14. package/dist/core/content-pruner.js +247 -190
  15. package/dist/core/content-pruner.js.map +1 -1
  16. package/dist/core/research.d.ts +67 -0
  17. package/dist/core/research.d.ts.map +1 -0
  18. package/dist/core/research.js +254 -0
  19. package/dist/core/research.js.map +1 -0
  20. package/dist/index.d.ts.map +1 -1
  21. package/dist/index.js +37 -3
  22. package/dist/index.js.map +1 -1
  23. package/dist/mcp/server.js +107 -2
  24. package/dist/mcp/server.js.map +1 -1
  25. package/dist/server/app.d.ts +14 -0
  26. package/dist/server/app.d.ts.map +1 -0
  27. package/dist/server/app.js +189 -0
  28. package/dist/server/app.js.map +1 -0
  29. package/dist/server/auth-store.d.ts +28 -0
  30. package/dist/server/auth-store.d.ts.map +1 -0
  31. package/dist/server/auth-store.js +89 -0
  32. package/dist/server/auth-store.js.map +1 -0
  33. package/dist/server/job-queue.d.ts +93 -0
  34. package/dist/server/job-queue.d.ts.map +1 -0
  35. package/dist/server/job-queue.js +144 -0
  36. package/dist/server/job-queue.js.map +1 -0
  37. package/dist/server/middleware/auth.d.ts +24 -0
  38. package/dist/server/middleware/auth.d.ts.map +1 -0
  39. package/dist/server/middleware/auth.js +152 -0
  40. package/dist/server/middleware/auth.js.map +1 -0
  41. package/dist/server/middleware/rate-limit.d.ts +23 -0
  42. package/dist/server/middleware/rate-limit.d.ts.map +1 -0
  43. package/dist/server/middleware/rate-limit.js +126 -0
  44. package/dist/server/middleware/rate-limit.js.map +1 -0
  45. package/dist/server/middleware/url-validator.d.ts +16 -0
  46. package/dist/server/middleware/url-validator.d.ts.map +1 -0
  47. package/dist/server/middleware/url-validator.js +187 -0
  48. package/dist/server/middleware/url-validator.js.map +1 -0
  49. package/dist/server/pg-auth-store.d.ts +129 -0
  50. package/dist/server/pg-auth-store.d.ts.map +1 -0
  51. package/dist/server/pg-auth-store.js +457 -0
  52. package/dist/server/pg-auth-store.js.map +1 -0
  53. package/dist/server/pg-job-queue.d.ts +60 -0
  54. package/dist/server/pg-job-queue.d.ts.map +1 -0
  55. package/dist/server/pg-job-queue.js +365 -0
  56. package/dist/server/pg-job-queue.js.map +1 -0
  57. package/dist/server/premium/domain-intel.d.ts +17 -0
  58. package/dist/server/premium/domain-intel.d.ts.map +1 -0
  59. package/dist/server/premium/domain-intel.js +134 -0
  60. package/dist/server/premium/domain-intel.js.map +1 -0
  61. package/dist/server/premium/index.d.ts +18 -0
  62. package/dist/server/premium/index.d.ts.map +1 -0
  63. package/dist/server/premium/index.js +36 -0
  64. package/dist/server/premium/index.js.map +1 -0
  65. package/dist/server/premium/swr-cache.d.ts +15 -0
  66. package/dist/server/premium/swr-cache.d.ts.map +1 -0
  67. package/dist/server/premium/swr-cache.js +35 -0
  68. package/dist/server/premium/swr-cache.js.map +1 -0
  69. package/dist/server/routes/activity.d.ts +7 -0
  70. package/dist/server/routes/activity.d.ts.map +1 -0
  71. package/dist/server/routes/activity.js +66 -0
  72. package/dist/server/routes/activity.js.map +1 -0
  73. package/dist/server/routes/agent.d.ts +12 -0
  74. package/dist/server/routes/agent.d.ts.map +1 -0
  75. package/dist/server/routes/agent.js +356 -0
  76. package/dist/server/routes/agent.js.map +1 -0
  77. package/dist/server/routes/answer.d.ts +6 -0
  78. package/dist/server/routes/answer.d.ts.map +1 -0
  79. package/dist/server/routes/answer.js +124 -0
  80. package/dist/server/routes/answer.js.map +1 -0
  81. package/dist/server/routes/batch.d.ts +7 -0
  82. package/dist/server/routes/batch.d.ts.map +1 -0
  83. package/dist/server/routes/batch.js +287 -0
  84. package/dist/server/routes/batch.js.map +1 -0
  85. package/dist/server/routes/cli-usage.d.ts +7 -0
  86. package/dist/server/routes/cli-usage.d.ts.map +1 -0
  87. package/dist/server/routes/cli-usage.js +121 -0
  88. package/dist/server/routes/cli-usage.js.map +1 -0
  89. package/dist/server/routes/compat.d.ts +24 -0
  90. package/dist/server/routes/compat.d.ts.map +1 -0
  91. package/dist/server/routes/compat.js +651 -0
  92. package/dist/server/routes/compat.js.map +1 -0
  93. package/dist/server/routes/extract.d.ts +9 -0
  94. package/dist/server/routes/extract.d.ts.map +1 -0
  95. package/dist/server/routes/extract.js +121 -0
  96. package/dist/server/routes/extract.js.map +1 -0
  97. package/dist/server/routes/fetch.d.ts +7 -0
  98. package/dist/server/routes/fetch.d.ts.map +1 -0
  99. package/dist/server/routes/fetch.js +537 -0
  100. package/dist/server/routes/fetch.js.map +1 -0
  101. package/dist/server/routes/health.d.ts +8 -0
  102. package/dist/server/routes/health.d.ts.map +1 -0
  103. package/dist/server/routes/health.js +36 -0
  104. package/dist/server/routes/health.js.map +1 -0
  105. package/dist/server/routes/jobs.d.ts +8 -0
  106. package/dist/server/routes/jobs.d.ts.map +1 -0
  107. package/dist/server/routes/jobs.js +374 -0
  108. package/dist/server/routes/jobs.js.map +1 -0
  109. package/dist/server/routes/mcp.d.ts +16 -0
  110. package/dist/server/routes/mcp.d.ts.map +1 -0
  111. package/dist/server/routes/mcp.js +475 -0
  112. package/dist/server/routes/mcp.js.map +1 -0
  113. package/dist/server/routes/oauth.d.ts +10 -0
  114. package/dist/server/routes/oauth.d.ts.map +1 -0
  115. package/dist/server/routes/oauth.js +296 -0
  116. package/dist/server/routes/oauth.js.map +1 -0
  117. package/dist/server/routes/screenshot.d.ts +10 -0
  118. package/dist/server/routes/screenshot.d.ts.map +1 -0
  119. package/dist/server/routes/screenshot.js +217 -0
  120. package/dist/server/routes/screenshot.js.map +1 -0
  121. package/dist/server/routes/search.d.ts +7 -0
  122. package/dist/server/routes/search.d.ts.map +1 -0
  123. package/dist/server/routes/search.js +287 -0
  124. package/dist/server/routes/search.js.map +1 -0
  125. package/dist/server/routes/stats.d.ts +7 -0
  126. package/dist/server/routes/stats.d.ts.map +1 -0
  127. package/dist/server/routes/stats.js +65 -0
  128. package/dist/server/routes/stats.js.map +1 -0
  129. package/dist/server/routes/stripe.d.ts +9 -0
  130. package/dist/server/routes/stripe.d.ts.map +1 -0
  131. package/dist/server/routes/stripe.js +233 -0
  132. package/dist/server/routes/stripe.js.map +1 -0
  133. package/dist/server/routes/users.d.ts +9 -0
  134. package/dist/server/routes/users.d.ts.map +1 -0
  135. package/dist/server/routes/users.js +954 -0
  136. package/dist/server/routes/users.js.map +1 -0
  137. package/dist/server/routes/webhooks.d.ts +15 -0
  138. package/dist/server/routes/webhooks.d.ts.map +1 -0
  139. package/dist/server/routes/webhooks.js +73 -0
  140. package/dist/server/routes/webhooks.js.map +1 -0
  141. package/dist/server/sentry.d.ts +14 -0
  142. package/dist/server/sentry.d.ts.map +1 -0
  143. package/dist/server/sentry.js +39 -0
  144. package/dist/server/sentry.js.map +1 -0
  145. package/dist/types.d.ts +13 -0
  146. package/dist/types.d.ts.map +1 -1
  147. package/dist/types.js.map +1 -1
  148. package/package.json +3 -2
package/README.md CHANGED
@@ -87,12 +87,16 @@ First 25 fetches work instantly, no signup. After that, [sign up free](https://a
87
87
  | **Stealth mode** | ✅ v2, all plans | ✅ | ⚠️ Limited | ❌ |
88
88
  | **Browser profiles** | ✅ Persistent sessions | ❌ | ❌ | ❌ |
89
89
  | **Hotel search** | ✅ Multi-source parallel | ❌ | ❌ | ❌ |
90
- | **CSS schema extraction** | ✅ 6 bundled + auto-detect | ❌ | ❌ | ❌ |
90
+ | **CSS schema extraction** | ✅ 7 bundled + auto-detect | ❌ | ❌ | ❌ |
91
91
  | **LLM extraction** | ✅ BYOK, cost tracking | ⚠️ Cloud only | ❌ | ❌ |
92
92
  | **Firecrawl-compatible** | ✅ Drop-in replacement | ✅ Native | ❌ | ❌ |
93
93
  | **Self-hosting** | ✅ Docker compose | ⚠️ Complex | ❌ | N/A |
94
94
  | **Autonomous agent** | ✅ BYOK any LLM | ⚠️ Locked | ❌ | ❌ |
95
- | **MCP tools** | ✅ 11 tools | 3 | 0 | 1 |
95
+ | **Deep research** | ✅ Multi-source + BM25 | ⚠️ Cloud only | | |
96
+ | **Content pruning** | ✅ 2-pass, 15-33% savings | ❌ | ❌ | ❌ |
97
+ | **BM25 filtering** | ✅ Query-focused | ❌ | ❌ | ❌ |
98
+ | **Python SDK** | ✅ `pip install` | ✅ | ❌ | ❌ |
99
+ | **MCP tools** | ✅ 13 tools | ~6 | 0 | 1 |
96
100
  | **License** | ✅ AGPL-3.0 | AGPL-3.0 | Proprietary | MIT |
97
101
  | **Pricing** | **Free / $9 / $29** | $0 / $16 / $83 | Custom | Free |
98
102
 
@@ -179,7 +183,27 @@ Zero dependencies. Pure Python 3.8+. [Full SDK docs →](python-sdk/README.md)
179
183
 
180
184
  > **Where to add this config:** Claude Desktop → `~/Library/Application Support/Claude/claude_desktop_config.json` · Cursor → Settings → MCP Servers · VS Code → `~/.vscode/mcp.json` · Windsurf → `~/.codeium/windsurf/mcp_config.json`
181
185
 
182
- ### Docker (Self-Hosted)
186
+ ### Docker
187
+
188
+ **MCP Server (stdio — for Claude Desktop, Cursor, Windsurf):**
189
+
190
+ ```bash
191
+ docker run -i webpeel/mcp
192
+ ```
193
+
194
+ **MCP Server (HTTP Streamable transport):**
195
+
196
+ ```bash
197
+ docker run -e MCP_HTTP_MODE=true -p 3100:3100 webpeel/mcp
198
+ ```
199
+
200
+ **API Server (Firecrawl-compatible REST API):**
201
+
202
+ ```bash
203
+ docker run -p 3000:3000 webpeel/api
204
+ ```
205
+
206
+ **Self-Hosted (full stack with database):**
183
207
 
184
208
  ```bash
185
209
  git clone https://github.com/webpeel/webpeel.git
@@ -188,6 +212,19 @@ cd webpeel && docker compose up
188
212
 
189
213
  Full API at `http://localhost:3000`. AGPL-3.0 licensed. [Commercial licensing available](mailto:support@webpeel.dev).
190
214
 
215
+ **MCP config for Docker:**
216
+
217
+ ```json
218
+ {
219
+ "mcpServers": {
220
+ "webpeel": {
221
+ "command": "docker",
222
+ "args": ["run", "-i", "--rm", "webpeel/mcp"]
223
+ }
224
+ }
225
+ }
226
+ ```
227
+
191
228
  ## Features
192
229
 
193
230
  ### 🎯 Smart Escalation
@@ -216,6 +253,36 @@ npx webpeel crawl https://docs.example.com --max-pages 100
216
253
  npx webpeel map https://example.com --max-urls 5000
217
254
  ```
218
255
 
256
+ ### 🔬 Deep Research
257
+
258
+ Multi-source research with BM25 relevance ranking. No API key needed for sources mode.
259
+
260
+ ```bash
261
+ # Get ranked sources with relevance scores
262
+ npx webpeel research "best web scraping tools 2025" --max-sources 5
263
+
264
+ # Full synthesis with LLM (BYOK)
265
+ npx webpeel research "compare Firecrawl vs Crawl4AI" --llm-key sk-...
266
+ ```
267
+
268
+ ### 🧹 Token Efficiency
269
+
270
+ Save 15-77% on AI tokens automatically.
271
+
272
+ ```bash
273
+ # Content pruning (default ON — strips nav/footer/sidebar)
274
+ npx webpeel https://en.wikipedia.org/wiki/Web_scraping
275
+
276
+ # Query-focused filtering (BM25)
277
+ npx webpeel https://en.wikipedia.org/wiki/Web_scraping --focus "legal issues"
278
+
279
+ # Token budget (hard cap)
280
+ npx webpeel https://en.wikipedia.org/wiki/Web_scraping --budget 3000
281
+
282
+ # Combined: prune → focus → budget = 77% savings
283
+ npx webpeel https://en.wikipedia.org/wiki/Web_scraping --focus "legal" --budget 3000
284
+ ```
285
+
219
286
  ### 🤖 Autonomous Agent (BYOK)
220
287
 
221
288
  Give it a prompt, it researches the web using your own LLM key.
@@ -228,24 +295,30 @@ npx webpeel agent "Compare pricing of Notion vs Coda" --llm-key sk-...
228
295
 
229
296
  | Feature | CLI | Node.js | Python | API |
230
297
  |---------|:---:|:-------:|:------:|:---:|
298
+ | Web scraping | ✅ | ✅ | ✅ | ✅ |
299
+ | Deep research | ✅ | ✅ | ✅ | ✅ |
300
+ | Content pruning | ✅ | ✅ | ✅ | ✅ |
301
+ | BM25 query filtering | ✅ | ✅ | — | ✅ |
231
302
  | Structured extraction | ✅ | ✅ | ✅ | ✅ |
232
303
  | CSS schema extraction | ✅ | ✅ | — | ✅ |
233
- | LLM extraction (BYOK) | ✅ | ✅ | | ✅ |
304
+ | LLM extraction (BYOK) | ✅ | ✅ | | ✅ |
305
+ | Page actions | ✅ | ✅ | ✅ | ✅ |
234
306
  | Browser profiles | ✅ | ✅ | — | — |
307
+ | Screenshots | ✅ | ✅ | ✅ | ✅ |
308
+ | Crawling | ✅ | ✅ | ✅ | ✅ |
309
+ | Batch fetching | ✅ | ✅ | ✅ | ✅ |
235
310
  | Hotel search | ✅ | — | — | — |
236
- | Screenshots | ✅ | ✅ | | ✅ |
311
+ | Token budget | ✅ | ✅ | | ✅ |
312
+ | Smart chunking | ✅ | ✅ | — | — |
237
313
  | Branding extraction | ✅ | ✅ | — | — |
238
314
  | Change tracking | ✅ | ✅ | — | — |
239
- | Token budget | ✅ | ✅ | ✅ | ✅ |
240
- | Tag filtering | ✅ | ✅ | ✅ | ✅ |
241
- | Image extraction | ✅ | ✅ | — | ✅ |
242
315
  | AI summarization | ✅ | ✅ | — | ✅ |
243
316
  | Batch processing | — | ✅ | — | ✅ |
244
317
  | PDF extraction | ✅ | ✅ | — | — |
245
318
 
246
319
  ## Integrations
247
320
 
248
- Works with **LangChain**, **LlamaIndex**, **CrewAI**, **Dify**, and **n8n**. [Integration docs →](https://webpeel.dev/docs)
321
+ Works with **CrewAI**, **Dify**, and **n8n** via the Firecrawl-compatible API. LangChain & LlamaIndex integrations coming soon. [Integration docs →](https://webpeel.dev/docs)
249
322
 
250
323
  ## Hosted API
251
324
 
package/dist/cli.js CHANGED
@@ -174,7 +174,8 @@ program
174
174
  .option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
175
175
  .option('--schema <name>', 'Force a specific extraction schema by name or domain (e.g., "booking.com", "amazon")')
176
176
  .option('--list-schemas', 'List all available extraction schemas and their supported domains')
177
- .option('--scroll-extract [count]', 'Scroll page N times to load lazy content, then extract (implies --render)', (v) => parseInt(v, 10))
177
+ .option('--scroll-extract [count]', 'Scroll page N times to load lazy content (bare flag = smart auto-scroll until stable), then extract (implies --render)', (v) => parseInt(v, 10))
178
+ .option('--scroll-extract-timeout <ms>', 'Total timeout in ms for auto-scroll (default: 30000, only used with bare --scroll-extract)', parseInt)
178
179
  .option('--csv', 'Output extraction results as CSV')
179
180
  .option('--table', 'Output extraction results as a formatted table')
180
181
  .option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
@@ -458,11 +459,16 @@ program
458
459
  // --stealth auto-enables --render (stealth requires browser)
459
460
  // --action auto-enables --render (actions require browser)
460
461
  // --scroll-extract implies --render (needs browser)
461
- const scrollExtractCount = options.scrollExtract !== undefined
462
- ? (typeof options.scrollExtract === 'number' ? options.scrollExtract : 3)
463
- : 0;
464
- const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || false;
465
- // Inject scroll actions when --scroll-extract is used
462
+ //
463
+ // Bare --scroll-extract (no number) smart autoScroll (detects stable height)
464
+ // --scroll-extract N (with number) → legacy fixed N scrolls via actions
465
+ const scrollExtractRaw = options.scrollExtract;
466
+ const isAutoScroll = scrollExtractRaw !== undefined && typeof scrollExtractRaw !== 'number';
467
+ const scrollExtractCount = isAutoScroll
468
+ ? 0
469
+ : (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
470
+ const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll || false;
471
+ // Inject scroll actions when --scroll-extract N (fixed count) is used
466
472
  if (scrollExtractCount > 0) {
467
473
  const scrollActions = [];
468
474
  for (let i = 0; i < scrollExtractCount; i++) {
@@ -499,6 +505,10 @@ program
499
505
  storageState: resolvedStorageState,
500
506
  proxy: options.proxy,
501
507
  fullPage: options.fullContent || false,
508
+ // Smart auto-scroll (bare --scroll-extract flag)
509
+ autoScroll: isAutoScroll
510
+ ? { timeout: options.scrollExtractTimeout }
511
+ : undefined,
502
512
  };
503
513
  // Add summary option if requested
504
514
  if (options.summary) {
@@ -3159,6 +3169,87 @@ program
3159
3169
  process.exit(1);
3160
3170
  }
3161
3171
  });
3172
+ // ============================================================
3173
+ // research command — autonomous multi-step web research
3174
+ // ============================================================
3175
+ program
3176
+ .command('research <query>')
3177
+ .description('Conduct autonomous multi-step web research on a topic and synthesize a report')
3178
+ .option('--max-sources <n>', 'Maximum sources to consult (default: 5)', '5')
3179
+ .option('--max-depth <n>', 'Link-following depth (default: 1)', '1')
3180
+ .option('--format <f>', 'Output format: report (default) or sources', 'report')
3181
+ .option('--llm-key <key>', 'LLM API key for synthesis (or env OPENAI_API_KEY)')
3182
+ .option('--llm-model <model>', 'LLM model for synthesis (default: gpt-4o-mini)')
3183
+ .option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
3184
+ .option('--timeout <ms>', 'Max research time in ms (default: 60000)', '60000')
3185
+ .option('--json', 'Output result as JSON')
3186
+ .option('-s, --silent', 'Suppress progress output')
3187
+ .action(async (query, options) => {
3188
+ const isSilent = !!options.silent;
3189
+ const isJson = !!options.json;
3190
+ const maxSources = parseInt(options.maxSources) || 5;
3191
+ const maxDepth = parseInt(options.maxDepth) || 1;
3192
+ const timeout = parseInt(options.timeout) || 60000;
3193
+ const outputFormat = options.format === 'sources' ? 'sources' : 'report';
3194
+ const apiKey = options.llmKey || process.env.OPENAI_API_KEY;
3195
+ const model = options.llmModel;
3196
+ const baseUrl = options.llmBaseUrl;
3197
+ const phaseIcons = {
3198
+ searching: '🔍',
3199
+ fetching: '📄',
3200
+ extracting: '🧠',
3201
+ following: '🔗',
3202
+ synthesizing: '✍️',
3203
+ };
3204
+ try {
3205
+ const { research } = await import('./core/research.js');
3206
+ const result = await research({
3207
+ query,
3208
+ maxSources,
3209
+ maxDepth,
3210
+ timeout,
3211
+ outputFormat: outputFormat,
3212
+ apiKey,
3213
+ model,
3214
+ baseUrl,
3215
+ onProgress: (step) => {
3216
+ if (!isSilent && !isJson) {
3217
+ const icon = phaseIcons[step.phase] ?? '⚙️';
3218
+ const extra = step.sourcesFound !== undefined
3219
+ ? ` (found ${step.sourcesFound})`
3220
+ : step.sourcesFetched !== undefined
3221
+ ? ` (${step.sourcesFetched} fetched)`
3222
+ : '';
3223
+ process.stderr.write(`${icon} ${step.message}${extra}...\n`);
3224
+ }
3225
+ },
3226
+ });
3227
+ if (isJson) {
3228
+ await writeStdout(JSON.stringify(result, null, 2) + '\n');
3229
+ }
3230
+ else {
3231
+ await writeStdout(result.report + '\n');
3232
+ if (!isSilent) {
3233
+ const elapsed = (result.elapsed / 1000).toFixed(1);
3234
+ const cost = result.cost !== undefined ? ` | cost: $${result.cost.toFixed(4)}` : '';
3235
+ process.stderr.write(`\n📊 ${result.sourcesConsulted} sources consulted (${result.totalSourcesFound} found) | ${elapsed}s${cost}\n`);
3236
+ }
3237
+ }
3238
+ await cleanup();
3239
+ process.exit(0);
3240
+ }
3241
+ catch (error) {
3242
+ const msg = error instanceof Error ? error.message : 'Unknown error';
3243
+ if (isJson) {
3244
+ await writeStdout(JSON.stringify({ error: msg, code: 'RESEARCH_FAILED' }) + '\n');
3245
+ }
3246
+ else {
3247
+ console.error(`\nError: ${msg}`);
3248
+ }
3249
+ await cleanup();
3250
+ process.exit(1);
3251
+ }
3252
+ });
3162
3253
  program.parse();
3163
3254
  // ============================================================
3164
3255
  // Time formatting helper