webcontext-ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +583 -0
  3. package/dist/browser/manager.d.ts +47 -0
  4. package/dist/browser/manager.d.ts.map +1 -0
  5. package/dist/browser/manager.js +215 -0
  6. package/dist/browser/manager.js.map +1 -0
  7. package/dist/cache/cache.d.ts +22 -0
  8. package/dist/cache/cache.d.ts.map +1 -0
  9. package/dist/cache/cache.js +150 -0
  10. package/dist/cache/cache.js.map +1 -0
  11. package/dist/chunking/chunker.d.ts +26 -0
  12. package/dist/chunking/chunker.d.ts.map +1 -0
  13. package/dist/chunking/chunker.js +208 -0
  14. package/dist/chunking/chunker.js.map +1 -0
  15. package/dist/cli/index.d.ts +3 -0
  16. package/dist/cli/index.d.ts.map +1 -0
  17. package/dist/cli/index.js +406 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/core/pipeline.d.ts +35 -0
  20. package/dist/core/pipeline.d.ts.map +1 -0
  21. package/dist/core/pipeline.js +476 -0
  22. package/dist/core/pipeline.js.map +1 -0
  23. package/dist/core/stream.d.ts +48 -0
  24. package/dist/core/stream.d.ts.map +1 -0
  25. package/dist/core/stream.js +72 -0
  26. package/dist/core/stream.js.map +1 -0
  27. package/dist/core/types.d.ts +259 -0
  28. package/dist/core/types.d.ts.map +1 -0
  29. package/dist/core/types.js +4 -0
  30. package/dist/core/types.js.map +1 -0
  31. package/dist/export/index.d.ts +3 -0
  32. package/dist/export/index.d.ts.map +1 -0
  33. package/dist/export/index.js +8 -0
  34. package/dist/export/index.js.map +1 -0
  35. package/dist/export/templates.d.ts +25 -0
  36. package/dist/export/templates.d.ts.map +1 -0
  37. package/dist/export/templates.js +76 -0
  38. package/dist/export/templates.js.map +1 -0
  39. package/dist/export/vectordb.d.ts +21 -0
  40. package/dist/export/vectordb.d.ts.map +1 -0
  41. package/dist/export/vectordb.js +101 -0
  42. package/dist/export/vectordb.js.map +1 -0
  43. package/dist/extractors/content.d.ts +23 -0
  44. package/dist/extractors/content.d.ts.map +1 -0
  45. package/dist/extractors/content.js +328 -0
  46. package/dist/extractors/content.js.map +1 -0
  47. package/dist/extractors/github.d.ts +19 -0
  48. package/dist/extractors/github.d.ts.map +1 -0
  49. package/dist/extractors/github.js +150 -0
  50. package/dist/extractors/github.js.map +1 -0
  51. package/dist/extractors/images.d.ts +20 -0
  52. package/dist/extractors/images.d.ts.map +1 -0
  53. package/dist/extractors/images.js +73 -0
  54. package/dist/extractors/images.js.map +1 -0
  55. package/dist/extractors/pdf.d.ts +11 -0
  56. package/dist/extractors/pdf.d.ts.map +1 -0
  57. package/dist/extractors/pdf.js +107 -0
  58. package/dist/extractors/pdf.js.map +1 -0
  59. package/dist/extractors/screenshot.d.ts +21 -0
  60. package/dist/extractors/screenshot.d.ts.map +1 -0
  61. package/dist/extractors/screenshot.js +85 -0
  62. package/dist/extractors/screenshot.js.map +1 -0
  63. package/dist/index.d.ts +70 -0
  64. package/dist/index.d.ts.map +1 -0
  65. package/dist/index.js +206 -0
  66. package/dist/index.js.map +1 -0
  67. package/dist/mcp-server.d.ts +3 -0
  68. package/dist/mcp-server.d.ts.map +1 -0
  69. package/dist/mcp-server.js +108 -0
  70. package/dist/mcp-server.js.map +1 -0
  71. package/dist/sdk/client.d.ts +48 -0
  72. package/dist/sdk/client.d.ts.map +1 -0
  73. package/dist/sdk/client.js +120 -0
  74. package/dist/sdk/client.js.map +1 -0
  75. package/dist/sdk/mcp.d.ts +12 -0
  76. package/dist/sdk/mcp.d.ts.map +1 -0
  77. package/dist/sdk/mcp.js +146 -0
  78. package/dist/sdk/mcp.js.map +1 -0
  79. package/dist/sdk/server.d.ts +5 -0
  80. package/dist/sdk/server.d.ts.map +1 -0
  81. package/dist/sdk/server.js +158 -0
  82. package/dist/sdk/server.js.map +1 -0
  83. package/dist/search/vector.d.ts +26 -0
  84. package/dist/search/vector.d.ts.map +1 -0
  85. package/dist/search/vector.js +142 -0
  86. package/dist/search/vector.js.map +1 -0
  87. package/dist/transformers/markdown.d.ts +21 -0
  88. package/dist/transformers/markdown.d.ts.map +1 -0
  89. package/dist/transformers/markdown.js +242 -0
  90. package/dist/transformers/markdown.js.map +1 -0
  91. package/dist/utils/dedup.d.ts +20 -0
  92. package/dist/utils/dedup.d.ts.map +1 -0
  93. package/dist/utils/dedup.js +61 -0
  94. package/dist/utils/dedup.js.map +1 -0
  95. package/dist/utils/index.d.ts +6 -0
  96. package/dist/utils/index.d.ts.map +1 -0
  97. package/dist/utils/index.js +15 -0
  98. package/dist/utils/index.js.map +1 -0
  99. package/dist/utils/metrics.d.ts +16 -0
  100. package/dist/utils/metrics.d.ts.map +1 -0
  101. package/dist/utils/metrics.js +28 -0
  102. package/dist/utils/metrics.js.map +1 -0
  103. package/dist/utils/scheduler.d.ts +19 -0
  104. package/dist/utils/scheduler.d.ts.map +1 -0
  105. package/dist/utils/scheduler.js +63 -0
  106. package/dist/utils/scheduler.js.map +1 -0
  107. package/dist/utils/sitemap.d.ts +17 -0
  108. package/dist/utils/sitemap.d.ts.map +1 -0
  109. package/dist/utils/sitemap.js +118 -0
  110. package/dist/utils/sitemap.js.map +1 -0
  111. package/dist/utils/validation.d.ts +142 -0
  112. package/dist/utils/validation.d.ts.map +1 -0
  113. package/dist/utils/validation.js +35 -0
  114. package/dist/utils/validation.js.map +1 -0
  115. package/dist/utils/webhook.d.ts +21 -0
  116. package/dist/utils/webhook.d.ts.map +1 -0
  117. package/dist/utils/webhook.js +108 -0
  118. package/dist/utils/webhook.js.map +1 -0
  119. package/package.json +109 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 sumeethmoolya
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,583 @@
1
+ # WebContext AI
2
+
3
+ > Turn any web content into clean AI-ready context — with crawling, chunking, semantic search, vector DB export, and MCP tools.
4
+
5
+ WebContext is a developer tool that crawls, extracts, cleans, and structures web content for consumption by LLMs, RAG pipelines, and AI agents. Think of it as Firecrawl — but open-source, self-hosted, and optimized for developer documentation.
6
+
7
+ ## Features
8
+
9
+ - **Smart Extraction** — Removes ads, navigation, cookie banners, and noise automatically
10
+ - **Code Preservation** — Keeps code blocks intact with language detection (15+ languages)
11
+ - **Recursive Crawling** — Crawl entire documentation sites with depth control and sitemap support
12
+ - **Token-Aware Chunking** — Semantic, heading-based, paragraph, or fixed-size chunking using tiktoken
13
+ - **Semantic Search** — TF-IDF vector search over extracted content chunks
14
+ - **Vector DB Export** — Export chunks ready for Pinecone, Chroma, Weaviate, Qdrant
15
+ - **PDF Extraction** — Extract text from PDF files and URLs
16
+ - **GitHub Extraction** — Fetch README and /docs from any GitHub repository
17
+ - **Screenshot Capture** — Take full-page screenshots of web pages
18
+ - **Image Extraction** — Extract images with alt text and surrounding context
19
+ - **Streaming** — Real-time event-based output as pages are crawled
20
+ - **Output Templates** — Built-in templates (LLM, XML, minimal) or define your own
21
+ - **MCP Server** — Model Context Protocol tools for AI agents (Cursor, Claude, Amazon Q)
22
+ - **Browser Rendering** — Optional Playwright-powered JS rendering for SPAs
23
+ - **Rate Limiting** — Token bucket rate limiter with configurable requests/second
24
+ - **Retry with Backoff** — Exponential backoff on 429/5xx responses
25
+ - **robots.txt Compliance** — Respects robots.txt by default
26
+ - **Caching** — Dual-layer (LRU memory + file-based) with TTL and content diff detection
27
+ - **Content Diffing** — Detect what changed between crawls via content hashing
28
+ - **Deduplication** — Automatically skips duplicate content during crawls
29
+ - **Sitemap Auto-Discovery** — Finds and uses sitemaps automatically before crawling
30
+ - **Link Resolution** — Converts relative links to absolute URLs in output
31
+ - **Focus Modes** — Extract only articles, code, API references, or READMEs
32
+ - **Plugin System** — Hook into any phase of the pipeline (pre/post fetch, extract, transform, chunk)
33
+ - **Checkpoint/Resume** — Save crawl state to disk and resume interrupted crawls
34
+ - **Scheduling** — Cron-based recurring crawls for keeping context fresh
35
+ - **Webhooks** — Get notified when crawls complete or content changes
36
+ - **LangChain Compatible** — Document loader adapter included
37
+ - **Metrics** — Track crawl performance, cache hit rates, token usage
38
+ - **Input Validation** — Zod-based validation on all inputs
39
+
40
+ ## Quick Start
41
+
42
+ ```bash
43
+ npm install webcontext-ai
44
+ ```
45
+
46
+ > **Note:** WebContext works out of the box for most sites (server-rendered). For JavaScript-heavy SPAs, you also need Playwright:
47
+ > ```bash
48
+ > npm install playwright
49
+ > npx playwright install chromium
50
+ > ```
51
+ > Then pass `{ javascript: true }` to enable browser rendering.
52
+
53
+ > **Optional extras:**
54
+ > ```bash
55
+ > npm install pdf-parse # For PDF extraction
56
+ > npm install playwright # For screenshots & JS rendering
57
+ > ```
58
+
59
+ ## CLI Usage
60
+
61
+ ```bash
62
+ # Extract a single page as markdown
63
+ webcontext extract https://docs.example.com/api --format markdown
64
+
65
+ # Crawl documentation recursively
66
+ webcontext crawl https://docs.example.com --depth 3 --max-pages 100 -o docs.md
67
+
68
+ # Generate LLM-ready context with token budget
69
+ webcontext context https://docs.example.com/quickstart --budget 4000
70
+
71
+ # Semantic search within a page
72
+ webcontext search https://docs.example.com/api "authentication"
73
+
74
+ # Export for vector database
75
+ webcontext export https://docs.example.com --to pinecone -o chunks.json
76
+ webcontext export https://docs.example.com --to chroma --namespace my-docs
77
+
78
+ # Extract GitHub repository
79
+ webcontext github https://github.com/user/repo -o repo-docs.md
80
+
81
+ # Extract PDF
82
+ webcontext pdf https://example.com/paper.pdf -o paper.md
83
+ webcontext pdf ./local-file.pdf -o extracted.md
84
+
85
+ # Take screenshot
86
+ webcontext screenshot https://docs.example.com -o ./screenshots --full-page
87
+
88
+ # Validate a URL
89
+ webcontext validate https://docs.example.com
90
+
91
+ # Schedule recurring crawls
92
+ webcontext schedule https://docs.example.com --cron "0 */6 * * *" -o ./docs-cache
93
+
94
+ # Start API server
95
+ webcontext serve --port 3456
96
+ ```
97
+
98
+ ## SDK Usage
99
+
100
+ ```typescript
101
+ import { WebContext } from 'webcontext-ai';
102
+
103
+ const wc = new WebContext({
104
+ cache: { enabled: true, ttl: 3600, maxSize: 500, contentHashing: true },
105
+ chunking: { maxTokens: 1500, strategy: 'semantic', overlap: 100 },
106
+ concurrency: 5,
107
+ metrics: true,
108
+ });
109
+
110
+ // Extract single page
111
+ const result = await wc.extract('https://docs.example.com/api');
112
+ console.log(result.pages[0].markdown);
113
+
114
+ // Crawl documentation site
115
+ const docs = await wc.crawlDocs('https://docs.example.com', {
116
+ depth: 2,
117
+ maxPages: 50,
118
+ onProgress: (p) => console.log(`${p.pagesProcessed}/${p.totalDiscovered}`),
119
+ });
120
+
121
+ // Get RAG-ready chunks
122
+ const chunks = await wc.toChunks('https://docs.example.com/guide');
123
+
124
+ // Generate token-budgeted context for LLM
125
+ const context = await wc.toContext('https://docs.example.com', { maxTokens: 4000 });
126
+
127
+ // Semantic search
128
+ const results = await wc.search('https://docs.example.com/api', 'authentication', 5);
129
+
130
+ // Extract GitHub repo
131
+ const repo = await wc.extractGitHub('https://github.com/user/repo');
132
+
133
+ // Extract PDF
134
+ const pdf = await wc.extractPdf('https://example.com/paper.pdf');
135
+
136
+ // Export for vector DB
137
+ const pineconeData = await wc.exportForVectorDB('https://docs.example.com', {
138
+ format: 'pinecone',
139
+ namespace: 'my-docs',
140
+ });
141
+
142
+ // Stream results in real-time
143
+ const stream = wc.extractStream('https://docs.example.com');
144
+ stream.onPage((page) => console.log(`Extracted: ${page.title}`));
145
+ stream.onDone((result) => console.log(`Done! ${result.stats.totalTokens} tokens`));
146
+
147
+ // Webhooks
148
+ wc.registerWebhook({
149
+ url: 'https://your-server.com/webhook',
150
+ events: ['crawl.complete', 'content.changed'],
151
+ secret: 'your-secret',
152
+ });
153
+
154
+ // Cleanup
155
+ wc.dispose();
156
+ ```
157
+
158
+ ## Vector DB Export
159
+
160
+ Export chunks in formats ready for direct import into popular vector databases:
161
+
162
+ ```typescript
163
+ import { WebContext } from 'webcontext-ai';
164
+
165
+ const wc = new WebContext();
166
+ const result = await wc.extract('https://docs.example.com');
167
+
168
+ // Export as Pinecone format
169
+ const pinecone = await wc.exportForVectorDB('https://docs.example.com', { format: 'pinecone', namespace: 'docs' });
170
+
171
+ // Export as Chroma format
172
+ const chroma = await wc.exportForVectorDB('https://docs.example.com', { format: 'chroma', collection: 'my-docs' });
173
+
174
+ // Supported formats: pinecone, chroma, weaviate, qdrant, json
175
+ ```
176
+
177
+ CLI:
178
+ ```bash
179
+ webcontext export https://docs.example.com --to pinecone -o pinecone-chunks.json
180
+ webcontext export https://docs.example.com --to chroma --namespace docs -o chroma-chunks.json
181
+ ```
182
+
183
+ ## Output Templates
184
+
185
+ Format extracted content using built-in or custom templates:
186
+
187
+ ```typescript
188
+ import { OutputFormatter } from 'webcontext-ai';
189
+
190
+ const fmt = new OutputFormatter();
191
+
192
+ // Built-in templates: default, llm, xml-tags, summary, minimal
193
+ fmt.formatPage(page, 'llm');
194
+ // Output: <context source="https://..." tokens="1234">...content...</context>
195
+
196
+ fmt.formatPage(page, 'xml-tags');
197
+ // Output: <document><title>...</title><source>...</source><content>...</content></document>
198
+
199
+ // Register custom template
200
+ fmt.register({
201
+ name: 'my-format',
202
+ template: '---\ntitle: {{title}}\nsource: {{url}}\n---\n\n{{markdown}}',
203
+ });
204
+ fmt.formatPage(page, 'my-format');
205
+ ```
206
+
207
+ ## MCP Tools (AI Agent Integration)
208
+
209
+ Use WebContext as a tool inside **Cursor**, **Claude Desktop**, **Amazon Q Developer**, or any MCP-compatible AI agent.
210
+
211
+ ### Setup for Claude Desktop
212
+
213
+ Add to your `claude_desktop_config.json`:
214
+
215
+ ```json
216
+ {
217
+ "mcpServers": {
218
+ "webcontext": {
219
+ "command": "npx",
220
+ "args": ["-y", "webcontext-ai", "webcontext-mcp"]
221
+ }
222
+ }
223
+ }
224
+ ```
225
+
226
+ ### Setup for Cursor
227
+
228
+ Add to `.cursor/mcp.json` in your project:
229
+
230
+ ```json
231
+ {
232
+ "mcpServers": {
233
+ "webcontext": {
234
+ "command": "npx",
235
+ "args": ["-y", "webcontext-ai", "webcontext-mcp"]
236
+ }
237
+ }
238
+ }
239
+ ```
240
+
241
+ ### Setup for Amazon Q Developer / Kiro
242
+
243
+ Add to your MCP configuration:
244
+
245
+ ```json
246
+ {
247
+ "mcpServers": {
248
+ "webcontext": {
249
+ "command": "npx",
250
+ "args": ["-y", "webcontext-ai", "webcontext-mcp"]
251
+ }
252
+ }
253
+ }
254
+ ```
255
+
256
+ ### Available MCP Tools
257
+
258
+ | Tool | Description | Example Prompt |
259
+ |------|-------------|----------------|
260
+ | `webcontext_extract` | Extract clean content from a URL | "Extract the React docs for useState" |
261
+ | `webcontext_crawl` | Crawl a documentation site | "Crawl the Express.js guide, 3 levels deep" |
262
+ | `webcontext_search` | Semantic search within a page | "Search the Next.js docs for 'server components'" |
263
+ | `webcontext_chunk` | Get RAG-ready chunks | "Chunk the TailwindCSS docs for my vector DB" |
264
+ | `webcontext_summarize` | Summarize a web page | "Summarize this API reference page" |
265
+ | `webcontext_github` | Extract GitHub repo docs | "Get the README from TanStack/query" |
266
+ | `webcontext_pdf` | Extract PDF content | "Extract text from this research paper PDF" |
267
+
268
+ ## Streaming
269
+
270
+ Get results in real-time as pages are processed:
271
+
272
+ ```typescript
273
+ const stream = wc.extractStream('https://docs.example.com');
274
+
275
+ stream.onPage((page) => {
276
+ console.log(`✓ ${page.title} (${page.codeBlocks.length} code blocks)`);
277
+ });
278
+
279
+ stream.onProgress((p) => {
280
+ console.log(`${p.pagesProcessed}/${p.totalDiscovered} - ${p.currentUrl}`);
281
+ });
282
+
283
+ stream.onDone((result) => {
284
+ console.log(`Complete: ${result.stats.totalTokens} tokens`);
285
+ });
286
+
287
+ // Or await completion
288
+ const result = await stream.toPromise();
289
+ ```
290
+
291
+ ## GitHub Extraction
292
+
293
+ Extract README and documentation from any public GitHub repository:
294
+
295
+ ```typescript
296
+ // Just the README
297
+ const readme = await wc.extractGitHub('https://github.com/TanStack/query');
298
+
299
+ // README + /docs folder
300
+ const full = await wc.extractGitHub('https://github.com/TanStack/query', { depth: 1 });
301
+ ```
302
+
303
+ CLI:
304
+ ```bash
305
+ webcontext github https://github.com/expressjs/express -o express-docs.md
306
+ ```
307
+
308
+ ## PDF Extraction
309
+
310
+ Extract text from PDF files (requires `npm install pdf-parse`):
311
+
312
+ ```typescript
313
+ // From URL
314
+ const paper = await wc.extractPdf('https://example.com/paper.pdf');
315
+
316
+ // From local file
317
+ const local = await wc.extractPdf('./documents/spec.pdf');
318
+ ```
319
+
320
+ CLI:
321
+ ```bash
322
+ webcontext pdf https://arxiv.org/pdf/1706.03762 -o transformer-paper.md
323
+ webcontext pdf ./local-file.pdf --format chunks -o chunks.json
324
+ ```
325
+
326
+ ## Webhooks
327
+
328
+ Get notified when crawls complete or content changes:
329
+
330
+ ```typescript
331
+ wc.registerWebhook({
332
+ url: 'https://your-server.com/webhook',
333
+ secret: 'hmac-secret', // Signs payload with HMAC-SHA256
334
+ events: ['crawl.complete', 'crawl.error', 'content.changed'],
335
+ });
336
+ ```
337
+
338
+ Webhook payload example:
339
+ ```json
340
+ {
341
+ "event": "content.changed",
342
+ "timestamp": "2024-01-15T10:30:00Z",
343
+ "data": {
344
+ "changedPages": 3,
345
+ "diffs": [
346
+ { "url": "https://docs.example.com/api", "addedSections": ["New Endpoint"], "removedSections": [] }
347
+ ]
348
+ }
349
+ }
350
+ ```
351
+
352
+ ## Client SDK (Remote Server)
353
+
354
+ ```typescript
355
+ import { WebContextClient } from 'webcontext-ai/sdk/client';
356
+
357
+ const client = new WebContextClient({ serverUrl: 'http://localhost:3456' });
358
+ const markdown = await client.toMarkdown('https://example.com');
359
+ const results = await client.search('https://example.com', 'pricing', 3);
360
+ ```
361
+
362
+ ## LangChain Integration
363
+
364
+ ```typescript
365
+ import { WebContextLoader } from 'webcontext-ai/sdk/client';
366
+
367
+ const loader = new WebContextLoader();
368
+ const docs = await loader.load('https://docs.example.com/guide');
369
+ // Returns LangChain-compatible Document[] with pageContent + metadata
370
+ ```
371
+
372
+ ## Plugin System
373
+
374
+ ```typescript
375
+ import { WebContext, WebContextPlugin } from 'webcontext-ai';
376
+
377
+ const myPlugin: WebContextPlugin = {
378
+ name: 'custom-cleaner',
379
+ hooks: {
380
+ 'post-extract': async (ctx) => {
381
+ ctx.extracted.markdown = ctx.extracted.markdown.replace(/CONFIDENTIAL/g, '[REDACTED]');
382
+ return ctx;
383
+ },
384
+ 'post-chunk': async (ctx) => {
385
+ ctx.chunks = ctx.chunks.filter(c => c.tokens > 50);
386
+ return ctx;
387
+ },
388
+ },
389
+ };
390
+
391
+ const wc = new WebContext({ plugins: [myPlugin] });
392
+ ```
393
+
394
+ ## API Server
395
+
396
+ ```bash
397
+ webcontext serve --port 3456
398
+ ```
399
+
400
+ | Method | Path | Description |
401
+ |--------|------|-------------|
402
+ | POST | `/extract` | Extract content from a single URL |
403
+ | POST | `/crawl` | Recursively crawl a documentation site |
404
+ | POST | `/context` | Generate LLM-ready context with token budget |
405
+ | POST | `/chunks` | Get RAG-ready content chunks |
406
+ | POST | `/search` | Semantic search within extracted content |
407
+ | GET | `/metrics` | View crawl metrics |
408
+ | POST | `/schedule` | Schedule recurring crawls |
409
+ | DELETE | `/schedule/:id` | Cancel a scheduled job |
410
+ | GET | `/health` | Health check |
411
+
412
+ ## Configuration
413
+
414
+ ```typescript
415
+ const wc = new WebContext({
416
+ browser: {
417
+ headless: true,
418
+ proxy: 'http://proxy:8080',
419
+ userAgent: 'MyBot/1.0',
420
+ viewport: { width: 1280, height: 720 },
421
+ },
422
+ extraction: {
423
+ removeSelectors: ['.sidebar', '.footer'],
424
+ contentSelectors: ['.doc-content'],
425
+ preserveImages: true,
426
+ preserveTables: true,
427
+ },
428
+ chunking: {
429
+ maxTokens: 1500,
430
+ overlap: 100,
431
+ strategy: 'semantic', // 'semantic' | 'heading' | 'fixed' | 'paragraph'
432
+ preserveCodeBlocks: true,
433
+ preserveHeadings: true,
434
+ },
435
+ cache: {
436
+ enabled: true,
437
+ ttl: 3600,
438
+ maxSize: 500,
439
+ directory: './.webcontext-cache',
440
+ contentHashing: true,
441
+ },
442
+ retry: {
443
+ maxRetries: 3,
444
+ backoffMs: 1000,
445
+ backoffMultiplier: 2,
446
+ retryOn: [429, 500, 502, 503, 504],
447
+ },
448
+ rateLimit: {
449
+ requestsPerSecond: 2,
450
+ burstSize: 5,
451
+ },
452
+ concurrency: 3,
453
+ metrics: true,
454
+ plugins: [],
455
+ });
456
+ ```
457
+
458
+ ## Real-World Examples
459
+
460
+ ### Feed documentation into your AI chatbot (RAG)
461
+
462
+ ```typescript
463
+ import { WebContext } from 'webcontext-ai';
464
+
465
+ const wc = new WebContext();
466
+ const result = await wc.crawlDocs('https://your-docs.com', { depth: 3, maxPages: 100 });
467
+
468
+ // Export directly for your vector DB
469
+ const pineconeData = await wc.exportForVectorDB('https://your-docs.com', {
470
+ format: 'pinecone',
471
+ namespace: 'product-docs',
472
+ });
473
+ // Write to file and import via Pinecone CLI/API
474
+ ```
475
+
476
+ ### Keep AI context fresh with scheduled re-crawls
477
+
478
+ ```typescript
479
+ import { WebContext, CrawlScheduler } from 'webcontext-ai';
480
+
481
+ const wc = new WebContext();
482
+ const scheduler = new CrawlScheduler();
483
+
484
+ scheduler.schedule('docs-sync', {
485
+ cron: '0 */6 * * *',
486
+ urls: ['https://your-docs.com'],
487
+ options: { depth: 2 },
488
+ onComplete: (result) => {
489
+ if (result.diffs?.length) {
490
+ console.log(`${result.diffs.length} pages changed — re-indexing`);
491
+ }
492
+ },
493
+ }, (url, opts) => wc.crawlDocs(url, opts));
494
+ ```
495
+
496
+ ### Use in a Cursor/Claude workflow
497
+
498
+ Just ask your AI agent:
499
+ - *"Use webcontext to extract the Next.js App Router docs and explain how layouts work"*
500
+ - *"Crawl the Stripe API reference and summarize the payment intents section"*
501
+ - *"Search the React docs for information about useEffect cleanup"*
502
+
503
+ The agent calls the MCP tools automatically.
504
+
505
+ ## Troubleshooting
506
+
507
+ ### "Executable doesn't exist" / Playwright errors
508
+
509
+ Playwright is only needed for `{ javascript: true }`. Most sites work without it.
510
+
511
+ ```bash
512
+ npm install playwright && npx playwright install chromium
513
+ ```
514
+
515
+ ### "fetch failed" / SSL certificate errors
516
+
517
+ Common in corporate environments:
518
+
519
+ ```bash
520
+ # Windows
521
+ set NODE_TLS_REJECT_UNAUTHORIZED=0
522
+
523
+ # Mac/Linux
524
+ export NODE_TLS_REJECT_UNAUTHORIZED=0
525
+ ```
526
+
527
+ ### Empty extraction / "No pages extracted"
528
+
529
+ 1. **SPA sites** (React/Vue/Angular) need `{ javascript: true }` + Playwright
530
+ 2. **Landing pages** have little content — target specific doc pages
531
+ 3. **Blocked by WAF** — try with custom headers
532
+
533
+ ### "pdf-parse is required"
534
+
535
+ ```bash
536
+ npm install pdf-parse
537
+ ```
538
+
539
+ ## Architecture
540
+
541
+ ```
542
+ URL → Sitemap Discovery → URL Queue
543
+
544
+ [PDF?] → PDF Extractor
545
+ [GitHub?] → GitHub Extractor
546
+ [Web?] → Browser Manager (fetch/Playwright)
547
+
548
+ Content Extractor (Cheerio + heuristics)
549
+
550
+ Markdown Transformer (Turndown)
551
+
552
+ Deduplication Check
553
+
554
+ Content Chunker (tiktoken, 4 strategies)
555
+
556
+ ┌─────────────────────────────────────┐
557
+ │ Vector Search │ Vector DB Export │
558
+ │ Streaming │ Output Templates │
559
+ │ Cache + Diff │ Webhooks │
560
+ └─────────────────────────────────────┘
561
+
562
+ CLI │ REST API │ SDK │ MCP Server │ LangChain
563
+ ```
564
+
565
+ ## Tech Stack
566
+
567
+ | Component | Technology |
568
+ |-----------|-----------|
569
+ | Browser rendering | Playwright (optional, lazy-loaded) |
570
+ | HTML parsing | Cheerio |
571
+ | Markdown conversion | Turndown (custom rules) |
572
+ | Token counting | tiktoken (cl100k_base) |
573
+ | Vector search | TF-IDF with cosine similarity |
574
+ | PDF parsing | pdf-parse (optional) |
575
+ | HTTP server | Express |
576
+ | CLI | Commander |
577
+ | Caching | LRU-Cache + File-based |
578
+ | Validation | Zod |
579
+ | Rate limiting | Token bucket algorithm |
580
+
581
+ ## License
582
+
583
+ MIT
@@ -0,0 +1,47 @@
1
+ /// <reference types="node" />
2
+ import { BrowserConfig, RetryConfig, RateLimitConfig } from '../core/types';
3
+ /**
4
+ * Browser manager using Playwright for JS-heavy page rendering.
5
+ * Handles rate limiting, retry with backoff, and robots.txt compliance.
6
+ */
7
+ export declare class BrowserManager {
8
+ private browser;
9
+ private context;
10
+ private config;
11
+ private rateLimitConfig;
12
+ private robotsCache;
13
+ private tokens;
14
+ private lastRefill;
15
+ constructor(config?: BrowserConfig, rateLimitConfig?: RateLimitConfig);
16
+ private refillTokens;
17
+ private waitForToken;
18
+ launch(): Promise<void>;
19
+ checkRobots(url: string): Promise<boolean>;
20
+ fetchWithRetry<T>(fn: () => Promise<T>, retryConfig?: RetryConfig): Promise<T>;
21
+ fetchPage(url: string, options?: {
22
+ respectRobots?: boolean;
23
+ waitForSelector?: string;
24
+ timeout?: number;
25
+ cookies?: Array<{
26
+ name: string;
27
+ value: string;
28
+ domain: string;
29
+ path?: string;
30
+ }>;
31
+ headers?: Record<string, string>;
32
+ retryConfig?: RetryConfig;
33
+ }): Promise<{
34
+ content: string;
35
+ status: number;
36
+ }>;
37
+ fetchStatic(url: string, options?: {
38
+ respectRobots?: boolean;
39
+ headers?: Record<string, string>;
40
+ retryConfig?: RetryConfig;
41
+ }): Promise<{
42
+ body: Buffer;
43
+ status: number;
44
+ }>;
45
+ close(): Promise<void>;
46
+ }
47
+ //# sourceMappingURL=manager.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"manager.d.ts","sourceRoot":"","sources":["../../src/browser/manager.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,aAAa,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAK5E;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAa;IAC5B,OAAO,CAAC,OAAO,CAAa;IAC5B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,eAAe,CAAkB;IACzC,OAAO,CAAC,WAAW,CAA+B;IAClD,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,GAAE,aAAkB,EAAE,eAAe,CAAC,EAAE,eAAe;IAYzE,OAAO,CAAC,YAAY;YAON,YAAY;IAYpB,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;IAuBvB,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAkB1C,cAAc,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,EAAE,WAAW,GAAE,WAA2B,GAAG,OAAO,CAAC,CAAC,CAAC;IAe7F,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE;QACpC,aAAa,CAAC,EAAE,OAAO,CAAC;QACxB,eAAe,CAAC,EAAE,MAAM,CAAC;QACzB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,OAAO,CAAC,EAAE,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,KAAK,EAAE,MAAM,CAAC;YAAC,MAAM,EAAE,MAAM,CAAC;YAAC,IAAI,CAAC,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;QAChF,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,WAAW,CAAC,EAAE,WAAW,CAAC;KACtB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAyD/C,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE;QACtC,aAAa,CAAC,EAAE,OAAO,CAAC;QACxB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,WAAW,CAAC,EAAE,WAAW,CAAC;KACtB,GAAG,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IA2B5C,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAW7B"}