@memvid/maw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -0
- package/dist/bin/maw.d.ts +6 -0
- package/dist/bin/maw.d.ts.map +1 -0
- package/dist/bin/maw.js +275 -0
- package/dist/bin/maw.js.map +1 -0
- package/dist/src/crawler/index.d.ts +71 -0
- package/dist/src/crawler/index.d.ts.map +1 -0
- package/dist/src/crawler/index.js +249 -0
- package/dist/src/crawler/index.js.map +1 -0
- package/dist/src/crawler/robots.d.ts +26 -0
- package/dist/src/crawler/robots.d.ts.map +1 -0
- package/dist/src/crawler/robots.js +179 -0
- package/dist/src/crawler/robots.js.map +1 -0
- package/dist/src/crawler/sitemap.d.ts +36 -0
- package/dist/src/crawler/sitemap.d.ts.map +1 -0
- package/dist/src/crawler/sitemap.js +209 -0
- package/dist/src/crawler/sitemap.js.map +1 -0
- package/dist/src/engine/detector.d.ts +18 -0
- package/dist/src/engine/detector.d.ts.map +1 -0
- package/dist/src/engine/detector.js +155 -0
- package/dist/src/engine/detector.js.map +1 -0
- package/dist/src/engine/fetch.d.ts +18 -0
- package/dist/src/engine/fetch.d.ts.map +1 -0
- package/dist/src/engine/fetch.js +53 -0
- package/dist/src/engine/fetch.js.map +1 -0
- package/dist/src/engine/index.d.ts +39 -0
- package/dist/src/engine/index.d.ts.map +1 -0
- package/dist/src/engine/index.js +116 -0
- package/dist/src/engine/index.js.map +1 -0
- package/dist/src/engine/playwright.d.ts +23 -0
- package/dist/src/engine/playwright.d.ts.map +1 -0
- package/dist/src/engine/playwright.js +88 -0
- package/dist/src/engine/playwright.js.map +1 -0
- package/dist/src/engine/rebrowser.d.ts +22 -0
- package/dist/src/engine/rebrowser.d.ts.map +1 -0
- package/dist/src/engine/rebrowser.js +142 -0
- package/dist/src/engine/rebrowser.js.map +1 -0
- package/dist/src/extractor/cleaner.d.ts +13 -0
- package/dist/src/extractor/cleaner.d.ts.map +1 -0
- package/dist/src/extractor/cleaner.js +122 -0
- package/dist/src/extractor/cleaner.js.map +1 -0
- package/dist/src/extractor/index.d.ts +29 -0
- package/dist/src/extractor/index.d.ts.map +1 -0
- package/dist/src/extractor/index.js +162 -0
- package/dist/src/extractor/index.js.map +1 -0
- package/dist/src/extractor/links.d.ts +22 -0
- package/dist/src/extractor/links.d.ts.map +1 -0
- package/dist/src/extractor/links.js +92 -0
- package/dist/src/extractor/links.js.map +1 -0
- package/dist/src/extractor/markdown.d.ts +13 -0
- package/dist/src/extractor/markdown.d.ts.map +1 -0
- package/dist/src/extractor/markdown.js +94 -0
- package/dist/src/extractor/markdown.js.map +1 -0
- package/dist/src/git/index.d.ts +40 -0
- package/dist/src/git/index.d.ts.map +1 -0
- package/dist/src/git/index.js +303 -0
- package/dist/src/git/index.js.map +1 -0
- package/dist/src/index.d.ts +103 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +229 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/ingestor/index.d.ts +95 -0
- package/dist/src/ingestor/index.d.ts.map +1 -0
- package/dist/src/ingestor/index.js +471 -0
- package/dist/src/ingestor/index.js.map +1 -0
- package/dist/src/utils/dedup.d.ts +66 -0
- package/dist/src/utils/dedup.d.ts.map +1 -0
- package/dist/src/utils/dedup.js +296 -0
- package/dist/src/utils/dedup.js.map +1 -0
- package/dist/src/utils/index.d.ts +3 -0
- package/dist/src/utils/index.d.ts.map +1 -0
- package/dist/src/utils/index.js +3 -0
- package/dist/src/utils/index.js.map +1 -0
- package/dist/src/utils/logger.d.ts +12 -0
- package/dist/src/utils/logger.d.ts.map +1 -0
- package/dist/src/utils/logger.js +49 -0
- package/dist/src/utils/logger.js.map +1 -0
- package/dist/src/utils/ui.d.ts +126 -0
- package/dist/src/utils/ui.d.ts.map +1 -0
- package/dist/src/utils/ui.js +357 -0
- package/dist/src/utils/ui.js.map +1 -0
- package/dist/src/utils/url.d.ts +21 -0
- package/dist/src/utils/url.d.ts.map +1 -0
- package/dist/src/utils/url.js +107 -0
- package/dist/src/utils/url.js.map +1 -0
- package/package.json +71 -0
package/README.md
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="s_maw.svg" alt="maw" width="400">
|
|
4
|
+
|
|
5
|
+
**Crawl any site. Search it forever.**
|
|
6
|
+
|
|
7
|
+
[](https://www.npmjs.com/package/@memvid/maw)
|
|
8
|
+
[](https://www.npmjs.com/package/@memvid/maw)
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
|
|
11
|
+
[Install](#install) · [Commands](#commands) · [Examples](#examples) · [FAQ](#faq)
|
|
12
|
+
|
|
13
|
+
</div>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
Feed the maw. It never forgets.
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npx @memvid/maw https://stripe.com/docs
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
That's it. The entire Stripe docs are now in a 40MB file you can search and ask questions to. Offline. Forever.
|
|
24
|
+
|
|
25
|
+
## Why?
|
|
26
|
+
|
|
27
|
+
Because you shouldn't need to keep 47 browser tabs open or bookmark links you'll never read again. Crawl once, query forever.
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# later, when you actually need it
|
|
31
|
+
maw ask stripe.mv2 "how do webhooks work?"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npm i -g @memvid/maw
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Or just use `npx @memvid/maw` without installing.
|
|
41
|
+
|
|
42
|
+
## What it does
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
maw https://react.dev → react.mv2 (312 pages, 18s)
|
|
46
|
+
maw https://docs.python.org → python.mv2 (2,847 pages, 4min)
|
|
47
|
+
maw . → repo.mv2 (your local git repo)
|
|
48
|
+
maw https://news.ycombinator.com/item?id=12345 → just that page
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Smart defaults:
|
|
52
|
+
- **Single page URL?** Fetches just that page
|
|
53
|
+
- **Domain root?** Crawls the whole site
|
|
54
|
+
- **Local path?** Reads your git repo
|
|
55
|
+
- **Protected site?** Auto-switches to stealth browser
|
|
56
|
+
|
|
57
|
+
## Commands
|
|
58
|
+
|
|
59
|
+
### Crawl
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
maw <url> # → maw.mv2
|
|
63
|
+
maw <url> -o docs.mv2 # custom output
|
|
64
|
+
maw <url> docs.mv2 # same thing (appends if exists)
|
|
65
|
+
maw <url> --depth 5 --max-pages 500 # go deeper
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Query
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
maw find docs.mv2 "authentication" # full-text search
|
|
72
|
+
maw ask docs.mv2 "how does X work?" # AI answer (needs OPENAI_API_KEY)
|
|
73
|
+
maw list docs.mv2 # see what's inside
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Preview
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
maw preview stripe.com # shows sitemap, estimated page count
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Export
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
maw export docs.mv2 -f markdown -o docs.md
|
|
86
|
+
maw export docs.mv2 -f json -o docs.json
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Embeddings
|
|
90
|
+
|
|
91
|
+
Want semantic search? Add `--embed`:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
maw https://docs.whatever.com --embed openai
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Uses OpenAI embeddings for semantic search. Costs ~$0.01 per 1000 pages. Without it, you get BM25 keyword search (still good, just different).
|
|
98
|
+
|
|
99
|
+
## How it works
|
|
100
|
+
|
|
101
|
+
Most sites work with a simple fetch. When that fails (Cloudflare, JS-heavy SPAs), maw falls back to a real browser. When *that* fails (aggressive anti-bot), it uses stealth mode.
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
fetch (fast) → playwright (slower) → rebrowser (stealth)
|
|
105
|
+
↓ ↓ ↓
|
|
106
|
+
works? blocked? blocked?
|
|
107
|
+
↓ ↓ ↓
|
|
108
|
+
done retry done
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
90% of sites never need the browser. The 10% that do, just work.
|
|
112
|
+
|
|
113
|
+
## Options
|
|
114
|
+
|
|
115
|
+
| Flag | Description | Default |
|
|
116
|
+
|------|-------------|---------|
|
|
117
|
+
| `-o, --output <file>` | Output file | `maw.mv2` |
|
|
118
|
+
| `-d, --depth <n>` | Crawl depth | `2` |
|
|
119
|
+
| `-m, --max-pages <n>` | Max pages to crawl | `150` |
|
|
120
|
+
| `-c, --concurrency <n>` | Parallel requests | `10` |
|
|
121
|
+
| `-r, --rate-limit <n>` | Requests per second | `10` |
|
|
122
|
+
| `--include <regex>` | Only crawl matching URLs | - |
|
|
123
|
+
| `--exclude <regex>` | Skip matching URLs | - |
|
|
124
|
+
| `--browser` | Force browser mode | - |
|
|
125
|
+
| `--stealth` | Force stealth mode | - |
|
|
126
|
+
| `--embed [model]` | Enable embeddings | - |
|
|
127
|
+
| `--no-robots` | Ignore robots.txt | - |
|
|
128
|
+
| `--no-sitemap` | Skip sitemap discovery | - |
|
|
129
|
+
|
|
130
|
+
## Examples
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# documentation sites
|
|
134
|
+
maw https://react.dev
|
|
135
|
+
maw https://docs.python.org
|
|
136
|
+
maw https://stripe.com/docs
|
|
137
|
+
|
|
138
|
+
# news/blogs
|
|
139
|
+
maw https://paulgraham.com/articles.html
|
|
140
|
+
maw "https://news.ycombinator.com/item?id=40000000"
|
|
141
|
+
|
|
142
|
+
# your own repos
|
|
143
|
+
maw . -o my-project.mv2
|
|
144
|
+
maw https://github.com/user/repo
|
|
145
|
+
|
|
146
|
+
# combine multiple sources
|
|
147
|
+
maw https://react.dev https://nextjs.org -o frontend.mv2
|
|
148
|
+
|
|
149
|
+
# deep crawl with embeddings
|
|
150
|
+
maw https://kubernetes.io/docs --depth 4 --max-pages 1000 --embed openai
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Limits
|
|
154
|
+
|
|
155
|
+
Files up to **50MB** work without any API key. That's roughly 500-2000 pages depending on content.
|
|
156
|
+
|
|
157
|
+
For bigger crawls, get a key at [memvid.com](https://memvid.com).
|
|
158
|
+
|
|
159
|
+
## FAQ
|
|
160
|
+
|
|
161
|
+
**Is this legal?**
|
|
162
|
+
|
|
163
|
+
Respects robots.txt by default. Use `--no-robots` at your own discretion.
|
|
164
|
+
|
|
165
|
+
**Why .mv2?**
|
|
166
|
+
|
|
167
|
+
It's a [memvid](https://memvid.com) file — single-file database with full-text search, embeddings, and temporal queries baked in. Think SQLite but for documents.
|
|
168
|
+
|
|
169
|
+
**Can I use it programmatically?**
|
|
170
|
+
|
|
171
|
+
```javascript
|
|
172
|
+
import { crawl, query } from 'maw'
|
|
173
|
+
|
|
174
|
+
await crawl('https://example.com', { output: 'site.mv2' })
|
|
175
|
+
const results = await query('site.mv2', 'search term')
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**What about rate limiting?**
|
|
179
|
+
|
|
180
|
+
Default is 10 req/sec with automatic backoff. Most sites won't notice you. If you're hitting APIs, consider `--rate-limit 2`.
|
|
181
|
+
|
|
182
|
+
**Does it handle JavaScript-rendered content?**
|
|
183
|
+
|
|
184
|
+
Yes. If fetch fails, it automatically tries Playwright. For heavily protected sites, use `--stealth`.
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
[MIT License](LICENSE) · Built on [memvid](https://memvid.com)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"maw.d.ts","sourceRoot":"","sources":["../../bin/maw.ts"],"names":[],"mappings":";AAEA;;GAEG"}
|
package/dist/bin/maw.js
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* maw CLI - Feed the maw. It never forgets.
|
|
4
|
+
*/
|
|
5
|
+
import { existsSync } from 'fs';
|
|
6
|
+
import { Command } from 'commander';
|
|
7
|
+
import { maw, find, ask, list, preview, exportDocs } from '../src/index.js';
|
|
8
|
+
import { setLogMode } from '../src/utils/logger.js';
|
|
9
|
+
import * as ui from '../src/utils/ui.js';
|
|
10
|
+
const VERSION = '1.0.0';
|
|
11
|
+
const program = new Command();
|
|
12
|
+
program
|
|
13
|
+
.name('maw')
|
|
14
|
+
.description('Feed the maw. It never forgets.')
|
|
15
|
+
.version(VERSION);
|
|
16
|
+
// Main command: maw <urls...> [file.mv2]
|
|
17
|
+
program
|
|
18
|
+
.argument('[urls...]', 'URLs/repos to consume, optionally followed by target.mv2 to append')
|
|
19
|
+
.option('-o, --output <file>', 'Output .mv2 file', 'maw.mv2')
|
|
20
|
+
.option('-d, --depth <n>', 'Crawl depth (auto: 0 for pages, 2 for domains)')
|
|
21
|
+
.option('-c, --concurrency <n>', 'Concurrent requests', '10')
|
|
22
|
+
.option('-m, --max-pages <n>', 'Maximum pages to crawl (default: 150)')
|
|
23
|
+
.option('-r, --rate-limit <n>', 'Requests per second', '10')
|
|
24
|
+
.option('-t, --timeout <ms>', 'Request timeout in ms', '10000')
|
|
25
|
+
.option('--include <pattern>', 'URL pattern to include (regex)')
|
|
26
|
+
.option('--exclude <pattern>', 'URL pattern to exclude (regex)')
|
|
27
|
+
.option('--label <label>', 'Label for ingested documents', 'web')
|
|
28
|
+
.option('--memory <id>', 'Cloud memory ID to bind to (from memvid.com/dashboard)')
|
|
29
|
+
.option('--sitemap', 'Use sitemap.xml for discovery (default: true)')
|
|
30
|
+
.option('--no-sitemap', 'Disable sitemap discovery')
|
|
31
|
+
.option('--no-robots', 'Ignore robots.txt')
|
|
32
|
+
.option('--browser', 'Force browser mode (for JavaScript-heavy sites)')
|
|
33
|
+
.option('--stealth', 'Force stealth mode (bypasses anti-bot)')
|
|
34
|
+
.option('--embed [model]', 'Enable semantic embeddings (models: bge-small, openai, nvidia)')
|
|
35
|
+
.option('-q, --quiet', 'Minimal output')
|
|
36
|
+
.option('-v, --verbose', 'Verbose output')
|
|
37
|
+
.action(async (urls, options) => {
|
|
38
|
+
if (urls.length === 0) {
|
|
39
|
+
// Show banner and help
|
|
40
|
+
console.log(ui.banner());
|
|
41
|
+
program.help();
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
setLogMode(options.quiet, options.verbose);
|
|
45
|
+
// Check if any argument is an .mv2 file (use as output target for appending)
|
|
46
|
+
// e.g., `maw https://example.com knowledge.mv2` or `maw knowledge.mv2 https://example.com`
|
|
47
|
+
const mv2Files = urls.filter((u) => u.endsWith('.mv2'));
|
|
48
|
+
const sources = urls.filter((u) => !u.endsWith('.mv2'));
|
|
49
|
+
// Determine output file: explicit -o flag > .mv2 in args > default
|
|
50
|
+
let outputFile = options.output;
|
|
51
|
+
if (mv2Files.length > 0 && options.output === 'maw.mv2') {
|
|
52
|
+
// Use the .mv2 file from args if no explicit -o was given
|
|
53
|
+
outputFile = mv2Files[0];
|
|
54
|
+
if (mv2Files.length > 1) {
|
|
55
|
+
console.error(ui.errorMessage('Only one .mv2 file can be specified as target'));
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (sources.length === 0) {
|
|
60
|
+
console.error(ui.errorMessage('No URLs or sources provided'));
|
|
61
|
+
process.exit(1);
|
|
62
|
+
}
|
|
63
|
+
// Show header - detect git repos vs URLs
|
|
64
|
+
if (!options.quiet) {
|
|
65
|
+
const isGit = sources.some((u) => u.startsWith('https://github.com/') ||
|
|
66
|
+
u.startsWith('https://gitlab.com/') ||
|
|
67
|
+
u.includes('.git') ||
|
|
68
|
+
u.startsWith('.') ||
|
|
69
|
+
u.startsWith('/'));
|
|
70
|
+
const label = isGit ? 'maw (git)' : 'maw';
|
|
71
|
+
const urlDisplay = sources.length === 1 ? sources[0] : `${sources.length} sources`;
|
|
72
|
+
console.log(ui.header(label, urlDisplay));
|
|
73
|
+
// Show if appending to existing file
|
|
74
|
+
if (existsSync(outputFile)) {
|
|
75
|
+
console.log(ui.theme.info(` → Adding to ${outputFile}\n`));
|
|
76
|
+
}
|
|
77
|
+
// Show embedding mode if enabled
|
|
78
|
+
if (options.embed) {
|
|
79
|
+
const model = typeof options.embed === 'string' ? options.embed : 'bge-small';
|
|
80
|
+
console.log(ui.theme.info(` Semantic embeddings enabled (${model})`));
|
|
81
|
+
console.log(ui.theme.dim(' This improves search quality but takes longer.\n'));
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
try {
|
|
85
|
+
const result = await maw(sources, {
|
|
86
|
+
output: outputFile,
|
|
87
|
+
depth: options.depth ? parseInt(options.depth, 10) : undefined, // undefined triggers auto-detect
|
|
88
|
+
concurrency: parseInt(options.concurrency, 10),
|
|
89
|
+
maxPages: options.maxPages ? parseInt(options.maxPages, 10) : undefined,
|
|
90
|
+
rateLimit: parseInt(options.rateLimit, 10),
|
|
91
|
+
timeout: parseInt(options.timeout, 10),
|
|
92
|
+
includePattern: options.include ? new RegExp(options.include) : undefined,
|
|
93
|
+
excludePattern: options.exclude ? new RegExp(options.exclude) : undefined,
|
|
94
|
+
label: options.label,
|
|
95
|
+
memoryId: options.memory,
|
|
96
|
+
useSitemap: options.sitemap,
|
|
97
|
+
respectRobots: options.robots,
|
|
98
|
+
forceEngine: options.stealth ? 'rebrowser' : options.browser ? 'playwright' : undefined,
|
|
99
|
+
enableEmbedding: !!options.embed,
|
|
100
|
+
embeddingModel: typeof options.embed === 'string' ? options.embed : 'bge-small',
|
|
101
|
+
quiet: options.quiet,
|
|
102
|
+
verbose: options.verbose,
|
|
103
|
+
});
|
|
104
|
+
// Success output
|
|
105
|
+
console.log(ui.successMessage(result.output, result.size, result.pages, result.duration));
|
|
106
|
+
// Show dedup stats if any skipped
|
|
107
|
+
const dedupStats = result.stats.dedup;
|
|
108
|
+
if (dedupStats && (dedupStats.localeSkipped > 0 || dedupStats.similarSkipped > 0)) {
|
|
109
|
+
console.log(ui.dedupStats(dedupStats));
|
|
110
|
+
}
|
|
111
|
+
// Show engine stats in verbose mode
|
|
112
|
+
if (options.verbose) {
|
|
113
|
+
console.log(ui.engineStats(result.stats));
|
|
114
|
+
}
|
|
115
|
+
// Warnings and cloud sync status
|
|
116
|
+
if (result.stoppedAtLimit) {
|
|
117
|
+
console.log(ui.limitWarning());
|
|
118
|
+
}
|
|
119
|
+
else if (result.memoryId) {
|
|
120
|
+
console.log(ui.cloudSyncMessage(result.memoryId));
|
|
121
|
+
}
|
|
122
|
+
else if (!options.quiet) {
|
|
123
|
+
console.log(ui.theme.dim(' It will never forget.'));
|
|
124
|
+
}
|
|
125
|
+
console.log();
|
|
126
|
+
}
|
|
127
|
+
catch (error) {
|
|
128
|
+
console.error(ui.errorMessage(error.message));
|
|
129
|
+
process.exit(1);
|
|
130
|
+
}
|
|
131
|
+
});
|
|
132
|
+
// find command: maw find <file> <query>
|
|
133
|
+
program
|
|
134
|
+
.command('find <file> <query>')
|
|
135
|
+
.description('Search in an .mv2 file')
|
|
136
|
+
.option('-k, --top <n>', 'Number of results (default: 10)', '10')
|
|
137
|
+
.option('--json', 'Output as JSON')
|
|
138
|
+
.action(async (file, query, options) => {
|
|
139
|
+
try {
|
|
140
|
+
const results = await find(file, query, { k: parseInt(options.top, 10) });
|
|
141
|
+
if (options.json) {
|
|
142
|
+
console.log(JSON.stringify(results, null, 2));
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
console.log();
|
|
146
|
+
console.log(ui.searchResults(results.hits || []));
|
|
147
|
+
}
|
|
148
|
+
catch (error) {
|
|
149
|
+
console.error(ui.errorMessage(error.message));
|
|
150
|
+
process.exit(1);
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
// ask command: maw ask <file> <question>
|
|
154
|
+
program
|
|
155
|
+
.command('ask <file> <question>')
|
|
156
|
+
.description('Ask a question using an .mv2 file')
|
|
157
|
+
.option('--model <model>', 'LLM model to use (default: gpt-4o-mini)', 'gpt-4o-mini')
|
|
158
|
+
.option('--api-key <key>', 'API key for the model')
|
|
159
|
+
.option('-k, --context <n>', 'Number of context chunks to retrieve (auto: 15 for overview questions, 8 otherwise)')
|
|
160
|
+
.option('--json', 'Output as JSON')
|
|
161
|
+
.action(async (file, question, options) => {
|
|
162
|
+
try {
|
|
163
|
+
const result = await ask(file, question, {
|
|
164
|
+
model: options.model,
|
|
165
|
+
apiKey: options.apiKey || process.env.OPENAI_API_KEY,
|
|
166
|
+
k: options.context ? parseInt(options.context, 10) : undefined, // Let ingestor decide default
|
|
167
|
+
});
|
|
168
|
+
if (options.json) {
|
|
169
|
+
console.log(JSON.stringify(result, null, 2));
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
console.log(ui.askResult(result.answer, result.sources));
|
|
173
|
+
}
|
|
174
|
+
catch (error) {
|
|
175
|
+
console.error(ui.errorMessage(error.message));
|
|
176
|
+
process.exit(1);
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
// list command: maw list <file>
|
|
180
|
+
program
|
|
181
|
+
.command('list <file>')
|
|
182
|
+
.description('List documents in an .mv2 file')
|
|
183
|
+
.option('-l, --limit <n>', 'Number of documents to show (default: 20)', '20')
|
|
184
|
+
.option('--json', 'Output as JSON')
|
|
185
|
+
.action(async (file, options) => {
|
|
186
|
+
try {
|
|
187
|
+
const results = await list(file, { limit: parseInt(options.limit, 10) });
|
|
188
|
+
if (options.json) {
|
|
189
|
+
console.log(JSON.stringify(results, null, 2));
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
const items = results.hits || results.frames || results;
|
|
193
|
+
if (Array.isArray(items) && items.length > 0) {
|
|
194
|
+
console.log(ui.listDocuments(items.map((item) => ({
|
|
195
|
+
title: item.title || item.preview?.slice(0, 60) || `Frame ${item.frame_id}`,
|
|
196
|
+
url: item.metadata?.url || item.uri,
|
|
197
|
+
preview: item.preview,
|
|
198
|
+
}))));
|
|
199
|
+
}
|
|
200
|
+
else {
|
|
201
|
+
console.log(`\n ${ui.theme.muted('No documents found.')}\n`);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
catch (error) {
|
|
205
|
+
console.error(ui.errorMessage(error.message));
|
|
206
|
+
process.exit(1);
|
|
207
|
+
}
|
|
208
|
+
});
|
|
209
|
+
// preview command: maw preview <url> (or np)
|
|
210
|
+
program
|
|
211
|
+
.command('preview <url>')
|
|
212
|
+
.alias('np')
|
|
213
|
+
.description('Preview available pages on a site (sitemap discovery)')
|
|
214
|
+
.option('-l, --limit <n>', 'Number of pages to show', '20')
|
|
215
|
+
.option('--json', 'Output as JSON')
|
|
216
|
+
.action(async (url, options) => {
|
|
217
|
+
try {
|
|
218
|
+
const result = await preview(url, { limit: parseInt(options.limit, 10) });
|
|
219
|
+
if (options.json) {
|
|
220
|
+
console.log(JSON.stringify(result, null, 2));
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
console.log(ui.previewResults(result));
|
|
224
|
+
}
|
|
225
|
+
catch (error) {
|
|
226
|
+
console.error(ui.errorMessage(error.message));
|
|
227
|
+
process.exit(1);
|
|
228
|
+
}
|
|
229
|
+
});
|
|
230
|
+
// export command: maw export <file>
|
|
231
|
+
program
|
|
232
|
+
.command('export <file>')
|
|
233
|
+
.description('Export .mv2 file to other formats')
|
|
234
|
+
.option('-f, --format <format>', 'Output format: json, markdown, csv', 'json')
|
|
235
|
+
.option('--out <file>', 'Output file (default: stdout)')
|
|
236
|
+
.action(async (file, options) => {
|
|
237
|
+
try {
|
|
238
|
+
// Get full content for all documents
|
|
239
|
+
const docs = await exportDocs(file, { limit: 10000 });
|
|
240
|
+
let output;
|
|
241
|
+
switch (options.format) {
|
|
242
|
+
case 'markdown':
|
|
243
|
+
output = docs.map((doc) => {
|
|
244
|
+
return `# ${doc.title}\n\n${doc.content}\n\n---\n`;
|
|
245
|
+
}).join('\n');
|
|
246
|
+
break;
|
|
247
|
+
case 'csv':
|
|
248
|
+
const headers = ['title', 'uri'];
|
|
249
|
+
const rows = docs.map((doc) => {
|
|
250
|
+
return [
|
|
251
|
+
`"${(doc.title || '').replace(/"/g, '""')}"`,
|
|
252
|
+
`"${(doc.uri || '').replace(/"/g, '""')}"`,
|
|
253
|
+
].join(',');
|
|
254
|
+
});
|
|
255
|
+
output = [headers.join(','), ...rows].join('\n');
|
|
256
|
+
break;
|
|
257
|
+
default:
|
|
258
|
+
output = JSON.stringify(docs, null, 2);
|
|
259
|
+
}
|
|
260
|
+
if (options.out) {
|
|
261
|
+
const { writeFileSync } = await import('fs');
|
|
262
|
+
writeFileSync(options.out, output);
|
|
263
|
+
console.log(ui.theme.success(`\n Exported ${docs.length} documents to ${options.out}\n`));
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
console.log(output);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
catch (error) {
|
|
270
|
+
console.error(ui.errorMessage(error.message));
|
|
271
|
+
process.exit(1);
|
|
272
|
+
}
|
|
273
|
+
});
|
|
274
|
+
program.parse();
|
|
275
|
+
//# sourceMappingURL=maw.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"maw.js","sourceRoot":"","sources":["../../bin/maw.ts"],"names":[],"mappings":";AAEA;;GAEG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAChC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAC5E,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEzC,MAAM,OAAO,GAAG,OAAO,CAAC;AAExB,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,KAAK,CAAC;KACX,WAAW,CAAC,iCAAiC,CAAC;KAC9C,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,yCAAyC;AACzC,OAAO;KACJ,QAAQ,CAAC,WAAW,EAAE,oEAAoE,CAAC;KAC3F,MAAM,CAAC,qBAAqB,EAAE,kBAAkB,EAAE,SAAS,CAAC;KAC5D,MAAM,CAAC,iBAAiB,EAAE,gDAAgD,CAAC;KAC3E,MAAM,CAAC,uBAAuB,EAAE,qBAAqB,EAAE,IAAI,CAAC;KAC5D,MAAM,CAAC,qBAAqB,EAAE,uCAAuC,CAAC;KACtE,MAAM,CAAC,sBAAsB,EAAE,qBAAqB,EAAE,IAAI,CAAC;KAC3D,MAAM,CAAC,oBAAoB,EAAE,uBAAuB,EAAE,OAAO,CAAC;KAC9D,MAAM,CAAC,qBAAqB,EAAE,gCAAgC,CAAC;KAC/D,MAAM,CAAC,qBAAqB,EAAE,gCAAgC,CAAC;KAC/D,MAAM,CAAC,iBAAiB,EAAE,8BAA8B,EAAE,KAAK,CAAC;KAChE,MAAM,CAAC,eAAe,EAAE,wDAAwD,CAAC;KACjF,MAAM,CAAC,WAAW,EAAE,+CAA+C,CAAC;KACpE,MAAM,CAAC,cAAc,EAAE,2BAA2B,CAAC;KACnD,MAAM,CAAC,aAAa,EAAE,mBAAmB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,iDAAiD,CAAC;KACtE,MAAM,CAAC,WAAW,EAAE,wCAAwC,CAAC;KAC7D,MAAM,CAAC,iBAAiB,EAAE,gEAAgE,CAAC;KAC3F,MAAM,CAAC,aAAa,EAAE,gBAAgB,CAAC;KACvC,MAAM,CAAC,eAAe,EAAE,gBAAgB,CAAC;KACzC,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE;IAC9B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,uBAAuB;QACvB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QACzB,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IAED,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IAE3C,6EAA6E;IAC7E,2FAA2F;IAC3F,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC;IAEhE,mEAAmE;IACnE,IAAI,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAChC,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,OAAO,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;QACxD,0DAA0D;QAC1D,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QACzB,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,+CAA+C,CAAC,CAAC,CAAC;YAChF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,6BAA6B,CAAC,CAAC,CAAC;QAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,yCAAyC;IACzC,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QACnB,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAS,EAAE,EAAE,CACvC,CAAC,CAAC,UAAU,CAAC,qBAAqB,CAAC;YACnC,CAAC,CAAC,UAAU,CAAC,qBAAqB,CAAC;YACnC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC;YAClB,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC;YACjB,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAClB,CAAC;QACF,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC;QAC1C,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,UAAU,CAAC;QACnF,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC,CAAC;QAE1C,qCAAqC;QACrC,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC3B,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,iBAAiB,UAAU,IAAI,CAAC,CAAC,CAAC;QAC9D,CAAC;QAED,iCAAiC;QACjC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,MAAM,KAAK,GAAG,OAAO,OAAO,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC;YAC9E,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,kCAAkC,KAAK,GAAG,CAAC,CAAC,CAAC;YACvE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,oDAAoD,CAAC,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE;YAChC,MAAM,EAAE,UAAU;YAClB,KAAK,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,EAAG,iCAAiC;YAClG,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;YAC9C,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YACvE,SAAS,EAAE,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;YAC1C,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;YACtC,cAAc,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS;YACzE,cAAc,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS;YACzE,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,QAAQ,EAAE,OAAO,CAAC,MAAM;YACxB,UAAU,EAAE,OAAO,CAAC,OAAO;YAC3B,aAAa,EAAE,OAAO,CAAC,MAAM;YAC7B,WAAW,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,SAAS;YACvF,eAAe,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK;YAChC,cAAc,EAAE,OAAO,OAAO,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW;YAC/E,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,OAAO,EAAE,OAAO,CAAC,OAAO;SACzB,CAAC,CAAC;QAEH,iBAAiB;QACjB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,cAAc,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QAE1F,kCAAkC;QAClC,MAAM,UAAU,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC;QACtC,IAAI,UAAU,IAAI,CAAC,UAAU,CAAC,aAAa,GAAG,CAAC,IAAI,UAAU,CAAC,cAAc,GAAG,CAAC,CAAC,EAAE,CAAC;YAClF,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC;QACzC,CAAC;QAED,oCAAoC;QACpC,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;YACpB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QAC5C,CAAC;QAED,iCAAiC;QACjC,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;YAC1B,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,CAAC;QACjC,CAAC;aAAM,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;YAC3B,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,gBAAgB,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QACpD,CAAC;aAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC1B,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC,CAAC;QACvD,CAAC;QAED,OAAO,CAAC,GAAG,EAAE,CAAC;IAChB,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,wCAAwC;AACxC,OAAO;KACJ,OAAO,CAAC,qBAAqB,CAAC;KAC9B,WAAW,CAAC,wBAAwB,CAAC;KACrC,MAAM,CAAC,eAAe,EAAE,iCAAiC,EAAE,IAAI,CAAC;KAChE,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;IACrC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;QAE1E,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO;QACT,CAAC;QAED,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC;IACpD,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,yCAAyC;AACzC,OAAO;KACJ,OAAO,CAAC,uBAAuB,CAAC;KAChC,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,iBAAiB,EAAE,yCAAyC,EAAE,aAAa,CAAC;KACnF,MAAM,CAAC,iBAAiB,EAAE,uBAAuB,CAAC;KAClD,MAAM,CAAC,mBAAmB,EAAE,qFAAqF,CAAC;KAClH,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,EAAE;IACxC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,QAAQ,EAAE;YACvC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc;YACpD,CAAC,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,8BAA8B;SAC/F,CAAC,CAAC;QAEH,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC7C,OAAO;QACT,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3D,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,gCAAgC;AAChC,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,gCAAgC,CAAC;KAC7C,MAAM,CAAC,iBAAiB,EAAE,2CAA2C,EAAE,IAAI,CAAC;KAC5E,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE;IAC9B,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;QAEzE,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO;QACT,CAAC;QAED,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC;QACxD,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7C,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,aAAa,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,CAAC;gBACrD,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,SAAS,IAAI,CAAC,QAAQ,EAAE;gBAC3E,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,GAAG,IAAI,IAAI,CAAC,GAAG;gBACnC,OAAO,EAAE,IAAI,CAAC,OAAO;aACtB,CAAC,CAAC,CAAC,CAAC,CAAC;QACR,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC;QAChE,CAAC;IACH,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,6CAA6C;AAC7C,OAAO;KACJ,OAAO,CAAC,eAAe,CAAC;KACxB,KAAK,CAAC,IAAI,CAAC;KACX,WAAW,CAAC,uDAAuD,CAAC;KACpE,MAAM,CAAC,iBAAiB,EAAE,yBAAyB,EAAE,IAAI,CAAC;KAC1D,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,KAAK,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;IAC7B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;QAE1E,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC7C,OAAO;QACT,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC;IACzC,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,oCAAoC;AACpC,OAAO;KACJ,OAAO,CAAC,eAAe,CAAC;KACxB,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,uBAAuB,EAAE,oCAAoC,EAAE,MAAM,CAAC;KAC7E,MAAM,CAAC,cAAc,EAAE,+BAA+B,CAAC;KACvD,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE;IAC9B,IAAI,CAAC;QACH,qCAAqC;QACrC,MAAM,IAAI,GAAG,MAAM,UAAU,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;QAEtD,IAAI,MAAc,CAAC;QAEnB,QAAQ,OAAO,CAAC,MAAM,EAAE,CAAC;YACvB,KAAK,UAAU;gBACb,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACxB,OAAO,KAAK,GAAG,CAAC,KAAK,OAAO,GAAG,CAAC,OAAO,WAAW,CAAC;gBACrD,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACd,MAAM;YAER,KAAK,KAAK;gBACR,MAAM,OAAO,GAAG,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;gBACjC,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBAC5B,OAAO;wBACL,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG;wBAC5C,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG;qBAC3C,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACd,CAAC,CAAC,CAAC;gBACH,MAAM,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjD,MAAM;YAER;gBACE,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QAC3C,CAAC;QAED,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;YAChB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;YAC7C,aAAa,CAAC,OAAO,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,gBAAgB,IAAI,CAAC,MAAM,iBAAiB,OAAO,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC;QAC7F,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web crawler with rate limiting
|
|
3
|
+
*/
|
|
4
|
+
import { type ExtractResult } from '../extractor/index.js';
|
|
5
|
+
export interface CrawlOptions {
|
|
6
|
+
depth: number;
|
|
7
|
+
concurrency: number;
|
|
8
|
+
maxPages: number;
|
|
9
|
+
rateLimit: number;
|
|
10
|
+
timeout: number;
|
|
11
|
+
respectRobots: boolean;
|
|
12
|
+
useSitemap: boolean;
|
|
13
|
+
includePattern?: RegExp;
|
|
14
|
+
excludePattern?: RegExp;
|
|
15
|
+
forceEngine?: 'fetch' | 'playwright' | 'rebrowser';
|
|
16
|
+
}
|
|
17
|
+
export interface CrawlResult {
|
|
18
|
+
url: string;
|
|
19
|
+
finalUrl: string;
|
|
20
|
+
extracted: ExtractResult;
|
|
21
|
+
depth: number;
|
|
22
|
+
engine: string;
|
|
23
|
+
}
|
|
24
|
+
export declare class Crawler {
|
|
25
|
+
private options;
|
|
26
|
+
private engine;
|
|
27
|
+
private extractor;
|
|
28
|
+
private robots;
|
|
29
|
+
private sitemap;
|
|
30
|
+
private dedup;
|
|
31
|
+
private visited;
|
|
32
|
+
private queue;
|
|
33
|
+
private baseHosts;
|
|
34
|
+
private results;
|
|
35
|
+
private pending;
|
|
36
|
+
constructor(options?: Partial<CrawlOptions>);
|
|
37
|
+
/**
|
|
38
|
+
* Crawl URLs and yield results as they complete
|
|
39
|
+
*/
|
|
40
|
+
crawl(startUrls: string[]): AsyncGenerator<CrawlResult>;
|
|
41
|
+
private parseSitemaps;
|
|
42
|
+
private addToQueue;
|
|
43
|
+
private processUrl;
|
|
44
|
+
private shouldCrawl;
|
|
45
|
+
/**
|
|
46
|
+
* Get engine statistics
|
|
47
|
+
*/
|
|
48
|
+
getStats(): {
|
|
49
|
+
visited: number;
|
|
50
|
+
queued: number;
|
|
51
|
+
pending: number;
|
|
52
|
+
dedup: {
|
|
53
|
+
uniquePaths: number;
|
|
54
|
+
uniqueContent: number;
|
|
55
|
+
localeSkipped: number;
|
|
56
|
+
similarSkipped: number;
|
|
57
|
+
total: number;
|
|
58
|
+
};
|
|
59
|
+
fetch: number;
|
|
60
|
+
playwright: number;
|
|
61
|
+
rebrowser: number;
|
|
62
|
+
blocked: number;
|
|
63
|
+
};
|
|
64
|
+
/**
|
|
65
|
+
* Close all resources
|
|
66
|
+
*/
|
|
67
|
+
close(): Promise<void>;
|
|
68
|
+
}
|
|
69
|
+
export { RobotsParser } from './robots.js';
|
|
70
|
+
export { SitemapParser } from './sitemap.js';
|
|
71
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/crawler/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,EAAa,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAOtE,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,aAAa,EAAE,OAAO,CAAC;IACvB,UAAU,EAAE,OAAO,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,OAAO,GAAG,YAAY,GAAG,WAAW,CAAC;CACpD;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,aAAa,CAAC;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAcD,qBAAa,OAAO;IAClB,OAAO,CAAC,OAAO,CAAe;IAC9B,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,SAAS,CAAY;IAC7B,OAAO,CAAC,MAAM,CAAe;IAC7B,OAAO,CAAC,OAAO,CAAgB;IAC/B,OAAO,CAAC,KAAK,CAAe;IAC5B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,SAAS,CAA0B;IAC3C,OAAO,CAAC,OAAO,CAAqB;IACpC,OAAO,CAAC,OAAO,CAA0D;gBAE7D,OAAO,GAAE,OAAO,CAAC,YAAY,CAAM;IAgB/C;;OAEG;IACI,KAAK,CAAC,SAAS,EAAE,MAAM,EAAE,GAAG,cAAc,CAAC,WAAW,CAAC;YAkEhD,aAAa;IA2B3B,OAAO,CAAC,UAAU;YAaJ,UAAU;IA+CxB,OAAO,CAAC,WAAW;IAyCnB;;OAEG;IACH,QAAQ;;;;;;;;;;;;;;;;IAUR;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAI7B;AAED,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC"}
|