edgecrawl 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +260 -0
- package/index.mjs +22 -0
- package/package.json +63 -0
- package/schemas/article.json +15 -0
- package/schemas/news.json +19 -0
- package/schemas/product.json +20 -0
- package/src/cli.mjs +226 -0
- package/src/html2md.mjs +268 -0
- package/src/llm.mjs +281 -0
- package/src/pipeline.mjs +165 -0
- package/src/scraper.mjs +218 -0
- package/src/structured-extract.mjs +226 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Koji Hayashida
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
# edgecrawl
|
|
2
|
+
|
|
3
|
+
Local AI-powered web scraper. Extract structured JSON from any website using on-device ONNX LLMs. No API keys, no cloud, no Python.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **100% Local AI** — Runs Qwen3 ONNX models on your machine via Transformers.js v4 (WebGPU/WASM)
|
|
8
|
+
- **Zero API Keys** — No OpenAI, no Anthropic, no cloud bills. Everything runs on-device
|
|
9
|
+
- **Structured JSON Output** — Define a schema, get clean JSON back
|
|
10
|
+
- **Smart Extraction** — Tries JSON-LD/Open Graph first, falls back to LLM only when needed
|
|
11
|
+
- **CLI + Library** — Use from the command line or import into your Node.js app
|
|
12
|
+
|
|
13
|
+
## Architecture
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
Playwright (headless browser)
|
|
17
|
+
|
|
|
18
|
+
v
|
|
19
|
+
JSDOM + node-html-markdown -> Clean Markdown
|
|
20
|
+
|
|
|
21
|
+
v
|
|
22
|
+
Qwen3 ONNX (Transformers.js v4) -> Structured JSON
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
### Install from npm
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
npm install edgecrawl
|
|
31
|
+
npx playwright install chromium
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Install from source
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
git clone https://github.com/couzip/edgecrawl.git
|
|
38
|
+
cd edgecrawl
|
|
39
|
+
npm install
|
|
40
|
+
npx playwright install chromium
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Models are downloaded automatically on first run:
|
|
44
|
+
- LLM: Qwen3 ONNX (0.4-2.5 GB depending on preset)
|
|
45
|
+
|
|
46
|
+
### CLI
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Extract structured data from a URL
|
|
50
|
+
edgecrawl extract https://example.com
|
|
51
|
+
|
|
52
|
+
# With custom schema
|
|
53
|
+
edgecrawl extract https://example.com -s schemas/product.json -o result.json
|
|
54
|
+
|
|
55
|
+
# Light model on WASM
|
|
56
|
+
edgecrawl extract https://example.com -p light -d wasm
|
|
57
|
+
|
|
58
|
+
# Convert to Markdown only (no LLM)
|
|
59
|
+
edgecrawl md https://example.com
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Library
|
|
63
|
+
|
|
64
|
+
```javascript
|
|
65
|
+
import { scrapeAndExtract, cleanup } from "edgecrawl";
|
|
66
|
+
|
|
67
|
+
const result = await scrapeAndExtract("https://example.com", {
|
|
68
|
+
preset: "balanced",
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
console.log(result.extracted);
|
|
72
|
+
await cleanup();
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## CLI Commands
|
|
76
|
+
|
|
77
|
+
### `extract <url>` — Structured extraction
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Default (balanced model, WebGPU)
|
|
81
|
+
edgecrawl extract https://example.com
|
|
82
|
+
|
|
83
|
+
# Light model on WASM
|
|
84
|
+
edgecrawl extract https://example.com -p light -d wasm
|
|
85
|
+
|
|
86
|
+
# Custom schema + output file
|
|
87
|
+
edgecrawl extract https://example.com -s schemas/product.json -o result.json
|
|
88
|
+
|
|
89
|
+
# Target a specific section
|
|
90
|
+
edgecrawl extract https://example.com --selector "main article"
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### `batch <file>` — Batch processing
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Process URL list (one URL per line)
|
|
97
|
+
edgecrawl batch urls.txt -o results.json
|
|
98
|
+
|
|
99
|
+
# With concurrency control
|
|
100
|
+
edgecrawl batch urls.txt -c 5
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### `query <url> <prompt>` — Custom question
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Ask a question about page content
|
|
107
|
+
edgecrawl query https://example.com "What are the main products?"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### `md <url>` — Markdown conversion only (no LLM)
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
edgecrawl md https://example.com
|
|
114
|
+
edgecrawl md https://example.com -o page.md --scroll
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## CLI Options
|
|
118
|
+
|
|
119
|
+
### Common Options (extract, batch, query)
|
|
120
|
+
|
|
121
|
+
| Option | Description | Default |
|
|
122
|
+
|--------|-------------|---------|
|
|
123
|
+
| `-p, --preset <preset>` | Model preset: `light` / `balanced` / `quality` | `balanced` |
|
|
124
|
+
| `-d, --device <device>` | Inference device: `webgpu` / `wasm` | `webgpu` |
|
|
125
|
+
| `-s, --schema <file>` | Custom schema JSON file | built-in default |
|
|
126
|
+
| `-o, --output <file>` | Output file path | stdout |
|
|
127
|
+
| `-t, --max-tokens <n>` | Max input tokens for LLM | `2048` |
|
|
128
|
+
| `--selector <selector>` | CSS selector to narrow target content | - |
|
|
129
|
+
|
|
130
|
+
### Batch Options
|
|
131
|
+
|
|
132
|
+
| Option | Description | Default |
|
|
133
|
+
|--------|-------------|---------|
|
|
134
|
+
| `-c, --concurrency <n>` | Concurrent scraping limit | `3` |
|
|
135
|
+
|
|
136
|
+
### Browser Options (all commands)
|
|
137
|
+
|
|
138
|
+
| Option | Description | Default |
|
|
139
|
+
|--------|-------------|---------|
|
|
140
|
+
| `--headful` | Show browser window (for debugging) | `false` |
|
|
141
|
+
| `--user-agent <ua>` | Custom User-Agent string | - |
|
|
142
|
+
| `--timeout <ms>` | Page load timeout in milliseconds | `30000` |
|
|
143
|
+
| `--proxy <url>` | Proxy server URL | - |
|
|
144
|
+
| `--cookie <cookie>` | Cookie in `name=value` format (repeatable) | - |
|
|
145
|
+
| `--extra-header <header>` | HTTP header in `Key:Value` format (repeatable) | - |
|
|
146
|
+
| `--viewport <WxH>` | Viewport size | `1280x800` |
|
|
147
|
+
| `--wait-until <event>` | Navigation wait condition: `load` / `domcontentloaded` / `networkidle` | `load` |
|
|
148
|
+
| `--no-block-media` | Disable blocking of images/fonts/media | `false` |
|
|
149
|
+
| `--scroll` | Scroll to bottom (for lazy-loaded content) | `false` |
|
|
150
|
+
| `--wait <selector>` | Wait for CSS selector to appear | - |
|
|
151
|
+
|
|
152
|
+
## Library Usage
|
|
153
|
+
|
|
154
|
+
### High-level Pipeline
|
|
155
|
+
|
|
156
|
+
```javascript
|
|
157
|
+
import {
|
|
158
|
+
scrapeAndExtract,
|
|
159
|
+
batchScrapeAndExtract,
|
|
160
|
+
scrapeAndQuery,
|
|
161
|
+
cleanup,
|
|
162
|
+
} from "edgecrawl";
|
|
163
|
+
|
|
164
|
+
// Basic extraction
|
|
165
|
+
const result = await scrapeAndExtract("https://example.com");
|
|
166
|
+
|
|
167
|
+
// Custom schema
|
|
168
|
+
const product = await scrapeAndExtract("https://shop.example.com/item", {
|
|
169
|
+
schema: {
|
|
170
|
+
title: "Product name (string)",
|
|
171
|
+
price: "Current price (string)",
|
|
172
|
+
features: "Key features (array of strings)",
|
|
173
|
+
},
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
// Batch processing
|
|
177
|
+
const results = await batchScrapeAndExtract(
|
|
178
|
+
["https://example.com/1", "https://example.com/2"],
|
|
179
|
+
{ concurrency: 3 }
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
// Custom query
|
|
183
|
+
const answer = await scrapeAndQuery(
|
|
184
|
+
"https://example.com",
|
|
185
|
+
"What are the main products?",
|
|
186
|
+
{ preset: "quality" }
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
await cleanup();
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Low-level APIs
|
|
193
|
+
|
|
194
|
+
```javascript
|
|
195
|
+
// Use individual modules
|
|
196
|
+
import { htmlToMarkdown, cleanMarkdown } from "edgecrawl/html2md";
|
|
197
|
+
import { launchBrowser, fetchPage, closeBrowser } from "edgecrawl/scraper";
|
|
198
|
+
import { initLLM, extractStructured } from "edgecrawl/llm";
|
|
199
|
+
|
|
200
|
+
// HTML to Markdown only
|
|
201
|
+
await launchBrowser();
|
|
202
|
+
const { html } = await fetchPage("https://example.com");
|
|
203
|
+
const { markdown, title } = htmlToMarkdown(html, "https://example.com");
|
|
204
|
+
const cleaned = cleanMarkdown(markdown);
|
|
205
|
+
await closeBrowser();
|
|
206
|
+
|
|
207
|
+
// Or use the root export
|
|
208
|
+
import { htmlToMarkdown, cleanMarkdown, fetchPage } from "edgecrawl";
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## Custom Schemas
|
|
212
|
+
|
|
213
|
+
Define what data to extract by providing a JSON schema file:
|
|
214
|
+
|
|
215
|
+
```json
|
|
216
|
+
{
|
|
217
|
+
"title": "Product name (string)",
|
|
218
|
+
"price": "Current price (string)",
|
|
219
|
+
"currency": "Currency code (string)",
|
|
220
|
+
"description": "Product description (string)",
|
|
221
|
+
"features": "Key features (array of strings)",
|
|
222
|
+
"availability": "In stock or out of stock (string)"
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
edgecrawl extract https://shop.example.com/product -s schema.json
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
See the `schemas/` directory for more examples.
|
|
231
|
+
|
|
232
|
+
## Model Presets
|
|
233
|
+
|
|
234
|
+
| Preset | Model | Size | Speed | Quality |
|
|
235
|
+
|--------|-------|------|-------|---------|
|
|
236
|
+
| `light` | Qwen3-0.6B | ~0.4 GB | Fast | Good for simple pages |
|
|
237
|
+
| `balanced` | Qwen3-1.7B | ~1.2 GB | Medium | Best balance (default) |
|
|
238
|
+
| `quality` | Qwen3-4B | ~2.5 GB | Slower | Best accuracy |
|
|
239
|
+
|
|
240
|
+
All models run locally via ONNX Runtime. First run downloads the model to `.model-cache/`.
|
|
241
|
+
|
|
242
|
+
## Tech Stack
|
|
243
|
+
|
|
244
|
+
| Component | Library | Role |
|
|
245
|
+
|-----------|---------|------|
|
|
246
|
+
| Browser | Playwright | Headless scraping |
|
|
247
|
+
| HTML -> Markdown | JSDOM + node-html-markdown | Content cleaning + Markdown conversion |
|
|
248
|
+
| LLM | Transformers.js v4 + Qwen3 ONNX | Local structured extraction |
|
|
249
|
+
| CLI | Commander.js | Command-line interface |
|
|
250
|
+
|
|
251
|
+
## Requirements
|
|
252
|
+
|
|
253
|
+
- Node.js >= 20.0.0
|
|
254
|
+
- Chromium (installed via `npx playwright install chromium`)
|
|
255
|
+
- ~1-3 GB disk space for models (downloaded on first run)
|
|
256
|
+
- GPU recommended for WebGPU mode (falls back to WASM/CPU)
|
|
257
|
+
|
|
258
|
+
## License
|
|
259
|
+
|
|
260
|
+
MIT
|
package/index.mjs
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// index.mjs — edgecrawl package entry point
|
|
2
|
+
|
|
3
|
+
// High-level pipeline functions
|
|
4
|
+
export {
|
|
5
|
+
scrapeAndExtract,
|
|
6
|
+
batchScrapeAndExtract,
|
|
7
|
+
scrapeAndQuery,
|
|
8
|
+
cleanup,
|
|
9
|
+
DEFAULT_SCHEMA,
|
|
10
|
+
} from "./src/pipeline.mjs";
|
|
11
|
+
|
|
12
|
+
// LLM utilities
|
|
13
|
+
export { initLLM, extractStructured, queryLLM, MODEL_PRESETS } from "./src/llm.mjs";
|
|
14
|
+
|
|
15
|
+
// HTML-to-Markdown conversion
|
|
16
|
+
export { htmlToMarkdown, cleanMarkdown, truncateForLLM } from "./src/html2md.mjs";
|
|
17
|
+
|
|
18
|
+
// Structured data extraction (JSON-LD / Open Graph)
|
|
19
|
+
export { tryStructuredExtract } from "./src/structured-extract.mjs";
|
|
20
|
+
|
|
21
|
+
// Browser / scraping primitives
|
|
22
|
+
export { launchBrowser, fetchPage, fetchPages, closeBrowser } from "./src/scraper.mjs";
|
package/package.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "edgecrawl",
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "Local AI-powered web scraper. Extract structured JSON from any website using on-device ONNX LLMs. No API keys, no cloud, no Python.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./index.mjs",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./index.mjs",
|
|
9
|
+
"./scraper": "./src/scraper.mjs",
|
|
10
|
+
"./html2md": "./src/html2md.mjs",
|
|
11
|
+
"./llm": "./src/llm.mjs",
|
|
12
|
+
"./structured-extract": "./src/structured-extract.mjs"
|
|
13
|
+
},
|
|
14
|
+
"bin": {
|
|
15
|
+
"edgecrawl": "./src/cli.mjs"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"index.mjs",
|
|
19
|
+
"src/",
|
|
20
|
+
"schemas/",
|
|
21
|
+
"LICENSE",
|
|
22
|
+
"README.md"
|
|
23
|
+
],
|
|
24
|
+
"scripts": {
|
|
25
|
+
"extract": "node src/cli.mjs extract",
|
|
26
|
+
"batch": "node src/cli.mjs batch",
|
|
27
|
+
"query": "node src/cli.mjs query",
|
|
28
|
+
"md": "node src/cli.mjs md"
|
|
29
|
+
},
|
|
30
|
+
"keywords": [
|
|
31
|
+
"scraper",
|
|
32
|
+
"web-scraping",
|
|
33
|
+
"llm",
|
|
34
|
+
"onnx",
|
|
35
|
+
"local-ai",
|
|
36
|
+
"structured-data",
|
|
37
|
+
"json",
|
|
38
|
+
"headless",
|
|
39
|
+
"playwright",
|
|
40
|
+
"edge-computing",
|
|
41
|
+
"transformers"
|
|
42
|
+
],
|
|
43
|
+
"author": "Koji Hayashida",
|
|
44
|
+
"license": "MIT",
|
|
45
|
+
"repository": {
|
|
46
|
+
"type": "git",
|
|
47
|
+
"url": "git+https://github.com/couzip/edgecrawl.git"
|
|
48
|
+
},
|
|
49
|
+
"homepage": "https://github.com/couzip/edgecrawl#readme",
|
|
50
|
+
"bugs": {
|
|
51
|
+
"url": "https://github.com/couzip/edgecrawl/issues"
|
|
52
|
+
},
|
|
53
|
+
"engines": {
|
|
54
|
+
"node": ">=20.0.0"
|
|
55
|
+
},
|
|
56
|
+
"dependencies": {
|
|
57
|
+
"@huggingface/transformers": "4.0.0-next.3",
|
|
58
|
+
"commander": "^12.0.0",
|
|
59
|
+
"jsdom": "^25.0.0",
|
|
60
|
+
"node-html-markdown": "^2.0.0",
|
|
61
|
+
"playwright": "^1.48.0"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "object",
|
|
3
|
+
"properties": {
|
|
4
|
+
"title": { "type": "string", "description": "Article title" },
|
|
5
|
+
"author": { "type": "string", "description": "Author name" },
|
|
6
|
+
"published_date": { "type": "string", "description": "Publication date" },
|
|
7
|
+
"summary": { "type": "string", "description": "Summary of the article (3-5 sentences)" },
|
|
8
|
+
"tags": {
|
|
9
|
+
"type": "array",
|
|
10
|
+
"items": { "type": "string" },
|
|
11
|
+
"description": "Topics or tags"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"required": ["title", "summary"]
|
|
15
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "object",
|
|
3
|
+
"properties": {
|
|
4
|
+
"articles": {
|
|
5
|
+
"type": "array",
|
|
6
|
+
"items": {
|
|
7
|
+
"type": "object",
|
|
8
|
+
"properties": {
|
|
9
|
+
"title": { "type": "string", "description": "Article headline" },
|
|
10
|
+
"url": { "type": "string", "description": "Article URL" },
|
|
11
|
+
"summary": { "type": "string", "description": "Brief summary (if available)" },
|
|
12
|
+
"date": { "type": "string", "description": "Published date (if available)" }
|
|
13
|
+
},
|
|
14
|
+
"required": ["title", "url"]
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"required": ["articles"]
|
|
19
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "object",
|
|
3
|
+
"properties": {
|
|
4
|
+
"name": { "type": "string", "description": "Product name" },
|
|
5
|
+
"brand": { "type": "string", "description": "Brand or manufacturer" },
|
|
6
|
+
"price": { "type": "number", "description": "Price (numeric)" },
|
|
7
|
+
"currency": { "type": "string", "description": "Currency code (e.g. USD, EUR, JPY)" },
|
|
8
|
+
"availability": { "type": "string", "description": "Stock status (in stock, out of stock, etc.)" },
|
|
9
|
+
"rating": { "type": "number", "description": "Rating score (e.g. 4.5)" },
|
|
10
|
+
"review_count": { "type": "integer", "description": "Number of reviews" },
|
|
11
|
+
"description": { "type": "string", "description": "Product description (1-3 sentences)" },
|
|
12
|
+
"category": { "type": "string", "description": "Product category" },
|
|
13
|
+
"features": {
|
|
14
|
+
"type": "array",
|
|
15
|
+
"items": { "type": "string" },
|
|
16
|
+
"description": "Key features or specs"
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
"required": ["name", "price", "currency"]
|
|
20
|
+
}
|
package/src/cli.mjs
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// src/cli.mjs
|
|
3
|
+
// CLI entry point
|
|
4
|
+
|
|
5
|
+
import { Command } from "commander";
|
|
6
|
+
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
7
|
+
|
|
8
|
+
async function loadPipeline() {
|
|
9
|
+
return await import("./pipeline.mjs");
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const program = new Command();
|
|
13
|
+
|
|
14
|
+
program
|
|
15
|
+
.name("edgecrawl")
|
|
16
|
+
.description("Local AI-powered web scraper — extract structured JSON using on-device ONNX LLMs")
|
|
17
|
+
.version("0.3.0");
|
|
18
|
+
|
|
19
|
+
// ---- Repeatable option helper ----
|
|
20
|
+
function collect(val, memo) {
|
|
21
|
+
memo.push(val);
|
|
22
|
+
return memo;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ---- Browser options ----
|
|
26
|
+
function addBrowserOptions(cmd) {
|
|
27
|
+
return cmd
|
|
28
|
+
.option("--headful", "Show browser window (for debugging)")
|
|
29
|
+
.option("--user-agent <ua>", "Custom User-Agent string")
|
|
30
|
+
.option("--timeout <ms>", "Page load timeout in milliseconds", "30000")
|
|
31
|
+
.option("--no-block-media", "Disable blocking of images/fonts/media")
|
|
32
|
+
.option("--viewport <WxH>", "Viewport size (e.g. 1920x1080)", "1280x800")
|
|
33
|
+
.option("--wait-until <event>", "Navigation wait condition (load/domcontentloaded/networkidle)", "load")
|
|
34
|
+
.option("--proxy <url>", "Proxy server URL")
|
|
35
|
+
.option("--cookie <cookie>", "Cookie in name=value format (repeatable)", collect, [])
|
|
36
|
+
.option("--extra-header <header>", "HTTP header in Key:Value format (repeatable)", collect, [])
|
|
37
|
+
.option("--scroll", "Scroll to bottom (for lazy-loaded content)")
|
|
38
|
+
.option("--wait <selector>", "Wait for CSS selector to appear");
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ---- Common options (LLM + browser) ----
|
|
42
|
+
function addCommonOptions(cmd) {
|
|
43
|
+
cmd = cmd
|
|
44
|
+
.option("-p, --preset <preset>", "Model preset (light/balanced/quality)", "balanced")
|
|
45
|
+
.option("-d, --device <device>", "Inference device (webgpu/wasm)", "webgpu")
|
|
46
|
+
.option("-s, --schema <file>", "Custom schema JSON file")
|
|
47
|
+
.option("-o, --output <file>", "Output file (default: stdout)")
|
|
48
|
+
.option("-t, --max-tokens <n>", "Max input tokens for LLM", "2048")
|
|
49
|
+
.option("--selector <selector>", "CSS selector to narrow target content");
|
|
50
|
+
return addBrowserOptions(cmd);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function buildBrowserOptions(opts) {
|
|
54
|
+
return {
|
|
55
|
+
headless: !opts.headful,
|
|
56
|
+
...(opts.proxy && { proxy: opts.proxy }),
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function buildScrapeOptions(opts) {
|
|
61
|
+
const [vw, vh] = (opts.viewport || "1280x800").split("x").map(Number);
|
|
62
|
+
return {
|
|
63
|
+
scrollToBottom: opts.scroll,
|
|
64
|
+
waitForSelector: opts.wait,
|
|
65
|
+
userAgent: opts.userAgent,
|
|
66
|
+
timeout: parseInt(opts.timeout || "30000"),
|
|
67
|
+
blockMedia: opts.blockMedia,
|
|
68
|
+
waitUntil: opts.waitUntil,
|
|
69
|
+
viewportWidth: vw || 1280,
|
|
70
|
+
viewportHeight: vh || 800,
|
|
71
|
+
cookies: opts.cookie || [],
|
|
72
|
+
extraHeaders: opts.extraHeader || [],
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function outputResult(result, opts) {
|
|
77
|
+
const json = JSON.stringify(result, null, 2);
|
|
78
|
+
if (opts.output) {
|
|
79
|
+
writeFileSync(opts.output, json);
|
|
80
|
+
console.error(`Result saved to ${opts.output}`);
|
|
81
|
+
} else {
|
|
82
|
+
console.log(json);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ---- Single URL extraction ----
|
|
87
|
+
addCommonOptions(
|
|
88
|
+
program
|
|
89
|
+
.command("extract <url>")
|
|
90
|
+
.description("Extract structured JSON data from a URL")
|
|
91
|
+
).action(async (url, opts) => {
|
|
92
|
+
try {
|
|
93
|
+
const { scrapeAndExtract, cleanup } = await loadPipeline();
|
|
94
|
+
const schema = opts.schema ? JSON.parse(readFileSync(opts.schema, "utf-8")) : undefined;
|
|
95
|
+
|
|
96
|
+
console.error(`Scraping: ${url}`);
|
|
97
|
+
console.error(`Model: ${opts.preset} / Device: ${opts.device}`);
|
|
98
|
+
|
|
99
|
+
const result = await scrapeAndExtract(url, {
|
|
100
|
+
schema,
|
|
101
|
+
preset: opts.preset,
|
|
102
|
+
device: opts.device,
|
|
103
|
+
maxTokens: parseInt(opts.maxTokens),
|
|
104
|
+
selector: opts.selector,
|
|
105
|
+
browserOptions: buildBrowserOptions(opts),
|
|
106
|
+
scrapeOptions: buildScrapeOptions(opts),
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
outputResult(result, opts);
|
|
110
|
+
} catch (err) {
|
|
111
|
+
console.error("Error:", err.message);
|
|
112
|
+
process.exit(1);
|
|
113
|
+
} finally {
|
|
114
|
+
const { cleanup } = await loadPipeline();
|
|
115
|
+
await cleanup();
|
|
116
|
+
}
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
// ---- Batch processing ----
|
|
120
|
+
addCommonOptions(
|
|
121
|
+
program
|
|
122
|
+
.command("batch <file>")
|
|
123
|
+
.description("Batch extract from a URL list file (one URL per line)")
|
|
124
|
+
.option("-c, --concurrency <n>", "Concurrent scraping limit", "3")
|
|
125
|
+
).action(async (file, opts) => {
|
|
126
|
+
try {
|
|
127
|
+
const { batchScrapeAndExtract } = await loadPipeline();
|
|
128
|
+
if (!existsSync(file)) {
|
|
129
|
+
console.error(`File not found: ${file}`);
|
|
130
|
+
process.exit(1);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const urls = readFileSync(file, "utf-8")
|
|
134
|
+
.split("\n")
|
|
135
|
+
.map((l) => l.trim())
|
|
136
|
+
.filter((l) => l && !l.startsWith("#"));
|
|
137
|
+
|
|
138
|
+
console.error(`Processing ${urls.length} URLs...`);
|
|
139
|
+
|
|
140
|
+
const schema = opts.schema ? JSON.parse(readFileSync(opts.schema, "utf-8")) : undefined;
|
|
141
|
+
|
|
142
|
+
const results = await batchScrapeAndExtract(urls, {
|
|
143
|
+
schema,
|
|
144
|
+
preset: opts.preset,
|
|
145
|
+
device: opts.device,
|
|
146
|
+
concurrency: parseInt(opts.concurrency),
|
|
147
|
+
browserOptions: buildBrowserOptions(opts),
|
|
148
|
+
scrapeOptions: buildScrapeOptions(opts),
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
const outFile = opts.output || "results.json";
|
|
152
|
+
writeFileSync(outFile, JSON.stringify(results, null, 2));
|
|
153
|
+
console.error(`Done. ${results.length} results saved to ${outFile}`);
|
|
154
|
+
} catch (err) {
|
|
155
|
+
console.error("Error:", err.message);
|
|
156
|
+
process.exit(1);
|
|
157
|
+
} finally {
|
|
158
|
+
const { cleanup } = await loadPipeline();
|
|
159
|
+
await cleanup();
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// ---- Custom query ----
|
|
164
|
+
addCommonOptions(
|
|
165
|
+
program
|
|
166
|
+
.command("query <url> <prompt>")
|
|
167
|
+
.description("Ask a custom question about a page's content")
|
|
168
|
+
).action(async (url, prompt, opts) => {
|
|
169
|
+
try {
|
|
170
|
+
const { scrapeAndQuery, cleanup } = await loadPipeline();
|
|
171
|
+
console.error(`Scraping: ${url}`);
|
|
172
|
+
const result = await scrapeAndQuery(url, prompt, {
|
|
173
|
+
preset: opts.preset,
|
|
174
|
+
device: opts.device,
|
|
175
|
+
browserOptions: buildBrowserOptions(opts),
|
|
176
|
+
scrapeOptions: buildScrapeOptions(opts),
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
outputResult(result, opts);
|
|
180
|
+
} catch (err) {
|
|
181
|
+
console.error("Error:", err.message);
|
|
182
|
+
process.exit(1);
|
|
183
|
+
} finally {
|
|
184
|
+
const { cleanup } = await loadPipeline();
|
|
185
|
+
await cleanup();
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
// ---- Markdown conversion only (no LLM) ----
|
|
190
|
+
addBrowserOptions(
|
|
191
|
+
program
|
|
192
|
+
.command("md <url>")
|
|
193
|
+
.description("Convert HTML to Markdown (no LLM)")
|
|
194
|
+
.option("-o, --output <file>", "Output file")
|
|
195
|
+
.option("--selector <selector>", "CSS selector to narrow target content")
|
|
196
|
+
).action(async (url, opts) => {
|
|
197
|
+
try {
|
|
198
|
+
const { fetchPage, launchBrowser } = await import("./scraper.mjs");
|
|
199
|
+
const { htmlToMarkdown, cleanMarkdown } = await import("./html2md.mjs");
|
|
200
|
+
|
|
201
|
+
await launchBrowser(buildBrowserOptions(opts));
|
|
202
|
+
const { html, url: finalUrl } = await fetchPage(url, buildScrapeOptions(opts));
|
|
203
|
+
|
|
204
|
+
const { markdown, title } = htmlToMarkdown(html, finalUrl, { selector: opts.selector });
|
|
205
|
+
const cleaned = cleanMarkdown(markdown);
|
|
206
|
+
|
|
207
|
+
console.error(`Markdown: ${markdown.length} chars -> cleaned: ${cleaned.length} chars (${Math.round((1 - cleaned.length / markdown.length) * 100)}% reduction)`);
|
|
208
|
+
|
|
209
|
+
const output = `# ${title}\n\nSource: ${finalUrl}\n\n---\n\n${cleaned}`;
|
|
210
|
+
|
|
211
|
+
if (opts.output) {
|
|
212
|
+
writeFileSync(opts.output, output);
|
|
213
|
+
console.error(`Saved to ${opts.output}`);
|
|
214
|
+
} else {
|
|
215
|
+
console.log(output);
|
|
216
|
+
}
|
|
217
|
+
} catch (err) {
|
|
218
|
+
console.error("Error:", err.message);
|
|
219
|
+
process.exit(1);
|
|
220
|
+
} finally {
|
|
221
|
+
const { closeBrowser } = await import("./scraper.mjs");
|
|
222
|
+
await closeBrowser();
|
|
223
|
+
}
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
program.parse();
|