@vakra-dev/reader 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -26
- package/dist/cli/index.js +445 -734
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +205 -41
- package/dist/index.js +663 -715
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -66,7 +66,8 @@ All the hard stuff, browser pooling, challenge detection, proxy rotation, retrie
|
|
|
66
66
|
## Features
|
|
67
67
|
|
|
68
68
|
- **Cloudflare Bypass** - TLS fingerprinting, DNS over TLS, WebRTC masking
|
|
69
|
-
- **
|
|
69
|
+
- **Clean Output** - Markdown and HTML with automatic main content extraction
|
|
70
|
+
- **Smart Content Cleaning** - Removes nav, headers, footers, popups, cookie banners
|
|
70
71
|
- **CLI & API** - Use from command line or programmatically
|
|
71
72
|
- **Browser Pool** - Auto-recycling, health monitoring, queue management
|
|
72
73
|
- **Concurrent Scraping** - Parallel URL processing with progress tracking
|
|
@@ -92,11 +93,11 @@ const reader = new ReaderClient();
|
|
|
92
93
|
|
|
93
94
|
const result = await reader.scrape({
|
|
94
95
|
urls: ["https://example.com"],
|
|
95
|
-
formats: ["markdown", "
|
|
96
|
+
formats: ["markdown", "html"],
|
|
96
97
|
});
|
|
97
98
|
|
|
98
99
|
console.log(result.data[0].markdown);
|
|
99
|
-
console.log(result.data[0].
|
|
100
|
+
console.log(result.data[0].html);
|
|
100
101
|
|
|
101
102
|
await reader.close();
|
|
102
103
|
```
|
|
@@ -242,7 +243,7 @@ Scrape one or more URLs.
|
|
|
242
243
|
npx reader scrape https://example.com
|
|
243
244
|
|
|
244
245
|
# Scrape with multiple formats
|
|
245
|
-
npx reader scrape https://example.com -f markdown,
|
|
246
|
+
npx reader scrape https://example.com -f markdown,html
|
|
246
247
|
|
|
247
248
|
# Scrape multiple URLs concurrently
|
|
248
249
|
npx reader scrape https://example.com https://example.org -c 2
|
|
@@ -253,7 +254,7 @@ npx reader scrape https://example.com -o output.md
|
|
|
253
254
|
|
|
254
255
|
| Option | Type | Default | Description |
|
|
255
256
|
| ------------------------ | ------ | ------------ | --------------------------------------------------------- |
|
|
256
|
-
| `-f, --format <formats>` | string | `"markdown"` | Output formats (comma-separated: markdown,html
|
|
257
|
+
| `-f, --format <formats>` | string | `"markdown"` | Output formats (comma-separated: markdown,html) |
|
|
257
258
|
| `-o, --output <file>` | string | stdout | Output file path |
|
|
258
259
|
| `-c, --concurrency <n>` | number | `1` | Parallel requests |
|
|
259
260
|
| `-t, --timeout <ms>` | number | `30000` | Request timeout in milliseconds |
|
|
@@ -261,7 +262,9 @@ npx reader scrape https://example.com -o output.md
|
|
|
261
262
|
| `--proxy <url>` | string | - | Proxy URL (e.g., http://user:pass@host:port) |
|
|
262
263
|
| `--user-agent <string>` | string | - | Custom user agent string |
|
|
263
264
|
| `--show-chrome` | flag | - | Show browser window for debugging |
|
|
264
|
-
| `--no-
|
|
265
|
+
| `--no-main-content` | flag | - | Disable main content extraction (include full page) |
|
|
266
|
+
| `--include-tags <sel>` | string | - | CSS selectors for elements to include (comma-separated) |
|
|
267
|
+
| `--exclude-tags <sel>` | string | - | CSS selectors for elements to exclude (comma-separated) |
|
|
265
268
|
| `-v, --verbose` | flag | - | Enable verbose logging |
|
|
266
269
|
|
|
267
270
|
### `reader crawl <url>`
|
|
@@ -352,24 +355,26 @@ await reader.close();
|
|
|
352
355
|
|
|
353
356
|
Scrape one or more URLs. Can be used directly or via `ReaderClient`.
|
|
354
357
|
|
|
355
|
-
| Option | Type
|
|
356
|
-
| ------------------ |
|
|
357
|
-
| `urls` | `string[]`
|
|
358
|
-
| `formats` | `Array<"markdown" \| "html"
|
|
359
|
-
| `
|
|
360
|
-
| `
|
|
361
|
-
| `
|
|
362
|
-
| `
|
|
363
|
-
| `
|
|
364
|
-
| `
|
|
365
|
-
| `
|
|
366
|
-
| `
|
|
367
|
-
| `
|
|
368
|
-
| `
|
|
369
|
-
| `
|
|
370
|
-
| `
|
|
371
|
-
| `
|
|
372
|
-
| `
|
|
358
|
+
| Option | Type | Required | Default | Description |
|
|
359
|
+
| ------------------ | --------------------------------- | -------- | -------------- | --------------------------------------------------------------- |
|
|
360
|
+
| `urls` | `string[]` | Yes | - | Array of URLs to scrape |
|
|
361
|
+
| `formats` | `Array<"markdown" \| "html">` | No | `["markdown"]` | Output formats |
|
|
362
|
+
| `onlyMainContent` | `boolean` | No | `true` | Extract only main content (removes nav/header/footer) |
|
|
363
|
+
| `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep |
|
|
364
|
+
| `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove |
|
|
365
|
+
| `userAgent` | `string` | No | - | Custom user agent string |
|
|
366
|
+
| `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds |
|
|
367
|
+
| `includePatterns` | `string[]` | No | `[]` | URL patterns to include (regex strings) |
|
|
368
|
+
| `excludePatterns` | `string[]` | No | `[]` | URL patterns to exclude (regex strings) |
|
|
369
|
+
| `batchConcurrency` | `number` | No | `1` | Number of URLs to process in parallel |
|
|
370
|
+
| `batchTimeoutMs` | `number` | No | `300000` | Total timeout for entire batch operation |
|
|
371
|
+
| `maxRetries` | `number` | No | `2` | Maximum retry attempts for failed URLs |
|
|
372
|
+
| `onProgress` | `function` | No | - | Progress callback: `({ completed, total, currentUrl }) => void` |
|
|
373
|
+
| `proxy` | `ProxyConfig` | No | - | Proxy configuration object |
|
|
374
|
+
| `waitForSelector` | `string` | No | - | CSS selector to wait for before page is loaded |
|
|
375
|
+
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
|
|
376
|
+
| `showChrome` | `boolean` | No | `false` | Show Chrome window for debugging |
|
|
377
|
+
| `connectionToCore` | `any` | No | - | Connection to shared Hero Core (for production) |
|
|
373
378
|
|
|
374
379
|
**Returns:** `Promise<ScrapeResult>`
|
|
375
380
|
|
|
@@ -382,8 +387,6 @@ interface ScrapeResult {
|
|
|
382
387
|
interface WebsiteScrapeResult {
|
|
383
388
|
markdown?: string;
|
|
384
389
|
html?: string;
|
|
385
|
-
json?: string;
|
|
386
|
-
text?: string;
|
|
387
390
|
metadata: {
|
|
388
391
|
baseUrl: string;
|
|
389
392
|
totalPages: number;
|