@vakra-dev/reader 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -16
- package/README.md +87 -57
- package/dist/cli/index.js +8 -14
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8 -14
- package/dist/index.js.map +1 -1
- package/package.json +2 -3
package/LICENSE
CHANGED
|
@@ -187,19 +187,4 @@
|
|
|
187
187
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
188
188
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
189
189
|
See the License for the specific language governing permissions and
|
|
190
|
-
limitations under the License.
|
|
191
|
-
|
|
192
|
-
---
|
|
193
|
-
|
|
194
|
-
Attribution Requirement
|
|
195
|
-
|
|
196
|
-
All distributions, publications, or public uses of this software, in whole or in
|
|
197
|
-
part, must include the following attribution in a clearly visible location (such
|
|
198
|
-
as documentation, a README file, or an "About" section in any user interface):
|
|
199
|
-
|
|
200
|
-
"This product includes software developed by Nihal Kaul
|
|
201
|
-
(https://www.linkedin.com/in/nihalwashere/) as part of the Reader project
|
|
202
|
-
(https://github.com/vakra-dev/reader)."
|
|
203
|
-
|
|
204
|
-
This attribution requirement is in addition to, and does not limit, any
|
|
205
|
-
obligations imposed by the Apache License, Version 2.0.
|
|
190
|
+
limitations under the License.
|
package/README.md
CHANGED
|
@@ -19,7 +19,11 @@
|
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
<p align="center">
|
|
22
|
-
|
|
22
|
+
<a href="https://docs.reader.dev">Docs</a> · <a href="https://docs.reader.dev/home/examples">Examples</a> · <a href="https://discord.gg/6tjkq7J5WV">Discord</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
<p align="center">
|
|
26
|
+
<img src="./docs/assets/demo.gif" alt="Reader demo — scrape any URL to clean markdown" width="700" />
|
|
23
27
|
</p>
|
|
24
28
|
|
|
25
29
|
## The Problem
|
|
@@ -63,6 +67,9 @@ console.log(`Found ${pages.urls.length} pages`);
|
|
|
63
67
|
|
|
64
68
|
All the hard stuff, browser pooling, challenge detection, proxy rotation, retries, happens under the hood. You get clean markdown. Your agents get the web.
|
|
65
69
|
|
|
70
|
+
> [!TIP]
|
|
71
|
+
> If Reader is useful to you, a [star on GitHub](https://github.com/vakra-dev/reader) helps others discover the project.
|
|
72
|
+
|
|
66
73
|
## Features
|
|
67
74
|
|
|
68
75
|
- **Cloudflare Bypass** - TLS fingerprinting, DNS over TLS, WebRTC masking
|
|
@@ -252,20 +259,20 @@ npx reader scrape https://example.com https://example.org -c 2
|
|
|
252
259
|
npx reader scrape https://example.com -o output.md
|
|
253
260
|
```
|
|
254
261
|
|
|
255
|
-
| Option | Type | Default | Description
|
|
256
|
-
| ------------------------ | ------ | ------------ |
|
|
257
|
-
| `-f, --format <formats>` | string | `"markdown"` | Output formats (comma-separated: markdown,html)
|
|
258
|
-
| `-o, --output <file>` | string | stdout | Output file path
|
|
259
|
-
| `-c, --concurrency <n>` | number | `1` | Parallel requests
|
|
260
|
-
| `-t, --timeout <ms>` | number | `30000` | Request timeout in milliseconds
|
|
261
|
-
| `--batch-timeout <ms>` | number | `300000` | Total timeout for entire batch operation
|
|
262
|
-
| `--proxy <url>` | string | - | Proxy URL (e.g., http://user:pass@host:port)
|
|
263
|
-
| `--user-agent <string>` | string | - | Custom user agent string
|
|
264
|
-
| `--show-chrome` | flag | - | Show browser window for debugging
|
|
265
|
-
| `--no-main-content` | flag | - | Disable main content extraction (include full page)
|
|
266
|
-
| `--include-tags <sel>` | string | - | CSS selectors for elements to include (comma-separated)
|
|
267
|
-
| `--exclude-tags <sel>` | string | - | CSS selectors for elements to exclude (comma-separated)
|
|
268
|
-
| `-v, --verbose` | flag | - | Enable verbose logging
|
|
262
|
+
| Option | Type | Default | Description |
|
|
263
|
+
| ------------------------ | ------ | ------------ | ------------------------------------------------------- |
|
|
264
|
+
| `-f, --format <formats>` | string | `"markdown"` | Output formats (comma-separated: markdown,html) |
|
|
265
|
+
| `-o, --output <file>` | string | stdout | Output file path |
|
|
266
|
+
| `-c, --concurrency <n>` | number | `1` | Parallel requests |
|
|
267
|
+
| `-t, --timeout <ms>` | number | `30000` | Request timeout in milliseconds |
|
|
268
|
+
| `--batch-timeout <ms>` | number | `300000` | Total timeout for entire batch operation |
|
|
269
|
+
| `--proxy <url>` | string | - | Proxy URL (e.g., http://user:pass@host:port) |
|
|
270
|
+
| `--user-agent <string>` | string | - | Custom user agent string |
|
|
271
|
+
| `--show-chrome` | flag | - | Show browser window for debugging |
|
|
272
|
+
| `--no-main-content` | flag | - | Disable main content extraction (include full page) |
|
|
273
|
+
| `--include-tags <sel>` | string | - | CSS selectors for elements to include (comma-separated) |
|
|
274
|
+
| `--exclude-tags <sel>` | string | - | CSS selectors for elements to exclude (comma-separated) |
|
|
275
|
+
| `-v, --verbose` | flag | - | Enable verbose logging |
|
|
269
276
|
|
|
270
277
|
### `reader crawl <url>`
|
|
271
278
|
|
|
@@ -355,26 +362,26 @@ await reader.close();
|
|
|
355
362
|
|
|
356
363
|
Scrape one or more URLs. Can be used directly or via `ReaderClient`.
|
|
357
364
|
|
|
358
|
-
| Option | Type
|
|
359
|
-
| ------------------ |
|
|
360
|
-
| `urls` | `string[]`
|
|
361
|
-
| `formats` | `Array<"markdown" \| "html">`
|
|
362
|
-
| `onlyMainContent` | `boolean`
|
|
363
|
-
| `includeTags` | `string[]`
|
|
364
|
-
| `excludeTags` | `string[]`
|
|
365
|
-
| `userAgent` | `string`
|
|
366
|
-
| `timeoutMs` | `number`
|
|
367
|
-
| `includePatterns` | `string[]`
|
|
368
|
-
| `excludePatterns` | `string[]`
|
|
369
|
-
| `batchConcurrency` | `number`
|
|
370
|
-
| `batchTimeoutMs` | `number`
|
|
371
|
-
| `maxRetries` | `number`
|
|
372
|
-
| `onProgress` | `function`
|
|
373
|
-
| `proxy` | `ProxyConfig`
|
|
374
|
-
| `waitForSelector` | `string`
|
|
375
|
-
| `verbose` | `boolean`
|
|
376
|
-
| `showChrome` | `boolean`
|
|
377
|
-
| `connectionToCore` | `any`
|
|
365
|
+
| Option | Type | Required | Default | Description |
|
|
366
|
+
| ------------------ | ----------------------------- | -------- | -------------- | --------------------------------------------------------------- |
|
|
367
|
+
| `urls` | `string[]` | Yes | - | Array of URLs to scrape |
|
|
368
|
+
| `formats` | `Array<"markdown" \| "html">` | No | `["markdown"]` | Output formats |
|
|
369
|
+
| `onlyMainContent` | `boolean` | No | `true` | Extract only main content (removes nav/header/footer) |
|
|
370
|
+
| `includeTags` | `string[]` | No | `[]` | CSS selectors for elements to keep |
|
|
371
|
+
| `excludeTags` | `string[]` | No | `[]` | CSS selectors for elements to remove |
|
|
372
|
+
| `userAgent` | `string` | No | - | Custom user agent string |
|
|
373
|
+
| `timeoutMs` | `number` | No | `30000` | Request timeout in milliseconds |
|
|
374
|
+
| `includePatterns` | `string[]` | No | `[]` | URL patterns to include (regex strings) |
|
|
375
|
+
| `excludePatterns` | `string[]` | No | `[]` | URL patterns to exclude (regex strings) |
|
|
376
|
+
| `batchConcurrency` | `number` | No | `1` | Number of URLs to process in parallel |
|
|
377
|
+
| `batchTimeoutMs` | `number` | No | `300000` | Total timeout for entire batch operation |
|
|
378
|
+
| `maxRetries` | `number` | No | `2` | Maximum retry attempts for failed URLs |
|
|
379
|
+
| `onProgress` | `function` | No | - | Progress callback: `({ completed, total, currentUrl }) => void` |
|
|
380
|
+
| `proxy` | `ProxyConfig` | No | - | Proxy configuration object |
|
|
381
|
+
| `waitForSelector` | `string` | No | - | CSS selector to wait for before page is loaded |
|
|
382
|
+
| `verbose` | `boolean` | No | `false` | Enable verbose logging |
|
|
383
|
+
| `showChrome` | `boolean` | No | `false` | Show Chrome window for debugging |
|
|
384
|
+
| `connectionToCore` | `any` | No | - | Connection to shared Hero Core (for production) |
|
|
378
385
|
|
|
379
386
|
**Returns:** `Promise<ScrapeResult>`
|
|
380
387
|
|
|
@@ -572,32 +579,54 @@ Reader uses [Ulixee Hero](https://ulixee.org/), a headless browser with advanced
|
|
|
572
579
|
- **Health Monitoring** - Background health checks every 5 minutes
|
|
573
580
|
- **Request Queuing** - Queues requests when pool is full (max 100)
|
|
574
581
|
|
|
575
|
-
|
|
582
|
+
### HTML to Markdown: supermarkdown
|
|
583
|
+
|
|
584
|
+
Reader uses [**supermarkdown**](https://github.com/vakra-dev/supermarkdown) for HTML to Markdown conversion - a sister project we built from scratch specifically for web scraping and LLM pipelines.
|
|
576
585
|
|
|
577
|
-
|
|
578
|
-
| ------------------------------------------ | ------------------------------ |
|
|
579
|
-
| [Getting Started](docs/getting-started.md) | Detailed setup and first steps |
|
|
580
|
-
| [Architecture](docs/architecture.md) | System design and data flow |
|
|
581
|
-
| [API Reference](docs/api-reference.md) | Complete API documentation |
|
|
582
|
-
| [Troubleshooting](docs/troubleshooting.md) | Common errors and solutions |
|
|
586
|
+
**Why we built it:**
|
|
583
587
|
|
|
584
|
-
|
|
588
|
+
When you're scraping the web, you encounter messy, malformed HTML that breaks most converters. And when you're feeding content to LLMs, you need clean output without artifacts or noise. We needed a converter that handles real-world HTML reliably while producing high-quality markdown.
|
|
585
589
|
|
|
586
|
-
|
|
587
|
-
| --------------------------------------------------------- | ----------------------------- |
|
|
588
|
-
| [Cloudflare Bypass](docs/guides/cloudflare-bypass.md) | How antibot bypass works |
|
|
589
|
-
| [Proxy Configuration](docs/guides/proxy-configuration.md) | Setting up proxies |
|
|
590
|
-
| [Browser Pool](docs/guides/browser-pool.md) | Production browser management |
|
|
591
|
-
| [Output Formats](docs/guides/output-formats.md) | Understanding output formats |
|
|
590
|
+
**What supermarkdown offers:**
|
|
592
591
|
|
|
593
|
-
|
|
592
|
+
| Feature | Benefit |
|
|
593
|
+
| -------------------- | ---------------------------------------------------- |
|
|
594
|
+
| **Written in Rust** | Native performance with Node.js bindings via napi-rs |
|
|
595
|
+
| **Full GFM support** | Tables, task lists, strikethrough, autolinks |
|
|
596
|
+
| **LLM-optimized** | Clean output designed for AI consumption |
|
|
597
|
+
| **Battle-tested** | Handles malformed HTML from real web pages |
|
|
598
|
+
| **CSS selectors** | Include/exclude elements during conversion |
|
|
599
|
+
|
|
600
|
+
supermarkdown is open source and available as both a Rust crate and npm package:
|
|
601
|
+
|
|
602
|
+
```bash
|
|
603
|
+
# npm
|
|
604
|
+
npm install @vakra-dev/supermarkdown
|
|
605
|
+
|
|
606
|
+
# Rust
|
|
607
|
+
cargo add supermarkdown
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
Check out the [supermarkdown repository](https://github.com/vakra-dev/supermarkdown) for examples and documentation.
|
|
611
|
+
|
|
612
|
+
## Server Deployment
|
|
613
|
+
|
|
614
|
+
Reader uses a real Chromium browser under the hood. On headless Linux servers (VPS, EC2, etc.), you need to install Chrome's system dependencies:
|
|
615
|
+
|
|
616
|
+
```bash
|
|
617
|
+
# Debian/Ubuntu
|
|
618
|
+
sudo apt-get install -y libnspr4 libnss3 libatk1.0-0 libatk-bridge2.0-0 \
|
|
619
|
+
libcups2 libxcb1 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
|
|
620
|
+
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 libasound2
|
|
621
|
+
```
|
|
622
|
+
|
|
623
|
+
This is the same requirement that Puppeteer and Playwright have on headless Linux. macOS, Windows, and Linux desktops already have these libraries.
|
|
624
|
+
|
|
625
|
+
For Docker and production deployment guides, see the [deployment documentation](https://docs.reader.dev/documentation/guides/deployment).
|
|
626
|
+
|
|
627
|
+
## Documentation
|
|
594
628
|
|
|
595
|
-
|
|
596
|
-
| --------------------------------------------------------- | -------------------------- |
|
|
597
|
-
| [Docker](docs/deployment/docker.md) | Container deployment |
|
|
598
|
-
| [Production Server](docs/deployment/production-server.md) | Express + shared Hero Core |
|
|
599
|
-
| [Job Queues](docs/deployment/job-queues.md) | BullMQ async scheduling |
|
|
600
|
-
| [Serverless](docs/deployment/serverless.md) | Lambda, Vercel, Workers |
|
|
629
|
+
Full documentation is available at **[docs.reader.dev](https://docs.reader.dev)**, including guides for scraping, crawling, proxy configuration, browser pool management, and deployment.
|
|
601
630
|
|
|
602
631
|
### Examples
|
|
603
632
|
|
|
@@ -658,4 +687,5 @@ If you use Reader in your research or project, please cite it:
|
|
|
658
687
|
## Support
|
|
659
688
|
|
|
660
689
|
- [GitHub Issues](https://github.com/vakra-dev/reader/issues)
|
|
661
|
-
- [Documentation](https://
|
|
690
|
+
- [Documentation](https://docs.reader.dev)
|
|
691
|
+
- [Discord](https://discord.gg/6tjkq7J5WV)
|
package/dist/cli/index.js
CHANGED
|
@@ -18,21 +18,15 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
18
18
|
import pLimit from "p-limit";
|
|
19
19
|
|
|
20
20
|
// src/formatters/markdown.ts
|
|
21
|
-
import
|
|
22
|
-
var turndownService = new TurndownService({
|
|
23
|
-
headingStyle: "atx",
|
|
24
|
-
hr: "---",
|
|
25
|
-
bulletListMarker: "-",
|
|
26
|
-
codeBlockStyle: "fenced",
|
|
27
|
-
fence: "```",
|
|
28
|
-
emDelimiter: "*",
|
|
29
|
-
strongDelimiter: "**",
|
|
30
|
-
linkStyle: "inlined",
|
|
31
|
-
linkReferenceStyle: "full"
|
|
32
|
-
});
|
|
21
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
33
22
|
function htmlToMarkdown(html) {
|
|
34
23
|
try {
|
|
35
|
-
return
|
|
24
|
+
return convert(html, {
|
|
25
|
+
headingStyle: "atx",
|
|
26
|
+
bulletMarker: "-",
|
|
27
|
+
codeFence: "`",
|
|
28
|
+
linkStyle: "inline"
|
|
29
|
+
});
|
|
36
30
|
} catch (error) {
|
|
37
31
|
console.warn("Error converting HTML to Markdown:", error);
|
|
38
32
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
@@ -1723,7 +1717,7 @@ var EngineOrchestrator = class {
|
|
|
1723
1717
|
return true;
|
|
1724
1718
|
}
|
|
1725
1719
|
if (error instanceof HttpError) {
|
|
1726
|
-
return error.statusCode === 403 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1720
|
+
return error.statusCode === 403 || error.statusCode === 404 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1727
1721
|
}
|
|
1728
1722
|
if (error instanceof EngineUnavailableError) {
|
|
1729
1723
|
return true;
|