@vakra-dev/reader 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -0
- package/dist/cli/index.js +8 -14
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8 -14
- package/dist/index.js.map +1 -1
- package/package.json +2 -3
package/README.md
CHANGED
|
@@ -572,6 +572,36 @@ Reader uses [Ulixee Hero](https://ulixee.org/), a headless browser with advanced
|
|
|
572
572
|
- **Health Monitoring** - Background health checks every 5 minutes
|
|
573
573
|
- **Request Queuing** - Queues requests when pool is full (max 100)
|
|
574
574
|
|
|
575
|
+
### HTML to Markdown: supermarkdown
|
|
576
|
+
|
|
577
|
+
Reader uses [**supermarkdown**](https://github.com/vakra-dev/supermarkdown) for HTML to Markdown conversion - a sister project we built from scratch specifically for web scraping and LLM pipelines.
|
|
578
|
+
|
|
579
|
+
**Why we built it:**
|
|
580
|
+
|
|
581
|
+
When you're scraping the web, you encounter messy, malformed HTML that breaks most converters. And when you're feeding content to LLMs, you need clean output without artifacts or noise. We needed a converter that handles real-world HTML reliably while producing high-quality markdown.
|
|
582
|
+
|
|
583
|
+
**What supermarkdown offers:**
|
|
584
|
+
|
|
585
|
+
| Feature | Benefit |
|
|
586
|
+
|---------|---------|
|
|
587
|
+
| **Written in Rust** | Native performance with Node.js bindings via napi-rs |
|
|
588
|
+
| **Full GFM support** | Tables, task lists, strikethrough, autolinks |
|
|
589
|
+
| **LLM-optimized** | Clean output designed for AI consumption |
|
|
590
|
+
| **Battle-tested** | Handles malformed HTML from real web pages |
|
|
591
|
+
| **CSS selectors** | Include/exclude elements during conversion |
|
|
592
|
+
|
|
593
|
+
supermarkdown is open source and available as both a Rust crate and npm package:
|
|
594
|
+
|
|
595
|
+
```bash
|
|
596
|
+
# npm
|
|
597
|
+
npm install @vakra-dev/supermarkdown
|
|
598
|
+
|
|
599
|
+
# Rust
|
|
600
|
+
cargo add supermarkdown
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
Check out the [supermarkdown repository](https://github.com/vakra-dev/supermarkdown) for examples and documentation.
|
|
604
|
+
|
|
575
605
|
## Documentation
|
|
576
606
|
|
|
577
607
|
| Guide | Description |
|
package/dist/cli/index.js
CHANGED
|
@@ -18,21 +18,15 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
18
18
|
import pLimit from "p-limit";
|
|
19
19
|
|
|
20
20
|
// src/formatters/markdown.ts
|
|
21
|
-
import
|
|
22
|
-
var turndownService = new TurndownService({
|
|
23
|
-
headingStyle: "atx",
|
|
24
|
-
hr: "---",
|
|
25
|
-
bulletListMarker: "-",
|
|
26
|
-
codeBlockStyle: "fenced",
|
|
27
|
-
fence: "```",
|
|
28
|
-
emDelimiter: "*",
|
|
29
|
-
strongDelimiter: "**",
|
|
30
|
-
linkStyle: "inlined",
|
|
31
|
-
linkReferenceStyle: "full"
|
|
32
|
-
});
|
|
21
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
33
22
|
function htmlToMarkdown(html) {
|
|
34
23
|
try {
|
|
35
|
-
return
|
|
24
|
+
return convert(html, {
|
|
25
|
+
headingStyle: "atx",
|
|
26
|
+
bulletMarker: "-",
|
|
27
|
+
codeFence: "`",
|
|
28
|
+
linkStyle: "inline"
|
|
29
|
+
});
|
|
36
30
|
} catch (error) {
|
|
37
31
|
console.warn("Error converting HTML to Markdown:", error);
|
|
38
32
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
@@ -1723,7 +1717,7 @@ var EngineOrchestrator = class {
|
|
|
1723
1717
|
return true;
|
|
1724
1718
|
}
|
|
1725
1719
|
if (error instanceof HttpError) {
|
|
1726
|
-
return error.statusCode === 403 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1720
|
+
return error.statusCode === 403 || error.statusCode === 404 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1727
1721
|
}
|
|
1728
1722
|
if (error instanceof EngineUnavailableError) {
|
|
1729
1723
|
return true;
|