@vakra-dev/supermarkdown 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +368 -0
- package/package.json +9 -9
- package/supermarkdown.darwin-arm64.node +0 -0
- package/supermarkdown.darwin-x64.node +0 -0
- package/supermarkdown.linux-arm64-gnu.node +0 -0
- package/supermarkdown.linux-arm64-musl.node +0 -0
- package/supermarkdown.linux-x64-gnu.node +0 -0
- package/supermarkdown.linux-x64-musl.node +0 -0
- package/supermarkdown.win32-arm64-msvc.node +0 -0
- package/supermarkdown.win32-x64-msvc.node +0 -0
package/README.md
CHANGED
|
@@ -19,10 +19,56 @@ High-performance HTML to Markdown converter with full GitHub Flavored Markdown s
|
|
|
19
19
|
|
|
20
20
|
## Installation
|
|
21
21
|
|
|
22
|
+
### Node.js
|
|
23
|
+
|
|
22
24
|
```bash
|
|
23
25
|
npm install @vakra-dev/supermarkdown
|
|
24
26
|
```
|
|
25
27
|
|
|
28
|
+
### Rust
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
cargo add supermarkdown
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### CLI
|
|
35
|
+
|
|
36
|
+
Install the CLI binary via cargo:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
cargo install supermarkdown-cli
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Command Line Usage
|
|
43
|
+
|
|
44
|
+
The CLI allows you to convert HTML files from the command line or via stdin:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Convert a file
|
|
48
|
+
supermarkdown page.html > page.md
|
|
49
|
+
|
|
50
|
+
# Pipe HTML from curl
|
|
51
|
+
curl -s https://example.com | supermarkdown
|
|
52
|
+
|
|
53
|
+
# Exclude navigation and ads
|
|
54
|
+
supermarkdown --exclude "nav,.ad,#sidebar" page.html
|
|
55
|
+
|
|
56
|
+
# Use setext-style headings and referenced links
|
|
57
|
+
supermarkdown --heading-style setext --link-style referenced page.html
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### CLI Options
|
|
61
|
+
|
|
62
|
+
| Option | Description |
|
|
63
|
+
| ------ | ----------- |
|
|
64
|
+
| `-h, --help` | Print help message |
|
|
65
|
+
| `-v, --version` | Print version |
|
|
66
|
+
| `--heading-style <STYLE>` | `atx` (default) or `setext` |
|
|
67
|
+
| `--link-style <STYLE>` | `inline` (default) or `referenced` |
|
|
68
|
+
| `--code-fence <CHAR>` | `` ` `` (default) or `~` |
|
|
69
|
+
| `--bullet <CHAR>` | `-` (default), `*`, or `+` |
|
|
70
|
+
| `--exclude <SELECTORS>` | CSS selectors to exclude (comma-separated) |
|
|
71
|
+
|
|
26
72
|
## Quick Start
|
|
27
73
|
|
|
28
74
|
```javascript
|
|
@@ -40,6 +86,156 @@ console.log(markdown);
|
|
|
40
86
|
// This is a **test** with a [link](https://example.com).
|
|
41
87
|
```
|
|
42
88
|
|
|
89
|
+
## Common Use Cases
|
|
90
|
+
|
|
91
|
+
### Cleaning Web Scrapes
|
|
92
|
+
|
|
93
|
+
When scraping websites, HTML often contains navigation, ads, and other non-content elements. Use selectors to extract only what you need:
|
|
94
|
+
|
|
95
|
+
```javascript
|
|
96
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
97
|
+
|
|
98
|
+
// Raw HTML from a web scrape
|
|
99
|
+
const scrapedHtml = await fetchPage("https://example.com/article");
|
|
100
|
+
|
|
101
|
+
// Clean conversion - remove nav, ads, sidebars
|
|
102
|
+
const markdown = convert(scrapedHtml, {
|
|
103
|
+
excludeSelectors: [
|
|
104
|
+
"nav",
|
|
105
|
+
"header",
|
|
106
|
+
"footer",
|
|
107
|
+
".sidebar",
|
|
108
|
+
".advertisement",
|
|
109
|
+
".cookie-banner",
|
|
110
|
+
".social-share",
|
|
111
|
+
".comments",
|
|
112
|
+
"script",
|
|
113
|
+
"style",
|
|
114
|
+
],
|
|
115
|
+
});
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Preparing Content for LLMs
|
|
119
|
+
|
|
120
|
+
When feeding web content to LLMs, you want clean, focused text without HTML artifacts:
|
|
121
|
+
|
|
122
|
+
```javascript
|
|
123
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
124
|
+
|
|
125
|
+
// Extract just the article content for RAG pipelines
|
|
126
|
+
const markdown = convert(html, {
|
|
127
|
+
excludeSelectors: [
|
|
128
|
+
"nav",
|
|
129
|
+
"header",
|
|
130
|
+
"footer",
|
|
131
|
+
"aside",
|
|
132
|
+
".related-posts",
|
|
133
|
+
".author-bio",
|
|
134
|
+
],
|
|
135
|
+
includeSelectors: ["article", ".post-content", "main"],
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// Now feed to your LLM
|
|
139
|
+
const response = await llm.chat({
|
|
140
|
+
messages: [
|
|
141
|
+
{
|
|
142
|
+
role: "user",
|
|
143
|
+
content: `Summarize this article:\n\n${markdown}`,
|
|
144
|
+
},
|
|
145
|
+
],
|
|
146
|
+
});
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Processing Blog Posts
|
|
150
|
+
|
|
151
|
+
Convert blog HTML while preserving code blocks and formatting:
|
|
152
|
+
|
|
153
|
+
```javascript
|
|
154
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
155
|
+
|
|
156
|
+
const blogHtml = `
|
|
157
|
+
<article>
|
|
158
|
+
<h1>Getting Started with Rust</h1>
|
|
159
|
+
<p>Rust is a systems programming language focused on safety.</p>
|
|
160
|
+
<pre><code class="language-rust">fn main() {
|
|
161
|
+
println!("Hello, world!");
|
|
162
|
+
}</code></pre>
|
|
163
|
+
<p>The <code>println!</code> macro prints to stdout.</p>
|
|
164
|
+
</article>
|
|
165
|
+
`;
|
|
166
|
+
|
|
167
|
+
const markdown = convert(blogHtml);
|
|
168
|
+
// Output:
|
|
169
|
+
// # Getting Started with Rust
|
|
170
|
+
//
|
|
171
|
+
// Rust is a systems programming language focused on safety.
|
|
172
|
+
//
|
|
173
|
+
// ```rust
|
|
174
|
+
// fn main() {
|
|
175
|
+
// println!("Hello, world!");
|
|
176
|
+
// }
|
|
177
|
+
// ```
|
|
178
|
+
//
|
|
179
|
+
// The `println!` macro prints to stdout.
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Converting Documentation Pages
|
|
183
|
+
|
|
184
|
+
Handle tables, definition lists, and nested structures common in docs:
|
|
185
|
+
|
|
186
|
+
```javascript
|
|
187
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
188
|
+
|
|
189
|
+
const docsHtml = `
|
|
190
|
+
<h2>API Reference</h2>
|
|
191
|
+
<table>
|
|
192
|
+
<tr><th>Method</th><th>Description</th></tr>
|
|
193
|
+
<tr><td><code>convert()</code></td><td>Sync conversion</td></tr>
|
|
194
|
+
<tr><td><code>convertAsync()</code></td><td>Async conversion</td></tr>
|
|
195
|
+
</table>
|
|
196
|
+
<dl>
|
|
197
|
+
<dt>headingStyle</dt>
|
|
198
|
+
<dd>ATX (#) or Setext (underlines)</dd>
|
|
199
|
+
</dl>
|
|
200
|
+
`;
|
|
201
|
+
|
|
202
|
+
const markdown = convert(docsHtml);
|
|
203
|
+
// Output:
|
|
204
|
+
// ## API Reference
|
|
205
|
+
//
|
|
206
|
+
// | Method | Description |
|
|
207
|
+
// | --- | --- |
|
|
208
|
+
// | `convert()` | Sync conversion |
|
|
209
|
+
// | `convertAsync()` | Async conversion |
|
|
210
|
+
//
|
|
211
|
+
// headingStyle
|
|
212
|
+
// : ATX (#) or Setext (underlines)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Batch Processing
|
|
216
|
+
|
|
217
|
+
Process multiple documents efficiently with async conversion:
|
|
218
|
+
|
|
219
|
+
```javascript
|
|
220
|
+
import { convertAsync } from "@vakra-dev/supermarkdown";
|
|
221
|
+
|
|
222
|
+
const urls = [
|
|
223
|
+
"https://example.com/page1",
|
|
224
|
+
"https://example.com/page2",
|
|
225
|
+
"https://example.com/page3",
|
|
226
|
+
];
|
|
227
|
+
|
|
228
|
+
// Fetch and convert in parallel
|
|
229
|
+
const markdownDocs = await Promise.all(
|
|
230
|
+
urls.map(async (url) => {
|
|
231
|
+
const html = await fetch(url).then((r) => r.text());
|
|
232
|
+
return convertAsync(html, {
|
|
233
|
+
excludeSelectors: ["nav", "footer"],
|
|
234
|
+
});
|
|
235
|
+
})
|
|
236
|
+
);
|
|
237
|
+
```
|
|
238
|
+
|
|
43
239
|
## Usage
|
|
44
240
|
|
|
45
241
|
### Basic Conversion
|
|
@@ -288,6 +484,178 @@ Some HTML features cannot be fully represented in Markdown:
|
|
|
288
484
|
| CSS styling | Ignored (except `text-align` for tables) |
|
|
289
485
|
| Empty elements | Removed from output |
|
|
290
486
|
|
|
487
|
+
## Edge Cases
|
|
488
|
+
|
|
489
|
+
supermarkdown handles many edge cases gracefully:
|
|
490
|
+
|
|
491
|
+
### Malformed HTML
|
|
492
|
+
|
|
493
|
+
Invalid or malformed HTML is parsed via html5ever, which applies browser-like error recovery:
|
|
494
|
+
|
|
495
|
+
```javascript
|
|
496
|
+
// Missing closing tags, nested issues - all handled
|
|
497
|
+
const html = "<p>Unclosed paragraph<div>Mixed<p>nesting</div>";
|
|
498
|
+
const markdown = convert(html); // Produces sensible output
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### Deeply Nested Lists
|
|
502
|
+
|
|
503
|
+
Nested lists maintain proper indentation:
|
|
504
|
+
|
|
505
|
+
```javascript
|
|
506
|
+
const html = `
|
|
507
|
+
<ul>
|
|
508
|
+
<li>Level 1
|
|
509
|
+
<ul>
|
|
510
|
+
<li>Level 2
|
|
511
|
+
<ul>
|
|
512
|
+
<li>Level 3</li>
|
|
513
|
+
</ul>
|
|
514
|
+
</li>
|
|
515
|
+
</ul>
|
|
516
|
+
</li>
|
|
517
|
+
</ul>`;
|
|
518
|
+
// Output:
|
|
519
|
+
// - Level 1
|
|
520
|
+
// - Level 2
|
|
521
|
+
// - Level 3
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
### Code Blocks with Backticks
|
|
525
|
+
|
|
526
|
+
When code contains backticks, the fence automatically uses more backticks:
|
|
527
|
+
|
|
528
|
+
```javascript
|
|
529
|
+
const html = "<pre><code>Use `backticks` for code</code></pre>";
|
|
530
|
+
// Output uses 4 backticks as fence:
|
|
531
|
+
// ````
|
|
532
|
+
// Use `backticks` for code
|
|
533
|
+
// ````
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
### Empty Elements
|
|
537
|
+
|
|
538
|
+
Empty paragraphs, divs, and spans are stripped to avoid blank lines:
|
|
539
|
+
|
|
540
|
+
```javascript
|
|
541
|
+
const html = "<p></p><p>Real content</p><p> </p>";
|
|
542
|
+
const markdown = convert(html);
|
|
543
|
+
// Output: "Real content" (empty paragraphs removed)
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
### Special Characters in URLs
|
|
547
|
+
|
|
548
|
+
Spaces, parentheses, and other special characters in URLs are percent-encoded:
|
|
549
|
+
|
|
550
|
+
```javascript
|
|
551
|
+
const html = '<a href="https://example.com/file (1).pdf">Download</a>';
|
|
552
|
+
// Output: [Download](https://example.com/file%20%281%29.pdf)
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
### Tables Without Headers
|
|
556
|
+
|
|
557
|
+
Tables missing `<thead>` use the first row as header:
|
|
558
|
+
|
|
559
|
+
```javascript
|
|
560
|
+
const html = `
|
|
561
|
+
<table>
|
|
562
|
+
<tr><td>A</td><td>B</td></tr>
|
|
563
|
+
<tr><td>1</td><td>2</td></tr>
|
|
564
|
+
</table>`;
|
|
565
|
+
// Output:
|
|
566
|
+
// | A | B |
|
|
567
|
+
// | --- | --- |
|
|
568
|
+
// | 1 | 2 |
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
### Mixed Content in Lists
|
|
572
|
+
|
|
573
|
+
List items with mixed block/inline content are handled:
|
|
574
|
+
|
|
575
|
+
```javascript
|
|
576
|
+
const html = `
|
|
577
|
+
<ul>
|
|
578
|
+
<li>Simple item</li>
|
|
579
|
+
<li>
|
|
580
|
+
<p>Paragraph in list</p>
|
|
581
|
+
<pre><code>code block</code></pre>
|
|
582
|
+
</li>
|
|
583
|
+
</ul>`;
|
|
584
|
+
// Outputs proper markdown with preserved formatting
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
## Troubleshooting
|
|
588
|
+
|
|
589
|
+
### Empty or Minimal Output
|
|
590
|
+
|
|
591
|
+
**Problem:** `convert()` returns empty string or very little content.
|
|
592
|
+
|
|
593
|
+
**Causes & Solutions:**
|
|
594
|
+
|
|
595
|
+
1. **Content is in excluded elements** - Check if your content is inside `nav`, `header`, etc. that might match default patterns
|
|
596
|
+
```javascript
|
|
597
|
+
// Try without selectors first
|
|
598
|
+
const markdown = convert(html);
|
|
599
|
+
```
|
|
600
|
+
|
|
601
|
+
2. **JavaScript-rendered content** - supermarkdown converts static HTML only. If the page uses client-side rendering, you need to render it first (e.g., with Puppeteer or Playwright)
|
|
602
|
+
|
|
603
|
+
3. **Content in iframes** - iframe content is not extracted. Fetch iframe src separately if needed
|
|
604
|
+
|
|
605
|
+
### Missing Code Block Language
|
|
606
|
+
|
|
607
|
+
**Problem:** Code blocks don't have language annotation.
|
|
608
|
+
|
|
609
|
+
**Solution:** supermarkdown looks for `language-*`, `lang-*`, or `highlight-*` class patterns. Ensure your HTML uses standard class naming:
|
|
610
|
+
|
|
611
|
+
```html
|
|
612
|
+
<!-- Detected -->
|
|
613
|
+
<pre><code class="language-python">...</code></pre>
|
|
614
|
+
<pre><code class="lang-js">...</code></pre>
|
|
615
|
+
|
|
616
|
+
<!-- Not detected -->
|
|
617
|
+
<pre><code class="python-code">...</code></pre>
|
|
618
|
+
```
|
|
619
|
+
|
|
620
|
+
### Tables Not Rendering Correctly
|
|
621
|
+
|
|
622
|
+
**Problem:** Tables appear as plain text or are malformed.
|
|
623
|
+
|
|
624
|
+
**Causes & Solutions:**
|
|
625
|
+
|
|
626
|
+
1. **Missing table structure** - Ensure proper `<table>`, `<tr>`, `<td>` structure
|
|
627
|
+
2. **Nested tables** - GFM doesn't support nested tables; inner tables are flattened
|
|
628
|
+
3. **colspan/rowspan** - These are not supported in GFM; content goes in first cell
|
|
629
|
+
|
|
630
|
+
### Links Missing or Broken
|
|
631
|
+
|
|
632
|
+
**Problem:** Links don't appear or have wrong URLs.
|
|
633
|
+
|
|
634
|
+
**Solutions:**
|
|
635
|
+
|
|
636
|
+
1. **Relative URLs** - Use `baseUrl` option to resolve relative links:
|
|
637
|
+
```javascript
|
|
638
|
+
convert(html, { baseUrl: "https://example.com" });
|
|
639
|
+
```
|
|
640
|
+
|
|
641
|
+
2. **Links in excluded elements** - Navigation links are often in `<nav>` which may be excluded
|
|
642
|
+
|
|
643
|
+
### Performance Issues with Large Documents
|
|
644
|
+
|
|
645
|
+
**Problem:** Conversion is slow for very large HTML files.
|
|
646
|
+
|
|
647
|
+
**Solutions:**
|
|
648
|
+
|
|
649
|
+
1. **Use async** - `convertAsync()` won't block the event loop
|
|
650
|
+
2. **Pre-filter HTML** - Remove obvious non-content before conversion
|
|
651
|
+
3. **Stream processing** - For very large docs, consider splitting into sections
|
|
652
|
+
|
|
653
|
+
### Special Characters Appearing Wrong
|
|
654
|
+
|
|
655
|
+
**Problem:** Characters like `<`, `>`, `&` appear as entities.
|
|
656
|
+
|
|
657
|
+
**Solution:** This is usually correct behavior - these characters need escaping in markdown. If you're seeing `&` where you expect `&`, the source HTML may have double-encoded entities.
|
|
658
|
+
|
|
291
659
|
## Rust Usage
|
|
292
660
|
|
|
293
661
|
Add to your `Cargo.toml`:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vakra-dev/supermarkdown",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.5",
|
|
4
4
|
"description": "High-performance HTML to Markdown converter with full GFM support",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -62,13 +62,13 @@
|
|
|
62
62
|
"timeout": "3m"
|
|
63
63
|
},
|
|
64
64
|
"optionalDependencies": {
|
|
65
|
-
"@vakra-dev/supermarkdown-win32-x64-msvc": "0.0.
|
|
66
|
-
"@vakra-dev/supermarkdown-darwin-x64": "0.0.
|
|
67
|
-
"@vakra-dev/supermarkdown-linux-x64-gnu": "0.0.
|
|
68
|
-
"@vakra-dev/supermarkdown-linux-x64-musl": "0.0.
|
|
69
|
-
"@vakra-dev/supermarkdown-linux-arm64-gnu": "0.0.
|
|
70
|
-
"@vakra-dev/supermarkdown-linux-arm64-musl": "0.0.
|
|
71
|
-
"@vakra-dev/supermarkdown-darwin-arm64": "0.0.
|
|
72
|
-
"@vakra-dev/supermarkdown-win32-arm64-msvc": "0.0.
|
|
65
|
+
"@vakra-dev/supermarkdown-win32-x64-msvc": "0.0.5",
|
|
66
|
+
"@vakra-dev/supermarkdown-darwin-x64": "0.0.5",
|
|
67
|
+
"@vakra-dev/supermarkdown-linux-x64-gnu": "0.0.5",
|
|
68
|
+
"@vakra-dev/supermarkdown-linux-x64-musl": "0.0.5",
|
|
69
|
+
"@vakra-dev/supermarkdown-linux-arm64-gnu": "0.0.5",
|
|
70
|
+
"@vakra-dev/supermarkdown-linux-arm64-musl": "0.0.5",
|
|
71
|
+
"@vakra-dev/supermarkdown-darwin-arm64": "0.0.5",
|
|
72
|
+
"@vakra-dev/supermarkdown-win32-arm64-msvc": "0.0.5"
|
|
73
73
|
}
|
|
74
74
|
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|