html2llm 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +191 -161
- package/package.json +44 -44
- package/dist/browser.d.ts +0 -14
- package/dist/browser.js +0 -97
- package/dist/cli.d.ts +0 -2
- package/dist/cli.js +0 -45
- package/dist/html2llm.d.ts +0 -12
- package/dist/html2llm.js +0 -152
- package/dist/server.d.ts +0 -2
- package/dist/server.js +0 -58
package/README.md
CHANGED
|
@@ -1,161 +1,191 @@
|
|
|
1
|
-
# html2llm
|
|
2
|
-
|
|
3
|
-
Convert HTML to **CSX (Compact S-Expression)** — a token-efficient format for feeding web content to LLMs.
|
|
4
|
-
|
|
5
|
-
## Why CSX?
|
|
6
|
-
|
|
7
|
-
Raw HTML is token-expensive: every `<div class="wrapper">` pays the cost of angle brackets, tag names, attribute syntax, and closing tags. CSX strips all noise and encodes the same structure in a format LLMs understand natively.
|
|
8
|
-
|
|
9
|
-
| Format | Example |
|
|
10
|
-
|--------|---------|
|
|
11
|
-
| HTML | `<p class="lead">Hello <strong>world</strong></p>` |
|
|
12
|
-
| Hiccup | `[:p {:class "lead"} "Hello " [:strong "world"]]` |
|
|
13
|
-
| **CSX** | `(p.lead Hello (b world))` |
|
|
14
|
-
|
|
15
|
-
## Install
|
|
16
|
-
|
|
17
|
-
```bash
|
|
18
|
-
npm install html2llm
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
// (
|
|
41
|
-
|
|
42
|
-
//
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
//
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
-
|
|
114
|
-
-
|
|
115
|
-
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
```bash
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
#
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
##
|
|
160
|
-
|
|
161
|
-
|
|
1
|
+
# html2llm
|
|
2
|
+
|
|
3
|
+
Convert HTML to **CSX (Compact S-Expression)** — a token-efficient format for feeding web content to LLMs.
|
|
4
|
+
|
|
5
|
+
## Why CSX?
|
|
6
|
+
|
|
7
|
+
Raw HTML is token-expensive: every `<div class="wrapper">` pays the cost of angle brackets, tag names, attribute syntax, and closing tags. CSX strips all noise and encodes the same structure in a format LLMs understand natively.
|
|
8
|
+
|
|
9
|
+
| Format | Example |
|
|
10
|
+
|--------|---------|
|
|
11
|
+
| HTML | `<p class="lead">Hello <strong>world</strong></p>` |
|
|
12
|
+
| Hiccup | `[:p {:class "lead"} "Hello " [:strong "world"]]` |
|
|
13
|
+
| **CSX** | `(p.lead Hello (b world))` |
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npm install html2llm
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
### As a module
|
|
24
|
+
|
|
25
|
+
```typescript
|
|
26
|
+
import { htmlToCSX, urlToCSX } from "html2llm";
|
|
27
|
+
|
|
28
|
+
// Convert HTML string to CSX
|
|
29
|
+
const html = `<article class="post">
|
|
30
|
+
<h1>Title</h1>
|
|
31
|
+
<p>This is <strong>important</strong> content.</p>
|
|
32
|
+
</article>`;
|
|
33
|
+
|
|
34
|
+
console.log(htmlToCSX(html));
|
|
35
|
+
// (article.post (h1 Title) (p "This is" (b important) content.))
|
|
36
|
+
|
|
37
|
+
console.log(htmlToCSX(html, { minify: false }));
|
|
38
|
+
// (article.post
|
|
39
|
+
// (h1 Title)
|
|
40
|
+
// (p "This is" (b important) content.))
|
|
41
|
+
|
|
42
|
+
// Fetch a URL and convert its HTML to CSX
|
|
43
|
+
const csx = await urlToCSX("https://example.com");
|
|
44
|
+
console.log(csx);
|
|
45
|
+
// (div (h1 "Example Domain") (p ...))
|
|
46
|
+
|
|
47
|
+
// Pretty-printed
|
|
48
|
+
const prettyCsx = await urlToCSX("https://example.com", { minify: false });
|
|
49
|
+
|
|
50
|
+
// Use headless Chromium to bypass JS anti-bot walls (zhihu, etc.)
|
|
51
|
+
const zhihuCsx = await urlToCSX("https://www.zhihu.com/question/...", {
|
|
52
|
+
headless: true,
|
|
53
|
+
});
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### As a CLI
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# From file
|
|
60
|
+
html2llm page.html
|
|
61
|
+
|
|
62
|
+
# From URL (fetches and converts)
|
|
63
|
+
html2llm https://example.com
|
|
64
|
+
|
|
65
|
+
# From URL with headless browser (bypasses anti-bot JS challenges)
|
|
66
|
+
html2llm https://www.zhihu.com/question/... --headless
|
|
67
|
+
|
|
68
|
+
# From stdin
|
|
69
|
+
curl -s https://example.com | html2llm
|
|
70
|
+
|
|
71
|
+
# Pretty-printed
|
|
72
|
+
html2llm page.html --pretty
|
|
73
|
+
html2llm https://example.com --pretty
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### As a web service (like Jina)
|
|
77
|
+
|
|
78
|
+
Start the server:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Development (hot reload)
|
|
82
|
+
npm run dev:server
|
|
83
|
+
|
|
84
|
+
# Production
|
|
85
|
+
npm run build && npm start
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Then curl any URL to get CSX:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Fetch a page
|
|
92
|
+
curl "localhost:3000/https://example.com"
|
|
93
|
+
|
|
94
|
+
# Pretty-printed
|
|
95
|
+
curl "localhost:3000/https://example.com?pretty"
|
|
96
|
+
|
|
97
|
+
# Bypass JS anti-bot challenges with headless Chromium
|
|
98
|
+
curl "localhost:3000/https://www.zhihu.com/question/...?headless"
|
|
99
|
+
|
|
100
|
+
# Combine both
|
|
101
|
+
curl "localhost:3000/https://www.zhihu.com/question/...?headless&pretty"
|
|
102
|
+
|
|
103
|
+
# Auto-prepends https:// if you omit the protocol
|
|
104
|
+
curl "localhost:3000/example.com"
|
|
105
|
+
|
|
106
|
+
# Health check
|
|
107
|
+
curl "localhost:3000/health"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The server:
|
|
111
|
+
- Listens on `PORT` env (default 3000)
|
|
112
|
+
- Fetches URLs server-side with a 15s timeout
|
|
113
|
+
- Limits responses to 5 MB
|
|
114
|
+
- Returns CSX as `text/plain; charset=utf-8`
|
|
115
|
+
- Sets `X-Original-URL` response header
|
|
116
|
+
|
|
117
|
+
### Public instance
|
|
118
|
+
|
|
119
|
+
A hosted instance is available at **html2llm.cyncyn.xyz** — use it like `r.jina.ai`:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# Fetch any webpage as CSX
|
|
123
|
+
curl "https://html2llm.cyncyn.xyz/https://example.com"
|
|
124
|
+
|
|
125
|
+
# Pretty-printed for readability
|
|
126
|
+
curl "https://html2llm.cyncyn.xyz/https://example.com?pretty"
|
|
127
|
+
|
|
128
|
+
# Feed to an LLM in one pipeline
|
|
129
|
+
curl -s "https://html2llm.cyncyn.xyz/https://example.com" | llm "summarize this page"
|
|
130
|
+
|
|
131
|
+
# Bypass JS anti-bot walls with headless Chromium
|
|
132
|
+
curl "https://html2llm.cyncyn.xyz/https://www.zhihu.com/question/...?headless"
|
|
133
|
+
|
|
134
|
+
# Omit https:// — auto-prepended
|
|
135
|
+
curl "https://html2llm.cyncyn.xyz/example.com"
|
|
136
|
+
|
|
137
|
+
# Health check
|
|
138
|
+
curl "https://html2llm.cyncyn.xyz/health"
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Query params:
|
|
142
|
+
|
|
143
|
+
| Param | Effect |
|
|
144
|
+
|---|---|
|
|
145
|
+
| `?pretty` | Indented, human-readable output |
|
|
146
|
+
| `?headless` | Use headless Chromium to bypass JS challenges (slower) |
|
|
147
|
+
| `?pretty&headless` | Combine both |
|
|
148
|
+
|
|
149
|
+
### Docker
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
docker build -t html2llm .
|
|
153
|
+
docker run -p 3000:3000 html2llm
|
|
154
|
+
|
|
155
|
+
# With custom port
|
|
156
|
+
docker run -p 8080:8080 -e PORT=8080 html2llm
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## CSX Format Rules
|
|
160
|
+
|
|
161
|
+
| Rule | Example |
|
|
162
|
+
|------|---------|
|
|
163
|
+
| Element | `(tag children...)` |
|
|
164
|
+
| ID shorthand | `(div#main ...)` |
|
|
165
|
+
| Class shorthand | `(div.card.dark ...)` |
|
|
166
|
+
| Attributes | `(img src=photo.jpg alt=photo)` |
|
|
167
|
+
| Quoted attr value | `(img alt="a nice photo")` |
|
|
168
|
+
| Single-word text | `(p Hello)` |
|
|
169
|
+
| Multi-word text | `(p "Hello world")` |
|
|
170
|
+
| Boolean attribute | `(input type=checkbox checked)` |
|
|
171
|
+
| Tag synonyms | `strong→b`, `em→i` |
|
|
172
|
+
|
|
173
|
+
**Stripped:** `script`, `style`, `svg`, `noscript`, `iframe`, `link`, `meta`, HTML comments, all non-semantic attributes.
|
|
174
|
+
|
|
175
|
+
**Flattened:** bare `div`/`span` with no attributes and a single child are hoisted.
|
|
176
|
+
|
|
177
|
+
## Benchmark
|
|
178
|
+
|
|
179
|
+
Token counts approximated at 4 characters/token (GPT-family).
|
|
180
|
+
|
|
181
|
+
| URL | HTML tokens | CSX tokens | Reduction |
|
|
182
|
+
|-----|-------------|------------|-----------|
|
|
183
|
+
| [github.com](https://en.wikipedia.org/wiki/Large_language_model) |142589 | 33022 | 76.8% |
|
|
184
|
+
| [news.ycombinator.com](https://news.ycombinator.com) | 8,619 | 5,793 | 32.8% |
|
|
185
|
+
| [bbc.com/news](https://www.bbc.com/news) | 81,821 | 20,038 | 75.5% |
|
|
186
|
+
|
|
187
|
+
> Results from running `npm run benchmark` against live URLs on 2026-05-15.
|
|
188
|
+
|
|
189
|
+
## License
|
|
190
|
+
|
|
191
|
+
MIT
|
package/package.json
CHANGED
|
@@ -1,44 +1,44 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "html2llm",
|
|
3
|
-
"version": "0.1.
|
|
4
|
-
"description": "Convert HTML to CSX (Compact S-Expression) for token-efficient LLM context",
|
|
5
|
-
"type": "module",
|
|
6
|
-
"main": "dist/html2llm.js",
|
|
7
|
-
"types": "dist/html2llm.d.ts",
|
|
8
|
-
"exports": {
|
|
9
|
-
".": "./dist/html2llm.js",
|
|
10
|
-
"./cli": "./dist/cli.js"
|
|
11
|
-
},
|
|
12
|
-
"files": [
|
|
13
|
-
"dist"
|
|
14
|
-
],
|
|
15
|
-
"bin": {
|
|
16
|
-
"html2llm": "./dist/cli.js"
|
|
17
|
-
},
|
|
18
|
-
"scripts": {
|
|
19
|
-
"build": "tsc",
|
|
20
|
-
"test": "vitest run",
|
|
21
|
-
"test:watch": "vitest",
|
|
22
|
-
"benchmark": "tsx tests/benchmark.ts",
|
|
23
|
-
"dev": "tsx src/cli.ts",
|
|
24
|
-
"start": "node dist/server.js",
|
|
25
|
-
"dev:server": "tsx watch src/server.ts",
|
|
26
|
-
"postinstall": "npx playwright install chromium"
|
|
27
|
-
},
|
|
28
|
-
"dependencies": {
|
|
29
|
-
"@hono/node-server": "^1.0.0",
|
|
30
|
-
"cheerio": "^1.0.0",
|
|
31
|
-
"got-scraping": "^4.0.0",
|
|
32
|
-
"hono": "^4.0.0",
|
|
33
|
-
"playwright": "^1.52.0",
|
|
34
|
-
"playwright-extra": "^4.3.0",
|
|
35
|
-
"puppeteer-extra-plugin-stealth": "^2.11.0"
|
|
36
|
-
},
|
|
37
|
-
"devDependencies": {
|
|
38
|
-
"@types/node": "^20.0.0",
|
|
39
|
-
"axios": "^1.6.0",
|
|
40
|
-
"tsx": "^4.0.0",
|
|
41
|
-
"typescript": "^5.0.0",
|
|
42
|
-
"vitest": "^1.0.0"
|
|
43
|
-
}
|
|
44
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"name": "html2llm",
|
|
3
|
+
"version": "0.1.2",
|
|
4
|
+
"description": "Convert HTML to CSX (Compact S-Expression) for token-efficient LLM context",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/html2llm.js",
|
|
7
|
+
"types": "dist/html2llm.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": "./dist/html2llm.js",
|
|
10
|
+
"./cli": "./dist/cli.js"
|
|
11
|
+
},
|
|
12
|
+
"files": [
|
|
13
|
+
"dist"
|
|
14
|
+
],
|
|
15
|
+
"bin": {
|
|
16
|
+
"html2llm": "./dist/cli.js"
|
|
17
|
+
},
|
|
18
|
+
"scripts": {
|
|
19
|
+
"build": "tsc",
|
|
20
|
+
"test": "vitest run",
|
|
21
|
+
"test:watch": "vitest",
|
|
22
|
+
"benchmark": "tsx tests/benchmark.ts",
|
|
23
|
+
"dev": "tsx src/cli.ts",
|
|
24
|
+
"start": "node dist/server.js",
|
|
25
|
+
"dev:server": "tsx watch src/server.ts",
|
|
26
|
+
"postinstall": "npx playwright install chromium"
|
|
27
|
+
},
|
|
28
|
+
"dependencies": {
|
|
29
|
+
"@hono/node-server": "^1.0.0",
|
|
30
|
+
"cheerio": "^1.0.0",
|
|
31
|
+
"got-scraping": "^4.0.0",
|
|
32
|
+
"hono": "^4.0.0",
|
|
33
|
+
"playwright": "^1.52.0",
|
|
34
|
+
"playwright-extra": "^4.3.0",
|
|
35
|
+
"puppeteer-extra-plugin-stealth": "^2.11.0"
|
|
36
|
+
},
|
|
37
|
+
"devDependencies": {
|
|
38
|
+
"@types/node": "^20.0.0",
|
|
39
|
+
"axios": "^1.6.0",
|
|
40
|
+
"tsx": "^4.0.0",
|
|
41
|
+
"typescript": "^5.0.0",
|
|
42
|
+
"vitest": "^1.0.0"
|
|
43
|
+
}
|
|
44
|
+
}
|
package/dist/browser.d.ts
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Fetch HTML from a URL using a headless Chromium browser with stealth patches.
|
|
3
|
-
* Handles JS challenge walls (e.g., zhihu) that got-scraping can't bypass.
|
|
4
|
-
*
|
|
5
|
-
* @param url - The URL to fetch
|
|
6
|
-
* @param timeoutMs - Page load timeout (default 30s)
|
|
7
|
-
* @returns The full HTML after JS execution
|
|
8
|
-
*/
|
|
9
|
-
export declare function getHTML(url: string, timeoutMs?: number): Promise<string>;
|
|
10
|
-
/**
|
|
11
|
-
* Gracefully close the browser. Safe to call multiple times.
|
|
12
|
-
* Registered on process exit signals.
|
|
13
|
-
*/
|
|
14
|
-
export declare function closeBrowser(): Promise<void>;
|
package/dist/browser.js
DELETED
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Headless browser manager — singleton Chromium instance with stealth plugin.
|
|
3
|
-
* Used as a fallback for sites that block got-scraping (JS challenge walls).
|
|
4
|
-
*/
|
|
5
|
-
import { chromium } from "playwright-extra";
|
|
6
|
-
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
|
7
|
-
// Apply stealth plugin once at module load
|
|
8
|
-
chromium.use(StealthPlugin());
|
|
9
|
-
let browser = null;
|
|
10
|
-
let launchPromise = null;
|
|
11
|
-
let shuttingDown = false;
|
|
12
|
-
const BROWSER_ARGS = [
|
|
13
|
-
"--no-sandbox",
|
|
14
|
-
"--disable-dev-shm-usage",
|
|
15
|
-
"--disable-gpu",
|
|
16
|
-
"--disable-setuid-sandbox",
|
|
17
|
-
"--single-process", // lighter weight for single-page fetches
|
|
18
|
-
];
|
|
19
|
-
async function getBrowser() {
|
|
20
|
-
if (shuttingDown) {
|
|
21
|
-
throw new Error("Browser is shutting down");
|
|
22
|
-
}
|
|
23
|
-
if (browser?.isConnected()) {
|
|
24
|
-
return browser;
|
|
25
|
-
}
|
|
26
|
-
// Avoid concurrent launches
|
|
27
|
-
if (!launchPromise) {
|
|
28
|
-
launchPromise = chromium.launch({
|
|
29
|
-
headless: true,
|
|
30
|
-
args: BROWSER_ARGS,
|
|
31
|
-
});
|
|
32
|
-
}
|
|
33
|
-
browser = await launchPromise;
|
|
34
|
-
launchPromise = null;
|
|
35
|
-
return browser;
|
|
36
|
-
}
|
|
37
|
-
async function getPage() {
|
|
38
|
-
const b = await getBrowser();
|
|
39
|
-
return b.newPage();
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* Fetch HTML from a URL using a headless Chromium browser with stealth patches.
|
|
43
|
-
* Handles JS challenge walls (e.g., zhihu) that got-scraping can't bypass.
|
|
44
|
-
*
|
|
45
|
-
* @param url - The URL to fetch
|
|
46
|
-
* @param timeoutMs - Page load timeout (default 30s)
|
|
47
|
-
* @returns The full HTML after JS execution
|
|
48
|
-
*/
|
|
49
|
-
export async function getHTML(url, timeoutMs = 30_000) {
|
|
50
|
-
let page = null;
|
|
51
|
-
try {
|
|
52
|
-
page = await getPage();
|
|
53
|
-
// Stealth extras: realistic viewport
|
|
54
|
-
await page.setViewportSize({ width: 1366, height: 768 });
|
|
55
|
-
await page.goto(url, {
|
|
56
|
-
waitUntil: "domcontentloaded",
|
|
57
|
-
timeout: timeoutMs,
|
|
58
|
-
});
|
|
59
|
-
// Wait a bit for JS-rendered content to appear (anti-bot challenges, SPAs)
|
|
60
|
-
await page.waitForTimeout(3000);
|
|
61
|
-
const html = await page.content();
|
|
62
|
-
// 5 MB guard
|
|
63
|
-
if (html.length > 5_000_000) {
|
|
64
|
-
throw new Error("Response too large (>5 MB)");
|
|
65
|
-
}
|
|
66
|
-
return html;
|
|
67
|
-
}
|
|
68
|
-
finally {
|
|
69
|
-
if (page) {
|
|
70
|
-
await page.close().catch(() => { });
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
/**
|
|
75
|
-
* Gracefully close the browser. Safe to call multiple times.
|
|
76
|
-
* Registered on process exit signals.
|
|
77
|
-
*/
|
|
78
|
-
export async function closeBrowser() {
|
|
79
|
-
shuttingDown = true;
|
|
80
|
-
if (browser) {
|
|
81
|
-
const b = browser;
|
|
82
|
-
browser = null;
|
|
83
|
-
await b.close().catch(() => { });
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
// Clean shutdown on exit
|
|
87
|
-
process.on("exit", () => {
|
|
88
|
-
if (browser) {
|
|
89
|
-
browser.close().catch(() => { });
|
|
90
|
-
}
|
|
91
|
-
});
|
|
92
|
-
process.on("SIGTERM", () => {
|
|
93
|
-
closeBrowser().then(() => process.exit(0));
|
|
94
|
-
});
|
|
95
|
-
process.on("SIGINT", () => {
|
|
96
|
-
closeBrowser().then(() => process.exit(0));
|
|
97
|
-
});
|
package/dist/cli.d.ts
DELETED
package/dist/cli.js
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import { readFileSync } from "fs";
|
|
3
|
-
import { htmlToCSX, urlToCSX } from "./html2llm.js";
|
|
4
|
-
import { closeBrowser } from "./browser.js";
|
|
5
|
-
const args = process.argv.slice(2);
|
|
6
|
-
const prettyFlag = args.includes("--pretty");
|
|
7
|
-
const headlessFlag = args.includes("--headless");
|
|
8
|
-
const fileArg = args.find((a) => !a.startsWith("--"));
|
|
9
|
-
function isURL(arg) {
|
|
10
|
-
return arg.startsWith("http://") || arg.startsWith("https://");
|
|
11
|
-
}
|
|
12
|
-
async function main() {
|
|
13
|
-
const options = { minify: !prettyFlag, headless: headlessFlag };
|
|
14
|
-
if (fileArg && isURL(fileArg)) {
|
|
15
|
-
const result = await urlToCSX(fileArg, options);
|
|
16
|
-
console.log(result);
|
|
17
|
-
return;
|
|
18
|
-
}
|
|
19
|
-
let html;
|
|
20
|
-
if (fileArg) {
|
|
21
|
-
try {
|
|
22
|
-
html = readFileSync(fileArg, "utf-8");
|
|
23
|
-
}
|
|
24
|
-
catch (e) {
|
|
25
|
-
console.error(`Error reading file: ${fileArg}: ${e.message}`);
|
|
26
|
-
process.exit(1);
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
else {
|
|
30
|
-
// Read from stdin
|
|
31
|
-
const chunks = [];
|
|
32
|
-
for await (const chunk of process.stdin) {
|
|
33
|
-
chunks.push(chunk);
|
|
34
|
-
}
|
|
35
|
-
html = Buffer.concat(chunks).toString("utf-8");
|
|
36
|
-
}
|
|
37
|
-
const result = htmlToCSX(html, options);
|
|
38
|
-
console.log(result);
|
|
39
|
-
}
|
|
40
|
-
main()
|
|
41
|
-
.then(() => closeBrowser())
|
|
42
|
-
.catch((e) => {
|
|
43
|
-
console.error(e);
|
|
44
|
-
closeBrowser().then(() => process.exit(1));
|
|
45
|
-
});
|
package/dist/html2llm.d.ts
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
export interface CSXOptions {
|
|
2
|
-
minify?: boolean;
|
|
3
|
-
/** Use headless Chromium to bypass JS anti-bot challenges (slower but stronger) */
|
|
4
|
-
headless?: boolean;
|
|
5
|
-
}
|
|
6
|
-
export declare function htmlToCSX(html: string, options?: CSXOptions): string;
|
|
7
|
-
/**
|
|
8
|
-
* Fetch a URL and convert its HTML to CSX.
|
|
9
|
-
* Uses got-scraping under the hood for TLS fingerprint spoofing
|
|
10
|
-
* and browser-like headers to avoid 403 blocks.
|
|
11
|
-
*/
|
|
12
|
-
export declare function urlToCSX(url: string, options?: CSXOptions): Promise<string>;
|
package/dist/html2llm.js
DELETED
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
import * as cheerio from "cheerio";
|
|
2
|
-
import { gotScraping } from "got-scraping";
|
|
3
|
-
import { getHTML } from "./browser.js";
|
|
4
|
-
// Tags to remove entirely (with all children)
|
|
5
|
-
const STRIP_TAGS = new Set([
|
|
6
|
-
"script", "style", "svg", "noscript", "iframe", "link", "meta",
|
|
7
|
-
]);
|
|
8
|
-
// Tags to rename (synonym collapse)
|
|
9
|
-
const TAG_MAP = {
|
|
10
|
-
strong: "b",
|
|
11
|
-
em: "i",
|
|
12
|
-
};
|
|
13
|
-
// Attributes to keep (beyond id/class which become shorthand)
|
|
14
|
-
const KEEP_ATTRS = new Set(["href", "src", "alt", "title", "type"]);
|
|
15
|
-
// Boolean attributes to keep
|
|
16
|
-
const BOOL_ATTRS = new Set([
|
|
17
|
-
"checked", "disabled", "readonly", "required", "selected", "open",
|
|
18
|
-
]);
|
|
19
|
-
function isDataAttr(name) {
|
|
20
|
-
return name.startsWith("data-");
|
|
21
|
-
}
|
|
22
|
-
function normalizeTag(tag) {
|
|
23
|
-
return TAG_MAP[tag] ?? tag;
|
|
24
|
-
}
|
|
25
|
-
function getShorthand(el) {
|
|
26
|
-
const attribs = el.attribs ?? {};
|
|
27
|
-
const id = attribs["id"] ? `#${attribs["id"]}` : "";
|
|
28
|
-
const classes = attribs["class"]
|
|
29
|
-
? attribs["class"].trim().split(/\s+/).map((c) => `.${c}`).join("")
|
|
30
|
-
: "";
|
|
31
|
-
return id + classes;
|
|
32
|
-
}
|
|
33
|
-
function getAttrs(el) {
|
|
34
|
-
const attribs = el.attribs ?? {};
|
|
35
|
-
const parts = [];
|
|
36
|
-
for (const [name, value] of Object.entries(attribs)) {
|
|
37
|
-
if (name === "id" || name === "class")
|
|
38
|
-
continue; // handled in shorthand
|
|
39
|
-
if (BOOL_ATTRS.has(name) && (value === "" || value === name || value === "true")) {
|
|
40
|
-
parts.push(name);
|
|
41
|
-
}
|
|
42
|
-
else if (KEEP_ATTRS.has(name) || isDataAttr(name)) {
|
|
43
|
-
const needsQuotes = value.includes(" ") || value.includes("=") || value.includes("(") || value.includes(")") || value.includes("#");
|
|
44
|
-
const escapedValue = needsQuotes ? value.replace(/"/g, '\\"') : value;
|
|
45
|
-
parts.push(needsQuotes ? `${name}="${escapedValue}"` : `${name}=${value}`);
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
return parts.join(" ");
|
|
49
|
-
}
|
|
50
|
-
function serializeText(text) {
|
|
51
|
-
const trimmed = text.replace(/\s+/g, " ").trim();
|
|
52
|
-
if (!trimmed)
|
|
53
|
-
return "";
|
|
54
|
-
const escaped = trimmed.replace(/"/g, '\\"');
|
|
55
|
-
const needsQuotes = escaped.includes(" ") || escaped.includes("(") || escaped.includes(")") || trimmed.includes('"');
|
|
56
|
-
return needsQuotes ? `"${escaped}"` : escaped;
|
|
57
|
-
}
|
|
58
|
-
function convertNode(node, $, minify, depth = 0) {
|
|
59
|
-
// Strip HTML comments
|
|
60
|
-
if (node.type === "comment")
|
|
61
|
-
return null;
|
|
62
|
-
// Text nodes
|
|
63
|
-
if (node.type === "text") {
|
|
64
|
-
const text = node.data ?? "";
|
|
65
|
-
return serializeText(text) || null;
|
|
66
|
-
}
|
|
67
|
-
// Only process element (tag) nodes
|
|
68
|
-
if (node.type !== "tag")
|
|
69
|
-
return null;
|
|
70
|
-
const el = node;
|
|
71
|
-
const rawTag = el.name.toLowerCase();
|
|
72
|
-
// Strip noise tags entirely
|
|
73
|
-
if (STRIP_TAGS.has(rawTag))
|
|
74
|
-
return null;
|
|
75
|
-
const tag = normalizeTag(rawTag);
|
|
76
|
-
const shorthand = getShorthand(el);
|
|
77
|
-
const attrs = getAttrs(el);
|
|
78
|
-
// Check if this node is a candidate for flattening BEFORE recursing children
|
|
79
|
-
const isFlattenable = (rawTag === "div" || rawTag === "span") && !shorthand && !attrs;
|
|
80
|
-
const children = $(el)
|
|
81
|
-
.contents()
|
|
82
|
-
.toArray()
|
|
83
|
-
.map((child) => convertNode(child, $, minify, depth + 1))
|
|
84
|
-
.filter((s) => s !== null && s.length > 0);
|
|
85
|
-
// Structural flattening: skip bare div/span with exactly one child
|
|
86
|
-
if (isFlattenable && children.length === 1) {
|
|
87
|
-
if (minify)
|
|
88
|
-
return children[0];
|
|
89
|
-
// In pretty mode, depth matters — re-render at current depth (not depth+1)
|
|
90
|
-
const childNodes = $(el).contents().toArray();
|
|
91
|
-
for (const child of childNodes) {
|
|
92
|
-
const rendered = convertNode(child, $, minify, depth);
|
|
93
|
-
if (rendered)
|
|
94
|
-
return rendered;
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
const tagPart = tag + shorthand;
|
|
98
|
-
const attrPart = attrs ? ` ${attrs}` : "";
|
|
99
|
-
if (children.length === 0) {
|
|
100
|
-
return `(${tagPart}${attrPart})`;
|
|
101
|
-
}
|
|
102
|
-
if (minify) {
|
|
103
|
-
return `(${tagPart}${attrPart} ${children.join(" ")})`;
|
|
104
|
-
}
|
|
105
|
-
else {
|
|
106
|
-
const indent = " ".repeat(depth + 1);
|
|
107
|
-
const childStr = children.map((c) => `${indent}${c}`).join("\n");
|
|
108
|
-
return `(${tagPart}${attrPart}\n${childStr})`;
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
export function htmlToCSX(html, options) {
|
|
112
|
-
const minify = options?.minify ?? true;
|
|
113
|
-
let $;
|
|
114
|
-
try {
|
|
115
|
-
$ = cheerio.load(html);
|
|
116
|
-
}
|
|
117
|
-
catch {
|
|
118
|
-
$ = cheerio.load("");
|
|
119
|
-
}
|
|
120
|
-
const results = [];
|
|
121
|
-
$("body")
|
|
122
|
-
.contents()
|
|
123
|
-
.toArray()
|
|
124
|
-
.forEach((node) => {
|
|
125
|
-
const result = convertNode(node, $, minify, 0);
|
|
126
|
-
if (result)
|
|
127
|
-
results.push(result);
|
|
128
|
-
});
|
|
129
|
-
return results.join(minify ? " " : "\n");
|
|
130
|
-
}
|
|
131
|
-
/**
|
|
132
|
-
* Fetch a URL and convert its HTML to CSX.
|
|
133
|
-
* Uses got-scraping under the hood for TLS fingerprint spoofing
|
|
134
|
-
* and browser-like headers to avoid 403 blocks.
|
|
135
|
-
*/
|
|
136
|
-
export async function urlToCSX(url, options) {
|
|
137
|
-
const headless = options?.headless ?? false;
|
|
138
|
-
if (headless) {
|
|
139
|
-
const html = await getHTML(url);
|
|
140
|
-
return htmlToCSX(html, options);
|
|
141
|
-
}
|
|
142
|
-
const res = await gotScraping({
|
|
143
|
-
url,
|
|
144
|
-
responseType: "text",
|
|
145
|
-
timeout: { request: 15_000 },
|
|
146
|
-
retry: { limit: 1 },
|
|
147
|
-
});
|
|
148
|
-
if (res.body.length > 5_000_000) {
|
|
149
|
-
throw new Error("Response too large (>5 MB)");
|
|
150
|
-
}
|
|
151
|
-
return htmlToCSX(res.body, options);
|
|
152
|
-
}
|
package/dist/server.d.ts
DELETED
package/dist/server.js
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
import { Hono } from "hono";
|
|
2
|
-
import { serve } from "@hono/node-server";
|
|
3
|
-
import { urlToCSX } from "./html2llm.js";
|
|
4
|
-
export const app = new Hono();
|
|
5
|
-
// Health check
|
|
6
|
-
app.get("/health", (c) => {
|
|
7
|
-
return c.json({ status: "ok" });
|
|
8
|
-
});
|
|
9
|
-
// Main endpoint: URL-as-path (like Jina)
|
|
10
|
-
// GET /https://example.com/page?pretty
|
|
11
|
-
app.get("/*", async (c) => {
|
|
12
|
-
const rawUrl = c.req.path.slice(1); // remove leading "/"
|
|
13
|
-
if (!rawUrl) {
|
|
14
|
-
return c.text("html2llm — Convert any webpage to CSX for LLM context.\n\n" +
|
|
15
|
-
"Usage: GET /https://example.com/page\n\n" +
|
|
16
|
-
"Query params:\n" +
|
|
17
|
-
" ?pretty — pretty-printed output for debugging\n" +
|
|
18
|
-
" ?headless — use headless Chromium to bypass JS anti-bot challenges\n", 400);
|
|
19
|
-
}
|
|
20
|
-
// Auto-prepend https:// if no protocol specified
|
|
21
|
-
let url;
|
|
22
|
-
try {
|
|
23
|
-
url = rawUrl.startsWith("http://") || rawUrl.startsWith("https://")
|
|
24
|
-
? rawUrl
|
|
25
|
-
: `https://${rawUrl}`;
|
|
26
|
-
new URL(url); // validate
|
|
27
|
-
}
|
|
28
|
-
catch {
|
|
29
|
-
return c.text(`Invalid URL: ${rawUrl}`, 400);
|
|
30
|
-
}
|
|
31
|
-
const pretty = c.req.query("pretty") !== undefined;
|
|
32
|
-
const headless = c.req.query("headless") !== undefined;
|
|
33
|
-
try {
|
|
34
|
-
const csx = await urlToCSX(url, { minify: !pretty, headless });
|
|
35
|
-
return new Response(csx, {
|
|
36
|
-
status: 200,
|
|
37
|
-
headers: {
|
|
38
|
-
"Content-Type": "text/plain; charset=utf-8",
|
|
39
|
-
"X-Original-URL": url,
|
|
40
|
-
},
|
|
41
|
-
});
|
|
42
|
-
}
|
|
43
|
-
catch (err) {
|
|
44
|
-
const name = err.name;
|
|
45
|
-
if (name === "TimeoutError") {
|
|
46
|
-
return c.text(`Request timed out (15s): ${url}`, 504);
|
|
47
|
-
}
|
|
48
|
-
if (name === "HTTPError") {
|
|
49
|
-
const httpErr = err;
|
|
50
|
-
return c.text(`Failed to fetch URL: ${url} (HTTP ${httpErr.response.statusCode})`, 502);
|
|
51
|
-
}
|
|
52
|
-
return c.text(`Failed to fetch URL: ${url}\n${err.message}`, 502);
|
|
53
|
-
}
|
|
54
|
-
});
|
|
55
|
-
// ---- Entry point ----
|
|
56
|
-
const port = parseInt(process.env.PORT || "3000");
|
|
57
|
-
serve({ fetch: app.fetch, port });
|
|
58
|
-
console.log(`html2llm server running on http://localhost:${port}`);
|