@qretaio/html2json 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +185 -0
  2. package/package.json +49 -0
package/README.md ADDED
@@ -0,0 +1,185 @@
1
+ # html2json
2
+
3
+ A Rust port of
4
+ [cheerio-json-mapper](https://github.com/denkan/cheerio-json-mapper).
5
+
6
+ ---
7
+
8
+ ## Overview
9
+
10
+ - **Input:** HTML source + Extractor spec (JSON)
11
+ - **Output:** JSON matching the structure defined in the spec
12
+ - **Available as:** Rust crate, CLI tool, and WebAssembly npm package
13
+
14
+ ## Installation
15
+
16
+ ### npm / WebAssembly
17
+
18
+ ```bash
19
+ npm install html2json-wasm
20
+ ```
21
+
22
+ ### From crates.io (Rust)
23
+
24
+ ```bash
25
+ cargo install html2json --features cli
26
+ ```
27
+
28
+ ### From source
29
+
30
+ ```bash
31
+ cargo install --path . --features cli
32
+ # or from a git repository
33
+ cargo install --git https://github.com/qretaio/html2json --features cli
34
+ ```
35
+
36
+ ### Using just
37
+
38
+ ```bash
39
+ just install
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### JavaScript / TypeScript
45
+
46
+ ```javascript
47
+ import { extract } from 'html2json-wasm';
48
+
49
+ const html = `
50
+ <article class="post">
51
+ <h2>My Article</h2>
52
+ <p class="author">John Doe</p>
53
+ <div class="tags">
54
+ <span>rust</span>
55
+ <span>wasm</span>
56
+ </div>
57
+ </article>
58
+ `;
59
+
60
+ const spec = JSON.stringify({
61
+ title: "h2",
62
+ author: ".author",
63
+ tags: [{
64
+ "$": ".tags span",
65
+ "name": "$"
66
+ }]
67
+ });
68
+
69
+ const result = extract(html, spec);
70
+ console.log(result);
71
+ // {
72
+ // "title": "My Article",
73
+ // "author": "John Doe",
74
+ // "tags": [{"name": "rust"}, {"name": "wasm"}]
75
+ // }
76
+ ```
77
+
78
+ ### CLI
79
+
80
+ ```bash
81
+ # Extract from file
82
+ html2json examples/hn.html --spec examples/hn.json
83
+
84
+ # Extract from stdin (pipe from curl)
85
+ curl -s https://news.ycombinator.com/ | html2json --spec examples/hn.json
86
+
87
+ # Extract from stdin (pipe from cat)
88
+ cat examples/hn.html | html2json --spec examples/hn.json
89
+
90
+ # Check output matches expected JSON (useful for testing/CI)
91
+ html2json examples/hn.html --spec examples/hn.json --check expected.json
92
+ ```
93
+
94
+ ### CLI Options
95
+
96
+ - `--spec, -s <FILE>` - Path to JSON extractor spec file (required)
97
+ - `--check, -c <FILE>` - Compare output against expected JSON file. Exits with 0 if match, 1 if differ (with colored diff).
98
+
99
+ ## Spec Format
100
+
101
+ The spec is a JSON object where each key defines an output field and each value defines a CSS selector to extract that field.
102
+
103
+ ### Basic Selectors
104
+
105
+ ```json
106
+ {
107
+ "title": "h1",
108
+ "description": "p.description"
109
+ }
110
+ ```
111
+
112
+ ### Attributes
113
+
114
+ ```json
115
+ {
116
+ "link": "a.main | attr:href",
117
+ "image": "img.hero | attr:src"
118
+ }
119
+ ```
120
+
121
+ ### Pipes (Transformations)
122
+
123
+ ```json
124
+ {
125
+ "title": "h1 | trim",
126
+ "slug": "h1 | lower | regex:\\s+-",
127
+ "price": ".price | regex:\\$(\\d+\\.\\d+) | parseAs:int"
128
+ }
129
+ ```
130
+
131
+ Available pipes:
132
+ - `trim` - Trim whitespace
133
+ - `lower` - Convert to lowercase
134
+ - `upper` - Convert to uppercase
135
+ - `substr:start:end` - Extract substring
136
+ - `regex:pattern` - Regex capture (first group)
137
+ - `parseAs:int` - Parse as integer
138
+ - `parseAs:float` - Parse as float
139
+ - `attr:name` - Get attribute value
140
+ - `void` - Extract from void elements, useful for extracting xml
141
+
142
+ ### Collections (Arrays)
143
+
144
+ ```json
145
+ {
146
+ "items": [{
147
+ "$": ".item",
148
+ "title": "h2",
149
+ "description": "p"
150
+ }]
151
+ }
152
+ ```
153
+
154
+ ### Scoping (`$` selector)
155
+
156
+ ```json
157
+ {
158
+ "$": "article",
159
+ "title": "h1",
160
+ "paragraphs": ["p"]
161
+ }
162
+ ```
163
+
164
+ ### Fallback Operators (`||`)
165
+
166
+ ```json
167
+ {
168
+ "title": "h1.main || h1.fallback || h1"
169
+ }
170
+ ```
171
+
172
+ ### Optional Fields (`?`)
173
+
174
+ ```json
175
+ {
176
+ "title": "h1",
177
+ "description?": "p.description"
178
+ }
179
+ ```
180
+
181
+ Optional fields that return `null` are removed from the output.
182
+
183
+ ## LICENSE
184
+
185
+ MIT
package/package.json ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "name": "@qretaio/html2json",
3
+ "version": "0.5.0",
4
+ "description": "HTML to JSON extractor using WebAssembly - Fast, powerful HTML parsing with CSS selectors",
5
+ "keywords": [
6
+ "html",
7
+ "json",
8
+ "parser",
9
+ "scraper",
10
+ "web-scraping",
11
+ "css-selector",
12
+ "html-parser",
13
+ "wasm",
14
+ "webassembly",
15
+ "extraction",
16
+ "cheerio"
17
+ ],
18
+ "license": "MIT",
19
+ "author": "Qreta Dev <qretadev@gmail.com>",
20
+ "repository": {
21
+ "type": "git",
22
+ "url": "git+https://github.com/qretaio/html2json.git"
23
+ },
24
+ "homepage": "https://github.com/qretaio/html2json#readme",
25
+ "bugs": {
26
+ "url": "https://github.com/qretaio/html2json/issues"
27
+ },
28
+ "main": "pkg/html2json.js",
29
+ "browser": "pkg/html2json.js",
30
+ "types": "pkg/html2json.d.ts",
31
+ "files": [
32
+ "pkg/*",
33
+ "README.md"
34
+ ],
35
+ "scripts": {
36
+ "build": "wasm-pack build --dev --target web --out-name html2json .",
37
+ "build:release": "wasm-pack build --release --target web --out-name html2json .",
38
+ "build:node": "wasm-pack build --dev --target nodejs --out-name html2json .",
39
+ "build:node:release": "wasm-pack build --release --target nodejs --out-name html2json .",
40
+ "test": "cargo test",
41
+ "prepublishOnly": "npm run build:release"
42
+ },
43
+ "engines": {
44
+ "node": ">=16"
45
+ },
46
+ "devDependencies": {
47
+ "wasm-pack": "^0.13.0"
48
+ }
49
+ }