@qretaio/html2json 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +185 -0
- package/package.json +49 -0
package/README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# html2json
|
|
2
|
+
|
|
3
|
+
A Rust port of
|
|
4
|
+
[cheerio-json-mapper](https://github.com/denkan/cheerio-json-mapper).
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
- **Input:** HTML source + Extractor spec (JSON)
|
|
11
|
+
- **Output:** JSON matching the structure defined in the spec
|
|
12
|
+
- **Available as:** Rust crate, CLI tool, and WebAssembly npm package
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### npm / WebAssembly
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
npm install html2json-wasm
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### From crates.io (Rust)
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
cargo install html2json --features cli
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### From source
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
cargo install --path . --features cli
|
|
32
|
+
# or from a git repository
|
|
33
|
+
cargo install --git https://github.com/qretaio/html2json --features cli
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Using just
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
just install
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### JavaScript / TypeScript
|
|
45
|
+
|
|
46
|
+
```javascript
|
|
47
|
+
import { extract } from 'html2json-wasm';
|
|
48
|
+
|
|
49
|
+
const html = `
|
|
50
|
+
<article class="post">
|
|
51
|
+
<h2>My Article</h2>
|
|
52
|
+
<p class="author">John Doe</p>
|
|
53
|
+
<div class="tags">
|
|
54
|
+
<span>rust</span>
|
|
55
|
+
<span>wasm</span>
|
|
56
|
+
</div>
|
|
57
|
+
</article>
|
|
58
|
+
`;
|
|
59
|
+
|
|
60
|
+
const spec = JSON.stringify({
|
|
61
|
+
title: "h2",
|
|
62
|
+
author: ".author",
|
|
63
|
+
tags: [{
|
|
64
|
+
"$": ".tags span",
|
|
65
|
+
"name": "$"
|
|
66
|
+
}]
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
const result = extract(html, spec);
|
|
70
|
+
console.log(result);
|
|
71
|
+
// {
|
|
72
|
+
// "title": "My Article",
|
|
73
|
+
// "author": "John Doe",
|
|
74
|
+
// "tags": [{"name": "rust"}, {"name": "wasm"}]
|
|
75
|
+
// }
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### CLI
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Extract from file
|
|
82
|
+
html2json examples/hn.html --spec examples/hn.json
|
|
83
|
+
|
|
84
|
+
# Extract from stdin (pipe from curl)
|
|
85
|
+
curl -s https://news.ycombinator.com/ | html2json --spec examples/hn.json
|
|
86
|
+
|
|
87
|
+
# Extract from stdin (pipe from cat)
|
|
88
|
+
cat examples/hn.html | html2json --spec examples/hn.json
|
|
89
|
+
|
|
90
|
+
# Check output matches expected JSON (useful for testing/CI)
|
|
91
|
+
html2json examples/hn.html --spec examples/hn.json --check expected.json
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### CLI Options
|
|
95
|
+
|
|
96
|
+
- `--spec, -s <FILE>` - Path to JSON extractor spec file (required)
|
|
97
|
+
- `--check, -c <FILE>` - Compare output against expected JSON file. Exits with 0 if match, 1 if differ (with colored diff).
|
|
98
|
+
|
|
99
|
+
## Spec Format
|
|
100
|
+
|
|
101
|
+
The spec is a JSON object where each key defines an output field and each value defines a CSS selector to extract that field.
|
|
102
|
+
|
|
103
|
+
### Basic Selectors
|
|
104
|
+
|
|
105
|
+
```json
|
|
106
|
+
{
|
|
107
|
+
"title": "h1",
|
|
108
|
+
"description": "p.description"
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Attributes
|
|
113
|
+
|
|
114
|
+
```json
|
|
115
|
+
{
|
|
116
|
+
"link": "a.main | attr:href",
|
|
117
|
+
"image": "img.hero | attr:src"
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Pipes (Transformations)
|
|
122
|
+
|
|
123
|
+
```json
|
|
124
|
+
{
|
|
125
|
+
"title": "h1 | trim",
|
|
126
|
+
"slug": "h1 | lower | regex:\\s+-",
|
|
127
|
+
"price": ".price | regex:\\$(\\d+\\.\\d+) | parseAs:int"
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Available pipes:
|
|
132
|
+
- `trim` - Trim whitespace
|
|
133
|
+
- `lower` - Convert to lowercase
|
|
134
|
+
- `upper` - Convert to uppercase
|
|
135
|
+
- `substr:start:end` - Extract substring
|
|
136
|
+
- `regex:pattern` - Regex capture (first group)
|
|
137
|
+
- `parseAs:int` - Parse as integer
|
|
138
|
+
- `parseAs:float` - Parse as float
|
|
139
|
+
- `attr:name` - Get attribute value
|
|
140
|
+
- `void` - Extract from void elements, useful for extracting xml
|
|
141
|
+
|
|
142
|
+
### Collections (Arrays)
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"items": [{
|
|
147
|
+
"$": ".item",
|
|
148
|
+
"title": "h2",
|
|
149
|
+
"description": "p"
|
|
150
|
+
}]
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Scoping (`$` selector)
|
|
155
|
+
|
|
156
|
+
```json
|
|
157
|
+
{
|
|
158
|
+
"$": "article",
|
|
159
|
+
"title": "h1",
|
|
160
|
+
"paragraphs": ["p"]
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Fallback Operators (`||`)
|
|
165
|
+
|
|
166
|
+
```json
|
|
167
|
+
{
|
|
168
|
+
"title": "h1.main || h1.fallback || h1"
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Optional Fields (`?`)
|
|
173
|
+
|
|
174
|
+
```json
|
|
175
|
+
{
|
|
176
|
+
"title": "h1",
|
|
177
|
+
"description?": "p.description"
|
|
178
|
+
}
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Optional fields that return `null` are removed from the output.
|
|
182
|
+
|
|
183
|
+
## LICENSE
|
|
184
|
+
|
|
185
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@qretaio/html2json",
|
|
3
|
+
"version": "0.5.0",
|
|
4
|
+
"description": "HTML to JSON extractor using WebAssembly - Fast, powerful HTML parsing with CSS selectors",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"html",
|
|
7
|
+
"json",
|
|
8
|
+
"parser",
|
|
9
|
+
"scraper",
|
|
10
|
+
"web-scraping",
|
|
11
|
+
"css-selector",
|
|
12
|
+
"html-parser",
|
|
13
|
+
"wasm",
|
|
14
|
+
"webassembly",
|
|
15
|
+
"extraction",
|
|
16
|
+
"cheerio"
|
|
17
|
+
],
|
|
18
|
+
"license": "MIT",
|
|
19
|
+
"author": "Qreta Dev <qretadev@gmail.com>",
|
|
20
|
+
"repository": {
|
|
21
|
+
"type": "git",
|
|
22
|
+
"url": "git+https://github.com/qretaio/html2json.git"
|
|
23
|
+
},
|
|
24
|
+
"homepage": "https://github.com/qretaio/html2json#readme",
|
|
25
|
+
"bugs": {
|
|
26
|
+
"url": "https://github.com/qretaio/html2json/issues"
|
|
27
|
+
},
|
|
28
|
+
"main": "pkg/html2json.js",
|
|
29
|
+
"browser": "pkg/html2json.js",
|
|
30
|
+
"types": "pkg/html2json.d.ts",
|
|
31
|
+
"files": [
|
|
32
|
+
"pkg/*",
|
|
33
|
+
"README.md"
|
|
34
|
+
],
|
|
35
|
+
"scripts": {
|
|
36
|
+
"build": "wasm-pack build --dev --target web --out-name html2json .",
|
|
37
|
+
"build:release": "wasm-pack build --release --target web --out-name html2json .",
|
|
38
|
+
"build:node": "wasm-pack build --dev --target nodejs --out-name html2json .",
|
|
39
|
+
"build:node:release": "wasm-pack build --release --target nodejs --out-name html2json .",
|
|
40
|
+
"test": "cargo test",
|
|
41
|
+
"prepublishOnly": "npm run build:release"
|
|
42
|
+
},
|
|
43
|
+
"engines": {
|
|
44
|
+
"node": ">=16"
|
|
45
|
+
},
|
|
46
|
+
"devDependencies": {
|
|
47
|
+
"wasm-pack": "^0.13.0"
|
|
48
|
+
}
|
|
49
|
+
}
|