@thi.ng/hiccup-html-parse 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -1
- package/README.md +17 -13
- package/index.d.ts +17 -3
- package/index.js +18 -8
- package/package.json +10 -10
package/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
-
- **Last updated**: 2023-
|
|
3
|
+
- **Last updated**: 2023-10-23T07:37:37Z
|
|
4
4
|
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
|
|
5
5
|
|
|
6
6
|
All notable changes to this project will be documented in this file.
|
|
@@ -9,6 +9,16 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
|
|
|
9
9
|
**Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
|
|
10
10
|
and/or version bumps of transitive dependencies.
|
|
11
11
|
|
|
12
|
+
## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.3.0) (2023-09-28)
|
|
13
|
+
|
|
14
|
+
#### 🚀 Features
|
|
15
|
+
|
|
16
|
+
- update grammar, add opts ([a43142d](https://github.com/thi-ng/umbrella/commit/a43142d))
|
|
17
|
+
- update grammar to improve doctype & void tag handling
|
|
18
|
+
- add new ParseOpts: collapse & unescape
|
|
19
|
+
- increase default maxDepth to 128
|
|
20
|
+
- add/update tests
|
|
21
|
+
|
|
12
22
|
## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.2.0) (2023-09-19)
|
|
13
23
|
|
|
14
24
|
#### 🚀 Features
|
package/README.md
CHANGED
|
@@ -27,8 +27,8 @@ This project is part of the
|
|
|
27
27
|
|
|
28
28
|
Well-formed HTML parsing and customizable transformation to nested JS arrays in [@thi.ng/hiccup](https://github.com/thi-ng/umbrella/tree/develop/packages/hiccup) format.
|
|
29
29
|
|
|
30
|
-
Note: This parser is intended to work with
|
|
31
|
-
any "quirky" (aka dodgy) markup...
|
|
30
|
+
Note: This parser is intended to work with wellformed HTML and will likely fail
|
|
31
|
+
for any "quirky" (aka malformed/dodgy) markup...
|
|
32
32
|
|
|
33
33
|
### Basic usage
|
|
34
34
|
|
|
@@ -78,16 +78,20 @@ console.log(result.result);
|
|
|
78
78
|
Parser behavior & results can be customized via supplied options and user
|
|
79
79
|
transformation functions:
|
|
80
80
|
|
|
81
|
-
| Option | Description
|
|
82
|
-
|
|
83
|
-
| `ignoreElements` | Array of element names to ignore
|
|
84
|
-
| `ignoreAttribs` | Array of attribute names to ignore
|
|
85
|
-
| `
|
|
86
|
-
| `
|
|
87
|
-
| `
|
|
88
|
-
| `
|
|
89
|
-
| `
|
|
90
|
-
| `
|
|
81
|
+
| Option | Description | Default |
|
|
82
|
+
|------------------|-----------------------------------------------------|---------|
|
|
83
|
+
| `ignoreElements` | Array of element names to ignore | [] |
|
|
84
|
+
| `ignoreAttribs` | Array of attribute names to ignore | [] |
|
|
85
|
+
| `dataAttribs` | Keep data attribs | true |
|
|
86
|
+
| `comments` | Keep `<!-- ... -->` comments | false |
|
|
87
|
+
| `doctype` | Keep `<!doctype ...>` element | false |
|
|
88
|
+
| `whitespace` | Keep whitespace-only text bodies | false |
|
|
89
|
+
| `collapse` | Collapse whitespace<sup>(1)</sup> | true |
|
|
90
|
+
| `unescape` | Replace named & numeric HTML entities<sup>(1)</sup> | true |
|
|
91
|
+
| `tx` | Element transform/filter function | |
|
|
92
|
+
| `txBody` | Plain text transform/filter function | |
|
|
93
|
+
|
|
94
|
+
- (1) - Not in CData content sections like inside `<script>` or `<style>` elements
|
|
91
95
|
|
|
92
96
|
## Status
|
|
93
97
|
|
|
@@ -121,7 +125,7 @@ For Node.js REPL:
|
|
|
121
125
|
const hiccupHtmlParse = await import("@thi.ng/hiccup-html-parse");
|
|
122
126
|
```
|
|
123
127
|
|
|
124
|
-
Package sizes (brotli'd, pre-treeshake): ESM: 1.
|
|
128
|
+
Package sizes (brotli'd, pre-treeshake): ESM: 1.18 KB
|
|
125
129
|
|
|
126
130
|
## Dependencies
|
|
127
131
|
|
package/index.d.ts
CHANGED
|
@@ -9,6 +9,12 @@ export interface ParseOpts {
|
|
|
9
9
|
* Array of attribute names to ignore.
|
|
10
10
|
*/
|
|
11
11
|
ignoreAttribs: string[];
|
|
12
|
+
/**
|
|
13
|
+
* Keep data attribs.
|
|
14
|
+
*
|
|
15
|
+
* @defaultValue true
|
|
16
|
+
*/
|
|
17
|
+
dataAttribs: boolean;
|
|
12
18
|
/**
|
|
13
19
|
* Keep `<!doctype ...>` element.
|
|
14
20
|
*
|
|
@@ -22,11 +28,19 @@ export interface ParseOpts {
|
|
|
22
28
|
*/
|
|
23
29
|
whitespace: boolean;
|
|
24
30
|
/**
|
|
25
|
-
*
|
|
31
|
+
* If enabled, collapses all whitespace to single space (`\u0020`)
|
|
32
|
+
* characters.
|
|
26
33
|
*
|
|
27
34
|
* @defaultValue true
|
|
28
35
|
*/
|
|
29
|
-
|
|
36
|
+
collapse: boolean;
|
|
37
|
+
/**
|
|
38
|
+
* If enabled, unescapes known named and numeric HTML entities (i.e.
|
|
39
|
+
* replaces them with their original characters).
|
|
40
|
+
*
|
|
41
|
+
* @defaultValue true
|
|
42
|
+
*/
|
|
43
|
+
unescape: boolean;
|
|
30
44
|
/**
|
|
31
45
|
* Keep comments.
|
|
32
46
|
*
|
|
@@ -49,7 +63,7 @@ export interface ParseOpts {
|
|
|
49
63
|
* Parser's internal max recursion limit. Parsing will terminate once this
|
|
50
64
|
* limit is reached.
|
|
51
65
|
*
|
|
52
|
-
* @defaultValue
|
|
66
|
+
* @defaultValue 128
|
|
53
67
|
*/
|
|
54
68
|
maxDepth: number;
|
|
55
69
|
/**
|
package/index.js
CHANGED
|
@@ -4,7 +4,7 @@ import { defGrammar } from "@thi.ng/parse/grammar";
|
|
|
4
4
|
import { unescapeEntities } from "@thi.ng/strings/entities";
|
|
5
5
|
// HTML parse grammar rules (see: thi.ng/parse readme for details)
|
|
6
6
|
// playground URL:
|
|
7
|
-
// https://demo.thi.ng/umbrella/parse-playground/#
|
|
7
|
+
// https://demo.thi.ng/umbrella/parse-playground/#l9oD3G5vZGU6ICc8JyEgKDxjb21tZW50PiB8IDxjZGF0YV9lbD4gfCA8dm9pZF9lbD4gfCA8ZWw-KSA7CmVsOiA8bmFtZT4gPGF0dHJpYj4qICg8ZWxfYm9keT4gfCA8ZWxfY2xvc2U-ISApIDsKZWxfYm9keTogPFdTMD4gJz4nISAoPGJvZHk-IHwgPG5vZGU-KSogIjwvIiEgPG5hbWU-ISA8V1MwPiAnPichID0-IGhvaXN0IDsKZWxfY2xvc2U6IDxXUzA-ICIvPiIhIDsKbmFtZTogW0EtWmEtejAtOV86XC1dKyA9PiBqb2luIDsKYXR0cmliOiA8V1MxPiA8bmFtZT4gPGF0dHZhbD4_IDsKYXR0dmFsOiAnPSchICg8dmFsPiB8IDxhbHRfdmFsPiB8IDxlbXB0eT4gfCA8YWx0X2VtcHR5PikgOwp2YWw6ICciJyEgLig_KyciJyEpID0-IGpvaW4gOwphbHRfdmFsOiAnXCcnISAuKD8rJ1wnJyEpID0-IGpvaW4gOwplbXB0eTogJyInICciJyA7CmFsdF9lbXB0eTogJ1wnJyEgJ1wnJyEgOwpib2R5OiAuKD8tJzwnISkgPT4gam9pbiA7Cgp2b2lkX2VsOiA8dm9pZF9uYW1lPiA8YXR0cmliPiogPFdTMD4gJy8nPyEgJz4nISA7CnZvaWRfbmFtZTogKCJhcmVhIiB8ICJiYXNlIiB8ICJiciIgfCAiY29sIiB8ICJlbWJlZCIgfCAiaHIiIHwgImltZyIgfCAiaW5wdXQiIHwgImxpbmsiIHwgIm1ldGEiIHwgInNvdXJjZSIgfCAidHJhY2siIHwgIndiciIpIDsKCmNkYXRhX2VsOiA8Y2RhdGFfbmFtZT4gPGF0dHJpYj4qICc-JyEgPGNkYXRhX2JvZHk-IDsKY2RhdGFfbmFtZTogKCJzY3JpcHQiIHwgInN0eWxlIikgOwpjZGF0YV9ib2R5OiAuKD8tPGNkYXRhX2Nsb3NlPiEpIDxjZGF0YV9jbG9zZT4hID0-IGpvaW4gOwpjZGF0YV9jbG9zZTogIjwvIiEgPGNkYXRhX25hbWU-ISA8V1MwPiAnPichIDsKCmRvY3R5cGU6ICI8ISIhICgiZG9jdHlwZSIgfCAiRE9DVFlQRSIpISA8V1MxPiAuKD8rJz4nISkgPFdTMD4gPT4gam9pbiA7CmNvbW1lbnQ6ICIhLS0iISAuKD8rIi0tPiIhKSA9PiBqb2luIDsKCm1haW46IDxTVEFSVD4gPGRvY3R5cGU-PyA8bm9kZT4rIDxFTkQ-IDukbWFpbtoBMjwhZG9jdHlwZSBodG1sPgo8aHRtbCBsYW5nPSJlbiI-CjxoZWFkPgogIDwhLS0gPGlnbm9yZT48L2lnbm9yZT4gLS0-CiAgPHNjcmlwdCBsYW5nPSJqYXZhc2NyaXB0Ij4KY29uc29sZS5sb2coIjwvIisic2NyaXB0PiIpOwogIDwvc2NyaXB0PgogIDxzdHlsZT4KYm9keSB7IG1hcmdpbjogMDsgfQogIDwvc3R5bGU-CjwvaGVhZD4KPGJvZHk-CiAgPGRpdiBpZD0iZm9vIiBib29sIGRhdGEteHl6PSIiIGVtcHR5PScnPgogICAgPGEgaHJlZj0iI2JhciI-YmF6IDxiPmJvbGQ8L2I-PC9hPjxici8-CiAgPC9kaXY-CjwvYm9keT4KPC9odG1sPqCgoKA
|
|
8
8
|
export const lang = defGrammar(`
|
|
9
9
|
node: '<'! (<comment> | <cdata_el> | <void_el> | <el>) ;
|
|
10
10
|
el: <name> <attrib>* (<el_body> | <el_close>! ) ;
|
|
@@ -20,14 +20,14 @@ alt_empty: '\\''! '\\''! ;
|
|
|
20
20
|
body: .(?-'<'!) => join ;
|
|
21
21
|
|
|
22
22
|
void_el: <void_name> <attrib>* <WS0> '/'?! '>'! ;
|
|
23
|
-
void_name: ("
|
|
23
|
+
void_name: ("area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link" | "meta" | "source" | "track" | "wbr") ;
|
|
24
24
|
|
|
25
25
|
cdata_el: <cdata_name> <attrib>* '>'! <cdata_body> ;
|
|
26
26
|
cdata_name: ("script" | "style") ;
|
|
27
27
|
cdata_body: .(?-<cdata_close>!) <cdata_close>! => join ;
|
|
28
28
|
cdata_close: "</"! <cdata_name>! <WS0> '>'! ;
|
|
29
29
|
|
|
30
|
-
doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1>
|
|
30
|
+
doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1> .(?+'>'!) <WS0> => join ;
|
|
31
31
|
comment: "!--"! .(?+"-->"!) => join ;
|
|
32
32
|
|
|
33
33
|
main: <START> <doctype>? <node>+ <END> ;
|
|
@@ -52,12 +52,19 @@ export const parseRaw = (src, opts) => {
|
|
|
52
52
|
* @param src
|
|
53
53
|
* @param opts
|
|
54
54
|
*/
|
|
55
|
-
export const parseHtml = (src, opts
|
|
55
|
+
export const parseHtml = (src, opts) => {
|
|
56
56
|
if (!src)
|
|
57
57
|
return { type: "success", result: [] };
|
|
58
|
+
opts = {
|
|
59
|
+
debug: false,
|
|
60
|
+
collapse: true,
|
|
61
|
+
unescape: true,
|
|
62
|
+
maxDepth: 128,
|
|
63
|
+
...opts,
|
|
64
|
+
};
|
|
58
65
|
try {
|
|
59
66
|
const { result, ctx } = parseRaw(src.trim(), {
|
|
60
|
-
debug: opts.debug
|
|
67
|
+
debug: opts.debug,
|
|
61
68
|
maxDepth: opts.maxDepth,
|
|
62
69
|
});
|
|
63
70
|
const loc = {
|
|
@@ -102,7 +109,7 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" },
|
|
|
102
109
|
return;
|
|
103
110
|
children = children[0].children;
|
|
104
111
|
if (opts.doctype && children?.[0]) {
|
|
105
|
-
acc.push(["!DOCTYPE", children[0].
|
|
112
|
+
acc.push(["!DOCTYPE", children[0].result]);
|
|
106
113
|
}
|
|
107
114
|
for (let x of children[1].children)
|
|
108
115
|
transformScope(x, opts, acc);
|
|
@@ -141,7 +148,7 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" },
|
|
|
141
148
|
}
|
|
142
149
|
if (body) {
|
|
143
150
|
if (body.result) {
|
|
144
|
-
el.push(
|
|
151
|
+
el.push(body.result.trim());
|
|
145
152
|
}
|
|
146
153
|
else if (body.children) {
|
|
147
154
|
for (let x of body.children)
|
|
@@ -156,7 +163,10 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" },
|
|
|
156
163
|
body: ({ result }, opts, acc) => {
|
|
157
164
|
if (!opts.whitespace && /^\s+$/.test(result))
|
|
158
165
|
return;
|
|
159
|
-
|
|
166
|
+
if (opts.collapse)
|
|
167
|
+
result = result.replace(/\s+/gm, " ");
|
|
168
|
+
if (opts.unescape)
|
|
169
|
+
result = unescapeEntities(result);
|
|
160
170
|
result = opts.txBody ? opts.txBody(result) : result;
|
|
161
171
|
if (result != null)
|
|
162
172
|
acc.push(result);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@thi.ng/hiccup-html-parse",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Well-formed HTML parsing and customizable transformation to nested JS arrays in @thi.ng/hiccup format",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"module": "./index.js",
|
|
@@ -34,17 +34,17 @@
|
|
|
34
34
|
"test": "testament test"
|
|
35
35
|
},
|
|
36
36
|
"dependencies": {
|
|
37
|
-
"@thi.ng/api": "^8.9.
|
|
38
|
-
"@thi.ng/defmulti": "^3.0.
|
|
39
|
-
"@thi.ng/parse": "^2.4.
|
|
40
|
-
"@thi.ng/strings": "^3.6.
|
|
37
|
+
"@thi.ng/api": "^8.9.6",
|
|
38
|
+
"@thi.ng/defmulti": "^3.0.1",
|
|
39
|
+
"@thi.ng/parse": "^2.4.1",
|
|
40
|
+
"@thi.ng/strings": "^3.6.1"
|
|
41
41
|
},
|
|
42
42
|
"devDependencies": {
|
|
43
|
-
"@microsoft/api-extractor": "^7.
|
|
44
|
-
"@thi.ng/testament": "^0.3.
|
|
45
|
-
"rimraf": "^5.0.
|
|
43
|
+
"@microsoft/api-extractor": "^7.38.0",
|
|
44
|
+
"@thi.ng/testament": "^0.3.24",
|
|
45
|
+
"rimraf": "^5.0.5",
|
|
46
46
|
"tools": "^0.0.1",
|
|
47
|
-
"typedoc": "^0.25.
|
|
47
|
+
"typedoc": "^0.25.2",
|
|
48
48
|
"typescript": "^5.2.2"
|
|
49
49
|
},
|
|
50
50
|
"keywords": [
|
|
@@ -84,5 +84,5 @@
|
|
|
84
84
|
"status": "alpha",
|
|
85
85
|
"year": 2023
|
|
86
86
|
},
|
|
87
|
-
"gitHead": "
|
|
87
|
+
"gitHead": "8d46d9326a9f9b81d65e7e274446f5964f9942ac\n"
|
|
88
88
|
}
|