@thi.ng/hiccup-html-parse 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -1
- package/README.md +23 -2
- package/index.d.ts +22 -3
- package/index.js +19 -10
- package/package.json +4 -4
package/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
-
- **Last updated**: 2023-09-
|
|
3
|
+
- **Last updated**: 2023-09-19T19:33:16Z
|
|
4
4
|
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
|
|
5
5
|
|
|
6
6
|
All notable changes to this project will be documented in this file.
|
|
@@ -9,6 +9,16 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
|
|
|
9
9
|
**Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
|
|
10
10
|
and/or version bumps of transitive dependencies.
|
|
11
11
|
|
|
12
|
+
## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.2.0) (2023-09-19)
|
|
13
|
+
|
|
14
|
+
#### 🚀 Features
|
|
15
|
+
|
|
16
|
+
- add support for comment nodes ([52390e9](https://github.com/thi-ng/umbrella/commit/52390e9))
|
|
17
|
+
- update grammar, add new options ([7cc3826](https://github.com/thi-ng/umbrella/commit/7cc3826))
|
|
18
|
+
- update whitespace inside tag handling
|
|
19
|
+
- add transformScope() impl for void elements
|
|
20
|
+
- add debug & maxDepth options
|
|
21
|
+
|
|
12
22
|
## [0.1.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.1.0) (2023-09-19)
|
|
13
23
|
|
|
14
24
|
#### 🚀 Features
|
package/README.md
CHANGED
|
@@ -19,12 +19,16 @@ This project is part of the
|
|
|
19
19
|
- [Dependencies](#dependencies)
|
|
20
20
|
- [Usage examples](#usage-examples)
|
|
21
21
|
- [API](#api)
|
|
22
|
+
- [Benchmarks](#benchmarks)
|
|
22
23
|
- [Authors](#authors)
|
|
23
24
|
- [License](#license)
|
|
24
25
|
|
|
25
26
|
## About
|
|
26
27
|
|
|
27
|
-
HTML parsing and transformation to nested JS arrays in
|
|
28
|
+
Well-formed HTML parsing and customizable transformation to nested JS arrays in [@thi.ng/hiccup](https://github.com/thi-ng/umbrella/tree/develop/packages/hiccup) format.
|
|
29
|
+
|
|
30
|
+
Note: This parser is intended to work with formed HTML and will likely fail for
|
|
31
|
+
any "quirky" (aka dodgy) markup...
|
|
28
32
|
|
|
29
33
|
### Basic usage
|
|
30
34
|
|
|
@@ -78,6 +82,7 @@ transformation functions:
|
|
|
78
82
|
|------------------|--------------------------------------|---------|
|
|
79
83
|
| `ignoreElements` | Array of element names to ignore | [] |
|
|
80
84
|
| `ignoreAttribs` | Array of attribute names to ignore | [] |
|
|
85
|
+
| `comments` | Keep `<!-- ... -->` comments | false |
|
|
81
86
|
| `doctype` | Keep `<!doctype ...>` element | false |
|
|
82
87
|
| `whitespace` | Keep whitespace-only text bodies | false |
|
|
83
88
|
| `dataAttribs` | Keep data attribs | true |
|
|
@@ -116,7 +121,7 @@ For Node.js REPL:
|
|
|
116
121
|
const hiccupHtmlParse = await import("@thi.ng/hiccup-html-parse");
|
|
117
122
|
```
|
|
118
123
|
|
|
119
|
-
Package sizes (brotli'd, pre-treeshake): ESM: 1.
|
|
124
|
+
Package sizes (brotli'd, pre-treeshake): ESM: 1.10 KB
|
|
120
125
|
|
|
121
126
|
## Dependencies
|
|
122
127
|
|
|
@@ -143,6 +148,22 @@ A selection:
|
|
|
143
148
|
|
|
144
149
|
TODO
|
|
145
150
|
|
|
151
|
+
## Benchmarks
|
|
152
|
+
|
|
153
|
+
Results from the
|
|
154
|
+
[benchmark](https://github.com/thi-ng/umbrella/blob/develop/packages/hiccup-html-parse/bench/index.ts)
|
|
155
|
+
parsing the HTML of the [thi.ng](https://thi.ng) website (MBA M1 2021, 16GB RAM,
|
|
156
|
+
Node.js v20.5.1):
|
|
157
|
+
|
|
158
|
+
```text
|
|
159
|
+
benchmarking: thi.ng html (87.97 KB)
|
|
160
|
+
warmup... 1951.76ms (100 runs)
|
|
161
|
+
total: 19375.49ms, runs: 1000 (@ 1 calls/iter)
|
|
162
|
+
mean: 19.38ms, median: 19.26ms, range: [18.12..28.45]
|
|
163
|
+
q1: 18.75ms, q3: 19.68ms
|
|
164
|
+
sd: 4.66%
|
|
165
|
+
```
|
|
166
|
+
|
|
146
167
|
## Authors
|
|
147
168
|
|
|
148
169
|
- [Karsten Schmidt](https://thi.ng)
|
package/index.d.ts
CHANGED
|
@@ -27,6 +27,12 @@ export interface ParseOpts {
|
|
|
27
27
|
* @defaultValue true
|
|
28
28
|
*/
|
|
29
29
|
dataAttribs: boolean;
|
|
30
|
+
/**
|
|
31
|
+
* Keep comments.
|
|
32
|
+
*
|
|
33
|
+
* @defaultValue false
|
|
34
|
+
*/
|
|
35
|
+
comments: boolean;
|
|
30
36
|
/**
|
|
31
37
|
* Element transform/filter. Receives an hiccup element before its being
|
|
32
38
|
* added to its parent. The function has full freedom to replace the element
|
|
@@ -39,6 +45,19 @@ export interface ParseOpts {
|
|
|
39
45
|
* text will be skipped/omitted entirely.
|
|
40
46
|
*/
|
|
41
47
|
txBody: Fn<string, any>;
|
|
48
|
+
/**
|
|
49
|
+
* Parser's internal max recursion limit. Parsing will terminate once this
|
|
50
|
+
* limit is reached.
|
|
51
|
+
*
|
|
52
|
+
* @defaultValue 64
|
|
53
|
+
*/
|
|
54
|
+
maxDepth: number;
|
|
55
|
+
/**
|
|
56
|
+
* True to enable parser debug output. Will emit details of each parse scope.
|
|
57
|
+
*
|
|
58
|
+
* @defaultValue false
|
|
59
|
+
*/
|
|
60
|
+
debug: boolean;
|
|
42
61
|
}
|
|
43
62
|
export type Element = [string, Record<string, any>, ...ElementBody[]];
|
|
44
63
|
type ElementBody = string | Element;
|
|
@@ -66,9 +85,9 @@ export declare const parseRaw: (src: string, opts?: Partial<ContextOpts>) => {
|
|
|
66
85
|
ctx: import("@thi.ng/parse").ParseContext<string>;
|
|
67
86
|
};
|
|
68
87
|
/**
|
|
69
|
-
*
|
|
70
|
-
* thi.ng/hiccup format, using provided options to transform, clean
|
|
71
|
-
* elements.
|
|
88
|
+
* Trims given HTML source string and attempts to parse it into a collection of
|
|
89
|
+
* elements in thi.ng/hiccup format, using provided options to transform, clean
|
|
90
|
+
* or filter elements.
|
|
72
91
|
*
|
|
73
92
|
* @param src
|
|
74
93
|
* @param opts
|
package/index.js
CHANGED
|
@@ -4,11 +4,11 @@ import { defGrammar } from "@thi.ng/parse/grammar";
|
|
|
4
4
|
import { unescapeEntities } from "@thi.ng/strings/entities";
|
|
5
5
|
// HTML parse grammar rules (see: thi.ng/parse readme for details)
|
|
6
6
|
// playground URL:
|
|
7
|
-
// https://demo.thi.ng/umbrella/parse-playground/#
|
|
7
|
+
// https://demo.thi.ng/umbrella/parse-playground/#l9oDdW5vZGU6ICc8JyEgKDxjb21tZW50PiB8IDxjZGF0YV9lbD4gfCA8dm9pZF9lbD4gfCA8ZWw-KSA7CmVsOiA8bmFtZT4gPGF0dHJpYj4qICg8ZWxfYm9keT4gfCA8ZWxfY2xvc2U-ISApIDsKZWxfYm9keTogPFdTMD4gJz4nISAoPGJvZHk-IHwgPG5vZGU-KSogIjwvIiEgPG5hbWU-ISA8V1MwPiAnPichID0-IGhvaXN0IDsKZWxfY2xvc2U6IDxXUzA-ICIvPiIhIDsKbmFtZTogW0EtWmEtejAtOV86XC1dKyA9PiBqb2luIDsKYXR0cmliOiA8V1MxPiA8bmFtZT4gPGF0dHZhbD4_IDsKYXR0dmFsOiAnPSchICg8dmFsPiB8IDxhbHRfdmFsPiB8IDxlbXB0eT4gfCA8YWx0X2VtcHR5PikgOwp2YWw6ICciJyEgLig_KyciJyEpID0-IGpvaW4gOwphbHRfdmFsOiAnXCcnISAuKD8rJ1wnJyEpID0-IGpvaW4gOwplbXB0eTogJyInICciJyA7CmFsdF9lbXB0eTogJ1wnJyEgJ1wnJyEgOwpib2R5OiAuKD8tJzwnISkgPT4gam9pbiA7Cgp2b2lkX2VsOiA8dm9pZF9uYW1lPiA8YXR0cmliPiogPFdTMD4gJy8nPyEgJz4nISA7CnZvaWRfbmFtZTogKCJtZXRhIiB8ICJsaW5rIikgOwoKY2RhdGFfZWw6IDxjZGF0YV9uYW1lPiA8YXR0cmliPiogJz4nISA8Y2RhdGFfYm9keT4gOwpjZGF0YV9uYW1lOiAoInNjcmlwdCIgfCAic3R5bGUiKSA7CmNkYXRhX2JvZHk6IC4oPy08Y2RhdGFfY2xvc2U-ISkgPGNkYXRhX2Nsb3NlPiEgPT4gam9pbiA7CmNkYXRhX2Nsb3NlOiAiPC8iISA8Y2RhdGFfbmFtZT4hIDxXUzA-ICc-JyEgOwoKZG9jdHlwZTogIjwhIiEgKCJkb2N0eXBlIiB8ICJET0NUWVBFIikhIDxXUzE-IDxuYW1lPiAnPichIDxXUzA-IDsKY29tbWVudDogIiEtLSIhIC4oPysiLS0-IiEpID0-IGpvaW4gOwoKbWFpbjogPFNUQVJUPiA8ZG9jdHlwZT4_IDxub2RlPisgPEVORD4gO6RtYWlu2gEyPCFkb2N0eXBlIGh0bWw-CjxodG1sIGxhbmc9ImVuIj4KPGhlYWQ-CiAgPCEtLSA8aWdub3JlPjwvaWdub3JlPiAtLT4KICA8c2NyaXB0IGxhbmc9ImphdmFzY3JpcHQiPgpjb25zb2xlLmxvZygiPC8iKyJzY3JpcHQ-Iik7CiAgPC9zY3JpcHQ-CiAgPHN0eWxlPgpib2R5IHsgbWFyZ2luOiAwOyB9CiAgPC9zdHlsZT4KPC9oZWFkPgo8Ym9keT4KICA8ZGl2IGlkPSJmb28iIGJvb2wgZGF0YS14eXo9IiIgZW1wdHk9Jyc-CiAgICA8YSBocmVmPSIjYmFyIj5iYXogPGI-Ym9sZDwvYj48L2E-PGJyLz4KICA8L2Rpdj4KPC9ib2R5Pgo8L2h0bWw-oKCgoA
|
|
8
8
|
export const lang = defGrammar(`
|
|
9
|
-
node: '<'! (<cdata_el> | <void_el> | <el>) ;
|
|
9
|
+
node: '<'! (<comment> | <cdata_el> | <void_el> | <el>) ;
|
|
10
10
|
el: <name> <attrib>* (<el_body> | <el_close>! ) ;
|
|
11
|
-
el_body: '>'! (<body> | <node>)* "</"! <name>! '>'! => hoist ;
|
|
11
|
+
el_body: <WS0> '>'! (<body> | <node>)* "</"! <name>! <WS0> '>'! => hoist ;
|
|
12
12
|
el_close: <WS0> "/>"! ;
|
|
13
13
|
name: [A-Za-z0-9_:\\-]+ => join ;
|
|
14
14
|
attrib: <WS1> <name> <attval>? ;
|
|
@@ -25,10 +25,12 @@ void_name: ("meta" | "link") ;
|
|
|
25
25
|
cdata_el: <cdata_name> <attrib>* '>'! <cdata_body> ;
|
|
26
26
|
cdata_name: ("script" | "style") ;
|
|
27
27
|
cdata_body: .(?-<cdata_close>!) <cdata_close>! => join ;
|
|
28
|
-
cdata_close: "</"! <cdata_name>! '>'! ;
|
|
28
|
+
cdata_close: "</"! <cdata_name>! <WS0> '>'! ;
|
|
29
29
|
|
|
30
30
|
doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1> <name> '>'! <WS0> ;
|
|
31
|
-
|
|
31
|
+
comment: "!--"! .(?+"-->"!) => join ;
|
|
32
|
+
|
|
33
|
+
main: <START> <doctype>? <node>+ <END> ;
|
|
32
34
|
`);
|
|
33
35
|
/**
|
|
34
36
|
* Creates a parser context for given source string and calls the main parser
|
|
@@ -43,9 +45,9 @@ export const parseRaw = (src, opts) => {
|
|
|
43
45
|
return { result: lang.rules.main(ctx), ctx };
|
|
44
46
|
};
|
|
45
47
|
/**
|
|
46
|
-
*
|
|
47
|
-
* thi.ng/hiccup format, using provided options to transform, clean
|
|
48
|
-
* elements.
|
|
48
|
+
* Trims given HTML source string and attempts to parse it into a collection of
|
|
49
|
+
* elements in thi.ng/hiccup format, using provided options to transform, clean
|
|
50
|
+
* or filter elements.
|
|
49
51
|
*
|
|
50
52
|
* @param src
|
|
51
53
|
* @param opts
|
|
@@ -54,7 +56,10 @@ export const parseHtml = (src, opts = {}) => {
|
|
|
54
56
|
if (!src)
|
|
55
57
|
return { type: "success", result: [] };
|
|
56
58
|
try {
|
|
57
|
-
const { result, ctx } = parseRaw(src)
|
|
59
|
+
const { result, ctx } = parseRaw(src.trim(), {
|
|
60
|
+
debug: opts.debug || false,
|
|
61
|
+
maxDepth: opts.maxDepth,
|
|
62
|
+
});
|
|
58
63
|
const loc = {
|
|
59
64
|
offset: ctx.state.p,
|
|
60
65
|
line: ctx.state.l,
|
|
@@ -87,7 +92,7 @@ export const parseHtml = (src, opts = {}) => {
|
|
|
87
92
|
*
|
|
88
93
|
* @internal
|
|
89
94
|
*/
|
|
90
|
-
const transformScope = defmulti((x) => x.id, { cdata_el: "el" }, {
|
|
95
|
+
const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" }, {
|
|
91
96
|
[DEFAULT]: (scope) => {
|
|
92
97
|
throw new Error(`missing impl for scope ID: ${scope.id}`);
|
|
93
98
|
},
|
|
@@ -105,6 +110,10 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el" }, {
|
|
|
105
110
|
node: ({ children }, opts, acc) => {
|
|
106
111
|
transformScope(children[0], opts, acc);
|
|
107
112
|
},
|
|
113
|
+
comment: ({ result }, opts, acc) => {
|
|
114
|
+
if (opts.comments)
|
|
115
|
+
acc.push(["__COMMENT__", result.trim()]);
|
|
116
|
+
},
|
|
108
117
|
// element node transformer, collects & filters attributes/children
|
|
109
118
|
// adds resulting hiccup element to accumulator array
|
|
110
119
|
el: ({ children }, opts, acc) => {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@thi.ng/hiccup-html-parse",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "HTML parsing and transformation to nested JS arrays in hiccup format",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Well-formed HTML parsing and customizable transformation to nested JS arrays in @thi.ng/hiccup format",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"module": "./index.js",
|
|
7
7
|
"typings": "./index.d.ts",
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"dependencies": {
|
|
37
37
|
"@thi.ng/api": "^8.9.5",
|
|
38
38
|
"@thi.ng/defmulti": "^3.0.0",
|
|
39
|
-
"@thi.ng/parse": "^2.
|
|
39
|
+
"@thi.ng/parse": "^2.4.0",
|
|
40
40
|
"@thi.ng/strings": "^3.6.0"
|
|
41
41
|
},
|
|
42
42
|
"devDependencies": {
|
|
@@ -84,5 +84,5 @@
|
|
|
84
84
|
"status": "alpha",
|
|
85
85
|
"year": 2023
|
|
86
86
|
},
|
|
87
|
-
"gitHead": "
|
|
87
|
+
"gitHead": "acac49146d9720010a2cc6cc53d3ef8ef035f409\n"
|
|
88
88
|
}
|