@thi.ng/hiccup-html-parse 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Change Log
2
2
 
3
- - **Last updated**: 2023-09-19T10:42:50Z
3
+ - **Last updated**: 2023-09-25T07:43:28Z
4
4
  - **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
5
5
 
6
6
  All notable changes to this project will be documented in this file.
@@ -9,6 +9,16 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
9
9
  **Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
10
10
  and/or version bumps of transitive dependencies.
11
11
 
12
+ ## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.2.0) (2023-09-19)
13
+
14
+ #### 🚀 Features
15
+
16
+ - add support for comment nodes ([52390e9](https://github.com/thi-ng/umbrella/commit/52390e9))
17
+ - update grammar, add new options ([7cc3826](https://github.com/thi-ng/umbrella/commit/7cc3826))
18
+ - update whitespace inside tag handling
19
+ - add transformScope() impl for void elements
20
+ - add debug & maxDepth options
21
+
12
22
  ## [0.1.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.1.0) (2023-09-19)
13
23
 
14
24
  #### 🚀 Features
package/README.md CHANGED
@@ -19,12 +19,16 @@ This project is part of the
19
19
  - [Dependencies](#dependencies)
20
20
  - [Usage examples](#usage-examples)
21
21
  - [API](#api)
22
+ - [Benchmarks](#benchmarks)
22
23
  - [Authors](#authors)
23
24
  - [License](#license)
24
25
 
25
26
  ## About
26
27
 
27
- HTML parsing and transformation to nested JS arrays in hiccup format. This is a support package for [@thi.ng/hiccup](https://github.com/thi-ng/umbrella/tree/develop/packages/hiccup).
28
+ Well-formed HTML parsing and customizable transformation to nested JS arrays in [@thi.ng/hiccup](https://github.com/thi-ng/umbrella/tree/develop/packages/hiccup) format.
29
+
30
+ Note: This parser is intended to work with formed HTML and will likely fail for
31
+ any "quirky" (aka dodgy) markup...
28
32
 
29
33
  ### Basic usage
30
34
 
@@ -78,6 +82,7 @@ transformation functions:
78
82
  |------------------|--------------------------------------|---------|
79
83
  | `ignoreElements` | Array of element names to ignore | [] |
80
84
  | `ignoreAttribs` | Array of attribute names to ignore | [] |
85
+ | `comments` | Keep `<!-- ... -->` comments | false |
81
86
  | `doctype` | Keep `<!doctype ...>` element | false |
82
87
  | `whitespace` | Keep whitespace-only text bodies | false |
83
88
  | `dataAttribs` | Keep data attribs | true |
@@ -116,7 +121,7 @@ For Node.js REPL:
116
121
  const hiccupHtmlParse = await import("@thi.ng/hiccup-html-parse");
117
122
  ```
118
123
 
119
- Package sizes (brotli'd, pre-treeshake): ESM: 1.03 KB
124
+ Package sizes (brotli'd, pre-treeshake): ESM: 1.10 KB
120
125
 
121
126
  ## Dependencies
122
127
 
@@ -143,6 +148,22 @@ A selection:
143
148
 
144
149
  TODO
145
150
 
151
+ ## Benchmarks
152
+
153
+ Results from the
154
+ [benchmark](https://github.com/thi-ng/umbrella/blob/develop/packages/hiccup-html-parse/bench/index.ts)
155
+ parsing the HTML of the [thi.ng](https://thi.ng) website (MBA M1 2021, 16GB RAM,
156
+ Node.js v20.5.1):
157
+
158
+ ```text
159
+ benchmarking: thi.ng html (87.97 KB)
160
+ warmup... 1951.76ms (100 runs)
161
+ total: 19375.49ms, runs: 1000 (@ 1 calls/iter)
162
+ mean: 19.38ms, median: 19.26ms, range: [18.12..28.45]
163
+ q1: 18.75ms, q3: 19.68ms
164
+ sd: 4.66%
165
+ ```
166
+
146
167
  ## Authors
147
168
 
148
169
  - [Karsten Schmidt](https://thi.ng)
package/index.d.ts CHANGED
@@ -27,6 +27,12 @@ export interface ParseOpts {
27
27
  * @defaultValue true
28
28
  */
29
29
  dataAttribs: boolean;
30
+ /**
31
+ * Keep comments.
32
+ *
33
+ * @defaultValue false
34
+ */
35
+ comments: boolean;
30
36
  /**
31
37
  * Element transform/filter. Receives an hiccup element before its being
32
38
  * added to its parent. The function has full freedom to replace the element
@@ -39,6 +45,19 @@ export interface ParseOpts {
39
45
  * text will be skipped/omitted entirely.
40
46
  */
41
47
  txBody: Fn<string, any>;
48
+ /**
49
+ * Parser's internal max recursion limit. Parsing will terminate once this
50
+ * limit is reached.
51
+ *
52
+ * @defaultValue 64
53
+ */
54
+ maxDepth: number;
55
+ /**
56
+ * True to enable parser debug output. Will emit details of each parse scope.
57
+ *
58
+ * @defaultValue false
59
+ */
60
+ debug: boolean;
42
61
  }
43
62
  export type Element = [string, Record<string, any>, ...ElementBody[]];
44
63
  type ElementBody = string | Element;
@@ -66,9 +85,9 @@ export declare const parseRaw: (src: string, opts?: Partial<ContextOpts>) => {
66
85
  ctx: import("@thi.ng/parse").ParseContext<string>;
67
86
  };
68
87
  /**
69
- * Parses given HTML source string into a collection of elements in
70
- * thi.ng/hiccup format, using provided options to transform, clean or filter
71
- * elements.
88
+ * Trims given HTML source string and attempts to parse it into a collection of
89
+ * elements in thi.ng/hiccup format, using provided options to transform, clean
90
+ * or filter elements.
72
91
  *
73
92
  * @param src
74
93
  * @param opts
package/index.js CHANGED
@@ -4,11 +4,11 @@ import { defGrammar } from "@thi.ng/parse/grammar";
4
4
  import { unescapeEntities } from "@thi.ng/strings/entities";
5
5
  // HTML parse grammar rules (see: thi.ng/parse readme for details)
6
6
  // playground URL:
7
- // https://demo.thi.ng/umbrella/parse-playground/#l9oDMG5vZGU6ICc8JyEgKDxjZGF0YV9lbD4gfCA8dm9pZF9lbD4gfCA8ZWw-KSA7CmVsOiA8bmFtZT4gPGF0dHJpYj4qICg8ZWxfYm9keT4gfCA8ZWxfY2xvc2U-ISApIDsKZWxfYm9keTogJz4nISAoPGJvZHk-IHwgPG5vZGU-KSogIjwvIiEgPG5hbWU-ISAnPichID0-IGhvaXN0IDsKZWxfY2xvc2U6IDxXUzA-ICIvPiIhIDsKbmFtZTogW0EtWmEtejAtOV86XC1dKyA9PiBqb2luIDsKYXR0cmliOiA8V1MxPiA8bmFtZT4gPGF0dHZhbD4_IDsKYXR0dmFsOiAnPSchICg8dmFsPiB8IDxhbHRfdmFsPiB8IDxlbXB0eT4gfCA8YWx0X2VtcHR5PikgOwp2YWw6ICciJyEgLig_KyciJyEpID0-IGpvaW4gOwphbHRfdmFsOiAnXCcnISAuKD8rJ1wnJyEpID0-IGpvaW4gOwplbXB0eTogJyInICciJyA7CmFsdF9lbXB0eTogJ1wnJyEgJ1wnJyEgOwpib2R5OiAuKD8tJzwnISkgPT4gam9pbiA7Cgp2b2lkX2VsOiA8dm9pZF9uYW1lPiA8YXR0cmliPiogPFdTMD4gJy8nPyEgJz4nISA7CnZvaWRfbmFtZTogKCJtZXRhIiB8ICJsaW5rIikgOwoKY2RhdGFfZWw6IDxjZGF0YV9uYW1lPiA8YXR0cmliPiogJz4nISA8Y2RhdGFfYm9keT4gOwpjZGF0YV9uYW1lOiAoInNjcmlwdCIgfCAic3R5bGUiKSA7CmNkYXRhX2JvZHk6IC4oPy08Y2RhdGFfY2xvc2U-ISkgPGNkYXRhX2Nsb3NlPiEgPT4gam9pbiA7CmNkYXRhX2Nsb3NlOiAiPC8iISA8Y2RhdGFfbmFtZT4hICc-JyEgOwoKZG9jdHlwZTogIjwhIiEgKCJkb2N0eXBlIiB8ICJET0NUWVBFIikhIDxXUzE-IDxuYW1lPiAnPichIDxXUzA-IDsKbWFpbjogPFNUQVJUPiA8ZG9jdHlwZT4_IDxub2RlPisgPEVORD4gO6RtYWlu2gEVPCFkb2N0eXBlIGh0bWw-CjxodG1sIGxhbmc9ImVuIj4KPGhlYWQ-CiAgPHNjcmlwdCBsYW5nPSJqYXZhc2NyaXB0Ij4KY29uc29sZS5sb2coIjwvIisic2NyaXB0PiIpOwogIDwvc2NyaXB0PgogIDxzdHlsZT4KYm9keSB7IG1hcmdpbjogMDsgfQogIDwvc3R5bGU-CjwvaGVhZD4KPGJvZHk-CiAgPGRpdiBpZD0iZm9vIiBib29sIGRhdGEteHl6PSIiIGVtcHR5PScnPgogICAgPGEgaHJlZj0iI2JhciI-YmF6IDxiPmJvbGQ8L2I-PC9hPjxici8-CiAgPC9kaXY-CjwvYm9keT4KPC9odG1sPqCgoKA
7
+ // https://demo.thi.ng/umbrella/parse-playground/#l9oDdW5vZGU6ICc8JyEgKDxjb21tZW50PiB8IDxjZGF0YV9lbD4gfCA8dm9pZF9lbD4gfCA8ZWw-KSA7CmVsOiA8bmFtZT4gPGF0dHJpYj4qICg8ZWxfYm9keT4gfCA8ZWxfY2xvc2U-ISApIDsKZWxfYm9keTogPFdTMD4gJz4nISAoPGJvZHk-IHwgPG5vZGU-KSogIjwvIiEgPG5hbWU-ISA8V1MwPiAnPichID0-IGhvaXN0IDsKZWxfY2xvc2U6IDxXUzA-ICIvPiIhIDsKbmFtZTogW0EtWmEtejAtOV86XC1dKyA9PiBqb2luIDsKYXR0cmliOiA8V1MxPiA8bmFtZT4gPGF0dHZhbD4_IDsKYXR0dmFsOiAnPSchICg8dmFsPiB8IDxhbHRfdmFsPiB8IDxlbXB0eT4gfCA8YWx0X2VtcHR5PikgOwp2YWw6ICciJyEgLig_KyciJyEpID0-IGpvaW4gOwphbHRfdmFsOiAnXCcnISAuKD8rJ1wnJyEpID0-IGpvaW4gOwplbXB0eTogJyInICciJyA7CmFsdF9lbXB0eTogJ1wnJyEgJ1wnJyEgOwpib2R5OiAuKD8tJzwnISkgPT4gam9pbiA7Cgp2b2lkX2VsOiA8dm9pZF9uYW1lPiA8YXR0cmliPiogPFdTMD4gJy8nPyEgJz4nISA7CnZvaWRfbmFtZTogKCJtZXRhIiB8ICJsaW5rIikgOwoKY2RhdGFfZWw6IDxjZGF0YV9uYW1lPiA8YXR0cmliPiogJz4nISA8Y2RhdGFfYm9keT4gOwpjZGF0YV9uYW1lOiAoInNjcmlwdCIgfCAic3R5bGUiKSA7CmNkYXRhX2JvZHk6IC4oPy08Y2RhdGFfY2xvc2U-ISkgPGNkYXRhX2Nsb3NlPiEgPT4gam9pbiA7CmNkYXRhX2Nsb3NlOiAiPC8iISA8Y2RhdGFfbmFtZT4hIDxXUzA-ICc-JyEgOwoKZG9jdHlwZTogIjwhIiEgKCJkb2N0eXBlIiB8ICJET0NUWVBFIikhIDxXUzE-IDxuYW1lPiAnPichIDxXUzA-IDsKY29tbWVudDogIiEtLSIhIC4oPysiLS0-IiEpID0-IGpvaW4gOwoKbWFpbjogPFNUQVJUPiA8ZG9jdHlwZT4_IDxub2RlPisgPEVORD4gO6RtYWlu2gEyPCFkb2N0eXBlIGh0bWw-CjxodG1sIGxhbmc9ImVuIj4KPGhlYWQ-CiAgPCEtLSA8aWdub3JlPjwvaWdub3JlPiAtLT4KICA8c2NyaXB0IGxhbmc9ImphdmFzY3JpcHQiPgpjb25zb2xlLmxvZygiPC8iKyJzY3JpcHQ-Iik7CiAgPC9zY3JpcHQ-CiAgPHN0eWxlPgpib2R5IHsgbWFyZ2luOiAwOyB9CiAgPC9zdHlsZT4KPC9oZWFkPgo8Ym9keT4KICA8ZGl2IGlkPSJmb28iIGJvb2wgZGF0YS14eXo9IiIgZW1wdHk9Jyc-CiAgICA8YSBocmVmPSIjYmFyIj5iYXogPGI-Ym9sZDwvYj48L2E-PGJyLz4KICA8L2Rpdj4KPC9ib2R5Pgo8L2h0bWw-oKCgoA
8
8
  export const lang = defGrammar(`
9
- node: '<'! (<cdata_el> | <void_el> | <el>) ;
9
+ node: '<'! (<comment> | <cdata_el> | <void_el> | <el>) ;
10
10
  el: <name> <attrib>* (<el_body> | <el_close>! ) ;
11
- el_body: '>'! (<body> | <node>)* "</"! <name>! '>'! => hoist ;
11
+ el_body: <WS0> '>'! (<body> | <node>)* "</"! <name>! <WS0> '>'! => hoist ;
12
12
  el_close: <WS0> "/>"! ;
13
13
  name: [A-Za-z0-9_:\\-]+ => join ;
14
14
  attrib: <WS1> <name> <attval>? ;
@@ -25,10 +25,12 @@ void_name: ("meta" | "link") ;
25
25
  cdata_el: <cdata_name> <attrib>* '>'! <cdata_body> ;
26
26
  cdata_name: ("script" | "style") ;
27
27
  cdata_body: .(?-<cdata_close>!) <cdata_close>! => join ;
28
- cdata_close: "</"! <cdata_name>! '>'! ;
28
+ cdata_close: "</"! <cdata_name>! <WS0> '>'! ;
29
29
 
30
30
  doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1> <name> '>'! <WS0> ;
31
- main: <START> <doctype>? <node>* <END> ;
31
+ comment: "!--"! .(?+"-->"!) => join ;
32
+
33
+ main: <START> <doctype>? <node>+ <END> ;
32
34
  `);
33
35
  /**
34
36
  * Creates a parser context for given source string and calls the main parser
@@ -43,9 +45,9 @@ export const parseRaw = (src, opts) => {
43
45
  return { result: lang.rules.main(ctx), ctx };
44
46
  };
45
47
  /**
46
- * Parses given HTML source string into a collection of elements in
47
- * thi.ng/hiccup format, using provided options to transform, clean or filter
48
- * elements.
48
+ * Trims given HTML source string and attempts to parse it into a collection of
49
+ * elements in thi.ng/hiccup format, using provided options to transform, clean
50
+ * or filter elements.
49
51
  *
50
52
  * @param src
51
53
  * @param opts
@@ -54,7 +56,10 @@ export const parseHtml = (src, opts = {}) => {
54
56
  if (!src)
55
57
  return { type: "success", result: [] };
56
58
  try {
57
- const { result, ctx } = parseRaw(src);
59
+ const { result, ctx } = parseRaw(src.trim(), {
60
+ debug: opts.debug || false,
61
+ maxDepth: opts.maxDepth,
62
+ });
58
63
  const loc = {
59
64
  offset: ctx.state.p,
60
65
  line: ctx.state.l,
@@ -87,7 +92,7 @@ export const parseHtml = (src, opts = {}) => {
87
92
  *
88
93
  * @internal
89
94
  */
90
- const transformScope = defmulti((x) => x.id, { cdata_el: "el" }, {
95
+ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" }, {
91
96
  [DEFAULT]: (scope) => {
92
97
  throw new Error(`missing impl for scope ID: ${scope.id}`);
93
98
  },
@@ -105,6 +110,10 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el" }, {
105
110
  node: ({ children }, opts, acc) => {
106
111
  transformScope(children[0], opts, acc);
107
112
  },
113
+ comment: ({ result }, opts, acc) => {
114
+ if (opts.comments)
115
+ acc.push(["__COMMENT__", result.trim()]);
116
+ },
108
117
  // element node transformer, collects & filters attributes/children
109
118
  // adds resulting hiccup element to accumulator array
110
119
  el: ({ children }, opts, acc) => {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@thi.ng/hiccup-html-parse",
3
- "version": "0.1.0",
4
- "description": "HTML parsing and transformation to nested JS arrays in hiccup format",
3
+ "version": "0.2.1",
4
+ "description": "Well-formed HTML parsing and customizable transformation to nested JS arrays in @thi.ng/hiccup format",
5
5
  "type": "module",
6
6
  "module": "./index.js",
7
7
  "typings": "./index.d.ts",
@@ -36,7 +36,7 @@
36
36
  "dependencies": {
37
37
  "@thi.ng/api": "^8.9.5",
38
38
  "@thi.ng/defmulti": "^3.0.0",
39
- "@thi.ng/parse": "^2.3.2",
39
+ "@thi.ng/parse": "^2.4.0",
40
40
  "@thi.ng/strings": "^3.6.0"
41
41
  },
42
42
  "devDependencies": {
@@ -84,5 +84,5 @@
84
84
  "status": "alpha",
85
85
  "year": 2023
86
86
  },
87
- "gitHead": "c22e8996cee284ebe8ea88582beb1ab5fc6ee503\n"
87
+ "gitHead": "fb2697579b193b609ec52237ea0d99c7295b6d3c\n"
88
88
  }