@thi.ng/hiccup-html-parse 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Change Log
2
2
 
3
- - **Last updated**: 2023-09-25T07:43:28Z
3
+ - **Last updated**: 2023-10-23T07:37:37Z
4
4
  - **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
5
5
 
6
6
  All notable changes to this project will be documented in this file.
@@ -9,6 +9,16 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
9
9
  **Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
10
10
  and/or version bumps of transitive dependencies.
11
11
 
12
+ ## [0.3.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.3.0) (2023-09-28)
13
+
14
+ #### 🚀 Features
15
+
16
+ - update grammar, add opts ([a43142d](https://github.com/thi-ng/umbrella/commit/a43142d))
17
+ - update grammar to improve doctype & void tag handling
18
+ - add new ParseOpts: collapse & unescape
19
+ - increase default maxDepth to 128
20
+ - add/update tests
21
+
12
22
  ## [0.2.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/hiccup-html-parse@0.2.0) (2023-09-19)
13
23
 
14
24
  #### 🚀 Features
package/README.md CHANGED
@@ -27,8 +27,8 @@ This project is part of the
27
27
 
28
28
  Well-formed HTML parsing and customizable transformation to nested JS arrays in [@thi.ng/hiccup](https://github.com/thi-ng/umbrella/tree/develop/packages/hiccup) format.
29
29
 
30
- Note: This parser is intended to work with formed HTML and will likely fail for
31
- any "quirky" (aka dodgy) markup...
30
+ Note: This parser is intended to work with wellformed HTML and will likely fail
31
+ for any "quirky" (aka malformed/dodgy) markup...
32
32
 
33
33
  ### Basic usage
34
34
 
@@ -78,16 +78,20 @@ console.log(result.result);
78
78
  Parser behavior & results can be customized via supplied options and user
79
79
  transformation functions:
80
80
 
81
- | Option | Description | Default |
82
- |------------------|--------------------------------------|---------|
83
- | `ignoreElements` | Array of element names to ignore | [] |
84
- | `ignoreAttribs` | Array of attribute names to ignore | [] |
85
- | `comments` | Keep `<!-- ... -->` comments | false |
86
- | `doctype` | Keep `<!doctype ...>` element | false |
87
- | `whitespace` | Keep whitespace-only text bodies | false |
88
- | `dataAttribs` | Keep data attribs | true |
89
- | `tx` | Element transform/filter function | |
90
- | `txBody` | Plain text transform/filter function | |
81
+ | Option | Description | Default |
82
+ |------------------|-----------------------------------------------------|---------|
83
+ | `ignoreElements` | Array of element names to ignore | [] |
84
+ | `ignoreAttribs` | Array of attribute names to ignore | [] |
85
+ | `dataAttribs` | Keep data attribs | true |
86
+ | `comments` | Keep `<!-- ... -->` comments | false |
87
+ | `doctype` | Keep `<!doctype ...>` element | false |
88
+ | `whitespace` | Keep whitespace-only text bodies | false |
89
+ | `collapse` | Collapse whitespace<sup>(1)</sup> | true |
90
+ | `unescape` | Replace named & numeric HTML entities<sup>(1)</sup> | true |
91
+ | `tx` | Element transform/filter function | |
92
+ | `txBody` | Plain text transform/filter function | |
93
+
94
+ - (1) - Not in CData content sections like inside `<script>` or `<style>` elements
91
95
 
92
96
  ## Status
93
97
 
@@ -121,7 +125,7 @@ For Node.js REPL:
121
125
  const hiccupHtmlParse = await import("@thi.ng/hiccup-html-parse");
122
126
  ```
123
127
 
124
- Package sizes (brotli'd, pre-treeshake): ESM: 1.10 KB
128
+ Package sizes (brotli'd, pre-treeshake): ESM: 1.18 KB
125
129
 
126
130
  ## Dependencies
127
131
 
package/index.d.ts CHANGED
@@ -9,6 +9,12 @@ export interface ParseOpts {
9
9
  * Array of attribute names to ignore.
10
10
  */
11
11
  ignoreAttribs: string[];
12
+ /**
13
+ * Keep data attribs.
14
+ *
15
+ * @defaultValue true
16
+ */
17
+ dataAttribs: boolean;
12
18
  /**
13
19
  * Keep `<!doctype ...>` element.
14
20
  *
@@ -22,11 +28,19 @@ export interface ParseOpts {
22
28
  */
23
29
  whitespace: boolean;
24
30
  /**
25
- * Keep data attribs.
31
+ * If enabled, collapses all whitespace to single space (`\u0020`)
32
+ * characters.
26
33
  *
27
34
  * @defaultValue true
28
35
  */
29
- dataAttribs: boolean;
36
+ collapse: boolean;
37
+ /**
38
+ * If enabled, unescapes known named and numeric HTML entities (i.e.
39
+ * replaces them with their original characters).
40
+ *
41
+ * @defaultValue true
42
+ */
43
+ unescape: boolean;
30
44
  /**
31
45
  * Keep comments.
32
46
  *
@@ -49,7 +63,7 @@ export interface ParseOpts {
49
63
  * Parser's internal max recursion limit. Parsing will terminate once this
50
64
  * limit is reached.
51
65
  *
52
- * @defaultValue 64
66
+ * @defaultValue 128
53
67
  */
54
68
  maxDepth: number;
55
69
  /**
package/index.js CHANGED
@@ -4,7 +4,7 @@ import { defGrammar } from "@thi.ng/parse/grammar";
4
4
  import { unescapeEntities } from "@thi.ng/strings/entities";
5
5
  // HTML parse grammar rules (see: thi.ng/parse readme for details)
6
6
  // playground URL:
7
- // https://demo.thi.ng/umbrella/parse-playground/#l9oDdW5vZGU6ICc8JyEgKDxjb21tZW50PiB8IDxjZGF0YV9lbD4gfCA8dm9pZF9lbD4gfCA8ZWw-KSA7CmVsOiA8bmFtZT4gPGF0dHJpYj4qICg8ZWxfYm9keT4gfCA8ZWxfY2xvc2U-ISApIDsKZWxfYm9keTogPFdTMD4gJz4nISAoPGJvZHk-IHwgPG5vZGU-KSogIjwvIiEgPG5hbWU-ISA8V1MwPiAnPichID0-IGhvaXN0IDsKZWxfY2xvc2U6IDxXUzA-ICIvPiIhIDsKbmFtZTogW0EtWmEtejAtOV86XC1dKyA9PiBqb2luIDsKYXR0cmliOiA8V1MxPiA8bmFtZT4gPGF0dHZhbD4_IDsKYXR0dmFsOiAnPSchICg8dmFsPiB8IDxhbHRfdmFsPiB8IDxlbXB0eT4gfCA8YWx0X2VtcHR5PikgOwp2YWw6ICciJyEgLig_KyciJyEpID0-IGpvaW4gOwphbHRfdmFsOiAnXCcnISAuKD8rJ1wnJyEpID0-IGpvaW4gOwplbXB0eTogJyInICciJyA7CmFsdF9lbXB0eTogJ1wnJyEgJ1wnJyEgOwpib2R5OiAuKD8tJzwnISkgPT4gam9pbiA7Cgp2b2lkX2VsOiA8dm9pZF9uYW1lPiA8YXR0cmliPiogPFdTMD4gJy8nPyEgJz4nISA7CnZvaWRfbmFtZTogKCJtZXRhIiB8ICJsaW5rIikgOwoKY2RhdGFfZWw6IDxjZGF0YV9uYW1lPiA8YXR0cmliPiogJz4nISA8Y2RhdGFfYm9keT4gOwpjZGF0YV9uYW1lOiAoInNjcmlwdCIgfCAic3R5bGUiKSA7CmNkYXRhX2JvZHk6IC4oPy08Y2RhdGFfY2xvc2U-ISkgPGNkYXRhX2Nsb3NlPiEgPT4gam9pbiA7CmNkYXRhX2Nsb3NlOiAiPC8iISA8Y2RhdGFfbmFtZT4hIDxXUzA-ICc-JyEgOwoKZG9jdHlwZTogIjwhIiEgKCJkb2N0eXBlIiB8ICJET0NUWVBFIikhIDxXUzE-IDxuYW1lPiAnPichIDxXUzA-IDsKY29tbWVudDogIiEtLSIhIC4oPysiLS0-IiEpID0-IGpvaW4gOwoKbWFpbjogPFNUQVJUPiA8ZG9jdHlwZT4_IDxub2RlPisgPEVORD4gO6RtYWlu2gEyPCFkb2N0eXBlIGh0bWw-CjxodG1sIGxhbmc9ImVuIj4KPGhlYWQ-CiAgPCEtLSA8aWdub3JlPjwvaWdub3JlPiAtLT4KICA8c2NyaXB0IGxhbmc9ImphdmFzY3JpcHQiPgpjb25zb2xlLmxvZygiPC8iKyJzY3JpcHQ-Iik7CiAgPC9zY3JpcHQ-CiAgPHN0eWxlPgpib2R5IHsgbWFyZ2luOiAwOyB9CiAgPC9zdHlsZT4KPC9oZWFkPgo8Ym9keT4KICA8ZGl2IGlkPSJmb28iIGJvb2wgZGF0YS14eXo9IiIgZW1wdHk9Jyc-CiAgICA8YSBocmVmPSIjYmFyIj5iYXogPGI-Ym9sZDwvYj48L2E-PGJyLz4KICA8L2Rpdj4KPC9ib2R5Pgo8L2h0bWw-oKCgoA
7
+ // https://demo.thi.ng/umbrella/parse-playground/#l9oD3G5vZGU6ICc8JyEgKDxjb21tZW50PiB8IDxjZGF0YV9lbD4gfCA8dm9pZF9lbD4gfCA8ZWw-KSA7CmVsOiA8bmFtZT4gPGF0dHJpYj4qICg8ZWxfYm9keT4gfCA8ZWxfY2xvc2U-ISApIDsKZWxfYm9keTogPFdTMD4gJz4nISAoPGJvZHk-IHwgPG5vZGU-KSogIjwvIiEgPG5hbWU-ISA8V1MwPiAnPichID0-IGhvaXN0IDsKZWxfY2xvc2U6IDxXUzA-ICIvPiIhIDsKbmFtZTogW0EtWmEtejAtOV86XC1dKyA9PiBqb2luIDsKYXR0cmliOiA8V1MxPiA8bmFtZT4gPGF0dHZhbD4_IDsKYXR0dmFsOiAnPSchICg8dmFsPiB8IDxhbHRfdmFsPiB8IDxlbXB0eT4gfCA8YWx0X2VtcHR5PikgOwp2YWw6ICciJyEgLig_KyciJyEpID0-IGpvaW4gOwphbHRfdmFsOiAnXCcnISAuKD8rJ1wnJyEpID0-IGpvaW4gOwplbXB0eTogJyInICciJyA7CmFsdF9lbXB0eTogJ1wnJyEgJ1wnJyEgOwpib2R5OiAuKD8tJzwnISkgPT4gam9pbiA7Cgp2b2lkX2VsOiA8dm9pZF9uYW1lPiA8YXR0cmliPiogPFdTMD4gJy8nPyEgJz4nISA7CnZvaWRfbmFtZTogKCJhcmVhIiB8ICJiYXNlIiB8ICJiciIgfCAiY29sIiB8ICJlbWJlZCIgfCAiaHIiIHwgImltZyIgfCAiaW5wdXQiIHwgImxpbmsiIHwgIm1ldGEiIHwgInNvdXJjZSIgfCAidHJhY2siIHwgIndiciIpIDsKCmNkYXRhX2VsOiA8Y2RhdGFfbmFtZT4gPGF0dHJpYj4qICc-JyEgPGNkYXRhX2JvZHk-IDsKY2RhdGFfbmFtZTogKCJzY3JpcHQiIHwgInN0eWxlIikgOwpjZGF0YV9ib2R5OiAuKD8tPGNkYXRhX2Nsb3NlPiEpIDxjZGF0YV9jbG9zZT4hID0-IGpvaW4gOwpjZGF0YV9jbG9zZTogIjwvIiEgPGNkYXRhX25hbWU-ISA8V1MwPiAnPichIDsKCmRvY3R5cGU6ICI8ISIhICgiZG9jdHlwZSIgfCAiRE9DVFlQRSIpISA8V1MxPiAuKD8rJz4nISkgPFdTMD4gPT4gam9pbiA7CmNvbW1lbnQ6ICIhLS0iISAuKD8rIi0tPiIhKSA9PiBqb2luIDsKCm1haW46IDxTVEFSVD4gPGRvY3R5cGU-PyA8bm9kZT4rIDxFTkQ-IDukbWFpbtoBMjwhZG9jdHlwZSBodG1sPgo8aHRtbCBsYW5nPSJlbiI-CjxoZWFkPgogIDwhLS0gPGlnbm9yZT48L2lnbm9yZT4gLS0-CiAgPHNjcmlwdCBsYW5nPSJqYXZhc2NyaXB0Ij4KY29uc29sZS5sb2coIjwvIisic2NyaXB0PiIpOwogIDwvc2NyaXB0PgogIDxzdHlsZT4KYm9keSB7IG1hcmdpbjogMDsgfQogIDwvc3R5bGU-CjwvaGVhZD4KPGJvZHk-CiAgPGRpdiBpZD0iZm9vIiBib29sIGRhdGEteHl6PSIiIGVtcHR5PScnPgogICAgPGEgaHJlZj0iI2JhciI-YmF6IDxiPmJvbGQ8L2I-PC9hPjxici8-CiAgPC9kaXY-CjwvYm9keT4KPC9odG1sPqCgoKA
8
8
  export const lang = defGrammar(`
9
9
  node: '<'! (<comment> | <cdata_el> | <void_el> | <el>) ;
10
10
  el: <name> <attrib>* (<el_body> | <el_close>! ) ;
@@ -20,14 +20,14 @@ alt_empty: '\\''! '\\''! ;
20
20
  body: .(?-'<'!) => join ;
21
21
 
22
22
  void_el: <void_name> <attrib>* <WS0> '/'?! '>'! ;
23
- void_name: ("meta" | "link") ;
23
+ void_name: ("area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link" | "meta" | "source" | "track" | "wbr") ;
24
24
 
25
25
  cdata_el: <cdata_name> <attrib>* '>'! <cdata_body> ;
26
26
  cdata_name: ("script" | "style") ;
27
27
  cdata_body: .(?-<cdata_close>!) <cdata_close>! => join ;
28
28
  cdata_close: "</"! <cdata_name>! <WS0> '>'! ;
29
29
 
30
- doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1> <name> '>'! <WS0> ;
30
+ doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1> .(?+'>'!) <WS0> => join ;
31
31
  comment: "!--"! .(?+"-->"!) => join ;
32
32
 
33
33
  main: <START> <doctype>? <node>+ <END> ;
@@ -52,12 +52,19 @@ export const parseRaw = (src, opts) => {
52
52
  * @param src
53
53
  * @param opts
54
54
  */
55
- export const parseHtml = (src, opts = {}) => {
55
+ export const parseHtml = (src, opts) => {
56
56
  if (!src)
57
57
  return { type: "success", result: [] };
58
+ opts = {
59
+ debug: false,
60
+ collapse: true,
61
+ unescape: true,
62
+ maxDepth: 128,
63
+ ...opts,
64
+ };
58
65
  try {
59
66
  const { result, ctx } = parseRaw(src.trim(), {
60
- debug: opts.debug || false,
67
+ debug: opts.debug,
61
68
  maxDepth: opts.maxDepth,
62
69
  });
63
70
  const loc = {
@@ -102,7 +109,7 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" },
102
109
  return;
103
110
  children = children[0].children;
104
111
  if (opts.doctype && children?.[0]) {
105
- acc.push(["!DOCTYPE", children[0].children?.[0].result]);
112
+ acc.push(["!DOCTYPE", children[0].result]);
106
113
  }
107
114
  for (let x of children[1].children)
108
115
  transformScope(x, opts, acc);
@@ -141,7 +148,7 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" },
141
148
  }
142
149
  if (body) {
143
150
  if (body.result) {
144
- el.push(unescapeEntities(body.result.trim()));
151
+ el.push(body.result.trim());
145
152
  }
146
153
  else if (body.children) {
147
154
  for (let x of body.children)
@@ -156,7 +163,10 @@ const transformScope = defmulti((x) => x.id, { cdata_el: "el", void_el: "el" },
156
163
  body: ({ result }, opts, acc) => {
157
164
  if (!opts.whitespace && /^\s+$/.test(result))
158
165
  return;
159
- result = unescapeEntities(result);
166
+ if (opts.collapse)
167
+ result = result.replace(/\s+/gm, " ");
168
+ if (opts.unescape)
169
+ result = unescapeEntities(result);
160
170
  result = opts.txBody ? opts.txBody(result) : result;
161
171
  if (result != null)
162
172
  acc.push(result);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thi.ng/hiccup-html-parse",
3
- "version": "0.2.1",
3
+ "version": "0.3.1",
4
4
  "description": "Well-formed HTML parsing and customizable transformation to nested JS arrays in @thi.ng/hiccup format",
5
5
  "type": "module",
6
6
  "module": "./index.js",
@@ -34,17 +34,17 @@
34
34
  "test": "testament test"
35
35
  },
36
36
  "dependencies": {
37
- "@thi.ng/api": "^8.9.5",
38
- "@thi.ng/defmulti": "^3.0.0",
39
- "@thi.ng/parse": "^2.4.0",
40
- "@thi.ng/strings": "^3.6.0"
37
+ "@thi.ng/api": "^8.9.6",
38
+ "@thi.ng/defmulti": "^3.0.1",
39
+ "@thi.ng/parse": "^2.4.1",
40
+ "@thi.ng/strings": "^3.6.1"
41
41
  },
42
42
  "devDependencies": {
43
- "@microsoft/api-extractor": "^7.36.4",
44
- "@thi.ng/testament": "^0.3.23",
45
- "rimraf": "^5.0.1",
43
+ "@microsoft/api-extractor": "^7.38.0",
44
+ "@thi.ng/testament": "^0.3.24",
45
+ "rimraf": "^5.0.5",
46
46
  "tools": "^0.0.1",
47
- "typedoc": "^0.25.0",
47
+ "typedoc": "^0.25.2",
48
48
  "typescript": "^5.2.2"
49
49
  },
50
50
  "keywords": [
@@ -84,5 +84,5 @@
84
84
  "status": "alpha",
85
85
  "year": 2023
86
86
  },
87
- "gitHead": "fb2697579b193b609ec52237ea0d99c7295b6d3c\n"
87
+ "gitHead": "8d46d9326a9f9b81d65e7e274446f5964f9942ac\n"
88
88
  }