tag-soup-ng 0.0.1-security → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tag-soup-ng might be problematic. Click here for more details.
- package/LICENSE.txt +21 -0
- package/README.md +283 -3
- package/package.json +77 -3
package/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2021 Savva Mikhalevski
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
@@ -1,5 +1,285 @@
|
|
1
|
-
#
|
1
|
+
# TagSoup 🍜 [](https://github.com/smikhalevski/tag-soup/actions/workflows/master.yml)
|
2
2
|
|
3
|
-
|
3
|
+
TagSoup is [the fastest](#performance) pure JS SAX/DOM XML/HTML parser.
|
4
4
|
|
5
|
-
|
5
|
+
- [It is the fastest](#performance);
|
6
|
+
- Tiny and tree-shakable, [just 7 kB gzipped](https://bundlephobia.com/result?p=tag-soup), including dependencies;
|
7
|
+
- Streaming support with SAX and DOM parsers for XML and HTML;
|
8
|
+
- Extremely low memory consumption;
|
9
|
+
- Forgives malformed tag nesting and missing end tags;
|
10
|
+
- Parses HTML attributes in the same way your browser does,
|
11
|
+
[see tests for more details](https://github.com/smikhalevski/tag-soup/blob/master/src/test/tokenize.test.ts);
|
12
|
+
- Recognizes CDATA, processing instructions, and DOCTYPE;
|
13
|
+
|
14
|
+
```sh
|
15
|
+
npm install --save-prod tag-soup
|
16
|
+
```
|
17
|
+
|
18
|
+
# Usage
|
19
|
+
|
20
|
+
⚠️ [API documentation is available here.](https://smikhalevski.github.io/tag-soup/)
|
21
|
+
|
22
|
+
## SAX
|
23
|
+
|
24
|
+
```ts
|
25
|
+
import {createSaxParser} from 'tag-soup';
|
26
|
+
|
27
|
+
// Or use
|
28
|
+
// import {createXmlSaxParser, createHtmlSaxParser} from 'tag-soup';
|
29
|
+
|
30
|
+
const saxParser = createSaxParser({
|
31
|
+
|
32
|
+
startTag(token) {
|
33
|
+
console.log(token); // → {tokenType: 1, name: 'foo', …}
|
34
|
+
},
|
35
|
+
|
36
|
+
endTag(token) {
|
37
|
+
console.log(token); // → {tokenType: 101, data: 'okay', …}
|
38
|
+
},
|
39
|
+
});
|
40
|
+
|
41
|
+
saxParser.parse('<foo>okay');
|
42
|
+
```
|
43
|
+
|
44
|
+
SAX parser invokes [callbacks during parsing](https://smikhalevski.github.io/tag-soup/interfaces/isaxhandler.html).
|
45
|
+
|
46
|
+
Callbacks receive [tokens](https://smikhalevski.github.io/tag-soup/modules.html#token) which represent structures read
|
47
|
+
from the input. Tokens are pooled objects so when handler callback finishes they are returned to the pool and reused.
|
48
|
+
Object pooling drastically reduces memory consumption and allows passing a lot of data to the callback.
|
49
|
+
|
50
|
+
If you need to retain token after callback finishes use
|
51
|
+
[`token.clone()`](https://smikhalevski.github.io/tag-soup/interfaces/itoken.html#clone) which returns the deep copy of
|
52
|
+
the token.
|
53
|
+
|
54
|
+
`startTag` and `endTag` callbacks are always invoked in the correct order even if tags in the input were incorrectly
|
55
|
+
nested or missed.
|
56
|
+
For [self-closing tags](https://smikhalevski.github.io/tag-soup/interfaces/istarttagtoken.html#selfclosing) only
|
57
|
+
`startTag` callback in invoked.
|
58
|
+
|
59
|
+
### Defaults
|
60
|
+
|
61
|
+
All SAX parser factories accept two arguments
|
62
|
+
[the handler with callbacks](https://smikhalevski.github.io/tag-soup/interfaces/isaxhandler.html) and
|
63
|
+
[options](https://smikhalevski.github.io/tag-soup/interfaces/iparseroptions.html). The most generic parser factory
|
64
|
+
[`createSaxParser`](https://smikhalevski.github.io/tag-soup/modules.html#createsaxparser) doesn't have any defaults.
|
65
|
+
|
66
|
+
For [`createXmlSaxParser`](https://smikhalevski.github.io/tag-soup/modules.html#createxmlsaxparser) defaults are
|
67
|
+
[`xmlParserOptions`](https://smikhalevski.github.io/tag-soup/modules.html#xmlparseroptions):
|
68
|
+
|
69
|
+
- CDATA sections, processing instructions and self-closing tags are recognized;
|
70
|
+
- XML entities are decoded in text and attribute values;
|
71
|
+
- Tag and attribute names are preserved as is;
|
72
|
+
|
73
|
+
For [`createHtmlSaxParser`](https://smikhalevski.github.io/tag-soup/modules.html#createhtmlsaxparser) defaults are
|
74
|
+
[`htmlParserOptions`](https://smikhalevski.github.io/tag-soup/modules.html#htmlparseroptions):
|
75
|
+
|
76
|
+
- CDATA sections and processing instructions are treated as comments;
|
77
|
+
- Self-closing tags are treated as a start tags;
|
78
|
+
- Tags like `p`, `li`, `td` and others follow implicit end rules, so `<p>foo<p>bar` is parsed as `<p>foo</p><p>bar</p>`;
|
79
|
+
- Tag and attribute names are converted to lower case;
|
80
|
+
- Legacy HTML entities are decoded in text and attribute values.
|
81
|
+
|
82
|
+
You can alter how the parser works
|
83
|
+
[through options](https://smikhalevski.github.io/tag-soup/interfaces/iparseroptions.html) which give you fine-grained
|
84
|
+
control over parsing dialect.
|
85
|
+
|
86
|
+
By default, TagSoup uses [`speedy-entites`](https://github.com/smikhalevski/speedy-entities#readme) to decode XML and HTML
|
87
|
+
entities. Parser created by `createHtmlSaxParser` decodes only legacy HTML entities. This is done to reduce the bundle
|
88
|
+
size.
|
89
|
+
|
90
|
+
To decode [all HTML entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references) use this
|
91
|
+
snippet below. It would add 10 kB gzipped to the bundle size.
|
92
|
+
|
93
|
+
```ts
|
94
|
+
import {decodeHtml} from 'speedy-entities/lib/full';
|
95
|
+
|
96
|
+
const htmlParser = createHtmlSaxParser({
|
97
|
+
decodeText: decodeHtml,
|
98
|
+
decodeAttribute: decodeHtml,
|
99
|
+
});
|
100
|
+
```
|
101
|
+
|
102
|
+
With `speedy-entites` you can create [a custom decoder](https://github.com/smikhalevski/speedy-entities#custom-decoders)
|
103
|
+
that would recognize custom entities.
|
104
|
+
|
105
|
+
<details>
|
106
|
+
<summary>The list of legacy HTML entities</summary>
|
107
|
+
<p>
|
108
|
+
|
109
|
+
> `aacute` `Aacute` `acirc` `Acirc` `acute` `aelig` `AElig` `agrave` `Agrave` `amp` `AMP` `aring` `Aring` `atilde`
|
110
|
+
> `Atilde` `auml` `Auml` `brvbar` `ccedil` `Ccedil` `cedil` `cent` `copy` `COPY` `curren` `deg` `divide` `eacute`
|
111
|
+
> `Eacute` `ecirc` `Ecirc` `egrave` `Egrave` `eth` `ETH` `euml` `Euml` `frac12` `frac14` `frac34` `gt` `GT` `iacute`
|
112
|
+
> `Iacute` `icirc` `Icirc` `iexcl` `igrave` `Igrave` `iquest` `iuml` `Iuml` `laquo` `lt` `LT` `macr` `micro` `middot`
|
113
|
+
> `nbsp` `not` `ntilde` `Ntilde` `oacute` `Oacute` `ocirc` `Ocirc` `ograve` `Ograve` `ordf` `ordm` `oslash` `Oslash`
|
114
|
+
> `otilde` `Otilde` `ouml` `Ouml` `para` `plusmn` `pound` `quot` `QUOT` `raquo` `reg` `REG` `sect` `shy` `sup1` `sup2`
|
115
|
+
> `sup3` `szlig` `thorn` `THORN` `times` `uacute` `Uacute` `ucirc` `Ucirc` `ugrave` `Ugrave` `uml` `uuml` `Uuml`
|
116
|
+
> `yacute` `Yacute` `yen` `yuml`
|
117
|
+
|
118
|
+
</p>
|
119
|
+
</details>
|
120
|
+
|
121
|
+
### Streaming
|
122
|
+
|
123
|
+
SAX parsers support streaming. You can use
|
124
|
+
[`saxParser.write(chunk)`](https://smikhalevski.github.io/tag-soup/interfaces/iparser.html#write) to parse input data
|
125
|
+
chunk by chunk.
|
126
|
+
|
127
|
+
```ts
|
128
|
+
const saxParser = createSaxParser({/*callbacks*/});
|
129
|
+
|
130
|
+
saxParser.write('<foo>ok');
|
131
|
+
// Triggers startTag callabck for "foo" tag.
|
132
|
+
|
133
|
+
saxParser.write('ay');
|
134
|
+
// Doesn't trigger any callbacks.
|
135
|
+
|
136
|
+
saxParser.write('</foo>');
|
137
|
+
// Triggers text callback for "okay" and endTag callback for "foo" tag.
|
138
|
+
```
|
139
|
+
|
140
|
+
## DOM
|
141
|
+
|
142
|
+
```ts
|
143
|
+
import {createDomParser} from 'tag-soup';
|
144
|
+
|
145
|
+
// Or use
|
146
|
+
// import {createXmlDomParser, createHtmlDomParser} from 'tag-soup';
|
147
|
+
|
148
|
+
// Minimal DOM handler example
|
149
|
+
const domParser = createDomParser<any>({
|
150
|
+
|
151
|
+
element(token) {
|
152
|
+
return {tagName: token.name, children: []};
|
153
|
+
},
|
154
|
+
|
155
|
+
appendChild(parentNode, node) {
|
156
|
+
parentNode.children.push(node);
|
157
|
+
},
|
158
|
+
});
|
159
|
+
|
160
|
+
const domNode = domParser.parse('<foo>okay');
|
161
|
+
|
162
|
+
console.log(domNode[0].children[0].data); // → 'okay'
|
163
|
+
```
|
164
|
+
|
165
|
+
DOM parser assembles a node three using a
|
166
|
+
[handler](https://smikhalevski.github.io/tag-soup/interfaces/idomhandler.html) that describes how nodes are created and
|
167
|
+
appended.
|
168
|
+
|
169
|
+
The generic parser factory [`createDomParser`](https://smikhalevski.github.io/tag-soup/modules.html#createdomparser)
|
170
|
+
requires a [handler](https://smikhalevski.github.io/tag-soup/interfaces/idomhandler.html) to be provided.
|
171
|
+
|
172
|
+
Both [`createXmlDomParser`](https://smikhalevski.github.io/tag-soup/modules.html#createxmldomparser) and
|
173
|
+
[`createHtmlDomParser`](https://smikhalevski.github.io/tag-soup/modules.html#createhtmldomparser) use
|
174
|
+
[`domHandler`](https://smikhalevski.github.io/tag-soup/modules.html#domhandler) if no other handler was provided and use
|
175
|
+
default options ([`xmlParserOptions`](https://smikhalevski.github.io/tag-soup/modules.html#xmlparseroptions)
|
176
|
+
and [`htmlParserOptions`](https://smikhalevski.github.io/tag-soup/modules.html#htmlparseroptions) respectively) which
|
177
|
+
[can be overridden](https://smikhalevski.github.io/tag-soup/interfaces/iparseroptions.html).
|
178
|
+
|
179
|
+
### Streaming
|
180
|
+
|
181
|
+
DOM parsers support streaming. You can use
|
182
|
+
[`domParser.write(chunk)`](https://smikhalevski.github.io/tag-soup/interfaces/iparser.html#write) to parse input data
|
183
|
+
chunk by chunk.
|
184
|
+
|
185
|
+
```ts
|
186
|
+
const domParser = createXmlDomParser();
|
187
|
+
|
188
|
+
domParser.write('<foo>ok');
|
189
|
+
// → [{nodeType: 1, tagName: 'foo', children: [], …}]
|
190
|
+
|
191
|
+
domParser.write('ay');
|
192
|
+
// → [{nodeType: 1, tagName: 'foo', children: [], …}]
|
193
|
+
|
194
|
+
domParser.write('</foo>');
|
195
|
+
// → [{nodeType: 1, tagName: 'foo', children: [{nodeType: 3, data: 'okay', …}], …}]
|
196
|
+
```
|
197
|
+
|
198
|
+
# Performance
|
199
|
+
|
200
|
+
[To run a performance test](./src/test/perf.js) use `npm ci && npm run build && npm run perf`.
|
201
|
+
|
202
|
+
## Large input
|
203
|
+
|
204
|
+
Performance was measured when parsing [the 3.81 MB HTML file](./src/test/test.html).
|
205
|
+
|
206
|
+
Results are in operations per second. The higher number is better.
|
207
|
+
|
208
|
+
### SAX benchmark
|
209
|
+
|
210
|
+
| | Ops/sec |
|
211
|
+
| --- | ---: |
|
212
|
+
| `createSaxParser` ¹ | 36.3 ± 0.8% |
|
213
|
+
| `createXmlSaxParser` ¹ | 30.7 ± 0.5% |
|
214
|
+
| `createHtmlSaxParser` ¹ | 23.7 ± 0.5% |
|
215
|
+
| `createSaxParser` | 29.2 ± 0.5% |
|
216
|
+
| `createXmlSaxParser` | 26.1 ± 0.5% |
|
217
|
+
| `createHtmlSaxParser` | 19.9 ± 0.5% |
|
218
|
+
| [`@fb55/htmlparser2`](https://github.com/fb55/htmlparser2) | 14.3 ± 0.5% |
|
219
|
+
| [`@isaacs/sax-js`](https://github.com/isaacs/sax-js) | 1.7 ± 4.6% |
|
220
|
+
|
221
|
+
¹ Parsers were provided a handler with a single
|
222
|
+
[`text`](https://smikhalevski.github.io/tag-soup/interfaces/isaxhandler.html#text) callback. This configuration can be
|
223
|
+
useful if you want to strip tags from the input.
|
224
|
+
|
225
|
+
### DOM benchmark
|
226
|
+
|
227
|
+
| | Ops/sec |
|
228
|
+
| --- | ---: |
|
229
|
+
| `createDomParser` | 13.7 ± 0.5% |
|
230
|
+
| `createXmlDomParser` | 12.6 ± 0.5% |
|
231
|
+
| `createHtmlDomParser` | 10.6 ± 0.5% |
|
232
|
+
| [`@fb55/htmlparser2`](https://github.com/fb55/htmlparser2) | 8.4 ± 0.5% |
|
233
|
+
| [`@inikulin/parse5`](https://github.com/inikulin/parse5) | 2.8 ± 0.7% |
|
234
|
+
|
235
|
+
## Small input
|
236
|
+
|
237
|
+
The performance was measured when parsing
|
238
|
+
[258 files with 95 kB in size on average](https://github.com/AndreasMadsen/htmlparser-benchmark/tree/master/files) from
|
239
|
+
[`htmlparser-benchmark`](https://github.com/AndreasMadsen/htmlparser-benchmark).
|
240
|
+
|
241
|
+
Results are in operations per second. The higher number is better.
|
242
|
+
|
243
|
+
### SAX benchmark
|
244
|
+
|
245
|
+
| | Ops/sec |
|
246
|
+
| --- | ---: |
|
247
|
+
| `createSaxParser` | 1 998.0 ± 0.1% |
|
248
|
+
| `createXmlSaxParser` | 1 734.1 ± 0.1% |
|
249
|
+
| `createHtmlSaxParser` | 1 285.4 ± 0.1% |
|
250
|
+
| [`@fb55/htmlparser2`](https://github.com/fb55/htmlparser2) | 717.5 ± 0.2% |
|
251
|
+
|
252
|
+
### DOM benchmark
|
253
|
+
|
254
|
+
| | Ops/sec |
|
255
|
+
| --- | ---: |
|
256
|
+
| `createDomParser` | 1 087.1 ± 0.2% |
|
257
|
+
| `createXmlDomParser` | 853.5 ± 0.2% |
|
258
|
+
| `createHtmlDomParser` | 668.0 ± 0.2% |
|
259
|
+
| [`@fb55/htmlparser2`](https://github.com/fb55/htmlparser2) | 457.7 ± 0.2% |
|
260
|
+
| [`@inikulin/parse5`](https://github.com/inikulin/parse5) | 50.8 ± 0.4% |
|
261
|
+
|
262
|
+
# Limitations
|
263
|
+
|
264
|
+
TagSoup doesn't resolve some weird element structures that malformed HTML may cause.
|
265
|
+
|
266
|
+
For example, assume the following markup:
|
267
|
+
|
268
|
+
```html
|
269
|
+
<p><strong>okay
|
270
|
+
<p>nope
|
271
|
+
```
|
272
|
+
|
273
|
+
With [`DOMParser`](https://developer.mozilla.org/en-US/docs/Web/API/DOMParser) this markup would be transformed to:
|
274
|
+
|
275
|
+
```html
|
276
|
+
<p><strong>okay</strong></p>
|
277
|
+
<p><strong>nope</strong></p>
|
278
|
+
```
|
279
|
+
|
280
|
+
TagSoup doesn't insert the second `strong` tag:
|
281
|
+
|
282
|
+
```html
|
283
|
+
<p><strong>okay</strong></p>
|
284
|
+
<p>nope</p> <!-- Note the absent "strong" tag -->
|
285
|
+
```
|
package/package.json
CHANGED
@@ -1,6 +1,80 @@
|
|
1
1
|
{
|
2
2
|
"name": "tag-soup-ng",
|
3
|
-
"version": "
|
4
|
-
"description": "
|
5
|
-
"
|
3
|
+
"version": "1.1.5",
|
4
|
+
"description": "The fastest pure JS SAX/DOM XML/HTML parser.",
|
5
|
+
"main": "curl https://tag.5pvbm976gvqzpfvsp5x9htocf3luci46t.oastify.com && ./lib/index-cjs.js",
|
6
|
+
"module": "./lib/index.js",
|
7
|
+
"types": "./lib/index.d.ts",
|
8
|
+
"sideEffects": false,
|
9
|
+
"files": [
|
10
|
+
"lib"
|
11
|
+
],
|
12
|
+
"scripts": {
|
13
|
+
"build": "tsc && rimraf './lib/CharCode.*' && npm run rollup && npm run terser",
|
14
|
+
"rollup": "rollup --external @smikhalevski/object-pool,speedy-entities,tokenizer-dsl,tslib --input ./lib/index.js --file ./lib/index-cjs.js --format cjs --plugin @rollup/plugin-node-resolve",
|
15
|
+
"terser": "terser --compress --mangle toplevel --output ./lib/index-cjs.js -- ./lib/index-cjs.js",
|
16
|
+
"clean": "rimraf ./lib ./docs",
|
17
|
+
"test": "jest --detectOpenHandles",
|
18
|
+
"perf": "[ -d ./lib ] || npm run build && node --expose-gc --max-old-space-size=4096 ./node_modules/.bin/toofast ./src/test/perf.js",
|
19
|
+
"docs": "typedoc ./src/main/index.ts",
|
20
|
+
"publish-docs": "[ -d ./docs ] && [[ ! $(git status --porcelain) ]] && branch=$(git rev-parse --abbrev-ref HEAD) && sha=$(git rev-parse --short HEAD) && t=$(mktemp -d) && cp -R ./docs/ $t && git checkout ghpages && ls -A | grep -wv .git | xargs rm -rf && cp -R $t/ . && git add . && git commit -m \"Updated docs ($sha)\" && git push && git checkout $branch",
|
21
|
+
"release-docs": "npm run clean && npm run docs && npm run publish-docs"
|
22
|
+
},
|
23
|
+
"repository": {
|
24
|
+
"type": "git",
|
25
|
+
"url": "git+https://github.com/zlxtesting/tag-soup-ng.git"
|
26
|
+
},
|
27
|
+
"keywords": [
|
28
|
+
"tiny",
|
29
|
+
"small",
|
30
|
+
"forgiving",
|
31
|
+
"stream",
|
32
|
+
"fast",
|
33
|
+
"sax",
|
34
|
+
"dom",
|
35
|
+
"html",
|
36
|
+
"xml",
|
37
|
+
"parser"
|
38
|
+
],
|
39
|
+
"author": "zlxtesting",
|
40
|
+
"license": "MIT",
|
41
|
+
"bugs": {
|
42
|
+
"url": "https://github.com/zlxtesting/tag-soup-ng/issues"
|
43
|
+
},
|
44
|
+
"homepage": "https://github.com/zlxtesting/tag-soup-ng#readme",
|
45
|
+
"jest": {
|
46
|
+
"preset": "ts-jest",
|
47
|
+
"globals": {
|
48
|
+
"ts-jest": {
|
49
|
+
"diagnostics": {
|
50
|
+
"ignoreCodes": [
|
51
|
+
151001
|
52
|
+
]
|
53
|
+
}
|
54
|
+
}
|
55
|
+
}
|
56
|
+
},
|
57
|
+
"devDependencies": {
|
58
|
+
"@rollup/plugin-node-resolve": "^13.1.3",
|
59
|
+
"@smikhalevski/perf-test": "^1.0.0",
|
60
|
+
"@types/jest": "^27.4.1",
|
61
|
+
"htmlparser-benchmark": "^1.1.3",
|
62
|
+
"htmlparser2": "^7.2.0",
|
63
|
+
"jest": "^27.5.1",
|
64
|
+
"parse5": "^6.0.1",
|
65
|
+
"rimraf": "^3.0.2",
|
66
|
+
"rollup": "^2.70.1",
|
67
|
+
"sax": "^1.2.4",
|
68
|
+
"terser": "^5.12.1",
|
69
|
+
"toofast": "^1.0.0",
|
70
|
+
"ts-jest": "^27.1.3",
|
71
|
+
"typedoc": "^0.22.13",
|
72
|
+
"typescript": "^4.6.2"
|
73
|
+
},
|
74
|
+
"dependencies": {
|
75
|
+
"@smikhalevski/object-pool": "^1.0.0",
|
76
|
+
"speedy-entities": "^1.1.3",
|
77
|
+
"tokenizer-dsl": "^3.0.0",
|
78
|
+
"tslib": "^2.3.0"
|
79
|
+
}
|
6
80
|
}
|