html-minifier-next 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +25 -0
- package/README.md +170 -0
- package/cli.js +308 -0
- package/dist/htmlminifier.cjs +1858 -0
- package/dist/htmlminifier.esm.bundle.js +59386 -0
- package/dist/htmlminifier.umd.bundle.js +59397 -0
- package/dist/htmlminifier.umd.bundle.min.js +9 -0
- package/package.json +95 -0
- package/src/htmlminifier.js +1366 -0
- package/src/htmlparser.js +565 -0
- package/src/tokenchain.js +68 -0
- package/src/utils.js +11 -0
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
/*!
|
|
2
|
+
* HTML Parser By John Resig (ejohn.org)
|
|
3
|
+
* Modified by Juriy "kangax" Zaytsev
|
|
4
|
+
* Original code by Erik Arvidsson, Mozilla Public License
|
|
5
|
+
* http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/*
|
|
9
|
+
* // Use like so:
|
|
10
|
+
* HTMLParser(htmlString, {
|
|
11
|
+
* start: function(tag, attrs, unary) {},
|
|
12
|
+
* end: function(tag) {},
|
|
13
|
+
* chars: function(text) {},
|
|
14
|
+
* comment: function(text) {}
|
|
15
|
+
* });
|
|
16
|
+
*
|
|
17
|
+
* // or to get an XML string:
|
|
18
|
+
* HTMLtoXML(htmlString);
|
|
19
|
+
*
|
|
20
|
+
* // or to get an XML DOM Document
|
|
21
|
+
* HTMLtoDOM(htmlString);
|
|
22
|
+
*
|
|
23
|
+
* // or to inject into an existing document/DOM node
|
|
24
|
+
* HTMLtoDOM(htmlString, document);
|
|
25
|
+
* HTMLtoDOM(htmlString, document.body);
|
|
26
|
+
*
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/* global ActiveXObject, DOMDocument */
|
|
30
|
+
|
|
31
|
+
import { replaceAsync } from './utils.js';
|
|
32
|
+
|
|
33
|
+
class CaseInsensitiveSet extends Set {
|
|
34
|
+
has(str) {
|
|
35
|
+
return super.has(str.toLowerCase());
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Regular Expressions for parsing tags and attributes
|
|
40
|
+
const singleAttrIdentifier = /([^\s"'<>/=]+)/;
|
|
41
|
+
const singleAttrAssigns = [/=/];
|
|
42
|
+
const singleAttrValues = [
|
|
43
|
+
// attr value double quotes
|
|
44
|
+
/"([^"]*)"+/.source,
|
|
45
|
+
// attr value, single quotes
|
|
46
|
+
/'([^']*)'+/.source,
|
|
47
|
+
// attr value, no quotes
|
|
48
|
+
/([^ \t\n\f\r"'`=<>]+)/.source
|
|
49
|
+
];
|
|
50
|
+
// https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
|
|
51
|
+
const qnameCapture = (function () {
|
|
52
|
+
// based on https://www.npmjs.com/package/ncname
|
|
53
|
+
const combiningChar = '\\u0300-\\u0345\\u0360\\u0361\\u0483-\\u0486\\u0591-\\u05A1\\u05A3-\\u05B9\\u05BB-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u064B-\\u0652\\u0670\\u06D6-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED\\u0901-\\u0903\\u093C\\u093E-\\u094D\\u0951-\\u0954\\u0962\\u0963\\u0981-\\u0983\\u09BC\\u09BE-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CD\\u09D7\\u09E2\\u09E3\\u0A02\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A70\\u0A71\\u0A81-\\u0A83\\u0ABC\\u0ABE-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0B01-\\u0B03\\u0B3C\\u0B3E-\\u0B43\\u0B47\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B82\\u0B83\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD7\\u0C01-\\u0C03\\u0C3E-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C82\\u0C83\\u0CBE-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D3E-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4D\\u0D57\\u0E31\\u0E34-\\u0E3A\\u0E47-\\u0E4E\\u0EB1\\u0EB4-\\u0EB9\\u0EBB\\u0EBC\\u0EC8-\\u0ECD\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F3E\\u0F3F\\u0F71-\\u0F84\\u0F86-\\u0F8B\\u0F90-\\u0F95\\u0F97\\u0F99-\\u0FAD\\u0FB1-\\u0FB7\\u0FB9\\u20D0-\\u20DC\\u20E1\\u302A-\\u302F\\u3099\\u309A';
|
|
54
|
+
const digit = '0-9\\u0660-\\u0669\\u06F0-\\u06F9\\u0966-\\u096F\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F\\u0BE7-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29';
|
|
55
|
+
const extender = '\\xB7\\u02D0\\u02D1\\u0387\\u0640\\u0E46\\u0EC6\\u3005\\u3031-\\u3035\\u309D\\u309E\\u30FC-\\u30FE';
|
|
56
|
+
const letter = 'A-Za-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u0180-\\u01C3\\u01CD-\\u01F0\\u01F4\\u01F5\\u01FA-\\u0217\\u0250-\\u02A8\\u02BB-\\u02C1\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03D0-\\u03D6\\u03DA\\u03DC\\u03DE\\u03E0\\u03E2-\\u03F3\\u0401-\\u040C\\u040E-\\u044F\\u0451-\\u045C\\u045E-\\u0481\\u0490-\\u04C4\\u04C7\\u04C8\\u04CB\\u04CC\\u04D0-\\u04EB\\u04EE-\\u04F5\\u04F8\\u04F9\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0621-\\u063A\\u0641-\\u064A\\u0671-\\u06B7\\u06BA-\\u06BE\\u06C0-\\u06CE\\u06D0-\\u06D3\\u06D5\\u06E5\\u06E6\\u0905-\\u0939\\u093D\\u0958-\\u0961\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09DC\\u09DD\\u09DF-\\u09E1\\u09F0\\u09F1\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A59-\\u0A5C\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8B\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABD\\u0AE0\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B36-\\u0B39\\u0B3D\\u0B5C\\u0B5D\\u0B5F-\\u0B61\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB5\\u0BB7-\\u0BB9\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C60\\u0C61\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CDE\\u0CE0\\u0CE1\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D28\\u0D2A-\\u0D39\\u0D60\\u0D61\\u0E01-\\u0E2E\\u0E30\\u0E32\\u0E33\\u0E40-\\u0E45\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD\\u0EAE\\u0EB0\\u0EB2\\u0EB3\\u0EBD\\u0EC0-\\u0EC4\\u0F40-\\u0F47\\u0F49-\\u0F69\\u10A0-\\u10C5\\u10D0-\\u10F6\\u1100\\u1102\\u1103\\u1105-\\u1107\\u1109\\u110B\\u110C\\u110E-\\u1112\\u113C\\u113E\\u1140\\u114C\\u114E\\u1150\\u1154\\u1155\\u1159\\u115F-\\u1161\\u1163\\u1165\\u1167\\u1169\\u116D\\u116E\\u1172\\u1173\\u1175\\u119E\\u11A8\\u11AB\\u11AE\\u11AF\\u11B7\\u11B8\\u11BA\\u11BC-\\u11C2\\u11EB\\u11F0\\u11F9\\u1E00-\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2126\\u212A\\u212B\\u212E\\u2180-\\u2182\\u3007\\u3021-\\u3029\\u3041-\\u3094\\u30A1-\\u30FA\\u3105-\\u312C\\u4E00-\\u9FA5\\uAC00-\\uD7A3';
|
|
57
|
+
const ncname = '[' + letter + '_][' + letter + digit + '\\.\\-_' + combiningChar + extender + ']*';
|
|
58
|
+
return '((?:' + ncname + '\\:)?' + ncname + ')';
|
|
59
|
+
})();
|
|
60
|
+
const startTagOpen = new RegExp('^<' + qnameCapture);
|
|
61
|
+
const startTagClose = /^\s*(\/?)>/;
|
|
62
|
+
export const endTag = new RegExp('^<\\/' + qnameCapture + '[^>]*>');
|
|
63
|
+
const doctype = /^<!DOCTYPE\s?[^>]+>/i;
|
|
64
|
+
|
|
65
|
+
let IS_REGEX_CAPTURING_BROKEN = false;
|
|
66
|
+
'x'.replace(/x(.)?/g, function (m, g) {
|
|
67
|
+
IS_REGEX_CAPTURING_BROKEN = g === '';
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
// Empty Elements
|
|
71
|
+
const empty = new CaseInsensitiveSet(['area', 'base', 'basefont', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
|
|
72
|
+
|
|
73
|
+
// Inline Elements
|
|
74
|
+
const inline = new CaseInsensitiveSet(['a', 'abbr', 'acronym', 'applet', 'b', 'basefont', 'bdo', 'big', 'br', 'button', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'map', 'noscript', 'object', 'q', 's', 'samp', 'script', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'svg', 'textarea', 'tt', 'u', 'var']);
|
|
75
|
+
|
|
76
|
+
// Elements that you can, intentionally, leave open
|
|
77
|
+
// (and which close themselves)
|
|
78
|
+
const closeSelf = new CaseInsensitiveSet(['colgroup', 'dd', 'dt', 'li', 'option', 'p', 'td', 'tfoot', 'th', 'thead', 'tr', 'source']);
|
|
79
|
+
|
|
80
|
+
// Attributes that have their values filled in disabled='disabled'
|
|
81
|
+
const fillAttrs = new CaseInsensitiveSet(['checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap', 'readonly', 'selected']);
|
|
82
|
+
|
|
83
|
+
// Special Elements (can contain anything)
|
|
84
|
+
const special = new CaseInsensitiveSet(['script', 'style']);
|
|
85
|
+
|
|
86
|
+
// HTML5 tags https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
|
87
|
+
// Phrasing Content https://html.spec.whatwg.org/multipage/dom.html#phrasing-content
|
|
88
|
+
const nonPhrasing = new CaseInsensitiveSet(['address', 'article', 'aside', 'base', 'blockquote', 'body', 'caption', 'col', 'colgroup', 'dd', 'details', 'dialog', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'legend', 'li', 'menuitem', 'meta', 'ol', 'optgroup', 'option', 'param', 'rp', 'rt', 'source', 'style', 'summary', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'track', 'ul']);
|
|
89
|
+
|
|
90
|
+
const reCache = {};
|
|
91
|
+
|
|
92
|
+
function attrForHandler(handler) {
|
|
93
|
+
let pattern = singleAttrIdentifier.source +
|
|
94
|
+
'(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' +
|
|
95
|
+
'[ \\t\\n\\f\\r]*(?:' + singleAttrValues.join('|') + '))?';
|
|
96
|
+
if (handler.customAttrSurround) {
|
|
97
|
+
const attrClauses = [];
|
|
98
|
+
for (let i = handler.customAttrSurround.length - 1; i >= 0; i--) {
|
|
99
|
+
attrClauses[i] = '(?:' +
|
|
100
|
+
'(' + handler.customAttrSurround[i][0].source + ')\\s*' +
|
|
101
|
+
pattern +
|
|
102
|
+
'\\s*(' + handler.customAttrSurround[i][1].source + ')' +
|
|
103
|
+
')';
|
|
104
|
+
}
|
|
105
|
+
attrClauses.push('(?:' + pattern + ')');
|
|
106
|
+
pattern = '(?:' + attrClauses.join('|') + ')';
|
|
107
|
+
}
|
|
108
|
+
return new RegExp('^\\s*' + pattern);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function joinSingleAttrAssigns(handler) {
|
|
112
|
+
return singleAttrAssigns.concat(
|
|
113
|
+
handler.customAttrAssign || []
|
|
114
|
+
).map(function (assign) {
|
|
115
|
+
return '(?:' + assign.source + ')';
|
|
116
|
+
}).join('|');
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
export class HTMLParser {
|
|
120
|
+
constructor(html, handler) {
|
|
121
|
+
this.html = html;
|
|
122
|
+
this.handler = handler;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async parse() {
|
|
126
|
+
let html = this.html;
|
|
127
|
+
const handler = this.handler;
|
|
128
|
+
|
|
129
|
+
const stack = []; let lastTag;
|
|
130
|
+
const attribute = attrForHandler(handler);
|
|
131
|
+
let last, prevTag, nextTag;
|
|
132
|
+
while (html) {
|
|
133
|
+
last = html;
|
|
134
|
+
// Make sure we're not in a script or style element
|
|
135
|
+
if (!lastTag || !special.has(lastTag)) {
|
|
136
|
+
let textEnd = html.indexOf('<');
|
|
137
|
+
if (textEnd === 0) {
|
|
138
|
+
// Comment:
|
|
139
|
+
if (/^<!--/.test(html)) {
|
|
140
|
+
const commentEnd = html.indexOf('-->');
|
|
141
|
+
|
|
142
|
+
if (commentEnd >= 0) {
|
|
143
|
+
if (handler.comment) {
|
|
144
|
+
await handler.comment(html.substring(4, commentEnd));
|
|
145
|
+
}
|
|
146
|
+
html = html.substring(commentEnd + 3);
|
|
147
|
+
prevTag = '';
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// https://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
|
|
153
|
+
if (/^<!\[/.test(html)) {
|
|
154
|
+
const conditionalEnd = html.indexOf(']>');
|
|
155
|
+
|
|
156
|
+
if (conditionalEnd >= 0) {
|
|
157
|
+
if (handler.comment) {
|
|
158
|
+
await handler.comment(html.substring(2, conditionalEnd + 1), true /* non-standard */);
|
|
159
|
+
}
|
|
160
|
+
html = html.substring(conditionalEnd + 2);
|
|
161
|
+
prevTag = '';
|
|
162
|
+
continue;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Doctype:
|
|
167
|
+
const doctypeMatch = html.match(doctype);
|
|
168
|
+
if (doctypeMatch) {
|
|
169
|
+
if (handler.doctype) {
|
|
170
|
+
handler.doctype(doctypeMatch[0]);
|
|
171
|
+
}
|
|
172
|
+
html = html.substring(doctypeMatch[0].length);
|
|
173
|
+
prevTag = '';
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// End tag:
|
|
178
|
+
const endTagMatch = html.match(endTag);
|
|
179
|
+
if (endTagMatch) {
|
|
180
|
+
html = html.substring(endTagMatch[0].length);
|
|
181
|
+
await replaceAsync(endTagMatch[0], endTag, parseEndTag);
|
|
182
|
+
prevTag = '/' + endTagMatch[1].toLowerCase();
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Start tag:
|
|
187
|
+
const startTagMatch = parseStartTag(html);
|
|
188
|
+
if (startTagMatch) {
|
|
189
|
+
html = startTagMatch.rest;
|
|
190
|
+
await handleStartTag(startTagMatch);
|
|
191
|
+
prevTag = startTagMatch.tagName.toLowerCase();
|
|
192
|
+
continue;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Treat `<` as text
|
|
196
|
+
if (handler.continueOnParseError) {
|
|
197
|
+
textEnd = html.indexOf('<', 1);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
let text;
|
|
202
|
+
if (textEnd >= 0) {
|
|
203
|
+
text = html.substring(0, textEnd);
|
|
204
|
+
html = html.substring(textEnd);
|
|
205
|
+
} else {
|
|
206
|
+
text = html;
|
|
207
|
+
html = '';
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// next tag
|
|
211
|
+
let nextTagMatch = parseStartTag(html);
|
|
212
|
+
if (nextTagMatch) {
|
|
213
|
+
nextTag = nextTagMatch.tagName;
|
|
214
|
+
} else {
|
|
215
|
+
nextTagMatch = html.match(endTag);
|
|
216
|
+
if (nextTagMatch) {
|
|
217
|
+
nextTag = '/' + nextTagMatch[1];
|
|
218
|
+
} else {
|
|
219
|
+
nextTag = '';
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
if (handler.chars) {
|
|
224
|
+
await handler.chars(text, prevTag, nextTag);
|
|
225
|
+
}
|
|
226
|
+
prevTag = '';
|
|
227
|
+
} else {
|
|
228
|
+
const stackedTag = lastTag.toLowerCase();
|
|
229
|
+
const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)</' + stackedTag + '[^>]*>', 'i'));
|
|
230
|
+
|
|
231
|
+
html = await replaceAsync(html, reStackedTag, async (_, text) => {
|
|
232
|
+
if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') {
|
|
233
|
+
text = text
|
|
234
|
+
.replace(/<!--([\s\S]*?)-->/g, '$1')
|
|
235
|
+
.replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1');
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (handler.chars) {
|
|
239
|
+
await handler.chars(text);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return '';
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
await parseEndTag('</' + stackedTag + '>', stackedTag);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (html === last) {
|
|
249
|
+
throw new Error('Parse Error: ' + html);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (!handler.partialMarkup) {
|
|
254
|
+
// Clean up any remaining tags
|
|
255
|
+
await parseEndTag();
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
function parseStartTag(input) {
|
|
259
|
+
const start = input.match(startTagOpen);
|
|
260
|
+
if (start) {
|
|
261
|
+
const match = {
|
|
262
|
+
tagName: start[1],
|
|
263
|
+
attrs: []
|
|
264
|
+
};
|
|
265
|
+
input = input.slice(start[0].length);
|
|
266
|
+
let end, attr;
|
|
267
|
+
while (!(end = input.match(startTagClose)) && (attr = input.match(attribute))) {
|
|
268
|
+
input = input.slice(attr[0].length);
|
|
269
|
+
match.attrs.push(attr);
|
|
270
|
+
}
|
|
271
|
+
if (end) {
|
|
272
|
+
match.unarySlash = end[1];
|
|
273
|
+
match.rest = input.slice(end[0].length);
|
|
274
|
+
return match;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
async function closeIfFound(tagName) {
|
|
280
|
+
if (findTag(tagName) >= 0) {
|
|
281
|
+
await parseEndTag('', tagName);
|
|
282
|
+
return true;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
async function handleStartTag(match) {
|
|
287
|
+
const tagName = match.tagName;
|
|
288
|
+
let unarySlash = match.unarySlash;
|
|
289
|
+
|
|
290
|
+
if (handler.html5) {
|
|
291
|
+
if (lastTag === 'p' && nonPhrasing.has(tagName)) {
|
|
292
|
+
await parseEndTag('', lastTag);
|
|
293
|
+
} else if (tagName === 'tbody') {
|
|
294
|
+
await closeIfFound('thead');
|
|
295
|
+
} else if (tagName === 'tfoot') {
|
|
296
|
+
if (!await closeIfFound('tbody')) {
|
|
297
|
+
await closeIfFound('thead');
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
if (tagName === 'col' && findTag('colgroup') < 0) {
|
|
301
|
+
lastTag = 'colgroup';
|
|
302
|
+
stack.push({ tag: lastTag, attrs: [] });
|
|
303
|
+
if (handler.start) {
|
|
304
|
+
await handler.start(lastTag, [], false, '');
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if (!handler.html5 && !inline.has(tagName)) {
|
|
310
|
+
while (lastTag && inline.has(lastTag)) {
|
|
311
|
+
await parseEndTag('', lastTag);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
if (closeSelf.has(tagName) && lastTag === tagName) {
|
|
316
|
+
await parseEndTag('', tagName);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
const unary = empty.has(tagName) || (tagName === 'html' && lastTag === 'head') || !!unarySlash;
|
|
320
|
+
|
|
321
|
+
const attrs = match.attrs.map(function (args) {
|
|
322
|
+
let name, value, customOpen, customClose, customAssign, quote;
|
|
323
|
+
const ncp = 7; // number of captured parts, scalar
|
|
324
|
+
|
|
325
|
+
// hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
|
|
326
|
+
if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) {
|
|
327
|
+
if (args[3] === '') { delete args[3]; }
|
|
328
|
+
if (args[4] === '') { delete args[4]; }
|
|
329
|
+
if (args[5] === '') { delete args[5]; }
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
function populate(index) {
|
|
333
|
+
customAssign = args[index];
|
|
334
|
+
value = args[index + 1];
|
|
335
|
+
if (typeof value !== 'undefined') {
|
|
336
|
+
return '"';
|
|
337
|
+
}
|
|
338
|
+
value = args[index + 2];
|
|
339
|
+
if (typeof value !== 'undefined') {
|
|
340
|
+
return '\'';
|
|
341
|
+
}
|
|
342
|
+
value = args[index + 3];
|
|
343
|
+
if (typeof value === 'undefined' && fillAttrs.has(name)) {
|
|
344
|
+
value = name;
|
|
345
|
+
}
|
|
346
|
+
return '';
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
let j = 1;
|
|
350
|
+
if (handler.customAttrSurround) {
|
|
351
|
+
for (let i = 0, l = handler.customAttrSurround.length; i < l; i++, j += ncp) {
|
|
352
|
+
name = args[j + 1];
|
|
353
|
+
if (name) {
|
|
354
|
+
quote = populate(j + 2);
|
|
355
|
+
customOpen = args[j];
|
|
356
|
+
customClose = args[j + 6];
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
if (!name && (name = args[j])) {
|
|
363
|
+
quote = populate(j + 1);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return {
|
|
367
|
+
name,
|
|
368
|
+
value,
|
|
369
|
+
customAssign: customAssign || '=',
|
|
370
|
+
customOpen: customOpen || '',
|
|
371
|
+
customClose: customClose || '',
|
|
372
|
+
quote: quote || ''
|
|
373
|
+
};
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
if (!unary) {
|
|
377
|
+
stack.push({ tag: tagName, attrs });
|
|
378
|
+
lastTag = tagName;
|
|
379
|
+
unarySlash = '';
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
if (handler.start) {
|
|
383
|
+
await handler.start(tagName, attrs, unary, unarySlash);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
function findTag(tagName) {
|
|
388
|
+
let pos;
|
|
389
|
+
const needle = tagName.toLowerCase();
|
|
390
|
+
for (pos = stack.length - 1; pos >= 0; pos--) {
|
|
391
|
+
if (stack[pos].tag.toLowerCase() === needle) {
|
|
392
|
+
break;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
return pos;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
async function parseEndTag(tag, tagName) {
|
|
399
|
+
let pos;
|
|
400
|
+
|
|
401
|
+
// Find the closest opened tag of the same type
|
|
402
|
+
if (tagName) {
|
|
403
|
+
pos = findTag(tagName);
|
|
404
|
+
} else { // If no tag name is provided, clean shop
|
|
405
|
+
pos = 0;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if (pos >= 0) {
|
|
409
|
+
// Close all the open elements, up the stack
|
|
410
|
+
for (let i = stack.length - 1; i >= pos; i--) {
|
|
411
|
+
if (handler.end) {
|
|
412
|
+
handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Remove the open elements from the stack
|
|
417
|
+
stack.length = pos;
|
|
418
|
+
lastTag = pos && stack[pos - 1].tag;
|
|
419
|
+
} else if (tagName.toLowerCase() === 'br') {
|
|
420
|
+
if (handler.start) {
|
|
421
|
+
await handler.start(tagName, [], true, '');
|
|
422
|
+
}
|
|
423
|
+
} else if (tagName.toLowerCase() === 'p') {
|
|
424
|
+
if (handler.start) {
|
|
425
|
+
await handler.start(tagName, [], false, '', true);
|
|
426
|
+
}
|
|
427
|
+
if (handler.end) {
|
|
428
|
+
handler.end(tagName, []);
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
export const HTMLtoXML = (html) => {
|
|
436
|
+
let results = '';
|
|
437
|
+
|
|
438
|
+
const parser = new HTMLParser(html, {
|
|
439
|
+
start: function (tag, attrs, unary) {
|
|
440
|
+
results += '<' + tag;
|
|
441
|
+
|
|
442
|
+
for (let i = 0, len = attrs.length; i < len; i++) {
|
|
443
|
+
results += ' ' + attrs[i].name + '="' + (attrs[i].value || '').replace(/"/g, '"') + '"';
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
results += (unary ? '/' : '') + '>';
|
|
447
|
+
},
|
|
448
|
+
end: function (tag) {
|
|
449
|
+
results += '</' + tag + '>';
|
|
450
|
+
},
|
|
451
|
+
chars: function (text) {
|
|
452
|
+
results += text;
|
|
453
|
+
},
|
|
454
|
+
comment: function (text) {
|
|
455
|
+
results += '<!--' + text + '-->';
|
|
456
|
+
},
|
|
457
|
+
ignore: function (text) {
|
|
458
|
+
results += text;
|
|
459
|
+
}
|
|
460
|
+
});
|
|
461
|
+
|
|
462
|
+
parser.parse();
|
|
463
|
+
|
|
464
|
+
return results;
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
export const HTMLtoDOM = (html, doc) => {
|
|
468
|
+
// There can be only one of these elements
|
|
469
|
+
const one = {
|
|
470
|
+
html: true,
|
|
471
|
+
head: true,
|
|
472
|
+
body: true,
|
|
473
|
+
title: true
|
|
474
|
+
};
|
|
475
|
+
|
|
476
|
+
// Enforce a structure for the document
|
|
477
|
+
const structure = {
|
|
478
|
+
link: 'head',
|
|
479
|
+
base: 'head'
|
|
480
|
+
};
|
|
481
|
+
|
|
482
|
+
if (doc) {
|
|
483
|
+
doc = doc.ownerDocument || (doc.getOwnerDocument && doc.getOwnerDocument()) || doc;
|
|
484
|
+
} else if (typeof DOMDocument !== 'undefined') {
|
|
485
|
+
doc = new DOMDocument();
|
|
486
|
+
} else if (typeof document !== 'undefined' && document.implementation && document.implementation.createDocument) {
|
|
487
|
+
doc = document.implementation.createDocument('', '', null);
|
|
488
|
+
} else if (typeof ActiveX !== 'undefined') {
|
|
489
|
+
doc = new ActiveXObject('Msxml.DOMDocument');
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
const elems = [];
|
|
493
|
+
const documentElement = doc.documentElement || (doc.getDocumentElement && doc.getDocumentElement());
|
|
494
|
+
|
|
495
|
+
// If we're dealing with an empty document then we
|
|
496
|
+
// need to pre-populate it with the HTML document structure
|
|
497
|
+
if (!documentElement && doc.createElement) {
|
|
498
|
+
(function () {
|
|
499
|
+
const html = doc.createElement('html');
|
|
500
|
+
const head = doc.createElement('head');
|
|
501
|
+
head.appendChild(doc.createElement('title'));
|
|
502
|
+
html.appendChild(head);
|
|
503
|
+
html.appendChild(doc.createElement('body'));
|
|
504
|
+
doc.appendChild(html);
|
|
505
|
+
})();
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
// Find all the unique elements
|
|
509
|
+
if (doc.getElementsByTagName) {
|
|
510
|
+
for (const i in one) {
|
|
511
|
+
one[i] = doc.getElementsByTagName(i)[0];
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// If we're working with a document, inject contents into
|
|
516
|
+
// the body element
|
|
517
|
+
let curParentNode = one.body;
|
|
518
|
+
|
|
519
|
+
const parser = new HTMLParser(html, {
|
|
520
|
+
start: function (tagName, attrs, unary) {
|
|
521
|
+
// If it's a pre-built element, then we can ignore
|
|
522
|
+
// its construction
|
|
523
|
+
if (one[tagName]) {
|
|
524
|
+
curParentNode = one[tagName];
|
|
525
|
+
return;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
const elem = doc.createElement(tagName);
|
|
529
|
+
|
|
530
|
+
for (const attr in attrs) {
|
|
531
|
+
elem.setAttribute(attrs[attr].name, attrs[attr].value);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
if (structure[tagName] && typeof one[structure[tagName]] !== 'boolean') {
|
|
535
|
+
one[structure[tagName]].appendChild(elem);
|
|
536
|
+
} else if (curParentNode && curParentNode.appendChild) {
|
|
537
|
+
curParentNode.appendChild(elem);
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
if (!unary) {
|
|
541
|
+
elems.push(elem);
|
|
542
|
+
curParentNode = elem;
|
|
543
|
+
}
|
|
544
|
+
},
|
|
545
|
+
end: function (/* tag */) {
|
|
546
|
+
elems.length -= 1;
|
|
547
|
+
|
|
548
|
+
// Init the new parentNode
|
|
549
|
+
curParentNode = elems[elems.length - 1];
|
|
550
|
+
},
|
|
551
|
+
chars: function (text) {
|
|
552
|
+
curParentNode.appendChild(doc.createTextNode(text));
|
|
553
|
+
},
|
|
554
|
+
comment: function (/* text */) {
|
|
555
|
+
// create comment node
|
|
556
|
+
},
|
|
557
|
+
ignore: function (/* text */) {
|
|
558
|
+
// What to do here?
|
|
559
|
+
}
|
|
560
|
+
});
|
|
561
|
+
|
|
562
|
+
parser.parse();
|
|
563
|
+
|
|
564
|
+
return doc;
|
|
565
|
+
};
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
class Sorter {
|
|
2
|
+
sort(tokens, fromIndex = 0) {
|
|
3
|
+
for (let i = 0, len = this.keys.length; i < len; i++) {
|
|
4
|
+
const key = this.keys[i];
|
|
5
|
+
const token = key.slice(1);
|
|
6
|
+
|
|
7
|
+
let index = tokens.indexOf(token, fromIndex);
|
|
8
|
+
|
|
9
|
+
if (index !== -1) {
|
|
10
|
+
do {
|
|
11
|
+
if (index !== fromIndex) {
|
|
12
|
+
tokens.splice(index, 1);
|
|
13
|
+
tokens.splice(fromIndex, 0, token);
|
|
14
|
+
}
|
|
15
|
+
fromIndex++;
|
|
16
|
+
} while ((index = tokens.indexOf(token, fromIndex)) !== -1);
|
|
17
|
+
|
|
18
|
+
return this[key].sort(tokens, fromIndex);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
return tokens;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
class TokenChain {
|
|
26
|
+
add(tokens) {
|
|
27
|
+
tokens.forEach((token) => {
|
|
28
|
+
const key = '$' + token;
|
|
29
|
+
if (!this[key]) {
|
|
30
|
+
this[key] = [];
|
|
31
|
+
this[key].processed = 0;
|
|
32
|
+
}
|
|
33
|
+
this[key].push(tokens);
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
createSorter() {
|
|
38
|
+
const sorter = new Sorter();
|
|
39
|
+
|
|
40
|
+
sorter.keys = Object.keys(this).sort((j, k) => {
|
|
41
|
+
const m = this[j].length;
|
|
42
|
+
const n = this[k].length;
|
|
43
|
+
return m < n ? 1 : m > n ? -1 : j < k ? -1 : j > k ? 1 : 0;
|
|
44
|
+
}).filter((key) => {
|
|
45
|
+
if (this[key].processed < this[key].length) {
|
|
46
|
+
const token = key.slice(1);
|
|
47
|
+
const chain = new TokenChain();
|
|
48
|
+
|
|
49
|
+
this[key].forEach((tokens) => {
|
|
50
|
+
let index;
|
|
51
|
+
while ((index = tokens.indexOf(token)) !== -1) {
|
|
52
|
+
tokens.splice(index, 1);
|
|
53
|
+
}
|
|
54
|
+
tokens.forEach((token) => {
|
|
55
|
+
this['$' + token].processed++;
|
|
56
|
+
});
|
|
57
|
+
chain.add(tokens.slice(0));
|
|
58
|
+
});
|
|
59
|
+
sorter[key] = chain.createSorter();
|
|
60
|
+
return true;
|
|
61
|
+
}
|
|
62
|
+
return false;
|
|
63
|
+
});
|
|
64
|
+
return sorter;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export default TokenChain;
|
package/src/utils.js
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export async function replaceAsync(str, regex, asyncFn) {
|
|
2
|
+
const promises = [];
|
|
3
|
+
|
|
4
|
+
str.replace(regex, (match, ...args) => {
|
|
5
|
+
const promise = asyncFn(match, ...args);
|
|
6
|
+
promises.push(promise);
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
const data = await Promise.all(promises);
|
|
10
|
+
return str.replace(regex, () => data.shift());
|
|
11
|
+
}
|