@cj-tech-master/excelts 9.5.5 → 9.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/modules/excel/worksheet.d.ts +11 -0
- package/dist/browser/modules/excel/worksheet.js +13 -0
- package/dist/browser/modules/formula/integration/apply-writeback-plan.js +17 -3
- package/dist/browser/modules/formula/integration/workbook-adapter.js +20 -1
- package/dist/browser/modules/formula/integration/workbook-snapshot.d.ts +12 -0
- package/dist/browser/modules/formula/materialize/build-writeback-plan.js +47 -0
- package/dist/browser/modules/formula/materialize/types.d.ts +19 -3
- package/dist/browser/modules/formula/materialize/types.js +13 -3
- package/dist/browser/modules/pdf/builder/document-builder.js +2 -2
- package/dist/browser/modules/pdf/font/system-fonts.d.ts +24 -4
- package/dist/browser/modules/pdf/font/system-fonts.js +76 -32
- package/dist/browser/modules/pdf/render/pdf-exporter.js +6 -3
- package/dist/browser/modules/word/advanced/field-engine.js +151 -23
- package/dist/browser/modules/word/advanced/math-convert.js +2 -1
- package/dist/browser/modules/word/advanced/style-map.js +44 -6
- package/dist/browser/modules/word/convert/html/html-import.js +434 -71
- package/dist/browser/modules/word/convert/markdown/markdown-renderer.js +11 -3
- package/dist/browser/modules/word/layout/layout-full.js +4 -1
- package/dist/browser/modules/word/security/digital-signatures.js +160 -33
- package/dist/browser/modules/word/security/encryption.js +109 -9
- package/dist/cjs/modules/excel/worksheet.js +13 -0
- package/dist/cjs/modules/formula/integration/apply-writeback-plan.js +17 -3
- package/dist/cjs/modules/formula/integration/workbook-adapter.js +20 -1
- package/dist/cjs/modules/formula/materialize/build-writeback-plan.js +47 -0
- package/dist/cjs/modules/formula/materialize/types.js +13 -3
- package/dist/cjs/modules/pdf/builder/document-builder.js +1 -1
- package/dist/cjs/modules/pdf/font/system-fonts.js +77 -32
- package/dist/cjs/modules/pdf/render/pdf-exporter.js +5 -2
- package/dist/cjs/modules/word/advanced/field-engine.js +151 -23
- package/dist/cjs/modules/word/advanced/math-convert.js +2 -1
- package/dist/cjs/modules/word/advanced/style-map.js +44 -6
- package/dist/cjs/modules/word/convert/html/html-import.js +434 -71
- package/dist/cjs/modules/word/convert/markdown/markdown-renderer.js +11 -3
- package/dist/cjs/modules/word/layout/layout-full.js +4 -1
- package/dist/cjs/modules/word/security/digital-signatures.js +160 -33
- package/dist/cjs/modules/word/security/encryption.js +109 -9
- package/dist/esm/modules/excel/worksheet.js +13 -0
- package/dist/esm/modules/formula/integration/apply-writeback-plan.js +17 -3
- package/dist/esm/modules/formula/integration/workbook-adapter.js +20 -1
- package/dist/esm/modules/formula/materialize/build-writeback-plan.js +47 -0
- package/dist/esm/modules/formula/materialize/types.js +13 -3
- package/dist/esm/modules/pdf/builder/document-builder.js +2 -2
- package/dist/esm/modules/pdf/font/system-fonts.js +76 -32
- package/dist/esm/modules/pdf/render/pdf-exporter.js +6 -3
- package/dist/esm/modules/word/advanced/field-engine.js +151 -23
- package/dist/esm/modules/word/advanced/math-convert.js +2 -1
- package/dist/esm/modules/word/advanced/style-map.js +44 -6
- package/dist/esm/modules/word/convert/html/html-import.js +434 -71
- package/dist/esm/modules/word/convert/markdown/markdown-renderer.js +11 -3
- package/dist/esm/modules/word/layout/layout-full.js +4 -1
- package/dist/esm/modules/word/security/digital-signatures.js +160 -33
- package/dist/esm/modules/word/security/encryption.js +109 -9
- package/dist/iife/excelts.iife.js +40 -26
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +3 -3
- package/dist/types/modules/excel/worksheet.d.ts +11 -0
- package/dist/types/modules/formula/integration/workbook-snapshot.d.ts +12 -0
- package/dist/types/modules/formula/materialize/types.d.ts +19 -3
- package/dist/types/modules/pdf/font/system-fonts.d.ts +24 -4
- package/package.json +1 -1
|
@@ -67,73 +67,342 @@ function tokenize(html) {
|
|
|
67
67
|
// instructions before tokenising — none of them should appear as text
|
|
68
68
|
// in the document body. The previous regex treated `<!doctype html>`
|
|
69
69
|
// as a text node containing `"!doctype html>"`.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
//
|
|
76
|
-
//
|
|
77
|
-
//
|
|
78
|
-
//
|
|
79
|
-
const
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
70
|
+
//
|
|
71
|
+
// We use a single linear scan rather than chained `.replace()` calls so
|
|
72
|
+
// we are immune to two CodeQL findings:
|
|
73
|
+
// - Incomplete multi-character sanitization: chained replaces let
|
|
74
|
+
// payloads such as `<!--<!--x-->-->` leak through (each pass only
|
|
75
|
+
// removes one layer, leaving `-->` behind).
|
|
76
|
+
// - Polynomial regular expression on uncontrolled data: lazy
|
|
77
|
+
// quantifiers like `<!--[\s\S]*?-->` exhibit catastrophic
|
|
78
|
+
// backtracking on adversarial input.
|
|
79
|
+
const stripped = stripSgmlNoise(html);
|
|
80
|
+
// The tokenizer is implemented as a linear index scan rather than a
|
|
81
|
+
// global regex (`/<\/?…(?:\s+[^>]*?)?\/?\s*>|((?:[^<]|…)+)/g`). The
|
|
82
|
+
// previous regex form combined an optional lazy attribute span with
|
|
83
|
+
// an optional `\/?` and optional trailing whitespace, which CodeQL
|
|
84
|
+
// flagged as polynomial-redos: an adversarial payload such as
|
|
85
|
+
// `<a` followed by many spaces but no closing `>` triggered
|
|
86
|
+
// catastrophic backtracking.
|
|
87
|
+
//
|
|
88
|
+
// The scan below is strictly O(n):
|
|
89
|
+
// - At every position we either advance one character or jump
|
|
90
|
+
// forward to the next `<` / `>` via a single `indexOf`.
|
|
91
|
+
// - Attribute parsing is delegated to `parseHtmlAttrs`, which is
|
|
92
|
+
// itself a linear scanner.
|
|
93
|
+
const n = stripped.length;
|
|
94
|
+
let i = 0;
|
|
95
|
+
while (i < n) {
|
|
96
|
+
// Scan a text run: everything up to the next position that begins
|
|
97
|
+
// a tag (`<` followed by a letter, or `</` followed by a letter).
|
|
98
|
+
// Bare `<` characters and unfinished tag-like fragments are kept
|
|
99
|
+
// inside the text run so that input such as `1 < 2`, `a<b<c`,
|
|
100
|
+
// `<<<<` or `<unfinished` (with no closing `>` anywhere) is not
|
|
101
|
+
// shattered into a stream of single-character runs.
|
|
102
|
+
if (stripped.charCodeAt(i) !== 0x3c /* '<' */ || !isTagStart(stripped, i)) {
|
|
103
|
+
const textEnd = scanTextEnd(stripped, i);
|
|
104
|
+
const raw = stripped.slice(i, textEnd);
|
|
105
|
+
const text = decodeHtmlEntities(raw);
|
|
87
106
|
if (text) {
|
|
88
107
|
tokens.push({ type: "text", value: text });
|
|
89
108
|
}
|
|
109
|
+
i = textEnd;
|
|
110
|
+
if (i >= n) {
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
// Fall through: position `i` is now at a real tag start.
|
|
114
|
+
}
|
|
115
|
+
// We are at '<' that introduces a tag (guaranteed by the
|
|
116
|
+
// `isTagStart` check above).
|
|
117
|
+
const next = stripped.charCodeAt(i + 1);
|
|
118
|
+
const isClose = next === 0x2f; /* '/' */
|
|
119
|
+
const nameStart = isClose ? i + 2 : i + 1;
|
|
120
|
+
// Defensive: the loop guard above should already ensure this, but
|
|
121
|
+
// keep the check so a future refactor cannot silently turn a bare
|
|
122
|
+
// `<` into an attempted tag parse.
|
|
123
|
+
if (!isAsciiAlpha(stripped.charCodeAt(nameStart))) {
|
|
124
|
+
tokens.push({ type: "text", value: "<" });
|
|
125
|
+
i++;
|
|
126
|
+
continue;
|
|
90
127
|
}
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
// Read the tag name: [A-Za-z][A-Za-z0-9]*.
|
|
129
|
+
let p = nameStart + 1;
|
|
130
|
+
while (p < n) {
|
|
131
|
+
const c = stripped.charCodeAt(p);
|
|
132
|
+
if (!isAsciiAlpha(c) && !isAsciiDigit(c)) {
|
|
133
|
+
break;
|
|
134
|
+
}
|
|
135
|
+
p++;
|
|
136
|
+
}
|
|
137
|
+
const tagName = stripped.slice(nameStart, p).toLowerCase();
|
|
138
|
+
// Find the closing '>' of the tag. We have to be careful not to
|
|
139
|
+
// mistake a '>' inside a quoted attribute value for the tag end.
|
|
140
|
+
const tagEnd = findTagEnd(stripped, p);
|
|
141
|
+
if (tagEnd < 0) {
|
|
142
|
+
// No closing '>' — the rest of the input is malformed; treat the
|
|
143
|
+
// remainder as text. (Original regex would simply not match and
|
|
144
|
+
// leave the same characters as text via the alternation.)
|
|
145
|
+
const text = decodeHtmlEntities(stripped.slice(i));
|
|
146
|
+
if (text) {
|
|
147
|
+
tokens.push({ type: "text", value: text });
|
|
148
|
+
}
|
|
149
|
+
// `break` exits the loop directly; no need to assign `i = n`
|
|
150
|
+
// first (CodeQL js/useless-assignment-to-local).
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
// Inside [p, tagEnd) lie attributes (and possibly a trailing '/').
|
|
154
|
+
let inner = stripped.slice(p, tagEnd);
|
|
155
|
+
// Detect self-close: trailing '/'. Strip it so it is not parsed as
|
|
156
|
+
// an attribute name.
|
|
157
|
+
let selfClose = false;
|
|
158
|
+
// Trim trailing whitespace, then a single '/'.
|
|
159
|
+
let innerEnd = inner.length;
|
|
160
|
+
while (innerEnd > 0 && isHtmlSpace(inner.charCodeAt(innerEnd - 1))) {
|
|
161
|
+
innerEnd--;
|
|
162
|
+
}
|
|
163
|
+
if (innerEnd > 0 && inner.charCodeAt(innerEnd - 1) === 0x2f) {
|
|
164
|
+
selfClose = true;
|
|
165
|
+
innerEnd--;
|
|
166
|
+
}
|
|
167
|
+
inner = inner.slice(0, innerEnd);
|
|
168
|
+
if (isClose) {
|
|
169
|
+
tokens.push({ type: "close", tag: tagName, attrs: {} });
|
|
170
|
+
i = tagEnd + 1;
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
const attrs = parseHtmlAttrs(inner);
|
|
174
|
+
const isVoidElement = VOID_ELEMENTS.has(tagName);
|
|
175
|
+
if (selfClose || isVoidElement) {
|
|
176
|
+
tokens.push({ type: "selfclose", tag: tagName, attrs });
|
|
177
|
+
i = tagEnd + 1;
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
tokens.push({ type: "open", tag: tagName, attrs });
|
|
181
|
+
i = tagEnd + 1;
|
|
182
|
+
// Raw-text elements: their body must not be parsed as markup.
|
|
183
|
+
if (RAW_TEXT_ELEMENTS.has(tagName)) {
|
|
184
|
+
const closeIdx = findRawTextClose(stripped, i, tagName);
|
|
185
|
+
if (closeIdx === null) {
|
|
186
|
+
// No closing tag — discard the rest of the input for this
|
|
187
|
+
// raw-text element to avoid emitting markup as text.
|
|
188
|
+
i = n;
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
const body = stripped.slice(i, closeIdx.bodyEnd);
|
|
192
|
+
if (RAW_TEXT_PRESERVE_BODY.has(tagName)) {
|
|
193
|
+
tokens.push({ type: "text", value: body });
|
|
131
194
|
}
|
|
195
|
+
tokens.push({ type: "close", tag: tagName, attrs: {} });
|
|
196
|
+
i = closeIdx.next;
|
|
132
197
|
}
|
|
133
198
|
}
|
|
134
199
|
}
|
|
135
200
|
return tokens;
|
|
136
201
|
}
|
|
202
|
+
function isAsciiAlpha(c) {
|
|
203
|
+
return (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a);
|
|
204
|
+
}
|
|
205
|
+
function isAsciiDigit(c) {
|
|
206
|
+
return c >= 0x30 && c <= 0x39;
|
|
207
|
+
}
|
|
208
|
+
function isHtmlSpace(c) {
|
|
209
|
+
return c === 0x20 || c === 0x09 || c === 0x0a || c === 0x0d || c === 0x0c;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Scan forward from `from` to the position of the next '<' that
|
|
213
|
+
* introduces a tag (i.e. is followed by `[a-zA-Z]` or `/[a-zA-Z]`).
|
|
214
|
+
* A bare '<' (e.g. in `1 < 2`) is included in the text run.
|
|
215
|
+
*/
|
|
216
|
+
function scanTextEnd(s, from) {
|
|
217
|
+
const n = s.length;
|
|
218
|
+
let i = from;
|
|
219
|
+
while (i < n) {
|
|
220
|
+
const lt = s.indexOf("<", i);
|
|
221
|
+
if (lt < 0) {
|
|
222
|
+
return n;
|
|
223
|
+
}
|
|
224
|
+
if (isTagStart(s, lt)) {
|
|
225
|
+
return lt;
|
|
226
|
+
}
|
|
227
|
+
// Bare '<' or `</` not followed by a letter — keep scanning.
|
|
228
|
+
i = lt + 1;
|
|
229
|
+
}
|
|
230
|
+
return n;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Return true if position `pos` in `s` is `<` followed by a letter
|
|
234
|
+
* (open tag) or `</` followed by a letter (close tag). Used to
|
|
235
|
+
* distinguish "real" tag starts from literal `<` characters.
|
|
236
|
+
*/
|
|
237
|
+
function isTagStart(s, pos) {
|
|
238
|
+
if (s.charCodeAt(pos) !== 0x3c /* '<' */) {
|
|
239
|
+
return false;
|
|
240
|
+
}
|
|
241
|
+
const next = s.charCodeAt(pos + 1);
|
|
242
|
+
if (isAsciiAlpha(next)) {
|
|
243
|
+
return true;
|
|
244
|
+
}
|
|
245
|
+
if (next === 0x2f /* '/' */ && isAsciiAlpha(s.charCodeAt(pos + 2))) {
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
return false;
|
|
249
|
+
}
|
|
250
|
+
/**
|
|
251
|
+
* Find the index of the '>' that closes the tag opened just before
|
|
252
|
+
* `from`. Honours quoted attribute values so that `<a href="x>y">`
|
|
253
|
+
* does not stop at the '>' inside quotes.
|
|
254
|
+
*
|
|
255
|
+
* Returns -1 if no closing '>' is found before EOF.
|
|
256
|
+
*/
|
|
257
|
+
function findTagEnd(s, from) {
|
|
258
|
+
const n = s.length;
|
|
259
|
+
let i = from;
|
|
260
|
+
while (i < n) {
|
|
261
|
+
const c = s.charCodeAt(i);
|
|
262
|
+
if (c === 0x22 /* '"' */ || c === 0x27 /* "'" */) {
|
|
263
|
+
const close = s.indexOf(c === 0x22 ? '"' : "'", i + 1);
|
|
264
|
+
if (close < 0) {
|
|
265
|
+
return -1;
|
|
266
|
+
}
|
|
267
|
+
i = close + 1;
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
if (c === 0x3e /* '>' */) {
|
|
271
|
+
return i;
|
|
272
|
+
}
|
|
273
|
+
i++;
|
|
274
|
+
}
|
|
275
|
+
return -1;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Find the closing tag for a raw-text element (e.g. `</script>`),
|
|
279
|
+
* starting at `from`. Returns the position immediately after the
|
|
280
|
+
* close tag (`next`) plus the position where the body ends (`bodyEnd`,
|
|
281
|
+
* i.e. the start of the close-tag literal).
|
|
282
|
+
*
|
|
283
|
+
* Implemented with a linear scan (no dynamic `RegExp`) so that
|
|
284
|
+
* adversarial bodies cannot trigger super-linear runtime.
|
|
285
|
+
*/
|
|
286
|
+
function findRawTextClose(s, from, tagName) {
|
|
287
|
+
const n = s.length;
|
|
288
|
+
let i = from;
|
|
289
|
+
while (i < n) {
|
|
290
|
+
const lt = s.indexOf("</", i);
|
|
291
|
+
if (lt < 0) {
|
|
292
|
+
return null;
|
|
293
|
+
}
|
|
294
|
+
const after = lt + 2;
|
|
295
|
+
// Compare tag name case-insensitively.
|
|
296
|
+
let ok = true;
|
|
297
|
+
for (let k = 0; k < tagName.length; k++) {
|
|
298
|
+
const a = s.charCodeAt(after + k);
|
|
299
|
+
const aLower = a >= 0x41 && a <= 0x5a ? a + 0x20 : a;
|
|
300
|
+
if (aLower !== tagName.charCodeAt(k)) {
|
|
301
|
+
ok = false;
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
if (!ok) {
|
|
306
|
+
i = after;
|
|
307
|
+
continue;
|
|
308
|
+
}
|
|
309
|
+
// Skip any trailing whitespace before '>'.
|
|
310
|
+
let p = after + tagName.length;
|
|
311
|
+
while (p < n && isHtmlSpace(s.charCodeAt(p))) {
|
|
312
|
+
p++;
|
|
313
|
+
}
|
|
314
|
+
if (p < n && s.charCodeAt(p) === 0x3e /* '>' */) {
|
|
315
|
+
return { bodyEnd: lt, next: p + 1 };
|
|
316
|
+
}
|
|
317
|
+
i = after;
|
|
318
|
+
}
|
|
319
|
+
return null;
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* Strip HTML comments, doctype declarations, CDATA sections and SGML
|
|
323
|
+
* processing instructions in a single linear scan.
|
|
324
|
+
*
|
|
325
|
+
* A linear scan (vs. chained `String.prototype.replace` with regular
|
|
326
|
+
* expressions) is required for two reasons:
|
|
327
|
+
*
|
|
328
|
+
* 1. **Incomplete multi-character sanitization** — chained replaces are
|
|
329
|
+
* each one pass; an attacker can nest the syntax (e.g.
|
|
330
|
+
* `<!--<!--x-->-->`) so the outer marker survives after the inner
|
|
331
|
+
* one is removed.
|
|
332
|
+
* 2. **Catastrophic backtracking** — lazy quantifiers such as
|
|
333
|
+
* `<!--[\s\S]*?-->` are polynomial-time on adversarial input
|
|
334
|
+
* (very long unterminated comments).
|
|
335
|
+
*
|
|
336
|
+
* The scan is O(n) in the input length and removes nested constructs by
|
|
337
|
+
* not advancing past the closing marker into already-emitted text.
|
|
338
|
+
*/
|
|
339
|
+
function stripSgmlNoise(input) {
|
|
340
|
+
let out = "";
|
|
341
|
+
let i = 0;
|
|
342
|
+
const n = input.length;
|
|
343
|
+
while (i < n) {
|
|
344
|
+
if (input.charCodeAt(i) !== 0x3c /* '<' */) {
|
|
345
|
+
out += input[i];
|
|
346
|
+
i++;
|
|
347
|
+
continue;
|
|
348
|
+
}
|
|
349
|
+
// Comment: <!-- ... -->
|
|
350
|
+
// If the closing `-->` is missing the input is malformed. The
|
|
351
|
+
// previous regex (`/<!--[\s\S]*?-->/g`) simply did not match in that
|
|
352
|
+
// case and left the text in place; we preserve that behaviour rather
|
|
353
|
+
// than swallowing the rest of the document, which would silently
|
|
354
|
+
// change the parse for legitimate inputs that happen to contain a
|
|
355
|
+
// stray `<!--`.
|
|
356
|
+
if (input.startsWith("<!--", i)) {
|
|
357
|
+
const end = input.indexOf("-->", i + 4);
|
|
358
|
+
if (end < 0) {
|
|
359
|
+
out += "<";
|
|
360
|
+
i++;
|
|
361
|
+
continue;
|
|
362
|
+
}
|
|
363
|
+
i = end + 3;
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
// CDATA: <![CDATA[ ... ]]>
|
|
367
|
+
if (input.startsWith("<![CDATA[", i)) {
|
|
368
|
+
const end = input.indexOf("]]>", i + 9);
|
|
369
|
+
if (end < 0) {
|
|
370
|
+
out += "<";
|
|
371
|
+
i++;
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
i = end + 3;
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
// Doctype: <!doctype ...> (case-insensitive)
|
|
378
|
+
if (input.charCodeAt(i + 1) === 0x21 /* '!' */ &&
|
|
379
|
+
input.slice(i + 2, i + 9).toLowerCase() === "doctype") {
|
|
380
|
+
const end = input.indexOf(">", i + 9);
|
|
381
|
+
if (end < 0) {
|
|
382
|
+
out += "<";
|
|
383
|
+
i++;
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
i = end + 1;
|
|
387
|
+
continue;
|
|
388
|
+
}
|
|
389
|
+
// Processing instruction: <? ... ?>
|
|
390
|
+
if (input.charCodeAt(i + 1) === 0x3f /* '?' */) {
|
|
391
|
+
const end = input.indexOf("?>", i + 2);
|
|
392
|
+
if (end < 0) {
|
|
393
|
+
out += "<";
|
|
394
|
+
i++;
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
i = end + 2;
|
|
398
|
+
continue;
|
|
399
|
+
}
|
|
400
|
+
// Not an SGML noise construct — emit the '<' literally and continue.
|
|
401
|
+
out += "<";
|
|
402
|
+
i++;
|
|
403
|
+
}
|
|
404
|
+
return out;
|
|
405
|
+
}
|
|
137
406
|
/**
|
|
138
407
|
* HTML elements whose body is not parsed as markup. Their content is either
|
|
139
408
|
* preserved (style) for downstream processing or discarded entirely.
|
|
@@ -204,26 +473,110 @@ function extractStyleRules(tokens) {
|
|
|
204
473
|
}
|
|
205
474
|
return result;
|
|
206
475
|
}
|
|
476
|
+
/**
|
|
477
|
+
* Parse HTML-style attributes from the inside of a start tag, e.g.
|
|
478
|
+
* `class="x" id='y' disabled href=foo`.
|
|
479
|
+
*
|
|
480
|
+
* Implemented as a linear scan rather than the previous global regex
|
|
481
|
+
* `/([a-zA-Z_][\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g` so
|
|
482
|
+
* adversarial start-tag content cannot trigger polynomial-redos
|
|
483
|
+
* (CodeQL js/polynomial-redos). Behaviour matches the regex form on
|
|
484
|
+
* well-formed inputs:
|
|
485
|
+
* - Attribute names lower-cased.
|
|
486
|
+
* - Double-quoted, single-quoted and unquoted values supported.
|
|
487
|
+
* - Boolean attributes (no `=`) yield an empty string value.
|
|
488
|
+
*/
|
|
207
489
|
function parseHtmlAttrs(str) {
|
|
208
490
|
const attrs = {};
|
|
209
|
-
const
|
|
210
|
-
let
|
|
211
|
-
while (
|
|
212
|
-
|
|
491
|
+
const n = str.length;
|
|
492
|
+
let i = 0;
|
|
493
|
+
while (i < n) {
|
|
494
|
+
// Skip whitespace.
|
|
495
|
+
while (i < n && isHtmlSpace(str.charCodeAt(i))) {
|
|
496
|
+
i++;
|
|
497
|
+
}
|
|
498
|
+
if (i >= n) {
|
|
499
|
+
break;
|
|
500
|
+
}
|
|
501
|
+
// Read attribute name: [A-Za-z_][\w-]*.
|
|
502
|
+
const nameStart = i;
|
|
503
|
+
const first = str.charCodeAt(i);
|
|
504
|
+
if (!isAsciiAlpha(first) && first !== 0x5f /* '_' */) {
|
|
505
|
+
// Not a valid attribute-name start — skip one char and resync.
|
|
506
|
+
i++;
|
|
507
|
+
continue;
|
|
508
|
+
}
|
|
509
|
+
i++;
|
|
510
|
+
while (i < n) {
|
|
511
|
+
const c = str.charCodeAt(i);
|
|
512
|
+
if (isAsciiAlpha(c) || isAsciiDigit(c) || c === 0x5f /* '_' */ || c === 0x2d /* '-' */) {
|
|
513
|
+
i++;
|
|
514
|
+
continue;
|
|
515
|
+
}
|
|
516
|
+
break;
|
|
517
|
+
}
|
|
518
|
+
const name = str.slice(nameStart, i).toLowerCase();
|
|
519
|
+
// Optional `\s*=\s*` then a value.
|
|
520
|
+
let j = i;
|
|
521
|
+
while (j < n && isHtmlSpace(str.charCodeAt(j))) {
|
|
522
|
+
j++;
|
|
523
|
+
}
|
|
524
|
+
if (j >= n || str.charCodeAt(j) !== 0x3d /* '=' */) {
|
|
525
|
+
// Boolean attribute.
|
|
526
|
+
attrs[name] = "";
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
j++; // past '='
|
|
530
|
+
while (j < n && isHtmlSpace(str.charCodeAt(j))) {
|
|
531
|
+
j++;
|
|
532
|
+
}
|
|
533
|
+
if (j >= n) {
|
|
534
|
+
attrs[name] = "";
|
|
535
|
+
i = j;
|
|
536
|
+
continue;
|
|
537
|
+
}
|
|
538
|
+
const q = str.charCodeAt(j);
|
|
539
|
+
if (q === 0x22 /* '"' */ || q === 0x27 /* "'" */) {
|
|
540
|
+
const close = str.indexOf(q === 0x22 ? '"' : "'", j + 1);
|
|
541
|
+
if (close < 0) {
|
|
542
|
+
// Unterminated quoted value — take whatever is left and stop.
|
|
543
|
+
attrs[name] = str.slice(j + 1);
|
|
544
|
+
break;
|
|
545
|
+
}
|
|
546
|
+
attrs[name] = str.slice(j + 1, close);
|
|
547
|
+
i = close + 1;
|
|
548
|
+
continue;
|
|
549
|
+
}
|
|
550
|
+
// Unquoted value: run of non-whitespace.
|
|
551
|
+
const valStart = j;
|
|
552
|
+
while (j < n && !isHtmlSpace(str.charCodeAt(j))) {
|
|
553
|
+
j++;
|
|
554
|
+
}
|
|
555
|
+
attrs[name] = str.slice(valStart, j);
|
|
556
|
+
i = j;
|
|
213
557
|
}
|
|
214
558
|
return attrs;
|
|
215
559
|
}
|
|
216
560
|
function decodeHtmlEntities(text) {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
561
|
+
// Decode every entity in a single pass. Chaining `.replace()` calls
|
|
562
|
+
// (first `&` → `&`, then `<` → `<`, …) re-runs the later
|
|
563
|
+
// replacements over the output of the earlier ones, so input like
|
|
564
|
+
// `&lt;` would round-trip to `<` instead of the intended `<`.
|
|
565
|
+
// CodeQL flags this as "Double escaping or unescaping". A single
|
|
566
|
+
// alternation guarantees each source position is decoded at most once.
|
|
567
|
+
return text.replace(/&(?:#(\d+)|#[xX]([a-fA-F0-9]+)|([a-zA-Z][a-zA-Z0-9]*));/g, (match, dec, hex, name) => {
|
|
568
|
+
if (dec !== undefined) {
|
|
569
|
+
return safeFromCodePoint(parseInt(dec, 10));
|
|
570
|
+
}
|
|
571
|
+
if (hex !== undefined) {
|
|
572
|
+
return safeFromCodePoint(parseInt(hex, 16));
|
|
573
|
+
}
|
|
574
|
+
if (name !== undefined) {
|
|
575
|
+
const replacement = HTML_ENTITIES[name];
|
|
576
|
+
return replacement ?? match;
|
|
577
|
+
}
|
|
578
|
+
return match;
|
|
579
|
+
});
|
|
227
580
|
}
|
|
228
581
|
/**
|
|
229
582
|
* Convert a numeric character reference to a string. Uses fromCodePoint so
|
|
@@ -243,6 +596,16 @@ function safeFromCodePoint(cp) {
|
|
|
243
596
|
}
|
|
244
597
|
/** Common HTML named entities mapped to their Unicode characters. */
|
|
245
598
|
const HTML_ENTITIES = {
|
|
599
|
+
// Core XML/HTML entities — these used to be handled as standalone
|
|
600
|
+
// chained `.replace()` calls in `decodeHtmlEntities`. They must live
|
|
601
|
+
// in this table so the single-pass decoder can resolve them without
|
|
602
|
+
// re-running over already-decoded output (CodeQL "double unescaping").
|
|
603
|
+
amp: "&",
|
|
604
|
+
lt: "<",
|
|
605
|
+
gt: ">",
|
|
606
|
+
quot: '"',
|
|
607
|
+
apos: "'",
|
|
608
|
+
nbsp: "\u00A0",
|
|
246
609
|
// Punctuation & Typography
|
|
247
610
|
mdash: "\u2014",
|
|
248
611
|
ndash: "\u2013",
|
|
@@ -188,8 +188,13 @@ function renderTable(state, table) {
|
|
|
188
188
|
cellParts.push(renderInlineChildren(state, block.children).trim());
|
|
189
189
|
}
|
|
190
190
|
}
|
|
191
|
-
// Escape pipe characters to prevent table structure corruption
|
|
192
|
-
|
|
191
|
+
// Escape pipe characters to prevent table structure corruption.
|
|
192
|
+
// Backslashes must be escaped *first*: replacing `|` first leaves
|
|
193
|
+
// a literal `\|` in the source untouched, but a subsequent
|
|
194
|
+
// `\` → `\\` pass would then double-escape it into `\\|`,
|
|
195
|
+
// breaking GFM tables. CodeQL flags the single-pass form as
|
|
196
|
+
// "Incomplete string escaping or encoding".
|
|
197
|
+
rowTexts.push(cellParts.join(" ").replace(/\\/g, "\\\\").replace(/\|/g, "\\|"));
|
|
193
198
|
}
|
|
194
199
|
grid.push(rowTexts);
|
|
195
200
|
}
|
|
@@ -470,7 +475,10 @@ function isMonospaceFont(font) {
|
|
|
470
475
|
if (typeof font === "string") {
|
|
471
476
|
return isMonospaceFontName(font);
|
|
472
477
|
}
|
|
473
|
-
|
|
478
|
+
// `!font` above already discarded `null`; `font !== null` here was
|
|
479
|
+
// therefore always true and CodeQL flagged it as a comparison
|
|
480
|
+
// between inconvertible types.
|
|
481
|
+
if (typeof font === "object") {
|
|
474
482
|
const f = font;
|
|
475
483
|
return (isMonospaceFontName(f.ascii) ||
|
|
476
484
|
isMonospaceFontName(f.hAnsi));
|
|
@@ -1254,7 +1254,10 @@ function resolveColorHex(color) {
|
|
|
1254
1254
|
if (typeof color === "string") {
|
|
1255
1255
|
return color;
|
|
1256
1256
|
}
|
|
1257
|
-
|
|
1257
|
+
// The `!color` check above already discarded `null`; an additional
|
|
1258
|
+
// `color !== null` test was always true and CodeQL flagged it as a
|
|
1259
|
+
// comparison between inconvertible types.
|
|
1260
|
+
if (typeof color === "object" && "value" in color) {
|
|
1258
1261
|
return color.value;
|
|
1259
1262
|
}
|
|
1260
1263
|
return undefined;
|