@cj-tech-master/excelts 9.5.5 → 9.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/modules/excel/worksheet.d.ts +11 -0
- package/dist/browser/modules/excel/worksheet.js +13 -0
- package/dist/browser/modules/formula/integration/apply-writeback-plan.js +17 -3
- package/dist/browser/modules/formula/integration/workbook-adapter.js +20 -1
- package/dist/browser/modules/formula/integration/workbook-snapshot.d.ts +12 -0
- package/dist/browser/modules/formula/materialize/build-writeback-plan.js +47 -0
- package/dist/browser/modules/formula/materialize/types.d.ts +19 -3
- package/dist/browser/modules/formula/materialize/types.js +13 -3
- package/dist/browser/modules/pdf/builder/document-builder.js +2 -2
- package/dist/browser/modules/pdf/font/system-fonts.d.ts +24 -4
- package/dist/browser/modules/pdf/font/system-fonts.js +76 -32
- package/dist/browser/modules/pdf/render/pdf-exporter.js +6 -3
- package/dist/browser/modules/word/advanced/field-engine.js +151 -23
- package/dist/browser/modules/word/advanced/math-convert.js +2 -1
- package/dist/browser/modules/word/advanced/style-map.js +44 -6
- package/dist/browser/modules/word/convert/html/html-import.js +434 -71
- package/dist/browser/modules/word/convert/markdown/markdown-renderer.js +11 -3
- package/dist/browser/modules/word/layout/layout-full.js +4 -1
- package/dist/browser/modules/word/security/digital-signatures.js +160 -33
- package/dist/browser/modules/word/security/encryption.js +109 -9
- package/dist/cjs/modules/excel/worksheet.js +13 -0
- package/dist/cjs/modules/formula/integration/apply-writeback-plan.js +17 -3
- package/dist/cjs/modules/formula/integration/workbook-adapter.js +20 -1
- package/dist/cjs/modules/formula/materialize/build-writeback-plan.js +47 -0
- package/dist/cjs/modules/formula/materialize/types.js +13 -3
- package/dist/cjs/modules/pdf/builder/document-builder.js +1 -1
- package/dist/cjs/modules/pdf/font/system-fonts.js +77 -32
- package/dist/cjs/modules/pdf/render/pdf-exporter.js +5 -2
- package/dist/cjs/modules/word/advanced/field-engine.js +151 -23
- package/dist/cjs/modules/word/advanced/math-convert.js +2 -1
- package/dist/cjs/modules/word/advanced/style-map.js +44 -6
- package/dist/cjs/modules/word/convert/html/html-import.js +434 -71
- package/dist/cjs/modules/word/convert/markdown/markdown-renderer.js +11 -3
- package/dist/cjs/modules/word/layout/layout-full.js +4 -1
- package/dist/cjs/modules/word/security/digital-signatures.js +160 -33
- package/dist/cjs/modules/word/security/encryption.js +109 -9
- package/dist/esm/modules/excel/worksheet.js +13 -0
- package/dist/esm/modules/formula/integration/apply-writeback-plan.js +17 -3
- package/dist/esm/modules/formula/integration/workbook-adapter.js +20 -1
- package/dist/esm/modules/formula/materialize/build-writeback-plan.js +47 -0
- package/dist/esm/modules/formula/materialize/types.js +13 -3
- package/dist/esm/modules/pdf/builder/document-builder.js +2 -2
- package/dist/esm/modules/pdf/font/system-fonts.js +76 -32
- package/dist/esm/modules/pdf/render/pdf-exporter.js +6 -3
- package/dist/esm/modules/word/advanced/field-engine.js +151 -23
- package/dist/esm/modules/word/advanced/math-convert.js +2 -1
- package/dist/esm/modules/word/advanced/style-map.js +44 -6
- package/dist/esm/modules/word/convert/html/html-import.js +434 -71
- package/dist/esm/modules/word/convert/markdown/markdown-renderer.js +11 -3
- package/dist/esm/modules/word/layout/layout-full.js +4 -1
- package/dist/esm/modules/word/security/digital-signatures.js +160 -33
- package/dist/esm/modules/word/security/encryption.js +109 -9
- package/dist/iife/excelts.iife.js +40 -26
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +3 -3
- package/dist/types/modules/excel/worksheet.d.ts +11 -0
- package/dist/types/modules/formula/integration/workbook-snapshot.d.ts +12 -0
- package/dist/types/modules/formula/materialize/types.d.ts +19 -3
- package/dist/types/modules/pdf/font/system-fonts.d.ts +24 -4
- package/package.json +1 -1
|
@@ -70,73 +70,342 @@ function tokenize(html) {
|
|
|
70
70
|
// instructions before tokenising — none of them should appear as text
|
|
71
71
|
// in the document body. The previous regex treated `<!doctype html>`
|
|
72
72
|
// as a text node containing `"!doctype html>"`.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
//
|
|
81
|
-
//
|
|
82
|
-
const
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
73
|
+
//
|
|
74
|
+
// We use a single linear scan rather than chained `.replace()` calls so
|
|
75
|
+
// we are immune to two CodeQL findings:
|
|
76
|
+
// - Incomplete multi-character sanitization: chained replaces let
|
|
77
|
+
// payloads such as `<!--<!--x-->-->` leak through (each pass only
|
|
78
|
+
// removes one layer, leaving `-->` behind).
|
|
79
|
+
// - Polynomial regular expression on uncontrolled data: lazy
|
|
80
|
+
// quantifiers like `<!--[\s\S]*?-->` exhibit catastrophic
|
|
81
|
+
// backtracking on adversarial input.
|
|
82
|
+
const stripped = stripSgmlNoise(html);
|
|
83
|
+
// The tokenizer is implemented as a linear index scan rather than a
|
|
84
|
+
// global regex (`/<\/?…(?:\s+[^>]*?)?\/?\s*>|((?:[^<]|…)+)/g`). The
|
|
85
|
+
// previous regex form combined an optional lazy attribute span with
|
|
86
|
+
// an optional `\/?` and optional trailing whitespace, which CodeQL
|
|
87
|
+
// flagged as polynomial-redos: an adversarial payload such as
|
|
88
|
+
// `<a` followed by many spaces but no closing `>` triggered
|
|
89
|
+
// catastrophic backtracking.
|
|
90
|
+
//
|
|
91
|
+
// The scan below is strictly O(n):
|
|
92
|
+
// - At every position we either advance one character or jump
|
|
93
|
+
// forward to the next `<` / `>` via a single `indexOf`.
|
|
94
|
+
// - Attribute parsing is delegated to `parseHtmlAttrs`, which is
|
|
95
|
+
// itself a linear scanner.
|
|
96
|
+
const n = stripped.length;
|
|
97
|
+
let i = 0;
|
|
98
|
+
while (i < n) {
|
|
99
|
+
// Scan a text run: everything up to the next position that begins
|
|
100
|
+
// a tag (`<` followed by a letter, or `</` followed by a letter).
|
|
101
|
+
// Bare `<` characters and unfinished tag-like fragments are kept
|
|
102
|
+
// inside the text run so that input such as `1 < 2`, `a<b<c`,
|
|
103
|
+
// `<<<<` or `<unfinished` (with no closing `>` anywhere) is not
|
|
104
|
+
// shattered into a stream of single-character runs.
|
|
105
|
+
if (stripped.charCodeAt(i) !== 0x3c /* '<' */ || !isTagStart(stripped, i)) {
|
|
106
|
+
const textEnd = scanTextEnd(stripped, i);
|
|
107
|
+
const raw = stripped.slice(i, textEnd);
|
|
108
|
+
const text = decodeHtmlEntities(raw);
|
|
90
109
|
if (text) {
|
|
91
110
|
tokens.push({ type: "text", value: text });
|
|
92
111
|
}
|
|
112
|
+
i = textEnd;
|
|
113
|
+
if (i >= n) {
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
// Fall through: position `i` is now at a real tag start.
|
|
117
|
+
}
|
|
118
|
+
// We are at '<' that introduces a tag (guaranteed by the
|
|
119
|
+
// `isTagStart` check above).
|
|
120
|
+
const next = stripped.charCodeAt(i + 1);
|
|
121
|
+
const isClose = next === 0x2f; /* '/' */
|
|
122
|
+
const nameStart = isClose ? i + 2 : i + 1;
|
|
123
|
+
// Defensive: the loop guard above should already ensure this, but
|
|
124
|
+
// keep the check so a future refactor cannot silently turn a bare
|
|
125
|
+
// `<` into an attempted tag parse.
|
|
126
|
+
if (!isAsciiAlpha(stripped.charCodeAt(nameStart))) {
|
|
127
|
+
tokens.push({ type: "text", value: "<" });
|
|
128
|
+
i++;
|
|
129
|
+
continue;
|
|
93
130
|
}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
131
|
+
// Read the tag name: [A-Za-z][A-Za-z0-9]*.
|
|
132
|
+
let p = nameStart + 1;
|
|
133
|
+
while (p < n) {
|
|
134
|
+
const c = stripped.charCodeAt(p);
|
|
135
|
+
if (!isAsciiAlpha(c) && !isAsciiDigit(c)) {
|
|
136
|
+
break;
|
|
137
|
+
}
|
|
138
|
+
p++;
|
|
139
|
+
}
|
|
140
|
+
const tagName = stripped.slice(nameStart, p).toLowerCase();
|
|
141
|
+
// Find the closing '>' of the tag. We have to be careful not to
|
|
142
|
+
// mistake a '>' inside a quoted attribute value for the tag end.
|
|
143
|
+
const tagEnd = findTagEnd(stripped, p);
|
|
144
|
+
if (tagEnd < 0) {
|
|
145
|
+
// No closing '>' — the rest of the input is malformed; treat the
|
|
146
|
+
// remainder as text. (Original regex would simply not match and
|
|
147
|
+
// leave the same characters as text via the alternation.)
|
|
148
|
+
const text = decodeHtmlEntities(stripped.slice(i));
|
|
149
|
+
if (text) {
|
|
150
|
+
tokens.push({ type: "text", value: text });
|
|
151
|
+
}
|
|
152
|
+
// `break` exits the loop directly; no need to assign `i = n`
|
|
153
|
+
// first (CodeQL js/useless-assignment-to-local).
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
// Inside [p, tagEnd) lie attributes (and possibly a trailing '/').
|
|
157
|
+
let inner = stripped.slice(p, tagEnd);
|
|
158
|
+
// Detect self-close: trailing '/'. Strip it so it is not parsed as
|
|
159
|
+
// an attribute name.
|
|
160
|
+
let selfClose = false;
|
|
161
|
+
// Trim trailing whitespace, then a single '/'.
|
|
162
|
+
let innerEnd = inner.length;
|
|
163
|
+
while (innerEnd > 0 && isHtmlSpace(inner.charCodeAt(innerEnd - 1))) {
|
|
164
|
+
innerEnd--;
|
|
165
|
+
}
|
|
166
|
+
if (innerEnd > 0 && inner.charCodeAt(innerEnd - 1) === 0x2f) {
|
|
167
|
+
selfClose = true;
|
|
168
|
+
innerEnd--;
|
|
169
|
+
}
|
|
170
|
+
inner = inner.slice(0, innerEnd);
|
|
171
|
+
if (isClose) {
|
|
172
|
+
tokens.push({ type: "close", tag: tagName, attrs: {} });
|
|
173
|
+
i = tagEnd + 1;
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
const attrs = parseHtmlAttrs(inner);
|
|
177
|
+
const isVoidElement = VOID_ELEMENTS.has(tagName);
|
|
178
|
+
if (selfClose || isVoidElement) {
|
|
179
|
+
tokens.push({ type: "selfclose", tag: tagName, attrs });
|
|
180
|
+
i = tagEnd + 1;
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
tokens.push({ type: "open", tag: tagName, attrs });
|
|
184
|
+
i = tagEnd + 1;
|
|
185
|
+
// Raw-text elements: their body must not be parsed as markup.
|
|
186
|
+
if (RAW_TEXT_ELEMENTS.has(tagName)) {
|
|
187
|
+
const closeIdx = findRawTextClose(stripped, i, tagName);
|
|
188
|
+
if (closeIdx === null) {
|
|
189
|
+
// No closing tag — discard the rest of the input for this
|
|
190
|
+
// raw-text element to avoid emitting markup as text.
|
|
191
|
+
i = n;
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
const body = stripped.slice(i, closeIdx.bodyEnd);
|
|
195
|
+
if (RAW_TEXT_PRESERVE_BODY.has(tagName)) {
|
|
196
|
+
tokens.push({ type: "text", value: body });
|
|
134
197
|
}
|
|
198
|
+
tokens.push({ type: "close", tag: tagName, attrs: {} });
|
|
199
|
+
i = closeIdx.next;
|
|
135
200
|
}
|
|
136
201
|
}
|
|
137
202
|
}
|
|
138
203
|
return tokens;
|
|
139
204
|
}
|
|
205
|
+
function isAsciiAlpha(c) {
|
|
206
|
+
return (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a);
|
|
207
|
+
}
|
|
208
|
+
function isAsciiDigit(c) {
|
|
209
|
+
return c >= 0x30 && c <= 0x39;
|
|
210
|
+
}
|
|
211
|
+
function isHtmlSpace(c) {
|
|
212
|
+
return c === 0x20 || c === 0x09 || c === 0x0a || c === 0x0d || c === 0x0c;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Scan forward from `from` to the position of the next '<' that
|
|
216
|
+
* introduces a tag (i.e. is followed by `[a-zA-Z]` or `/[a-zA-Z]`).
|
|
217
|
+
* A bare '<' (e.g. in `1 < 2`) is included in the text run.
|
|
218
|
+
*/
|
|
219
|
+
function scanTextEnd(s, from) {
|
|
220
|
+
const n = s.length;
|
|
221
|
+
let i = from;
|
|
222
|
+
while (i < n) {
|
|
223
|
+
const lt = s.indexOf("<", i);
|
|
224
|
+
if (lt < 0) {
|
|
225
|
+
return n;
|
|
226
|
+
}
|
|
227
|
+
if (isTagStart(s, lt)) {
|
|
228
|
+
return lt;
|
|
229
|
+
}
|
|
230
|
+
// Bare '<' or `</` not followed by a letter — keep scanning.
|
|
231
|
+
i = lt + 1;
|
|
232
|
+
}
|
|
233
|
+
return n;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Return true if position `pos` in `s` is `<` followed by a letter
|
|
237
|
+
* (open tag) or `</` followed by a letter (close tag). Used to
|
|
238
|
+
* distinguish "real" tag starts from literal `<` characters.
|
|
239
|
+
*/
|
|
240
|
+
function isTagStart(s, pos) {
|
|
241
|
+
if (s.charCodeAt(pos) !== 0x3c /* '<' */) {
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
const next = s.charCodeAt(pos + 1);
|
|
245
|
+
if (isAsciiAlpha(next)) {
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
if (next === 0x2f /* '/' */ && isAsciiAlpha(s.charCodeAt(pos + 2))) {
|
|
249
|
+
return true;
|
|
250
|
+
}
|
|
251
|
+
return false;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Find the index of the '>' that closes the tag opened just before
|
|
255
|
+
* `from`. Honours quoted attribute values so that `<a href="x>y">`
|
|
256
|
+
* does not stop at the '>' inside quotes.
|
|
257
|
+
*
|
|
258
|
+
* Returns -1 if no closing '>' is found before EOF.
|
|
259
|
+
*/
|
|
260
|
+
function findTagEnd(s, from) {
|
|
261
|
+
const n = s.length;
|
|
262
|
+
let i = from;
|
|
263
|
+
while (i < n) {
|
|
264
|
+
const c = s.charCodeAt(i);
|
|
265
|
+
if (c === 0x22 /* '"' */ || c === 0x27 /* "'" */) {
|
|
266
|
+
const close = s.indexOf(c === 0x22 ? '"' : "'", i + 1);
|
|
267
|
+
if (close < 0) {
|
|
268
|
+
return -1;
|
|
269
|
+
}
|
|
270
|
+
i = close + 1;
|
|
271
|
+
continue;
|
|
272
|
+
}
|
|
273
|
+
if (c === 0x3e /* '>' */) {
|
|
274
|
+
return i;
|
|
275
|
+
}
|
|
276
|
+
i++;
|
|
277
|
+
}
|
|
278
|
+
return -1;
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Find the closing tag for a raw-text element (e.g. `</script>`),
|
|
282
|
+
* starting at `from`. Returns the position immediately after the
|
|
283
|
+
* close tag (`next`) plus the position where the body ends (`bodyEnd`,
|
|
284
|
+
* i.e. the start of the close-tag literal).
|
|
285
|
+
*
|
|
286
|
+
* Implemented with a linear scan (no dynamic `RegExp`) so that
|
|
287
|
+
* adversarial bodies cannot trigger super-linear runtime.
|
|
288
|
+
*/
|
|
289
|
+
function findRawTextClose(s, from, tagName) {
|
|
290
|
+
const n = s.length;
|
|
291
|
+
let i = from;
|
|
292
|
+
while (i < n) {
|
|
293
|
+
const lt = s.indexOf("</", i);
|
|
294
|
+
if (lt < 0) {
|
|
295
|
+
return null;
|
|
296
|
+
}
|
|
297
|
+
const after = lt + 2;
|
|
298
|
+
// Compare tag name case-insensitively.
|
|
299
|
+
let ok = true;
|
|
300
|
+
for (let k = 0; k < tagName.length; k++) {
|
|
301
|
+
const a = s.charCodeAt(after + k);
|
|
302
|
+
const aLower = a >= 0x41 && a <= 0x5a ? a + 0x20 : a;
|
|
303
|
+
if (aLower !== tagName.charCodeAt(k)) {
|
|
304
|
+
ok = false;
|
|
305
|
+
break;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
if (!ok) {
|
|
309
|
+
i = after;
|
|
310
|
+
continue;
|
|
311
|
+
}
|
|
312
|
+
// Skip any trailing whitespace before '>'.
|
|
313
|
+
let p = after + tagName.length;
|
|
314
|
+
while (p < n && isHtmlSpace(s.charCodeAt(p))) {
|
|
315
|
+
p++;
|
|
316
|
+
}
|
|
317
|
+
if (p < n && s.charCodeAt(p) === 0x3e /* '>' */) {
|
|
318
|
+
return { bodyEnd: lt, next: p + 1 };
|
|
319
|
+
}
|
|
320
|
+
i = after;
|
|
321
|
+
}
|
|
322
|
+
return null;
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* Strip HTML comments, doctype declarations, CDATA sections and SGML
|
|
326
|
+
* processing instructions in a single linear scan.
|
|
327
|
+
*
|
|
328
|
+
* A linear scan (vs. chained `String.prototype.replace` with regular
|
|
329
|
+
* expressions) is required for two reasons:
|
|
330
|
+
*
|
|
331
|
+
* 1. **Incomplete multi-character sanitization** — chained replaces are
|
|
332
|
+
* each one pass; an attacker can nest the syntax (e.g.
|
|
333
|
+
* `<!--<!--x-->-->`) so the outer marker survives after the inner
|
|
334
|
+
* one is removed.
|
|
335
|
+
* 2. **Catastrophic backtracking** — lazy quantifiers such as
|
|
336
|
+
* `<!--[\s\S]*?-->` are polynomial-time on adversarial input
|
|
337
|
+
* (very long unterminated comments).
|
|
338
|
+
*
|
|
339
|
+
* The scan is O(n) in the input length and removes nested constructs by
|
|
340
|
+
* not advancing past the closing marker into already-emitted text.
|
|
341
|
+
*/
|
|
342
|
+
function stripSgmlNoise(input) {
|
|
343
|
+
let out = "";
|
|
344
|
+
let i = 0;
|
|
345
|
+
const n = input.length;
|
|
346
|
+
while (i < n) {
|
|
347
|
+
if (input.charCodeAt(i) !== 0x3c /* '<' */) {
|
|
348
|
+
out += input[i];
|
|
349
|
+
i++;
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
// Comment: <!-- ... -->
|
|
353
|
+
// If the closing `-->` is missing the input is malformed. The
|
|
354
|
+
// previous regex (`/<!--[\s\S]*?-->/g`) simply did not match in that
|
|
355
|
+
// case and left the text in place; we preserve that behaviour rather
|
|
356
|
+
// than swallowing the rest of the document, which would silently
|
|
357
|
+
// change the parse for legitimate inputs that happen to contain a
|
|
358
|
+
// stray `<!--`.
|
|
359
|
+
if (input.startsWith("<!--", i)) {
|
|
360
|
+
const end = input.indexOf("-->", i + 4);
|
|
361
|
+
if (end < 0) {
|
|
362
|
+
out += "<";
|
|
363
|
+
i++;
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
i = end + 3;
|
|
367
|
+
continue;
|
|
368
|
+
}
|
|
369
|
+
// CDATA: <![CDATA[ ... ]]>
|
|
370
|
+
if (input.startsWith("<![CDATA[", i)) {
|
|
371
|
+
const end = input.indexOf("]]>", i + 9);
|
|
372
|
+
if (end < 0) {
|
|
373
|
+
out += "<";
|
|
374
|
+
i++;
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
i = end + 3;
|
|
378
|
+
continue;
|
|
379
|
+
}
|
|
380
|
+
// Doctype: <!doctype ...> (case-insensitive)
|
|
381
|
+
if (input.charCodeAt(i + 1) === 0x21 /* '!' */ &&
|
|
382
|
+
input.slice(i + 2, i + 9).toLowerCase() === "doctype") {
|
|
383
|
+
const end = input.indexOf(">", i + 9);
|
|
384
|
+
if (end < 0) {
|
|
385
|
+
out += "<";
|
|
386
|
+
i++;
|
|
387
|
+
continue;
|
|
388
|
+
}
|
|
389
|
+
i = end + 1;
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
// Processing instruction: <? ... ?>
|
|
393
|
+
if (input.charCodeAt(i + 1) === 0x3f /* '?' */) {
|
|
394
|
+
const end = input.indexOf("?>", i + 2);
|
|
395
|
+
if (end < 0) {
|
|
396
|
+
out += "<";
|
|
397
|
+
i++;
|
|
398
|
+
continue;
|
|
399
|
+
}
|
|
400
|
+
i = end + 2;
|
|
401
|
+
continue;
|
|
402
|
+
}
|
|
403
|
+
// Not an SGML noise construct — emit the '<' literally and continue.
|
|
404
|
+
out += "<";
|
|
405
|
+
i++;
|
|
406
|
+
}
|
|
407
|
+
return out;
|
|
408
|
+
}
|
|
140
409
|
/**
|
|
141
410
|
* HTML elements whose body is not parsed as markup. Their content is either
|
|
142
411
|
* preserved (style) for downstream processing or discarded entirely.
|
|
@@ -207,26 +476,110 @@ function extractStyleRules(tokens) {
|
|
|
207
476
|
}
|
|
208
477
|
return result;
|
|
209
478
|
}
|
|
479
|
+
/**
|
|
480
|
+
* Parse HTML-style attributes from the inside of a start tag, e.g.
|
|
481
|
+
* `class="x" id='y' disabled href=foo`.
|
|
482
|
+
*
|
|
483
|
+
* Implemented as a linear scan rather than the previous global regex
|
|
484
|
+
* `/([a-zA-Z_][\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g` so
|
|
485
|
+
* adversarial start-tag content cannot trigger polynomial-redos
|
|
486
|
+
* (CodeQL js/polynomial-redos). Behaviour matches the regex form on
|
|
487
|
+
* well-formed inputs:
|
|
488
|
+
* - Attribute names lower-cased.
|
|
489
|
+
* - Double-quoted, single-quoted and unquoted values supported.
|
|
490
|
+
* - Boolean attributes (no `=`) yield an empty string value.
|
|
491
|
+
*/
|
|
210
492
|
function parseHtmlAttrs(str) {
|
|
211
493
|
const attrs = {};
|
|
212
|
-
const
|
|
213
|
-
let
|
|
214
|
-
while (
|
|
215
|
-
|
|
494
|
+
const n = str.length;
|
|
495
|
+
let i = 0;
|
|
496
|
+
while (i < n) {
|
|
497
|
+
// Skip whitespace.
|
|
498
|
+
while (i < n && isHtmlSpace(str.charCodeAt(i))) {
|
|
499
|
+
i++;
|
|
500
|
+
}
|
|
501
|
+
if (i >= n) {
|
|
502
|
+
break;
|
|
503
|
+
}
|
|
504
|
+
// Read attribute name: [A-Za-z_][\w-]*.
|
|
505
|
+
const nameStart = i;
|
|
506
|
+
const first = str.charCodeAt(i);
|
|
507
|
+
if (!isAsciiAlpha(first) && first !== 0x5f /* '_' */) {
|
|
508
|
+
// Not a valid attribute-name start — skip one char and resync.
|
|
509
|
+
i++;
|
|
510
|
+
continue;
|
|
511
|
+
}
|
|
512
|
+
i++;
|
|
513
|
+
while (i < n) {
|
|
514
|
+
const c = str.charCodeAt(i);
|
|
515
|
+
if (isAsciiAlpha(c) || isAsciiDigit(c) || c === 0x5f /* '_' */ || c === 0x2d /* '-' */) {
|
|
516
|
+
i++;
|
|
517
|
+
continue;
|
|
518
|
+
}
|
|
519
|
+
break;
|
|
520
|
+
}
|
|
521
|
+
const name = str.slice(nameStart, i).toLowerCase();
|
|
522
|
+
// Optional `\s*=\s*` then a value.
|
|
523
|
+
let j = i;
|
|
524
|
+
while (j < n && isHtmlSpace(str.charCodeAt(j))) {
|
|
525
|
+
j++;
|
|
526
|
+
}
|
|
527
|
+
if (j >= n || str.charCodeAt(j) !== 0x3d /* '=' */) {
|
|
528
|
+
// Boolean attribute.
|
|
529
|
+
attrs[name] = "";
|
|
530
|
+
continue;
|
|
531
|
+
}
|
|
532
|
+
j++; // past '='
|
|
533
|
+
while (j < n && isHtmlSpace(str.charCodeAt(j))) {
|
|
534
|
+
j++;
|
|
535
|
+
}
|
|
536
|
+
if (j >= n) {
|
|
537
|
+
attrs[name] = "";
|
|
538
|
+
i = j;
|
|
539
|
+
continue;
|
|
540
|
+
}
|
|
541
|
+
const q = str.charCodeAt(j);
|
|
542
|
+
if (q === 0x22 /* '"' */ || q === 0x27 /* "'" */) {
|
|
543
|
+
const close = str.indexOf(q === 0x22 ? '"' : "'", j + 1);
|
|
544
|
+
if (close < 0) {
|
|
545
|
+
// Unterminated quoted value — take whatever is left and stop.
|
|
546
|
+
attrs[name] = str.slice(j + 1);
|
|
547
|
+
break;
|
|
548
|
+
}
|
|
549
|
+
attrs[name] = str.slice(j + 1, close);
|
|
550
|
+
i = close + 1;
|
|
551
|
+
continue;
|
|
552
|
+
}
|
|
553
|
+
// Unquoted value: run of non-whitespace.
|
|
554
|
+
const valStart = j;
|
|
555
|
+
while (j < n && !isHtmlSpace(str.charCodeAt(j))) {
|
|
556
|
+
j++;
|
|
557
|
+
}
|
|
558
|
+
attrs[name] = str.slice(valStart, j);
|
|
559
|
+
i = j;
|
|
216
560
|
}
|
|
217
561
|
return attrs;
|
|
218
562
|
}
|
|
219
563
|
function decodeHtmlEntities(text) {
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
564
|
+
// Decode every entity in a single pass. Chaining `.replace()` calls
|
|
565
|
+
// (first `&` → `&`, then `<` → `<`, …) re-runs the later
|
|
566
|
+
// replacements over the output of the earlier ones, so input like
|
|
567
|
+
// `&lt;` would round-trip to `<` instead of the intended `<`.
|
|
568
|
+
// CodeQL flags this as "Double escaping or unescaping". A single
|
|
569
|
+
// alternation guarantees each source position is decoded at most once.
|
|
570
|
+
return text.replace(/&(?:#(\d+)|#[xX]([a-fA-F0-9]+)|([a-zA-Z][a-zA-Z0-9]*));/g, (match, dec, hex, name) => {
|
|
571
|
+
if (dec !== undefined) {
|
|
572
|
+
return safeFromCodePoint(parseInt(dec, 10));
|
|
573
|
+
}
|
|
574
|
+
if (hex !== undefined) {
|
|
575
|
+
return safeFromCodePoint(parseInt(hex, 16));
|
|
576
|
+
}
|
|
577
|
+
if (name !== undefined) {
|
|
578
|
+
const replacement = HTML_ENTITIES[name];
|
|
579
|
+
return replacement ?? match;
|
|
580
|
+
}
|
|
581
|
+
return match;
|
|
582
|
+
});
|
|
230
583
|
}
|
|
231
584
|
/**
|
|
232
585
|
* Convert a numeric character reference to a string. Uses fromCodePoint so
|
|
@@ -246,6 +599,16 @@ function safeFromCodePoint(cp) {
|
|
|
246
599
|
}
|
|
247
600
|
/** Common HTML named entities mapped to their Unicode characters. */
|
|
248
601
|
const HTML_ENTITIES = {
|
|
602
|
+
// Core XML/HTML entities — these used to be handled as standalone
|
|
603
|
+
// chained `.replace()` calls in `decodeHtmlEntities`. They must live
|
|
604
|
+
// in this table so the single-pass decoder can resolve them without
|
|
605
|
+
// re-running over already-decoded output (CodeQL "double unescaping").
|
|
606
|
+
amp: "&",
|
|
607
|
+
lt: "<",
|
|
608
|
+
gt: ">",
|
|
609
|
+
quot: '"',
|
|
610
|
+
apos: "'",
|
|
611
|
+
nbsp: "\u00A0",
|
|
249
612
|
// Punctuation & Typography
|
|
250
613
|
mdash: "\u2014",
|
|
251
614
|
ndash: "\u2013",
|
|
@@ -191,8 +191,13 @@ function renderTable(state, table) {
|
|
|
191
191
|
cellParts.push(renderInlineChildren(state, block.children).trim());
|
|
192
192
|
}
|
|
193
193
|
}
|
|
194
|
-
// Escape pipe characters to prevent table structure corruption
|
|
195
|
-
|
|
194
|
+
// Escape pipe characters to prevent table structure corruption.
|
|
195
|
+
// Backslashes must be escaped *first*: replacing `|` first leaves
|
|
196
|
+
// a literal `\|` in the source untouched, but a subsequent
|
|
197
|
+
// `\` → `\\` pass would then double-escape it into `\\|`,
|
|
198
|
+
// breaking GFM tables. CodeQL flags the single-pass form as
|
|
199
|
+
// "Incomplete string escaping or encoding".
|
|
200
|
+
rowTexts.push(cellParts.join(" ").replace(/\\/g, "\\\\").replace(/\|/g, "\\|"));
|
|
196
201
|
}
|
|
197
202
|
grid.push(rowTexts);
|
|
198
203
|
}
|
|
@@ -473,7 +478,10 @@ function isMonospaceFont(font) {
|
|
|
473
478
|
if (typeof font === "string") {
|
|
474
479
|
return isMonospaceFontName(font);
|
|
475
480
|
}
|
|
476
|
-
|
|
481
|
+
// `!font` above already discarded `null`; `font !== null` here was
|
|
482
|
+
// therefore always true and CodeQL flagged it as a comparison
|
|
483
|
+
// between inconvertible types.
|
|
484
|
+
if (typeof font === "object") {
|
|
477
485
|
const f = font;
|
|
478
486
|
return (isMonospaceFontName(f.ascii) ||
|
|
479
487
|
isMonospaceFontName(f.hAnsi));
|
|
@@ -1257,7 +1257,10 @@ function resolveColorHex(color) {
|
|
|
1257
1257
|
if (typeof color === "string") {
|
|
1258
1258
|
return color;
|
|
1259
1259
|
}
|
|
1260
|
-
|
|
1260
|
+
// The `!color` check above already discarded `null`; an additional
|
|
1261
|
+
// `color !== null` test was always true and CodeQL flagged it as a
|
|
1262
|
+
// comparison between inconvertible types.
|
|
1263
|
+
if (typeof color === "object" && "value" in color) {
|
|
1261
1264
|
return color.value;
|
|
1262
1265
|
}
|
|
1263
1266
|
return undefined;
|