@exodus/bytes 1.0.0-rc.8 → 1.0.0-rc.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -4
- package/encoding.js +234 -0
- package/fallback/_utils.js +88 -10
- package/fallback/encoding.labels.js +46 -0
- package/fallback/encoding.util.js +34 -0
- package/fallback/hex.js +2 -70
- package/fallback/latin1.js +2 -1
- package/fallback/multi-byte.encodings.cjs +1 -0
- package/fallback/multi-byte.encodings.json +545 -0
- package/fallback/multi-byte.js +449 -0
- package/fallback/multi-byte.table.js +114 -0
- package/fallback/single-byte.encodings.js +45 -0
- package/fallback/single-byte.js +83 -0
- package/fallback/utf16.js +180 -0
- package/hex.node.js +2 -0
- package/multi-byte.js +13 -0
- package/multi-byte.node.js +25 -0
- package/package.json +39 -8
- package/single-byte.js +55 -0
- package/single-byte.node.js +62 -0
- package/utf16.js +73 -0
- package/utf16.node.js +79 -0
- package/utf8.js +7 -9
- package/utf8.node.js +8 -5
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
import { asciiPrefix, decodeLatin1 } from './latin1.js'
|
|
2
|
+
import { getTable } from './multi-byte.table.js'
|
|
3
|
+
|
|
4
|
+
export const E_STRICT = 'Input is not well-formed for this encoding'
|
|
5
|
+
|
|
6
|
+
// TODO: optimize
|
|
7
|
+
|
|
8
|
+
// If the decoder is not cleared properly, state can be preserved between non-streaming calls!
|
|
9
|
+
// See comment about fatal stream
|
|
10
|
+
|
|
11
|
+
// All except iso-2022-jp are ASCII supersets
|
|
12
|
+
// When adding something that is not an ASCII superset, ajust the ASCII fast path
|
|
13
|
+
const REP = 0xff_fd
|
|
14
|
+
const mappers = {
|
|
15
|
+
// https://encoding.spec.whatwg.org/#euc-kr-decoder
|
|
16
|
+
'euc-kr': () => {
|
|
17
|
+
const euc = getTable('euc-kr')
|
|
18
|
+
let lead = 0
|
|
19
|
+
|
|
20
|
+
const pushback = []
|
|
21
|
+
const bytes = (b) => {
|
|
22
|
+
if (lead) {
|
|
23
|
+
const cp = b >= 0x41 && b <= 0xfe ? euc[(lead - 0x81) * 190 + b - 0x41] : undefined
|
|
24
|
+
lead = 0
|
|
25
|
+
if (cp !== undefined && cp !== REP) return cp
|
|
26
|
+
if (b < 128) pushback.push(b)
|
|
27
|
+
return -2
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if (b < 128) return b
|
|
31
|
+
if (b < 0x81 || b === 0xff) return -2
|
|
32
|
+
lead = b
|
|
33
|
+
return -1
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const eof = () => {
|
|
37
|
+
if (!lead) return null
|
|
38
|
+
lead = 0
|
|
39
|
+
return -2
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return { bytes, eof, pushback }
|
|
43
|
+
},
|
|
44
|
+
// https://encoding.spec.whatwg.org/#euc-jp-decoder
|
|
45
|
+
'euc-jp': () => {
|
|
46
|
+
const jis0208 = getTable('jis0208')
|
|
47
|
+
const jis0212 = getTable('jis0212')
|
|
48
|
+
let j12 = false
|
|
49
|
+
let lead = 0
|
|
50
|
+
|
|
51
|
+
const pushback = []
|
|
52
|
+
const bytes = (b) => {
|
|
53
|
+
if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
|
|
54
|
+
lead = 0
|
|
55
|
+
return 0xfe_c0 + b
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
|
|
59
|
+
j12 = true
|
|
60
|
+
lead = b
|
|
61
|
+
return -1
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (lead) {
|
|
65
|
+
let cp
|
|
66
|
+
if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
|
|
67
|
+
cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
lead = 0
|
|
71
|
+
j12 = false
|
|
72
|
+
if (cp !== undefined && cp !== REP) return cp
|
|
73
|
+
if (b < 128) pushback.push(b)
|
|
74
|
+
return -2
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (b < 128) return b
|
|
78
|
+
if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) return -2
|
|
79
|
+
lead = b
|
|
80
|
+
return -1
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// eslint-disable-next-line sonarjs/no-identical-functions
|
|
84
|
+
const eof = () => {
|
|
85
|
+
if (!lead) return null
|
|
86
|
+
lead = 0
|
|
87
|
+
return -2
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return { bytes, eof, pushback }
|
|
91
|
+
},
|
|
92
|
+
// https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
|
|
93
|
+
// Per-letter of the spec, don't shortcut on state changes on EOF. Some code is regrouped but preserving the logic
|
|
94
|
+
'iso-2022-jp': () => {
|
|
95
|
+
const jis0208 = getTable('jis0208')
|
|
96
|
+
const EOF = -1
|
|
97
|
+
let dState = 1
|
|
98
|
+
let oState = 1
|
|
99
|
+
let lead = 0
|
|
100
|
+
let out = false
|
|
101
|
+
|
|
102
|
+
const pushback = []
|
|
103
|
+
const bytes = (b) => {
|
|
104
|
+
if (dState < 5) {
|
|
105
|
+
if (b === EOF) return null
|
|
106
|
+
if (b === 0x1b) {
|
|
107
|
+
dState = 6 // escape start
|
|
108
|
+
return -1
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
switch (dState) {
|
|
113
|
+
case 1:
|
|
114
|
+
case 2:
|
|
115
|
+
// ASCII, Roman (common)
|
|
116
|
+
out = false
|
|
117
|
+
if (dState === 2) {
|
|
118
|
+
if (b === 0x5c) return 0xa5
|
|
119
|
+
if (b === 0x7e) return 0x20_3e
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (b <= 0x7f && b !== 0x0e && b !== 0x0f) return b
|
|
123
|
+
return -2
|
|
124
|
+
case 3:
|
|
125
|
+
// Katakana
|
|
126
|
+
out = false
|
|
127
|
+
if (b >= 0x21 && b <= 0x5f) return 0xff_40 + b
|
|
128
|
+
return -2
|
|
129
|
+
case 4:
|
|
130
|
+
// Leading byte
|
|
131
|
+
out = false
|
|
132
|
+
if ((b >= 0x21) & (b <= 0x7e)) {
|
|
133
|
+
lead = b
|
|
134
|
+
dState = 5
|
|
135
|
+
return -1
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return -2
|
|
139
|
+
case 5:
|
|
140
|
+
// Trailing byte
|
|
141
|
+
out = false
|
|
142
|
+
if (b === 0x1b) {
|
|
143
|
+
dState = 6 // escape start
|
|
144
|
+
return -2
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
dState = 4
|
|
148
|
+
if (b >= 0x21 && b <= 0x7e) {
|
|
149
|
+
const cp = jis0208[(lead - 0x21) * 94 + b - 0x21]
|
|
150
|
+
return cp !== undefined && cp !== REP ? cp : -2
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return -2
|
|
154
|
+
case 6:
|
|
155
|
+
// Escape start
|
|
156
|
+
if (b === 0x24 || b === 0x28) {
|
|
157
|
+
lead = b
|
|
158
|
+
dState = 7
|
|
159
|
+
return -1
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
out = false
|
|
163
|
+
dState = oState
|
|
164
|
+
if (b !== EOF) pushback.push(b)
|
|
165
|
+
return -2
|
|
166
|
+
case 7: {
|
|
167
|
+
// Escape
|
|
168
|
+
const l = lead
|
|
169
|
+
lead = 0
|
|
170
|
+
let s
|
|
171
|
+
if (l === 0x28) {
|
|
172
|
+
// eslint-disable-next-line unicorn/prefer-switch
|
|
173
|
+
if (b === 0x42) {
|
|
174
|
+
s = 1
|
|
175
|
+
} else if (b === 0x4a) {
|
|
176
|
+
s = 2
|
|
177
|
+
} else if (b === 0x49) {
|
|
178
|
+
s = 3
|
|
179
|
+
}
|
|
180
|
+
} else if (l === 0x24 && (b === 0x40 || b === 0x42)) {
|
|
181
|
+
s = 4
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (s) {
|
|
185
|
+
dState = oState = s
|
|
186
|
+
const output = out
|
|
187
|
+
out = true
|
|
188
|
+
return output ? -2 : -1
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
out = false
|
|
192
|
+
dState = oState
|
|
193
|
+
if (b !== EOF) pushback.push(b)
|
|
194
|
+
pushback.push(l)
|
|
195
|
+
return -2
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const eof = () => bytes(EOF)
|
|
201
|
+
|
|
202
|
+
return { bytes, eof, pushback }
|
|
203
|
+
},
|
|
204
|
+
// https://encoding.spec.whatwg.org/#shift_jis-decoder
|
|
205
|
+
shift_jis: () => {
|
|
206
|
+
const jis0208 = getTable('jis0208')
|
|
207
|
+
let lead = 0
|
|
208
|
+
|
|
209
|
+
const pushback = []
|
|
210
|
+
const bytes = (b) => {
|
|
211
|
+
if (lead) {
|
|
212
|
+
const l = lead
|
|
213
|
+
lead = 0
|
|
214
|
+
if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
|
|
215
|
+
const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
|
|
216
|
+
if (p >= 8836 && p <= 10_715) return 0xe0_00 - 8836 + p // 16-bit
|
|
217
|
+
const cp = jis0208[p]
|
|
218
|
+
if (cp !== undefined && cp !== REP) return cp
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (b < 128) pushback.push(b)
|
|
222
|
+
return -2
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if (b <= 0x80) return b // 0x80 is allowed
|
|
226
|
+
if (b >= 0xa1 && b <= 0xdf) return 0xff_61 - 0xa1 + b
|
|
227
|
+
if (b < 0x81 || (b > 0x9f && b < 0xe0) || b > 0xfc) return -2
|
|
228
|
+
lead = b
|
|
229
|
+
return -1
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// eslint-disable-next-line sonarjs/no-identical-functions
|
|
233
|
+
const eof = () => {
|
|
234
|
+
if (!lead) return null
|
|
235
|
+
lead = 0 // this clears state completely on EOF
|
|
236
|
+
return -2
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return { bytes, eof, pushback }
|
|
240
|
+
},
|
|
241
|
+
// https://encoding.spec.whatwg.org/#gbk-decoder
|
|
242
|
+
gbk: () => mappers.gb18030(), // 10.1.1. GBK’s decoder is gb18030’s decoder
|
|
243
|
+
// https://encoding.spec.whatwg.org/#gb18030-decoder
|
|
244
|
+
gb18030: () => {
|
|
245
|
+
const gb18030 = getTable('gb18030')
|
|
246
|
+
const gb18030r = getTable('gb18030-ranges')
|
|
247
|
+
let g1 = 0, g2 = 0, g3 = 0 // prettier-ignore
|
|
248
|
+
const index = (p) => {
|
|
249
|
+
if ((p > 39_419 && p < 189_000) || p > 1_237_575) return
|
|
250
|
+
if (p === 7457) return 0xe7_c7
|
|
251
|
+
let a = 0, b = 0 // prettier-ignore
|
|
252
|
+
for (const [c, d] of gb18030r) {
|
|
253
|
+
if (c > p) break
|
|
254
|
+
a = c
|
|
255
|
+
b = d
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
return b + p - a
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const pushback = []
|
|
262
|
+
const bytes = (b) => {
|
|
263
|
+
if (g3) {
|
|
264
|
+
if (b < 0x30 || b > 0x39) {
|
|
265
|
+
pushback.push(b, g3, g2)
|
|
266
|
+
g1 = g2 = g3 = 0
|
|
267
|
+
return -2
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const cp = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
|
|
271
|
+
g1 = g2 = g3 = 0
|
|
272
|
+
if (cp !== undefined) return cp // Can validly return replacement
|
|
273
|
+
return -2
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (g2) {
|
|
277
|
+
if (b >= 0x81 && b <= 0xfe) {
|
|
278
|
+
g3 = b
|
|
279
|
+
return -1
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
pushback.push(b, g2)
|
|
283
|
+
g1 = g2 = 0
|
|
284
|
+
return -2
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (g1) {
|
|
288
|
+
if (b >= 0x30 && b <= 0x39) {
|
|
289
|
+
g2 = b
|
|
290
|
+
return -1
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
let cp
|
|
294
|
+
if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
|
|
295
|
+
cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
g1 = 0
|
|
299
|
+
if (cp !== undefined && cp !== REP) return cp
|
|
300
|
+
if (b < 128) pushback.push(b)
|
|
301
|
+
return -2
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
if (b < 128) return b
|
|
305
|
+
if (b === 0x80) return 0x20_ac
|
|
306
|
+
if (b === 0xff) return -2
|
|
307
|
+
g1 = b
|
|
308
|
+
return -1
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
const eof = () => {
|
|
312
|
+
if (!g1 && !g2 && !g3) return null
|
|
313
|
+
g1 = g2 = g3 = 0
|
|
314
|
+
return -2
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
return { bytes, eof, pushback }
|
|
318
|
+
},
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
|
|
322
|
+
export const multibyteSupported = (enc) => Object.hasOwn(mappers, enc) || enc === 'big5'
|
|
323
|
+
|
|
324
|
+
export function multibyteDecoder(enc, loose = false) {
|
|
325
|
+
if (enc === 'big5') return big5decoder(loose)
|
|
326
|
+
if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
|
|
327
|
+
|
|
328
|
+
// Input is assumed to be typechecked already
|
|
329
|
+
let mapper
|
|
330
|
+
const asciiSuperset = isAsciiSuperset(enc)
|
|
331
|
+
return (arr, stream = false) => {
|
|
332
|
+
const onErr = loose
|
|
333
|
+
? () => '\uFFFD'
|
|
334
|
+
: () => {
|
|
335
|
+
mapper.pushback.length = 0 // the queue is cleared on returning an error
|
|
336
|
+
// The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
|
|
337
|
+
// Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
|
|
338
|
+
// iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
|
|
339
|
+
if (!stream) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
|
|
340
|
+
throw new Error(E_STRICT)
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let res = ''
|
|
344
|
+
const length = arr.length
|
|
345
|
+
if (asciiSuperset && !mapper) {
|
|
346
|
+
res = decodeLatin1(arr, 0, asciiPrefix(arr))
|
|
347
|
+
if (res.length === arr.length) return res // ascii
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if (!mapper) mapper = mappers[enc]()
|
|
351
|
+
const { bytes, eof, pushback } = mapper
|
|
352
|
+
let i = res.length
|
|
353
|
+
|
|
354
|
+
// First, dump everything until EOF
|
|
355
|
+
// Same as the full loop, but without EOF handling
|
|
356
|
+
while (i < length || pushback.length > 0) {
|
|
357
|
+
const c = bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
|
|
358
|
+
if (c >= 0) {
|
|
359
|
+
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
|
|
360
|
+
} else if (c === -2) {
|
|
361
|
+
res += onErr()
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Then, dump EOF. This needs the same loop as the characters can be pushed back
|
|
366
|
+
// TODO: only some encodings need this, most can be optimized
|
|
367
|
+
if (!stream) {
|
|
368
|
+
while (i <= length || pushback.length > 0) {
|
|
369
|
+
const isEOF = i === length && pushback.length === 0
|
|
370
|
+
const c = isEOF ? eof() : bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
|
|
371
|
+
if (isEOF && c === null) break // clean exit
|
|
372
|
+
if (c === -1) continue // consuming
|
|
373
|
+
if (c === -2) {
|
|
374
|
+
res += onErr()
|
|
375
|
+
} else {
|
|
376
|
+
res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Chrome and WebKit fail on this, we don't: completely destroy the old decoder instance when finished streaming
|
|
382
|
+
// > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
|
|
383
|
+
// > Set this’s do not flush to options["stream"]
|
|
384
|
+
if (!stream) mapper = null
|
|
385
|
+
|
|
386
|
+
return res
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
|
|
391
|
+
// We store that as strings
|
|
392
|
+
function big5decoder(loose) {
|
|
393
|
+
// Input is assumed to be typechecked already
|
|
394
|
+
let lead = 0
|
|
395
|
+
let big5
|
|
396
|
+
const pushback = []
|
|
397
|
+
return (arr, stream = false) => {
|
|
398
|
+
const onErr = loose
|
|
399
|
+
? () => '\uFFFD'
|
|
400
|
+
: () => {
|
|
401
|
+
pushback.length = 0 // the queue is cleared on returning an error
|
|
402
|
+
// Lead is always already cleared before throwing
|
|
403
|
+
throw new Error(E_STRICT)
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
let res = ''
|
|
407
|
+
const length = arr.length
|
|
408
|
+
if (!lead) {
|
|
409
|
+
res = decodeLatin1(arr, 0, asciiPrefix(arr))
|
|
410
|
+
if (res.length === arr.length) return res // ascii
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
if (!big5) big5 = getTable('big5')
|
|
414
|
+
for (let i = res.length; i < length || pushback.length > 0; ) {
|
|
415
|
+
const b = pushback.length > 0 ? pushback.pop() : arr[i++]
|
|
416
|
+
if (lead) {
|
|
417
|
+
let cp
|
|
418
|
+
if ((b >= 0x40 && b <= 0x7e) || (b >= 0xa1 && b !== 0xff)) {
|
|
419
|
+
cp = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
lead = 0
|
|
423
|
+
if (cp) {
|
|
424
|
+
res += cp // strings
|
|
425
|
+
} else {
|
|
426
|
+
res += onErr()
|
|
427
|
+
if (b < 128) pushback.push(b)
|
|
428
|
+
}
|
|
429
|
+
} else if (b < 128) {
|
|
430
|
+
res += String.fromCharCode(b)
|
|
431
|
+
} else if (b < 0x81 || b === 0xff) {
|
|
432
|
+
res += onErr()
|
|
433
|
+
} else {
|
|
434
|
+
lead = b
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
if (!stream) {
|
|
439
|
+
// Destroy decoder state
|
|
440
|
+
pushback.length = 0
|
|
441
|
+
if (lead) {
|
|
442
|
+
lead = 0
|
|
443
|
+
res += onErr()
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
return res
|
|
448
|
+
}
|
|
449
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { fromBase64url } from '@exodus/bytes/base64.js' // eslint-disable-line @exodus/import/no-unresolved
|
|
2
|
+
import { utf16toString } from '@exodus/bytes/utf16.js' // eslint-disable-line @exodus/import/no-unresolved
|
|
3
|
+
import loadEncodings from './multi-byte.encodings.cjs'
|
|
4
|
+
import { to16input } from './utf16.js'
|
|
5
|
+
|
|
6
|
+
export const sizes = {
|
|
7
|
+
jis0208: 11_104,
|
|
8
|
+
jis0212: 7211,
|
|
9
|
+
'euc-kr': 23_750,
|
|
10
|
+
gb18030: 23_940,
|
|
11
|
+
big5: 19_782,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// This is huge. It's _much_ smaller than https://npmjs.com/text-encoding though
|
|
15
|
+
// Exactly as mapped by the index table
|
|
16
|
+
// 0,x - hole of x empty elements
|
|
17
|
+
// n,c - continious [c, ...] of length n
|
|
18
|
+
// $.. - references to common chunks
|
|
19
|
+
// -{x} - same as 1,{x}
|
|
20
|
+
|
|
21
|
+
// See tests/multi-byte.test.js to verify that this data decodes exactly into the encoding spec tables
|
|
22
|
+
|
|
23
|
+
let indices
|
|
24
|
+
const tables = new Map()
|
|
25
|
+
/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
|
|
26
|
+
|
|
27
|
+
function loadBase64(str) {
|
|
28
|
+
const x = fromBase64url(str)
|
|
29
|
+
const len = x.length
|
|
30
|
+
const len2 = len >> 1
|
|
31
|
+
const y = new Uint8Array(len)
|
|
32
|
+
let a = -1, b = 0 // prettier-ignore
|
|
33
|
+
for (let i = 0, j = 0; i < len; i += 2, j++) {
|
|
34
|
+
a = (a + x[j] + 1) & 0xff
|
|
35
|
+
b = (b + x[len2 + j]) & 0xff
|
|
36
|
+
y[i] = a
|
|
37
|
+
y[i + 1] = b
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return y
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function unwrap(res, t, pos, stringMode = false) {
|
|
44
|
+
let code = 0
|
|
45
|
+
for (let i = 0; i < t.length; i++) {
|
|
46
|
+
let x = t[i]
|
|
47
|
+
if (typeof x === 'number') {
|
|
48
|
+
if (x === 0) {
|
|
49
|
+
pos += t[++i]
|
|
50
|
+
} else {
|
|
51
|
+
if (x < 0) {
|
|
52
|
+
code -= x
|
|
53
|
+
x = 1
|
|
54
|
+
} else {
|
|
55
|
+
code += t[++i]
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (stringMode) {
|
|
59
|
+
for (let k = 0; k < x; k++, pos++, code++) res[pos] = String.fromCodePoint(code)
|
|
60
|
+
} else {
|
|
61
|
+
for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
} else if (x[0] === '$' && Object.hasOwn(indices, x)) {
|
|
65
|
+
pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
|
|
66
|
+
} else if (stringMode) {
|
|
67
|
+
const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
|
|
68
|
+
for (let i = 0; i < s.length; ) res[pos++] = s[i++] // TODO: splice?
|
|
69
|
+
code = s[s.length - 1].codePointAt(0) + 1
|
|
70
|
+
} else {
|
|
71
|
+
const u16 = to16input(loadBase64(x), true) // data is little-endian
|
|
72
|
+
res.set(u16, pos)
|
|
73
|
+
pos += u16.length
|
|
74
|
+
code = u16[u16.length - 1] + 1
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return pos
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export function getTable(id) {
|
|
82
|
+
const cached = tables.get(id)
|
|
83
|
+
if (cached) return cached
|
|
84
|
+
|
|
85
|
+
if (!indices) indices = loadEncodings() // lazy-load
|
|
86
|
+
if (!Object.hasOwn(indices, id)) throw new Error('Unknown encoding')
|
|
87
|
+
if (!indices[id]) throw new Error('Table already used (likely incorrect bundler dedupe)')
|
|
88
|
+
|
|
89
|
+
let res
|
|
90
|
+
if (id.endsWith('-ranges')) {
|
|
91
|
+
res = []
|
|
92
|
+
let a = 0, b = 0 // prettier-ignore
|
|
93
|
+
const idx = indices[id]
|
|
94
|
+
while (idx.length > 0) res.push([(a += idx.shift()), (b += idx.shift())]) // destroying, we remove it later anyway
|
|
95
|
+
} else if (id === 'big5') {
|
|
96
|
+
if (!Object.hasOwn(sizes, id)) throw new Error('Unknown encoding')
|
|
97
|
+
res = new Array(sizes[id]) // array of strings or undefined
|
|
98
|
+
unwrap(res, indices[id], 0, true)
|
|
99
|
+
// Pointer code updates are embedded into the table
|
|
100
|
+
res[1133] = '\xCA\u0304'
|
|
101
|
+
res[1135] = '\xCA\u030C'
|
|
102
|
+
res[1164] = '\xEA\u0304'
|
|
103
|
+
res[1166] = '\xEA\u030C'
|
|
104
|
+
} else {
|
|
105
|
+
if (!Object.hasOwn(sizes, id)) throw new Error('Unknown encoding')
|
|
106
|
+
res = new Uint16Array(sizes[id])
|
|
107
|
+
res.fill(0xff_fd)
|
|
108
|
+
unwrap(res, indices[id], 0, false)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
indices[id] = null // gc
|
|
112
|
+
tables.set(id, res)
|
|
113
|
+
return res
|
|
114
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
// See tests/fixtures/encodings/single-byte/dump.js for generator
|
|
2
|
+
|
|
3
|
+
const c =
|
|
4
|
+
'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0'
|
|
5
|
+
const k8a = '─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥\xA0⌡°²·÷═║╒ё'
|
|
6
|
+
const k8b = '©юабцдефгхийклмнопярстужвьызшэщчъЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ'
|
|
7
|
+
const i2 = 'żŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙'
|
|
8
|
+
const ch = 'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
|
|
9
|
+
const cl1 = 'абвгдежзийклмнопрстуфхцчшщъыьэю'
|
|
10
|
+
const i8 = 'אבגדהוזחטיךכלםמןנסעףפץצקרשת'
|
|
11
|
+
const p = '€\x81‚ƒ„…†‡'
|
|
12
|
+
const s = 'µ¶·ø¹ŗ»¼½¾æĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽßąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž'
|
|
13
|
+
const f = (n) => '\uFFFD'.repeat(n)
|
|
14
|
+
|
|
15
|
+
/* eslint-disable @exodus/export-default/named */
|
|
16
|
+
// prettier-ignore
|
|
17
|
+
export default {
|
|
18
|
+
ibm866: ch + "абвгдежзийклмноп░▒▓│┤╡╢╖╕╣║╗╝╜╛┐└┴┬├─┼╞╟╚╔╩╦╠═╬╧╨╤╥╙╘╒╓╫╪┘┌█▄▌▐▀рстуфхцчшщъыьэюяЁёЄєЇїЎў°∙·√№¤■\xA0",
|
|
19
|
+
"iso-8859-10": c + "ĄĒĢĪĨͧĻĐŠŦŽ\xADŪŊ°ąēģīĩķ·ļđšŧž―ūŋĀÁÂÃÄÅÆĮČÉĘËĖÍÎÏÐŅŌÓÔÕÖŨØŲÚÛÜÝÞßāáâãäåæįčéęëėíîïðņōóôõöũøųúûüýþĸ",
|
|
20
|
+
"iso-8859-13": c + "”¢£¤„¦§Ø©Ŗ«¬\xAD®Æ°±²³“" + s + "’",
|
|
21
|
+
"iso-8859-14": c + "Ḃḃ£ĊċḊ§Ẁ©ẂḋỲ\xAD®ŸḞḟĠġṀṁ¶ṖẁṗẃṠỳẄẅṡÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏŴÑÒÓÔÕÖṪØÙÚÛÜÝŶßàáâãäåæçèéêëìíîïŵñòóôõöṫøùúûüýŷ",
|
|
22
|
+
"iso-8859-15": c + "¡¢£€¥Š§š©ª«¬\xAD®¯°±²³Žµ¶·ž¹º»ŒœŸ",
|
|
23
|
+
"iso-8859-16": c + "ĄąŁ€„Чš©Ș«Ź\xADźŻ°±ČłŽ”¶·žčș»ŒœŸżÀÁÂĂÄĆÆÇÈÉÊËÌÍÎÏĐŃÒÓÔŐÖŚŰÙÚÛÜĘȚßàáâăäćæçèéêëìíîïđńòóôőöśűùúûüęț",
|
|
24
|
+
"iso-8859-2": c + "Ą˘Ł¤ĽŚ§¨ŠŞŤŹ\xADŽŻ°ą˛ł´ľśˇ¸šşťź˝ž" + i2,
|
|
25
|
+
"iso-8859-3": c + "Ħ˘£¤\uFFFDĤ§¨İŞĞĴ\xAD\uFFFDݰħ²³´µĥ·¸ışğĵ½\uFFFDżÀÁÂ\uFFFDÄĊĈÇÈÉÊËÌÍÎÏ\uFFFDÑÒÓÔĠÖ×ĜÙÚÛÜŬŜßàáâ\uFFFDäċĉçèéêëìíîï\uFFFDñòóôġö÷ĝùúûüŭŝ˙",
|
|
26
|
+
"iso-8859-4": c + "ĄĸŖ¤Ĩϧ¨ŠĒĢŦ\xADޝ°ą˛ŗ´ĩšēģŧŊžŋĀÁÂÃÄÅÆĮČÉĘËĖÍÎĪĐŅŌĶÔÕÖרŲÚÛÜŨŪßāáâãäåæįčéęëėíîīđņōķôõö÷øųúûüũū˙",
|
|
27
|
+
"iso-8859-5": c + "ЁЂЃЄЅІЇЈЉЊЋЌ\xADЎЏ" + ch + cl1 + "я№ёђѓєѕіїјљњћќ§ўџ",
|
|
28
|
+
"iso-8859-6": c + f(3) + "¤" + f(7) + "،\xAD" + f(13) + "؛" + f(3) + "؟\uFFFDءآأؤإئابةتثجحخدذرزسشصضطظعغ" + f(5) + "ـفقكلمنهوىي\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652" + f(13),
|
|
29
|
+
"iso-8859-7": c + "‘’£€₯¦§¨©ͺ«¬\xAD\uFFFD―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ\uFFFDΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ\uFFFD",
|
|
30
|
+
"iso-8859-8": c + "\uFFFD¢£¤¥¦§¨©×«¬\xAD®¯°±²³´µ¶·¸¹÷»¼½¾" + f(32) + "‗" + i8 + f(2) + "\u200E\u200F\uFFFD",
|
|
31
|
+
"koi8-r": k8a + "╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡Ё╢╣╤╥╦╧╨╩╪╫╬" + k8b,
|
|
32
|
+
"koi8-u": k8a + "є╔ії╗╘╙╚╛ґў╞╟╠╡ЁЄ╣ІЇ╦╧╨╩╪ҐЎ" + k8b,
|
|
33
|
+
macintosh: "ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñóòôöõúùûü†°¢£§•¶ß®©™´¨≠ÆØ∞±≤≥¥µ∂∑∏π∫ªºΩæø¿¡¬√ƒ≈∆«»…\xA0ÀÃÕŒœ–—“”‘’÷◊ÿŸ⁄€‹›fifl‡·‚„‰ÂÊÁËÈÍÎÏÌÓÔ\uF8FFÒÚÛÙıˆ˜¯˘˙˚¸˝˛ˇ",
|
|
34
|
+
"windows-1250": "€\x81‚\x83„…†‡\x88‰Š‹ŚŤŽŹ\x90‘’“”•–—\x98™š›śťžź\xA0ˇ˘Ł¤Ą¦§¨©Ş«¬\xAD®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľ" + i2,
|
|
35
|
+
"windows-1251": "ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—\x98™љ›њќћџ\xA0ЎўЈ¤Ґ¦§Ё©Є«¬\xAD®Ї°±Ііґµ¶·ё№є»јЅѕї" + ch + cl1 + 'я',
|
|
36
|
+
"windows-1252": p + "ˆ‰Š‹Œ\x8DŽ\x8F\x90‘’“”•–—˜™š›œ\x9DžŸ",
|
|
37
|
+
"windows-1253": p + "\x88‰\x8A‹\x8C\x8D\x8E\x8F\x90‘’“”•–—\x98™\x9A›\x9C\x9D\x9E\x9F\xA0΅Ά£¤¥¦§¨©\uFFFD«¬\xAD®―°±²³΄µ¶·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ\uFFFDΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ\uFFFD",
|
|
38
|
+
"windows-1254": p + "ˆ‰Š‹Œ\x8D\x8E\x8F\x90‘’“”•–—˜™š›œ\x9D\x9EŸ\xA0¡¢£¤¥¦§¨©ª«¬\xAD®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖרÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüış",
|
|
39
|
+
"windows-1255": p + "ˆ‰\x8A‹\x8C\x8D\x8E\x8F\x90‘’“”•–—˜™\x9A›\x9C\x9D\x9E\x9F\xA0¡¢£₪¥¦§¨©×«¬\xAD®¯°±²³´µ¶·¸¹÷»¼½¾¿\u05B0\u05B1\u05B2\u05B3\u05B4\u05B5\u05B6\u05B7\u05B8\u05B9\u05BA\u05BB\u05BC\u05BD־\u05BF׀\u05C1\u05C2׃װױײ׳״" + f(7) + i8 + f(2) + "\u200E\u200F\uFFFD",
|
|
40
|
+
"windows-1256": "€پ‚ƒ„…†‡ˆ‰ٹ‹Œچژڈگ‘’“”•–—ک™ڑ›œ\u200C\u200Dں\xA0،¢£¤¥¦§¨©ھ«¬\xAD®¯°±²³´µ¶·¸¹؛»¼½¾؟ہءآأؤإئابةتثجحخدذرزسشصض×طظعغـفقكàلâمنهوçèéêëىيîï\u064B\u064C\u064D\u064Eô\u064F\u0650÷\u0651ù\u0652ûü\u200E\u200Fے",
|
|
41
|
+
"windows-1257": "€\x81‚\x83„…†‡\x88‰\x8A‹\x8C¨ˇ¸\x90‘’“”•–—\x98™\x9A›\x9C¯˛\x9F\xA0\uFFFD¢£¤\uFFFD¦§Ø©Ŗ«¬\xAD®Æ°±²³´" + s + "˙",
|
|
42
|
+
"windows-1258": p + "ˆ‰\x8A‹Œ\x8D\x8E\x8F\x90‘’“”•–—˜™\x9A›œ\x9D\x9EŸ\xA0¡¢£¤¥¦§¨©ª«¬\xAD®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂĂÄÅÆÇÈÉÊË\u0300ÍÎÏĐÑ\u0309ÓÔƠÖרÙÚÛÜƯ\u0303ßàáâăäåæçèéêë\u0301íîïđñ\u0323óôơö÷øùúûüư₫",
|
|
43
|
+
"windows-874": "€\x81\x82\x83\x84…\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90‘’“”•–—\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะ\u0E31าำ\u0E34\u0E35\u0E36\u0E37\u0E38\u0E39\u0E3A" + f(4) + "฿เแโใไๅๆ\u0E47\u0E48\u0E49\u0E4A\u0E4B\u0E4C\u0E4D\u0E4E๏๐๑๒๓๔๕๖๗๘๙๚๛" + f(4),
|
|
44
|
+
"x-mac-cyrillic": ch + "†°Ґ£§•¶І®©™Ђђ≠Ѓѓ∞±≤≥іµґЈЄєЇїЉљЊњјЅ¬√ƒ≈∆«»…\xA0ЋћЌќѕ–—“”‘’÷„ЎўЏџ№Ёёя" + cl1 + "€"
|
|
45
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { asciiPrefix, decodeLatin1 } from './latin1.js'
|
|
2
|
+
import encodings from './single-byte.encodings.js'
|
|
3
|
+
import { decode2string } from './_utils.js'
|
|
4
|
+
|
|
5
|
+
export const E_STRICT = 'Input is not well-formed for this encoding'
|
|
6
|
+
const xUserDefined = 'x-user-defined'
|
|
7
|
+
|
|
8
|
+
export const assertEncoding = (encoding) => {
|
|
9
|
+
if (Object.hasOwn(encodings, encoding) || encoding === xUserDefined) return
|
|
10
|
+
throw new RangeError('Unsupported encoding')
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function getEncoding(encoding) {
|
|
14
|
+
assertEncoding(encoding)
|
|
15
|
+
if (encoding === xUserDefined) {
|
|
16
|
+
return Array.from({ length: 128 }, (_, i) => String.fromCharCode(0xf7_80 + i)).join('')
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
return encodings[encoding]
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const mappers = new Map()
|
|
23
|
+
const decoders = new Map()
|
|
24
|
+
|
|
25
|
+
// Used only on Node.js, no reason to optimize for anything else
|
|
26
|
+
// E.g. avoiding .from and filling zero-initialized arr manually is faster on Hermes, but we avoid this codepath on Hermes completely
|
|
27
|
+
export function encodingMapper(encoding) {
|
|
28
|
+
const cached = mappers.get(encoding)
|
|
29
|
+
if (cached) return cached
|
|
30
|
+
|
|
31
|
+
const incomplete = getEncoding(encoding).includes('\uFFFD')
|
|
32
|
+
let map
|
|
33
|
+
const mapper = (arr, start = 0) => {
|
|
34
|
+
if (!map) {
|
|
35
|
+
map = Uint16Array.from({ length: 256 }, (_, i) => i) // Unicode subset
|
|
36
|
+
const strings = getEncoding(encoding).split('')
|
|
37
|
+
map.set(Uint16Array.from(strings.map((x) => x.charCodeAt(0))), 128)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const o = Uint16Array.from(start === 0 ? arr : arr.subarray(start)) // copy to modify in-place, also those are 16-bit now
|
|
41
|
+
let i = 0
|
|
42
|
+
for (const end7 = o.length - 7; i < end7; i += 8) {
|
|
43
|
+
o[i] = map[o[i]]
|
|
44
|
+
o[i + 1] = map[o[i + 1]]
|
|
45
|
+
o[i + 2] = map[o[i + 2]]
|
|
46
|
+
o[i + 3] = map[o[i + 3]]
|
|
47
|
+
o[i + 4] = map[o[i + 4]]
|
|
48
|
+
o[i + 5] = map[o[i + 5]]
|
|
49
|
+
o[i + 6] = map[o[i + 6]]
|
|
50
|
+
o[i + 7] = map[o[i + 7]]
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
for (const end = o.length; i < end; i++) o[i] = map[o[i]]
|
|
54
|
+
return o
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
mappers.set(encoding, { mapper, incomplete })
|
|
58
|
+
return { mapper, incomplete }
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function encodingDecoder(encoding) {
|
|
62
|
+
const cached = decoders.get(encoding)
|
|
63
|
+
if (cached) return cached
|
|
64
|
+
|
|
65
|
+
let strings
|
|
66
|
+
const incomplete = getEncoding(encoding).includes('\uFFFD')
|
|
67
|
+
const decoder = (arr, loose = false) => {
|
|
68
|
+
if (!strings) {
|
|
69
|
+
const part = getEncoding(encoding).split('')
|
|
70
|
+
strings = Array.from({ length: 128 }, (_, i) => String.fromCharCode(i)).concat(part)
|
|
71
|
+
while (strings.length < 256) strings.push(String.fromCharCode(strings.length))
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
|
|
75
|
+
if (prefix.length === arr.length) return prefix
|
|
76
|
+
const suffix = decode2string(arr, prefix.length, arr.length, strings)
|
|
77
|
+
if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
|
|
78
|
+
return prefix + suffix
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
decoders.set(encoding, decoder)
|
|
82
|
+
return decoder
|
|
83
|
+
}
|