@gjsify/string_decoder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,501 @@
1
+ // Reference: Node.js lib/string_decoder.js
2
+ // Reimplemented for GJS — handles incremental decoding of multi-byte character sequences
3
+ // Uses function constructor (not ES6 class) for compatibility with legacy CJS patterns
4
+ // that call StringDecoder.call(this, enc) (e.g., iconv-lite).
5
+
6
+ import { normalizeEncoding, checkEncoding, base64Encode as bytesToBase64 } from '@gjsify/utils';
7
+
8
+ function normalizeAndValidateEncoding(enc?: string): string {
9
+ if (enc) checkEncoding(enc);
10
+ return normalizeEncoding(enc);
11
+ }
12
+
13
+ /**
14
+ * Decode a complete (non-streaming) chunk of UTF-8 bytes into a string,
15
+ * using the W3C "maximal subpart" replacement algorithm (Unicode 3.9 D93b).
16
+ *
17
+ * This avoids relying on TextDecoder which may produce incorrect replacement
18
+ * counts on older SpiderMonkey versions (e.g., GJS 1.80 / SpiderMonkey 115).
19
+ *
20
+ * Valid UTF-8 byte ranges per position:
21
+ * 1-byte: 00-7F
22
+ * 2-byte: C2-DF, 80-BF
23
+ * 3-byte: E0 A0-BF 80-BF | E1-EC 80-BF 80-BF | ED 80-9F 80-BF | EE-EF 80-BF 80-BF
24
+ * 4-byte: F0 90-BF 80-BF 80-BF | F1-F3 80-BF 80-BF 80-BF | F4 80-8F 80-BF 80-BF
25
+ */
26
+ function utf8DecodeMaximalSubpart(bytes: Uint8Array, start: number, end: number): string {
27
+ let result = '';
28
+ let i = start;
29
+
30
+ while (i < end) {
31
+ const b0 = bytes[i];
32
+
33
+ // 1-byte (ASCII): 00-7F
34
+ if (b0 <= 0x7F) {
35
+ result += String.fromCharCode(b0);
36
+ i++;
37
+ continue;
38
+ }
39
+
40
+ // 2-byte: C2-DF, 80-BF
41
+ if (b0 >= 0xC2 && b0 <= 0xDF) {
42
+ if (i + 1 < end && bytes[i + 1] >= 0x80 && bytes[i + 1] <= 0xBF) {
43
+ result += String.fromCharCode(((b0 & 0x1F) << 6) | (bytes[i + 1] & 0x3F));
44
+ i += 2;
45
+ } else {
46
+ // Maximal subpart: just b0
47
+ result += '\ufffd';
48
+ i++;
49
+ }
50
+ continue;
51
+ }
52
+
53
+ // 3-byte sequences
54
+ if (b0 >= 0xE0 && b0 <= 0xEF) {
55
+ // Determine valid range for second byte
56
+ let lo2: number, hi2: number;
57
+ if (b0 === 0xE0) { lo2 = 0xA0; hi2 = 0xBF; }
58
+ else if (b0 === 0xED) { lo2 = 0x80; hi2 = 0x9F; }
59
+ else { lo2 = 0x80; hi2 = 0xBF; }
60
+
61
+ if (i + 1 >= end) {
62
+ // Only lead byte available — maximal subpart is b0
63
+ result += '\ufffd';
64
+ i++;
65
+ continue;
66
+ }
67
+ const b1 = bytes[i + 1];
68
+ if (b1 < lo2 || b1 > hi2) {
69
+ // Second byte out of range — maximal subpart is just b0
70
+ result += '\ufffd';
71
+ i++;
72
+ continue;
73
+ }
74
+ if (i + 2 >= end) {
75
+ // Two valid bytes but third missing — maximal subpart is b0 b1
76
+ result += '\ufffd';
77
+ i += 2;
78
+ continue;
79
+ }
80
+ const b2 = bytes[i + 2];
81
+ if (b2 < 0x80 || b2 > 0xBF) {
82
+ // Third byte invalid — maximal subpart is b0 b1
83
+ result += '\ufffd';
84
+ i += 2;
85
+ continue;
86
+ }
87
+ // Valid 3-byte sequence
88
+ const cp = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
89
+ result += String.fromCharCode(cp);
90
+ i += 3;
91
+ continue;
92
+ }
93
+
94
+ // 4-byte sequences
95
+ if (b0 >= 0xF0 && b0 <= 0xF4) {
96
+ // Determine valid range for second byte
97
+ let lo2: number, hi2: number;
98
+ if (b0 === 0xF0) { lo2 = 0x90; hi2 = 0xBF; }
99
+ else if (b0 === 0xF4) { lo2 = 0x80; hi2 = 0x8F; }
100
+ else { lo2 = 0x80; hi2 = 0xBF; }
101
+
102
+ if (i + 1 >= end) {
103
+ result += '\ufffd';
104
+ i++;
105
+ continue;
106
+ }
107
+ const b1 = bytes[i + 1];
108
+ if (b1 < lo2 || b1 > hi2) {
109
+ // Second byte out of range — maximal subpart is just b0
110
+ result += '\ufffd';
111
+ i++;
112
+ continue;
113
+ }
114
+ if (i + 2 >= end) {
115
+ // Two valid bytes but incomplete — maximal subpart is b0 b1
116
+ result += '\ufffd';
117
+ i += 2;
118
+ continue;
119
+ }
120
+ const b2 = bytes[i + 2];
121
+ if (b2 < 0x80 || b2 > 0xBF) {
122
+ // Third byte invalid — maximal subpart is b0 b1
123
+ result += '\ufffd';
124
+ i += 2;
125
+ continue;
126
+ }
127
+ if (i + 3 >= end) {
128
+ // Three valid bytes but incomplete — maximal subpart is b0 b1 b2
129
+ result += '\ufffd';
130
+ i += 3;
131
+ continue;
132
+ }
133
+ const b3 = bytes[i + 3];
134
+ if (b3 < 0x80 || b3 > 0xBF) {
135
+ // Fourth byte invalid — maximal subpart is b0 b1 b2
136
+ result += '\ufffd';
137
+ i += 3;
138
+ continue;
139
+ }
140
+ // Valid 4-byte sequence — produces a surrogate pair
141
+ const cp = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
142
+ // Encode as surrogate pair
143
+ result += String.fromCharCode(
144
+ 0xD800 + ((cp - 0x10000) >> 10),
145
+ 0xDC00 + ((cp - 0x10000) & 0x3FF)
146
+ );
147
+ i += 4;
148
+ continue;
149
+ }
150
+
151
+ // Invalid lead byte (80-BF = orphan continuation, C0-C1 = overlong, F5-FF = too high)
152
+ result += '\ufffd';
153
+ i++;
154
+ }
155
+
156
+ return result;
157
+ }
158
+
159
+ /**
160
+ * Returns the expected total byte length of a UTF-8 character given its first byte,
161
+ * and validates the lead byte is in a valid range.
162
+ * Returns 0 for invalid lead bytes.
163
+ */
164
+ function utf8CharLength(byte: number): number {
165
+ if ((byte & 0x80) === 0) return 1;
166
+ if (byte >= 0xC2 && byte <= 0xDF) return 2;
167
+ if (byte >= 0xE0 && byte <= 0xEF) return 3;
168
+ if (byte >= 0xF0 && byte <= 0xF4) return 4;
169
+ return 0; // invalid leading byte (C0-C1 overlong, F5+ too high, 80-BF continuation)
170
+ }
171
+
172
+ /**
173
+ * Check if a continuation byte is valid for its position in a multi-byte sequence.
174
+ * Returns true if the byte is in the expected range for that position.
175
+ */
176
+ function isValidContinuation(leadByte: number, charLen: number, position: number, byte: number): boolean {
177
+ if (position === 1) {
178
+ // Second byte has restricted ranges for some lead bytes
179
+ if (charLen === 3) {
180
+ if (leadByte === 0xE0) return byte >= 0xA0 && byte <= 0xBF;
181
+ if (leadByte === 0xED) return byte >= 0x80 && byte <= 0x9F;
182
+ return byte >= 0x80 && byte <= 0xBF;
183
+ }
184
+ if (charLen === 4) {
185
+ if (leadByte === 0xF0) return byte >= 0x90 && byte <= 0xBF;
186
+ if (leadByte === 0xF4) return byte >= 0x80 && byte <= 0x8F;
187
+ return byte >= 0x80 && byte <= 0xBF;
188
+ }
189
+ }
190
+ // All other positions: standard continuation range
191
+ return byte >= 0x80 && byte <= 0xBF;
192
+ }
193
+
194
+ interface StringDecoderInstance {
195
+ readonly encoding: string;
196
+ write(buf: Uint8Array): string;
197
+ end(buf?: Uint8Array): string;
198
+ }
199
+
200
+ interface StringDecoderConstructor {
201
+ new (encoding?: string): StringDecoderInstance;
202
+ (this: StringDecoderInstance, encoding?: string): void;
203
+ prototype: StringDecoderInstance;
204
+ }
205
+
206
+ /**
207
+ * StringDecoder provides an interface for efficiently decoding Buffer data
208
+ * into strings while preserving multi-byte characters that are split across
209
+ * Buffer boundaries.
210
+ *
211
+ * Implemented as a function constructor (not ES6 class) for compatibility
212
+ * with legacy CJS patterns that use StringDecoder.call(this, enc).
213
+ */
214
+ const StringDecoder = function StringDecoder(this: any, encoding?: string) {
215
+ this.encoding = normalizeAndValidateEncoding(encoding);
216
+ this._lastNeed = 0;
217
+ this._lastTotal = 0;
218
+ this._lastLeadByte = 0;
219
+
220
+ if (this.encoding === 'utf8') {
221
+ this._lastChar = new Uint8Array(4);
222
+ } else if (this.encoding === 'utf16le') {
223
+ this._lastChar = new Uint8Array(4);
224
+ } else if (this.encoding === 'base64') {
225
+ this._lastChar = new Uint8Array(3);
226
+ } else {
227
+ this._lastChar = new Uint8Array(0);
228
+ }
229
+ } as unknown as StringDecoderConstructor;
230
+
231
+ StringDecoder.prototype.write = function write(buf: Uint8Array): string {
232
+ if (buf.length === 0) return '';
233
+
234
+ switch (this.encoding) {
235
+ case 'utf8':
236
+ return writeUtf8(this, buf);
237
+ case 'utf16le':
238
+ return writeUtf16le(this, buf);
239
+ case 'base64':
240
+ return writeBase64(this, buf);
241
+ case 'ascii':
242
+ return decodeAscii(buf);
243
+ case 'latin1':
244
+ return decodeLatin1(buf);
245
+ case 'hex':
246
+ return decodeHex(buf);
247
+ default:
248
+ return decodeAscii(buf);
249
+ }
250
+ };
251
+
252
+ StringDecoder.prototype.end = function end(buf?: Uint8Array): string {
253
+ let result = '';
254
+ if (buf && buf.length > 0) {
255
+ result = this.write(buf);
256
+ }
257
+
258
+ if (this.encoding === 'utf8' && this._lastNeed > 0) {
259
+ result += '\ufffd';
260
+ this._lastNeed = 0;
261
+ this._lastTotal = 0;
262
+ } else if (this.encoding === 'utf16le' && this._lastNeed > 0) {
263
+ const stored = this._lastTotal - this._lastNeed;
264
+ for (let i = 0; i + 1 < stored; i += 2) {
265
+ result += String.fromCharCode(this._lastChar[i] | (this._lastChar[i + 1] << 8));
266
+ }
267
+ this._lastNeed = 0;
268
+ this._lastTotal = 0;
269
+ } else if (this.encoding === 'base64' && this._lastNeed > 0) {
270
+ const remaining = this._lastChar.subarray(0, this._lastTotal - this._lastNeed);
271
+ result += bytesToBase64(remaining);
272
+ this._lastNeed = 0;
273
+ this._lastTotal = 0;
274
+ }
275
+
276
+ return result;
277
+ };
278
+
279
+ function writeUtf8(self: any, buf: Uint8Array): string {
280
+ let i = 0;
281
+ let result = '';
282
+
283
+ if (self._lastNeed > 0) {
284
+ while (i < buf.length && self._lastNeed > 0) {
285
+ const byte = buf[i];
286
+ const position = self._lastTotal - self._lastNeed;
287
+ if (isValidContinuation(self._lastLeadByte, self._lastTotal, position, byte)) {
288
+ self._lastChar[position] = byte;
289
+ self._lastNeed--;
290
+ i++;
291
+ } else {
292
+ result += '\ufffd';
293
+ self._lastNeed = 0;
294
+ self._lastTotal = 0;
295
+ self._lastLeadByte = 0;
296
+ break;
297
+ }
298
+ }
299
+
300
+ if (self._lastNeed === 0 && self._lastTotal > 0) {
301
+ result += utf8DecodeMaximalSubpart(self._lastChar, 0, self._lastTotal);
302
+ self._lastTotal = 0;
303
+ self._lastLeadByte = 0;
304
+ }
305
+
306
+ if (self._lastNeed > 0) {
307
+ return result;
308
+ }
309
+ }
310
+
311
+ let completeEnd = buf.length;
312
+ for (let j = 0; j < Math.min(4, buf.length - i); j++) {
313
+ const idx = buf.length - 1 - j;
314
+ if (idx < i) break;
315
+ const byte = buf[idx];
316
+ if ((byte & 0xC0) !== 0x80) {
317
+ const charLen = utf8CharLength(byte);
318
+ if (charLen > 0 && byte >= 0x80) {
319
+ const available = buf.length - idx;
320
+ if (available < charLen) {
321
+ let allValid = true;
322
+ for (let k = 1; k < available; k++) {
323
+ if (!isValidContinuation(byte, charLen, k, buf[idx + k])) {
324
+ allValid = false;
325
+ break;
326
+ }
327
+ }
328
+ if (allValid) {
329
+ completeEnd = idx;
330
+ for (let k = 0; k < available; k++) {
331
+ self._lastChar[k] = buf[idx + k];
332
+ }
333
+ self._lastNeed = charLen - available;
334
+ self._lastTotal = charLen;
335
+ self._lastLeadByte = byte;
336
+ }
337
+ }
338
+ }
339
+ break;
340
+ }
341
+ }
342
+
343
+ if (completeEnd > i) {
344
+ result += utf8DecodeMaximalSubpart(buf, i, completeEnd);
345
+ }
346
+
347
+ return result;
348
+ }
349
+
350
+ function writeUtf16le(self: any, buf: Uint8Array): string {
351
+ let result = '';
352
+ let i = 0;
353
+
354
+ if (self._lastNeed > 0) {
355
+ const offset = self._lastTotal - self._lastNeed;
356
+ const needed = Math.min(self._lastNeed, buf.length);
357
+ for (let j = 0; j < needed; j++) {
358
+ self._lastChar[offset + j] = buf[j];
359
+ }
360
+ self._lastNeed -= needed;
361
+ i = needed;
362
+
363
+ if (self._lastNeed > 0) return '';
364
+
365
+ const stored = self._lastTotal;
366
+ let j = 0;
367
+ while (j + 1 < stored) {
368
+ const code = self._lastChar[j] | (self._lastChar[j + 1] << 8);
369
+ j += 2;
370
+ if (code >= 0xD800 && code <= 0xDBFF) {
371
+ if (j + 1 < stored) {
372
+ const nextCode = self._lastChar[j] | (self._lastChar[j + 1] << 8);
373
+ if (nextCode >= 0xDC00 && nextCode <= 0xDFFF) {
374
+ result += String.fromCharCode(code, nextCode);
375
+ j += 2;
376
+ continue;
377
+ }
378
+ }
379
+ if (i + 1 < buf.length) {
380
+ const nextCode = buf[i] | (buf[i + 1] << 8);
381
+ if (nextCode >= 0xDC00 && nextCode <= 0xDFFF) {
382
+ result += String.fromCharCode(code, nextCode);
383
+ i += 2;
384
+ continue;
385
+ }
386
+ } else if (i >= buf.length) {
387
+ self._lastChar[0] = self._lastChar[j - 2];
388
+ self._lastChar[1] = self._lastChar[j - 1];
389
+ self._lastNeed = 2;
390
+ self._lastTotal = 4;
391
+ return result;
392
+ }
393
+ }
394
+ result += String.fromCharCode(code);
395
+ }
396
+ self._lastTotal = 0;
397
+ }
398
+
399
+ while (i + 1 < buf.length) {
400
+ const code = buf[i] | (buf[i + 1] << 8);
401
+ i += 2;
402
+
403
+ if (code >= 0xD800 && code <= 0xDBFF) {
404
+ if (i + 1 < buf.length) {
405
+ const nextCode = buf[i] | (buf[i + 1] << 8);
406
+ if (nextCode >= 0xDC00 && nextCode <= 0xDFFF) {
407
+ result += String.fromCharCode(code, nextCode);
408
+ i += 2;
409
+ continue;
410
+ }
411
+ } else if (i < buf.length) {
412
+ result += String.fromCharCode(code);
413
+ self._lastChar[0] = buf[i];
414
+ self._lastNeed = 1;
415
+ self._lastTotal = 2;
416
+ return result;
417
+ } else {
418
+ self._lastChar[0] = buf[i - 2];
419
+ self._lastChar[1] = buf[i - 1];
420
+ self._lastNeed = 2;
421
+ self._lastTotal = 4;
422
+ return result;
423
+ }
424
+ }
425
+ result += String.fromCharCode(code);
426
+ }
427
+
428
+ if (i < buf.length) {
429
+ self._lastChar[0] = buf[i];
430
+ self._lastNeed = 1;
431
+ self._lastTotal = 2;
432
+ }
433
+
434
+ return result;
435
+ }
436
+
437
+ function writeBase64(self: any, buf: Uint8Array): string {
438
+ let start = 0;
439
+
440
+ if (self._lastNeed > 0) {
441
+ const needed = Math.min(self._lastNeed, buf.length);
442
+ for (let i = 0; i < needed; i++) {
443
+ self._lastChar[self._lastTotal - self._lastNeed + i] = buf[i];
444
+ self._lastNeed--;
445
+ }
446
+ start = needed;
447
+ if (self._lastNeed > 0) return '';
448
+ }
449
+
450
+ const remaining = buf.length - start;
451
+ const complete = remaining - (remaining % 3);
452
+ let result = '';
453
+
454
+ if (self._lastTotal > 0 && self._lastNeed === 0) {
455
+ result += bytesToBase64(self._lastChar.subarray(0, self._lastTotal));
456
+ self._lastTotal = 0;
457
+ }
458
+
459
+ if (complete > 0) {
460
+ result += bytesToBase64(buf.subarray(start, start + complete));
461
+ }
462
+
463
+ const leftover = remaining - complete;
464
+ if (leftover > 0) {
465
+ for (let i = 0; i < leftover; i++) {
466
+ self._lastChar[i] = buf[start + complete + i];
467
+ }
468
+ self._lastNeed = 3 - leftover;
469
+ self._lastTotal = 3;
470
+ }
471
+
472
+ return result;
473
+ }
474
+
475
+ function decodeAscii(buf: Uint8Array): string {
476
+ let result = '';
477
+ for (let i = 0; i < buf.length; i++) {
478
+ result += String.fromCharCode(buf[i] & 0x7f);
479
+ }
480
+ return result;
481
+ }
482
+
483
+ function decodeLatin1(buf: Uint8Array): string {
484
+ let result = '';
485
+ for (let i = 0; i < buf.length; i++) {
486
+ result += String.fromCharCode(buf[i]);
487
+ }
488
+ return result;
489
+ }
490
+
491
+ function decodeHex(buf: Uint8Array): string {
492
+ let result = '';
493
+ for (let i = 0; i < buf.length; i++) {
494
+ result += buf[i].toString(16).padStart(2, '0');
495
+ }
496
+ return result;
497
+ }
498
+
499
+ export { StringDecoder };
500
+
501
+ export default { StringDecoder };
package/src/test.mts ADDED
@@ -0,0 +1,6 @@
1
+
2
+ import { run } from '@gjsify/unit';
3
+
4
+ import stringDecoderTestSuite from './index.spec.js';
5
+
6
+ run({ stringDecoderTestSuite });
package/tsconfig.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "compilerOptions": {
3
+ "module": "ESNext",
4
+ "target": "ESNext",
5
+ "moduleResolution": "bundler",
6
+ "types": [
7
+ "node"
8
+ ],
9
+ "experimentalDecorators": true,
10
+ "emitDeclarationOnly": true,
11
+ "declaration": true,
12
+ "allowImportingTsExtensions": true,
13
+ "outDir": "lib",
14
+ "rootDir": "src",
15
+ "declarationDir": "lib/types",
16
+ "composite": true,
17
+ "skipLibCheck": true,
18
+ "allowJs": true,
19
+ "checkJs": false,
20
+ "strict": false
21
+ },
22
+ "include": [
23
+ "src/**/*.ts"
24
+ ],
25
+ "exclude": [
26
+ "src/test.ts",
27
+ "src/test.mts",
28
+ "src/**/*.spec.ts",
29
+ "src/**/*.spec.mts"
30
+ ]
31
+ }