@qevm/strings 5.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src.ts/utf8.ts ADDED
@@ -0,0 +1,295 @@
1
+ "use strict";
2
+
3
+ import { arrayify, BytesLike } from "@qevm/bytes";
4
+
5
+ import { Logger } from "@ethersproject/logger";
6
+ import { version } from "./_version";
7
+ const logger = new Logger(version);
8
+
9
+ ///////////////////////////////
10
+
11
+ export enum UnicodeNormalizationForm {
12
+ current = "",
13
+ NFC = "NFC",
14
+ NFD = "NFD",
15
+ NFKC = "NFKC",
16
+ NFKD = "NFKD"
17
+ };
18
+
19
+ export enum Utf8ErrorReason {
20
+ // A continuation byte was present where there was nothing to continue
21
+ // - offset = the index the codepoint began in
22
+ UNEXPECTED_CONTINUE = "unexpected continuation byte",
23
+
24
+ // An invalid (non-continuation) byte to start a UTF-8 codepoint was found
25
+ // - offset = the index the codepoint began in
26
+ BAD_PREFIX = "bad codepoint prefix",
27
+
28
+ // The string is too short to process the expected codepoint
29
+ // - offset = the index the codepoint began in
30
+ OVERRUN = "string overrun",
31
+
32
+ // A missing continuation byte was expected but not found
33
+ // - offset = the index the continuation byte was expected at
34
+ MISSING_CONTINUE = "missing continuation byte",
35
+
36
+ // The computed code point is outside the range for UTF-8
37
+ // - offset = start of this codepoint
38
+ // - badCodepoint = the computed codepoint; outside the UTF-8 range
39
+ OUT_OF_RANGE = "out of UTF-8 range",
40
+
41
+ // UTF-8 strings may not contain UTF-16 surrogate pairs
42
+ // - offset = start of this codepoint
43
+ // - badCodepoint = the computed codepoint; inside the UTF-16 surrogate range
44
+ UTF16_SURROGATE = "UTF-16 surrogate",
45
+
46
+ // The string is an overlong representation
47
+ // - offset = start of this codepoint
48
+ // - badCodepoint = the computed codepoint; already bounds checked
49
+ OVERLONG = "overlong representation",
50
+ };
51
+
52
+
53
+ export type Utf8ErrorFunc = (reason: Utf8ErrorReason, offset: number, bytes: ArrayLike<number>, output: Array<number>, badCodepoint?: number) => number;
54
+
55
+ function errorFunc(reason: Utf8ErrorReason, offset: number, bytes: ArrayLike<number>, output: Array<number>, badCodepoint?: number): number {
56
+ return logger.throwArgumentError(`invalid codepoint at offset ${ offset }; ${ reason }`, "bytes", bytes);
57
+ }
58
+
59
+ function ignoreFunc(reason: Utf8ErrorReason, offset: number, bytes: ArrayLike<number>, output: Array<number>, badCodepoint?: number): number {
60
+
61
+ // If there is an invalid prefix (including stray continuation), skip any additional continuation bytes
62
+ if (reason === Utf8ErrorReason.BAD_PREFIX || reason === Utf8ErrorReason.UNEXPECTED_CONTINUE) {
63
+ let i = 0;
64
+ for (let o = offset + 1; o < bytes.length; o++) {
65
+ if (bytes[o] >> 6 !== 0x02) { break; }
66
+ i++;
67
+ }
68
+ return i;
69
+ }
70
+
71
+ // This byte runs us past the end of the string, so just jump to the end
72
+ // (but the first byte was read already read and therefore skipped)
73
+ if (reason === Utf8ErrorReason.OVERRUN) {
74
+ return bytes.length - offset - 1;
75
+ }
76
+
77
+ // Nothing to skip
78
+ return 0;
79
+ }
80
+
81
+ function replaceFunc(reason: Utf8ErrorReason, offset: number, bytes: ArrayLike<number>, output: Array<number>, badCodepoint?: number): number {
82
+
83
+ // Overlong representations are otherwise "valid" code points; just non-deistingtished
84
+ if (reason === Utf8ErrorReason.OVERLONG) {
85
+ output.push(badCodepoint);
86
+ return 0;
87
+ }
88
+
89
+ // Put the replacement character into the output
90
+ output.push(0xfffd);
91
+
92
+ // Otherwise, process as if ignoring errors
93
+ return ignoreFunc(reason, offset, bytes, output, badCodepoint);
94
+ }
95
+
96
+ // Common error handing strategies
97
+ export const Utf8ErrorFuncs: { [ name: string ]: Utf8ErrorFunc } = Object.freeze({
98
+ error: errorFunc,
99
+ ignore: ignoreFunc,
100
+ replace: replaceFunc
101
+ });
102
+
103
+ // http://stackoverflow.com/questions/13356493/decode-utf-8-with-javascript#13691499
104
+ function getUtf8CodePoints(bytes: BytesLike, onError?: Utf8ErrorFunc): Array<number> {
105
+ if (onError == null) { onError = Utf8ErrorFuncs.error; }
106
+
107
+ bytes = arrayify(bytes);
108
+
109
+ const result: Array<number> = [];
110
+ let i = 0;
111
+
112
+ // Invalid bytes are ignored
113
+ while(i < bytes.length) {
114
+
115
+ const c = bytes[i++];
116
+
117
+ // 0xxx xxxx
118
+ if (c >> 7 === 0) {
119
+ result.push(c);
120
+ continue;
121
+ }
122
+
123
+ // Multibyte; how many bytes left for this character?
124
+ let extraLength = null;
125
+ let overlongMask = null;
126
+
127
+ // 110x xxxx 10xx xxxx
128
+ if ((c & 0xe0) === 0xc0) {
129
+ extraLength = 1;
130
+ overlongMask = 0x7f;
131
+
132
+ // 1110 xxxx 10xx xxxx 10xx xxxx
133
+ } else if ((c & 0xf0) === 0xe0) {
134
+ extraLength = 2;
135
+ overlongMask = 0x7ff;
136
+
137
+ // 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
138
+ } else if ((c & 0xf8) === 0xf0) {
139
+ extraLength = 3;
140
+ overlongMask = 0xffff;
141
+
142
+ } else {
143
+ if ((c & 0xc0) === 0x80) {
144
+ i += onError(Utf8ErrorReason.UNEXPECTED_CONTINUE, i - 1, bytes, result);
145
+ } else {
146
+ i += onError(Utf8ErrorReason.BAD_PREFIX, i - 1, bytes, result);
147
+ }
148
+ continue;
149
+ }
150
+
151
+ // Do we have enough bytes in our data?
152
+ if (i - 1 + extraLength >= bytes.length) {
153
+ i += onError(Utf8ErrorReason.OVERRUN, i - 1, bytes, result);
154
+ continue;
155
+ }
156
+
157
+ // Remove the length prefix from the char
158
+ let res = c & ((1 << (8 - extraLength - 1)) - 1);
159
+
160
+ for (let j = 0; j < extraLength; j++) {
161
+ let nextChar = bytes[i];
162
+
163
+ // Invalid continuation byte
164
+ if ((nextChar & 0xc0) != 0x80) {
165
+ i += onError(Utf8ErrorReason.MISSING_CONTINUE, i, bytes, result);
166
+ res = null;
167
+ break;
168
+ };
169
+
170
+ res = (res << 6) | (nextChar & 0x3f);
171
+ i++;
172
+ }
173
+
174
+ // See above loop for invalid continuation byte
175
+ if (res === null) { continue; }
176
+
177
+ // Maximum code point
178
+ if (res > 0x10ffff) {
179
+ i += onError(Utf8ErrorReason.OUT_OF_RANGE, i - 1 - extraLength, bytes, result, res);
180
+ continue;
181
+ }
182
+
183
+ // Reserved for UTF-16 surrogate halves
184
+ if (res >= 0xd800 && res <= 0xdfff) {
185
+ i += onError(Utf8ErrorReason.UTF16_SURROGATE, i - 1 - extraLength, bytes, result, res);
186
+ continue;
187
+ }
188
+
189
+ // Check for overlong sequences (more bytes than needed)
190
+ if (res <= overlongMask) {
191
+ i += onError(Utf8ErrorReason.OVERLONG, i - 1 - extraLength, bytes, result, res);
192
+ continue;
193
+ }
194
+
195
+ result.push(res);
196
+ }
197
+
198
+ return result;
199
+ }
200
+
201
+ // http://stackoverflow.com/questions/18729405/how-to-convert-utf8-string-to-byte-array
202
+ export function toUtf8Bytes(str: string, form: UnicodeNormalizationForm = UnicodeNormalizationForm.current): Uint8Array {
203
+
204
+ if (form != UnicodeNormalizationForm.current) {
205
+ logger.checkNormalize();
206
+ str = str.normalize(form);
207
+ }
208
+
209
+ let result = [];
210
+ for (let i = 0; i < str.length; i++) {
211
+ const c = str.charCodeAt(i);
212
+
213
+ if (c < 0x80) {
214
+ result.push(c);
215
+
216
+ } else if (c < 0x800) {
217
+ result.push((c >> 6) | 0xc0);
218
+ result.push((c & 0x3f) | 0x80);
219
+
220
+ } else if ((c & 0xfc00) == 0xd800) {
221
+ i++;
222
+ const c2 = str.charCodeAt(i);
223
+
224
+ if (i >= str.length || (c2 & 0xfc00) !== 0xdc00) {
225
+ throw new Error("invalid utf-8 string");
226
+ }
227
+
228
+ // Surrogate Pair
229
+ const pair = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff);
230
+ result.push((pair >> 18) | 0xf0);
231
+ result.push(((pair >> 12) & 0x3f) | 0x80);
232
+ result.push(((pair >> 6) & 0x3f) | 0x80);
233
+ result.push((pair & 0x3f) | 0x80);
234
+
235
+ } else {
236
+ result.push((c >> 12) | 0xe0);
237
+ result.push(((c >> 6) & 0x3f) | 0x80);
238
+ result.push((c & 0x3f) | 0x80);
239
+ }
240
+ }
241
+
242
+ return arrayify(result);
243
+ };
244
+
245
+ function escapeChar(value: number) {
246
+ const hex = ("0000" + value.toString(16));
247
+ return "\\u" + hex.substring(hex.length - 4);
248
+ }
249
+
250
+ export function _toEscapedUtf8String(bytes: BytesLike, onError?: Utf8ErrorFunc): string {
251
+ return '"' + getUtf8CodePoints(bytes, onError).map((codePoint) => {
252
+ if (codePoint < 256) {
253
+ switch (codePoint) {
254
+ case 8: return "\\b";
255
+ case 9: return "\\t";
256
+ case 10: return "\\n"
257
+ case 13: return "\\r";
258
+ case 34: return "\\\"";
259
+ case 92: return "\\\\";
260
+ }
261
+
262
+ if (codePoint >= 32 && codePoint < 127) {
263
+ return String.fromCharCode(codePoint);
264
+ }
265
+ }
266
+
267
+ if (codePoint <= 0xffff) {
268
+ return escapeChar(codePoint);
269
+ }
270
+
271
+ codePoint -= 0x10000;
272
+ return escapeChar(((codePoint >> 10) & 0x3ff) + 0xd800) + escapeChar((codePoint & 0x3ff) + 0xdc00);
273
+ }).join("") + '"';
274
+ }
275
+
276
+ export function _toUtf8String(codePoints: Array<number>): string {
277
+ return codePoints.map((codePoint) => {
278
+ if (codePoint <= 0xffff) {
279
+ return String.fromCharCode(codePoint);
280
+ }
281
+ codePoint -= 0x10000;
282
+ return String.fromCharCode(
283
+ (((codePoint >> 10) & 0x3ff) + 0xd800),
284
+ ((codePoint & 0x3ff) + 0xdc00)
285
+ );
286
+ }).join("");
287
+ }
288
+
289
+ export function toUtf8String(bytes: BytesLike, onError?: Utf8ErrorFunc): string {
290
+ return _toUtf8String(getUtf8CodePoints(bytes, onError));
291
+ }
292
+
293
+ export function toUtf8CodePoints(str: string, form: UnicodeNormalizationForm = UnicodeNormalizationForm.current): Array<number> {
294
+ return getUtf8CodePoints(toUtf8Bytes(str, form));
295
+ }