functionalscript 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/text/utf8/module.f.d.ts +31 -2
- package/text/utf8/module.f.js +116 -28
- package/types/bit_vec/module.f.d.ts +2 -2
- package/types/bit_vec/module.f.js +17 -4
package/package.json
CHANGED
package/text/utf8/module.f.d.ts
CHANGED
|
@@ -1,9 +1,38 @@
|
|
|
1
1
|
import { type List, type Thunk } from '../../types/list/module.f.ts';
|
|
2
2
|
import type { Array1, Array2, Array3 } from '../../types/array/module.f.ts';
|
|
3
|
+
/**
|
|
4
|
+
* An unsigned 8-bit integer, represents a single byte.
|
|
5
|
+
*/
|
|
6
|
+
export type U8 = number;
|
|
7
|
+
/**
|
|
8
|
+
* A singed 32-bit integer.
|
|
9
|
+
*/
|
|
10
|
+
export type I32 = number;
|
|
11
|
+
/**
|
|
12
|
+
* Represents an unsigend 8-bit type - U8 or the end-of-file indicator.
|
|
13
|
+
* The U8 represents the byte itself, and null indicates that reading does not return anything else.
|
|
14
|
+
*/
|
|
3
15
|
export type ByteOrEof = U8 | null;
|
|
16
|
+
/**
|
|
17
|
+
* Represents the state of a UTF-8 decoding operation that contains at least one byte.
|
|
18
|
+
*/
|
|
4
19
|
export type Utf8NonEmptyState = Array1<number> | Array2<number> | Array3<number>;
|
|
20
|
+
/**
|
|
21
|
+
* Represents the state of a UTF-8 decoding operation, which can be either `null` (no state)
|
|
22
|
+
* or a non-empty state containing one or more bytes.
|
|
23
|
+
*/
|
|
5
24
|
export type Utf8State = null | Utf8NonEmptyState;
|
|
6
|
-
|
|
7
|
-
|
|
25
|
+
/**
|
|
26
|
+
* Maps a list of Unicode code points to a stream of UTF-8 bytes.
|
|
27
|
+
*
|
|
28
|
+
* @param input - A list of Unicode code points to be converted.
|
|
29
|
+
* @returns A thunk that lazily produces a sequence of UTF-8 bytes.
|
|
30
|
+
*/
|
|
8
31
|
export declare const fromCodePointList: (input: List<number>) => Thunk<U8>;
|
|
32
|
+
/**
|
|
33
|
+
* Converts a list of UTF-8 bytes into a list of Unicode code points.
|
|
34
|
+
*
|
|
35
|
+
* @param input - A list of UTF-8 bytes.
|
|
36
|
+
* @returns A list of Unicode code points or error codes.
|
|
37
|
+
*/
|
|
9
38
|
export declare const toCodePointList: (input: List<U8>) => List<I32>;
|
package/text/utf8/module.f.js
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { flat, flatMap, stateScan } from "../../types/list/module.f.js";
|
|
2
|
+
/**
|
|
3
|
+
* Error mask constant used to represent invalid code points or encoding errors in UTF-8.
|
|
4
|
+
*/
|
|
2
5
|
const errorMask = 0b1000_0000_0000_0000_0000_0000_0000_0000;
|
|
6
|
+
/**
|
|
7
|
+
* Converts a Unicode code point to a sequence of UTF-8 bytes.
|
|
8
|
+
* @param input The Unicode code point to be converted. Valid range:
|
|
9
|
+
* - 0x0000 to 0x007F for 1-byte sequences.
|
|
10
|
+
* - 0x0080 to 0x07FF for 2-byte sequences.
|
|
11
|
+
* - 0x0800 to 0xFFFF for 3-byte sequences.
|
|
12
|
+
* - 0x10000 to 0x10FFFF for 4-byte sequences.
|
|
13
|
+
* @returns A readonly array of UTF-8 bytes representing the input code point.
|
|
14
|
+
* - Returns `[errorMask]` if the input does not match valid UTF-8 encoding rules.
|
|
15
|
+
*/
|
|
3
16
|
const codePointToUtf8 = (input) => {
|
|
4
17
|
if (input >= 0x0000 && input <= 0x007f) {
|
|
5
18
|
return [input & 0b01111_1111];
|
|
@@ -8,28 +21,58 @@ const codePointToUtf8 = (input) => {
|
|
|
8
21
|
return [input >> 6 | 0b1100_0000, input & 0b0011_1111 | 0b1000_0000];
|
|
9
22
|
}
|
|
10
23
|
if (input >= 0x0800 && input <= 0xffff) {
|
|
11
|
-
return [
|
|
24
|
+
return [
|
|
25
|
+
input >> 12 | 0b1110_0000,
|
|
26
|
+
input >> 6 & 0b0011_1111 | 0b1000_0000,
|
|
27
|
+
input & 0b0011_1111 | 0b1000_0000,
|
|
28
|
+
];
|
|
12
29
|
}
|
|
13
30
|
if (input >= 0x10000 && input <= 0x10ffff) {
|
|
14
|
-
return [
|
|
31
|
+
return [
|
|
32
|
+
input >> 18 | 0b1111_0000,
|
|
33
|
+
input >> 12 & 0b0011_1111 | 0b1000_0000,
|
|
34
|
+
input >> 6 & 0b0011_1111 | 0b1000_0000,
|
|
35
|
+
input & 0b0011_1111 | 0b1000_0000,
|
|
36
|
+
];
|
|
15
37
|
}
|
|
16
38
|
if ((input & errorMask) !== 0) {
|
|
17
39
|
if ((input & 0b1000_0000_0000_0000) !== 0) {
|
|
18
|
-
return [
|
|
40
|
+
return [
|
|
41
|
+
input >> 12 & 0b0000_0111 | 0b1111_0000,
|
|
42
|
+
input >> 6 & 0b0011_1111 | 0b1000_0000,
|
|
43
|
+
input & 0b0011_1111 | 0b1000_0000,
|
|
44
|
+
];
|
|
19
45
|
}
|
|
20
46
|
if ((input & 0b0000_0100_0000_0000) !== 0) {
|
|
21
|
-
return [
|
|
47
|
+
return [
|
|
48
|
+
input >> 6 & 0b0000_1111 | 0b1110_0000,
|
|
49
|
+
input & 0b0011_1111 | 0b1000_0000,
|
|
50
|
+
];
|
|
22
51
|
}
|
|
23
52
|
if ((input & 0b0000_0010_0000_0000) !== 0) {
|
|
24
|
-
return [
|
|
53
|
+
return [
|
|
54
|
+
input >> 6 & 0b0000_0111 | 0b1111_0000,
|
|
55
|
+
input & 0b0011_1111 | 0b1000_0000,
|
|
56
|
+
];
|
|
25
57
|
}
|
|
26
|
-
if ((input & 0b0000_0000_1000_0000) !== 0)
|
|
58
|
+
if ((input & 0b0000_0000_1000_0000) !== 0)
|
|
27
59
|
return [input & 0b1111_1111];
|
|
28
|
-
}
|
|
29
60
|
}
|
|
30
61
|
return [errorMask];
|
|
31
62
|
};
|
|
63
|
+
/**
|
|
64
|
+
* Maps a list of Unicode code points to a stream of UTF-8 bytes.
|
|
65
|
+
*
|
|
66
|
+
* @param input - A list of Unicode code points to be converted.
|
|
67
|
+
* @returns A thunk that lazily produces a sequence of UTF-8 bytes.
|
|
68
|
+
*/
|
|
32
69
|
export const fromCodePointList = flatMap(codePointToUtf8);
|
|
70
|
+
/**
|
|
71
|
+
* Converts a non-empty UTF-8 decoding state to an error code.
|
|
72
|
+
*
|
|
73
|
+
* @param state - A non-empty UTF-8 decoding state.
|
|
74
|
+
* @returns An I32 error code derived from the invalid UTF-8 state.
|
|
75
|
+
*/
|
|
33
76
|
const utf8StateToError = (state) => {
|
|
34
77
|
let x;
|
|
35
78
|
switch (state.length) {
|
|
@@ -41,12 +84,14 @@ const utf8StateToError = (state) => {
|
|
|
41
84
|
const [s0, s1] = state;
|
|
42
85
|
x = s0 < 0b1111_0000
|
|
43
86
|
? ((s0 & 0b0000_1111) << 6) + (s1 & 0b0011_1111) + 0b0000_0100_0000_0000
|
|
44
|
-
: ((s0 & 0b0000_0111) << 6) + (s1 & 0b0011_1111) +
|
|
87
|
+
: ((s0 & 0b0000_0111) << 6) + (s1 & 0b0011_1111) +
|
|
88
|
+
0b0000_0010_0000_0000;
|
|
45
89
|
break;
|
|
46
90
|
}
|
|
47
91
|
case 3: {
|
|
48
92
|
const [s0, s1, s2] = state;
|
|
49
|
-
x = ((s0 & 0b0000_0111) << 12) + ((s1 & 0b0011_1111) << 6) +
|
|
93
|
+
x = ((s0 & 0b0000_0111) << 12) + ((s1 & 0b0011_1111) << 6) +
|
|
94
|
+
(s2 & 0b0011_1111) + 0b1000_0000_0000_0000;
|
|
50
95
|
break;
|
|
51
96
|
}
|
|
52
97
|
default:
|
|
@@ -54,17 +99,24 @@ const utf8StateToError = (state) => {
|
|
|
54
99
|
}
|
|
55
100
|
return x | errorMask;
|
|
56
101
|
};
|
|
57
|
-
|
|
102
|
+
/**
|
|
103
|
+
* Decodes a byte into a Unicode code point, using a given UTF-8 state.
|
|
104
|
+
*
|
|
105
|
+
* @param state - The current UTF-8 decoding state.
|
|
106
|
+
* @param byte - A single byte to decode.
|
|
107
|
+
* @returns A tuple containing:
|
|
108
|
+
* - A list of decoded Unicode code points or error codes.
|
|
109
|
+
* - The updated UTF-8 state.
|
|
110
|
+
*/
|
|
111
|
+
const utf8ByteToCodePointOp = (state) => (byte) => {
|
|
58
112
|
if (byte < 0x00 || byte > 0xff) {
|
|
59
113
|
return [[errorMask], state];
|
|
60
114
|
}
|
|
61
115
|
if (state === null) {
|
|
62
|
-
if (byte < 0b1000_0000)
|
|
116
|
+
if (byte < 0b1000_0000)
|
|
63
117
|
return [[byte], null];
|
|
64
|
-
|
|
65
|
-
if (byte >= 0b1100_0010 && byte <= 0b1111_0100) {
|
|
118
|
+
if (byte >= 0b1100_0010 && byte <= 0b1111_0100)
|
|
66
119
|
return [[], [byte]];
|
|
67
|
-
}
|
|
68
120
|
return [[byte | errorMask], null];
|
|
69
121
|
}
|
|
70
122
|
if (byte >= 0b1000_0000 && byte < 0b1100_0000) {
|
|
@@ -74,37 +126,73 @@ const utf8ByteToCodePointOp = state => byte => {
|
|
|
74
126
|
if (s0 < 0b1110_0000) {
|
|
75
127
|
return [[((s0 & 0b0001_1111) << 6) + (byte & 0b0011_1111)], null];
|
|
76
128
|
}
|
|
77
|
-
if (s0 < 0b1111_1000)
|
|
129
|
+
if (s0 < 0b1111_1000)
|
|
78
130
|
return [[], [s0, byte]];
|
|
79
|
-
}
|
|
80
131
|
break;
|
|
81
132
|
}
|
|
82
133
|
case 2: {
|
|
83
134
|
const [s0, s1] = state;
|
|
84
135
|
if (s0 < 0b1111_0000) {
|
|
85
|
-
return [[
|
|
136
|
+
return [[
|
|
137
|
+
((s0 & 0b0000_1111) << 12) + ((s1 & 0b0011_1111) << 6) +
|
|
138
|
+
(byte & 0b0011_1111),
|
|
139
|
+
], null];
|
|
86
140
|
}
|
|
87
|
-
if (s0 < 0b1111_1000)
|
|
141
|
+
if (s0 < 0b1111_1000)
|
|
88
142
|
return [[], [s0, s1, byte]];
|
|
89
|
-
}
|
|
90
143
|
break;
|
|
91
144
|
}
|
|
92
145
|
case 3: {
|
|
93
146
|
const [s0, s1, s2] = state;
|
|
94
|
-
return [[
|
|
147
|
+
return [[
|
|
148
|
+
((s0 & 0b0000_0111) << 18) + ((s1 & 0b0011_1111) << 12) +
|
|
149
|
+
((s2 & 0b0011_1111) << 6) + (byte & 0b0011_1111),
|
|
150
|
+
], null];
|
|
95
151
|
}
|
|
96
152
|
}
|
|
97
153
|
}
|
|
98
154
|
const error = utf8StateToError(state);
|
|
99
|
-
if (byte < 0b1000_0000)
|
|
155
|
+
if (byte < 0b1000_0000)
|
|
100
156
|
return [[error, byte], null];
|
|
101
|
-
|
|
102
|
-
if (byte >= 0b1100_0010 && byte <= 0b1111_0100) {
|
|
157
|
+
if (byte >= 0b1100_0010 && byte <= 0b1111_0100)
|
|
103
158
|
return [[error], [byte]];
|
|
104
|
-
}
|
|
105
159
|
return [[error, byte | errorMask], null];
|
|
106
160
|
};
|
|
107
|
-
|
|
108
|
-
|
|
161
|
+
/**
|
|
162
|
+
* Handles the end-of-file (EOF) case for UTF-8 decoding.
|
|
163
|
+
*
|
|
164
|
+
* @param state - The current UTF-8 decoding state.
|
|
165
|
+
* @returns A tuple containing:
|
|
166
|
+
* - A list of decoded Unicode code points or error codes.
|
|
167
|
+
* - The reset UTF-8 state (`null`).
|
|
168
|
+
*/
|
|
169
|
+
const utf8EofToCodePointOp = (state) => [
|
|
170
|
+
state === null ? null : [utf8StateToError(state)],
|
|
171
|
+
null,
|
|
172
|
+
];
|
|
173
|
+
/**
|
|
174
|
+
* Combines UTF-8 byte and EOF handling into a single decoding operation.
|
|
175
|
+
*
|
|
176
|
+
* @param state - The current UTF-8 decoding state.
|
|
177
|
+
* @param input - The next byte or EOF indicator.
|
|
178
|
+
* @returns A tuple containing:
|
|
179
|
+
* - A list of decoded Unicode code points or error codes.
|
|
180
|
+
* - The updated UTF-8 state.
|
|
181
|
+
*/
|
|
182
|
+
const utf8ByteOrEofToCodePointOp = (state) => (input) => input === null ? utf8EofToCodePointOp(state) : utf8ByteToCodePointOp(state)(input);
|
|
183
|
+
/**
|
|
184
|
+
* A constant representing the end-of-file (EOF) marker for UTF-8 decoding.
|
|
185
|
+
*
|
|
186
|
+
* @remarks
|
|
187
|
+
* This is used as a sentinel value in decoding operations to signify the
|
|
188
|
+
* termination of input. The list contains a single `null` value, which
|
|
189
|
+
* represents the EOF condition.
|
|
190
|
+
*/
|
|
109
191
|
const eofList = [null];
|
|
110
|
-
|
|
192
|
+
/**
|
|
193
|
+
* Converts a list of UTF-8 bytes into a list of Unicode code points.
|
|
194
|
+
*
|
|
195
|
+
* @param input - A list of UTF-8 bytes.
|
|
196
|
+
* @returns A list of Unicode code points or error codes.
|
|
197
|
+
*/
|
|
198
|
+
export const toCodePointList = (input) => flat(stateScan(utf8ByteOrEofToCodePointOp)(null)(flat([input, eofList])));
|
|
@@ -18,8 +18,8 @@ export declare const length: (v: bigint) => bigint;
|
|
|
18
18
|
*
|
|
19
19
|
* ```js
|
|
20
20
|
* const vec4 = vec(4n)
|
|
21
|
-
* const v0 = vec4(5n) // 0x15n
|
|
22
|
-
* const v1 = vec4(0x5FEn) // 0x1En
|
|
21
|
+
* const v0 = vec4(5n) // 0x15n = 0b1_0101
|
|
22
|
+
* const v1 = vec4(0x5FEn) // 0x1En = 0b1_1110
|
|
23
23
|
* ```
|
|
24
24
|
*/
|
|
25
25
|
export declare const vec: (len: bigint) => (ui: bigint) => Vec;
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MSb is most-significant bit first.
|
|
3
|
+
* ```
|
|
4
|
+
* - byte: 0x53 = 0b0101_0011
|
|
5
|
+
* - 0123_4567
|
|
6
|
+
* ```
|
|
7
|
+
* LSb is least-significant bit first.
|
|
8
|
+
* ```
|
|
9
|
+
* - byte: 0x53 = 0b0101_0011
|
|
10
|
+
* - 7654_3210
|
|
11
|
+
* ```
|
|
12
|
+
* @module
|
|
13
|
+
*/
|
|
1
14
|
import { log2, mask } from "../bigint/module.f.js";
|
|
2
15
|
import { flip } from "../function/module.f.js";
|
|
3
16
|
import { fold } from "../list/module.f.js";
|
|
@@ -16,8 +29,8 @@ export const length = log2;
|
|
|
16
29
|
*
|
|
17
30
|
* ```js
|
|
18
31
|
* const vec4 = vec(4n)
|
|
19
|
-
* const v0 = vec4(5n) // 0x15n
|
|
20
|
-
* const v1 = vec4(0x5FEn) // 0x1En
|
|
32
|
+
* const v0 = vec4(5n) // 0x15n = 0b1_0101
|
|
33
|
+
* const v1 = vec4(0x5FEn) // 0x1En = 0b1_1110
|
|
21
34
|
* ```
|
|
22
35
|
*/
|
|
23
36
|
export const vec = (len) => {
|
|
@@ -73,7 +86,7 @@ export const lsb = {
|
|
|
73
86
|
const aLen = length(a);
|
|
74
87
|
const m = mask(aLen);
|
|
75
88
|
return b => (b << aLen) | (a & m);
|
|
76
|
-
}
|
|
89
|
+
},
|
|
77
90
|
};
|
|
78
91
|
/**
|
|
79
92
|
* Implements operations for handling vectors in a most-significant-bit (MSb) first order.
|
|
@@ -95,7 +108,7 @@ export const msb = {
|
|
|
95
108
|
return [(v >> d) & m, vec(d)(v)];
|
|
96
109
|
};
|
|
97
110
|
},
|
|
98
|
-
concat: flip(lsb.concat)
|
|
111
|
+
concat: flip(lsb.concat),
|
|
99
112
|
};
|
|
100
113
|
const appendU8 = ({ concat }) => (u8) => (a) => concat(a)(vec8(BigInt(u8)));
|
|
101
114
|
/**
|