functionalscript 0.0.373 → 0.0.376
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/text/encoding/README.md +90 -0
- package/text/encoding/module.f.cjs +104 -6
- package/text/encoding/test.f.cjs +80 -0
package/package.json
CHANGED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# UNICODE
|
|
2
|
+
|
|
3
|
+
## UTF-8
|
|
4
|
+
|
|
5
|
+
Requirement: no loss for UTF8 => codepoint => UTF8
|
|
6
|
+
|
|
7
|
+
|utf8 |codepoint |size |
|
|
8
|
+
|---------|---------------------------------------|---------|
|
|
9
|
+
|[a] |0xxx_xxxx |7 bit |
|
|
10
|
+
|[b,a] |110x_xxxx 10xx_xxxx |11 bit |
|
|
11
|
+
|[c,b,a] |1110_xxxx 10xx_xxxx 10xx_xxxx |16 bit |
|
|
12
|
+
|[d,c,b,a]|1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx|21 bit |
|
|
13
|
+
|
|
14
|
+
|utf8 error|codepoint |size |
|
|
15
|
+
|----------|-----------------------------|------|
|
|
16
|
+
|[e] |10xx_xxxx |6 bit |
|
|
17
|
+
|[e] |1111_1xxx |3 bit |
|
|
18
|
+
|[b,] |110x_xxxx |5 bit |
|
|
19
|
+
|[c,] |1110_xxxx |4 bit |
|
|
20
|
+
|[c,b,] |1110_xxxx 10xx_xxxx |10 bit|
|
|
21
|
+
|[d,] |1111_0xxx |3 bit |
|
|
22
|
+
|[d,c,] |1111_0xxx 10xx_xxxx |9 bit |
|
|
23
|
+
|[d,c,b] |1111_0xxx 10xx_xxxx 10xx_xxxx|15 bit|
|
|
24
|
+
|
|
25
|
+
Total error states:
|
|
26
|
+
|
|
27
|
+
- 2^6 + 2^3 + 2^5 + 2^4 + 2^10 + 2^3 + + 2^9 + 2^15
|
|
28
|
+
- 2^4 + 2^6 + 2^5 + 2^4 + 2^10 + 2^9 + 2^15
|
|
29
|
+
- 2^5 + 2^6 + 2^5 + 2^10 + 2^9 + 2^15
|
|
30
|
+
- 2^6 + 2^6 + 2^10 + 2^9 + 2^15
|
|
31
|
+
- 2^7 + 2^9 + 2^10 + 2^15
|
|
32
|
+
- < 2^16
|
|
33
|
+
|
|
34
|
+
|utf8 error|codepoint |size |map |
|
|
35
|
+
|----------|-----------------------------|------|-------------------|
|
|
36
|
+
|[e] |1111_1xxx | 3 bit| |
|
|
37
|
+
|[d,] |1111_0xxx | 3 bit| |
|
|
38
|
+
|[c,] |1110_xxxx | 4 bit| |
|
|
39
|
+
|[b,] |110x_xxxx | 5 bit| |
|
|
40
|
+
|[e] |10xx_xxxx | 6 bit|1111_1111 1xxx_xxxx|
|
|
41
|
+
|[d,c,] |1111_0xxx 10xx_xxxx | 9 bit|1111_0xxx 10xx_xxxx|
|
|
42
|
+
|[c,b,] |1110_xxxx 10xx_xxxx |10 bit|1110_xxxx 10xx_xxxx|
|
|
43
|
+
|[d,c,b] |1111_0xxx 10xx_xxxx 10xx_xxxx|15 bit|0xxx_xxxx xxxx_xxxx|
|
|
44
|
+
|
|
45
|
+
```js
|
|
46
|
+
/** @type {(input: List<u8|undefined>) => List<i32>} */
|
|
47
|
+
const utf8ToCodePoint
|
|
48
|
+
|
|
49
|
+
/** @type {(input: List<i32>) => List<u8>} */
|
|
50
|
+
const codePointToUtf8
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## UTF-16
|
|
54
|
+
|
|
55
|
+
Requirement: no loss for UTF16 => codepoint => UTF16
|
|
56
|
+
|
|
57
|
+
0xD800..0xDFFF
|
|
58
|
+
0b_1101_1000_0000_0000
|
|
59
|
+
0b_1101_1111_1111_1111
|
|
60
|
+
|
|
61
|
+
0b_1101_1xxx_xxxx_xxxx : 11 bits
|
|
62
|
+
|
|
63
|
+
- first : 0xD800: 0b_1101_10xx_xxxx_xxxx : 10 bit
|
|
64
|
+
- second: 0xDC00: 0b_1101_11xx_xxxx_xxxx : 10 bit
|
|
65
|
+
|
|
66
|
+
|utf16 |codepoint |size |
|
|
67
|
+
|---------|---------------------------------------|------|
|
|
68
|
+
|[a] |xxxx_xxxx_xxxx_xxxx |16 bit|
|
|
69
|
+
|[b,a] |1101_10xx_xxxx_xxxx 1101_11xx_xxxx_xxxx|20 bit|
|
|
70
|
+
|
|
71
|
+
|utf16 error|codepoint |size |
|
|
72
|
+
|-----------|-------------------|------|
|
|
73
|
+
|[e] |1101_11xx_xxxx_xxxx|10 bit|
|
|
74
|
+
|[b,] |1101_10xx_xxxx_xxxx|10 bit|
|
|
75
|
+
|
|
76
|
+
Total error states: 11 bit
|
|
77
|
+
|
|
78
|
+
```js
|
|
79
|
+
/** @type {(input: List<u16|undefined>) => List<i32>} */
|
|
80
|
+
const utf16ToCodePoint
|
|
81
|
+
|
|
82
|
+
/** @type {(input: List<i32>) => List<u16>} */
|
|
83
|
+
const codePointToUtf16
|
|
84
|
+
|
|
85
|
+
/** @type {(input: string) => List<u16> */
|
|
86
|
+
const stringToUtf16
|
|
87
|
+
|
|
88
|
+
/** @type {(input: List<u16>) => string} */
|
|
89
|
+
const utf16ToString
|
|
90
|
+
```
|
|
@@ -1,10 +1,29 @@
|
|
|
1
1
|
const result = require('../../types/result/module.f.cjs')
|
|
2
2
|
const list = require('../../types/list/module.f.cjs')
|
|
3
|
+
const operator = require('../../types/function/operator/module.f.cjs')
|
|
4
|
+
const array = require('../../types/array/module.f.cjs')
|
|
5
|
+
const { contains } = require('../../types/range/module.f.cjs')
|
|
3
6
|
const { ok, error } = result
|
|
4
7
|
|
|
5
|
-
/** @typedef {result.Result<number,number>}
|
|
8
|
+
/** @typedef {result.Result<number,number>} ByteResult */
|
|
6
9
|
|
|
7
|
-
/** @
|
|
10
|
+
/** @typedef {result.Result<number,readonly number[]>} CodePointResult */
|
|
11
|
+
|
|
12
|
+
/** @typedef {number|undefined} ByteOrEof */
|
|
13
|
+
|
|
14
|
+
/** @typedef {undefined|array.Array1<number>|array.Array2<number>|array.Array3<number>} Utf8State */
|
|
15
|
+
|
|
16
|
+
/** @typedef {undefined|array.Array1<number>|array.Array2<number>|array.Array3<number>} Utf16State */
|
|
17
|
+
|
|
18
|
+
/** @type {(a:number) => boolean} */
|
|
19
|
+
const isBmpCodePoint = a => a >= 0x0000 && a <= 0xd7ff || a >= 0xe000 && a <= 0xffff
|
|
20
|
+
|
|
21
|
+
const isHighSurrogate = contains([0xd800, 0xdbff])
|
|
22
|
+
|
|
23
|
+
/** @type {(a:number) => boolean} */
|
|
24
|
+
const isLowSurrogate = contains([0xdc00, 0xdfff])
|
|
25
|
+
|
|
26
|
+
/** @type {(input:number) => list.List<ByteResult>} */
|
|
8
27
|
const codePointToUtf8 = input =>
|
|
9
28
|
{
|
|
10
29
|
if (input >= 0x0000 && input <= 0x007f) { return [ok(input & 0x7f)] }
|
|
@@ -14,10 +33,10 @@ const codePointToUtf8 = input =>
|
|
|
14
33
|
return [error(input)]
|
|
15
34
|
}
|
|
16
35
|
|
|
17
|
-
/** @type {(input:number) => list.List<
|
|
36
|
+
/** @type {(input:number) => list.List<ByteResult>} */
|
|
18
37
|
const codePointToUtf16 = input =>
|
|
19
38
|
{
|
|
20
|
-
if (input
|
|
39
|
+
if (isBmpCodePoint(input)) { return [ok(input >> 8), ok(input & 0xff)] }
|
|
21
40
|
if (input >= 0x010000 && input <= 0x10ffff) {
|
|
22
41
|
const high = ((input - 0x10000) >> 10) + 0xd800
|
|
23
42
|
const low = ((input - 0x10000) & 0x3ff) + 0xdc00
|
|
@@ -26,15 +45,94 @@ const codePointToUtf16 = input =>
|
|
|
26
45
|
return [error(input)]
|
|
27
46
|
}
|
|
28
47
|
|
|
29
|
-
/** @type {(input: list.List<number>) => list.List<
|
|
48
|
+
/** @type {(input: list.List<number>) => list.List<ByteResult>} */
|
|
30
49
|
const codePointListToUtf8 = list.flatMap(codePointToUtf8)
|
|
31
50
|
|
|
32
|
-
/** @type {(input: list.List<number>) => list.List<
|
|
51
|
+
/** @type {(input: list.List<number>) => list.List<ByteResult>} */
|
|
33
52
|
const codePointListToUtf16 = list.flatMap(codePointToUtf16)
|
|
34
53
|
|
|
54
|
+
/** @type {operator.StateScan<number, Utf8State, list.List<CodePointResult>>} */
|
|
55
|
+
const utf8ByteToCodePointOp = state => byte => {
|
|
56
|
+
if (byte < 0x00 || byte > 0xff) {
|
|
57
|
+
return [[error([byte])], state]
|
|
58
|
+
}
|
|
59
|
+
if (state == undefined) {
|
|
60
|
+
if (byte < 0x80) { return [[ok(byte)], undefined] }
|
|
61
|
+
if (byte >= 0xc2 && byte <= 0xf4) { return [[], [byte]] }
|
|
62
|
+
return [[error([byte])], undefined]
|
|
63
|
+
}
|
|
64
|
+
if (byte >= 0x80 && byte < 0xc0)
|
|
65
|
+
{
|
|
66
|
+
switch(state.length)
|
|
67
|
+
{
|
|
68
|
+
case 1:
|
|
69
|
+
if (state[0] < 0xe0) { return [[ok(((state[0] & 0x1f) << 6) + (byte & 0x3f))], undefined] }
|
|
70
|
+
if (state[0] < 0xf8) { return [[], [state[0], byte]] }
|
|
71
|
+
break
|
|
72
|
+
case 2:
|
|
73
|
+
if (state[0] < 0xf0) { return [[ok(((state[0] & 0x0f) << 12) + ((state[1] & 0x3f) << 6) + (byte & 0x3f))], undefined] }
|
|
74
|
+
if (state[0] < 0xf8) { return [[], [state[0], state[1], byte]] }
|
|
75
|
+
break
|
|
76
|
+
case 3:
|
|
77
|
+
return [[ok(((state[0] & 0x07) << 18) + ((state[1] & 0x3f) << 12) + ((state[2] & 0x3f) << 6) + (byte & 0x3f))], undefined]
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return [[error(list.toArray(list.concat(state)([byte])))], undefined]
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** @type {(state: Utf8State) => readonly[list.List<CodePointResult>, Utf8State]} */
|
|
84
|
+
const utf8EofToCodePointOp = state => [state == undefined ? undefined : [error(state)], undefined]
|
|
85
|
+
|
|
86
|
+
/** @type {operator.StateScan<ByteOrEof, Utf8State, list.List<CodePointResult>>} */
|
|
87
|
+
const utf8ByteOrEofToCodePointOp = state => input => input === undefined ? utf8EofToCodePointOp(state) : utf8ByteToCodePointOp(state)(input)
|
|
88
|
+
|
|
89
|
+
/** @type {(input: list.List<number>) => list.List<CodePointResult>} */
|
|
90
|
+
const utf8ListToCodePoint = input => list.flat(list.stateScan(utf8ByteOrEofToCodePointOp)(undefined)(list.concat(/** @type {list.List<ByteOrEof>} */(input))([undefined])))
|
|
91
|
+
|
|
92
|
+
/** @type {operator.StateScan<number, Utf16State, list.List<CodePointResult>>} */
|
|
93
|
+
const utf16ByteToCodePointOp = state => byte => {
|
|
94
|
+
if (byte < 0x00 || byte > 0xff) {
|
|
95
|
+
return [[error([byte])], state]
|
|
96
|
+
}
|
|
97
|
+
if (state == undefined) {
|
|
98
|
+
return [[], [byte]]
|
|
99
|
+
}
|
|
100
|
+
switch(state.length)
|
|
101
|
+
{
|
|
102
|
+
case 1:
|
|
103
|
+
const codeUnit = (state[0] << 8) + byte
|
|
104
|
+
if (isBmpCodePoint(codeUnit)) { return [[ok(codeUnit)], undefined] }
|
|
105
|
+
if (isHighSurrogate(codeUnit)) { return [[], [state[0], byte]] }
|
|
106
|
+
break
|
|
107
|
+
case 2:
|
|
108
|
+
return [[], [state[0], state[1], byte]]
|
|
109
|
+
case 3:
|
|
110
|
+
if (isLowSurrogate((state[2] << 8) + byte)) {
|
|
111
|
+
const high = (state[0] << 8) + state[1] - 0xd800
|
|
112
|
+
const low = (state[2] << 8) + byte - 0xdc00
|
|
113
|
+
return [[ok((high << 10) + low + 0x10000)], undefined]
|
|
114
|
+
}
|
|
115
|
+
break
|
|
116
|
+
}
|
|
117
|
+
return [[error(list.toArray(list.concat(state)([byte])))], undefined]
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** @type {(state: Utf8State) => readonly[list.List<CodePointResult>, Utf16State]} */
|
|
121
|
+
const utf16EofToCodePointOp = state => [state == undefined ? undefined : [error(state)], undefined]
|
|
122
|
+
|
|
123
|
+
/** @type {operator.StateScan<ByteOrEof, Utf8State, list.List<CodePointResult>>} */
|
|
124
|
+
const utf16ByteOrEofToCodePointOp = state => input => input === undefined ? utf16EofToCodePointOp(state) : utf16ByteToCodePointOp(state)(input)
|
|
125
|
+
|
|
126
|
+
/** @type {(input: list.List<number>) => list.List<CodePointResult>} */
|
|
127
|
+
const utf16ListToCodePoint = input => list.flat(list.stateScan(utf16ByteOrEofToCodePointOp)(undefined)(list.concat(/** @type {list.List<ByteOrEof>} */(input))([undefined])))
|
|
128
|
+
|
|
35
129
|
module.exports = {
|
|
36
130
|
/** @readonly */
|
|
37
131
|
codePointListToUtf8,
|
|
38
132
|
/** @readonly */
|
|
39
133
|
codePointListToUtf16,
|
|
134
|
+
/** @readonly */
|
|
135
|
+
utf8ListToCodePoint,
|
|
136
|
+
/** @readonly */
|
|
137
|
+
utf16ListToCodePoint
|
|
40
138
|
}
|
package/text/encoding/test.f.cjs
CHANGED
|
@@ -116,4 +116,84 @@ const stringify = a => json.stringify(sort)(a)
|
|
|
116
116
|
if (result !== '[["error",-1],["error",55296],["error",57343],["error",1114112]]') { throw result }
|
|
117
117
|
}
|
|
118
118
|
|
|
119
|
+
{
|
|
120
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([-1, 256])))
|
|
121
|
+
if (result !== '[["error",[-1]],["error",[256]]]') { throw result }
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
{
|
|
125
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([128, 193, 245, 255])))
|
|
126
|
+
if (result !== '[["error",[128]],["error",[193]],["error",[245]],["error",[255]]]') { throw result }
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
{
|
|
130
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([0, 1, 127])))
|
|
131
|
+
if (result !== '[["ok",0],["ok",1],["ok",127]]') { throw result }
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
{
|
|
135
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([194, 128, 194, 169, 223, 191])))
|
|
136
|
+
if (result !== '[["ok",128],["ok",169],["ok",2047]]') { throw result }
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
{
|
|
140
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([194, 127, 194, 192, 194])))
|
|
141
|
+
if (result !== '[["error",[194,127]],["error",[194,192]],["error",[194]]]') { throw result }
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
{
|
|
145
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([224, 160, 128, 224, 160, 129, 239, 191, 191])))
|
|
146
|
+
if (result !== '[["ok",2048],["ok",2049],["ok",65535]]') { throw result }
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
{
|
|
150
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([224, 160, 127, 224, 160, 192, 224, 160])))
|
|
151
|
+
if (result !== '[["error",[224,160,127]],["error",[224,160,192]],["error",[224,160]]]') { throw result }
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
{
|
|
155
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([240, 144, 128, 128, 240, 144, 128, 129, 244, 143, 191, 191])))
|
|
156
|
+
if (result !== '[["ok",65536],["ok",65537],["ok",1114111]]') { throw result }
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
{
|
|
160
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([240, 144, 128, 127, 240, 144, 128, 192, 240, 144, 128])))
|
|
161
|
+
if (result !== '[["error",[240,144,128,127]],["error",[240,144,128,192]],["error",[240,144,128]]]') { throw result }
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
{
|
|
165
|
+
const result = stringify(list.toArray(encoding.utf8ListToCodePoint([194, -1, 128])))
|
|
166
|
+
if (result !== '[["error",[-1]],["ok",128]]') { throw result }
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
{
|
|
170
|
+
const result = stringify(list.toArray(encoding.utf16ListToCodePoint([-1, 256,])))
|
|
171
|
+
if (result !== '[["error",[-1]],["error",[256]]]') { throw result }
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
{
|
|
175
|
+
const result = stringify(list.toArray(encoding.utf16ListToCodePoint([0, 0, 0, 36, 32, 172, 215, 255, 224, 0, 255, 255])))
|
|
176
|
+
if (result !== '[["ok",0],["ok",36],["ok",8364],["ok",55295],["ok",57344],["ok",65535]]') { throw result }
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
{
|
|
180
|
+
const result = stringify(list.toArray(encoding.utf16ListToCodePoint([220, 0, 223, 255])))
|
|
181
|
+
if (result !== '[["error",[220,0]],["error",[223,255]]]') { throw result }
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
{
|
|
185
|
+
const result = stringify(list.toArray(encoding.utf16ListToCodePoint([216, 0, 220, 0, 216, 1, 220, 55, 216, 82, 223, 98, 219, 255, 223, 255])))
|
|
186
|
+
if (result !== '[["ok",65536],["ok",66615],["ok",150370],["ok",1114111]]') { throw result }
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
{
|
|
190
|
+
const result = stringify(list.toArray(encoding.utf16ListToCodePoint([216, 0, 216, 0])))
|
|
191
|
+
if (result !== '[["error",[216,0,216,0]]]') { throw result }
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
{
|
|
195
|
+
const result = stringify(list.toArray(encoding.utf16ListToCodePoint([216, 0, 0, 0])))
|
|
196
|
+
if (result !== '[["error",[216,0,0,0]]]') { throw result }
|
|
197
|
+
}
|
|
198
|
+
|
|
119
199
|
module.exports = {}
|