@bufbuild/re2 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +30 -0
- package/dist/cjs/CharClass.d.ts +30 -0
- package/dist/cjs/CharClass.js +284 -0
- package/dist/cjs/CharGroup.d.ts +8 -0
- package/dist/cjs/CharGroup.js +83 -0
- package/dist/cjs/Codepoint.d.ts +3 -0
- package/dist/cjs/Codepoint.js +62 -0
- package/dist/cjs/Compiler.d.ts +40 -0
- package/dist/cjs/Compiler.js +262 -0
- package/dist/cjs/DFA.d.ts +36 -0
- package/dist/cjs/DFA.js +350 -0
- package/dist/cjs/Inst.d.ts +26 -0
- package/dist/cjs/Inst.js +86 -0
- package/dist/cjs/MachineInput.d.ts +17 -0
- package/dist/cjs/MachineInput.js +72 -0
- package/dist/cjs/Parser.d.ts +111 -0
- package/dist/cjs/Parser.js +1538 -0
- package/dist/cjs/Prefilter.d.ts +19 -0
- package/dist/cjs/Prefilter.js +163 -0
- package/dist/cjs/Prog.d.ts +39 -0
- package/dist/cjs/Prog.js +154 -0
- package/dist/cjs/RE2.d.ts +27 -0
- package/dist/cjs/RE2.js +221 -0
- package/dist/cjs/RE2Flags.d.ts +16 -0
- package/dist/cjs/RE2Flags.js +58 -0
- package/dist/cjs/Regexp.d.ts +43 -0
- package/dist/cjs/Regexp.js +98 -0
- package/dist/cjs/Simplify.d.ts +3 -0
- package/dist/cjs/Simplify.js +230 -0
- package/dist/cjs/Unicode.d.ts +17 -0
- package/dist/cjs/Unicode.js +165 -0
- package/dist/cjs/UnicodeRangeTable.d.ts +12 -0
- package/dist/cjs/UnicodeRangeTable.js +31 -0
- package/dist/cjs/UnicodeTables.d.ts +29 -0
- package/dist/cjs/UnicodeTables.js +571 -0
- package/dist/cjs/Utils.d.ts +22 -0
- package/dist/cjs/Utils.js +119 -0
- package/dist/cjs/__fixtures__/find.d.ts +9 -0
- package/dist/cjs/__fixtures__/find.js +115 -0
- package/dist/cjs/chars.d.ts +2 -0
- package/dist/cjs/chars.js +19 -0
- package/dist/cjs/exceptions.d.ts +55 -0
- package/dist/cjs/exceptions.js +94 -0
- package/dist/cjs/index.d.ts +102 -0
- package/dist/cjs/index.js +173 -0
- package/dist/cjs/package.json +1 -0
- package/dist/cjs/testParser.d.ts +3 -0
- package/dist/cjs/testParser.js +143 -0
- package/dist/esm/CharClass.d.ts +30 -0
- package/dist/esm/CharClass.js +281 -0
- package/dist/esm/CharGroup.d.ts +8 -0
- package/dist/esm/CharGroup.js +78 -0
- package/dist/esm/Codepoint.d.ts +3 -0
- package/dist/esm/Codepoint.js +59 -0
- package/dist/esm/Compiler.d.ts +40 -0
- package/dist/esm/Compiler.js +259 -0
- package/dist/esm/DFA.d.ts +36 -0
- package/dist/esm/DFA.js +347 -0
- package/dist/esm/Inst.d.ts +26 -0
- package/dist/esm/Inst.js +83 -0
- package/dist/esm/MachineInput.d.ts +17 -0
- package/dist/esm/MachineInput.js +68 -0
- package/dist/esm/Parser.d.ts +111 -0
- package/dist/esm/Parser.js +1535 -0
- package/dist/esm/Prefilter.d.ts +19 -0
- package/dist/esm/Prefilter.js +159 -0
- package/dist/esm/Prog.d.ts +39 -0
- package/dist/esm/Prog.js +150 -0
- package/dist/esm/RE2.d.ts +27 -0
- package/dist/esm/RE2.js +218 -0
- package/dist/esm/RE2Flags.d.ts +16 -0
- package/dist/esm/RE2Flags.js +41 -0
- package/dist/esm/Regexp.d.ts +43 -0
- package/dist/esm/Regexp.js +94 -0
- package/dist/esm/Simplify.d.ts +3 -0
- package/dist/esm/Simplify.js +228 -0
- package/dist/esm/Unicode.d.ts +17 -0
- package/dist/esm/Unicode.js +150 -0
- package/dist/esm/UnicodeRangeTable.d.ts +12 -0
- package/dist/esm/UnicodeRangeTable.js +28 -0
- package/dist/esm/UnicodeTables.d.ts +29 -0
- package/dist/esm/UnicodeTables.js +568 -0
- package/dist/esm/Utils.d.ts +22 -0
- package/dist/esm/Utils.js +103 -0
- package/dist/esm/__fixtures__/find.d.ts +9 -0
- package/dist/esm/__fixtures__/find.js +112 -0
- package/dist/esm/chars.d.ts +2 -0
- package/dist/esm/chars.js +14 -0
- package/dist/esm/exceptions.d.ts +55 -0
- package/dist/esm/exceptions.js +86 -0
- package/dist/esm/index.d.ts +102 -0
- package/dist/esm/index.js +163 -0
- package/dist/esm/testParser.d.ts +3 -0
- package/dist/esm/testParser.js +138 -0
- package/package.json +49 -0
|
@@ -0,0 +1,568 @@
|
|
|
1
|
+
// GENERATED BY tools/scripts/genUnicodeTable.js; DO NOT EDIT.
|
|
2
|
+
// yarn node ./tools/scripts/genUnicodeTable.js > src/UnicodeTables.ts
|
|
3
|
+
import { UnicodeRangeTable } from "./UnicodeRangeTable.js";
|
|
4
|
+
let _B64_MAP = null;
|
|
5
|
+
const getB64Map = () => {
|
|
6
|
+
if (!_B64_MAP) {
|
|
7
|
+
_B64_MAP = new Uint8Array(256);
|
|
8
|
+
const b = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-";
|
|
9
|
+
for (let i = 0; i < 64; i++) {
|
|
10
|
+
_B64_MAP[b.charCodeAt(i)] = i;
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return _B64_MAP;
|
|
14
|
+
};
|
|
15
|
+
const decodeVLQ = (str) => {
|
|
16
|
+
const b64 = getB64Map();
|
|
17
|
+
const res = [];
|
|
18
|
+
let value = 0, shift = 0;
|
|
19
|
+
for (let i = 0; i < str.length; i++) {
|
|
20
|
+
const digit = b64[str.charCodeAt(i)];
|
|
21
|
+
value |= (digit & 0x1f) << shift;
|
|
22
|
+
if ((digit & 0x20) === 0) {
|
|
23
|
+
res.push(value);
|
|
24
|
+
value = 0;
|
|
25
|
+
shift = 0;
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
shift += 5;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return res;
|
|
32
|
+
};
|
|
33
|
+
const decodeRanges = (str, isStride1) => {
|
|
34
|
+
if (str.length === 0)
|
|
35
|
+
return new Uint32Array(0);
|
|
36
|
+
const res = decodeVLQ(str);
|
|
37
|
+
const numRanges = isStride1 ? res.length / 2 : res.length / 3;
|
|
38
|
+
const out = new Uint32Array(numRanges * 3);
|
|
39
|
+
let current = 0, resIdx = 0;
|
|
40
|
+
for (let i = 0; i < numRanges; i++) {
|
|
41
|
+
current += res[resIdx++];
|
|
42
|
+
out[i * 3] = current;
|
|
43
|
+
current += res[resIdx++];
|
|
44
|
+
out[i * 3 + 1] = current;
|
|
45
|
+
out[i * 3 + 2] = isStride1 ? 1 : res[resIdx++];
|
|
46
|
+
}
|
|
47
|
+
return out;
|
|
48
|
+
};
|
|
49
|
+
const decodeOrbit = (str) => {
|
|
50
|
+
const res = decodeVLQ(str);
|
|
51
|
+
const map = new Map();
|
|
52
|
+
let currentKey = 0;
|
|
53
|
+
for (let i = 0; i < res.length; i += 2) {
|
|
54
|
+
currentKey += res[i];
|
|
55
|
+
map.set(currentKey, res[i + 1]);
|
|
56
|
+
}
|
|
57
|
+
return map;
|
|
58
|
+
};
|
|
59
|
+
// Merges two stride-encoded UnicodeRangeTables. Expands any stride>1
|
|
60
|
+
// ranges to individual codepoints, then coalesces contiguous runs.
|
|
61
|
+
const mergeRanges = (a, b) => {
|
|
62
|
+
if (b.length === 0)
|
|
63
|
+
return a;
|
|
64
|
+
if (a.length === 0)
|
|
65
|
+
return b;
|
|
66
|
+
const points = [];
|
|
67
|
+
const push = (arr) => {
|
|
68
|
+
for (let i = 0; i < arr.length; i += 3) {
|
|
69
|
+
const lo = arr[i], hi = arr[i + 1], stride = arr[i + 2];
|
|
70
|
+
if (stride === 1) {
|
|
71
|
+
points.push([lo, hi]);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
for (let cp = lo; cp <= hi; cp += stride)
|
|
75
|
+
points.push([cp, cp]);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
push(a);
|
|
80
|
+
push(b);
|
|
81
|
+
points.sort((x, y) => x[0] - y[0]);
|
|
82
|
+
const merged = [];
|
|
83
|
+
for (const [lo, hi] of points) {
|
|
84
|
+
const last = merged[merged.length - 1];
|
|
85
|
+
if (last && last[1] + 1 >= lo) {
|
|
86
|
+
if (hi > last[1])
|
|
87
|
+
last[1] = hi;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
merged.push([lo, hi]);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const out = new Uint32Array(merged.length * 3);
|
|
94
|
+
for (let i = 0; i < merged.length; i++) {
|
|
95
|
+
out[i * 3] = merged[i][0];
|
|
96
|
+
out[i * 3 + 1] = merged[i][1];
|
|
97
|
+
out[i * 3 + 2] = 1;
|
|
98
|
+
}
|
|
99
|
+
return out;
|
|
100
|
+
};
|
|
101
|
+
// Sweeps the codepoint space using a platform property-escape regex and
|
|
102
|
+
// returns stride-1 ranges. Surrogates are included — String.fromCodePoint
|
|
103
|
+
// returns the lone surrogate char and platform regex matches \p{Cs} on it.
|
|
104
|
+
const sweepPlatform = (pattern) => {
|
|
105
|
+
const re = new RegExp(pattern, "u");
|
|
106
|
+
const ranges = [];
|
|
107
|
+
let start = -1;
|
|
108
|
+
for (let cp = 0; cp <= 0x10ffff; cp++) {
|
|
109
|
+
if (re.test(String.fromCodePoint(cp))) {
|
|
110
|
+
if (start < 0)
|
|
111
|
+
start = cp;
|
|
112
|
+
}
|
|
113
|
+
else if (start >= 0) {
|
|
114
|
+
ranges.push(start, cp - 1, 1);
|
|
115
|
+
start = -1;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
if (start >= 0)
|
|
119
|
+
ranges.push(start, 0x10ffff, 1);
|
|
120
|
+
return Uint32Array.from(ranges);
|
|
121
|
+
};
|
|
122
|
+
class LazyDecoder {
|
|
123
|
+
initializer;
|
|
124
|
+
cache;
|
|
125
|
+
constructor(initializer) {
|
|
126
|
+
this.initializer = initializer;
|
|
127
|
+
this.cache = new Map();
|
|
128
|
+
}
|
|
129
|
+
has(key) {
|
|
130
|
+
return key in this.initializer;
|
|
131
|
+
}
|
|
132
|
+
get(key) {
|
|
133
|
+
const cached = this.cache.get(key);
|
|
134
|
+
if (cached !== undefined || this.cache.has(key)) {
|
|
135
|
+
return cached ?? null;
|
|
136
|
+
}
|
|
137
|
+
const fn = this.initializer[key];
|
|
138
|
+
const val = fn ? fn() : null;
|
|
139
|
+
this.cache.set(key, val);
|
|
140
|
+
return val;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
let _CASE_ORBIT = null;
|
|
144
|
+
const getCASE_ORBIT = () => {
|
|
145
|
+
if (!_CASE_ORBIT) {
|
|
146
|
+
_CASE_ORBIT = decodeOrbit("rDqpII-LsD+0HGrpIsCxJzElODoODrOnByP-Mz+HTieNj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeE1eDmpII0fjB4c+BgkHChkHKikHDjkHBkkHImkHZnkHhhGlkH9O70H-Io8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HJ48HB58HB68HB78HB88HB98HB+8HB-8HJo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HM89HLlaFs+HQwcQwdQ8-HzJpdErCBlGgphBokHjMu+pBBv+pBDy+pBBz+pBB0+pBB1+pBw5Um4+BBl4+B68cg17CBh17CBi17CBj17CBk17CBl17CBm17CBn17CBo17CBp17CBq17CBr17CBs17CBt17CBu17CBv17CBw17CBx17CBy17CBz17CB017CB117CB217CB317CB417CD717CB817CB917CB+17CB-17CBg27CBh27CBi27CBj27CBk27CBl27CBm27CBn27CBo27CBp27CBq27CBr27CBs27CBt27CBu27CBv27CBw27CBx27CBy27CBz27C");
|
|
147
|
+
}
|
|
148
|
+
return _CASE_ORBIT;
|
|
149
|
+
};
|
|
150
|
+
// Additions from Unicode 15.0 → 16.0 per stable general-category name.
|
|
151
|
+
// Merged unconditionally with platform sweep output; no-op on 16.0+ engines.
|
|
152
|
+
const _DELTA_CATEGORIES = /*#__PURE__*/ new LazyDecoder({
|
|
153
|
+
L: () => decodeRanges("pkHBBh6iBCBNCBkvXzBB36BbBKWB9JCB8lBJBCDDClBBCaaCt-Bt-BBfBgkG68DBmoHdBjhDsBBz8HxmWxmWBcBDgwhCgwhCBsTB", false),
|
|
154
|
+
LC: () => decodeRanges("pkHBh6iBCNC0rZVLV", true),
|
|
155
|
+
Ll: () => decodeRanges("qkHj6iBj6iBO1sZ1sZBUB", false),
|
|
156
|
+
Lm: () => decodeRanges("uqjChBhBx+XCBpBBB", false),
|
|
157
|
+
Lo: () => decodeRanges("guhCzBB36BDBCzLzLBBB8lBJBCDDClBBCaaCt-Bt-BBfBgkG68DBmoHdBmhDnBB18HxmWxmWBcBDgwhCgwhCBsTB", false),
|
|
158
|
+
Lu: () => decodeRanges("pkHi6iBi6iBBOOC0rZ0rZBUB", false),
|
|
159
|
+
M: () => decodeRanges("3kCymhCymhCBDBvM8lB8lBBHBCDDCDBCEBCPPB47C47CkuQRB-lhBBB", false),
|
|
160
|
+
Mc: () => decodeRanges("49kCCBIDDCDBCBBCvavaswSCB", false),
|
|
161
|
+
Mn: () => decodeRanges("3kCymhCymhCBDBvM-lB-lBBEBOECPBB47CkuQkuQBKBECB-lhBBB", false),
|
|
162
|
+
N: () => decodeRanges("gqjCJnsCTtoBJ3pRJ3hDJ37XJ4nGJ", true),
|
|
163
|
+
Nd: () => decodeRanges("gqjCJnsCTtoBJ3pRJ3hDJ37XJ4nGJ", true),
|
|
164
|
+
P: () => decodeRanges("u6GBBwBvv8Bvv8BmzBBBCBBpgCssUssUBBBwkeAB", false),
|
|
165
|
+
Pd: () => decodeRanges("urjCA", true),
|
|
166
|
+
Po: () => decodeRanges("u6GBBwB1i+B1i+BBCCBpgCpgCssUCBwkeAB", false),
|
|
167
|
+
S: () => decodeRanges("nhJCBz+CDBlPBBK-82B-82BBxzvBxzvBBuHBRzNB-vKJBFBBoOGGvBIIWDDKiHiHBjBB", false),
|
|
168
|
+
Sm: () => decodeRanges("usjCB", true),
|
|
169
|
+
So: () => decodeRanges("nhJCBz+CDBlPBBKxwmDxwmDBuHBRzNB-vKJBFBBoOGGvBIIWDDKiHiHBjBB", false),
|
|
170
|
+
});
|
|
171
|
+
// Additions from Unicode 15.0 → 16.0 per stable script name.
|
|
172
|
+
const _DELTA_SCRIPTS = /*#__PURE__*/ new LazyDecoder({
|
|
173
|
+
Arabic: () => decodeRanges("3kCrxhCrxhCBBB4BAB", false),
|
|
174
|
+
Balinese: () => decodeRanges("u6GBwBA", true),
|
|
175
|
+
Common: () => decodeRanges("nhJCBz+CDBlPBBKxwmDxwmDB4HBHzNB-vKJBFBBoOGGvBIIWDDKiHiHBjBB", false),
|
|
176
|
+
Cyrillic: () => decodeRanges("pkHB", true),
|
|
177
|
+
Egyptian_Hieroglyphs: () => decodeRanges("gjtC68D", true),
|
|
178
|
+
Han: () => decodeRanges("w-6FtT", true),
|
|
179
|
+
Kawi: () => decodeRanges("66nCA", true),
|
|
180
|
+
Khitan_Small_Script: () => decodeRanges("-njDA", true),
|
|
181
|
+
Latin: () => decodeRanges("r+pBCNC", true),
|
|
182
|
+
Myanmar: () => decodeRanges("w2lCT", true),
|
|
183
|
+
});
|
|
184
|
+
// Full tables for scripts added in Unicode 16.0. Engines < 16.0 throw
|
|
185
|
+
// SyntaxError on these names, so platform sweep is impossible.
|
|
186
|
+
const _NEW_SCRIPTS = /*#__PURE__*/ new LazyDecoder({
|
|
187
|
+
Garay: () => new UnicodeRangeTable(decodeRanges("gqjClBEcJB", true)),
|
|
188
|
+
Gurung_Khema: () => new UnicodeRangeTable(decodeRanges("go4C5B", true)),
|
|
189
|
+
Kirat_Rai: () => new UnicodeRangeTable(decodeRanges("gq7C5B", true)),
|
|
190
|
+
Ol_Onal: () => new UnicodeRangeTable(decodeRanges("wu5DqBFA", true)),
|
|
191
|
+
Sunuwar: () => new UnicodeRangeTable(decodeRanges("g+mChBPJ", true)),
|
|
192
|
+
Todhri: () => new UnicodeRangeTable(decodeRanges("guhCzB", true)),
|
|
193
|
+
Tulu_Tigalari: () => new UnicodeRangeTable(decodeRanges("g8kCJBCDDClBBCJBCDDCDBCJBCBBJBB", false)),
|
|
194
|
+
});
|
|
195
|
+
const STABLE_CATEGORY_NAMES = new Set([
|
|
196
|
+
"C",
|
|
197
|
+
"Cc",
|
|
198
|
+
"Cf",
|
|
199
|
+
"Cn",
|
|
200
|
+
"Co",
|
|
201
|
+
"Cs",
|
|
202
|
+
"L",
|
|
203
|
+
"LC",
|
|
204
|
+
"Ll",
|
|
205
|
+
"Lm",
|
|
206
|
+
"Lo",
|
|
207
|
+
"Lt",
|
|
208
|
+
"Lu",
|
|
209
|
+
"M",
|
|
210
|
+
"Mc",
|
|
211
|
+
"Me",
|
|
212
|
+
"Mn",
|
|
213
|
+
"N",
|
|
214
|
+
"Nd",
|
|
215
|
+
"Nl",
|
|
216
|
+
"No",
|
|
217
|
+
"P",
|
|
218
|
+
"Pc",
|
|
219
|
+
"Pd",
|
|
220
|
+
"Pe",
|
|
221
|
+
"Pf",
|
|
222
|
+
"Pi",
|
|
223
|
+
"Po",
|
|
224
|
+
"Ps",
|
|
225
|
+
"S",
|
|
226
|
+
"Sc",
|
|
227
|
+
"Sk",
|
|
228
|
+
"Sm",
|
|
229
|
+
"So",
|
|
230
|
+
"Z",
|
|
231
|
+
"Zl",
|
|
232
|
+
"Zp",
|
|
233
|
+
"Zs",
|
|
234
|
+
]);
|
|
235
|
+
const STABLE_SCRIPT_NAMES = new Set([
|
|
236
|
+
"Adlam",
|
|
237
|
+
"Ahom",
|
|
238
|
+
"Anatolian_Hieroglyphs",
|
|
239
|
+
"Arabic",
|
|
240
|
+
"Armenian",
|
|
241
|
+
"Avestan",
|
|
242
|
+
"Balinese",
|
|
243
|
+
"Bamum",
|
|
244
|
+
"Bassa_Vah",
|
|
245
|
+
"Batak",
|
|
246
|
+
"Bengali",
|
|
247
|
+
"Bhaiksuki",
|
|
248
|
+
"Bopomofo",
|
|
249
|
+
"Brahmi",
|
|
250
|
+
"Braille",
|
|
251
|
+
"Buginese",
|
|
252
|
+
"Buhid",
|
|
253
|
+
"Canadian_Aboriginal",
|
|
254
|
+
"Carian",
|
|
255
|
+
"Caucasian_Albanian",
|
|
256
|
+
"Chakma",
|
|
257
|
+
"Cham",
|
|
258
|
+
"Cherokee",
|
|
259
|
+
"Chorasmian",
|
|
260
|
+
"Common",
|
|
261
|
+
"Coptic",
|
|
262
|
+
"Cuneiform",
|
|
263
|
+
"Cypriot",
|
|
264
|
+
"Cypro_Minoan",
|
|
265
|
+
"Cyrillic",
|
|
266
|
+
"Deseret",
|
|
267
|
+
"Devanagari",
|
|
268
|
+
"Dives_Akuru",
|
|
269
|
+
"Dogra",
|
|
270
|
+
"Duployan",
|
|
271
|
+
"Egyptian_Hieroglyphs",
|
|
272
|
+
"Elbasan",
|
|
273
|
+
"Elymaic",
|
|
274
|
+
"Ethiopic",
|
|
275
|
+
"Georgian",
|
|
276
|
+
"Glagolitic",
|
|
277
|
+
"Gothic",
|
|
278
|
+
"Grantha",
|
|
279
|
+
"Greek",
|
|
280
|
+
"Gujarati",
|
|
281
|
+
"Gunjala_Gondi",
|
|
282
|
+
"Gurmukhi",
|
|
283
|
+
"Han",
|
|
284
|
+
"Hangul",
|
|
285
|
+
"Hanifi_Rohingya",
|
|
286
|
+
"Hanunoo",
|
|
287
|
+
"Hatran",
|
|
288
|
+
"Hebrew",
|
|
289
|
+
"Hiragana",
|
|
290
|
+
"Imperial_Aramaic",
|
|
291
|
+
"Inherited",
|
|
292
|
+
"Inscriptional_Pahlavi",
|
|
293
|
+
"Inscriptional_Parthian",
|
|
294
|
+
"Javanese",
|
|
295
|
+
"Kaithi",
|
|
296
|
+
"Kannada",
|
|
297
|
+
"Katakana",
|
|
298
|
+
"Kawi",
|
|
299
|
+
"Kayah_Li",
|
|
300
|
+
"Kharoshthi",
|
|
301
|
+
"Khitan_Small_Script",
|
|
302
|
+
"Khmer",
|
|
303
|
+
"Khojki",
|
|
304
|
+
"Khudawadi",
|
|
305
|
+
"Lao",
|
|
306
|
+
"Latin",
|
|
307
|
+
"Lepcha",
|
|
308
|
+
"Limbu",
|
|
309
|
+
"Linear_A",
|
|
310
|
+
"Linear_B",
|
|
311
|
+
"Lisu",
|
|
312
|
+
"Lycian",
|
|
313
|
+
"Lydian",
|
|
314
|
+
"Mahajani",
|
|
315
|
+
"Makasar",
|
|
316
|
+
"Malayalam",
|
|
317
|
+
"Mandaic",
|
|
318
|
+
"Manichaean",
|
|
319
|
+
"Marchen",
|
|
320
|
+
"Masaram_Gondi",
|
|
321
|
+
"Medefaidrin",
|
|
322
|
+
"Meetei_Mayek",
|
|
323
|
+
"Mende_Kikakui",
|
|
324
|
+
"Meroitic_Cursive",
|
|
325
|
+
"Meroitic_Hieroglyphs",
|
|
326
|
+
"Miao",
|
|
327
|
+
"Modi",
|
|
328
|
+
"Mongolian",
|
|
329
|
+
"Mro",
|
|
330
|
+
"Multani",
|
|
331
|
+
"Myanmar",
|
|
332
|
+
"Nabataean",
|
|
333
|
+
"Nag_Mundari",
|
|
334
|
+
"Nandinagari",
|
|
335
|
+
"New_Tai_Lue",
|
|
336
|
+
"Newa",
|
|
337
|
+
"Nko",
|
|
338
|
+
"Nushu",
|
|
339
|
+
"Nyiakeng_Puachue_Hmong",
|
|
340
|
+
"Ogham",
|
|
341
|
+
"Ol_Chiki",
|
|
342
|
+
"Old_Hungarian",
|
|
343
|
+
"Old_Italic",
|
|
344
|
+
"Old_North_Arabian",
|
|
345
|
+
"Old_Permic",
|
|
346
|
+
"Old_Persian",
|
|
347
|
+
"Old_Sogdian",
|
|
348
|
+
"Old_South_Arabian",
|
|
349
|
+
"Old_Turkic",
|
|
350
|
+
"Old_Uyghur",
|
|
351
|
+
"Oriya",
|
|
352
|
+
"Osage",
|
|
353
|
+
"Osmanya",
|
|
354
|
+
"Pahawh_Hmong",
|
|
355
|
+
"Palmyrene",
|
|
356
|
+
"Pau_Cin_Hau",
|
|
357
|
+
"Phags_Pa",
|
|
358
|
+
"Phoenician",
|
|
359
|
+
"Psalter_Pahlavi",
|
|
360
|
+
"Rejang",
|
|
361
|
+
"Runic",
|
|
362
|
+
"Samaritan",
|
|
363
|
+
"Saurashtra",
|
|
364
|
+
"Sharada",
|
|
365
|
+
"Shavian",
|
|
366
|
+
"Siddham",
|
|
367
|
+
"SignWriting",
|
|
368
|
+
"Sinhala",
|
|
369
|
+
"Sogdian",
|
|
370
|
+
"Sora_Sompeng",
|
|
371
|
+
"Soyombo",
|
|
372
|
+
"Sundanese",
|
|
373
|
+
"Syloti_Nagri",
|
|
374
|
+
"Syriac",
|
|
375
|
+
"Tagalog",
|
|
376
|
+
"Tagbanwa",
|
|
377
|
+
"Tai_Le",
|
|
378
|
+
"Tai_Tham",
|
|
379
|
+
"Tai_Viet",
|
|
380
|
+
"Takri",
|
|
381
|
+
"Tamil",
|
|
382
|
+
"Tangsa",
|
|
383
|
+
"Tangut",
|
|
384
|
+
"Telugu",
|
|
385
|
+
"Thaana",
|
|
386
|
+
"Thai",
|
|
387
|
+
"Tibetan",
|
|
388
|
+
"Tifinagh",
|
|
389
|
+
"Tirhuta",
|
|
390
|
+
"Toto",
|
|
391
|
+
"Ugaritic",
|
|
392
|
+
"Unknown",
|
|
393
|
+
"Vai",
|
|
394
|
+
"Vithkuqi",
|
|
395
|
+
"Wancho",
|
|
396
|
+
"Warang_Citi",
|
|
397
|
+
"Yezidi",
|
|
398
|
+
"Yi",
|
|
399
|
+
"Zanabazar_Square",
|
|
400
|
+
]);
|
|
401
|
+
const NEW_SCRIPT_NAMES = new Set([
|
|
402
|
+
"Garay",
|
|
403
|
+
"Gurung_Khema",
|
|
404
|
+
"Kirat_Rai",
|
|
405
|
+
"Ol_Onal",
|
|
406
|
+
"Sunuwar",
|
|
407
|
+
"Todhri",
|
|
408
|
+
"Tulu_Tigalari",
|
|
409
|
+
]);
|
|
410
|
+
const _sweepCache = new Map();
|
|
411
|
+
const _foldCache = new Map();
|
|
412
|
+
// Returns the base range table for a property name, or null if unknown.
|
|
413
|
+
// Stable names: platform sweep + bundled delta (15.0 → 16.0).
|
|
414
|
+
// New-in-16.0 script names: bundled full table.
|
|
415
|
+
const buildForProperty = (name) => {
|
|
416
|
+
if (NEW_SCRIPT_NAMES.has(name)) {
|
|
417
|
+
return _NEW_SCRIPTS.get(name);
|
|
418
|
+
}
|
|
419
|
+
let kind = null;
|
|
420
|
+
let pattern = null;
|
|
421
|
+
if (STABLE_CATEGORY_NAMES.has(name)) {
|
|
422
|
+
kind = "category";
|
|
423
|
+
pattern = `\\p{General_Category=${name}}`;
|
|
424
|
+
}
|
|
425
|
+
else if (STABLE_SCRIPT_NAMES.has(name)) {
|
|
426
|
+
kind = "script";
|
|
427
|
+
pattern = `\\p{Script=${name}}`;
|
|
428
|
+
}
|
|
429
|
+
else
|
|
430
|
+
return null;
|
|
431
|
+
const cacheKey = `${kind}:${name}`;
|
|
432
|
+
const cached = _sweepCache.get(cacheKey);
|
|
433
|
+
if (cached)
|
|
434
|
+
return cached;
|
|
435
|
+
const base = sweepPlatform(pattern);
|
|
436
|
+
const delta = kind === "category"
|
|
437
|
+
? _DELTA_CATEGORIES.get(name)
|
|
438
|
+
: _DELTA_SCRIPTS.get(name);
|
|
439
|
+
const merged = delta ? mergeRanges(base, delta) : base;
|
|
440
|
+
const table = new UnicodeRangeTable(merged);
|
|
441
|
+
_sweepCache.set(cacheKey, table);
|
|
442
|
+
return table;
|
|
443
|
+
};
|
|
444
|
+
// Computes the fold-overlay for a property name: additional runes that
|
|
445
|
+
// fold to some rune already in the base class. Returns null if no overlay
|
|
446
|
+
// is needed (base class is fold-stable).
|
|
447
|
+
const buildFoldOverlay = (name) => {
|
|
448
|
+
const cached = _foldCache.get(name);
|
|
449
|
+
if (cached !== undefined)
|
|
450
|
+
return cached;
|
|
451
|
+
const base = buildForProperty(name);
|
|
452
|
+
if (!base) {
|
|
453
|
+
_foldCache.set(name, null);
|
|
454
|
+
return null;
|
|
455
|
+
}
|
|
456
|
+
const inBase = (r) => {
|
|
457
|
+
let lo = 0, hi = base.length;
|
|
458
|
+
while (lo < hi) {
|
|
459
|
+
const m = (lo + hi) >> 1;
|
|
460
|
+
const rlo = base.getLo(m), rhi = base.getHi(m);
|
|
461
|
+
if (r < rlo)
|
|
462
|
+
hi = m;
|
|
463
|
+
else if (r > rhi)
|
|
464
|
+
lo = m + 1;
|
|
465
|
+
else
|
|
466
|
+
return (r - rlo) % base.getStride(m) === 0;
|
|
467
|
+
}
|
|
468
|
+
return false;
|
|
469
|
+
};
|
|
470
|
+
// Inline simpleFold to avoid circular import with Unicode.ts.
|
|
471
|
+
const orbit = getCASE_ORBIT();
|
|
472
|
+
const simpleFold = (r) => {
|
|
473
|
+
const folded = orbit.get(r);
|
|
474
|
+
if (folded !== undefined)
|
|
475
|
+
return folded;
|
|
476
|
+
const s = String.fromCodePoint(r);
|
|
477
|
+
const lower = s.toLowerCase();
|
|
478
|
+
if (lower.length === s.length) {
|
|
479
|
+
const lowerCp = lower.codePointAt(0);
|
|
480
|
+
if (lowerCp !== undefined && lowerCp !== r)
|
|
481
|
+
return lowerCp;
|
|
482
|
+
}
|
|
483
|
+
const upper = s.toUpperCase();
|
|
484
|
+
if (upper.length === s.length) {
|
|
485
|
+
const upperCp = upper.codePointAt(0);
|
|
486
|
+
if (upperCp !== undefined && upperCp !== r)
|
|
487
|
+
return upperCp;
|
|
488
|
+
}
|
|
489
|
+
return r;
|
|
490
|
+
};
|
|
491
|
+
const extras = new Set();
|
|
492
|
+
for (let i = 0; i < base.length; i++) {
|
|
493
|
+
const lo = base.getLo(i), hi = base.getHi(i), stride = base.getStride(i);
|
|
494
|
+
for (let cp = lo; cp <= hi; cp += stride) {
|
|
495
|
+
let r = simpleFold(cp);
|
|
496
|
+
while (r !== cp) {
|
|
497
|
+
if (!inBase(r))
|
|
498
|
+
extras.add(r);
|
|
499
|
+
r = simpleFold(r);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
if (extras.size === 0) {
|
|
504
|
+
_foldCache.set(name, null);
|
|
505
|
+
return null;
|
|
506
|
+
}
|
|
507
|
+
const sorted = Array.from(extras).sort((a, b) => a - b);
|
|
508
|
+
const merged = [];
|
|
509
|
+
for (const cp of sorted) {
|
|
510
|
+
const last = merged[merged.length - 1];
|
|
511
|
+
if (last && last[1] + 1 === cp)
|
|
512
|
+
last[1] = cp;
|
|
513
|
+
else
|
|
514
|
+
merged.push([cp, cp]);
|
|
515
|
+
}
|
|
516
|
+
const out = new Uint32Array(merged.length * 3);
|
|
517
|
+
for (let i = 0; i < merged.length; i++) {
|
|
518
|
+
out[i * 3] = merged[i][0];
|
|
519
|
+
out[i * 3 + 1] = merged[i][1];
|
|
520
|
+
out[i * 3 + 2] = 1;
|
|
521
|
+
}
|
|
522
|
+
const table = new UnicodeRangeTable(out);
|
|
523
|
+
_foldCache.set(name, table);
|
|
524
|
+
return table;
|
|
525
|
+
};
|
|
526
|
+
const getUpper = () => {
|
|
527
|
+
const table = buildForProperty("Lu");
|
|
528
|
+
if (table === null) {
|
|
529
|
+
throw new Error("Upper: missing Lu property");
|
|
530
|
+
}
|
|
531
|
+
return table;
|
|
532
|
+
};
|
|
533
|
+
// --- Legacy API surface used by Parser ---
|
|
534
|
+
export const UnicodeTables = {
|
|
535
|
+
get CASE_ORBIT() {
|
|
536
|
+
return getCASE_ORBIT();
|
|
537
|
+
},
|
|
538
|
+
STABLE_CATEGORY_NAMES,
|
|
539
|
+
STABLE_SCRIPT_NAMES,
|
|
540
|
+
NEW_SCRIPT_NAMES,
|
|
541
|
+
buildForProperty,
|
|
542
|
+
buildFoldOverlay,
|
|
543
|
+
CATEGORIES: {
|
|
544
|
+
has: (name) => STABLE_CATEGORY_NAMES.has(name),
|
|
545
|
+
get: (name) => buildForProperty(name),
|
|
546
|
+
},
|
|
547
|
+
SCRIPTS: {
|
|
548
|
+
has: (name) => STABLE_SCRIPT_NAMES.has(name) || NEW_SCRIPT_NAMES.has(name),
|
|
549
|
+
get: (name) => buildForProperty(name),
|
|
550
|
+
},
|
|
551
|
+
FOLD_CATEGORIES: {
|
|
552
|
+
has: (name) => STABLE_CATEGORY_NAMES.has(name),
|
|
553
|
+
get: (name) => buildFoldOverlay(name),
|
|
554
|
+
},
|
|
555
|
+
FOLD_SCRIPT: {
|
|
556
|
+
has: (name) => STABLE_SCRIPT_NAMES.has(name) || NEW_SCRIPT_NAMES.has(name),
|
|
557
|
+
get: (name) => buildFoldOverlay(name),
|
|
558
|
+
},
|
|
559
|
+
get Upper() {
|
|
560
|
+
return getUpper();
|
|
561
|
+
},
|
|
562
|
+
// --- Test-only hooks: expose the raw bundled 15.0→16.0 delta and
|
|
563
|
+
// new-in-16.0 script data so tests can verify the generator output.
|
|
564
|
+
// These are not part of the public API.
|
|
565
|
+
_deltaCategoryRanges: (name) => _DELTA_CATEGORIES.get(name),
|
|
566
|
+
_deltaScriptRanges: (name) => _DELTA_SCRIPTS.get(name),
|
|
567
|
+
_newScriptTable: (name) => _NEW_SCRIPTS.get(name),
|
|
568
|
+
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
declare const EMPTY_BEGIN_LINE = 1;
|
|
2
|
+
declare const EMPTY_END_LINE = 2;
|
|
3
|
+
declare const EMPTY_BEGIN_TEXT = 4;
|
|
4
|
+
declare const EMPTY_END_TEXT = 8;
|
|
5
|
+
declare const EMPTY_WORD_BOUNDARY = 16;
|
|
6
|
+
declare const EMPTY_NO_WORD_BOUNDARY = 32;
|
|
7
|
+
declare function emptyInts(): number[];
|
|
8
|
+
declare function isalnum(c: number): boolean;
|
|
9
|
+
declare function unhex(c: number): number;
|
|
10
|
+
declare function stringToRunes(str: string): number[];
|
|
11
|
+
declare function runeToString(r: number): string;
|
|
12
|
+
declare function emptyOpContext(r1: number, r2: number): number;
|
|
13
|
+
/**
|
|
14
|
+
* Returns a string that quotes all regular expression metacharacters inside the argument text;
|
|
15
|
+
* the returned string is a regular expression matching the literal text. For example,
|
|
16
|
+
* {@code quoteMeta("[foo]").equals("\\[foo\\]")}.
|
|
17
|
+
* @param {string} str
|
|
18
|
+
* @returns {string}
|
|
19
|
+
*/
|
|
20
|
+
declare function quoteMeta(str: string): string;
|
|
21
|
+
declare function charCount(codePoint: number): number;
|
|
22
|
+
export { emptyInts, runeToString, emptyOpContext, charCount, stringToRunes, isalnum, unhex, quoteMeta, EMPTY_BEGIN_LINE, EMPTY_END_LINE, EMPTY_WORD_BOUNDARY, EMPTY_BEGIN_TEXT, EMPTY_END_TEXT, EMPTY_NO_WORD_BOUNDARY, };
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { MAX_BMP } from "./Unicode.js";
|
|
2
|
+
import { codePointAtOrThrow } from "./chars.js";
|
|
3
|
+
/**
|
|
4
|
+
* Various constants and helper utilities.
|
|
5
|
+
*/
|
|
6
|
+
const METACHARACTERS = "\\.+*?()|[]{}^$";
|
|
7
|
+
//// EMPTY_* flags
|
|
8
|
+
const EMPTY_BEGIN_LINE = 0x01;
|
|
9
|
+
const EMPTY_END_LINE = 0x02;
|
|
10
|
+
const EMPTY_BEGIN_TEXT = 0x04;
|
|
11
|
+
const EMPTY_END_TEXT = 0x08;
|
|
12
|
+
const EMPTY_WORD_BOUNDARY = 0x10;
|
|
13
|
+
const EMPTY_NO_WORD_BOUNDARY = 0x20;
|
|
14
|
+
function emptyInts() {
|
|
15
|
+
return [];
|
|
16
|
+
}
|
|
17
|
+
// Returns true iff |c| is an ASCII letter or decimal digit.
|
|
18
|
+
function isalnum(c) {
|
|
19
|
+
return ((0x30 <= c && c <= 0x39) ||
|
|
20
|
+
(0x61 <= c && c <= 0x7a) ||
|
|
21
|
+
(0x41 <= c && c <= 0x5a));
|
|
22
|
+
}
|
|
23
|
+
// If |c| is an ASCII hex digit, returns its value, otherwise -1.
|
|
24
|
+
function unhex(c) {
|
|
25
|
+
if (0x30 <= c && c <= 0x39) {
|
|
26
|
+
return c - 0x30;
|
|
27
|
+
}
|
|
28
|
+
if (0x61 <= c && c <= 0x66) {
|
|
29
|
+
return c - 0x61 + 10;
|
|
30
|
+
}
|
|
31
|
+
if (0x41 <= c && c <= 0x46) {
|
|
32
|
+
return c - 0x41 + 10;
|
|
33
|
+
}
|
|
34
|
+
return -1;
|
|
35
|
+
}
|
|
36
|
+
// Returns the array of runes in the specified UTF-16 string.
|
|
37
|
+
function stringToRunes(str) {
|
|
38
|
+
return Array.from(String(str)).map((s) => codePointAtOrThrow(s, 0));
|
|
39
|
+
}
|
|
40
|
+
// Returns the Java UTF-16 string containing the single rune |r|.
|
|
41
|
+
function runeToString(r) {
|
|
42
|
+
return String.fromCodePoint(r);
|
|
43
|
+
}
|
|
44
|
+
// isWordRune reports whether r is consider a ``word character''
|
|
45
|
+
// during the evaluation of the \b and \B zero-width assertions.
|
|
46
|
+
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
|
47
|
+
function isWordRune(r) {
|
|
48
|
+
return ((0x61 <= r && r <= 0x7a) ||
|
|
49
|
+
(0x41 <= r && r <= 0x5a) ||
|
|
50
|
+
(0x30 <= r && r <= 0x39) ||
|
|
51
|
+
r === 0x5f);
|
|
52
|
+
}
|
|
53
|
+
// emptyOpContext returns the zero-width assertions satisfied at the position
|
|
54
|
+
// between the runes r1 and r2, a bitmask of EMPTY_* flags.
|
|
55
|
+
// Passing r1 == -1 indicates that the position is at the beginning of the
|
|
56
|
+
// text.
|
|
57
|
+
// Passing r2 == -1 indicates that the position is at the end of the text.
|
|
58
|
+
// eslint-disable-next-line no-warning-comments
|
|
59
|
+
// TODO(adonovan): move to Machine.
|
|
60
|
+
function emptyOpContext(r1, r2) {
|
|
61
|
+
let op = 0;
|
|
62
|
+
if (r1 < 0) {
|
|
63
|
+
op |= EMPTY_BEGIN_TEXT | EMPTY_BEGIN_LINE;
|
|
64
|
+
}
|
|
65
|
+
if (r1 === 0x0a) {
|
|
66
|
+
op |= EMPTY_BEGIN_LINE;
|
|
67
|
+
}
|
|
68
|
+
if (r2 < 0) {
|
|
69
|
+
op |= EMPTY_END_TEXT | EMPTY_END_LINE;
|
|
70
|
+
}
|
|
71
|
+
if (r2 === 0x0a) {
|
|
72
|
+
op |= EMPTY_END_LINE;
|
|
73
|
+
}
|
|
74
|
+
if (isWordRune(r1) !== isWordRune(r2)) {
|
|
75
|
+
op |= EMPTY_WORD_BOUNDARY;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
op |= EMPTY_NO_WORD_BOUNDARY;
|
|
79
|
+
}
|
|
80
|
+
return op;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Returns a string that quotes all regular expression metacharacters inside the argument text;
|
|
84
|
+
* the returned string is a regular expression matching the literal text. For example,
|
|
85
|
+
* {@code quoteMeta("[foo]").equals("\\[foo\\]")}.
|
|
86
|
+
* @param {string} str
|
|
87
|
+
* @returns {string}
|
|
88
|
+
*/
|
|
89
|
+
function quoteMeta(str) {
|
|
90
|
+
return str
|
|
91
|
+
.split("") // A char loop is correct because all metacharacters fit in one UTF-16 code.
|
|
92
|
+
.map((s) => {
|
|
93
|
+
if (METACHARACTERS.indexOf(s) >= 0) {
|
|
94
|
+
return `\\${s}`;
|
|
95
|
+
}
|
|
96
|
+
return s;
|
|
97
|
+
})
|
|
98
|
+
.join("");
|
|
99
|
+
}
|
|
100
|
+
function charCount(codePoint) {
|
|
101
|
+
return codePoint > MAX_BMP ? 2 : 1;
|
|
102
|
+
}
|
|
103
|
+
export { emptyInts, runeToString, emptyOpContext, charCount, stringToRunes, isalnum, unhex, quoteMeta, EMPTY_BEGIN_LINE, EMPTY_END_LINE, EMPTY_WORD_BOUNDARY, EMPTY_BEGIN_TEXT, EMPTY_END_TEXT, EMPTY_NO_WORD_BOUNDARY, };
|