@shd101wyy/yo 0.0.28 → 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,365 @@
1
+ // std/regex/unicode.yo - Unicode property ranges for \p{...} support
2
+ //
3
+ // Provides character ranges for common Unicode general categories.
4
+ // Uses compact range representation covering the most commonly used
5
+ // Unicode blocks. Not exhaustive but covers practical use cases.
6
+
7
+ open import "std/collections/array_list";
8
+ open import "std/string";
9
+ { CharRange } :: import "./node.yo";
10
+
11
+ _add_letter_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
12
+ // Latin letters
13
+ rs.push(CharRange(low: u32(0x0041), high: u32(0x005A)));
14
+ rs.push(CharRange(low: u32(0x0061), high: u32(0x007A)));
15
+ // Latin Extended
16
+ rs.push(CharRange(low: u32(0x00C0), high: u32(0x00D6)));
17
+ rs.push(CharRange(low: u32(0x00D8), high: u32(0x00F6)));
18
+ rs.push(CharRange(low: u32(0x00F8), high: u32(0x024F)));
19
+ // Greek
20
+ rs.push(CharRange(low: u32(0x0370), high: u32(0x0373)));
21
+ rs.push(CharRange(low: u32(0x0376), high: u32(0x0377)));
22
+ rs.push(CharRange(low: u32(0x037B), high: u32(0x037D)));
23
+ rs.push(CharRange(low: u32(0x0386), high: u32(0x0386)));
24
+ rs.push(CharRange(low: u32(0x0388), high: u32(0x038A)));
25
+ rs.push(CharRange(low: u32(0x038C), high: u32(0x038C)));
26
+ rs.push(CharRange(low: u32(0x038E), high: u32(0x03A1)));
27
+ rs.push(CharRange(low: u32(0x03A3), high: u32(0x03FF)));
28
+ // Cyrillic
29
+ rs.push(CharRange(low: u32(0x0400), high: u32(0x0481)));
30
+ rs.push(CharRange(low: u32(0x048A), high: u32(0x052F)));
31
+ // Armenian
32
+ rs.push(CharRange(low: u32(0x0531), high: u32(0x0556)));
33
+ rs.push(CharRange(low: u32(0x0560), high: u32(0x0588)));
34
+ // Hebrew
35
+ rs.push(CharRange(low: u32(0x05D0), high: u32(0x05EA)));
36
+ // Arabic
37
+ rs.push(CharRange(low: u32(0x0620), high: u32(0x064A)));
38
+ rs.push(CharRange(low: u32(0x066E), high: u32(0x066F)));
39
+ rs.push(CharRange(low: u32(0x0671), high: u32(0x06D3)));
40
+ // Devanagari
41
+ rs.push(CharRange(low: u32(0x0904), high: u32(0x0939)));
42
+ rs.push(CharRange(low: u32(0x0958), high: u32(0x0961)));
43
+ // Thai
44
+ rs.push(CharRange(low: u32(0x0E01), high: u32(0x0E30)));
45
+ rs.push(CharRange(low: u32(0x0E32), high: u32(0x0E33)));
46
+ // CJK Unified Ideographs
47
+ rs.push(CharRange(low: u32(0x4E00), high: u32(0x9FFF)));
48
+ // Hangul Syllables
49
+ rs.push(CharRange(low: u32(0xAC00), high: u32(0xD7A3)));
50
+ // Hiragana
51
+ rs.push(CharRange(low: u32(0x3040), high: u32(0x309F)));
52
+ // Katakana
53
+ rs.push(CharRange(low: u32(0x30A0), high: u32(0x30FF)));
54
+ // CJK Extension A
55
+ rs.push(CharRange(low: u32(0x3400), high: u32(0x4DBF)));
56
+ // CJK Extension B
57
+ rs.push(CharRange(low: u32(0x20000), high: u32(0x2A6DF)));
58
+ });
59
+
60
+ _add_uppercase_letter_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
61
+ rs.push(CharRange(low: u32(0x0041), high: u32(0x005A)));
62
+ rs.push(CharRange(low: u32(0x00C0), high: u32(0x00D6)));
63
+ rs.push(CharRange(low: u32(0x00D8), high: u32(0x00DE)));
64
+ rs.push(CharRange(low: u32(0x0100), high: u32(0x0100)));
65
+ rs.push(CharRange(low: u32(0x0102), high: u32(0x0102)));
66
+ rs.push(CharRange(low: u32(0x0104), high: u32(0x0104)));
67
+ rs.push(CharRange(low: u32(0x0106), high: u32(0x0106)));
68
+ rs.push(CharRange(low: u32(0x0391), high: u32(0x03A1)));
69
+ rs.push(CharRange(low: u32(0x03A3), high: u32(0x03A9)));
70
+ rs.push(CharRange(low: u32(0x0410), high: u32(0x042F)));
71
+ });
72
+
73
+ _add_lowercase_letter_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
74
+ rs.push(CharRange(low: u32(0x0061), high: u32(0x007A)));
75
+ rs.push(CharRange(low: u32(0x00DF), high: u32(0x00F6)));
76
+ rs.push(CharRange(low: u32(0x00F8), high: u32(0x00FF)));
77
+ rs.push(CharRange(low: u32(0x0101), high: u32(0x0101)));
78
+ rs.push(CharRange(low: u32(0x0103), high: u32(0x0103)));
79
+ rs.push(CharRange(low: u32(0x0105), high: u32(0x0105)));
80
+ rs.push(CharRange(low: u32(0x03B1), high: u32(0x03C9)));
81
+ rs.push(CharRange(low: u32(0x0430), high: u32(0x044F)));
82
+ });
83
+
84
+ _add_number_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
85
+ // ASCII digits
86
+ rs.push(CharRange(low: u32(0x0030), high: u32(0x0039)));
87
+ // Superscript/subscript digits
88
+ rs.push(CharRange(low: u32(0x00B2), high: u32(0x00B3)));
89
+ rs.push(CharRange(low: u32(0x00B9), high: u32(0x00B9)));
90
+ rs.push(CharRange(low: u32(0x00BC), high: u32(0x00BE)));
91
+ // Arabic-Indic digits
92
+ rs.push(CharRange(low: u32(0x0660), high: u32(0x0669)));
93
+ rs.push(CharRange(low: u32(0x06F0), high: u32(0x06F9)));
94
+ // Devanagari digits
95
+ rs.push(CharRange(low: u32(0x0966), high: u32(0x096F)));
96
+ // Fullwidth digits
97
+ rs.push(CharRange(low: u32(0xFF10), high: u32(0xFF19)));
98
+ });
99
+
100
+ _add_digit_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
101
+ rs.push(CharRange(low: u32(0x0030), high: u32(0x0039)));
102
+ rs.push(CharRange(low: u32(0x0660), high: u32(0x0669)));
103
+ rs.push(CharRange(low: u32(0x06F0), high: u32(0x06F9)));
104
+ rs.push(CharRange(low: u32(0x0966), high: u32(0x096F)));
105
+ rs.push(CharRange(low: u32(0x09E6), high: u32(0x09EF)));
106
+ rs.push(CharRange(low: u32(0x0A66), high: u32(0x0A6F)));
107
+ rs.push(CharRange(low: u32(0xFF10), high: u32(0xFF19)));
108
+ });
109
+
110
+ _add_punctuation_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
111
+ rs.push(CharRange(low: u32(0x0021), high: u32(0x0023)));
112
+ rs.push(CharRange(low: u32(0x0025), high: u32(0x002A)));
113
+ rs.push(CharRange(low: u32(0x002C), high: u32(0x002F)));
114
+ rs.push(CharRange(low: u32(0x003A), high: u32(0x003B)));
115
+ rs.push(CharRange(low: u32(0x003F), high: u32(0x0040)));
116
+ rs.push(CharRange(low: u32(0x005B), high: u32(0x005D)));
117
+ rs.push(CharRange(low: u32(0x005F), high: u32(0x005F)));
118
+ rs.push(CharRange(low: u32(0x007B), high: u32(0x007B)));
119
+ rs.push(CharRange(low: u32(0x007D), high: u32(0x007D)));
120
+ rs.push(CharRange(low: u32(0x00A1), high: u32(0x00A1)));
121
+ rs.push(CharRange(low: u32(0x00A7), high: u32(0x00A7)));
122
+ rs.push(CharRange(low: u32(0x00AB), high: u32(0x00AB)));
123
+ rs.push(CharRange(low: u32(0x00B6), high: u32(0x00B7)));
124
+ rs.push(CharRange(low: u32(0x00BB), high: u32(0x00BB)));
125
+ rs.push(CharRange(low: u32(0x00BF), high: u32(0x00BF)));
126
+ rs.push(CharRange(low: u32(0x2010), high: u32(0x2027)));
127
+ rs.push(CharRange(low: u32(0x2030), high: u32(0x2043)));
128
+ rs.push(CharRange(low: u32(0x3001), high: u32(0x3003)));
129
+ rs.push(CharRange(low: u32(0x3008), high: u32(0x3011)));
130
+ rs.push(CharRange(low: u32(0xFF01), high: u32(0xFF0F)));
131
+ rs.push(CharRange(low: u32(0xFF1A), high: u32(0xFF1B)));
132
+ rs.push(CharRange(low: u32(0xFF1F), high: u32(0xFF20)));
133
+ });
134
+
135
+ _add_symbol_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
136
+ rs.push(CharRange(low: u32(0x0024), high: u32(0x0024)));
137
+ rs.push(CharRange(low: u32(0x002B), high: u32(0x002B)));
138
+ rs.push(CharRange(low: u32(0x003C), high: u32(0x003E)));
139
+ rs.push(CharRange(low: u32(0x005E), high: u32(0x005E)));
140
+ rs.push(CharRange(low: u32(0x0060), high: u32(0x0060)));
141
+ rs.push(CharRange(low: u32(0x007C), high: u32(0x007C)));
142
+ rs.push(CharRange(low: u32(0x007E), high: u32(0x007E)));
143
+ rs.push(CharRange(low: u32(0x00A2), high: u32(0x00A6)));
144
+ rs.push(CharRange(low: u32(0x00A8), high: u32(0x00A9)));
145
+ rs.push(CharRange(low: u32(0x00AC), high: u32(0x00AC)));
146
+ rs.push(CharRange(low: u32(0x00AE), high: u32(0x00B1)));
147
+ rs.push(CharRange(low: u32(0x2190), high: u32(0x21FF)));
148
+ rs.push(CharRange(low: u32(0x2200), high: u32(0x22FF)));
149
+ rs.push(CharRange(low: u32(0x2600), high: u32(0x26FF)));
150
+ rs.push(CharRange(low: u32(0x2700), high: u32(0x27BF)));
151
+ });
152
+
153
+ _add_separator_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
154
+ rs.push(CharRange(low: u32(0x0020), high: u32(0x0020)));
155
+ rs.push(CharRange(low: u32(0x00A0), high: u32(0x00A0)));
156
+ rs.push(CharRange(low: u32(0x1680), high: u32(0x1680)));
157
+ rs.push(CharRange(low: u32(0x2000), high: u32(0x200A)));
158
+ rs.push(CharRange(low: u32(0x2028), high: u32(0x2029)));
159
+ rs.push(CharRange(low: u32(0x202F), high: u32(0x202F)));
160
+ rs.push(CharRange(low: u32(0x205F), high: u32(0x205F)));
161
+ rs.push(CharRange(low: u32(0x3000), high: u32(0x3000)));
162
+ });
163
+
164
+ _add_mark_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
165
+ // Combining diacritical marks
166
+ rs.push(CharRange(low: u32(0x0300), high: u32(0x036F)));
167
+ // Devanagari dependent vowel signs
168
+ rs.push(CharRange(low: u32(0x093C), high: u32(0x094F)));
169
+ // Arabic combining marks
170
+ rs.push(CharRange(low: u32(0x064B), high: u32(0x065F)));
171
+ // Thai combining marks
172
+ rs.push(CharRange(low: u32(0x0E31), high: u32(0x0E31)));
173
+ rs.push(CharRange(low: u32(0x0E34), high: u32(0x0E3A)));
174
+ // CJK compatibility ideographs supplement
175
+ rs.push(CharRange(low: u32(0xFE20), high: u32(0xFE2F)));
176
+ });
177
+
178
+ _add_other_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
179
+ // C0 controls
180
+ rs.push(CharRange(low: u32(0x0000), high: u32(0x001F)));
181
+ // Delete
182
+ rs.push(CharRange(low: u32(0x007F), high: u32(0x007F)));
183
+ // C1 controls
184
+ rs.push(CharRange(low: u32(0x0080), high: u32(0x009F)));
185
+ // Surrogates (not valid in UTF-8, but included for completeness)
186
+ rs.push(CharRange(low: u32(0xD800), high: u32(0xDFFF)));
187
+ // Noncharacters
188
+ rs.push(CharRange(low: u32(0xFDD0), high: u32(0xFDEF)));
189
+ rs.push(CharRange(low: u32(0xFFFE), high: u32(0xFFFF)));
190
+ });
191
+
192
+ _add_latin_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
193
+ rs.push(CharRange(low: u32(0x0041), high: u32(0x005A)));
194
+ rs.push(CharRange(low: u32(0x0061), high: u32(0x007A)));
195
+ rs.push(CharRange(low: u32(0x00C0), high: u32(0x00D6)));
196
+ rs.push(CharRange(low: u32(0x00D8), high: u32(0x00F6)));
197
+ rs.push(CharRange(low: u32(0x00F8), high: u32(0x024F)));
198
+ rs.push(CharRange(low: u32(0x1E00), high: u32(0x1EFF)));
199
+ });
200
+
201
+ _add_han_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
202
+ // CJK Unified Ideographs
203
+ rs.push(CharRange(low: u32(0x4E00), high: u32(0x9FFF)));
204
+ // CJK Extension A
205
+ rs.push(CharRange(low: u32(0x3400), high: u32(0x4DBF)));
206
+ // CJK Extension B
207
+ rs.push(CharRange(low: u32(0x20000), high: u32(0x2A6DF)));
208
+ // CJK Compatibility Ideographs
209
+ rs.push(CharRange(low: u32(0xF900), high: u32(0xFAFF)));
210
+ // CJK Radicals Supplement
211
+ rs.push(CharRange(low: u32(0x2E80), high: u32(0x2EFF)));
212
+ // Kangxi Radicals
213
+ rs.push(CharRange(low: u32(0x2F00), high: u32(0x2FDF)));
214
+ });
215
+
216
+ _add_emoji_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
217
+ // Miscellaneous Symbols
218
+ rs.push(CharRange(low: u32(0x2600), high: u32(0x26FF)));
219
+ // Dingbats
220
+ rs.push(CharRange(low: u32(0x2700), high: u32(0x27BF)));
221
+ // Emoticons
222
+ rs.push(CharRange(low: u32(0x1F600), high: u32(0x1F64F)));
223
+ // Misc Symbols and Pictographs
224
+ rs.push(CharRange(low: u32(0x1F300), high: u32(0x1F5FF)));
225
+ // Transport and Map
226
+ rs.push(CharRange(low: u32(0x1F680), high: u32(0x1F6FF)));
227
+ // Supplemental Symbols
228
+ rs.push(CharRange(low: u32(0x1F900), high: u32(0x1F9FF)));
229
+ // Symbols and Pictographs Extended-A
230
+ rs.push(CharRange(low: u32(0x1FA00), high: u32(0x1FA6F)));
231
+ rs.push(CharRange(low: u32(0x1FA70), high: u32(0x1FAFF)));
232
+ });
233
+
234
+ _add_whitespace_ranges :: (fn(rs : ArrayList(CharRange)) -> unit)({
235
+ rs.push(CharRange(low: u32(0x0009), high: u32(0x000D)));
236
+ rs.push(CharRange(low: u32(0x0020), high: u32(0x0020)));
237
+ rs.push(CharRange(low: u32(0x0085), high: u32(0x0085)));
238
+ rs.push(CharRange(low: u32(0x00A0), high: u32(0x00A0)));
239
+ rs.push(CharRange(low: u32(0x1680), high: u32(0x1680)));
240
+ rs.push(CharRange(low: u32(0x2000), high: u32(0x200A)));
241
+ rs.push(CharRange(low: u32(0x2028), high: u32(0x2029)));
242
+ rs.push(CharRange(low: u32(0x202F), high: u32(0x202F)));
243
+ rs.push(CharRange(low: u32(0x205F), high: u32(0x205F)));
244
+ rs.push(CharRange(low: u32(0x3000), high: u32(0x3000)));
245
+ });
246
+
247
+ // Unicode property range lookup.
248
+ // Returns Some(ranges) for known property names, None for unknown.
249
+ // Supports both short (L, N, P, S, Z, M, C) and long names (Letter, Number, etc.)
250
+ // as well as subcategories (Lu, Ll, Nd, etc.)
251
+ unicode_property_ranges :: (fn(name : String) -> Option(ArrayList(CharRange)))({
252
+ rs := ArrayList(CharRange).new();
253
+ (found : bool) = true;
254
+
255
+ cond(
256
+ // === General Category: Letter (L) ===
257
+ ((name == `L`) || (name == `Letter`)) => {
258
+ _add_letter_ranges(rs);
259
+ },
260
+ ((name == `Lu`) || (name == `Uppercase_Letter`)) => {
261
+ _add_uppercase_letter_ranges(rs);
262
+ },
263
+ ((name == `Ll`) || (name == `Lowercase_Letter`)) => {
264
+ _add_lowercase_letter_ranges(rs);
265
+ },
266
+
267
+ // === General Category: Number (N) ===
268
+ ((name == `N`) || (name == `Number`)) => {
269
+ _add_number_ranges(rs);
270
+ },
271
+ ((name == `Nd`) || (name == `Digit`)) => {
272
+ _add_digit_ranges(rs);
273
+ },
274
+
275
+ // === General Category: Punctuation (P) ===
276
+ ((name == `P`) || (name == `Punctuation`)) => {
277
+ _add_punctuation_ranges(rs);
278
+ },
279
+
280
+ // === General Category: Symbol (S) ===
281
+ ((name == `S`) || (name == `Symbol`)) => {
282
+ _add_symbol_ranges(rs);
283
+ },
284
+
285
+ // === General Category: Separator (Z) ===
286
+ ((name == `Z`) || (name == `Separator`)) => {
287
+ _add_separator_ranges(rs);
288
+ },
289
+
290
+ // === General Category: Mark (M) ===
291
+ ((name == `M`) || (name == `Mark`)) => {
292
+ _add_mark_ranges(rs);
293
+ },
294
+
295
+ // === General Category: Other (C) ===
296
+ ((name == `C`) || (name == `Other`)) => {
297
+ _add_other_ranges(rs);
298
+ },
299
+
300
+ // === Script categories ===
301
+ ((name == `ASCII`) || (name == `ascii`)) => {
302
+ rs.push(CharRange(low: u32(0x0000), high: u32(0x007F)));
303
+ },
304
+ ((name == `Latin`) || (name == `latin`)) => {
305
+ _add_latin_ranges(rs);
306
+ },
307
+ ((name == `Greek`) || (name == `greek`)) => {
308
+ rs.push(CharRange(low: u32(0x0370), high: u32(0x03FF)));
309
+ rs.push(CharRange(low: u32(0x1F00), high: u32(0x1FFF)));
310
+ },
311
+ ((name == `Cyrillic`) || (name == `cyrillic`)) => {
312
+ rs.push(CharRange(low: u32(0x0400), high: u32(0x04FF)));
313
+ rs.push(CharRange(low: u32(0x0500), high: u32(0x052F)));
314
+ },
315
+ ((name == `Han`) || (name == `han`)) => {
316
+ _add_han_ranges(rs);
317
+ },
318
+ ((name == `Hiragana`) || (name == `hiragana`)) => {
319
+ rs.push(CharRange(low: u32(0x3040), high: u32(0x309F)));
320
+ },
321
+ ((name == `Katakana`) || (name == `katakana`)) => {
322
+ rs.push(CharRange(low: u32(0x30A0), high: u32(0x30FF)));
323
+ rs.push(CharRange(low: u32(0x31F0), high: u32(0x31FF)));
324
+ },
325
+ ((name == `Hangul`) || (name == `hangul`)) => {
326
+ rs.push(CharRange(low: u32(0xAC00), high: u32(0xD7AF)));
327
+ rs.push(CharRange(low: u32(0x1100), high: u32(0x11FF)));
328
+ rs.push(CharRange(low: u32(0x3130), high: u32(0x318F)));
329
+ },
330
+ ((name == `Arabic`) || (name == `arabic`)) => {
331
+ rs.push(CharRange(low: u32(0x0600), high: u32(0x06FF)));
332
+ rs.push(CharRange(low: u32(0x0750), high: u32(0x077F)));
333
+ },
334
+ ((name == `Devanagari`) || (name == `devanagari`)) => {
335
+ rs.push(CharRange(low: u32(0x0900), high: u32(0x097F)));
336
+ },
337
+ ((name == `Thai`) || (name == `thai`)) => {
338
+ rs.push(CharRange(low: u32(0x0E00), high: u32(0x0E7F)));
339
+ },
340
+ ((name == `Emoji`) || (name == `emoji`)) => {
341
+ _add_emoji_ranges(rs);
342
+ },
343
+
344
+ // === Boolean properties ===
345
+ ((name == `White_Space`) || (name == `space`)) => {
346
+ _add_whitespace_ranges(rs);
347
+ },
348
+ ((name == `Alphabetic`) || (name == `Alpha`)) => {
349
+ _add_letter_ranges(rs);
350
+ },
351
+
352
+ true => {
353
+ found = false;
354
+ }
355
+ );
356
+
357
+ cond(
358
+ found => .Some(rs),
359
+ true => .None
360
+ )
361
+ });
362
+
363
+ export
364
+ unicode_property_ranges
365
+ ;