quickjs 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE +21 -0
  4. data/Rakefile +22 -0
  5. data/ext/quickjsrb/extconf.rb +45 -0
  6. data/ext/quickjsrb/quickjs/LICENSE +22 -0
  7. data/ext/quickjsrb/quickjs/cutils.c +631 -0
  8. data/ext/quickjsrb/quickjs/cutils.h +347 -0
  9. data/ext/quickjsrb/quickjs/libbf.c +8475 -0
  10. data/ext/quickjsrb/quickjs/libbf.h +535 -0
  11. data/ext/quickjsrb/quickjs/libregexp-opcode.h +57 -0
  12. data/ext/quickjsrb/quickjs/libregexp.c +2501 -0
  13. data/ext/quickjsrb/quickjs/libregexp.h +55 -0
  14. data/ext/quickjsrb/quickjs/libunicode-table.h +4557 -0
  15. data/ext/quickjsrb/quickjs/libunicode.c +1910 -0
  16. data/ext/quickjsrb/quickjs/libunicode.h +182 -0
  17. data/ext/quickjsrb/quickjs/list.h +99 -0
  18. data/ext/quickjsrb/quickjs/qjs.c +564 -0
  19. data/ext/quickjsrb/quickjs/qjsc.c +761 -0
  20. data/ext/quickjsrb/quickjs/qjscalc.c +4005 -0
  21. data/ext/quickjsrb/quickjs/quickjs-atom.h +273 -0
  22. data/ext/quickjsrb/quickjs/quickjs-libc.c +4052 -0
  23. data/ext/quickjsrb/quickjs/quickjs-libc.h +60 -0
  24. data/ext/quickjsrb/quickjs/quickjs-opcode.h +372 -0
  25. data/ext/quickjsrb/quickjs/quickjs.c +55978 -0
  26. data/ext/quickjsrb/quickjs/quickjs.h +1087 -0
  27. data/ext/quickjsrb/quickjs/repl.c +2057 -0
  28. data/ext/quickjsrb/quickjs/run-test262.c +2216 -0
  29. data/ext/quickjsrb/quickjs/unicode_gen.c +3225 -0
  30. data/ext/quickjsrb/quickjs/unicode_gen_def.h +291 -0
  31. data/ext/quickjsrb/quickjsrb.c +105 -0
  32. data/ext/quickjsrb/quickjsrb.h +14 -0
  33. data/lib/quickjs/version.rb +5 -0
  34. data/lib/quickjs.rb +28 -0
  35. data/sig/quickjs.rbs +4 -0
  36. metadata +81 -0
@@ -0,0 +1,1910 @@
1
+ /*
2
+ * Unicode utilities
3
+ *
4
+ * Copyright (c) 2017-2018 Fabrice Bellard
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ * of this software and associated documentation files (the "Software"), to deal
8
+ * in the Software without restriction, including without limitation the rights
9
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the Software is
11
+ * furnished to do so, subject to the following conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be included in
14
+ * all copies or substantial portions of the Software.
15
+ *
16
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ * THE SOFTWARE.
23
+ */
24
+ #include <stdlib.h>
25
+ #include <stdio.h>
26
+ #include <stdarg.h>
27
+ #include <string.h>
28
+ #include <assert.h>
29
+
30
+ #include "cutils.h"
31
+ #include "libunicode.h"
32
+ #include "libunicode-table.h"
33
+
34
+ enum {
35
+ RUN_TYPE_U,
36
+ RUN_TYPE_L,
37
+ RUN_TYPE_UF,
38
+ RUN_TYPE_LF,
39
+ RUN_TYPE_UL,
40
+ RUN_TYPE_LSU,
41
+ RUN_TYPE_U2L_399_EXT2,
42
+ RUN_TYPE_UF_D20,
43
+ RUN_TYPE_UF_D1_EXT,
44
+ RUN_TYPE_U_EXT,
45
+ RUN_TYPE_LF_EXT,
46
+ RUN_TYPE_UF_EXT2,
47
+ RUN_TYPE_LF_EXT2,
48
+ RUN_TYPE_UF_EXT3,
49
+ };
50
+
51
+ static int lre_case_conv1(uint32_t c, int conv_type)
52
+ {
53
+ uint32_t res[LRE_CC_RES_LEN_MAX];
54
+ lre_case_conv(res, c, conv_type);
55
+ return res[0];
56
+ }
57
+
58
+ /* case conversion using the table entry 'idx' with value 'v' */
59
+ static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
60
+ {
61
+ uint32_t code, data, type, a, is_lower;
62
+ is_lower = (conv_type != 0);
63
+ type = (v >> (32 - 17 - 7 - 4)) & 0xf;
64
+ data = ((v & 0xf) << 8) | case_conv_table2[idx];
65
+ code = v >> (32 - 17);
66
+ switch(type) {
67
+ case RUN_TYPE_U:
68
+ case RUN_TYPE_L:
69
+ case RUN_TYPE_UF:
70
+ case RUN_TYPE_LF:
71
+ if (conv_type == (type & 1) ||
72
+ (type >= RUN_TYPE_UF && conv_type == 2)) {
73
+ c = c - code + (case_conv_table1[data] >> (32 - 17));
74
+ }
75
+ break;
76
+ case RUN_TYPE_UL:
77
+ a = c - code;
78
+ if ((a & 1) != (1 - is_lower))
79
+ break;
80
+ c = (a ^ 1) + code;
81
+ break;
82
+ case RUN_TYPE_LSU:
83
+ a = c - code;
84
+ if (a == 1) {
85
+ c += 2 * is_lower - 1;
86
+ } else if (a == (1 - is_lower) * 2) {
87
+ c += (2 * is_lower - 1) * 2;
88
+ }
89
+ break;
90
+ case RUN_TYPE_U2L_399_EXT2:
91
+ if (!is_lower) {
92
+ res[0] = c - code + case_conv_ext[data >> 6];
93
+ res[1] = 0x399;
94
+ return 2;
95
+ } else {
96
+ c = c - code + case_conv_ext[data & 0x3f];
97
+ }
98
+ break;
99
+ case RUN_TYPE_UF_D20:
100
+ if (conv_type == 1)
101
+ break;
102
+ c = data + (conv_type == 2) * 0x20;
103
+ break;
104
+ case RUN_TYPE_UF_D1_EXT:
105
+ if (conv_type == 1)
106
+ break;
107
+ c = case_conv_ext[data] + (conv_type == 2);
108
+ break;
109
+ case RUN_TYPE_U_EXT:
110
+ case RUN_TYPE_LF_EXT:
111
+ if (is_lower != (type - RUN_TYPE_U_EXT))
112
+ break;
113
+ c = case_conv_ext[data];
114
+ break;
115
+ case RUN_TYPE_LF_EXT2:
116
+ if (!is_lower)
117
+ break;
118
+ res[0] = c - code + case_conv_ext[data >> 6];
119
+ res[1] = case_conv_ext[data & 0x3f];
120
+ return 2;
121
+ case RUN_TYPE_UF_EXT2:
122
+ if (conv_type == 1)
123
+ break;
124
+ res[0] = c - code + case_conv_ext[data >> 6];
125
+ res[1] = case_conv_ext[data & 0x3f];
126
+ if (conv_type == 2) {
127
+ /* convert to lower */
128
+ res[0] = lre_case_conv1(res[0], 1);
129
+ res[1] = lre_case_conv1(res[1], 1);
130
+ }
131
+ return 2;
132
+ default:
133
+ case RUN_TYPE_UF_EXT3:
134
+ if (conv_type == 1)
135
+ break;
136
+ res[0] = case_conv_ext[data >> 8];
137
+ res[1] = case_conv_ext[(data >> 4) & 0xf];
138
+ res[2] = case_conv_ext[data & 0xf];
139
+ if (conv_type == 2) {
140
+ /* convert to lower */
141
+ res[0] = lre_case_conv1(res[0], 1);
142
+ res[1] = lre_case_conv1(res[1], 1);
143
+ res[2] = lre_case_conv1(res[2], 1);
144
+ }
145
+ return 3;
146
+ }
147
+ res[0] = c;
148
+ return 1;
149
+ }
150
+
151
+ /* conv_type:
152
+ 0 = to upper
153
+ 1 = to lower
154
+ 2 = case folding (= to lower with modifications)
155
+ */
156
+ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
157
+ {
158
+ if (c < 128) {
159
+ if (conv_type) {
160
+ if (c >= 'A' && c <= 'Z') {
161
+ c = c - 'A' + 'a';
162
+ }
163
+ } else {
164
+ if (c >= 'a' && c <= 'z') {
165
+ c = c - 'a' + 'A';
166
+ }
167
+ }
168
+ } else {
169
+ uint32_t v, code, len;
170
+ int idx, idx_min, idx_max;
171
+
172
+ idx_min = 0;
173
+ idx_max = countof(case_conv_table1) - 1;
174
+ while (idx_min <= idx_max) {
175
+ idx = (unsigned)(idx_max + idx_min) / 2;
176
+ v = case_conv_table1[idx];
177
+ code = v >> (32 - 17);
178
+ len = (v >> (32 - 17 - 7)) & 0x7f;
179
+ if (c < code) {
180
+ idx_max = idx - 1;
181
+ } else if (c >= code + len) {
182
+ idx_min = idx + 1;
183
+ } else {
184
+ return lre_case_conv_entry(res, c, conv_type, idx, v);
185
+ }
186
+ }
187
+ }
188
+ res[0] = c;
189
+ return 1;
190
+ }
191
+
192
+ static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
193
+ {
194
+ uint32_t res[LRE_CC_RES_LEN_MAX];
195
+ int len;
196
+
197
+ if (is_unicode) {
198
+ len = lre_case_conv_entry(res, c, 2, idx, v);
199
+ if (len == 1) {
200
+ c = res[0];
201
+ } else {
202
+ /* handle the few specific multi-character cases (see
203
+ unicode_gen.c:dump_case_folding_special_cases()) */
204
+ if (c == 0xfb06) {
205
+ c = 0xfb05;
206
+ } else if (c == 0x01fd3) {
207
+ c = 0x390;
208
+ } else if (c == 0x01fe3) {
209
+ c = 0x3b0;
210
+ }
211
+ }
212
+ } else {
213
+ if (likely(c < 128)) {
214
+ if (c >= 'a' && c <= 'z')
215
+ c = c - 'a' + 'A';
216
+ } else {
217
+ /* legacy regexp: to upper case if single char >= 128 */
218
+ len = lre_case_conv_entry(res, c, FALSE, idx, v);
219
+ if (len == 1 && res[0] >= 128)
220
+ c = res[0];
221
+ }
222
+ }
223
+ return c;
224
+ }
225
+
226
+ /* JS regexp specific rules for case folding */
227
+ int lre_canonicalize(uint32_t c, BOOL is_unicode)
228
+ {
229
+ if (c < 128) {
230
+ /* fast case */
231
+ if (is_unicode) {
232
+ if (c >= 'A' && c <= 'Z') {
233
+ c = c - 'A' + 'a';
234
+ }
235
+ } else {
236
+ if (c >= 'a' && c <= 'z') {
237
+ c = c - 'a' + 'A';
238
+ }
239
+ }
240
+ } else {
241
+ uint32_t v, code, len;
242
+ int idx, idx_min, idx_max;
243
+
244
+ idx_min = 0;
245
+ idx_max = countof(case_conv_table1) - 1;
246
+ while (idx_min <= idx_max) {
247
+ idx = (unsigned)(idx_max + idx_min) / 2;
248
+ v = case_conv_table1[idx];
249
+ code = v >> (32 - 17);
250
+ len = (v >> (32 - 17 - 7)) & 0x7f;
251
+ if (c < code) {
252
+ idx_max = idx - 1;
253
+ } else if (c >= code + len) {
254
+ idx_min = idx + 1;
255
+ } else {
256
+ return lre_case_folding_entry(c, idx, v, is_unicode);
257
+ }
258
+ }
259
+ }
260
+ return c;
261
+ }
262
+
263
+ static uint32_t get_le24(const uint8_t *ptr)
264
+ {
265
+ return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
266
+ }
267
+
268
+ #define UNICODE_INDEX_BLOCK_LEN 32
269
+
270
+ /* return -1 if not in table, otherwise the offset in the block */
271
+ static int get_index_pos(uint32_t *pcode, uint32_t c,
272
+ const uint8_t *index_table, int index_table_len)
273
+ {
274
+ uint32_t code, v;
275
+ int idx_min, idx_max, idx;
276
+
277
+ idx_min = 0;
278
+ v = get_le24(index_table);
279
+ code = v & ((1 << 21) - 1);
280
+ if (c < code) {
281
+ *pcode = 0;
282
+ return 0;
283
+ }
284
+ idx_max = index_table_len - 1;
285
+ code = get_le24(index_table + idx_max * 3);
286
+ if (c >= code)
287
+ return -1;
288
+ /* invariant: tab[idx_min] <= c < tab2[idx_max] */
289
+ while ((idx_max - idx_min) > 1) {
290
+ idx = (idx_max + idx_min) / 2;
291
+ v = get_le24(index_table + idx * 3);
292
+ code = v & ((1 << 21) - 1);
293
+ if (c < code) {
294
+ idx_max = idx;
295
+ } else {
296
+ idx_min = idx;
297
+ }
298
+ }
299
+ v = get_le24(index_table + idx_min * 3);
300
+ *pcode = v & ((1 << 21) - 1);
301
+ return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
302
+ }
303
+
304
+ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
305
+ const uint8_t *index_table, int index_table_len)
306
+ {
307
+ uint32_t code, b, bit;
308
+ int pos;
309
+ const uint8_t *p;
310
+
311
+ pos = get_index_pos(&code, c, index_table, index_table_len);
312
+ if (pos < 0)
313
+ return FALSE; /* outside the table */
314
+ p = table + pos;
315
+ bit = 0;
316
+ /* Compressed run length encoding:
317
+ 00..3F: 2 packed lengths: 3-bit + 3-bit
318
+ 40..5F: 5-bits plus extra byte for length
319
+ 60..7F: 5-bits plus 2 extra bytes for length
320
+ 80..FF: 7-bit length
321
+ lengths must be incremented to get character count
322
+ Ranges alternate between false and true return value.
323
+ */
324
+ for(;;) {
325
+ b = *p++;
326
+ if (b < 64) {
327
+ code += (b >> 3) + 1;
328
+ if (c < code)
329
+ return bit;
330
+ bit ^= 1;
331
+ code += (b & 7) + 1;
332
+ } else if (b >= 0x80) {
333
+ code += b - 0x80 + 1;
334
+ } else if (b < 0x60) {
335
+ code += (((b - 0x40) << 8) | p[0]) + 1;
336
+ p++;
337
+ } else {
338
+ code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
339
+ p += 2;
340
+ }
341
+ if (c < code)
342
+ return bit;
343
+ bit ^= 1;
344
+ }
345
+ }
346
+
347
+ BOOL lre_is_cased(uint32_t c)
348
+ {
349
+ uint32_t v, code, len;
350
+ int idx, idx_min, idx_max;
351
+
352
+ idx_min = 0;
353
+ idx_max = countof(case_conv_table1) - 1;
354
+ while (idx_min <= idx_max) {
355
+ idx = (unsigned)(idx_max + idx_min) / 2;
356
+ v = case_conv_table1[idx];
357
+ code = v >> (32 - 17);
358
+ len = (v >> (32 - 17 - 7)) & 0x7f;
359
+ if (c < code) {
360
+ idx_max = idx - 1;
361
+ } else if (c >= code + len) {
362
+ idx_min = idx + 1;
363
+ } else {
364
+ return TRUE;
365
+ }
366
+ }
367
+ return lre_is_in_table(c, unicode_prop_Cased1_table,
368
+ unicode_prop_Cased1_index,
369
+ sizeof(unicode_prop_Cased1_index) / 3);
370
+ }
371
+
372
+ BOOL lre_is_case_ignorable(uint32_t c)
373
+ {
374
+ return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
375
+ unicode_prop_Case_Ignorable_index,
376
+ sizeof(unicode_prop_Case_Ignorable_index) / 3);
377
+ }
378
+
379
+ /* character range */
380
+
381
+ static __maybe_unused void cr_dump(CharRange *cr)
382
+ {
383
+ int i;
384
+ for(i = 0; i < cr->len; i++)
385
+ printf("%d: 0x%04x\n", i, cr->points[i]);
386
+ }
387
+
388
+ static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
389
+ {
390
+ return realloc(ptr, size);
391
+ }
392
+
393
+ void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
394
+ {
395
+ cr->len = cr->size = 0;
396
+ cr->points = NULL;
397
+ cr->mem_opaque = mem_opaque;
398
+ cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
399
+ }
400
+
401
+ void cr_free(CharRange *cr)
402
+ {
403
+ cr->realloc_func(cr->mem_opaque, cr->points, 0);
404
+ }
405
+
406
+ int cr_realloc(CharRange *cr, int size)
407
+ {
408
+ int new_size;
409
+ uint32_t *new_buf;
410
+
411
+ if (size > cr->size) {
412
+ new_size = max_int(size, cr->size * 3 / 2);
413
+ new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
414
+ new_size * sizeof(cr->points[0]));
415
+ if (!new_buf)
416
+ return -1;
417
+ cr->points = new_buf;
418
+ cr->size = new_size;
419
+ }
420
+ return 0;
421
+ }
422
+
423
+ int cr_copy(CharRange *cr, const CharRange *cr1)
424
+ {
425
+ if (cr_realloc(cr, cr1->len))
426
+ return -1;
427
+ memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
428
+ cr->len = cr1->len;
429
+ return 0;
430
+ }
431
+
432
+ /* merge consecutive intervals and remove empty intervals */
433
+ static void cr_compress(CharRange *cr)
434
+ {
435
+ int i, j, k, len;
436
+ uint32_t *pt;
437
+
438
+ pt = cr->points;
439
+ len = cr->len;
440
+ i = 0;
441
+ j = 0;
442
+ k = 0;
443
+ while ((i + 1) < len) {
444
+ if (pt[i] == pt[i + 1]) {
445
+ /* empty interval */
446
+ i += 2;
447
+ } else {
448
+ j = i;
449
+ while ((j + 3) < len && pt[j + 1] == pt[j + 2])
450
+ j += 2;
451
+ /* just copy */
452
+ pt[k] = pt[i];
453
+ pt[k + 1] = pt[j + 1];
454
+ k += 2;
455
+ i = j + 2;
456
+ }
457
+ }
458
+ cr->len = k;
459
+ }
460
+
461
+ /* union or intersection */
462
+ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
463
+ const uint32_t *b_pt, int b_len, int op)
464
+ {
465
+ int a_idx, b_idx, is_in;
466
+ uint32_t v;
467
+
468
+ a_idx = 0;
469
+ b_idx = 0;
470
+ for(;;) {
471
+ /* get one more point from a or b in increasing order */
472
+ if (a_idx < a_len && b_idx < b_len) {
473
+ if (a_pt[a_idx] < b_pt[b_idx]) {
474
+ goto a_add;
475
+ } else if (a_pt[a_idx] == b_pt[b_idx]) {
476
+ v = a_pt[a_idx];
477
+ a_idx++;
478
+ b_idx++;
479
+ } else {
480
+ goto b_add;
481
+ }
482
+ } else if (a_idx < a_len) {
483
+ a_add:
484
+ v = a_pt[a_idx++];
485
+ } else if (b_idx < b_len) {
486
+ b_add:
487
+ v = b_pt[b_idx++];
488
+ } else {
489
+ break;
490
+ }
491
+ /* add the point if the in/out status changes */
492
+ switch(op) {
493
+ case CR_OP_UNION:
494
+ is_in = (a_idx & 1) | (b_idx & 1);
495
+ break;
496
+ case CR_OP_INTER:
497
+ is_in = (a_idx & 1) & (b_idx & 1);
498
+ break;
499
+ case CR_OP_XOR:
500
+ is_in = (a_idx & 1) ^ (b_idx & 1);
501
+ break;
502
+ default:
503
+ abort();
504
+ }
505
+ if (is_in != (cr->len & 1)) {
506
+ if (cr_add_point(cr, v))
507
+ return -1;
508
+ }
509
+ }
510
+ cr_compress(cr);
511
+ return 0;
512
+ }
513
+
514
+ int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
515
+ {
516
+ CharRange a = *cr;
517
+ int ret;
518
+ cr->len = 0;
519
+ cr->size = 0;
520
+ cr->points = NULL;
521
+ ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
522
+ cr_free(&a);
523
+ return ret;
524
+ }
525
+
526
+ int cr_invert(CharRange *cr)
527
+ {
528
+ int len;
529
+ len = cr->len;
530
+ if (cr_realloc(cr, len + 2))
531
+ return -1;
532
+ memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
533
+ cr->points[0] = 0;
534
+ cr->points[len + 1] = UINT32_MAX;
535
+ cr->len = len + 2;
536
+ cr_compress(cr);
537
+ return 0;
538
+ }
539
+
540
+ #ifdef CONFIG_ALL_UNICODE
541
+
542
+ BOOL lre_is_id_start(uint32_t c)
543
+ {
544
+ return lre_is_in_table(c, unicode_prop_ID_Start_table,
545
+ unicode_prop_ID_Start_index,
546
+ sizeof(unicode_prop_ID_Start_index) / 3);
547
+ }
548
+
549
+ BOOL lre_is_id_continue(uint32_t c)
550
+ {
551
+ return lre_is_id_start(c) ||
552
+ lre_is_in_table(c, unicode_prop_ID_Continue1_table,
553
+ unicode_prop_ID_Continue1_index,
554
+ sizeof(unicode_prop_ID_Continue1_index) / 3);
555
+ }
556
+
557
+ #define UNICODE_DECOMP_LEN_MAX 18
558
+
559
+ typedef enum {
560
+ DECOMP_TYPE_C1, /* 16 bit char */
561
+ DECOMP_TYPE_L1, /* 16 bit char table */
562
+ DECOMP_TYPE_L2,
563
+ DECOMP_TYPE_L3,
564
+ DECOMP_TYPE_L4,
565
+ DECOMP_TYPE_L5, /* XXX: not used */
566
+ DECOMP_TYPE_L6, /* XXX: could remove */
567
+ DECOMP_TYPE_L7, /* XXX: could remove */
568
+ DECOMP_TYPE_LL1, /* 18 bit char table */
569
+ DECOMP_TYPE_LL2,
570
+ DECOMP_TYPE_S1, /* 8 bit char table */
571
+ DECOMP_TYPE_S2,
572
+ DECOMP_TYPE_S3,
573
+ DECOMP_TYPE_S4,
574
+ DECOMP_TYPE_S5,
575
+ DECOMP_TYPE_I1, /* increment 16 bit char value */
576
+ DECOMP_TYPE_I2_0,
577
+ DECOMP_TYPE_I2_1,
578
+ DECOMP_TYPE_I3_1,
579
+ DECOMP_TYPE_I3_2,
580
+ DECOMP_TYPE_I4_1,
581
+ DECOMP_TYPE_I4_2,
582
+ DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
583
+ DECOMP_TYPE_B2,
584
+ DECOMP_TYPE_B3,
585
+ DECOMP_TYPE_B4,
586
+ DECOMP_TYPE_B5,
587
+ DECOMP_TYPE_B6,
588
+ DECOMP_TYPE_B7,
589
+ DECOMP_TYPE_B8,
590
+ DECOMP_TYPE_B18,
591
+ DECOMP_TYPE_LS2,
592
+ DECOMP_TYPE_PAT3,
593
+ DECOMP_TYPE_S2_UL,
594
+ DECOMP_TYPE_LS2_UL,
595
+ } DecompTypeEnum;
596
+
597
+ static uint32_t unicode_get_short_code(uint32_t c)
598
+ {
599
+ static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
600
+
601
+ if (c < 0x80)
602
+ return c;
603
+ else if (c < 0x80 + 0x50)
604
+ return c - 0x80 + 0x300;
605
+ else
606
+ return unicode_short_table[c - 0x80 - 0x50];
607
+ }
608
+
609
+ static uint32_t unicode_get_lower_simple(uint32_t c)
610
+ {
611
+ if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
612
+ c += 0x20;
613
+ else
614
+ c++;
615
+ return c;
616
+ }
617
+
618
+ static uint16_t unicode_get16(const uint8_t *p)
619
+ {
620
+ return p[0] | (p[1] << 8);
621
+ }
622
+
623
+ static int unicode_decomp_entry(uint32_t *res, uint32_t c,
624
+ int idx, uint32_t code, uint32_t len,
625
+ uint32_t type)
626
+ {
627
+ uint32_t c1;
628
+ int l, i, p;
629
+ const uint8_t *d;
630
+
631
+ if (type == DECOMP_TYPE_C1) {
632
+ res[0] = unicode_decomp_table2[idx];
633
+ return 1;
634
+ } else {
635
+ d = unicode_decomp_data + unicode_decomp_table2[idx];
636
+ switch(type) {
637
+ case DECOMP_TYPE_L1:
638
+ case DECOMP_TYPE_L2:
639
+ case DECOMP_TYPE_L3:
640
+ case DECOMP_TYPE_L4:
641
+ case DECOMP_TYPE_L5:
642
+ case DECOMP_TYPE_L6:
643
+ case DECOMP_TYPE_L7:
644
+ l = type - DECOMP_TYPE_L1 + 1;
645
+ d += (c - code) * l * 2;
646
+ for(i = 0; i < l; i++) {
647
+ if ((res[i] = unicode_get16(d + 2 * i)) == 0)
648
+ return 0;
649
+ }
650
+ return l;
651
+ case DECOMP_TYPE_LL1:
652
+ case DECOMP_TYPE_LL2:
653
+ {
654
+ uint32_t k, p;
655
+ l = type - DECOMP_TYPE_LL1 + 1;
656
+ k = (c - code) * l;
657
+ p = len * l * 2;
658
+ for(i = 0; i < l; i++) {
659
+ c1 = unicode_get16(d + 2 * k) |
660
+ (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
661
+ if (!c1)
662
+ return 0;
663
+ res[i] = c1;
664
+ k++;
665
+ }
666
+ }
667
+ return l;
668
+ case DECOMP_TYPE_S1:
669
+ case DECOMP_TYPE_S2:
670
+ case DECOMP_TYPE_S3:
671
+ case DECOMP_TYPE_S4:
672
+ case DECOMP_TYPE_S5:
673
+ l = type - DECOMP_TYPE_S1 + 1;
674
+ d += (c - code) * l;
675
+ for(i = 0; i < l; i++) {
676
+ if ((res[i] = unicode_get_short_code(d[i])) == 0)
677
+ return 0;
678
+ }
679
+ return l;
680
+ case DECOMP_TYPE_I1:
681
+ l = 1;
682
+ p = 0;
683
+ goto decomp_type_i;
684
+ case DECOMP_TYPE_I2_0:
685
+ case DECOMP_TYPE_I2_1:
686
+ case DECOMP_TYPE_I3_1:
687
+ case DECOMP_TYPE_I3_2:
688
+ case DECOMP_TYPE_I4_1:
689
+ case DECOMP_TYPE_I4_2:
690
+ l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
691
+ p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
692
+ decomp_type_i:
693
+ for(i = 0; i < l; i++) {
694
+ c1 = unicode_get16(d + 2 * i);
695
+ if (i == p)
696
+ c1 += c - code;
697
+ res[i] = c1;
698
+ }
699
+ return l;
700
+ case DECOMP_TYPE_B18:
701
+ l = 18;
702
+ goto decomp_type_b;
703
+ case DECOMP_TYPE_B1:
704
+ case DECOMP_TYPE_B2:
705
+ case DECOMP_TYPE_B3:
706
+ case DECOMP_TYPE_B4:
707
+ case DECOMP_TYPE_B5:
708
+ case DECOMP_TYPE_B6:
709
+ case DECOMP_TYPE_B7:
710
+ case DECOMP_TYPE_B8:
711
+ l = type - DECOMP_TYPE_B1 + 1;
712
+ decomp_type_b:
713
+ {
714
+ uint32_t c_min;
715
+ c_min = unicode_get16(d);
716
+ d += 2 + (c - code) * l;
717
+ for(i = 0; i < l; i++) {
718
+ c1 = d[i];
719
+ if (c1 == 0xff)
720
+ c1 = 0x20;
721
+ else
722
+ c1 += c_min;
723
+ res[i] = c1;
724
+ }
725
+ }
726
+ return l;
727
+ case DECOMP_TYPE_LS2:
728
+ d += (c - code) * 3;
729
+ if (!(res[0] = unicode_get16(d)))
730
+ return 0;
731
+ res[1] = unicode_get_short_code(d[2]);
732
+ return 2;
733
+ case DECOMP_TYPE_PAT3:
734
+ res[0] = unicode_get16(d);
735
+ res[2] = unicode_get16(d + 2);
736
+ d += 4 + (c - code) * 2;
737
+ res[1] = unicode_get16(d);
738
+ return 3;
739
+ case DECOMP_TYPE_S2_UL:
740
+ case DECOMP_TYPE_LS2_UL:
741
+ c1 = c - code;
742
+ if (type == DECOMP_TYPE_S2_UL) {
743
+ d += c1 & ~1;
744
+ c = unicode_get_short_code(*d);
745
+ d++;
746
+ } else {
747
+ d += (c1 >> 1) * 3;
748
+ c = unicode_get16(d);
749
+ d += 2;
750
+ }
751
+ if (c1 & 1)
752
+ c = unicode_get_lower_simple(c);
753
+ res[0] = c;
754
+ res[1] = unicode_get_short_code(*d);
755
+ return 2;
756
+ }
757
+ }
758
+ return 0;
759
+ }
760
+
761
+
762
+ /* return the length of the decomposition (length <=
763
+ UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
764
+ static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
765
+ {
766
+ uint32_t v, type, is_compat, code, len;
767
+ int idx_min, idx_max, idx;
768
+
769
+ idx_min = 0;
770
+ idx_max = countof(unicode_decomp_table1) - 1;
771
+ while (idx_min <= idx_max) {
772
+ idx = (idx_max + idx_min) / 2;
773
+ v = unicode_decomp_table1[idx];
774
+ code = v >> (32 - 18);
775
+ len = (v >> (32 - 18 - 7)) & 0x7f;
776
+ // printf("idx=%d code=%05x len=%d\n", idx, code, len);
777
+ if (c < code) {
778
+ idx_max = idx - 1;
779
+ } else if (c >= code + len) {
780
+ idx_min = idx + 1;
781
+ } else {
782
+ is_compat = v & 1;
783
+ if (is_compat1 < is_compat)
784
+ break;
785
+ type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
786
+ return unicode_decomp_entry(res, c, idx, code, len, type);
787
+ }
788
+ }
789
+ return 0;
790
+ }
791
+
792
+ /* return 0 if no pair found */
793
+ static int unicode_compose_pair(uint32_t c0, uint32_t c1)
794
+ {
795
+ uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
796
+ int idx_min, idx_max, idx, d;
797
+ uint32_t pair[2];
798
+
799
+ idx_min = 0;
800
+ idx_max = countof(unicode_comp_table) - 1;
801
+ while (idx_min <= idx_max) {
802
+ idx = (idx_max + idx_min) / 2;
803
+ idx1 = unicode_comp_table[idx];
804
+
805
+ /* idx1 represent an entry of the decomposition table */
806
+ d_idx = idx1 >> 6;
807
+ d_offset = idx1 & 0x3f;
808
+ v = unicode_decomp_table1[d_idx];
809
+ code = v >> (32 - 18);
810
+ len = (v >> (32 - 18 - 7)) & 0x7f;
811
+ type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
812
+ ch = code + d_offset;
813
+ unicode_decomp_entry(pair, ch, d_idx, code, len, type);
814
+ d = c0 - pair[0];
815
+ if (d == 0)
816
+ d = c1 - pair[1];
817
+ if (d < 0) {
818
+ idx_max = idx - 1;
819
+ } else if (d > 0) {
820
+ idx_min = idx + 1;
821
+ } else {
822
+ return ch;
823
+ }
824
+ }
825
+ return 0;
826
+ }
827
+
828
+ /* return the combining class of character c (between 0 and 255) */
829
+ static int unicode_get_cc(uint32_t c)
830
+ {
831
+ uint32_t code, n, type, cc, c1, b;
832
+ int pos;
833
+ const uint8_t *p;
834
+
835
+ pos = get_index_pos(&code, c,
836
+ unicode_cc_index, sizeof(unicode_cc_index) / 3);
837
+ if (pos < 0)
838
+ return 0;
839
+ p = unicode_cc_table + pos;
840
+ /* Compressed run length encoding:
841
+ - 2 high order bits are combining class type
842
+ - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
843
+ - 00..2F: range length (add 1)
844
+ - 30..37: 3-bit range-length + 1 extra byte
845
+ - 38..3F: 3-bit range-length + 2 extra byte
846
+ */
847
+ for(;;) {
848
+ b = *p++;
849
+ type = b >> 6;
850
+ n = b & 0x3f;
851
+ if (n < 48) {
852
+ } else if (n < 56) {
853
+ n = (n - 48) << 8;
854
+ n |= *p++;
855
+ n += 48;
856
+ } else {
857
+ n = (n - 56) << 8;
858
+ n |= *p++ << 8;
859
+ n |= *p++;
860
+ n += 48 + (1 << 11);
861
+ }
862
+ if (type <= 1)
863
+ p++;
864
+ c1 = code + n + 1;
865
+ if (c < c1) {
866
+ switch(type) {
867
+ case 0:
868
+ cc = p[-1];
869
+ break;
870
+ case 1:
871
+ cc = p[-1] + c - code;
872
+ break;
873
+ case 2:
874
+ cc = 0;
875
+ break;
876
+ default:
877
+ case 3:
878
+ cc = 230;
879
+ break;
880
+ }
881
+ return cc;
882
+ }
883
+ code = c1;
884
+ }
885
+ }
886
+
887
+ static void sort_cc(int *buf, int len)
888
+ {
889
+ int i, j, k, cc, cc1, start, ch1;
890
+
891
+ for(i = 0; i < len; i++) {
892
+ cc = unicode_get_cc(buf[i]);
893
+ if (cc != 0) {
894
+ start = i;
895
+ j = i + 1;
896
+ while (j < len) {
897
+ ch1 = buf[j];
898
+ cc1 = unicode_get_cc(ch1);
899
+ if (cc1 == 0)
900
+ break;
901
+ k = j - 1;
902
+ while (k >= start) {
903
+ if (unicode_get_cc(buf[k]) <= cc1)
904
+ break;
905
+ buf[k + 1] = buf[k];
906
+ k--;
907
+ }
908
+ buf[k + 1] = ch1;
909
+ j++;
910
+ }
911
+ #if 0
912
+ printf("cc:");
913
+ for(k = start; k < j; k++) {
914
+ printf(" %3d", unicode_get_cc(buf[k]));
915
+ }
916
+ printf("\n");
917
+ #endif
918
+ i = j;
919
+ }
920
+ }
921
+ }
922
+
923
+ static void to_nfd_rec(DynBuf *dbuf,
924
+ const int *src, int src_len, int is_compat)
925
+ {
926
+ uint32_t c, v;
927
+ int i, l;
928
+ uint32_t res[UNICODE_DECOMP_LEN_MAX];
929
+
930
+ for(i = 0; i < src_len; i++) {
931
+ c = src[i];
932
+ if (c >= 0xac00 && c < 0xd7a4) {
933
+ /* Hangul decomposition */
934
+ c -= 0xac00;
935
+ dbuf_put_u32(dbuf, 0x1100 + c / 588);
936
+ dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
937
+ v = c % 28;
938
+ if (v != 0)
939
+ dbuf_put_u32(dbuf, 0x11a7 + v);
940
+ } else {
941
+ l = unicode_decomp_char(res, c, is_compat);
942
+ if (l) {
943
+ to_nfd_rec(dbuf, (int *)res, l, is_compat);
944
+ } else {
945
+ dbuf_put_u32(dbuf, c);
946
+ }
947
+ }
948
+ }
949
+ }
950
+
951
+ /* return 0 if not found */
952
+ static int compose_pair(uint32_t c0, uint32_t c1)
953
+ {
954
+ /* Hangul composition */
955
+ if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
956
+ c1 >= 0x1161 && c1 < 0x1161 + 21) {
957
+ return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
958
+ } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
959
+ (c0 - 0xac00) % 28 == 0 &&
960
+ c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
961
+ return c0 + c1 - 0x11a7;
962
+ } else {
963
+ return unicode_compose_pair(c0, c1);
964
+ }
965
+ }
966
+
967
+ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
968
+ UnicodeNormalizationEnum n_type,
969
+ void *opaque, DynBufReallocFunc *realloc_func)
970
+ {
971
+ int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
972
+ BOOL is_compat;
973
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
974
+
975
+ is_compat = n_type >> 1;
976
+
977
+ dbuf_init2(dbuf, opaque, realloc_func);
978
+ if (dbuf_realloc(dbuf, sizeof(int) * src_len))
979
+ goto fail;
980
+
981
+ /* common case: latin1 is unaffected by NFC */
982
+ if (n_type == UNICODE_NFC) {
983
+ for(i = 0; i < src_len; i++) {
984
+ if (src[i] >= 0x100)
985
+ goto not_latin1;
986
+ }
987
+ buf = (int *)dbuf->buf;
988
+ memcpy(buf, src, src_len * sizeof(int));
989
+ *pdst = (uint32_t *)buf;
990
+ return src_len;
991
+ not_latin1: ;
992
+ }
993
+
994
+ to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
995
+ if (dbuf_error(dbuf)) {
996
+ fail:
997
+ *pdst = NULL;
998
+ return -1;
999
+ }
1000
+ buf = (int *)dbuf->buf;
1001
+ buf_len = dbuf->size / sizeof(int);
1002
+
1003
+ sort_cc(buf, buf_len);
1004
+
1005
+ if (buf_len <= 1 || (n_type & 1) != 0) {
1006
+ /* NFD / NFKD */
1007
+ *pdst = (uint32_t *)buf;
1008
+ return buf_len;
1009
+ }
1010
+
1011
+ i = 1;
1012
+ out_len = 1;
1013
+ while (i < buf_len) {
1014
+ /* find the starter character and test if it is blocked from
1015
+ the character at 'i' */
1016
+ last_cc = unicode_get_cc(buf[i]);
1017
+ starter_pos = out_len - 1;
1018
+ while (starter_pos >= 0) {
1019
+ cc = unicode_get_cc(buf[starter_pos]);
1020
+ if (cc == 0)
1021
+ break;
1022
+ if (cc >= last_cc)
1023
+ goto next;
1024
+ last_cc = 256;
1025
+ starter_pos--;
1026
+ }
1027
+ if (starter_pos >= 0 &&
1028
+ (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
1029
+ buf[starter_pos] = p;
1030
+ i++;
1031
+ } else {
1032
+ next:
1033
+ buf[out_len++] = buf[i++];
1034
+ }
1035
+ }
1036
+ *pdst = (uint32_t *)buf;
1037
+ return out_len;
1038
+ }
1039
+
1040
+ /* char ranges for various unicode properties */
1041
+
1042
+ static int unicode_find_name(const char *name_table, const char *name)
1043
+ {
1044
+ const char *p, *r;
1045
+ int pos;
1046
+ size_t name_len, len;
1047
+
1048
+ p = name_table;
1049
+ pos = 0;
1050
+ name_len = strlen(name);
1051
+ while (*p) {
1052
+ for(;;) {
1053
+ r = strchr(p, ',');
1054
+ if (!r)
1055
+ len = strlen(p);
1056
+ else
1057
+ len = r - p;
1058
+ if (len == name_len && !memcmp(p, name, name_len))
1059
+ return pos;
1060
+ p += len + 1;
1061
+ if (!r)
1062
+ break;
1063
+ }
1064
+ pos++;
1065
+ }
1066
+ return -1;
1067
+ }
1068
+
1069
+ /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1070
+ if not found */
1071
+ int unicode_script(CharRange *cr,
1072
+ const char *script_name, BOOL is_ext)
1073
+ {
1074
+ int script_idx;
1075
+ const uint8_t *p, *p_end;
1076
+ uint32_t c, c1, b, n, v, v_len, i, type;
1077
+ CharRange cr1_s, *cr1;
1078
+ CharRange cr2_s, *cr2 = &cr2_s;
1079
+ BOOL is_common;
1080
+
1081
+ script_idx = unicode_find_name(unicode_script_name_table, script_name);
1082
+ if (script_idx < 0)
1083
+ return -2;
1084
+ /* Note: we remove the "Unknown" Script */
1085
+ script_idx += UNICODE_SCRIPT_Unknown + 1;
1086
+
1087
+ is_common = (script_idx == UNICODE_SCRIPT_Common ||
1088
+ script_idx == UNICODE_SCRIPT_Inherited);
1089
+ if (is_ext) {
1090
+ cr1 = &cr1_s;
1091
+ cr_init(cr1, cr->mem_opaque, cr->realloc_func);
1092
+ cr_init(cr2, cr->mem_opaque, cr->realloc_func);
1093
+ } else {
1094
+ cr1 = cr;
1095
+ }
1096
+
1097
+ p = unicode_script_table;
1098
+ p_end = unicode_script_table + countof(unicode_script_table);
1099
+ c = 0;
1100
+ while (p < p_end) {
1101
+ b = *p++;
1102
+ type = b >> 7;
1103
+ n = b & 0x7f;
1104
+ if (n < 96) {
1105
+ } else if (n < 112) {
1106
+ n = (n - 96) << 8;
1107
+ n |= *p++;
1108
+ n += 96;
1109
+ } else {
1110
+ n = (n - 112) << 16;
1111
+ n |= *p++ << 8;
1112
+ n |= *p++;
1113
+ n += 96 + (1 << 12);
1114
+ }
1115
+ if (type == 0)
1116
+ v = 0;
1117
+ else
1118
+ v = *p++;
1119
+ c1 = c + n + 1;
1120
+ if (v == script_idx) {
1121
+ if (cr_add_interval(cr1, c, c1))
1122
+ goto fail;
1123
+ }
1124
+ c = c1;
1125
+ }
1126
+
1127
+ if (is_ext) {
1128
+ /* add the script extensions */
1129
+ p = unicode_script_ext_table;
1130
+ p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1131
+ c = 0;
1132
+ while (p < p_end) {
1133
+ b = *p++;
1134
+ if (b < 128) {
1135
+ n = b;
1136
+ } else if (b < 128 + 64) {
1137
+ n = (b - 128) << 8;
1138
+ n |= *p++;
1139
+ n += 128;
1140
+ } else {
1141
+ n = (b - 128 - 64) << 16;
1142
+ n |= *p++ << 8;
1143
+ n |= *p++;
1144
+ n += 128 + (1 << 14);
1145
+ }
1146
+ c1 = c + n + 1;
1147
+ v_len = *p++;
1148
+ if (is_common) {
1149
+ if (v_len != 0) {
1150
+ if (cr_add_interval(cr2, c, c1))
1151
+ goto fail;
1152
+ }
1153
+ } else {
1154
+ for(i = 0; i < v_len; i++) {
1155
+ if (p[i] == script_idx) {
1156
+ if (cr_add_interval(cr2, c, c1))
1157
+ goto fail;
1158
+ break;
1159
+ }
1160
+ }
1161
+ }
1162
+ p += v_len;
1163
+ c = c1;
1164
+ }
1165
+ if (is_common) {
1166
+ /* remove all the characters with script extensions */
1167
+ if (cr_invert(cr2))
1168
+ goto fail;
1169
+ if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1170
+ CR_OP_INTER))
1171
+ goto fail;
1172
+ } else {
1173
+ if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1174
+ CR_OP_UNION))
1175
+ goto fail;
1176
+ }
1177
+ cr_free(cr1);
1178
+ cr_free(cr2);
1179
+ }
1180
+ return 0;
1181
+ fail:
1182
+ if (is_ext) {
1183
+ cr_free(cr1);
1184
+ cr_free(cr2);
1185
+ }
1186
+ goto fail;
1187
+ }
1188
+
1189
+ #define M(id) (1U << UNICODE_GC_ ## id)
1190
+
1191
+ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1192
+ {
1193
+ const uint8_t *p, *p_end;
1194
+ uint32_t c, c0, b, n, v;
1195
+
1196
+ p = unicode_gc_table;
1197
+ p_end = unicode_gc_table + countof(unicode_gc_table);
1198
+ c = 0;
1199
+ /* Compressed range encoding:
1200
+ initial byte:
1201
+ bits 0..4: category number (special case 31)
1202
+ bits 5..7: range length (add 1)
1203
+ special case bits 5..7 == 7: read an extra byte
1204
+ - 00..7F: range length (add 7 + 1)
1205
+ - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1206
+ - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1207
+ */
1208
+ while (p < p_end) {
1209
+ b = *p++;
1210
+ n = b >> 5;
1211
+ v = b & 0x1f;
1212
+ if (n == 7) {
1213
+ n = *p++;
1214
+ if (n < 128) {
1215
+ n += 7;
1216
+ } else if (n < 128 + 64) {
1217
+ n = (n - 128) << 8;
1218
+ n |= *p++;
1219
+ n += 7 + 128;
1220
+ } else {
1221
+ n = (n - 128 - 64) << 16;
1222
+ n |= *p++ << 8;
1223
+ n |= *p++;
1224
+ n += 7 + 128 + (1 << 14);
1225
+ }
1226
+ }
1227
+ c0 = c;
1228
+ c += n + 1;
1229
+ if (v == 31) {
1230
+ /* run of Lu / Ll */
1231
+ b = gc_mask & (M(Lu) | M(Ll));
1232
+ if (b != 0) {
1233
+ if (b == (M(Lu) | M(Ll))) {
1234
+ goto add_range;
1235
+ } else {
1236
+ c0 += ((gc_mask & M(Ll)) != 0);
1237
+ for(; c0 < c; c0 += 2) {
1238
+ if (cr_add_interval(cr, c0, c0 + 1))
1239
+ return -1;
1240
+ }
1241
+ }
1242
+ }
1243
+ } else if ((gc_mask >> v) & 1) {
1244
+ add_range:
1245
+ if (cr_add_interval(cr, c0, c))
1246
+ return -1;
1247
+ }
1248
+ }
1249
+ return 0;
1250
+ }
1251
+
1252
+ static int unicode_prop1(CharRange *cr, int prop_idx)
1253
+ {
1254
+ const uint8_t *p, *p_end;
1255
+ uint32_t c, c0, b, bit;
1256
+
1257
+ p = unicode_prop_table[prop_idx];
1258
+ p_end = p + unicode_prop_len_table[prop_idx];
1259
+ c = 0;
1260
+ bit = 0;
1261
+ /* Compressed range encoding:
1262
+ 00..3F: 2 packed lengths: 3-bit + 3-bit
1263
+ 40..5F: 5-bits plus extra byte for length
1264
+ 60..7F: 5-bits plus 2 extra bytes for length
1265
+ 80..FF: 7-bit length
1266
+ lengths must be incremented to get character count
1267
+ Ranges alternate between false and true return value.
1268
+ */
1269
+ while (p < p_end) {
1270
+ c0 = c;
1271
+ b = *p++;
1272
+ if (b < 64) {
1273
+ c += (b >> 3) + 1;
1274
+ if (bit) {
1275
+ if (cr_add_interval(cr, c0, c))
1276
+ return -1;
1277
+ }
1278
+ bit ^= 1;
1279
+ c0 = c;
1280
+ c += (b & 7) + 1;
1281
+ } else if (b >= 0x80) {
1282
+ c += b - 0x80 + 1;
1283
+ } else if (b < 0x60) {
1284
+ c += (((b - 0x40) << 8) | p[0]) + 1;
1285
+ p++;
1286
+ } else {
1287
+ c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1288
+ p += 2;
1289
+ }
1290
+ if (bit) {
1291
+ if (cr_add_interval(cr, c0, c))
1292
+ return -1;
1293
+ }
1294
+ bit ^= 1;
1295
+ }
1296
+ return 0;
1297
+ }
1298
+
1299
+ #define CASE_U (1 << 0)
1300
+ #define CASE_L (1 << 1)
1301
+ #define CASE_F (1 << 2)
1302
+
1303
+ /* use the case conversion table to generate range of characters.
1304
+ CASE_U: set char if modified by uppercasing,
1305
+ CASE_L: set char if modified by lowercasing,
1306
+ CASE_F: set char if modified by case folding,
1307
+ */
1308
+ static int unicode_case1(CharRange *cr, int case_mask)
1309
+ {
1310
+ #define MR(x) (1 << RUN_TYPE_ ## x)
1311
+ const uint32_t tab_run_mask[3] = {
1312
+ MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
1313
+ MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
1314
+
1315
+ MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
1316
+
1317
+ MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
1318
+ };
1319
+ #undef MR
1320
+ uint32_t mask, v, code, type, len, i, idx;
1321
+
1322
+ if (case_mask == 0)
1323
+ return 0;
1324
+ mask = 0;
1325
+ for(i = 0; i < 3; i++) {
1326
+ if ((case_mask >> i) & 1)
1327
+ mask |= tab_run_mask[i];
1328
+ }
1329
+ for(idx = 0; idx < countof(case_conv_table1); idx++) {
1330
+ v = case_conv_table1[idx];
1331
+ type = (v >> (32 - 17 - 7 - 4)) & 0xf;
1332
+ code = v >> (32 - 17);
1333
+ len = (v >> (32 - 17 - 7)) & 0x7f;
1334
+ if ((mask >> type) & 1) {
1335
+ // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
1336
+ switch(type) {
1337
+ case RUN_TYPE_UL:
1338
+ if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1339
+ goto def_case;
1340
+ code += ((case_mask & CASE_U) != 0);
1341
+ for(i = 0; i < len; i += 2) {
1342
+ if (cr_add_interval(cr, code + i, code + i + 1))
1343
+ return -1;
1344
+ }
1345
+ break;
1346
+ case RUN_TYPE_LSU:
1347
+ if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1348
+ goto def_case;
1349
+ if (!(case_mask & CASE_U)) {
1350
+ if (cr_add_interval(cr, code, code + 1))
1351
+ return -1;
1352
+ }
1353
+ if (cr_add_interval(cr, code + 1, code + 2))
1354
+ return -1;
1355
+ if (case_mask & CASE_U) {
1356
+ if (cr_add_interval(cr, code + 2, code + 3))
1357
+ return -1;
1358
+ }
1359
+ break;
1360
+ default:
1361
+ def_case:
1362
+ if (cr_add_interval(cr, code, code + len))
1363
+ return -1;
1364
+ break;
1365
+ }
1366
+ }
1367
+ }
1368
+ return 0;
1369
+ }
1370
+
1371
+ static int point_cmp(const void *p1, const void *p2, void *arg)
1372
+ {
1373
+ uint32_t v1 = *(uint32_t *)p1;
1374
+ uint32_t v2 = *(uint32_t *)p2;
1375
+ return (v1 > v2) - (v1 < v2);
1376
+ }
1377
+
1378
+ static void cr_sort_and_remove_overlap(CharRange *cr)
1379
+ {
1380
+ uint32_t start, end, start1, end1, i, j;
1381
+
1382
+ /* the resulting ranges are not necessarily sorted and may overlap */
1383
+ rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
1384
+ j = 0;
1385
+ for(i = 0; i < cr->len; ) {
1386
+ start = cr->points[i];
1387
+ end = cr->points[i + 1];
1388
+ i += 2;
1389
+ while (i < cr->len) {
1390
+ start1 = cr->points[i];
1391
+ end1 = cr->points[i + 1];
1392
+ if (start1 > end) {
1393
+ /* |------|
1394
+ * |-------| */
1395
+ break;
1396
+ } else if (end1 <= end) {
1397
+ /* |------|
1398
+ * |--| */
1399
+ i += 2;
1400
+ } else {
1401
+ /* |------|
1402
+ * |-------| */
1403
+ end = end1;
1404
+ i += 2;
1405
+ }
1406
+ }
1407
+ cr->points[j] = start;
1408
+ cr->points[j + 1] = end;
1409
+ j += 2;
1410
+ }
1411
+ cr->len = j;
1412
+ }
1413
+
1414
+ /* canonicalize a character set using the JS regex case folding rules
1415
+ (see lre_canonicalize()) */
1416
+ int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
1417
+ {
1418
+ CharRange cr_inter, cr_mask, cr_result, cr_sub;
1419
+ uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
1420
+
1421
+ cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
1422
+ cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
1423
+ cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
1424
+ cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
1425
+
1426
+ if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
1427
+ goto fail;
1428
+ if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
1429
+ goto fail;
1430
+
1431
+ if (cr_invert(&cr_mask))
1432
+ goto fail;
1433
+ if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
1434
+ goto fail;
1435
+
1436
+ /* cr_inter = cr & cr_mask */
1437
+ /* cr_sub = cr & ~cr_mask */
1438
+
1439
+ /* use the case conversion table to compute the result */
1440
+ d_start = -1;
1441
+ d_end = -1;
1442
+ idx = 0;
1443
+ v = case_conv_table1[idx];
1444
+ code = v >> (32 - 17);
1445
+ len = (v >> (32 - 17 - 7)) & 0x7f;
1446
+ for(i = 0; i < cr_inter.len; i += 2) {
1447
+ start = cr_inter.points[i];
1448
+ end = cr_inter.points[i + 1];
1449
+
1450
+ for(c = start; c < end; c++) {
1451
+ for(;;) {
1452
+ if (c >= code && c < code + len)
1453
+ break;
1454
+ idx++;
1455
+ assert(idx < countof(case_conv_table1));
1456
+ v = case_conv_table1[idx];
1457
+ code = v >> (32 - 17);
1458
+ len = (v >> (32 - 17 - 7)) & 0x7f;
1459
+ }
1460
+ d = lre_case_folding_entry(c, idx, v, is_unicode);
1461
+ /* try to merge with the current interval */
1462
+ if (d_start == -1) {
1463
+ d_start = d;
1464
+ d_end = d + 1;
1465
+ } else if (d_end == d) {
1466
+ d_end++;
1467
+ } else {
1468
+ cr_add_interval(&cr_result, d_start, d_end);
1469
+ d_start = d;
1470
+ d_end = d + 1;
1471
+ }
1472
+ }
1473
+ }
1474
+ if (d_start != -1) {
1475
+ if (cr_add_interval(&cr_result, d_start, d_end))
1476
+ goto fail;
1477
+ }
1478
+
1479
+ /* the resulting ranges are not necessarily sorted and may overlap */
1480
+ cr_sort_and_remove_overlap(&cr_result);
1481
+
1482
+ /* or with the character not affected by the case folding */
1483
+ cr->len = 0;
1484
+ if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
1485
+ goto fail;
1486
+
1487
+ cr_free(&cr_inter);
1488
+ cr_free(&cr_mask);
1489
+ cr_free(&cr_result);
1490
+ cr_free(&cr_sub);
1491
+ return 0;
1492
+ fail:
1493
+ cr_free(&cr_inter);
1494
+ cr_free(&cr_mask);
1495
+ cr_free(&cr_result);
1496
+ cr_free(&cr_sub);
1497
+ return -1;
1498
+ }
1499
+
1500
+ typedef enum {
1501
+ POP_GC,
1502
+ POP_PROP,
1503
+ POP_CASE,
1504
+ POP_UNION,
1505
+ POP_INTER,
1506
+ POP_XOR,
1507
+ POP_INVERT,
1508
+ POP_END,
1509
+ } PropOPEnum;
1510
+
1511
+ #define POP_STACK_LEN_MAX 4
1512
+
1513
+ static int unicode_prop_ops(CharRange *cr, ...)
1514
+ {
1515
+ va_list ap;
1516
+ CharRange stack[POP_STACK_LEN_MAX];
1517
+ int stack_len, op, ret, i;
1518
+ uint32_t a;
1519
+
1520
+ va_start(ap, cr);
1521
+ stack_len = 0;
1522
+ for(;;) {
1523
+ op = va_arg(ap, int);
1524
+ switch(op) {
1525
+ case POP_GC:
1526
+ assert(stack_len < POP_STACK_LEN_MAX);
1527
+ a = va_arg(ap, int);
1528
+ cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1529
+ if (unicode_general_category1(&stack[stack_len - 1], a))
1530
+ goto fail;
1531
+ break;
1532
+ case POP_PROP:
1533
+ assert(stack_len < POP_STACK_LEN_MAX);
1534
+ a = va_arg(ap, int);
1535
+ cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1536
+ if (unicode_prop1(&stack[stack_len - 1], a))
1537
+ goto fail;
1538
+ break;
1539
+ case POP_CASE:
1540
+ assert(stack_len < POP_STACK_LEN_MAX);
1541
+ a = va_arg(ap, int);
1542
+ cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1543
+ if (unicode_case1(&stack[stack_len - 1], a))
1544
+ goto fail;
1545
+ break;
1546
+ case POP_UNION:
1547
+ case POP_INTER:
1548
+ case POP_XOR:
1549
+ {
1550
+ CharRange *cr1, *cr2, *cr3;
1551
+ assert(stack_len >= 2);
1552
+ assert(stack_len < POP_STACK_LEN_MAX);
1553
+ cr1 = &stack[stack_len - 2];
1554
+ cr2 = &stack[stack_len - 1];
1555
+ cr3 = &stack[stack_len++];
1556
+ cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1557
+ if (cr_op(cr3, cr1->points, cr1->len,
1558
+ cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1559
+ goto fail;
1560
+ cr_free(cr1);
1561
+ cr_free(cr2);
1562
+ *cr1 = *cr3;
1563
+ stack_len -= 2;
1564
+ }
1565
+ break;
1566
+ case POP_INVERT:
1567
+ assert(stack_len >= 1);
1568
+ if (cr_invert(&stack[stack_len - 1]))
1569
+ goto fail;
1570
+ break;
1571
+ case POP_END:
1572
+ goto done;
1573
+ default:
1574
+ abort();
1575
+ }
1576
+ }
1577
+ done:
1578
+ assert(stack_len == 1);
1579
+ ret = cr_copy(cr, &stack[0]);
1580
+ cr_free(&stack[0]);
1581
+ return ret;
1582
+ fail:
1583
+ for(i = 0; i < stack_len; i++)
1584
+ cr_free(&stack[i]);
1585
+ return -1;
1586
+ }
1587
+
1588
+ static const uint32_t unicode_gc_mask_table[] = {
1589
+ M(Lu) | M(Ll) | M(Lt), /* LC */
1590
+ M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1591
+ M(Mn) | M(Mc) | M(Me), /* M */
1592
+ M(Nd) | M(Nl) | M(No), /* N */
1593
+ M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1594
+ M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1595
+ M(Zs) | M(Zl) | M(Zp), /* Z */
1596
+ M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1597
+ };
1598
+
1599
+ /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1600
+ if not found */
1601
+ int unicode_general_category(CharRange *cr, const char *gc_name)
1602
+ {
1603
+ int gc_idx;
1604
+ uint32_t gc_mask;
1605
+
1606
+ gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1607
+ if (gc_idx < 0)
1608
+ return -2;
1609
+ if (gc_idx <= UNICODE_GC_Co) {
1610
+ gc_mask = (uint64_t)1 << gc_idx;
1611
+ } else {
1612
+ gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1613
+ }
1614
+ return unicode_general_category1(cr, gc_mask);
1615
+ }
1616
+
1617
+
1618
+ /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1619
+ if not found */
1620
+ int unicode_prop(CharRange *cr, const char *prop_name)
1621
+ {
1622
+ int prop_idx, ret;
1623
+
1624
+ prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1625
+ if (prop_idx < 0)
1626
+ return -2;
1627
+ prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1628
+
1629
+ ret = 0;
1630
+ switch(prop_idx) {
1631
+ case UNICODE_PROP_ASCII:
1632
+ if (cr_add_interval(cr, 0x00, 0x7f + 1))
1633
+ return -1;
1634
+ break;
1635
+ case UNICODE_PROP_Any:
1636
+ if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1637
+ return -1;
1638
+ break;
1639
+ case UNICODE_PROP_Assigned:
1640
+ ret = unicode_prop_ops(cr,
1641
+ POP_GC, M(Cn),
1642
+ POP_INVERT,
1643
+ POP_END);
1644
+ break;
1645
+ case UNICODE_PROP_Math:
1646
+ ret = unicode_prop_ops(cr,
1647
+ POP_GC, M(Sm),
1648
+ POP_PROP, UNICODE_PROP_Other_Math,
1649
+ POP_UNION,
1650
+ POP_END);
1651
+ break;
1652
+ case UNICODE_PROP_Lowercase:
1653
+ ret = unicode_prop_ops(cr,
1654
+ POP_GC, M(Ll),
1655
+ POP_PROP, UNICODE_PROP_Other_Lowercase,
1656
+ POP_UNION,
1657
+ POP_END);
1658
+ break;
1659
+ case UNICODE_PROP_Uppercase:
1660
+ ret = unicode_prop_ops(cr,
1661
+ POP_GC, M(Lu),
1662
+ POP_PROP, UNICODE_PROP_Other_Uppercase,
1663
+ POP_UNION,
1664
+ POP_END);
1665
+ break;
1666
+ case UNICODE_PROP_Cased:
1667
+ ret = unicode_prop_ops(cr,
1668
+ POP_GC, M(Lu) | M(Ll) | M(Lt),
1669
+ POP_PROP, UNICODE_PROP_Other_Uppercase,
1670
+ POP_UNION,
1671
+ POP_PROP, UNICODE_PROP_Other_Lowercase,
1672
+ POP_UNION,
1673
+ POP_END);
1674
+ break;
1675
+ case UNICODE_PROP_Alphabetic:
1676
+ ret = unicode_prop_ops(cr,
1677
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1678
+ POP_PROP, UNICODE_PROP_Other_Uppercase,
1679
+ POP_UNION,
1680
+ POP_PROP, UNICODE_PROP_Other_Lowercase,
1681
+ POP_UNION,
1682
+ POP_PROP, UNICODE_PROP_Other_Alphabetic,
1683
+ POP_UNION,
1684
+ POP_END);
1685
+ break;
1686
+ case UNICODE_PROP_Grapheme_Base:
1687
+ ret = unicode_prop_ops(cr,
1688
+ POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1689
+ POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1690
+ POP_UNION,
1691
+ POP_INVERT,
1692
+ POP_END);
1693
+ break;
1694
+ case UNICODE_PROP_Grapheme_Extend:
1695
+ ret = unicode_prop_ops(cr,
1696
+ POP_GC, M(Me) | M(Mn),
1697
+ POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1698
+ POP_UNION,
1699
+ POP_END);
1700
+ break;
1701
+ case UNICODE_PROP_XID_Start:
1702
+ ret = unicode_prop_ops(cr,
1703
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1704
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
1705
+ POP_UNION,
1706
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
1707
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
1708
+ POP_UNION,
1709
+ POP_PROP, UNICODE_PROP_XID_Start1,
1710
+ POP_UNION,
1711
+ POP_INVERT,
1712
+ POP_INTER,
1713
+ POP_END);
1714
+ break;
1715
+ case UNICODE_PROP_XID_Continue:
1716
+ ret = unicode_prop_ops(cr,
1717
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1718
+ M(Mn) | M(Mc) | M(Nd) | M(Pc),
1719
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
1720
+ POP_UNION,
1721
+ POP_PROP, UNICODE_PROP_Other_ID_Continue,
1722
+ POP_UNION,
1723
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
1724
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
1725
+ POP_UNION,
1726
+ POP_PROP, UNICODE_PROP_XID_Continue1,
1727
+ POP_UNION,
1728
+ POP_INVERT,
1729
+ POP_INTER,
1730
+ POP_END);
1731
+ break;
1732
+ case UNICODE_PROP_Changes_When_Uppercased:
1733
+ ret = unicode_case1(cr, CASE_U);
1734
+ break;
1735
+ case UNICODE_PROP_Changes_When_Lowercased:
1736
+ ret = unicode_case1(cr, CASE_L);
1737
+ break;
1738
+ case UNICODE_PROP_Changes_When_Casemapped:
1739
+ ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1740
+ break;
1741
+ case UNICODE_PROP_Changes_When_Titlecased:
1742
+ ret = unicode_prop_ops(cr,
1743
+ POP_CASE, CASE_U,
1744
+ POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1745
+ POP_XOR,
1746
+ POP_END);
1747
+ break;
1748
+ case UNICODE_PROP_Changes_When_Casefolded:
1749
+ ret = unicode_prop_ops(cr,
1750
+ POP_CASE, CASE_F,
1751
+ POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1752
+ POP_XOR,
1753
+ POP_END);
1754
+ break;
1755
+ case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1756
+ ret = unicode_prop_ops(cr,
1757
+ POP_CASE, CASE_F,
1758
+ POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1759
+ POP_XOR,
1760
+ POP_END);
1761
+ break;
1762
+ #if 0
1763
+ case UNICODE_PROP_ID_Start:
1764
+ ret = unicode_prop_ops(cr,
1765
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1766
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
1767
+ POP_UNION,
1768
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
1769
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
1770
+ POP_UNION,
1771
+ POP_INVERT,
1772
+ POP_INTER,
1773
+ POP_END);
1774
+ break;
1775
+ case UNICODE_PROP_ID_Continue:
1776
+ ret = unicode_prop_ops(cr,
1777
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1778
+ M(Mn) | M(Mc) | M(Nd) | M(Pc),
1779
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
1780
+ POP_UNION,
1781
+ POP_PROP, UNICODE_PROP_Other_ID_Continue,
1782
+ POP_UNION,
1783
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
1784
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
1785
+ POP_UNION,
1786
+ POP_INVERT,
1787
+ POP_INTER,
1788
+ POP_END);
1789
+ break;
1790
+ case UNICODE_PROP_Case_Ignorable:
1791
+ ret = unicode_prop_ops(cr,
1792
+ POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1793
+ POP_PROP, UNICODE_PROP_Case_Ignorable1,
1794
+ POP_XOR,
1795
+ POP_END);
1796
+ break;
1797
+ #else
1798
+ /* we use the existing tables */
1799
+ case UNICODE_PROP_ID_Continue:
1800
+ ret = unicode_prop_ops(cr,
1801
+ POP_PROP, UNICODE_PROP_ID_Start,
1802
+ POP_PROP, UNICODE_PROP_ID_Continue1,
1803
+ POP_XOR,
1804
+ POP_END);
1805
+ break;
1806
+ #endif
1807
+ default:
1808
+ if (prop_idx >= countof(unicode_prop_table))
1809
+ return -2;
1810
+ ret = unicode_prop1(cr, prop_idx);
1811
+ break;
1812
+ }
1813
+ return ret;
1814
+ }
1815
+
1816
+ #endif /* CONFIG_ALL_UNICODE */
1817
+
1818
+ /*---- lre codepoint categorizing functions ----*/
1819
+
1820
+ #define S UNICODE_C_SPACE
1821
+ #define D UNICODE_C_DIGIT
1822
+ #define X UNICODE_C_XDIGIT
1823
+ #define U UNICODE_C_UPPER
1824
+ #define L UNICODE_C_LOWER
1825
+ #define _ UNICODE_C_UNDER
1826
+ #define d UNICODE_C_DOLLAR
1827
+
1828
+ uint8_t const lre_ctype_bits[256] = {
1829
+ 0, 0, 0, 0, 0, 0, 0, 0,
1830
+ 0, S, S, S, S, S, 0, 0,
1831
+ 0, 0, 0, 0, 0, 0, 0, 0,
1832
+ 0, 0, 0, 0, 0, 0, 0, 0,
1833
+
1834
+ S, 0, 0, 0, d, 0, 0, 0,
1835
+ 0, 0, 0, 0, 0, 0, 0, 0,
1836
+ X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1837
+ X|D, X|D, 0, 0, 0, 0, 0, 0,
1838
+
1839
+ 0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1840
+ U, U, U, U, U, U, U, U,
1841
+ U, U, U, U, U, U, U, U,
1842
+ U, U, U, 0, 0, 0, 0, _,
1843
+
1844
+ 0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1845
+ L, L, L, L, L, L, L, L,
1846
+ L, L, L, L, L, L, L, L,
1847
+ L, L, L, 0, 0, 0, 0, 0,
1848
+
1849
+ 0, 0, 0, 0, 0, 0, 0, 0,
1850
+ 0, 0, 0, 0, 0, 0, 0, 0,
1851
+ 0, 0, 0, 0, 0, 0, 0, 0,
1852
+ 0, 0, 0, 0, 0, 0, 0, 0,
1853
+
1854
+ S, 0, 0, 0, 0, 0, 0, 0,
1855
+ 0, 0, 0, 0, 0, 0, 0, 0,
1856
+ 0, 0, 0, 0, 0, 0, 0, 0,
1857
+ 0, 0, 0, 0, 0, 0, 0, 0,
1858
+
1859
+ 0, 0, 0, 0, 0, 0, 0, 0,
1860
+ 0, 0, 0, 0, 0, 0, 0, 0,
1861
+ 0, 0, 0, 0, 0, 0, 0, 0,
1862
+ 0, 0, 0, 0, 0, 0, 0, 0,
1863
+
1864
+ 0, 0, 0, 0, 0, 0, 0, 0,
1865
+ 0, 0, 0, 0, 0, 0, 0, 0,
1866
+ 0, 0, 0, 0, 0, 0, 0, 0,
1867
+ 0, 0, 0, 0, 0, 0, 0, 0,
1868
+ };
1869
+
1870
+ #undef S
1871
+ #undef D
1872
+ #undef X
1873
+ #undef U
1874
+ #undef L
1875
+ #undef _
1876
+ #undef d
1877
+
1878
+ /* code point ranges for Zs,Zl or Zp property */
1879
+ static const uint16_t char_range_s[] = {
1880
+ 10,
1881
+ 0x0009, 0x000D + 1,
1882
+ 0x0020, 0x0020 + 1,
1883
+ 0x00A0, 0x00A0 + 1,
1884
+ 0x1680, 0x1680 + 1,
1885
+ 0x2000, 0x200A + 1,
1886
+ /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1887
+ /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1888
+ 0x2028, 0x2029 + 1,
1889
+ 0x202F, 0x202F + 1,
1890
+ 0x205F, 0x205F + 1,
1891
+ 0x3000, 0x3000 + 1,
1892
+ /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1893
+ 0xFEFF, 0xFEFF + 1,
1894
+ };
1895
+
1896
+ BOOL lre_is_space_non_ascii(uint32_t c)
1897
+ {
1898
+ size_t i, n;
1899
+
1900
+ n = countof(char_range_s);
1901
+ for(i = 5; i < n; i += 2) {
1902
+ uint32_t low = char_range_s[i];
1903
+ uint32_t high = char_range_s[i + 1];
1904
+ if (c < low)
1905
+ return FALSE;
1906
+ if (c < high)
1907
+ return TRUE;
1908
+ }
1909
+ return FALSE;
1910
+ }