quickjs 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE +21 -0
  4. data/Rakefile +22 -0
  5. data/ext/quickjsrb/extconf.rb +45 -0
  6. data/ext/quickjsrb/quickjs/LICENSE +22 -0
  7. data/ext/quickjsrb/quickjs/cutils.c +631 -0
  8. data/ext/quickjsrb/quickjs/cutils.h +347 -0
  9. data/ext/quickjsrb/quickjs/libbf.c +8475 -0
  10. data/ext/quickjsrb/quickjs/libbf.h +535 -0
  11. data/ext/quickjsrb/quickjs/libregexp-opcode.h +57 -0
  12. data/ext/quickjsrb/quickjs/libregexp.c +2501 -0
  13. data/ext/quickjsrb/quickjs/libregexp.h +55 -0
  14. data/ext/quickjsrb/quickjs/libunicode-table.h +4557 -0
  15. data/ext/quickjsrb/quickjs/libunicode.c +1910 -0
  16. data/ext/quickjsrb/quickjs/libunicode.h +182 -0
  17. data/ext/quickjsrb/quickjs/list.h +99 -0
  18. data/ext/quickjsrb/quickjs/qjs.c +564 -0
  19. data/ext/quickjsrb/quickjs/qjsc.c +761 -0
  20. data/ext/quickjsrb/quickjs/qjscalc.c +4005 -0
  21. data/ext/quickjsrb/quickjs/quickjs-atom.h +273 -0
  22. data/ext/quickjsrb/quickjs/quickjs-libc.c +4052 -0
  23. data/ext/quickjsrb/quickjs/quickjs-libc.h +60 -0
  24. data/ext/quickjsrb/quickjs/quickjs-opcode.h +372 -0
  25. data/ext/quickjsrb/quickjs/quickjs.c +55978 -0
  26. data/ext/quickjsrb/quickjs/quickjs.h +1087 -0
  27. data/ext/quickjsrb/quickjs/repl.c +2057 -0
  28. data/ext/quickjsrb/quickjs/run-test262.c +2216 -0
  29. data/ext/quickjsrb/quickjs/unicode_gen.c +3225 -0
  30. data/ext/quickjsrb/quickjs/unicode_gen_def.h +291 -0
  31. data/ext/quickjsrb/quickjsrb.c +105 -0
  32. data/ext/quickjsrb/quickjsrb.h +14 -0
  33. data/lib/quickjs/version.rb +5 -0
  34. data/lib/quickjs.rb +28 -0
  35. data/sig/quickjs.rbs +4 -0
  36. metadata +81 -0
@@ -0,0 +1,3225 @@
1
+ /*
2
+ * Generation of Unicode tables
3
+ *
4
+ * Copyright (c) 2017-2018 Fabrice Bellard
5
+ * Copyright (c) 2017-2018 Charlie Gordon
6
+ *
7
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ * of this software and associated documentation files (the "Software"), to deal
9
+ * in the Software without restriction, including without limitation the rights
10
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the Software is
12
+ * furnished to do so, subject to the following conditions:
13
+ *
14
+ * The above copyright notice and this permission notice shall be included in
15
+ * all copies or substantial portions of the Software.
16
+ *
17
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ * THE SOFTWARE.
24
+ */
25
+ #include <stdlib.h>
26
+ #include <stdio.h>
27
+ #include <stdarg.h>
28
+ #include <inttypes.h>
29
+ #include <string.h>
30
+ #include <assert.h>
31
+ #include <ctype.h>
32
+ #include <time.h>
33
+
34
+ #include "cutils.h"
35
+
36
+ uint32_t total_tables;
37
+ uint32_t total_table_bytes;
38
+ uint32_t total_index;
39
+ uint32_t total_index_bytes;
40
+
41
+ /* define it to be able to test unicode.c */
42
+ //#define USE_TEST
43
+ /* profile tests */
44
+ //#define PROFILE
45
+
46
+ //#define DUMP_CASE_CONV_TABLE
47
+ //#define DUMP_TABLE_SIZE
48
+ //#define DUMP_CC_TABLE
49
+ //#define DUMP_DECOMP_TABLE
50
+ //#define DUMP_CASE_FOLDING_SPECIAL_CASES
51
+
52
+ /* Ideas:
53
+ - Generalize run length encoding + index for all tables
54
+ - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
55
+
56
+ Case conversion:
57
+ - use a single entry for consecutive U/LF runs
58
+ - allow EXT runs of length > 1
59
+
60
+ Decomposition:
61
+ - Greek lower case (+1f10/1f10) ?
62
+ - allow holes in B runs
63
+ - suppress more upper / lower case redundancy
64
+ */
65
+
66
+ #ifdef USE_TEST
67
+ #include "libunicode.c"
68
+ #endif
69
+
70
+ #define CHARCODE_MAX 0x10ffff
71
+ #define CC_LEN_MAX 3
72
+
73
+ void *mallocz(size_t size)
74
+ {
75
+ void *ptr;
76
+ ptr = malloc(size);
77
+ memset(ptr, 0, size);
78
+ return ptr;
79
+ }
80
+
81
+ const char *get_field(const char *p, int n)
82
+ {
83
+ int i;
84
+ for(i = 0; i < n; i++) {
85
+ while (*p != ';' && *p != '\0')
86
+ p++;
87
+ if (*p == '\0')
88
+ return NULL;
89
+ p++;
90
+ }
91
+ return p;
92
+ }
93
+
94
+ const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
95
+ {
96
+ char *q;
97
+ p = get_field(p, n);
98
+ q = buf;
99
+ while (*p != ';' && *p != '\0') {
100
+ if ((q - buf) < buf_size - 1)
101
+ *q++ = *p;
102
+ p++;
103
+ }
104
+ *q = '\0';
105
+ return buf;
106
+ }
107
+
108
+ void add_char(int **pbuf, int *psize, int *plen, int c)
109
+ {
110
+ int len, size, *buf;
111
+ buf = *pbuf;
112
+ size = *psize;
113
+ len = *plen;
114
+ if (len >= size) {
115
+ size = *psize;
116
+ size = max_int(len + 1, size * 3 / 2);
117
+ buf = realloc(buf, sizeof(buf[0]) * size);
118
+ *pbuf = buf;
119
+ *psize = size;
120
+ }
121
+ buf[len++] = c;
122
+ *plen = len;
123
+ }
124
+
125
+ int *get_field_str(int *plen, const char *str, int n)
126
+ {
127
+ const char *p;
128
+ int *buf, len, size;
129
+ p = get_field(str, n);
130
+ if (!p) {
131
+ *plen = 0;
132
+ return NULL;
133
+ }
134
+ len = 0;
135
+ size = 0;
136
+ buf = NULL;
137
+ for(;;) {
138
+ while (isspace(*p))
139
+ p++;
140
+ if (!isxdigit(*p))
141
+ break;
142
+ add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
143
+ }
144
+ *plen = len;
145
+ return buf;
146
+ }
147
+
148
+ char *get_line(char *buf, int buf_size, FILE *f)
149
+ {
150
+ int len;
151
+ if (!fgets(buf, buf_size, f))
152
+ return NULL;
153
+ len = strlen(buf);
154
+ if (len > 0 && buf[len - 1] == '\n')
155
+ buf[len - 1] = '\0';
156
+ return buf;
157
+ }
158
+
159
+ #define UNICODE_GENERAL_CATEGORY
160
+
161
+ typedef enum {
162
+ #define DEF(id, str) GCAT_ ## id,
163
+ #include "unicode_gen_def.h"
164
+ #undef DEF
165
+ GCAT_COUNT,
166
+ } UnicodeGCEnum1;
167
+
168
+ static const char *unicode_gc_name[] = {
169
+ #define DEF(id, str) #id,
170
+ #include "unicode_gen_def.h"
171
+ #undef DEF
172
+ };
173
+
174
+ static const char *unicode_gc_short_name[] = {
175
+ #define DEF(id, str) str,
176
+ #include "unicode_gen_def.h"
177
+ #undef DEF
178
+ };
179
+
180
+ #undef UNICODE_GENERAL_CATEGORY
181
+
182
+ #define UNICODE_SCRIPT
183
+
184
+ typedef enum {
185
+ #define DEF(id, str) SCRIPT_ ## id,
186
+ #include "unicode_gen_def.h"
187
+ #undef DEF
188
+ SCRIPT_COUNT,
189
+ } UnicodeScriptEnum1;
190
+
191
+ static const char *unicode_script_name[] = {
192
+ #define DEF(id, str) #id,
193
+ #include "unicode_gen_def.h"
194
+ #undef DEF
195
+ };
196
+
197
+ const char *unicode_script_short_name[] = {
198
+ #define DEF(id, str) str,
199
+ #include "unicode_gen_def.h"
200
+ #undef DEF
201
+ };
202
+
203
+ #undef UNICODE_SCRIPT
204
+
205
+ #define UNICODE_PROP_LIST
206
+
207
+ typedef enum {
208
+ #define DEF(id, str) PROP_ ## id,
209
+ #include "unicode_gen_def.h"
210
+ #undef DEF
211
+ PROP_COUNT,
212
+ } UnicodePropEnum1;
213
+
214
+ static const char *unicode_prop_name[] = {
215
+ #define DEF(id, str) #id,
216
+ #include "unicode_gen_def.h"
217
+ #undef DEF
218
+ };
219
+
220
+ static const char *unicode_prop_short_name[] = {
221
+ #define DEF(id, str) str,
222
+ #include "unicode_gen_def.h"
223
+ #undef DEF
224
+ };
225
+
226
+ #undef UNICODE_PROP_LIST
227
+
228
+ typedef struct {
229
+ /* case conv */
230
+ uint8_t u_len;
231
+ uint8_t l_len;
232
+ uint8_t f_len;
233
+ int u_data[CC_LEN_MAX]; /* to upper case */
234
+ int l_data[CC_LEN_MAX]; /* to lower case */
235
+ int f_data[CC_LEN_MAX]; /* to case folding */
236
+
237
+ uint8_t combining_class;
238
+ uint8_t is_compat:1;
239
+ uint8_t is_excluded:1;
240
+ uint8_t general_category;
241
+ uint8_t script;
242
+ uint8_t script_ext_len;
243
+ uint8_t *script_ext;
244
+ uint32_t prop_bitmap_tab[3];
245
+ /* decomposition */
246
+ int decomp_len;
247
+ int *decomp_data;
248
+ } CCInfo;
249
+
250
+ CCInfo *unicode_db;
251
+
252
+ int find_name(const char **tab, int tab_len, const char *name)
253
+ {
254
+ int i, len, name_len;
255
+ const char *p, *r;
256
+
257
+ name_len = strlen(name);
258
+ for(i = 0; i < tab_len; i++) {
259
+ p = tab[i];
260
+ for(;;) {
261
+ r = strchr(p, ',');
262
+ if (!r)
263
+ len = strlen(p);
264
+ else
265
+ len = r - p;
266
+ if (len == name_len && memcmp(p, name, len) == 0)
267
+ return i;
268
+ if (!r)
269
+ break;
270
+ p = r + 1;
271
+ }
272
+ }
273
+ return -1;
274
+ }
275
+
276
+ static BOOL get_prop(uint32_t c, int prop_idx)
277
+ {
278
+ return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
279
+ }
280
+
281
+ static void set_prop(uint32_t c, int prop_idx, int val)
282
+ {
283
+ uint32_t mask;
284
+ mask = 1U << (prop_idx & 0x1f);
285
+ if (val)
286
+ unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
287
+ else
288
+ unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
289
+ }
290
+
291
+ void parse_unicode_data(const char *filename)
292
+ {
293
+ FILE *f;
294
+ char line[1024];
295
+ char buf1[256];
296
+ const char *p;
297
+ int code, lc, uc, last_code;
298
+ CCInfo *ci, *tab = unicode_db;
299
+
300
+ f = fopen(filename, "rb");
301
+ if (!f) {
302
+ perror(filename);
303
+ exit(1);
304
+ }
305
+
306
+ last_code = 0;
307
+ for(;;) {
308
+ if (!get_line(line, sizeof(line), f))
309
+ break;
310
+ p = line;
311
+ while (isspace(*p))
312
+ p++;
313
+ if (*p == '#')
314
+ continue;
315
+
316
+ p = get_field(line, 0);
317
+ if (!p)
318
+ continue;
319
+ code = strtoul(p, NULL, 16);
320
+ lc = 0;
321
+ uc = 0;
322
+
323
+ p = get_field(line, 12);
324
+ if (p && *p != ';') {
325
+ uc = strtoul(p, NULL, 16);
326
+ }
327
+
328
+ p = get_field(line, 13);
329
+ if (p && *p != ';') {
330
+ lc = strtoul(p, NULL, 16);
331
+ }
332
+ ci = &tab[code];
333
+ if (uc > 0 || lc > 0) {
334
+ assert(code <= CHARCODE_MAX);
335
+ if (uc > 0) {
336
+ assert(ci->u_len == 0);
337
+ ci->u_len = 1;
338
+ ci->u_data[0] = uc;
339
+ }
340
+ if (lc > 0) {
341
+ assert(ci->l_len == 0);
342
+ ci->l_len = 1;
343
+ ci->l_data[0] = lc;
344
+ }
345
+ }
346
+
347
+ {
348
+ int i;
349
+ get_field_buf(buf1, sizeof(buf1), line, 2);
350
+ i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
351
+ if (i < 0) {
352
+ fprintf(stderr, "General category '%s' not found\n",
353
+ buf1);
354
+ exit(1);
355
+ }
356
+ ci->general_category = i;
357
+ }
358
+
359
+ p = get_field(line, 3);
360
+ if (p && *p != ';' && *p != '\0') {
361
+ int cc;
362
+ cc = strtoul(p, NULL, 0);
363
+ if (cc != 0) {
364
+ assert(code <= CHARCODE_MAX);
365
+ ci->combining_class = cc;
366
+ // printf("%05x: %d\n", code, ci->combining_class);
367
+ }
368
+ }
369
+
370
+ p = get_field(line, 5);
371
+ if (p && *p != ';' && *p != '\0') {
372
+ int size;
373
+ assert(code <= CHARCODE_MAX);
374
+ ci->is_compat = 0;
375
+ if (*p == '<') {
376
+ while (*p != '\0' && *p != '>')
377
+ p++;
378
+ if (*p == '>')
379
+ p++;
380
+ ci->is_compat = 1;
381
+ }
382
+ size = 0;
383
+ for(;;) {
384
+ while (isspace(*p))
385
+ p++;
386
+ if (!isxdigit(*p))
387
+ break;
388
+ add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
389
+ }
390
+ #if 0
391
+ {
392
+ int i;
393
+ static int count, d_count;
394
+
395
+ printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
396
+ for(i = 0; i < ci->decomp_len; i++)
397
+ printf(" %05x", ci->decomp_data[i]);
398
+ printf("\n");
399
+ count++;
400
+ d_count += ci->decomp_len;
401
+ // printf("%d %d\n", count, d_count);
402
+ }
403
+ #endif
404
+ }
405
+
406
+ p = get_field(line, 9);
407
+ if (p && *p == 'Y') {
408
+ set_prop(code, PROP_Bidi_Mirrored, 1);
409
+ }
410
+
411
+ /* handle ranges */
412
+ get_field_buf(buf1, sizeof(buf1), line, 1);
413
+ if (strstr(buf1, " Last>")) {
414
+ int i;
415
+ // printf("range: 0x%x-%0x\n", last_code, code);
416
+ assert(ci->decomp_len == 0);
417
+ assert(ci->script_ext_len == 0);
418
+ for(i = last_code + 1; i < code; i++) {
419
+ unicode_db[i] = *ci;
420
+ }
421
+ }
422
+ last_code = code;
423
+ }
424
+
425
+ fclose(f);
426
+ }
427
+
428
+ void parse_special_casing(CCInfo *tab, const char *filename)
429
+ {
430
+ FILE *f;
431
+ char line[1024];
432
+ const char *p;
433
+ int code;
434
+ CCInfo *ci;
435
+
436
+ f = fopen(filename, "rb");
437
+ if (!f) {
438
+ perror(filename);
439
+ exit(1);
440
+ }
441
+
442
+ for(;;) {
443
+ if (!get_line(line, sizeof(line), f))
444
+ break;
445
+ p = line;
446
+ while (isspace(*p))
447
+ p++;
448
+ if (*p == '#')
449
+ continue;
450
+
451
+ p = get_field(line, 0);
452
+ if (!p)
453
+ continue;
454
+ code = strtoul(p, NULL, 16);
455
+ assert(code <= CHARCODE_MAX);
456
+ ci = &tab[code];
457
+
458
+ p = get_field(line, 4);
459
+ if (p) {
460
+ /* locale dependent casing */
461
+ while (isspace(*p))
462
+ p++;
463
+ if (*p != '#' && *p != '\0')
464
+ continue;
465
+ }
466
+
467
+
468
+ p = get_field(line, 1);
469
+ if (p && *p != ';') {
470
+ ci->l_len = 0;
471
+ for(;;) {
472
+ while (isspace(*p))
473
+ p++;
474
+ if (*p == ';')
475
+ break;
476
+ assert(ci->l_len < CC_LEN_MAX);
477
+ ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
478
+ }
479
+
480
+ if (ci->l_len == 1 && ci->l_data[0] == code)
481
+ ci->l_len = 0;
482
+ }
483
+
484
+ p = get_field(line, 3);
485
+ if (p && *p != ';') {
486
+ ci->u_len = 0;
487
+ for(;;) {
488
+ while (isspace(*p))
489
+ p++;
490
+ if (*p == ';')
491
+ break;
492
+ assert(ci->u_len < CC_LEN_MAX);
493
+ ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
494
+ }
495
+
496
+ if (ci->u_len == 1 && ci->u_data[0] == code)
497
+ ci->u_len = 0;
498
+ }
499
+ }
500
+
501
+ fclose(f);
502
+ }
503
+
504
+ void parse_case_folding(CCInfo *tab, const char *filename)
505
+ {
506
+ FILE *f;
507
+ char line[1024];
508
+ const char *p;
509
+ int code, status;
510
+ CCInfo *ci;
511
+
512
+ f = fopen(filename, "rb");
513
+ if (!f) {
514
+ perror(filename);
515
+ exit(1);
516
+ }
517
+
518
+ for(;;) {
519
+ if (!get_line(line, sizeof(line), f))
520
+ break;
521
+ p = line;
522
+ while (isspace(*p))
523
+ p++;
524
+ if (*p == '#')
525
+ continue;
526
+
527
+ p = get_field(line, 0);
528
+ if (!p)
529
+ continue;
530
+ code = strtoul(p, NULL, 16);
531
+ assert(code <= CHARCODE_MAX);
532
+ ci = &tab[code];
533
+
534
+ p = get_field(line, 1);
535
+ if (!p)
536
+ continue;
537
+ /* locale dependent casing */
538
+ while (isspace(*p))
539
+ p++;
540
+ status = *p;
541
+ if (status != 'C' && status != 'S' && status != 'F')
542
+ continue;
543
+
544
+ p = get_field(line, 2);
545
+ assert(p != NULL);
546
+ if (status == 'S') {
547
+ /* we always select the simple case folding and assume it
548
+ * comes after the full case folding case */
549
+ assert(ci->f_len >= 2);
550
+ ci->f_len = 0;
551
+ } else {
552
+ assert(ci->f_len == 0);
553
+ }
554
+ for(;;) {
555
+ while (isspace(*p))
556
+ p++;
557
+ if (*p == ';')
558
+ break;
559
+ assert(ci->l_len < CC_LEN_MAX);
560
+ ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16);
561
+ }
562
+ }
563
+
564
+ fclose(f);
565
+ }
566
+
567
+ void parse_composition_exclusions(const char *filename)
568
+ {
569
+ FILE *f;
570
+ char line[4096], *p;
571
+ uint32_t c0;
572
+
573
+ f = fopen(filename, "rb");
574
+ if (!f) {
575
+ perror(filename);
576
+ exit(1);
577
+ }
578
+
579
+ for(;;) {
580
+ if (!get_line(line, sizeof(line), f))
581
+ break;
582
+ p = line;
583
+ while (isspace(*p))
584
+ p++;
585
+ if (*p == '#' || *p == '@' || *p == '\0')
586
+ continue;
587
+ c0 = strtoul(p, (char **)&p, 16);
588
+ assert(c0 > 0 && c0 <= CHARCODE_MAX);
589
+ unicode_db[c0].is_excluded = TRUE;
590
+ }
591
+ fclose(f);
592
+ }
593
+
594
+ void parse_derived_core_properties(const char *filename)
595
+ {
596
+ FILE *f;
597
+ char line[4096], *p, buf[256], *q;
598
+ uint32_t c0, c1, c;
599
+ int i;
600
+
601
+ f = fopen(filename, "rb");
602
+ if (!f) {
603
+ perror(filename);
604
+ exit(1);
605
+ }
606
+
607
+ for(;;) {
608
+ if (!get_line(line, sizeof(line), f))
609
+ break;
610
+ p = line;
611
+ while (isspace(*p))
612
+ p++;
613
+ if (*p == '#' || *p == '@' || *p == '\0')
614
+ continue;
615
+ c0 = strtoul(p, (char **)&p, 16);
616
+ if (*p == '.' && p[1] == '.') {
617
+ p += 2;
618
+ c1 = strtoul(p, (char **)&p, 16);
619
+ } else {
620
+ c1 = c0;
621
+ }
622
+ assert(c1 <= CHARCODE_MAX);
623
+ p += strspn(p, " \t");
624
+ if (*p == ';') {
625
+ p++;
626
+ p += strspn(p, " \t");
627
+ q = buf;
628
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
629
+ if ((q - buf) < sizeof(buf) - 1)
630
+ *q++ = *p;
631
+ p++;
632
+ }
633
+ *q = '\0';
634
+ i = find_name(unicode_prop_name,
635
+ countof(unicode_prop_name), buf);
636
+ if (i < 0) {
637
+ if (!strcmp(buf, "Grapheme_Link"))
638
+ goto next;
639
+ fprintf(stderr, "Property not found: %s\n", buf);
640
+ exit(1);
641
+ }
642
+ for(c = c0; c <= c1; c++) {
643
+ set_prop(c, i, 1);
644
+ }
645
+ next: ;
646
+ }
647
+ }
648
+ fclose(f);
649
+ }
650
+
651
+ void parse_derived_norm_properties(const char *filename)
652
+ {
653
+ FILE *f;
654
+ char line[4096], *p, buf[256], *q;
655
+ uint32_t c0, c1, c;
656
+
657
+ f = fopen(filename, "rb");
658
+ if (!f) {
659
+ perror(filename);
660
+ exit(1);
661
+ }
662
+
663
+ for(;;) {
664
+ if (!get_line(line, sizeof(line), f))
665
+ break;
666
+ p = line;
667
+ while (isspace(*p))
668
+ p++;
669
+ if (*p == '#' || *p == '@' || *p == '\0')
670
+ continue;
671
+ c0 = strtoul(p, (char **)&p, 16);
672
+ if (*p == '.' && p[1] == '.') {
673
+ p += 2;
674
+ c1 = strtoul(p, (char **)&p, 16);
675
+ } else {
676
+ c1 = c0;
677
+ }
678
+ assert(c1 <= CHARCODE_MAX);
679
+ p += strspn(p, " \t");
680
+ if (*p == ';') {
681
+ p++;
682
+ p += strspn(p, " \t");
683
+ q = buf;
684
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
685
+ if ((q - buf) < sizeof(buf) - 1)
686
+ *q++ = *p;
687
+ p++;
688
+ }
689
+ *q = '\0';
690
+ if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
691
+ for(c = c0; c <= c1; c++) {
692
+ set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
693
+ }
694
+ }
695
+ }
696
+ }
697
+ fclose(f);
698
+ }
699
+
700
+ void parse_prop_list(const char *filename)
701
+ {
702
+ FILE *f;
703
+ char line[4096], *p, buf[256], *q;
704
+ uint32_t c0, c1, c;
705
+ int i;
706
+
707
+ f = fopen(filename, "rb");
708
+ if (!f) {
709
+ perror(filename);
710
+ exit(1);
711
+ }
712
+
713
+ for(;;) {
714
+ if (!get_line(line, sizeof(line), f))
715
+ break;
716
+ p = line;
717
+ while (isspace(*p))
718
+ p++;
719
+ if (*p == '#' || *p == '@' || *p == '\0')
720
+ continue;
721
+ c0 = strtoul(p, (char **)&p, 16);
722
+ if (*p == '.' && p[1] == '.') {
723
+ p += 2;
724
+ c1 = strtoul(p, (char **)&p, 16);
725
+ } else {
726
+ c1 = c0;
727
+ }
728
+ assert(c1 <= CHARCODE_MAX);
729
+ p += strspn(p, " \t");
730
+ if (*p == ';') {
731
+ p++;
732
+ p += strspn(p, " \t");
733
+ q = buf;
734
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
735
+ if ((q - buf) < sizeof(buf) - 1)
736
+ *q++ = *p;
737
+ p++;
738
+ }
739
+ *q = '\0';
740
+ i = find_name(unicode_prop_name,
741
+ countof(unicode_prop_name), buf);
742
+ if (i < 0) {
743
+ fprintf(stderr, "Property not found: %s\n", buf);
744
+ exit(1);
745
+ }
746
+ for(c = c0; c <= c1; c++) {
747
+ set_prop(c, i, 1);
748
+ }
749
+ }
750
+ }
751
+ fclose(f);
752
+ }
753
+
754
+ void parse_scripts(const char *filename)
755
+ {
756
+ FILE *f;
757
+ char line[4096], *p, buf[256], *q;
758
+ uint32_t c0, c1, c;
759
+ int i;
760
+
761
+ f = fopen(filename, "rb");
762
+ if (!f) {
763
+ perror(filename);
764
+ exit(1);
765
+ }
766
+
767
+ for(;;) {
768
+ if (!get_line(line, sizeof(line), f))
769
+ break;
770
+ p = line;
771
+ while (isspace(*p))
772
+ p++;
773
+ if (*p == '#' || *p == '@' || *p == '\0')
774
+ continue;
775
+ c0 = strtoul(p, (char **)&p, 16);
776
+ if (*p == '.' && p[1] == '.') {
777
+ p += 2;
778
+ c1 = strtoul(p, (char **)&p, 16);
779
+ } else {
780
+ c1 = c0;
781
+ }
782
+ assert(c1 <= CHARCODE_MAX);
783
+ p += strspn(p, " \t");
784
+ if (*p == ';') {
785
+ p++;
786
+ p += strspn(p, " \t");
787
+ q = buf;
788
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
789
+ if ((q - buf) < sizeof(buf) - 1)
790
+ *q++ = *p;
791
+ p++;
792
+ }
793
+ *q = '\0';
794
+ i = find_name(unicode_script_name,
795
+ countof(unicode_script_name), buf);
796
+ if (i < 0) {
797
+ fprintf(stderr, "Unknown script: '%s'\n", buf);
798
+ exit(1);
799
+ }
800
+ for(c = c0; c <= c1; c++)
801
+ unicode_db[c].script = i;
802
+ }
803
+ }
804
+ fclose(f);
805
+ }
806
+
807
+ void parse_script_extensions(const char *filename)
808
+ {
809
+ FILE *f;
810
+ char line[4096], *p, buf[256], *q;
811
+ uint32_t c0, c1, c;
812
+ int i;
813
+ uint8_t script_ext[255];
814
+ int script_ext_len;
815
+
816
+ f = fopen(filename, "rb");
817
+ if (!f) {
818
+ perror(filename);
819
+ exit(1);
820
+ }
821
+
822
+ for(;;) {
823
+ if (!get_line(line, sizeof(line), f))
824
+ break;
825
+ p = line;
826
+ while (isspace(*p))
827
+ p++;
828
+ if (*p == '#' || *p == '@' || *p == '\0')
829
+ continue;
830
+ c0 = strtoul(p, (char **)&p, 16);
831
+ if (*p == '.' && p[1] == '.') {
832
+ p += 2;
833
+ c1 = strtoul(p, (char **)&p, 16);
834
+ } else {
835
+ c1 = c0;
836
+ }
837
+ assert(c1 <= CHARCODE_MAX);
838
+ p += strspn(p, " \t");
839
+ script_ext_len = 0;
840
+ if (*p == ';') {
841
+ p++;
842
+ for(;;) {
843
+ p += strspn(p, " \t");
844
+ q = buf;
845
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
846
+ if ((q - buf) < sizeof(buf) - 1)
847
+ *q++ = *p;
848
+ p++;
849
+ }
850
+ *q = '\0';
851
+ if (buf[0] == '\0')
852
+ break;
853
+ i = find_name(unicode_script_short_name,
854
+ countof(unicode_script_short_name), buf);
855
+ if (i < 0) {
856
+ fprintf(stderr, "Script not found: %s\n", buf);
857
+ exit(1);
858
+ }
859
+ assert(script_ext_len < sizeof(script_ext));
860
+ script_ext[script_ext_len++] = i;
861
+ }
862
+ for(c = c0; c <= c1; c++) {
863
+ CCInfo *ci = &unicode_db[c];
864
+ ci->script_ext_len = script_ext_len;
865
+ ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
866
+ for(i = 0; i < script_ext_len; i++)
867
+ ci->script_ext[i] = script_ext[i];
868
+ }
869
+ }
870
+ }
871
+ fclose(f);
872
+ }
873
+
874
+ void dump_cc_info(CCInfo *ci, int i)
875
+ {
876
+ int j;
877
+ printf("%05x:", i);
878
+ if (ci->u_len != 0) {
879
+ printf(" U:");
880
+ for(j = 0; j < ci->u_len; j++)
881
+ printf(" %05x", ci->u_data[j]);
882
+ }
883
+ if (ci->l_len != 0) {
884
+ printf(" L:");
885
+ for(j = 0; j < ci->l_len; j++)
886
+ printf(" %05x", ci->l_data[j]);
887
+ }
888
+ if (ci->f_len != 0) {
889
+ printf(" F:");
890
+ for(j = 0; j < ci->f_len; j++)
891
+ printf(" %05x", ci->f_data[j]);
892
+ }
893
+ printf("\n");
894
+ }
895
+
896
+ void dump_unicode_data(CCInfo *tab)
897
+ {
898
+ int i;
899
+ CCInfo *ci;
900
+ for(i = 0; i <= CHARCODE_MAX; i++) {
901
+ ci = &tab[i];
902
+ if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) {
903
+ dump_cc_info(ci, i);
904
+ }
905
+ }
906
+ }
907
+
908
+ BOOL is_complicated_case(const CCInfo *ci)
909
+ {
910
+ return (ci->u_len > 1 || ci->l_len > 1 ||
911
+ (ci->u_len > 0 && ci->l_len > 0) ||
912
+ (ci->f_len != ci->l_len) ||
913
+ (memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0));
914
+ }
915
+
916
+ #ifndef USE_TEST
917
+ enum {
918
+ RUN_TYPE_U,
919
+ RUN_TYPE_L,
920
+ RUN_TYPE_UF,
921
+ RUN_TYPE_LF,
922
+ RUN_TYPE_UL,
923
+ RUN_TYPE_LSU,
924
+ RUN_TYPE_U2L_399_EXT2,
925
+ RUN_TYPE_UF_D20,
926
+ RUN_TYPE_UF_D1_EXT,
927
+ RUN_TYPE_U_EXT,
928
+ RUN_TYPE_LF_EXT,
929
+ RUN_TYPE_UF_EXT2,
930
+ RUN_TYPE_LF_EXT2,
931
+ RUN_TYPE_UF_EXT3,
932
+ };
933
+ #endif
934
+
935
+ const char *run_type_str[] = {
936
+ "U",
937
+ "L",
938
+ "UF",
939
+ "LF",
940
+ "UL",
941
+ "LSU",
942
+ "U2L_399_EXT2",
943
+ "UF_D20",
944
+ "UF_D1_EXT",
945
+ "U_EXT",
946
+ "LF_EXT",
947
+ "UF_EXT2",
948
+ "LF_EXT2",
949
+ "UF_EXT3",
950
+ };
951
+
952
+ typedef struct {
953
+ int code;
954
+ int len;
955
+ int type;
956
+ int data;
957
+ int ext_len;
958
+ int ext_data[3];
959
+ int data_index; /* 'data' coming from the table */
960
+ } TableEntry;
961
+
962
+ static int simple_to_lower(CCInfo *tab, int c)
963
+ {
964
+ if (tab[c].l_len != 1)
965
+ return c;
966
+ return tab[c].l_data[0];
967
+ }
968
+
969
+ /* code (17), len (7), type (4) */
970
+
971
+ void find_run_type(TableEntry *te, CCInfo *tab, int code)
972
+ {
973
+ int is_lower, len;
974
+ CCInfo *ci, *ci1, *ci2;
975
+
976
+ ci = &tab[code];
977
+ ci1 = &tab[code + 1];
978
+ ci2 = &tab[code + 2];
979
+ te->code = code;
980
+
981
+ if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
982
+ ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] &&
983
+ ci->u_len == 0 &&
984
+
985
+ ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
986
+ ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] &&
987
+ ci1->u_len == 1 && ci1->u_data[0] == code &&
988
+
989
+ ci2->l_len == 0 &&
990
+ ci2->f_len == 0 &&
991
+ ci2->u_len == 1 && ci2->u_data[0] == code) {
992
+ te->len = 3;
993
+ te->data = 0;
994
+ te->type = RUN_TYPE_LSU;
995
+ return;
996
+ }
997
+
998
+ if (is_complicated_case(ci)) {
999
+ len = 1;
1000
+ while (code + len <= CHARCODE_MAX) {
1001
+ ci1 = &tab[code + len];
1002
+ if (ci1->u_len != 1 ||
1003
+ ci1->u_data[0] != ci->u_data[0] + len ||
1004
+ ci1->l_len != 0 ||
1005
+ ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0])
1006
+ break;
1007
+ len++;
1008
+ }
1009
+ if (len > 1) {
1010
+ te->len = len;
1011
+ te->type = RUN_TYPE_UF;
1012
+ te->data = ci->u_data[0];
1013
+ return;
1014
+ }
1015
+
1016
+ if (ci->l_len == 0 &&
1017
+ ci->u_len == 2 && ci->u_data[1] == 0x399 &&
1018
+ ci->f_len == 2 && ci->f_data[1] == 0x3B9 &&
1019
+ ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) {
1020
+ len = 1;
1021
+ while (code + len <= CHARCODE_MAX) {
1022
+ ci1 = &tab[code + len];
1023
+ if (!(ci1->u_len == 2 &&
1024
+ ci1->u_data[1] == ci->u_data[1] &&
1025
+ ci1->u_data[0] == ci->u_data[0] + len &&
1026
+ ci1->f_len == 2 &&
1027
+ ci1->f_data[1] == ci->f_data[1] &&
1028
+ ci1->f_data[0] == ci->f_data[0] + len &&
1029
+ ci1->l_len == 0))
1030
+ break;
1031
+ len++;
1032
+ }
1033
+ te->len = len;
1034
+ te->type = RUN_TYPE_UF_EXT2;
1035
+ te->ext_data[0] = ci->u_data[0];
1036
+ te->ext_data[1] = ci->u_data[1];
1037
+ te->ext_len = 2;
1038
+ return;
1039
+ }
1040
+
1041
+ if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
1042
+ ci->l_len == 1 &&
1043
+ ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) {
1044
+ len = 1;
1045
+ while (code + len <= CHARCODE_MAX) {
1046
+ ci1 = &tab[code + len];
1047
+ if (!(ci1->u_len == 2 &&
1048
+ ci1->u_data[1] == 0x399 &&
1049
+ ci1->u_data[0] == ci->u_data[0] + len &&
1050
+ ci1->l_len == 1 &&
1051
+ ci1->l_data[0] == ci->l_data[0] + len &&
1052
+ ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0]))
1053
+ break;
1054
+ len++;
1055
+ }
1056
+ te->len = len;
1057
+ te->type = RUN_TYPE_U2L_399_EXT2;
1058
+ te->ext_data[0] = ci->u_data[0];
1059
+ te->ext_data[1] = ci->l_data[0];
1060
+ te->ext_len = 2;
1061
+ return;
1062
+ }
1063
+
1064
+ if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) {
1065
+ len = 1;
1066
+ while (code + len <= CHARCODE_MAX) {
1067
+ ci1 = &tab[code + len];
1068
+ if (!(ci1->l_len == 1 &&
1069
+ ci1->l_data[0] == ci->l_data[0] + len &&
1070
+ ci1->u_len == 0 && ci1->f_len == 0))
1071
+ break;
1072
+ len++;
1073
+ }
1074
+ te->len = len;
1075
+ te->type = RUN_TYPE_L;
1076
+ te->data = ci->l_data[0];
1077
+ return;
1078
+ }
1079
+
1080
+ if (ci->l_len == 0 &&
1081
+ ci->u_len == 1 &&
1082
+ ci->u_data[0] < 0x1000 &&
1083
+ ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) {
1084
+ te->len = 1;
1085
+ te->type = RUN_TYPE_UF_D20;
1086
+ te->data = ci->u_data[0];
1087
+ } else if (ci->l_len == 0 &&
1088
+ ci->u_len == 1 &&
1089
+ ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) {
1090
+ te->len = 1;
1091
+ te->type = RUN_TYPE_UF_D1_EXT;
1092
+ te->ext_data[0] = ci->u_data[0];
1093
+ te->ext_len = 1;
1094
+ } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 &&
1095
+ ci->l_data[0] == ci->f_data[0] &&
1096
+ ci->l_data[1] == ci->f_data[1]) {
1097
+ te->len = 1;
1098
+ te->type = RUN_TYPE_LF_EXT2;
1099
+ te->ext_data[0] = ci->l_data[0];
1100
+ te->ext_data[1] = ci->l_data[1];
1101
+ te->ext_len = 2;
1102
+ } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 &&
1103
+ ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
1104
+ ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) {
1105
+ te->len = 1;
1106
+ te->type = RUN_TYPE_UF_EXT2;
1107
+ te->ext_data[0] = ci->u_data[0];
1108
+ te->ext_data[1] = ci->u_data[1];
1109
+ te->ext_len = 2;
1110
+ } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 &&
1111
+ ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
1112
+ ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) &&
1113
+ ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) {
1114
+ te->len = 1;
1115
+ te->type = RUN_TYPE_UF_EXT3;
1116
+ te->ext_data[0] = ci->u_data[0];
1117
+ te->ext_data[1] = ci->u_data[1];
1118
+ te->ext_data[2] = ci->u_data[2];
1119
+ te->ext_len = 3;
1120
+ } else {
1121
+ printf("unsupported encoding case:\n");
1122
+ dump_cc_info(ci, code);
1123
+ abort();
1124
+ }
1125
+ } else {
1126
+ /* look for a run of identical conversions */
1127
+ len = 0;
1128
+ for(;;) {
1129
+ if (code >= CHARCODE_MAX || len >= 126)
1130
+ break;
1131
+ ci = &tab[code + len];
1132
+ ci1 = &tab[code + len + 1];
1133
+ if (is_complicated_case(ci) || is_complicated_case(ci1)) {
1134
+ break;
1135
+ }
1136
+ if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
1137
+ break;
1138
+ if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
1139
+ break;
1140
+ len += 2;
1141
+ }
1142
+ if (len > 0) {
1143
+ te->len = len;
1144
+ te->type = RUN_TYPE_UL;
1145
+ te->data = 0;
1146
+ return;
1147
+ }
1148
+
1149
+ ci = &tab[code];
1150
+ is_lower = ci->l_len > 0;
1151
+ len = 1;
1152
+ while (code + len <= CHARCODE_MAX) {
1153
+ ci1 = &tab[code + len];
1154
+ if (is_complicated_case(ci1))
1155
+ break;
1156
+ if (is_lower) {
1157
+ if (ci1->l_len != 1 ||
1158
+ ci1->l_data[0] != ci->l_data[0] + len)
1159
+ break;
1160
+ } else {
1161
+ if (ci1->u_len != 1 ||
1162
+ ci1->u_data[0] != ci->u_data[0] + len)
1163
+ break;
1164
+ }
1165
+ len++;
1166
+ }
1167
+ te->len = len;
1168
+ if (is_lower) {
1169
+ te->type = RUN_TYPE_LF;
1170
+ te->data = ci->l_data[0];
1171
+ } else {
1172
+ te->type = RUN_TYPE_U;
1173
+ te->data = ci->u_data[0];
1174
+ }
1175
+ }
1176
+ }
1177
+
1178
+ TableEntry conv_table[1000];
1179
+ int conv_table_len;
1180
+ int ext_data[1000];
1181
+ int ext_data_len;
1182
+
1183
+ void dump_case_conv_table1(void)
1184
+ {
1185
+ int i, j;
1186
+ const TableEntry *te;
1187
+
1188
+ for(i = 0; i < conv_table_len; i++) {
1189
+ te = &conv_table[i];
1190
+ printf("%05x %02x %-10s %05x",
1191
+ te->code, te->len, run_type_str[te->type], te->data);
1192
+ for(j = 0; j < te->ext_len; j++) {
1193
+ printf(" %05x", te->ext_data[j]);
1194
+ }
1195
+ printf("\n");
1196
+ }
1197
+ printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len);
1198
+ }
1199
+
1200
+ int find_data_index(const TableEntry *conv_table, int len, int data)
1201
+ {
1202
+ int i;
1203
+ const TableEntry *te;
1204
+ for(i = 0; i < len; i++) {
1205
+ te = &conv_table[i];
1206
+ if (te->code == data)
1207
+ return i;
1208
+ }
1209
+ return -1;
1210
+ }
1211
+
1212
+ int find_ext_data_index(int data)
1213
+ {
1214
+ int i;
1215
+ for(i = 0; i < ext_data_len; i++) {
1216
+ if (ext_data[i] == data)
1217
+ return i;
1218
+ }
1219
+ assert(ext_data_len < countof(ext_data));
1220
+ ext_data[ext_data_len++] = data;
1221
+ return ext_data_len - 1;
1222
+ }
1223
+
1224
+ void build_conv_table(CCInfo *tab)
1225
+ {
1226
+ int code, i, j;
1227
+ CCInfo *ci;
1228
+ TableEntry *te;
1229
+
1230
+ te = conv_table;
1231
+ for(code = 0; code <= CHARCODE_MAX; code++) {
1232
+ ci = &tab[code];
1233
+ if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0)
1234
+ continue;
1235
+ assert(te - conv_table < countof(conv_table));
1236
+ find_run_type(te, tab, code);
1237
+ #if 0
1238
+ if (te->type == RUN_TYPE_TODO) {
1239
+ printf("TODO: ");
1240
+ dump_cc_info(ci, code);
1241
+ }
1242
+ #endif
1243
+ assert(te->len <= 127);
1244
+ code += te->len - 1;
1245
+ te++;
1246
+ }
1247
+ conv_table_len = te - conv_table;
1248
+
1249
+ /* find the data index */
1250
+ for(i = 0; i < conv_table_len; i++) {
1251
+ int data_index;
1252
+ te = &conv_table[i];
1253
+
1254
+ switch(te->type) {
1255
+ case RUN_TYPE_U:
1256
+ case RUN_TYPE_L:
1257
+ case RUN_TYPE_UF:
1258
+ case RUN_TYPE_LF:
1259
+ data_index = find_data_index(conv_table, conv_table_len, te->data);
1260
+ if (data_index < 0) {
1261
+ switch(te->type) {
1262
+ case RUN_TYPE_U:
1263
+ te->type = RUN_TYPE_U_EXT;
1264
+ te->ext_len = 1;
1265
+ te->ext_data[0] = te->data;
1266
+ break;
1267
+ case RUN_TYPE_LF:
1268
+ te->type = RUN_TYPE_LF_EXT;
1269
+ te->ext_len = 1;
1270
+ te->ext_data[0] = te->data;
1271
+ break;
1272
+ default:
1273
+ printf("%05x: index not found\n", te->code);
1274
+ exit(1);
1275
+ }
1276
+ } else {
1277
+ te->data_index = data_index;
1278
+ }
1279
+ break;
1280
+ case RUN_TYPE_UF_D20:
1281
+ te->data_index = te->data;
1282
+ break;
1283
+ }
1284
+ }
1285
+
1286
+ /* find the data index for ext_data */
1287
+ for(i = 0; i < conv_table_len; i++) {
1288
+ te = &conv_table[i];
1289
+ if (te->type == RUN_TYPE_UF_EXT3) {
1290
+ int p, v;
1291
+ v = 0;
1292
+ for(j = 0; j < 3; j++) {
1293
+ p = find_ext_data_index(te->ext_data[j]);
1294
+ assert(p < 16);
1295
+ v = (v << 4) | p;
1296
+ }
1297
+ te->data_index = v;
1298
+ }
1299
+ }
1300
+
1301
+ for(i = 0; i < conv_table_len; i++) {
1302
+ te = &conv_table[i];
1303
+ if (te->type == RUN_TYPE_LF_EXT2 ||
1304
+ te->type == RUN_TYPE_UF_EXT2 ||
1305
+ te->type == RUN_TYPE_U2L_399_EXT2) {
1306
+ int p, v;
1307
+ v = 0;
1308
+ for(j = 0; j < 2; j++) {
1309
+ p = find_ext_data_index(te->ext_data[j]);
1310
+ assert(p < 64);
1311
+ v = (v << 6) | p;
1312
+ }
1313
+ te->data_index = v;
1314
+ }
1315
+ }
1316
+
1317
+ for(i = 0; i < conv_table_len; i++) {
1318
+ te = &conv_table[i];
1319
+ if (te->type == RUN_TYPE_UF_D1_EXT ||
1320
+ te->type == RUN_TYPE_U_EXT ||
1321
+ te->type == RUN_TYPE_LF_EXT) {
1322
+ te->data_index = find_ext_data_index(te->ext_data[0]);
1323
+ }
1324
+ }
1325
+ #ifdef DUMP_CASE_CONV_TABLE
1326
+ dump_case_conv_table1();
1327
+ #endif
1328
+ }
1329
+
1330
+ void dump_case_conv_table(FILE *f)
1331
+ {
1332
+ int i;
1333
+ uint32_t v;
1334
+ const TableEntry *te;
1335
+
1336
+ total_tables++;
1337
+ total_table_bytes += conv_table_len * sizeof(uint32_t);
1338
+ fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len);
1339
+ for(i = 0; i < conv_table_len; i++) {
1340
+ if (i % 4 == 0)
1341
+ fprintf(f, "\n ");
1342
+ te = &conv_table[i];
1343
+ v = te->code << (32 - 17);
1344
+ v |= te->len << (32 - 17 - 7);
1345
+ v |= te->type << (32 - 17 - 7 - 4);
1346
+ v |= te->data_index >> 8;
1347
+ fprintf(f, " 0x%08x,", v);
1348
+ }
1349
+ fprintf(f, "\n};\n\n");
1350
+
1351
+ total_tables++;
1352
+ total_table_bytes += conv_table_len;
1353
+ fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len);
1354
+ for(i = 0; i < conv_table_len; i++) {
1355
+ if (i % 8 == 0)
1356
+ fprintf(f, "\n ");
1357
+ te = &conv_table[i];
1358
+ fprintf(f, " 0x%02x,", te->data_index & 0xff);
1359
+ }
1360
+ fprintf(f, "\n};\n\n");
1361
+
1362
+ total_tables++;
1363
+ total_table_bytes += ext_data_len * sizeof(uint16_t);
1364
+ fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len);
1365
+ for(i = 0; i < ext_data_len; i++) {
1366
+ if (i % 8 == 0)
1367
+ fprintf(f, "\n ");
1368
+ fprintf(f, " 0x%04x,", ext_data[i]);
1369
+ }
1370
+ fprintf(f, "\n};\n\n");
1371
+ }
1372
+
1373
+
1374
+ static CCInfo *global_tab;
1375
+
1376
+ static int sp_cc_cmp(const void *p1, const void *p2)
1377
+ {
1378
+ CCInfo *c1 = &global_tab[*(const int *)p1];
1379
+ CCInfo *c2 = &global_tab[*(const int *)p2];
1380
+ if (c1->f_len < c2->f_len) {
1381
+ return -1;
1382
+ } else if (c2->f_len < c1->f_len) {
1383
+ return 1;
1384
+ } else {
1385
+ return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len);
1386
+ }
1387
+ }
1388
+
1389
+ /* dump the case special cases (multi character results which are
1390
+ identical and need specific handling in lre_canonicalize() */
1391
+ void dump_case_folding_special_cases(CCInfo *tab)
1392
+ {
1393
+ int i, len, j;
1394
+ int *perm;
1395
+
1396
+ perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1));
1397
+ for(i = 0; i <= CHARCODE_MAX; i++)
1398
+ perm[i] = i;
1399
+ global_tab = tab;
1400
+ qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp);
1401
+ for(i = 0; i <= CHARCODE_MAX;) {
1402
+ if (tab[perm[i]].f_len <= 1) {
1403
+ i++;
1404
+ } else {
1405
+ len = 1;
1406
+ while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len]))
1407
+ len++;
1408
+
1409
+ if (len > 1) {
1410
+ for(j = i; j < i + len; j++)
1411
+ dump_cc_info(&tab[perm[j]], perm[j]);
1412
+ }
1413
+ i += len;
1414
+ }
1415
+ }
1416
+ free(perm);
1417
+ global_tab = NULL;
1418
+ }
1419
+
1420
+
1421
+ int tabcmp(const int *tab1, const int *tab2, int n)
1422
+ {
1423
+ int i;
1424
+ for(i = 0; i < n; i++) {
1425
+ if (tab1[i] != tab2[i])
1426
+ return -1;
1427
+ }
1428
+ return 0;
1429
+ }
1430
+
1431
+ void dump_str(const char *str, const int *buf, int len)
1432
+ {
1433
+ int i;
1434
+ printf("%s=", str);
1435
+ for(i = 0; i < len; i++)
1436
+ printf(" %05x", buf[i]);
1437
+ printf("\n");
1438
+ }
1439
+
1440
+ void compute_internal_props(void)
1441
+ {
1442
+ int i;
1443
+ BOOL has_ul;
1444
+
1445
+ for(i = 0; i <= CHARCODE_MAX; i++) {
1446
+ CCInfo *ci = &unicode_db[i];
1447
+ has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0);
1448
+ if (has_ul) {
1449
+ assert(get_prop(i, PROP_Cased));
1450
+ } else {
1451
+ set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
1452
+ }
1453
+ set_prop(i, PROP_ID_Continue1,
1454
+ get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
1455
+ set_prop(i, PROP_XID_Start1,
1456
+ get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
1457
+ set_prop(i, PROP_XID_Continue1,
1458
+ get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
1459
+ set_prop(i, PROP_Changes_When_Titlecased1,
1460
+ get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
1461
+ set_prop(i, PROP_Changes_When_Casefolded1,
1462
+ get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0));
1463
+ /* XXX: reduce table size (438 bytes) */
1464
+ set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
1465
+ get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0));
1466
+ #if 0
1467
+ /* TEST */
1468
+ #define M(x) (1U << GCAT_ ## x)
1469
+ {
1470
+ int b;
1471
+ b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >>
1472
+ unicode_db[i].general_category) & 1;
1473
+ set_prop(i, PROP_Cased1,
1474
+ get_prop(i, PROP_Case_Ignorable) ^ b);
1475
+ }
1476
+ #undef M
1477
+ #endif
1478
+ }
1479
+ }
1480
+
1481
+ void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
1482
+ {
1483
+ int i;
1484
+
1485
+ total_tables++;
1486
+ total_table_bytes += len;
1487
+ fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
1488
+ for(i = 0; i < len; i++) {
1489
+ if (i % 8 == 0)
1490
+ fprintf(f, "\n ");
1491
+ fprintf(f, " 0x%02x,", tab[i]);
1492
+ }
1493
+ fprintf(f, "\n};\n\n");
1494
+ }
1495
+
1496
+ void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len)
1497
+ {
1498
+ int i, code, offset;
1499
+
1500
+ total_index++;
1501
+ total_index_bytes += len;
1502
+ fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len);
1503
+ for(i = 0; i < len; i += 3) {
1504
+ code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16);
1505
+ offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5);
1506
+ fprintf(f, " 0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]);
1507
+ fprintf(f, " // %6.5X at %d%s\n", code, offset,
1508
+ i == len - 3 ? " (upper bound)" : "");
1509
+ }
1510
+ fprintf(f, "};\n\n");
1511
+ }
1512
+
1513
+ #define PROP_BLOCK_LEN 32
1514
+
1515
+ void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index)
1516
+ {
1517
+ int i, j, n, v, offset, code;
1518
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1519
+ DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1520
+ DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
1521
+ const uint32_t *buf;
1522
+ int buf_len, block_end_pos, bit;
1523
+ char cname[128];
1524
+
1525
+ dbuf_init(dbuf1);
1526
+
1527
+ for(i = 0; i <= CHARCODE_MAX;) {
1528
+ v = get_prop(i, prop_index);
1529
+ j = i + 1;
1530
+ while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
1531
+ j++;
1532
+ }
1533
+ n = j - i;
1534
+ if (j == (CHARCODE_MAX + 1) && v == 0)
1535
+ break; /* no need to encode last zero run */
1536
+ //printf("%05x: %d %d\n", i, n, v);
1537
+ dbuf_put_u32(dbuf1, n - 1);
1538
+ i += n;
1539
+ }
1540
+
1541
+ dbuf_init(dbuf);
1542
+ dbuf_init(dbuf2);
1543
+ buf = (uint32_t *)dbuf1->buf;
1544
+ buf_len = dbuf1->size / sizeof(buf[0]);
1545
+
1546
+ /* the first value is assumed to be 0 */
1547
+ assert(get_prop(0, prop_index) == 0);
1548
+
1549
+ block_end_pos = PROP_BLOCK_LEN;
1550
+ i = 0;
1551
+ code = 0;
1552
+ bit = 0;
1553
+ while (i < buf_len) {
1554
+ if (add_index && dbuf->size >= block_end_pos && bit == 0) {
1555
+ offset = (dbuf->size - block_end_pos);
1556
+ /* XXX: offset could be larger in case of runs of small
1557
+ lengths. Could add code to change the encoding to
1558
+ prevent it at the expense of one byte loss */
1559
+ assert(offset <= 7);
1560
+ v = code | (offset << 21);
1561
+ dbuf_putc(dbuf2, v);
1562
+ dbuf_putc(dbuf2, v >> 8);
1563
+ dbuf_putc(dbuf2, v >> 16);
1564
+ block_end_pos += PROP_BLOCK_LEN;
1565
+ }
1566
+
1567
+ /* Compressed byte encoding:
1568
+ 00..3F: 2 packed lengths: 3-bit + 3-bit
1569
+ 40..5F: 5-bits plus extra byte for length
1570
+ 60..7F: 5-bits plus 2 extra bytes for length
1571
+ 80..FF: 7-bit length
1572
+ lengths must be incremented to get character count
1573
+ Ranges alternate between false and true return value.
1574
+ */
1575
+ v = buf[i];
1576
+ code += v + 1;
1577
+ bit ^= 1;
1578
+ if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
1579
+ code += buf[i + 1] + 1;
1580
+ bit ^= 1;
1581
+ dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
1582
+ i += 2;
1583
+ } else if (v < 128) {
1584
+ dbuf_putc(dbuf, 0x80 + v);
1585
+ i++;
1586
+ } else if (v < (1 << 13)) {
1587
+ dbuf_putc(dbuf, 0x40 + (v >> 8));
1588
+ dbuf_putc(dbuf, v);
1589
+ i++;
1590
+ } else {
1591
+ assert(v < (1 << 21));
1592
+ dbuf_putc(dbuf, 0x60 + (v >> 16));
1593
+ dbuf_putc(dbuf, v >> 8);
1594
+ dbuf_putc(dbuf, v);
1595
+ i++;
1596
+ }
1597
+ }
1598
+
1599
+ if (add_index) {
1600
+ /* last index entry */
1601
+ v = code;
1602
+ dbuf_putc(dbuf2, v);
1603
+ dbuf_putc(dbuf2, v >> 8);
1604
+ dbuf_putc(dbuf2, v >> 16);
1605
+ }
1606
+
1607
+ #ifdef DUMP_TABLE_SIZE
1608
+ printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index],
1609
+ (int)(dbuf->size + dbuf2->size));
1610
+ #endif
1611
+ snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]);
1612
+ dump_byte_table(f, cname, dbuf->buf, dbuf->size);
1613
+ if (add_index) {
1614
+ snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
1615
+ dump_index_table(f, cname, dbuf2->buf, dbuf2->size);
1616
+ }
1617
+
1618
+ dbuf_free(dbuf);
1619
+ dbuf_free(dbuf1);
1620
+ dbuf_free(dbuf2);
1621
+ }
1622
+
1623
+ void build_flags_tables(FILE *f)
1624
+ {
1625
+ build_prop_table(f, "Cased1", PROP_Cased1, TRUE);
1626
+ build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE);
1627
+ build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE);
1628
+ build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE);
1629
+ }
1630
+
1631
+ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
1632
+ const char **tab_short_name)
1633
+ {
1634
+ int i, w, maxw;
1635
+
1636
+ maxw = 0;
1637
+ for(i = 0; i < len; i++) {
1638
+ w = strlen(tab_name[i]);
1639
+ if (tab_short_name[i][0] != '\0') {
1640
+ w += 1 + strlen(tab_short_name[i]);
1641
+ }
1642
+ if (maxw < w)
1643
+ maxw = w;
1644
+ }
1645
+
1646
+ /* generate a sequence of strings terminated by an empty string */
1647
+ fprintf(f, "static const char %s[] =\n", cname);
1648
+ for(i = 0; i < len; i++) {
1649
+ fprintf(f, " \"");
1650
+ w = fprintf(f, "%s", tab_name[i]);
1651
+ if (tab_short_name[i][0] != '\0') {
1652
+ w += fprintf(f, ",%s", tab_short_name[i]);
1653
+ }
1654
+ fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
1655
+ }
1656
+ fprintf(f, ";\n\n");
1657
+ }
1658
+
1659
+ void build_general_category_table(FILE *f)
1660
+ {
1661
+ int i, v, j, n, n1;
1662
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1663
+ #ifdef DUMP_TABLE_SIZE
1664
+ int cw_count, cw_len_count[4], cw_start;
1665
+ #endif
1666
+
1667
+ fprintf(f, "typedef enum {\n");
1668
+ for(i = 0; i < GCAT_COUNT; i++)
1669
+ fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]);
1670
+ fprintf(f, " UNICODE_GC_COUNT,\n");
1671
+ fprintf(f, "} UnicodeGCEnum;\n\n");
1672
+
1673
+ dump_name_table(f, "unicode_gc_name_table",
1674
+ unicode_gc_name, GCAT_COUNT,
1675
+ unicode_gc_short_name);
1676
+
1677
+
1678
+ dbuf_init(dbuf);
1679
+ #ifdef DUMP_TABLE_SIZE
1680
+ cw_count = 0;
1681
+ for(i = 0; i < 4; i++)
1682
+ cw_len_count[i] = 0;
1683
+ #endif
1684
+ for(i = 0; i <= CHARCODE_MAX;) {
1685
+ v = unicode_db[i].general_category;
1686
+ j = i + 1;
1687
+ while (j <= CHARCODE_MAX && unicode_db[j].general_category == v)
1688
+ j++;
1689
+ n = j - i;
1690
+ /* compress Lu/Ll runs */
1691
+ if (v == GCAT_Lu) {
1692
+ n1 = 1;
1693
+ while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
1694
+ n1++;
1695
+ }
1696
+ if (n1 > n) {
1697
+ v = 31;
1698
+ n = n1;
1699
+ }
1700
+ }
1701
+ // printf("%05x %05x %d\n", i, n, v);
1702
+ n--;
1703
+ #ifdef DUMP_TABLE_SIZE
1704
+ cw_count++;
1705
+ cw_start = dbuf->size;
1706
+ #endif
1707
+ if (n < 7) {
1708
+ dbuf_putc(dbuf, (n << 5) | v);
1709
+ } else if (n < 7 + 128) {
1710
+ n1 = n - 7;
1711
+ assert(n1 < 128);
1712
+ dbuf_putc(dbuf, (0xf << 5) | v);
1713
+ dbuf_putc(dbuf, n1);
1714
+ } else if (n < 7 + 128 + (1 << 14)) {
1715
+ n1 = n - (7 + 128);
1716
+ assert(n1 < (1 << 14));
1717
+ dbuf_putc(dbuf, (0xf << 5) | v);
1718
+ dbuf_putc(dbuf, (n1 >> 8) + 128);
1719
+ dbuf_putc(dbuf, n1);
1720
+ } else {
1721
+ n1 = n - (7 + 128 + (1 << 14));
1722
+ assert(n1 < (1 << 22));
1723
+ dbuf_putc(dbuf, (0xf << 5) | v);
1724
+ dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1725
+ dbuf_putc(dbuf, n1 >> 8);
1726
+ dbuf_putc(dbuf, n1);
1727
+ }
1728
+ #ifdef DUMP_TABLE_SIZE
1729
+ cw_len_count[dbuf->size - cw_start - 1]++;
1730
+ #endif
1731
+ i += n + 1;
1732
+ }
1733
+ #ifdef DUMP_TABLE_SIZE
1734
+ printf("general category: %d entries [", cw_count);
1735
+ for(i = 0; i < 4; i++)
1736
+ printf(" %d", cw_len_count[i]);
1737
+ printf(" ], length=%d bytes\n", (int)dbuf->size);
1738
+ #endif
1739
+
1740
+ dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size);
1741
+
1742
+ dbuf_free(dbuf);
1743
+ }
1744
+
1745
+ void build_script_table(FILE *f)
1746
+ {
1747
+ int i, v, j, n, n1, type;
1748
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1749
+ #ifdef DUMP_TABLE_SIZE
1750
+ int cw_count, cw_len_count[4], cw_start;
1751
+ #endif
1752
+
1753
+ fprintf(f, "typedef enum {\n");
1754
+ for(i = 0; i < SCRIPT_COUNT; i++)
1755
+ fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]);
1756
+ fprintf(f, " UNICODE_SCRIPT_COUNT,\n");
1757
+ fprintf(f, "} UnicodeScriptEnum;\n\n");
1758
+
1759
+ i = 1;
1760
+ dump_name_table(f, "unicode_script_name_table",
1761
+ unicode_script_name + i, SCRIPT_COUNT - i,
1762
+ unicode_script_short_name + i);
1763
+
1764
+ dbuf_init(dbuf);
1765
+ #ifdef DUMP_TABLE_SIZE
1766
+ cw_count = 0;
1767
+ for(i = 0; i < 4; i++)
1768
+ cw_len_count[i] = 0;
1769
+ #endif
1770
+ for(i = 0; i <= CHARCODE_MAX;) {
1771
+ v = unicode_db[i].script;
1772
+ j = i + 1;
1773
+ while (j <= CHARCODE_MAX && unicode_db[j].script == v)
1774
+ j++;
1775
+ n = j - i;
1776
+ if (v == 0 && j == (CHARCODE_MAX + 1))
1777
+ break;
1778
+ // printf("%05x %05x %d\n", i, n, v);
1779
+ n--;
1780
+ #ifdef DUMP_TABLE_SIZE
1781
+ cw_count++;
1782
+ cw_start = dbuf->size;
1783
+ #endif
1784
+ if (v == 0)
1785
+ type = 0;
1786
+ else
1787
+ type = 1;
1788
+ if (n < 96) {
1789
+ dbuf_putc(dbuf, n | (type << 7));
1790
+ } else if (n < 96 + (1 << 12)) {
1791
+ n1 = n - 96;
1792
+ assert(n1 < (1 << 12));
1793
+ dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
1794
+ dbuf_putc(dbuf, n1);
1795
+ } else {
1796
+ n1 = n - (96 + (1 << 12));
1797
+ assert(n1 < (1 << 20));
1798
+ dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
1799
+ dbuf_putc(dbuf, n1 >> 8);
1800
+ dbuf_putc(dbuf, n1);
1801
+ }
1802
+ if (type != 0)
1803
+ dbuf_putc(dbuf, v);
1804
+
1805
+ #ifdef DUMP_TABLE_SIZE
1806
+ cw_len_count[dbuf->size - cw_start - 1]++;
1807
+ #endif
1808
+ i += n + 1;
1809
+ }
1810
+ #ifdef DUMP_TABLE_SIZE
1811
+ printf("script: %d entries [", cw_count);
1812
+ for(i = 0; i < 4; i++)
1813
+ printf(" %d", cw_len_count[i]);
1814
+ printf(" ], length=%d bytes\n", (int)dbuf->size);
1815
+ #endif
1816
+
1817
+ dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size);
1818
+
1819
+ dbuf_free(dbuf);
1820
+ }
1821
+
1822
+ void build_script_ext_table(FILE *f)
1823
+ {
1824
+ int i, j, n, n1, script_ext_len;
1825
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1826
+ #if defined(DUMP_TABLE_SIZE)
1827
+ int cw_count = 0;
1828
+ #endif
1829
+
1830
+ dbuf_init(dbuf);
1831
+ for(i = 0; i <= CHARCODE_MAX;) {
1832
+ script_ext_len = unicode_db[i].script_ext_len;
1833
+ j = i + 1;
1834
+ while (j <= CHARCODE_MAX &&
1835
+ unicode_db[j].script_ext_len == script_ext_len &&
1836
+ !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext,
1837
+ script_ext_len)) {
1838
+ j++;
1839
+ }
1840
+ n = j - i;
1841
+ #if defined(DUMP_TABLE_SIZE)
1842
+ cw_count++;
1843
+ #endif
1844
+ n--;
1845
+ if (n < 128) {
1846
+ dbuf_putc(dbuf, n);
1847
+ } else if (n < 128 + (1 << 14)) {
1848
+ n1 = n - 128;
1849
+ assert(n1 < (1 << 14));
1850
+ dbuf_putc(dbuf, (n1 >> 8) + 128);
1851
+ dbuf_putc(dbuf, n1);
1852
+ } else {
1853
+ n1 = n - (128 + (1 << 14));
1854
+ assert(n1 < (1 << 22));
1855
+ dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1856
+ dbuf_putc(dbuf, n1 >> 8);
1857
+ dbuf_putc(dbuf, n1);
1858
+ }
1859
+ dbuf_putc(dbuf, script_ext_len);
1860
+ for(j = 0; j < script_ext_len; j++)
1861
+ dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
1862
+ i += n + 1;
1863
+ }
1864
+ #ifdef DUMP_TABLE_SIZE
1865
+ printf("script_ext: %d entries", cw_count);
1866
+ printf(", length=%d bytes\n", (int)dbuf->size);
1867
+ #endif
1868
+
1869
+ dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size);
1870
+
1871
+ dbuf_free(dbuf);
1872
+ }
1873
+
1874
+ /* the following properties are synthetized so no table is necessary */
1875
+ #define PROP_TABLE_COUNT PROP_ASCII
1876
+
1877
+ void build_prop_list_table(FILE *f)
1878
+ {
1879
+ int i;
1880
+
1881
+ for(i = 0; i < PROP_TABLE_COUNT; i++) {
1882
+ if (i == PROP_ID_Start ||
1883
+ i == PROP_Case_Ignorable ||
1884
+ i == PROP_ID_Continue1) {
1885
+ /* already generated */
1886
+ } else {
1887
+ build_prop_table(f, unicode_prop_name[i], i, FALSE);
1888
+ }
1889
+ }
1890
+
1891
+ fprintf(f, "typedef enum {\n");
1892
+ for(i = 0; i < PROP_COUNT; i++)
1893
+ fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]);
1894
+ fprintf(f, " UNICODE_PROP_COUNT,\n");
1895
+ fprintf(f, "} UnicodePropertyEnum;\n\n");
1896
+
1897
+ i = PROP_ASCII_Hex_Digit;
1898
+ dump_name_table(f, "unicode_prop_name_table",
1899
+ unicode_prop_name + i, PROP_XID_Start - i + 1,
1900
+ unicode_prop_short_name + i);
1901
+
1902
+ fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n");
1903
+ for(i = 0; i < PROP_TABLE_COUNT; i++) {
1904
+ fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]);
1905
+ }
1906
+ fprintf(f, "};\n\n");
1907
+
1908
+ fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n");
1909
+ for(i = 0; i < PROP_TABLE_COUNT; i++) {
1910
+ fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]);
1911
+ }
1912
+ fprintf(f, "};\n\n");
1913
+ }
1914
+
1915
+ #ifdef USE_TEST
1916
+ int check_conv(uint32_t *res, uint32_t c, int conv_type)
1917
+ {
1918
+ return lre_case_conv(res, c, conv_type);
1919
+ }
1920
+
1921
+ void check_case_conv(void)
1922
+ {
1923
+ CCInfo *tab = unicode_db;
1924
+ uint32_t res[3];
1925
+ int l, error;
1926
+ CCInfo ci_s, *ci1, *ci = &ci_s;
1927
+ int code;
1928
+
1929
+ for(code = 0; code <= CHARCODE_MAX; code++) {
1930
+ ci1 = &tab[code];
1931
+ *ci = *ci1;
1932
+ if (ci->l_len == 0) {
1933
+ ci->l_len = 1;
1934
+ ci->l_data[0] = code;
1935
+ }
1936
+ if (ci->u_len == 0) {
1937
+ ci->u_len = 1;
1938
+ ci->u_data[0] = code;
1939
+ }
1940
+ if (ci->f_len == 0) {
1941
+ ci->f_len = 1;
1942
+ ci->f_data[0] = code;
1943
+ }
1944
+
1945
+ error = 0;
1946
+ l = check_conv(res, code, 0);
1947
+ if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) {
1948
+ printf("ERROR: L\n");
1949
+ error++;
1950
+ }
1951
+ l = check_conv(res, code, 1);
1952
+ if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) {
1953
+ printf("ERROR: U\n");
1954
+ error++;
1955
+ }
1956
+ l = check_conv(res, code, 2);
1957
+ if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) {
1958
+ printf("ERROR: F\n");
1959
+ error++;
1960
+ }
1961
+ if (error) {
1962
+ dump_cc_info(ci, code);
1963
+ exit(1);
1964
+ }
1965
+ }
1966
+ }
1967
+
1968
+ #ifdef PROFILE
1969
+ static int64_t get_time_ns(void)
1970
+ {
1971
+ struct timespec ts;
1972
+ clock_gettime(CLOCK_MONOTONIC, &ts);
1973
+ return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
1974
+ }
1975
+ #endif
1976
+
1977
+
1978
+ void check_flags(void)
1979
+ {
1980
+ int c;
1981
+ BOOL flag_ref, flag;
1982
+ for(c = 0; c <= CHARCODE_MAX; c++) {
1983
+ flag_ref = get_prop(c, PROP_Cased);
1984
+ flag = !!lre_is_cased(c);
1985
+ if (flag != flag_ref) {
1986
+ printf("ERROR: c=%05x cased=%d ref=%d\n",
1987
+ c, flag, flag_ref);
1988
+ exit(1);
1989
+ }
1990
+
1991
+ flag_ref = get_prop(c, PROP_Case_Ignorable);
1992
+ flag = !!lre_is_case_ignorable(c);
1993
+ if (flag != flag_ref) {
1994
+ printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
1995
+ c, flag, flag_ref);
1996
+ exit(1);
1997
+ }
1998
+
1999
+ flag_ref = get_prop(c, PROP_ID_Start);
2000
+ flag = !!lre_is_id_start(c);
2001
+ if (flag != flag_ref) {
2002
+ printf("ERROR: c=%05x id_start=%d ref=%d\n",
2003
+ c, flag, flag_ref);
2004
+ exit(1);
2005
+ }
2006
+
2007
+ flag_ref = get_prop(c, PROP_ID_Continue);
2008
+ flag = !!lre_is_id_continue(c);
2009
+ if (flag != flag_ref) {
2010
+ printf("ERROR: c=%05x id_cont=%d ref=%d\n",
2011
+ c, flag, flag_ref);
2012
+ exit(1);
2013
+ }
2014
+ }
2015
+ #ifdef PROFILE
2016
+ {
2017
+ int64_t ti, count;
2018
+ ti = get_time_ns();
2019
+ count = 0;
2020
+ for(c = 0x20; c <= 0xffff; c++) {
2021
+ flag_ref = get_prop(c, PROP_ID_Start);
2022
+ flag = !!lre_is_id_start(c);
2023
+ assert(flag == flag_ref);
2024
+ count++;
2025
+ }
2026
+ ti = get_time_ns() - ti;
2027
+ printf("flags time=%0.1f ns/char\n",
2028
+ (double)ti / count);
2029
+ }
2030
+ #endif
2031
+ }
2032
+
2033
+ #endif
2034
+
2035
+ #define CC_BLOCK_LEN 32
2036
+
2037
+ void build_cc_table(FILE *f)
2038
+ {
2039
+ // Compress combining class table
2040
+ // see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
2041
+ int i, cc, n, type, n1, block_end_pos;
2042
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
2043
+ DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
2044
+ #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
2045
+ int cw_len_tab[3], cw_start, cc_table_len;
2046
+ #endif
2047
+ uint32_t v;
2048
+
2049
+ dbuf_init(dbuf);
2050
+ dbuf_init(dbuf1);
2051
+ #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
2052
+ cc_table_len = 0;
2053
+ for(i = 0; i < countof(cw_len_tab); i++)
2054
+ cw_len_tab[i] = 0;
2055
+ #endif
2056
+ block_end_pos = CC_BLOCK_LEN;
2057
+ for(i = 0; i <= CHARCODE_MAX;) {
2058
+ cc = unicode_db[i].combining_class;
2059
+ assert(cc <= 255);
2060
+ /* check increasing values */
2061
+ n = 1;
2062
+ while ((i + n) <= CHARCODE_MAX &&
2063
+ unicode_db[i + n].combining_class == (cc + n))
2064
+ n++;
2065
+ if (n >= 2) {
2066
+ type = 1;
2067
+ } else {
2068
+ type = 0;
2069
+ n = 1;
2070
+ while ((i + n) <= CHARCODE_MAX &&
2071
+ unicode_db[i + n].combining_class == cc)
2072
+ n++;
2073
+ }
2074
+ /* no need to encode the last run */
2075
+ if (cc == 0 && (i + n - 1) == CHARCODE_MAX)
2076
+ break;
2077
+ #ifdef DUMP_CC_TABLE
2078
+ printf("%05x %6d %d %d\n", i, n, type, cc);
2079
+ #endif
2080
+ if (type == 0) {
2081
+ if (cc == 0)
2082
+ type = 2;
2083
+ else if (cc == 230)
2084
+ type = 3;
2085
+ }
2086
+ n1 = n - 1;
2087
+
2088
+ /* add an entry to the index if necessary */
2089
+ if (dbuf->size >= block_end_pos) {
2090
+ v = i | ((dbuf->size - block_end_pos) << 21);
2091
+ dbuf_putc(dbuf1, v);
2092
+ dbuf_putc(dbuf1, v >> 8);
2093
+ dbuf_putc(dbuf1, v >> 16);
2094
+ block_end_pos += CC_BLOCK_LEN;
2095
+ }
2096
+ #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
2097
+ cw_start = dbuf->size;
2098
+ #endif
2099
+ /* Compressed run length encoding:
2100
+ - 2 high order bits are combining class type
2101
+ - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
2102
+ - 00..2F: range length (add 1)
2103
+ - 30..37: 3-bit range-length + 1 extra byte
2104
+ - 38..3F: 3-bit range-length + 2 extra byte
2105
+ */
2106
+ if (n1 < 48) {
2107
+ dbuf_putc(dbuf, n1 | (type << 6));
2108
+ } else if (n1 < 48 + (1 << 11)) {
2109
+ n1 -= 48;
2110
+ dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
2111
+ dbuf_putc(dbuf, n1);
2112
+ } else {
2113
+ n1 -= 48 + (1 << 11);
2114
+ assert(n1 < (1 << 20));
2115
+ dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
2116
+ dbuf_putc(dbuf, n1 >> 8);
2117
+ dbuf_putc(dbuf, n1);
2118
+ }
2119
+ #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
2120
+ cw_len_tab[dbuf->size - cw_start - 1]++;
2121
+ cc_table_len++;
2122
+ #endif
2123
+ if (type == 0 || type == 1)
2124
+ dbuf_putc(dbuf, cc);
2125
+ i += n;
2126
+ }
2127
+
2128
+ /* last index entry */
2129
+ v = i;
2130
+ dbuf_putc(dbuf1, v);
2131
+ dbuf_putc(dbuf1, v >> 8);
2132
+ dbuf_putc(dbuf1, v >> 16);
2133
+
2134
+ dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
2135
+ dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
2136
+
2137
+ #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
2138
+ printf("CC table: size=%d (%d entries) [",
2139
+ (int)(dbuf->size + dbuf1->size),
2140
+ cc_table_len);
2141
+ for(i = 0; i < countof(cw_len_tab); i++)
2142
+ printf(" %d", cw_len_tab[i]);
2143
+ printf(" ]\n");
2144
+ #endif
2145
+ dbuf_free(dbuf);
2146
+ dbuf_free(dbuf1);
2147
+ }
2148
+
2149
+ /* maximum length of decomposition: 18 chars (1), then 8 */
2150
+ #ifndef USE_TEST
2151
+ typedef enum {
2152
+ DECOMP_TYPE_C1, /* 16 bit char */
2153
+ DECOMP_TYPE_L1, /* 16 bit char table */
2154
+ DECOMP_TYPE_L2,
2155
+ DECOMP_TYPE_L3,
2156
+ DECOMP_TYPE_L4,
2157
+ DECOMP_TYPE_L5, /* XXX: not used */
2158
+ DECOMP_TYPE_L6, /* XXX: could remove */
2159
+ DECOMP_TYPE_L7, /* XXX: could remove */
2160
+ DECOMP_TYPE_LL1, /* 18 bit char table */
2161
+ DECOMP_TYPE_LL2,
2162
+ DECOMP_TYPE_S1, /* 8 bit char table */
2163
+ DECOMP_TYPE_S2,
2164
+ DECOMP_TYPE_S3,
2165
+ DECOMP_TYPE_S4,
2166
+ DECOMP_TYPE_S5,
2167
+ DECOMP_TYPE_I1, /* increment 16 bit char value */
2168
+ DECOMP_TYPE_I2_0,
2169
+ DECOMP_TYPE_I2_1,
2170
+ DECOMP_TYPE_I3_1,
2171
+ DECOMP_TYPE_I3_2,
2172
+ DECOMP_TYPE_I4_1,
2173
+ DECOMP_TYPE_I4_2,
2174
+ DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
2175
+ DECOMP_TYPE_B2,
2176
+ DECOMP_TYPE_B3,
2177
+ DECOMP_TYPE_B4,
2178
+ DECOMP_TYPE_B5,
2179
+ DECOMP_TYPE_B6,
2180
+ DECOMP_TYPE_B7,
2181
+ DECOMP_TYPE_B8,
2182
+ DECOMP_TYPE_B18,
2183
+ DECOMP_TYPE_LS2,
2184
+ DECOMP_TYPE_PAT3,
2185
+ DECOMP_TYPE_S2_UL,
2186
+ DECOMP_TYPE_LS2_UL,
2187
+ } DecompTypeEnum;
2188
+ #endif
2189
+
2190
+ const char *decomp_type_str[] = {
2191
+ "C1",
2192
+ "L1",
2193
+ "L2",
2194
+ "L3",
2195
+ "L4",
2196
+ "L5",
2197
+ "L6",
2198
+ "L7",
2199
+ "LL1",
2200
+ "LL2",
2201
+ "S1",
2202
+ "S2",
2203
+ "S3",
2204
+ "S4",
2205
+ "S5",
2206
+ "I1",
2207
+ "I2_0",
2208
+ "I2_1",
2209
+ "I3_1",
2210
+ "I3_2",
2211
+ "I4_1",
2212
+ "I4_2",
2213
+ "B1",
2214
+ "B2",
2215
+ "B3",
2216
+ "B4",
2217
+ "B5",
2218
+ "B6",
2219
+ "B7",
2220
+ "B8",
2221
+ "B18",
2222
+ "LS2",
2223
+ "PAT3",
2224
+ "S2_UL",
2225
+ "LS2_UL",
2226
+ };
2227
+
2228
+ const int decomp_incr_tab[4][4] = {
2229
+ { DECOMP_TYPE_I1, 0, -1 },
2230
+ { DECOMP_TYPE_I2_0, 0, 1, -1 },
2231
+ { DECOMP_TYPE_I3_1, 1, 2, -1 },
2232
+ { DECOMP_TYPE_I4_1, 1, 2, -1 },
2233
+ };
2234
+
2235
+ /*
2236
+ entry size:
2237
+ type bits
2238
+ code 18
2239
+ len 7
2240
+ compat 1
2241
+ type 5
2242
+ index 16
2243
+ total 47
2244
+ */
2245
+
2246
+ typedef struct {
2247
+ int code;
2248
+ uint8_t len;
2249
+ uint8_t type;
2250
+ uint8_t c_len;
2251
+ uint16_t c_min;
2252
+ uint16_t data_index;
2253
+ int cost; /* size in bytes from this entry to the end */
2254
+ } DecompEntry;
2255
+
2256
+ int get_decomp_run_size(const DecompEntry *de)
2257
+ {
2258
+ int s;
2259
+ s = 6;
2260
+ if (de->type <= DECOMP_TYPE_C1) {
2261
+ /* nothing more */
2262
+ } else if (de->type <= DECOMP_TYPE_L7) {
2263
+ s += de->len * de->c_len * 2;
2264
+ } else if (de->type <= DECOMP_TYPE_LL2) {
2265
+ /* 18 bits per char */
2266
+ s += (de->len * de->c_len * 18 + 7) / 8;
2267
+ } else if (de->type <= DECOMP_TYPE_S5) {
2268
+ s += de->len * de->c_len;
2269
+ } else if (de->type <= DECOMP_TYPE_I4_2) {
2270
+ s += de->c_len * 2;
2271
+ } else if (de->type <= DECOMP_TYPE_B18) {
2272
+ s += 2 + de->len * de->c_len;
2273
+ } else if (de->type <= DECOMP_TYPE_LS2) {
2274
+ s += de->len * 3;
2275
+ } else if (de->type <= DECOMP_TYPE_PAT3) {
2276
+ s += 4 + de->len * 2;
2277
+ } else if (de->type <= DECOMP_TYPE_S2_UL) {
2278
+ s += de->len;
2279
+ } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2280
+ s += (de->len / 2) * 3;
2281
+ } else {
2282
+ abort();
2283
+ }
2284
+ return s;
2285
+ }
2286
+
2287
+ static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
2288
+
2289
+ /* return -1 if not found */
2290
+ int get_short_code(int c)
2291
+ {
2292
+ int i;
2293
+ if (c < 0x80) {
2294
+ return c;
2295
+ } else if (c >= 0x300 && c < 0x350) {
2296
+ return c - 0x300 + 0x80;
2297
+ } else {
2298
+ for(i = 0; i < countof(unicode_short_table); i++) {
2299
+ if (c == unicode_short_table[i])
2300
+ return i + 0x80 + 0x50;
2301
+ }
2302
+ return -1;
2303
+ }
2304
+ }
2305
+
2306
+ static BOOL is_short(int code)
2307
+ {
2308
+ return get_short_code(code) >= 0;
2309
+ }
2310
+
2311
+ static BOOL is_short_tab(const int *tab, int len)
2312
+ {
2313
+ int i;
2314
+ for(i = 0; i < len; i++) {
2315
+ if (!is_short(tab[i]))
2316
+ return FALSE;
2317
+ }
2318
+ return TRUE;
2319
+ }
2320
+
2321
+ static BOOL is_16bit(const int *tab, int len)
2322
+ {
2323
+ int i;
2324
+ for(i = 0; i < len; i++) {
2325
+ if (tab[i] > 0xffff)
2326
+ return FALSE;
2327
+ }
2328
+ return TRUE;
2329
+ }
2330
+
2331
+ static uint32_t to_lower_simple(uint32_t c)
2332
+ {
2333
+ /* Latin1 and Cyrillic */
2334
+ if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
2335
+ c += 0x20;
2336
+ else
2337
+ c++;
2338
+ return c;
2339
+ }
2340
+
2341
+ /* select best encoding with dynamic programming */
2342
+ void find_decomp_run(DecompEntry *tab_de, int i)
2343
+ {
2344
+ DecompEntry de_s, *de = &de_s;
2345
+ CCInfo *ci, *ci1, *ci2;
2346
+ int l, j, n, len_max;
2347
+
2348
+ ci = &unicode_db[i];
2349
+ l = ci->decomp_len;
2350
+ if (l == 0) {
2351
+ tab_de[i].cost = tab_de[i + 1].cost;
2352
+ return;
2353
+ }
2354
+
2355
+ /* the offset for the compose table has only 6 bits, so we must
2356
+ limit if it can be used by the compose table */
2357
+ if (!ci->is_compat && !ci->is_excluded && l == 2)
2358
+ len_max = 64;
2359
+ else
2360
+ len_max = 127;
2361
+
2362
+ tab_de[i].cost = 0x7fffffff;
2363
+
2364
+ if (!is_16bit(ci->decomp_data, l)) {
2365
+ assert(l <= 2);
2366
+
2367
+ n = 1;
2368
+ for(;;) {
2369
+ de->code = i;
2370
+ de->len = n;
2371
+ de->type = DECOMP_TYPE_LL1 + l - 1;
2372
+ de->c_len = l;
2373
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2374
+ if (de->cost < tab_de[i].cost) {
2375
+ tab_de[i] = *de;
2376
+ }
2377
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2378
+ break;
2379
+ ci1 = &unicode_db[i + n];
2380
+ /* Note: we accept a hole */
2381
+ if (!(ci1->decomp_len == 0 ||
2382
+ (ci1->decomp_len == l &&
2383
+ ci1->is_compat == ci->is_compat)))
2384
+ break;
2385
+ n++;
2386
+ }
2387
+ return;
2388
+ }
2389
+
2390
+ if (l <= 7) {
2391
+ n = 1;
2392
+ for(;;) {
2393
+ de->code = i;
2394
+ de->len = n;
2395
+ if (l == 1 && n == 1) {
2396
+ de->type = DECOMP_TYPE_C1;
2397
+ } else {
2398
+ assert(l <= 8);
2399
+ de->type = DECOMP_TYPE_L1 + l - 1;
2400
+ }
2401
+ de->c_len = l;
2402
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2403
+ if (de->cost < tab_de[i].cost) {
2404
+ tab_de[i] = *de;
2405
+ }
2406
+
2407
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2408
+ break;
2409
+ ci1 = &unicode_db[i + n];
2410
+ /* Note: we accept a hole */
2411
+ if (!(ci1->decomp_len == 0 ||
2412
+ (ci1->decomp_len == l &&
2413
+ ci1->is_compat == ci->is_compat &&
2414
+ is_16bit(ci1->decomp_data, l))))
2415
+ break;
2416
+ n++;
2417
+ }
2418
+ }
2419
+
2420
+ if (l <= 8 || l == 18) {
2421
+ int c_min, c_max, c;
2422
+ c_min = c_max = -1;
2423
+ n = 1;
2424
+ for(;;) {
2425
+ ci1 = &unicode_db[i + n - 1];
2426
+ for(j = 0; j < l; j++) {
2427
+ c = ci1->decomp_data[j];
2428
+ if (c == 0x20) {
2429
+ /* we accept space for Arabic */
2430
+ } else if (c_min == -1) {
2431
+ c_min = c_max = c;
2432
+ } else {
2433
+ c_min = min_int(c_min, c);
2434
+ c_max = max_int(c_max, c);
2435
+ }
2436
+ }
2437
+ if ((c_max - c_min) > 254)
2438
+ break;
2439
+ de->code = i;
2440
+ de->len = n;
2441
+ if (l == 18)
2442
+ de->type = DECOMP_TYPE_B18;
2443
+ else
2444
+ de->type = DECOMP_TYPE_B1 + l - 1;
2445
+ de->c_len = l;
2446
+ de->c_min = c_min;
2447
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2448
+ if (de->cost < tab_de[i].cost) {
2449
+ tab_de[i] = *de;
2450
+ }
2451
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2452
+ break;
2453
+ ci1 = &unicode_db[i + n];
2454
+ if (!(ci1->decomp_len == l &&
2455
+ ci1->is_compat == ci->is_compat))
2456
+ break;
2457
+ n++;
2458
+ }
2459
+ }
2460
+
2461
+ /* find an ascii run */
2462
+ if (l <= 5 && is_short_tab(ci->decomp_data, l)) {
2463
+ n = 1;
2464
+ for(;;) {
2465
+ de->code = i;
2466
+ de->len = n;
2467
+ de->type = DECOMP_TYPE_S1 + l - 1;
2468
+ de->c_len = l;
2469
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2470
+ if (de->cost < tab_de[i].cost) {
2471
+ tab_de[i] = *de;
2472
+ }
2473
+
2474
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2475
+ break;
2476
+ ci1 = &unicode_db[i + n];
2477
+ /* Note: we accept a hole */
2478
+ if (!(ci1->decomp_len == 0 ||
2479
+ (ci1->decomp_len == l &&
2480
+ ci1->is_compat == ci->is_compat &&
2481
+ is_short_tab(ci1->decomp_data, l))))
2482
+ break;
2483
+ n++;
2484
+ }
2485
+ }
2486
+
2487
+ /* check if a single char is increasing */
2488
+ if (l <= 4) {
2489
+ int idx1, idx;
2490
+
2491
+ for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
2492
+ n = 1;
2493
+ for(;;) {
2494
+ de->code = i;
2495
+ de->len = n;
2496
+ de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
2497
+ de->c_len = l;
2498
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2499
+ if (de->cost < tab_de[i].cost) {
2500
+ tab_de[i] = *de;
2501
+ }
2502
+
2503
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2504
+ break;
2505
+ ci1 = &unicode_db[i + n];
2506
+ if (!(ci1->decomp_len == l &&
2507
+ ci1->is_compat == ci->is_compat))
2508
+ goto next1;
2509
+ for(j = 0; j < l; j++) {
2510
+ if (j == idx) {
2511
+ if (ci1->decomp_data[j] != ci->decomp_data[j] + n)
2512
+ goto next1;
2513
+ } else {
2514
+ if (ci1->decomp_data[j] != ci->decomp_data[j])
2515
+ goto next1;
2516
+ }
2517
+ }
2518
+ n++;
2519
+ }
2520
+ next1: ;
2521
+ }
2522
+ }
2523
+
2524
+ if (l == 3) {
2525
+ n = 1;
2526
+ for(;;) {
2527
+ de->code = i;
2528
+ de->len = n;
2529
+ de->type = DECOMP_TYPE_PAT3;
2530
+ de->c_len = l;
2531
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2532
+ if (de->cost < tab_de[i].cost) {
2533
+ tab_de[i] = *de;
2534
+ }
2535
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2536
+ break;
2537
+ ci1 = &unicode_db[i + n];
2538
+ if (!(ci1->decomp_len == l &&
2539
+ ci1->is_compat == ci->is_compat &&
2540
+ ci1->decomp_data[1] <= 0xffff &&
2541
+ ci1->decomp_data[0] == ci->decomp_data[0] &&
2542
+ ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
2543
+ break;
2544
+ n++;
2545
+ }
2546
+ }
2547
+
2548
+ if (l == 2 && is_short(ci->decomp_data[1])) {
2549
+ n = 1;
2550
+ for(;;) {
2551
+ de->code = i;
2552
+ de->len = n;
2553
+ de->type = DECOMP_TYPE_LS2;
2554
+ de->c_len = l;
2555
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2556
+ if (de->cost < tab_de[i].cost) {
2557
+ tab_de[i] = *de;
2558
+ }
2559
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2560
+ break;
2561
+ ci1 = &unicode_db[i + n];
2562
+ if (!(ci1->decomp_len == 0 ||
2563
+ (ci1->decomp_len == l &&
2564
+ ci1->is_compat == ci->is_compat &&
2565
+ ci1->decomp_data[0] <= 0xffff &&
2566
+ is_short(ci1->decomp_data[1]))))
2567
+ break;
2568
+ n++;
2569
+ }
2570
+ }
2571
+
2572
+ if (l == 2) {
2573
+ BOOL is_16bit;
2574
+
2575
+ n = 0;
2576
+ is_16bit = FALSE;
2577
+ for(;;) {
2578
+ if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
2579
+ break;
2580
+ ci1 = &unicode_db[i + n];
2581
+ if (!(ci1->decomp_len == l &&
2582
+ ci1->is_compat == ci->is_compat &&
2583
+ is_short(ci1->decomp_data[1])))
2584
+ break;
2585
+ if (!is_16bit && !is_short(ci1->decomp_data[0]))
2586
+ is_16bit = TRUE;
2587
+ ci2 = &unicode_db[i + n + 1];
2588
+ if (!(ci2->decomp_len == l &&
2589
+ ci2->is_compat == ci->is_compat &&
2590
+ ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) &&
2591
+ ci2->decomp_data[1] == ci1->decomp_data[1]))
2592
+ break;
2593
+ n += 2;
2594
+ de->code = i;
2595
+ de->len = n;
2596
+ de->type = DECOMP_TYPE_S2_UL + is_16bit;
2597
+ de->c_len = l;
2598
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2599
+ if (de->cost < tab_de[i].cost) {
2600
+ tab_de[i] = *de;
2601
+ }
2602
+ }
2603
+ }
2604
+ }
2605
+
2606
+ void put16(uint8_t *data_buf, int *pidx, uint16_t c)
2607
+ {
2608
+ int idx;
2609
+ idx = *pidx;
2610
+ data_buf[idx++] = c;
2611
+ data_buf[idx++] = c >> 8;
2612
+ *pidx = idx;
2613
+ }
2614
+
2615
+ void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de)
2616
+ {
2617
+ int i, j, idx, c;
2618
+ CCInfo *ci;
2619
+
2620
+ idx = *pidx;
2621
+ de->data_index = idx;
2622
+ if (de->type <= DECOMP_TYPE_C1) {
2623
+ ci = &unicode_db[de->code];
2624
+ assert(ci->decomp_len == 1);
2625
+ de->data_index = ci->decomp_data[0];
2626
+ } else if (de->type <= DECOMP_TYPE_L7) {
2627
+ for(i = 0; i < de->len; i++) {
2628
+ ci = &unicode_db[de->code + i];
2629
+ for(j = 0; j < de->c_len; j++) {
2630
+ if (ci->decomp_len == 0)
2631
+ c = 0;
2632
+ else
2633
+ c = ci->decomp_data[j];
2634
+ put16(data_buf, &idx, c);
2635
+ }
2636
+ }
2637
+ } else if (de->type <= DECOMP_TYPE_LL2) {
2638
+ int n, p, k;
2639
+ n = (de->len * de->c_len * 18 + 7) / 8;
2640
+ p = de->len * de->c_len * 2;
2641
+ memset(data_buf + idx, 0, n);
2642
+ k = 0;
2643
+ for(i = 0; i < de->len; i++) {
2644
+ ci = &unicode_db[de->code + i];
2645
+ for(j = 0; j < de->c_len; j++) {
2646
+ if (ci->decomp_len == 0)
2647
+ c = 0;
2648
+ else
2649
+ c = ci->decomp_data[j];
2650
+ data_buf[idx + k * 2] = c;
2651
+ data_buf[idx + k * 2 + 1] = c >> 8;
2652
+ data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
2653
+ k++;
2654
+ }
2655
+ }
2656
+ idx += n;
2657
+ } else if (de->type <= DECOMP_TYPE_S5) {
2658
+ for(i = 0; i < de->len; i++) {
2659
+ ci = &unicode_db[de->code + i];
2660
+ for(j = 0; j < de->c_len; j++) {
2661
+ if (ci->decomp_len == 0)
2662
+ c = 0;
2663
+ else
2664
+ c = ci->decomp_data[j];
2665
+ c = get_short_code(c);
2666
+ assert(c >= 0);
2667
+ data_buf[idx++] = c;
2668
+ }
2669
+ }
2670
+ } else if (de->type <= DECOMP_TYPE_I4_2) {
2671
+ ci = &unicode_db[de->code];
2672
+ assert(ci->decomp_len == de->c_len);
2673
+ for(j = 0; j < de->c_len; j++)
2674
+ put16(data_buf, &idx, ci->decomp_data[j]);
2675
+ } else if (de->type <= DECOMP_TYPE_B18) {
2676
+ c = de->c_min;
2677
+ data_buf[idx++] = c;
2678
+ data_buf[idx++] = c >> 8;
2679
+ for(i = 0; i < de->len; i++) {
2680
+ ci = &unicode_db[de->code + i];
2681
+ for(j = 0; j < de->c_len; j++) {
2682
+ assert(ci->decomp_len == de->c_len);
2683
+ c = ci->decomp_data[j];
2684
+ if (c == 0x20) {
2685
+ c = 0xff;
2686
+ } else {
2687
+ c -= de->c_min;
2688
+ assert((uint32_t)c <= 254);
2689
+ }
2690
+ data_buf[idx++] = c;
2691
+ }
2692
+ }
2693
+ } else if (de->type <= DECOMP_TYPE_LS2) {
2694
+ assert(de->c_len == 2);
2695
+ for(i = 0; i < de->len; i++) {
2696
+ ci = &unicode_db[de->code + i];
2697
+ if (ci->decomp_len == 0)
2698
+ c = 0;
2699
+ else
2700
+ c = ci->decomp_data[0];
2701
+ put16(data_buf, &idx, c);
2702
+
2703
+ if (ci->decomp_len == 0)
2704
+ c = 0;
2705
+ else
2706
+ c = ci->decomp_data[1];
2707
+ c = get_short_code(c);
2708
+ assert(c >= 0);
2709
+ data_buf[idx++] = c;
2710
+ }
2711
+ } else if (de->type <= DECOMP_TYPE_PAT3) {
2712
+ ci = &unicode_db[de->code];
2713
+ assert(ci->decomp_len == 3);
2714
+ put16(data_buf, &idx, ci->decomp_data[0]);
2715
+ put16(data_buf, &idx, ci->decomp_data[2]);
2716
+ for(i = 0; i < de->len; i++) {
2717
+ ci = &unicode_db[de->code + i];
2718
+ assert(ci->decomp_len == 3);
2719
+ put16(data_buf, &idx, ci->decomp_data[1]);
2720
+ }
2721
+ } else if (de->type <= DECOMP_TYPE_S2_UL) {
2722
+ for(i = 0; i < de->len; i += 2) {
2723
+ ci = &unicode_db[de->code + i];
2724
+ c = ci->decomp_data[0];
2725
+ c = get_short_code(c);
2726
+ assert(c >= 0);
2727
+ data_buf[idx++] = c;
2728
+ c = ci->decomp_data[1];
2729
+ c = get_short_code(c);
2730
+ assert(c >= 0);
2731
+ data_buf[idx++] = c;
2732
+ }
2733
+ } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2734
+ for(i = 0; i < de->len; i += 2) {
2735
+ ci = &unicode_db[de->code + i];
2736
+ c = ci->decomp_data[0];
2737
+ put16(data_buf, &idx, c);
2738
+ c = ci->decomp_data[1];
2739
+ c = get_short_code(c);
2740
+ assert(c >= 0);
2741
+ data_buf[idx++] = c;
2742
+ }
2743
+ } else {
2744
+ abort();
2745
+ }
2746
+ *pidx = idx;
2747
+ }
2748
+
2749
+ #if 0
2750
+ void dump_large_char(void)
2751
+ {
2752
+ int i, j;
2753
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2754
+ CCInfo *ci = &unicode_db[i];
2755
+ for(j = 0; j < ci->decomp_len; j++) {
2756
+ if (ci->decomp_data[j] > 0xffff)
2757
+ printf("%05x\n", ci->decomp_data[j]);
2758
+ }
2759
+ }
2760
+ }
2761
+ #endif
2762
+
2763
+ void build_compose_table(FILE *f, const DecompEntry *tab_de);
2764
+
2765
+ void build_decompose_table(FILE *f)
2766
+ {
2767
+ int i, array_len, code_max, data_len, count;
2768
+ DecompEntry *tab_de, de_s, *de = &de_s;
2769
+ uint8_t *data_buf;
2770
+
2771
+ code_max = CHARCODE_MAX;
2772
+
2773
+ tab_de = mallocz((code_max + 2) * sizeof(*tab_de));
2774
+
2775
+ for(i = code_max; i >= 0; i--) {
2776
+ find_decomp_run(tab_de, i);
2777
+ }
2778
+
2779
+ /* build the data buffer */
2780
+ data_buf = malloc(100000);
2781
+ data_len = 0;
2782
+ array_len = 0;
2783
+ for(i = 0; i <= code_max; i++) {
2784
+ de = &tab_de[i];
2785
+ if (de->len != 0) {
2786
+ add_decomp_data(data_buf, &data_len, de);
2787
+ i += de->len - 1;
2788
+ array_len++;
2789
+ }
2790
+ }
2791
+
2792
+ #ifdef DUMP_DECOMP_TABLE
2793
+ /* dump */
2794
+ {
2795
+ int size, size1;
2796
+
2797
+ printf("START LEN TYPE L C SIZE\n");
2798
+ size = 0;
2799
+ for(i = 0; i <= code_max; i++) {
2800
+ de = &tab_de[i];
2801
+ if (de->len != 0) {
2802
+ size1 = get_decomp_run_size(de);
2803
+ printf("%05x %3d %6s %2d %1d %4d\n", i, de->len,
2804
+ decomp_type_str[de->type], de->c_len,
2805
+ unicode_db[i].is_compat, size1);
2806
+ i += de->len - 1;
2807
+ size += size1;
2808
+ }
2809
+ }
2810
+
2811
+ printf("array_len=%d estimated size=%d bytes actual=%d bytes\n",
2812
+ array_len, size, array_len * 6 + data_len);
2813
+ }
2814
+ #endif
2815
+
2816
+ total_tables++;
2817
+ total_table_bytes += array_len * sizeof(uint32_t);
2818
+ fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len);
2819
+ count = 0;
2820
+ for(i = 0; i <= code_max; i++) {
2821
+ de = &tab_de[i];
2822
+ if (de->len != 0) {
2823
+ uint32_t v;
2824
+ if (count++ % 4 == 0)
2825
+ fprintf(f, "\n ");
2826
+ v = (de->code << (32 - 18)) |
2827
+ (de->len << (32 - 18 - 7)) |
2828
+ (de->type << (32 - 18 - 7 - 6)) |
2829
+ unicode_db[de->code].is_compat;
2830
+ fprintf(f, " 0x%08x,", v);
2831
+ i += de->len - 1;
2832
+ }
2833
+ }
2834
+ fprintf(f, "\n};\n\n");
2835
+
2836
+ total_tables++;
2837
+ total_table_bytes += array_len * sizeof(uint16_t);
2838
+ fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len);
2839
+ count = 0;
2840
+ for(i = 0; i <= code_max; i++) {
2841
+ de = &tab_de[i];
2842
+ if (de->len != 0) {
2843
+ if (count++ % 8 == 0)
2844
+ fprintf(f, "\n ");
2845
+ fprintf(f, " 0x%04x,", de->data_index);
2846
+ i += de->len - 1;
2847
+ }
2848
+ }
2849
+ fprintf(f, "\n};\n\n");
2850
+
2851
+ total_tables++;
2852
+ total_table_bytes += data_len;
2853
+ fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len);
2854
+ for(i = 0; i < data_len; i++) {
2855
+ if (i % 8 == 0)
2856
+ fprintf(f, "\n ");
2857
+ fprintf(f, " 0x%02x,", data_buf[i]);
2858
+ }
2859
+ fprintf(f, "\n};\n\n");
2860
+
2861
+ build_compose_table(f, tab_de);
2862
+
2863
+ free(data_buf);
2864
+
2865
+ free(tab_de);
2866
+ }
2867
+
2868
+ typedef struct {
2869
+ uint32_t c[2];
2870
+ uint32_t p;
2871
+ } ComposeEntry;
2872
+
2873
+ #define COMPOSE_LEN_MAX 10000
2874
+
2875
+ static int ce_cmp(const void *p1, const void *p2)
2876
+ {
2877
+ const ComposeEntry *ce1 = p1;
2878
+ const ComposeEntry *ce2 = p2;
2879
+ int i;
2880
+
2881
+ for(i = 0; i < 2; i++) {
2882
+ if (ce1->c[i] < ce2->c[i])
2883
+ return -1;
2884
+ else if (ce1->c[i] > ce2->c[i])
2885
+ return 1;
2886
+ }
2887
+ return 0;
2888
+ }
2889
+
2890
+
2891
+ static int get_decomp_pos(const DecompEntry *tab_de, int c)
2892
+ {
2893
+ int i, v, k;
2894
+ const DecompEntry *de;
2895
+
2896
+ k = 0;
2897
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2898
+ de = &tab_de[i];
2899
+ if (de->len != 0) {
2900
+ if (c >= de->code && c < de->code + de->len) {
2901
+ v = c - de->code;
2902
+ assert(v < 64);
2903
+ v |= k << 6;
2904
+ assert(v < 65536);
2905
+ return v;
2906
+ }
2907
+ i += de->len - 1;
2908
+ k++;
2909
+ }
2910
+ }
2911
+ return -1;
2912
+ }
2913
+
2914
+ void build_compose_table(FILE *f, const DecompEntry *tab_de)
2915
+ {
2916
+ int i, v, tab_ce_len;
2917
+ ComposeEntry *ce, *tab_ce;
2918
+
2919
+ tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX);
2920
+ tab_ce_len = 0;
2921
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2922
+ CCInfo *ci = &unicode_db[i];
2923
+ if (ci->decomp_len == 2 && !ci->is_compat &&
2924
+ !ci->is_excluded) {
2925
+ assert(tab_ce_len < COMPOSE_LEN_MAX);
2926
+ ce = &tab_ce[tab_ce_len++];
2927
+ ce->c[0] = ci->decomp_data[0];
2928
+ ce->c[1] = ci->decomp_data[1];
2929
+ ce->p = i;
2930
+ }
2931
+ }
2932
+ qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp);
2933
+
2934
+ #if 0
2935
+ {
2936
+ printf("tab_ce_len=%d\n", tab_ce_len);
2937
+ for(i = 0; i < tab_ce_len; i++) {
2938
+ ce = &tab_ce[i];
2939
+ printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p);
2940
+ }
2941
+ }
2942
+ #endif
2943
+
2944
+ total_tables++;
2945
+ total_table_bytes += tab_ce_len * sizeof(uint16_t);
2946
+ fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len);
2947
+ for(i = 0; i < tab_ce_len; i++) {
2948
+ if (i % 8 == 0)
2949
+ fprintf(f, "\n ");
2950
+ v = get_decomp_pos(tab_de, tab_ce[i].p);
2951
+ if (v < 0) {
2952
+ printf("ERROR: entry for c=%04x not found\n",
2953
+ tab_ce[i].p);
2954
+ exit(1);
2955
+ }
2956
+ fprintf(f, " 0x%04x,", v);
2957
+ }
2958
+ fprintf(f, "\n};\n\n");
2959
+
2960
+ free(tab_ce);
2961
+ }
2962
+
2963
+ #ifdef USE_TEST
2964
+ void check_decompose_table(void)
2965
+ {
2966
+ int c;
2967
+ CCInfo *ci;
2968
+ int res[UNICODE_DECOMP_LEN_MAX], *ref;
2969
+ int len, ref_len, is_compat;
2970
+
2971
+ for(is_compat = 0; is_compat <= 1; is_compat++) {
2972
+ for(c = 0; c < CHARCODE_MAX; c++) {
2973
+ ci = &unicode_db[c];
2974
+ ref_len = ci->decomp_len;
2975
+ ref = ci->decomp_data;
2976
+ if (!is_compat && ci->is_compat) {
2977
+ ref_len = 0;
2978
+ }
2979
+ len = unicode_decomp_char((uint32_t *)res, c, is_compat);
2980
+ if (len != ref_len ||
2981
+ tabcmp(res, ref, ref_len) != 0) {
2982
+ printf("ERROR c=%05x compat=%d\n", c, is_compat);
2983
+ dump_str("res", res, len);
2984
+ dump_str("ref", ref, ref_len);
2985
+ exit(1);
2986
+ }
2987
+ }
2988
+ }
2989
+ }
2990
+
2991
+ void check_compose_table(void)
2992
+ {
2993
+ int i, p;
2994
+ /* XXX: we don't test all the cases */
2995
+
2996
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2997
+ CCInfo *ci = &unicode_db[i];
2998
+ if (ci->decomp_len == 2 && !ci->is_compat &&
2999
+ !ci->is_excluded) {
3000
+ p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
3001
+ if (p != i) {
3002
+ printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n",
3003
+ ci->decomp_data[0], ci->decomp_data[1], p, i);
3004
+ exit(1);
3005
+ }
3006
+ }
3007
+ }
3008
+
3009
+
3010
+
3011
+ }
3012
+
3013
+ #endif
3014
+
3015
+
3016
+
3017
+ #ifdef USE_TEST
3018
+
3019
+ void check_str(const char *msg, int num, const int *in_buf, int in_len,
3020
+ const int *buf1, int len1,
3021
+ const int *buf2, int len2)
3022
+ {
3023
+ if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
3024
+ printf("%d: ERROR %s:\n", num, msg);
3025
+ dump_str(" in", in_buf, in_len);
3026
+ dump_str("res", buf1, len1);
3027
+ dump_str("ref", buf2, len2);
3028
+ exit(1);
3029
+ }
3030
+ }
3031
+
3032
+ void check_cc_table(void)
3033
+ {
3034
+ int cc, cc_ref, c;
3035
+
3036
+ for(c = 0; c <= CHARCODE_MAX; c++) {
3037
+ cc_ref = unicode_db[c].combining_class;
3038
+ cc = unicode_get_cc(c);
3039
+ if (cc != cc_ref) {
3040
+ printf("ERROR: c=%04x cc=%d cc_ref=%d\n",
3041
+ c, cc, cc_ref);
3042
+ exit(1);
3043
+ }
3044
+ }
3045
+ #ifdef PROFILE
3046
+ {
3047
+ int64_t ti, count;
3048
+
3049
+ ti = get_time_ns();
3050
+ count = 0;
3051
+ /* only do it on meaningful chars */
3052
+ for(c = 0x20; c <= 0xffff; c++) {
3053
+ cc_ref = unicode_db[c].combining_class;
3054
+ cc = unicode_get_cc(c);
3055
+ count++;
3056
+ }
3057
+ ti = get_time_ns() - ti;
3058
+ printf("cc time=%0.1f ns/char\n",
3059
+ (double)ti / count);
3060
+ }
3061
+ #endif
3062
+ }
3063
+
3064
+ void normalization_test(const char *filename)
3065
+ {
3066
+ FILE *f;
3067
+ char line[4096], *p;
3068
+ int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
3069
+ int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
3070
+ int *buf, buf_len, pos;
3071
+
3072
+ f = fopen(filename, "rb");
3073
+ if (!f) {
3074
+ perror(filename);
3075
+ exit(1);
3076
+ }
3077
+ pos = 0;
3078
+ for(;;) {
3079
+ if (!get_line(line, sizeof(line), f))
3080
+ break;
3081
+ pos++;
3082
+ p = line;
3083
+ while (isspace(*p))
3084
+ p++;
3085
+ if (*p == '#' || *p == '@')
3086
+ continue;
3087
+ in_str = get_field_str(&in_len, p, 0);
3088
+ nfc_str = get_field_str(&nfc_len, p, 1);
3089
+ nfd_str = get_field_str(&nfd_len, p, 2);
3090
+ nfkc_str = get_field_str(&nfkc_len, p, 3);
3091
+ nfkd_str = get_field_str(&nfkd_len, p, 4);
3092
+
3093
+ // dump_str("in", in_str, in_len);
3094
+
3095
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
3096
+ check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
3097
+ free(buf);
3098
+
3099
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
3100
+ check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
3101
+ free(buf);
3102
+
3103
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
3104
+ check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
3105
+ free(buf);
3106
+
3107
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
3108
+ check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
3109
+ free(buf);
3110
+
3111
+ free(in_str);
3112
+ free(nfc_str);
3113
+ free(nfd_str);
3114
+ free(nfkc_str);
3115
+ free(nfkd_str);
3116
+ }
3117
+ fclose(f);
3118
+ }
3119
+ #endif
3120
+
3121
+ int main(int argc, char *argv[])
3122
+ {
3123
+ const char *unicode_db_path, *outfilename;
3124
+ char filename[1024];
3125
+ int arg = 1;
3126
+
3127
+ if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) {
3128
+ printf("usage: %s PATH [OUTPUT]\n"
3129
+ " PATH path to the Unicode database directory\n"
3130
+ " OUTPUT name of the output file. If omitted, a self test is performed\n"
3131
+ " using the files from the Unicode library\n"
3132
+ , argv[0]);
3133
+ return 1;
3134
+ }
3135
+ unicode_db_path = argv[arg++];
3136
+ outfilename = NULL;
3137
+ if (arg < argc)
3138
+ outfilename = argv[arg++];
3139
+
3140
+ unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
3141
+
3142
+ snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
3143
+
3144
+ parse_unicode_data(filename);
3145
+
3146
+ snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path);
3147
+ parse_special_casing(unicode_db, filename);
3148
+
3149
+ snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path);
3150
+ parse_case_folding(unicode_db, filename);
3151
+
3152
+ snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path);
3153
+ parse_composition_exclusions(filename);
3154
+
3155
+ snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path);
3156
+ parse_derived_core_properties(filename);
3157
+
3158
+ snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path);
3159
+ parse_derived_norm_properties(filename);
3160
+
3161
+ snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path);
3162
+ parse_prop_list(filename);
3163
+
3164
+ snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path);
3165
+ parse_scripts(filename);
3166
+
3167
+ snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt",
3168
+ unicode_db_path);
3169
+ parse_script_extensions(filename);
3170
+
3171
+ snprintf(filename, sizeof(filename), "%s/emoji-data.txt",
3172
+ unicode_db_path);
3173
+ parse_prop_list(filename);
3174
+
3175
+ // dump_unicode_data(unicode_db);
3176
+ build_conv_table(unicode_db);
3177
+
3178
+ #ifdef DUMP_CASE_FOLDING_SPECIAL_CASES
3179
+ dump_case_folding_special_cases(unicode_db);
3180
+ #endif
3181
+
3182
+ if (!outfilename) {
3183
+ #ifdef USE_TEST
3184
+ check_case_conv();
3185
+ check_flags();
3186
+ check_decompose_table();
3187
+ check_compose_table();
3188
+ check_cc_table();
3189
+ snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path);
3190
+ normalization_test(filename);
3191
+ #else
3192
+ fprintf(stderr, "Tests are not compiled\n");
3193
+ exit(1);
3194
+ #endif
3195
+ } else
3196
+ {
3197
+ FILE *fo = fopen(outfilename, "wb");
3198
+
3199
+ if (!fo) {
3200
+ perror(outfilename);
3201
+ exit(1);
3202
+ }
3203
+ fprintf(fo,
3204
+ "/* Compressed unicode tables */\n"
3205
+ "/* Automatically generated file - do not edit */\n"
3206
+ "\n"
3207
+ "#include <stdint.h>\n"
3208
+ "\n");
3209
+ dump_case_conv_table(fo);
3210
+ compute_internal_props();
3211
+ build_flags_tables(fo);
3212
+ fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n");
3213
+ build_cc_table(fo);
3214
+ build_decompose_table(fo);
3215
+ build_general_category_table(fo);
3216
+ build_script_table(fo);
3217
+ build_script_ext_table(fo);
3218
+ build_prop_list_table(fo);
3219
+ fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
3220
+ fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
3221
+ total_tables, total_table_bytes, total_index, total_index_bytes);
3222
+ fclose(fo);
3223
+ }
3224
+ return 0;
3225
+ }