quickjs 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE +21 -0
- data/Rakefile +22 -0
- data/ext/quickjsrb/extconf.rb +45 -0
- data/ext/quickjsrb/quickjs/LICENSE +22 -0
- data/ext/quickjsrb/quickjs/cutils.c +631 -0
- data/ext/quickjsrb/quickjs/cutils.h +347 -0
- data/ext/quickjsrb/quickjs/libbf.c +8475 -0
- data/ext/quickjsrb/quickjs/libbf.h +535 -0
- data/ext/quickjsrb/quickjs/libregexp-opcode.h +57 -0
- data/ext/quickjsrb/quickjs/libregexp.c +2501 -0
- data/ext/quickjsrb/quickjs/libregexp.h +55 -0
- data/ext/quickjsrb/quickjs/libunicode-table.h +4557 -0
- data/ext/quickjsrb/quickjs/libunicode.c +1910 -0
- data/ext/quickjsrb/quickjs/libunicode.h +182 -0
- data/ext/quickjsrb/quickjs/list.h +99 -0
- data/ext/quickjsrb/quickjs/qjs.c +564 -0
- data/ext/quickjsrb/quickjs/qjsc.c +761 -0
- data/ext/quickjsrb/quickjs/qjscalc.c +4005 -0
- data/ext/quickjsrb/quickjs/quickjs-atom.h +273 -0
- data/ext/quickjsrb/quickjs/quickjs-libc.c +4052 -0
- data/ext/quickjsrb/quickjs/quickjs-libc.h +60 -0
- data/ext/quickjsrb/quickjs/quickjs-opcode.h +372 -0
- data/ext/quickjsrb/quickjs/quickjs.c +55978 -0
- data/ext/quickjsrb/quickjs/quickjs.h +1087 -0
- data/ext/quickjsrb/quickjs/repl.c +2057 -0
- data/ext/quickjsrb/quickjs/run-test262.c +2216 -0
- data/ext/quickjsrb/quickjs/unicode_gen.c +3225 -0
- data/ext/quickjsrb/quickjs/unicode_gen_def.h +291 -0
- data/ext/quickjsrb/quickjsrb.c +105 -0
- data/ext/quickjsrb/quickjsrb.h +14 -0
- data/lib/quickjs/version.rb +5 -0
- data/lib/quickjs.rb +28 -0
- data/sig/quickjs.rbs +4 -0
- metadata +81 -0
@@ -0,0 +1,3225 @@
|
|
1
|
+
/*
|
2
|
+
* Generation of Unicode tables
|
3
|
+
*
|
4
|
+
* Copyright (c) 2017-2018 Fabrice Bellard
|
5
|
+
* Copyright (c) 2017-2018 Charlie Gordon
|
6
|
+
*
|
7
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
* of this software and associated documentation files (the "Software"), to deal
|
9
|
+
* in the Software without restriction, including without limitation the rights
|
10
|
+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
* copies of the Software, and to permit persons to whom the Software is
|
12
|
+
* furnished to do so, subject to the following conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be included in
|
15
|
+
* all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
20
|
+
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
* THE SOFTWARE.
|
24
|
+
*/
|
25
|
+
#include <stdlib.h>
|
26
|
+
#include <stdio.h>
|
27
|
+
#include <stdarg.h>
|
28
|
+
#include <inttypes.h>
|
29
|
+
#include <string.h>
|
30
|
+
#include <assert.h>
|
31
|
+
#include <ctype.h>
|
32
|
+
#include <time.h>
|
33
|
+
|
34
|
+
#include "cutils.h"
|
35
|
+
|
36
|
+
uint32_t total_tables;
|
37
|
+
uint32_t total_table_bytes;
|
38
|
+
uint32_t total_index;
|
39
|
+
uint32_t total_index_bytes;
|
40
|
+
|
41
|
+
/* define it to be able to test unicode.c */
|
42
|
+
//#define USE_TEST
|
43
|
+
/* profile tests */
|
44
|
+
//#define PROFILE
|
45
|
+
|
46
|
+
//#define DUMP_CASE_CONV_TABLE
|
47
|
+
//#define DUMP_TABLE_SIZE
|
48
|
+
//#define DUMP_CC_TABLE
|
49
|
+
//#define DUMP_DECOMP_TABLE
|
50
|
+
//#define DUMP_CASE_FOLDING_SPECIAL_CASES
|
51
|
+
|
52
|
+
/* Ideas:
|
53
|
+
- Generalize run length encoding + index for all tables
|
54
|
+
- remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
|
55
|
+
|
56
|
+
Case conversion:
|
57
|
+
- use a single entry for consecutive U/LF runs
|
58
|
+
- allow EXT runs of length > 1
|
59
|
+
|
60
|
+
Decomposition:
|
61
|
+
- Greek lower case (+1f10/1f10) ?
|
62
|
+
- allow holes in B runs
|
63
|
+
- suppress more upper / lower case redundancy
|
64
|
+
*/
|
65
|
+
|
66
|
+
#ifdef USE_TEST
|
67
|
+
#include "libunicode.c"
|
68
|
+
#endif
|
69
|
+
|
70
|
+
#define CHARCODE_MAX 0x10ffff
|
71
|
+
#define CC_LEN_MAX 3
|
72
|
+
|
73
|
+
void *mallocz(size_t size)
|
74
|
+
{
|
75
|
+
void *ptr;
|
76
|
+
ptr = malloc(size);
|
77
|
+
memset(ptr, 0, size);
|
78
|
+
return ptr;
|
79
|
+
}
|
80
|
+
|
81
|
+
const char *get_field(const char *p, int n)
|
82
|
+
{
|
83
|
+
int i;
|
84
|
+
for(i = 0; i < n; i++) {
|
85
|
+
while (*p != ';' && *p != '\0')
|
86
|
+
p++;
|
87
|
+
if (*p == '\0')
|
88
|
+
return NULL;
|
89
|
+
p++;
|
90
|
+
}
|
91
|
+
return p;
|
92
|
+
}
|
93
|
+
|
94
|
+
const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
|
95
|
+
{
|
96
|
+
char *q;
|
97
|
+
p = get_field(p, n);
|
98
|
+
q = buf;
|
99
|
+
while (*p != ';' && *p != '\0') {
|
100
|
+
if ((q - buf) < buf_size - 1)
|
101
|
+
*q++ = *p;
|
102
|
+
p++;
|
103
|
+
}
|
104
|
+
*q = '\0';
|
105
|
+
return buf;
|
106
|
+
}
|
107
|
+
|
108
|
+
void add_char(int **pbuf, int *psize, int *plen, int c)
|
109
|
+
{
|
110
|
+
int len, size, *buf;
|
111
|
+
buf = *pbuf;
|
112
|
+
size = *psize;
|
113
|
+
len = *plen;
|
114
|
+
if (len >= size) {
|
115
|
+
size = *psize;
|
116
|
+
size = max_int(len + 1, size * 3 / 2);
|
117
|
+
buf = realloc(buf, sizeof(buf[0]) * size);
|
118
|
+
*pbuf = buf;
|
119
|
+
*psize = size;
|
120
|
+
}
|
121
|
+
buf[len++] = c;
|
122
|
+
*plen = len;
|
123
|
+
}
|
124
|
+
|
125
|
+
int *get_field_str(int *plen, const char *str, int n)
|
126
|
+
{
|
127
|
+
const char *p;
|
128
|
+
int *buf, len, size;
|
129
|
+
p = get_field(str, n);
|
130
|
+
if (!p) {
|
131
|
+
*plen = 0;
|
132
|
+
return NULL;
|
133
|
+
}
|
134
|
+
len = 0;
|
135
|
+
size = 0;
|
136
|
+
buf = NULL;
|
137
|
+
for(;;) {
|
138
|
+
while (isspace(*p))
|
139
|
+
p++;
|
140
|
+
if (!isxdigit(*p))
|
141
|
+
break;
|
142
|
+
add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
|
143
|
+
}
|
144
|
+
*plen = len;
|
145
|
+
return buf;
|
146
|
+
}
|
147
|
+
|
148
|
+
char *get_line(char *buf, int buf_size, FILE *f)
|
149
|
+
{
|
150
|
+
int len;
|
151
|
+
if (!fgets(buf, buf_size, f))
|
152
|
+
return NULL;
|
153
|
+
len = strlen(buf);
|
154
|
+
if (len > 0 && buf[len - 1] == '\n')
|
155
|
+
buf[len - 1] = '\0';
|
156
|
+
return buf;
|
157
|
+
}
|
158
|
+
|
159
|
+
#define UNICODE_GENERAL_CATEGORY
|
160
|
+
|
161
|
+
typedef enum {
|
162
|
+
#define DEF(id, str) GCAT_ ## id,
|
163
|
+
#include "unicode_gen_def.h"
|
164
|
+
#undef DEF
|
165
|
+
GCAT_COUNT,
|
166
|
+
} UnicodeGCEnum1;
|
167
|
+
|
168
|
+
static const char *unicode_gc_name[] = {
|
169
|
+
#define DEF(id, str) #id,
|
170
|
+
#include "unicode_gen_def.h"
|
171
|
+
#undef DEF
|
172
|
+
};
|
173
|
+
|
174
|
+
static const char *unicode_gc_short_name[] = {
|
175
|
+
#define DEF(id, str) str,
|
176
|
+
#include "unicode_gen_def.h"
|
177
|
+
#undef DEF
|
178
|
+
};
|
179
|
+
|
180
|
+
#undef UNICODE_GENERAL_CATEGORY
|
181
|
+
|
182
|
+
#define UNICODE_SCRIPT
|
183
|
+
|
184
|
+
typedef enum {
|
185
|
+
#define DEF(id, str) SCRIPT_ ## id,
|
186
|
+
#include "unicode_gen_def.h"
|
187
|
+
#undef DEF
|
188
|
+
SCRIPT_COUNT,
|
189
|
+
} UnicodeScriptEnum1;
|
190
|
+
|
191
|
+
static const char *unicode_script_name[] = {
|
192
|
+
#define DEF(id, str) #id,
|
193
|
+
#include "unicode_gen_def.h"
|
194
|
+
#undef DEF
|
195
|
+
};
|
196
|
+
|
197
|
+
const char *unicode_script_short_name[] = {
|
198
|
+
#define DEF(id, str) str,
|
199
|
+
#include "unicode_gen_def.h"
|
200
|
+
#undef DEF
|
201
|
+
};
|
202
|
+
|
203
|
+
#undef UNICODE_SCRIPT
|
204
|
+
|
205
|
+
#define UNICODE_PROP_LIST
|
206
|
+
|
207
|
+
typedef enum {
|
208
|
+
#define DEF(id, str) PROP_ ## id,
|
209
|
+
#include "unicode_gen_def.h"
|
210
|
+
#undef DEF
|
211
|
+
PROP_COUNT,
|
212
|
+
} UnicodePropEnum1;
|
213
|
+
|
214
|
+
static const char *unicode_prop_name[] = {
|
215
|
+
#define DEF(id, str) #id,
|
216
|
+
#include "unicode_gen_def.h"
|
217
|
+
#undef DEF
|
218
|
+
};
|
219
|
+
|
220
|
+
static const char *unicode_prop_short_name[] = {
|
221
|
+
#define DEF(id, str) str,
|
222
|
+
#include "unicode_gen_def.h"
|
223
|
+
#undef DEF
|
224
|
+
};
|
225
|
+
|
226
|
+
#undef UNICODE_PROP_LIST
|
227
|
+
|
228
|
+
typedef struct {
|
229
|
+
/* case conv */
|
230
|
+
uint8_t u_len;
|
231
|
+
uint8_t l_len;
|
232
|
+
uint8_t f_len;
|
233
|
+
int u_data[CC_LEN_MAX]; /* to upper case */
|
234
|
+
int l_data[CC_LEN_MAX]; /* to lower case */
|
235
|
+
int f_data[CC_LEN_MAX]; /* to case folding */
|
236
|
+
|
237
|
+
uint8_t combining_class;
|
238
|
+
uint8_t is_compat:1;
|
239
|
+
uint8_t is_excluded:1;
|
240
|
+
uint8_t general_category;
|
241
|
+
uint8_t script;
|
242
|
+
uint8_t script_ext_len;
|
243
|
+
uint8_t *script_ext;
|
244
|
+
uint32_t prop_bitmap_tab[3];
|
245
|
+
/* decomposition */
|
246
|
+
int decomp_len;
|
247
|
+
int *decomp_data;
|
248
|
+
} CCInfo;
|
249
|
+
|
250
|
+
CCInfo *unicode_db;
|
251
|
+
|
252
|
+
int find_name(const char **tab, int tab_len, const char *name)
|
253
|
+
{
|
254
|
+
int i, len, name_len;
|
255
|
+
const char *p, *r;
|
256
|
+
|
257
|
+
name_len = strlen(name);
|
258
|
+
for(i = 0; i < tab_len; i++) {
|
259
|
+
p = tab[i];
|
260
|
+
for(;;) {
|
261
|
+
r = strchr(p, ',');
|
262
|
+
if (!r)
|
263
|
+
len = strlen(p);
|
264
|
+
else
|
265
|
+
len = r - p;
|
266
|
+
if (len == name_len && memcmp(p, name, len) == 0)
|
267
|
+
return i;
|
268
|
+
if (!r)
|
269
|
+
break;
|
270
|
+
p = r + 1;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
return -1;
|
274
|
+
}
|
275
|
+
|
276
|
+
static BOOL get_prop(uint32_t c, int prop_idx)
|
277
|
+
{
|
278
|
+
return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
|
279
|
+
}
|
280
|
+
|
281
|
+
static void set_prop(uint32_t c, int prop_idx, int val)
|
282
|
+
{
|
283
|
+
uint32_t mask;
|
284
|
+
mask = 1U << (prop_idx & 0x1f);
|
285
|
+
if (val)
|
286
|
+
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
|
287
|
+
else
|
288
|
+
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
|
289
|
+
}
|
290
|
+
|
291
|
+
void parse_unicode_data(const char *filename)
|
292
|
+
{
|
293
|
+
FILE *f;
|
294
|
+
char line[1024];
|
295
|
+
char buf1[256];
|
296
|
+
const char *p;
|
297
|
+
int code, lc, uc, last_code;
|
298
|
+
CCInfo *ci, *tab = unicode_db;
|
299
|
+
|
300
|
+
f = fopen(filename, "rb");
|
301
|
+
if (!f) {
|
302
|
+
perror(filename);
|
303
|
+
exit(1);
|
304
|
+
}
|
305
|
+
|
306
|
+
last_code = 0;
|
307
|
+
for(;;) {
|
308
|
+
if (!get_line(line, sizeof(line), f))
|
309
|
+
break;
|
310
|
+
p = line;
|
311
|
+
while (isspace(*p))
|
312
|
+
p++;
|
313
|
+
if (*p == '#')
|
314
|
+
continue;
|
315
|
+
|
316
|
+
p = get_field(line, 0);
|
317
|
+
if (!p)
|
318
|
+
continue;
|
319
|
+
code = strtoul(p, NULL, 16);
|
320
|
+
lc = 0;
|
321
|
+
uc = 0;
|
322
|
+
|
323
|
+
p = get_field(line, 12);
|
324
|
+
if (p && *p != ';') {
|
325
|
+
uc = strtoul(p, NULL, 16);
|
326
|
+
}
|
327
|
+
|
328
|
+
p = get_field(line, 13);
|
329
|
+
if (p && *p != ';') {
|
330
|
+
lc = strtoul(p, NULL, 16);
|
331
|
+
}
|
332
|
+
ci = &tab[code];
|
333
|
+
if (uc > 0 || lc > 0) {
|
334
|
+
assert(code <= CHARCODE_MAX);
|
335
|
+
if (uc > 0) {
|
336
|
+
assert(ci->u_len == 0);
|
337
|
+
ci->u_len = 1;
|
338
|
+
ci->u_data[0] = uc;
|
339
|
+
}
|
340
|
+
if (lc > 0) {
|
341
|
+
assert(ci->l_len == 0);
|
342
|
+
ci->l_len = 1;
|
343
|
+
ci->l_data[0] = lc;
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
{
|
348
|
+
int i;
|
349
|
+
get_field_buf(buf1, sizeof(buf1), line, 2);
|
350
|
+
i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
|
351
|
+
if (i < 0) {
|
352
|
+
fprintf(stderr, "General category '%s' not found\n",
|
353
|
+
buf1);
|
354
|
+
exit(1);
|
355
|
+
}
|
356
|
+
ci->general_category = i;
|
357
|
+
}
|
358
|
+
|
359
|
+
p = get_field(line, 3);
|
360
|
+
if (p && *p != ';' && *p != '\0') {
|
361
|
+
int cc;
|
362
|
+
cc = strtoul(p, NULL, 0);
|
363
|
+
if (cc != 0) {
|
364
|
+
assert(code <= CHARCODE_MAX);
|
365
|
+
ci->combining_class = cc;
|
366
|
+
// printf("%05x: %d\n", code, ci->combining_class);
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
p = get_field(line, 5);
|
371
|
+
if (p && *p != ';' && *p != '\0') {
|
372
|
+
int size;
|
373
|
+
assert(code <= CHARCODE_MAX);
|
374
|
+
ci->is_compat = 0;
|
375
|
+
if (*p == '<') {
|
376
|
+
while (*p != '\0' && *p != '>')
|
377
|
+
p++;
|
378
|
+
if (*p == '>')
|
379
|
+
p++;
|
380
|
+
ci->is_compat = 1;
|
381
|
+
}
|
382
|
+
size = 0;
|
383
|
+
for(;;) {
|
384
|
+
while (isspace(*p))
|
385
|
+
p++;
|
386
|
+
if (!isxdigit(*p))
|
387
|
+
break;
|
388
|
+
add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
|
389
|
+
}
|
390
|
+
#if 0
|
391
|
+
{
|
392
|
+
int i;
|
393
|
+
static int count, d_count;
|
394
|
+
|
395
|
+
printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
|
396
|
+
for(i = 0; i < ci->decomp_len; i++)
|
397
|
+
printf(" %05x", ci->decomp_data[i]);
|
398
|
+
printf("\n");
|
399
|
+
count++;
|
400
|
+
d_count += ci->decomp_len;
|
401
|
+
// printf("%d %d\n", count, d_count);
|
402
|
+
}
|
403
|
+
#endif
|
404
|
+
}
|
405
|
+
|
406
|
+
p = get_field(line, 9);
|
407
|
+
if (p && *p == 'Y') {
|
408
|
+
set_prop(code, PROP_Bidi_Mirrored, 1);
|
409
|
+
}
|
410
|
+
|
411
|
+
/* handle ranges */
|
412
|
+
get_field_buf(buf1, sizeof(buf1), line, 1);
|
413
|
+
if (strstr(buf1, " Last>")) {
|
414
|
+
int i;
|
415
|
+
// printf("range: 0x%x-%0x\n", last_code, code);
|
416
|
+
assert(ci->decomp_len == 0);
|
417
|
+
assert(ci->script_ext_len == 0);
|
418
|
+
for(i = last_code + 1; i < code; i++) {
|
419
|
+
unicode_db[i] = *ci;
|
420
|
+
}
|
421
|
+
}
|
422
|
+
last_code = code;
|
423
|
+
}
|
424
|
+
|
425
|
+
fclose(f);
|
426
|
+
}
|
427
|
+
|
428
|
+
void parse_special_casing(CCInfo *tab, const char *filename)
|
429
|
+
{
|
430
|
+
FILE *f;
|
431
|
+
char line[1024];
|
432
|
+
const char *p;
|
433
|
+
int code;
|
434
|
+
CCInfo *ci;
|
435
|
+
|
436
|
+
f = fopen(filename, "rb");
|
437
|
+
if (!f) {
|
438
|
+
perror(filename);
|
439
|
+
exit(1);
|
440
|
+
}
|
441
|
+
|
442
|
+
for(;;) {
|
443
|
+
if (!get_line(line, sizeof(line), f))
|
444
|
+
break;
|
445
|
+
p = line;
|
446
|
+
while (isspace(*p))
|
447
|
+
p++;
|
448
|
+
if (*p == '#')
|
449
|
+
continue;
|
450
|
+
|
451
|
+
p = get_field(line, 0);
|
452
|
+
if (!p)
|
453
|
+
continue;
|
454
|
+
code = strtoul(p, NULL, 16);
|
455
|
+
assert(code <= CHARCODE_MAX);
|
456
|
+
ci = &tab[code];
|
457
|
+
|
458
|
+
p = get_field(line, 4);
|
459
|
+
if (p) {
|
460
|
+
/* locale dependent casing */
|
461
|
+
while (isspace(*p))
|
462
|
+
p++;
|
463
|
+
if (*p != '#' && *p != '\0')
|
464
|
+
continue;
|
465
|
+
}
|
466
|
+
|
467
|
+
|
468
|
+
p = get_field(line, 1);
|
469
|
+
if (p && *p != ';') {
|
470
|
+
ci->l_len = 0;
|
471
|
+
for(;;) {
|
472
|
+
while (isspace(*p))
|
473
|
+
p++;
|
474
|
+
if (*p == ';')
|
475
|
+
break;
|
476
|
+
assert(ci->l_len < CC_LEN_MAX);
|
477
|
+
ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
|
478
|
+
}
|
479
|
+
|
480
|
+
if (ci->l_len == 1 && ci->l_data[0] == code)
|
481
|
+
ci->l_len = 0;
|
482
|
+
}
|
483
|
+
|
484
|
+
p = get_field(line, 3);
|
485
|
+
if (p && *p != ';') {
|
486
|
+
ci->u_len = 0;
|
487
|
+
for(;;) {
|
488
|
+
while (isspace(*p))
|
489
|
+
p++;
|
490
|
+
if (*p == ';')
|
491
|
+
break;
|
492
|
+
assert(ci->u_len < CC_LEN_MAX);
|
493
|
+
ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
|
494
|
+
}
|
495
|
+
|
496
|
+
if (ci->u_len == 1 && ci->u_data[0] == code)
|
497
|
+
ci->u_len = 0;
|
498
|
+
}
|
499
|
+
}
|
500
|
+
|
501
|
+
fclose(f);
|
502
|
+
}
|
503
|
+
|
504
|
+
void parse_case_folding(CCInfo *tab, const char *filename)
|
505
|
+
{
|
506
|
+
FILE *f;
|
507
|
+
char line[1024];
|
508
|
+
const char *p;
|
509
|
+
int code, status;
|
510
|
+
CCInfo *ci;
|
511
|
+
|
512
|
+
f = fopen(filename, "rb");
|
513
|
+
if (!f) {
|
514
|
+
perror(filename);
|
515
|
+
exit(1);
|
516
|
+
}
|
517
|
+
|
518
|
+
for(;;) {
|
519
|
+
if (!get_line(line, sizeof(line), f))
|
520
|
+
break;
|
521
|
+
p = line;
|
522
|
+
while (isspace(*p))
|
523
|
+
p++;
|
524
|
+
if (*p == '#')
|
525
|
+
continue;
|
526
|
+
|
527
|
+
p = get_field(line, 0);
|
528
|
+
if (!p)
|
529
|
+
continue;
|
530
|
+
code = strtoul(p, NULL, 16);
|
531
|
+
assert(code <= CHARCODE_MAX);
|
532
|
+
ci = &tab[code];
|
533
|
+
|
534
|
+
p = get_field(line, 1);
|
535
|
+
if (!p)
|
536
|
+
continue;
|
537
|
+
/* locale dependent casing */
|
538
|
+
while (isspace(*p))
|
539
|
+
p++;
|
540
|
+
status = *p;
|
541
|
+
if (status != 'C' && status != 'S' && status != 'F')
|
542
|
+
continue;
|
543
|
+
|
544
|
+
p = get_field(line, 2);
|
545
|
+
assert(p != NULL);
|
546
|
+
if (status == 'S') {
|
547
|
+
/* we always select the simple case folding and assume it
|
548
|
+
* comes after the full case folding case */
|
549
|
+
assert(ci->f_len >= 2);
|
550
|
+
ci->f_len = 0;
|
551
|
+
} else {
|
552
|
+
assert(ci->f_len == 0);
|
553
|
+
}
|
554
|
+
for(;;) {
|
555
|
+
while (isspace(*p))
|
556
|
+
p++;
|
557
|
+
if (*p == ';')
|
558
|
+
break;
|
559
|
+
assert(ci->l_len < CC_LEN_MAX);
|
560
|
+
ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16);
|
561
|
+
}
|
562
|
+
}
|
563
|
+
|
564
|
+
fclose(f);
|
565
|
+
}
|
566
|
+
|
567
|
+
void parse_composition_exclusions(const char *filename)
|
568
|
+
{
|
569
|
+
FILE *f;
|
570
|
+
char line[4096], *p;
|
571
|
+
uint32_t c0;
|
572
|
+
|
573
|
+
f = fopen(filename, "rb");
|
574
|
+
if (!f) {
|
575
|
+
perror(filename);
|
576
|
+
exit(1);
|
577
|
+
}
|
578
|
+
|
579
|
+
for(;;) {
|
580
|
+
if (!get_line(line, sizeof(line), f))
|
581
|
+
break;
|
582
|
+
p = line;
|
583
|
+
while (isspace(*p))
|
584
|
+
p++;
|
585
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
586
|
+
continue;
|
587
|
+
c0 = strtoul(p, (char **)&p, 16);
|
588
|
+
assert(c0 > 0 && c0 <= CHARCODE_MAX);
|
589
|
+
unicode_db[c0].is_excluded = TRUE;
|
590
|
+
}
|
591
|
+
fclose(f);
|
592
|
+
}
|
593
|
+
|
594
|
+
void parse_derived_core_properties(const char *filename)
|
595
|
+
{
|
596
|
+
FILE *f;
|
597
|
+
char line[4096], *p, buf[256], *q;
|
598
|
+
uint32_t c0, c1, c;
|
599
|
+
int i;
|
600
|
+
|
601
|
+
f = fopen(filename, "rb");
|
602
|
+
if (!f) {
|
603
|
+
perror(filename);
|
604
|
+
exit(1);
|
605
|
+
}
|
606
|
+
|
607
|
+
for(;;) {
|
608
|
+
if (!get_line(line, sizeof(line), f))
|
609
|
+
break;
|
610
|
+
p = line;
|
611
|
+
while (isspace(*p))
|
612
|
+
p++;
|
613
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
614
|
+
continue;
|
615
|
+
c0 = strtoul(p, (char **)&p, 16);
|
616
|
+
if (*p == '.' && p[1] == '.') {
|
617
|
+
p += 2;
|
618
|
+
c1 = strtoul(p, (char **)&p, 16);
|
619
|
+
} else {
|
620
|
+
c1 = c0;
|
621
|
+
}
|
622
|
+
assert(c1 <= CHARCODE_MAX);
|
623
|
+
p += strspn(p, " \t");
|
624
|
+
if (*p == ';') {
|
625
|
+
p++;
|
626
|
+
p += strspn(p, " \t");
|
627
|
+
q = buf;
|
628
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
629
|
+
if ((q - buf) < sizeof(buf) - 1)
|
630
|
+
*q++ = *p;
|
631
|
+
p++;
|
632
|
+
}
|
633
|
+
*q = '\0';
|
634
|
+
i = find_name(unicode_prop_name,
|
635
|
+
countof(unicode_prop_name), buf);
|
636
|
+
if (i < 0) {
|
637
|
+
if (!strcmp(buf, "Grapheme_Link"))
|
638
|
+
goto next;
|
639
|
+
fprintf(stderr, "Property not found: %s\n", buf);
|
640
|
+
exit(1);
|
641
|
+
}
|
642
|
+
for(c = c0; c <= c1; c++) {
|
643
|
+
set_prop(c, i, 1);
|
644
|
+
}
|
645
|
+
next: ;
|
646
|
+
}
|
647
|
+
}
|
648
|
+
fclose(f);
|
649
|
+
}
|
650
|
+
|
651
|
+
void parse_derived_norm_properties(const char *filename)
|
652
|
+
{
|
653
|
+
FILE *f;
|
654
|
+
char line[4096], *p, buf[256], *q;
|
655
|
+
uint32_t c0, c1, c;
|
656
|
+
|
657
|
+
f = fopen(filename, "rb");
|
658
|
+
if (!f) {
|
659
|
+
perror(filename);
|
660
|
+
exit(1);
|
661
|
+
}
|
662
|
+
|
663
|
+
for(;;) {
|
664
|
+
if (!get_line(line, sizeof(line), f))
|
665
|
+
break;
|
666
|
+
p = line;
|
667
|
+
while (isspace(*p))
|
668
|
+
p++;
|
669
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
670
|
+
continue;
|
671
|
+
c0 = strtoul(p, (char **)&p, 16);
|
672
|
+
if (*p == '.' && p[1] == '.') {
|
673
|
+
p += 2;
|
674
|
+
c1 = strtoul(p, (char **)&p, 16);
|
675
|
+
} else {
|
676
|
+
c1 = c0;
|
677
|
+
}
|
678
|
+
assert(c1 <= CHARCODE_MAX);
|
679
|
+
p += strspn(p, " \t");
|
680
|
+
if (*p == ';') {
|
681
|
+
p++;
|
682
|
+
p += strspn(p, " \t");
|
683
|
+
q = buf;
|
684
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
685
|
+
if ((q - buf) < sizeof(buf) - 1)
|
686
|
+
*q++ = *p;
|
687
|
+
p++;
|
688
|
+
}
|
689
|
+
*q = '\0';
|
690
|
+
if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
|
691
|
+
for(c = c0; c <= c1; c++) {
|
692
|
+
set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
|
693
|
+
}
|
694
|
+
}
|
695
|
+
}
|
696
|
+
}
|
697
|
+
fclose(f);
|
698
|
+
}
|
699
|
+
|
700
|
+
void parse_prop_list(const char *filename)
|
701
|
+
{
|
702
|
+
FILE *f;
|
703
|
+
char line[4096], *p, buf[256], *q;
|
704
|
+
uint32_t c0, c1, c;
|
705
|
+
int i;
|
706
|
+
|
707
|
+
f = fopen(filename, "rb");
|
708
|
+
if (!f) {
|
709
|
+
perror(filename);
|
710
|
+
exit(1);
|
711
|
+
}
|
712
|
+
|
713
|
+
for(;;) {
|
714
|
+
if (!get_line(line, sizeof(line), f))
|
715
|
+
break;
|
716
|
+
p = line;
|
717
|
+
while (isspace(*p))
|
718
|
+
p++;
|
719
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
720
|
+
continue;
|
721
|
+
c0 = strtoul(p, (char **)&p, 16);
|
722
|
+
if (*p == '.' && p[1] == '.') {
|
723
|
+
p += 2;
|
724
|
+
c1 = strtoul(p, (char **)&p, 16);
|
725
|
+
} else {
|
726
|
+
c1 = c0;
|
727
|
+
}
|
728
|
+
assert(c1 <= CHARCODE_MAX);
|
729
|
+
p += strspn(p, " \t");
|
730
|
+
if (*p == ';') {
|
731
|
+
p++;
|
732
|
+
p += strspn(p, " \t");
|
733
|
+
q = buf;
|
734
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
735
|
+
if ((q - buf) < sizeof(buf) - 1)
|
736
|
+
*q++ = *p;
|
737
|
+
p++;
|
738
|
+
}
|
739
|
+
*q = '\0';
|
740
|
+
i = find_name(unicode_prop_name,
|
741
|
+
countof(unicode_prop_name), buf);
|
742
|
+
if (i < 0) {
|
743
|
+
fprintf(stderr, "Property not found: %s\n", buf);
|
744
|
+
exit(1);
|
745
|
+
}
|
746
|
+
for(c = c0; c <= c1; c++) {
|
747
|
+
set_prop(c, i, 1);
|
748
|
+
}
|
749
|
+
}
|
750
|
+
}
|
751
|
+
fclose(f);
|
752
|
+
}
|
753
|
+
|
754
|
+
void parse_scripts(const char *filename)
|
755
|
+
{
|
756
|
+
FILE *f;
|
757
|
+
char line[4096], *p, buf[256], *q;
|
758
|
+
uint32_t c0, c1, c;
|
759
|
+
int i;
|
760
|
+
|
761
|
+
f = fopen(filename, "rb");
|
762
|
+
if (!f) {
|
763
|
+
perror(filename);
|
764
|
+
exit(1);
|
765
|
+
}
|
766
|
+
|
767
|
+
for(;;) {
|
768
|
+
if (!get_line(line, sizeof(line), f))
|
769
|
+
break;
|
770
|
+
p = line;
|
771
|
+
while (isspace(*p))
|
772
|
+
p++;
|
773
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
774
|
+
continue;
|
775
|
+
c0 = strtoul(p, (char **)&p, 16);
|
776
|
+
if (*p == '.' && p[1] == '.') {
|
777
|
+
p += 2;
|
778
|
+
c1 = strtoul(p, (char **)&p, 16);
|
779
|
+
} else {
|
780
|
+
c1 = c0;
|
781
|
+
}
|
782
|
+
assert(c1 <= CHARCODE_MAX);
|
783
|
+
p += strspn(p, " \t");
|
784
|
+
if (*p == ';') {
|
785
|
+
p++;
|
786
|
+
p += strspn(p, " \t");
|
787
|
+
q = buf;
|
788
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
789
|
+
if ((q - buf) < sizeof(buf) - 1)
|
790
|
+
*q++ = *p;
|
791
|
+
p++;
|
792
|
+
}
|
793
|
+
*q = '\0';
|
794
|
+
i = find_name(unicode_script_name,
|
795
|
+
countof(unicode_script_name), buf);
|
796
|
+
if (i < 0) {
|
797
|
+
fprintf(stderr, "Unknown script: '%s'\n", buf);
|
798
|
+
exit(1);
|
799
|
+
}
|
800
|
+
for(c = c0; c <= c1; c++)
|
801
|
+
unicode_db[c].script = i;
|
802
|
+
}
|
803
|
+
}
|
804
|
+
fclose(f);
|
805
|
+
}
|
806
|
+
|
807
|
+
void parse_script_extensions(const char *filename)
|
808
|
+
{
|
809
|
+
FILE *f;
|
810
|
+
char line[4096], *p, buf[256], *q;
|
811
|
+
uint32_t c0, c1, c;
|
812
|
+
int i;
|
813
|
+
uint8_t script_ext[255];
|
814
|
+
int script_ext_len;
|
815
|
+
|
816
|
+
f = fopen(filename, "rb");
|
817
|
+
if (!f) {
|
818
|
+
perror(filename);
|
819
|
+
exit(1);
|
820
|
+
}
|
821
|
+
|
822
|
+
for(;;) {
|
823
|
+
if (!get_line(line, sizeof(line), f))
|
824
|
+
break;
|
825
|
+
p = line;
|
826
|
+
while (isspace(*p))
|
827
|
+
p++;
|
828
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
829
|
+
continue;
|
830
|
+
c0 = strtoul(p, (char **)&p, 16);
|
831
|
+
if (*p == '.' && p[1] == '.') {
|
832
|
+
p += 2;
|
833
|
+
c1 = strtoul(p, (char **)&p, 16);
|
834
|
+
} else {
|
835
|
+
c1 = c0;
|
836
|
+
}
|
837
|
+
assert(c1 <= CHARCODE_MAX);
|
838
|
+
p += strspn(p, " \t");
|
839
|
+
script_ext_len = 0;
|
840
|
+
if (*p == ';') {
|
841
|
+
p++;
|
842
|
+
for(;;) {
|
843
|
+
p += strspn(p, " \t");
|
844
|
+
q = buf;
|
845
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
846
|
+
if ((q - buf) < sizeof(buf) - 1)
|
847
|
+
*q++ = *p;
|
848
|
+
p++;
|
849
|
+
}
|
850
|
+
*q = '\0';
|
851
|
+
if (buf[0] == '\0')
|
852
|
+
break;
|
853
|
+
i = find_name(unicode_script_short_name,
|
854
|
+
countof(unicode_script_short_name), buf);
|
855
|
+
if (i < 0) {
|
856
|
+
fprintf(stderr, "Script not found: %s\n", buf);
|
857
|
+
exit(1);
|
858
|
+
}
|
859
|
+
assert(script_ext_len < sizeof(script_ext));
|
860
|
+
script_ext[script_ext_len++] = i;
|
861
|
+
}
|
862
|
+
for(c = c0; c <= c1; c++) {
|
863
|
+
CCInfo *ci = &unicode_db[c];
|
864
|
+
ci->script_ext_len = script_ext_len;
|
865
|
+
ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
|
866
|
+
for(i = 0; i < script_ext_len; i++)
|
867
|
+
ci->script_ext[i] = script_ext[i];
|
868
|
+
}
|
869
|
+
}
|
870
|
+
}
|
871
|
+
fclose(f);
|
872
|
+
}
|
873
|
+
|
874
|
+
void dump_cc_info(CCInfo *ci, int i)
|
875
|
+
{
|
876
|
+
int j;
|
877
|
+
printf("%05x:", i);
|
878
|
+
if (ci->u_len != 0) {
|
879
|
+
printf(" U:");
|
880
|
+
for(j = 0; j < ci->u_len; j++)
|
881
|
+
printf(" %05x", ci->u_data[j]);
|
882
|
+
}
|
883
|
+
if (ci->l_len != 0) {
|
884
|
+
printf(" L:");
|
885
|
+
for(j = 0; j < ci->l_len; j++)
|
886
|
+
printf(" %05x", ci->l_data[j]);
|
887
|
+
}
|
888
|
+
if (ci->f_len != 0) {
|
889
|
+
printf(" F:");
|
890
|
+
for(j = 0; j < ci->f_len; j++)
|
891
|
+
printf(" %05x", ci->f_data[j]);
|
892
|
+
}
|
893
|
+
printf("\n");
|
894
|
+
}
|
895
|
+
|
896
|
+
void dump_unicode_data(CCInfo *tab)
|
897
|
+
{
|
898
|
+
int i;
|
899
|
+
CCInfo *ci;
|
900
|
+
for(i = 0; i <= CHARCODE_MAX; i++) {
|
901
|
+
ci = &tab[i];
|
902
|
+
if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) {
|
903
|
+
dump_cc_info(ci, i);
|
904
|
+
}
|
905
|
+
}
|
906
|
+
}
|
907
|
+
|
908
|
+
BOOL is_complicated_case(const CCInfo *ci)
|
909
|
+
{
|
910
|
+
return (ci->u_len > 1 || ci->l_len > 1 ||
|
911
|
+
(ci->u_len > 0 && ci->l_len > 0) ||
|
912
|
+
(ci->f_len != ci->l_len) ||
|
913
|
+
(memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0));
|
914
|
+
}
|
915
|
+
|
916
|
+
#ifndef USE_TEST
|
917
|
+
enum {
|
918
|
+
RUN_TYPE_U,
|
919
|
+
RUN_TYPE_L,
|
920
|
+
RUN_TYPE_UF,
|
921
|
+
RUN_TYPE_LF,
|
922
|
+
RUN_TYPE_UL,
|
923
|
+
RUN_TYPE_LSU,
|
924
|
+
RUN_TYPE_U2L_399_EXT2,
|
925
|
+
RUN_TYPE_UF_D20,
|
926
|
+
RUN_TYPE_UF_D1_EXT,
|
927
|
+
RUN_TYPE_U_EXT,
|
928
|
+
RUN_TYPE_LF_EXT,
|
929
|
+
RUN_TYPE_UF_EXT2,
|
930
|
+
RUN_TYPE_LF_EXT2,
|
931
|
+
RUN_TYPE_UF_EXT3,
|
932
|
+
};
|
933
|
+
#endif
|
934
|
+
|
935
|
+
const char *run_type_str[] = {
|
936
|
+
"U",
|
937
|
+
"L",
|
938
|
+
"UF",
|
939
|
+
"LF",
|
940
|
+
"UL",
|
941
|
+
"LSU",
|
942
|
+
"U2L_399_EXT2",
|
943
|
+
"UF_D20",
|
944
|
+
"UF_D1_EXT",
|
945
|
+
"U_EXT",
|
946
|
+
"LF_EXT",
|
947
|
+
"UF_EXT2",
|
948
|
+
"LF_EXT2",
|
949
|
+
"UF_EXT3",
|
950
|
+
};
|
951
|
+
|
952
|
+
typedef struct {
|
953
|
+
int code;
|
954
|
+
int len;
|
955
|
+
int type;
|
956
|
+
int data;
|
957
|
+
int ext_len;
|
958
|
+
int ext_data[3];
|
959
|
+
int data_index; /* 'data' coming from the table */
|
960
|
+
} TableEntry;
|
961
|
+
|
962
|
+
static int simple_to_lower(CCInfo *tab, int c)
|
963
|
+
{
|
964
|
+
if (tab[c].l_len != 1)
|
965
|
+
return c;
|
966
|
+
return tab[c].l_data[0];
|
967
|
+
}
|
968
|
+
|
969
|
+
/* code (17), len (7), type (4) */
|
970
|
+
|
971
|
+
void find_run_type(TableEntry *te, CCInfo *tab, int code)
|
972
|
+
{
|
973
|
+
int is_lower, len;
|
974
|
+
CCInfo *ci, *ci1, *ci2;
|
975
|
+
|
976
|
+
ci = &tab[code];
|
977
|
+
ci1 = &tab[code + 1];
|
978
|
+
ci2 = &tab[code + 2];
|
979
|
+
te->code = code;
|
980
|
+
|
981
|
+
if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
|
982
|
+
ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] &&
|
983
|
+
ci->u_len == 0 &&
|
984
|
+
|
985
|
+
ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
|
986
|
+
ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] &&
|
987
|
+
ci1->u_len == 1 && ci1->u_data[0] == code &&
|
988
|
+
|
989
|
+
ci2->l_len == 0 &&
|
990
|
+
ci2->f_len == 0 &&
|
991
|
+
ci2->u_len == 1 && ci2->u_data[0] == code) {
|
992
|
+
te->len = 3;
|
993
|
+
te->data = 0;
|
994
|
+
te->type = RUN_TYPE_LSU;
|
995
|
+
return;
|
996
|
+
}
|
997
|
+
|
998
|
+
if (is_complicated_case(ci)) {
|
999
|
+
len = 1;
|
1000
|
+
while (code + len <= CHARCODE_MAX) {
|
1001
|
+
ci1 = &tab[code + len];
|
1002
|
+
if (ci1->u_len != 1 ||
|
1003
|
+
ci1->u_data[0] != ci->u_data[0] + len ||
|
1004
|
+
ci1->l_len != 0 ||
|
1005
|
+
ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0])
|
1006
|
+
break;
|
1007
|
+
len++;
|
1008
|
+
}
|
1009
|
+
if (len > 1) {
|
1010
|
+
te->len = len;
|
1011
|
+
te->type = RUN_TYPE_UF;
|
1012
|
+
te->data = ci->u_data[0];
|
1013
|
+
return;
|
1014
|
+
}
|
1015
|
+
|
1016
|
+
if (ci->l_len == 0 &&
|
1017
|
+
ci->u_len == 2 && ci->u_data[1] == 0x399 &&
|
1018
|
+
ci->f_len == 2 && ci->f_data[1] == 0x3B9 &&
|
1019
|
+
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) {
|
1020
|
+
len = 1;
|
1021
|
+
while (code + len <= CHARCODE_MAX) {
|
1022
|
+
ci1 = &tab[code + len];
|
1023
|
+
if (!(ci1->u_len == 2 &&
|
1024
|
+
ci1->u_data[1] == ci->u_data[1] &&
|
1025
|
+
ci1->u_data[0] == ci->u_data[0] + len &&
|
1026
|
+
ci1->f_len == 2 &&
|
1027
|
+
ci1->f_data[1] == ci->f_data[1] &&
|
1028
|
+
ci1->f_data[0] == ci->f_data[0] + len &&
|
1029
|
+
ci1->l_len == 0))
|
1030
|
+
break;
|
1031
|
+
len++;
|
1032
|
+
}
|
1033
|
+
te->len = len;
|
1034
|
+
te->type = RUN_TYPE_UF_EXT2;
|
1035
|
+
te->ext_data[0] = ci->u_data[0];
|
1036
|
+
te->ext_data[1] = ci->u_data[1];
|
1037
|
+
te->ext_len = 2;
|
1038
|
+
return;
|
1039
|
+
}
|
1040
|
+
|
1041
|
+
if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
|
1042
|
+
ci->l_len == 1 &&
|
1043
|
+
ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) {
|
1044
|
+
len = 1;
|
1045
|
+
while (code + len <= CHARCODE_MAX) {
|
1046
|
+
ci1 = &tab[code + len];
|
1047
|
+
if (!(ci1->u_len == 2 &&
|
1048
|
+
ci1->u_data[1] == 0x399 &&
|
1049
|
+
ci1->u_data[0] == ci->u_data[0] + len &&
|
1050
|
+
ci1->l_len == 1 &&
|
1051
|
+
ci1->l_data[0] == ci->l_data[0] + len &&
|
1052
|
+
ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0]))
|
1053
|
+
break;
|
1054
|
+
len++;
|
1055
|
+
}
|
1056
|
+
te->len = len;
|
1057
|
+
te->type = RUN_TYPE_U2L_399_EXT2;
|
1058
|
+
te->ext_data[0] = ci->u_data[0];
|
1059
|
+
te->ext_data[1] = ci->l_data[0];
|
1060
|
+
te->ext_len = 2;
|
1061
|
+
return;
|
1062
|
+
}
|
1063
|
+
|
1064
|
+
if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) {
|
1065
|
+
len = 1;
|
1066
|
+
while (code + len <= CHARCODE_MAX) {
|
1067
|
+
ci1 = &tab[code + len];
|
1068
|
+
if (!(ci1->l_len == 1 &&
|
1069
|
+
ci1->l_data[0] == ci->l_data[0] + len &&
|
1070
|
+
ci1->u_len == 0 && ci1->f_len == 0))
|
1071
|
+
break;
|
1072
|
+
len++;
|
1073
|
+
}
|
1074
|
+
te->len = len;
|
1075
|
+
te->type = RUN_TYPE_L;
|
1076
|
+
te->data = ci->l_data[0];
|
1077
|
+
return;
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
if (ci->l_len == 0 &&
|
1081
|
+
ci->u_len == 1 &&
|
1082
|
+
ci->u_data[0] < 0x1000 &&
|
1083
|
+
ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) {
|
1084
|
+
te->len = 1;
|
1085
|
+
te->type = RUN_TYPE_UF_D20;
|
1086
|
+
te->data = ci->u_data[0];
|
1087
|
+
} else if (ci->l_len == 0 &&
|
1088
|
+
ci->u_len == 1 &&
|
1089
|
+
ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) {
|
1090
|
+
te->len = 1;
|
1091
|
+
te->type = RUN_TYPE_UF_D1_EXT;
|
1092
|
+
te->ext_data[0] = ci->u_data[0];
|
1093
|
+
te->ext_len = 1;
|
1094
|
+
} else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 &&
|
1095
|
+
ci->l_data[0] == ci->f_data[0] &&
|
1096
|
+
ci->l_data[1] == ci->f_data[1]) {
|
1097
|
+
te->len = 1;
|
1098
|
+
te->type = RUN_TYPE_LF_EXT2;
|
1099
|
+
te->ext_data[0] = ci->l_data[0];
|
1100
|
+
te->ext_data[1] = ci->l_data[1];
|
1101
|
+
te->ext_len = 2;
|
1102
|
+
} else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 &&
|
1103
|
+
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
|
1104
|
+
ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) {
|
1105
|
+
te->len = 1;
|
1106
|
+
te->type = RUN_TYPE_UF_EXT2;
|
1107
|
+
te->ext_data[0] = ci->u_data[0];
|
1108
|
+
te->ext_data[1] = ci->u_data[1];
|
1109
|
+
te->ext_len = 2;
|
1110
|
+
} else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 &&
|
1111
|
+
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
|
1112
|
+
ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) &&
|
1113
|
+
ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) {
|
1114
|
+
te->len = 1;
|
1115
|
+
te->type = RUN_TYPE_UF_EXT3;
|
1116
|
+
te->ext_data[0] = ci->u_data[0];
|
1117
|
+
te->ext_data[1] = ci->u_data[1];
|
1118
|
+
te->ext_data[2] = ci->u_data[2];
|
1119
|
+
te->ext_len = 3;
|
1120
|
+
} else {
|
1121
|
+
printf("unsupported encoding case:\n");
|
1122
|
+
dump_cc_info(ci, code);
|
1123
|
+
abort();
|
1124
|
+
}
|
1125
|
+
} else {
|
1126
|
+
/* look for a run of identical conversions */
|
1127
|
+
len = 0;
|
1128
|
+
for(;;) {
|
1129
|
+
if (code >= CHARCODE_MAX || len >= 126)
|
1130
|
+
break;
|
1131
|
+
ci = &tab[code + len];
|
1132
|
+
ci1 = &tab[code + len + 1];
|
1133
|
+
if (is_complicated_case(ci) || is_complicated_case(ci1)) {
|
1134
|
+
break;
|
1135
|
+
}
|
1136
|
+
if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
|
1137
|
+
break;
|
1138
|
+
if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
|
1139
|
+
break;
|
1140
|
+
len += 2;
|
1141
|
+
}
|
1142
|
+
if (len > 0) {
|
1143
|
+
te->len = len;
|
1144
|
+
te->type = RUN_TYPE_UL;
|
1145
|
+
te->data = 0;
|
1146
|
+
return;
|
1147
|
+
}
|
1148
|
+
|
1149
|
+
ci = &tab[code];
|
1150
|
+
is_lower = ci->l_len > 0;
|
1151
|
+
len = 1;
|
1152
|
+
while (code + len <= CHARCODE_MAX) {
|
1153
|
+
ci1 = &tab[code + len];
|
1154
|
+
if (is_complicated_case(ci1))
|
1155
|
+
break;
|
1156
|
+
if (is_lower) {
|
1157
|
+
if (ci1->l_len != 1 ||
|
1158
|
+
ci1->l_data[0] != ci->l_data[0] + len)
|
1159
|
+
break;
|
1160
|
+
} else {
|
1161
|
+
if (ci1->u_len != 1 ||
|
1162
|
+
ci1->u_data[0] != ci->u_data[0] + len)
|
1163
|
+
break;
|
1164
|
+
}
|
1165
|
+
len++;
|
1166
|
+
}
|
1167
|
+
te->len = len;
|
1168
|
+
if (is_lower) {
|
1169
|
+
te->type = RUN_TYPE_LF;
|
1170
|
+
te->data = ci->l_data[0];
|
1171
|
+
} else {
|
1172
|
+
te->type = RUN_TYPE_U;
|
1173
|
+
te->data = ci->u_data[0];
|
1174
|
+
}
|
1175
|
+
}
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
TableEntry conv_table[1000];
|
1179
|
+
int conv_table_len;
|
1180
|
+
int ext_data[1000];
|
1181
|
+
int ext_data_len;
|
1182
|
+
|
1183
|
+
void dump_case_conv_table1(void)
|
1184
|
+
{
|
1185
|
+
int i, j;
|
1186
|
+
const TableEntry *te;
|
1187
|
+
|
1188
|
+
for(i = 0; i < conv_table_len; i++) {
|
1189
|
+
te = &conv_table[i];
|
1190
|
+
printf("%05x %02x %-10s %05x",
|
1191
|
+
te->code, te->len, run_type_str[te->type], te->data);
|
1192
|
+
for(j = 0; j < te->ext_len; j++) {
|
1193
|
+
printf(" %05x", te->ext_data[j]);
|
1194
|
+
}
|
1195
|
+
printf("\n");
|
1196
|
+
}
|
1197
|
+
printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len);
|
1198
|
+
}
|
1199
|
+
|
1200
|
+
int find_data_index(const TableEntry *conv_table, int len, int data)
|
1201
|
+
{
|
1202
|
+
int i;
|
1203
|
+
const TableEntry *te;
|
1204
|
+
for(i = 0; i < len; i++) {
|
1205
|
+
te = &conv_table[i];
|
1206
|
+
if (te->code == data)
|
1207
|
+
return i;
|
1208
|
+
}
|
1209
|
+
return -1;
|
1210
|
+
}
|
1211
|
+
|
1212
|
+
int find_ext_data_index(int data)
|
1213
|
+
{
|
1214
|
+
int i;
|
1215
|
+
for(i = 0; i < ext_data_len; i++) {
|
1216
|
+
if (ext_data[i] == data)
|
1217
|
+
return i;
|
1218
|
+
}
|
1219
|
+
assert(ext_data_len < countof(ext_data));
|
1220
|
+
ext_data[ext_data_len++] = data;
|
1221
|
+
return ext_data_len - 1;
|
1222
|
+
}
|
1223
|
+
|
1224
|
+
void build_conv_table(CCInfo *tab)
|
1225
|
+
{
|
1226
|
+
int code, i, j;
|
1227
|
+
CCInfo *ci;
|
1228
|
+
TableEntry *te;
|
1229
|
+
|
1230
|
+
te = conv_table;
|
1231
|
+
for(code = 0; code <= CHARCODE_MAX; code++) {
|
1232
|
+
ci = &tab[code];
|
1233
|
+
if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0)
|
1234
|
+
continue;
|
1235
|
+
assert(te - conv_table < countof(conv_table));
|
1236
|
+
find_run_type(te, tab, code);
|
1237
|
+
#if 0
|
1238
|
+
if (te->type == RUN_TYPE_TODO) {
|
1239
|
+
printf("TODO: ");
|
1240
|
+
dump_cc_info(ci, code);
|
1241
|
+
}
|
1242
|
+
#endif
|
1243
|
+
assert(te->len <= 127);
|
1244
|
+
code += te->len - 1;
|
1245
|
+
te++;
|
1246
|
+
}
|
1247
|
+
conv_table_len = te - conv_table;
|
1248
|
+
|
1249
|
+
/* find the data index */
|
1250
|
+
for(i = 0; i < conv_table_len; i++) {
|
1251
|
+
int data_index;
|
1252
|
+
te = &conv_table[i];
|
1253
|
+
|
1254
|
+
switch(te->type) {
|
1255
|
+
case RUN_TYPE_U:
|
1256
|
+
case RUN_TYPE_L:
|
1257
|
+
case RUN_TYPE_UF:
|
1258
|
+
case RUN_TYPE_LF:
|
1259
|
+
data_index = find_data_index(conv_table, conv_table_len, te->data);
|
1260
|
+
if (data_index < 0) {
|
1261
|
+
switch(te->type) {
|
1262
|
+
case RUN_TYPE_U:
|
1263
|
+
te->type = RUN_TYPE_U_EXT;
|
1264
|
+
te->ext_len = 1;
|
1265
|
+
te->ext_data[0] = te->data;
|
1266
|
+
break;
|
1267
|
+
case RUN_TYPE_LF:
|
1268
|
+
te->type = RUN_TYPE_LF_EXT;
|
1269
|
+
te->ext_len = 1;
|
1270
|
+
te->ext_data[0] = te->data;
|
1271
|
+
break;
|
1272
|
+
default:
|
1273
|
+
printf("%05x: index not found\n", te->code);
|
1274
|
+
exit(1);
|
1275
|
+
}
|
1276
|
+
} else {
|
1277
|
+
te->data_index = data_index;
|
1278
|
+
}
|
1279
|
+
break;
|
1280
|
+
case RUN_TYPE_UF_D20:
|
1281
|
+
te->data_index = te->data;
|
1282
|
+
break;
|
1283
|
+
}
|
1284
|
+
}
|
1285
|
+
|
1286
|
+
/* find the data index for ext_data */
|
1287
|
+
for(i = 0; i < conv_table_len; i++) {
|
1288
|
+
te = &conv_table[i];
|
1289
|
+
if (te->type == RUN_TYPE_UF_EXT3) {
|
1290
|
+
int p, v;
|
1291
|
+
v = 0;
|
1292
|
+
for(j = 0; j < 3; j++) {
|
1293
|
+
p = find_ext_data_index(te->ext_data[j]);
|
1294
|
+
assert(p < 16);
|
1295
|
+
v = (v << 4) | p;
|
1296
|
+
}
|
1297
|
+
te->data_index = v;
|
1298
|
+
}
|
1299
|
+
}
|
1300
|
+
|
1301
|
+
for(i = 0; i < conv_table_len; i++) {
|
1302
|
+
te = &conv_table[i];
|
1303
|
+
if (te->type == RUN_TYPE_LF_EXT2 ||
|
1304
|
+
te->type == RUN_TYPE_UF_EXT2 ||
|
1305
|
+
te->type == RUN_TYPE_U2L_399_EXT2) {
|
1306
|
+
int p, v;
|
1307
|
+
v = 0;
|
1308
|
+
for(j = 0; j < 2; j++) {
|
1309
|
+
p = find_ext_data_index(te->ext_data[j]);
|
1310
|
+
assert(p < 64);
|
1311
|
+
v = (v << 6) | p;
|
1312
|
+
}
|
1313
|
+
te->data_index = v;
|
1314
|
+
}
|
1315
|
+
}
|
1316
|
+
|
1317
|
+
for(i = 0; i < conv_table_len; i++) {
|
1318
|
+
te = &conv_table[i];
|
1319
|
+
if (te->type == RUN_TYPE_UF_D1_EXT ||
|
1320
|
+
te->type == RUN_TYPE_U_EXT ||
|
1321
|
+
te->type == RUN_TYPE_LF_EXT) {
|
1322
|
+
te->data_index = find_ext_data_index(te->ext_data[0]);
|
1323
|
+
}
|
1324
|
+
}
|
1325
|
+
#ifdef DUMP_CASE_CONV_TABLE
|
1326
|
+
dump_case_conv_table1();
|
1327
|
+
#endif
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
void dump_case_conv_table(FILE *f)
|
1331
|
+
{
|
1332
|
+
int i;
|
1333
|
+
uint32_t v;
|
1334
|
+
const TableEntry *te;
|
1335
|
+
|
1336
|
+
total_tables++;
|
1337
|
+
total_table_bytes += conv_table_len * sizeof(uint32_t);
|
1338
|
+
fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len);
|
1339
|
+
for(i = 0; i < conv_table_len; i++) {
|
1340
|
+
if (i % 4 == 0)
|
1341
|
+
fprintf(f, "\n ");
|
1342
|
+
te = &conv_table[i];
|
1343
|
+
v = te->code << (32 - 17);
|
1344
|
+
v |= te->len << (32 - 17 - 7);
|
1345
|
+
v |= te->type << (32 - 17 - 7 - 4);
|
1346
|
+
v |= te->data_index >> 8;
|
1347
|
+
fprintf(f, " 0x%08x,", v);
|
1348
|
+
}
|
1349
|
+
fprintf(f, "\n};\n\n");
|
1350
|
+
|
1351
|
+
total_tables++;
|
1352
|
+
total_table_bytes += conv_table_len;
|
1353
|
+
fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len);
|
1354
|
+
for(i = 0; i < conv_table_len; i++) {
|
1355
|
+
if (i % 8 == 0)
|
1356
|
+
fprintf(f, "\n ");
|
1357
|
+
te = &conv_table[i];
|
1358
|
+
fprintf(f, " 0x%02x,", te->data_index & 0xff);
|
1359
|
+
}
|
1360
|
+
fprintf(f, "\n};\n\n");
|
1361
|
+
|
1362
|
+
total_tables++;
|
1363
|
+
total_table_bytes += ext_data_len * sizeof(uint16_t);
|
1364
|
+
fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len);
|
1365
|
+
for(i = 0; i < ext_data_len; i++) {
|
1366
|
+
if (i % 8 == 0)
|
1367
|
+
fprintf(f, "\n ");
|
1368
|
+
fprintf(f, " 0x%04x,", ext_data[i]);
|
1369
|
+
}
|
1370
|
+
fprintf(f, "\n};\n\n");
|
1371
|
+
}
|
1372
|
+
|
1373
|
+
|
1374
|
+
static CCInfo *global_tab;
|
1375
|
+
|
1376
|
+
static int sp_cc_cmp(const void *p1, const void *p2)
|
1377
|
+
{
|
1378
|
+
CCInfo *c1 = &global_tab[*(const int *)p1];
|
1379
|
+
CCInfo *c2 = &global_tab[*(const int *)p2];
|
1380
|
+
if (c1->f_len < c2->f_len) {
|
1381
|
+
return -1;
|
1382
|
+
} else if (c2->f_len < c1->f_len) {
|
1383
|
+
return 1;
|
1384
|
+
} else {
|
1385
|
+
return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len);
|
1386
|
+
}
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
/* dump the case special cases (multi character results which are
|
1390
|
+
identical and need specific handling in lre_canonicalize() */
|
1391
|
+
void dump_case_folding_special_cases(CCInfo *tab)
|
1392
|
+
{
|
1393
|
+
int i, len, j;
|
1394
|
+
int *perm;
|
1395
|
+
|
1396
|
+
perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1));
|
1397
|
+
for(i = 0; i <= CHARCODE_MAX; i++)
|
1398
|
+
perm[i] = i;
|
1399
|
+
global_tab = tab;
|
1400
|
+
qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp);
|
1401
|
+
for(i = 0; i <= CHARCODE_MAX;) {
|
1402
|
+
if (tab[perm[i]].f_len <= 1) {
|
1403
|
+
i++;
|
1404
|
+
} else {
|
1405
|
+
len = 1;
|
1406
|
+
while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len]))
|
1407
|
+
len++;
|
1408
|
+
|
1409
|
+
if (len > 1) {
|
1410
|
+
for(j = i; j < i + len; j++)
|
1411
|
+
dump_cc_info(&tab[perm[j]], perm[j]);
|
1412
|
+
}
|
1413
|
+
i += len;
|
1414
|
+
}
|
1415
|
+
}
|
1416
|
+
free(perm);
|
1417
|
+
global_tab = NULL;
|
1418
|
+
}
|
1419
|
+
|
1420
|
+
|
1421
|
+
int tabcmp(const int *tab1, const int *tab2, int n)
|
1422
|
+
{
|
1423
|
+
int i;
|
1424
|
+
for(i = 0; i < n; i++) {
|
1425
|
+
if (tab1[i] != tab2[i])
|
1426
|
+
return -1;
|
1427
|
+
}
|
1428
|
+
return 0;
|
1429
|
+
}
|
1430
|
+
|
1431
|
+
void dump_str(const char *str, const int *buf, int len)
|
1432
|
+
{
|
1433
|
+
int i;
|
1434
|
+
printf("%s=", str);
|
1435
|
+
for(i = 0; i < len; i++)
|
1436
|
+
printf(" %05x", buf[i]);
|
1437
|
+
printf("\n");
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
void compute_internal_props(void)
|
1441
|
+
{
|
1442
|
+
int i;
|
1443
|
+
BOOL has_ul;
|
1444
|
+
|
1445
|
+
for(i = 0; i <= CHARCODE_MAX; i++) {
|
1446
|
+
CCInfo *ci = &unicode_db[i];
|
1447
|
+
has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0);
|
1448
|
+
if (has_ul) {
|
1449
|
+
assert(get_prop(i, PROP_Cased));
|
1450
|
+
} else {
|
1451
|
+
set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
|
1452
|
+
}
|
1453
|
+
set_prop(i, PROP_ID_Continue1,
|
1454
|
+
get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
|
1455
|
+
set_prop(i, PROP_XID_Start1,
|
1456
|
+
get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
|
1457
|
+
set_prop(i, PROP_XID_Continue1,
|
1458
|
+
get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
|
1459
|
+
set_prop(i, PROP_Changes_When_Titlecased1,
|
1460
|
+
get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
|
1461
|
+
set_prop(i, PROP_Changes_When_Casefolded1,
|
1462
|
+
get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0));
|
1463
|
+
/* XXX: reduce table size (438 bytes) */
|
1464
|
+
set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
|
1465
|
+
get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0));
|
1466
|
+
#if 0
|
1467
|
+
/* TEST */
|
1468
|
+
#define M(x) (1U << GCAT_ ## x)
|
1469
|
+
{
|
1470
|
+
int b;
|
1471
|
+
b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >>
|
1472
|
+
unicode_db[i].general_category) & 1;
|
1473
|
+
set_prop(i, PROP_Cased1,
|
1474
|
+
get_prop(i, PROP_Case_Ignorable) ^ b);
|
1475
|
+
}
|
1476
|
+
#undef M
|
1477
|
+
#endif
|
1478
|
+
}
|
1479
|
+
}
|
1480
|
+
|
1481
|
+
void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
|
1482
|
+
{
|
1483
|
+
int i;
|
1484
|
+
|
1485
|
+
total_tables++;
|
1486
|
+
total_table_bytes += len;
|
1487
|
+
fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
|
1488
|
+
for(i = 0; i < len; i++) {
|
1489
|
+
if (i % 8 == 0)
|
1490
|
+
fprintf(f, "\n ");
|
1491
|
+
fprintf(f, " 0x%02x,", tab[i]);
|
1492
|
+
}
|
1493
|
+
fprintf(f, "\n};\n\n");
|
1494
|
+
}
|
1495
|
+
|
1496
|
+
void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len)
|
1497
|
+
{
|
1498
|
+
int i, code, offset;
|
1499
|
+
|
1500
|
+
total_index++;
|
1501
|
+
total_index_bytes += len;
|
1502
|
+
fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len);
|
1503
|
+
for(i = 0; i < len; i += 3) {
|
1504
|
+
code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16);
|
1505
|
+
offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5);
|
1506
|
+
fprintf(f, " 0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]);
|
1507
|
+
fprintf(f, " // %6.5X at %d%s\n", code, offset,
|
1508
|
+
i == len - 3 ? " (upper bound)" : "");
|
1509
|
+
}
|
1510
|
+
fprintf(f, "};\n\n");
|
1511
|
+
}
|
1512
|
+
|
1513
|
+
#define PROP_BLOCK_LEN 32
|
1514
|
+
|
1515
|
+
void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index)
|
1516
|
+
{
|
1517
|
+
int i, j, n, v, offset, code;
|
1518
|
+
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
1519
|
+
DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
|
1520
|
+
DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
|
1521
|
+
const uint32_t *buf;
|
1522
|
+
int buf_len, block_end_pos, bit;
|
1523
|
+
char cname[128];
|
1524
|
+
|
1525
|
+
dbuf_init(dbuf1);
|
1526
|
+
|
1527
|
+
for(i = 0; i <= CHARCODE_MAX;) {
|
1528
|
+
v = get_prop(i, prop_index);
|
1529
|
+
j = i + 1;
|
1530
|
+
while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
|
1531
|
+
j++;
|
1532
|
+
}
|
1533
|
+
n = j - i;
|
1534
|
+
if (j == (CHARCODE_MAX + 1) && v == 0)
|
1535
|
+
break; /* no need to encode last zero run */
|
1536
|
+
//printf("%05x: %d %d\n", i, n, v);
|
1537
|
+
dbuf_put_u32(dbuf1, n - 1);
|
1538
|
+
i += n;
|
1539
|
+
}
|
1540
|
+
|
1541
|
+
dbuf_init(dbuf);
|
1542
|
+
dbuf_init(dbuf2);
|
1543
|
+
buf = (uint32_t *)dbuf1->buf;
|
1544
|
+
buf_len = dbuf1->size / sizeof(buf[0]);
|
1545
|
+
|
1546
|
+
/* the first value is assumed to be 0 */
|
1547
|
+
assert(get_prop(0, prop_index) == 0);
|
1548
|
+
|
1549
|
+
block_end_pos = PROP_BLOCK_LEN;
|
1550
|
+
i = 0;
|
1551
|
+
code = 0;
|
1552
|
+
bit = 0;
|
1553
|
+
while (i < buf_len) {
|
1554
|
+
if (add_index && dbuf->size >= block_end_pos && bit == 0) {
|
1555
|
+
offset = (dbuf->size - block_end_pos);
|
1556
|
+
/* XXX: offset could be larger in case of runs of small
|
1557
|
+
lengths. Could add code to change the encoding to
|
1558
|
+
prevent it at the expense of one byte loss */
|
1559
|
+
assert(offset <= 7);
|
1560
|
+
v = code | (offset << 21);
|
1561
|
+
dbuf_putc(dbuf2, v);
|
1562
|
+
dbuf_putc(dbuf2, v >> 8);
|
1563
|
+
dbuf_putc(dbuf2, v >> 16);
|
1564
|
+
block_end_pos += PROP_BLOCK_LEN;
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
/* Compressed byte encoding:
|
1568
|
+
00..3F: 2 packed lengths: 3-bit + 3-bit
|
1569
|
+
40..5F: 5-bits plus extra byte for length
|
1570
|
+
60..7F: 5-bits plus 2 extra bytes for length
|
1571
|
+
80..FF: 7-bit length
|
1572
|
+
lengths must be incremented to get character count
|
1573
|
+
Ranges alternate between false and true return value.
|
1574
|
+
*/
|
1575
|
+
v = buf[i];
|
1576
|
+
code += v + 1;
|
1577
|
+
bit ^= 1;
|
1578
|
+
if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
|
1579
|
+
code += buf[i + 1] + 1;
|
1580
|
+
bit ^= 1;
|
1581
|
+
dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
|
1582
|
+
i += 2;
|
1583
|
+
} else if (v < 128) {
|
1584
|
+
dbuf_putc(dbuf, 0x80 + v);
|
1585
|
+
i++;
|
1586
|
+
} else if (v < (1 << 13)) {
|
1587
|
+
dbuf_putc(dbuf, 0x40 + (v >> 8));
|
1588
|
+
dbuf_putc(dbuf, v);
|
1589
|
+
i++;
|
1590
|
+
} else {
|
1591
|
+
assert(v < (1 << 21));
|
1592
|
+
dbuf_putc(dbuf, 0x60 + (v >> 16));
|
1593
|
+
dbuf_putc(dbuf, v >> 8);
|
1594
|
+
dbuf_putc(dbuf, v);
|
1595
|
+
i++;
|
1596
|
+
}
|
1597
|
+
}
|
1598
|
+
|
1599
|
+
if (add_index) {
|
1600
|
+
/* last index entry */
|
1601
|
+
v = code;
|
1602
|
+
dbuf_putc(dbuf2, v);
|
1603
|
+
dbuf_putc(dbuf2, v >> 8);
|
1604
|
+
dbuf_putc(dbuf2, v >> 16);
|
1605
|
+
}
|
1606
|
+
|
1607
|
+
#ifdef DUMP_TABLE_SIZE
|
1608
|
+
printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index],
|
1609
|
+
(int)(dbuf->size + dbuf2->size));
|
1610
|
+
#endif
|
1611
|
+
snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]);
|
1612
|
+
dump_byte_table(f, cname, dbuf->buf, dbuf->size);
|
1613
|
+
if (add_index) {
|
1614
|
+
snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
|
1615
|
+
dump_index_table(f, cname, dbuf2->buf, dbuf2->size);
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
dbuf_free(dbuf);
|
1619
|
+
dbuf_free(dbuf1);
|
1620
|
+
dbuf_free(dbuf2);
|
1621
|
+
}
|
1622
|
+
|
1623
|
+
void build_flags_tables(FILE *f)
|
1624
|
+
{
|
1625
|
+
build_prop_table(f, "Cased1", PROP_Cased1, TRUE);
|
1626
|
+
build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE);
|
1627
|
+
build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE);
|
1628
|
+
build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE);
|
1629
|
+
}
|
1630
|
+
|
1631
|
+
void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
|
1632
|
+
const char **tab_short_name)
|
1633
|
+
{
|
1634
|
+
int i, w, maxw;
|
1635
|
+
|
1636
|
+
maxw = 0;
|
1637
|
+
for(i = 0; i < len; i++) {
|
1638
|
+
w = strlen(tab_name[i]);
|
1639
|
+
if (tab_short_name[i][0] != '\0') {
|
1640
|
+
w += 1 + strlen(tab_short_name[i]);
|
1641
|
+
}
|
1642
|
+
if (maxw < w)
|
1643
|
+
maxw = w;
|
1644
|
+
}
|
1645
|
+
|
1646
|
+
/* generate a sequence of strings terminated by an empty string */
|
1647
|
+
fprintf(f, "static const char %s[] =\n", cname);
|
1648
|
+
for(i = 0; i < len; i++) {
|
1649
|
+
fprintf(f, " \"");
|
1650
|
+
w = fprintf(f, "%s", tab_name[i]);
|
1651
|
+
if (tab_short_name[i][0] != '\0') {
|
1652
|
+
w += fprintf(f, ",%s", tab_short_name[i]);
|
1653
|
+
}
|
1654
|
+
fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
|
1655
|
+
}
|
1656
|
+
fprintf(f, ";\n\n");
|
1657
|
+
}
|
1658
|
+
|
1659
|
+
void build_general_category_table(FILE *f)
|
1660
|
+
{
|
1661
|
+
int i, v, j, n, n1;
|
1662
|
+
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
1663
|
+
#ifdef DUMP_TABLE_SIZE
|
1664
|
+
int cw_count, cw_len_count[4], cw_start;
|
1665
|
+
#endif
|
1666
|
+
|
1667
|
+
fprintf(f, "typedef enum {\n");
|
1668
|
+
for(i = 0; i < GCAT_COUNT; i++)
|
1669
|
+
fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]);
|
1670
|
+
fprintf(f, " UNICODE_GC_COUNT,\n");
|
1671
|
+
fprintf(f, "} UnicodeGCEnum;\n\n");
|
1672
|
+
|
1673
|
+
dump_name_table(f, "unicode_gc_name_table",
|
1674
|
+
unicode_gc_name, GCAT_COUNT,
|
1675
|
+
unicode_gc_short_name);
|
1676
|
+
|
1677
|
+
|
1678
|
+
dbuf_init(dbuf);
|
1679
|
+
#ifdef DUMP_TABLE_SIZE
|
1680
|
+
cw_count = 0;
|
1681
|
+
for(i = 0; i < 4; i++)
|
1682
|
+
cw_len_count[i] = 0;
|
1683
|
+
#endif
|
1684
|
+
for(i = 0; i <= CHARCODE_MAX;) {
|
1685
|
+
v = unicode_db[i].general_category;
|
1686
|
+
j = i + 1;
|
1687
|
+
while (j <= CHARCODE_MAX && unicode_db[j].general_category == v)
|
1688
|
+
j++;
|
1689
|
+
n = j - i;
|
1690
|
+
/* compress Lu/Ll runs */
|
1691
|
+
if (v == GCAT_Lu) {
|
1692
|
+
n1 = 1;
|
1693
|
+
while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
|
1694
|
+
n1++;
|
1695
|
+
}
|
1696
|
+
if (n1 > n) {
|
1697
|
+
v = 31;
|
1698
|
+
n = n1;
|
1699
|
+
}
|
1700
|
+
}
|
1701
|
+
// printf("%05x %05x %d\n", i, n, v);
|
1702
|
+
n--;
|
1703
|
+
#ifdef DUMP_TABLE_SIZE
|
1704
|
+
cw_count++;
|
1705
|
+
cw_start = dbuf->size;
|
1706
|
+
#endif
|
1707
|
+
if (n < 7) {
|
1708
|
+
dbuf_putc(dbuf, (n << 5) | v);
|
1709
|
+
} else if (n < 7 + 128) {
|
1710
|
+
n1 = n - 7;
|
1711
|
+
assert(n1 < 128);
|
1712
|
+
dbuf_putc(dbuf, (0xf << 5) | v);
|
1713
|
+
dbuf_putc(dbuf, n1);
|
1714
|
+
} else if (n < 7 + 128 + (1 << 14)) {
|
1715
|
+
n1 = n - (7 + 128);
|
1716
|
+
assert(n1 < (1 << 14));
|
1717
|
+
dbuf_putc(dbuf, (0xf << 5) | v);
|
1718
|
+
dbuf_putc(dbuf, (n1 >> 8) + 128);
|
1719
|
+
dbuf_putc(dbuf, n1);
|
1720
|
+
} else {
|
1721
|
+
n1 = n - (7 + 128 + (1 << 14));
|
1722
|
+
assert(n1 < (1 << 22));
|
1723
|
+
dbuf_putc(dbuf, (0xf << 5) | v);
|
1724
|
+
dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
|
1725
|
+
dbuf_putc(dbuf, n1 >> 8);
|
1726
|
+
dbuf_putc(dbuf, n1);
|
1727
|
+
}
|
1728
|
+
#ifdef DUMP_TABLE_SIZE
|
1729
|
+
cw_len_count[dbuf->size - cw_start - 1]++;
|
1730
|
+
#endif
|
1731
|
+
i += n + 1;
|
1732
|
+
}
|
1733
|
+
#ifdef DUMP_TABLE_SIZE
|
1734
|
+
printf("general category: %d entries [", cw_count);
|
1735
|
+
for(i = 0; i < 4; i++)
|
1736
|
+
printf(" %d", cw_len_count[i]);
|
1737
|
+
printf(" ], length=%d bytes\n", (int)dbuf->size);
|
1738
|
+
#endif
|
1739
|
+
|
1740
|
+
dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size);
|
1741
|
+
|
1742
|
+
dbuf_free(dbuf);
|
1743
|
+
}
|
1744
|
+
|
1745
|
+
void build_script_table(FILE *f)
|
1746
|
+
{
|
1747
|
+
int i, v, j, n, n1, type;
|
1748
|
+
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
1749
|
+
#ifdef DUMP_TABLE_SIZE
|
1750
|
+
int cw_count, cw_len_count[4], cw_start;
|
1751
|
+
#endif
|
1752
|
+
|
1753
|
+
fprintf(f, "typedef enum {\n");
|
1754
|
+
for(i = 0; i < SCRIPT_COUNT; i++)
|
1755
|
+
fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]);
|
1756
|
+
fprintf(f, " UNICODE_SCRIPT_COUNT,\n");
|
1757
|
+
fprintf(f, "} UnicodeScriptEnum;\n\n");
|
1758
|
+
|
1759
|
+
i = 1;
|
1760
|
+
dump_name_table(f, "unicode_script_name_table",
|
1761
|
+
unicode_script_name + i, SCRIPT_COUNT - i,
|
1762
|
+
unicode_script_short_name + i);
|
1763
|
+
|
1764
|
+
dbuf_init(dbuf);
|
1765
|
+
#ifdef DUMP_TABLE_SIZE
|
1766
|
+
cw_count = 0;
|
1767
|
+
for(i = 0; i < 4; i++)
|
1768
|
+
cw_len_count[i] = 0;
|
1769
|
+
#endif
|
1770
|
+
for(i = 0; i <= CHARCODE_MAX;) {
|
1771
|
+
v = unicode_db[i].script;
|
1772
|
+
j = i + 1;
|
1773
|
+
while (j <= CHARCODE_MAX && unicode_db[j].script == v)
|
1774
|
+
j++;
|
1775
|
+
n = j - i;
|
1776
|
+
if (v == 0 && j == (CHARCODE_MAX + 1))
|
1777
|
+
break;
|
1778
|
+
// printf("%05x %05x %d\n", i, n, v);
|
1779
|
+
n--;
|
1780
|
+
#ifdef DUMP_TABLE_SIZE
|
1781
|
+
cw_count++;
|
1782
|
+
cw_start = dbuf->size;
|
1783
|
+
#endif
|
1784
|
+
if (v == 0)
|
1785
|
+
type = 0;
|
1786
|
+
else
|
1787
|
+
type = 1;
|
1788
|
+
if (n < 96) {
|
1789
|
+
dbuf_putc(dbuf, n | (type << 7));
|
1790
|
+
} else if (n < 96 + (1 << 12)) {
|
1791
|
+
n1 = n - 96;
|
1792
|
+
assert(n1 < (1 << 12));
|
1793
|
+
dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
|
1794
|
+
dbuf_putc(dbuf, n1);
|
1795
|
+
} else {
|
1796
|
+
n1 = n - (96 + (1 << 12));
|
1797
|
+
assert(n1 < (1 << 20));
|
1798
|
+
dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
|
1799
|
+
dbuf_putc(dbuf, n1 >> 8);
|
1800
|
+
dbuf_putc(dbuf, n1);
|
1801
|
+
}
|
1802
|
+
if (type != 0)
|
1803
|
+
dbuf_putc(dbuf, v);
|
1804
|
+
|
1805
|
+
#ifdef DUMP_TABLE_SIZE
|
1806
|
+
cw_len_count[dbuf->size - cw_start - 1]++;
|
1807
|
+
#endif
|
1808
|
+
i += n + 1;
|
1809
|
+
}
|
1810
|
+
#ifdef DUMP_TABLE_SIZE
|
1811
|
+
printf("script: %d entries [", cw_count);
|
1812
|
+
for(i = 0; i < 4; i++)
|
1813
|
+
printf(" %d", cw_len_count[i]);
|
1814
|
+
printf(" ], length=%d bytes\n", (int)dbuf->size);
|
1815
|
+
#endif
|
1816
|
+
|
1817
|
+
dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size);
|
1818
|
+
|
1819
|
+
dbuf_free(dbuf);
|
1820
|
+
}
|
1821
|
+
|
1822
|
+
void build_script_ext_table(FILE *f)
|
1823
|
+
{
|
1824
|
+
int i, j, n, n1, script_ext_len;
|
1825
|
+
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
1826
|
+
#if defined(DUMP_TABLE_SIZE)
|
1827
|
+
int cw_count = 0;
|
1828
|
+
#endif
|
1829
|
+
|
1830
|
+
dbuf_init(dbuf);
|
1831
|
+
for(i = 0; i <= CHARCODE_MAX;) {
|
1832
|
+
script_ext_len = unicode_db[i].script_ext_len;
|
1833
|
+
j = i + 1;
|
1834
|
+
while (j <= CHARCODE_MAX &&
|
1835
|
+
unicode_db[j].script_ext_len == script_ext_len &&
|
1836
|
+
!memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext,
|
1837
|
+
script_ext_len)) {
|
1838
|
+
j++;
|
1839
|
+
}
|
1840
|
+
n = j - i;
|
1841
|
+
#if defined(DUMP_TABLE_SIZE)
|
1842
|
+
cw_count++;
|
1843
|
+
#endif
|
1844
|
+
n--;
|
1845
|
+
if (n < 128) {
|
1846
|
+
dbuf_putc(dbuf, n);
|
1847
|
+
} else if (n < 128 + (1 << 14)) {
|
1848
|
+
n1 = n - 128;
|
1849
|
+
assert(n1 < (1 << 14));
|
1850
|
+
dbuf_putc(dbuf, (n1 >> 8) + 128);
|
1851
|
+
dbuf_putc(dbuf, n1);
|
1852
|
+
} else {
|
1853
|
+
n1 = n - (128 + (1 << 14));
|
1854
|
+
assert(n1 < (1 << 22));
|
1855
|
+
dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
|
1856
|
+
dbuf_putc(dbuf, n1 >> 8);
|
1857
|
+
dbuf_putc(dbuf, n1);
|
1858
|
+
}
|
1859
|
+
dbuf_putc(dbuf, script_ext_len);
|
1860
|
+
for(j = 0; j < script_ext_len; j++)
|
1861
|
+
dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
|
1862
|
+
i += n + 1;
|
1863
|
+
}
|
1864
|
+
#ifdef DUMP_TABLE_SIZE
|
1865
|
+
printf("script_ext: %d entries", cw_count);
|
1866
|
+
printf(", length=%d bytes\n", (int)dbuf->size);
|
1867
|
+
#endif
|
1868
|
+
|
1869
|
+
dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size);
|
1870
|
+
|
1871
|
+
dbuf_free(dbuf);
|
1872
|
+
}
|
1873
|
+
|
1874
|
+
/* the following properties are synthetized so no table is necessary */
|
1875
|
+
#define PROP_TABLE_COUNT PROP_ASCII
|
1876
|
+
|
1877
|
+
void build_prop_list_table(FILE *f)
|
1878
|
+
{
|
1879
|
+
int i;
|
1880
|
+
|
1881
|
+
for(i = 0; i < PROP_TABLE_COUNT; i++) {
|
1882
|
+
if (i == PROP_ID_Start ||
|
1883
|
+
i == PROP_Case_Ignorable ||
|
1884
|
+
i == PROP_ID_Continue1) {
|
1885
|
+
/* already generated */
|
1886
|
+
} else {
|
1887
|
+
build_prop_table(f, unicode_prop_name[i], i, FALSE);
|
1888
|
+
}
|
1889
|
+
}
|
1890
|
+
|
1891
|
+
fprintf(f, "typedef enum {\n");
|
1892
|
+
for(i = 0; i < PROP_COUNT; i++)
|
1893
|
+
fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]);
|
1894
|
+
fprintf(f, " UNICODE_PROP_COUNT,\n");
|
1895
|
+
fprintf(f, "} UnicodePropertyEnum;\n\n");
|
1896
|
+
|
1897
|
+
i = PROP_ASCII_Hex_Digit;
|
1898
|
+
dump_name_table(f, "unicode_prop_name_table",
|
1899
|
+
unicode_prop_name + i, PROP_XID_Start - i + 1,
|
1900
|
+
unicode_prop_short_name + i);
|
1901
|
+
|
1902
|
+
fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n");
|
1903
|
+
for(i = 0; i < PROP_TABLE_COUNT; i++) {
|
1904
|
+
fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]);
|
1905
|
+
}
|
1906
|
+
fprintf(f, "};\n\n");
|
1907
|
+
|
1908
|
+
fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n");
|
1909
|
+
for(i = 0; i < PROP_TABLE_COUNT; i++) {
|
1910
|
+
fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]);
|
1911
|
+
}
|
1912
|
+
fprintf(f, "};\n\n");
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
#ifdef USE_TEST
|
1916
|
+
int check_conv(uint32_t *res, uint32_t c, int conv_type)
|
1917
|
+
{
|
1918
|
+
return lre_case_conv(res, c, conv_type);
|
1919
|
+
}
|
1920
|
+
|
1921
|
+
void check_case_conv(void)
|
1922
|
+
{
|
1923
|
+
CCInfo *tab = unicode_db;
|
1924
|
+
uint32_t res[3];
|
1925
|
+
int l, error;
|
1926
|
+
CCInfo ci_s, *ci1, *ci = &ci_s;
|
1927
|
+
int code;
|
1928
|
+
|
1929
|
+
for(code = 0; code <= CHARCODE_MAX; code++) {
|
1930
|
+
ci1 = &tab[code];
|
1931
|
+
*ci = *ci1;
|
1932
|
+
if (ci->l_len == 0) {
|
1933
|
+
ci->l_len = 1;
|
1934
|
+
ci->l_data[0] = code;
|
1935
|
+
}
|
1936
|
+
if (ci->u_len == 0) {
|
1937
|
+
ci->u_len = 1;
|
1938
|
+
ci->u_data[0] = code;
|
1939
|
+
}
|
1940
|
+
if (ci->f_len == 0) {
|
1941
|
+
ci->f_len = 1;
|
1942
|
+
ci->f_data[0] = code;
|
1943
|
+
}
|
1944
|
+
|
1945
|
+
error = 0;
|
1946
|
+
l = check_conv(res, code, 0);
|
1947
|
+
if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) {
|
1948
|
+
printf("ERROR: L\n");
|
1949
|
+
error++;
|
1950
|
+
}
|
1951
|
+
l = check_conv(res, code, 1);
|
1952
|
+
if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) {
|
1953
|
+
printf("ERROR: U\n");
|
1954
|
+
error++;
|
1955
|
+
}
|
1956
|
+
l = check_conv(res, code, 2);
|
1957
|
+
if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) {
|
1958
|
+
printf("ERROR: F\n");
|
1959
|
+
error++;
|
1960
|
+
}
|
1961
|
+
if (error) {
|
1962
|
+
dump_cc_info(ci, code);
|
1963
|
+
exit(1);
|
1964
|
+
}
|
1965
|
+
}
|
1966
|
+
}
|
1967
|
+
|
1968
|
+
#ifdef PROFILE
|
1969
|
+
static int64_t get_time_ns(void)
|
1970
|
+
{
|
1971
|
+
struct timespec ts;
|
1972
|
+
clock_gettime(CLOCK_MONOTONIC, &ts);
|
1973
|
+
return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
|
1974
|
+
}
|
1975
|
+
#endif
|
1976
|
+
|
1977
|
+
|
1978
|
+
void check_flags(void)
|
1979
|
+
{
|
1980
|
+
int c;
|
1981
|
+
BOOL flag_ref, flag;
|
1982
|
+
for(c = 0; c <= CHARCODE_MAX; c++) {
|
1983
|
+
flag_ref = get_prop(c, PROP_Cased);
|
1984
|
+
flag = !!lre_is_cased(c);
|
1985
|
+
if (flag != flag_ref) {
|
1986
|
+
printf("ERROR: c=%05x cased=%d ref=%d\n",
|
1987
|
+
c, flag, flag_ref);
|
1988
|
+
exit(1);
|
1989
|
+
}
|
1990
|
+
|
1991
|
+
flag_ref = get_prop(c, PROP_Case_Ignorable);
|
1992
|
+
flag = !!lre_is_case_ignorable(c);
|
1993
|
+
if (flag != flag_ref) {
|
1994
|
+
printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
|
1995
|
+
c, flag, flag_ref);
|
1996
|
+
exit(1);
|
1997
|
+
}
|
1998
|
+
|
1999
|
+
flag_ref = get_prop(c, PROP_ID_Start);
|
2000
|
+
flag = !!lre_is_id_start(c);
|
2001
|
+
if (flag != flag_ref) {
|
2002
|
+
printf("ERROR: c=%05x id_start=%d ref=%d\n",
|
2003
|
+
c, flag, flag_ref);
|
2004
|
+
exit(1);
|
2005
|
+
}
|
2006
|
+
|
2007
|
+
flag_ref = get_prop(c, PROP_ID_Continue);
|
2008
|
+
flag = !!lre_is_id_continue(c);
|
2009
|
+
if (flag != flag_ref) {
|
2010
|
+
printf("ERROR: c=%05x id_cont=%d ref=%d\n",
|
2011
|
+
c, flag, flag_ref);
|
2012
|
+
exit(1);
|
2013
|
+
}
|
2014
|
+
}
|
2015
|
+
#ifdef PROFILE
|
2016
|
+
{
|
2017
|
+
int64_t ti, count;
|
2018
|
+
ti = get_time_ns();
|
2019
|
+
count = 0;
|
2020
|
+
for(c = 0x20; c <= 0xffff; c++) {
|
2021
|
+
flag_ref = get_prop(c, PROP_ID_Start);
|
2022
|
+
flag = !!lre_is_id_start(c);
|
2023
|
+
assert(flag == flag_ref);
|
2024
|
+
count++;
|
2025
|
+
}
|
2026
|
+
ti = get_time_ns() - ti;
|
2027
|
+
printf("flags time=%0.1f ns/char\n",
|
2028
|
+
(double)ti / count);
|
2029
|
+
}
|
2030
|
+
#endif
|
2031
|
+
}
|
2032
|
+
|
2033
|
+
#endif
|
2034
|
+
|
2035
|
+
#define CC_BLOCK_LEN 32
|
2036
|
+
|
2037
|
+
void build_cc_table(FILE *f)
|
2038
|
+
{
|
2039
|
+
// Compress combining class table
|
2040
|
+
// see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
|
2041
|
+
int i, cc, n, type, n1, block_end_pos;
|
2042
|
+
DynBuf dbuf_s, *dbuf = &dbuf_s;
|
2043
|
+
DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
|
2044
|
+
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
2045
|
+
int cw_len_tab[3], cw_start, cc_table_len;
|
2046
|
+
#endif
|
2047
|
+
uint32_t v;
|
2048
|
+
|
2049
|
+
dbuf_init(dbuf);
|
2050
|
+
dbuf_init(dbuf1);
|
2051
|
+
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
2052
|
+
cc_table_len = 0;
|
2053
|
+
for(i = 0; i < countof(cw_len_tab); i++)
|
2054
|
+
cw_len_tab[i] = 0;
|
2055
|
+
#endif
|
2056
|
+
block_end_pos = CC_BLOCK_LEN;
|
2057
|
+
for(i = 0; i <= CHARCODE_MAX;) {
|
2058
|
+
cc = unicode_db[i].combining_class;
|
2059
|
+
assert(cc <= 255);
|
2060
|
+
/* check increasing values */
|
2061
|
+
n = 1;
|
2062
|
+
while ((i + n) <= CHARCODE_MAX &&
|
2063
|
+
unicode_db[i + n].combining_class == (cc + n))
|
2064
|
+
n++;
|
2065
|
+
if (n >= 2) {
|
2066
|
+
type = 1;
|
2067
|
+
} else {
|
2068
|
+
type = 0;
|
2069
|
+
n = 1;
|
2070
|
+
while ((i + n) <= CHARCODE_MAX &&
|
2071
|
+
unicode_db[i + n].combining_class == cc)
|
2072
|
+
n++;
|
2073
|
+
}
|
2074
|
+
/* no need to encode the last run */
|
2075
|
+
if (cc == 0 && (i + n - 1) == CHARCODE_MAX)
|
2076
|
+
break;
|
2077
|
+
#ifdef DUMP_CC_TABLE
|
2078
|
+
printf("%05x %6d %d %d\n", i, n, type, cc);
|
2079
|
+
#endif
|
2080
|
+
if (type == 0) {
|
2081
|
+
if (cc == 0)
|
2082
|
+
type = 2;
|
2083
|
+
else if (cc == 230)
|
2084
|
+
type = 3;
|
2085
|
+
}
|
2086
|
+
n1 = n - 1;
|
2087
|
+
|
2088
|
+
/* add an entry to the index if necessary */
|
2089
|
+
if (dbuf->size >= block_end_pos) {
|
2090
|
+
v = i | ((dbuf->size - block_end_pos) << 21);
|
2091
|
+
dbuf_putc(dbuf1, v);
|
2092
|
+
dbuf_putc(dbuf1, v >> 8);
|
2093
|
+
dbuf_putc(dbuf1, v >> 16);
|
2094
|
+
block_end_pos += CC_BLOCK_LEN;
|
2095
|
+
}
|
2096
|
+
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
2097
|
+
cw_start = dbuf->size;
|
2098
|
+
#endif
|
2099
|
+
/* Compressed run length encoding:
|
2100
|
+
- 2 high order bits are combining class type
|
2101
|
+
- 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
|
2102
|
+
- 00..2F: range length (add 1)
|
2103
|
+
- 30..37: 3-bit range-length + 1 extra byte
|
2104
|
+
- 38..3F: 3-bit range-length + 2 extra byte
|
2105
|
+
*/
|
2106
|
+
if (n1 < 48) {
|
2107
|
+
dbuf_putc(dbuf, n1 | (type << 6));
|
2108
|
+
} else if (n1 < 48 + (1 << 11)) {
|
2109
|
+
n1 -= 48;
|
2110
|
+
dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
|
2111
|
+
dbuf_putc(dbuf, n1);
|
2112
|
+
} else {
|
2113
|
+
n1 -= 48 + (1 << 11);
|
2114
|
+
assert(n1 < (1 << 20));
|
2115
|
+
dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
|
2116
|
+
dbuf_putc(dbuf, n1 >> 8);
|
2117
|
+
dbuf_putc(dbuf, n1);
|
2118
|
+
}
|
2119
|
+
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
2120
|
+
cw_len_tab[dbuf->size - cw_start - 1]++;
|
2121
|
+
cc_table_len++;
|
2122
|
+
#endif
|
2123
|
+
if (type == 0 || type == 1)
|
2124
|
+
dbuf_putc(dbuf, cc);
|
2125
|
+
i += n;
|
2126
|
+
}
|
2127
|
+
|
2128
|
+
/* last index entry */
|
2129
|
+
v = i;
|
2130
|
+
dbuf_putc(dbuf1, v);
|
2131
|
+
dbuf_putc(dbuf1, v >> 8);
|
2132
|
+
dbuf_putc(dbuf1, v >> 16);
|
2133
|
+
|
2134
|
+
dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
|
2135
|
+
dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
|
2136
|
+
|
2137
|
+
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
|
2138
|
+
printf("CC table: size=%d (%d entries) [",
|
2139
|
+
(int)(dbuf->size + dbuf1->size),
|
2140
|
+
cc_table_len);
|
2141
|
+
for(i = 0; i < countof(cw_len_tab); i++)
|
2142
|
+
printf(" %d", cw_len_tab[i]);
|
2143
|
+
printf(" ]\n");
|
2144
|
+
#endif
|
2145
|
+
dbuf_free(dbuf);
|
2146
|
+
dbuf_free(dbuf1);
|
2147
|
+
}
|
2148
|
+
|
2149
|
+
/* maximum length of decomposition: 18 chars (1), then 8 */
|
2150
|
+
#ifndef USE_TEST
|
2151
|
+
typedef enum {
|
2152
|
+
DECOMP_TYPE_C1, /* 16 bit char */
|
2153
|
+
DECOMP_TYPE_L1, /* 16 bit char table */
|
2154
|
+
DECOMP_TYPE_L2,
|
2155
|
+
DECOMP_TYPE_L3,
|
2156
|
+
DECOMP_TYPE_L4,
|
2157
|
+
DECOMP_TYPE_L5, /* XXX: not used */
|
2158
|
+
DECOMP_TYPE_L6, /* XXX: could remove */
|
2159
|
+
DECOMP_TYPE_L7, /* XXX: could remove */
|
2160
|
+
DECOMP_TYPE_LL1, /* 18 bit char table */
|
2161
|
+
DECOMP_TYPE_LL2,
|
2162
|
+
DECOMP_TYPE_S1, /* 8 bit char table */
|
2163
|
+
DECOMP_TYPE_S2,
|
2164
|
+
DECOMP_TYPE_S3,
|
2165
|
+
DECOMP_TYPE_S4,
|
2166
|
+
DECOMP_TYPE_S5,
|
2167
|
+
DECOMP_TYPE_I1, /* increment 16 bit char value */
|
2168
|
+
DECOMP_TYPE_I2_0,
|
2169
|
+
DECOMP_TYPE_I2_1,
|
2170
|
+
DECOMP_TYPE_I3_1,
|
2171
|
+
DECOMP_TYPE_I3_2,
|
2172
|
+
DECOMP_TYPE_I4_1,
|
2173
|
+
DECOMP_TYPE_I4_2,
|
2174
|
+
DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
|
2175
|
+
DECOMP_TYPE_B2,
|
2176
|
+
DECOMP_TYPE_B3,
|
2177
|
+
DECOMP_TYPE_B4,
|
2178
|
+
DECOMP_TYPE_B5,
|
2179
|
+
DECOMP_TYPE_B6,
|
2180
|
+
DECOMP_TYPE_B7,
|
2181
|
+
DECOMP_TYPE_B8,
|
2182
|
+
DECOMP_TYPE_B18,
|
2183
|
+
DECOMP_TYPE_LS2,
|
2184
|
+
DECOMP_TYPE_PAT3,
|
2185
|
+
DECOMP_TYPE_S2_UL,
|
2186
|
+
DECOMP_TYPE_LS2_UL,
|
2187
|
+
} DecompTypeEnum;
|
2188
|
+
#endif
|
2189
|
+
|
2190
|
+
const char *decomp_type_str[] = {
|
2191
|
+
"C1",
|
2192
|
+
"L1",
|
2193
|
+
"L2",
|
2194
|
+
"L3",
|
2195
|
+
"L4",
|
2196
|
+
"L5",
|
2197
|
+
"L6",
|
2198
|
+
"L7",
|
2199
|
+
"LL1",
|
2200
|
+
"LL2",
|
2201
|
+
"S1",
|
2202
|
+
"S2",
|
2203
|
+
"S3",
|
2204
|
+
"S4",
|
2205
|
+
"S5",
|
2206
|
+
"I1",
|
2207
|
+
"I2_0",
|
2208
|
+
"I2_1",
|
2209
|
+
"I3_1",
|
2210
|
+
"I3_2",
|
2211
|
+
"I4_1",
|
2212
|
+
"I4_2",
|
2213
|
+
"B1",
|
2214
|
+
"B2",
|
2215
|
+
"B3",
|
2216
|
+
"B4",
|
2217
|
+
"B5",
|
2218
|
+
"B6",
|
2219
|
+
"B7",
|
2220
|
+
"B8",
|
2221
|
+
"B18",
|
2222
|
+
"LS2",
|
2223
|
+
"PAT3",
|
2224
|
+
"S2_UL",
|
2225
|
+
"LS2_UL",
|
2226
|
+
};
|
2227
|
+
|
2228
|
+
const int decomp_incr_tab[4][4] = {
|
2229
|
+
{ DECOMP_TYPE_I1, 0, -1 },
|
2230
|
+
{ DECOMP_TYPE_I2_0, 0, 1, -1 },
|
2231
|
+
{ DECOMP_TYPE_I3_1, 1, 2, -1 },
|
2232
|
+
{ DECOMP_TYPE_I4_1, 1, 2, -1 },
|
2233
|
+
};
|
2234
|
+
|
2235
|
+
/*
|
2236
|
+
entry size:
|
2237
|
+
type bits
|
2238
|
+
code 18
|
2239
|
+
len 7
|
2240
|
+
compat 1
|
2241
|
+
type 5
|
2242
|
+
index 16
|
2243
|
+
total 47
|
2244
|
+
*/
|
2245
|
+
|
2246
|
+
typedef struct {
|
2247
|
+
int code;
|
2248
|
+
uint8_t len;
|
2249
|
+
uint8_t type;
|
2250
|
+
uint8_t c_len;
|
2251
|
+
uint16_t c_min;
|
2252
|
+
uint16_t data_index;
|
2253
|
+
int cost; /* size in bytes from this entry to the end */
|
2254
|
+
} DecompEntry;
|
2255
|
+
|
2256
|
+
int get_decomp_run_size(const DecompEntry *de)
|
2257
|
+
{
|
2258
|
+
int s;
|
2259
|
+
s = 6;
|
2260
|
+
if (de->type <= DECOMP_TYPE_C1) {
|
2261
|
+
/* nothing more */
|
2262
|
+
} else if (de->type <= DECOMP_TYPE_L7) {
|
2263
|
+
s += de->len * de->c_len * 2;
|
2264
|
+
} else if (de->type <= DECOMP_TYPE_LL2) {
|
2265
|
+
/* 18 bits per char */
|
2266
|
+
s += (de->len * de->c_len * 18 + 7) / 8;
|
2267
|
+
} else if (de->type <= DECOMP_TYPE_S5) {
|
2268
|
+
s += de->len * de->c_len;
|
2269
|
+
} else if (de->type <= DECOMP_TYPE_I4_2) {
|
2270
|
+
s += de->c_len * 2;
|
2271
|
+
} else if (de->type <= DECOMP_TYPE_B18) {
|
2272
|
+
s += 2 + de->len * de->c_len;
|
2273
|
+
} else if (de->type <= DECOMP_TYPE_LS2) {
|
2274
|
+
s += de->len * 3;
|
2275
|
+
} else if (de->type <= DECOMP_TYPE_PAT3) {
|
2276
|
+
s += 4 + de->len * 2;
|
2277
|
+
} else if (de->type <= DECOMP_TYPE_S2_UL) {
|
2278
|
+
s += de->len;
|
2279
|
+
} else if (de->type <= DECOMP_TYPE_LS2_UL) {
|
2280
|
+
s += (de->len / 2) * 3;
|
2281
|
+
} else {
|
2282
|
+
abort();
|
2283
|
+
}
|
2284
|
+
return s;
|
2285
|
+
}
|
2286
|
+
|
2287
|
+
static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
|
2288
|
+
|
2289
|
+
/* return -1 if not found */
|
2290
|
+
int get_short_code(int c)
|
2291
|
+
{
|
2292
|
+
int i;
|
2293
|
+
if (c < 0x80) {
|
2294
|
+
return c;
|
2295
|
+
} else if (c >= 0x300 && c < 0x350) {
|
2296
|
+
return c - 0x300 + 0x80;
|
2297
|
+
} else {
|
2298
|
+
for(i = 0; i < countof(unicode_short_table); i++) {
|
2299
|
+
if (c == unicode_short_table[i])
|
2300
|
+
return i + 0x80 + 0x50;
|
2301
|
+
}
|
2302
|
+
return -1;
|
2303
|
+
}
|
2304
|
+
}
|
2305
|
+
|
2306
|
+
static BOOL is_short(int code)
|
2307
|
+
{
|
2308
|
+
return get_short_code(code) >= 0;
|
2309
|
+
}
|
2310
|
+
|
2311
|
+
static BOOL is_short_tab(const int *tab, int len)
|
2312
|
+
{
|
2313
|
+
int i;
|
2314
|
+
for(i = 0; i < len; i++) {
|
2315
|
+
if (!is_short(tab[i]))
|
2316
|
+
return FALSE;
|
2317
|
+
}
|
2318
|
+
return TRUE;
|
2319
|
+
}
|
2320
|
+
|
2321
|
+
static BOOL is_16bit(const int *tab, int len)
|
2322
|
+
{
|
2323
|
+
int i;
|
2324
|
+
for(i = 0; i < len; i++) {
|
2325
|
+
if (tab[i] > 0xffff)
|
2326
|
+
return FALSE;
|
2327
|
+
}
|
2328
|
+
return TRUE;
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
static uint32_t to_lower_simple(uint32_t c)
|
2332
|
+
{
|
2333
|
+
/* Latin1 and Cyrillic */
|
2334
|
+
if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
|
2335
|
+
c += 0x20;
|
2336
|
+
else
|
2337
|
+
c++;
|
2338
|
+
return c;
|
2339
|
+
}
|
2340
|
+
|
2341
|
+
/* select best encoding with dynamic programming */
|
2342
|
+
void find_decomp_run(DecompEntry *tab_de, int i)
|
2343
|
+
{
|
2344
|
+
DecompEntry de_s, *de = &de_s;
|
2345
|
+
CCInfo *ci, *ci1, *ci2;
|
2346
|
+
int l, j, n, len_max;
|
2347
|
+
|
2348
|
+
ci = &unicode_db[i];
|
2349
|
+
l = ci->decomp_len;
|
2350
|
+
if (l == 0) {
|
2351
|
+
tab_de[i].cost = tab_de[i + 1].cost;
|
2352
|
+
return;
|
2353
|
+
}
|
2354
|
+
|
2355
|
+
/* the offset for the compose table has only 6 bits, so we must
|
2356
|
+
limit if it can be used by the compose table */
|
2357
|
+
if (!ci->is_compat && !ci->is_excluded && l == 2)
|
2358
|
+
len_max = 64;
|
2359
|
+
else
|
2360
|
+
len_max = 127;
|
2361
|
+
|
2362
|
+
tab_de[i].cost = 0x7fffffff;
|
2363
|
+
|
2364
|
+
if (!is_16bit(ci->decomp_data, l)) {
|
2365
|
+
assert(l <= 2);
|
2366
|
+
|
2367
|
+
n = 1;
|
2368
|
+
for(;;) {
|
2369
|
+
de->code = i;
|
2370
|
+
de->len = n;
|
2371
|
+
de->type = DECOMP_TYPE_LL1 + l - 1;
|
2372
|
+
de->c_len = l;
|
2373
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2374
|
+
if (de->cost < tab_de[i].cost) {
|
2375
|
+
tab_de[i] = *de;
|
2376
|
+
}
|
2377
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2378
|
+
break;
|
2379
|
+
ci1 = &unicode_db[i + n];
|
2380
|
+
/* Note: we accept a hole */
|
2381
|
+
if (!(ci1->decomp_len == 0 ||
|
2382
|
+
(ci1->decomp_len == l &&
|
2383
|
+
ci1->is_compat == ci->is_compat)))
|
2384
|
+
break;
|
2385
|
+
n++;
|
2386
|
+
}
|
2387
|
+
return;
|
2388
|
+
}
|
2389
|
+
|
2390
|
+
if (l <= 7) {
|
2391
|
+
n = 1;
|
2392
|
+
for(;;) {
|
2393
|
+
de->code = i;
|
2394
|
+
de->len = n;
|
2395
|
+
if (l == 1 && n == 1) {
|
2396
|
+
de->type = DECOMP_TYPE_C1;
|
2397
|
+
} else {
|
2398
|
+
assert(l <= 8);
|
2399
|
+
de->type = DECOMP_TYPE_L1 + l - 1;
|
2400
|
+
}
|
2401
|
+
de->c_len = l;
|
2402
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2403
|
+
if (de->cost < tab_de[i].cost) {
|
2404
|
+
tab_de[i] = *de;
|
2405
|
+
}
|
2406
|
+
|
2407
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2408
|
+
break;
|
2409
|
+
ci1 = &unicode_db[i + n];
|
2410
|
+
/* Note: we accept a hole */
|
2411
|
+
if (!(ci1->decomp_len == 0 ||
|
2412
|
+
(ci1->decomp_len == l &&
|
2413
|
+
ci1->is_compat == ci->is_compat &&
|
2414
|
+
is_16bit(ci1->decomp_data, l))))
|
2415
|
+
break;
|
2416
|
+
n++;
|
2417
|
+
}
|
2418
|
+
}
|
2419
|
+
|
2420
|
+
if (l <= 8 || l == 18) {
|
2421
|
+
int c_min, c_max, c;
|
2422
|
+
c_min = c_max = -1;
|
2423
|
+
n = 1;
|
2424
|
+
for(;;) {
|
2425
|
+
ci1 = &unicode_db[i + n - 1];
|
2426
|
+
for(j = 0; j < l; j++) {
|
2427
|
+
c = ci1->decomp_data[j];
|
2428
|
+
if (c == 0x20) {
|
2429
|
+
/* we accept space for Arabic */
|
2430
|
+
} else if (c_min == -1) {
|
2431
|
+
c_min = c_max = c;
|
2432
|
+
} else {
|
2433
|
+
c_min = min_int(c_min, c);
|
2434
|
+
c_max = max_int(c_max, c);
|
2435
|
+
}
|
2436
|
+
}
|
2437
|
+
if ((c_max - c_min) > 254)
|
2438
|
+
break;
|
2439
|
+
de->code = i;
|
2440
|
+
de->len = n;
|
2441
|
+
if (l == 18)
|
2442
|
+
de->type = DECOMP_TYPE_B18;
|
2443
|
+
else
|
2444
|
+
de->type = DECOMP_TYPE_B1 + l - 1;
|
2445
|
+
de->c_len = l;
|
2446
|
+
de->c_min = c_min;
|
2447
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2448
|
+
if (de->cost < tab_de[i].cost) {
|
2449
|
+
tab_de[i] = *de;
|
2450
|
+
}
|
2451
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2452
|
+
break;
|
2453
|
+
ci1 = &unicode_db[i + n];
|
2454
|
+
if (!(ci1->decomp_len == l &&
|
2455
|
+
ci1->is_compat == ci->is_compat))
|
2456
|
+
break;
|
2457
|
+
n++;
|
2458
|
+
}
|
2459
|
+
}
|
2460
|
+
|
2461
|
+
/* find an ascii run */
|
2462
|
+
if (l <= 5 && is_short_tab(ci->decomp_data, l)) {
|
2463
|
+
n = 1;
|
2464
|
+
for(;;) {
|
2465
|
+
de->code = i;
|
2466
|
+
de->len = n;
|
2467
|
+
de->type = DECOMP_TYPE_S1 + l - 1;
|
2468
|
+
de->c_len = l;
|
2469
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2470
|
+
if (de->cost < tab_de[i].cost) {
|
2471
|
+
tab_de[i] = *de;
|
2472
|
+
}
|
2473
|
+
|
2474
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2475
|
+
break;
|
2476
|
+
ci1 = &unicode_db[i + n];
|
2477
|
+
/* Note: we accept a hole */
|
2478
|
+
if (!(ci1->decomp_len == 0 ||
|
2479
|
+
(ci1->decomp_len == l &&
|
2480
|
+
ci1->is_compat == ci->is_compat &&
|
2481
|
+
is_short_tab(ci1->decomp_data, l))))
|
2482
|
+
break;
|
2483
|
+
n++;
|
2484
|
+
}
|
2485
|
+
}
|
2486
|
+
|
2487
|
+
/* check if a single char is increasing */
|
2488
|
+
if (l <= 4) {
|
2489
|
+
int idx1, idx;
|
2490
|
+
|
2491
|
+
for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
|
2492
|
+
n = 1;
|
2493
|
+
for(;;) {
|
2494
|
+
de->code = i;
|
2495
|
+
de->len = n;
|
2496
|
+
de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
|
2497
|
+
de->c_len = l;
|
2498
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2499
|
+
if (de->cost < tab_de[i].cost) {
|
2500
|
+
tab_de[i] = *de;
|
2501
|
+
}
|
2502
|
+
|
2503
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2504
|
+
break;
|
2505
|
+
ci1 = &unicode_db[i + n];
|
2506
|
+
if (!(ci1->decomp_len == l &&
|
2507
|
+
ci1->is_compat == ci->is_compat))
|
2508
|
+
goto next1;
|
2509
|
+
for(j = 0; j < l; j++) {
|
2510
|
+
if (j == idx) {
|
2511
|
+
if (ci1->decomp_data[j] != ci->decomp_data[j] + n)
|
2512
|
+
goto next1;
|
2513
|
+
} else {
|
2514
|
+
if (ci1->decomp_data[j] != ci->decomp_data[j])
|
2515
|
+
goto next1;
|
2516
|
+
}
|
2517
|
+
}
|
2518
|
+
n++;
|
2519
|
+
}
|
2520
|
+
next1: ;
|
2521
|
+
}
|
2522
|
+
}
|
2523
|
+
|
2524
|
+
if (l == 3) {
|
2525
|
+
n = 1;
|
2526
|
+
for(;;) {
|
2527
|
+
de->code = i;
|
2528
|
+
de->len = n;
|
2529
|
+
de->type = DECOMP_TYPE_PAT3;
|
2530
|
+
de->c_len = l;
|
2531
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2532
|
+
if (de->cost < tab_de[i].cost) {
|
2533
|
+
tab_de[i] = *de;
|
2534
|
+
}
|
2535
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2536
|
+
break;
|
2537
|
+
ci1 = &unicode_db[i + n];
|
2538
|
+
if (!(ci1->decomp_len == l &&
|
2539
|
+
ci1->is_compat == ci->is_compat &&
|
2540
|
+
ci1->decomp_data[1] <= 0xffff &&
|
2541
|
+
ci1->decomp_data[0] == ci->decomp_data[0] &&
|
2542
|
+
ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
|
2543
|
+
break;
|
2544
|
+
n++;
|
2545
|
+
}
|
2546
|
+
}
|
2547
|
+
|
2548
|
+
if (l == 2 && is_short(ci->decomp_data[1])) {
|
2549
|
+
n = 1;
|
2550
|
+
for(;;) {
|
2551
|
+
de->code = i;
|
2552
|
+
de->len = n;
|
2553
|
+
de->type = DECOMP_TYPE_LS2;
|
2554
|
+
de->c_len = l;
|
2555
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2556
|
+
if (de->cost < tab_de[i].cost) {
|
2557
|
+
tab_de[i] = *de;
|
2558
|
+
}
|
2559
|
+
if (!((i + n) <= CHARCODE_MAX && n < len_max))
|
2560
|
+
break;
|
2561
|
+
ci1 = &unicode_db[i + n];
|
2562
|
+
if (!(ci1->decomp_len == 0 ||
|
2563
|
+
(ci1->decomp_len == l &&
|
2564
|
+
ci1->is_compat == ci->is_compat &&
|
2565
|
+
ci1->decomp_data[0] <= 0xffff &&
|
2566
|
+
is_short(ci1->decomp_data[1]))))
|
2567
|
+
break;
|
2568
|
+
n++;
|
2569
|
+
}
|
2570
|
+
}
|
2571
|
+
|
2572
|
+
if (l == 2) {
|
2573
|
+
BOOL is_16bit;
|
2574
|
+
|
2575
|
+
n = 0;
|
2576
|
+
is_16bit = FALSE;
|
2577
|
+
for(;;) {
|
2578
|
+
if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
|
2579
|
+
break;
|
2580
|
+
ci1 = &unicode_db[i + n];
|
2581
|
+
if (!(ci1->decomp_len == l &&
|
2582
|
+
ci1->is_compat == ci->is_compat &&
|
2583
|
+
is_short(ci1->decomp_data[1])))
|
2584
|
+
break;
|
2585
|
+
if (!is_16bit && !is_short(ci1->decomp_data[0]))
|
2586
|
+
is_16bit = TRUE;
|
2587
|
+
ci2 = &unicode_db[i + n + 1];
|
2588
|
+
if (!(ci2->decomp_len == l &&
|
2589
|
+
ci2->is_compat == ci->is_compat &&
|
2590
|
+
ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) &&
|
2591
|
+
ci2->decomp_data[1] == ci1->decomp_data[1]))
|
2592
|
+
break;
|
2593
|
+
n += 2;
|
2594
|
+
de->code = i;
|
2595
|
+
de->len = n;
|
2596
|
+
de->type = DECOMP_TYPE_S2_UL + is_16bit;
|
2597
|
+
de->c_len = l;
|
2598
|
+
de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
|
2599
|
+
if (de->cost < tab_de[i].cost) {
|
2600
|
+
tab_de[i] = *de;
|
2601
|
+
}
|
2602
|
+
}
|
2603
|
+
}
|
2604
|
+
}
|
2605
|
+
|
2606
|
+
void put16(uint8_t *data_buf, int *pidx, uint16_t c)
|
2607
|
+
{
|
2608
|
+
int idx;
|
2609
|
+
idx = *pidx;
|
2610
|
+
data_buf[idx++] = c;
|
2611
|
+
data_buf[idx++] = c >> 8;
|
2612
|
+
*pidx = idx;
|
2613
|
+
}
|
2614
|
+
|
2615
|
+
void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de)
|
2616
|
+
{
|
2617
|
+
int i, j, idx, c;
|
2618
|
+
CCInfo *ci;
|
2619
|
+
|
2620
|
+
idx = *pidx;
|
2621
|
+
de->data_index = idx;
|
2622
|
+
if (de->type <= DECOMP_TYPE_C1) {
|
2623
|
+
ci = &unicode_db[de->code];
|
2624
|
+
assert(ci->decomp_len == 1);
|
2625
|
+
de->data_index = ci->decomp_data[0];
|
2626
|
+
} else if (de->type <= DECOMP_TYPE_L7) {
|
2627
|
+
for(i = 0; i < de->len; i++) {
|
2628
|
+
ci = &unicode_db[de->code + i];
|
2629
|
+
for(j = 0; j < de->c_len; j++) {
|
2630
|
+
if (ci->decomp_len == 0)
|
2631
|
+
c = 0;
|
2632
|
+
else
|
2633
|
+
c = ci->decomp_data[j];
|
2634
|
+
put16(data_buf, &idx, c);
|
2635
|
+
}
|
2636
|
+
}
|
2637
|
+
} else if (de->type <= DECOMP_TYPE_LL2) {
|
2638
|
+
int n, p, k;
|
2639
|
+
n = (de->len * de->c_len * 18 + 7) / 8;
|
2640
|
+
p = de->len * de->c_len * 2;
|
2641
|
+
memset(data_buf + idx, 0, n);
|
2642
|
+
k = 0;
|
2643
|
+
for(i = 0; i < de->len; i++) {
|
2644
|
+
ci = &unicode_db[de->code + i];
|
2645
|
+
for(j = 0; j < de->c_len; j++) {
|
2646
|
+
if (ci->decomp_len == 0)
|
2647
|
+
c = 0;
|
2648
|
+
else
|
2649
|
+
c = ci->decomp_data[j];
|
2650
|
+
data_buf[idx + k * 2] = c;
|
2651
|
+
data_buf[idx + k * 2 + 1] = c >> 8;
|
2652
|
+
data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
|
2653
|
+
k++;
|
2654
|
+
}
|
2655
|
+
}
|
2656
|
+
idx += n;
|
2657
|
+
} else if (de->type <= DECOMP_TYPE_S5) {
|
2658
|
+
for(i = 0; i < de->len; i++) {
|
2659
|
+
ci = &unicode_db[de->code + i];
|
2660
|
+
for(j = 0; j < de->c_len; j++) {
|
2661
|
+
if (ci->decomp_len == 0)
|
2662
|
+
c = 0;
|
2663
|
+
else
|
2664
|
+
c = ci->decomp_data[j];
|
2665
|
+
c = get_short_code(c);
|
2666
|
+
assert(c >= 0);
|
2667
|
+
data_buf[idx++] = c;
|
2668
|
+
}
|
2669
|
+
}
|
2670
|
+
} else if (de->type <= DECOMP_TYPE_I4_2) {
|
2671
|
+
ci = &unicode_db[de->code];
|
2672
|
+
assert(ci->decomp_len == de->c_len);
|
2673
|
+
for(j = 0; j < de->c_len; j++)
|
2674
|
+
put16(data_buf, &idx, ci->decomp_data[j]);
|
2675
|
+
} else if (de->type <= DECOMP_TYPE_B18) {
|
2676
|
+
c = de->c_min;
|
2677
|
+
data_buf[idx++] = c;
|
2678
|
+
data_buf[idx++] = c >> 8;
|
2679
|
+
for(i = 0; i < de->len; i++) {
|
2680
|
+
ci = &unicode_db[de->code + i];
|
2681
|
+
for(j = 0; j < de->c_len; j++) {
|
2682
|
+
assert(ci->decomp_len == de->c_len);
|
2683
|
+
c = ci->decomp_data[j];
|
2684
|
+
if (c == 0x20) {
|
2685
|
+
c = 0xff;
|
2686
|
+
} else {
|
2687
|
+
c -= de->c_min;
|
2688
|
+
assert((uint32_t)c <= 254);
|
2689
|
+
}
|
2690
|
+
data_buf[idx++] = c;
|
2691
|
+
}
|
2692
|
+
}
|
2693
|
+
} else if (de->type <= DECOMP_TYPE_LS2) {
|
2694
|
+
assert(de->c_len == 2);
|
2695
|
+
for(i = 0; i < de->len; i++) {
|
2696
|
+
ci = &unicode_db[de->code + i];
|
2697
|
+
if (ci->decomp_len == 0)
|
2698
|
+
c = 0;
|
2699
|
+
else
|
2700
|
+
c = ci->decomp_data[0];
|
2701
|
+
put16(data_buf, &idx, c);
|
2702
|
+
|
2703
|
+
if (ci->decomp_len == 0)
|
2704
|
+
c = 0;
|
2705
|
+
else
|
2706
|
+
c = ci->decomp_data[1];
|
2707
|
+
c = get_short_code(c);
|
2708
|
+
assert(c >= 0);
|
2709
|
+
data_buf[idx++] = c;
|
2710
|
+
}
|
2711
|
+
} else if (de->type <= DECOMP_TYPE_PAT3) {
|
2712
|
+
ci = &unicode_db[de->code];
|
2713
|
+
assert(ci->decomp_len == 3);
|
2714
|
+
put16(data_buf, &idx, ci->decomp_data[0]);
|
2715
|
+
put16(data_buf, &idx, ci->decomp_data[2]);
|
2716
|
+
for(i = 0; i < de->len; i++) {
|
2717
|
+
ci = &unicode_db[de->code + i];
|
2718
|
+
assert(ci->decomp_len == 3);
|
2719
|
+
put16(data_buf, &idx, ci->decomp_data[1]);
|
2720
|
+
}
|
2721
|
+
} else if (de->type <= DECOMP_TYPE_S2_UL) {
|
2722
|
+
for(i = 0; i < de->len; i += 2) {
|
2723
|
+
ci = &unicode_db[de->code + i];
|
2724
|
+
c = ci->decomp_data[0];
|
2725
|
+
c = get_short_code(c);
|
2726
|
+
assert(c >= 0);
|
2727
|
+
data_buf[idx++] = c;
|
2728
|
+
c = ci->decomp_data[1];
|
2729
|
+
c = get_short_code(c);
|
2730
|
+
assert(c >= 0);
|
2731
|
+
data_buf[idx++] = c;
|
2732
|
+
}
|
2733
|
+
} else if (de->type <= DECOMP_TYPE_LS2_UL) {
|
2734
|
+
for(i = 0; i < de->len; i += 2) {
|
2735
|
+
ci = &unicode_db[de->code + i];
|
2736
|
+
c = ci->decomp_data[0];
|
2737
|
+
put16(data_buf, &idx, c);
|
2738
|
+
c = ci->decomp_data[1];
|
2739
|
+
c = get_short_code(c);
|
2740
|
+
assert(c >= 0);
|
2741
|
+
data_buf[idx++] = c;
|
2742
|
+
}
|
2743
|
+
} else {
|
2744
|
+
abort();
|
2745
|
+
}
|
2746
|
+
*pidx = idx;
|
2747
|
+
}
|
2748
|
+
|
2749
|
+
#if 0
|
2750
|
+
void dump_large_char(void)
|
2751
|
+
{
|
2752
|
+
int i, j;
|
2753
|
+
for(i = 0; i <= CHARCODE_MAX; i++) {
|
2754
|
+
CCInfo *ci = &unicode_db[i];
|
2755
|
+
for(j = 0; j < ci->decomp_len; j++) {
|
2756
|
+
if (ci->decomp_data[j] > 0xffff)
|
2757
|
+
printf("%05x\n", ci->decomp_data[j]);
|
2758
|
+
}
|
2759
|
+
}
|
2760
|
+
}
|
2761
|
+
#endif
|
2762
|
+
|
2763
|
+
void build_compose_table(FILE *f, const DecompEntry *tab_de);
|
2764
|
+
|
2765
|
+
void build_decompose_table(FILE *f)
|
2766
|
+
{
|
2767
|
+
int i, array_len, code_max, data_len, count;
|
2768
|
+
DecompEntry *tab_de, de_s, *de = &de_s;
|
2769
|
+
uint8_t *data_buf;
|
2770
|
+
|
2771
|
+
code_max = CHARCODE_MAX;
|
2772
|
+
|
2773
|
+
tab_de = mallocz((code_max + 2) * sizeof(*tab_de));
|
2774
|
+
|
2775
|
+
for(i = code_max; i >= 0; i--) {
|
2776
|
+
find_decomp_run(tab_de, i);
|
2777
|
+
}
|
2778
|
+
|
2779
|
+
/* build the data buffer */
|
2780
|
+
data_buf = malloc(100000);
|
2781
|
+
data_len = 0;
|
2782
|
+
array_len = 0;
|
2783
|
+
for(i = 0; i <= code_max; i++) {
|
2784
|
+
de = &tab_de[i];
|
2785
|
+
if (de->len != 0) {
|
2786
|
+
add_decomp_data(data_buf, &data_len, de);
|
2787
|
+
i += de->len - 1;
|
2788
|
+
array_len++;
|
2789
|
+
}
|
2790
|
+
}
|
2791
|
+
|
2792
|
+
#ifdef DUMP_DECOMP_TABLE
|
2793
|
+
/* dump */
|
2794
|
+
{
|
2795
|
+
int size, size1;
|
2796
|
+
|
2797
|
+
printf("START LEN TYPE L C SIZE\n");
|
2798
|
+
size = 0;
|
2799
|
+
for(i = 0; i <= code_max; i++) {
|
2800
|
+
de = &tab_de[i];
|
2801
|
+
if (de->len != 0) {
|
2802
|
+
size1 = get_decomp_run_size(de);
|
2803
|
+
printf("%05x %3d %6s %2d %1d %4d\n", i, de->len,
|
2804
|
+
decomp_type_str[de->type], de->c_len,
|
2805
|
+
unicode_db[i].is_compat, size1);
|
2806
|
+
i += de->len - 1;
|
2807
|
+
size += size1;
|
2808
|
+
}
|
2809
|
+
}
|
2810
|
+
|
2811
|
+
printf("array_len=%d estimated size=%d bytes actual=%d bytes\n",
|
2812
|
+
array_len, size, array_len * 6 + data_len);
|
2813
|
+
}
|
2814
|
+
#endif
|
2815
|
+
|
2816
|
+
total_tables++;
|
2817
|
+
total_table_bytes += array_len * sizeof(uint32_t);
|
2818
|
+
fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len);
|
2819
|
+
count = 0;
|
2820
|
+
for(i = 0; i <= code_max; i++) {
|
2821
|
+
de = &tab_de[i];
|
2822
|
+
if (de->len != 0) {
|
2823
|
+
uint32_t v;
|
2824
|
+
if (count++ % 4 == 0)
|
2825
|
+
fprintf(f, "\n ");
|
2826
|
+
v = (de->code << (32 - 18)) |
|
2827
|
+
(de->len << (32 - 18 - 7)) |
|
2828
|
+
(de->type << (32 - 18 - 7 - 6)) |
|
2829
|
+
unicode_db[de->code].is_compat;
|
2830
|
+
fprintf(f, " 0x%08x,", v);
|
2831
|
+
i += de->len - 1;
|
2832
|
+
}
|
2833
|
+
}
|
2834
|
+
fprintf(f, "\n};\n\n");
|
2835
|
+
|
2836
|
+
total_tables++;
|
2837
|
+
total_table_bytes += array_len * sizeof(uint16_t);
|
2838
|
+
fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len);
|
2839
|
+
count = 0;
|
2840
|
+
for(i = 0; i <= code_max; i++) {
|
2841
|
+
de = &tab_de[i];
|
2842
|
+
if (de->len != 0) {
|
2843
|
+
if (count++ % 8 == 0)
|
2844
|
+
fprintf(f, "\n ");
|
2845
|
+
fprintf(f, " 0x%04x,", de->data_index);
|
2846
|
+
i += de->len - 1;
|
2847
|
+
}
|
2848
|
+
}
|
2849
|
+
fprintf(f, "\n};\n\n");
|
2850
|
+
|
2851
|
+
total_tables++;
|
2852
|
+
total_table_bytes += data_len;
|
2853
|
+
fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len);
|
2854
|
+
for(i = 0; i < data_len; i++) {
|
2855
|
+
if (i % 8 == 0)
|
2856
|
+
fprintf(f, "\n ");
|
2857
|
+
fprintf(f, " 0x%02x,", data_buf[i]);
|
2858
|
+
}
|
2859
|
+
fprintf(f, "\n};\n\n");
|
2860
|
+
|
2861
|
+
build_compose_table(f, tab_de);
|
2862
|
+
|
2863
|
+
free(data_buf);
|
2864
|
+
|
2865
|
+
free(tab_de);
|
2866
|
+
}
|
2867
|
+
|
2868
|
+
typedef struct {
|
2869
|
+
uint32_t c[2];
|
2870
|
+
uint32_t p;
|
2871
|
+
} ComposeEntry;
|
2872
|
+
|
2873
|
+
#define COMPOSE_LEN_MAX 10000
|
2874
|
+
|
2875
|
+
static int ce_cmp(const void *p1, const void *p2)
|
2876
|
+
{
|
2877
|
+
const ComposeEntry *ce1 = p1;
|
2878
|
+
const ComposeEntry *ce2 = p2;
|
2879
|
+
int i;
|
2880
|
+
|
2881
|
+
for(i = 0; i < 2; i++) {
|
2882
|
+
if (ce1->c[i] < ce2->c[i])
|
2883
|
+
return -1;
|
2884
|
+
else if (ce1->c[i] > ce2->c[i])
|
2885
|
+
return 1;
|
2886
|
+
}
|
2887
|
+
return 0;
|
2888
|
+
}
|
2889
|
+
|
2890
|
+
|
2891
|
+
static int get_decomp_pos(const DecompEntry *tab_de, int c)
|
2892
|
+
{
|
2893
|
+
int i, v, k;
|
2894
|
+
const DecompEntry *de;
|
2895
|
+
|
2896
|
+
k = 0;
|
2897
|
+
for(i = 0; i <= CHARCODE_MAX; i++) {
|
2898
|
+
de = &tab_de[i];
|
2899
|
+
if (de->len != 0) {
|
2900
|
+
if (c >= de->code && c < de->code + de->len) {
|
2901
|
+
v = c - de->code;
|
2902
|
+
assert(v < 64);
|
2903
|
+
v |= k << 6;
|
2904
|
+
assert(v < 65536);
|
2905
|
+
return v;
|
2906
|
+
}
|
2907
|
+
i += de->len - 1;
|
2908
|
+
k++;
|
2909
|
+
}
|
2910
|
+
}
|
2911
|
+
return -1;
|
2912
|
+
}
|
2913
|
+
|
2914
|
+
void build_compose_table(FILE *f, const DecompEntry *tab_de)
|
2915
|
+
{
|
2916
|
+
int i, v, tab_ce_len;
|
2917
|
+
ComposeEntry *ce, *tab_ce;
|
2918
|
+
|
2919
|
+
tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX);
|
2920
|
+
tab_ce_len = 0;
|
2921
|
+
for(i = 0; i <= CHARCODE_MAX; i++) {
|
2922
|
+
CCInfo *ci = &unicode_db[i];
|
2923
|
+
if (ci->decomp_len == 2 && !ci->is_compat &&
|
2924
|
+
!ci->is_excluded) {
|
2925
|
+
assert(tab_ce_len < COMPOSE_LEN_MAX);
|
2926
|
+
ce = &tab_ce[tab_ce_len++];
|
2927
|
+
ce->c[0] = ci->decomp_data[0];
|
2928
|
+
ce->c[1] = ci->decomp_data[1];
|
2929
|
+
ce->p = i;
|
2930
|
+
}
|
2931
|
+
}
|
2932
|
+
qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp);
|
2933
|
+
|
2934
|
+
#if 0
|
2935
|
+
{
|
2936
|
+
printf("tab_ce_len=%d\n", tab_ce_len);
|
2937
|
+
for(i = 0; i < tab_ce_len; i++) {
|
2938
|
+
ce = &tab_ce[i];
|
2939
|
+
printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p);
|
2940
|
+
}
|
2941
|
+
}
|
2942
|
+
#endif
|
2943
|
+
|
2944
|
+
total_tables++;
|
2945
|
+
total_table_bytes += tab_ce_len * sizeof(uint16_t);
|
2946
|
+
fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len);
|
2947
|
+
for(i = 0; i < tab_ce_len; i++) {
|
2948
|
+
if (i % 8 == 0)
|
2949
|
+
fprintf(f, "\n ");
|
2950
|
+
v = get_decomp_pos(tab_de, tab_ce[i].p);
|
2951
|
+
if (v < 0) {
|
2952
|
+
printf("ERROR: entry for c=%04x not found\n",
|
2953
|
+
tab_ce[i].p);
|
2954
|
+
exit(1);
|
2955
|
+
}
|
2956
|
+
fprintf(f, " 0x%04x,", v);
|
2957
|
+
}
|
2958
|
+
fprintf(f, "\n};\n\n");
|
2959
|
+
|
2960
|
+
free(tab_ce);
|
2961
|
+
}
|
2962
|
+
|
2963
|
+
#ifdef USE_TEST
|
2964
|
+
void check_decompose_table(void)
|
2965
|
+
{
|
2966
|
+
int c;
|
2967
|
+
CCInfo *ci;
|
2968
|
+
int res[UNICODE_DECOMP_LEN_MAX], *ref;
|
2969
|
+
int len, ref_len, is_compat;
|
2970
|
+
|
2971
|
+
for(is_compat = 0; is_compat <= 1; is_compat++) {
|
2972
|
+
for(c = 0; c < CHARCODE_MAX; c++) {
|
2973
|
+
ci = &unicode_db[c];
|
2974
|
+
ref_len = ci->decomp_len;
|
2975
|
+
ref = ci->decomp_data;
|
2976
|
+
if (!is_compat && ci->is_compat) {
|
2977
|
+
ref_len = 0;
|
2978
|
+
}
|
2979
|
+
len = unicode_decomp_char((uint32_t *)res, c, is_compat);
|
2980
|
+
if (len != ref_len ||
|
2981
|
+
tabcmp(res, ref, ref_len) != 0) {
|
2982
|
+
printf("ERROR c=%05x compat=%d\n", c, is_compat);
|
2983
|
+
dump_str("res", res, len);
|
2984
|
+
dump_str("ref", ref, ref_len);
|
2985
|
+
exit(1);
|
2986
|
+
}
|
2987
|
+
}
|
2988
|
+
}
|
2989
|
+
}
|
2990
|
+
|
2991
|
+
void check_compose_table(void)
|
2992
|
+
{
|
2993
|
+
int i, p;
|
2994
|
+
/* XXX: we don't test all the cases */
|
2995
|
+
|
2996
|
+
for(i = 0; i <= CHARCODE_MAX; i++) {
|
2997
|
+
CCInfo *ci = &unicode_db[i];
|
2998
|
+
if (ci->decomp_len == 2 && !ci->is_compat &&
|
2999
|
+
!ci->is_excluded) {
|
3000
|
+
p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
|
3001
|
+
if (p != i) {
|
3002
|
+
printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n",
|
3003
|
+
ci->decomp_data[0], ci->decomp_data[1], p, i);
|
3004
|
+
exit(1);
|
3005
|
+
}
|
3006
|
+
}
|
3007
|
+
}
|
3008
|
+
|
3009
|
+
|
3010
|
+
|
3011
|
+
}
|
3012
|
+
|
3013
|
+
#endif
|
3014
|
+
|
3015
|
+
|
3016
|
+
|
3017
|
+
#ifdef USE_TEST
|
3018
|
+
|
3019
|
+
void check_str(const char *msg, int num, const int *in_buf, int in_len,
|
3020
|
+
const int *buf1, int len1,
|
3021
|
+
const int *buf2, int len2)
|
3022
|
+
{
|
3023
|
+
if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
|
3024
|
+
printf("%d: ERROR %s:\n", num, msg);
|
3025
|
+
dump_str(" in", in_buf, in_len);
|
3026
|
+
dump_str("res", buf1, len1);
|
3027
|
+
dump_str("ref", buf2, len2);
|
3028
|
+
exit(1);
|
3029
|
+
}
|
3030
|
+
}
|
3031
|
+
|
3032
|
+
void check_cc_table(void)
|
3033
|
+
{
|
3034
|
+
int cc, cc_ref, c;
|
3035
|
+
|
3036
|
+
for(c = 0; c <= CHARCODE_MAX; c++) {
|
3037
|
+
cc_ref = unicode_db[c].combining_class;
|
3038
|
+
cc = unicode_get_cc(c);
|
3039
|
+
if (cc != cc_ref) {
|
3040
|
+
printf("ERROR: c=%04x cc=%d cc_ref=%d\n",
|
3041
|
+
c, cc, cc_ref);
|
3042
|
+
exit(1);
|
3043
|
+
}
|
3044
|
+
}
|
3045
|
+
#ifdef PROFILE
|
3046
|
+
{
|
3047
|
+
int64_t ti, count;
|
3048
|
+
|
3049
|
+
ti = get_time_ns();
|
3050
|
+
count = 0;
|
3051
|
+
/* only do it on meaningful chars */
|
3052
|
+
for(c = 0x20; c <= 0xffff; c++) {
|
3053
|
+
cc_ref = unicode_db[c].combining_class;
|
3054
|
+
cc = unicode_get_cc(c);
|
3055
|
+
count++;
|
3056
|
+
}
|
3057
|
+
ti = get_time_ns() - ti;
|
3058
|
+
printf("cc time=%0.1f ns/char\n",
|
3059
|
+
(double)ti / count);
|
3060
|
+
}
|
3061
|
+
#endif
|
3062
|
+
}
|
3063
|
+
|
3064
|
+
void normalization_test(const char *filename)
|
3065
|
+
{
|
3066
|
+
FILE *f;
|
3067
|
+
char line[4096], *p;
|
3068
|
+
int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
|
3069
|
+
int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
|
3070
|
+
int *buf, buf_len, pos;
|
3071
|
+
|
3072
|
+
f = fopen(filename, "rb");
|
3073
|
+
if (!f) {
|
3074
|
+
perror(filename);
|
3075
|
+
exit(1);
|
3076
|
+
}
|
3077
|
+
pos = 0;
|
3078
|
+
for(;;) {
|
3079
|
+
if (!get_line(line, sizeof(line), f))
|
3080
|
+
break;
|
3081
|
+
pos++;
|
3082
|
+
p = line;
|
3083
|
+
while (isspace(*p))
|
3084
|
+
p++;
|
3085
|
+
if (*p == '#' || *p == '@')
|
3086
|
+
continue;
|
3087
|
+
in_str = get_field_str(&in_len, p, 0);
|
3088
|
+
nfc_str = get_field_str(&nfc_len, p, 1);
|
3089
|
+
nfd_str = get_field_str(&nfd_len, p, 2);
|
3090
|
+
nfkc_str = get_field_str(&nfkc_len, p, 3);
|
3091
|
+
nfkd_str = get_field_str(&nfkd_len, p, 4);
|
3092
|
+
|
3093
|
+
// dump_str("in", in_str, in_len);
|
3094
|
+
|
3095
|
+
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
|
3096
|
+
check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
|
3097
|
+
free(buf);
|
3098
|
+
|
3099
|
+
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
|
3100
|
+
check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
|
3101
|
+
free(buf);
|
3102
|
+
|
3103
|
+
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
|
3104
|
+
check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
|
3105
|
+
free(buf);
|
3106
|
+
|
3107
|
+
buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
|
3108
|
+
check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
|
3109
|
+
free(buf);
|
3110
|
+
|
3111
|
+
free(in_str);
|
3112
|
+
free(nfc_str);
|
3113
|
+
free(nfd_str);
|
3114
|
+
free(nfkc_str);
|
3115
|
+
free(nfkd_str);
|
3116
|
+
}
|
3117
|
+
fclose(f);
|
3118
|
+
}
|
3119
|
+
#endif
|
3120
|
+
|
3121
|
+
int main(int argc, char *argv[])
|
3122
|
+
{
|
3123
|
+
const char *unicode_db_path, *outfilename;
|
3124
|
+
char filename[1024];
|
3125
|
+
int arg = 1;
|
3126
|
+
|
3127
|
+
if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) {
|
3128
|
+
printf("usage: %s PATH [OUTPUT]\n"
|
3129
|
+
" PATH path to the Unicode database directory\n"
|
3130
|
+
" OUTPUT name of the output file. If omitted, a self test is performed\n"
|
3131
|
+
" using the files from the Unicode library\n"
|
3132
|
+
, argv[0]);
|
3133
|
+
return 1;
|
3134
|
+
}
|
3135
|
+
unicode_db_path = argv[arg++];
|
3136
|
+
outfilename = NULL;
|
3137
|
+
if (arg < argc)
|
3138
|
+
outfilename = argv[arg++];
|
3139
|
+
|
3140
|
+
unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
|
3141
|
+
|
3142
|
+
snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
|
3143
|
+
|
3144
|
+
parse_unicode_data(filename);
|
3145
|
+
|
3146
|
+
snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path);
|
3147
|
+
parse_special_casing(unicode_db, filename);
|
3148
|
+
|
3149
|
+
snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path);
|
3150
|
+
parse_case_folding(unicode_db, filename);
|
3151
|
+
|
3152
|
+
snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path);
|
3153
|
+
parse_composition_exclusions(filename);
|
3154
|
+
|
3155
|
+
snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path);
|
3156
|
+
parse_derived_core_properties(filename);
|
3157
|
+
|
3158
|
+
snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path);
|
3159
|
+
parse_derived_norm_properties(filename);
|
3160
|
+
|
3161
|
+
snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path);
|
3162
|
+
parse_prop_list(filename);
|
3163
|
+
|
3164
|
+
snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path);
|
3165
|
+
parse_scripts(filename);
|
3166
|
+
|
3167
|
+
snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt",
|
3168
|
+
unicode_db_path);
|
3169
|
+
parse_script_extensions(filename);
|
3170
|
+
|
3171
|
+
snprintf(filename, sizeof(filename), "%s/emoji-data.txt",
|
3172
|
+
unicode_db_path);
|
3173
|
+
parse_prop_list(filename);
|
3174
|
+
|
3175
|
+
// dump_unicode_data(unicode_db);
|
3176
|
+
build_conv_table(unicode_db);
|
3177
|
+
|
3178
|
+
#ifdef DUMP_CASE_FOLDING_SPECIAL_CASES
|
3179
|
+
dump_case_folding_special_cases(unicode_db);
|
3180
|
+
#endif
|
3181
|
+
|
3182
|
+
if (!outfilename) {
|
3183
|
+
#ifdef USE_TEST
|
3184
|
+
check_case_conv();
|
3185
|
+
check_flags();
|
3186
|
+
check_decompose_table();
|
3187
|
+
check_compose_table();
|
3188
|
+
check_cc_table();
|
3189
|
+
snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path);
|
3190
|
+
normalization_test(filename);
|
3191
|
+
#else
|
3192
|
+
fprintf(stderr, "Tests are not compiled\n");
|
3193
|
+
exit(1);
|
3194
|
+
#endif
|
3195
|
+
} else
|
3196
|
+
{
|
3197
|
+
FILE *fo = fopen(outfilename, "wb");
|
3198
|
+
|
3199
|
+
if (!fo) {
|
3200
|
+
perror(outfilename);
|
3201
|
+
exit(1);
|
3202
|
+
}
|
3203
|
+
fprintf(fo,
|
3204
|
+
"/* Compressed unicode tables */\n"
|
3205
|
+
"/* Automatically generated file - do not edit */\n"
|
3206
|
+
"\n"
|
3207
|
+
"#include <stdint.h>\n"
|
3208
|
+
"\n");
|
3209
|
+
dump_case_conv_table(fo);
|
3210
|
+
compute_internal_props();
|
3211
|
+
build_flags_tables(fo);
|
3212
|
+
fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n");
|
3213
|
+
build_cc_table(fo);
|
3214
|
+
build_decompose_table(fo);
|
3215
|
+
build_general_category_table(fo);
|
3216
|
+
build_script_table(fo);
|
3217
|
+
build_script_ext_table(fo);
|
3218
|
+
build_prop_list_table(fo);
|
3219
|
+
fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
|
3220
|
+
fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
|
3221
|
+
total_tables, total_table_bytes, total_index, total_index_bytes);
|
3222
|
+
fclose(fo);
|
3223
|
+
}
|
3224
|
+
return 0;
|
3225
|
+
}
|