quickjs 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/quickjsrb/quickjs/cutils.c +2 -0
- data/ext/quickjsrb/quickjs/cutils.h +56 -0
- data/ext/quickjsrb/quickjs/dtoa.c +5 -11
- data/ext/quickjsrb/quickjs/libregexp-opcode.h +11 -1
- data/ext/quickjsrb/quickjs/libregexp.c +883 -132
- data/ext/quickjsrb/quickjs/libregexp.h +1 -0
- data/ext/quickjsrb/quickjs/libunicode-table.h +2211 -1619
- data/ext/quickjsrb/quickjs/libunicode.c +224 -11
- data/ext/quickjsrb/quickjs/libunicode.h +9 -5
- data/ext/quickjsrb/quickjs/qjs.c +48 -9
- data/ext/quickjsrb/quickjs/qjsc.c +216 -73
- data/ext/quickjsrb/quickjs/quickjs-atom.h +14 -0
- data/ext/quickjsrb/quickjs/quickjs-libc.c +460 -174
- data/ext/quickjsrb/quickjs/quickjs-libc.h +7 -1
- data/ext/quickjsrb/quickjs/quickjs-opcode.h +5 -4
- data/ext/quickjsrb/quickjs/quickjs.c +4503 -1614
- data/ext/quickjsrb/quickjs/quickjs.h +82 -15
- data/ext/quickjsrb/quickjs/run-test262.c +119 -33
- data/ext/quickjsrb/quickjs/unicode_gen.c +560 -6
- data/ext/quickjsrb/quickjs/unicode_gen_def.h +27 -0
- data/ext/quickjsrb/quickjsrb.c +1 -1
- data/lib/quickjs/version.rb +1 -1
- metadata +2 -2
@@ -156,6 +156,153 @@ char *get_line(char *buf, int buf_size, FILE *f)
|
|
156
156
|
return buf;
|
157
157
|
}
|
158
158
|
|
159
|
+
typedef struct REString {
|
160
|
+
struct REString *next;
|
161
|
+
uint32_t hash;
|
162
|
+
uint32_t len;
|
163
|
+
uint32_t flags;
|
164
|
+
uint32_t buf[];
|
165
|
+
} REString;
|
166
|
+
|
167
|
+
typedef struct {
|
168
|
+
uint32_t n_strings;
|
169
|
+
uint32_t hash_size;
|
170
|
+
int hash_bits;
|
171
|
+
REString **hash_table;
|
172
|
+
} REStringList;
|
173
|
+
|
174
|
+
static uint32_t re_string_hash(int len, const uint32_t *buf)
|
175
|
+
{
|
176
|
+
int i;
|
177
|
+
uint32_t h;
|
178
|
+
h = 1;
|
179
|
+
for(i = 0; i < len; i++)
|
180
|
+
h = h * 263 + buf[i];
|
181
|
+
return h * 0x61C88647;
|
182
|
+
}
|
183
|
+
|
184
|
+
static void re_string_list_init(REStringList *s)
|
185
|
+
{
|
186
|
+
s->n_strings = 0;
|
187
|
+
s->hash_size = 0;
|
188
|
+
s->hash_bits = 0;
|
189
|
+
s->hash_table = NULL;
|
190
|
+
}
|
191
|
+
|
192
|
+
static __maybe_unused void re_string_list_free(REStringList *s)
|
193
|
+
{
|
194
|
+
REString *p, *p_next;
|
195
|
+
int i;
|
196
|
+
for(i = 0; i < s->hash_size; i++) {
|
197
|
+
for(p = s->hash_table[i]; p != NULL; p = p_next) {
|
198
|
+
p_next = p->next;
|
199
|
+
free(p);
|
200
|
+
}
|
201
|
+
}
|
202
|
+
free(s->hash_table);
|
203
|
+
}
|
204
|
+
|
205
|
+
static void lre_print_char(int c, BOOL is_range)
|
206
|
+
{
|
207
|
+
if (c == '\'' || c == '\\' ||
|
208
|
+
(is_range && (c == '-' || c == ']'))) {
|
209
|
+
printf("\\%c", c);
|
210
|
+
} else if (c >= ' ' && c <= 126) {
|
211
|
+
printf("%c", c);
|
212
|
+
} else {
|
213
|
+
printf("\\u{%04x}", c);
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
|
218
|
+
{
|
219
|
+
REString *p;
|
220
|
+
int i, j, k;
|
221
|
+
|
222
|
+
printf("%s:\n", str);
|
223
|
+
|
224
|
+
j = 0;
|
225
|
+
for(i = 0; i < s->hash_size; i++) {
|
226
|
+
for(p = s->hash_table[i]; p != NULL; p = p->next) {
|
227
|
+
printf(" %d/%d: '", j, s->n_strings);
|
228
|
+
for(k = 0; k < p->len; k++) {
|
229
|
+
lre_print_char(p->buf[k], FALSE);
|
230
|
+
}
|
231
|
+
printf("'\n");
|
232
|
+
j++;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
static REString *re_string_find2(REStringList *s, int len, const uint32_t *buf,
|
238
|
+
uint32_t h0, BOOL add_flag)
|
239
|
+
{
|
240
|
+
uint32_t h = 0; /* avoid warning */
|
241
|
+
REString *p;
|
242
|
+
if (s->n_strings != 0) {
|
243
|
+
h = h0 >> (32 - s->hash_bits);
|
244
|
+
for(p = s->hash_table[h]; p != NULL; p = p->next) {
|
245
|
+
if (p->hash == h0 && p->len == len &&
|
246
|
+
!memcmp(p->buf, buf, len * sizeof(buf[0]))) {
|
247
|
+
return p;
|
248
|
+
}
|
249
|
+
}
|
250
|
+
}
|
251
|
+
/* not found */
|
252
|
+
if (!add_flag)
|
253
|
+
return NULL;
|
254
|
+
/* increase the size of the hash table if needed */
|
255
|
+
if (unlikely((s->n_strings + 1) > s->hash_size)) {
|
256
|
+
REString **new_hash_table, *p_next;
|
257
|
+
int new_hash_bits, i;
|
258
|
+
uint32_t new_hash_size;
|
259
|
+
new_hash_bits = max_int(s->hash_bits + 1, 4);
|
260
|
+
new_hash_size = 1 << new_hash_bits;
|
261
|
+
new_hash_table = malloc(sizeof(new_hash_table[0]) * new_hash_size);
|
262
|
+
if (!new_hash_table)
|
263
|
+
return NULL;
|
264
|
+
memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
|
265
|
+
for(i = 0; i < s->hash_size; i++) {
|
266
|
+
for(p = s->hash_table[i]; p != NULL; p = p_next) {
|
267
|
+
p_next = p->next;
|
268
|
+
h = p->hash >> (32 - new_hash_bits);
|
269
|
+
p->next = new_hash_table[h];
|
270
|
+
new_hash_table[h] = p;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
free(s->hash_table);
|
274
|
+
s->hash_bits = new_hash_bits;
|
275
|
+
s->hash_size = new_hash_size;
|
276
|
+
s->hash_table = new_hash_table;
|
277
|
+
h = h0 >> (32 - s->hash_bits);
|
278
|
+
}
|
279
|
+
|
280
|
+
p = malloc(sizeof(REString) + len * sizeof(buf[0]));
|
281
|
+
if (!p)
|
282
|
+
return NULL;
|
283
|
+
p->next = s->hash_table[h];
|
284
|
+
s->hash_table[h] = p;
|
285
|
+
s->n_strings++;
|
286
|
+
p->hash = h0;
|
287
|
+
p->len = len;
|
288
|
+
p->flags = 0;
|
289
|
+
memcpy(p->buf, buf, sizeof(buf[0]) * len);
|
290
|
+
return p;
|
291
|
+
}
|
292
|
+
|
293
|
+
static REString *re_string_find(REStringList *s, int len, const uint32_t *buf,
|
294
|
+
BOOL add_flag)
|
295
|
+
{
|
296
|
+
uint32_t h0;
|
297
|
+
h0 = re_string_hash(len, buf);
|
298
|
+
return re_string_find2(s, len, buf, h0, add_flag);
|
299
|
+
}
|
300
|
+
|
301
|
+
static void re_string_add(REStringList *s, int len, const uint32_t *buf)
|
302
|
+
{
|
303
|
+
re_string_find(s, len, buf, TRUE);
|
304
|
+
}
|
305
|
+
|
159
306
|
#define UNICODE_GENERAL_CATEGORY
|
160
307
|
|
161
308
|
typedef enum {
|
@@ -225,6 +372,23 @@ static const char *unicode_prop_short_name[] = {
|
|
225
372
|
|
226
373
|
#undef UNICODE_PROP_LIST
|
227
374
|
|
375
|
+
#define UNICODE_SEQUENCE_PROP_LIST
|
376
|
+
|
377
|
+
typedef enum {
|
378
|
+
#define DEF(id) SEQUENCE_PROP_ ## id,
|
379
|
+
#include "unicode_gen_def.h"
|
380
|
+
#undef DEF
|
381
|
+
SEQUENCE_PROP_COUNT,
|
382
|
+
} UnicodeSequencePropEnum1;
|
383
|
+
|
384
|
+
static const char *unicode_sequence_prop_name[] = {
|
385
|
+
#define DEF(id) #id,
|
386
|
+
#include "unicode_gen_def.h"
|
387
|
+
#undef DEF
|
388
|
+
};
|
389
|
+
|
390
|
+
#undef UNICODE_SEQUENCE_PROP_LIST
|
391
|
+
|
228
392
|
typedef struct {
|
229
393
|
/* case conv */
|
230
394
|
uint8_t u_len;
|
@@ -247,7 +411,15 @@ typedef struct {
|
|
247
411
|
int *decomp_data;
|
248
412
|
} CCInfo;
|
249
413
|
|
414
|
+
typedef struct {
|
415
|
+
int count;
|
416
|
+
int size;
|
417
|
+
int *tab;
|
418
|
+
} UnicodeSequenceProperties;
|
419
|
+
|
250
420
|
CCInfo *unicode_db;
|
421
|
+
REStringList rgi_emoji_zwj_sequence;
|
422
|
+
DynBuf rgi_emoji_tag_sequence;
|
251
423
|
|
252
424
|
int find_name(const char **tab, int tab_len, const char *name)
|
253
425
|
{
|
@@ -625,7 +797,7 @@ void parse_derived_core_properties(const char *filename)
|
|
625
797
|
p++;
|
626
798
|
p += strspn(p, " \t");
|
627
799
|
q = buf;
|
628
|
-
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
800
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t' && *p != ';') {
|
629
801
|
if ((q - buf) < sizeof(buf) - 1)
|
630
802
|
*q++ = *p;
|
631
803
|
p++;
|
@@ -751,6 +923,147 @@ void parse_prop_list(const char *filename)
|
|
751
923
|
fclose(f);
|
752
924
|
}
|
753
925
|
|
926
|
+
#define SEQ_MAX_LEN 16
|
927
|
+
|
928
|
+
static BOOL is_emoji_modifier(uint32_t c)
|
929
|
+
{
|
930
|
+
return (c >= 0x1f3fb && c <= 0x1f3ff);
|
931
|
+
}
|
932
|
+
|
933
|
+
static void add_sequence_prop(int idx, int seq_len, int *seq)
|
934
|
+
{
|
935
|
+
int i;
|
936
|
+
|
937
|
+
assert(idx < SEQUENCE_PROP_COUNT);
|
938
|
+
switch(idx) {
|
939
|
+
case SEQUENCE_PROP_Basic_Emoji:
|
940
|
+
/* convert to 2 properties lists */
|
941
|
+
if (seq_len == 1) {
|
942
|
+
set_prop(seq[0], PROP_Basic_Emoji1, 1);
|
943
|
+
} else if (seq_len == 2 && seq[1] == 0xfe0f) {
|
944
|
+
set_prop(seq[0], PROP_Basic_Emoji2, 1);
|
945
|
+
} else {
|
946
|
+
abort();
|
947
|
+
}
|
948
|
+
break;
|
949
|
+
case SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence:
|
950
|
+
assert(seq_len == 2);
|
951
|
+
assert(is_emoji_modifier(seq[1]));
|
952
|
+
assert(get_prop(seq[0], PROP_Emoji_Modifier_Base));
|
953
|
+
set_prop(seq[0], PROP_RGI_Emoji_Modifier_Sequence, 1);
|
954
|
+
break;
|
955
|
+
case SEQUENCE_PROP_RGI_Emoji_Flag_Sequence:
|
956
|
+
{
|
957
|
+
int code;
|
958
|
+
assert(seq_len == 2);
|
959
|
+
assert(seq[0] >= 0x1F1E6 && seq[0] <= 0x1F1FF);
|
960
|
+
assert(seq[1] >= 0x1F1E6 && seq[1] <= 0x1F1FF);
|
961
|
+
code = (seq[0] - 0x1F1E6) * 26 + (seq[1] - 0x1F1E6);
|
962
|
+
/* XXX: would be more compact with a simple bitmap -> 676 bits */
|
963
|
+
set_prop(code, PROP_RGI_Emoji_Flag_Sequence, 1);
|
964
|
+
}
|
965
|
+
break;
|
966
|
+
case SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence:
|
967
|
+
re_string_add(&rgi_emoji_zwj_sequence, seq_len, (uint32_t *)seq);
|
968
|
+
break;
|
969
|
+
case SEQUENCE_PROP_RGI_Emoji_Tag_Sequence:
|
970
|
+
{
|
971
|
+
assert(seq_len >= 3);
|
972
|
+
assert(seq[0] == 0x1F3F4);
|
973
|
+
assert(seq[seq_len - 1] == 0xE007F);
|
974
|
+
for(i = 1; i < seq_len - 1; i++) {
|
975
|
+
assert(seq[i] >= 0xe0001 && seq[i] <= 0xe007e);
|
976
|
+
dbuf_putc(&rgi_emoji_tag_sequence, seq[i] - 0xe0000);
|
977
|
+
}
|
978
|
+
dbuf_putc(&rgi_emoji_tag_sequence, 0);
|
979
|
+
}
|
980
|
+
break;
|
981
|
+
case SEQUENCE_PROP_Emoji_Keycap_Sequence:
|
982
|
+
assert(seq_len == 3);
|
983
|
+
assert(seq[1] == 0xfe0f);
|
984
|
+
assert(seq[2] == 0x20e3);
|
985
|
+
set_prop(seq[0], PROP_Emoji_Keycap_Sequence, 1);
|
986
|
+
break;
|
987
|
+
default:
|
988
|
+
assert(0);
|
989
|
+
}
|
990
|
+
}
|
991
|
+
|
992
|
+
void parse_sequence_prop_list(const char *filename)
|
993
|
+
{
|
994
|
+
FILE *f;
|
995
|
+
char line[4096], *p, buf[256], *q, *p_start;
|
996
|
+
uint32_t c0, c1, c;
|
997
|
+
int idx, seq_len;
|
998
|
+
int seq[SEQ_MAX_LEN];
|
999
|
+
|
1000
|
+
f = fopen(filename, "rb");
|
1001
|
+
if (!f) {
|
1002
|
+
perror(filename);
|
1003
|
+
exit(1);
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
for(;;) {
|
1007
|
+
if (!get_line(line, sizeof(line), f))
|
1008
|
+
break;
|
1009
|
+
p = line;
|
1010
|
+
while (isspace(*p))
|
1011
|
+
p++;
|
1012
|
+
if (*p == '#' || *p == '@' || *p == '\0')
|
1013
|
+
continue;
|
1014
|
+
p_start = p;
|
1015
|
+
|
1016
|
+
/* find the sequence property name */
|
1017
|
+
p = strchr(p, ';');
|
1018
|
+
if (!p)
|
1019
|
+
continue;
|
1020
|
+
p++;
|
1021
|
+
p += strspn(p, " \t");
|
1022
|
+
q = buf;
|
1023
|
+
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t' && *p != ';') {
|
1024
|
+
if ((q - buf) < sizeof(buf) - 1)
|
1025
|
+
*q++ = *p;
|
1026
|
+
p++;
|
1027
|
+
}
|
1028
|
+
*q = '\0';
|
1029
|
+
idx = find_name(unicode_sequence_prop_name,
|
1030
|
+
countof(unicode_sequence_prop_name), buf);
|
1031
|
+
if (idx < 0) {
|
1032
|
+
fprintf(stderr, "Property not found: %s\n", buf);
|
1033
|
+
exit(1);
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
p = p_start;
|
1037
|
+
c0 = strtoul(p, (char **)&p, 16);
|
1038
|
+
assert(c0 <= CHARCODE_MAX);
|
1039
|
+
|
1040
|
+
if (*p == '.' && p[1] == '.') {
|
1041
|
+
p += 2;
|
1042
|
+
c1 = strtoul(p, (char **)&p, 16);
|
1043
|
+
assert(c1 <= CHARCODE_MAX);
|
1044
|
+
for(c = c0; c <= c1; c++) {
|
1045
|
+
seq[0] = c;
|
1046
|
+
add_sequence_prop(idx, 1, seq);
|
1047
|
+
}
|
1048
|
+
} else {
|
1049
|
+
seq_len = 0;
|
1050
|
+
seq[seq_len++] = c0;
|
1051
|
+
for(;;) {
|
1052
|
+
while (isspace(*p))
|
1053
|
+
p++;
|
1054
|
+
if (*p == ';' || *p == '\0')
|
1055
|
+
break;
|
1056
|
+
c0 = strtoul(p, (char **)&p, 16);
|
1057
|
+
assert(c0 <= CHARCODE_MAX);
|
1058
|
+
assert(seq_len < countof(seq));
|
1059
|
+
seq[seq_len++] = c0;
|
1060
|
+
}
|
1061
|
+
add_sequence_prop(idx, seq_len, seq);
|
1062
|
+
}
|
1063
|
+
}
|
1064
|
+
fclose(f);
|
1065
|
+
}
|
1066
|
+
|
754
1067
|
void parse_scripts(const char *filename)
|
755
1068
|
{
|
756
1069
|
FILE *f;
|
@@ -1117,6 +1430,24 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
|
|
1117
1430
|
te->ext_data[1] = ci->u_data[1];
|
1118
1431
|
te->ext_data[2] = ci->u_data[2];
|
1119
1432
|
te->ext_len = 3;
|
1433
|
+
} else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 1) {
|
1434
|
+
// U+FB05 LATIN SMALL LIGATURE LONG S T
|
1435
|
+
assert(code == 0xFB05);
|
1436
|
+
te->len = 1;
|
1437
|
+
te->type = RUN_TYPE_UF_EXT2;
|
1438
|
+
te->ext_data[0] = ci->u_data[0];
|
1439
|
+
te->ext_data[1] = ci->u_data[1];
|
1440
|
+
te->ext_len = 2;
|
1441
|
+
} else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 1) {
|
1442
|
+
// U+1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA or
|
1443
|
+
// U+1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
|
1444
|
+
assert(code == 0x1FD3 || code == 0x1FE3);
|
1445
|
+
te->len = 1;
|
1446
|
+
te->type = RUN_TYPE_UF_EXT3;
|
1447
|
+
te->ext_data[0] = ci->u_data[0];
|
1448
|
+
te->ext_data[1] = ci->u_data[1];
|
1449
|
+
te->ext_data[2] = ci->u_data[2];
|
1450
|
+
te->ext_len = 3;
|
1120
1451
|
} else {
|
1121
1452
|
printf("unsupported encoding case:\n");
|
1122
1453
|
dump_cc_info(ci, code);
|
@@ -1636,7 +1967,7 @@ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
|
|
1636
1967
|
maxw = 0;
|
1637
1968
|
for(i = 0; i < len; i++) {
|
1638
1969
|
w = strlen(tab_name[i]);
|
1639
|
-
if (tab_short_name[i][0] != '\0') {
|
1970
|
+
if (tab_short_name && tab_short_name[i][0] != '\0') {
|
1640
1971
|
w += 1 + strlen(tab_short_name[i]);
|
1641
1972
|
}
|
1642
1973
|
if (maxw < w)
|
@@ -1648,7 +1979,7 @@ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
|
|
1648
1979
|
for(i = 0; i < len; i++) {
|
1649
1980
|
fprintf(f, " \"");
|
1650
1981
|
w = fprintf(f, "%s", tab_name[i]);
|
1651
|
-
if (tab_short_name[i][0] != '\0') {
|
1982
|
+
if (tab_short_name && tab_short_name[i][0] != '\0') {
|
1652
1983
|
w += fprintf(f, ",%s", tab_short_name[i]);
|
1653
1984
|
}
|
1654
1985
|
fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
|
@@ -1756,10 +2087,9 @@ void build_script_table(FILE *f)
|
|
1756
2087
|
fprintf(f, " UNICODE_SCRIPT_COUNT,\n");
|
1757
2088
|
fprintf(f, "} UnicodeScriptEnum;\n\n");
|
1758
2089
|
|
1759
|
-
i = 1;
|
1760
2090
|
dump_name_table(f, "unicode_script_name_table",
|
1761
|
-
unicode_script_name
|
1762
|
-
unicode_script_short_name
|
2091
|
+
unicode_script_name, SCRIPT_COUNT,
|
2092
|
+
unicode_script_short_name);
|
1763
2093
|
|
1764
2094
|
dbuf_init(dbuf);
|
1765
2095
|
#ifdef DUMP_TABLE_SIZE
|
@@ -1912,6 +2242,218 @@ void build_prop_list_table(FILE *f)
|
|
1912
2242
|
fprintf(f, "};\n\n");
|
1913
2243
|
}
|
1914
2244
|
|
2245
|
+
static BOOL is_emoji_hair_color(uint32_t c)
|
2246
|
+
{
|
2247
|
+
return (c >= 0x1F9B0 && c <= 0x1F9B3);
|
2248
|
+
}
|
2249
|
+
|
2250
|
+
#define EMOJI_MOD_NONE 0
|
2251
|
+
#define EMOJI_MOD_TYPE1 1
|
2252
|
+
#define EMOJI_MOD_TYPE2 2
|
2253
|
+
#define EMOJI_MOD_TYPE2D 3
|
2254
|
+
|
2255
|
+
static BOOL mark_zwj_string(REStringList *sl, uint32_t *buf, int len, int mod_type, int *mod_pos,
|
2256
|
+
int hc_pos, BOOL mark_flag)
|
2257
|
+
{
|
2258
|
+
REString *p;
|
2259
|
+
int i, n_mod, i0, i1, hc_count, j;
|
2260
|
+
|
2261
|
+
#if 0
|
2262
|
+
if (mark_flag)
|
2263
|
+
printf("mod_type=%d\n", mod_type);
|
2264
|
+
#endif
|
2265
|
+
|
2266
|
+
switch(mod_type) {
|
2267
|
+
case EMOJI_MOD_NONE:
|
2268
|
+
n_mod = 1;
|
2269
|
+
break;
|
2270
|
+
case EMOJI_MOD_TYPE1:
|
2271
|
+
n_mod = 5;
|
2272
|
+
break;
|
2273
|
+
case EMOJI_MOD_TYPE2:
|
2274
|
+
n_mod = 25;
|
2275
|
+
break;
|
2276
|
+
case EMOJI_MOD_TYPE2D:
|
2277
|
+
n_mod = 20;
|
2278
|
+
break;
|
2279
|
+
default:
|
2280
|
+
assert(0);
|
2281
|
+
}
|
2282
|
+
if (hc_pos >= 0)
|
2283
|
+
hc_count = 4;
|
2284
|
+
else
|
2285
|
+
hc_count = 1;
|
2286
|
+
/* check that all the related strings are present */
|
2287
|
+
for(j = 0; j < hc_count; j++) {
|
2288
|
+
for(i = 0; i < n_mod; i++) {
|
2289
|
+
switch(mod_type) {
|
2290
|
+
case EMOJI_MOD_NONE:
|
2291
|
+
break;
|
2292
|
+
case EMOJI_MOD_TYPE1:
|
2293
|
+
buf[mod_pos[0]] = 0x1f3fb + i;
|
2294
|
+
break;
|
2295
|
+
case EMOJI_MOD_TYPE2:
|
2296
|
+
case EMOJI_MOD_TYPE2D:
|
2297
|
+
i0 = i / 5;
|
2298
|
+
i1 = i % 5;
|
2299
|
+
/* avoid identical values */
|
2300
|
+
if (mod_type == EMOJI_MOD_TYPE2D && i0 >= i1)
|
2301
|
+
i0++;
|
2302
|
+
buf[mod_pos[0]] = 0x1f3fb + i0;
|
2303
|
+
buf[mod_pos[1]] = 0x1f3fb + i1;
|
2304
|
+
break;
|
2305
|
+
default:
|
2306
|
+
assert(0);
|
2307
|
+
}
|
2308
|
+
|
2309
|
+
if (hc_pos >= 0)
|
2310
|
+
buf[hc_pos] = 0x1F9B0 + j;
|
2311
|
+
|
2312
|
+
p = re_string_find(sl, len, buf, FALSE);
|
2313
|
+
if (!p)
|
2314
|
+
return FALSE;
|
2315
|
+
if (mark_flag)
|
2316
|
+
p->flags |= 1;
|
2317
|
+
}
|
2318
|
+
}
|
2319
|
+
return TRUE;
|
2320
|
+
}
|
2321
|
+
|
2322
|
+
static void zwj_encode_string(DynBuf *dbuf, const uint32_t *buf, int len, int mod_type, int *mod_pos,
|
2323
|
+
int hc_pos)
|
2324
|
+
{
|
2325
|
+
int i, j;
|
2326
|
+
int c, code;
|
2327
|
+
uint32_t buf1[SEQ_MAX_LEN];
|
2328
|
+
|
2329
|
+
j = 0;
|
2330
|
+
for(i = 0; i < len;) {
|
2331
|
+
c = buf[i++];
|
2332
|
+
if (c >= 0x2000 && c <= 0x2fff) {
|
2333
|
+
code = c - 0x2000;
|
2334
|
+
} else if (c >= 0x1f000 && c <= 0x1ffff) {
|
2335
|
+
code = c - 0x1f000 + 0x1000;
|
2336
|
+
} else {
|
2337
|
+
assert(0);
|
2338
|
+
}
|
2339
|
+
if (i < len && is_emoji_modifier(buf[i])) {
|
2340
|
+
/* modifier */
|
2341
|
+
code |= (mod_type << 13);
|
2342
|
+
i++;
|
2343
|
+
}
|
2344
|
+
if (i < len && buf[i] == 0xfe0f) {
|
2345
|
+
/* presentation selector present */
|
2346
|
+
code |= 0x8000;
|
2347
|
+
i++;
|
2348
|
+
}
|
2349
|
+
if (i < len) {
|
2350
|
+
/* zero width join */
|
2351
|
+
assert(buf[i] == 0x200d);
|
2352
|
+
i++;
|
2353
|
+
}
|
2354
|
+
buf1[j++] = code;
|
2355
|
+
}
|
2356
|
+
dbuf_putc(dbuf, j);
|
2357
|
+
for(i = 0; i < j; i++) {
|
2358
|
+
dbuf_putc(dbuf, buf1[i]);
|
2359
|
+
dbuf_putc(dbuf, buf1[i] >> 8);
|
2360
|
+
}
|
2361
|
+
}
|
2362
|
+
|
2363
|
+
static void build_rgi_emoji_zwj_sequence(FILE *f, REStringList *sl)
|
2364
|
+
{
|
2365
|
+
int mod_pos[2], mod_count, hair_color_pos, j, h;
|
2366
|
+
REString *p;
|
2367
|
+
uint32_t buf[SEQ_MAX_LEN];
|
2368
|
+
DynBuf dbuf;
|
2369
|
+
|
2370
|
+
#if 0
|
2371
|
+
{
|
2372
|
+
for(h = 0; h < sl->hash_size; h++) {
|
2373
|
+
for(p = sl->hash_table[h]; p != NULL; p = p->next) {
|
2374
|
+
for(j = 0; j < p->len; j++)
|
2375
|
+
printf(" %04x", p->buf[j]);
|
2376
|
+
printf("\n");
|
2377
|
+
}
|
2378
|
+
}
|
2379
|
+
exit(0);
|
2380
|
+
}
|
2381
|
+
#endif
|
2382
|
+
// printf("rgi_emoji_zwj_sequence: n=%d\n", sl->n_strings);
|
2383
|
+
|
2384
|
+
dbuf_init(&dbuf);
|
2385
|
+
|
2386
|
+
/* avoid duplicating strings with emoji modifiers or hair colors */
|
2387
|
+
for(h = 0; h < sl->hash_size; h++) {
|
2388
|
+
for(p = sl->hash_table[h]; p != NULL; p = p->next) {
|
2389
|
+
if (p->flags) /* already examined */
|
2390
|
+
continue;
|
2391
|
+
mod_count = 0;
|
2392
|
+
hair_color_pos = -1;
|
2393
|
+
for(j = 0; j < p->len; j++) {
|
2394
|
+
if (is_emoji_modifier(p->buf[j])) {
|
2395
|
+
assert(mod_count < 2);
|
2396
|
+
mod_pos[mod_count++] = j;
|
2397
|
+
} else if (is_emoji_hair_color(p->buf[j])) {
|
2398
|
+
hair_color_pos = j;
|
2399
|
+
}
|
2400
|
+
buf[j] = p->buf[j];
|
2401
|
+
}
|
2402
|
+
|
2403
|
+
if (mod_count != 0 || hair_color_pos >= 0) {
|
2404
|
+
int mod_type;
|
2405
|
+
if (mod_count == 0)
|
2406
|
+
mod_type = EMOJI_MOD_NONE;
|
2407
|
+
else if (mod_count == 1)
|
2408
|
+
mod_type = EMOJI_MOD_TYPE1;
|
2409
|
+
else
|
2410
|
+
mod_type = EMOJI_MOD_TYPE2;
|
2411
|
+
|
2412
|
+
if (mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, FALSE)) {
|
2413
|
+
mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, TRUE);
|
2414
|
+
} else if (mod_type == EMOJI_MOD_TYPE2) {
|
2415
|
+
mod_type = EMOJI_MOD_TYPE2D;
|
2416
|
+
if (mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, FALSE)) {
|
2417
|
+
mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, TRUE);
|
2418
|
+
} else {
|
2419
|
+
dump_str("not_found", (int *)p->buf, p->len);
|
2420
|
+
goto keep;
|
2421
|
+
}
|
2422
|
+
}
|
2423
|
+
if (hair_color_pos >= 0)
|
2424
|
+
buf[hair_color_pos] = 0x1f9b0;
|
2425
|
+
/* encode the string */
|
2426
|
+
zwj_encode_string(&dbuf, buf, p->len, mod_type, mod_pos, hair_color_pos);
|
2427
|
+
} else {
|
2428
|
+
keep:
|
2429
|
+
zwj_encode_string(&dbuf, buf, p->len, EMOJI_MOD_NONE, NULL, -1);
|
2430
|
+
}
|
2431
|
+
}
|
2432
|
+
}
|
2433
|
+
|
2434
|
+
/* Encode */
|
2435
|
+
dump_byte_table(f, "unicode_rgi_emoji_zwj_sequence", dbuf.buf, dbuf.size);
|
2436
|
+
|
2437
|
+
dbuf_free(&dbuf);
|
2438
|
+
}
|
2439
|
+
|
2440
|
+
void build_sequence_prop_list_table(FILE *f)
|
2441
|
+
{
|
2442
|
+
int i;
|
2443
|
+
fprintf(f, "typedef enum {\n");
|
2444
|
+
for(i = 0; i < SEQUENCE_PROP_COUNT; i++)
|
2445
|
+
fprintf(f, " UNICODE_SEQUENCE_PROP_%s,\n", unicode_sequence_prop_name[i]);
|
2446
|
+
fprintf(f, " UNICODE_SEQUENCE_PROP_COUNT,\n");
|
2447
|
+
fprintf(f, "} UnicodeSequencePropertyEnum;\n\n");
|
2448
|
+
|
2449
|
+
dump_name_table(f, "unicode_sequence_prop_name_table",
|
2450
|
+
unicode_sequence_prop_name, SEQUENCE_PROP_COUNT, NULL);
|
2451
|
+
|
2452
|
+
dump_byte_table(f, "unicode_rgi_emoji_tag_sequence", rgi_emoji_tag_sequence.buf, rgi_emoji_tag_sequence.size);
|
2453
|
+
|
2454
|
+
build_rgi_emoji_zwj_sequence(f, &rgi_emoji_zwj_sequence);
|
2455
|
+
}
|
2456
|
+
|
1915
2457
|
#ifdef USE_TEST
|
1916
2458
|
int check_conv(uint32_t *res, uint32_t c, int conv_type)
|
1917
2459
|
{
|
@@ -3138,6 +3680,8 @@ int main(int argc, char *argv[])
|
|
3138
3680
|
outfilename = argv[arg++];
|
3139
3681
|
|
3140
3682
|
unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
|
3683
|
+
re_string_list_init(&rgi_emoji_zwj_sequence);
|
3684
|
+
dbuf_init(&rgi_emoji_tag_sequence);
|
3141
3685
|
|
3142
3686
|
snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
|
3143
3687
|
|
@@ -3172,6 +3716,14 @@ int main(int argc, char *argv[])
|
|
3172
3716
|
unicode_db_path);
|
3173
3717
|
parse_prop_list(filename);
|
3174
3718
|
|
3719
|
+
snprintf(filename, sizeof(filename), "%s/emoji-sequences.txt",
|
3720
|
+
unicode_db_path);
|
3721
|
+
parse_sequence_prop_list(filename);
|
3722
|
+
|
3723
|
+
snprintf(filename, sizeof(filename), "%s/emoji-zwj-sequences.txt",
|
3724
|
+
unicode_db_path);
|
3725
|
+
parse_sequence_prop_list(filename);
|
3726
|
+
|
3175
3727
|
// dump_unicode_data(unicode_db);
|
3176
3728
|
build_conv_table(unicode_db);
|
3177
3729
|
|
@@ -3216,10 +3768,12 @@ int main(int argc, char *argv[])
|
|
3216
3768
|
build_script_table(fo);
|
3217
3769
|
build_script_ext_table(fo);
|
3218
3770
|
build_prop_list_table(fo);
|
3771
|
+
build_sequence_prop_list_table(fo);
|
3219
3772
|
fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
|
3220
3773
|
fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
|
3221
3774
|
total_tables, total_table_bytes, total_index, total_index_bytes);
|
3222
3775
|
fclose(fo);
|
3223
3776
|
}
|
3777
|
+
re_string_list_free(&rgi_emoji_zwj_sequence);
|
3224
3778
|
return 0;
|
3225
3779
|
}
|