quickjs 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/quickjsrb/extconf.rb +4 -6
- data/ext/quickjsrb/quickjs/cutils.h +20 -0
- data/ext/quickjsrb/quickjs/dtoa.c +1626 -0
- data/ext/quickjsrb/quickjs/dtoa.h +83 -0
- data/ext/quickjsrb/quickjs/libregexp.c +34 -6
- data/ext/quickjsrb/quickjs/libregexp.h +5 -0
- data/ext/quickjsrb/quickjs/libunicode.c +201 -201
- data/ext/quickjsrb/quickjs/qjs.c +0 -52
- data/ext/quickjsrb/quickjs/qjsc.c +1 -29
- data/ext/quickjsrb/quickjs/quickjs-atom.h +0 -17
- data/ext/quickjsrb/quickjs/quickjs-opcode.h +1 -4
- data/ext/quickjsrb/quickjs/quickjs.c +3482 -6322
- data/ext/quickjsrb/quickjs/quickjs.h +39 -25
- data/ext/quickjsrb/quickjsrb.c +9 -10
- data/lib/quickjs/version.rb +1 -1
- metadata +4 -4
- data/ext/quickjsrb/quickjs/libbf.c +0 -8475
- data/ext/quickjsrb/quickjs/libbf.h +0 -535
@@ -0,0 +1,83 @@
|
|
1
|
+
/*
|
2
|
+
* Tiny float64 printing and parsing library
|
3
|
+
*
|
4
|
+
* Copyright (c) 2024 Fabrice Bellard
|
5
|
+
*
|
6
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
* of this software and associated documentation files (the "Software"), to deal
|
8
|
+
* in the Software without restriction, including without limitation the rights
|
9
|
+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the Software is
|
11
|
+
* furnished to do so, subject to the following conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be included in
|
14
|
+
* all copies or substantial portions of the Software.
|
15
|
+
*
|
16
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
19
|
+
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
* THE SOFTWARE.
|
23
|
+
*/
|
24
|
+
|
25
|
+
//#define JS_DTOA_DUMP_STATS
|
26
|
+
|
27
|
+
/* maximum number of digits for fixed and frac formats */
|
28
|
+
#define JS_DTOA_MAX_DIGITS 101
|
29
|
+
|
30
|
+
/* radix != 10 is only supported with flags = JS_DTOA_FORMAT_FREE */
|
31
|
+
/* use as many digits as necessary */
|
32
|
+
#define JS_DTOA_FORMAT_FREE (0 << 0)
|
33
|
+
/* use n_digits significant digits (1 <= n_digits <= JS_DTOA_MAX_DIGITS) */
|
34
|
+
#define JS_DTOA_FORMAT_FIXED (1 << 0)
|
35
|
+
/* force fractional format: [-]dd.dd with n_digits fractional digits.
|
36
|
+
0 <= n_digits <= JS_DTOA_MAX_DIGITS */
|
37
|
+
#define JS_DTOA_FORMAT_FRAC (2 << 0)
|
38
|
+
#define JS_DTOA_FORMAT_MASK (3 << 0)
|
39
|
+
|
40
|
+
/* select exponential notation either in fixed or free format */
|
41
|
+
#define JS_DTOA_EXP_AUTO (0 << 2)
|
42
|
+
#define JS_DTOA_EXP_ENABLED (1 << 2)
|
43
|
+
#define JS_DTOA_EXP_DISABLED (2 << 2)
|
44
|
+
#define JS_DTOA_EXP_MASK (3 << 2)
|
45
|
+
|
46
|
+
#define JS_DTOA_MINUS_ZERO (1 << 4) /* show the minus sign for -0 */
|
47
|
+
|
48
|
+
/* only accepts integers (no dot, no exponent) */
|
49
|
+
#define JS_ATOD_INT_ONLY (1 << 0)
|
50
|
+
/* accept Oo and Ob prefixes in addition to 0x prefix if radix = 0 */
|
51
|
+
#define JS_ATOD_ACCEPT_BIN_OCT (1 << 1)
|
52
|
+
/* accept O prefix as octal if radix == 0 and properly formed (Annex B) */
|
53
|
+
#define JS_ATOD_ACCEPT_LEGACY_OCTAL (1 << 2)
|
54
|
+
/* accept _ between digits as a digit separator */
|
55
|
+
#define JS_ATOD_ACCEPT_UNDERSCORES (1 << 3)
|
56
|
+
|
57
|
+
typedef struct {
|
58
|
+
uint64_t mem[37];
|
59
|
+
} JSDTOATempMem;
|
60
|
+
|
61
|
+
typedef struct {
|
62
|
+
uint64_t mem[27];
|
63
|
+
} JSATODTempMem;
|
64
|
+
|
65
|
+
/* return a maximum bound of the string length */
|
66
|
+
int js_dtoa_max_len(double d, int radix, int n_digits, int flags);
|
67
|
+
/* return the string length */
|
68
|
+
int js_dtoa(char *buf, double d, int radix, int n_digits, int flags,
|
69
|
+
JSDTOATempMem *tmp_mem);
|
70
|
+
double js_atod(const char *str, const char **pnext, int radix, int flags,
|
71
|
+
JSATODTempMem *tmp_mem);
|
72
|
+
|
73
|
+
#ifdef JS_DTOA_DUMP_STATS
|
74
|
+
void js_dtoa_dump_stats(void);
|
75
|
+
#endif
|
76
|
+
|
77
|
+
/* additional exported functions */
|
78
|
+
size_t u32toa(char *buf, uint32_t n);
|
79
|
+
size_t i32toa(char *buf, int32_t n);
|
80
|
+
size_t u64toa(char *buf, uint64_t n);
|
81
|
+
size_t i64toa(char *buf, int64_t n);
|
82
|
+
size_t u64toa_radix(char *buf, uint64_t n, unsigned int radix);
|
83
|
+
size_t i64toa_radix(char *buf, int64_t n, unsigned int radix);
|
@@ -54,6 +54,9 @@ typedef enum {
|
|
54
54
|
|
55
55
|
#define CAPTURE_COUNT_MAX 255
|
56
56
|
#define STACK_SIZE_MAX 255
|
57
|
+
/* must be large enough to have a negligible runtime cost and small
|
58
|
+
enough to call the interrupt callback often. */
|
59
|
+
#define INTERRUPT_COUNTER_INIT 10000
|
57
60
|
|
58
61
|
/* unicode code points */
|
59
62
|
#define CP_LS 0x2028
|
@@ -683,6 +686,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
|
|
683
686
|
c = '\\';
|
684
687
|
}
|
685
688
|
break;
|
689
|
+
case '-':
|
690
|
+
if (!inclass && s->is_unicode)
|
691
|
+
goto invalid_escape;
|
692
|
+
break;
|
686
693
|
#ifdef CONFIG_ALL_UNICODE
|
687
694
|
case 'p':
|
688
695
|
case 'P':
|
@@ -1931,6 +1938,7 @@ typedef struct {
|
|
1931
1938
|
BOOL multi_line;
|
1932
1939
|
BOOL ignore_case;
|
1933
1940
|
BOOL is_unicode;
|
1941
|
+
int interrupt_counter;
|
1934
1942
|
void *opaque; /* used for stack overflow check */
|
1935
1943
|
|
1936
1944
|
size_t state_size;
|
@@ -1977,7 +1985,17 @@ static int push_state(REExecContext *s,
|
|
1977
1985
|
return 0;
|
1978
1986
|
}
|
1979
1987
|
|
1980
|
-
|
1988
|
+
static int lre_poll_timeout(REExecContext *s)
|
1989
|
+
{
|
1990
|
+
if (unlikely(--s->interrupt_counter <= 0)) {
|
1991
|
+
s->interrupt_counter = INTERRUPT_COUNTER_INIT;
|
1992
|
+
if (lre_check_timeout(s->opaque))
|
1993
|
+
return LRE_RET_TIMEOUT;
|
1994
|
+
}
|
1995
|
+
return 0;
|
1996
|
+
}
|
1997
|
+
|
1998
|
+
/* return 1 if match, 0 if not match or < 0 if error. */
|
1981
1999
|
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
1982
2000
|
StackInt *stack, int stack_len,
|
1983
2001
|
const uint8_t *pc, const uint8_t *cptr,
|
@@ -2008,6 +2026,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2008
2026
|
ret = 0;
|
2009
2027
|
recurse:
|
2010
2028
|
for(;;) {
|
2029
|
+
if (lre_poll_timeout(s))
|
2030
|
+
return LRE_RET_TIMEOUT;
|
2011
2031
|
if (s->state_stack_len == 0)
|
2012
2032
|
return ret;
|
2013
2033
|
rs = (REExecState *)(s->state_stack +
|
@@ -2097,7 +2117,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2097
2117
|
ret = push_state(s, capture, stack, stack_len,
|
2098
2118
|
pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
|
2099
2119
|
if (ret < 0)
|
2100
|
-
return
|
2120
|
+
return LRE_RET_MEMORY_ERROR;
|
2101
2121
|
break;
|
2102
2122
|
}
|
2103
2123
|
case REOP_lookahead:
|
@@ -2109,12 +2129,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2109
2129
|
RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
|
2110
2130
|
0);
|
2111
2131
|
if (ret < 0)
|
2112
|
-
return
|
2132
|
+
return LRE_RET_MEMORY_ERROR;
|
2113
2133
|
break;
|
2114
2134
|
|
2115
2135
|
case REOP_goto:
|
2116
2136
|
val = get_u32(pc);
|
2117
2137
|
pc += 4 + (int)val;
|
2138
|
+
if (lre_poll_timeout(s))
|
2139
|
+
return LRE_RET_TIMEOUT;
|
2118
2140
|
break;
|
2119
2141
|
case REOP_line_start:
|
2120
2142
|
if (cptr == s->cbuf)
|
@@ -2179,6 +2201,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2179
2201
|
pc += 4;
|
2180
2202
|
if (--stack[stack_len - 1] != 0) {
|
2181
2203
|
pc += (int)val;
|
2204
|
+
if (lre_poll_timeout(s))
|
2205
|
+
return LRE_RET_TIMEOUT;
|
2182
2206
|
}
|
2183
2207
|
break;
|
2184
2208
|
case REOP_push_char_pos:
|
@@ -2353,9 +2377,12 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2353
2377
|
|
2354
2378
|
q = 0;
|
2355
2379
|
for(;;) {
|
2380
|
+
if (lre_poll_timeout(s))
|
2381
|
+
return LRE_RET_TIMEOUT;
|
2356
2382
|
res = lre_exec_backtrack(s, capture, stack, stack_len,
|
2357
2383
|
pc1, cptr, TRUE);
|
2358
|
-
if (res ==
|
2384
|
+
if (res == LRE_RET_MEMORY_ERROR ||
|
2385
|
+
res == LRE_RET_TIMEOUT)
|
2359
2386
|
return res;
|
2360
2387
|
if (!res)
|
2361
2388
|
break;
|
@@ -2373,7 +2400,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2373
2400
|
RE_EXEC_STATE_GREEDY_QUANT,
|
2374
2401
|
q - quant_min);
|
2375
2402
|
if (ret < 0)
|
2376
|
-
return
|
2403
|
+
return LRE_RET_MEMORY_ERROR;
|
2377
2404
|
}
|
2378
2405
|
}
|
2379
2406
|
break;
|
@@ -2383,7 +2410,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2383
2410
|
}
|
2384
2411
|
}
|
2385
2412
|
|
2386
|
-
/* Return 1 if match, 0 if not match or
|
2413
|
+
/* Return 1 if match, 0 if not match or < 0 if error (see LRE_RET_x). cindex is the
|
2387
2414
|
starting position of the match and must be such as 0 <= cindex <=
|
2388
2415
|
clen. */
|
2389
2416
|
int lre_exec(uint8_t **capture,
|
@@ -2405,6 +2432,7 @@ int lre_exec(uint8_t **capture,
|
|
2405
2432
|
s->cbuf_type = cbuf_type;
|
2406
2433
|
if (s->cbuf_type == 1 && s->is_unicode)
|
2407
2434
|
s->cbuf_type = 2;
|
2435
|
+
s->interrupt_counter = INTERRUPT_COUNTER_INIT;
|
2408
2436
|
s->opaque = opaque;
|
2409
2437
|
|
2410
2438
|
s->state_size = sizeof(REExecState) +
|
@@ -36,6 +36,9 @@
|
|
36
36
|
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */
|
37
37
|
#define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
|
38
38
|
|
39
|
+
#define LRE_RET_MEMORY_ERROR (-1)
|
40
|
+
#define LRE_RET_TIMEOUT (-2)
|
41
|
+
|
39
42
|
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
|
40
43
|
const char *buf, size_t buf_len, int re_flags,
|
41
44
|
void *opaque);
|
@@ -50,6 +53,8 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16);
|
|
50
53
|
|
51
54
|
/* must be provided by the user, return non zero if overflow */
|
52
55
|
int lre_check_stack_overflow(void *opaque, size_t alloca_size);
|
56
|
+
/* must be provided by the user, return non zero if time out */
|
57
|
+
int lre_check_timeout(void *opaque);
|
53
58
|
void *lre_realloc(void *opaque, void *ptr, size_t size);
|
54
59
|
|
55
60
|
#endif /* LIBREGEXP_H */
|
@@ -537,6 +537,207 @@ int cr_invert(CharRange *cr)
|
|
537
537
|
return 0;
|
538
538
|
}
|
539
539
|
|
540
|
+
#define CASE_U (1 << 0)
|
541
|
+
#define CASE_L (1 << 1)
|
542
|
+
#define CASE_F (1 << 2)
|
543
|
+
|
544
|
+
/* use the case conversion table to generate range of characters.
|
545
|
+
CASE_U: set char if modified by uppercasing,
|
546
|
+
CASE_L: set char if modified by lowercasing,
|
547
|
+
CASE_F: set char if modified by case folding,
|
548
|
+
*/
|
549
|
+
static int unicode_case1(CharRange *cr, int case_mask)
|
550
|
+
{
|
551
|
+
#define MR(x) (1 << RUN_TYPE_ ## x)
|
552
|
+
const uint32_t tab_run_mask[3] = {
|
553
|
+
MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
|
554
|
+
MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
|
555
|
+
|
556
|
+
MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
|
557
|
+
|
558
|
+
MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
|
559
|
+
};
|
560
|
+
#undef MR
|
561
|
+
uint32_t mask, v, code, type, len, i, idx;
|
562
|
+
|
563
|
+
if (case_mask == 0)
|
564
|
+
return 0;
|
565
|
+
mask = 0;
|
566
|
+
for(i = 0; i < 3; i++) {
|
567
|
+
if ((case_mask >> i) & 1)
|
568
|
+
mask |= tab_run_mask[i];
|
569
|
+
}
|
570
|
+
for(idx = 0; idx < countof(case_conv_table1); idx++) {
|
571
|
+
v = case_conv_table1[idx];
|
572
|
+
type = (v >> (32 - 17 - 7 - 4)) & 0xf;
|
573
|
+
code = v >> (32 - 17);
|
574
|
+
len = (v >> (32 - 17 - 7)) & 0x7f;
|
575
|
+
if ((mask >> type) & 1) {
|
576
|
+
// printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
|
577
|
+
switch(type) {
|
578
|
+
case RUN_TYPE_UL:
|
579
|
+
if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
|
580
|
+
goto def_case;
|
581
|
+
code += ((case_mask & CASE_U) != 0);
|
582
|
+
for(i = 0; i < len; i += 2) {
|
583
|
+
if (cr_add_interval(cr, code + i, code + i + 1))
|
584
|
+
return -1;
|
585
|
+
}
|
586
|
+
break;
|
587
|
+
case RUN_TYPE_LSU:
|
588
|
+
if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
|
589
|
+
goto def_case;
|
590
|
+
if (!(case_mask & CASE_U)) {
|
591
|
+
if (cr_add_interval(cr, code, code + 1))
|
592
|
+
return -1;
|
593
|
+
}
|
594
|
+
if (cr_add_interval(cr, code + 1, code + 2))
|
595
|
+
return -1;
|
596
|
+
if (case_mask & CASE_U) {
|
597
|
+
if (cr_add_interval(cr, code + 2, code + 3))
|
598
|
+
return -1;
|
599
|
+
}
|
600
|
+
break;
|
601
|
+
default:
|
602
|
+
def_case:
|
603
|
+
if (cr_add_interval(cr, code, code + len))
|
604
|
+
return -1;
|
605
|
+
break;
|
606
|
+
}
|
607
|
+
}
|
608
|
+
}
|
609
|
+
return 0;
|
610
|
+
}
|
611
|
+
|
612
|
+
static int point_cmp(const void *p1, const void *p2, void *arg)
|
613
|
+
{
|
614
|
+
uint32_t v1 = *(uint32_t *)p1;
|
615
|
+
uint32_t v2 = *(uint32_t *)p2;
|
616
|
+
return (v1 > v2) - (v1 < v2);
|
617
|
+
}
|
618
|
+
|
619
|
+
static void cr_sort_and_remove_overlap(CharRange *cr)
|
620
|
+
{
|
621
|
+
uint32_t start, end, start1, end1, i, j;
|
622
|
+
|
623
|
+
/* the resulting ranges are not necessarily sorted and may overlap */
|
624
|
+
rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
|
625
|
+
j = 0;
|
626
|
+
for(i = 0; i < cr->len; ) {
|
627
|
+
start = cr->points[i];
|
628
|
+
end = cr->points[i + 1];
|
629
|
+
i += 2;
|
630
|
+
while (i < cr->len) {
|
631
|
+
start1 = cr->points[i];
|
632
|
+
end1 = cr->points[i + 1];
|
633
|
+
if (start1 > end) {
|
634
|
+
/* |------|
|
635
|
+
* |-------| */
|
636
|
+
break;
|
637
|
+
} else if (end1 <= end) {
|
638
|
+
/* |------|
|
639
|
+
* |--| */
|
640
|
+
i += 2;
|
641
|
+
} else {
|
642
|
+
/* |------|
|
643
|
+
* |-------| */
|
644
|
+
end = end1;
|
645
|
+
i += 2;
|
646
|
+
}
|
647
|
+
}
|
648
|
+
cr->points[j] = start;
|
649
|
+
cr->points[j + 1] = end;
|
650
|
+
j += 2;
|
651
|
+
}
|
652
|
+
cr->len = j;
|
653
|
+
}
|
654
|
+
|
655
|
+
/* canonicalize a character set using the JS regex case folding rules
|
656
|
+
(see lre_canonicalize()) */
|
657
|
+
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
|
658
|
+
{
|
659
|
+
CharRange cr_inter, cr_mask, cr_result, cr_sub;
|
660
|
+
uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
|
661
|
+
|
662
|
+
cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
|
663
|
+
cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
|
664
|
+
cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
|
665
|
+
cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
|
666
|
+
|
667
|
+
if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
|
668
|
+
goto fail;
|
669
|
+
if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
|
670
|
+
goto fail;
|
671
|
+
|
672
|
+
if (cr_invert(&cr_mask))
|
673
|
+
goto fail;
|
674
|
+
if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
|
675
|
+
goto fail;
|
676
|
+
|
677
|
+
/* cr_inter = cr & cr_mask */
|
678
|
+
/* cr_sub = cr & ~cr_mask */
|
679
|
+
|
680
|
+
/* use the case conversion table to compute the result */
|
681
|
+
d_start = -1;
|
682
|
+
d_end = -1;
|
683
|
+
idx = 0;
|
684
|
+
v = case_conv_table1[idx];
|
685
|
+
code = v >> (32 - 17);
|
686
|
+
len = (v >> (32 - 17 - 7)) & 0x7f;
|
687
|
+
for(i = 0; i < cr_inter.len; i += 2) {
|
688
|
+
start = cr_inter.points[i];
|
689
|
+
end = cr_inter.points[i + 1];
|
690
|
+
|
691
|
+
for(c = start; c < end; c++) {
|
692
|
+
for(;;) {
|
693
|
+
if (c >= code && c < code + len)
|
694
|
+
break;
|
695
|
+
idx++;
|
696
|
+
assert(idx < countof(case_conv_table1));
|
697
|
+
v = case_conv_table1[idx];
|
698
|
+
code = v >> (32 - 17);
|
699
|
+
len = (v >> (32 - 17 - 7)) & 0x7f;
|
700
|
+
}
|
701
|
+
d = lre_case_folding_entry(c, idx, v, is_unicode);
|
702
|
+
/* try to merge with the current interval */
|
703
|
+
if (d_start == -1) {
|
704
|
+
d_start = d;
|
705
|
+
d_end = d + 1;
|
706
|
+
} else if (d_end == d) {
|
707
|
+
d_end++;
|
708
|
+
} else {
|
709
|
+
cr_add_interval(&cr_result, d_start, d_end);
|
710
|
+
d_start = d;
|
711
|
+
d_end = d + 1;
|
712
|
+
}
|
713
|
+
}
|
714
|
+
}
|
715
|
+
if (d_start != -1) {
|
716
|
+
if (cr_add_interval(&cr_result, d_start, d_end))
|
717
|
+
goto fail;
|
718
|
+
}
|
719
|
+
|
720
|
+
/* the resulting ranges are not necessarily sorted and may overlap */
|
721
|
+
cr_sort_and_remove_overlap(&cr_result);
|
722
|
+
|
723
|
+
/* or with the character not affected by the case folding */
|
724
|
+
cr->len = 0;
|
725
|
+
if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
|
726
|
+
goto fail;
|
727
|
+
|
728
|
+
cr_free(&cr_inter);
|
729
|
+
cr_free(&cr_mask);
|
730
|
+
cr_free(&cr_result);
|
731
|
+
cr_free(&cr_sub);
|
732
|
+
return 0;
|
733
|
+
fail:
|
734
|
+
cr_free(&cr_inter);
|
735
|
+
cr_free(&cr_mask);
|
736
|
+
cr_free(&cr_result);
|
737
|
+
cr_free(&cr_sub);
|
738
|
+
return -1;
|
739
|
+
}
|
740
|
+
|
540
741
|
#ifdef CONFIG_ALL_UNICODE
|
541
742
|
|
542
743
|
BOOL lre_is_id_start(uint32_t c)
|
@@ -1296,207 +1497,6 @@ static int unicode_prop1(CharRange *cr, int prop_idx)
|
|
1296
1497
|
return 0;
|
1297
1498
|
}
|
1298
1499
|
|
1299
|
-
#define CASE_U (1 << 0)
|
1300
|
-
#define CASE_L (1 << 1)
|
1301
|
-
#define CASE_F (1 << 2)
|
1302
|
-
|
1303
|
-
/* use the case conversion table to generate range of characters.
|
1304
|
-
CASE_U: set char if modified by uppercasing,
|
1305
|
-
CASE_L: set char if modified by lowercasing,
|
1306
|
-
CASE_F: set char if modified by case folding,
|
1307
|
-
*/
|
1308
|
-
static int unicode_case1(CharRange *cr, int case_mask)
|
1309
|
-
{
|
1310
|
-
#define MR(x) (1 << RUN_TYPE_ ## x)
|
1311
|
-
const uint32_t tab_run_mask[3] = {
|
1312
|
-
MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
|
1313
|
-
MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
|
1314
|
-
|
1315
|
-
MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
|
1316
|
-
|
1317
|
-
MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
|
1318
|
-
};
|
1319
|
-
#undef MR
|
1320
|
-
uint32_t mask, v, code, type, len, i, idx;
|
1321
|
-
|
1322
|
-
if (case_mask == 0)
|
1323
|
-
return 0;
|
1324
|
-
mask = 0;
|
1325
|
-
for(i = 0; i < 3; i++) {
|
1326
|
-
if ((case_mask >> i) & 1)
|
1327
|
-
mask |= tab_run_mask[i];
|
1328
|
-
}
|
1329
|
-
for(idx = 0; idx < countof(case_conv_table1); idx++) {
|
1330
|
-
v = case_conv_table1[idx];
|
1331
|
-
type = (v >> (32 - 17 - 7 - 4)) & 0xf;
|
1332
|
-
code = v >> (32 - 17);
|
1333
|
-
len = (v >> (32 - 17 - 7)) & 0x7f;
|
1334
|
-
if ((mask >> type) & 1) {
|
1335
|
-
// printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
|
1336
|
-
switch(type) {
|
1337
|
-
case RUN_TYPE_UL:
|
1338
|
-
if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
|
1339
|
-
goto def_case;
|
1340
|
-
code += ((case_mask & CASE_U) != 0);
|
1341
|
-
for(i = 0; i < len; i += 2) {
|
1342
|
-
if (cr_add_interval(cr, code + i, code + i + 1))
|
1343
|
-
return -1;
|
1344
|
-
}
|
1345
|
-
break;
|
1346
|
-
case RUN_TYPE_LSU:
|
1347
|
-
if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
|
1348
|
-
goto def_case;
|
1349
|
-
if (!(case_mask & CASE_U)) {
|
1350
|
-
if (cr_add_interval(cr, code, code + 1))
|
1351
|
-
return -1;
|
1352
|
-
}
|
1353
|
-
if (cr_add_interval(cr, code + 1, code + 2))
|
1354
|
-
return -1;
|
1355
|
-
if (case_mask & CASE_U) {
|
1356
|
-
if (cr_add_interval(cr, code + 2, code + 3))
|
1357
|
-
return -1;
|
1358
|
-
}
|
1359
|
-
break;
|
1360
|
-
default:
|
1361
|
-
def_case:
|
1362
|
-
if (cr_add_interval(cr, code, code + len))
|
1363
|
-
return -1;
|
1364
|
-
break;
|
1365
|
-
}
|
1366
|
-
}
|
1367
|
-
}
|
1368
|
-
return 0;
|
1369
|
-
}
|
1370
|
-
|
1371
|
-
static int point_cmp(const void *p1, const void *p2, void *arg)
|
1372
|
-
{
|
1373
|
-
uint32_t v1 = *(uint32_t *)p1;
|
1374
|
-
uint32_t v2 = *(uint32_t *)p2;
|
1375
|
-
return (v1 > v2) - (v1 < v2);
|
1376
|
-
}
|
1377
|
-
|
1378
|
-
static void cr_sort_and_remove_overlap(CharRange *cr)
|
1379
|
-
{
|
1380
|
-
uint32_t start, end, start1, end1, i, j;
|
1381
|
-
|
1382
|
-
/* the resulting ranges are not necessarily sorted and may overlap */
|
1383
|
-
rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
|
1384
|
-
j = 0;
|
1385
|
-
for(i = 0; i < cr->len; ) {
|
1386
|
-
start = cr->points[i];
|
1387
|
-
end = cr->points[i + 1];
|
1388
|
-
i += 2;
|
1389
|
-
while (i < cr->len) {
|
1390
|
-
start1 = cr->points[i];
|
1391
|
-
end1 = cr->points[i + 1];
|
1392
|
-
if (start1 > end) {
|
1393
|
-
/* |------|
|
1394
|
-
* |-------| */
|
1395
|
-
break;
|
1396
|
-
} else if (end1 <= end) {
|
1397
|
-
/* |------|
|
1398
|
-
* |--| */
|
1399
|
-
i += 2;
|
1400
|
-
} else {
|
1401
|
-
/* |------|
|
1402
|
-
* |-------| */
|
1403
|
-
end = end1;
|
1404
|
-
i += 2;
|
1405
|
-
}
|
1406
|
-
}
|
1407
|
-
cr->points[j] = start;
|
1408
|
-
cr->points[j + 1] = end;
|
1409
|
-
j += 2;
|
1410
|
-
}
|
1411
|
-
cr->len = j;
|
1412
|
-
}
|
1413
|
-
|
1414
|
-
/* canonicalize a character set using the JS regex case folding rules
|
1415
|
-
(see lre_canonicalize()) */
|
1416
|
-
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
|
1417
|
-
{
|
1418
|
-
CharRange cr_inter, cr_mask, cr_result, cr_sub;
|
1419
|
-
uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
|
1420
|
-
|
1421
|
-
cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
|
1422
|
-
cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
|
1423
|
-
cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
|
1424
|
-
cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
|
1425
|
-
|
1426
|
-
if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
|
1427
|
-
goto fail;
|
1428
|
-
if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
|
1429
|
-
goto fail;
|
1430
|
-
|
1431
|
-
if (cr_invert(&cr_mask))
|
1432
|
-
goto fail;
|
1433
|
-
if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
|
1434
|
-
goto fail;
|
1435
|
-
|
1436
|
-
/* cr_inter = cr & cr_mask */
|
1437
|
-
/* cr_sub = cr & ~cr_mask */
|
1438
|
-
|
1439
|
-
/* use the case conversion table to compute the result */
|
1440
|
-
d_start = -1;
|
1441
|
-
d_end = -1;
|
1442
|
-
idx = 0;
|
1443
|
-
v = case_conv_table1[idx];
|
1444
|
-
code = v >> (32 - 17);
|
1445
|
-
len = (v >> (32 - 17 - 7)) & 0x7f;
|
1446
|
-
for(i = 0; i < cr_inter.len; i += 2) {
|
1447
|
-
start = cr_inter.points[i];
|
1448
|
-
end = cr_inter.points[i + 1];
|
1449
|
-
|
1450
|
-
for(c = start; c < end; c++) {
|
1451
|
-
for(;;) {
|
1452
|
-
if (c >= code && c < code + len)
|
1453
|
-
break;
|
1454
|
-
idx++;
|
1455
|
-
assert(idx < countof(case_conv_table1));
|
1456
|
-
v = case_conv_table1[idx];
|
1457
|
-
code = v >> (32 - 17);
|
1458
|
-
len = (v >> (32 - 17 - 7)) & 0x7f;
|
1459
|
-
}
|
1460
|
-
d = lre_case_folding_entry(c, idx, v, is_unicode);
|
1461
|
-
/* try to merge with the current interval */
|
1462
|
-
if (d_start == -1) {
|
1463
|
-
d_start = d;
|
1464
|
-
d_end = d + 1;
|
1465
|
-
} else if (d_end == d) {
|
1466
|
-
d_end++;
|
1467
|
-
} else {
|
1468
|
-
cr_add_interval(&cr_result, d_start, d_end);
|
1469
|
-
d_start = d;
|
1470
|
-
d_end = d + 1;
|
1471
|
-
}
|
1472
|
-
}
|
1473
|
-
}
|
1474
|
-
if (d_start != -1) {
|
1475
|
-
if (cr_add_interval(&cr_result, d_start, d_end))
|
1476
|
-
goto fail;
|
1477
|
-
}
|
1478
|
-
|
1479
|
-
/* the resulting ranges are not necessarily sorted and may overlap */
|
1480
|
-
cr_sort_and_remove_overlap(&cr_result);
|
1481
|
-
|
1482
|
-
/* or with the character not affected by the case folding */
|
1483
|
-
cr->len = 0;
|
1484
|
-
if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
|
1485
|
-
goto fail;
|
1486
|
-
|
1487
|
-
cr_free(&cr_inter);
|
1488
|
-
cr_free(&cr_mask);
|
1489
|
-
cr_free(&cr_result);
|
1490
|
-
cr_free(&cr_sub);
|
1491
|
-
return 0;
|
1492
|
-
fail:
|
1493
|
-
cr_free(&cr_inter);
|
1494
|
-
cr_free(&cr_mask);
|
1495
|
-
cr_free(&cr_result);
|
1496
|
-
cr_free(&cr_sub);
|
1497
|
-
return -1;
|
1498
|
-
}
|
1499
|
-
|
1500
1500
|
typedef enum {
|
1501
1501
|
POP_GC,
|
1502
1502
|
POP_PROP,
|