quickjs 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/quickjsrb/quickjs/cutils.h +56 -0
- data/ext/quickjsrb/quickjs/libregexp-opcode.h +11 -1
- data/ext/quickjsrb/quickjs/libregexp.c +883 -132
- data/ext/quickjsrb/quickjs/libregexp.h +1 -0
- data/ext/quickjsrb/quickjs/libunicode-table.h +420 -1
- data/ext/quickjsrb/quickjs/libunicode.c +224 -11
- data/ext/quickjsrb/quickjs/libunicode.h +9 -5
- data/ext/quickjsrb/quickjs/qjs.c +1 -1
- data/ext/quickjsrb/quickjs/qjsc.c +81 -26
- data/ext/quickjsrb/quickjs/quickjs-atom.h +7 -0
- data/ext/quickjsrb/quickjs/quickjs-libc.c +254 -65
- data/ext/quickjsrb/quickjs/quickjs-libc.h +7 -1
- data/ext/quickjsrb/quickjs/quickjs-opcode.h +2 -2
- data/ext/quickjsrb/quickjs/quickjs.c +2021 -686
- data/ext/quickjsrb/quickjs/quickjs.h +52 -8
- data/ext/quickjsrb/quickjs/run-test262.c +109 -32
- data/ext/quickjsrb/quickjs/unicode_gen.c +541 -5
- data/ext/quickjsrb/quickjs/unicode_gen_def.h +15 -0
- data/ext/quickjsrb/quickjsrb.c +1 -1
- data/lib/quickjs/version.rb +1 -1
- metadata +2 -2
@@ -71,7 +71,9 @@ typedef struct {
|
|
71
71
|
const uint8_t *buf_start;
|
72
72
|
int re_flags;
|
73
73
|
BOOL is_unicode;
|
74
|
+
BOOL unicode_sets; /* if set, is_unicode is also set */
|
74
75
|
BOOL ignore_case;
|
76
|
+
BOOL multi_line;
|
75
77
|
BOOL dotall;
|
76
78
|
int capture_count;
|
77
79
|
int total_capture_count; /* -1 = not computed yet */
|
@@ -102,11 +104,11 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
|
|
102
104
|
};
|
103
105
|
|
104
106
|
#define RE_HEADER_FLAGS 0
|
105
|
-
#define RE_HEADER_CAPTURE_COUNT
|
106
|
-
#define RE_HEADER_STACK_SIZE
|
107
|
-
#define RE_HEADER_BYTECODE_LEN
|
107
|
+
#define RE_HEADER_CAPTURE_COUNT 2
|
108
|
+
#define RE_HEADER_STACK_SIZE 3
|
109
|
+
#define RE_HEADER_BYTECODE_LEN 4
|
108
110
|
|
109
|
-
#define RE_HEADER_LEN
|
111
|
+
#define RE_HEADER_LEN 8
|
110
112
|
|
111
113
|
static inline int is_digit(int c) {
|
112
114
|
return c >= '0' && c <= '9';
|
@@ -122,6 +124,264 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
|
|
122
124
|
return 0;
|
123
125
|
}
|
124
126
|
|
127
|
+
typedef struct REString {
|
128
|
+
struct REString *next;
|
129
|
+
uint32_t hash;
|
130
|
+
uint32_t len;
|
131
|
+
uint32_t buf[];
|
132
|
+
} REString;
|
133
|
+
|
134
|
+
typedef struct {
|
135
|
+
/* the string list is the union of 'char_range' and of the strings
|
136
|
+
in hash_table[]. The strings in hash_table[] have a length !=
|
137
|
+
1. */
|
138
|
+
CharRange cr;
|
139
|
+
uint32_t n_strings;
|
140
|
+
uint32_t hash_size;
|
141
|
+
int hash_bits;
|
142
|
+
REString **hash_table;
|
143
|
+
} REStringList;
|
144
|
+
|
145
|
+
static uint32_t re_string_hash(int len, const uint32_t *buf)
|
146
|
+
{
|
147
|
+
int i;
|
148
|
+
uint32_t h;
|
149
|
+
h = 1;
|
150
|
+
for(i = 0; i < len; i++)
|
151
|
+
h = h * 263 + buf[i];
|
152
|
+
return h * 0x61C88647;
|
153
|
+
}
|
154
|
+
|
155
|
+
static void re_string_list_init(REParseState *s1, REStringList *s)
|
156
|
+
{
|
157
|
+
cr_init(&s->cr, s1->opaque, lre_realloc);
|
158
|
+
s->n_strings = 0;
|
159
|
+
s->hash_size = 0;
|
160
|
+
s->hash_bits = 0;
|
161
|
+
s->hash_table = NULL;
|
162
|
+
}
|
163
|
+
|
164
|
+
static void re_string_list_free(REStringList *s)
|
165
|
+
{
|
166
|
+
REString *p, *p_next;
|
167
|
+
int i;
|
168
|
+
for(i = 0; i < s->hash_size; i++) {
|
169
|
+
for(p = s->hash_table[i]; p != NULL; p = p_next) {
|
170
|
+
p_next = p->next;
|
171
|
+
lre_realloc(s->cr.mem_opaque, p, 0);
|
172
|
+
}
|
173
|
+
}
|
174
|
+
lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
|
175
|
+
|
176
|
+
cr_free(&s->cr);
|
177
|
+
}
|
178
|
+
|
179
|
+
static void lre_print_char(int c, BOOL is_range)
|
180
|
+
{
|
181
|
+
if (c == '\'' || c == '\\' ||
|
182
|
+
(is_range && (c == '-' || c == ']'))) {
|
183
|
+
printf("\\%c", c);
|
184
|
+
} else if (c >= ' ' && c <= 126) {
|
185
|
+
printf("%c", c);
|
186
|
+
} else {
|
187
|
+
printf("\\u{%04x}", c);
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
|
192
|
+
{
|
193
|
+
REString *p;
|
194
|
+
const CharRange *cr;
|
195
|
+
int i, j, k;
|
196
|
+
|
197
|
+
printf("%s:\n", str);
|
198
|
+
printf(" ranges: [");
|
199
|
+
cr = &s->cr;
|
200
|
+
for(i = 0; i < cr->len; i += 2) {
|
201
|
+
lre_print_char(cr->points[i], TRUE);
|
202
|
+
if (cr->points[i] != cr->points[i + 1] - 1) {
|
203
|
+
printf("-");
|
204
|
+
lre_print_char(cr->points[i + 1] - 1, TRUE);
|
205
|
+
}
|
206
|
+
}
|
207
|
+
printf("]\n");
|
208
|
+
|
209
|
+
j = 0;
|
210
|
+
for(i = 0; i < s->hash_size; i++) {
|
211
|
+
for(p = s->hash_table[i]; p != NULL; p = p->next) {
|
212
|
+
printf(" %d/%d: '", j, s->n_strings);
|
213
|
+
for(k = 0; k < p->len; k++) {
|
214
|
+
lre_print_char(p->buf[k], FALSE);
|
215
|
+
}
|
216
|
+
printf("'\n");
|
217
|
+
j++;
|
218
|
+
}
|
219
|
+
}
|
220
|
+
}
|
221
|
+
|
222
|
+
static int re_string_find2(REStringList *s, int len, const uint32_t *buf,
|
223
|
+
uint32_t h0, BOOL add_flag)
|
224
|
+
{
|
225
|
+
uint32_t h = 0; /* avoid warning */
|
226
|
+
REString *p;
|
227
|
+
if (s->n_strings != 0) {
|
228
|
+
h = h0 >> (32 - s->hash_bits);
|
229
|
+
for(p = s->hash_table[h]; p != NULL; p = p->next) {
|
230
|
+
if (p->hash == h0 && p->len == len &&
|
231
|
+
!memcmp(p->buf, buf, len * sizeof(buf[0]))) {
|
232
|
+
return 1;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
}
|
236
|
+
/* not found */
|
237
|
+
if (!add_flag)
|
238
|
+
return 0;
|
239
|
+
/* increase the size of the hash table if needed */
|
240
|
+
if (unlikely((s->n_strings + 1) > s->hash_size)) {
|
241
|
+
REString **new_hash_table, *p_next;
|
242
|
+
int new_hash_bits, i;
|
243
|
+
uint32_t new_hash_size;
|
244
|
+
new_hash_bits = max_int(s->hash_bits + 1, 4);
|
245
|
+
new_hash_size = 1 << new_hash_bits;
|
246
|
+
new_hash_table = lre_realloc(s->cr.mem_opaque, NULL,
|
247
|
+
sizeof(new_hash_table[0]) * new_hash_size);
|
248
|
+
if (!new_hash_table)
|
249
|
+
return -1;
|
250
|
+
memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
|
251
|
+
for(i = 0; i < s->hash_size; i++) {
|
252
|
+
for(p = s->hash_table[i]; p != NULL; p = p_next) {
|
253
|
+
p_next = p->next;
|
254
|
+
h = p->hash >> (32 - new_hash_bits);
|
255
|
+
p->next = new_hash_table[h];
|
256
|
+
new_hash_table[h] = p;
|
257
|
+
}
|
258
|
+
}
|
259
|
+
lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
|
260
|
+
s->hash_bits = new_hash_bits;
|
261
|
+
s->hash_size = new_hash_size;
|
262
|
+
s->hash_table = new_hash_table;
|
263
|
+
h = h0 >> (32 - s->hash_bits);
|
264
|
+
}
|
265
|
+
|
266
|
+
p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0]));
|
267
|
+
if (!p)
|
268
|
+
return -1;
|
269
|
+
p->next = s->hash_table[h];
|
270
|
+
s->hash_table[h] = p;
|
271
|
+
s->n_strings++;
|
272
|
+
p->hash = h0;
|
273
|
+
p->len = len;
|
274
|
+
memcpy(p->buf, buf, sizeof(buf[0]) * len);
|
275
|
+
return 1;
|
276
|
+
}
|
277
|
+
|
278
|
+
static int re_string_find(REStringList *s, int len, const uint32_t *buf,
|
279
|
+
BOOL add_flag)
|
280
|
+
{
|
281
|
+
uint32_t h0;
|
282
|
+
h0 = re_string_hash(len, buf);
|
283
|
+
return re_string_find2(s, len, buf, h0, add_flag);
|
284
|
+
}
|
285
|
+
|
286
|
+
/* return -1 if memory error, 0 if OK */
|
287
|
+
static int re_string_add(REStringList *s, int len, const uint32_t *buf)
|
288
|
+
{
|
289
|
+
if (len == 1) {
|
290
|
+
return cr_union_interval(&s->cr, buf[0], buf[0]);
|
291
|
+
}
|
292
|
+
if (re_string_find(s, len, buf, TRUE) < 0)
|
293
|
+
return -1;
|
294
|
+
return 0;
|
295
|
+
}
|
296
|
+
|
297
|
+
/* a = a op b */
|
298
|
+
static int re_string_list_op(REStringList *a, REStringList *b, int op)
|
299
|
+
{
|
300
|
+
int i, ret;
|
301
|
+
REString *p, **pp;
|
302
|
+
|
303
|
+
if (cr_op1(&a->cr, b->cr.points, b->cr.len, op))
|
304
|
+
return -1;
|
305
|
+
|
306
|
+
switch(op) {
|
307
|
+
case CR_OP_UNION:
|
308
|
+
if (b->n_strings != 0) {
|
309
|
+
for(i = 0; i < b->hash_size; i++) {
|
310
|
+
for(p = b->hash_table[i]; p != NULL; p = p->next) {
|
311
|
+
if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0)
|
312
|
+
return -1;
|
313
|
+
}
|
314
|
+
}
|
315
|
+
}
|
316
|
+
break;
|
317
|
+
case CR_OP_INTER:
|
318
|
+
case CR_OP_SUB:
|
319
|
+
for(i = 0; i < a->hash_size; i++) {
|
320
|
+
pp = &a->hash_table[i];
|
321
|
+
for(;;) {
|
322
|
+
p = *pp;
|
323
|
+
if (p == NULL)
|
324
|
+
break;
|
325
|
+
ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE);
|
326
|
+
if (op == CR_OP_SUB)
|
327
|
+
ret = !ret;
|
328
|
+
if (!ret) {
|
329
|
+
/* remove it */
|
330
|
+
*pp = p->next;
|
331
|
+
a->n_strings--;
|
332
|
+
lre_realloc(a->cr.mem_opaque, p, 0);
|
333
|
+
} else {
|
334
|
+
/* keep it */
|
335
|
+
pp = &p->next;
|
336
|
+
}
|
337
|
+
}
|
338
|
+
}
|
339
|
+
break;
|
340
|
+
default:
|
341
|
+
abort();
|
342
|
+
}
|
343
|
+
return 0;
|
344
|
+
}
|
345
|
+
|
346
|
+
static int re_string_list_canonicalize(REParseState *s1,
|
347
|
+
REStringList *s, BOOL is_unicode)
|
348
|
+
{
|
349
|
+
if (cr_regexp_canonicalize(&s->cr, is_unicode))
|
350
|
+
return -1;
|
351
|
+
if (s->n_strings != 0) {
|
352
|
+
REStringList a_s, *a = &a_s;
|
353
|
+
int i, j;
|
354
|
+
REString *p;
|
355
|
+
|
356
|
+
/* XXX: simplify */
|
357
|
+
re_string_list_init(s1, a);
|
358
|
+
|
359
|
+
a->n_strings = s->n_strings;
|
360
|
+
a->hash_size = s->hash_size;
|
361
|
+
a->hash_bits = s->hash_bits;
|
362
|
+
a->hash_table = s->hash_table;
|
363
|
+
|
364
|
+
s->n_strings = 0;
|
365
|
+
s->hash_size = 0;
|
366
|
+
s->hash_bits = 0;
|
367
|
+
s->hash_table = NULL;
|
368
|
+
|
369
|
+
for(i = 0; i < a->hash_size; i++) {
|
370
|
+
for(p = a->hash_table[i]; p != NULL; p = p->next) {
|
371
|
+
for(j = 0; j < p->len; j++) {
|
372
|
+
p->buf[j] = lre_canonicalize(p->buf[j], is_unicode);
|
373
|
+
}
|
374
|
+
if (re_string_add(s, p->len, p->buf)) {
|
375
|
+
re_string_list_free(a);
|
376
|
+
return -1;
|
377
|
+
}
|
378
|
+
}
|
379
|
+
}
|
380
|
+
re_string_list_free(a);
|
381
|
+
}
|
382
|
+
return 0;
|
383
|
+
}
|
384
|
+
|
125
385
|
static const uint16_t char_range_d[] = {
|
126
386
|
1,
|
127
387
|
0x0030, 0x0039 + 1,
|
@@ -170,7 +430,7 @@ static const uint16_t * const char_range_table[] = {
|
|
170
430
|
char_range_w,
|
171
431
|
};
|
172
432
|
|
173
|
-
static int cr_init_char_range(REParseState *s,
|
433
|
+
static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c)
|
174
434
|
{
|
175
435
|
BOOL invert;
|
176
436
|
const uint16_t *c_pt;
|
@@ -179,18 +439,18 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
|
|
179
439
|
invert = c & 1;
|
180
440
|
c_pt = char_range_table[c >> 1];
|
181
441
|
len = *c_pt++;
|
182
|
-
|
442
|
+
re_string_list_init(s, cr);
|
183
443
|
for(i = 0; i < len * 2; i++) {
|
184
|
-
if (cr_add_point(cr, c_pt[i]))
|
444
|
+
if (cr_add_point(&cr->cr, c_pt[i]))
|
185
445
|
goto fail;
|
186
446
|
}
|
187
447
|
if (invert) {
|
188
|
-
if (cr_invert(cr))
|
448
|
+
if (cr_invert(&cr->cr))
|
189
449
|
goto fail;
|
190
450
|
}
|
191
451
|
return 0;
|
192
452
|
fail:
|
193
|
-
|
453
|
+
re_string_list_free(cr);
|
194
454
|
return -1;
|
195
455
|
}
|
196
456
|
|
@@ -240,6 +500,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|
240
500
|
printf("%s", reopcode_info[opcode].name);
|
241
501
|
switch(opcode) {
|
242
502
|
case REOP_char:
|
503
|
+
case REOP_char_i:
|
243
504
|
val = get_u16(buf + pos + 1);
|
244
505
|
if (val >= ' ' && val <= 126)
|
245
506
|
printf(" '%c'", val);
|
@@ -247,6 +508,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|
247
508
|
printf(" 0x%04x", val);
|
248
509
|
break;
|
249
510
|
case REOP_char32:
|
511
|
+
case REOP_char32_i:
|
250
512
|
val = get_u32(buf + pos + 1);
|
251
513
|
if (val >= ' ' && val <= 126)
|
252
514
|
printf(" '%c'", val);
|
@@ -273,7 +535,9 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|
273
535
|
case REOP_save_start:
|
274
536
|
case REOP_save_end:
|
275
537
|
case REOP_back_reference:
|
538
|
+
case REOP_back_reference_i:
|
276
539
|
case REOP_backward_back_reference:
|
540
|
+
case REOP_backward_back_reference_i:
|
277
541
|
printf(" %u", buf[pos + 1]);
|
278
542
|
break;
|
279
543
|
case REOP_save_reset:
|
@@ -284,6 +548,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|
284
548
|
printf(" %d", val);
|
285
549
|
break;
|
286
550
|
case REOP_range:
|
551
|
+
case REOP_range_i:
|
287
552
|
{
|
288
553
|
int n, i;
|
289
554
|
n = get_u16(buf + pos + 1);
|
@@ -295,6 +560,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
|
|
295
560
|
}
|
296
561
|
break;
|
297
562
|
case REOP_range32:
|
563
|
+
case REOP_range32_i:
|
298
564
|
{
|
299
565
|
int n, i;
|
300
566
|
n = get_u16(buf + pos + 1);
|
@@ -533,8 +799,16 @@ static BOOL is_unicode_char(int c)
|
|
533
799
|
(c == '_'));
|
534
800
|
}
|
535
801
|
|
536
|
-
|
537
|
-
|
802
|
+
/* XXX: memory error test */
|
803
|
+
static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len)
|
804
|
+
{
|
805
|
+
REStringList *sl = opaque;
|
806
|
+
re_string_add(sl, seq_len, seq);
|
807
|
+
}
|
808
|
+
|
809
|
+
static int parse_unicode_property(REParseState *s, REStringList *cr,
|
810
|
+
const uint8_t **pp, BOOL is_inv,
|
811
|
+
BOOL allow_sequence_prop)
|
538
812
|
{
|
539
813
|
const uint8_t *p;
|
540
814
|
char name[64], value[64];
|
@@ -574,51 +848,76 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
|
574
848
|
} else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
|
575
849
|
script_ext = TRUE;
|
576
850
|
do_script:
|
577
|
-
|
578
|
-
ret = unicode_script(cr, value, script_ext);
|
851
|
+
re_string_list_init(s, cr);
|
852
|
+
ret = unicode_script(&cr->cr, value, script_ext);
|
579
853
|
if (ret) {
|
580
|
-
|
854
|
+
re_string_list_free(cr);
|
581
855
|
if (ret == -2)
|
582
856
|
return re_parse_error(s, "unknown unicode script");
|
583
857
|
else
|
584
858
|
goto out_of_memory;
|
585
859
|
}
|
586
860
|
} else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
|
587
|
-
|
588
|
-
ret = unicode_general_category(cr, value);
|
861
|
+
re_string_list_init(s, cr);
|
862
|
+
ret = unicode_general_category(&cr->cr, value);
|
589
863
|
if (ret) {
|
590
|
-
|
864
|
+
re_string_list_free(cr);
|
591
865
|
if (ret == -2)
|
592
866
|
return re_parse_error(s, "unknown unicode general category");
|
593
867
|
else
|
594
868
|
goto out_of_memory;
|
595
869
|
}
|
596
870
|
} else if (value[0] == '\0') {
|
597
|
-
|
598
|
-
ret = unicode_general_category(cr, name);
|
871
|
+
re_string_list_init(s, cr);
|
872
|
+
ret = unicode_general_category(&cr->cr, name);
|
599
873
|
if (ret == -1) {
|
600
|
-
|
874
|
+
re_string_list_free(cr);
|
601
875
|
goto out_of_memory;
|
602
876
|
}
|
603
877
|
if (ret < 0) {
|
604
|
-
ret = unicode_prop(cr, name);
|
605
|
-
if (ret) {
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
878
|
+
ret = unicode_prop(&cr->cr, name);
|
879
|
+
if (ret == -1) {
|
880
|
+
re_string_list_free(cr);
|
881
|
+
goto out_of_memory;
|
882
|
+
}
|
883
|
+
}
|
884
|
+
if (ret < 0 && !is_inv && allow_sequence_prop) {
|
885
|
+
CharRange cr_tmp;
|
886
|
+
cr_init(&cr_tmp, s->opaque, lre_realloc);
|
887
|
+
ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp);
|
888
|
+
cr_free(&cr_tmp);
|
889
|
+
if (ret == -1) {
|
890
|
+
re_string_list_free(cr);
|
891
|
+
goto out_of_memory;
|
611
892
|
}
|
612
893
|
}
|
894
|
+
if (ret < 0)
|
895
|
+
goto unknown_property_name;
|
613
896
|
} else {
|
614
897
|
unknown_property_name:
|
615
898
|
return re_parse_error(s, "unknown unicode property name");
|
616
899
|
}
|
617
900
|
|
901
|
+
/* the ordering of case folding and inversion differs with
|
902
|
+
unicode_sets. 'unicode_sets' ordering is more consistent */
|
903
|
+
/* XXX: the spec seems incorrect, we do it as the other engines
|
904
|
+
seem to do it. */
|
905
|
+
if (s->ignore_case && s->unicode_sets) {
|
906
|
+
if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
|
907
|
+
re_string_list_free(cr);
|
908
|
+
goto out_of_memory;
|
909
|
+
}
|
910
|
+
}
|
618
911
|
if (is_inv) {
|
619
|
-
if (cr_invert(cr)) {
|
620
|
-
|
621
|
-
|
912
|
+
if (cr_invert(&cr->cr)) {
|
913
|
+
re_string_list_free(cr);
|
914
|
+
goto out_of_memory;
|
915
|
+
}
|
916
|
+
}
|
917
|
+
if (s->ignore_case && !s->unicode_sets) {
|
918
|
+
if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
|
919
|
+
re_string_list_free(cr);
|
920
|
+
goto out_of_memory;
|
622
921
|
}
|
623
922
|
}
|
624
923
|
*pp = p;
|
@@ -628,10 +927,61 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
|
|
628
927
|
}
|
629
928
|
#endif /* CONFIG_ALL_UNICODE */
|
630
929
|
|
930
|
+
static int get_class_atom(REParseState *s, REStringList *cr,
|
931
|
+
const uint8_t **pp, BOOL inclass);
|
932
|
+
|
933
|
+
static int parse_class_string_disjunction(REParseState *s, REStringList *cr,
|
934
|
+
const uint8_t **pp)
|
935
|
+
{
|
936
|
+
const uint8_t *p;
|
937
|
+
DynBuf str;
|
938
|
+
int c;
|
939
|
+
|
940
|
+
p = *pp;
|
941
|
+
if (*p != '{')
|
942
|
+
return re_parse_error(s, "expecting '{' after \\q");
|
943
|
+
|
944
|
+
dbuf_init2(&str, s->opaque, lre_realloc);
|
945
|
+
re_string_list_init(s, cr);
|
946
|
+
|
947
|
+
p++;
|
948
|
+
for(;;) {
|
949
|
+
str.size = 0;
|
950
|
+
while (*p != '}' && *p != '|') {
|
951
|
+
c = get_class_atom(s, NULL, &p, FALSE);
|
952
|
+
if (c < 0)
|
953
|
+
goto fail;
|
954
|
+
if (dbuf_put_u32(&str, c)) {
|
955
|
+
re_parse_out_of_memory(s);
|
956
|
+
goto fail;
|
957
|
+
}
|
958
|
+
}
|
959
|
+
if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) {
|
960
|
+
re_parse_out_of_memory(s);
|
961
|
+
goto fail;
|
962
|
+
}
|
963
|
+
if (*p == '}')
|
964
|
+
break;
|
965
|
+
p++;
|
966
|
+
}
|
967
|
+
if (s->ignore_case) {
|
968
|
+
if (re_string_list_canonicalize(s, cr, TRUE))
|
969
|
+
goto fail;
|
970
|
+
}
|
971
|
+
p++; /* skip the '}' */
|
972
|
+
dbuf_free(&str);
|
973
|
+
*pp = p;
|
974
|
+
return 0;
|
975
|
+
fail:
|
976
|
+
dbuf_free(&str);
|
977
|
+
re_string_list_free(cr);
|
978
|
+
return -1;
|
979
|
+
}
|
980
|
+
|
631
981
|
/* return -1 if error otherwise the character or a class range
|
632
|
-
(CLASS_RANGE_BASE). In case of class range, 'cr' is
|
982
|
+
(CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is
|
633
983
|
initialized. Otherwise, it is ignored. */
|
634
|
-
static int get_class_atom(REParseState *s,
|
984
|
+
static int get_class_atom(REParseState *s, REStringList *cr,
|
635
985
|
const uint8_t **pp, BOOL inclass)
|
636
986
|
{
|
637
987
|
const uint8_t *p;
|
@@ -666,6 +1016,8 @@ static int get_class_atom(REParseState *s, CharRange *cr,
|
|
666
1016
|
case 'W':
|
667
1017
|
c = CHAR_RANGE_W;
|
668
1018
|
class_range:
|
1019
|
+
if (!cr)
|
1020
|
+
goto default_escape;
|
669
1021
|
if (cr_init_char_range(s, cr, c))
|
670
1022
|
return -1;
|
671
1023
|
c = CLASS_RANGE_BASE;
|
@@ -690,27 +1042,50 @@ static int get_class_atom(REParseState *s, CharRange *cr,
|
|
690
1042
|
if (!inclass && s->is_unicode)
|
691
1043
|
goto invalid_escape;
|
692
1044
|
break;
|
1045
|
+
case '^':
|
1046
|
+
case '$':
|
1047
|
+
case '\\':
|
1048
|
+
case '.':
|
1049
|
+
case '*':
|
1050
|
+
case '+':
|
1051
|
+
case '?':
|
1052
|
+
case '(':
|
1053
|
+
case ')':
|
1054
|
+
case '[':
|
1055
|
+
case ']':
|
1056
|
+
case '{':
|
1057
|
+
case '}':
|
1058
|
+
case '|':
|
1059
|
+
case '/':
|
1060
|
+
/* always valid to escape these characters */
|
1061
|
+
break;
|
693
1062
|
#ifdef CONFIG_ALL_UNICODE
|
694
1063
|
case 'p':
|
695
1064
|
case 'P':
|
696
|
-
if (s->is_unicode) {
|
697
|
-
if (parse_unicode_property(s, cr, &p, (c == 'P')))
|
1065
|
+
if (s->is_unicode && cr) {
|
1066
|
+
if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets))
|
698
1067
|
return -1;
|
699
1068
|
c = CLASS_RANGE_BASE;
|
700
1069
|
break;
|
701
1070
|
}
|
702
|
-
|
1071
|
+
goto default_escape;
|
703
1072
|
#endif
|
1073
|
+
case 'q':
|
1074
|
+
if (s->unicode_sets && cr && inclass) {
|
1075
|
+
if (parse_class_string_disjunction(s, cr, &p))
|
1076
|
+
return -1;
|
1077
|
+
c = CLASS_RANGE_BASE;
|
1078
|
+
break;
|
1079
|
+
}
|
1080
|
+
goto default_escape;
|
704
1081
|
default:
|
1082
|
+
default_escape:
|
705
1083
|
p--;
|
706
1084
|
ret = lre_parse_escape(&p, s->is_unicode * 2);
|
707
1085
|
if (ret >= 0) {
|
708
1086
|
c = ret;
|
709
1087
|
} else {
|
710
|
-
if (
|
711
|
-
/* always valid to escape these characters */
|
712
|
-
goto normal_char;
|
713
|
-
} else if (s->is_unicode) {
|
1088
|
+
if (s->is_unicode) {
|
714
1089
|
invalid_escape:
|
715
1090
|
return re_parse_error(s, "invalid escape sequence in regular expression");
|
716
1091
|
} else {
|
@@ -727,6 +1102,48 @@ static int get_class_atom(REParseState *s, CharRange *cr,
|
|
727
1102
|
return re_parse_error(s, "unexpected end");
|
728
1103
|
}
|
729
1104
|
/* fall thru */
|
1105
|
+
goto normal_char;
|
1106
|
+
|
1107
|
+
case '&':
|
1108
|
+
case '!':
|
1109
|
+
case '#':
|
1110
|
+
case '$':
|
1111
|
+
case '%':
|
1112
|
+
case '*':
|
1113
|
+
case '+':
|
1114
|
+
case ',':
|
1115
|
+
case '.':
|
1116
|
+
case ':':
|
1117
|
+
case ';':
|
1118
|
+
case '<':
|
1119
|
+
case '=':
|
1120
|
+
case '>':
|
1121
|
+
case '?':
|
1122
|
+
case '@':
|
1123
|
+
case '^':
|
1124
|
+
case '`':
|
1125
|
+
case '~':
|
1126
|
+
if (s->unicode_sets && p[1] == c) {
|
1127
|
+
/* forbidden double characters */
|
1128
|
+
return re_parse_error(s, "invalid class set operation in regular expression");
|
1129
|
+
}
|
1130
|
+
goto normal_char;
|
1131
|
+
|
1132
|
+
case '(':
|
1133
|
+
case ')':
|
1134
|
+
case '[':
|
1135
|
+
case ']':
|
1136
|
+
case '{':
|
1137
|
+
case '}':
|
1138
|
+
case '/':
|
1139
|
+
case '-':
|
1140
|
+
case '|':
|
1141
|
+
if (s->unicode_sets) {
|
1142
|
+
/* invalid characters in unicode sets */
|
1143
|
+
return re_parse_error(s, "invalid character in class in regular expression");
|
1144
|
+
}
|
1145
|
+
goto normal_char;
|
1146
|
+
|
730
1147
|
default:
|
731
1148
|
normal_char:
|
732
1149
|
/* normal char */
|
@@ -754,8 +1171,6 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
|
|
754
1171
|
if (len >= 65535)
|
755
1172
|
return re_parse_error(s, "too many ranges");
|
756
1173
|
if (len == 0) {
|
757
|
-
/* not sure it can really happen. Emit a match that is always
|
758
|
-
false */
|
759
1174
|
re_emit_op_u32(s, REOP_char32, -1);
|
760
1175
|
} else {
|
761
1176
|
high = cr->points[cr->len - 1];
|
@@ -764,7 +1179,7 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
|
|
764
1179
|
if (high <= 0xffff) {
|
765
1180
|
/* can use 16 bit ranges with the conversion that 0xffff =
|
766
1181
|
infinity */
|
767
|
-
re_emit_op_u16(s, REOP_range, len);
|
1182
|
+
re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len);
|
768
1183
|
for(i = 0; i < cr->len; i += 2) {
|
769
1184
|
dbuf_put_u16(&s->byte_code, cr->points[i]);
|
770
1185
|
high = cr->points[i + 1] - 1;
|
@@ -773,7 +1188,7 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
|
|
773
1188
|
dbuf_put_u16(&s->byte_code, high);
|
774
1189
|
}
|
775
1190
|
} else {
|
776
|
-
re_emit_op_u16(s, REOP_range32, len);
|
1191
|
+
re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len);
|
777
1192
|
for(i = 0; i < cr->len; i += 2) {
|
778
1193
|
dbuf_put_u32(&s->byte_code, cr->points[i]);
|
779
1194
|
dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
|
@@ -783,15 +1198,139 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
|
|
783
1198
|
return 0;
|
784
1199
|
}
|
785
1200
|
|
786
|
-
static int
|
1201
|
+
static int re_string_cmp_len(const void *a, const void *b, void *arg)
|
1202
|
+
{
|
1203
|
+
REString *p1 = *(REString **)a;
|
1204
|
+
REString *p2 = *(REString **)b;
|
1205
|
+
return (p1->len < p2->len) - (p1->len > p2->len);
|
1206
|
+
}
|
1207
|
+
|
1208
|
+
static void re_emit_char(REParseState *s, int c)
|
1209
|
+
{
|
1210
|
+
if (c <= 0xffff)
|
1211
|
+
re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c);
|
1212
|
+
else
|
1213
|
+
re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c);
|
1214
|
+
}
|
1215
|
+
|
1216
|
+
static int re_emit_string_list(REParseState *s, const REStringList *sl)
|
1217
|
+
{
|
1218
|
+
REString **tab, *p;
|
1219
|
+
int i, j, split_pos, last_match_pos, n;
|
1220
|
+
BOOL has_empty_string, is_last;
|
1221
|
+
|
1222
|
+
// re_string_list_dump("sl", sl);
|
1223
|
+
if (sl->n_strings == 0) {
|
1224
|
+
/* simple case: only characters */
|
1225
|
+
if (re_emit_range(s, &sl->cr))
|
1226
|
+
return -1;
|
1227
|
+
} else {
|
1228
|
+
/* at least one string list is present : match the longest ones first */
|
1229
|
+
/* XXX: add a new op_switch opcode to compile as a trie */
|
1230
|
+
tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings);
|
1231
|
+
if (!tab) {
|
1232
|
+
re_parse_out_of_memory(s);
|
1233
|
+
return -1;
|
1234
|
+
}
|
1235
|
+
has_empty_string = FALSE;
|
1236
|
+
n = 0;
|
1237
|
+
for(i = 0; i < sl->hash_size; i++) {
|
1238
|
+
for(p = sl->hash_table[i]; p != NULL; p = p->next) {
|
1239
|
+
if (p->len == 0) {
|
1240
|
+
has_empty_string = TRUE;
|
1241
|
+
} else {
|
1242
|
+
tab[n++] = p;
|
1243
|
+
}
|
1244
|
+
}
|
1245
|
+
}
|
1246
|
+
assert(n <= sl->n_strings);
|
1247
|
+
|
1248
|
+
rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL);
|
1249
|
+
|
1250
|
+
last_match_pos = -1;
|
1251
|
+
for(i = 0; i < n; i++) {
|
1252
|
+
p = tab[i];
|
1253
|
+
is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1);
|
1254
|
+
if (!is_last)
|
1255
|
+
split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
|
1256
|
+
else
|
1257
|
+
split_pos = 0;
|
1258
|
+
for(j = 0; j < p->len; j++) {
|
1259
|
+
re_emit_char(s, p->buf[j]);
|
1260
|
+
}
|
1261
|
+
if (!is_last) {
|
1262
|
+
last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
|
1263
|
+
put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
|
1264
|
+
}
|
1265
|
+
}
|
1266
|
+
|
1267
|
+
if (sl->cr.len != 0) {
|
1268
|
+
/* char range */
|
1269
|
+
is_last = !has_empty_string;
|
1270
|
+
if (!is_last)
|
1271
|
+
split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
|
1272
|
+
else
|
1273
|
+
split_pos = 0; /* not used */
|
1274
|
+
if (re_emit_range(s, &sl->cr)) {
|
1275
|
+
lre_realloc(s->opaque, tab, 0);
|
1276
|
+
return -1;
|
1277
|
+
}
|
1278
|
+
if (!is_last)
|
1279
|
+
put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
|
1280
|
+
}
|
1281
|
+
|
1282
|
+
/* patch the 'goto match' */
|
1283
|
+
while (last_match_pos != -1) {
|
1284
|
+
int next_pos = get_u32(s->byte_code.buf + last_match_pos);
|
1285
|
+
put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4));
|
1286
|
+
last_match_pos = next_pos;
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
lre_realloc(s->opaque, tab, 0);
|
1290
|
+
}
|
1291
|
+
return 0;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp);
|
1295
|
+
|
1296
|
+
static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp)
|
1297
|
+
{
|
1298
|
+
int c1;
|
1299
|
+
const uint8_t *p = *pp;
|
1300
|
+
|
1301
|
+
if (*p == '[') {
|
1302
|
+
if (re_parse_nested_class(s, cr, pp))
|
1303
|
+
return -1;
|
1304
|
+
} else {
|
1305
|
+
c1 = get_class_atom(s, cr, pp, TRUE);
|
1306
|
+
if (c1 < 0)
|
1307
|
+
return -1;
|
1308
|
+
if (c1 < CLASS_RANGE_BASE) {
|
1309
|
+
/* create a range with a single character */
|
1310
|
+
re_string_list_init(s, cr);
|
1311
|
+
if (s->ignore_case)
|
1312
|
+
c1 = lre_canonicalize(c1, s->is_unicode);
|
1313
|
+
if (cr_union_interval(&cr->cr, c1, c1)) {
|
1314
|
+
re_string_list_free(cr);
|
1315
|
+
return -1;
|
1316
|
+
}
|
1317
|
+
}
|
1318
|
+
}
|
1319
|
+
return 0;
|
1320
|
+
}
|
1321
|
+
|
1322
|
+
static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp)
|
787
1323
|
{
|
788
1324
|
const uint8_t *p;
|
789
1325
|
uint32_t c1, c2;
|
790
|
-
|
791
|
-
|
792
|
-
BOOL invert;
|
1326
|
+
int ret;
|
1327
|
+
REStringList cr1_s, *cr1 = &cr1_s;
|
1328
|
+
BOOL invert, is_first;
|
793
1329
|
|
794
|
-
|
1330
|
+
if (lre_check_stack_overflow(s->opaque, 0))
|
1331
|
+
return re_parse_error(s, "stack overflow");
|
1332
|
+
|
1333
|
+
re_string_list_init(s, cr);
|
795
1334
|
p = *pp;
|
796
1335
|
p++; /* skip '[' */
|
797
1336
|
|
@@ -800,74 +1339,155 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
|
|
800
1339
|
p++;
|
801
1340
|
invert = TRUE;
|
802
1341
|
}
|
803
|
-
|
1342
|
+
|
1343
|
+
/* handle unions */
|
1344
|
+
is_first = TRUE;
|
804
1345
|
for(;;) {
|
805
1346
|
if (*p == ']')
|
806
1347
|
break;
|
807
|
-
|
808
|
-
|
809
|
-
goto fail;
|
810
|
-
if (*p == '-' && p[1] != ']') {
|
811
|
-
const uint8_t *p0 = p + 1;
|
812
|
-
if (c1 >= CLASS_RANGE_BASE) {
|
813
|
-
if (s->is_unicode) {
|
814
|
-
cr_free(cr1);
|
815
|
-
goto invalid_class_range;
|
816
|
-
}
|
817
|
-
/* Annex B: match '-' character */
|
818
|
-
goto class_atom;
|
819
|
-
}
|
820
|
-
c2 = get_class_atom(s, cr1, &p0, TRUE);
|
821
|
-
if ((int)c2 < 0)
|
822
|
-
goto fail;
|
823
|
-
if (c2 >= CLASS_RANGE_BASE) {
|
824
|
-
cr_free(cr1);
|
825
|
-
if (s->is_unicode) {
|
826
|
-
goto invalid_class_range;
|
827
|
-
}
|
828
|
-
/* Annex B: match '-' character */
|
829
|
-
goto class_atom;
|
830
|
-
}
|
831
|
-
p = p0;
|
832
|
-
if (c2 < c1) {
|
833
|
-
invalid_class_range:
|
834
|
-
re_parse_error(s, "invalid class range");
|
1348
|
+
if (*p == '[' && s->unicode_sets) {
|
1349
|
+
if (re_parse_nested_class(s, cr1, &p))
|
835
1350
|
goto fail;
|
836
|
-
|
837
|
-
if (cr_union_interval(cr, c1, c2))
|
838
|
-
goto memory_error;
|
1351
|
+
goto class_union;
|
839
1352
|
} else {
|
840
|
-
|
841
|
-
if (c1
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
if (
|
846
|
-
goto
|
1353
|
+
c1 = get_class_atom(s, cr1, &p, TRUE);
|
1354
|
+
if ((int)c1 < 0)
|
1355
|
+
goto fail;
|
1356
|
+
if (*p == '-' && p[1] != ']') {
|
1357
|
+
const uint8_t *p0 = p + 1;
|
1358
|
+
if (p[1] == '-' && s->unicode_sets && is_first)
|
1359
|
+
goto class_atom; /* first character class followed by '--' */
|
1360
|
+
if (c1 >= CLASS_RANGE_BASE) {
|
1361
|
+
if (s->is_unicode) {
|
1362
|
+
re_string_list_free(cr1);
|
1363
|
+
goto invalid_class_range;
|
1364
|
+
}
|
1365
|
+
/* Annex B: match '-' character */
|
1366
|
+
goto class_atom;
|
1367
|
+
}
|
1368
|
+
c2 = get_class_atom(s, cr1, &p0, TRUE);
|
1369
|
+
if ((int)c2 < 0)
|
1370
|
+
goto fail;
|
1371
|
+
if (c2 >= CLASS_RANGE_BASE) {
|
1372
|
+
re_string_list_free(cr1);
|
1373
|
+
if (s->is_unicode) {
|
1374
|
+
goto invalid_class_range;
|
1375
|
+
}
|
1376
|
+
/* Annex B: match '-' character */
|
1377
|
+
goto class_atom;
|
1378
|
+
}
|
1379
|
+
p = p0;
|
1380
|
+
if (c2 < c1) {
|
1381
|
+
invalid_class_range:
|
1382
|
+
re_parse_error(s, "invalid class range");
|
1383
|
+
goto fail;
|
1384
|
+
}
|
1385
|
+
if (s->ignore_case) {
|
1386
|
+
CharRange cr2_s, *cr2 = &cr2_s;
|
1387
|
+
cr_init(cr2, s->opaque, lre_realloc);
|
1388
|
+
if (cr_add_interval(cr2, c1, c2 + 1) ||
|
1389
|
+
cr_regexp_canonicalize(cr2, s->is_unicode) ||
|
1390
|
+
cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) {
|
1391
|
+
cr_free(cr2);
|
1392
|
+
goto memory_error;
|
1393
|
+
}
|
1394
|
+
cr_free(cr2);
|
1395
|
+
} else {
|
1396
|
+
if (cr_union_interval(&cr->cr, c1, c2))
|
1397
|
+
goto memory_error;
|
1398
|
+
}
|
1399
|
+
is_first = FALSE; /* union operation */
|
847
1400
|
} else {
|
848
|
-
|
849
|
-
|
1401
|
+
class_atom:
|
1402
|
+
if (c1 >= CLASS_RANGE_BASE) {
|
1403
|
+
class_union:
|
1404
|
+
ret = re_string_list_op(cr, cr1, CR_OP_UNION);
|
1405
|
+
re_string_list_free(cr1);
|
1406
|
+
if (ret)
|
1407
|
+
goto memory_error;
|
1408
|
+
} else {
|
1409
|
+
if (s->ignore_case)
|
1410
|
+
c1 = lre_canonicalize(c1, s->is_unicode);
|
1411
|
+
if (cr_union_interval(&cr->cr, c1, c1))
|
1412
|
+
goto memory_error;
|
1413
|
+
}
|
850
1414
|
}
|
851
1415
|
}
|
1416
|
+
if (s->unicode_sets && is_first) {
|
1417
|
+
if (*p == '&' && p[1] == '&' && p[2] != '&') {
|
1418
|
+
/* handle '&&' */
|
1419
|
+
for(;;) {
|
1420
|
+
if (*p == ']') {
|
1421
|
+
break;
|
1422
|
+
} else if (*p == '&' && p[1] == '&' && p[2] != '&') {
|
1423
|
+
p += 2;
|
1424
|
+
} else {
|
1425
|
+
goto invalid_operation;
|
1426
|
+
}
|
1427
|
+
if (re_parse_class_set_operand(s, cr1, &p))
|
1428
|
+
goto fail;
|
1429
|
+
ret = re_string_list_op(cr, cr1, CR_OP_INTER);
|
1430
|
+
re_string_list_free(cr1);
|
1431
|
+
if (ret)
|
1432
|
+
goto memory_error;
|
1433
|
+
}
|
1434
|
+
} else if (*p == '-' && p[1] == '-') {
|
1435
|
+
/* handle '--' */
|
1436
|
+
for(;;) {
|
1437
|
+
if (*p == ']') {
|
1438
|
+
break;
|
1439
|
+
} else if (*p == '-' && p[1] == '-') {
|
1440
|
+
p += 2;
|
1441
|
+
} else {
|
1442
|
+
invalid_operation:
|
1443
|
+
re_parse_error(s, "invalid operation in regular expression");
|
1444
|
+
goto fail;
|
1445
|
+
}
|
1446
|
+
if (re_parse_class_set_operand(s, cr1, &p))
|
1447
|
+
goto fail;
|
1448
|
+
ret = re_string_list_op(cr, cr1, CR_OP_SUB);
|
1449
|
+
re_string_list_free(cr1);
|
1450
|
+
if (ret)
|
1451
|
+
goto memory_error;
|
1452
|
+
}
|
1453
|
+
}
|
1454
|
+
}
|
1455
|
+
is_first = FALSE;
|
852
1456
|
}
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
}
|
1457
|
+
|
1458
|
+
p++; /* skip ']' */
|
1459
|
+
*pp = p;
|
857
1460
|
if (invert) {
|
858
|
-
|
1461
|
+
/* XXX: add may_contain_string syntax check to be fully
|
1462
|
+
compliant. The test here accepts more input than the
|
1463
|
+
spec. */
|
1464
|
+
if (cr->n_strings != 0) {
|
1465
|
+
re_parse_error(s, "negated character class with strings in regular expression debugger eval code");
|
1466
|
+
goto fail;
|
1467
|
+
}
|
1468
|
+
if (cr_invert(&cr->cr))
|
859
1469
|
goto memory_error;
|
860
1470
|
}
|
861
|
-
if (re_emit_range(s, cr))
|
862
|
-
goto fail;
|
863
|
-
cr_free(cr);
|
864
|
-
p++; /* skip ']' */
|
865
|
-
*pp = p;
|
866
1471
|
return 0;
|
867
1472
|
memory_error:
|
868
1473
|
re_parse_out_of_memory(s);
|
869
1474
|
fail:
|
870
|
-
|
1475
|
+
re_string_list_free(cr);
|
1476
|
+
return -1;
|
1477
|
+
}
|
1478
|
+
|
1479
|
+
static int re_parse_char_class(REParseState *s, const uint8_t **pp)
|
1480
|
+
{
|
1481
|
+
REStringList cr_s, *cr = &cr_s;
|
1482
|
+
|
1483
|
+
if (re_parse_nested_class(s, cr, pp))
|
1484
|
+
return -1;
|
1485
|
+
if (re_emit_string_list(s, cr))
|
1486
|
+
goto fail;
|
1487
|
+
re_string_list_free(cr);
|
1488
|
+
return 0;
|
1489
|
+
fail:
|
1490
|
+
re_string_list_free(cr);
|
871
1491
|
return -1;
|
872
1492
|
}
|
873
1493
|
|
@@ -888,27 +1508,35 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
|
|
888
1508
|
len = reopcode_info[opcode].size;
|
889
1509
|
switch(opcode) {
|
890
1510
|
case REOP_range:
|
1511
|
+
case REOP_range_i:
|
891
1512
|
val = get_u16(bc_buf + pos + 1);
|
892
1513
|
len += val * 4;
|
893
1514
|
goto simple_char;
|
894
1515
|
case REOP_range32:
|
1516
|
+
case REOP_range32_i:
|
895
1517
|
val = get_u16(bc_buf + pos + 1);
|
896
1518
|
len += val * 8;
|
897
1519
|
goto simple_char;
|
898
1520
|
case REOP_char:
|
1521
|
+
case REOP_char_i:
|
899
1522
|
case REOP_char32:
|
1523
|
+
case REOP_char32_i:
|
900
1524
|
case REOP_dot:
|
901
1525
|
case REOP_any:
|
902
1526
|
simple_char:
|
903
1527
|
ret = FALSE;
|
904
1528
|
break;
|
905
1529
|
case REOP_line_start:
|
1530
|
+
case REOP_line_start_m:
|
906
1531
|
case REOP_line_end:
|
1532
|
+
case REOP_line_end_m:
|
907
1533
|
case REOP_push_i32:
|
908
1534
|
case REOP_push_char_pos:
|
909
1535
|
case REOP_drop:
|
910
1536
|
case REOP_word_boundary:
|
1537
|
+
case REOP_word_boundary_i:
|
911
1538
|
case REOP_not_word_boundary:
|
1539
|
+
case REOP_not_word_boundary_i:
|
912
1540
|
case REOP_prev:
|
913
1541
|
/* no effect */
|
914
1542
|
break;
|
@@ -916,7 +1544,9 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
|
|
916
1544
|
case REOP_save_end:
|
917
1545
|
case REOP_save_reset:
|
918
1546
|
case REOP_back_reference:
|
1547
|
+
case REOP_back_reference_i:
|
919
1548
|
case REOP_backward_back_reference:
|
1549
|
+
case REOP_backward_back_reference_i:
|
920
1550
|
break;
|
921
1551
|
default:
|
922
1552
|
/* safe behavior: we cannot predict the outcome */
|
@@ -941,24 +1571,32 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
|
|
941
1571
|
len = reopcode_info[opcode].size;
|
942
1572
|
switch(opcode) {
|
943
1573
|
case REOP_range:
|
1574
|
+
case REOP_range_i:
|
944
1575
|
val = get_u16(bc_buf + pos + 1);
|
945
1576
|
len += val * 4;
|
946
1577
|
goto simple_char;
|
947
1578
|
case REOP_range32:
|
1579
|
+
case REOP_range32_i:
|
948
1580
|
val = get_u16(bc_buf + pos + 1);
|
949
1581
|
len += val * 8;
|
950
1582
|
goto simple_char;
|
951
1583
|
case REOP_char:
|
1584
|
+
case REOP_char_i:
|
952
1585
|
case REOP_char32:
|
1586
|
+
case REOP_char32_i:
|
953
1587
|
case REOP_dot:
|
954
1588
|
case REOP_any:
|
955
1589
|
simple_char:
|
956
1590
|
count++;
|
957
1591
|
break;
|
958
1592
|
case REOP_line_start:
|
1593
|
+
case REOP_line_start_m:
|
959
1594
|
case REOP_line_end:
|
1595
|
+
case REOP_line_end_m:
|
960
1596
|
case REOP_word_boundary:
|
1597
|
+
case REOP_word_boundary_i:
|
961
1598
|
case REOP_not_word_boundary:
|
1599
|
+
case REOP_not_word_boundary_i:
|
962
1600
|
break;
|
963
1601
|
default:
|
964
1602
|
return -1;
|
@@ -1116,12 +1754,47 @@ static int find_group_name(REParseState *s, const char *name)
|
|
1116
1754
|
|
1117
1755
|
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
|
1118
1756
|
|
1757
|
+
static int re_parse_modifiers(REParseState *s, const uint8_t **pp)
|
1758
|
+
{
|
1759
|
+
const uint8_t *p = *pp;
|
1760
|
+
int mask = 0;
|
1761
|
+
int val;
|
1762
|
+
|
1763
|
+
for(;;) {
|
1764
|
+
if (*p == 'i') {
|
1765
|
+
val = LRE_FLAG_IGNORECASE;
|
1766
|
+
} else if (*p == 'm') {
|
1767
|
+
val = LRE_FLAG_MULTILINE;
|
1768
|
+
} else if (*p == 's') {
|
1769
|
+
val = LRE_FLAG_DOTALL;
|
1770
|
+
} else {
|
1771
|
+
break;
|
1772
|
+
}
|
1773
|
+
if (mask & val)
|
1774
|
+
return re_parse_error(s, "duplicate modifier: '%c'", *p);
|
1775
|
+
mask |= val;
|
1776
|
+
p++;
|
1777
|
+
}
|
1778
|
+
*pp = p;
|
1779
|
+
return mask;
|
1780
|
+
}
|
1781
|
+
|
1782
|
+
static BOOL update_modifier(BOOL val, int add_mask, int remove_mask,
|
1783
|
+
int mask)
|
1784
|
+
{
|
1785
|
+
if (add_mask & mask)
|
1786
|
+
val = TRUE;
|
1787
|
+
if (remove_mask & mask)
|
1788
|
+
val = FALSE;
|
1789
|
+
return val;
|
1790
|
+
}
|
1791
|
+
|
1119
1792
|
static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
1120
1793
|
{
|
1121
1794
|
const uint8_t *p;
|
1122
1795
|
int c, last_atom_start, quant_min, quant_max, last_capture_count;
|
1123
1796
|
BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
|
1124
|
-
|
1797
|
+
REStringList cr_s, *cr = &cr_s;
|
1125
1798
|
|
1126
1799
|
last_atom_start = -1;
|
1127
1800
|
last_capture_count = 0;
|
@@ -1130,11 +1803,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|
1130
1803
|
switch(c) {
|
1131
1804
|
case '^':
|
1132
1805
|
p++;
|
1133
|
-
re_emit_op(s, REOP_line_start);
|
1806
|
+
re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start);
|
1134
1807
|
break;
|
1135
1808
|
case '$':
|
1136
1809
|
p++;
|
1137
|
-
re_emit_op(s, REOP_line_end);
|
1810
|
+
re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end);
|
1138
1811
|
break;
|
1139
1812
|
case '.':
|
1140
1813
|
p++;
|
@@ -1184,6 +1857,44 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|
1184
1857
|
p = s->buf_ptr;
|
1185
1858
|
if (re_parse_expect(s, &p, ')'))
|
1186
1859
|
return -1;
|
1860
|
+
} else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') {
|
1861
|
+
BOOL saved_ignore_case, saved_multi_line, saved_dotall;
|
1862
|
+
int add_mask, remove_mask;
|
1863
|
+
p += 2;
|
1864
|
+
remove_mask = 0;
|
1865
|
+
add_mask = re_parse_modifiers(s, &p);
|
1866
|
+
if (add_mask < 0)
|
1867
|
+
return -1;
|
1868
|
+
if (*p == '-') {
|
1869
|
+
p++;
|
1870
|
+
remove_mask = re_parse_modifiers(s, &p);
|
1871
|
+
if (remove_mask < 0)
|
1872
|
+
return -1;
|
1873
|
+
}
|
1874
|
+
if ((add_mask == 0 && remove_mask == 0) ||
|
1875
|
+
(add_mask & remove_mask) != 0) {
|
1876
|
+
return re_parse_error(s, "invalid modifiers");
|
1877
|
+
}
|
1878
|
+
if (re_parse_expect(s, &p, ':'))
|
1879
|
+
return -1;
|
1880
|
+
saved_ignore_case = s->ignore_case;
|
1881
|
+
saved_multi_line = s->multi_line;
|
1882
|
+
saved_dotall = s->dotall;
|
1883
|
+
s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE);
|
1884
|
+
s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE);
|
1885
|
+
s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL);
|
1886
|
+
|
1887
|
+
last_atom_start = s->byte_code.size;
|
1888
|
+
last_capture_count = s->capture_count;
|
1889
|
+
s->buf_ptr = p;
|
1890
|
+
if (re_parse_disjunction(s, is_backward_dir))
|
1891
|
+
return -1;
|
1892
|
+
p = s->buf_ptr;
|
1893
|
+
if (re_parse_expect(s, &p, ')'))
|
1894
|
+
return -1;
|
1895
|
+
s->ignore_case = saved_ignore_case;
|
1896
|
+
s->multi_line = saved_multi_line;
|
1897
|
+
s->dotall = saved_dotall;
|
1187
1898
|
} else if ((p[2] == '=' || p[2] == '!')) {
|
1188
1899
|
is_neg = (p[2] == '!');
|
1189
1900
|
is_backward_lookahead = FALSE;
|
@@ -1262,7 +1973,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|
1262
1973
|
switch(p[1]) {
|
1263
1974
|
case 'b':
|
1264
1975
|
case 'B':
|
1265
|
-
|
1976
|
+
if (p[1] != 'b') {
|
1977
|
+
re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
|
1978
|
+
} else {
|
1979
|
+
re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
|
1980
|
+
}
|
1266
1981
|
p += 2;
|
1267
1982
|
break;
|
1268
1983
|
case 'k':
|
@@ -1351,7 +2066,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|
1351
2066
|
emit_back_reference:
|
1352
2067
|
last_atom_start = s->byte_code.size;
|
1353
2068
|
last_capture_count = s->capture_count;
|
1354
|
-
|
2069
|
+
|
2070
|
+
re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, c);
|
1355
2071
|
}
|
1356
2072
|
break;
|
1357
2073
|
default:
|
@@ -1385,18 +2101,14 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
|
|
1385
2101
|
re_emit_op(s, REOP_prev);
|
1386
2102
|
if (c >= CLASS_RANGE_BASE) {
|
1387
2103
|
int ret;
|
1388
|
-
|
1389
|
-
|
1390
|
-
cr_free(cr);
|
2104
|
+
ret = re_emit_string_list(s, cr);
|
2105
|
+
re_string_list_free(cr);
|
1391
2106
|
if (ret)
|
1392
2107
|
return -1;
|
1393
2108
|
} else {
|
1394
2109
|
if (s->ignore_case)
|
1395
2110
|
c = lre_canonicalize(c, s->is_unicode);
|
1396
|
-
|
1397
|
-
re_emit_op_u16(s, REOP_char, c);
|
1398
|
-
else
|
1399
|
-
re_emit_op_u32(s, REOP_char32, c);
|
2111
|
+
re_emit_char(s, c);
|
1400
2112
|
}
|
1401
2113
|
if (is_backward_dir)
|
1402
2114
|
re_emit_op(s, REOP_prev);
|
@@ -1706,10 +2418,12 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
|
|
1706
2418
|
stack_size--;
|
1707
2419
|
break;
|
1708
2420
|
case REOP_range:
|
2421
|
+
case REOP_range_i:
|
1709
2422
|
val = get_u16(bc_buf + pos + 1);
|
1710
2423
|
len += val * 4;
|
1711
2424
|
break;
|
1712
2425
|
case REOP_range32:
|
2426
|
+
case REOP_range32_i:
|
1713
2427
|
val = get_u16(bc_buf + pos + 1);
|
1714
2428
|
len += val * 8;
|
1715
2429
|
break;
|
@@ -1719,6 +2433,17 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
|
|
1719
2433
|
return stack_size_max;
|
1720
2434
|
}
|
1721
2435
|
|
2436
|
+
static void *lre_bytecode_realloc(void *opaque, void *ptr, size_t size)
|
2437
|
+
{
|
2438
|
+
if (size > (INT32_MAX / 2)) {
|
2439
|
+
/* the bytecode cannot be larger than 2G. Leave some slack to
|
2440
|
+
avoid some overflows. */
|
2441
|
+
return NULL;
|
2442
|
+
} else {
|
2443
|
+
return lre_realloc(opaque, ptr, size);
|
2444
|
+
}
|
2445
|
+
}
|
2446
|
+
|
1722
2447
|
/* 'buf' must be a zero terminated UTF-8 string of length buf_len.
|
1723
2448
|
Return NULL if error and allocate an error message in *perror_msg,
|
1724
2449
|
otherwise the compiled bytecode and its length in plen.
|
@@ -1737,18 +2462,20 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
|
|
1737
2462
|
s->buf_end = s->buf_ptr + buf_len;
|
1738
2463
|
s->buf_start = s->buf_ptr;
|
1739
2464
|
s->re_flags = re_flags;
|
1740
|
-
s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
|
2465
|
+
s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
|
1741
2466
|
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
|
1742
2467
|
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
|
2468
|
+
s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0);
|
1743
2469
|
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
|
2470
|
+
s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
|
1744
2471
|
s->capture_count = 1;
|
1745
2472
|
s->total_capture_count = -1;
|
1746
2473
|
s->has_named_captures = -1;
|
1747
2474
|
|
1748
|
-
dbuf_init2(&s->byte_code, opaque,
|
2475
|
+
dbuf_init2(&s->byte_code, opaque, lre_bytecode_realloc);
|
1749
2476
|
dbuf_init2(&s->group_names, opaque, lre_realloc);
|
1750
2477
|
|
1751
|
-
|
2478
|
+
dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */
|
1752
2479
|
dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
|
1753
2480
|
dbuf_putc(&s->byte_code, 0); /* stack size */
|
1754
2481
|
dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
|
@@ -1801,7 +2528,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
|
|
1801
2528
|
/* add the named groups if needed */
|
1802
2529
|
if (s->group_names.size > (s->capture_count - 1)) {
|
1803
2530
|
dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
|
1804
|
-
s->byte_code.buf
|
2531
|
+
put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
|
2532
|
+
lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
|
1805
2533
|
}
|
1806
2534
|
dbuf_free(&s->group_names);
|
1807
2535
|
|
@@ -1935,8 +2663,6 @@ typedef struct {
|
|
1935
2663
|
int cbuf_type;
|
1936
2664
|
int capture_count;
|
1937
2665
|
int stack_size_max;
|
1938
|
-
BOOL multi_line;
|
1939
|
-
BOOL ignore_case;
|
1940
2666
|
BOOL is_unicode;
|
1941
2667
|
int interrupt_counter;
|
1942
2668
|
void *opaque; /* used for stack overflow check */
|
@@ -2085,17 +2811,19 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2085
2811
|
}
|
2086
2812
|
break;
|
2087
2813
|
case REOP_char32:
|
2814
|
+
case REOP_char32_i:
|
2088
2815
|
val = get_u32(pc);
|
2089
2816
|
pc += 4;
|
2090
2817
|
goto test_char;
|
2091
2818
|
case REOP_char:
|
2819
|
+
case REOP_char_i:
|
2092
2820
|
val = get_u16(pc);
|
2093
2821
|
pc += 2;
|
2094
2822
|
test_char:
|
2095
2823
|
if (cptr >= cbuf_end)
|
2096
2824
|
goto no_match;
|
2097
2825
|
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
2098
|
-
if (
|
2826
|
+
if (opcode == REOP_char_i || opcode == REOP_char32_i) {
|
2099
2827
|
c = lre_canonicalize(c, s->is_unicode);
|
2100
2828
|
}
|
2101
2829
|
if (val != c)
|
@@ -2139,18 +2867,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2139
2867
|
return LRE_RET_TIMEOUT;
|
2140
2868
|
break;
|
2141
2869
|
case REOP_line_start:
|
2870
|
+
case REOP_line_start_m:
|
2142
2871
|
if (cptr == s->cbuf)
|
2143
2872
|
break;
|
2144
|
-
if (
|
2873
|
+
if (opcode == REOP_line_start)
|
2145
2874
|
goto no_match;
|
2146
2875
|
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
|
2147
2876
|
if (!is_line_terminator(c))
|
2148
2877
|
goto no_match;
|
2149
2878
|
break;
|
2150
2879
|
case REOP_line_end:
|
2880
|
+
case REOP_line_end_m:
|
2151
2881
|
if (cptr == cbuf_end)
|
2152
2882
|
break;
|
2153
|
-
if (
|
2883
|
+
if (opcode == REOP_line_end)
|
2154
2884
|
goto no_match;
|
2155
2885
|
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
|
2156
2886
|
if (!is_line_terminator(c))
|
@@ -2213,14 +2943,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2213
2943
|
goto no_match;
|
2214
2944
|
break;
|
2215
2945
|
case REOP_word_boundary:
|
2946
|
+
case REOP_word_boundary_i:
|
2216
2947
|
case REOP_not_word_boundary:
|
2948
|
+
case REOP_not_word_boundary_i:
|
2217
2949
|
{
|
2218
2950
|
BOOL v1, v2;
|
2951
|
+
int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i);
|
2952
|
+
BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i);
|
2219
2953
|
/* char before */
|
2220
2954
|
if (cptr == s->cbuf) {
|
2221
2955
|
v1 = FALSE;
|
2222
2956
|
} else {
|
2223
2957
|
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
|
2958
|
+
if (ignore_case)
|
2959
|
+
c = lre_canonicalize(c, s->is_unicode);
|
2224
2960
|
v1 = is_word_char(c);
|
2225
2961
|
}
|
2226
2962
|
/* current char */
|
@@ -2228,14 +2964,18 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2228
2964
|
v2 = FALSE;
|
2229
2965
|
} else {
|
2230
2966
|
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
|
2967
|
+
if (ignore_case)
|
2968
|
+
c = lre_canonicalize(c, s->is_unicode);
|
2231
2969
|
v2 = is_word_char(c);
|
2232
2970
|
}
|
2233
|
-
if (v1 ^ v2 ^
|
2971
|
+
if (v1 ^ v2 ^ is_boundary)
|
2234
2972
|
goto no_match;
|
2235
2973
|
}
|
2236
2974
|
break;
|
2237
2975
|
case REOP_back_reference:
|
2976
|
+
case REOP_back_reference_i:
|
2238
2977
|
case REOP_backward_back_reference:
|
2978
|
+
case REOP_backward_back_reference_i:
|
2239
2979
|
{
|
2240
2980
|
const uint8_t *cptr1, *cptr1_end, *cptr1_start;
|
2241
2981
|
uint32_t c1, c2;
|
@@ -2247,14 +2987,15 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2247
2987
|
cptr1_end = capture[2 * val + 1];
|
2248
2988
|
if (!cptr1_start || !cptr1_end)
|
2249
2989
|
break;
|
2250
|
-
if (opcode == REOP_back_reference
|
2990
|
+
if (opcode == REOP_back_reference ||
|
2991
|
+
opcode == REOP_back_reference_i) {
|
2251
2992
|
cptr1 = cptr1_start;
|
2252
2993
|
while (cptr1 < cptr1_end) {
|
2253
2994
|
if (cptr >= cbuf_end)
|
2254
2995
|
goto no_match;
|
2255
2996
|
GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
|
2256
2997
|
GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
|
2257
|
-
if (
|
2998
|
+
if (opcode == REOP_back_reference_i) {
|
2258
2999
|
c1 = lre_canonicalize(c1, s->is_unicode);
|
2259
3000
|
c2 = lre_canonicalize(c2, s->is_unicode);
|
2260
3001
|
}
|
@@ -2268,7 +3009,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2268
3009
|
goto no_match;
|
2269
3010
|
GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
|
2270
3011
|
GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
|
2271
|
-
if (
|
3012
|
+
if (opcode == REOP_backward_back_reference_i) {
|
2272
3013
|
c1 = lre_canonicalize(c1, s->is_unicode);
|
2273
3014
|
c2 = lre_canonicalize(c2, s->is_unicode);
|
2274
3015
|
}
|
@@ -2279,6 +3020,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2279
3020
|
}
|
2280
3021
|
break;
|
2281
3022
|
case REOP_range:
|
3023
|
+
case REOP_range_i:
|
2282
3024
|
{
|
2283
3025
|
int n;
|
2284
3026
|
uint32_t low, high, idx_min, idx_max, idx;
|
@@ -2288,7 +3030,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2288
3030
|
if (cptr >= cbuf_end)
|
2289
3031
|
goto no_match;
|
2290
3032
|
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
2291
|
-
if (
|
3033
|
+
if (opcode == REOP_range_i) {
|
2292
3034
|
c = lre_canonicalize(c, s->is_unicode);
|
2293
3035
|
}
|
2294
3036
|
idx_min = 0;
|
@@ -2319,6 +3061,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2319
3061
|
}
|
2320
3062
|
break;
|
2321
3063
|
case REOP_range32:
|
3064
|
+
case REOP_range32_i:
|
2322
3065
|
{
|
2323
3066
|
int n;
|
2324
3067
|
uint32_t low, high, idx_min, idx_max, idx;
|
@@ -2328,7 +3071,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
|
|
2328
3071
|
if (cptr >= cbuf_end)
|
2329
3072
|
goto no_match;
|
2330
3073
|
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
|
2331
|
-
if (
|
3074
|
+
if (opcode == REOP_range32_i) {
|
2332
3075
|
c = lre_canonicalize(c, s->is_unicode);
|
2333
3076
|
}
|
2334
3077
|
idx_min = 0;
|
@@ -2420,11 +3163,10 @@ int lre_exec(uint8_t **capture,
|
|
2420
3163
|
REExecContext s_s, *s = &s_s;
|
2421
3164
|
int re_flags, i, alloca_size, ret;
|
2422
3165
|
StackInt *stack_buf;
|
3166
|
+
const uint8_t *cptr;
|
2423
3167
|
|
2424
3168
|
re_flags = lre_get_flags(bc_buf);
|
2425
|
-
s->
|
2426
|
-
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
|
2427
|
-
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
|
3169
|
+
s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
|
2428
3170
|
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
|
2429
3171
|
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
|
2430
3172
|
s->cbuf = cbuf;
|
@@ -2446,8 +3188,17 @@ int lre_exec(uint8_t **capture,
|
|
2446
3188
|
capture[i] = NULL;
|
2447
3189
|
alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
|
2448
3190
|
stack_buf = alloca(alloca_size);
|
3191
|
+
|
3192
|
+
cptr = cbuf + (cindex << cbuf_type);
|
3193
|
+
if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
|
3194
|
+
const uint16_t *p = (const uint16_t *)cptr;
|
3195
|
+
if (is_lo_surrogate(*p) && is_hi_surrogate(p[-1])) {
|
3196
|
+
cptr = (const uint8_t *)(p - 1);
|
3197
|
+
}
|
3198
|
+
}
|
3199
|
+
|
2449
3200
|
ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
|
2450
|
-
|
3201
|
+
cptr, FALSE);
|
2451
3202
|
lre_realloc(s->opaque, s->state_stack, 0);
|
2452
3203
|
return ret;
|
2453
3204
|
}
|
@@ -2459,7 +3210,7 @@ int lre_get_capture_count(const uint8_t *bc_buf)
|
|
2459
3210
|
|
2460
3211
|
int lre_get_flags(const uint8_t *bc_buf)
|
2461
3212
|
{
|
2462
|
-
return bc_buf
|
3213
|
+
return get_u16(bc_buf + RE_HEADER_FLAGS);
|
2463
3214
|
}
|
2464
3215
|
|
2465
3216
|
/* Return NULL if no group names. Otherwise, return a pointer to
|