quickjs 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,7 +71,9 @@ typedef struct {
71
71
  const uint8_t *buf_start;
72
72
  int re_flags;
73
73
  BOOL is_unicode;
74
+ BOOL unicode_sets; /* if set, is_unicode is also set */
74
75
  BOOL ignore_case;
76
+ BOOL multi_line;
75
77
  BOOL dotall;
76
78
  int capture_count;
77
79
  int total_capture_count; /* -1 = not computed yet */
@@ -102,11 +104,11 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
102
104
  };
103
105
 
104
106
  #define RE_HEADER_FLAGS 0
105
- #define RE_HEADER_CAPTURE_COUNT 1
106
- #define RE_HEADER_STACK_SIZE 2
107
- #define RE_HEADER_BYTECODE_LEN 3
107
+ #define RE_HEADER_CAPTURE_COUNT 2
108
+ #define RE_HEADER_STACK_SIZE 3
109
+ #define RE_HEADER_BYTECODE_LEN 4
108
110
 
109
- #define RE_HEADER_LEN 7
111
+ #define RE_HEADER_LEN 8
110
112
 
111
113
  static inline int is_digit(int c) {
112
114
  return c >= '0' && c <= '9';
@@ -122,6 +124,264 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
122
124
  return 0;
123
125
  }
124
126
 
127
+ typedef struct REString {
128
+ struct REString *next;
129
+ uint32_t hash;
130
+ uint32_t len;
131
+ uint32_t buf[];
132
+ } REString;
133
+
134
+ typedef struct {
135
+ /* the string list is the union of 'char_range' and of the strings
136
+ in hash_table[]. The strings in hash_table[] have a length !=
137
+ 1. */
138
+ CharRange cr;
139
+ uint32_t n_strings;
140
+ uint32_t hash_size;
141
+ int hash_bits;
142
+ REString **hash_table;
143
+ } REStringList;
144
+
145
+ static uint32_t re_string_hash(int len, const uint32_t *buf)
146
+ {
147
+ int i;
148
+ uint32_t h;
149
+ h = 1;
150
+ for(i = 0; i < len; i++)
151
+ h = h * 263 + buf[i];
152
+ return h * 0x61C88647;
153
+ }
154
+
155
+ static void re_string_list_init(REParseState *s1, REStringList *s)
156
+ {
157
+ cr_init(&s->cr, s1->opaque, lre_realloc);
158
+ s->n_strings = 0;
159
+ s->hash_size = 0;
160
+ s->hash_bits = 0;
161
+ s->hash_table = NULL;
162
+ }
163
+
164
+ static void re_string_list_free(REStringList *s)
165
+ {
166
+ REString *p, *p_next;
167
+ int i;
168
+ for(i = 0; i < s->hash_size; i++) {
169
+ for(p = s->hash_table[i]; p != NULL; p = p_next) {
170
+ p_next = p->next;
171
+ lre_realloc(s->cr.mem_opaque, p, 0);
172
+ }
173
+ }
174
+ lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
175
+
176
+ cr_free(&s->cr);
177
+ }
178
+
179
+ static void lre_print_char(int c, BOOL is_range)
180
+ {
181
+ if (c == '\'' || c == '\\' ||
182
+ (is_range && (c == '-' || c == ']'))) {
183
+ printf("\\%c", c);
184
+ } else if (c >= ' ' && c <= 126) {
185
+ printf("%c", c);
186
+ } else {
187
+ printf("\\u{%04x}", c);
188
+ }
189
+ }
190
+
191
+ static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
192
+ {
193
+ REString *p;
194
+ const CharRange *cr;
195
+ int i, j, k;
196
+
197
+ printf("%s:\n", str);
198
+ printf(" ranges: [");
199
+ cr = &s->cr;
200
+ for(i = 0; i < cr->len; i += 2) {
201
+ lre_print_char(cr->points[i], TRUE);
202
+ if (cr->points[i] != cr->points[i + 1] - 1) {
203
+ printf("-");
204
+ lre_print_char(cr->points[i + 1] - 1, TRUE);
205
+ }
206
+ }
207
+ printf("]\n");
208
+
209
+ j = 0;
210
+ for(i = 0; i < s->hash_size; i++) {
211
+ for(p = s->hash_table[i]; p != NULL; p = p->next) {
212
+ printf(" %d/%d: '", j, s->n_strings);
213
+ for(k = 0; k < p->len; k++) {
214
+ lre_print_char(p->buf[k], FALSE);
215
+ }
216
+ printf("'\n");
217
+ j++;
218
+ }
219
+ }
220
+ }
221
+
222
+ static int re_string_find2(REStringList *s, int len, const uint32_t *buf,
223
+ uint32_t h0, BOOL add_flag)
224
+ {
225
+ uint32_t h = 0; /* avoid warning */
226
+ REString *p;
227
+ if (s->n_strings != 0) {
228
+ h = h0 >> (32 - s->hash_bits);
229
+ for(p = s->hash_table[h]; p != NULL; p = p->next) {
230
+ if (p->hash == h0 && p->len == len &&
231
+ !memcmp(p->buf, buf, len * sizeof(buf[0]))) {
232
+ return 1;
233
+ }
234
+ }
235
+ }
236
+ /* not found */
237
+ if (!add_flag)
238
+ return 0;
239
+ /* increase the size of the hash table if needed */
240
+ if (unlikely((s->n_strings + 1) > s->hash_size)) {
241
+ REString **new_hash_table, *p_next;
242
+ int new_hash_bits, i;
243
+ uint32_t new_hash_size;
244
+ new_hash_bits = max_int(s->hash_bits + 1, 4);
245
+ new_hash_size = 1 << new_hash_bits;
246
+ new_hash_table = lre_realloc(s->cr.mem_opaque, NULL,
247
+ sizeof(new_hash_table[0]) * new_hash_size);
248
+ if (!new_hash_table)
249
+ return -1;
250
+ memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
251
+ for(i = 0; i < s->hash_size; i++) {
252
+ for(p = s->hash_table[i]; p != NULL; p = p_next) {
253
+ p_next = p->next;
254
+ h = p->hash >> (32 - new_hash_bits);
255
+ p->next = new_hash_table[h];
256
+ new_hash_table[h] = p;
257
+ }
258
+ }
259
+ lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
260
+ s->hash_bits = new_hash_bits;
261
+ s->hash_size = new_hash_size;
262
+ s->hash_table = new_hash_table;
263
+ h = h0 >> (32 - s->hash_bits);
264
+ }
265
+
266
+ p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0]));
267
+ if (!p)
268
+ return -1;
269
+ p->next = s->hash_table[h];
270
+ s->hash_table[h] = p;
271
+ s->n_strings++;
272
+ p->hash = h0;
273
+ p->len = len;
274
+ memcpy(p->buf, buf, sizeof(buf[0]) * len);
275
+ return 1;
276
+ }
277
+
278
+ static int re_string_find(REStringList *s, int len, const uint32_t *buf,
279
+ BOOL add_flag)
280
+ {
281
+ uint32_t h0;
282
+ h0 = re_string_hash(len, buf);
283
+ return re_string_find2(s, len, buf, h0, add_flag);
284
+ }
285
+
286
+ /* return -1 if memory error, 0 if OK */
287
+ static int re_string_add(REStringList *s, int len, const uint32_t *buf)
288
+ {
289
+ if (len == 1) {
290
+ return cr_union_interval(&s->cr, buf[0], buf[0]);
291
+ }
292
+ if (re_string_find(s, len, buf, TRUE) < 0)
293
+ return -1;
294
+ return 0;
295
+ }
296
+
297
+ /* a = a op b */
298
+ static int re_string_list_op(REStringList *a, REStringList *b, int op)
299
+ {
300
+ int i, ret;
301
+ REString *p, **pp;
302
+
303
+ if (cr_op1(&a->cr, b->cr.points, b->cr.len, op))
304
+ return -1;
305
+
306
+ switch(op) {
307
+ case CR_OP_UNION:
308
+ if (b->n_strings != 0) {
309
+ for(i = 0; i < b->hash_size; i++) {
310
+ for(p = b->hash_table[i]; p != NULL; p = p->next) {
311
+ if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0)
312
+ return -1;
313
+ }
314
+ }
315
+ }
316
+ break;
317
+ case CR_OP_INTER:
318
+ case CR_OP_SUB:
319
+ for(i = 0; i < a->hash_size; i++) {
320
+ pp = &a->hash_table[i];
321
+ for(;;) {
322
+ p = *pp;
323
+ if (p == NULL)
324
+ break;
325
+ ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE);
326
+ if (op == CR_OP_SUB)
327
+ ret = !ret;
328
+ if (!ret) {
329
+ /* remove it */
330
+ *pp = p->next;
331
+ a->n_strings--;
332
+ lre_realloc(a->cr.mem_opaque, p, 0);
333
+ } else {
334
+ /* keep it */
335
+ pp = &p->next;
336
+ }
337
+ }
338
+ }
339
+ break;
340
+ default:
341
+ abort();
342
+ }
343
+ return 0;
344
+ }
345
+
346
+ static int re_string_list_canonicalize(REParseState *s1,
347
+ REStringList *s, BOOL is_unicode)
348
+ {
349
+ if (cr_regexp_canonicalize(&s->cr, is_unicode))
350
+ return -1;
351
+ if (s->n_strings != 0) {
352
+ REStringList a_s, *a = &a_s;
353
+ int i, j;
354
+ REString *p;
355
+
356
+ /* XXX: simplify */
357
+ re_string_list_init(s1, a);
358
+
359
+ a->n_strings = s->n_strings;
360
+ a->hash_size = s->hash_size;
361
+ a->hash_bits = s->hash_bits;
362
+ a->hash_table = s->hash_table;
363
+
364
+ s->n_strings = 0;
365
+ s->hash_size = 0;
366
+ s->hash_bits = 0;
367
+ s->hash_table = NULL;
368
+
369
+ for(i = 0; i < a->hash_size; i++) {
370
+ for(p = a->hash_table[i]; p != NULL; p = p->next) {
371
+ for(j = 0; j < p->len; j++) {
372
+ p->buf[j] = lre_canonicalize(p->buf[j], is_unicode);
373
+ }
374
+ if (re_string_add(s, p->len, p->buf)) {
375
+ re_string_list_free(a);
376
+ return -1;
377
+ }
378
+ }
379
+ }
380
+ re_string_list_free(a);
381
+ }
382
+ return 0;
383
+ }
384
+
125
385
  static const uint16_t char_range_d[] = {
126
386
  1,
127
387
  0x0030, 0x0039 + 1,
@@ -170,7 +430,7 @@ static const uint16_t * const char_range_table[] = {
170
430
  char_range_w,
171
431
  };
172
432
 
173
- static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
433
+ static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c)
174
434
  {
175
435
  BOOL invert;
176
436
  const uint16_t *c_pt;
@@ -179,18 +439,18 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
179
439
  invert = c & 1;
180
440
  c_pt = char_range_table[c >> 1];
181
441
  len = *c_pt++;
182
- cr_init(cr, s->opaque, lre_realloc);
442
+ re_string_list_init(s, cr);
183
443
  for(i = 0; i < len * 2; i++) {
184
- if (cr_add_point(cr, c_pt[i]))
444
+ if (cr_add_point(&cr->cr, c_pt[i]))
185
445
  goto fail;
186
446
  }
187
447
  if (invert) {
188
- if (cr_invert(cr))
448
+ if (cr_invert(&cr->cr))
189
449
  goto fail;
190
450
  }
191
451
  return 0;
192
452
  fail:
193
- cr_free(cr);
453
+ re_string_list_free(cr);
194
454
  return -1;
195
455
  }
196
456
 
@@ -240,6 +500,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
240
500
  printf("%s", reopcode_info[opcode].name);
241
501
  switch(opcode) {
242
502
  case REOP_char:
503
+ case REOP_char_i:
243
504
  val = get_u16(buf + pos + 1);
244
505
  if (val >= ' ' && val <= 126)
245
506
  printf(" '%c'", val);
@@ -247,6 +508,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
247
508
  printf(" 0x%04x", val);
248
509
  break;
249
510
  case REOP_char32:
511
+ case REOP_char32_i:
250
512
  val = get_u32(buf + pos + 1);
251
513
  if (val >= ' ' && val <= 126)
252
514
  printf(" '%c'", val);
@@ -273,7 +535,9 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
273
535
  case REOP_save_start:
274
536
  case REOP_save_end:
275
537
  case REOP_back_reference:
538
+ case REOP_back_reference_i:
276
539
  case REOP_backward_back_reference:
540
+ case REOP_backward_back_reference_i:
277
541
  printf(" %u", buf[pos + 1]);
278
542
  break;
279
543
  case REOP_save_reset:
@@ -284,6 +548,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
284
548
  printf(" %d", val);
285
549
  break;
286
550
  case REOP_range:
551
+ case REOP_range_i:
287
552
  {
288
553
  int n, i;
289
554
  n = get_u16(buf + pos + 1);
@@ -295,6 +560,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
295
560
  }
296
561
  break;
297
562
  case REOP_range32:
563
+ case REOP_range32_i:
298
564
  {
299
565
  int n, i;
300
566
  n = get_u16(buf + pos + 1);
@@ -533,8 +799,16 @@ static BOOL is_unicode_char(int c)
533
799
  (c == '_'));
534
800
  }
535
801
 
536
- static int parse_unicode_property(REParseState *s, CharRange *cr,
537
- const uint8_t **pp, BOOL is_inv)
802
+ /* XXX: memory error test */
803
+ static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len)
804
+ {
805
+ REStringList *sl = opaque;
806
+ re_string_add(sl, seq_len, seq);
807
+ }
808
+
809
+ static int parse_unicode_property(REParseState *s, REStringList *cr,
810
+ const uint8_t **pp, BOOL is_inv,
811
+ BOOL allow_sequence_prop)
538
812
  {
539
813
  const uint8_t *p;
540
814
  char name[64], value[64];
@@ -574,51 +848,76 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
574
848
  } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
575
849
  script_ext = TRUE;
576
850
  do_script:
577
- cr_init(cr, s->opaque, lre_realloc);
578
- ret = unicode_script(cr, value, script_ext);
851
+ re_string_list_init(s, cr);
852
+ ret = unicode_script(&cr->cr, value, script_ext);
579
853
  if (ret) {
580
- cr_free(cr);
854
+ re_string_list_free(cr);
581
855
  if (ret == -2)
582
856
  return re_parse_error(s, "unknown unicode script");
583
857
  else
584
858
  goto out_of_memory;
585
859
  }
586
860
  } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
587
- cr_init(cr, s->opaque, lre_realloc);
588
- ret = unicode_general_category(cr, value);
861
+ re_string_list_init(s, cr);
862
+ ret = unicode_general_category(&cr->cr, value);
589
863
  if (ret) {
590
- cr_free(cr);
864
+ re_string_list_free(cr);
591
865
  if (ret == -2)
592
866
  return re_parse_error(s, "unknown unicode general category");
593
867
  else
594
868
  goto out_of_memory;
595
869
  }
596
870
  } else if (value[0] == '\0') {
597
- cr_init(cr, s->opaque, lre_realloc);
598
- ret = unicode_general_category(cr, name);
871
+ re_string_list_init(s, cr);
872
+ ret = unicode_general_category(&cr->cr, name);
599
873
  if (ret == -1) {
600
- cr_free(cr);
874
+ re_string_list_free(cr);
601
875
  goto out_of_memory;
602
876
  }
603
877
  if (ret < 0) {
604
- ret = unicode_prop(cr, name);
605
- if (ret) {
606
- cr_free(cr);
607
- if (ret == -2)
608
- goto unknown_property_name;
609
- else
610
- goto out_of_memory;
878
+ ret = unicode_prop(&cr->cr, name);
879
+ if (ret == -1) {
880
+ re_string_list_free(cr);
881
+ goto out_of_memory;
882
+ }
883
+ }
884
+ if (ret < 0 && !is_inv && allow_sequence_prop) {
885
+ CharRange cr_tmp;
886
+ cr_init(&cr_tmp, s->opaque, lre_realloc);
887
+ ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp);
888
+ cr_free(&cr_tmp);
889
+ if (ret == -1) {
890
+ re_string_list_free(cr);
891
+ goto out_of_memory;
611
892
  }
612
893
  }
894
+ if (ret < 0)
895
+ goto unknown_property_name;
613
896
  } else {
614
897
  unknown_property_name:
615
898
  return re_parse_error(s, "unknown unicode property name");
616
899
  }
617
900
 
901
+ /* the ordering of case folding and inversion differs with
902
+ unicode_sets. 'unicode_sets' ordering is more consistent */
903
+ /* XXX: the spec seems incorrect, we do it as the other engines
904
+ seem to do it. */
905
+ if (s->ignore_case && s->unicode_sets) {
906
+ if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
907
+ re_string_list_free(cr);
908
+ goto out_of_memory;
909
+ }
910
+ }
618
911
  if (is_inv) {
619
- if (cr_invert(cr)) {
620
- cr_free(cr);
621
- return -1;
912
+ if (cr_invert(&cr->cr)) {
913
+ re_string_list_free(cr);
914
+ goto out_of_memory;
915
+ }
916
+ }
917
+ if (s->ignore_case && !s->unicode_sets) {
918
+ if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
919
+ re_string_list_free(cr);
920
+ goto out_of_memory;
622
921
  }
623
922
  }
624
923
  *pp = p;
@@ -628,10 +927,61 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
628
927
  }
629
928
  #endif /* CONFIG_ALL_UNICODE */
630
929
 
930
+ static int get_class_atom(REParseState *s, REStringList *cr,
931
+ const uint8_t **pp, BOOL inclass);
932
+
933
+ static int parse_class_string_disjunction(REParseState *s, REStringList *cr,
934
+ const uint8_t **pp)
935
+ {
936
+ const uint8_t *p;
937
+ DynBuf str;
938
+ int c;
939
+
940
+ p = *pp;
941
+ if (*p != '{')
942
+ return re_parse_error(s, "expecting '{' after \\q");
943
+
944
+ dbuf_init2(&str, s->opaque, lre_realloc);
945
+ re_string_list_init(s, cr);
946
+
947
+ p++;
948
+ for(;;) {
949
+ str.size = 0;
950
+ while (*p != '}' && *p != '|') {
951
+ c = get_class_atom(s, NULL, &p, FALSE);
952
+ if (c < 0)
953
+ goto fail;
954
+ if (dbuf_put_u32(&str, c)) {
955
+ re_parse_out_of_memory(s);
956
+ goto fail;
957
+ }
958
+ }
959
+ if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) {
960
+ re_parse_out_of_memory(s);
961
+ goto fail;
962
+ }
963
+ if (*p == '}')
964
+ break;
965
+ p++;
966
+ }
967
+ if (s->ignore_case) {
968
+ if (re_string_list_canonicalize(s, cr, TRUE))
969
+ goto fail;
970
+ }
971
+ p++; /* skip the '}' */
972
+ dbuf_free(&str);
973
+ *pp = p;
974
+ return 0;
975
+ fail:
976
+ dbuf_free(&str);
977
+ re_string_list_free(cr);
978
+ return -1;
979
+ }
980
+
631
981
  /* return -1 if error otherwise the character or a class range
632
- (CLASS_RANGE_BASE). In case of class range, 'cr' is
982
+ (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is
633
983
  initialized. Otherwise, it is ignored. */
634
- static int get_class_atom(REParseState *s, CharRange *cr,
984
+ static int get_class_atom(REParseState *s, REStringList *cr,
635
985
  const uint8_t **pp, BOOL inclass)
636
986
  {
637
987
  const uint8_t *p;
@@ -666,6 +1016,8 @@ static int get_class_atom(REParseState *s, CharRange *cr,
666
1016
  case 'W':
667
1017
  c = CHAR_RANGE_W;
668
1018
  class_range:
1019
+ if (!cr)
1020
+ goto default_escape;
669
1021
  if (cr_init_char_range(s, cr, c))
670
1022
  return -1;
671
1023
  c = CLASS_RANGE_BASE;
@@ -690,27 +1042,50 @@ static int get_class_atom(REParseState *s, CharRange *cr,
690
1042
  if (!inclass && s->is_unicode)
691
1043
  goto invalid_escape;
692
1044
  break;
1045
+ case '^':
1046
+ case '$':
1047
+ case '\\':
1048
+ case '.':
1049
+ case '*':
1050
+ case '+':
1051
+ case '?':
1052
+ case '(':
1053
+ case ')':
1054
+ case '[':
1055
+ case ']':
1056
+ case '{':
1057
+ case '}':
1058
+ case '|':
1059
+ case '/':
1060
+ /* always valid to escape these characters */
1061
+ break;
693
1062
  #ifdef CONFIG_ALL_UNICODE
694
1063
  case 'p':
695
1064
  case 'P':
696
- if (s->is_unicode) {
697
- if (parse_unicode_property(s, cr, &p, (c == 'P')))
1065
+ if (s->is_unicode && cr) {
1066
+ if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets))
698
1067
  return -1;
699
1068
  c = CLASS_RANGE_BASE;
700
1069
  break;
701
1070
  }
702
- /* fall thru */
1071
+ goto default_escape;
703
1072
  #endif
1073
+ case 'q':
1074
+ if (s->unicode_sets && cr && inclass) {
1075
+ if (parse_class_string_disjunction(s, cr, &p))
1076
+ return -1;
1077
+ c = CLASS_RANGE_BASE;
1078
+ break;
1079
+ }
1080
+ goto default_escape;
704
1081
  default:
1082
+ default_escape:
705
1083
  p--;
706
1084
  ret = lre_parse_escape(&p, s->is_unicode * 2);
707
1085
  if (ret >= 0) {
708
1086
  c = ret;
709
1087
  } else {
710
- if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
711
- /* always valid to escape these characters */
712
- goto normal_char;
713
- } else if (s->is_unicode) {
1088
+ if (s->is_unicode) {
714
1089
  invalid_escape:
715
1090
  return re_parse_error(s, "invalid escape sequence in regular expression");
716
1091
  } else {
@@ -727,6 +1102,48 @@ static int get_class_atom(REParseState *s, CharRange *cr,
727
1102
  return re_parse_error(s, "unexpected end");
728
1103
  }
729
1104
  /* fall thru */
1105
+ goto normal_char;
1106
+
1107
+ case '&':
1108
+ case '!':
1109
+ case '#':
1110
+ case '$':
1111
+ case '%':
1112
+ case '*':
1113
+ case '+':
1114
+ case ',':
1115
+ case '.':
1116
+ case ':':
1117
+ case ';':
1118
+ case '<':
1119
+ case '=':
1120
+ case '>':
1121
+ case '?':
1122
+ case '@':
1123
+ case '^':
1124
+ case '`':
1125
+ case '~':
1126
+ if (s->unicode_sets && p[1] == c) {
1127
+ /* forbidden double characters */
1128
+ return re_parse_error(s, "invalid class set operation in regular expression");
1129
+ }
1130
+ goto normal_char;
1131
+
1132
+ case '(':
1133
+ case ')':
1134
+ case '[':
1135
+ case ']':
1136
+ case '{':
1137
+ case '}':
1138
+ case '/':
1139
+ case '-':
1140
+ case '|':
1141
+ if (s->unicode_sets) {
1142
+ /* invalid characters in unicode sets */
1143
+ return re_parse_error(s, "invalid character in class in regular expression");
1144
+ }
1145
+ goto normal_char;
1146
+
730
1147
  default:
731
1148
  normal_char:
732
1149
  /* normal char */
@@ -754,8 +1171,6 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
754
1171
  if (len >= 65535)
755
1172
  return re_parse_error(s, "too many ranges");
756
1173
  if (len == 0) {
757
- /* not sure it can really happen. Emit a match that is always
758
- false */
759
1174
  re_emit_op_u32(s, REOP_char32, -1);
760
1175
  } else {
761
1176
  high = cr->points[cr->len - 1];
@@ -764,7 +1179,7 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
764
1179
  if (high <= 0xffff) {
765
1180
  /* can use 16 bit ranges with the conversion that 0xffff =
766
1181
  infinity */
767
- re_emit_op_u16(s, REOP_range, len);
1182
+ re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len);
768
1183
  for(i = 0; i < cr->len; i += 2) {
769
1184
  dbuf_put_u16(&s->byte_code, cr->points[i]);
770
1185
  high = cr->points[i + 1] - 1;
@@ -773,7 +1188,7 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
773
1188
  dbuf_put_u16(&s->byte_code, high);
774
1189
  }
775
1190
  } else {
776
- re_emit_op_u16(s, REOP_range32, len);
1191
+ re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len);
777
1192
  for(i = 0; i < cr->len; i += 2) {
778
1193
  dbuf_put_u32(&s->byte_code, cr->points[i]);
779
1194
  dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
@@ -783,15 +1198,139 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
783
1198
  return 0;
784
1199
  }
785
1200
 
786
- static int re_parse_char_class(REParseState *s, const uint8_t **pp)
1201
+ static int re_string_cmp_len(const void *a, const void *b, void *arg)
1202
+ {
1203
+ REString *p1 = *(REString **)a;
1204
+ REString *p2 = *(REString **)b;
1205
+ return (p1->len < p2->len) - (p1->len > p2->len);
1206
+ }
1207
+
1208
+ static void re_emit_char(REParseState *s, int c)
1209
+ {
1210
+ if (c <= 0xffff)
1211
+ re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c);
1212
+ else
1213
+ re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c);
1214
+ }
1215
+
1216
+ static int re_emit_string_list(REParseState *s, const REStringList *sl)
1217
+ {
1218
+ REString **tab, *p;
1219
+ int i, j, split_pos, last_match_pos, n;
1220
+ BOOL has_empty_string, is_last;
1221
+
1222
+ // re_string_list_dump("sl", sl);
1223
+ if (sl->n_strings == 0) {
1224
+ /* simple case: only characters */
1225
+ if (re_emit_range(s, &sl->cr))
1226
+ return -1;
1227
+ } else {
1228
+ /* at least one string list is present : match the longest ones first */
1229
+ /* XXX: add a new op_switch opcode to compile as a trie */
1230
+ tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings);
1231
+ if (!tab) {
1232
+ re_parse_out_of_memory(s);
1233
+ return -1;
1234
+ }
1235
+ has_empty_string = FALSE;
1236
+ n = 0;
1237
+ for(i = 0; i < sl->hash_size; i++) {
1238
+ for(p = sl->hash_table[i]; p != NULL; p = p->next) {
1239
+ if (p->len == 0) {
1240
+ has_empty_string = TRUE;
1241
+ } else {
1242
+ tab[n++] = p;
1243
+ }
1244
+ }
1245
+ }
1246
+ assert(n <= sl->n_strings);
1247
+
1248
+ rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL);
1249
+
1250
+ last_match_pos = -1;
1251
+ for(i = 0; i < n; i++) {
1252
+ p = tab[i];
1253
+ is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1);
1254
+ if (!is_last)
1255
+ split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1256
+ else
1257
+ split_pos = 0;
1258
+ for(j = 0; j < p->len; j++) {
1259
+ re_emit_char(s, p->buf[j]);
1260
+ }
1261
+ if (!is_last) {
1262
+ last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
1263
+ put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1264
+ }
1265
+ }
1266
+
1267
+ if (sl->cr.len != 0) {
1268
+ /* char range */
1269
+ is_last = !has_empty_string;
1270
+ if (!is_last)
1271
+ split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
1272
+ else
1273
+ split_pos = 0; /* not used */
1274
+ if (re_emit_range(s, &sl->cr)) {
1275
+ lre_realloc(s->opaque, tab, 0);
1276
+ return -1;
1277
+ }
1278
+ if (!is_last)
1279
+ put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
1280
+ }
1281
+
1282
+ /* patch the 'goto match' */
1283
+ while (last_match_pos != -1) {
1284
+ int next_pos = get_u32(s->byte_code.buf + last_match_pos);
1285
+ put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4));
1286
+ last_match_pos = next_pos;
1287
+ }
1288
+
1289
+ lre_realloc(s->opaque, tab, 0);
1290
+ }
1291
+ return 0;
1292
+ }
1293
+
1294
+ static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp);
1295
+
1296
+ static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp)
1297
+ {
1298
+ int c1;
1299
+ const uint8_t *p = *pp;
1300
+
1301
+ if (*p == '[') {
1302
+ if (re_parse_nested_class(s, cr, pp))
1303
+ return -1;
1304
+ } else {
1305
+ c1 = get_class_atom(s, cr, pp, TRUE);
1306
+ if (c1 < 0)
1307
+ return -1;
1308
+ if (c1 < CLASS_RANGE_BASE) {
1309
+ /* create a range with a single character */
1310
+ re_string_list_init(s, cr);
1311
+ if (s->ignore_case)
1312
+ c1 = lre_canonicalize(c1, s->is_unicode);
1313
+ if (cr_union_interval(&cr->cr, c1, c1)) {
1314
+ re_string_list_free(cr);
1315
+ return -1;
1316
+ }
1317
+ }
1318
+ }
1319
+ return 0;
1320
+ }
1321
+
1322
+ static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp)
787
1323
  {
788
1324
  const uint8_t *p;
789
1325
  uint32_t c1, c2;
790
- CharRange cr_s, *cr = &cr_s;
791
- CharRange cr1_s, *cr1 = &cr1_s;
792
- BOOL invert;
1326
+ int ret;
1327
+ REStringList cr1_s, *cr1 = &cr1_s;
1328
+ BOOL invert, is_first;
793
1329
 
794
- cr_init(cr, s->opaque, lre_realloc);
1330
+ if (lre_check_stack_overflow(s->opaque, 0))
1331
+ return re_parse_error(s, "stack overflow");
1332
+
1333
+ re_string_list_init(s, cr);
795
1334
  p = *pp;
796
1335
  p++; /* skip '[' */
797
1336
 
@@ -800,74 +1339,155 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
800
1339
  p++;
801
1340
  invert = TRUE;
802
1341
  }
803
-
1342
+
1343
+ /* handle unions */
1344
+ is_first = TRUE;
804
1345
  for(;;) {
805
1346
  if (*p == ']')
806
1347
  break;
807
- c1 = get_class_atom(s, cr1, &p, TRUE);
808
- if ((int)c1 < 0)
809
- goto fail;
810
- if (*p == '-' && p[1] != ']') {
811
- const uint8_t *p0 = p + 1;
812
- if (c1 >= CLASS_RANGE_BASE) {
813
- if (s->is_unicode) {
814
- cr_free(cr1);
815
- goto invalid_class_range;
816
- }
817
- /* Annex B: match '-' character */
818
- goto class_atom;
819
- }
820
- c2 = get_class_atom(s, cr1, &p0, TRUE);
821
- if ((int)c2 < 0)
822
- goto fail;
823
- if (c2 >= CLASS_RANGE_BASE) {
824
- cr_free(cr1);
825
- if (s->is_unicode) {
826
- goto invalid_class_range;
827
- }
828
- /* Annex B: match '-' character */
829
- goto class_atom;
830
- }
831
- p = p0;
832
- if (c2 < c1) {
833
- invalid_class_range:
834
- re_parse_error(s, "invalid class range");
1348
+ if (*p == '[' && s->unicode_sets) {
1349
+ if (re_parse_nested_class(s, cr1, &p))
835
1350
  goto fail;
836
- }
837
- if (cr_union_interval(cr, c1, c2))
838
- goto memory_error;
1351
+ goto class_union;
839
1352
  } else {
840
- class_atom:
841
- if (c1 >= CLASS_RANGE_BASE) {
842
- int ret;
843
- ret = cr_union1(cr, cr1->points, cr1->len);
844
- cr_free(cr1);
845
- if (ret)
846
- goto memory_error;
1353
+ c1 = get_class_atom(s, cr1, &p, TRUE);
1354
+ if ((int)c1 < 0)
1355
+ goto fail;
1356
+ if (*p == '-' && p[1] != ']') {
1357
+ const uint8_t *p0 = p + 1;
1358
+ if (p[1] == '-' && s->unicode_sets && is_first)
1359
+ goto class_atom; /* first character class followed by '--' */
1360
+ if (c1 >= CLASS_RANGE_BASE) {
1361
+ if (s->is_unicode) {
1362
+ re_string_list_free(cr1);
1363
+ goto invalid_class_range;
1364
+ }
1365
+ /* Annex B: match '-' character */
1366
+ goto class_atom;
1367
+ }
1368
+ c2 = get_class_atom(s, cr1, &p0, TRUE);
1369
+ if ((int)c2 < 0)
1370
+ goto fail;
1371
+ if (c2 >= CLASS_RANGE_BASE) {
1372
+ re_string_list_free(cr1);
1373
+ if (s->is_unicode) {
1374
+ goto invalid_class_range;
1375
+ }
1376
+ /* Annex B: match '-' character */
1377
+ goto class_atom;
1378
+ }
1379
+ p = p0;
1380
+ if (c2 < c1) {
1381
+ invalid_class_range:
1382
+ re_parse_error(s, "invalid class range");
1383
+ goto fail;
1384
+ }
1385
+ if (s->ignore_case) {
1386
+ CharRange cr2_s, *cr2 = &cr2_s;
1387
+ cr_init(cr2, s->opaque, lre_realloc);
1388
+ if (cr_add_interval(cr2, c1, c2 + 1) ||
1389
+ cr_regexp_canonicalize(cr2, s->is_unicode) ||
1390
+ cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) {
1391
+ cr_free(cr2);
1392
+ goto memory_error;
1393
+ }
1394
+ cr_free(cr2);
1395
+ } else {
1396
+ if (cr_union_interval(&cr->cr, c1, c2))
1397
+ goto memory_error;
1398
+ }
1399
+ is_first = FALSE; /* union operation */
847
1400
  } else {
848
- if (cr_union_interval(cr, c1, c1))
849
- goto memory_error;
1401
+ class_atom:
1402
+ if (c1 >= CLASS_RANGE_BASE) {
1403
+ class_union:
1404
+ ret = re_string_list_op(cr, cr1, CR_OP_UNION);
1405
+ re_string_list_free(cr1);
1406
+ if (ret)
1407
+ goto memory_error;
1408
+ } else {
1409
+ if (s->ignore_case)
1410
+ c1 = lre_canonicalize(c1, s->is_unicode);
1411
+ if (cr_union_interval(&cr->cr, c1, c1))
1412
+ goto memory_error;
1413
+ }
850
1414
  }
851
1415
  }
1416
+ if (s->unicode_sets && is_first) {
1417
+ if (*p == '&' && p[1] == '&' && p[2] != '&') {
1418
+ /* handle '&&' */
1419
+ for(;;) {
1420
+ if (*p == ']') {
1421
+ break;
1422
+ } else if (*p == '&' && p[1] == '&' && p[2] != '&') {
1423
+ p += 2;
1424
+ } else {
1425
+ goto invalid_operation;
1426
+ }
1427
+ if (re_parse_class_set_operand(s, cr1, &p))
1428
+ goto fail;
1429
+ ret = re_string_list_op(cr, cr1, CR_OP_INTER);
1430
+ re_string_list_free(cr1);
1431
+ if (ret)
1432
+ goto memory_error;
1433
+ }
1434
+ } else if (*p == '-' && p[1] == '-') {
1435
+ /* handle '--' */
1436
+ for(;;) {
1437
+ if (*p == ']') {
1438
+ break;
1439
+ } else if (*p == '-' && p[1] == '-') {
1440
+ p += 2;
1441
+ } else {
1442
+ invalid_operation:
1443
+ re_parse_error(s, "invalid operation in regular expression");
1444
+ goto fail;
1445
+ }
1446
+ if (re_parse_class_set_operand(s, cr1, &p))
1447
+ goto fail;
1448
+ ret = re_string_list_op(cr, cr1, CR_OP_SUB);
1449
+ re_string_list_free(cr1);
1450
+ if (ret)
1451
+ goto memory_error;
1452
+ }
1453
+ }
1454
+ }
1455
+ is_first = FALSE;
852
1456
  }
853
- if (s->ignore_case) {
854
- if (cr_regexp_canonicalize(cr, s->is_unicode))
855
- goto memory_error;
856
- }
1457
+
1458
+ p++; /* skip ']' */
1459
+ *pp = p;
857
1460
  if (invert) {
858
- if (cr_invert(cr))
1461
+ /* XXX: add may_contain_string syntax check to be fully
1462
+ compliant. The test here accepts more input than the
1463
+ spec. */
1464
+ if (cr->n_strings != 0) {
1465
+ re_parse_error(s, "negated character class with strings in regular expression debugger eval code");
1466
+ goto fail;
1467
+ }
1468
+ if (cr_invert(&cr->cr))
859
1469
  goto memory_error;
860
1470
  }
861
- if (re_emit_range(s, cr))
862
- goto fail;
863
- cr_free(cr);
864
- p++; /* skip ']' */
865
- *pp = p;
866
1471
  return 0;
867
1472
  memory_error:
868
1473
  re_parse_out_of_memory(s);
869
1474
  fail:
870
- cr_free(cr);
1475
+ re_string_list_free(cr);
1476
+ return -1;
1477
+ }
1478
+
1479
+ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
1480
+ {
1481
+ REStringList cr_s, *cr = &cr_s;
1482
+
1483
+ if (re_parse_nested_class(s, cr, pp))
1484
+ return -1;
1485
+ if (re_emit_string_list(s, cr))
1486
+ goto fail;
1487
+ re_string_list_free(cr);
1488
+ return 0;
1489
+ fail:
1490
+ re_string_list_free(cr);
871
1491
  return -1;
872
1492
  }
873
1493
 
@@ -888,27 +1508,35 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
888
1508
  len = reopcode_info[opcode].size;
889
1509
  switch(opcode) {
890
1510
  case REOP_range:
1511
+ case REOP_range_i:
891
1512
  val = get_u16(bc_buf + pos + 1);
892
1513
  len += val * 4;
893
1514
  goto simple_char;
894
1515
  case REOP_range32:
1516
+ case REOP_range32_i:
895
1517
  val = get_u16(bc_buf + pos + 1);
896
1518
  len += val * 8;
897
1519
  goto simple_char;
898
1520
  case REOP_char:
1521
+ case REOP_char_i:
899
1522
  case REOP_char32:
1523
+ case REOP_char32_i:
900
1524
  case REOP_dot:
901
1525
  case REOP_any:
902
1526
  simple_char:
903
1527
  ret = FALSE;
904
1528
  break;
905
1529
  case REOP_line_start:
1530
+ case REOP_line_start_m:
906
1531
  case REOP_line_end:
1532
+ case REOP_line_end_m:
907
1533
  case REOP_push_i32:
908
1534
  case REOP_push_char_pos:
909
1535
  case REOP_drop:
910
1536
  case REOP_word_boundary:
1537
+ case REOP_word_boundary_i:
911
1538
  case REOP_not_word_boundary:
1539
+ case REOP_not_word_boundary_i:
912
1540
  case REOP_prev:
913
1541
  /* no effect */
914
1542
  break;
@@ -916,7 +1544,9 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
916
1544
  case REOP_save_end:
917
1545
  case REOP_save_reset:
918
1546
  case REOP_back_reference:
1547
+ case REOP_back_reference_i:
919
1548
  case REOP_backward_back_reference:
1549
+ case REOP_backward_back_reference_i:
920
1550
  break;
921
1551
  default:
922
1552
  /* safe behavior: we cannot predict the outcome */
@@ -941,24 +1571,32 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
941
1571
  len = reopcode_info[opcode].size;
942
1572
  switch(opcode) {
943
1573
  case REOP_range:
1574
+ case REOP_range_i:
944
1575
  val = get_u16(bc_buf + pos + 1);
945
1576
  len += val * 4;
946
1577
  goto simple_char;
947
1578
  case REOP_range32:
1579
+ case REOP_range32_i:
948
1580
  val = get_u16(bc_buf + pos + 1);
949
1581
  len += val * 8;
950
1582
  goto simple_char;
951
1583
  case REOP_char:
1584
+ case REOP_char_i:
952
1585
  case REOP_char32:
1586
+ case REOP_char32_i:
953
1587
  case REOP_dot:
954
1588
  case REOP_any:
955
1589
  simple_char:
956
1590
  count++;
957
1591
  break;
958
1592
  case REOP_line_start:
1593
+ case REOP_line_start_m:
959
1594
  case REOP_line_end:
1595
+ case REOP_line_end_m:
960
1596
  case REOP_word_boundary:
1597
+ case REOP_word_boundary_i:
961
1598
  case REOP_not_word_boundary:
1599
+ case REOP_not_word_boundary_i:
962
1600
  break;
963
1601
  default:
964
1602
  return -1;
@@ -1116,12 +1754,47 @@ static int find_group_name(REParseState *s, const char *name)
1116
1754
 
1117
1755
  static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1118
1756
 
1757
+ static int re_parse_modifiers(REParseState *s, const uint8_t **pp)
1758
+ {
1759
+ const uint8_t *p = *pp;
1760
+ int mask = 0;
1761
+ int val;
1762
+
1763
+ for(;;) {
1764
+ if (*p == 'i') {
1765
+ val = LRE_FLAG_IGNORECASE;
1766
+ } else if (*p == 'm') {
1767
+ val = LRE_FLAG_MULTILINE;
1768
+ } else if (*p == 's') {
1769
+ val = LRE_FLAG_DOTALL;
1770
+ } else {
1771
+ break;
1772
+ }
1773
+ if (mask & val)
1774
+ return re_parse_error(s, "duplicate modifier: '%c'", *p);
1775
+ mask |= val;
1776
+ p++;
1777
+ }
1778
+ *pp = p;
1779
+ return mask;
1780
+ }
1781
+
1782
+ static BOOL update_modifier(BOOL val, int add_mask, int remove_mask,
1783
+ int mask)
1784
+ {
1785
+ if (add_mask & mask)
1786
+ val = TRUE;
1787
+ if (remove_mask & mask)
1788
+ val = FALSE;
1789
+ return val;
1790
+ }
1791
+
1119
1792
  static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1120
1793
  {
1121
1794
  const uint8_t *p;
1122
1795
  int c, last_atom_start, quant_min, quant_max, last_capture_count;
1123
1796
  BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1124
- CharRange cr_s, *cr = &cr_s;
1797
+ REStringList cr_s, *cr = &cr_s;
1125
1798
 
1126
1799
  last_atom_start = -1;
1127
1800
  last_capture_count = 0;
@@ -1130,11 +1803,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1130
1803
  switch(c) {
1131
1804
  case '^':
1132
1805
  p++;
1133
- re_emit_op(s, REOP_line_start);
1806
+ re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start);
1134
1807
  break;
1135
1808
  case '$':
1136
1809
  p++;
1137
- re_emit_op(s, REOP_line_end);
1810
+ re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end);
1138
1811
  break;
1139
1812
  case '.':
1140
1813
  p++;
@@ -1184,6 +1857,44 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1184
1857
  p = s->buf_ptr;
1185
1858
  if (re_parse_expect(s, &p, ')'))
1186
1859
  return -1;
1860
+ } else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') {
1861
+ BOOL saved_ignore_case, saved_multi_line, saved_dotall;
1862
+ int add_mask, remove_mask;
1863
+ p += 2;
1864
+ remove_mask = 0;
1865
+ add_mask = re_parse_modifiers(s, &p);
1866
+ if (add_mask < 0)
1867
+ return -1;
1868
+ if (*p == '-') {
1869
+ p++;
1870
+ remove_mask = re_parse_modifiers(s, &p);
1871
+ if (remove_mask < 0)
1872
+ return -1;
1873
+ }
1874
+ if ((add_mask == 0 && remove_mask == 0) ||
1875
+ (add_mask & remove_mask) != 0) {
1876
+ return re_parse_error(s, "invalid modifiers");
1877
+ }
1878
+ if (re_parse_expect(s, &p, ':'))
1879
+ return -1;
1880
+ saved_ignore_case = s->ignore_case;
1881
+ saved_multi_line = s->multi_line;
1882
+ saved_dotall = s->dotall;
1883
+ s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE);
1884
+ s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE);
1885
+ s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL);
1886
+
1887
+ last_atom_start = s->byte_code.size;
1888
+ last_capture_count = s->capture_count;
1889
+ s->buf_ptr = p;
1890
+ if (re_parse_disjunction(s, is_backward_dir))
1891
+ return -1;
1892
+ p = s->buf_ptr;
1893
+ if (re_parse_expect(s, &p, ')'))
1894
+ return -1;
1895
+ s->ignore_case = saved_ignore_case;
1896
+ s->multi_line = saved_multi_line;
1897
+ s->dotall = saved_dotall;
1187
1898
  } else if ((p[2] == '=' || p[2] == '!')) {
1188
1899
  is_neg = (p[2] == '!');
1189
1900
  is_backward_lookahead = FALSE;
@@ -1262,7 +1973,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1262
1973
  switch(p[1]) {
1263
1974
  case 'b':
1264
1975
  case 'B':
1265
- re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
1976
+ if (p[1] != 'b') {
1977
+ re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
1978
+ } else {
1979
+ re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
1980
+ }
1266
1981
  p += 2;
1267
1982
  break;
1268
1983
  case 'k':
@@ -1351,7 +2066,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1351
2066
  emit_back_reference:
1352
2067
  last_atom_start = s->byte_code.size;
1353
2068
  last_capture_count = s->capture_count;
1354
- re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
2069
+
2070
+ re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, c);
1355
2071
  }
1356
2072
  break;
1357
2073
  default:
@@ -1385,18 +2101,14 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1385
2101
  re_emit_op(s, REOP_prev);
1386
2102
  if (c >= CLASS_RANGE_BASE) {
1387
2103
  int ret;
1388
- /* Note: canonicalization is not needed */
1389
- ret = re_emit_range(s, cr);
1390
- cr_free(cr);
2104
+ ret = re_emit_string_list(s, cr);
2105
+ re_string_list_free(cr);
1391
2106
  if (ret)
1392
2107
  return -1;
1393
2108
  } else {
1394
2109
  if (s->ignore_case)
1395
2110
  c = lre_canonicalize(c, s->is_unicode);
1396
- if (c <= 0xffff)
1397
- re_emit_op_u16(s, REOP_char, c);
1398
- else
1399
- re_emit_op_u32(s, REOP_char32, c);
2111
+ re_emit_char(s, c);
1400
2112
  }
1401
2113
  if (is_backward_dir)
1402
2114
  re_emit_op(s, REOP_prev);
@@ -1706,10 +2418,12 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1706
2418
  stack_size--;
1707
2419
  break;
1708
2420
  case REOP_range:
2421
+ case REOP_range_i:
1709
2422
  val = get_u16(bc_buf + pos + 1);
1710
2423
  len += val * 4;
1711
2424
  break;
1712
2425
  case REOP_range32:
2426
+ case REOP_range32_i:
1713
2427
  val = get_u16(bc_buf + pos + 1);
1714
2428
  len += val * 8;
1715
2429
  break;
@@ -1719,6 +2433,17 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1719
2433
  return stack_size_max;
1720
2434
  }
1721
2435
 
2436
+ static void *lre_bytecode_realloc(void *opaque, void *ptr, size_t size)
2437
+ {
2438
+ if (size > (INT32_MAX / 2)) {
2439
+ /* the bytecode cannot be larger than 2G. Leave some slack to
2440
+ avoid some overflows. */
2441
+ return NULL;
2442
+ } else {
2443
+ return lre_realloc(opaque, ptr, size);
2444
+ }
2445
+ }
2446
+
1722
2447
  /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
1723
2448
  Return NULL if error and allocate an error message in *perror_msg,
1724
2449
  otherwise the compiled bytecode and its length in plen.
@@ -1737,18 +2462,20 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1737
2462
  s->buf_end = s->buf_ptr + buf_len;
1738
2463
  s->buf_start = s->buf_ptr;
1739
2464
  s->re_flags = re_flags;
1740
- s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
2465
+ s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
1741
2466
  is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
1742
2467
  s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
2468
+ s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0);
1743
2469
  s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
2470
+ s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
1744
2471
  s->capture_count = 1;
1745
2472
  s->total_capture_count = -1;
1746
2473
  s->has_named_captures = -1;
1747
2474
 
1748
- dbuf_init2(&s->byte_code, opaque, lre_realloc);
2475
+ dbuf_init2(&s->byte_code, opaque, lre_bytecode_realloc);
1749
2476
  dbuf_init2(&s->group_names, opaque, lre_realloc);
1750
2477
 
1751
- dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
2478
+ dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */
1752
2479
  dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
1753
2480
  dbuf_putc(&s->byte_code, 0); /* stack size */
1754
2481
  dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
@@ -1801,7 +2528,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1801
2528
  /* add the named groups if needed */
1802
2529
  if (s->group_names.size > (s->capture_count - 1)) {
1803
2530
  dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1804
- s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
2531
+ put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
2532
+ lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
1805
2533
  }
1806
2534
  dbuf_free(&s->group_names);
1807
2535
 
@@ -1935,8 +2663,6 @@ typedef struct {
1935
2663
  int cbuf_type;
1936
2664
  int capture_count;
1937
2665
  int stack_size_max;
1938
- BOOL multi_line;
1939
- BOOL ignore_case;
1940
2666
  BOOL is_unicode;
1941
2667
  int interrupt_counter;
1942
2668
  void *opaque; /* used for stack overflow check */
@@ -2085,17 +2811,19 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2085
2811
  }
2086
2812
  break;
2087
2813
  case REOP_char32:
2814
+ case REOP_char32_i:
2088
2815
  val = get_u32(pc);
2089
2816
  pc += 4;
2090
2817
  goto test_char;
2091
2818
  case REOP_char:
2819
+ case REOP_char_i:
2092
2820
  val = get_u16(pc);
2093
2821
  pc += 2;
2094
2822
  test_char:
2095
2823
  if (cptr >= cbuf_end)
2096
2824
  goto no_match;
2097
2825
  GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2098
- if (s->ignore_case) {
2826
+ if (opcode == REOP_char_i || opcode == REOP_char32_i) {
2099
2827
  c = lre_canonicalize(c, s->is_unicode);
2100
2828
  }
2101
2829
  if (val != c)
@@ -2139,18 +2867,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2139
2867
  return LRE_RET_TIMEOUT;
2140
2868
  break;
2141
2869
  case REOP_line_start:
2870
+ case REOP_line_start_m:
2142
2871
  if (cptr == s->cbuf)
2143
2872
  break;
2144
- if (!s->multi_line)
2873
+ if (opcode == REOP_line_start)
2145
2874
  goto no_match;
2146
2875
  PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2147
2876
  if (!is_line_terminator(c))
2148
2877
  goto no_match;
2149
2878
  break;
2150
2879
  case REOP_line_end:
2880
+ case REOP_line_end_m:
2151
2881
  if (cptr == cbuf_end)
2152
2882
  break;
2153
- if (!s->multi_line)
2883
+ if (opcode == REOP_line_end)
2154
2884
  goto no_match;
2155
2885
  PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2156
2886
  if (!is_line_terminator(c))
@@ -2213,14 +2943,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2213
2943
  goto no_match;
2214
2944
  break;
2215
2945
  case REOP_word_boundary:
2946
+ case REOP_word_boundary_i:
2216
2947
  case REOP_not_word_boundary:
2948
+ case REOP_not_word_boundary_i:
2217
2949
  {
2218
2950
  BOOL v1, v2;
2951
+ int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i);
2952
+ BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i);
2219
2953
  /* char before */
2220
2954
  if (cptr == s->cbuf) {
2221
2955
  v1 = FALSE;
2222
2956
  } else {
2223
2957
  PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2958
+ if (ignore_case)
2959
+ c = lre_canonicalize(c, s->is_unicode);
2224
2960
  v1 = is_word_char(c);
2225
2961
  }
2226
2962
  /* current char */
@@ -2228,14 +2964,18 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2228
2964
  v2 = FALSE;
2229
2965
  } else {
2230
2966
  PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2967
+ if (ignore_case)
2968
+ c = lre_canonicalize(c, s->is_unicode);
2231
2969
  v2 = is_word_char(c);
2232
2970
  }
2233
- if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2971
+ if (v1 ^ v2 ^ is_boundary)
2234
2972
  goto no_match;
2235
2973
  }
2236
2974
  break;
2237
2975
  case REOP_back_reference:
2976
+ case REOP_back_reference_i:
2238
2977
  case REOP_backward_back_reference:
2978
+ case REOP_backward_back_reference_i:
2239
2979
  {
2240
2980
  const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2241
2981
  uint32_t c1, c2;
@@ -2247,14 +2987,15 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2247
2987
  cptr1_end = capture[2 * val + 1];
2248
2988
  if (!cptr1_start || !cptr1_end)
2249
2989
  break;
2250
- if (opcode == REOP_back_reference) {
2990
+ if (opcode == REOP_back_reference ||
2991
+ opcode == REOP_back_reference_i) {
2251
2992
  cptr1 = cptr1_start;
2252
2993
  while (cptr1 < cptr1_end) {
2253
2994
  if (cptr >= cbuf_end)
2254
2995
  goto no_match;
2255
2996
  GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
2256
2997
  GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
2257
- if (s->ignore_case) {
2998
+ if (opcode == REOP_back_reference_i) {
2258
2999
  c1 = lre_canonicalize(c1, s->is_unicode);
2259
3000
  c2 = lre_canonicalize(c2, s->is_unicode);
2260
3001
  }
@@ -2268,7 +3009,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2268
3009
  goto no_match;
2269
3010
  GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
2270
3011
  GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
2271
- if (s->ignore_case) {
3012
+ if (opcode == REOP_backward_back_reference_i) {
2272
3013
  c1 = lre_canonicalize(c1, s->is_unicode);
2273
3014
  c2 = lre_canonicalize(c2, s->is_unicode);
2274
3015
  }
@@ -2279,6 +3020,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2279
3020
  }
2280
3021
  break;
2281
3022
  case REOP_range:
3023
+ case REOP_range_i:
2282
3024
  {
2283
3025
  int n;
2284
3026
  uint32_t low, high, idx_min, idx_max, idx;
@@ -2288,7 +3030,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2288
3030
  if (cptr >= cbuf_end)
2289
3031
  goto no_match;
2290
3032
  GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2291
- if (s->ignore_case) {
3033
+ if (opcode == REOP_range_i) {
2292
3034
  c = lre_canonicalize(c, s->is_unicode);
2293
3035
  }
2294
3036
  idx_min = 0;
@@ -2319,6 +3061,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2319
3061
  }
2320
3062
  break;
2321
3063
  case REOP_range32:
3064
+ case REOP_range32_i:
2322
3065
  {
2323
3066
  int n;
2324
3067
  uint32_t low, high, idx_min, idx_max, idx;
@@ -2328,7 +3071,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2328
3071
  if (cptr >= cbuf_end)
2329
3072
  goto no_match;
2330
3073
  GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2331
- if (s->ignore_case) {
3074
+ if (opcode == REOP_range32_i) {
2332
3075
  c = lre_canonicalize(c, s->is_unicode);
2333
3076
  }
2334
3077
  idx_min = 0;
@@ -2420,11 +3163,10 @@ int lre_exec(uint8_t **capture,
2420
3163
  REExecContext s_s, *s = &s_s;
2421
3164
  int re_flags, i, alloca_size, ret;
2422
3165
  StackInt *stack_buf;
3166
+ const uint8_t *cptr;
2423
3167
 
2424
3168
  re_flags = lre_get_flags(bc_buf);
2425
- s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
2426
- s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2427
- s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
3169
+ s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
2428
3170
  s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2429
3171
  s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2430
3172
  s->cbuf = cbuf;
@@ -2446,8 +3188,17 @@ int lre_exec(uint8_t **capture,
2446
3188
  capture[i] = NULL;
2447
3189
  alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
2448
3190
  stack_buf = alloca(alloca_size);
3191
+
3192
+ cptr = cbuf + (cindex << cbuf_type);
3193
+ if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
3194
+ const uint16_t *p = (const uint16_t *)cptr;
3195
+ if (is_lo_surrogate(*p) && is_hi_surrogate(p[-1])) {
3196
+ cptr = (const uint8_t *)(p - 1);
3197
+ }
3198
+ }
3199
+
2449
3200
  ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
2450
- cbuf + (cindex << cbuf_type), FALSE);
3201
+ cptr, FALSE);
2451
3202
  lre_realloc(s->opaque, s->state_stack, 0);
2452
3203
  return ret;
2453
3204
  }
@@ -2459,7 +3210,7 @@ int lre_get_capture_count(const uint8_t *bc_buf)
2459
3210
 
2460
3211
  int lre_get_flags(const uint8_t *bc_buf)
2461
3212
  {
2462
- return bc_buf[RE_HEADER_FLAGS];
3213
+ return get_u16(bc_buf + RE_HEADER_FLAGS);
2463
3214
  }
2464
3215
 
2465
3216
  /* Return NULL if no group names. Otherwise, return a pointer to