quickjs 0.19.0 → 0.20.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,16 +34,20 @@
34
34
 
35
35
  /*
36
36
  TODO:
37
-
37
+ - remove REOP_char_i and REOP_range_i by precomputing the case folding.
38
+ - add specific opcodes for simple unicode property tests so that the
39
+ generated bytecode is smaller.
38
40
  - Add a lock step execution mode (=linear time execution guaranteed)
39
41
  when the regular expression is "simple" i.e. no backreference nor
40
42
  complicated lookahead. The opcodes are designed for this execution
41
43
  model.
42
44
  */
43
45
 
44
- #if defined(TEST)
46
+ #if defined(TEST)
45
47
  #define DUMP_REOP
46
48
  #endif
49
+ //#define DUMP_REOP
50
+ //#define DUMP_EXEC
47
51
 
48
52
  typedef enum {
49
53
  #define DEF(id, size) REOP_ ## id,
@@ -53,7 +57,7 @@ typedef enum {
53
57
  } REOPCodeEnum;
54
58
 
55
59
  #define CAPTURE_COUNT_MAX 255
56
- #define STACK_SIZE_MAX 255
60
+ #define REGISTER_COUNT_MAX 255
57
61
  /* must be large enough to have a negligible runtime cost and small
58
62
  enough to call the interrupt callback often. */
59
63
  #define INTERRUPT_COUNTER_INIT 10000
@@ -75,6 +79,7 @@ typedef struct {
75
79
  BOOL ignore_case;
76
80
  BOOL multi_line;
77
81
  BOOL dotall;
82
+ uint8_t group_name_scope;
78
83
  int capture_count;
79
84
  int total_capture_count; /* -1 = not computed yet */
80
85
  int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
@@ -103,10 +108,10 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
103
108
  #undef DEF
104
109
  };
105
110
 
106
- #define RE_HEADER_FLAGS 0
107
- #define RE_HEADER_CAPTURE_COUNT 2
108
- #define RE_HEADER_STACK_SIZE 3
109
- #define RE_HEADER_BYTECODE_LEN 4
111
+ #define RE_HEADER_FLAGS 0
112
+ #define RE_HEADER_CAPTURE_COUNT 2
113
+ #define RE_HEADER_REGISTER_COUNT 3
114
+ #define RE_HEADER_BYTECODE_LEN 4
110
115
 
111
116
  #define RE_HEADER_LEN 8
112
117
 
@@ -117,7 +122,7 @@ static inline int is_digit(int c) {
117
122
  /* insert 'len' bytes at position 'pos'. Return < 0 if error. */
118
123
  static int dbuf_insert(DynBuf *s, int pos, int len)
119
124
  {
120
- if (dbuf_realloc(s, s->size + len))
125
+ if (dbuf_claim(s, len))
121
126
  return -1;
122
127
  memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
123
128
  s->size += len;
@@ -459,15 +464,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
459
464
  int buf_len)
460
465
  {
461
466
  int pos, len, opcode, bc_len, re_flags, i;
462
- uint32_t val;
467
+ uint32_t val, val2;
463
468
 
464
469
  assert(buf_len >= RE_HEADER_LEN);
465
470
 
466
471
  re_flags = lre_get_flags(buf);
467
472
  bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
468
473
  assert(bc_len + RE_HEADER_LEN <= buf_len);
469
- printf("flags: 0x%x capture_count=%d stack_size=%d\n",
470
- re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
474
+ printf("flags: 0x%x capture_count=%d reg_count=%d\n",
475
+ re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_REGISTER_COUNT]);
471
476
  if (re_flags & LRE_FLAG_NAMED_GROUPS) {
472
477
  const char *p;
473
478
  p = (char *)buf + RE_HEADER_LEN + bc_len;
@@ -476,7 +481,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
476
481
  if (i != 1)
477
482
  printf(",");
478
483
  printf("<%s>", p);
479
- p += strlen(p) + 1;
484
+ p += strlen(p) + LRE_GROUP_NAME_TRAILER_LEN;
480
485
  }
481
486
  printf("\n");
482
487
  assert(p == (char *)(buf + buf_len));
@@ -518,34 +523,62 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
518
523
  case REOP_goto:
519
524
  case REOP_split_goto_first:
520
525
  case REOP_split_next_first:
521
- case REOP_loop:
522
526
  case REOP_lookahead:
523
527
  case REOP_negative_lookahead:
524
528
  val = get_u32(buf + pos + 1);
525
529
  val += (pos + 5);
526
530
  printf(" %u", val);
527
531
  break;
528
- case REOP_simple_greedy_quant:
529
- printf(" %u %u %u %u",
530
- get_u32(buf + pos + 1) + (pos + 17),
531
- get_u32(buf + pos + 1 + 4),
532
- get_u32(buf + pos + 1 + 8),
533
- get_u32(buf + pos + 1 + 12));
532
+ case REOP_loop:
533
+ val2 = buf[pos + 1];
534
+ val = get_u32(buf + pos + 2);
535
+ val += (pos + 6);
536
+ printf(" r%u, %u", val2, val);
537
+ break;
538
+ case REOP_loop_split_goto_first:
539
+ case REOP_loop_split_next_first:
540
+ case REOP_loop_check_adv_split_goto_first:
541
+ case REOP_loop_check_adv_split_next_first:
542
+ {
543
+ uint32_t limit;
544
+ val2 = buf[pos + 1];
545
+ limit = get_u32(buf + pos + 2);
546
+ val = get_u32(buf + pos + 6);
547
+ val += (pos + 10);
548
+ printf(" r%u, %u, %u", val2, limit, val);
549
+ }
534
550
  break;
535
551
  case REOP_save_start:
536
552
  case REOP_save_end:
553
+ printf(" %u", buf[pos + 1]);
554
+ break;
537
555
  case REOP_back_reference:
538
556
  case REOP_back_reference_i:
539
557
  case REOP_backward_back_reference:
540
558
  case REOP_backward_back_reference_i:
541
- printf(" %u", buf[pos + 1]);
559
+ {
560
+ int n, i;
561
+ n = buf[pos + 1];
562
+ len += n;
563
+ for(i = 0; i < n; i++) {
564
+ if (i != 0)
565
+ printf(",");
566
+ printf(" %u", buf[pos + 2 + i]);
567
+ }
568
+ }
542
569
  break;
543
570
  case REOP_save_reset:
544
571
  printf(" %u %u", buf[pos + 1], buf[pos + 2]);
545
572
  break;
546
- case REOP_push_i32:
547
- val = get_u32(buf + pos + 1);
548
- printf(" %d", val);
573
+ case REOP_set_i32:
574
+ val = buf[pos + 1];
575
+ val2 = get_u32(buf + pos + 2);
576
+ printf(" r%u, %d", val, val2);
577
+ break;
578
+ case REOP_set_char_pos:
579
+ case REOP_check_advance:
580
+ val = buf[pos + 1];
581
+ printf(" r%u", val);
549
582
  break;
550
583
  case REOP_range:
551
584
  case REOP_range_i:
@@ -604,6 +637,27 @@ static int re_emit_goto(REParseState *s, int op, uint32_t val)
604
637
  return pos;
605
638
  }
606
639
 
640
+ static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val)
641
+ {
642
+ int pos;
643
+ dbuf_putc(&s->byte_code, op);
644
+ dbuf_putc(&s->byte_code, arg);
645
+ pos = s->byte_code.size;
646
+ dbuf_put_u32(&s->byte_code, val - (pos + 4));
647
+ return pos;
648
+ }
649
+
650
+ static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val)
651
+ {
652
+ int pos;
653
+ dbuf_putc(&s->byte_code, op);
654
+ dbuf_putc(&s->byte_code, arg0);
655
+ dbuf_put_u32(&s->byte_code, arg1);
656
+ pos = s->byte_code.size;
657
+ dbuf_put_u32(&s->byte_code, val - (pos + 4));
658
+ return pos;
659
+ }
660
+
607
661
  static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
608
662
  {
609
663
  dbuf_putc(&s->byte_code, op);
@@ -705,9 +759,21 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
705
759
  c = '\v';
706
760
  break;
707
761
  case 'x':
762
+ {
763
+ int h0, h1;
764
+
765
+ h0 = from_hex(*p++);
766
+ if (h0 < 0)
767
+ return -1;
768
+ h1 = from_hex(*p++);
769
+ if (h1 < 0)
770
+ return -1;
771
+ c = (h0 << 4) | h1;
772
+ }
773
+ break;
708
774
  case 'u':
709
775
  {
710
- int h, n, i;
776
+ int h, i;
711
777
  uint32_t c1;
712
778
 
713
779
  if (*p == '{' && allow_utf16) {
@@ -725,14 +791,8 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
725
791
  }
726
792
  p++;
727
793
  } else {
728
- if (c == 'x') {
729
- n = 2;
730
- } else {
731
- n = 4;
732
- }
733
-
734
794
  c = 0;
735
- for(i = 0; i < n; i++) {
795
+ for(i = 0; i < 4; i++) {
736
796
  h = from_hex(*p++);
737
797
  if (h < 0) {
738
798
  return -1;
@@ -1020,7 +1080,7 @@ static int get_class_atom(REParseState *s, REStringList *cr,
1020
1080
  goto default_escape;
1021
1081
  if (cr_init_char_range(s, cr, c))
1022
1082
  return -1;
1023
- c = CLASS_RANGE_BASE;
1083
+ c += CLASS_RANGE_BASE;
1024
1084
  break;
1025
1085
  case 'c':
1026
1086
  c = *p;
@@ -1491,17 +1551,18 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
1491
1551
  return -1;
1492
1552
  }
1493
1553
 
1494
- /* Return:
1495
- - true if the opcodes may not advance the char pointer
1496
- - false if the opcodes always advance the char pointer
1554
+ /* need_check_adv: false if the opcodes always advance the char pointer
1555
+ need_capture_init: true if all the captures in the atom are not set
1497
1556
  */
1498
- static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
1557
+ static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
1558
+ const uint8_t *bc_buf, int bc_buf_len)
1499
1559
  {
1500
1560
  int pos, opcode, len;
1501
1561
  uint32_t val;
1502
- BOOL ret;
1562
+ BOOL need_check_adv, need_capture_init;
1503
1563
 
1504
- ret = TRUE;
1564
+ need_check_adv = TRUE;
1565
+ need_capture_init = FALSE;
1505
1566
  pos = 0;
1506
1567
  while (pos < bc_buf_len) {
1507
1568
  opcode = bc_buf[pos];
@@ -1511,28 +1572,30 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
1511
1572
  case REOP_range_i:
1512
1573
  val = get_u16(bc_buf + pos + 1);
1513
1574
  len += val * 4;
1514
- goto simple_char;
1575
+ need_check_adv = FALSE;
1576
+ break;
1515
1577
  case REOP_range32:
1516
1578
  case REOP_range32_i:
1517
1579
  val = get_u16(bc_buf + pos + 1);
1518
1580
  len += val * 8;
1519
- goto simple_char;
1581
+ need_check_adv = FALSE;
1582
+ break;
1520
1583
  case REOP_char:
1521
1584
  case REOP_char_i:
1522
1585
  case REOP_char32:
1523
1586
  case REOP_char32_i:
1524
1587
  case REOP_dot:
1525
1588
  case REOP_any:
1526
- simple_char:
1527
- ret = FALSE;
1589
+ case REOP_space:
1590
+ case REOP_not_space:
1591
+ need_check_adv = FALSE;
1528
1592
  break;
1529
1593
  case REOP_line_start:
1530
1594
  case REOP_line_start_m:
1531
1595
  case REOP_line_end:
1532
1596
  case REOP_line_end_m:
1533
- case REOP_push_i32:
1534
- case REOP_push_char_pos:
1535
- case REOP_drop:
1597
+ case REOP_set_i32:
1598
+ case REOP_set_char_pos:
1536
1599
  case REOP_word_boundary:
1537
1600
  case REOP_word_boundary_i:
1538
1601
  case REOP_not_word_boundary:
@@ -1543,67 +1606,25 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
1543
1606
  case REOP_save_start:
1544
1607
  case REOP_save_end:
1545
1608
  case REOP_save_reset:
1609
+ break;
1546
1610
  case REOP_back_reference:
1547
1611
  case REOP_back_reference_i:
1548
1612
  case REOP_backward_back_reference:
1549
1613
  case REOP_backward_back_reference_i:
1614
+ val = bc_buf[pos + 1];
1615
+ len += val;
1616
+ need_capture_init = TRUE;
1550
1617
  break;
1551
1618
  default:
1552
1619
  /* safe behavior: we cannot predict the outcome */
1553
- return TRUE;
1620
+ need_capture_init = TRUE;
1621
+ goto done;
1554
1622
  }
1555
1623
  pos += len;
1556
1624
  }
1557
- return ret;
1558
- }
1559
-
1560
- /* return -1 if a simple quantifier cannot be used. Otherwise return
1561
- the number of characters in the atom. */
1562
- static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
1563
- {
1564
- int pos, opcode, len, count;
1565
- uint32_t val;
1566
-
1567
- count = 0;
1568
- pos = 0;
1569
- while (pos < bc_buf_len) {
1570
- opcode = bc_buf[pos];
1571
- len = reopcode_info[opcode].size;
1572
- switch(opcode) {
1573
- case REOP_range:
1574
- case REOP_range_i:
1575
- val = get_u16(bc_buf + pos + 1);
1576
- len += val * 4;
1577
- goto simple_char;
1578
- case REOP_range32:
1579
- case REOP_range32_i:
1580
- val = get_u16(bc_buf + pos + 1);
1581
- len += val * 8;
1582
- goto simple_char;
1583
- case REOP_char:
1584
- case REOP_char_i:
1585
- case REOP_char32:
1586
- case REOP_char32_i:
1587
- case REOP_dot:
1588
- case REOP_any:
1589
- simple_char:
1590
- count++;
1591
- break;
1592
- case REOP_line_start:
1593
- case REOP_line_start_m:
1594
- case REOP_line_end:
1595
- case REOP_line_end_m:
1596
- case REOP_word_boundary:
1597
- case REOP_word_boundary_i:
1598
- case REOP_not_word_boundary:
1599
- case REOP_not_word_boundary_i:
1600
- break;
1601
- default:
1602
- return -1;
1603
- }
1604
- pos += len;
1605
- }
1606
- return count;
1625
+ done:
1626
+ *pneed_capture_init = need_capture_init;
1627
+ return need_check_adv;
1607
1628
  }
1608
1629
 
1609
1630
  /* '*pp' is the first char after '<' */
@@ -1662,16 +1683,16 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
1662
1683
  }
1663
1684
 
1664
1685
  /* if capture_name = NULL: return the number of captures + 1.
1665
- Otherwise, return the capture index corresponding to capture_name
1666
- or -1 if none */
1686
+ Otherwise, return the number of matching capture groups */
1667
1687
  static int re_parse_captures(REParseState *s, int *phas_named_captures,
1668
- const char *capture_name)
1688
+ const char *capture_name, BOOL emit_group_index)
1669
1689
  {
1670
1690
  const uint8_t *p;
1671
- int capture_index;
1691
+ int capture_index, n;
1672
1692
  char name[TMP_BUF_SIZE];
1673
1693
 
1674
1694
  capture_index = 1;
1695
+ n = 0;
1675
1696
  *phas_named_captures = 0;
1676
1697
  for (p = s->buf_start; p < s->buf_end; p++) {
1677
1698
  switch (*p) {
@@ -1683,8 +1704,11 @@ static int re_parse_captures(REParseState *s, int *phas_named_captures,
1683
1704
  if (capture_name) {
1684
1705
  p += 3;
1685
1706
  if (re_parse_group_name(name, sizeof(name), &p) == 0) {
1686
- if (!strcmp(name, capture_name))
1687
- return capture_index;
1707
+ if (!strcmp(name, capture_name)) {
1708
+ if (emit_group_index)
1709
+ dbuf_putc(&s->byte_code, capture_index);
1710
+ n++;
1711
+ }
1688
1712
  }
1689
1713
  }
1690
1714
  capture_index++;
@@ -1709,17 +1733,18 @@ static int re_parse_captures(REParseState *s, int *phas_named_captures,
1709
1733
  }
1710
1734
  }
1711
1735
  done:
1712
- if (capture_name)
1713
- return -1;
1714
- else
1736
+ if (capture_name) {
1737
+ return n;
1738
+ } else {
1715
1739
  return capture_index;
1740
+ }
1716
1741
  }
1717
1742
 
1718
1743
  static int re_count_captures(REParseState *s)
1719
1744
  {
1720
1745
  if (s->total_capture_count < 0) {
1721
1746
  s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1722
- NULL);
1747
+ NULL, FALSE);
1723
1748
  }
1724
1749
  return s->total_capture_count;
1725
1750
  }
@@ -1731,25 +1756,53 @@ static BOOL re_has_named_captures(REParseState *s)
1731
1756
  return s->has_named_captures;
1732
1757
  }
1733
1758
 
1734
- static int find_group_name(REParseState *s, const char *name)
1759
+ static int find_group_name(REParseState *s, const char *name, BOOL emit_group_index)
1735
1760
  {
1736
1761
  const char *p, *buf_end;
1737
1762
  size_t len, name_len;
1738
- int capture_index;
1763
+ int capture_index, n;
1739
1764
 
1740
1765
  p = (char *)s->group_names.buf;
1741
- if (!p) return -1;
1766
+ if (!p)
1767
+ return 0;
1742
1768
  buf_end = (char *)s->group_names.buf + s->group_names.size;
1743
1769
  name_len = strlen(name);
1744
1770
  capture_index = 1;
1771
+ n = 0;
1745
1772
  while (p < buf_end) {
1746
1773
  len = strlen(p);
1747
- if (len == name_len && memcmp(name, p, name_len) == 0)
1748
- return capture_index;
1749
- p += len + 1;
1774
+ if (len == name_len && memcmp(name, p, name_len) == 0) {
1775
+ if (emit_group_index)
1776
+ dbuf_putc(&s->byte_code, capture_index);
1777
+ n++;
1778
+ }
1779
+ p += len + LRE_GROUP_NAME_TRAILER_LEN;
1750
1780
  capture_index++;
1751
1781
  }
1752
- return -1;
1782
+ return n;
1783
+ }
1784
+
1785
+ static BOOL is_duplicate_group_name(REParseState *s, const char *name, int scope)
1786
+ {
1787
+ const char *p, *buf_end;
1788
+ size_t len, name_len;
1789
+ int scope1;
1790
+
1791
+ p = (char *)s->group_names.buf;
1792
+ if (!p)
1793
+ return 0;
1794
+ buf_end = (char *)s->group_names.buf + s->group_names.size;
1795
+ name_len = strlen(name);
1796
+ while (p < buf_end) {
1797
+ len = strlen(p);
1798
+ if (len == name_len && memcmp(name, p, name_len) == 0) {
1799
+ scope1 = (uint8_t)p[len + 1];
1800
+ if (scope == scope1)
1801
+ return TRUE;
1802
+ }
1803
+ p += len + LRE_GROUP_NAME_TRAILER_LEN;
1804
+ }
1805
+ return FALSE;
1753
1806
  }
1754
1807
 
1755
1808
  static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
@@ -1793,7 +1846,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1793
1846
  {
1794
1847
  const uint8_t *p;
1795
1848
  int c, last_atom_start, quant_min, quant_max, last_capture_count;
1796
- BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1849
+ BOOL greedy, is_neg, is_backward_lookahead;
1797
1850
  REStringList cr_s, *cr = &cr_s;
1798
1851
 
1799
1852
  last_atom_start = -1;
@@ -1921,7 +1974,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1921
1974
  p = s->buf_ptr;
1922
1975
  if (re_parse_expect(s, &p, ')'))
1923
1976
  return -1;
1924
- re_emit_op(s, REOP_match);
1977
+ re_emit_op(s, REOP_lookahead_match + is_neg);
1925
1978
  /* jump after the 'match' after the lookahead is successful */
1926
1979
  if (dbuf_error(&s->byte_code))
1927
1980
  return -1;
@@ -1932,12 +1985,16 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1932
1985
  &p)) {
1933
1986
  return re_parse_error(s, "invalid group name");
1934
1987
  }
1935
- if (find_group_name(s, s->u.tmp_buf) > 0) {
1988
+ /* poor's man method to test duplicate group
1989
+ names. */
1990
+ /* XXX: this method does not catch all the errors*/
1991
+ if (is_duplicate_group_name(s, s->u.tmp_buf, s->group_name_scope)) {
1936
1992
  return re_parse_error(s, "duplicate group name");
1937
1993
  }
1938
1994
  /* group name with a trailing zero */
1939
1995
  dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1940
1996
  strlen(s->u.tmp_buf) + 1);
1997
+ dbuf_putc(&s->group_names, s->group_name_scope);
1941
1998
  s->has_named_captures = 1;
1942
1999
  goto parse_capture;
1943
2000
  } else {
@@ -1948,6 +2005,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1948
2005
  p++;
1949
2006
  /* capture without group name */
1950
2007
  dbuf_putc(&s->group_names, 0);
2008
+ dbuf_putc(&s->group_names, 0);
1951
2009
  parse_capture:
1952
2010
  if (s->capture_count >= CAPTURE_COUNT_MAX)
1953
2011
  return re_parse_error(s, "too many captures");
@@ -1974,17 +2032,18 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1974
2032
  case 'b':
1975
2033
  case 'B':
1976
2034
  if (p[1] != 'b') {
1977
- re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
2035
+ re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
1978
2036
  } else {
1979
- re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
2037
+ re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
1980
2038
  }
1981
2039
  p += 2;
1982
2040
  break;
1983
2041
  case 'k':
1984
2042
  {
1985
2043
  const uint8_t *p1;
1986
- int dummy_res;
1987
-
2044
+ int dummy_res, n;
2045
+ BOOL is_forward;
2046
+
1988
2047
  p1 = p;
1989
2048
  if (p1[2] != '<') {
1990
2049
  /* annex B: we tolerate invalid group names in non
@@ -2003,21 +2062,33 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
2003
2062
  else
2004
2063
  goto parse_class_atom;
2005
2064
  }
2006
- c = find_group_name(s, s->u.tmp_buf);
2007
- if (c < 0) {
2065
+ is_forward = FALSE;
2066
+ n = find_group_name(s, s->u.tmp_buf, FALSE);
2067
+ if (n == 0) {
2008
2068
  /* no capture name parsed before, try to look
2009
2069
  after (inefficient, but hopefully not common */
2010
- c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
2011
- if (c < 0) {
2070
+ n = re_parse_captures(s, &dummy_res, s->u.tmp_buf, FALSE);
2071
+ if (n == 0) {
2012
2072
  if (s->is_unicode || re_has_named_captures(s))
2013
2073
  return re_parse_error(s, "group name not defined");
2014
2074
  else
2015
2075
  goto parse_class_atom;
2016
2076
  }
2077
+ is_forward = TRUE;
2078
+ }
2079
+ last_atom_start = s->byte_code.size;
2080
+ last_capture_count = s->capture_count;
2081
+
2082
+ /* emit back references to all the captures indexes matching the group name */
2083
+ re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, n);
2084
+ if (is_forward) {
2085
+ re_parse_captures(s, &dummy_res, s->u.tmp_buf, TRUE);
2086
+ } else {
2087
+ find_group_name(s, s->u.tmp_buf, TRUE);
2017
2088
  }
2018
2089
  p = p1;
2019
2090
  }
2020
- goto emit_back_reference;
2091
+ break;
2021
2092
  case '0':
2022
2093
  p += 2;
2023
2094
  c = 0;
@@ -2063,11 +2134,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
2063
2134
  }
2064
2135
  return re_parse_error(s, "back reference out of range in regular expression");
2065
2136
  }
2066
- emit_back_reference:
2067
2137
  last_atom_start = s->byte_code.size;
2068
2138
  last_capture_count = s->capture_count;
2069
2139
 
2070
- re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, c);
2140
+ re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, 1);
2141
+ dbuf_putc(&s->byte_code, c);
2071
2142
  }
2072
2143
  break;
2073
2144
  default:
@@ -2100,8 +2171,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
2100
2171
  if (is_backward_dir)
2101
2172
  re_emit_op(s, REOP_prev);
2102
2173
  if (c >= CLASS_RANGE_BASE) {
2103
- int ret;
2104
- ret = re_emit_string_list(s, cr);
2174
+ int ret = 0;
2175
+ /* optimize the common 'space' tests */
2176
+ if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
2177
+ re_emit_op(s, REOP_space);
2178
+ } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
2179
+ re_emit_op(s, REOP_not_space);
2180
+ } else {
2181
+ ret = re_emit_string_list(s, cr);
2182
+ }
2105
2183
  re_string_list_free(cr);
2106
2184
  if (ret)
2107
2185
  return -1;
@@ -2176,52 +2254,39 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
2176
2254
  if (last_atom_start < 0) {
2177
2255
  return re_parse_error(s, "nothing to repeat");
2178
2256
  }
2179
- if (greedy) {
2257
+ {
2258
+ BOOL need_capture_init, add_zero_advance_check;
2180
2259
  int len, pos;
2181
-
2182
- if (quant_max > 0) {
2183
- /* specific optimization for simple quantifiers */
2184
- if (dbuf_error(&s->byte_code))
2260
+
2261
+ /* the spec tells that if there is no advance when
2262
+ running the atom after the first quant_min times,
2263
+ then there is no match. We remove this test when we
2264
+ are sure the atom always advances the position. */
2265
+ add_zero_advance_check =
2266
+ re_need_check_adv_and_capture_init(&need_capture_init,
2267
+ s->byte_code.buf + last_atom_start,
2268
+ s->byte_code.size - last_atom_start);
2269
+
2270
+ /* general case: need to reset the capture at each
2271
+ iteration. We don't do it if there are no captures
2272
+ in the atom or if we are sure all captures are
2273
+ initialized in the atom. If quant_min = 0, we still
2274
+ need to reset once the captures in case the atom
2275
+ does not match. */
2276
+ if (need_capture_init && last_capture_count != s->capture_count) {
2277
+ if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2185
2278
  goto out_of_memory;
2186
- len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
2187
- s->byte_code.size - last_atom_start);
2188
- if (len > 0) {
2189
- re_emit_op(s, REOP_match);
2190
-
2191
- if (dbuf_insert(&s->byte_code, last_atom_start, 17))
2192
- goto out_of_memory;
2193
- pos = last_atom_start;
2194
- s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
2195
- put_u32(&s->byte_code.buf[pos],
2196
- s->byte_code.size - last_atom_start - 17);
2197
- pos += 4;
2198
- put_u32(&s->byte_code.buf[pos], quant_min);
2199
- pos += 4;
2200
- put_u32(&s->byte_code.buf[pos], quant_max);
2201
- pos += 4;
2202
- put_u32(&s->byte_code.buf[pos], len);
2203
- pos += 4;
2204
- goto done;
2205
- }
2279
+ int pos = last_atom_start;
2280
+ s->byte_code.buf[pos++] = REOP_save_reset;
2281
+ s->byte_code.buf[pos++] = last_capture_count;
2282
+ s->byte_code.buf[pos++] = s->capture_count - 1;
2206
2283
  }
2207
2284
 
2208
- if (dbuf_error(&s->byte_code))
2209
- goto out_of_memory;
2210
- }
2211
- /* the spec tells that if there is no advance when
2212
- running the atom after the first quant_min times,
2213
- then there is no match. We remove this test when we
2214
- are sure the atom always advances the position. */
2215
- add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
2216
- s->byte_code.size - last_atom_start);
2217
-
2218
- {
2219
- int len, pos;
2220
2285
  len = s->byte_code.size - last_atom_start;
2221
2286
  if (quant_min == 0) {
2222
2287
  /* need to reset the capture in case the atom is
2223
2288
  not executed */
2224
- if (last_capture_count != s->capture_count) {
2289
+ if (!need_capture_init && last_capture_count != s->capture_count) {
2225
2290
  if (dbuf_insert(&s->byte_code, last_atom_start, 3))
2226
2291
  goto out_of_memory;
2227
2292
  s->byte_code.buf[last_atom_start++] = REOP_save_reset;
@@ -2232,76 +2297,63 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
2232
2297
  s->byte_code.size = last_atom_start;
2233
2298
  } else if (quant_max == 1 || quant_max == INT32_MAX) {
2234
2299
  BOOL has_goto = (quant_max == INT32_MAX);
2235
- if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
2300
+ if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check * 2))
2236
2301
  goto out_of_memory;
2237
2302
  s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
2238
2303
  greedy;
2239
2304
  put_u32(s->byte_code.buf + last_atom_start + 1,
2240
- len + 5 * has_goto + add_zero_advance_check * 2);
2305
+ len + 5 * has_goto + add_zero_advance_check * 2 * 2);
2241
2306
  if (add_zero_advance_check) {
2242
- s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
2243
- re_emit_op(s, REOP_check_advance);
2307
+ s->byte_code.buf[last_atom_start + 1 + 4] = REOP_set_char_pos;
2308
+ s->byte_code.buf[last_atom_start + 1 + 4 + 1] = 0;
2309
+ re_emit_op_u8(s, REOP_check_advance, 0);
2244
2310
  }
2245
2311
  if (has_goto)
2246
2312
  re_emit_goto(s, REOP_goto, last_atom_start);
2247
2313
  } else {
2248
- if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
2314
+ if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
2249
2315
  goto out_of_memory;
2250
2316
  pos = last_atom_start;
2251
- s->byte_code.buf[pos++] = REOP_push_i32;
2252
- put_u32(s->byte_code.buf + pos, quant_max);
2253
- pos += 4;
2254
2317
  s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
2255
- put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
2318
+ put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
2256
2319
  pos += 4;
2320
+
2321
+ s->byte_code.buf[pos++] = REOP_set_i32;
2322
+ s->byte_code.buf[pos++] = 0;
2323
+ put_u32(s->byte_code.buf + pos, quant_max);
2324
+ pos += 4;
2325
+ last_atom_start = pos;
2257
2326
  if (add_zero_advance_check) {
2258
- s->byte_code.buf[pos++] = REOP_push_char_pos;
2259
- re_emit_op(s, REOP_check_advance);
2327
+ s->byte_code.buf[pos++] = REOP_set_char_pos;
2328
+ s->byte_code.buf[pos++] = 0;
2260
2329
  }
2261
- re_emit_goto(s, REOP_loop, last_atom_start + 5);
2262
- re_emit_op(s, REOP_drop);
2330
+ re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
2263
2331
  }
2264
2332
  } else if (quant_min == 1 && quant_max == INT32_MAX &&
2265
2333
  !add_zero_advance_check) {
2266
2334
  re_emit_goto(s, REOP_split_next_first - greedy,
2267
2335
  last_atom_start);
2268
2336
  } else {
2269
- if (quant_min == 1) {
2270
- /* nothing to add */
2271
- } else {
2272
- if (dbuf_insert(&s->byte_code, last_atom_start, 5))
2273
- goto out_of_memory;
2274
- s->byte_code.buf[last_atom_start] = REOP_push_i32;
2275
- put_u32(s->byte_code.buf + last_atom_start + 1,
2276
- quant_min);
2277
- last_atom_start += 5;
2278
- re_emit_goto(s, REOP_loop, last_atom_start);
2279
- re_emit_op(s, REOP_drop);
2337
+ if (quant_min == quant_max)
2338
+ add_zero_advance_check = FALSE;
2339
+ if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2))
2340
+ goto out_of_memory;
2341
+ /* Note: we assume the string length is < INT32_MAX */
2342
+ pos = last_atom_start;
2343
+ s->byte_code.buf[pos++] = REOP_set_i32;
2344
+ s->byte_code.buf[pos++] = 0;
2345
+ put_u32(s->byte_code.buf + pos, quant_max);
2346
+ pos += 4;
2347
+ last_atom_start = pos;
2348
+ if (add_zero_advance_check) {
2349
+ s->byte_code.buf[pos++] = REOP_set_char_pos;
2350
+ s->byte_code.buf[pos++] = 0;
2280
2351
  }
2281
- if (quant_max == INT32_MAX) {
2282
- pos = s->byte_code.size;
2283
- re_emit_op_u32(s, REOP_split_goto_first + greedy,
2284
- len + 5 + add_zero_advance_check * 2);
2285
- if (add_zero_advance_check)
2286
- re_emit_op(s, REOP_push_char_pos);
2287
- /* copy the atom */
2288
- dbuf_put_self(&s->byte_code, last_atom_start, len);
2289
- if (add_zero_advance_check)
2290
- re_emit_op(s, REOP_check_advance);
2291
- re_emit_goto(s, REOP_goto, pos);
2292
- } else if (quant_max > quant_min) {
2293
- re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
2294
- pos = s->byte_code.size;
2295
- re_emit_op_u32(s, REOP_split_goto_first + greedy,
2296
- len + 5 + add_zero_advance_check * 2);
2297
- if (add_zero_advance_check)
2298
- re_emit_op(s, REOP_push_char_pos);
2299
- /* copy the atom */
2300
- dbuf_put_self(&s->byte_code, last_atom_start, len);
2301
- if (add_zero_advance_check)
2302
- re_emit_op(s, REOP_check_advance);
2303
- re_emit_goto(s, REOP_loop, pos);
2304
- re_emit_op(s, REOP_drop);
2352
+ if (quant_min == quant_max) {
2353
+ /* a simple loop is enough */
2354
+ re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
2355
+ } else {
2356
+ re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start);
2305
2357
  }
2306
2358
  }
2307
2359
  last_atom_start = -1;
@@ -2311,7 +2363,6 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
2311
2363
  break;
2312
2364
  }
2313
2365
  }
2314
- done:
2315
2366
  s->buf_ptr = p;
2316
2367
  return 0;
2317
2368
  out_of_memory:
@@ -2340,7 +2391,7 @@ static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
2340
2391
  speed is not really critical here) */
2341
2392
  end = s->byte_code.size;
2342
2393
  term_size = end - term_start;
2343
- if (dbuf_realloc(&s->byte_code, end + term_size))
2394
+ if (dbuf_claim(&s->byte_code, term_size))
2344
2395
  return -1;
2345
2396
  memmove(s->byte_code.buf + start + term_size,
2346
2397
  s->byte_code.buf + start,
@@ -2376,6 +2427,8 @@ static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
2376
2427
 
2377
2428
  pos = re_emit_op_u32(s, REOP_goto, 0);
2378
2429
 
2430
+ s->group_name_scope++;
2431
+
2379
2432
  if (re_parse_alternative(s, is_backward_dir))
2380
2433
  return -1;
2381
2434
 
@@ -2386,8 +2439,9 @@ static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
2386
2439
  return 0;
2387
2440
  }
2388
2441
 
2389
- /* the control flow is recursive so the analysis can be linear */
2390
- static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
2442
+ /* Allocate the registers as a stack. The control flow is recursive so
2443
+ the analysis can be linear. */
2444
+ static int compute_register_count(uint8_t *bc_buf, int bc_buf_len)
2391
2445
  {
2392
2446
  int stack_size, stack_size_max, pos, opcode, len;
2393
2447
  uint32_t val;
@@ -2403,19 +2457,29 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
2403
2457
  assert(opcode < REOP_COUNT);
2404
2458
  assert((pos + len) <= bc_buf_len);
2405
2459
  switch(opcode) {
2406
- case REOP_push_i32:
2407
- case REOP_push_char_pos:
2460
+ case REOP_set_i32:
2461
+ case REOP_set_char_pos:
2462
+ bc_buf[pos + 1] = stack_size;
2408
2463
  stack_size++;
2409
2464
  if (stack_size > stack_size_max) {
2410
- if (stack_size > STACK_SIZE_MAX)
2465
+ if (stack_size > REGISTER_COUNT_MAX)
2411
2466
  return -1;
2412
2467
  stack_size_max = stack_size;
2413
2468
  }
2414
2469
  break;
2415
- case REOP_drop:
2416
2470
  case REOP_check_advance:
2471
+ case REOP_loop:
2472
+ case REOP_loop_split_goto_first:
2473
+ case REOP_loop_split_next_first:
2417
2474
  assert(stack_size > 0);
2418
2475
  stack_size--;
2476
+ bc_buf[pos + 1] = stack_size;
2477
+ break;
2478
+ case REOP_loop_check_adv_split_goto_first:
2479
+ case REOP_loop_check_adv_split_next_first:
2480
+ assert(stack_size >= 2);
2481
+ stack_size -= 2;
2482
+ bc_buf[pos + 1] = stack_size;
2419
2483
  break;
2420
2484
  case REOP_range:
2421
2485
  case REOP_range_i:
@@ -2427,6 +2491,13 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
2427
2491
  val = get_u16(bc_buf + pos + 1);
2428
2492
  len += val * 8;
2429
2493
  break;
2494
+ case REOP_back_reference:
2495
+ case REOP_back_reference_i:
2496
+ case REOP_backward_back_reference:
2497
+ case REOP_backward_back_reference_i:
2498
+ val = bc_buf[pos + 1];
2499
+ len += val;
2500
+ break;
2430
2501
  }
2431
2502
  pos += len;
2432
2503
  }
@@ -2453,7 +2524,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
2453
2524
  void *opaque)
2454
2525
  {
2455
2526
  REParseState s_s, *s = &s_s;
2456
- int stack_size;
2527
+ int register_count;
2457
2528
  BOOL is_sticky;
2458
2529
 
2459
2530
  memset(s, 0, sizeof(*s));
@@ -2514,19 +2585,19 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
2514
2585
  goto error;
2515
2586
  }
2516
2587
 
2517
- stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
2518
- if (stack_size < 0) {
2588
+ register_count = compute_register_count(s->byte_code.buf, s->byte_code.size);
2589
+ if (register_count < 0) {
2519
2590
  re_parse_error(s, "too many imbricated quantifiers");
2520
2591
  goto error;
2521
2592
  }
2522
2593
 
2523
2594
  s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
2524
- s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
2595
+ s->byte_code.buf[RE_HEADER_REGISTER_COUNT] = register_count;
2525
2596
  put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
2526
2597
  s->byte_code.size - RE_HEADER_LEN);
2527
2598
 
2528
2599
  /* add the named groups if needed */
2529
- if (s->group_names.size > (s->capture_count - 1)) {
2600
+ if (s->group_names.size > (s->capture_count - 1) * LRE_GROUP_NAME_TRAILER_LEN) {
2530
2601
  dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
2531
2602
  put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
2532
2603
  lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
@@ -2547,14 +2618,6 @@ static BOOL is_line_terminator(uint32_t c)
2547
2618
  return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
2548
2619
  }
2549
2620
 
2550
- static BOOL is_word_char(uint32_t c)
2551
- {
2552
- return ((c >= '0' && c <= '9') ||
2553
- (c >= 'a' && c <= 'z') ||
2554
- (c >= 'A' && c <= 'Z') ||
2555
- (c == '_'));
2556
- }
2557
-
2558
2621
  #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
2559
2622
  do { \
2560
2623
  if (cbuf_type == 0) { \
@@ -2638,23 +2701,26 @@ static BOOL is_word_char(uint32_t c)
2638
2701
  } \
2639
2702
  } while (0)
2640
2703
 
2641
- typedef uintptr_t StackInt;
2642
-
2643
2704
  typedef enum {
2644
2705
  RE_EXEC_STATE_SPLIT,
2645
2706
  RE_EXEC_STATE_LOOKAHEAD,
2646
2707
  RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
2647
- RE_EXEC_STATE_GREEDY_QUANT,
2648
2708
  } REExecStateEnum;
2649
2709
 
2650
- typedef struct REExecState {
2651
- REExecStateEnum type : 8;
2652
- uint8_t stack_len;
2653
- size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
2654
- const uint8_t *cptr;
2655
- const uint8_t *pc;
2656
- void *buf[0];
2657
- } REExecState;
2710
+ #if INTPTR_MAX >= INT64_MAX
2711
+ #define BP_TYPE_BITS 3
2712
+ #else
2713
+ #define BP_TYPE_BITS 2
2714
+ #endif
2715
+
2716
+ typedef union {
2717
+ uint8_t *ptr;
2718
+ intptr_t val; /* for bp, the low BP_SHIFT bits store REExecStateEnum */
2719
+ struct {
2720
+ uintptr_t val : sizeof(uintptr_t) * 8 - BP_TYPE_BITS;
2721
+ uintptr_t type : BP_TYPE_BITS;
2722
+ } bp;
2723
+ } StackElem;
2658
2724
 
2659
2725
  typedef struct {
2660
2726
  const uint8_t *cbuf;
@@ -2662,55 +2728,15 @@ typedef struct {
2662
2728
  /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
2663
2729
  int cbuf_type;
2664
2730
  int capture_count;
2665
- int stack_size_max;
2666
2731
  BOOL is_unicode;
2667
2732
  int interrupt_counter;
2668
2733
  void *opaque; /* used for stack overflow check */
2669
2734
 
2670
- size_t state_size;
2671
- uint8_t *state_stack;
2672
- size_t state_stack_size;
2673
- size_t state_stack_len;
2735
+ StackElem *stack_buf;
2736
+ size_t stack_size;
2737
+ StackElem static_stack_buf[32]; /* static stack to avoid allocation in most cases */
2674
2738
  } REExecContext;
2675
2739
 
2676
- static int push_state(REExecContext *s,
2677
- uint8_t **capture,
2678
- StackInt *stack, size_t stack_len,
2679
- const uint8_t *pc, const uint8_t *cptr,
2680
- REExecStateEnum type, size_t count)
2681
- {
2682
- REExecState *rs;
2683
- uint8_t *new_stack;
2684
- size_t new_size, i, n;
2685
- StackInt *stack_buf;
2686
-
2687
- if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
2688
- /* reallocate the stack */
2689
- new_size = s->state_stack_size * 3 / 2;
2690
- if (new_size < 8)
2691
- new_size = 8;
2692
- new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
2693
- if (!new_stack)
2694
- return -1;
2695
- s->state_stack_size = new_size;
2696
- s->state_stack = new_stack;
2697
- }
2698
- rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
2699
- s->state_stack_len++;
2700
- rs->type = type;
2701
- rs->count = count;
2702
- rs->stack_len = stack_len;
2703
- rs->cptr = cptr;
2704
- rs->pc = pc;
2705
- n = 2 * s->capture_count;
2706
- for(i = 0; i < n; i++)
2707
- rs->buf[i] = capture[i];
2708
- stack_buf = (StackInt *)(rs->buf + n);
2709
- for(i = 0; i < stack_len; i++)
2710
- stack_buf[i] = stack[i];
2711
- return 0;
2712
- }
2713
-
2714
2740
  static int lre_poll_timeout(REExecContext *s)
2715
2741
  {
2716
2742
  if (unlikely(--s->interrupt_counter <= 0)) {
@@ -2721,95 +2747,182 @@ static int lre_poll_timeout(REExecContext *s)
2721
2747
  return 0;
2722
2748
  }
2723
2749
 
2750
+ static no_inline int stack_realloc(REExecContext *s, size_t n)
2751
+ {
2752
+ StackElem *new_stack;
2753
+ size_t new_size;
2754
+ new_size = s->stack_size * 3 / 2;
2755
+ if (new_size < n)
2756
+ new_size = n;
2757
+ if (s->stack_buf == s->static_stack_buf) {
2758
+ new_stack = lre_realloc(s->opaque, NULL, new_size * sizeof(StackElem));
2759
+ if (!new_stack)
2760
+ return -1;
2761
+ /* XXX: could use correct size */
2762
+ memcpy(new_stack, s->stack_buf, s->stack_size * sizeof(StackElem));
2763
+ } else {
2764
+ new_stack = lre_realloc(s->opaque, s->stack_buf, new_size * sizeof(StackElem));
2765
+ if (!new_stack)
2766
+ return -1;
2767
+ }
2768
+ s->stack_size = new_size;
2769
+ s->stack_buf = new_stack;
2770
+ return 0;
2771
+ }
2772
+
2724
2773
  /* return 1 if match, 0 if not match or < 0 if error. */
2725
2774
  static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2726
- StackInt *stack, int stack_len,
2727
- const uint8_t *pc, const uint8_t *cptr,
2728
- BOOL no_recurse)
2775
+ const uint8_t *pc, const uint8_t *cptr)
2729
2776
  {
2730
- int opcode, ret;
2777
+ int opcode;
2731
2778
  int cbuf_type;
2732
- uint32_t val, c;
2779
+ uint32_t val, c, idx;
2733
2780
  const uint8_t *cbuf_end;
2734
-
2781
+ StackElem *sp, *bp, *stack_end;
2782
+ #ifdef DUMP_EXEC
2783
+ const uint8_t *pc_start = pc; /* TEST */
2784
+ #endif
2735
2785
  cbuf_type = s->cbuf_type;
2736
2786
  cbuf_end = s->cbuf_end;
2737
2787
 
2788
+ sp = s->stack_buf;
2789
+ bp = s->stack_buf;
2790
+ stack_end = s->stack_buf + s->stack_size;
2791
+
2792
+ #define CHECK_STACK_SPACE(n) \
2793
+ if (unlikely((stack_end - sp) < (n))) { \
2794
+ size_t saved_sp = sp - s->stack_buf; \
2795
+ size_t saved_bp = bp - s->stack_buf; \
2796
+ if (stack_realloc(s, sp - s->stack_buf + (n))) \
2797
+ return LRE_RET_MEMORY_ERROR; \
2798
+ stack_end = s->stack_buf + s->stack_size; \
2799
+ sp = s->stack_buf + saved_sp; \
2800
+ bp = s->stack_buf + saved_bp; \
2801
+ }
2802
+
2803
+ /* XXX: could test if the value was saved to reduce the stack size
2804
+ but slower */
2805
+ #define SAVE_CAPTURE(idx, value) \
2806
+ { \
2807
+ CHECK_STACK_SPACE(2); \
2808
+ sp[0].val = idx; \
2809
+ sp[1].ptr = capture[idx]; \
2810
+ sp += 2; \
2811
+ capture[idx] = (value); \
2812
+ }
2813
+
2814
+ /* avoid saving the previous value if already saved */
2815
+ #define SAVE_CAPTURE_CHECK(idx, value) \
2816
+ { \
2817
+ StackElem *sp1; \
2818
+ sp1 = sp; \
2819
+ for(;;) { \
2820
+ if (sp1 > bp) { \
2821
+ if (sp1[-2].val == idx) \
2822
+ break; \
2823
+ sp1 -= 2; \
2824
+ } else { \
2825
+ CHECK_STACK_SPACE(2); \
2826
+ sp[0].val = idx; \
2827
+ sp[1].ptr = capture[idx]; \
2828
+ sp += 2; \
2829
+ break; \
2830
+ } \
2831
+ } \
2832
+ capture[idx] = (value); \
2833
+ }
2834
+
2835
+
2836
+ #ifdef DUMP_EXEC
2837
+ printf("%5s %5s %5s %5s %s\n", "PC", "CP", "BP", "SP", "OPCODE");
2838
+ #endif
2738
2839
  for(;;) {
2739
- // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2740
2840
  opcode = *pc++;
2841
+ #ifdef DUMP_EXEC
2842
+ printf("%5ld %5ld %5ld %5ld %s\n",
2843
+ pc - 1 - pc_start,
2844
+ cbuf_type == 0 ? cptr - s->cbuf : (cptr - s->cbuf) / 2,
2845
+ bp - s->stack_buf,
2846
+ sp - s->stack_buf,
2847
+ reopcode_info[opcode].name);
2848
+ #endif
2741
2849
  switch(opcode) {
2742
2850
  case REOP_match:
2743
- {
2744
- REExecState *rs;
2745
- if (no_recurse)
2746
- return (intptr_t)cptr;
2747
- ret = 1;
2748
- goto recurse;
2749
- no_match:
2750
- if (no_recurse)
2851
+ return 1;
2852
+ no_match:
2853
+ for(;;) {
2854
+ REExecStateEnum type;
2855
+ if (bp == s->stack_buf)
2751
2856
  return 0;
2752
- ret = 0;
2753
- recurse:
2857
+ /* undo the modifications to capture[] */
2858
+ while (sp > bp) {
2859
+ capture[sp[-2].val] = sp[-1].ptr;
2860
+ sp -= 2;
2861
+ }
2862
+
2863
+ pc = sp[-3].ptr;
2864
+ cptr = sp[-2].ptr;
2865
+ type = sp[-1].bp.type;
2866
+ bp = s->stack_buf + sp[-1].bp.val;
2867
+ sp -= 3;
2868
+ if (type != RE_EXEC_STATE_LOOKAHEAD)
2869
+ break;
2870
+ }
2871
+ if (lre_poll_timeout(s))
2872
+ return LRE_RET_TIMEOUT;
2873
+ break;
2874
+ case REOP_lookahead_match:
2875
+ /* pop all the saved states until reaching the start of
2876
+ the lookahead and keep the updated captures and
2877
+ variables and the corresponding undo info. */
2878
+ {
2879
+ StackElem *sp1, *sp_top, *next_sp;
2880
+ REExecStateEnum type;
2881
+
2882
+ sp_top = sp;
2754
2883
  for(;;) {
2755
- if (lre_poll_timeout(s))
2756
- return LRE_RET_TIMEOUT;
2757
- if (s->state_stack_len == 0)
2758
- return ret;
2759
- rs = (REExecState *)(s->state_stack +
2760
- (s->state_stack_len - 1) * s->state_size);
2761
- if (rs->type == RE_EXEC_STATE_SPLIT) {
2762
- if (!ret) {
2763
- pop_state:
2764
- memcpy(capture, rs->buf,
2765
- sizeof(capture[0]) * 2 * s->capture_count);
2766
- pop_state1:
2767
- pc = rs->pc;
2768
- cptr = rs->cptr;
2769
- stack_len = rs->stack_len;
2770
- memcpy(stack, rs->buf + 2 * s->capture_count,
2771
- stack_len * sizeof(stack[0]));
2772
- s->state_stack_len--;
2773
- break;
2774
- }
2775
- } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2776
- if (!ret) {
2777
- uint32_t char_count, i;
2778
- memcpy(capture, rs->buf,
2779
- sizeof(capture[0]) * 2 * s->capture_count);
2780
- stack_len = rs->stack_len;
2781
- memcpy(stack, rs->buf + 2 * s->capture_count,
2782
- stack_len * sizeof(stack[0]));
2783
- pc = rs->pc;
2784
- cptr = rs->cptr;
2785
- /* go backward */
2786
- char_count = get_u32(pc + 12);
2787
- for(i = 0; i < char_count; i++) {
2788
- PREV_CHAR(cptr, s->cbuf, cbuf_type);
2789
- }
2790
- pc = (pc + 16) + (int)get_u32(pc);
2791
- rs->cptr = cptr;
2792
- rs->count--;
2793
- if (rs->count == 0) {
2794
- s->state_stack_len--;
2795
- }
2796
- break;
2797
- }
2798
- } else {
2799
- ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2800
- (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2801
- if (ret) {
2802
- /* keep the capture in case of positive lookahead */
2803
- if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2804
- goto pop_state1;
2805
- else
2806
- goto pop_state;
2807
- }
2884
+ sp1 = sp;
2885
+ sp = bp;
2886
+ pc = sp[-3].ptr;
2887
+ cptr = sp[-2].ptr;
2888
+ type = sp[-1].bp.type;
2889
+ bp = s->stack_buf + sp[-1].bp.val;
2890
+ sp[-1].ptr = (void *)sp1; /* save the next value for the copy step */
2891
+ sp -= 3;
2892
+ if (type == RE_EXEC_STATE_LOOKAHEAD)
2893
+ break;
2894
+ }
2895
+ if (sp != s->stack_buf) {
2896
+ /* keep the undo info if there is a saved state */
2897
+ sp1 = sp;
2898
+ while (sp1 < sp_top) {
2899
+ next_sp = (void *)sp1[2].ptr;
2900
+ sp1 += 3;
2901
+ while (sp1 < next_sp)
2902
+ *sp++ = *sp1++;
2808
2903
  }
2809
- s->state_stack_len--;
2810
2904
  }
2811
2905
  }
2812
2906
  break;
2907
+ case REOP_negative_lookahead_match:
2908
+ /* pop all the saved states until reaching start of the negative lookahead */
2909
+ for(;;) {
2910
+ REExecStateEnum type;
2911
+ type = bp[-1].bp.type;
2912
+ /* undo the modifications to capture[] */
2913
+ while (sp > bp) {
2914
+ capture[sp[-2].val] = sp[-1].ptr;
2915
+ sp -= 2;
2916
+ }
2917
+ pc = sp[-3].ptr;
2918
+ cptr = sp[-2].ptr;
2919
+ type = sp[-1].bp.type;
2920
+ bp = s->stack_buf + sp[-1].bp.val;
2921
+ sp -= 3;
2922
+ if (type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD)
2923
+ break;
2924
+ }
2925
+ goto no_match;
2813
2926
  case REOP_char32:
2814
2927
  case REOP_char32_i:
2815
2928
  val = get_u32(pc);
@@ -2842,24 +2955,27 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2842
2955
  pc1 = pc;
2843
2956
  pc = pc + (int)val;
2844
2957
  }
2845
- ret = push_state(s, capture, stack, stack_len,
2846
- pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2847
- if (ret < 0)
2848
- return LRE_RET_MEMORY_ERROR;
2849
- break;
2958
+ CHECK_STACK_SPACE(3);
2959
+ sp[0].ptr = (uint8_t *)pc1;
2960
+ sp[1].ptr = (uint8_t *)cptr;
2961
+ sp[2].bp.val = bp - s->stack_buf;
2962
+ sp[2].bp.type = RE_EXEC_STATE_SPLIT;
2963
+ sp += 3;
2964
+ bp = sp;
2850
2965
  }
2966
+ break;
2851
2967
  case REOP_lookahead:
2852
2968
  case REOP_negative_lookahead:
2853
2969
  val = get_u32(pc);
2854
2970
  pc += 4;
2855
- ret = push_state(s, capture, stack, stack_len,
2856
- pc + (int)val, cptr,
2857
- RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2858
- 0);
2859
- if (ret < 0)
2860
- return LRE_RET_MEMORY_ERROR;
2971
+ CHECK_STACK_SPACE(3);
2972
+ sp[0].ptr = (uint8_t *)(pc + (int)val);
2973
+ sp[1].ptr = (uint8_t *)cptr;
2974
+ sp[2].bp.val = bp - s->stack_buf;
2975
+ sp[2].bp.type = RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead;
2976
+ sp += 3;
2977
+ bp = sp;
2861
2978
  break;
2862
-
2863
2979
  case REOP_goto:
2864
2980
  val = get_u32(pc);
2865
2981
  pc += 4 + (int)val;
@@ -2898,11 +3014,26 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2898
3014
  goto no_match;
2899
3015
  GET_CHAR(c, cptr, cbuf_end, cbuf_type);
2900
3016
  break;
3017
+ case REOP_space:
3018
+ if (cptr == cbuf_end)
3019
+ goto no_match;
3020
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3021
+ if (!lre_is_space(c))
3022
+ goto no_match;
3023
+ break;
3024
+ case REOP_not_space:
3025
+ if (cptr == cbuf_end)
3026
+ goto no_match;
3027
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3028
+ if (lre_is_space(c))
3029
+ goto no_match;
3030
+ break;
2901
3031
  case REOP_save_start:
2902
3032
  case REOP_save_end:
2903
3033
  val = *pc++;
2904
3034
  assert(val < s->capture_count);
2905
- capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
3035
+ idx = 2 * val + opcode - REOP_save_start;
3036
+ SAVE_CAPTURE(idx, (uint8_t *)cptr);
2906
3037
  break;
2907
3038
  case REOP_save_reset:
2908
3039
  {
@@ -2911,35 +3042,97 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2911
3042
  val2 = pc[1];
2912
3043
  pc += 2;
2913
3044
  assert(val2 < s->capture_count);
3045
+ CHECK_STACK_SPACE(2 * (val2 - val + 1));
2914
3046
  while (val <= val2) {
2915
- capture[2 * val] = NULL;
2916
- capture[2 * val + 1] = NULL;
3047
+ idx = 2 * val;
3048
+ SAVE_CAPTURE(idx, NULL);
3049
+ idx = 2 * val + 1;
3050
+ SAVE_CAPTURE(idx, NULL);
2917
3051
  val++;
2918
3052
  }
2919
3053
  }
2920
3054
  break;
2921
- case REOP_push_i32:
2922
- val = get_u32(pc);
2923
- pc += 4;
2924
- stack[stack_len++] = val;
2925
- break;
2926
- case REOP_drop:
2927
- stack_len--;
3055
+ case REOP_set_i32:
3056
+ idx = 2 * s->capture_count + pc[0];
3057
+ val = get_u32(pc + 1);
3058
+ pc += 5;
3059
+ SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
2928
3060
  break;
2929
3061
  case REOP_loop:
2930
- val = get_u32(pc);
2931
- pc += 4;
2932
- if (--stack[stack_len - 1] != 0) {
2933
- pc += (int)val;
2934
- if (lre_poll_timeout(s))
2935
- return LRE_RET_TIMEOUT;
3062
+ {
3063
+ uint32_t val2;
3064
+ idx = 2 * s->capture_count + pc[0];
3065
+ val = get_u32(pc + 1);
3066
+ pc += 5;
3067
+
3068
+ val2 = (uintptr_t)capture[idx] - 1;
3069
+ SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
3070
+ if (val2 != 0) {
3071
+ pc += (int)val;
3072
+ if (lre_poll_timeout(s))
3073
+ return LRE_RET_TIMEOUT;
3074
+ }
3075
+ }
3076
+ break;
3077
+ case REOP_loop_split_goto_first:
3078
+ case REOP_loop_split_next_first:
3079
+ case REOP_loop_check_adv_split_goto_first:
3080
+ case REOP_loop_check_adv_split_next_first:
3081
+ {
3082
+ const uint8_t *pc1;
3083
+ uint32_t val2, limit;
3084
+ idx = 2 * s->capture_count + pc[0];
3085
+ limit = get_u32(pc + 1);
3086
+ val = get_u32(pc + 5);
3087
+ pc += 9;
3088
+
3089
+ /* decrement the counter */
3090
+ val2 = (uintptr_t)capture[idx] - 1;
3091
+ SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
3092
+
3093
+ if (val2 > limit) {
3094
+ /* normal loop if counter > limit */
3095
+ pc += (int)val;
3096
+ if (lre_poll_timeout(s))
3097
+ return LRE_RET_TIMEOUT;
3098
+ } else {
3099
+ /* check advance */
3100
+ if ((opcode == REOP_loop_check_adv_split_goto_first ||
3101
+ opcode == REOP_loop_check_adv_split_next_first) &&
3102
+ capture[idx + 1] == cptr &&
3103
+ val2 != limit) {
3104
+ goto no_match;
3105
+ }
3106
+
3107
+ /* otherwise conditional split */
3108
+ if (val2 != 0) {
3109
+ if (opcode == REOP_loop_split_next_first ||
3110
+ opcode == REOP_loop_check_adv_split_next_first) {
3111
+ pc1 = pc + (int)val;
3112
+ } else {
3113
+ pc1 = pc;
3114
+ pc = pc + (int)val;
3115
+ }
3116
+ CHECK_STACK_SPACE(3);
3117
+ sp[0].ptr = (uint8_t *)pc1;
3118
+ sp[1].ptr = (uint8_t *)cptr;
3119
+ sp[2].bp.val = bp - s->stack_buf;
3120
+ sp[2].bp.type = RE_EXEC_STATE_SPLIT;
3121
+ sp += 3;
3122
+ bp = sp;
3123
+ }
3124
+ }
2936
3125
  }
2937
3126
  break;
2938
- case REOP_push_char_pos:
2939
- stack[stack_len++] = (uintptr_t)cptr;
3127
+ case REOP_set_char_pos:
3128
+ idx = 2 * s->capture_count + pc[0];
3129
+ pc++;
3130
+ SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
2940
3131
  break;
2941
3132
  case REOP_check_advance:
2942
- if (stack[--stack_len] == (uintptr_t)cptr)
3133
+ idx = 2 * s->capture_count + pc[0];
3134
+ pc++;
3135
+ if (capture[idx] == cptr)
2943
3136
  goto no_match;
2944
3137
  break;
2945
3138
  case REOP_word_boundary:
@@ -2955,18 +3148,22 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2955
3148
  v1 = FALSE;
2956
3149
  } else {
2957
3150
  PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
2958
- if (ignore_case)
2959
- c = lre_canonicalize(c, s->is_unicode);
2960
- v1 = is_word_char(c);
3151
+ if (c < 256) {
3152
+ v1 = (lre_is_word_byte(c) != 0);
3153
+ } else {
3154
+ v1 = ignore_case && (c == 0x017f || c == 0x212a);
3155
+ }
2961
3156
  }
2962
3157
  /* current char */
2963
3158
  if (cptr >= cbuf_end) {
2964
3159
  v2 = FALSE;
2965
3160
  } else {
2966
3161
  PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
2967
- if (ignore_case)
2968
- c = lre_canonicalize(c, s->is_unicode);
2969
- v2 = is_word_char(c);
3162
+ if (c < 256) {
3163
+ v2 = (lre_is_word_byte(c) != 0);
3164
+ } else {
3165
+ v2 = ignore_case && (c == 0x017f || c == 0x212a);
3166
+ }
2970
3167
  }
2971
3168
  if (v1 ^ v2 ^ is_boundary)
2972
3169
  goto no_match;
@@ -2978,43 +3175,53 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2978
3175
  case REOP_backward_back_reference_i:
2979
3176
  {
2980
3177
  const uint8_t *cptr1, *cptr1_end, *cptr1_start;
3178
+ const uint8_t *pc1;
2981
3179
  uint32_t c1, c2;
3180
+ int i, n;
2982
3181
 
2983
- val = *pc++;
2984
- if (val >= s->capture_count)
2985
- goto no_match;
2986
- cptr1_start = capture[2 * val];
2987
- cptr1_end = capture[2 * val + 1];
2988
- if (!cptr1_start || !cptr1_end)
2989
- break;
2990
- if (opcode == REOP_back_reference ||
2991
- opcode == REOP_back_reference_i) {
2992
- cptr1 = cptr1_start;
2993
- while (cptr1 < cptr1_end) {
2994
- if (cptr >= cbuf_end)
2995
- goto no_match;
2996
- GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
2997
- GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
2998
- if (opcode == REOP_back_reference_i) {
2999
- c1 = lre_canonicalize(c1, s->is_unicode);
3000
- c2 = lre_canonicalize(c2, s->is_unicode);
3001
- }
3002
- if (c1 != c2)
3003
- goto no_match;
3004
- }
3005
- } else {
3006
- cptr1 = cptr1_end;
3007
- while (cptr1 > cptr1_start) {
3008
- if (cptr == s->cbuf)
3009
- goto no_match;
3010
- GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
3011
- GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
3012
- if (opcode == REOP_backward_back_reference_i) {
3013
- c1 = lre_canonicalize(c1, s->is_unicode);
3014
- c2 = lre_canonicalize(c2, s->is_unicode);
3182
+ n = *pc++;
3183
+ pc1 = pc;
3184
+ pc += n;
3185
+
3186
+ for(i = 0; i < n; i++) {
3187
+ val = pc1[i];
3188
+ if (val >= s->capture_count)
3189
+ goto no_match;
3190
+ cptr1_start = capture[2 * val];
3191
+ cptr1_end = capture[2 * val + 1];
3192
+ /* test the first not empty capture */
3193
+ if (cptr1_start && cptr1_end) {
3194
+ if (opcode == REOP_back_reference ||
3195
+ opcode == REOP_back_reference_i) {
3196
+ cptr1 = cptr1_start;
3197
+ while (cptr1 < cptr1_end) {
3198
+ if (cptr >= cbuf_end)
3199
+ goto no_match;
3200
+ GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
3201
+ GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
3202
+ if (opcode == REOP_back_reference_i) {
3203
+ c1 = lre_canonicalize(c1, s->is_unicode);
3204
+ c2 = lre_canonicalize(c2, s->is_unicode);
3205
+ }
3206
+ if (c1 != c2)
3207
+ goto no_match;
3208
+ }
3209
+ } else {
3210
+ cptr1 = cptr1_end;
3211
+ while (cptr1 > cptr1_start) {
3212
+ if (cptr == s->cbuf)
3213
+ goto no_match;
3214
+ GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
3215
+ GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
3216
+ if (opcode == REOP_backward_back_reference_i) {
3217
+ c1 = lre_canonicalize(c1, s->is_unicode);
3218
+ c2 = lre_canonicalize(c2, s->is_unicode);
3219
+ }
3220
+ if (c1 != c2)
3221
+ goto no_match;
3222
+ }
3015
3223
  }
3016
- if (c1 != c2)
3017
- goto no_match;
3224
+ break;
3018
3225
  }
3019
3226
  }
3020
3227
  }
@@ -3104,50 +3311,10 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
3104
3311
  goto no_match;
3105
3312
  PREV_CHAR(cptr, s->cbuf, cbuf_type);
3106
3313
  break;
3107
- case REOP_simple_greedy_quant:
3108
- {
3109
- uint32_t next_pos, quant_min, quant_max;
3110
- size_t q;
3111
- intptr_t res;
3112
- const uint8_t *pc1;
3113
-
3114
- next_pos = get_u32(pc);
3115
- quant_min = get_u32(pc + 4);
3116
- quant_max = get_u32(pc + 8);
3117
- pc += 16;
3118
- pc1 = pc;
3119
- pc += (int)next_pos;
3120
-
3121
- q = 0;
3122
- for(;;) {
3123
- if (lre_poll_timeout(s))
3124
- return LRE_RET_TIMEOUT;
3125
- res = lre_exec_backtrack(s, capture, stack, stack_len,
3126
- pc1, cptr, TRUE);
3127
- if (res == LRE_RET_MEMORY_ERROR ||
3128
- res == LRE_RET_TIMEOUT)
3129
- return res;
3130
- if (!res)
3131
- break;
3132
- cptr = (uint8_t *)res;
3133
- q++;
3134
- if (q >= quant_max && quant_max != INT32_MAX)
3135
- break;
3136
- }
3137
- if (q < quant_min)
3138
- goto no_match;
3139
- if (q > quant_min) {
3140
- /* will examine all matches down to quant_min */
3141
- ret = push_state(s, capture, stack, stack_len,
3142
- pc1 - 16, cptr,
3143
- RE_EXEC_STATE_GREEDY_QUANT,
3144
- q - quant_min);
3145
- if (ret < 0)
3146
- return LRE_RET_MEMORY_ERROR;
3147
- }
3148
- }
3149
- break;
3150
3314
  default:
3315
+ #ifdef DUMP_EXEC
3316
+ printf("unknown opcode pc=%ld\n", pc - 1 - pc_start);
3317
+ #endif
3151
3318
  abort();
3152
3319
  }
3153
3320
  }
@@ -3161,14 +3328,12 @@ int lre_exec(uint8_t **capture,
3161
3328
  int cbuf_type, void *opaque)
3162
3329
  {
3163
3330
  REExecContext s_s, *s = &s_s;
3164
- int re_flags, i, alloca_size, ret;
3165
- StackInt *stack_buf;
3331
+ int re_flags, i, ret;
3166
3332
  const uint8_t *cptr;
3167
3333
 
3168
3334
  re_flags = lre_get_flags(bc_buf);
3169
3335
  s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
3170
3336
  s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
3171
- s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
3172
3337
  s->cbuf = cbuf;
3173
3338
  s->cbuf_end = cbuf + (clen << cbuf_type);
3174
3339
  s->cbuf_type = cbuf_type;
@@ -3177,17 +3342,11 @@ int lre_exec(uint8_t **capture,
3177
3342
  s->interrupt_counter = INTERRUPT_COUNTER_INIT;
3178
3343
  s->opaque = opaque;
3179
3344
 
3180
- s->state_size = sizeof(REExecState) +
3181
- s->capture_count * sizeof(capture[0]) * 2 +
3182
- s->stack_size_max * sizeof(stack_buf[0]);
3183
- s->state_stack = NULL;
3184
- s->state_stack_len = 0;
3185
- s->state_stack_size = 0;
3345
+ s->stack_buf = s->static_stack_buf;
3346
+ s->stack_size = countof(s->static_stack_buf);
3186
3347
 
3187
3348
  for(i = 0; i < s->capture_count * 2; i++)
3188
3349
  capture[i] = NULL;
3189
- alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
3190
- stack_buf = alloca(alloca_size);
3191
3350
 
3192
3351
  cptr = cbuf + (cindex << cbuf_type);
3193
3352
  if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
@@ -3197,12 +3356,19 @@ int lre_exec(uint8_t **capture,
3197
3356
  }
3198
3357
  }
3199
3358
 
3200
- ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
3201
- cptr, FALSE);
3202
- lre_realloc(s->opaque, s->state_stack, 0);
3359
+ ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
3360
+
3361
+ if (s->stack_buf != s->static_stack_buf)
3362
+ lre_realloc(s->opaque, s->stack_buf, 0);
3203
3363
  return ret;
3204
3364
  }
3205
3365
 
3366
+ int lre_get_alloc_count(const uint8_t *bc_buf)
3367
+ {
3368
+ return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
3369
+ bc_buf[RE_HEADER_REGISTER_COUNT];
3370
+ }
3371
+
3206
3372
  int lre_get_capture_count(const uint8_t *bc_buf)
3207
3373
  {
3208
3374
  return bc_buf[RE_HEADER_CAPTURE_COUNT];
@@ -3241,7 +3407,7 @@ int main(int argc, char **argv)
3241
3407
  int len, flags, ret, i;
3242
3408
  uint8_t *bc;
3243
3409
  char error_msg[64];
3244
- uint8_t *capture[CAPTURE_COUNT_MAX * 2];
3410
+ uint8_t *capture;
3245
3411
  const char *input;
3246
3412
  int input_len, capture_count;
3247
3413
 
@@ -3260,6 +3426,7 @@ int main(int argc, char **argv)
3260
3426
  input = argv[3];
3261
3427
  input_len = strlen(input);
3262
3428
 
3429
+ capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
3263
3430
  ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
3264
3431
  printf("ret=%d\n", ret);
3265
3432
  if (ret == 1) {
@@ -3275,6 +3442,7 @@ int main(int argc, char **argv)
3275
3442
  printf("\n");
3276
3443
  }
3277
3444
  }
3445
+ free(capture);
3278
3446
  return 0;
3279
3447
  }
3280
3448
  #endif