re2 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ce0e303b87738a767776165216cbbd8cf0f63ec9dba3ef4389f657cbb5da8dc3
4
- data.tar.gz: 61983a9e93dc64334d43a41f3f978ff71de020bd9d6f27bc409431a313ea58e0
3
+ metadata.gz: 2151621115d04b197403c0f67276347c205928136405c05aac1fc70bf3a00dec
4
+ data.tar.gz: 8ba805a95c535ab7d30296a830448dea51da4f5f699a6aa3f858296b2590d188
5
5
  SHA512:
6
- metadata.gz: 229f667e12094ae2d42ae3d72a08aa0567cf17e1d666d9677b6b0bdfccc549f241870085ac918789fbbcd0fcb538a942a31f6922f32d88cc7b54043b553f35e3
7
- data.tar.gz: 850d6dc79bcfbfe96a913ac93ddc2c8d4e1b79ddd8aa50eb951bfaae99e3a1688ff31fd2e55c2e497946b9ea4913bcca4938070b6fa37f2ca77d88721652ca66
6
+ metadata.gz: 010e20dae629df302c35c6d5cc412c5a0add1cc3f0b6357114ce64c5d77e6c3bbca7401e57c0b86be5fe87acae17162cd7c860c079dc2c57898761f9fd8d4ce4
7
+ data.tar.gz: 6bfa2db432e91b87ab5d6fa21babd07a675df3b22904e20f1491322b1d34c8fe1c5814783049c33f84ab8f22e0b3c2ef9c2308161f8694dd6bcbd8ec4da278a4
data/README.md CHANGED
@@ -4,8 +4,8 @@ re2 [![Build Status](https://github.com/mudge/re2/actions/workflows/tests.yml/ba
4
4
  A Ruby binding to [re2][], an "efficient, principled regular expression
5
5
  library".
6
6
 
7
- **Current version:** 1.4.0
8
- **Supported Ruby versions:** 1.8.7, 1.9.3, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 3.0
7
+ **Current version:** 1.6.0
8
+ **Supported Ruby versions:** 1.8.7, 1.9.3, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 3.0, 3.1
9
9
  **Supported re2 versions:** libre2.0 (< 2020-03-02), libre2.1 (2020-03-02), libre2.6 (2020-03-03), libre2.7 (2020-05-01), libre2.8 (2020-07-06), libre2.9 (2020-11-01)
10
10
 
11
11
  Installation
@@ -131,6 +131,43 @@ enum.next #=> ["It"]
131
131
  enum.next #=> ["is"]
132
132
  ```
133
133
 
134
+ As of 1.5.0, you can use `RE2::Set` to match multiple patterns against a
135
+ string. Calling `RE2::Set#add` with a pattern will return an integer index of
136
+ the pattern. After all patterns have been added, the set can be compiled using
137
+ `RE2::Set#compile`, and then `RE2::Set#match` will return an `Array<Integer>`
138
+ containing the indices of all the patterns that matched.
139
+
140
+ ```ruby
141
+ set = RE2::Set.new
142
+ set.add("abc") #=> 0
143
+ set.add("def") #=> 1
144
+ set.add("ghi") #=> 2
145
+ set.compile #=> true
146
+ set.match("abcdefghi") #=> [0, 1, 2]
147
+ set.match("ghidefabc") #=> [2, 1, 0]
148
+ ```
149
+
150
+ As of 1.6.0, you can use [Ruby's pattern matching](https://docs.ruby-lang.org/en/3.0/syntax/pattern_matching_rdoc.html) against `RE2::MatchData` with both array patterns and hash patterns:
151
+
152
+ ```ruby
153
+ case RE2('(\w+) (\d+)').match("Alice 42")
154
+ in [name, age]
155
+ puts "My name is #{name} and I am #{age} years old"
156
+ else
157
+ puts "No match!"
158
+ end
159
+ # My name is Alice and I am 42 years old
160
+
161
+
162
+ case RE2('(?P<name>\w+) (?P<age>\d+)').match("Alice 42")
163
+ in {name:, age:}
164
+ puts "My name is #{name} and I am #{age} years old"
165
+ else
166
+ puts "No match!"
167
+ end
168
+ # My name is Alice and I am 42 years old
169
+ ```
170
+
134
171
  Features
135
172
  --------
136
173
 
@@ -149,6 +186,8 @@ Features
149
186
 
150
187
  * Incrementally scanning text with `re2.scan(text)`
151
188
 
189
+ * Search a collection of patterns simultaneously with `RE2::Set`
190
+
152
191
  * Checking regular expression compilation with `re2.ok?`, `re2.error` and
153
192
  `re2.error_arg`
154
193
 
@@ -167,6 +206,8 @@ Features
167
206
  [`RE2.escape(unquoted)`](https://github.com/google/re2/blob/2016-02-01/re2/re2.h#L418) and
168
207
  `RE2.quote(unquoted)`
169
208
 
209
+ * Pattern matching with `RE2::MatchData`
210
+
170
211
  Contributions
171
212
  -------------
172
213
 
@@ -177,7 +218,9 @@ Contributions
177
218
  * Thanks to [Sebastian Reitenbach](https://github.com/buzzdeee) for reporting
178
219
  the deprecation and removal of the `utf8` encoding option in re2;
179
220
  * Thanks to [Sergio Medina](https://github.com/serch) for reporting a bug when
180
- using `RE2::Scanner#scan` with an invalid regular expression.
221
+ using `RE2::Scanner#scan` with an invalid regular expression;
222
+ * Thanks to [Pritam Baral](https://github.com/pritambaral) for contributed the
223
+ initial support for `RE2::Set`.
181
224
 
182
225
  Contact
183
226
  -------
data/ext/re2/extconf.rb CHANGED
@@ -88,4 +88,28 @@ SRC
88
88
  end
89
89
  end
90
90
 
91
+ checking_for("RE2::Set::Match() with error information") do
92
+ test_re2_set_match_signature = <<SRC
93
+ #include <vector>
94
+ #include <re2/re2.h>
95
+ #include <re2/set.h>
96
+
97
+ int main() {
98
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
99
+ s.Add("foo", NULL);
100
+ s.Compile();
101
+
102
+ std::vector<int> v;
103
+ RE2::Set::ErrorInfo ei;
104
+ s.Match("foo", &v, &ei);
105
+
106
+ return 0;
107
+ }
108
+ SRC
109
+
110
+ if try_compile(test_re2_set_match_signature, compile_options)
111
+ $defs.push("-DHAVE_ERROR_INFO_ARGUMENT")
112
+ end
113
+ end
114
+
91
115
  create_makefile("re2")
data/ext/re2/re2.cc CHANGED
@@ -8,6 +8,7 @@
8
8
 
9
9
  #include <ruby.h>
10
10
  #include <re2/re2.h>
11
+ #include <re2/set.h>
11
12
  #include <stdint.h>
12
13
  #include <string>
13
14
  #include <sstream>
@@ -93,12 +94,82 @@ typedef struct {
93
94
  VALUE regexp, text;
94
95
  } re2_scanner;
95
96
 
96
- VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner;
97
+ typedef struct {
98
+ RE2::Set *set;
99
+ } re2_set;
100
+
101
+ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
102
+ re2_eSetMatchError, re2_eSetUnsupportedError;
97
103
 
98
104
  /* Symbols used in RE2 options. */
99
105
  static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
100
106
  id_max_mem, id_literal, id_never_nl, id_case_sensitive,
101
- id_perl_classes, id_word_boundary, id_one_line;
107
+ id_perl_classes, id_word_boundary, id_one_line,
108
+ id_unanchored, id_anchor_start, id_anchor_both, id_exception;
109
+
110
+ void parse_re2_options(RE2::Options& re2_options, VALUE options) {
111
+ if (TYPE(options) != T_HASH) {
112
+ rb_raise(rb_eArgError, "options should be a hash");
113
+ }
114
+ VALUE utf8, posix_syntax, longest_match, log_errors,
115
+ max_mem, literal, never_nl, case_sensitive, perl_classes,
116
+ word_boundary, one_line;
117
+
118
+ utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
119
+ if (!NIL_P(utf8)) {
120
+ re2_options.set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
121
+ }
122
+
123
+ posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
124
+ if (!NIL_P(posix_syntax)) {
125
+ re2_options.set_posix_syntax(RTEST(posix_syntax));
126
+ }
127
+
128
+ longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
129
+ if (!NIL_P(longest_match)) {
130
+ re2_options.set_longest_match(RTEST(longest_match));
131
+ }
132
+
133
+ log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
134
+ if (!NIL_P(log_errors)) {
135
+ re2_options.set_log_errors(RTEST(log_errors));
136
+ }
137
+
138
+ max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
139
+ if (!NIL_P(max_mem)) {
140
+ re2_options.set_max_mem(NUM2INT(max_mem));
141
+ }
142
+
143
+ literal = rb_hash_aref(options, ID2SYM(id_literal));
144
+ if (!NIL_P(literal)) {
145
+ re2_options.set_literal(RTEST(literal));
146
+ }
147
+
148
+ never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
149
+ if (!NIL_P(never_nl)) {
150
+ re2_options.set_never_nl(RTEST(never_nl));
151
+ }
152
+
153
+ case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
154
+ if (!NIL_P(case_sensitive)) {
155
+ re2_options.set_case_sensitive(RTEST(case_sensitive));
156
+ }
157
+
158
+ perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
159
+ if (!NIL_P(perl_classes)) {
160
+ re2_options.set_perl_classes(RTEST(perl_classes));
161
+ }
162
+
163
+ word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
164
+ if (!NIL_P(word_boundary)) {
165
+ re2_options.set_word_boundary(RTEST(word_boundary));
166
+ }
167
+
168
+ one_line = rb_hash_aref(options, ID2SYM(id_one_line));
169
+ if (!NIL_P(one_line)) {
170
+ re2_options.set_one_line(RTEST(one_line));
171
+ }
172
+ }
102
173
 
103
174
  void re2_matchdata_mark(re2_matchdata* self) {
104
175
  rb_gc_mark(self->regexp);
@@ -616,6 +687,112 @@ static VALUE re2_matchdata_inspect(VALUE self) {
616
687
  return result;
617
688
  }
618
689
 
690
+ /*
691
+ * Returns the array of submatches for pattern matching.
692
+ *
693
+ * @return [Array<String, nil>] the array of submatches
694
+ * @example
695
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
696
+ * m.deconstruct #=> ["123"]
697
+ *
698
+ * @example pattern matching
699
+ * case RE2::Regexp.new('(\d+) (\d+)').match("bob 123 456")
700
+ * in x, y
701
+ * puts "Matched #{x} #{y}"
702
+ * else
703
+ * puts "Unrecognised match"
704
+ * end
705
+ */
706
+ static VALUE re2_matchdata_deconstruct(VALUE self) {
707
+ int i;
708
+ re2_matchdata *m;
709
+ re2_pattern *p;
710
+ re2::StringPiece *match;
711
+ VALUE array;
712
+
713
+ Data_Get_Struct(self, re2_matchdata, m);
714
+ Data_Get_Struct(m->regexp, re2_pattern, p);
715
+
716
+ array = rb_ary_new2(m->number_of_matches - 1);
717
+ for (i = 1; i < m->number_of_matches; i++) {
718
+ match = &m->matches[i];
719
+
720
+ if (match->empty()) {
721
+ rb_ary_push(array, Qnil);
722
+ } else {
723
+ rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
724
+ p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
725
+ }
726
+ }
727
+
728
+ return array;
729
+ }
730
+
731
+ /*
732
+ * Returns a hash of capturing group names to submatches for pattern matching.
733
+ *
734
+ * As this is used by Ruby's pattern matching, it will return an empty hash if given
735
+ * more keys than there are capturing groups. Given keys will populate the hash in
736
+ * order but an invalid name will cause the hash to be immediately returned.
737
+ *
738
+ * @return [Hash] a hash of capturing group names to submatches
739
+ * @param [Array<Symbol>, nil] keys an array of Symbol capturing group names or nil to return all names
740
+ * @example
741
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
742
+ * m.deconstruct_keys(nil) #=> {:numbers => "123", :letters => "abc"}
743
+ * m.deconstruct_keys([:numbers]) #=> {:numbers => "123"}
744
+ * m.deconstruct_keys([:fruit]) #=> {}
745
+ * m.deconstruct_keys([:letters, :fruit]) #=> {:letters => "abc"}
746
+ *
747
+ * @example pattern matching
748
+ * case RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
749
+ * in numbers:, letters:
750
+ * puts "Numbers: #{numbers}, letters: #{letters}"
751
+ * else
752
+ * puts "Unrecognised match"
753
+ * end
754
+ */
755
+ static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
756
+ int i;
757
+ VALUE capturing_groups, key;
758
+ re2_matchdata *m;
759
+ re2_pattern *p;
760
+ map<string, int> groups;
761
+ map<string, int>::iterator iterator;
762
+
763
+ Data_Get_Struct(self, re2_matchdata, m);
764
+ Data_Get_Struct(m->regexp, re2_pattern, p);
765
+
766
+ groups = p->pattern->NamedCapturingGroups();
767
+ capturing_groups = rb_hash_new();
768
+
769
+ if (NIL_P(keys)) {
770
+ for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
771
+ rb_hash_aset(capturing_groups,
772
+ ID2SYM(rb_intern(iterator->first.data())),
773
+ re2_matchdata_nth_match(iterator->second, self));
774
+ }
775
+ } else {
776
+ Check_Type(keys, T_ARRAY);
777
+
778
+ if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) {
779
+ for (i = 0; i < RARRAY_LEN(keys); i++) {
780
+ key = rb_ary_entry(keys, i);
781
+ Check_Type(key, T_SYMBOL);
782
+ string name(rb_id2name(SYM2ID(key)));
783
+
784
+ if (groups.count(name) == 0) {
785
+ break;
786
+ }
787
+
788
+ rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(groups[name], self));
789
+ }
790
+ }
791
+ }
792
+
793
+ return capturing_groups;
794
+ }
795
+
619
796
  /*
620
797
  * Returns a new RE2 object with a compiled version of
621
798
  * +pattern+ stored inside. Equivalent to +RE2.new+.
@@ -667,75 +844,15 @@ static VALUE re2_re2(int argc, VALUE *argv, VALUE self) {
667
844
  * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
668
845
  */
669
846
  static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
670
- VALUE pattern, options, utf8, posix_syntax, longest_match, log_errors,
671
- max_mem, literal, never_nl, case_sensitive, perl_classes,
672
- word_boundary, one_line;
847
+ VALUE pattern, options;
673
848
  re2_pattern *p;
674
849
 
675
850
  rb_scan_args(argc, argv, "11", &pattern, &options);
676
851
  Data_Get_Struct(self, re2_pattern, p);
677
852
 
678
853
  if (RTEST(options)) {
679
- if (TYPE(options) != T_HASH) {
680
- rb_raise(rb_eArgError, "options should be a hash");
681
- }
682
-
683
854
  RE2::Options re2_options;
684
-
685
- utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
686
- if (!NIL_P(utf8)) {
687
- re2_options.set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
688
- }
689
-
690
- posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
691
- if (!NIL_P(posix_syntax)) {
692
- re2_options.set_posix_syntax(RTEST(posix_syntax));
693
- }
694
-
695
- longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
696
- if (!NIL_P(longest_match)) {
697
- re2_options.set_longest_match(RTEST(longest_match));
698
- }
699
-
700
- log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
701
- if (!NIL_P(log_errors)) {
702
- re2_options.set_log_errors(RTEST(log_errors));
703
- }
704
-
705
- max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
706
- if (!NIL_P(max_mem)) {
707
- re2_options.set_max_mem(NUM2INT(max_mem));
708
- }
709
-
710
- literal = rb_hash_aref(options, ID2SYM(id_literal));
711
- if (!NIL_P(literal)) {
712
- re2_options.set_literal(RTEST(literal));
713
- }
714
-
715
- never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
716
- if (!NIL_P(never_nl)) {
717
- re2_options.set_never_nl(RTEST(never_nl));
718
- }
719
-
720
- case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
721
- if (!NIL_P(case_sensitive)) {
722
- re2_options.set_case_sensitive(RTEST(case_sensitive));
723
- }
724
-
725
- perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
726
- if (!NIL_P(perl_classes)) {
727
- re2_options.set_perl_classes(RTEST(perl_classes));
728
- }
729
-
730
- word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
731
- if (!NIL_P(word_boundary)) {
732
- re2_options.set_word_boundary(RTEST(word_boundary));
733
- }
734
-
735
- one_line = rb_hash_aref(options, ID2SYM(id_one_line));
736
- if (!NIL_P(one_line)) {
737
- re2_options.set_one_line(RTEST(one_line));
738
- }
855
+ parse_re2_options(re2_options, options);
739
856
 
740
857
  p->pattern = new(nothrow) RE2(StringValuePtr(pattern), re2_options);
741
858
  } else {
@@ -1234,7 +1351,7 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1234
1351
  *
1235
1352
  * @return [Boolean] whether the match was successful
1236
1353
  */
1237
- static VALUE re2_regexp_match_query(VALUE self, VALUE text) {
1354
+ static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1238
1355
  VALUE argv[2];
1239
1356
  argv[0] = text;
1240
1357
  argv[1] = INT2FIX(0);
@@ -1362,6 +1479,257 @@ static VALUE re2_QuoteMeta(VALUE self, VALUE unquoted) {
1362
1479
  return rb_str_new(quoted_string.data(), quoted_string.size());
1363
1480
  }
1364
1481
 
1482
+ void re2_set_free(re2_set *self) {
1483
+ if (self->set) {
1484
+ delete self->set;
1485
+ }
1486
+ free(self);
1487
+ }
1488
+
1489
+ static VALUE re2_set_allocate(VALUE klass) {
1490
+ re2_set *s;
1491
+ VALUE result = Data_Make_Struct(klass, re2_set, 0, re2_set_free, s);
1492
+ return result;
1493
+ }
1494
+
1495
+ /*
1496
+ * Returns a new {RE2::Set} object, a collection of patterns that can be
1497
+ * searched for simultaneously.
1498
+ *
1499
+ * @return [RE2::Set]
1500
+ *
1501
+ * @overload initialize
1502
+ * Returns a new {RE2::Set} object for unanchored patterns with the default
1503
+ * options.
1504
+ *
1505
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1506
+ * @return [RE2::Set]
1507
+ *
1508
+ * @overload initialize(anchor)
1509
+ * Returns a new {RE2::Set} object for the specified anchor with the default
1510
+ * options.
1511
+ *
1512
+ * @param [Symbol] anchor One of :unanchored, :anchor_start, :anchor_both
1513
+ * @raise [ArgumentError] if anchor is not :unanchored, :anchor_start or :anchor_both
1514
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1515
+ *
1516
+ * @overload initialize(anchor, options)
1517
+ * Returns a new {RE2::Set} object with the specified options.
1518
+ *
1519
+ * @param [Symbol] anchor One of :unanchored, :anchor_start, :anchor_both
1520
+ * @param [Hash] options the options with which to compile the pattern
1521
+ * @option options [Boolean] :utf8 (true) text and pattern are UTF-8; otherwise Latin-1
1522
+ * @option options [Boolean] :posix_syntax (false) restrict regexps to POSIX egrep syntax
1523
+ * @option options [Boolean] :longest_match (false) search for longest match, not first match
1524
+ * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
1525
+ * @option options [Fixnum] :max_mem approx. max memory footprint of RE2
1526
+ * @option options [Boolean] :literal (false) interpret string as literal, not regexp
1527
+ * @option options [Boolean] :never_nl (false) never match \n, even if it is in regexp
1528
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with (?i) unless in posix_syntax mode)
1529
+ * @option options [Boolean] :perl_classes (false) allow Perl's \d \s \w \D \S \W when in posix_syntax mode
1530
+ * @option options [Boolean] :word_boundary (false) allow \b \B (word boundary and not) when in posix_syntax mode
1531
+ * @option options [Boolean] :one_line (false) ^ and $ only match beginning and end of text when in posix_syntax mode
1532
+ * @return [RE2::Set] an RE2::Set with the specified anchor and options
1533
+ * @raise [ArgumentError] if anchor is not one of the accepted choices
1534
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1535
+ */
1536
+ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1537
+ VALUE anchor, options;
1538
+ re2_set *s;
1539
+ RE2::Anchor re2_anchor;
1540
+ RE2::Options re2_options;
1541
+
1542
+ rb_scan_args(argc, argv, "02", &anchor, &options);
1543
+ Data_Get_Struct(self, re2_set, s);
1544
+
1545
+ if (RTEST(options)) {
1546
+ parse_re2_options(re2_options, options);
1547
+ }
1548
+ if (NIL_P(anchor)) {
1549
+ re2_anchor = RE2::UNANCHORED;
1550
+ } else {
1551
+ Check_Type(anchor, T_SYMBOL);
1552
+ ID id_anchor = SYM2ID(anchor);
1553
+ if (id_anchor == id_unanchored) {
1554
+ re2_anchor = RE2::UNANCHORED;
1555
+ } else if (id_anchor == id_anchor_start) {
1556
+ re2_anchor = RE2::ANCHOR_START;
1557
+ } else if (id_anchor == id_anchor_both) {
1558
+ re2_anchor = RE2::ANCHOR_BOTH;
1559
+ } else {
1560
+ rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
1561
+ }
1562
+ }
1563
+
1564
+ s->set = new(nothrow) RE2::Set(re2_options, re2_anchor);
1565
+ if (s->set == 0) {
1566
+ rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1567
+ }
1568
+
1569
+ return self;
1570
+ }
1571
+
1572
+ /*
1573
+ * Adds a pattern to the set. Returns the index that will identify the pattern
1574
+ * in the output of #match. Cannot be called after #compile has been called.
1575
+ *
1576
+ * @param [String] pattern the regex pattern
1577
+ * @return [Integer] the index of the pattern in the set
1578
+ * @raise [ArgumentError] if called after compile or the pattern is rejected
1579
+ * @example
1580
+ * set = RE2::Set.new
1581
+ * set.add("abc") #=> 0
1582
+ * set.add("def") #=> 1
1583
+ */
1584
+ static VALUE re2_set_add(VALUE self, VALUE pattern) {
1585
+ Check_Type(pattern, T_STRING);
1586
+ re2::StringPiece regex(RSTRING_PTR(pattern), RSTRING_LEN(pattern));
1587
+ std::string err;
1588
+ re2_set *s;
1589
+ Data_Get_Struct(self, re2_set, s);
1590
+ int index = s->set->Add(regex, &err);
1591
+ if (index < 0) {
1592
+ rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", err.c_str());
1593
+ }
1594
+
1595
+ return INT2FIX(index);
1596
+ }
1597
+
1598
+ /*
1599
+ * Compiles a Set so it can be used to match against. Must be called after #add
1600
+ * and before #match.
1601
+ *
1602
+ * @return [Bool] whether compilation was a success
1603
+ * @example
1604
+ * set = RE2::Set.new
1605
+ * set.add("abc")
1606
+ * set.compile # => true
1607
+ */
1608
+ static VALUE re2_set_compile(VALUE self) {
1609
+ re2_set *s;
1610
+ Data_Get_Struct(self, re2_set, s);
1611
+
1612
+ return BOOL2RUBY(s->set->Compile());
1613
+ }
1614
+
1615
+ /*
1616
+ * Returns whether the underlying re2 version outputs error information from
1617
+ * RE2::Set::Match. If not, #match will raise an error if attempting to set its
1618
+ * :exception option to true.
1619
+ *
1620
+ * @return [Bool] whether the underlying re2 outputs error information from Set matches
1621
+ */
1622
+ static VALUE re2_set_match_raises_errors_p(VALUE self) {
1623
+ UNUSED(self);
1624
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
1625
+ return Qtrue;
1626
+ #else
1627
+ return Qfalse;
1628
+ #endif
1629
+ }
1630
+
1631
+ /*
1632
+ * Matches the given text against patterns in the set, returning an array of
1633
+ * integer indices of the matching patterns if matched or an empty array if
1634
+ * there are no matches.
1635
+ *
1636
+ * @return [Array<Integer>]
1637
+ *
1638
+ * @overload match(str)
1639
+ * Returns an array of integer indices of patterns matching the given string
1640
+ * (if any). Raises exceptions if there are any errors while matching.
1641
+ *
1642
+ * @param [String] str the text to match against
1643
+ * @return [Array<Integer>] the indices of matching regexps
1644
+ * @raise [MatchError] if an error occurs while matching
1645
+ * @raise [UnsupportedError] if the underlying version of re2 does not output error information
1646
+ * @example
1647
+ * set = RE2::Set.new
1648
+ * set.add("abc")
1649
+ * set.add("def")
1650
+ * set.compile
1651
+ * set.match("abcdef") # => [0, 1]
1652
+ *
1653
+ * @overload match(str, options)
1654
+ * Returns an array of integer indices of patterns matching the given string
1655
+ * (if any). Raises exceptions if there are any errors while matching and the
1656
+ * :exception option is set to true.
1657
+ *
1658
+ * @param [String] str the text to match against
1659
+ * @param [Hash] options the options with which to match
1660
+ * @option options [Boolean] :exception (true) whether to raise exceptions with re2's error information (not supported on ABI version 0 of re2)
1661
+ * @return [Array<Integer>] the indices of matching regexps
1662
+ * @raise [MatchError] if an error occurs while matching
1663
+ * @raise [UnsupportedError] if the underlying version of re2 does not output error information
1664
+ * @example
1665
+ * set = RE2::Set.new
1666
+ * set.add("abc")
1667
+ * set.add("def")
1668
+ * set.compile
1669
+ * set.match("abcdef", :exception => true) # => [0, 1]
1670
+ */
1671
+ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1672
+ VALUE str, options, exception_option;
1673
+ bool raise_exception = true;
1674
+ rb_scan_args(argc, argv, "11", &str, &options);
1675
+ Check_Type(str, T_STRING);
1676
+ re2::StringPiece data(RSTRING_PTR(str), RSTRING_LEN(str));
1677
+ std::vector<int> v;
1678
+ re2_set *s;
1679
+ Data_Get_Struct(self, re2_set, s);
1680
+
1681
+ if (RTEST(options)) {
1682
+ Check_Type(options, T_HASH);
1683
+
1684
+ exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1685
+ if (!NIL_P(exception_option)) {
1686
+ raise_exception = RTEST(exception_option);
1687
+ }
1688
+ }
1689
+
1690
+ if (raise_exception) {
1691
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
1692
+ RE2::Set::ErrorInfo e;
1693
+ bool match_failed = !s->set->Match(data, &v, &e);
1694
+ VALUE result = rb_ary_new2(v.size());
1695
+
1696
+ if (match_failed) {
1697
+ switch (e.kind) {
1698
+ case RE2::Set::kNoError:
1699
+ break;
1700
+ case RE2::Set::kNotCompiled:
1701
+ rb_raise(re2_eSetMatchError, "#match must not be called before #compile");
1702
+ case RE2::Set::kOutOfMemory:
1703
+ rb_raise(re2_eSetMatchError, "The DFA ran out of memory");
1704
+ case RE2::Set::kInconsistent:
1705
+ rb_raise(re2_eSetMatchError, "RE2::Prog internal error");
1706
+ default: // Just in case a future version of libre2 adds new ErrorKinds
1707
+ rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
1708
+ }
1709
+ } else {
1710
+ for (size_t i = 0; i < v.size(); i++) {
1711
+ rb_ary_push(result, INT2FIX(v[i]));
1712
+ }
1713
+ }
1714
+
1715
+ return result;
1716
+ #else
1717
+ rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
1718
+ #endif
1719
+ } else {
1720
+ bool matched = s->set->Match(data, &v);
1721
+ VALUE result = rb_ary_new2(v.size());
1722
+
1723
+ if (matched) {
1724
+ for (size_t i = 0; i < v.size(); i++) {
1725
+ rb_ary_push(result, INT2FIX(v[i]));
1726
+ }
1727
+ }
1728
+
1729
+ return result;
1730
+ }
1731
+ }
1732
+
1365
1733
  /* Forward declare Init_re2 to be called by C code but define it separately so
1366
1734
  * that YARD can parse it.
1367
1735
  */
@@ -1372,12 +1740,18 @@ void Init_re2(void) {
1372
1740
  re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
1373
1741
  re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject);
1374
1742
  re2_cScanner = rb_define_class_under(re2_mRE2, "Scanner", rb_cObject);
1743
+ re2_cSet = rb_define_class_under(re2_mRE2, "Set", rb_cObject);
1744
+ re2_eSetMatchError = rb_define_class_under(re2_cSet, "MatchError",
1745
+ rb_const_get(rb_cObject, rb_intern("StandardError")));
1746
+ re2_eSetUnsupportedError = rb_define_class_under(re2_cSet, "UnsupportedError",
1747
+ rb_const_get(rb_cObject, rb_intern("StandardError")));
1375
1748
 
1376
1749
  rb_define_alloc_func(re2_cRegexp, (VALUE (*)(VALUE))re2_regexp_allocate);
1377
1750
  rb_define_alloc_func(re2_cMatchData,
1378
1751
  (VALUE (*)(VALUE))re2_matchdata_allocate);
1379
1752
  rb_define_alloc_func(re2_cScanner,
1380
1753
  (VALUE (*)(VALUE))re2_scanner_allocate);
1754
+ rb_define_alloc_func(re2_cSet, (VALUE (*)(VALUE))re2_set_allocate);
1381
1755
 
1382
1756
  rb_define_method(re2_cMatchData, "string",
1383
1757
  RUBY_METHOD_FUNC(re2_matchdata_string), 0);
@@ -1394,10 +1768,15 @@ void Init_re2(void) {
1394
1768
  rb_define_method(re2_cMatchData, "end",
1395
1769
  RUBY_METHOD_FUNC(re2_matchdata_end), 1);
1396
1770
  rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref),
1397
- -1); rb_define_method(re2_cMatchData, "to_s",
1771
+ -1);
1772
+ rb_define_method(re2_cMatchData, "to_s",
1398
1773
  RUBY_METHOD_FUNC(re2_matchdata_to_s), 0);
1399
1774
  rb_define_method(re2_cMatchData, "inspect",
1400
1775
  RUBY_METHOD_FUNC(re2_matchdata_inspect), 0);
1776
+ rb_define_method(re2_cMatchData, "deconstruct",
1777
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct), 0);
1778
+ rb_define_method(re2_cMatchData, "deconstruct_keys",
1779
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct_keys), 1);
1401
1780
 
1402
1781
  rb_define_method(re2_cScanner, "string",
1403
1782
  RUBY_METHOD_FUNC(re2_scanner_string), 0);
@@ -1428,11 +1807,11 @@ void Init_re2(void) {
1428
1807
  rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match),
1429
1808
  -1);
1430
1809
  rb_define_method(re2_cRegexp, "match?",
1431
- RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
1810
+ RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
1432
1811
  rb_define_method(re2_cRegexp, "=~",
1433
- RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
1812
+ RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
1434
1813
  rb_define_method(re2_cRegexp, "===",
1435
- RUBY_METHOD_FUNC(re2_regexp_match_query), 1);
1814
+ RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
1436
1815
  rb_define_method(re2_cRegexp, "scan",
1437
1816
  RUBY_METHOD_FUNC(re2_regexp_scan), 1);
1438
1817
  rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
@@ -1471,6 +1850,14 @@ void Init_re2(void) {
1471
1850
  rb_define_method(re2_cRegexp, "one_line?",
1472
1851
  RUBY_METHOD_FUNC(re2_regexp_one_line), 0);
1473
1852
 
1853
+ rb_define_singleton_method(re2_cSet, "match_raises_errors?",
1854
+ RUBY_METHOD_FUNC(re2_set_match_raises_errors_p), 0);
1855
+ rb_define_method(re2_cSet, "initialize",
1856
+ RUBY_METHOD_FUNC(re2_set_initialize), -1);
1857
+ rb_define_method(re2_cSet, "add", RUBY_METHOD_FUNC(re2_set_add), 1);
1858
+ rb_define_method(re2_cSet, "compile", RUBY_METHOD_FUNC(re2_set_compile), 0);
1859
+ rb_define_method(re2_cSet, "match", RUBY_METHOD_FUNC(re2_set_match), -1);
1860
+
1474
1861
  rb_define_module_function(re2_mRE2, "Replace",
1475
1862
  RUBY_METHOD_FUNC(re2_Replace), 3);
1476
1863
  rb_define_module_function(re2_mRE2, "GlobalReplace",
@@ -1498,6 +1885,10 @@ void Init_re2(void) {
1498
1885
  id_perl_classes = rb_intern("perl_classes");
1499
1886
  id_word_boundary = rb_intern("word_boundary");
1500
1887
  id_one_line = rb_intern("one_line");
1888
+ id_unanchored = rb_intern("unanchored");
1889
+ id_anchor_start = rb_intern("anchor_start");
1890
+ id_anchor_both = rb_intern("anchor_both");
1891
+ id_exception = rb_intern("exception");
1501
1892
 
1502
1893
  #if 0
1503
1894
  /* Fake so YARD generates the file. */
@@ -241,4 +241,62 @@ RSpec.describe RE2::MatchData do
241
241
  expect(md.end(:foo)).to be_nil
242
242
  end
243
243
  end
244
+
245
+ describe "#deconstruct" do
246
+ it "returns all capturing groups" do
247
+ md = RE2::Regexp.new('w(o)(o)').match('woo')
248
+
249
+ expect(md.deconstruct).to eq(['o', 'o'])
250
+ end
251
+
252
+ it "includes optional capturing groups as nil" do
253
+ md = RE2::Regexp.new('w(.)(.)(.)?').match('woo')
254
+
255
+ expect(md.deconstruct).to eq(['o', 'o', nil])
256
+ end
257
+ end
258
+
259
+ describe "#deconstruct_keys" do
260
+ it "returns all named captures if given nil" do
261
+ md = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
262
+
263
+ expect(md.deconstruct_keys(nil)).to eq(:numbers => '123', :letters => 'abc')
264
+ end
265
+
266
+ it "returns only named captures if given names" do
267
+ md = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
268
+
269
+ expect(md.deconstruct_keys([:numbers])).to eq(:numbers => '123')
270
+ end
271
+
272
+ it "returns named captures up until an invalid name is given" do
273
+ md = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
274
+
275
+ expect(md.deconstruct_keys([:numbers, :punctuation])).to eq(:numbers => '123')
276
+ end
277
+
278
+ it "returns an empty hash if given more capture names than exist" do
279
+ md = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
280
+
281
+ expect(md.deconstruct_keys([:numbers, :letters, :punctuation])).to eq({})
282
+ end
283
+
284
+ it "returns an empty hash if there are no named capturing groups" do
285
+ md = RE2::Regexp.new('(\d+) ([a-zA-Z]+)').match('123 abc')
286
+
287
+ expect(md.deconstruct_keys(nil)).to eq({})
288
+ end
289
+
290
+ it "raises an error if given a non-array of keys" do
291
+ md = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
292
+
293
+ expect { md.deconstruct_keys(0) }.to raise_error(TypeError)
294
+ end
295
+
296
+ it "raises an error if given keys as non-symbols" do
297
+ md = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
298
+
299
+ expect { md.deconstruct_keys([0]) }.to raise_error(TypeError)
300
+ end
301
+ end
244
302
  end
@@ -0,0 +1,168 @@
1
+ RSpec.describe RE2::Set do
2
+ describe "#initialize" do
3
+ it "returns an instance given no args" do
4
+ set = RE2::Set.new
5
+
6
+ expect(set).to be_a(RE2::Set)
7
+ end
8
+
9
+ it "returns an instance given only an anchor of :unanchored" do
10
+ set = RE2::Set.new(:unanchored)
11
+
12
+ expect(set).to be_a(RE2::Set)
13
+ end
14
+
15
+ it "returns an instance given only an anchor of :anchor_start" do
16
+ set = RE2::Set.new(:anchor_start)
17
+
18
+ expect(set).to be_a(RE2::Set)
19
+ end
20
+
21
+ it "returns an instance given only an anchor of :anchor_both" do
22
+ set = RE2::Set.new(:anchor_both)
23
+
24
+ expect(set).to be_a(RE2::Set)
25
+ end
26
+
27
+ it "returns an instance given an anchor and options" do
28
+ set = RE2::Set.new(:unanchored, :case_sensitive => false)
29
+
30
+ expect(set).to be_a(RE2::Set)
31
+ end
32
+
33
+ it "raises an error if given an inappropriate type" do
34
+ expect { RE2::Set.new(0) }.to raise_error(TypeError)
35
+ end
36
+
37
+ it "raises an error if given an invalid anchor" do
38
+ expect { RE2::Set.new(:not_a_valid_anchor) }.to raise_error(
39
+ ArgumentError,
40
+ "anchor should be one of: :unanchored, :anchor_start, :anchor_both"
41
+ )
42
+ end
43
+ end
44
+
45
+ describe "#add" do
46
+ it "allows multiple patterns to be added", :aggregate_failures do
47
+ set = RE2::Set.new
48
+
49
+ expect(set.add("abc")).to eq(0)
50
+ expect(set.add("def")).to eq(1)
51
+ expect(set.add("ghi")).to eq(2)
52
+ end
53
+
54
+ it "rejects invalid patterns when added" do
55
+ set = RE2::Set.new(:unanchored, :log_errors => false)
56
+
57
+ expect { set.add("???") }.to raise_error(ArgumentError, /str rejected by RE2::Set->Add()/)
58
+ end
59
+
60
+ it "raises an error if called after #compile" do
61
+ set = RE2::Set.new(:unanchored, :log_errors => false)
62
+ set.add("abc")
63
+ set.compile
64
+
65
+ silence_stderr do
66
+ expect { set.add("def") }.to raise_error(ArgumentError)
67
+ end
68
+ end
69
+
70
+ it "raises an error if given a non-string pattern" do
71
+ set = RE2::Set.new(:unanchored, :log_errors => false)
72
+
73
+ expect { set.add(0) }.to raise_error(TypeError)
74
+ end
75
+ end
76
+
77
+ describe "#compile" do
78
+ it "compiles the set without error" do
79
+ set = RE2::Set.new
80
+ set.add("abc")
81
+ set.add("def")
82
+ set.add("ghi")
83
+
84
+ expect(set.compile).to be_truthy
85
+ end
86
+ end
87
+
88
+ describe "#match" do
89
+ it "matches against multiple patterns" do
90
+ set = RE2::Set.new
91
+ set.add("abc")
92
+ set.add("def")
93
+ set.add("ghi")
94
+ set.compile
95
+
96
+ expect(set.match("abcdefghi", :exception => false)).to eq([0, 1, 2])
97
+ end
98
+
99
+ it "raises an error if called before #compile by default" do
100
+ skip "Underlying RE2::Set::Match does not output error information" unless RE2::Set.match_raises_errors?
101
+
102
+ set = RE2::Set.new(:unanchored, :log_errors => false)
103
+
104
+ silence_stderr do
105
+ expect { set.match("") }.to raise_error(RE2::Set::MatchError)
106
+ end
107
+ end
108
+
109
+ it "raises an error if called before #compile when :exception is true" do
110
+ skip "Underlying RE2::Set::Match does not output error information" unless RE2::Set.match_raises_errors?
111
+
112
+ set = RE2::Set.new(:unanchored, :log_errors => false)
113
+
114
+ silence_stderr do
115
+ expect { set.match("", :exception => true) }.to raise_error(RE2::Set::MatchError)
116
+ end
117
+ end
118
+
119
+ it "returns an empty array if called before #compile when :exception is false" do
120
+ set = RE2::Set.new(:unanchored, :log_errors => false)
121
+
122
+ silence_stderr do
123
+ expect(set.match("", :exception => false)).to be_empty
124
+ end
125
+ end
126
+
127
+ it "raises an error if :exception is true and re2 does not support it" do
128
+ skip "Underlying RE2::Set::Match outputs error information" if RE2::Set.match_raises_errors?
129
+
130
+ set = RE2::Set.new(:unanchored, :log_errors => false)
131
+
132
+ silence_stderr do
133
+ expect { set.match("", :exception => true) }.to raise_error(RE2::Set::UnsupportedError)
134
+ end
135
+ end
136
+
137
+ it "raises an error if given non-hash options" do
138
+ set = RE2::Set.new
139
+
140
+ expect { set.match("", 0) }.to raise_error(TypeError)
141
+ end
142
+ end
143
+
144
+ def silence_stderr
145
+ original_stream = STDERR
146
+
147
+ if File.const_defined?(:NULL)
148
+ STDERR.reopen(File::NULL)
149
+ else
150
+ platform = RUBY_PLATFORM == 'java' ? RbConfig::CONFIG['host_os'] : RUBY_PLATFORM
151
+
152
+ case platform
153
+ when /mswin|mingw/i
154
+ STDERR.reopen('NUL')
155
+ when /amiga/i
156
+ STDERR.reopen('NIL')
157
+ when /openvms/i
158
+ STDERR.reopen('NL:')
159
+ else
160
+ STDERR.reopen('/dev/null')
161
+ end
162
+ end
163
+
164
+ yield
165
+ ensure
166
+ STDERR.reopen(original_stream)
167
+ end
168
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: re2
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Mucur
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-29 00:00:00.000000000 Z
11
+ date: 2022-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler
@@ -57,6 +57,7 @@ files:
57
57
  - spec/re2/match_data_spec.rb
58
58
  - spec/re2/regexp_spec.rb
59
59
  - spec/re2/scanner_spec.rb
60
+ - spec/re2/set_spec.rb
60
61
  - spec/re2/string_spec.rb
61
62
  - spec/re2_spec.rb
62
63
  - spec/spec_helper.rb
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
80
  - !ruby/object:Gem::Version
80
81
  version: '0'
81
82
  requirements: []
82
- rubygems_version: 3.2.3
83
+ rubygems_version: 3.3.7
83
84
  signing_key:
84
85
  specification_version: 4
85
86
  summary: Ruby bindings to re2.
@@ -90,4 +91,5 @@ test_files:
90
91
  - spec/re2/regexp_spec.rb
91
92
  - spec/re2/match_data_spec.rb
92
93
  - spec/re2/string_spec.rb
94
+ - spec/re2/set_spec.rb
93
95
  - spec/re2/scanner_spec.rb