re2 2.23.0 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/re2/re2.cc CHANGED
@@ -8,7 +8,7 @@
8
8
  * Released under the BSD Licence, please see LICENSE.txt
9
9
  */
10
10
 
11
- #include <stdint.h>
11
+ #include <cstdint>
12
12
 
13
13
  #include <map>
14
14
  #include <sstream>
@@ -19,6 +19,7 @@
19
19
  #include <re2/set.h>
20
20
  #include <ruby.h>
21
21
  #include <ruby/encoding.h>
22
+ #include <ruby/thread.h>
22
23
 
23
24
  #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
24
25
 
@@ -43,6 +44,132 @@ typedef struct {
43
44
  RE2::Set *set;
44
45
  } re2_set;
45
46
 
47
+ struct nogvl_match_arg {
48
+ const RE2 *pattern;
49
+ re2::StringPiece text;
50
+ size_t startpos;
51
+ size_t endpos;
52
+ RE2::Anchor anchor;
53
+ re2::StringPiece *matches;
54
+ int n;
55
+ bool matched;
56
+ };
57
+
58
+ static void *nogvl_match(void *ptr) {
59
+ auto *arg = static_cast<nogvl_match_arg *>(ptr);
60
+ #ifdef HAVE_ENDPOS_ARGUMENT
61
+ arg->matched = arg->pattern->Match(
62
+ arg->text, arg->startpos, arg->endpos,
63
+ arg->anchor, arg->matches, arg->n);
64
+ #else
65
+ arg->matched = arg->pattern->Match(
66
+ arg->text, arg->startpos,
67
+ arg->anchor, arg->matches, arg->n);
68
+ #endif
69
+ return nullptr;
70
+ }
71
+
72
+ static bool re2_match_without_gvl(
73
+ const RE2 *pattern, VALUE text, size_t startpos, size_t endpos,
74
+ RE2::Anchor anchor, re2::StringPiece *matches, int n) {
75
+ nogvl_match_arg arg;
76
+ arg.pattern = pattern;
77
+ arg.text = re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text));
78
+ arg.startpos = startpos;
79
+ arg.endpos = endpos;
80
+ arg.anchor = anchor;
81
+ arg.matches = matches;
82
+ arg.n = n;
83
+ arg.matched = false;
84
+
85
+ /* Abseil's synchronization primitives (SRWLOCK, SleepConditionVariableSRW)
86
+ * are incompatible with Ruby's Win32 Mutex-based GVL, causing
87
+ * WAIT_ABANDONED crashes when multiple threads match concurrently.
88
+ */
89
+ #ifdef _WIN32
90
+ nogvl_match(&arg);
91
+ #else
92
+ /* No unblocking function is needed: RE2 matching is CPU-bound computation,
93
+ * not a blocking system call, so a signal cannot safely interrupt it.
94
+ */
95
+ rb_thread_call_without_gvl(nogvl_match, &arg, NULL, NULL);
96
+ #endif
97
+
98
+ return arg.matched;
99
+ }
100
+
101
+ struct nogvl_set_match_arg {
102
+ const RE2::Set *set;
103
+ re2::StringPiece text;
104
+ std::vector<int> *v;
105
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
106
+ RE2::Set::ErrorInfo *error_info;
107
+ #endif
108
+ bool matched;
109
+ };
110
+
111
+ static void *nogvl_set_match(void *ptr) {
112
+ auto *arg = static_cast<nogvl_set_match_arg *>(ptr);
113
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
114
+ if (arg->error_info) {
115
+ arg->matched = arg->set->Match(arg->text, arg->v, arg->error_info);
116
+ } else {
117
+ arg->matched = arg->set->Match(arg->text, arg->v);
118
+ }
119
+ #else
120
+ arg->matched = arg->set->Match(arg->text, arg->v);
121
+ #endif
122
+ return nullptr;
123
+ }
124
+
125
+ struct nogvl_replace_arg {
126
+ std::string *str;
127
+ const RE2 *pattern;
128
+ re2::StringPiece string_pattern;
129
+ re2::StringPiece rewrite;
130
+ };
131
+
132
+ static void *nogvl_replace(void *ptr) {
133
+ auto *arg = static_cast<nogvl_replace_arg *>(ptr);
134
+ if (arg->pattern) {
135
+ RE2::Replace(arg->str, *arg->pattern, arg->rewrite);
136
+ } else {
137
+ RE2::Replace(arg->str, arg->string_pattern, arg->rewrite);
138
+ }
139
+ return nullptr;
140
+ }
141
+
142
+ static void *nogvl_global_replace(void *ptr) {
143
+ auto *arg = static_cast<nogvl_replace_arg *>(ptr);
144
+ if (arg->pattern) {
145
+ RE2::GlobalReplace(arg->str, *arg->pattern, arg->rewrite);
146
+ } else {
147
+ RE2::GlobalReplace(arg->str, arg->string_pattern, arg->rewrite);
148
+ }
149
+ return nullptr;
150
+ }
151
+
152
+ struct nogvl_extract_arg {
153
+ re2::StringPiece text;
154
+ const RE2 *pattern;
155
+ re2::StringPiece string_pattern;
156
+ re2::StringPiece rewrite;
157
+ std::string *out;
158
+ bool extracted;
159
+ };
160
+
161
+ static void *nogvl_extract(void *ptr) {
162
+ auto *arg = static_cast<nogvl_extract_arg *>(ptr);
163
+ if (arg->pattern) {
164
+ arg->extracted = RE2::Extract(arg->text, *arg->pattern,
165
+ arg->rewrite, arg->out);
166
+ } else {
167
+ arg->extracted = RE2::Extract(arg->text, RE2(arg->string_pattern),
168
+ arg->rewrite, arg->out);
169
+ }
170
+ return nullptr;
171
+ }
172
+
46
173
  VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
47
174
  re2_eSetMatchError, re2_eSetUnsupportedError, re2_eRegexpUnsupportedError;
48
175
 
@@ -51,7 +178,7 @@ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
51
178
  id_max_mem, id_literal, id_never_nl, id_case_sensitive,
52
179
  id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
53
180
  id_anchor, id_anchor_start, id_anchor_both, id_exception,
54
- id_submatches, id_startpos, id_endpos;
181
+ id_submatches, id_startpos, id_endpos, id_symbolize_names;
55
182
 
56
183
  inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
57
184
  if (encoding == RE2::Options::EncodingUTF8) {
@@ -126,18 +253,22 @@ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
126
253
  }
127
254
 
128
255
  static void re2_matchdata_mark(void *ptr) {
129
- re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
256
+ re2_matchdata *m = static_cast<re2_matchdata *>(ptr);
130
257
  rb_gc_mark_movable(m->regexp);
258
+
259
+ /* Text must not be movable because StringPiece matches hold pointers into
260
+ * its underlying buffer; moving the string would invalidate them.
261
+ */
131
262
  rb_gc_mark(m->text);
132
263
  }
133
264
 
134
265
  static void re2_matchdata_compact(void *ptr) {
135
- re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
266
+ re2_matchdata *m = static_cast<re2_matchdata *>(ptr);
136
267
  m->regexp = rb_gc_location(m->regexp);
137
268
  }
138
269
 
139
270
  static void re2_matchdata_free(void *ptr) {
140
- re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
271
+ re2_matchdata *m = static_cast<re2_matchdata *>(ptr);
141
272
  if (m->matches) {
142
273
  delete[] m->matches;
143
274
  }
@@ -145,7 +276,7 @@ static void re2_matchdata_free(void *ptr) {
145
276
  }
146
277
 
147
278
  static size_t re2_matchdata_memsize(const void *ptr) {
148
- const re2_matchdata *m = reinterpret_cast<const re2_matchdata *>(ptr);
279
+ const re2_matchdata *m = static_cast<const re2_matchdata *>(ptr);
149
280
  size_t size = sizeof(*m);
150
281
  if (m->matches) {
151
282
  size += sizeof(*m->matches) * m->number_of_matches;
@@ -170,18 +301,22 @@ static const rb_data_type_t re2_matchdata_data_type = {
170
301
  };
171
302
 
172
303
  static void re2_scanner_mark(void *ptr) {
173
- re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
304
+ re2_scanner *s = static_cast<re2_scanner *>(ptr);
174
305
  rb_gc_mark_movable(s->regexp);
306
+
307
+ /* Text must not be movable because the StringPiece input holds a pointer
308
+ * into its underlying buffer; moving the string would invalidate it.
309
+ */
175
310
  rb_gc_mark(s->text);
176
311
  }
177
312
 
178
313
  static void re2_scanner_compact(void *ptr) {
179
- re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
314
+ re2_scanner *s = static_cast<re2_scanner *>(ptr);
180
315
  s->regexp = rb_gc_location(s->regexp);
181
316
  }
182
317
 
183
318
  static void re2_scanner_free(void *ptr) {
184
- re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
319
+ re2_scanner *s = static_cast<re2_scanner *>(ptr);
185
320
  if (s->input) {
186
321
  delete s->input;
187
322
  }
@@ -189,7 +324,7 @@ static void re2_scanner_free(void *ptr) {
189
324
  }
190
325
 
191
326
  static size_t re2_scanner_memsize(const void *ptr) {
192
- const re2_scanner *s = reinterpret_cast<const re2_scanner *>(ptr);
327
+ const re2_scanner *s = static_cast<const re2_scanner *>(ptr);
193
328
  size_t size = sizeof(*s);
194
329
  if (s->input) {
195
330
  size += sizeof(*s->input);
@@ -214,7 +349,7 @@ static const rb_data_type_t re2_scanner_data_type = {
214
349
  };
215
350
 
216
351
  static void re2_regexp_free(void *ptr) {
217
- re2_pattern *p = reinterpret_cast<re2_pattern *>(ptr);
352
+ re2_pattern *p = static_cast<re2_pattern *>(ptr);
218
353
  if (p->pattern) {
219
354
  delete p->pattern;
220
355
  }
@@ -222,7 +357,7 @@ static void re2_regexp_free(void *ptr) {
222
357
  }
223
358
 
224
359
  static size_t re2_regexp_memsize(const void *ptr) {
225
- const re2_pattern *p = reinterpret_cast<const re2_pattern *>(ptr);
360
+ const re2_pattern *p = static_cast<const re2_pattern *>(ptr);
226
361
  size_t size = sizeof(*p);
227
362
  if (p->pattern) {
228
363
  size += sizeof(*p->pattern);
@@ -242,9 +377,64 @@ static const rb_data_type_t re2_regexp_data_type = {
242
377
  0,
243
378
  // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
244
379
  // macro to update VALUE references, as to trigger write barriers.
245
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
380
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE
246
381
  };
247
382
 
383
+ static re2_pattern *unwrap_re2_regexp(VALUE self) {
384
+ re2_pattern *p;
385
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
386
+ if (!p->pattern) {
387
+ rb_raise(rb_eTypeError, "uninitialized RE2::Regexp");
388
+ }
389
+ return p;
390
+ }
391
+
392
+ static re2_matchdata *unwrap_re2_matchdata(VALUE self) {
393
+ re2_matchdata *m;
394
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
395
+ if (!RTEST(m->regexp)) {
396
+ rb_raise(rb_eTypeError, "uninitialized RE2::MatchData");
397
+ }
398
+ return m;
399
+ }
400
+
401
+ static re2_scanner *unwrap_re2_scanner(VALUE self) {
402
+ re2_scanner *c;
403
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
404
+ if (!RTEST(c->regexp)) {
405
+ rb_raise(rb_eTypeError, "uninitialized RE2::Scanner");
406
+ }
407
+ return c;
408
+ }
409
+
410
+ /*
411
+ * Returns an array of names of all named capturing groups. Names are returned
412
+ * in alphabetical order rather than definition order, as RE2 stores named
413
+ * groups internally in a sorted map.
414
+ *
415
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
416
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
417
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
418
+ *
419
+ * @return [Array<String>] an array of names of named capturing groups
420
+ * @example
421
+ * RE2::Regexp.new('(?P<a>\d+) (?P<b>\w+)').names #=> ["a", "b"]
422
+ */
423
+ static VALUE re2_regexp_names(const VALUE self) {
424
+ re2_pattern *p = unwrap_re2_regexp(self);
425
+
426
+ const auto& groups = p->pattern->NamedCapturingGroups();
427
+ VALUE names = rb_ary_new2(groups.size());
428
+
429
+ for (const auto& group : groups) {
430
+ rb_ary_push(names,
431
+ encoded_str_new(group.first.data(), group.first.size(),
432
+ p->pattern->options().encoding()));
433
+ }
434
+
435
+ return names;
436
+ }
437
+
248
438
  static VALUE re2_matchdata_allocate(VALUE klass) {
249
439
  re2_matchdata *m;
250
440
 
@@ -269,8 +459,7 @@ static VALUE re2_scanner_allocate(VALUE klass) {
269
459
  * m.string #=> "bob 123"
270
460
  */
271
461
  static VALUE re2_matchdata_string(const VALUE self) {
272
- re2_matchdata *m;
273
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
462
+ re2_matchdata *m = unwrap_re2_matchdata(self);
274
463
 
275
464
  return m->text;
276
465
  }
@@ -287,8 +476,7 @@ static VALUE re2_matchdata_string(const VALUE self) {
287
476
  * c.string #=> "foo"
288
477
  */
289
478
  static VALUE re2_scanner_string(const VALUE self) {
290
- re2_scanner *c;
291
- TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
479
+ re2_scanner *c = unwrap_re2_scanner(self);
292
480
 
293
481
  return c->text;
294
482
  }
@@ -302,8 +490,7 @@ static VALUE re2_scanner_string(const VALUE self) {
302
490
  * c.eof? #=> true
303
491
  */
304
492
  static VALUE re2_scanner_eof(const VALUE self) {
305
- re2_scanner *c;
306
- TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
493
+ re2_scanner *c = unwrap_re2_scanner(self);
307
494
 
308
495
  return BOOL2RUBY(c->eof);
309
496
  }
@@ -320,13 +507,12 @@ static VALUE re2_scanner_eof(const VALUE self) {
320
507
  * e.scan #=> ["1"]
321
508
  */
322
509
  static VALUE re2_scanner_rewind(VALUE self) {
323
- re2_scanner *c;
324
- TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
510
+ re2_scanner *c = unwrap_re2_scanner(self);
325
511
 
326
512
  delete c->input;
327
513
  c->input = new(std::nothrow) re2::StringPiece(
328
514
  RSTRING_PTR(c->text), RSTRING_LEN(c->text));
329
- if (c->input == 0) {
515
+ if (c->input == nullptr) {
330
516
  rb_raise(rb_eNoMemError,
331
517
  "not enough memory to allocate StringPiece for input");
332
518
  }
@@ -336,6 +522,35 @@ static VALUE re2_scanner_rewind(VALUE self) {
336
522
  return self;
337
523
  }
338
524
 
525
+ static VALUE re2_scanner_initialize_copy(VALUE self, VALUE other) {
526
+ re2_scanner *self_c;
527
+ re2_scanner *other_c = unwrap_re2_scanner(other);
528
+
529
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, self_c);
530
+
531
+ if (self_c->input) {
532
+ delete self_c->input;
533
+ self_c->input = nullptr;
534
+ }
535
+
536
+ RB_OBJ_WRITE(self, &self_c->regexp, other_c->regexp);
537
+ RB_OBJ_WRITE(self, &self_c->text, other_c->text);
538
+ self_c->number_of_capturing_groups = other_c->number_of_capturing_groups;
539
+ self_c->eof = other_c->eof;
540
+
541
+ if (other_c->input) {
542
+ self_c->input = new(std::nothrow) re2::StringPiece(*other_c->input);
543
+ if (self_c->input == nullptr) {
544
+ rb_raise(rb_eNoMemError,
545
+ "not enough memory to allocate StringPiece for input");
546
+ }
547
+ } else {
548
+ self_c->input = nullptr;
549
+ }
550
+
551
+ return self;
552
+ }
553
+
339
554
  /*
340
555
  * Scan the given text incrementally for matches using
341
556
  * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
@@ -356,11 +571,8 @@ static VALUE re2_scanner_rewind(VALUE self) {
356
571
  * s.scan #=> ["bar"]
357
572
  */
358
573
  static VALUE re2_scanner_scan(VALUE self) {
359
- re2_pattern *p;
360
- re2_scanner *c;
361
-
362
- TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
363
- TypedData_Get_Struct(c->regexp, re2_pattern, &re2_regexp_data_type, p);
574
+ re2_scanner *c = unwrap_re2_scanner(self);
575
+ re2_pattern *p = unwrap_re2_regexp(c->regexp);
364
576
 
365
577
  std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
366
578
  std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
@@ -385,7 +597,7 @@ static VALUE re2_scanner_scan(VALUE self) {
385
597
  VALUE result = rb_ary_new2(c->number_of_capturing_groups);
386
598
 
387
599
  for (int i = 0; i < c->number_of_capturing_groups; ++i) {
388
- if (matches[i].empty()) {
600
+ if (matches[i].data() == nullptr) {
389
601
  rb_ary_push(result, Qnil);
390
602
  } else {
391
603
  rb_ary_push(result, encoded_str_new(matches[i].data(),
@@ -397,9 +609,27 @@ static VALUE re2_scanner_scan(VALUE self) {
397
609
  /* Check whether we've exhausted the input yet. */
398
610
  c->eof = new_input_size == 0;
399
611
 
400
- /* If the match didn't advance the input, we need to do this ourselves. */
612
+ /* If the match didn't advance the input, we need to do this ourselves,
613
+ * advancing by a whole character to avoid splitting multi-byte characters.
614
+ *
615
+ * The lookup table approach is taken from RE2's own Python extension: the
616
+ * high 4 bits of a UTF-8 lead byte determine the character's byte length.
617
+ *
618
+ * See https://github.com/google/re2/blob/972a15cedd008d846f1a39b2e88ce48d7f166cbd/python/_re2.cc#L46-L48
619
+ */
401
620
  if (!input_advanced && new_input_size > 0) {
402
- c->input->remove_prefix(1);
621
+ size_t char_size = 1;
622
+
623
+ if (p->pattern->options().encoding() == RE2::Options::EncodingUTF8) {
624
+ char_size = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
625
+ [((*c->input)[0] & 0xFF) >> 4];
626
+
627
+ if (char_size > new_input_size) {
628
+ char_size = new_input_size;
629
+ }
630
+ }
631
+
632
+ c->input->remove_prefix(char_size);
403
633
  }
404
634
 
405
635
  return result;
@@ -409,47 +639,44 @@ static VALUE re2_scanner_scan(VALUE self) {
409
639
  }
410
640
 
411
641
  static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
412
- re2_matchdata *m;
413
- re2_pattern *p;
414
-
415
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
416
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
642
+ re2_matchdata *m = unwrap_re2_matchdata(self);
643
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
417
644
 
418
645
  int id;
419
646
 
420
647
  if (RB_INTEGER_TYPE_P(idx)) {
421
648
  id = NUM2INT(idx);
422
649
  } else if (SYMBOL_P(idx)) {
423
- const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
424
- std::map<std::string, int>::const_iterator search = groups.find(rb_id2name(SYM2ID(idx)));
650
+ const auto& groups = p->pattern->NamedCapturingGroups();
651
+ auto search = groups.find(rb_id2name(SYM2ID(idx)));
425
652
 
426
653
  if (search != groups.end()) {
427
654
  id = search->second;
428
655
  } else {
429
- return NULL;
656
+ return nullptr;
430
657
  }
431
658
  } else {
432
659
  StringValue(idx);
433
660
 
434
- const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
435
- std::map<std::string, int>::const_iterator search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)));
661
+ const auto& groups = p->pattern->NamedCapturingGroups();
662
+ auto search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)));
436
663
 
437
664
  if (search != groups.end()) {
438
665
  id = search->second;
439
666
  } else {
440
- return NULL;
667
+ return nullptr;
441
668
  }
442
669
  }
443
670
 
444
671
  if (id >= 0 && id < m->number_of_matches) {
445
672
  re2::StringPiece *match = &m->matches[id];
446
673
 
447
- if (!match->empty()) {
674
+ if (match->data() != nullptr) {
448
675
  return match;
449
676
  }
450
677
  }
451
678
 
452
- return NULL;
679
+ return nullptr;
453
680
  }
454
681
 
455
682
  /*
@@ -458,14 +685,12 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
458
685
  *
459
686
  * @return [Integer] the number of elements
460
687
  * @example
461
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
688
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
462
689
  * m.size #=> 2
463
690
  * m.length #=> 2
464
691
  */
465
692
  static VALUE re2_matchdata_size(const VALUE self) {
466
- re2_matchdata *m;
467
-
468
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
693
+ re2_matchdata *m = unwrap_re2_matchdata(self);
469
694
 
470
695
  return INT2FIX(m->number_of_matches);
471
696
  }
@@ -477,17 +702,15 @@ static VALUE re2_matchdata_size(const VALUE self) {
477
702
  * @return [Integer, nil] the offset of the start of the match or `nil` if
478
703
  * there is no such submatch
479
704
  * @example
480
- * m = RE2::Regexp.new('ob (\d+)').match("bob 123")
705
+ * m = RE2::Regexp.new('ob (\d+)').partial_match("bob 123")
481
706
  * m.begin(0) #=> 1
482
707
  * m.begin(1) #=> 4
483
708
  */
484
709
  static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
485
- re2_matchdata *m;
486
-
487
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
710
+ re2_matchdata *m = unwrap_re2_matchdata(self);
488
711
 
489
712
  re2::StringPiece *match = re2_matchdata_find_match(n, self);
490
- if (match == NULL) {
713
+ if (match == nullptr) {
491
714
  return Qnil;
492
715
  } else {
493
716
  long offset = match->data() - RSTRING_PTR(m->text);
@@ -504,17 +727,15 @@ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
504
727
  * @return [Integer, nil] the offset of the character following the end of the
505
728
  * match or `nil` if there is no such match
506
729
  * @example
507
- * m = RE2::Regexp.new('ob (\d+) b').match("bob 123 bob")
730
+ * m = RE2::Regexp.new('ob (\d+) b').partial_match("bob 123 bob")
508
731
  * m.end(0) #=> 9
509
732
  * m.end(1) #=> 7
510
733
  */
511
734
  static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
512
- re2_matchdata *m;
513
-
514
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
735
+ re2_matchdata *m = unwrap_re2_matchdata(self);
515
736
 
516
737
  re2::StringPiece *match = re2_matchdata_find_match(n, self);
517
- if (match == NULL) {
738
+ if (match == nullptr) {
518
739
  return Qnil;
519
740
  } else {
520
741
  long offset = (match->data() - RSTRING_PTR(m->text)) + match->size();
@@ -523,17 +744,129 @@ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
523
744
  }
524
745
  }
525
746
 
747
+ /*
748
+ * Returns the portion of the original string before the match.
749
+ *
750
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
751
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
752
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
753
+ *
754
+ * @return [String] the portion of the original string before the match
755
+ * @example
756
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123 456")
757
+ * m.pre_match #=> "bob "
758
+ */
759
+ static VALUE re2_matchdata_pre_match(const VALUE self) {
760
+ re2_matchdata *m = unwrap_re2_matchdata(self);
761
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
762
+
763
+ re2::StringPiece *match = &m->matches[0];
764
+ if (match->data() == nullptr) {
765
+ return Qnil;
766
+ }
767
+
768
+ long offset = match->data() - RSTRING_PTR(m->text);
769
+
770
+ return encoded_str_new(RSTRING_PTR(m->text), offset,
771
+ p->pattern->options().encoding());
772
+ }
773
+
774
+ /*
775
+ * Returns the portion of the original string after the match.
776
+ *
777
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
778
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
779
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
780
+ *
781
+ * @return [String] the portion of the original string after the match
782
+ * @example
783
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123 456")
784
+ * m.post_match #=> " 456"
785
+ */
786
+ static VALUE re2_matchdata_post_match(const VALUE self) {
787
+ re2_matchdata *m = unwrap_re2_matchdata(self);
788
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
789
+
790
+ re2::StringPiece *match = &m->matches[0];
791
+ if (match->data() == nullptr) {
792
+ return Qnil;
793
+ }
794
+
795
+ long start = (match->data() - RSTRING_PTR(m->text)) + match->size();
796
+ long remaining = RSTRING_LEN(m->text) - start;
797
+
798
+ return encoded_str_new(RSTRING_PTR(m->text) + start, remaining,
799
+ p->pattern->options().encoding());
800
+ }
801
+
802
+ /*
803
+ * Returns a two-element array containing the beginning and ending offsets of
804
+ * the nth match.
805
+ *
806
+ * @param [Integer, String, Symbol] n the name or number of the match
807
+ * @return [Array<Integer>, nil] a two-element array with the beginning and
808
+ * ending offsets of the match or `nil` if there is no such match
809
+ * @example
810
+ * m = RE2::Regexp.new('ob (\d+)').partial_match("bob 123")
811
+ * m.offset(0) #=> [1, 7]
812
+ * m.offset(1) #=> [4, 7]
813
+ */
814
+ static VALUE re2_matchdata_offset(const VALUE self, VALUE n) {
815
+ re2_matchdata *m = unwrap_re2_matchdata(self);
816
+
817
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
818
+ if (match == nullptr) {
819
+ return Qnil;
820
+ }
821
+
822
+ long start = match->data() - RSTRING_PTR(m->text);
823
+ long end_pos = start + match->size();
824
+
825
+ VALUE array = rb_ary_new2(2);
826
+ rb_ary_push(array, LONG2NUM(rb_str_sublen(m->text, start)));
827
+ rb_ary_push(array, LONG2NUM(rb_str_sublen(m->text, end_pos)));
828
+
829
+ return array;
830
+ }
831
+
832
+ /*
833
+ * Returns the length of the nth match in characters. This is equivalent to
834
+ * `m[n].length` but without allocating a new string.
835
+ *
836
+ * @param [Integer, String, Symbol] n the name or number of the match
837
+ * @return [Integer, nil] the length of the match or `nil` if there is no such
838
+ * match
839
+ * @example
840
+ * m = RE2::Regexp.new('(?P<word>\w+) (?P<number>\d+)').partial_match("alice 123")
841
+ * m.match_length(0) #=> 9
842
+ * m.match_length(1) #=> 5
843
+ * m.match_length(:number) #=> 3
844
+ */
845
+ static VALUE re2_matchdata_match_length(const VALUE self, VALUE n) {
846
+ re2_matchdata *m = unwrap_re2_matchdata(self);
847
+
848
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
849
+ if (match == nullptr) {
850
+ return Qnil;
851
+ }
852
+
853
+ long start = match->data() - RSTRING_PTR(m->text);
854
+ long end_pos = start + match->size();
855
+ long char_len = rb_str_sublen(m->text, end_pos) - rb_str_sublen(m->text, start);
856
+
857
+ return LONG2NUM(char_len);
858
+ }
859
+
526
860
  /*
527
861
  * Returns the {RE2::Regexp} used in the match.
528
862
  *
529
863
  * @return [RE2::Regexp] the regular expression used in the match
530
864
  * @example
531
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
865
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
532
866
  * m.regexp #=> #<RE2::Regexp /(\d+)/>
533
867
  */
534
868
  static VALUE re2_matchdata_regexp(const VALUE self) {
535
- re2_matchdata *m;
536
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
869
+ re2_matchdata *m = unwrap_re2_matchdata(self);
537
870
 
538
871
  return m->regexp;
539
872
  }
@@ -547,8 +880,7 @@ static VALUE re2_matchdata_regexp(const VALUE self) {
547
880
  * c.regexp #=> #<RE2::Regexp /(\d+)/>
548
881
  */
549
882
  static VALUE re2_scanner_regexp(const VALUE self) {
550
- re2_scanner *c;
551
- TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
883
+ re2_scanner *c = unwrap_re2_scanner(self);
552
884
 
553
885
  return c->regexp;
554
886
  }
@@ -569,21 +901,18 @@ static VALUE re2_regexp_allocate(VALUE klass) {
569
901
  *
570
902
  * @return [Array<String, nil>] the array of matches
571
903
  * @example
572
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
904
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
573
905
  * m.to_a #=> ["123", "123"]
574
906
  */
575
907
  static VALUE re2_matchdata_to_a(const VALUE self) {
576
- re2_matchdata *m;
577
- re2_pattern *p;
578
-
579
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
580
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
908
+ re2_matchdata *m = unwrap_re2_matchdata(self);
909
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
581
910
 
582
911
  VALUE array = rb_ary_new2(m->number_of_matches);
583
912
  for (int i = 0; i < m->number_of_matches; ++i) {
584
913
  re2::StringPiece *match = &m->matches[i];
585
914
 
586
- if (match->empty()) {
915
+ if (match->data() == nullptr) {
587
916
  rb_ary_push(array, Qnil);
588
917
  } else {
589
918
  rb_ary_push(array, encoded_str_new(match->data(), match->size(),
@@ -595,18 +924,15 @@ static VALUE re2_matchdata_to_a(const VALUE self) {
595
924
  }
596
925
 
597
926
  static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
598
- re2_matchdata *m;
599
- re2_pattern *p;
600
-
601
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
602
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
927
+ re2_matchdata *m = unwrap_re2_matchdata(self);
928
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
603
929
 
604
930
  if (nth < 0 || nth >= m->number_of_matches) {
605
931
  return Qnil;
606
932
  } else {
607
933
  re2::StringPiece *match = &m->matches[nth];
608
934
 
609
- if (match->empty()) {
935
+ if (match->data() == nullptr) {
610
936
  return Qnil;
611
937
  } else {
612
938
  return encoded_str_new(match->data(), match->size(),
@@ -616,14 +942,11 @@ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
616
942
  }
617
943
 
618
944
  static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
619
- re2_matchdata *m;
620
- re2_pattern *p;
945
+ re2_matchdata *m = unwrap_re2_matchdata(self);
946
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
621
947
 
622
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
623
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
624
-
625
- const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
626
- std::map<std::string, int>::const_iterator search = groups.find(name);
948
+ const auto& groups = p->pattern->NamedCapturingGroups();
949
+ auto search = groups.find(name);
627
950
 
628
951
  if (search != groups.end()) {
629
952
  return re2_matchdata_nth_match(search->second, self);
@@ -645,7 +968,7 @@ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self
645
968
  * @param [Integer] index the index of the match to fetch
646
969
  * @return [String, nil] the specified match or `nil` if it isn't present
647
970
  * @example
648
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
971
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
649
972
  * m[0] #=> "123"
650
973
  *
651
974
  * @overload [](start, length)
@@ -655,7 +978,7 @@ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self
655
978
  * @param [Integer] length the number of elements to fetch
656
979
  * @return [Array<String, nil>] the specified matches
657
980
  * @example
658
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
981
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
659
982
  * m[0, 1] #=> ["123"]
660
983
  *
661
984
  * @overload [](range)
@@ -664,8 +987,8 @@ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self
664
987
  * @param [Range] range the range of match indexes to fetch
665
988
  * @return [Array<String, nil>] the specified matches
666
989
  * @example
667
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
668
- * m[0..1] #=> "[123", "123"]
990
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
991
+ * m[0..1] #=> ["123", "123"]
669
992
  *
670
993
  * @overload [](name)
671
994
  * Access a particular match by name.
@@ -673,7 +996,7 @@ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self
673
996
  * @param [String, Symbol] name the name of the match to fetch
674
997
  * @return [String, nil] the specific match or `nil` if it isn't present
675
998
  * @example
676
- * m = RE2::Regexp.new('(?P<number>\d+)').match("bob 123")
999
+ * m = RE2::Regexp.new('(?P<number>\d+)').partial_match("bob 123")
677
1000
  * m["number"] #=> "123"
678
1001
  * m[:number] #=> "123"
679
1002
  */
@@ -697,6 +1020,9 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
697
1020
  * Returns the entire matched string.
698
1021
  *
699
1022
  * @return [String] the entire matched string
1023
+ * @example
1024
+ * m = RE2::Regexp.new('(?P<number>\d+)').partial_match("bob 123")
1025
+ * m.to_s #=> "123"
700
1026
  */
701
1027
  static VALUE re2_matchdata_to_s(const VALUE self) {
702
1028
  return re2_matchdata_nth_match(0, self);
@@ -711,15 +1037,12 @@ static VALUE re2_matchdata_to_s(const VALUE self) {
711
1037
  *
712
1038
  * @return [String] a printable version of the match
713
1039
  * @example
714
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
1040
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
715
1041
  * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
716
1042
  */
717
1043
  static VALUE re2_matchdata_inspect(const VALUE self) {
718
- re2_matchdata *m;
719
- re2_pattern *p;
720
-
721
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
722
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
1044
+ re2_matchdata *m = unwrap_re2_matchdata(self);
1045
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
723
1046
 
724
1047
  std::ostringstream output;
725
1048
  output << "#<RE2::MatchData";
@@ -749,7 +1072,7 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
749
1072
  }
750
1073
 
751
1074
  /*
752
- * Returns the array of submatches for pattern matching.
1075
+ * Returns the array of submatches.
753
1076
  *
754
1077
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
755
1078
  * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
@@ -758,11 +1081,12 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
758
1081
  *
759
1082
  * @return [Array<String, nil>] the array of submatches
760
1083
  * @example
761
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
1084
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
1085
+ * m.captures #=> ["123"]
762
1086
  * m.deconstruct #=> ["123"]
763
1087
  *
764
1088
  * @example pattern matching
765
- * case RE2::Regexp.new('(\d+) (\d+)').match("bob 123 456")
1089
+ * case RE2::Regexp.new('(\d+) (\d+)').partial_match("bob 123 456")
766
1090
  * in x, y
767
1091
  * puts "Matched #{x} #{y}"
768
1092
  * else
@@ -770,17 +1094,14 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
770
1094
  * end
771
1095
  */
772
1096
  static VALUE re2_matchdata_deconstruct(const VALUE self) {
773
- re2_matchdata *m;
774
- re2_pattern *p;
775
-
776
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
777
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
1097
+ re2_matchdata *m = unwrap_re2_matchdata(self);
1098
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
778
1099
 
779
1100
  VALUE array = rb_ary_new2(m->number_of_matches - 1);
780
1101
  for (int i = 1; i < m->number_of_matches; ++i) {
781
1102
  re2::StringPiece *match = &m->matches[i];
782
1103
 
783
- if (match->empty()) {
1104
+ if (match->data() == nullptr) {
784
1105
  rb_ary_push(array, Qnil);
785
1106
  } else {
786
1107
  rb_ary_push(array, encoded_str_new(match->data(), match->size(),
@@ -806,14 +1127,14 @@ static VALUE re2_matchdata_deconstruct(const VALUE self) {
806
1127
  * @param [Array<Symbol>, nil] keys an array of `Symbol` capturing group names
807
1128
  * or `nil` to return all names
808
1129
  * @example
809
- * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
1130
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').partial_match('123 abc')
810
1131
  * m.deconstruct_keys(nil) #=> {numbers: "123", letters: "abc"}
811
1132
  * m.deconstruct_keys([:numbers]) #=> {numbers: "123"}
812
1133
  * m.deconstruct_keys([:fruit]) #=> {}
813
1134
  * m.deconstruct_keys([:letters, :fruit]) #=> {letters: "abc"}
814
1135
  *
815
1136
  * @example pattern matching
816
- * case RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
1137
+ * case RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').partial_match('123 abc')
817
1138
  * in numbers:, letters:
818
1139
  * puts "Numbers: #{numbers}, letters: #{letters}"
819
1140
  * else
@@ -821,20 +1142,17 @@ static VALUE re2_matchdata_deconstruct(const VALUE self) {
821
1142
  * end
822
1143
  */
823
1144
  static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys) {
824
- re2_matchdata *m;
825
- re2_pattern *p;
826
-
827
- TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
828
- TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
1145
+ re2_matchdata *m = unwrap_re2_matchdata(self);
1146
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
829
1147
 
830
- const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1148
+ const auto& groups = p->pattern->NamedCapturingGroups();
831
1149
  VALUE capturing_groups = rb_hash_new();
832
1150
 
833
1151
  if (NIL_P(keys)) {
834
- for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1152
+ for (const auto& group : groups) {
835
1153
  rb_hash_aset(capturing_groups,
836
- ID2SYM(rb_intern(it->first.data())),
837
- re2_matchdata_nth_match(it->second, self));
1154
+ ID2SYM(rb_intern2(group.first.data(), group.first.size())),
1155
+ re2_matchdata_nth_match(group.second, self));
838
1156
  }
839
1157
  } else {
840
1158
  Check_Type(keys, T_ARRAY);
@@ -844,7 +1162,7 @@ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys)
844
1162
  VALUE key = rb_ary_entry(keys, i);
845
1163
  Check_Type(key, T_SYMBOL);
846
1164
  const char *name = rb_id2name(SYM2ID(key));
847
- std::map<std::string, int>::const_iterator search = groups.find(name);
1165
+ auto search = groups.find(name);
848
1166
 
849
1167
  if (search != groups.end()) {
850
1168
  rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(search->second, self));
@@ -858,6 +1176,151 @@ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys)
858
1176
  return capturing_groups;
859
1177
  }
860
1178
 
1179
+ /*
1180
+ * Returns a hash of capturing group names to matched strings.
1181
+ *
1182
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1183
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1184
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1185
+ *
1186
+ * @overload named_captures
1187
+ * Returns a hash with string keys.
1188
+ *
1189
+ * @return [Hash] a hash of capturing group names to matching strings
1190
+ * @example
1191
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').partial_match('123 abc')
1192
+ * m.named_captures #=> {"numbers" => "123", "letters" => "abc"}
1193
+ *
1194
+ * @overload named_captures(symbolize_names:)
1195
+ * Returns a hash with string or symbol keys.
1196
+ *
1197
+ * @param [Boolean] symbolize_names whether to return group names as symbols
1198
+ * @return [Hash] a hash of capturing group names to matching strings
1199
+ * @example
1200
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').partial_match('123 abc')
1201
+ * m.named_captures
1202
+ * #=> {"numbers" => "123", "letters" => "abc"}
1203
+ * m.named_captures(symbolize_names: true) #=> {numbers: "123", letters: "abc"}
1204
+ */
1205
+ static VALUE re2_matchdata_named_captures(int argc, VALUE *argv, const VALUE self) {
1206
+ VALUE opts;
1207
+ rb_scan_args(argc, argv, "0:", &opts);
1208
+
1209
+ bool symbolize = false;
1210
+ if (!NIL_P(opts)) {
1211
+ VALUE sym = rb_hash_aref(opts, ID2SYM(id_symbolize_names));
1212
+ symbolize = RTEST(sym);
1213
+ }
1214
+
1215
+ re2_matchdata *m = unwrap_re2_matchdata(self);
1216
+ re2_pattern *p = unwrap_re2_regexp(m->regexp);
1217
+
1218
+ const auto& groups = p->pattern->NamedCapturingGroups();
1219
+ VALUE result = rb_hash_new();
1220
+
1221
+ for (const auto& group : groups) {
1222
+ VALUE key;
1223
+ if (symbolize) {
1224
+ key = ID2SYM(rb_intern2(group.first.data(), group.first.size()));
1225
+ } else {
1226
+ key = encoded_str_new(group.first.data(), group.first.size(),
1227
+ p->pattern->options().encoding());
1228
+ }
1229
+ rb_hash_aset(result, key, re2_matchdata_nth_match(group.second, self));
1230
+ }
1231
+
1232
+ return result;
1233
+ }
1234
+
1235
+ /*
1236
+ * Returns an array of names of named capturing groups. Names are returned in
1237
+ * alphabetical order rather than definition order, as RE2 stores named groups
1238
+ * internally in a sorted map.
1239
+ *
1240
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1241
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1242
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1243
+ *
1244
+ * @return [Array<String>] an array of names of named capturing groups
1245
+ * @example
1246
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').partial_match('123 abc')
1247
+ * m.names #=> ["letters", "numbers"]
1248
+ */
1249
+ static VALUE re2_matchdata_names(const VALUE self) {
1250
+ re2_matchdata *m = unwrap_re2_matchdata(self);
1251
+
1252
+ return re2_regexp_names(m->regexp);
1253
+ }
1254
+
1255
+ /*
1256
+ * Returns an array of match values at the given indices or names.
1257
+ *
1258
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1259
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1260
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1261
+ *
1262
+ * @param [Integer, String, Symbol] indexes the indices or names of
1263
+ * the matches to fetch
1264
+ * @return [Array<String, nil>] the values at the given indices or names
1265
+ * @example
1266
+ * m = RE2::Regexp.new('(?P<a>\d+) (?P<b>\d+)').partial_match("123 456")
1267
+ * m.values_at(1, 2) #=> ["123", "456"]
1268
+ * m.values_at(:a, :b) #=> ["123", "456"]
1269
+ * m.values_at(1, :b) #=> ["123", "456"]
1270
+ */
1271
+ static VALUE re2_matchdata_values_at(int argc, VALUE *argv, const VALUE self) {
1272
+ unwrap_re2_matchdata(self);
1273
+
1274
+ VALUE result = rb_ary_new2(argc);
1275
+
1276
+ for (int i = 0; i < argc; ++i) {
1277
+ VALUE idx = argv[i];
1278
+
1279
+ if (TYPE(idx) == T_STRING) {
1280
+ rb_ary_push(result, re2_matchdata_named_match(
1281
+ std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self));
1282
+ } else if (SYMBOL_P(idx)) {
1283
+ rb_ary_push(result, re2_matchdata_named_match(
1284
+ rb_id2name(SYM2ID(idx)), self));
1285
+ } else {
1286
+ rb_ary_push(result, re2_matchdata_nth_match(NUM2INT(idx), self));
1287
+ }
1288
+ }
1289
+
1290
+ return result;
1291
+ }
1292
+
1293
+ static VALUE re2_matchdata_initialize_copy(VALUE self, VALUE other) {
1294
+ re2_matchdata *self_m;
1295
+ re2_matchdata *other_m = unwrap_re2_matchdata(other);
1296
+
1297
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, self_m);
1298
+
1299
+ if (self_m->matches) {
1300
+ delete[] self_m->matches;
1301
+ self_m->matches = nullptr;
1302
+ }
1303
+
1304
+ self_m->number_of_matches = other_m->number_of_matches;
1305
+ RB_OBJ_WRITE(self, &self_m->regexp, other_m->regexp);
1306
+ RB_OBJ_WRITE(self, &self_m->text, other_m->text);
1307
+
1308
+ if (other_m->matches) {
1309
+ self_m->matches = new(std::nothrow) re2::StringPiece[other_m->number_of_matches];
1310
+ if (self_m->matches == nullptr) {
1311
+ rb_raise(rb_eNoMemError,
1312
+ "not enough memory to allocate StringPiece for matches");
1313
+ }
1314
+ for (int i = 0; i < other_m->number_of_matches; ++i) {
1315
+ self_m->matches[i] = other_m->matches[i];
1316
+ }
1317
+ } else {
1318
+ self_m->matches = nullptr;
1319
+ }
1320
+
1321
+ return self;
1322
+ }
1323
+
861
1324
  /*
862
1325
  * Shorthand to compile a new {RE2::Regexp}.
863
1326
  *
@@ -913,6 +1376,13 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
913
1376
 
914
1377
  TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
915
1378
 
1379
+ rb_check_frozen(self);
1380
+
1381
+ if (p->pattern) {
1382
+ delete p->pattern;
1383
+ p->pattern = nullptr;
1384
+ }
1385
+
916
1386
  if (RTEST(options)) {
917
1387
  RE2::Options re2_options;
918
1388
  parse_re2_options(&re2_options, options);
@@ -924,10 +1394,36 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
924
1394
  re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
925
1395
  }
926
1396
 
927
- if (p->pattern == 0) {
1397
+ if (p->pattern == nullptr) {
928
1398
  rb_raise(rb_eNoMemError, "not enough memory to allocate RE2 object");
929
1399
  }
930
1400
 
1401
+ rb_obj_freeze(self);
1402
+
1403
+ return self;
1404
+ }
1405
+
1406
+ static VALUE re2_regexp_initialize_copy(VALUE self, VALUE other) {
1407
+ re2_pattern *self_p;
1408
+ re2_pattern *other_p = unwrap_re2_regexp(other);
1409
+
1410
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, self_p);
1411
+
1412
+ rb_check_frozen(self);
1413
+
1414
+ if (self_p->pattern) {
1415
+ delete self_p->pattern;
1416
+ self_p->pattern = nullptr;
1417
+ }
1418
+
1419
+ self_p->pattern = new(std::nothrow) RE2(other_p->pattern->pattern(),
1420
+ other_p->pattern->options());
1421
+ if (self_p->pattern == nullptr) {
1422
+ rb_raise(rb_eNoMemError, "not enough memory to allocate RE2 object");
1423
+ }
1424
+
1425
+ rb_obj_freeze(self);
1426
+
931
1427
  return self;
932
1428
  }
933
1429
 
@@ -945,9 +1441,7 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
945
1441
  * re2.inspect #=> "#<RE2::Regexp /woo?/>"
946
1442
  */
947
1443
  static VALUE re2_regexp_inspect(const VALUE self) {
948
- re2_pattern *p;
949
-
950
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1444
+ re2_pattern *p = unwrap_re2_regexp(self);
951
1445
 
952
1446
  std::ostringstream output;
953
1447
 
@@ -970,8 +1464,7 @@ static VALUE re2_regexp_inspect(const VALUE self) {
970
1464
  * re2.to_s #=> "woo?"
971
1465
  */
972
1466
  static VALUE re2_regexp_to_s(const VALUE self) {
973
- re2_pattern *p;
974
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1467
+ re2_pattern *p = unwrap_re2_regexp(self);
975
1468
 
976
1469
  return encoded_str_new(p->pattern->pattern().data(),
977
1470
  p->pattern->pattern().size(),
@@ -987,8 +1480,7 @@ static VALUE re2_regexp_to_s(const VALUE self) {
987
1480
  * re2.ok? #=> true
988
1481
  */
989
1482
  static VALUE re2_regexp_ok(const VALUE self) {
990
- re2_pattern *p;
991
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1483
+ re2_pattern *p = unwrap_re2_regexp(self);
992
1484
 
993
1485
  return BOOL2RUBY(p->pattern->ok());
994
1486
  }
@@ -1003,8 +1495,7 @@ static VALUE re2_regexp_ok(const VALUE self) {
1003
1495
  * re2.utf8? #=> true
1004
1496
  */
1005
1497
  static VALUE re2_regexp_utf8(const VALUE self) {
1006
- re2_pattern *p;
1007
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1498
+ re2_pattern *p = unwrap_re2_regexp(self);
1008
1499
 
1009
1500
  return BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8);
1010
1501
  }
@@ -1019,8 +1510,7 @@ static VALUE re2_regexp_utf8(const VALUE self) {
1019
1510
  * re2.posix_syntax? #=> true
1020
1511
  */
1021
1512
  static VALUE re2_regexp_posix_syntax(const VALUE self) {
1022
- re2_pattern *p;
1023
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1513
+ re2_pattern *p = unwrap_re2_regexp(self);
1024
1514
 
1025
1515
  return BOOL2RUBY(p->pattern->options().posix_syntax());
1026
1516
  }
@@ -1035,8 +1525,7 @@ static VALUE re2_regexp_posix_syntax(const VALUE self) {
1035
1525
  * re2.longest_match? #=> true
1036
1526
  */
1037
1527
  static VALUE re2_regexp_longest_match(const VALUE self) {
1038
- re2_pattern *p;
1039
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1528
+ re2_pattern *p = unwrap_re2_regexp(self);
1040
1529
 
1041
1530
  return BOOL2RUBY(p->pattern->options().longest_match());
1042
1531
  }
@@ -1051,8 +1540,7 @@ static VALUE re2_regexp_longest_match(const VALUE self) {
1051
1540
  * re2.log_errors? #=> true
1052
1541
  */
1053
1542
  static VALUE re2_regexp_log_errors(const VALUE self) {
1054
- re2_pattern *p;
1055
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1543
+ re2_pattern *p = unwrap_re2_regexp(self);
1056
1544
 
1057
1545
  return BOOL2RUBY(p->pattern->options().log_errors());
1058
1546
  }
@@ -1066,8 +1554,7 @@ static VALUE re2_regexp_log_errors(const VALUE self) {
1066
1554
  * re2.max_mem #=> 1024
1067
1555
  */
1068
1556
  static VALUE re2_regexp_max_mem(const VALUE self) {
1069
- re2_pattern *p;
1070
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1557
+ re2_pattern *p = unwrap_re2_regexp(self);
1071
1558
 
1072
1559
  return INT2FIX(p->pattern->options().max_mem());
1073
1560
  }
@@ -1082,8 +1569,7 @@ static VALUE re2_regexp_max_mem(const VALUE self) {
1082
1569
  * re2.literal? #=> true
1083
1570
  */
1084
1571
  static VALUE re2_regexp_literal(const VALUE self) {
1085
- re2_pattern *p;
1086
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1572
+ re2_pattern *p = unwrap_re2_regexp(self);
1087
1573
 
1088
1574
  return BOOL2RUBY(p->pattern->options().literal());
1089
1575
  }
@@ -1098,8 +1584,7 @@ static VALUE re2_regexp_literal(const VALUE self) {
1098
1584
  * re2.never_nl? #=> true
1099
1585
  */
1100
1586
  static VALUE re2_regexp_never_nl(const VALUE self) {
1101
- re2_pattern *p;
1102
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1587
+ re2_pattern *p = unwrap_re2_regexp(self);
1103
1588
 
1104
1589
  return BOOL2RUBY(p->pattern->options().never_nl());
1105
1590
  }
@@ -1114,8 +1599,7 @@ static VALUE re2_regexp_never_nl(const VALUE self) {
1114
1599
  * re2.case_sensitive? #=> true
1115
1600
  */
1116
1601
  static VALUE re2_regexp_case_sensitive(const VALUE self) {
1117
- re2_pattern *p;
1118
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1602
+ re2_pattern *p = unwrap_re2_regexp(self);
1119
1603
 
1120
1604
  return BOOL2RUBY(p->pattern->options().case_sensitive());
1121
1605
  }
@@ -1144,8 +1628,7 @@ static VALUE re2_regexp_case_insensitive(const VALUE self) {
1144
1628
  * re2.perl_classes? #=> true
1145
1629
  */
1146
1630
  static VALUE re2_regexp_perl_classes(const VALUE self) {
1147
- re2_pattern *p;
1148
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1631
+ re2_pattern *p = unwrap_re2_regexp(self);
1149
1632
 
1150
1633
  return BOOL2RUBY(p->pattern->options().perl_classes());
1151
1634
  }
@@ -1160,8 +1643,7 @@ static VALUE re2_regexp_perl_classes(const VALUE self) {
1160
1643
  * re2.word_boundary? #=> true
1161
1644
  */
1162
1645
  static VALUE re2_regexp_word_boundary(const VALUE self) {
1163
- re2_pattern *p;
1164
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1646
+ re2_pattern *p = unwrap_re2_regexp(self);
1165
1647
 
1166
1648
  return BOOL2RUBY(p->pattern->options().word_boundary());
1167
1649
  }
@@ -1176,8 +1658,7 @@ static VALUE re2_regexp_word_boundary(const VALUE self) {
1176
1658
  * re2.one_line? #=> true
1177
1659
  */
1178
1660
  static VALUE re2_regexp_one_line(const VALUE self) {
1179
- re2_pattern *p;
1180
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1661
+ re2_pattern *p = unwrap_re2_regexp(self);
1181
1662
 
1182
1663
  return BOOL2RUBY(p->pattern->options().one_line());
1183
1664
  }
@@ -1189,8 +1670,7 @@ static VALUE re2_regexp_one_line(const VALUE self) {
1189
1670
  * @return [String, nil] the error string or `nil`
1190
1671
  */
1191
1672
  static VALUE re2_regexp_error(const VALUE self) {
1192
- re2_pattern *p;
1193
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1673
+ re2_pattern *p = unwrap_re2_regexp(self);
1194
1674
 
1195
1675
  if (p->pattern->ok()) {
1196
1676
  return Qnil;
@@ -1210,8 +1690,7 @@ static VALUE re2_regexp_error(const VALUE self) {
1210
1690
  * @return [String, nil] the offending portion of the regexp or `nil`
1211
1691
  */
1212
1692
  static VALUE re2_regexp_error_arg(const VALUE self) {
1213
- re2_pattern *p;
1214
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1693
+ re2_pattern *p = unwrap_re2_regexp(self);
1215
1694
 
1216
1695
  if (p->pattern->ok()) {
1217
1696
  return Qnil;
@@ -1230,8 +1709,7 @@ static VALUE re2_regexp_error_arg(const VALUE self) {
1230
1709
  * @return [Integer] the regexp "cost"
1231
1710
  */
1232
1711
  static VALUE re2_regexp_program_size(const VALUE self) {
1233
- re2_pattern *p;
1234
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1712
+ re2_pattern *p = unwrap_re2_regexp(self);
1235
1713
 
1236
1714
  return INT2FIX(p->pattern->ProgramSize());
1237
1715
  }
@@ -1242,9 +1720,7 @@ static VALUE re2_regexp_program_size(const VALUE self) {
1242
1720
  * @return [Hash] the options
1243
1721
  */
1244
1722
  static VALUE re2_regexp_options(const VALUE self) {
1245
- re2_pattern *p;
1246
-
1247
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1723
+ re2_pattern *p = unwrap_re2_regexp(self);
1248
1724
  VALUE options = rb_hash_new();
1249
1725
 
1250
1726
  rb_hash_aset(options, ID2SYM(id_utf8),
@@ -1294,8 +1770,7 @@ static VALUE re2_regexp_options(const VALUE self) {
1294
1770
  * @return [Integer] the number of capturing subpatterns
1295
1771
  */
1296
1772
  static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1297
- re2_pattern *p;
1298
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1773
+ re2_pattern *p = unwrap_re2_regexp(self);
1299
1774
 
1300
1775
  return INT2FIX(p->pattern->NumberOfCapturingGroups());
1301
1776
  }
@@ -1310,17 +1785,15 @@ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1310
1785
  * @return [Hash] a hash of names to capturing indices
1311
1786
  */
1312
1787
  static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1313
- re2_pattern *p;
1314
-
1315
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1316
- const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1788
+ re2_pattern *p = unwrap_re2_regexp(self);
1789
+ const auto& groups = p->pattern->NamedCapturingGroups();
1317
1790
  VALUE capturing_groups = rb_hash_new();
1318
1791
 
1319
- for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1792
+ for (const auto& group : groups) {
1320
1793
  rb_hash_aset(capturing_groups,
1321
- encoded_str_new(it->first.data(), it->first.size(),
1794
+ encoded_str_new(group.first.data(), group.first.size(),
1322
1795
  p->pattern->options().encoding()),
1323
- INT2FIX(it->second));
1796
+ INT2FIX(group.second));
1324
1797
  }
1325
1798
 
1326
1799
  return capturing_groups;
@@ -1415,14 +1888,15 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1415
1888
 
1416
1889
  rb_scan_args(argc, argv, "11", &text, &options);
1417
1890
 
1418
- /* Ensure text is a string. */
1891
+ /* Coerce and freeze text to prevent mutation. */
1419
1892
  StringValue(text);
1893
+ text = rb_str_new_frozen(text);
1420
1894
 
1421
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1895
+ p = unwrap_re2_regexp(self);
1422
1896
 
1423
1897
  int n;
1424
- int startpos = 0;
1425
- int endpos = RSTRING_LEN(text);
1898
+ size_t startpos = 0;
1899
+ size_t endpos = RSTRING_LEN(text);
1426
1900
  RE2::Anchor anchor = RE2::UNANCHORED;
1427
1901
 
1428
1902
  if (RTEST(options)) {
@@ -1440,11 +1914,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1440
1914
  VALUE endpos_option = rb_hash_aref(options, ID2SYM(id_endpos));
1441
1915
  if (!NIL_P(endpos_option)) {
1442
1916
  #ifdef HAVE_ENDPOS_ARGUMENT
1443
- endpos = NUM2INT(endpos_option);
1917
+ ssize_t endpos_value = NUM2SSIZET(endpos_option);
1444
1918
 
1445
- if (endpos < 0) {
1919
+ if (endpos_value < 0) {
1446
1920
  rb_raise(rb_eArgError, "endpos should be >= 0");
1447
1921
  }
1922
+
1923
+ endpos = static_cast<size_t>(endpos_value);
1448
1924
  #else
1449
1925
  rb_raise(re2_eRegexpUnsupportedError, "current version of RE2::Match() does not support endpos argument");
1450
1926
  #endif
@@ -1483,11 +1959,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1483
1959
 
1484
1960
  VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
1485
1961
  if (!NIL_P(startpos_option)) {
1486
- startpos = NUM2INT(startpos_option);
1962
+ ssize_t startpos_value = NUM2SSIZET(startpos_option);
1487
1963
 
1488
- if (startpos < 0) {
1964
+ if (startpos_value < 0) {
1489
1965
  rb_raise(rb_eArgError, "startpos should be >= 0");
1490
1966
  }
1967
+
1968
+ startpos = static_cast<size_t>(startpos_value);
1491
1969
  }
1492
1970
  }
1493
1971
  } else {
@@ -1502,16 +1980,18 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1502
1980
  rb_raise(rb_eArgError, "startpos should be <= endpos");
1503
1981
  }
1504
1982
 
1505
- if (n == 0) {
1506
- #ifdef HAVE_ENDPOS_ARGUMENT
1507
- bool matched = p->pattern->Match(
1508
- re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1509
- startpos, endpos, anchor, 0, 0);
1510
- #else
1511
- bool matched = p->pattern->Match(
1512
- re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1513
- startpos, anchor, 0, 0);
1983
+ #ifndef HAVE_ENDPOS_ARGUMENT
1984
+ /* Old RE2's Match() takes int startpos. Reject values that would overflow. */
1985
+ if (startpos > INT_MAX) {
1986
+ rb_raise(rb_eRangeError, "startpos should be <= %d", INT_MAX);
1987
+ }
1514
1988
  #endif
1989
+
1990
+ if (n == 0) {
1991
+ bool matched = re2_match_without_gvl(
1992
+ p->pattern, text, startpos, endpos, anchor, 0, 0);
1993
+ RB_GC_GUARD(text);
1994
+
1515
1995
  return BOOL2RUBY(matched);
1516
1996
  } else {
1517
1997
  if (n == INT_MAX) {
@@ -1522,22 +2002,15 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1522
2002
  n += 1;
1523
2003
 
1524
2004
  re2::StringPiece *matches = new(std::nothrow) re2::StringPiece[n];
1525
- if (matches == 0) {
2005
+ if (matches == nullptr) {
1526
2006
  rb_raise(rb_eNoMemError,
1527
2007
  "not enough memory to allocate StringPieces for matches");
1528
2008
  }
1529
2009
 
1530
- text = rb_str_new_frozen(text);
2010
+ bool matched = re2_match_without_gvl(
2011
+ p->pattern, text, startpos, endpos, anchor, matches, n);
2012
+ RB_GC_GUARD(text);
1531
2013
 
1532
- #ifdef HAVE_ENDPOS_ARGUMENT
1533
- bool matched = p->pattern->Match(
1534
- re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1535
- startpos, endpos, anchor, matches, n);
1536
- #else
1537
- bool matched = p->pattern->Match(
1538
- re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1539
- startpos, anchor, matches, n);
1540
- #endif
1541
2014
  if (matched) {
1542
2015
  VALUE matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1543
2016
  TypedData_Get_Struct(matchdata, re2_matchdata, &re2_matchdata_data_type, m);
@@ -1561,19 +2034,20 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1561
2034
  * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L413-L427
1562
2035
  * `PartialMatch`}.
1563
2036
  *
2037
+ * @param [String] text the text to search
1564
2038
  * @return [Boolean] whether the match was successful
1565
2039
  * @raise [TypeError] if text cannot be coerced to a `String`
1566
2040
  */
1567
2041
  static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1568
- re2_pattern *p;
1569
-
1570
- /* Ensure text is a string. */
1571
2042
  StringValue(text);
2043
+ text = rb_str_new_frozen(text);
1572
2044
 
1573
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
2045
+ re2_pattern *p = unwrap_re2_regexp(self);
2046
+ bool matched = re2_match_without_gvl(
2047
+ p->pattern, text, 0, RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
2048
+ RB_GC_GUARD(text);
1574
2049
 
1575
- return BOOL2RUBY(RE2::PartialMatch(
1576
- re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
2050
+ return BOOL2RUBY(matched);
1577
2051
  }
1578
2052
 
1579
2053
  /*
@@ -1581,19 +2055,20 @@ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1581
2055
  * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L376-L411
1582
2056
  * `FullMatch`}.
1583
2057
  *
2058
+ * @param [String] text the text to search
1584
2059
  * @return [Boolean] whether the match was successful
1585
2060
  * @raise [TypeError] if text cannot be coerced to a `String`
1586
2061
  */
1587
2062
  static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {
1588
- re2_pattern *p;
1589
-
1590
- /* Ensure text is a string. */
1591
2063
  StringValue(text);
2064
+ text = rb_str_new_frozen(text);
1592
2065
 
1593
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
2066
+ re2_pattern *p = unwrap_re2_regexp(self);
2067
+ bool matched = re2_match_without_gvl(
2068
+ p->pattern, text, 0, RSTRING_LEN(text), RE2::ANCHOR_BOTH, 0, 0);
2069
+ RB_GC_GUARD(text);
1594
2070
 
1595
- return BOOL2RUBY(RE2::FullMatch(
1596
- re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
2071
+ return BOOL2RUBY(matched);
1597
2072
  }
1598
2073
 
1599
2074
  /*
@@ -1609,21 +2084,19 @@ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {
1609
2084
  * #=> #<RE2::Scanner:0x0000000000000001>
1610
2085
  */
1611
2086
  static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1612
- /* Ensure text is a string. */
1613
2087
  StringValue(text);
2088
+ text = rb_str_new_frozen(text);
1614
2089
 
1615
- re2_pattern *p;
2090
+ re2_pattern *p = unwrap_re2_regexp(self);
1616
2091
  re2_scanner *c;
1617
-
1618
- TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1619
2092
  VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1620
2093
  TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);
1621
2094
 
1622
2095
  RB_OBJ_WRITE(scanner, &c->regexp, self);
1623
- RB_OBJ_WRITE(scanner, &c->text, rb_str_new_frozen(text));
2096
+ RB_OBJ_WRITE(scanner, &c->text, text);
1624
2097
  c->input = new(std::nothrow) re2::StringPiece(
1625
2098
  RSTRING_PTR(c->text), RSTRING_LEN(c->text));
1626
- if (c->input == 0) {
2099
+ if (c->input == nullptr) {
1627
2100
  rb_raise(rb_eNoMemError,
1628
2101
  "not enough memory to allocate StringPiece for input");
1629
2102
  }
@@ -1675,40 +2148,59 @@ static VALUE re2_regexp_match_has_endpos_argument_p(VALUE) {
1675
2148
  * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1676
2149
  * {RE2::Regexp}) cannot be coerced to `String`s
1677
2150
  * @example
1678
- * RE2.Replace("hello there", "hello", "howdy") #=> "howdy there"
2151
+ * RE2.replace("hello there", "hello", "howdy") #=> "howdy there"
1679
2152
  * re2 = RE2::Regexp.new("hel+o")
1680
- * RE2.Replace("hello there", re2, "yo") #=> "yo there"
2153
+ * RE2.replace("hello there", re2, "yo") #=> "yo there"
1681
2154
  */
1682
- static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
2155
+ static VALUE re2_replace(VALUE, VALUE str, VALUE pattern,
1683
2156
  VALUE rewrite) {
1684
- /* Ensure rewrite is a string. */
1685
- StringValue(rewrite);
2157
+ re2_pattern *p = nullptr;
1686
2158
 
1687
- re2_pattern *p;
1688
-
1689
- /* Take a copy of str so it can be modified in-place by
1690
- * RE2::Replace.
2159
+ /* Coerce and freeze all arguments before any C++ allocations so that any
2160
+ * Ruby exceptions (via longjmp) cannot bypass C++ destructors and leak
2161
+ * memory, and later coercions cannot mutate earlier strings.
1691
2162
  */
1692
2163
  StringValue(str);
2164
+ str = rb_str_new_frozen(str);
2165
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
2166
+ p = unwrap_re2_regexp(pattern);
2167
+ } else {
2168
+ StringValue(pattern);
2169
+ pattern = rb_str_new_frozen(pattern);
2170
+ }
2171
+ StringValue(rewrite);
2172
+ rewrite = rb_str_new_frozen(rewrite);
2173
+
2174
+ /* Take a copy of str so it can be modified in-place by RE2::Replace. */
1693
2175
  std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1694
2176
 
1695
- /* Do the replacement. */
1696
- if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1697
- TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1698
- RE2::Replace(&str_as_string, *p->pattern,
1699
- re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
2177
+ nogvl_replace_arg arg;
2178
+ arg.str = &str_as_string;
2179
+ if (p) {
2180
+ arg.pattern = p->pattern;
2181
+ } else {
2182
+ arg.pattern = nullptr;
2183
+ arg.string_pattern = re2::StringPiece(
2184
+ RSTRING_PTR(pattern), RSTRING_LEN(pattern));
2185
+ }
2186
+ arg.rewrite = re2::StringPiece(
2187
+ RSTRING_PTR(rewrite), RSTRING_LEN(rewrite));
2188
+
2189
+ #ifdef _WIN32
2190
+ nogvl_replace(&arg);
2191
+ #else
2192
+ rb_thread_call_without_gvl(nogvl_replace, &arg, NULL, NULL);
2193
+ #endif
1700
2194
 
2195
+ RB_GC_GUARD(rewrite);
2196
+ RB_GC_GUARD(pattern);
2197
+
2198
+ if (p) {
1701
2199
  return encoded_str_new(str_as_string.data(), str_as_string.size(),
1702
2200
  p->pattern->options().encoding());
1703
2201
  } else {
1704
- /* Ensure pattern is a string. */
1705
- StringValue(pattern);
1706
-
1707
- RE2::Replace(&str_as_string,
1708
- re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1709
- re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1710
-
1711
- return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
2202
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
2203
+ RE2::Options::EncodingUTF8);
1712
2204
  }
1713
2205
  }
1714
2206
 
@@ -1729,38 +2221,136 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
1729
2221
  * @return [String] the resulting string
1730
2222
  * @example
1731
2223
  * re2 = RE2::Regexp.new("oo?")
1732
- * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
1733
- * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
2224
+ * RE2.global_replace("whoops-doops", re2, "e") #=> "wheps-deps"
2225
+ * RE2.global_replace("hello there", "e", "i") #=> "hillo thiri"
1734
2226
  */
1735
- static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
2227
+ static VALUE re2_global_replace(VALUE, VALUE str, VALUE pattern,
1736
2228
  VALUE rewrite) {
1737
- /* Ensure rewrite is a string. */
2229
+ re2_pattern *p = nullptr;
2230
+
2231
+ /* Coerce and freeze all arguments before any C++ allocations so that any
2232
+ * Ruby exceptions (via longjmp) cannot bypass C++ destructors and leak
2233
+ * memory, and later coercions cannot mutate earlier strings.
2234
+ */
2235
+ StringValue(str);
2236
+ str = rb_str_new_frozen(str);
2237
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
2238
+ p = unwrap_re2_regexp(pattern);
2239
+ } else {
2240
+ StringValue(pattern);
2241
+ pattern = rb_str_new_frozen(pattern);
2242
+ }
1738
2243
  StringValue(rewrite);
2244
+ rewrite = rb_str_new_frozen(rewrite);
1739
2245
 
1740
2246
  /* Take a copy of str so it can be modified in-place by
1741
2247
  * RE2::GlobalReplace.
1742
2248
  */
1743
- re2_pattern *p;
1744
- StringValue(str);
1745
2249
  std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1746
2250
 
1747
- /* Do the replacement. */
1748
- if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1749
- TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1750
- RE2::GlobalReplace(&str_as_string, *p->pattern,
1751
- re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
2251
+ nogvl_replace_arg arg;
2252
+ arg.str = &str_as_string;
2253
+ if (p) {
2254
+ arg.pattern = p->pattern;
2255
+ } else {
2256
+ arg.pattern = nullptr;
2257
+ arg.string_pattern = re2::StringPiece(
2258
+ RSTRING_PTR(pattern), RSTRING_LEN(pattern));
2259
+ }
2260
+ arg.rewrite = re2::StringPiece(
2261
+ RSTRING_PTR(rewrite), RSTRING_LEN(rewrite));
1752
2262
 
2263
+ #ifdef _WIN32
2264
+ nogvl_global_replace(&arg);
2265
+ #else
2266
+ rb_thread_call_without_gvl(nogvl_global_replace, &arg, NULL, NULL);
2267
+ #endif
2268
+
2269
+ RB_GC_GUARD(rewrite);
2270
+ RB_GC_GUARD(pattern);
2271
+
2272
+ if (p) {
1753
2273
  return encoded_str_new(str_as_string.data(), str_as_string.size(),
1754
2274
  p->pattern->options().encoding());
1755
2275
  } else {
1756
- /* Ensure pattern is a string. */
2276
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
2277
+ RE2::Options::EncodingUTF8);
2278
+ }
2279
+ }
2280
+
2281
+ /*
2282
+ * If `pattern` matches `text`, returns a copy of `rewrite` with substitutions
2283
+ * using
2284
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L499-L510
2285
+ * `Extract`}. Non-matching portions of `text` are ignored.
2286
+ *
2287
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
2288
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
2289
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
2290
+ *
2291
+ * @param [String] text the string from which to extract
2292
+ * @param [String, RE2::Regexp] pattern a regexp matching the text
2293
+ * @param [String] rewrite the rewrite string with `\1`-style substitutions
2294
+ * @return [String, nil] the extracted string on a successful match or nil if
2295
+ * there is no match
2296
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
2297
+ * {RE2::Regexp}) cannot be coerced to `String`s
2298
+ * @example
2299
+ * RE2.extract("alice@example.com", '(\w+)@(\w+)', '\2-\1')
2300
+ * #=> "example-alice"
2301
+ * RE2.extract("no match", '(\d+)', '\1') #=> nil
2302
+ */
2303
+ static VALUE re2_extract(VALUE, VALUE text, VALUE pattern,
2304
+ VALUE rewrite) {
2305
+ re2_pattern *p = nullptr;
2306
+
2307
+ /* Coerce and freeze all arguments before any C++ allocations so that any
2308
+ * Ruby exceptions (via longjmp) cannot bypass C++ destructors and leak
2309
+ * memory, and later coercions cannot mutate earlier strings.
2310
+ */
2311
+ StringValue(text);
2312
+ text = rb_str_new_frozen(text);
2313
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
2314
+ p = unwrap_re2_regexp(pattern);
2315
+ } else {
1757
2316
  StringValue(pattern);
2317
+ pattern = rb_str_new_frozen(pattern);
2318
+ }
2319
+ StringValue(rewrite);
2320
+ rewrite = rb_str_new_frozen(rewrite);
1758
2321
 
1759
- RE2::GlobalReplace(&str_as_string,
1760
- re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1761
- re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
2322
+ std::string out;
1762
2323
 
1763
- return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
2324
+ nogvl_extract_arg arg;
2325
+ arg.text = re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text));
2326
+ if (p) {
2327
+ arg.pattern = p->pattern;
2328
+ } else {
2329
+ arg.pattern = nullptr;
2330
+ arg.string_pattern = re2::StringPiece(
2331
+ RSTRING_PTR(pattern), RSTRING_LEN(pattern));
2332
+ }
2333
+ arg.rewrite = re2::StringPiece(
2334
+ RSTRING_PTR(rewrite), RSTRING_LEN(rewrite));
2335
+ arg.out = &out;
2336
+ arg.extracted = false;
2337
+
2338
+ #ifdef _WIN32
2339
+ nogvl_extract(&arg);
2340
+ #else
2341
+ rb_thread_call_without_gvl(nogvl_extract, &arg, NULL, NULL);
2342
+ #endif
2343
+
2344
+ RB_GC_GUARD(text);
2345
+ RB_GC_GUARD(rewrite);
2346
+ RB_GC_GUARD(pattern);
2347
+
2348
+ if (arg.extracted) {
2349
+ return encoded_str_new(out.data(), out.size(),
2350
+ p ? p->pattern->options().encoding()
2351
+ : RE2::Options::EncodingUTF8);
2352
+ } else {
2353
+ return Qnil;
1764
2354
  }
1765
2355
  }
1766
2356
 
@@ -1775,9 +2365,12 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1775
2365
  * @raise [TypeError] if the given unquoted string cannot be coerced to a `String`
1776
2366
  * @return [String] the escaped string
1777
2367
  * @example
1778
- * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
2368
+ * RE2.escape("1.5-2.0?") #=> "1\\.5\\-2\\.0\\?"
2369
+ * RE2.quote("1.5-2.0?") #=> "1\\.5\\-2\\.0\\?"
2370
+ * RE2::Regexp.escape("1.5-2.0?") #=> "1\\.5\\-2\\.0\\?"
2371
+ * RE2::Regexp.quote("1.5-2.0?") #=> "1\\.5\\-2\\.0\\?"
1779
2372
  */
1780
- static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
2373
+ static VALUE re2_escape(VALUE, VALUE unquoted) {
1781
2374
  StringValue(unquoted);
1782
2375
 
1783
2376
  std::string quoted_string = RE2::QuoteMeta(
@@ -1787,7 +2380,7 @@ static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
1787
2380
  }
1788
2381
 
1789
2382
  static void re2_set_free(void *ptr) {
1790
- re2_set *s = reinterpret_cast<re2_set *>(ptr);
2383
+ re2_set *s = static_cast<re2_set *>(ptr);
1791
2384
  if (s->set) {
1792
2385
  delete s->set;
1793
2386
  }
@@ -1795,7 +2388,7 @@ static void re2_set_free(void *ptr) {
1795
2388
  }
1796
2389
 
1797
2390
  static size_t re2_set_memsize(const void *ptr) {
1798
- const re2_set *s = reinterpret_cast<const re2_set *>(ptr);
2391
+ const re2_set *s = static_cast<const re2_set *>(ptr);
1799
2392
  size_t size = sizeof(*s);
1800
2393
  if (s->set) {
1801
2394
  size += sizeof(*s->set);
@@ -1815,9 +2408,18 @@ static const rb_data_type_t re2_set_data_type = {
1815
2408
  0,
1816
2409
  // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
1817
2410
  // macro to update VALUE references, as to trigger write barriers.
1818
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
2411
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE
1819
2412
  };
1820
2413
 
2414
+ static re2_set *unwrap_re2_set(VALUE self) {
2415
+ re2_set *s;
2416
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2417
+ if (!s->set) {
2418
+ rb_raise(rb_eTypeError, "uninitialized RE2::Set");
2419
+ }
2420
+ return s;
2421
+ }
2422
+
1821
2423
  static VALUE re2_set_allocate(VALUE klass) {
1822
2424
  re2_set *s;
1823
2425
  VALUE result = TypedData_Make_Struct(klass, re2_set, &re2_set_data_type, s);
@@ -1825,6 +2427,10 @@ static VALUE re2_set_allocate(VALUE klass) {
1825
2427
  return result;
1826
2428
  }
1827
2429
 
2430
+ static VALUE re2_set_initialize_copy(VALUE, VALUE) {
2431
+ rb_raise(rb_eTypeError, "cannot copy RE2::Set");
2432
+ }
2433
+
1828
2434
  /*
1829
2435
  * Returns a new {RE2::Set} object, a collection of patterns that can be
1830
2436
  * searched for simultaneously.
@@ -1895,8 +2501,15 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1895
2501
  parse_re2_options(&re2_options, options);
1896
2502
  }
1897
2503
 
2504
+ rb_check_frozen(self);
2505
+
2506
+ if (s->set) {
2507
+ delete s->set;
2508
+ s->set = nullptr;
2509
+ }
2510
+
1898
2511
  s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
1899
- if (s->set == 0) {
2512
+ if (s->set == nullptr) {
1900
2513
  rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1901
2514
  }
1902
2515
 
@@ -1919,8 +2532,8 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1919
2532
  static VALUE re2_set_add(VALUE self, VALUE pattern) {
1920
2533
  StringValue(pattern);
1921
2534
 
1922
- re2_set *s;
1923
- TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2535
+ re2_set *s = unwrap_re2_set(self);
2536
+ rb_check_frozen(self);
1924
2537
 
1925
2538
  int index;
1926
2539
  VALUE msg;
@@ -1951,10 +2564,16 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) {
1951
2564
  * set.compile #=> true
1952
2565
  */
1953
2566
  static VALUE re2_set_compile(VALUE self) {
1954
- re2_set *s;
1955
- TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2567
+ re2_set *s = unwrap_re2_set(self);
2568
+ rb_check_frozen(self);
1956
2569
 
1957
- return BOOL2RUBY(s->set->Compile());
2570
+ bool compiled = s->set->Compile();
2571
+
2572
+ if (compiled) {
2573
+ rb_obj_freeze(self);
2574
+ }
2575
+
2576
+ return BOOL2RUBY(compiled);
1958
2577
  }
1959
2578
 
1960
2579
  /*
@@ -1968,8 +2587,7 @@ static VALUE re2_set_compile(VALUE self) {
1968
2587
  */
1969
2588
  static VALUE re2_set_size(VALUE self) {
1970
2589
  #ifdef HAVE_SET_SIZE
1971
- re2_set *s;
1972
- TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2590
+ re2_set *s = unwrap_re2_set(self);
1973
2591
 
1974
2592
  return INT2FIX(s->set->Size());
1975
2593
  #else
@@ -2052,8 +2670,9 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2052
2670
  rb_scan_args(argc, argv, "11", &str, &options);
2053
2671
 
2054
2672
  StringValue(str);
2055
- re2_set *s;
2056
- TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2673
+ str = rb_str_new_frozen(str);
2674
+
2675
+ re2_set *s = unwrap_re2_set(self);
2057
2676
 
2058
2677
  if (RTEST(options)) {
2059
2678
  Check_Type(options, T_HASH);
@@ -2069,8 +2688,21 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2069
2688
  if (raise_exception) {
2070
2689
  #ifdef HAVE_ERROR_INFO_ARGUMENT
2071
2690
  RE2::Set::ErrorInfo e;
2072
- bool match_failed = !s->set->Match(
2073
- re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
2691
+ nogvl_set_match_arg arg;
2692
+ arg.set = s->set;
2693
+ arg.text = re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str));
2694
+ arg.v = &v;
2695
+ arg.error_info = &e;
2696
+ arg.matched = false;
2697
+
2698
+ #ifdef _WIN32
2699
+ nogvl_set_match(&arg);
2700
+ #else
2701
+ rb_thread_call_without_gvl(nogvl_set_match, &arg, NULL, NULL);
2702
+ #endif
2703
+ RB_GC_GUARD(str);
2704
+
2705
+ bool match_failed = !arg.matched;
2074
2706
  VALUE result = rb_ary_new2(v.size());
2075
2707
 
2076
2708
  if (match_failed) {
@@ -2087,8 +2719,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2087
2719
  rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
2088
2720
  }
2089
2721
  } else {
2090
- for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
2091
- rb_ary_push(result, INT2FIX(v[i]));
2722
+ for (int index : v) {
2723
+ rb_ary_push(result, INT2FIX(index));
2092
2724
  }
2093
2725
  }
2094
2726
 
@@ -2097,13 +2729,27 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2097
2729
  rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
2098
2730
  #endif
2099
2731
  } else {
2100
- bool matched = s->set->Match(
2101
- re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
2732
+ nogvl_set_match_arg arg;
2733
+ arg.set = s->set;
2734
+ arg.text = re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str));
2735
+ arg.v = &v;
2736
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
2737
+ arg.error_info = nullptr;
2738
+ #endif
2739
+ arg.matched = false;
2740
+
2741
+ #ifdef _WIN32
2742
+ nogvl_set_match(&arg);
2743
+ #else
2744
+ rb_thread_call_without_gvl(nogvl_set_match, &arg, NULL, NULL);
2745
+ #endif
2746
+ RB_GC_GUARD(str);
2747
+
2102
2748
  VALUE result = rb_ary_new2(v.size());
2103
2749
 
2104
- if (matched) {
2105
- for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
2106
- rb_ary_push(result, INT2FIX(v[i]));
2750
+ if (arg.matched) {
2751
+ for (int index : v) {
2752
+ rb_ary_push(result, INT2FIX(index));
2107
2753
  }
2108
2754
  }
2109
2755
 
@@ -2112,6 +2758,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2112
2758
  }
2113
2759
 
2114
2760
  extern "C" void Init_re2(void) {
2761
+ rb_ext_ractor_safe(true);
2762
+
2115
2763
  re2_mRE2 = rb_define_module("RE2");
2116
2764
  re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
2117
2765
  re2_eRegexpUnsupportedError = rb_define_class_under(re2_cRegexp,
@@ -2147,6 +2795,14 @@ extern "C" void Init_re2(void) {
2147
2795
  RUBY_METHOD_FUNC(re2_matchdata_begin), 1);
2148
2796
  rb_define_method(re2_cMatchData, "end",
2149
2797
  RUBY_METHOD_FUNC(re2_matchdata_end), 1);
2798
+ rb_define_method(re2_cMatchData, "pre_match",
2799
+ RUBY_METHOD_FUNC(re2_matchdata_pre_match), 0);
2800
+ rb_define_method(re2_cMatchData, "post_match",
2801
+ RUBY_METHOD_FUNC(re2_matchdata_post_match), 0);
2802
+ rb_define_method(re2_cMatchData, "offset",
2803
+ RUBY_METHOD_FUNC(re2_matchdata_offset), 1);
2804
+ rb_define_method(re2_cMatchData, "match_length",
2805
+ RUBY_METHOD_FUNC(re2_matchdata_match_length), 1);
2150
2806
  rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref),
2151
2807
  -1);
2152
2808
  rb_define_method(re2_cMatchData, "to_s",
@@ -2155,8 +2811,18 @@ extern "C" void Init_re2(void) {
2155
2811
  RUBY_METHOD_FUNC(re2_matchdata_inspect), 0);
2156
2812
  rb_define_method(re2_cMatchData, "deconstruct",
2157
2813
  RUBY_METHOD_FUNC(re2_matchdata_deconstruct), 0);
2814
+ rb_define_method(re2_cMatchData, "captures",
2815
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct), 0);
2816
+ rb_define_method(re2_cMatchData, "named_captures",
2817
+ RUBY_METHOD_FUNC(re2_matchdata_named_captures), -1);
2818
+ rb_define_method(re2_cMatchData, "names",
2819
+ RUBY_METHOD_FUNC(re2_matchdata_names), 0);
2820
+ rb_define_method(re2_cMatchData, "values_at",
2821
+ RUBY_METHOD_FUNC(re2_matchdata_values_at), -1);
2158
2822
  rb_define_method(re2_cMatchData, "deconstruct_keys",
2159
2823
  RUBY_METHOD_FUNC(re2_matchdata_deconstruct_keys), 1);
2824
+ rb_define_method(re2_cMatchData, "initialize_copy",
2825
+ RUBY_METHOD_FUNC(re2_matchdata_initialize_copy), 1);
2160
2826
 
2161
2827
  rb_define_method(re2_cScanner, "string",
2162
2828
  RUBY_METHOD_FUNC(re2_scanner_string), 0);
@@ -2168,11 +2834,15 @@ extern "C" void Init_re2(void) {
2168
2834
  RUBY_METHOD_FUNC(re2_scanner_scan), 0);
2169
2835
  rb_define_method(re2_cScanner, "rewind",
2170
2836
  RUBY_METHOD_FUNC(re2_scanner_rewind), 0);
2837
+ rb_define_method(re2_cScanner, "initialize_copy",
2838
+ RUBY_METHOD_FUNC(re2_scanner_initialize_copy), 1);
2171
2839
 
2172
2840
  rb_define_singleton_method(re2_cRegexp, "match_has_endpos_argument?",
2173
2841
  RUBY_METHOD_FUNC(re2_regexp_match_has_endpos_argument_p), 0);
2174
2842
  rb_define_method(re2_cRegexp, "initialize",
2175
2843
  RUBY_METHOD_FUNC(re2_regexp_initialize), -1);
2844
+ rb_define_method(re2_cRegexp, "initialize_copy",
2845
+ RUBY_METHOD_FUNC(re2_regexp_initialize_copy), 1);
2176
2846
  rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0);
2177
2847
  rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error),
2178
2848
  0);
@@ -2186,6 +2856,10 @@ extern "C" void Init_re2(void) {
2186
2856
  RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0);
2187
2857
  rb_define_method(re2_cRegexp, "named_capturing_groups",
2188
2858
  RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
2859
+ rb_define_method(re2_cRegexp, "named_captures",
2860
+ RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
2861
+ rb_define_method(re2_cRegexp, "names",
2862
+ RUBY_METHOD_FUNC(re2_regexp_names), 0);
2189
2863
  rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match),
2190
2864
  -1);
2191
2865
  rb_define_method(re2_cRegexp, "match?", RUBY_METHOD_FUNC(re2_regexp_match_p),
@@ -2240,22 +2914,34 @@ extern "C" void Init_re2(void) {
2240
2914
  RUBY_METHOD_FUNC(re2_set_size_p), 0);
2241
2915
  rb_define_method(re2_cSet, "initialize",
2242
2916
  RUBY_METHOD_FUNC(re2_set_initialize), -1);
2917
+ rb_define_method(re2_cSet, "initialize_copy",
2918
+ RUBY_METHOD_FUNC(re2_set_initialize_copy), 1);
2243
2919
  rb_define_method(re2_cSet, "add", RUBY_METHOD_FUNC(re2_set_add), 1);
2244
2920
  rb_define_method(re2_cSet, "compile", RUBY_METHOD_FUNC(re2_set_compile), 0);
2245
2921
  rb_define_method(re2_cSet, "match", RUBY_METHOD_FUNC(re2_set_match), -1);
2246
2922
  rb_define_method(re2_cSet, "size", RUBY_METHOD_FUNC(re2_set_size), 0);
2247
2923
  rb_define_method(re2_cSet, "length", RUBY_METHOD_FUNC(re2_set_size), 0);
2248
2924
 
2925
+ rb_define_module_function(re2_mRE2, "replace",
2926
+ RUBY_METHOD_FUNC(re2_replace), 3);
2249
2927
  rb_define_module_function(re2_mRE2, "Replace",
2250
- RUBY_METHOD_FUNC(re2_Replace), 3);
2928
+ RUBY_METHOD_FUNC(re2_replace), 3);
2929
+ rb_define_module_function(re2_mRE2, "global_replace",
2930
+ RUBY_METHOD_FUNC(re2_global_replace), 3);
2251
2931
  rb_define_module_function(re2_mRE2, "GlobalReplace",
2252
- RUBY_METHOD_FUNC(re2_GlobalReplace), 3);
2932
+ RUBY_METHOD_FUNC(re2_global_replace), 3);
2933
+ rb_define_module_function(re2_mRE2, "extract",
2934
+ RUBY_METHOD_FUNC(re2_extract), 3);
2253
2935
  rb_define_module_function(re2_mRE2, "QuoteMeta",
2254
- RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2936
+ RUBY_METHOD_FUNC(re2_escape), 1);
2937
+ rb_define_module_function(re2_mRE2, "escape",
2938
+ RUBY_METHOD_FUNC(re2_escape), 1);
2939
+ rb_define_module_function(re2_mRE2, "quote",
2940
+ RUBY_METHOD_FUNC(re2_escape), 1);
2255
2941
  rb_define_singleton_method(re2_cRegexp, "escape",
2256
- RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2942
+ RUBY_METHOD_FUNC(re2_escape), 1);
2257
2943
  rb_define_singleton_method(re2_cRegexp, "quote",
2258
- RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2944
+ RUBY_METHOD_FUNC(re2_escape), 1);
2259
2945
 
2260
2946
  // (see RE2::Regexp#initialize)
2261
2947
  rb_define_singleton_method(re2_cRegexp, "compile",
@@ -2283,4 +2969,5 @@ extern "C" void Init_re2(void) {
2283
2969
  id_submatches = rb_intern("submatches");
2284
2970
  id_startpos = rb_intern("startpos");
2285
2971
  id_endpos = rb_intern("endpos");
2972
+ id_symbolize_names = rb_intern("symbolize_names");
2286
2973
  }