re2 2.4.3 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/re2/re2.cc CHANGED
@@ -1,8 +1,10 @@
1
1
  /*
2
- * re2 (http://github.com/mudge/re2)
3
- * Ruby bindings to re2, an "efficient, principled regular expression library"
2
+ * re2 (https://github.com/mudge/re2)
3
+ * Ruby bindings to RE2, a "fast, safe, thread-friendly alternative to
4
+ * backtracking regular expression engines like those used in PCRE, Perl, and
5
+ * Python".
4
6
  *
5
- * Copyright (c) 2010-2014, Paul Mucur (http://mudge.name)
7
+ * Copyright (c) 2010, Paul Mucur (https://mudge.name)
6
8
  * Released under the BSD Licence, please see LICENSE.txt
7
9
  */
8
10
 
@@ -42,13 +44,14 @@ typedef struct {
42
44
  } re2_set;
43
45
 
44
46
  VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
45
- re2_eSetMatchError, re2_eSetUnsupportedError;
47
+ re2_eSetMatchError, re2_eSetUnsupportedError, re2_eRegexpUnsupportedError;
46
48
 
47
49
  /* Symbols used in RE2 options. */
48
50
  static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
49
51
  id_max_mem, id_literal, id_never_nl, id_case_sensitive,
50
- id_perl_classes, id_word_boundary, id_one_line,
51
- id_unanchored, id_anchor_start, id_anchor_both, id_exception;
52
+ id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
53
+ id_anchor, id_anchor_start, id_anchor_both, id_exception,
54
+ id_submatches, id_startpos, id_endpos;
52
55
 
53
56
  inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
54
57
  if (encoding == RE2::Options::EncodingUTF8) {
@@ -122,7 +125,7 @@ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
122
125
  }
123
126
  }
124
127
 
125
- /* For compatibility with ruby < 2.7 */
128
+ /* For compatibility with Ruby < 2.7 */
126
129
  #ifdef HAVE_RB_GC_MARK_MOVABLE
127
130
  #define re2_compact_callback(x) (x),
128
131
  #else
@@ -270,12 +273,14 @@ static VALUE re2_scanner_allocate(VALUE klass) {
270
273
  }
271
274
 
272
275
  /*
273
- * Returns a frozen copy of the string passed into +match+.
276
+ * Returns a frozen copy of the text supplied when matching.
274
277
  *
275
- * @return [String] a frozen copy of the passed string.
278
+ * If the text was already a frozen string, returns the original.
279
+ *
280
+ * @return [String] a frozen string with the text supplied when matching
276
281
  * @example
277
- * m = RE2::Regexp.new('(\d+)').match("bob 123")
278
- * m.string #=> "bob 123"
282
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
283
+ * m.string #=> "bob 123"
279
284
  */
280
285
  static VALUE re2_matchdata_string(const VALUE self) {
281
286
  re2_matchdata *m;
@@ -285,9 +290,10 @@ static VALUE re2_matchdata_string(const VALUE self) {
285
290
  }
286
291
 
287
292
  /*
288
- * Returns the string passed into the scanner.
293
+ * Returns the text supplied when incrementally matching with
294
+ * {RE2::Regexp#scan}.
289
295
  *
290
- * @return [String] the original string.
296
+ * @return [String] the original string passed to {RE2::Regexp#scan}
291
297
  * @example
292
298
  * c = RE2::Regexp.new('(\d+)').scan("foo")
293
299
  * c.string #=> "foo"
@@ -300,9 +306,9 @@ static VALUE re2_scanner_string(const VALUE self) {
300
306
  }
301
307
 
302
308
  /*
303
- * Returns whether the scanner has consumed all input or not.
309
+ * Returns whether the {RE2::Scanner} has consumed all input or not.
304
310
  *
305
- * @return [Boolean] whether the scanner has consumed all input or not
311
+ * @return [Boolean] whether the {RE2::Scanner} has consumed all input or not
306
312
  * @example
307
313
  * c = RE2::Regexp.new('(\d+)').scan("foo")
308
314
  * c.eof? #=> true
@@ -315,7 +321,7 @@ static VALUE re2_scanner_eof(const VALUE self) {
315
321
  }
316
322
 
317
323
  /*
318
- * Rewind the scanner to the start of the string.
324
+ * Rewind the {RE2::Scanner} to the start of the string.
319
325
  *
320
326
  * @example
321
327
  * s = RE2::Regexp.new('(\d+)').scan("1 2 3")
@@ -330,21 +336,27 @@ static VALUE re2_scanner_rewind(VALUE self) {
330
336
  TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
331
337
 
332
338
  delete c->input;
333
- c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(c->text));
339
+ c->input = new(std::nothrow) re2::StringPiece(
340
+ RSTRING_PTR(c->text), RSTRING_LEN(c->text));
334
341
  c->eof = false;
335
342
 
336
343
  return self;
337
344
  }
338
345
 
339
346
  /*
340
- * Scan the given text incrementally for matches, returning an array of
341
- * matches on each subsequent call. Returns nil if no matches are found.
347
+ * Scan the given text incrementally for matches using
348
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
349
+ * `FindAndConsume`}, returning an array of submatches on each subsequent
350
+ * call. Returns `nil` if no matches are found or an empty array for every
351
+ * match if the pattern has no capturing groups.
342
352
  *
343
353
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
344
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
345
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
354
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
355
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
346
356
  *
347
- * @return [Array<String>] the matches.
357
+ * @return [Array<String>] if the pattern has capturing groups
358
+ * @return [[]] if the pattern does not have capturing groups
359
+ * @return [nil] if no matches are found
348
360
  * @example
349
361
  * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
350
362
  * s.scan #=> ["Foo"]
@@ -359,7 +371,7 @@ static VALUE re2_scanner_scan(VALUE self) {
359
371
 
360
372
  std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
361
373
  std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
362
- std::vector<std::string> matches(c->number_of_capturing_groups);
374
+ std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);
363
375
 
364
376
  if (c->eof) {
365
377
  return Qnil;
@@ -403,9 +415,6 @@ static VALUE re2_scanner_scan(VALUE self) {
403
415
  }
404
416
  }
405
417
 
406
- /*
407
- * Retrieve a matchdata by index or name.
408
- */
409
418
  static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
410
419
  re2_matchdata *m;
411
420
  re2_pattern *p;
@@ -417,10 +426,20 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
417
426
 
418
427
  if (FIXNUM_P(idx)) {
419
428
  id = FIX2INT(idx);
429
+ } else if (SYMBOL_P(idx)) {
430
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
431
+ std::map<std::string, int>::const_iterator search = groups.find(rb_id2name(SYM2ID(idx)));
432
+
433
+ if (search != groups.end()) {
434
+ id = search->second;
435
+ } else {
436
+ return NULL;
437
+ }
420
438
  } else {
421
- const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
439
+ StringValue(idx);
440
+
422
441
  const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
423
- std::map<std::string, int>::const_iterator search = groups.find(name);
442
+ std::map<std::string, int>::const_iterator search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)));
424
443
 
425
444
  if (search != groups.end()) {
426
445
  id = search->second;
@@ -441,13 +460,14 @@ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
441
460
  }
442
461
 
443
462
  /*
444
- * Returns the number of elements in the match array (including nils).
463
+ * Returns the number of elements in the {RE2::MatchData} (including the
464
+ * overall match, submatches and any `nils`).
445
465
  *
446
466
  * @return [Integer] the number of elements
447
467
  * @example
448
468
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
449
- * m.size #=> 2
450
- * m.length #=> 2
469
+ * m.size #=> 2
470
+ * m.length #=> 2
451
471
  */
452
472
  static VALUE re2_matchdata_size(const VALUE self) {
453
473
  re2_matchdata *m;
@@ -458,14 +478,15 @@ static VALUE re2_matchdata_size(const VALUE self) {
458
478
  }
459
479
 
460
480
  /*
461
- * Returns the offset of the start of the nth element of the matchdata.
481
+ * Returns the offset of the start of the nth element of the {RE2::MatchData}.
462
482
  *
463
- * @param [Integer, String, Symbol] n the name or number of the match
464
- * @return [Integer] the offset of the start of the match
483
+ * @param [Integer, String, Symbol] n the name or number of the submatch
484
+ * @return [Integer, nil] the offset of the start of the match or `nil` if
485
+ * there is no such submatch
465
486
  * @example
466
487
  * m = RE2::Regexp.new('ob (\d+)').match("bob 123")
467
- * m.begin(0) #=> 1
468
- * m.begin(1) #=> 4
488
+ * m.begin(0) #=> 1
489
+ * m.begin(1) #=> 4
469
490
  */
470
491
  static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
471
492
  re2_matchdata *m;
@@ -483,14 +504,16 @@ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
483
504
  }
484
505
 
485
506
  /*
486
- * Returns the offset of the character following the end of the nth element of the matchdata.
507
+ * Returns the offset of the character following the end of the nth element of
508
+ * the {RE2::MatchData}.
487
509
  *
488
510
  * @param [Integer, String, Symbol] n the name or number of the match
489
- * @return [Integer] the offset of the character following the end of the match
511
+ * @return [Integer, nil] the offset of the character following the end of the
512
+ * match or `nil` if there is no such match
490
513
  * @example
491
514
  * m = RE2::Regexp.new('ob (\d+) b').match("bob 123 bob")
492
- * m.end(0) #=> 9
493
- * m.end(1) #=> 7
515
+ * m.end(0) #=> 9
516
+ * m.end(1) #=> 7
494
517
  */
495
518
  static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
496
519
  re2_matchdata *m;
@@ -510,10 +533,10 @@ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
510
533
  /*
511
534
  * Returns the {RE2::Regexp} used in the match.
512
535
  *
513
- * @return [RE2::Regexp] the regexp used in the match
536
+ * @return [RE2::Regexp] the regular expression used in the match
514
537
  * @example
515
538
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
516
- * m.regexp #=> #<RE2::Regexp /(\d+)/>
539
+ * m.regexp #=> #<RE2::Regexp /(\d+)/>
517
540
  */
518
541
  static VALUE re2_matchdata_regexp(const VALUE self) {
519
542
  re2_matchdata *m;
@@ -523,12 +546,12 @@ static VALUE re2_matchdata_regexp(const VALUE self) {
523
546
  }
524
547
 
525
548
  /*
526
- * Returns the {RE2::Regexp} used in the scanner.
549
+ * Returns the {RE2::Regexp} used in the {RE2::Scanner}.
527
550
  *
528
- * @return [RE2::Regexp] the regexp used in the scanner
551
+ * @return [RE2::Regexp] the regular expression used in the {RE2::Scanner}
529
552
  * @example
530
553
  * c = RE2::Regexp.new('(\d+)').scan("bob 123")
531
- * c.regexp #=> #<RE2::Regexp /(\d+)/>
554
+ * c.regexp #=> #<RE2::Regexp /(\d+)/>
532
555
  */
533
556
  static VALUE re2_scanner_regexp(const VALUE self) {
534
557
  re2_scanner *c;
@@ -544,16 +567,17 @@ static VALUE re2_regexp_allocate(VALUE klass) {
544
567
  }
545
568
 
546
569
  /*
547
- * Returns the array of matches.
570
+ * Returns the array of matches including the overall match, submatches and any
571
+ * `nil`s.
548
572
  *
549
573
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
550
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
551
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
574
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
575
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
552
576
  *
553
577
  * @return [Array<String, nil>] the array of matches
554
578
  * @example
555
579
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
556
- * m.to_a #=> ["123", "123"]
580
+ * m.to_a #=> ["123", "123"]
557
581
  */
558
582
  static VALUE re2_matchdata_to_a(const VALUE self) {
559
583
  re2_matchdata *m;
@@ -598,7 +622,7 @@ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
598
622
  }
599
623
  }
600
624
 
601
- static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
625
+ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
602
626
  re2_matchdata *m;
603
627
  re2_pattern *p;
604
628
 
@@ -619,19 +643,17 @@ static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
619
643
  * Retrieve zero, one or more matches by index or name.
620
644
  *
621
645
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
622
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
623
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
624
- *
625
- * @return [Array<String, nil>, String, Boolean]
646
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
647
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
626
648
  *
627
649
  * @overload [](index)
628
650
  * Access a particular match by index.
629
651
  *
630
652
  * @param [Integer] index the index of the match to fetch
631
- * @return [String, nil] the specified match
653
+ * @return [String, nil] the specified match or `nil` if it isn't present
632
654
  * @example
633
655
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
634
- * m[0] #=> "123"
656
+ * m[0] #=> "123"
635
657
  *
636
658
  * @overload [](start, length)
637
659
  * Access a range of matches by starting index and length.
@@ -641,7 +663,7 @@ static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
641
663
  * @return [Array<String, nil>] the specified matches
642
664
  * @example
643
665
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
644
- * m[0, 1] #=> ["123"]
666
+ * m[0, 1] #=> ["123"]
645
667
  *
646
668
  * @overload [](range)
647
669
  * Access a range of matches by index.
@@ -650,13 +672,13 @@ static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
650
672
  * @return [Array<String, nil>] the specified matches
651
673
  * @example
652
674
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
653
- * m[0..1] #=> "[123", "123"]
675
+ * m[0..1] #=> "[123", "123"]
654
676
  *
655
677
  * @overload [](name)
656
678
  * Access a particular match by name.
657
679
  *
658
680
  * @param [String, Symbol] name the name of the match to fetch
659
- * @return [String, nil] the specific match
681
+ * @return [String, nil] the specific match or `nil` if it isn't present
660
682
  * @example
661
683
  * m = RE2::Regexp.new('(?P<number>\d+)').match("bob 123")
662
684
  * m["number"] #=> "123"
@@ -667,7 +689,8 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
667
689
  rb_scan_args(argc, argv, "11", &idx, &rest);
668
690
 
669
691
  if (TYPE(idx) == T_STRING) {
670
- return re2_matchdata_named_match(RSTRING_PTR(idx), self);
692
+ return re2_matchdata_named_match(
693
+ std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self);
671
694
  } else if (SYMBOL_P(idx)) {
672
695
  return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
673
696
  } else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
@@ -690,13 +713,13 @@ static VALUE re2_matchdata_to_s(const VALUE self) {
690
713
  * Returns a printable version of the match.
691
714
  *
692
715
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
693
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
694
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
716
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
717
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
695
718
  *
696
719
  * @return [String] a printable version of the match
697
720
  * @example
698
721
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
699
- * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
722
+ * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
700
723
  */
701
724
  static VALUE re2_matchdata_inspect(const VALUE self) {
702
725
  re2_matchdata *m;
@@ -720,7 +743,9 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
720
743
  if (match == Qnil) {
721
744
  output << "nil";
722
745
  } else {
723
- output << "\"" << RSTRING_PTR(match) << "\"";
746
+ output << "\"";
747
+ output.write(RSTRING_PTR(match), RSTRING_LEN(match));
748
+ output << "\"";
724
749
  }
725
750
  }
726
751
 
@@ -734,13 +759,14 @@ static VALUE re2_matchdata_inspect(const VALUE self) {
734
759
  * Returns the array of submatches for pattern matching.
735
760
  *
736
761
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
737
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
738
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
762
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
763
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is
764
+ * undefined).
739
765
  *
740
766
  * @return [Array<String, nil>] the array of submatches
741
767
  * @example
742
768
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
743
- * m.deconstruct #=> ["123"]
769
+ * m.deconstruct #=> ["123"]
744
770
  *
745
771
  * @example pattern matching
746
772
  * case RE2::Regexp.new('(\d+) (\d+)').match("bob 123 456")
@@ -780,17 +806,18 @@ static VALUE re2_matchdata_deconstruct(const VALUE self) {
780
806
  * order but an invalid name will cause the hash to be immediately returned.
781
807
  *
782
808
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
783
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
784
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
809
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
810
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
785
811
  *
786
812
  * @return [Hash] a hash of capturing group names to submatches
787
- * @param [Array<Symbol>, nil] keys an array of Symbol capturing group names or nil to return all names
813
+ * @param [Array<Symbol>, nil] keys an array of `Symbol` capturing group names
814
+ * or `nil` to return all names
788
815
  * @example
789
816
  * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
790
- * m.deconstruct_keys(nil) #=> {:numbers => "123", :letters => "abc"}
791
- * m.deconstruct_keys([:numbers]) #=> {:numbers => "123"}
792
- * m.deconstruct_keys([:fruit]) #=> {}
793
- * m.deconstruct_keys([:letters, :fruit]) #=> {:letters => "abc"}
817
+ * m.deconstruct_keys(nil) #=> {numbers: "123", letters: "abc"}
818
+ * m.deconstruct_keys([:numbers]) #=> {numbers: "123"}
819
+ * m.deconstruct_keys([:fruit]) #=> {}
820
+ * m.deconstruct_keys([:letters, :fruit]) #=> {letters: "abc"}
794
821
  *
795
822
  * @example pattern matching
796
823
  * case RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
@@ -839,11 +866,9 @@ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys)
839
866
  }
840
867
 
841
868
  /*
842
- * Returns a new RE2 object with a compiled version of
843
- * +pattern+ stored inside. Equivalent to +RE2::Regexp.new+.
869
+ * Shorthand to compile a new {RE2::Regexp}.
844
870
  *
845
871
  * @see RE2::Regexp#initialize
846
- *
847
872
  */
848
873
  static VALUE re2_re2(int argc, VALUE *argv, VALUE) {
849
874
  return rb_class_new_instance(argc, argv, re2_cRegexp);
@@ -851,22 +876,21 @@ static VALUE re2_re2(int argc, VALUE *argv, VALUE) {
851
876
 
852
877
  /*
853
878
  * Returns a new {RE2::Regexp} object with a compiled version of
854
- * +pattern+ stored inside.
855
- *
856
- * @return [RE2::Regexp]
879
+ * `pattern` stored inside.
857
880
  *
858
881
  * @overload initialize(pattern)
859
882
  * Returns a new {RE2::Regexp} object with a compiled version of
860
- * +pattern+ stored inside with the default options.
883
+ * `pattern` stored inside with the default options.
861
884
  *
862
885
  * @param [String] pattern the pattern to compile
863
- * @return [RE2::Regexp] an RE2::Regexp with the specified pattern
886
+ * @return [RE2::Regexp] a {RE2::Regexp} with the specified pattern
887
+ * @raise [TypeError] if the given pattern can't be coerced to a `String`
864
888
  * @raise [NoMemoryError] if memory could not be allocated for the compiled
865
- * pattern
889
+ * pattern
866
890
  *
867
891
  * @overload initialize(pattern, options)
868
892
  * Returns a new {RE2::Regexp} object with a compiled version of
869
- * +pattern+ stored inside with the specified options.
893
+ * `pattern` stored inside with the specified options.
870
894
  *
871
895
  * @param [String] pattern the pattern to compile
872
896
  * @param [Hash] options the options with which to compile the pattern
@@ -876,12 +900,13 @@ static VALUE re2_re2(int argc, VALUE *argv, VALUE) {
876
900
  * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
877
901
  * @option options [Integer] :max_mem approx. max memory footprint of RE2
878
902
  * @option options [Boolean] :literal (false) interpret string as literal, not regexp
879
- * @option options [Boolean] :never_nl (false) never match \n, even if it is in regexp
880
- * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with (?i) unless in posix_syntax mode)
881
- * @option options [Boolean] :perl_classes (false) allow Perl's \d \s \w \D \S \W when in posix_syntax mode
882
- * @option options [Boolean] :word_boundary (false) allow \b \B (word boundary and not) when in posix_syntax mode
883
- * @option options [Boolean] :one_line (false) ^ and $ only match beginning and end of text when in posix_syntax mode
884
- * @return [RE2::Regexp] an RE2::Regexp with the specified pattern and options
903
+ * @option options [Boolean] :never_nl (false) never match `\n`, even if it is in regexp
904
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with `(?i)` unless in `posix_syntax` mode)
905
+ * @option options [Boolean] :perl_classes (false) allow Perl's `\d` `\s` `\w` `\D` `\S` `\W` when in `posix_syntax` mode
906
+ * @option options [Boolean] :word_boundary (false) allow `\b` `\B` (word boundary and not) when in `posix_syntax` mode
907
+ * @option options [Boolean] :one_line (false) `^` and `$` only match beginning and end of text when in `posix_syntax` mode
908
+ * @return [RE2::Regexp] a {RE2::Regexp} with the specified pattern and options
909
+ * @raise [TypeError] if the given pattern can't be coerced to a `String`
885
910
  * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
886
911
  */
887
912
  static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
@@ -899,9 +924,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
899
924
  RE2::Options re2_options;
900
925
  parse_re2_options(&re2_options, options);
901
926
 
902
- p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern), re2_options);
927
+ p->pattern = new(std::nothrow) RE2(
928
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options);
903
929
  } else {
904
- p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern));
930
+ p->pattern = new(std::nothrow) RE2(
931
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
905
932
  }
906
933
 
907
934
  if (p->pattern == 0) {
@@ -912,16 +939,17 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
912
939
  }
913
940
 
914
941
  /*
915
- * Returns a printable version of the regular expression +re2+.
942
+ * Returns a printable version of the regular expression.
916
943
  *
917
944
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
918
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
919
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
945
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
946
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is
947
+ * undefined).
920
948
  *
921
949
  * @return [String] a printable version of the regular expression
922
950
  * @example
923
951
  * re2 = RE2::Regexp.new("woo?")
924
- * re2.inspect #=> "#<RE2::Regexp /woo?/>"
952
+ * re2.inspect #=> "#<RE2::Regexp /woo?/>"
925
953
  */
926
954
  static VALUE re2_regexp_inspect(const VALUE self) {
927
955
  re2_pattern *p;
@@ -937,16 +965,16 @@ static VALUE re2_regexp_inspect(const VALUE self) {
937
965
  }
938
966
 
939
967
  /*
940
- * Returns a string version of the regular expression +re2+.
968
+ * Returns a string version of the regular expression.
941
969
  *
942
970
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
943
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
944
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
971
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
972
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
945
973
  *
946
974
  * @return [String] a string version of the regular expression
947
975
  * @example
948
976
  * re2 = RE2::Regexp.new("woo?")
949
- * re2.to_s #=> "woo?"
977
+ * re2.to_s #=> "woo?"
950
978
  */
951
979
  static VALUE re2_regexp_to_s(const VALUE self) {
952
980
  re2_pattern *p;
@@ -958,13 +986,12 @@ static VALUE re2_regexp_to_s(const VALUE self) {
958
986
  }
959
987
 
960
988
  /*
961
- * Returns whether or not the regular expression +re2+
962
- * was compiled successfully or not.
989
+ * Returns whether or not the regular expression was compiled successfully.
963
990
  *
964
991
  * @return [Boolean] whether or not compilation was successful
965
992
  * @example
966
993
  * re2 = RE2::Regexp.new("woo?")
967
- * re2.ok? #=> true
994
+ * re2.ok? #=> true
968
995
  */
969
996
  static VALUE re2_regexp_ok(const VALUE self) {
970
997
  re2_pattern *p;
@@ -974,13 +1001,13 @@ static VALUE re2_regexp_ok(const VALUE self) {
974
1001
  }
975
1002
 
976
1003
  /*
977
- * Returns whether or not the regular expression +re2+
978
- * was compiled with the utf8 option set to true.
1004
+ * Returns whether or not the regular expression was compiled with the `utf8`
1005
+ * option set to `true`.
979
1006
  *
980
- * @return [Boolean] the utf8 option
1007
+ * @return [Boolean] the `utf8` option
981
1008
  * @example
982
- * re2 = RE2::Regexp.new("woo?", :utf8 => true)
983
- * re2.utf8? #=> true
1009
+ * re2 = RE2::Regexp.new("woo?", utf8: true)
1010
+ * re2.utf8? #=> true
984
1011
  */
985
1012
  static VALUE re2_regexp_utf8(const VALUE self) {
986
1013
  re2_pattern *p;
@@ -990,13 +1017,13 @@ static VALUE re2_regexp_utf8(const VALUE self) {
990
1017
  }
991
1018
 
992
1019
  /*
993
- * Returns whether or not the regular expression +re2+
994
- * was compiled with the posix_syntax option set to true.
1020
+ * Returns whether or not the regular expression was compiled with the
1021
+ * `posix_syntax` option set to `true`.
995
1022
  *
996
- * @return [Boolean] the posix_syntax option
1023
+ * @return [Boolean] the `posix_syntax` option
997
1024
  * @example
998
- * re2 = RE2::Regexp.new("woo?", :posix_syntax => true)
999
- * re2.posix_syntax? #=> true
1025
+ * re2 = RE2::Regexp.new("woo?", posix_syntax: true)
1026
+ * re2.posix_syntax? #=> true
1000
1027
  */
1001
1028
  static VALUE re2_regexp_posix_syntax(const VALUE self) {
1002
1029
  re2_pattern *p;
@@ -1006,13 +1033,13 @@ static VALUE re2_regexp_posix_syntax(const VALUE self) {
1006
1033
  }
1007
1034
 
1008
1035
  /*
1009
- * Returns whether or not the regular expression +re2+
1010
- * was compiled with the longest_match option set to true.
1036
+ * Returns whether or not the regular expression was compiled with the
1037
+ * `longest_match` option set to `true`.
1011
1038
  *
1012
- * @return [Boolean] the longest_match option
1039
+ * @return [Boolean] the `longest_match` option
1013
1040
  * @example
1014
- * re2 = RE2::Regexp.new("woo?", :longest_match => true)
1015
- * re2.longest_match? #=> true
1041
+ * re2 = RE2::Regexp.new("woo?", longest_match: true)
1042
+ * re2.longest_match? #=> true
1016
1043
  */
1017
1044
  static VALUE re2_regexp_longest_match(const VALUE self) {
1018
1045
  re2_pattern *p;
@@ -1022,13 +1049,13 @@ static VALUE re2_regexp_longest_match(const VALUE self) {
1022
1049
  }
1023
1050
 
1024
1051
  /*
1025
- * Returns whether or not the regular expression +re2+
1026
- * was compiled with the log_errors option set to true.
1052
+ * Returns whether or not the regular expression was compiled with the
1053
+ * `log_errors` option set to `true`.
1027
1054
  *
1028
- * @return [Boolean] the log_errors option
1055
+ * @return [Boolean] the `log_errors` option
1029
1056
  * @example
1030
- * re2 = RE2::Regexp.new("woo?", :log_errors => true)
1031
- * re2.log_errors? #=> true
1057
+ * re2 = RE2::Regexp.new("woo?", log_errors: true)
1058
+ * re2.log_errors? #=> true
1032
1059
  */
1033
1060
  static VALUE re2_regexp_log_errors(const VALUE self) {
1034
1061
  re2_pattern *p;
@@ -1038,13 +1065,12 @@ static VALUE re2_regexp_log_errors(const VALUE self) {
1038
1065
  }
1039
1066
 
1040
1067
  /*
1041
- * Returns the max_mem setting for the regular expression
1042
- * +re2+.
1068
+ * Returns the `max_mem` setting for the regular expression.
1043
1069
  *
1044
- * @return [Integer] the max_mem option
1070
+ * @return [Integer] the `max_mem` option
1045
1071
  * @example
1046
- * re2 = RE2::Regexp.new("woo?", :max_mem => 1024)
1047
- * re2.max_mem #=> 1024
1072
+ * re2 = RE2::Regexp.new("woo?", max_mem: 1024)
1073
+ * re2.max_mem #=> 1024
1048
1074
  */
1049
1075
  static VALUE re2_regexp_max_mem(const VALUE self) {
1050
1076
  re2_pattern *p;
@@ -1054,13 +1080,13 @@ static VALUE re2_regexp_max_mem(const VALUE self) {
1054
1080
  }
1055
1081
 
1056
1082
  /*
1057
- * Returns whether or not the regular expression +re2+
1058
- * was compiled with the literal option set to true.
1083
+ * Returns whether or not the regular expression was compiled with the
1084
+ * `literal` option set to `true`.
1059
1085
  *
1060
- * @return [Boolean] the literal option
1086
+ * @return [Boolean] the `literal` option
1061
1087
  * @example
1062
- * re2 = RE2::Regexp.new("woo?", :literal => true)
1063
- * re2.literal? #=> true
1088
+ * re2 = RE2::Regexp.new("woo?", literal: true)
1089
+ * re2.literal? #=> true
1064
1090
  */
1065
1091
  static VALUE re2_regexp_literal(const VALUE self) {
1066
1092
  re2_pattern *p;
@@ -1070,13 +1096,13 @@ static VALUE re2_regexp_literal(const VALUE self) {
1070
1096
  }
1071
1097
 
1072
1098
  /*
1073
- * Returns whether or not the regular expression +re2+
1074
- * was compiled with the never_nl option set to true.
1099
+ * Returns whether or not the regular expression was compiled with the
1100
+ * `never_nl` option set to `true`.
1075
1101
  *
1076
- * @return [Boolean] the never_nl option
1102
+ * @return [Boolean] the `never_nl` option
1077
1103
  * @example
1078
- * re2 = RE2::Regexp.new("woo?", :never_nl => true)
1079
- * re2.never_nl? #=> true
1104
+ * re2 = RE2::Regexp.new("woo?", never_nl: true)
1105
+ * re2.never_nl? #=> true
1080
1106
  */
1081
1107
  static VALUE re2_regexp_never_nl(const VALUE self) {
1082
1108
  re2_pattern *p;
@@ -1086,13 +1112,13 @@ static VALUE re2_regexp_never_nl(const VALUE self) {
1086
1112
  }
1087
1113
 
1088
1114
  /*
1089
- * Returns whether or not the regular expression +re2+
1090
- * was compiled with the case_sensitive option set to true.
1115
+ * Returns whether or not the regular expression was compiled with the
1116
+ * `case_sensitive` option set to `true`.
1091
1117
  *
1092
- * @return [Boolean] the case_sensitive option
1118
+ * @return [Boolean] the `case_sensitive` option
1093
1119
  * @example
1094
- * re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
1095
- * re2.case_sensitive? #=> true
1120
+ * re2 = RE2::Regexp.new("woo?", case_sensitive: true)
1121
+ * re2.case_sensitive? #=> true
1096
1122
  */
1097
1123
  static VALUE re2_regexp_case_sensitive(const VALUE self) {
1098
1124
  re2_pattern *p;
@@ -1102,27 +1128,27 @@ static VALUE re2_regexp_case_sensitive(const VALUE self) {
1102
1128
  }
1103
1129
 
1104
1130
  /*
1105
- * Returns whether or not the regular expression +re2+
1106
- * was compiled with the case_sensitive option set to false.
1131
+ * Returns whether or not the regular expression was compiled with the
1132
+ * `case_sensitive` option set to `false`.
1107
1133
  *
1108
- * @return [Boolean] the inverse of the case_sensitive option
1134
+ * @return [Boolean] the inverse of the `case_sensitive` option
1109
1135
  * @example
1110
- * re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
1111
- * re2.case_insensitive? #=> false
1112
- * re2.casefold? #=> false
1136
+ * re2 = RE2::Regexp.new("woo?", case_sensitive: true)
1137
+ * re2.case_insensitive? #=> false
1138
+ * re2.casefold? #=> false
1113
1139
  */
1114
1140
  static VALUE re2_regexp_case_insensitive(const VALUE self) {
1115
1141
  return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
1116
1142
  }
1117
1143
 
1118
1144
  /*
1119
- * Returns whether or not the regular expression +re2+
1120
- * was compiled with the perl_classes option set to true.
1145
+ * Returns whether or not the regular expression was compiled with the
1146
+ * perl_classes option set to `true`.
1121
1147
  *
1122
- * @return [Boolean] the perl_classes option
1148
+ * @return [Boolean] the `perl_classes` option
1123
1149
  * @example
1124
- * re2 = RE2::Regexp.new("woo?", :perl_classes => true)
1125
- * re2.perl_classes? #=> true
1150
+ * re2 = RE2::Regexp.new("woo?", perl_classes: true)
1151
+ * re2.perl_classes? #=> true
1126
1152
  */
1127
1153
  static VALUE re2_regexp_perl_classes(const VALUE self) {
1128
1154
  re2_pattern *p;
@@ -1132,13 +1158,13 @@ static VALUE re2_regexp_perl_classes(const VALUE self) {
1132
1158
  }
1133
1159
 
1134
1160
  /*
1135
- * Returns whether or not the regular expression +re2+
1136
- * was compiled with the word_boundary option set to true.
1161
+ * Returns whether or not the regular expression was compiled with the
1162
+ * `word_boundary` option set to `true`.
1137
1163
  *
1138
- * @return [Boolean] the word_boundary option
1164
+ * @return [Boolean] the `word_boundary` option
1139
1165
  * @example
1140
- * re2 = RE2::Regexp.new("woo?", :word_boundary => true)
1141
- * re2.word_boundary? #=> true
1166
+ * re2 = RE2::Regexp.new("woo?", word_boundary: true)
1167
+ * re2.word_boundary? #=> true
1142
1168
  */
1143
1169
  static VALUE re2_regexp_word_boundary(const VALUE self) {
1144
1170
  re2_pattern *p;
@@ -1148,13 +1174,13 @@ static VALUE re2_regexp_word_boundary(const VALUE self) {
1148
1174
  }
1149
1175
 
1150
1176
  /*
1151
- * Returns whether or not the regular expression +re2+
1152
- * was compiled with the one_line option set to true.
1177
+ * Returns whether or not the regular expression was compiled with the
1178
+ * `one_line` option set to `true`.
1153
1179
  *
1154
- * @return [Boolean] the one_line option
1180
+ * @return [Boolean] the `one_line` option
1155
1181
  * @example
1156
- * re2 = RE2::Regexp.new("woo?", :one_line => true)
1157
- * re2.one_line? #=> true
1182
+ * re2 = RE2::Regexp.new("woo?", one_line: true)
1183
+ * re2.one_line? #=> true
1158
1184
  */
1159
1185
  static VALUE re2_regexp_one_line(const VALUE self) {
1160
1186
  re2_pattern *p;
@@ -1164,10 +1190,10 @@ static VALUE re2_regexp_one_line(const VALUE self) {
1164
1190
  }
1165
1191
 
1166
1192
  /*
1167
- * If the RE2 could not be created properly, returns an
1168
- * error string otherwise returns nil.
1193
+ * If the {RE2::Regexp} could not be created properly, returns an error string
1194
+ * otherwise returns `nil`.
1169
1195
  *
1170
- * @return [String, nil] the error string or nil
1196
+ * @return [String, nil] the error string or `nil`
1171
1197
  */
1172
1198
  static VALUE re2_regexp_error(const VALUE self) {
1173
1199
  re2_pattern *p;
@@ -1181,14 +1207,14 @@ static VALUE re2_regexp_error(const VALUE self) {
1181
1207
  }
1182
1208
 
1183
1209
  /*
1184
- * If the RE2 could not be created properly, returns
1185
- * the offending portion of the regexp otherwise returns nil.
1210
+ * If the {RE2::Regexp} could not be created properly, returns
1211
+ * the offending portion of the regexp otherwise returns `nil`.
1186
1212
  *
1187
1213
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1188
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1189
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1214
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1215
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1190
1216
  *
1191
- * @return [String, nil] the offending portion of the regexp or nil
1217
+ * @return [String, nil] the offending portion of the regexp or `nil`
1192
1218
  */
1193
1219
  static VALUE re2_regexp_error_arg(const VALUE self) {
1194
1220
  re2_pattern *p;
@@ -1218,8 +1244,7 @@ static VALUE re2_regexp_program_size(const VALUE self) {
1218
1244
  }
1219
1245
 
1220
1246
  /*
1221
- * Returns a hash of the options currently set for
1222
- * +re2+.
1247
+ * Returns a hash of the options currently set for the {RE2::Regexp}.
1223
1248
  *
1224
1249
  * @return [Hash] the options
1225
1250
  */
@@ -1270,8 +1295,8 @@ static VALUE re2_regexp_options(const VALUE self) {
1270
1295
 
1271
1296
  /*
1272
1297
  * Returns the number of capturing subpatterns, or -1 if the regexp
1273
- * wasn't valid on construction. The overall match ($0) does not
1274
- * count: if the regexp is "(a)(b)", returns 2.
1298
+ * wasn't valid on construction. The overall match (`$0`) does not
1299
+ * count: if the regexp is `"(a)(b)"`, returns 2.
1275
1300
  *
1276
1301
  * @return [Integer] the number of capturing subpatterns
1277
1302
  */
@@ -1286,8 +1311,8 @@ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1286
1311
  * Returns a hash of names to capturing indices of groups.
1287
1312
  *
1288
1313
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1289
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1290
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1314
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1315
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1291
1316
  *
1292
1317
  * @return [Hash] a hash of names to capturing indices
1293
1318
  */
@@ -1309,63 +1334,93 @@ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1309
1334
  }
1310
1335
 
1311
1336
  /*
1312
- * Match the pattern against the given +text+ and return either
1313
- * a boolean (if no submatches are required) or a {RE2::MatchData}
1314
- * instance.
1337
+ * General matching: match the pattern against the given `text` using
1338
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1339
+ * `Match`} and return a {RE2::MatchData} instance with the specified number of
1340
+ * submatches (defaults to the total number of capturing groups) or a boolean
1341
+ * (if no submatches are required).
1315
1342
  *
1316
- * @return [Boolean, RE2::MatchData]
1343
+ * The number of submatches has a significant impact on performance: requesting
1344
+ * one submatch is much faster than requesting more than one and requesting
1345
+ * zero submatches is faster still.
1317
1346
  *
1318
1347
  * @overload match(text)
1319
- * Returns an {RE2::MatchData} containing the matching pattern and all
1320
- * subpatterns resulting from looking for the regexp in +text+ if the pattern
1348
+ * Returns a {RE2::MatchData} containing the matching pattern and all
1349
+ * submatches resulting from looking for the regexp in `text` if the pattern
1321
1350
  * contains capturing groups.
1322
1351
  *
1323
- * Returns either true or false indicating whether a successful match was
1352
+ * Returns either `true` or `false` indicating whether a successful match was
1324
1353
  * made if the pattern contains no capturing groups.
1325
1354
  *
1326
1355
  * @param [String] text the text to search
1327
- * @return [RE2::MatchData] if the pattern contains capturing groups
1356
+ * @return [RE2::MatchData, nil] if the pattern contains capturing groups
1328
1357
  * @return [Boolean] if the pattern does not contain capturing groups
1329
- * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1358
+ * @raise [NoMemoryError] if there was not enough memory to allocate the submatches
1359
+ * @raise [TypeError] if given text that cannot be coerced to a `String`
1330
1360
  * @example Matching with capturing groups
1331
1361
  * r = RE2::Regexp.new('w(o)(o)')
1332
- * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1362
+ * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1333
1363
  * @example Matching without capturing groups
1334
1364
  * r = RE2::Regexp.new('woo')
1335
- * r.match('woo') #=> true
1365
+ * r.match('woo') #=> true
1336
1366
  *
1337
- * @overload match(text, 0)
1338
- * Returns either true or false indicating whether a
1339
- * successful match was made.
1367
+ * @overload match(text, options)
1368
+ * See `match(text)` but with customisable offsets for starting and ending
1369
+ * matches, optional anchoring to the start or both ends of the text and a
1370
+ * specific number of submatches to extract (padded with `nil`s if
1371
+ * necessary).
1340
1372
  *
1341
1373
  * @param [String] text the text to search
1342
- * @return [Boolean] whether the match was successful
1374
+ * @param [Hash] options the options with which to perform the match
1375
+ * @option options [Integer] :startpos (0) offset at which to start matching
1376
+ * @option options [Integer] :endpos offset at which to stop matching, defaults to the text length
1377
+ * @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match
1378
+ * @option options [Integer] :submatches how many submatches to extract (0 is
1379
+ * fastest), defaults to the number of capturing groups
1380
+ * @return [RE2::MatchData, nil] if extracting any submatches
1381
+ * @return [Boolean] if not extracting any submatches
1382
+ * @raise [ArgumentError] if given a negative number of submatches, invalid
1383
+ * anchor or invalid startpos, endpos pair
1343
1384
  * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1344
- * @example
1385
+ * @raise [TypeError] if given non-String text, non-numeric number of
1386
+ * submatches, non-symbol anchor or non-hash options
1387
+ * @raise [RE2::Regexp::UnsupportedError] if given an endpos argument on a
1388
+ * version of RE2 that does not support it
1389
+ * @example Matching with capturing groups
1345
1390
  * r = RE2::Regexp.new('w(o)(o)')
1346
- * r.match('woo', 0) #=> true
1347
- * r.match('bob', 0) #=> false
1391
+ * r.match('woo', submatches: 1) #=> #<RE2::MatchData "woo" 1:"o">
1392
+ * r.match('woo', submatches: 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1393
+ * r.match('woot', anchor: :anchor_both, submatches: 0)
1394
+ * #=> false
1395
+ * r.match('woot', anchor: :anchor_start, submatches: 0)
1396
+ * #=> true
1397
+ * @example Matching without capturing groups
1398
+ * r = RE2::Regexp.new('wo+')
1399
+ * r.match('woot', anchor: :anchor_both) #=> false
1400
+ * r.match('woot', anchor: :anchor_start) #=> true
1348
1401
  *
1349
- * @overload match(text, number_of_matches)
1350
- * See +match(text)+ but with a specific number of
1351
- * matches returned (padded with nils if necessary).
1402
+ * @overload match(text, submatches)
1403
+ * @deprecated Legacy syntax for matching against `text` with a specific
1404
+ * number of submatches to extract. Use `match(text, submatches: n)` instead.
1352
1405
  *
1353
1406
  * @param [String] text the text to search
1354
- * @param [Integer] number_of_matches the number of matches to return
1355
- * @return [RE2::MatchData] the matches
1356
- * @raise [ArgumentError] if given a negative number of matches
1357
- * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1407
+ * @param [Integer] submatches the number of submatches to extract
1408
+ * @return [RE2::MatchData, nil] if extracting any submatches
1409
+ * @return [Boolean] if not extracting any submatches
1410
+ * @raise [NoMemoryError] if there was not enough memory to allocate the submatches
1411
+ * @raise [TypeError] if given non-numeric number of submatches
1358
1412
  * @example
1359
1413
  * r = RE2::Regexp.new('w(o)(o)')
1414
+ * r.match('woo', 0) #=> true
1360
1415
  * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
1361
- * r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1416
+ * r.match('woo', 2) #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1362
1417
  */
1363
1418
  static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1364
1419
  re2_pattern *p;
1365
1420
  re2_matchdata *m;
1366
- VALUE text, number_of_matches;
1421
+ VALUE text, options;
1367
1422
 
1368
- rb_scan_args(argc, argv, "11", &text, &number_of_matches);
1423
+ rb_scan_args(argc, argv, "11", &text, &options);
1369
1424
 
1370
1425
  /* Ensure text is a string. */
1371
1426
  StringValue(text);
@@ -1373,12 +1428,80 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1373
1428
  TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1374
1429
 
1375
1430
  int n;
1431
+ int startpos = 0;
1432
+ int endpos = RSTRING_LEN(text);
1433
+ RE2::Anchor anchor = RE2::UNANCHORED;
1434
+
1435
+ if (RTEST(options)) {
1436
+ if (FIXNUM_P(options)) {
1437
+ n = NUM2INT(options);
1438
+
1439
+ if (n < 0) {
1440
+ rb_raise(rb_eArgError, "number of matches should be >= 0");
1441
+ }
1442
+ } else {
1443
+ if (TYPE(options) != T_HASH) {
1444
+ options = rb_Hash(options);
1445
+ }
1446
+
1447
+ VALUE endpos_option = rb_hash_aref(options, ID2SYM(id_endpos));
1448
+ if (!NIL_P(endpos_option)) {
1449
+ #ifdef HAVE_ENDPOS_ARGUMENT
1450
+ Check_Type(endpos_option, T_FIXNUM);
1451
+
1452
+ endpos = NUM2INT(endpos_option);
1453
+
1454
+ if (endpos < 0) {
1455
+ rb_raise(rb_eArgError, "endpos should be >= 0");
1456
+ }
1457
+ #else
1458
+ rb_raise(re2_eRegexpUnsupportedError, "current version of RE2::Match() does not support endpos argument");
1459
+ #endif
1460
+ }
1461
+
1462
+ VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor));
1463
+ if (!NIL_P(anchor_option)) {
1464
+ Check_Type(anchor_option, T_SYMBOL);
1465
+
1466
+ ID id_anchor_option = SYM2ID(anchor_option);
1467
+ if (id_anchor_option == id_unanchored) {
1468
+ anchor = RE2::UNANCHORED;
1469
+ } else if (id_anchor_option == id_anchor_start) {
1470
+ anchor = RE2::ANCHOR_START;
1471
+ } else if (id_anchor_option == id_anchor_both) {
1472
+ anchor = RE2::ANCHOR_BOTH;
1473
+ } else {
1474
+ rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
1475
+ }
1476
+ }
1477
+
1478
+ VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches));
1479
+ if (!NIL_P(submatches_option)) {
1480
+ Check_Type(submatches_option, T_FIXNUM);
1481
+
1482
+ n = NUM2INT(submatches_option);
1376
1483
 
1377
- if (RTEST(number_of_matches)) {
1378
- n = NUM2INT(number_of_matches);
1484
+ if (n < 0) {
1485
+ rb_raise(rb_eArgError, "number of matches should be >= 0");
1486
+ }
1487
+ } else {
1488
+ if (!p->pattern->ok()) {
1489
+ return Qnil;
1490
+ }
1491
+
1492
+ n = p->pattern->NumberOfCapturingGroups();
1493
+ }
1494
+
1495
+ VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
1496
+ if (!NIL_P(startpos_option)) {
1497
+ Check_Type(startpos_option, T_FIXNUM);
1379
1498
 
1380
- if (n < 0) {
1381
- rb_raise(rb_eArgError, "number of matches should be >= 0");
1499
+ startpos = NUM2INT(startpos_option);
1500
+
1501
+ if (startpos < 0) {
1502
+ rb_raise(rb_eArgError, "startpos should be >= 0");
1503
+ }
1504
+ }
1382
1505
  }
1383
1506
  } else {
1384
1507
  if (!p->pattern->ok()) {
@@ -1388,13 +1511,19 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1388
1511
  n = p->pattern->NumberOfCapturingGroups();
1389
1512
  }
1390
1513
 
1514
+ if (startpos > endpos) {
1515
+ rb_raise(rb_eArgError, "startpos should be <= endpos");
1516
+ }
1517
+
1391
1518
  if (n == 0) {
1392
1519
  #ifdef HAVE_ENDPOS_ARGUMENT
1393
- bool matched = p->pattern->Match(RSTRING_PTR(text), 0,
1394
- RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
1520
+ bool matched = p->pattern->Match(
1521
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1522
+ startpos, endpos, anchor, 0, 0);
1395
1523
  #else
1396
- bool matched = p->pattern->Match(RSTRING_PTR(text), 0, RE2::UNANCHORED,
1397
- 0, 0);
1524
+ bool matched = p->pattern->Match(
1525
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1526
+ startpos, anchor, 0, 0);
1398
1527
  #endif
1399
1528
  return BOOL2RUBY(matched);
1400
1529
  } else {
@@ -1418,11 +1547,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1418
1547
  m->number_of_matches = n;
1419
1548
 
1420
1549
  #ifdef HAVE_ENDPOS_ARGUMENT
1421
- bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
1422
- RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
1550
+ bool matched = p->pattern->Match(
1551
+ re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
1552
+ startpos, endpos, anchor, m->matches, n);
1423
1553
  #else
1424
- bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
1425
- RE2::UNANCHORED, m->matches, n);
1554
+ bool matched = p->pattern->Match(
1555
+ re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
1556
+ startpos, anchor, m->matches, n);
1426
1557
  #endif
1427
1558
  if (matched) {
1428
1559
  return matchdata;
@@ -1433,22 +1564,56 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1433
1564
  }
1434
1565
 
1435
1566
  /*
1436
- * Returns true or false to indicate a successful match.
1437
- * Equivalent to +re2.match(text, 0)+.
1567
+ * Returns true if the pattern matches any substring of the given text using
1568
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L413-L427
1569
+ * `PartialMatch`}.
1438
1570
  *
1439
1571
  * @return [Boolean] whether the match was successful
1572
+ * @raise [TypeError] if text cannot be coerced to a `String`
1440
1573
  */
1441
1574
  static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1442
- VALUE argv[2] = { text, INT2FIX(0) };
1575
+ re2_pattern *p;
1576
+
1577
+ /* Ensure text is a string. */
1578
+ StringValue(text);
1579
+
1580
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1581
+
1582
+ return BOOL2RUBY(RE2::PartialMatch(
1583
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
1584
+ }
1585
+
1586
+ /*
1587
+ * Returns true if the pattern matches the given text using
1588
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L376-L411
1589
+ * `FullMatch`}.
1590
+ *
1591
+ * @return [Boolean] whether the match was successful
1592
+ * @raise [TypeError] if text cannot be coerced to a `String`
1593
+ */
1594
+ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {
1595
+ re2_pattern *p;
1596
+
1597
+ /* Ensure text is a string. */
1598
+ StringValue(text);
1443
1599
 
1444
- return re2_regexp_match(2, argv, self);
1600
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1601
+
1602
+ return BOOL2RUBY(RE2::FullMatch(
1603
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
1445
1604
  }
1446
1605
 
1447
1606
  /*
1448
- * Returns a {RE2::Scanner} for scanning the given text incrementally.
1607
+ * Returns a {RE2::Scanner} for scanning the given text incrementally with
1608
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
1609
+ * `FindAndConsume`}.
1449
1610
  *
1611
+ * @param [text] text the text to scan incrementally
1612
+ * @return [RE2::Scanner] an `Enumerable` {RE2::Scanner} object
1613
+ * @raise [TypeError] if `text` cannot be coerced to a `String`
1450
1614
  * @example
1451
1615
  * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
1616
+ * #=> #<RE2::Scanner:0x0000000000000001>
1452
1617
  */
1453
1618
  static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1454
1619
  /* Ensure text is a string. */
@@ -1461,7 +1626,8 @@ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1461
1626
  VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1462
1627
  TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);
1463
1628
 
1464
- c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(text));
1629
+ c->input = new(std::nothrow) re2::StringPiece(
1630
+ RSTRING_PTR(text), RSTRING_LEN(text));
1465
1631
  RB_OBJ_WRITE(scanner, &c->regexp, self);
1466
1632
  RB_OBJ_WRITE(scanner, &c->text, text);
1467
1633
 
@@ -1477,17 +1643,40 @@ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1477
1643
  }
1478
1644
 
1479
1645
  /*
1480
- * Returns a copy of +str+ with the first occurrence +pattern+
1481
- * replaced with +rewrite+.
1646
+ * Returns whether the underlying RE2 version supports passing an `endpos`
1647
+ * argument to
1648
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1649
+ * Match}. If not, {RE2::Regexp#match} will raise an error if attempting to
1650
+ * pass an `endpos`.
1651
+ *
1652
+ * @return [Boolean] whether the underlying
1653
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1654
+ * Match} has an endpos argument
1655
+ */
1656
+ static VALUE re2_regexp_match_has_endpos_argument_p(VALUE) {
1657
+ #ifdef HAVE_ENDPOS_ARGUMENT
1658
+ return Qtrue;
1659
+ #else
1660
+ return Qfalse;
1661
+ #endif
1662
+ }
1663
+
1664
+ /*
1665
+ * Returns a copy of `str` with the first occurrence `pattern` replaced with
1666
+ * `rewrite` using
1667
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L465-L480
1668
+ * `Replace`}.
1482
1669
  *
1483
1670
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1484
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1485
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1671
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1672
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1486
1673
  *
1487
1674
  * @param [String] str the string to modify
1488
1675
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1489
1676
  * @param [String] rewrite the string to replace with
1490
1677
  * @return [String] the resulting string
1678
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1679
+ * {RE2::Regexp}) cannot be coerced to `String`s
1491
1680
  * @example
1492
1681
  * RE2.Replace("hello there", "hello", "howdy") #=> "howdy there"
1493
1682
  * re2 = RE2::Regexp.new("hel+o")
@@ -1503,12 +1692,14 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
1503
1692
  /* Take a copy of str so it can be modified in-place by
1504
1693
  * RE2::Replace.
1505
1694
  */
1506
- std::string str_as_string(StringValuePtr(str));
1695
+ StringValue(str);
1696
+ std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1507
1697
 
1508
1698
  /* Do the replacement. */
1509
1699
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1510
1700
  TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1511
- RE2::Replace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
1701
+ RE2::Replace(&str_as_string, *p->pattern,
1702
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1512
1703
 
1513
1704
  return encoded_str_new(str_as_string.data(), str_as_string.size(),
1514
1705
  p->pattern->options().encoding());
@@ -1516,27 +1707,33 @@ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
1516
1707
  /* Ensure pattern is a string. */
1517
1708
  StringValue(pattern);
1518
1709
 
1519
- RE2::Replace(&str_as_string, RSTRING_PTR(pattern), RSTRING_PTR(rewrite));
1710
+ RE2::Replace(&str_as_string,
1711
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1712
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1520
1713
 
1521
1714
  return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1522
1715
  }
1523
1716
  }
1524
1717
 
1525
1718
  /*
1526
- * Return a copy of +str+ with +pattern+ replaced by +rewrite+.
1719
+ * Return a copy of `str` with `pattern` replaced by `rewrite` using
1720
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L482-L497
1721
+ * `GlobalReplace`}.
1527
1722
  *
1528
1723
  * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1529
- * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1530
- * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1724
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1725
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1531
1726
  *
1532
1727
  * @param [String] str the string to modify
1533
1728
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1534
1729
  * @param [String] rewrite the string to replace with
1730
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1731
+ * {RE2::Regexp}) cannot be coerced to `String`s
1535
1732
  * @return [String] the resulting string
1536
1733
  * @example
1537
1734
  * re2 = RE2::Regexp.new("oo?")
1538
- * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
1539
- * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
1735
+ * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
1736
+ * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
1540
1737
  */
1541
1738
  static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1542
1739
  VALUE rewrite) {
@@ -1547,12 +1744,14 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1547
1744
  * RE2::GlobalReplace.
1548
1745
  */
1549
1746
  re2_pattern *p;
1550
- std::string str_as_string(StringValuePtr(str));
1747
+ StringValue(str);
1748
+ std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1551
1749
 
1552
1750
  /* Do the replacement. */
1553
1751
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1554
1752
  TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1555
- RE2::GlobalReplace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
1753
+ RE2::GlobalReplace(&str_as_string, *p->pattern,
1754
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1556
1755
 
1557
1756
  return encoded_str_new(str_as_string.data(), str_as_string.size(),
1558
1757
  p->pattern->options().encoding());
@@ -1560,27 +1759,32 @@ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1560
1759
  /* Ensure pattern is a string. */
1561
1760
  StringValue(pattern);
1562
1761
 
1563
- RE2::GlobalReplace(&str_as_string, RSTRING_PTR(pattern),
1564
- RSTRING_PTR(rewrite));
1762
+ RE2::GlobalReplace(&str_as_string,
1763
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1764
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1565
1765
 
1566
1766
  return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1567
1767
  }
1568
1768
  }
1569
1769
 
1570
1770
  /*
1571
- * Returns a version of str with all potentially meaningful regexp
1572
- * characters escaped. The returned string, used as a regular
1573
- * expression, will exactly match the original string.
1771
+ * Returns a version of `str` with all potentially meaningful regexp characters
1772
+ * escaped using
1773
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L512-L518
1774
+ * `QuoteMeta`}. The returned string, used as a regular expression, will
1775
+ * exactly match the original string.
1574
1776
  *
1575
1777
  * @param [String] unquoted the unquoted string
1778
+ * @raise [TypeError] if the given unquoted string cannot be coerced to a `String`
1576
1779
  * @return [String] the escaped string
1577
1780
  * @example
1578
- * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
1781
+ * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
1579
1782
  */
1580
1783
  static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
1581
1784
  StringValue(unquoted);
1582
1785
 
1583
- std::string quoted_string = RE2::QuoteMeta(RSTRING_PTR(unquoted));
1786
+ std::string quoted_string = RE2::QuoteMeta(
1787
+ re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted)));
1584
1788
 
1585
1789
  return rb_str_new(quoted_string.data(), quoted_string.size());
1586
1790
  }
@@ -1641,14 +1845,14 @@ static VALUE re2_set_allocate(VALUE klass) {
1641
1845
  * Returns a new {RE2::Set} object for the specified anchor with the default
1642
1846
  * options.
1643
1847
  *
1644
- * @param [Symbol] anchor One of :unanchored, :anchor_start, :anchor_both
1645
- * @raise [ArgumentError] if anchor is not :unanchored, :anchor_start or :anchor_both
1848
+ * @param [Symbol] anchor one of `:unanchored`, `:anchor_start`, `:anchor_both`
1849
+ * @raise [ArgumentError] if anchor is not `:unanchored`, `:anchor_start` or `:anchor_both`
1646
1850
  * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1647
1851
  *
1648
1852
  * @overload initialize(anchor, options)
1649
1853
  * Returns a new {RE2::Set} object with the specified options.
1650
1854
  *
1651
- * @param [Symbol] anchor One of :unanchored, :anchor_start, :anchor_both
1855
+ * @param [Symbol] anchor one of `:unanchored`, `:anchor_start`, `:anchor_both`
1652
1856
  * @param [Hash] options the options with which to compile the pattern
1653
1857
  * @option options [Boolean] :utf8 (true) text and pattern are UTF-8; otherwise Latin-1
1654
1858
  * @option options [Boolean] :posix_syntax (false) restrict regexps to POSIX egrep syntax
@@ -1656,13 +1860,13 @@ static VALUE re2_set_allocate(VALUE klass) {
1656
1860
  * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
1657
1861
  * @option options [Integer] :max_mem approx. max memory footprint of RE2
1658
1862
  * @option options [Boolean] :literal (false) interpret string as literal, not regexp
1659
- * @option options [Boolean] :never_nl (false) never match \n, even if it is in regexp
1660
- * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with (?i) unless in posix_syntax mode)
1661
- * @option options [Boolean] :perl_classes (false) allow Perl's \d \s \w \D \S \W when in posix_syntax mode
1662
- * @option options [Boolean] :word_boundary (false) allow \b \B (word boundary and not) when in posix_syntax mode
1663
- * @option options [Boolean] :one_line (false) ^ and $ only match beginning and end of text when in posix_syntax mode
1664
- * @return [RE2::Set] an RE2::Set with the specified anchor and options
1665
- * @raise [ArgumentError] if anchor is not one of the accepted choices
1863
+ * @option options [Boolean] :never_nl (false) never match `\n`, even if it is in regexp
1864
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with `(?i)` unless in `posix_syntax` mode)
1865
+ * @option options [Boolean] :perl_classes (false) allow Perl's `\d` `\s` `\w` `\D` `\S` `\W` when in `posix_syntax` mode
1866
+ * @option options [Boolean] :word_boundary (false) allow `\b` `\B` (word boundary and not) when in `posix_syntax` mode
1867
+ * @option options [Boolean] :one_line (false) `^` and `$` only match beginning and end of text when in `posix_syntax` mode
1868
+ * @return [RE2::Set] a {RE2::Set} with the specified anchor and options
1869
+ * @raise [ArgumentError] if `anchor` is not one of the accepted choices
1666
1870
  * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1667
1871
  */
1668
1872
  static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
@@ -1676,12 +1880,12 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1676
1880
 
1677
1881
  if (!NIL_P(anchor)) {
1678
1882
  Check_Type(anchor, T_SYMBOL);
1679
- ID id_anchor = SYM2ID(anchor);
1680
- if (id_anchor == id_unanchored) {
1883
+ ID id_anchor_arg = SYM2ID(anchor);
1884
+ if (id_anchor_arg == id_unanchored) {
1681
1885
  re2_anchor = RE2::UNANCHORED;
1682
- } else if (id_anchor == id_anchor_start) {
1886
+ } else if (id_anchor_arg == id_anchor_start) {
1683
1887
  re2_anchor = RE2::ANCHOR_START;
1684
- } else if (id_anchor == id_anchor_both) {
1888
+ } else if (id_anchor_arg == id_anchor_both) {
1685
1889
  re2_anchor = RE2::ANCHOR_BOTH;
1686
1890
  } else {
1687
1891
  rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
@@ -1704,15 +1908,16 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1704
1908
 
1705
1909
  /*
1706
1910
  * Adds a pattern to the set. Returns the index that will identify the pattern
1707
- * in the output of #match. Cannot be called after #compile has been called.
1911
+ * in the output of {RE2::Set#match}. Cannot be called after {RE2::Set#compile}
1912
+ * has been called.
1708
1913
  *
1709
1914
  * @param [String] pattern the regex pattern
1710
1915
  * @return [Integer] the index of the pattern in the set
1711
1916
  * @raise [ArgumentError] if called after compile or the pattern is rejected
1712
1917
  * @example
1713
1918
  * set = RE2::Set.new
1714
- * set.add("abc") #=> 0
1715
- * set.add("def") #=> 1
1919
+ * set.add("abc") #=> 0
1920
+ * set.add("def") #=> 1
1716
1921
  */
1717
1922
  static VALUE re2_set_add(VALUE self, VALUE pattern) {
1718
1923
  StringValue(pattern);
@@ -1728,7 +1933,8 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) {
1728
1933
 
1729
1934
  {
1730
1935
  std::string err;
1731
- index = s->set->Add(RSTRING_PTR(pattern), &err);
1936
+ index = s->set->Add(
1937
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
1732
1938
  strlcpy(msg, err.c_str(), sizeof(msg));
1733
1939
  }
1734
1940
 
@@ -1740,14 +1946,14 @@ static VALUE re2_set_add(VALUE self, VALUE pattern) {
1740
1946
  }
1741
1947
 
1742
1948
  /*
1743
- * Compiles a Set so it can be used to match against. Must be called after #add
1744
- * and before #match.
1949
+ * Compiles a {RE2::Set} so it can be used to match against. Must be called
1950
+ * after {RE2::Set#add} and before {RE2::Set#match}.
1745
1951
  *
1746
- * @return [Bool] whether compilation was a success
1952
+ * @return [Boolean] whether compilation was a success
1747
1953
  * @example
1748
1954
  * set = RE2::Set.new
1749
1955
  * set.add("abc")
1750
- * set.compile # => true
1956
+ * set.compile #=> true
1751
1957
  */
1752
1958
  static VALUE re2_set_compile(VALUE self) {
1753
1959
  re2_set *s;
@@ -1757,11 +1963,12 @@ static VALUE re2_set_compile(VALUE self) {
1757
1963
  }
1758
1964
 
1759
1965
  /*
1760
- * Returns whether the underlying re2 version outputs error information from
1761
- * RE2::Set::Match. If not, #match will raise an error if attempting to set its
1762
- * :exception option to true.
1966
+ * Returns whether the underlying RE2 version outputs error information from
1967
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/set.h#L62-L65
1968
+ * `RE2::Set::Match`}. If not, {RE2::Set#match} will raise an error if attempting to set
1969
+ * its `:exception` option to `true`.
1763
1970
  *
1764
- * @return [Bool] whether the underlying re2 outputs error information from Set matches
1971
+ * @return [Boolean] whether the underlying RE2 outputs error information from {RE2::Set} matches
1765
1972
  */
1766
1973
  static VALUE re2_set_match_raises_errors_p(VALUE) {
1767
1974
  #ifdef HAVE_ERROR_INFO_ARGUMENT
@@ -1785,31 +1992,31 @@ static VALUE re2_set_match_raises_errors_p(VALUE) {
1785
1992
  * @param [String] str the text to match against
1786
1993
  * @return [Array<Integer>] the indices of matching regexps
1787
1994
  * @raise [MatchError] if an error occurs while matching
1788
- * @raise [UnsupportedError] if the underlying version of re2 does not output error information
1995
+ * @raise [UnsupportedError] if the underlying version of RE2 does not output error information
1789
1996
  * @example
1790
1997
  * set = RE2::Set.new
1791
1998
  * set.add("abc")
1792
1999
  * set.add("def")
1793
2000
  * set.compile
1794
- * set.match("abcdef") # => [0, 1]
2001
+ * set.match("abcdef") #=> [0, 1]
1795
2002
  *
1796
2003
  * @overload match(str, options)
1797
2004
  * Returns an array of integer indices of patterns matching the given string
1798
2005
  * (if any). Raises exceptions if there are any errors while matching and the
1799
- * :exception option is set to true.
2006
+ * `:exception` option is set to true.
1800
2007
  *
1801
2008
  * @param [String] str the text to match against
1802
2009
  * @param [Hash] options the options with which to match
1803
- * @option options [Boolean] :exception (true) whether to raise exceptions with re2's error information (not supported on ABI version 0 of re2)
2010
+ * @option options [Boolean] :exception (true) whether to raise exceptions with RE2's error information (not supported on ABI version 0 of RE2)
1804
2011
  * @return [Array<Integer>] the indices of matching regexps
1805
2012
  * @raise [MatchError] if an error occurs while matching
1806
- * @raise [UnsupportedError] if the underlying version of re2 does not output error information
2013
+ * @raise [UnsupportedError] if the underlying version of RE2 does not output error information
1807
2014
  * @example
1808
2015
  * set = RE2::Set.new
1809
2016
  * set.add("abc")
1810
2017
  * set.add("def")
1811
2018
  * set.compile
1812
- * set.match("abcdef", :exception => true) # => [0, 1]
2019
+ * set.match("abcdef", exception: true) #=> [0, 1]
1813
2020
  */
1814
2021
  static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1815
2022
  VALUE str, options;
@@ -1834,7 +2041,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1834
2041
  if (raise_exception) {
1835
2042
  #ifdef HAVE_ERROR_INFO_ARGUMENT
1836
2043
  RE2::Set::ErrorInfo e;
1837
- bool match_failed = !s->set->Match(RSTRING_PTR(str), &v, &e);
2044
+ bool match_failed = !s->set->Match(
2045
+ re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
1838
2046
  VALUE result = rb_ary_new2(v.size());
1839
2047
 
1840
2048
  if (match_failed) {
@@ -1861,7 +2069,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1861
2069
  rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
1862
2070
  #endif
1863
2071
  } else {
1864
- bool matched = s->set->Match(RSTRING_PTR(str), &v);
2072
+ bool matched = s->set->Match(
2073
+ re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
1865
2074
  VALUE result = rb_ary_new2(v.size());
1866
2075
 
1867
2076
  if (matched) {
@@ -1877,6 +2086,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1877
2086
  extern "C" void Init_re2(void) {
1878
2087
  re2_mRE2 = rb_define_module("RE2");
1879
2088
  re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
2089
+ re2_eRegexpUnsupportedError = rb_define_class_under(re2_cRegexp,
2090
+ "UnsupportedError", rb_const_get(rb_cObject, rb_intern("StandardError")));
1880
2091
  re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject);
1881
2092
  re2_cScanner = rb_define_class_under(re2_mRE2, "Scanner", rb_cObject);
1882
2093
  re2_cSet = rb_define_class_under(re2_mRE2, "Set", rb_cObject);
@@ -1930,6 +2141,8 @@ extern "C" void Init_re2(void) {
1930
2141
  rb_define_method(re2_cScanner, "rewind",
1931
2142
  RUBY_METHOD_FUNC(re2_scanner_rewind), 0);
1932
2143
 
2144
+ rb_define_singleton_method(re2_cRegexp, "match_has_endpos_argument?",
2145
+ RUBY_METHOD_FUNC(re2_regexp_match_has_endpos_argument_p), 0);
1933
2146
  rb_define_method(re2_cRegexp, "initialize",
1934
2147
  RUBY_METHOD_FUNC(re2_regexp_initialize), -1);
1935
2148
  rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0);
@@ -1947,12 +2160,14 @@ extern "C" void Init_re2(void) {
1947
2160
  RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
1948
2161
  rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match),
1949
2162
  -1);
1950
- rb_define_method(re2_cRegexp, "match?",
1951
- RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
1952
- rb_define_method(re2_cRegexp, "=~",
1953
- RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
1954
- rb_define_method(re2_cRegexp, "===",
2163
+ rb_define_method(re2_cRegexp, "match?", RUBY_METHOD_FUNC(re2_regexp_match_p),
2164
+ 1);
2165
+ rb_define_method(re2_cRegexp, "partial_match?",
1955
2166
  RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2167
+ rb_define_method(re2_cRegexp, "=~", RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2168
+ rb_define_method(re2_cRegexp, "===", RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2169
+ rb_define_method(re2_cRegexp, "full_match?",
2170
+ RUBY_METHOD_FUNC(re2_regexp_full_match_p), 1);
1956
2171
  rb_define_method(re2_cRegexp, "scan",
1957
2172
  RUBY_METHOD_FUNC(re2_regexp_scan), 1);
1958
2173
  rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
@@ -2009,6 +2224,8 @@ extern "C" void Init_re2(void) {
2009
2224
  RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2010
2225
  rb_define_singleton_method(re2_cRegexp, "quote",
2011
2226
  RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2227
+
2228
+ // (see RE2::Regexp#initialize)
2012
2229
  rb_define_singleton_method(re2_cRegexp, "compile",
2013
2230
  RUBY_METHOD_FUNC(rb_class_new_instance), -1);
2014
2231
 
@@ -2027,7 +2244,11 @@ extern "C" void Init_re2(void) {
2027
2244
  id_word_boundary = rb_intern("word_boundary");
2028
2245
  id_one_line = rb_intern("one_line");
2029
2246
  id_unanchored = rb_intern("unanchored");
2247
+ id_anchor = rb_intern("anchor");
2030
2248
  id_anchor_start = rb_intern("anchor_start");
2031
2249
  id_anchor_both = rb_intern("anchor_both");
2032
2250
  id_exception = rb_intern("exception");
2251
+ id_submatches = rb_intern("submatches");
2252
+ id_startpos = rb_intern("startpos");
2253
+ id_endpos = rb_intern("endpos");
2033
2254
  }