re2 2.15.0.rc1-x86-linux-gnu

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/re2/re2.cc ADDED
@@ -0,0 +1,2254 @@
1
+ /*
2
+ * re2 (https://github.com/mudge/re2)
3
+ * Ruby bindings to RE2, a "fast, safe, thread-friendly alternative to
4
+ * backtracking regular expression engines like those used in PCRE, Perl, and
5
+ * Python".
6
+ *
7
+ * Copyright (c) 2010, Paul Mucur (https://mudge.name)
8
+ * Released under the BSD Licence, please see LICENSE.txt
9
+ */
10
+
11
+ #include <stdint.h>
12
+
13
+ #include <map>
14
+ #include <sstream>
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ #include <re2/re2.h>
19
+ #include <re2/set.h>
20
+ #include <ruby.h>
21
+ #include <ruby/encoding.h>
22
+
23
+ #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
24
+
25
+ typedef struct {
26
+ RE2 *pattern;
27
+ } re2_pattern;
28
+
29
+ typedef struct {
30
+ re2::StringPiece *matches;
31
+ int number_of_matches;
32
+ VALUE regexp, text;
33
+ } re2_matchdata;
34
+
35
+ typedef struct {
36
+ re2::StringPiece *input;
37
+ int number_of_capturing_groups;
38
+ bool eof;
39
+ VALUE regexp, text;
40
+ } re2_scanner;
41
+
42
+ typedef struct {
43
+ RE2::Set *set;
44
+ } re2_set;
45
+
46
+ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
47
+ re2_eSetMatchError, re2_eSetUnsupportedError, re2_eRegexpUnsupportedError;
48
+
49
+ /* Symbols used in RE2 options. */
50
+ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
51
+ id_max_mem, id_literal, id_never_nl, id_case_sensitive,
52
+ id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
53
+ id_anchor, id_anchor_start, id_anchor_both, id_exception,
54
+ id_submatches, id_startpos, id_endpos;
55
+
56
+ inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
57
+ if (encoding == RE2::Options::EncodingUTF8) {
58
+ return rb_utf8_str_new(str, length);
59
+ }
60
+
61
+ VALUE string = rb_str_new(str, length);
62
+ rb_enc_associate_index(string, rb_enc_find_index("ISO-8859-1"));
63
+
64
+ return string;
65
+ }
66
+
67
+ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
68
+ if (TYPE(options) != T_HASH) {
69
+ rb_raise(rb_eArgError, "options should be a hash");
70
+ }
71
+
72
+ VALUE utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
73
+ if (!NIL_P(utf8)) {
74
+ re2_options->set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
75
+ }
76
+
77
+ VALUE posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
78
+ if (!NIL_P(posix_syntax)) {
79
+ re2_options->set_posix_syntax(RTEST(posix_syntax));
80
+ }
81
+
82
+ VALUE longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
83
+ if (!NIL_P(longest_match)) {
84
+ re2_options->set_longest_match(RTEST(longest_match));
85
+ }
86
+
87
+ VALUE log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
88
+ if (!NIL_P(log_errors)) {
89
+ re2_options->set_log_errors(RTEST(log_errors));
90
+ }
91
+
92
+ VALUE max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
93
+ if (!NIL_P(max_mem)) {
94
+ re2_options->set_max_mem(NUM2INT(max_mem));
95
+ }
96
+
97
+ VALUE literal = rb_hash_aref(options, ID2SYM(id_literal));
98
+ if (!NIL_P(literal)) {
99
+ re2_options->set_literal(RTEST(literal));
100
+ }
101
+
102
+ VALUE never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
103
+ if (!NIL_P(never_nl)) {
104
+ re2_options->set_never_nl(RTEST(never_nl));
105
+ }
106
+
107
+ VALUE case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
108
+ if (!NIL_P(case_sensitive)) {
109
+ re2_options->set_case_sensitive(RTEST(case_sensitive));
110
+ }
111
+
112
+ VALUE perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
113
+ if (!NIL_P(perl_classes)) {
114
+ re2_options->set_perl_classes(RTEST(perl_classes));
115
+ }
116
+
117
+ VALUE word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
118
+ if (!NIL_P(word_boundary)) {
119
+ re2_options->set_word_boundary(RTEST(word_boundary));
120
+ }
121
+
122
+ VALUE one_line = rb_hash_aref(options, ID2SYM(id_one_line));
123
+ if (!NIL_P(one_line)) {
124
+ re2_options->set_one_line(RTEST(one_line));
125
+ }
126
+ }
127
+
128
+ /* For compatibility with Ruby < 2.7 */
129
+ #ifdef HAVE_RB_GC_MARK_MOVABLE
130
+ #define re2_compact_callback(x) (x),
131
+ #else
132
+ #define rb_gc_mark_movable(x) rb_gc_mark(x)
133
+ #define re2_compact_callback(x)
134
+ #endif
135
+
136
+ static void re2_matchdata_mark(void *ptr) {
137
+ re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
138
+ rb_gc_mark_movable(m->regexp);
139
+ rb_gc_mark_movable(m->text);
140
+ }
141
+
142
+ #ifdef HAVE_RB_GC_MARK_MOVABLE
143
+ static void re2_matchdata_compact(void *ptr) {
144
+ re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
145
+ m->regexp = rb_gc_location(m->regexp);
146
+ m->text = rb_gc_location(m->text);
147
+ }
148
+ #endif
149
+
150
+ static void re2_matchdata_free(void *ptr) {
151
+ re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
152
+ if (m->matches) {
153
+ delete[] m->matches;
154
+ }
155
+ xfree(m);
156
+ }
157
+
158
+ static size_t re2_matchdata_memsize(const void *ptr) {
159
+ const re2_matchdata *m = reinterpret_cast<const re2_matchdata *>(ptr);
160
+ size_t size = sizeof(*m);
161
+ if (m->matches) {
162
+ size += sizeof(*m->matches) * m->number_of_matches;
163
+ }
164
+
165
+ return size;
166
+ }
167
+
168
+ static const rb_data_type_t re2_matchdata_data_type = {
169
+ "RE2::MatchData",
170
+ {
171
+ re2_matchdata_mark,
172
+ re2_matchdata_free,
173
+ re2_matchdata_memsize,
174
+ re2_compact_callback(re2_matchdata_compact)
175
+ },
176
+ 0,
177
+ 0,
178
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
179
+ // macro to update VALUE references, as to trigger write barriers.
180
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
181
+ };
182
+
183
+ static void re2_scanner_mark(void *ptr) {
184
+ re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
185
+ rb_gc_mark_movable(s->regexp);
186
+ rb_gc_mark_movable(s->text);
187
+ }
188
+
189
+ #ifdef HAVE_RB_GC_MARK_MOVABLE
190
+ static void re2_scanner_compact(void *ptr) {
191
+ re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
192
+ s->regexp = rb_gc_location(s->regexp);
193
+ s->text = rb_gc_location(s->text);
194
+ }
195
+ #endif
196
+
197
+ static void re2_scanner_free(void *ptr) {
198
+ re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
199
+ if (s->input) {
200
+ delete s->input;
201
+ }
202
+ xfree(s);
203
+ }
204
+
205
+ static size_t re2_scanner_memsize(const void *ptr) {
206
+ const re2_scanner *s = reinterpret_cast<const re2_scanner *>(ptr);
207
+ size_t size = sizeof(*s);
208
+ if (s->input) {
209
+ size += sizeof(*s->input);
210
+ }
211
+
212
+ return size;
213
+ }
214
+
215
+ static const rb_data_type_t re2_scanner_data_type = {
216
+ "RE2::Scanner",
217
+ {
218
+ re2_scanner_mark,
219
+ re2_scanner_free,
220
+ re2_scanner_memsize,
221
+ re2_compact_callback(re2_scanner_compact)
222
+ },
223
+ 0,
224
+ 0,
225
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
226
+ // macro to update VALUE references, as to trigger write barriers.
227
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
228
+ };
229
+
230
+ static void re2_regexp_free(void *ptr) {
231
+ re2_pattern *p = reinterpret_cast<re2_pattern *>(ptr);
232
+ if (p->pattern) {
233
+ delete p->pattern;
234
+ }
235
+ xfree(p);
236
+ }
237
+
238
+ static size_t re2_regexp_memsize(const void *ptr) {
239
+ const re2_pattern *p = reinterpret_cast<const re2_pattern *>(ptr);
240
+ size_t size = sizeof(*p);
241
+ if (p->pattern) {
242
+ size += sizeof(*p->pattern);
243
+ }
244
+
245
+ return size;
246
+ }
247
+
248
+ static const rb_data_type_t re2_regexp_data_type = {
249
+ "RE2::Regexp",
250
+ {
251
+ 0,
252
+ re2_regexp_free,
253
+ re2_regexp_memsize,
254
+ },
255
+ 0,
256
+ 0,
257
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
258
+ // macro to update VALUE references, as to trigger write barriers.
259
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
260
+ };
261
+
262
+ static VALUE re2_matchdata_allocate(VALUE klass) {
263
+ re2_matchdata *m;
264
+
265
+ return TypedData_Make_Struct(klass, re2_matchdata, &re2_matchdata_data_type,
266
+ m);
267
+ }
268
+
269
+ static VALUE re2_scanner_allocate(VALUE klass) {
270
+ re2_scanner *c;
271
+
272
+ return TypedData_Make_Struct(klass, re2_scanner, &re2_scanner_data_type, c);
273
+ }
274
+
275
+ /*
276
+ * Returns a frozen copy of the text supplied when matching.
277
+ *
278
+ * If the text was already a frozen string, returns the original.
279
+ *
280
+ * @return [String] a frozen string with the text supplied when matching
281
+ * @example
282
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
283
+ * m.string #=> "bob 123"
284
+ */
285
+ static VALUE re2_matchdata_string(const VALUE self) {
286
+ re2_matchdata *m;
287
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
288
+
289
+ return m->text;
290
+ }
291
+
292
+ /*
293
+ * Returns the text supplied when incrementally matching with
294
+ * {RE2::Regexp#scan}.
295
+ *
296
+ * @return [String] the original string passed to {RE2::Regexp#scan}
297
+ * @example
298
+ * c = RE2::Regexp.new('(\d+)').scan("foo")
299
+ * c.string #=> "foo"
300
+ */
301
+ static VALUE re2_scanner_string(const VALUE self) {
302
+ re2_scanner *c;
303
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
304
+
305
+ return c->text;
306
+ }
307
+
308
+ /*
309
+ * Returns whether the {RE2::Scanner} has consumed all input or not.
310
+ *
311
+ * @return [Boolean] whether the {RE2::Scanner} has consumed all input or not
312
+ * @example
313
+ * c = RE2::Regexp.new('(\d+)').scan("foo")
314
+ * c.eof? #=> true
315
+ */
316
+ static VALUE re2_scanner_eof(const VALUE self) {
317
+ re2_scanner *c;
318
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
319
+
320
+ return BOOL2RUBY(c->eof);
321
+ }
322
+
323
+ /*
324
+ * Rewind the {RE2::Scanner} to the start of the string.
325
+ *
326
+ * @example
327
+ * s = RE2::Regexp.new('(\d+)').scan("1 2 3")
328
+ * e = s.to_enum
329
+ * e.scan #=> ["1"]
330
+ * e.scan #=> ["2"]
331
+ * s.rewind
332
+ * e.scan #=> ["1"]
333
+ */
334
+ static VALUE re2_scanner_rewind(VALUE self) {
335
+ re2_scanner *c;
336
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
337
+
338
+ delete c->input;
339
+ c->input = new(std::nothrow) re2::StringPiece(
340
+ RSTRING_PTR(c->text), RSTRING_LEN(c->text));
341
+ c->eof = false;
342
+
343
+ return self;
344
+ }
345
+
346
+ /*
347
+ * Scan the given text incrementally for matches using
348
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
349
+ * `FindAndConsume`}, returning an array of submatches on each subsequent
350
+ * call. Returns `nil` if no matches are found or an empty array for every
351
+ * match if the pattern has no capturing groups.
352
+ *
353
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
354
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
355
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
356
+ *
357
+ * @return [Array<String>] if the pattern has capturing groups
358
+ * @return [[]] if the pattern does not have capturing groups
359
+ * @return [nil] if no matches are found
360
+ * @example
361
+ * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
362
+ * s.scan #=> ["Foo"]
363
+ * s.scan #=> ["bar"]
364
+ */
365
+ static VALUE re2_scanner_scan(VALUE self) {
366
+ re2_pattern *p;
367
+ re2_scanner *c;
368
+
369
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
370
+ TypedData_Get_Struct(c->regexp, re2_pattern, &re2_regexp_data_type, p);
371
+
372
+ std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
373
+ std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
374
+ std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);
375
+
376
+ if (c->eof) {
377
+ return Qnil;
378
+ }
379
+
380
+ re2::StringPiece::size_type original_input_size = c->input->size();
381
+
382
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
383
+ argv[i] = &matches[i];
384
+ args[i] = &argv[i];
385
+ }
386
+
387
+ if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
388
+ c->number_of_capturing_groups)) {
389
+ re2::StringPiece::size_type new_input_size = c->input->size();
390
+ bool input_advanced = new_input_size < original_input_size;
391
+
392
+ VALUE result = rb_ary_new2(c->number_of_capturing_groups);
393
+
394
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
395
+ if (matches[i].empty()) {
396
+ rb_ary_push(result, Qnil);
397
+ } else {
398
+ rb_ary_push(result, encoded_str_new(matches[i].data(),
399
+ matches[i].size(),
400
+ p->pattern->options().encoding()));
401
+ }
402
+ }
403
+
404
+ /* Check whether we've exhausted the input yet. */
405
+ c->eof = new_input_size == 0;
406
+
407
+ /* If the match didn't advance the input, we need to do this ourselves. */
408
+ if (!input_advanced && new_input_size > 0) {
409
+ c->input->remove_prefix(1);
410
+ }
411
+
412
+ return result;
413
+ } else {
414
+ return Qnil;
415
+ }
416
+ }
417
+
418
+ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
419
+ re2_matchdata *m;
420
+ re2_pattern *p;
421
+
422
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
423
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
424
+
425
+ int id;
426
+
427
+ if (FIXNUM_P(idx)) {
428
+ id = FIX2INT(idx);
429
+ } else if (SYMBOL_P(idx)) {
430
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
431
+ std::map<std::string, int>::const_iterator search = groups.find(rb_id2name(SYM2ID(idx)));
432
+
433
+ if (search != groups.end()) {
434
+ id = search->second;
435
+ } else {
436
+ return NULL;
437
+ }
438
+ } else {
439
+ StringValue(idx);
440
+
441
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
442
+ std::map<std::string, int>::const_iterator search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)));
443
+
444
+ if (search != groups.end()) {
445
+ id = search->second;
446
+ } else {
447
+ return NULL;
448
+ }
449
+ }
450
+
451
+ if (id >= 0 && id < m->number_of_matches) {
452
+ re2::StringPiece *match = &m->matches[id];
453
+
454
+ if (!match->empty()) {
455
+ return match;
456
+ }
457
+ }
458
+
459
+ return NULL;
460
+ }
461
+
462
+ /*
463
+ * Returns the number of elements in the {RE2::MatchData} (including the
464
+ * overall match, submatches and any `nils`).
465
+ *
466
+ * @return [Integer] the number of elements
467
+ * @example
468
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
469
+ * m.size #=> 2
470
+ * m.length #=> 2
471
+ */
472
+ static VALUE re2_matchdata_size(const VALUE self) {
473
+ re2_matchdata *m;
474
+
475
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
476
+
477
+ return INT2FIX(m->number_of_matches);
478
+ }
479
+
480
+ /*
481
+ * Returns the offset of the start of the nth element of the {RE2::MatchData}.
482
+ *
483
+ * @param [Integer, String, Symbol] n the name or number of the submatch
484
+ * @return [Integer, nil] the offset of the start of the match or `nil` if
485
+ * there is no such submatch
486
+ * @example
487
+ * m = RE2::Regexp.new('ob (\d+)').match("bob 123")
488
+ * m.begin(0) #=> 1
489
+ * m.begin(1) #=> 4
490
+ */
491
+ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
492
+ re2_matchdata *m;
493
+
494
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
495
+
496
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
497
+ if (match == NULL) {
498
+ return Qnil;
499
+ } else {
500
+ long offset = match->data() - RSTRING_PTR(m->text);
501
+
502
+ return LONG2NUM(rb_str_sublen(m->text, offset));
503
+ }
504
+ }
505
+
506
+ /*
507
+ * Returns the offset of the character following the end of the nth element of
508
+ * the {RE2::MatchData}.
509
+ *
510
+ * @param [Integer, String, Symbol] n the name or number of the match
511
+ * @return [Integer, nil] the offset of the character following the end of the
512
+ * match or `nil` if there is no such match
513
+ * @example
514
+ * m = RE2::Regexp.new('ob (\d+) b').match("bob 123 bob")
515
+ * m.end(0) #=> 9
516
+ * m.end(1) #=> 7
517
+ */
518
+ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
519
+ re2_matchdata *m;
520
+
521
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
522
+
523
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
524
+ if (match == NULL) {
525
+ return Qnil;
526
+ } else {
527
+ long offset = (match->data() - RSTRING_PTR(m->text)) + match->size();
528
+
529
+ return LONG2NUM(rb_str_sublen(m->text, offset));
530
+ }
531
+ }
532
+
533
+ /*
534
+ * Returns the {RE2::Regexp} used in the match.
535
+ *
536
+ * @return [RE2::Regexp] the regular expression used in the match
537
+ * @example
538
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
539
+ * m.regexp #=> #<RE2::Regexp /(\d+)/>
540
+ */
541
+ static VALUE re2_matchdata_regexp(const VALUE self) {
542
+ re2_matchdata *m;
543
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
544
+
545
+ return m->regexp;
546
+ }
547
+
548
+ /*
549
+ * Returns the {RE2::Regexp} used in the {RE2::Scanner}.
550
+ *
551
+ * @return [RE2::Regexp] the regular expression used in the {RE2::Scanner}
552
+ * @example
553
+ * c = RE2::Regexp.new('(\d+)').scan("bob 123")
554
+ * c.regexp #=> #<RE2::Regexp /(\d+)/>
555
+ */
556
+ static VALUE re2_scanner_regexp(const VALUE self) {
557
+ re2_scanner *c;
558
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
559
+
560
+ return c->regexp;
561
+ }
562
+
563
+ static VALUE re2_regexp_allocate(VALUE klass) {
564
+ re2_pattern *p;
565
+
566
+ return TypedData_Make_Struct(klass, re2_pattern, &re2_regexp_data_type, p);
567
+ }
568
+
569
+ /*
570
+ * Returns the array of matches including the overall match, submatches and any
571
+ * `nil`s.
572
+ *
573
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
574
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
575
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
576
+ *
577
+ * @return [Array<String, nil>] the array of matches
578
+ * @example
579
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
580
+ * m.to_a #=> ["123", "123"]
581
+ */
582
+ static VALUE re2_matchdata_to_a(const VALUE self) {
583
+ re2_matchdata *m;
584
+ re2_pattern *p;
585
+
586
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
587
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
588
+
589
+ VALUE array = rb_ary_new2(m->number_of_matches);
590
+ for (int i = 0; i < m->number_of_matches; ++i) {
591
+ re2::StringPiece *match = &m->matches[i];
592
+
593
+ if (match->empty()) {
594
+ rb_ary_push(array, Qnil);
595
+ } else {
596
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
597
+ p->pattern->options().encoding()));
598
+ }
599
+ }
600
+
601
+ return array;
602
+ }
603
+
604
+ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
605
+ re2_matchdata *m;
606
+ re2_pattern *p;
607
+
608
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
609
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
610
+
611
+ if (nth < 0 || nth >= m->number_of_matches) {
612
+ return Qnil;
613
+ } else {
614
+ re2::StringPiece *match = &m->matches[nth];
615
+
616
+ if (match->empty()) {
617
+ return Qnil;
618
+ } else {
619
+ return encoded_str_new(match->data(), match->size(),
620
+ p->pattern->options().encoding());
621
+ }
622
+ }
623
+ }
624
+
625
+ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
626
+ re2_matchdata *m;
627
+ re2_pattern *p;
628
+
629
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
630
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
631
+
632
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
633
+ std::map<std::string, int>::const_iterator search = groups.find(name);
634
+
635
+ if (search != groups.end()) {
636
+ return re2_matchdata_nth_match(search->second, self);
637
+ } else {
638
+ return Qnil;
639
+ }
640
+ }
641
+
642
+ /*
643
+ * Retrieve zero, one or more matches by index or name.
644
+ *
645
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
646
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
647
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
648
+ *
649
+ * @overload [](index)
650
+ * Access a particular match by index.
651
+ *
652
+ * @param [Integer] index the index of the match to fetch
653
+ * @return [String, nil] the specified match or `nil` if it isn't present
654
+ * @example
655
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
656
+ * m[0] #=> "123"
657
+ *
658
+ * @overload [](start, length)
659
+ * Access a range of matches by starting index and length.
660
+ *
661
+ * @param [Integer] start the index from which to start
662
+ * @param [Integer] length the number of elements to fetch
663
+ * @return [Array<String, nil>] the specified matches
664
+ * @example
665
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
666
+ * m[0, 1] #=> ["123"]
667
+ *
668
+ * @overload [](range)
669
+ * Access a range of matches by index.
670
+ *
671
+ * @param [Range] range the range of match indexes to fetch
672
+ * @return [Array<String, nil>] the specified matches
673
+ * @example
674
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
675
+ * m[0..1] #=> "[123", "123"]
676
+ *
677
+ * @overload [](name)
678
+ * Access a particular match by name.
679
+ *
680
+ * @param [String, Symbol] name the name of the match to fetch
681
+ * @return [String, nil] the specific match or `nil` if it isn't present
682
+ * @example
683
+ * m = RE2::Regexp.new('(?P<number>\d+)').match("bob 123")
684
+ * m["number"] #=> "123"
685
+ * m[:number] #=> "123"
686
+ */
687
+ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
688
+ VALUE idx, rest;
689
+ rb_scan_args(argc, argv, "11", &idx, &rest);
690
+
691
+ if (TYPE(idx) == T_STRING) {
692
+ return re2_matchdata_named_match(
693
+ std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self);
694
+ } else if (SYMBOL_P(idx)) {
695
+ return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
696
+ } else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
697
+ return rb_ary_aref(argc, argv, re2_matchdata_to_a(self));
698
+ } else {
699
+ return re2_matchdata_nth_match(FIX2INT(idx), self);
700
+ }
701
+ }
702
+
703
+ /*
704
+ * Returns the entire matched string.
705
+ *
706
+ * @return [String] the entire matched string
707
+ */
708
+ static VALUE re2_matchdata_to_s(const VALUE self) {
709
+ return re2_matchdata_nth_match(0, self);
710
+ }
711
+
712
+ /*
713
+ * Returns a printable version of the match.
714
+ *
715
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
716
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
717
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
718
+ *
719
+ * @return [String] a printable version of the match
720
+ * @example
721
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
722
+ * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
723
+ */
724
+ static VALUE re2_matchdata_inspect(const VALUE self) {
725
+ re2_matchdata *m;
726
+ re2_pattern *p;
727
+
728
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
729
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
730
+
731
+ std::ostringstream output;
732
+ output << "#<RE2::MatchData";
733
+
734
+ for (int i = 0; i < m->number_of_matches; ++i) {
735
+ output << " ";
736
+
737
+ if (i > 0) {
738
+ output << i << ":";
739
+ }
740
+
741
+ VALUE match = re2_matchdata_nth_match(i, self);
742
+
743
+ if (match == Qnil) {
744
+ output << "nil";
745
+ } else {
746
+ output << "\"";
747
+ output.write(RSTRING_PTR(match), RSTRING_LEN(match));
748
+ output << "\"";
749
+ }
750
+ }
751
+
752
+ output << ">";
753
+
754
+ return encoded_str_new(output.str().data(), output.str().length(),
755
+ p->pattern->options().encoding());
756
+ }
757
+
758
+ /*
759
+ * Returns the array of submatches for pattern matching.
760
+ *
761
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
762
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
763
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is
764
+ * undefined).
765
+ *
766
+ * @return [Array<String, nil>] the array of submatches
767
+ * @example
768
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
769
+ * m.deconstruct #=> ["123"]
770
+ *
771
+ * @example pattern matching
772
+ * case RE2::Regexp.new('(\d+) (\d+)').match("bob 123 456")
773
+ * in x, y
774
+ * puts "Matched #{x} #{y}"
775
+ * else
776
+ * puts "Unrecognised match"
777
+ * end
778
+ */
779
+ static VALUE re2_matchdata_deconstruct(const VALUE self) {
780
+ re2_matchdata *m;
781
+ re2_pattern *p;
782
+
783
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
784
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
785
+
786
+ VALUE array = rb_ary_new2(m->number_of_matches - 1);
787
+ for (int i = 1; i < m->number_of_matches; ++i) {
788
+ re2::StringPiece *match = &m->matches[i];
789
+
790
+ if (match->empty()) {
791
+ rb_ary_push(array, Qnil);
792
+ } else {
793
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
794
+ p->pattern->options().encoding()));
795
+ }
796
+ }
797
+
798
+ return array;
799
+ }
800
+
801
+ /*
802
+ * Returns a hash of capturing group names to submatches for pattern matching.
803
+ *
804
+ * As this is used by Ruby's pattern matching, it will return an empty hash if given
805
+ * more keys than there are capturing groups. Given keys will populate the hash in
806
+ * order but an invalid name will cause the hash to be immediately returned.
807
+ *
808
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
809
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
810
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
811
+ *
812
+ * @return [Hash] a hash of capturing group names to submatches
813
+ * @param [Array<Symbol>, nil] keys an array of `Symbol` capturing group names
814
+ * or `nil` to return all names
815
+ * @example
816
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
817
+ * m.deconstruct_keys(nil) #=> {numbers: "123", letters: "abc"}
818
+ * m.deconstruct_keys([:numbers]) #=> {numbers: "123"}
819
+ * m.deconstruct_keys([:fruit]) #=> {}
820
+ * m.deconstruct_keys([:letters, :fruit]) #=> {letters: "abc"}
821
+ *
822
+ * @example pattern matching
823
+ * case RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
824
+ * in numbers:, letters:
825
+ * puts "Numbers: #{numbers}, letters: #{letters}"
826
+ * else
827
+ * puts "Unrecognised match"
828
+ * end
829
+ */
830
+ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys) {
831
+ re2_matchdata *m;
832
+ re2_pattern *p;
833
+
834
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
835
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
836
+
837
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
838
+ VALUE capturing_groups = rb_hash_new();
839
+
840
+ if (NIL_P(keys)) {
841
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
842
+ rb_hash_aset(capturing_groups,
843
+ ID2SYM(rb_intern(it->first.data())),
844
+ re2_matchdata_nth_match(it->second, self));
845
+ }
846
+ } else {
847
+ Check_Type(keys, T_ARRAY);
848
+
849
+ if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) {
850
+ for (int i = 0; i < RARRAY_LEN(keys); ++i) {
851
+ VALUE key = rb_ary_entry(keys, i);
852
+ Check_Type(key, T_SYMBOL);
853
+ const char *name = rb_id2name(SYM2ID(key));
854
+ std::map<std::string, int>::const_iterator search = groups.find(name);
855
+
856
+ if (search != groups.end()) {
857
+ rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(search->second, self));
858
+ } else {
859
+ break;
860
+ }
861
+ }
862
+ }
863
+ }
864
+
865
+ return capturing_groups;
866
+ }
867
+
868
+ /*
869
+ * Shorthand to compile a new {RE2::Regexp}.
870
+ *
871
+ * @see RE2::Regexp#initialize
872
+ */
873
+ static VALUE re2_re2(int argc, VALUE *argv, VALUE) {
874
+ return rb_class_new_instance(argc, argv, re2_cRegexp);
875
+ }
876
+
877
+ /*
878
+ * Returns a new {RE2::Regexp} object with a compiled version of
879
+ * `pattern` stored inside.
880
+ *
881
+ * @overload initialize(pattern)
882
+ * Returns a new {RE2::Regexp} object with a compiled version of
883
+ * `pattern` stored inside with the default options.
884
+ *
885
+ * @param [String] pattern the pattern to compile
886
+ * @return [RE2::Regexp] a {RE2::Regexp} with the specified pattern
887
+ * @raise [TypeError] if the given pattern can't be coerced to a `String`
888
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled
889
+ * pattern
890
+ *
891
+ * @overload initialize(pattern, options)
892
+ * Returns a new {RE2::Regexp} object with a compiled version of
893
+ * `pattern` stored inside with the specified options.
894
+ *
895
+ * @param [String] pattern the pattern to compile
896
+ * @param [Hash] options the options with which to compile the pattern
897
+ * @option options [Boolean] :utf8 (true) text and pattern are UTF-8; otherwise Latin-1
898
+ * @option options [Boolean] :posix_syntax (false) restrict regexps to POSIX egrep syntax
899
+ * @option options [Boolean] :longest_match (false) search for longest match, not first match
900
+ * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
901
+ * @option options [Integer] :max_mem approx. max memory footprint of RE2
902
+ * @option options [Boolean] :literal (false) interpret string as literal, not regexp
903
+ * @option options [Boolean] :never_nl (false) never match `\n`, even if it is in regexp
904
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with `(?i)` unless in `posix_syntax` mode)
905
+ * @option options [Boolean] :perl_classes (false) allow Perl's `\d` `\s` `\w` `\D` `\S` `\W` when in `posix_syntax` mode
906
+ * @option options [Boolean] :word_boundary (false) allow `\b` `\B` (word boundary and not) when in `posix_syntax` mode
907
+ * @option options [Boolean] :one_line (false) `^` and `$` only match beginning and end of text when in `posix_syntax` mode
908
+ * @return [RE2::Regexp] a {RE2::Regexp} with the specified pattern and options
909
+ * @raise [TypeError] if the given pattern can't be coerced to a `String`
910
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
911
+ */
912
+ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
913
+ VALUE pattern, options;
914
+ re2_pattern *p;
915
+
916
+ rb_scan_args(argc, argv, "11", &pattern, &options);
917
+
918
+ /* Ensure pattern is a string. */
919
+ StringValue(pattern);
920
+
921
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
922
+
923
+ if (RTEST(options)) {
924
+ RE2::Options re2_options;
925
+ parse_re2_options(&re2_options, options);
926
+
927
+ p->pattern = new(std::nothrow) RE2(
928
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options);
929
+ } else {
930
+ p->pattern = new(std::nothrow) RE2(
931
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
932
+ }
933
+
934
+ if (p->pattern == 0) {
935
+ rb_raise(rb_eNoMemError, "not enough memory to allocate RE2 object");
936
+ }
937
+
938
+ return self;
939
+ }
940
+
941
+ /*
942
+ * Returns a printable version of the regular expression.
943
+ *
944
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
945
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
946
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is
947
+ * undefined).
948
+ *
949
+ * @return [String] a printable version of the regular expression
950
+ * @example
951
+ * re2 = RE2::Regexp.new("woo?")
952
+ * re2.inspect #=> "#<RE2::Regexp /woo?/>"
953
+ */
954
+ static VALUE re2_regexp_inspect(const VALUE self) {
955
+ re2_pattern *p;
956
+
957
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
958
+
959
+ std::ostringstream output;
960
+
961
+ output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
962
+
963
+ return encoded_str_new(output.str().data(), output.str().length(),
964
+ p->pattern->options().encoding());
965
+ }
966
+
967
+ /*
968
+ * Returns a string version of the regular expression.
969
+ *
970
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
971
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
972
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
973
+ *
974
+ * @return [String] a string version of the regular expression
975
+ * @example
976
+ * re2 = RE2::Regexp.new("woo?")
977
+ * re2.to_s #=> "woo?"
978
+ */
979
+ static VALUE re2_regexp_to_s(const VALUE self) {
980
+ re2_pattern *p;
981
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
982
+
983
+ return encoded_str_new(p->pattern->pattern().data(),
984
+ p->pattern->pattern().size(),
985
+ p->pattern->options().encoding());
986
+ }
987
+
988
+ /*
989
+ * Returns whether or not the regular expression was compiled successfully.
990
+ *
991
+ * @return [Boolean] whether or not compilation was successful
992
+ * @example
993
+ * re2 = RE2::Regexp.new("woo?")
994
+ * re2.ok? #=> true
995
+ */
996
+ static VALUE re2_regexp_ok(const VALUE self) {
997
+ re2_pattern *p;
998
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
999
+
1000
+ return BOOL2RUBY(p->pattern->ok());
1001
+ }
1002
+
1003
+ /*
1004
+ * Returns whether or not the regular expression was compiled with the `utf8`
1005
+ * option set to `true`.
1006
+ *
1007
+ * @return [Boolean] the `utf8` option
1008
+ * @example
1009
+ * re2 = RE2::Regexp.new("woo?", utf8: true)
1010
+ * re2.utf8? #=> true
1011
+ */
1012
+ static VALUE re2_regexp_utf8(const VALUE self) {
1013
+ re2_pattern *p;
1014
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1015
+
1016
+ return BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8);
1017
+ }
1018
+
1019
+ /*
1020
+ * Returns whether or not the regular expression was compiled with the
1021
+ * `posix_syntax` option set to `true`.
1022
+ *
1023
+ * @return [Boolean] the `posix_syntax` option
1024
+ * @example
1025
+ * re2 = RE2::Regexp.new("woo?", posix_syntax: true)
1026
+ * re2.posix_syntax? #=> true
1027
+ */
1028
+ static VALUE re2_regexp_posix_syntax(const VALUE self) {
1029
+ re2_pattern *p;
1030
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1031
+
1032
+ return BOOL2RUBY(p->pattern->options().posix_syntax());
1033
+ }
1034
+
1035
+ /*
1036
+ * Returns whether or not the regular expression was compiled with the
1037
+ * `longest_match` option set to `true`.
1038
+ *
1039
+ * @return [Boolean] the `longest_match` option
1040
+ * @example
1041
+ * re2 = RE2::Regexp.new("woo?", longest_match: true)
1042
+ * re2.longest_match? #=> true
1043
+ */
1044
+ static VALUE re2_regexp_longest_match(const VALUE self) {
1045
+ re2_pattern *p;
1046
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1047
+
1048
+ return BOOL2RUBY(p->pattern->options().longest_match());
1049
+ }
1050
+
1051
+ /*
1052
+ * Returns whether or not the regular expression was compiled with the
1053
+ * `log_errors` option set to `true`.
1054
+ *
1055
+ * @return [Boolean] the `log_errors` option
1056
+ * @example
1057
+ * re2 = RE2::Regexp.new("woo?", log_errors: true)
1058
+ * re2.log_errors? #=> true
1059
+ */
1060
+ static VALUE re2_regexp_log_errors(const VALUE self) {
1061
+ re2_pattern *p;
1062
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1063
+
1064
+ return BOOL2RUBY(p->pattern->options().log_errors());
1065
+ }
1066
+
1067
+ /*
1068
+ * Returns the `max_mem` setting for the regular expression.
1069
+ *
1070
+ * @return [Integer] the `max_mem` option
1071
+ * @example
1072
+ * re2 = RE2::Regexp.new("woo?", max_mem: 1024)
1073
+ * re2.max_mem #=> 1024
1074
+ */
1075
+ static VALUE re2_regexp_max_mem(const VALUE self) {
1076
+ re2_pattern *p;
1077
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1078
+
1079
+ return INT2FIX(p->pattern->options().max_mem());
1080
+ }
1081
+
1082
+ /*
1083
+ * Returns whether or not the regular expression was compiled with the
1084
+ * `literal` option set to `true`.
1085
+ *
1086
+ * @return [Boolean] the `literal` option
1087
+ * @example
1088
+ * re2 = RE2::Regexp.new("woo?", literal: true)
1089
+ * re2.literal? #=> true
1090
+ */
1091
+ static VALUE re2_regexp_literal(const VALUE self) {
1092
+ re2_pattern *p;
1093
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1094
+
1095
+ return BOOL2RUBY(p->pattern->options().literal());
1096
+ }
1097
+
1098
+ /*
1099
+ * Returns whether or not the regular expression was compiled with the
1100
+ * `never_nl` option set to `true`.
1101
+ *
1102
+ * @return [Boolean] the `never_nl` option
1103
+ * @example
1104
+ * re2 = RE2::Regexp.new("woo?", never_nl: true)
1105
+ * re2.never_nl? #=> true
1106
+ */
1107
+ static VALUE re2_regexp_never_nl(const VALUE self) {
1108
+ re2_pattern *p;
1109
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1110
+
1111
+ return BOOL2RUBY(p->pattern->options().never_nl());
1112
+ }
1113
+
1114
+ /*
1115
+ * Returns whether or not the regular expression was compiled with the
1116
+ * `case_sensitive` option set to `true`.
1117
+ *
1118
+ * @return [Boolean] the `case_sensitive` option
1119
+ * @example
1120
+ * re2 = RE2::Regexp.new("woo?", case_sensitive: true)
1121
+ * re2.case_sensitive? #=> true
1122
+ */
1123
+ static VALUE re2_regexp_case_sensitive(const VALUE self) {
1124
+ re2_pattern *p;
1125
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1126
+
1127
+ return BOOL2RUBY(p->pattern->options().case_sensitive());
1128
+ }
1129
+
1130
+ /*
1131
+ * Returns whether or not the regular expression was compiled with the
1132
+ * `case_sensitive` option set to `false`.
1133
+ *
1134
+ * @return [Boolean] the inverse of the `case_sensitive` option
1135
+ * @example
1136
+ * re2 = RE2::Regexp.new("woo?", case_sensitive: true)
1137
+ * re2.case_insensitive? #=> false
1138
+ * re2.casefold? #=> false
1139
+ */
1140
+ static VALUE re2_regexp_case_insensitive(const VALUE self) {
1141
+ return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
1142
+ }
1143
+
1144
+ /*
1145
+ * Returns whether or not the regular expression was compiled with the
1146
+ * perl_classes option set to `true`.
1147
+ *
1148
+ * @return [Boolean] the `perl_classes` option
1149
+ * @example
1150
+ * re2 = RE2::Regexp.new("woo?", perl_classes: true)
1151
+ * re2.perl_classes? #=> true
1152
+ */
1153
+ static VALUE re2_regexp_perl_classes(const VALUE self) {
1154
+ re2_pattern *p;
1155
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1156
+
1157
+ return BOOL2RUBY(p->pattern->options().perl_classes());
1158
+ }
1159
+
1160
+ /*
1161
+ * Returns whether or not the regular expression was compiled with the
1162
+ * `word_boundary` option set to `true`.
1163
+ *
1164
+ * @return [Boolean] the `word_boundary` option
1165
+ * @example
1166
+ * re2 = RE2::Regexp.new("woo?", word_boundary: true)
1167
+ * re2.word_boundary? #=> true
1168
+ */
1169
+ static VALUE re2_regexp_word_boundary(const VALUE self) {
1170
+ re2_pattern *p;
1171
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1172
+
1173
+ return BOOL2RUBY(p->pattern->options().word_boundary());
1174
+ }
1175
+
1176
+ /*
1177
+ * Returns whether or not the regular expression was compiled with the
1178
+ * `one_line` option set to `true`.
1179
+ *
1180
+ * @return [Boolean] the `one_line` option
1181
+ * @example
1182
+ * re2 = RE2::Regexp.new("woo?", one_line: true)
1183
+ * re2.one_line? #=> true
1184
+ */
1185
+ static VALUE re2_regexp_one_line(const VALUE self) {
1186
+ re2_pattern *p;
1187
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1188
+
1189
+ return BOOL2RUBY(p->pattern->options().one_line());
1190
+ }
1191
+
1192
+ /*
1193
+ * If the {RE2::Regexp} could not be created properly, returns an error string
1194
+ * otherwise returns `nil`.
1195
+ *
1196
+ * @return [String, nil] the error string or `nil`
1197
+ */
1198
+ static VALUE re2_regexp_error(const VALUE self) {
1199
+ re2_pattern *p;
1200
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1201
+
1202
+ if (p->pattern->ok()) {
1203
+ return Qnil;
1204
+ } else {
1205
+ return rb_str_new(p->pattern->error().data(), p->pattern->error().size());
1206
+ }
1207
+ }
1208
+
1209
+ /*
1210
+ * If the {RE2::Regexp} could not be created properly, returns
1211
+ * the offending portion of the regexp otherwise returns `nil`.
1212
+ *
1213
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1214
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1215
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1216
+ *
1217
+ * @return [String, nil] the offending portion of the regexp or `nil`
1218
+ */
1219
+ static VALUE re2_regexp_error_arg(const VALUE self) {
1220
+ re2_pattern *p;
1221
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1222
+
1223
+ if (p->pattern->ok()) {
1224
+ return Qnil;
1225
+ } else {
1226
+ return encoded_str_new(p->pattern->error_arg().data(),
1227
+ p->pattern->error_arg().size(),
1228
+ p->pattern->options().encoding());
1229
+ }
1230
+ }
1231
+
1232
+ /*
1233
+ * Returns the program size, a very approximate measure
1234
+ * of a regexp's "cost". Larger numbers are more expensive
1235
+ * than smaller numbers.
1236
+ *
1237
+ * @return [Integer] the regexp "cost"
1238
+ */
1239
+ static VALUE re2_regexp_program_size(const VALUE self) {
1240
+ re2_pattern *p;
1241
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1242
+
1243
+ return INT2FIX(p->pattern->ProgramSize());
1244
+ }
1245
+
1246
+ /*
1247
+ * Returns a hash of the options currently set for the {RE2::Regexp}.
1248
+ *
1249
+ * @return [Hash] the options
1250
+ */
1251
+ static VALUE re2_regexp_options(const VALUE self) {
1252
+ re2_pattern *p;
1253
+
1254
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1255
+ VALUE options = rb_hash_new();
1256
+
1257
+ rb_hash_aset(options, ID2SYM(id_utf8),
1258
+ BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8));
1259
+
1260
+ rb_hash_aset(options, ID2SYM(id_posix_syntax),
1261
+ BOOL2RUBY(p->pattern->options().posix_syntax()));
1262
+
1263
+ rb_hash_aset(options, ID2SYM(id_longest_match),
1264
+ BOOL2RUBY(p->pattern->options().longest_match()));
1265
+
1266
+ rb_hash_aset(options, ID2SYM(id_log_errors),
1267
+ BOOL2RUBY(p->pattern->options().log_errors()));
1268
+
1269
+ rb_hash_aset(options, ID2SYM(id_max_mem),
1270
+ INT2FIX(p->pattern->options().max_mem()));
1271
+
1272
+ rb_hash_aset(options, ID2SYM(id_literal),
1273
+ BOOL2RUBY(p->pattern->options().literal()));
1274
+
1275
+ rb_hash_aset(options, ID2SYM(id_never_nl),
1276
+ BOOL2RUBY(p->pattern->options().never_nl()));
1277
+
1278
+ rb_hash_aset(options, ID2SYM(id_case_sensitive),
1279
+ BOOL2RUBY(p->pattern->options().case_sensitive()));
1280
+
1281
+ rb_hash_aset(options, ID2SYM(id_perl_classes),
1282
+ BOOL2RUBY(p->pattern->options().perl_classes()));
1283
+
1284
+ rb_hash_aset(options, ID2SYM(id_word_boundary),
1285
+ BOOL2RUBY(p->pattern->options().word_boundary()));
1286
+
1287
+ rb_hash_aset(options, ID2SYM(id_one_line),
1288
+ BOOL2RUBY(p->pattern->options().one_line()));
1289
+
1290
+ /* This is a read-only hash after all... */
1291
+ rb_obj_freeze(options);
1292
+
1293
+ return options;
1294
+ }
1295
+
1296
+ /*
1297
+ * Returns the number of capturing subpatterns, or -1 if the regexp
1298
+ * wasn't valid on construction. The overall match (`$0`) does not
1299
+ * count: if the regexp is `"(a)(b)"`, returns 2.
1300
+ *
1301
+ * @return [Integer] the number of capturing subpatterns
1302
+ */
1303
+ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1304
+ re2_pattern *p;
1305
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1306
+
1307
+ return INT2FIX(p->pattern->NumberOfCapturingGroups());
1308
+ }
1309
+
1310
+ /*
1311
+ * Returns a hash of names to capturing indices of groups.
1312
+ *
1313
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1314
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1315
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1316
+ *
1317
+ * @return [Hash] a hash of names to capturing indices
1318
+ */
1319
+ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1320
+ re2_pattern *p;
1321
+
1322
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1323
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1324
+ VALUE capturing_groups = rb_hash_new();
1325
+
1326
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1327
+ rb_hash_aset(capturing_groups,
1328
+ encoded_str_new(it->first.data(), it->first.size(),
1329
+ p->pattern->options().encoding()),
1330
+ INT2FIX(it->second));
1331
+ }
1332
+
1333
+ return capturing_groups;
1334
+ }
1335
+
1336
+ /*
1337
+ * General matching: match the pattern against the given `text` using
1338
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1339
+ * `Match`} and return a {RE2::MatchData} instance with the specified number of
1340
+ * submatches (defaults to the total number of capturing groups) or a boolean
1341
+ * (if no submatches are required).
1342
+ *
1343
+ * The number of submatches has a significant impact on performance: requesting
1344
+ * one submatch is much faster than requesting more than one and requesting
1345
+ * zero submatches is faster still.
1346
+ *
1347
+ * @overload match(text)
1348
+ * Returns a {RE2::MatchData} containing the matching pattern and all
1349
+ * submatches resulting from looking for the regexp in `text` if the pattern
1350
+ * contains capturing groups.
1351
+ *
1352
+ * Returns either `true` or `false` indicating whether a successful match was
1353
+ * made if the pattern contains no capturing groups.
1354
+ *
1355
+ * @param [String] text the text to search
1356
+ * @return [RE2::MatchData, nil] if the pattern contains capturing groups
1357
+ * @return [Boolean] if the pattern does not contain capturing groups
1358
+ * @raise [NoMemoryError] if there was not enough memory to allocate the submatches
1359
+ * @raise [TypeError] if given text that cannot be coerced to a `String`
1360
+ * @example Matching with capturing groups
1361
+ * r = RE2::Regexp.new('w(o)(o)')
1362
+ * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1363
+ * @example Matching without capturing groups
1364
+ * r = RE2::Regexp.new('woo')
1365
+ * r.match('woo') #=> true
1366
+ *
1367
+ * @overload match(text, options)
1368
+ * See `match(text)` but with customisable offsets for starting and ending
1369
+ * matches, optional anchoring to the start or both ends of the text and a
1370
+ * specific number of submatches to extract (padded with `nil`s if
1371
+ * necessary).
1372
+ *
1373
+ * @param [String] text the text to search
1374
+ * @param [Hash] options the options with which to perform the match
1375
+ * @option options [Integer] :startpos (0) offset at which to start matching
1376
+ * @option options [Integer] :endpos offset at which to stop matching, defaults to the text length
1377
+ * @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match
1378
+ * @option options [Integer] :submatches how many submatches to extract (0 is
1379
+ * fastest), defaults to the number of capturing groups
1380
+ * @return [RE2::MatchData, nil] if extracting any submatches
1381
+ * @return [Boolean] if not extracting any submatches
1382
+ * @raise [ArgumentError] if given a negative number of submatches, invalid
1383
+ * anchor or invalid startpos, endpos pair
1384
+ * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1385
+ * @raise [TypeError] if given non-String text, non-numeric number of
1386
+ * submatches, non-symbol anchor or non-hash options
1387
+ * @raise [RE2::Regexp::UnsupportedError] if given an endpos argument on a
1388
+ * version of RE2 that does not support it
1389
+ * @example Matching with capturing groups
1390
+ * r = RE2::Regexp.new('w(o)(o)')
1391
+ * r.match('woo', submatches: 1) #=> #<RE2::MatchData "woo" 1:"o">
1392
+ * r.match('woo', submatches: 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1393
+ * r.match('woot', anchor: :anchor_both, submatches: 0)
1394
+ * #=> false
1395
+ * r.match('woot', anchor: :anchor_start, submatches: 0)
1396
+ * #=> true
1397
+ * @example Matching without capturing groups
1398
+ * r = RE2::Regexp.new('wo+')
1399
+ * r.match('woot', anchor: :anchor_both) #=> false
1400
+ * r.match('woot', anchor: :anchor_start) #=> true
1401
+ *
1402
+ * @overload match(text, submatches)
1403
+ * @deprecated Legacy syntax for matching against `text` with a specific
1404
+ * number of submatches to extract. Use `match(text, submatches: n)` instead.
1405
+ *
1406
+ * @param [String] text the text to search
1407
+ * @param [Integer] submatches the number of submatches to extract
1408
+ * @return [RE2::MatchData, nil] if extracting any submatches
1409
+ * @return [Boolean] if not extracting any submatches
1410
+ * @raise [NoMemoryError] if there was not enough memory to allocate the submatches
1411
+ * @raise [TypeError] if given non-numeric number of submatches
1412
+ * @example
1413
+ * r = RE2::Regexp.new('w(o)(o)')
1414
+ * r.match('woo', 0) #=> true
1415
+ * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
1416
+ * r.match('woo', 2) #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1417
+ */
1418
+ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1419
+ re2_pattern *p;
1420
+ re2_matchdata *m;
1421
+ VALUE text, options;
1422
+
1423
+ rb_scan_args(argc, argv, "11", &text, &options);
1424
+
1425
+ /* Ensure text is a string. */
1426
+ StringValue(text);
1427
+
1428
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1429
+
1430
+ int n;
1431
+ int startpos = 0;
1432
+ int endpos = RSTRING_LEN(text);
1433
+ RE2::Anchor anchor = RE2::UNANCHORED;
1434
+
1435
+ if (RTEST(options)) {
1436
+ if (FIXNUM_P(options)) {
1437
+ n = NUM2INT(options);
1438
+
1439
+ if (n < 0) {
1440
+ rb_raise(rb_eArgError, "number of matches should be >= 0");
1441
+ }
1442
+ } else {
1443
+ if (TYPE(options) != T_HASH) {
1444
+ options = rb_Hash(options);
1445
+ }
1446
+
1447
+ VALUE endpos_option = rb_hash_aref(options, ID2SYM(id_endpos));
1448
+ if (!NIL_P(endpos_option)) {
1449
+ #ifdef HAVE_ENDPOS_ARGUMENT
1450
+ Check_Type(endpos_option, T_FIXNUM);
1451
+
1452
+ endpos = NUM2INT(endpos_option);
1453
+
1454
+ if (endpos < 0) {
1455
+ rb_raise(rb_eArgError, "endpos should be >= 0");
1456
+ }
1457
+ #else
1458
+ rb_raise(re2_eRegexpUnsupportedError, "current version of RE2::Match() does not support endpos argument");
1459
+ #endif
1460
+ }
1461
+
1462
+ VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor));
1463
+ if (!NIL_P(anchor_option)) {
1464
+ Check_Type(anchor_option, T_SYMBOL);
1465
+
1466
+ ID id_anchor_option = SYM2ID(anchor_option);
1467
+ if (id_anchor_option == id_unanchored) {
1468
+ anchor = RE2::UNANCHORED;
1469
+ } else if (id_anchor_option == id_anchor_start) {
1470
+ anchor = RE2::ANCHOR_START;
1471
+ } else if (id_anchor_option == id_anchor_both) {
1472
+ anchor = RE2::ANCHOR_BOTH;
1473
+ } else {
1474
+ rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
1475
+ }
1476
+ }
1477
+
1478
+ VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches));
1479
+ if (!NIL_P(submatches_option)) {
1480
+ Check_Type(submatches_option, T_FIXNUM);
1481
+
1482
+ n = NUM2INT(submatches_option);
1483
+
1484
+ if (n < 0) {
1485
+ rb_raise(rb_eArgError, "number of matches should be >= 0");
1486
+ }
1487
+ } else {
1488
+ if (!p->pattern->ok()) {
1489
+ return Qnil;
1490
+ }
1491
+
1492
+ n = p->pattern->NumberOfCapturingGroups();
1493
+ }
1494
+
1495
+ VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
1496
+ if (!NIL_P(startpos_option)) {
1497
+ Check_Type(startpos_option, T_FIXNUM);
1498
+
1499
+ startpos = NUM2INT(startpos_option);
1500
+
1501
+ if (startpos < 0) {
1502
+ rb_raise(rb_eArgError, "startpos should be >= 0");
1503
+ }
1504
+ }
1505
+ }
1506
+ } else {
1507
+ if (!p->pattern->ok()) {
1508
+ return Qnil;
1509
+ }
1510
+
1511
+ n = p->pattern->NumberOfCapturingGroups();
1512
+ }
1513
+
1514
+ if (startpos > endpos) {
1515
+ rb_raise(rb_eArgError, "startpos should be <= endpos");
1516
+ }
1517
+
1518
+ if (n == 0) {
1519
+ #ifdef HAVE_ENDPOS_ARGUMENT
1520
+ bool matched = p->pattern->Match(
1521
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1522
+ startpos, endpos, anchor, 0, 0);
1523
+ #else
1524
+ bool matched = p->pattern->Match(
1525
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1526
+ startpos, anchor, 0, 0);
1527
+ #endif
1528
+ return BOOL2RUBY(matched);
1529
+ } else {
1530
+ /* Because match returns the whole match as well. */
1531
+ n += 1;
1532
+
1533
+ VALUE matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1534
+ TypedData_Get_Struct(matchdata, re2_matchdata, &re2_matchdata_data_type, m);
1535
+ m->matches = new(std::nothrow) re2::StringPiece[n];
1536
+ RB_OBJ_WRITE(matchdata, &m->regexp, self);
1537
+ if (!RTEST(rb_obj_frozen_p(text))) {
1538
+ text = rb_str_freeze(rb_str_dup(text));
1539
+ }
1540
+ RB_OBJ_WRITE(matchdata, &m->text, text);
1541
+
1542
+ if (m->matches == 0) {
1543
+ rb_raise(rb_eNoMemError,
1544
+ "not enough memory to allocate StringPieces for matches");
1545
+ }
1546
+
1547
+ m->number_of_matches = n;
1548
+
1549
+ #ifdef HAVE_ENDPOS_ARGUMENT
1550
+ bool matched = p->pattern->Match(
1551
+ re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
1552
+ startpos, endpos, anchor, m->matches, n);
1553
+ #else
1554
+ bool matched = p->pattern->Match(
1555
+ re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
1556
+ startpos, anchor, m->matches, n);
1557
+ #endif
1558
+ if (matched) {
1559
+ return matchdata;
1560
+ } else {
1561
+ return Qnil;
1562
+ }
1563
+ }
1564
+ }
1565
+
1566
+ /*
1567
+ * Returns true if the pattern matches any substring of the given text using
1568
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L413-L427
1569
+ * `PartialMatch`}.
1570
+ *
1571
+ * @return [Boolean] whether the match was successful
1572
+ * @raise [TypeError] if text cannot be coerced to a `String`
1573
+ */
1574
+ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1575
+ re2_pattern *p;
1576
+
1577
+ /* Ensure text is a string. */
1578
+ StringValue(text);
1579
+
1580
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1581
+
1582
+ return BOOL2RUBY(RE2::PartialMatch(
1583
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
1584
+ }
1585
+
1586
+ /*
1587
+ * Returns true if the pattern matches the given text using
1588
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L376-L411
1589
+ * `FullMatch`}.
1590
+ *
1591
+ * @return [Boolean] whether the match was successful
1592
+ * @raise [TypeError] if text cannot be coerced to a `String`
1593
+ */
1594
+ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {
1595
+ re2_pattern *p;
1596
+
1597
+ /* Ensure text is a string. */
1598
+ StringValue(text);
1599
+
1600
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1601
+
1602
+ return BOOL2RUBY(RE2::FullMatch(
1603
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
1604
+ }
1605
+
1606
+ /*
1607
+ * Returns a {RE2::Scanner} for scanning the given text incrementally with
1608
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
1609
+ * `FindAndConsume`}.
1610
+ *
1611
+ * @param [text] text the text to scan incrementally
1612
+ * @return [RE2::Scanner] an `Enumerable` {RE2::Scanner} object
1613
+ * @raise [TypeError] if `text` cannot be coerced to a `String`
1614
+ * @example
1615
+ * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
1616
+ * #=> #<RE2::Scanner:0x0000000000000001>
1617
+ */
1618
+ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1619
+ /* Ensure text is a string. */
1620
+ StringValue(text);
1621
+
1622
+ re2_pattern *p;
1623
+ re2_scanner *c;
1624
+
1625
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1626
+ VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1627
+ TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);
1628
+
1629
+ c->input = new(std::nothrow) re2::StringPiece(
1630
+ RSTRING_PTR(text), RSTRING_LEN(text));
1631
+ RB_OBJ_WRITE(scanner, &c->regexp, self);
1632
+ RB_OBJ_WRITE(scanner, &c->text, text);
1633
+
1634
+ if (p->pattern->ok()) {
1635
+ c->number_of_capturing_groups = p->pattern->NumberOfCapturingGroups();
1636
+ } else {
1637
+ c->number_of_capturing_groups = 0;
1638
+ }
1639
+
1640
+ c->eof = false;
1641
+
1642
+ return scanner;
1643
+ }
1644
+
1645
+ /*
1646
+ * Returns whether the underlying RE2 version supports passing an `endpos`
1647
+ * argument to
1648
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1649
+ * Match}. If not, {RE2::Regexp#match} will raise an error if attempting to
1650
+ * pass an `endpos`.
1651
+ *
1652
+ * @return [Boolean] whether the underlying
1653
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1654
+ * Match} has an endpos argument
1655
+ */
1656
+ static VALUE re2_regexp_match_has_endpos_argument_p(VALUE) {
1657
+ #ifdef HAVE_ENDPOS_ARGUMENT
1658
+ return Qtrue;
1659
+ #else
1660
+ return Qfalse;
1661
+ #endif
1662
+ }
1663
+
1664
+ /*
1665
+ * Returns a copy of `str` with the first occurrence `pattern` replaced with
1666
+ * `rewrite` using
1667
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L465-L480
1668
+ * `Replace`}.
1669
+ *
1670
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1671
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1672
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1673
+ *
1674
+ * @param [String] str the string to modify
1675
+ * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1676
+ * @param [String] rewrite the string to replace with
1677
+ * @return [String] the resulting string
1678
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1679
+ * {RE2::Regexp}) cannot be coerced to `String`s
1680
+ * @example
1681
+ * RE2.Replace("hello there", "hello", "howdy") #=> "howdy there"
1682
+ * re2 = RE2::Regexp.new("hel+o")
1683
+ * RE2.Replace("hello there", re2, "yo") #=> "yo there"
1684
+ */
1685
+ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
1686
+ VALUE rewrite) {
1687
+ /* Ensure rewrite is a string. */
1688
+ StringValue(rewrite);
1689
+
1690
+ re2_pattern *p;
1691
+
1692
+ /* Take a copy of str so it can be modified in-place by
1693
+ * RE2::Replace.
1694
+ */
1695
+ StringValue(str);
1696
+ std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1697
+
1698
+ /* Do the replacement. */
1699
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1700
+ TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1701
+ RE2::Replace(&str_as_string, *p->pattern,
1702
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1703
+
1704
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1705
+ p->pattern->options().encoding());
1706
+ } else {
1707
+ /* Ensure pattern is a string. */
1708
+ StringValue(pattern);
1709
+
1710
+ RE2::Replace(&str_as_string,
1711
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1712
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1713
+
1714
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1715
+ }
1716
+ }
1717
+
1718
+ /*
1719
+ * Return a copy of `str` with `pattern` replaced by `rewrite` using
1720
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L482-L497
1721
+ * `GlobalReplace`}.
1722
+ *
1723
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1724
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1725
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1726
+ *
1727
+ * @param [String] str the string to modify
1728
+ * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1729
+ * @param [String] rewrite the string to replace with
1730
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1731
+ * {RE2::Regexp}) cannot be coerced to `String`s
1732
+ * @return [String] the resulting string
1733
+ * @example
1734
+ * re2 = RE2::Regexp.new("oo?")
1735
+ * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
1736
+ * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
1737
+ */
1738
+ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1739
+ VALUE rewrite) {
1740
+ /* Ensure rewrite is a string. */
1741
+ StringValue(rewrite);
1742
+
1743
+ /* Take a copy of str so it can be modified in-place by
1744
+ * RE2::GlobalReplace.
1745
+ */
1746
+ re2_pattern *p;
1747
+ StringValue(str);
1748
+ std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1749
+
1750
+ /* Do the replacement. */
1751
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1752
+ TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1753
+ RE2::GlobalReplace(&str_as_string, *p->pattern,
1754
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1755
+
1756
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1757
+ p->pattern->options().encoding());
1758
+ } else {
1759
+ /* Ensure pattern is a string. */
1760
+ StringValue(pattern);
1761
+
1762
+ RE2::GlobalReplace(&str_as_string,
1763
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1764
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1765
+
1766
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1767
+ }
1768
+ }
1769
+
1770
+ /*
1771
+ * Returns a version of `str` with all potentially meaningful regexp characters
1772
+ * escaped using
1773
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L512-L518
1774
+ * `QuoteMeta`}. The returned string, used as a regular expression, will
1775
+ * exactly match the original string.
1776
+ *
1777
+ * @param [String] unquoted the unquoted string
1778
+ * @raise [TypeError] if the given unquoted string cannot be coerced to a `String`
1779
+ * @return [String] the escaped string
1780
+ * @example
1781
+ * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
1782
+ */
1783
+ static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
1784
+ StringValue(unquoted);
1785
+
1786
+ std::string quoted_string = RE2::QuoteMeta(
1787
+ re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted)));
1788
+
1789
+ return rb_str_new(quoted_string.data(), quoted_string.size());
1790
+ }
1791
+
1792
+ static void re2_set_free(void *ptr) {
1793
+ re2_set *s = reinterpret_cast<re2_set *>(ptr);
1794
+ if (s->set) {
1795
+ delete s->set;
1796
+ }
1797
+ xfree(s);
1798
+ }
1799
+
1800
+ static size_t re2_set_memsize(const void *ptr) {
1801
+ const re2_set *s = reinterpret_cast<const re2_set *>(ptr);
1802
+ size_t size = sizeof(*s);
1803
+ if (s->set) {
1804
+ size += sizeof(*s->set);
1805
+ }
1806
+
1807
+ return size;
1808
+ }
1809
+
1810
+ static const rb_data_type_t re2_set_data_type = {
1811
+ "RE2::Set",
1812
+ {
1813
+ 0,
1814
+ re2_set_free,
1815
+ re2_set_memsize,
1816
+ },
1817
+ 0,
1818
+ 0,
1819
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
1820
+ // macro to update VALUE references, as to trigger write barriers.
1821
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
1822
+ };
1823
+
1824
+ static VALUE re2_set_allocate(VALUE klass) {
1825
+ re2_set *s;
1826
+ VALUE result = TypedData_Make_Struct(klass, re2_set, &re2_set_data_type, s);
1827
+
1828
+ return result;
1829
+ }
1830
+
1831
+ /*
1832
+ * Returns a new {RE2::Set} object, a collection of patterns that can be
1833
+ * searched for simultaneously.
1834
+ *
1835
+ * @return [RE2::Set]
1836
+ *
1837
+ * @overload initialize
1838
+ * Returns a new {RE2::Set} object for unanchored patterns with the default
1839
+ * options.
1840
+ *
1841
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1842
+ * @return [RE2::Set]
1843
+ *
1844
+ * @overload initialize(anchor)
1845
+ * Returns a new {RE2::Set} object for the specified anchor with the default
1846
+ * options.
1847
+ *
1848
+ * @param [Symbol] anchor one of `:unanchored`, `:anchor_start`, `:anchor_both`
1849
+ * @raise [ArgumentError] if anchor is not `:unanchored`, `:anchor_start` or `:anchor_both`
1850
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1851
+ *
1852
+ * @overload initialize(anchor, options)
1853
+ * Returns a new {RE2::Set} object with the specified options.
1854
+ *
1855
+ * @param [Symbol] anchor one of `:unanchored`, `:anchor_start`, `:anchor_both`
1856
+ * @param [Hash] options the options with which to compile the pattern
1857
+ * @option options [Boolean] :utf8 (true) text and pattern are UTF-8; otherwise Latin-1
1858
+ * @option options [Boolean] :posix_syntax (false) restrict regexps to POSIX egrep syntax
1859
+ * @option options [Boolean] :longest_match (false) search for longest match, not first match
1860
+ * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
1861
+ * @option options [Integer] :max_mem approx. max memory footprint of RE2
1862
+ * @option options [Boolean] :literal (false) interpret string as literal, not regexp
1863
+ * @option options [Boolean] :never_nl (false) never match `\n`, even if it is in regexp
1864
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with `(?i)` unless in `posix_syntax` mode)
1865
+ * @option options [Boolean] :perl_classes (false) allow Perl's `\d` `\s` `\w` `\D` `\S` `\W` when in `posix_syntax` mode
1866
+ * @option options [Boolean] :word_boundary (false) allow `\b` `\B` (word boundary and not) when in `posix_syntax` mode
1867
+ * @option options [Boolean] :one_line (false) `^` and `$` only match beginning and end of text when in `posix_syntax` mode
1868
+ * @return [RE2::Set] a {RE2::Set} with the specified anchor and options
1869
+ * @raise [ArgumentError] if `anchor` is not one of the accepted choices
1870
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1871
+ */
1872
+ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1873
+ VALUE anchor, options;
1874
+ re2_set *s;
1875
+
1876
+ rb_scan_args(argc, argv, "02", &anchor, &options);
1877
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
1878
+
1879
+ RE2::Anchor re2_anchor = RE2::UNANCHORED;
1880
+
1881
+ if (!NIL_P(anchor)) {
1882
+ Check_Type(anchor, T_SYMBOL);
1883
+ ID id_anchor_arg = SYM2ID(anchor);
1884
+ if (id_anchor_arg == id_unanchored) {
1885
+ re2_anchor = RE2::UNANCHORED;
1886
+ } else if (id_anchor_arg == id_anchor_start) {
1887
+ re2_anchor = RE2::ANCHOR_START;
1888
+ } else if (id_anchor_arg == id_anchor_both) {
1889
+ re2_anchor = RE2::ANCHOR_BOTH;
1890
+ } else {
1891
+ rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
1892
+ }
1893
+ }
1894
+
1895
+ RE2::Options re2_options;
1896
+
1897
+ if (RTEST(options)) {
1898
+ parse_re2_options(&re2_options, options);
1899
+ }
1900
+
1901
+ s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
1902
+ if (s->set == 0) {
1903
+ rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1904
+ }
1905
+
1906
+ return self;
1907
+ }
1908
+
1909
+ /*
1910
+ * Adds a pattern to the set. Returns the index that will identify the pattern
1911
+ * in the output of {RE2::Set#match}. Cannot be called after {RE2::Set#compile}
1912
+ * has been called.
1913
+ *
1914
+ * @param [String] pattern the regex pattern
1915
+ * @return [Integer] the index of the pattern in the set
1916
+ * @raise [ArgumentError] if called after compile or the pattern is rejected
1917
+ * @example
1918
+ * set = RE2::Set.new
1919
+ * set.add("abc") #=> 0
1920
+ * set.add("def") #=> 1
1921
+ */
1922
+ static VALUE re2_set_add(VALUE self, VALUE pattern) {
1923
+ StringValue(pattern);
1924
+
1925
+ re2_set *s;
1926
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
1927
+
1928
+ /* To prevent the memory of the err string leaking when we call rb_raise,
1929
+ * take a copy of it and let it go out of scope.
1930
+ */
1931
+ char msg[100];
1932
+ int index;
1933
+
1934
+ {
1935
+ std::string err;
1936
+ index = s->set->Add(
1937
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
1938
+ strlcpy(msg, err.c_str(), sizeof(msg));
1939
+ }
1940
+
1941
+ if (index < 0) {
1942
+ rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", msg);
1943
+ }
1944
+
1945
+ return INT2FIX(index);
1946
+ }
1947
+
1948
+ /*
1949
+ * Compiles a {RE2::Set} so it can be used to match against. Must be called
1950
+ * after {RE2::Set#add} and before {RE2::Set#match}.
1951
+ *
1952
+ * @return [Boolean] whether compilation was a success
1953
+ * @example
1954
+ * set = RE2::Set.new
1955
+ * set.add("abc")
1956
+ * set.compile #=> true
1957
+ */
1958
+ static VALUE re2_set_compile(VALUE self) {
1959
+ re2_set *s;
1960
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
1961
+
1962
+ return BOOL2RUBY(s->set->Compile());
1963
+ }
1964
+
1965
+ /*
1966
+ * Returns whether the underlying RE2 version outputs error information from
1967
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/set.h#L62-L65
1968
+ * `RE2::Set::Match`}. If not, {RE2::Set#match} will raise an error if attempting to set
1969
+ * its `:exception` option to `true`.
1970
+ *
1971
+ * @return [Boolean] whether the underlying RE2 outputs error information from {RE2::Set} matches
1972
+ */
1973
+ static VALUE re2_set_match_raises_errors_p(VALUE) {
1974
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
1975
+ return Qtrue;
1976
+ #else
1977
+ return Qfalse;
1978
+ #endif
1979
+ }
1980
+
1981
+ /*
1982
+ * Matches the given text against patterns in the set, returning an array of
1983
+ * integer indices of the matching patterns if matched or an empty array if
1984
+ * there are no matches.
1985
+ *
1986
+ * @return [Array<Integer>]
1987
+ *
1988
+ * @overload match(str)
1989
+ * Returns an array of integer indices of patterns matching the given string
1990
+ * (if any). Raises exceptions if there are any errors while matching.
1991
+ *
1992
+ * @param [String] str the text to match against
1993
+ * @return [Array<Integer>] the indices of matching regexps
1994
+ * @raise [MatchError] if an error occurs while matching
1995
+ * @raise [UnsupportedError] if the underlying version of RE2 does not output error information
1996
+ * @example
1997
+ * set = RE2::Set.new
1998
+ * set.add("abc")
1999
+ * set.add("def")
2000
+ * set.compile
2001
+ * set.match("abcdef") #=> [0, 1]
2002
+ *
2003
+ * @overload match(str, options)
2004
+ * Returns an array of integer indices of patterns matching the given string
2005
+ * (if any). Raises exceptions if there are any errors while matching and the
2006
+ * `:exception` option is set to true.
2007
+ *
2008
+ * @param [String] str the text to match against
2009
+ * @param [Hash] options the options with which to match
2010
+ * @option options [Boolean] :exception (true) whether to raise exceptions with RE2's error information (not supported on ABI version 0 of RE2)
2011
+ * @return [Array<Integer>] the indices of matching regexps
2012
+ * @raise [MatchError] if an error occurs while matching
2013
+ * @raise [UnsupportedError] if the underlying version of RE2 does not output error information
2014
+ * @example
2015
+ * set = RE2::Set.new
2016
+ * set.add("abc")
2017
+ * set.add("def")
2018
+ * set.compile
2019
+ * set.match("abcdef", exception: true) #=> [0, 1]
2020
+ */
2021
+ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2022
+ VALUE str, options;
2023
+ bool raise_exception = true;
2024
+ rb_scan_args(argc, argv, "11", &str, &options);
2025
+
2026
+ StringValue(str);
2027
+ re2_set *s;
2028
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2029
+
2030
+ if (RTEST(options)) {
2031
+ Check_Type(options, T_HASH);
2032
+
2033
+ VALUE exception_option = rb_hash_aref(options, ID2SYM(id_exception));
2034
+ if (!NIL_P(exception_option)) {
2035
+ raise_exception = RTEST(exception_option);
2036
+ }
2037
+ }
2038
+
2039
+ std::vector<int> v;
2040
+
2041
+ if (raise_exception) {
2042
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
2043
+ RE2::Set::ErrorInfo e;
2044
+ bool match_failed = !s->set->Match(
2045
+ re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
2046
+ VALUE result = rb_ary_new2(v.size());
2047
+
2048
+ if (match_failed) {
2049
+ switch (e.kind) {
2050
+ case RE2::Set::kNoError:
2051
+ break;
2052
+ case RE2::Set::kNotCompiled:
2053
+ rb_raise(re2_eSetMatchError, "#match must not be called before #compile");
2054
+ case RE2::Set::kOutOfMemory:
2055
+ rb_raise(re2_eSetMatchError, "The DFA ran out of memory");
2056
+ case RE2::Set::kInconsistent:
2057
+ rb_raise(re2_eSetMatchError, "RE2::Prog internal error");
2058
+ default: // Just in case a future version of libre2 adds new ErrorKinds
2059
+ rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
2060
+ }
2061
+ } else {
2062
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
2063
+ rb_ary_push(result, INT2FIX(v[i]));
2064
+ }
2065
+ }
2066
+
2067
+ return result;
2068
+ #else
2069
+ rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
2070
+ #endif
2071
+ } else {
2072
+ bool matched = s->set->Match(
2073
+ re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
2074
+ VALUE result = rb_ary_new2(v.size());
2075
+
2076
+ if (matched) {
2077
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
2078
+ rb_ary_push(result, INT2FIX(v[i]));
2079
+ }
2080
+ }
2081
+
2082
+ return result;
2083
+ }
2084
+ }
2085
+
2086
+ extern "C" void Init_re2(void) {
2087
+ re2_mRE2 = rb_define_module("RE2");
2088
+ re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
2089
+ re2_eRegexpUnsupportedError = rb_define_class_under(re2_cRegexp,
2090
+ "UnsupportedError", rb_const_get(rb_cObject, rb_intern("StandardError")));
2091
+ re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject);
2092
+ re2_cScanner = rb_define_class_under(re2_mRE2, "Scanner", rb_cObject);
2093
+ re2_cSet = rb_define_class_under(re2_mRE2, "Set", rb_cObject);
2094
+ re2_eSetMatchError = rb_define_class_under(re2_cSet, "MatchError",
2095
+ rb_const_get(rb_cObject, rb_intern("StandardError")));
2096
+ re2_eSetUnsupportedError = rb_define_class_under(re2_cSet, "UnsupportedError",
2097
+ rb_const_get(rb_cObject, rb_intern("StandardError")));
2098
+
2099
+ rb_define_alloc_func(re2_cRegexp,
2100
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_regexp_allocate));
2101
+ rb_define_alloc_func(re2_cMatchData,
2102
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_matchdata_allocate));
2103
+ rb_define_alloc_func(re2_cScanner,
2104
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_scanner_allocate));
2105
+ rb_define_alloc_func(re2_cSet,
2106
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_set_allocate));
2107
+
2108
+ rb_define_method(re2_cMatchData, "string",
2109
+ RUBY_METHOD_FUNC(re2_matchdata_string), 0);
2110
+ rb_define_method(re2_cMatchData, "regexp",
2111
+ RUBY_METHOD_FUNC(re2_matchdata_regexp), 0);
2112
+ rb_define_method(re2_cMatchData, "to_a",
2113
+ RUBY_METHOD_FUNC(re2_matchdata_to_a), 0);
2114
+ rb_define_method(re2_cMatchData, "size",
2115
+ RUBY_METHOD_FUNC(re2_matchdata_size), 0);
2116
+ rb_define_method(re2_cMatchData, "length",
2117
+ RUBY_METHOD_FUNC(re2_matchdata_size), 0);
2118
+ rb_define_method(re2_cMatchData, "begin",
2119
+ RUBY_METHOD_FUNC(re2_matchdata_begin), 1);
2120
+ rb_define_method(re2_cMatchData, "end",
2121
+ RUBY_METHOD_FUNC(re2_matchdata_end), 1);
2122
+ rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref),
2123
+ -1);
2124
+ rb_define_method(re2_cMatchData, "to_s",
2125
+ RUBY_METHOD_FUNC(re2_matchdata_to_s), 0);
2126
+ rb_define_method(re2_cMatchData, "inspect",
2127
+ RUBY_METHOD_FUNC(re2_matchdata_inspect), 0);
2128
+ rb_define_method(re2_cMatchData, "deconstruct",
2129
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct), 0);
2130
+ rb_define_method(re2_cMatchData, "deconstruct_keys",
2131
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct_keys), 1);
2132
+
2133
+ rb_define_method(re2_cScanner, "string",
2134
+ RUBY_METHOD_FUNC(re2_scanner_string), 0);
2135
+ rb_define_method(re2_cScanner, "eof?",
2136
+ RUBY_METHOD_FUNC(re2_scanner_eof), 0);
2137
+ rb_define_method(re2_cScanner, "regexp",
2138
+ RUBY_METHOD_FUNC(re2_scanner_regexp), 0);
2139
+ rb_define_method(re2_cScanner, "scan",
2140
+ RUBY_METHOD_FUNC(re2_scanner_scan), 0);
2141
+ rb_define_method(re2_cScanner, "rewind",
2142
+ RUBY_METHOD_FUNC(re2_scanner_rewind), 0);
2143
+
2144
+ rb_define_singleton_method(re2_cRegexp, "match_has_endpos_argument?",
2145
+ RUBY_METHOD_FUNC(re2_regexp_match_has_endpos_argument_p), 0);
2146
+ rb_define_method(re2_cRegexp, "initialize",
2147
+ RUBY_METHOD_FUNC(re2_regexp_initialize), -1);
2148
+ rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0);
2149
+ rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error),
2150
+ 0);
2151
+ rb_define_method(re2_cRegexp, "error_arg",
2152
+ RUBY_METHOD_FUNC(re2_regexp_error_arg), 0);
2153
+ rb_define_method(re2_cRegexp, "program_size",
2154
+ RUBY_METHOD_FUNC(re2_regexp_program_size), 0);
2155
+ rb_define_method(re2_cRegexp, "options",
2156
+ RUBY_METHOD_FUNC(re2_regexp_options), 0);
2157
+ rb_define_method(re2_cRegexp, "number_of_capturing_groups",
2158
+ RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0);
2159
+ rb_define_method(re2_cRegexp, "named_capturing_groups",
2160
+ RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
2161
+ rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match),
2162
+ -1);
2163
+ rb_define_method(re2_cRegexp, "match?", RUBY_METHOD_FUNC(re2_regexp_match_p),
2164
+ 1);
2165
+ rb_define_method(re2_cRegexp, "partial_match?",
2166
+ RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2167
+ rb_define_method(re2_cRegexp, "=~", RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2168
+ rb_define_method(re2_cRegexp, "===", RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2169
+ rb_define_method(re2_cRegexp, "full_match?",
2170
+ RUBY_METHOD_FUNC(re2_regexp_full_match_p), 1);
2171
+ rb_define_method(re2_cRegexp, "scan",
2172
+ RUBY_METHOD_FUNC(re2_regexp_scan), 1);
2173
+ rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
2174
+ rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s),
2175
+ 0);
2176
+ rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s),
2177
+ 0);
2178
+ rb_define_method(re2_cRegexp, "source", RUBY_METHOD_FUNC(re2_regexp_to_s),
2179
+ 0);
2180
+ rb_define_method(re2_cRegexp, "inspect",
2181
+ RUBY_METHOD_FUNC(re2_regexp_inspect), 0);
2182
+ rb_define_method(re2_cRegexp, "utf8?", RUBY_METHOD_FUNC(re2_regexp_utf8),
2183
+ 0);
2184
+ rb_define_method(re2_cRegexp, "posix_syntax?",
2185
+ RUBY_METHOD_FUNC(re2_regexp_posix_syntax), 0);
2186
+ rb_define_method(re2_cRegexp, "longest_match?",
2187
+ RUBY_METHOD_FUNC(re2_regexp_longest_match), 0);
2188
+ rb_define_method(re2_cRegexp, "log_errors?",
2189
+ RUBY_METHOD_FUNC(re2_regexp_log_errors), 0);
2190
+ rb_define_method(re2_cRegexp, "max_mem",
2191
+ RUBY_METHOD_FUNC(re2_regexp_max_mem), 0);
2192
+ rb_define_method(re2_cRegexp, "literal?",
2193
+ RUBY_METHOD_FUNC(re2_regexp_literal), 0);
2194
+ rb_define_method(re2_cRegexp, "never_nl?",
2195
+ RUBY_METHOD_FUNC(re2_regexp_never_nl), 0);
2196
+ rb_define_method(re2_cRegexp, "case_sensitive?",
2197
+ RUBY_METHOD_FUNC(re2_regexp_case_sensitive), 0);
2198
+ rb_define_method(re2_cRegexp, "case_insensitive?",
2199
+ RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
2200
+ rb_define_method(re2_cRegexp, "casefold?",
2201
+ RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
2202
+ rb_define_method(re2_cRegexp, "perl_classes?",
2203
+ RUBY_METHOD_FUNC(re2_regexp_perl_classes), 0);
2204
+ rb_define_method(re2_cRegexp, "word_boundary?",
2205
+ RUBY_METHOD_FUNC(re2_regexp_word_boundary), 0);
2206
+ rb_define_method(re2_cRegexp, "one_line?",
2207
+ RUBY_METHOD_FUNC(re2_regexp_one_line), 0);
2208
+
2209
+ rb_define_singleton_method(re2_cSet, "match_raises_errors?",
2210
+ RUBY_METHOD_FUNC(re2_set_match_raises_errors_p), 0);
2211
+ rb_define_method(re2_cSet, "initialize",
2212
+ RUBY_METHOD_FUNC(re2_set_initialize), -1);
2213
+ rb_define_method(re2_cSet, "add", RUBY_METHOD_FUNC(re2_set_add), 1);
2214
+ rb_define_method(re2_cSet, "compile", RUBY_METHOD_FUNC(re2_set_compile), 0);
2215
+ rb_define_method(re2_cSet, "match", RUBY_METHOD_FUNC(re2_set_match), -1);
2216
+
2217
+ rb_define_module_function(re2_mRE2, "Replace",
2218
+ RUBY_METHOD_FUNC(re2_Replace), 3);
2219
+ rb_define_module_function(re2_mRE2, "GlobalReplace",
2220
+ RUBY_METHOD_FUNC(re2_GlobalReplace), 3);
2221
+ rb_define_module_function(re2_mRE2, "QuoteMeta",
2222
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2223
+ rb_define_singleton_method(re2_cRegexp, "escape",
2224
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2225
+ rb_define_singleton_method(re2_cRegexp, "quote",
2226
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2227
+
2228
+ // (see RE2::Regexp#initialize)
2229
+ rb_define_singleton_method(re2_cRegexp, "compile",
2230
+ RUBY_METHOD_FUNC(rb_class_new_instance), -1);
2231
+
2232
+ rb_define_module_function(rb_mKernel, "RE2", RUBY_METHOD_FUNC(re2_re2), -1);
2233
+
2234
+ /* Create the symbols used in options. */
2235
+ id_utf8 = rb_intern("utf8");
2236
+ id_posix_syntax = rb_intern("posix_syntax");
2237
+ id_longest_match = rb_intern("longest_match");
2238
+ id_log_errors = rb_intern("log_errors");
2239
+ id_max_mem = rb_intern("max_mem");
2240
+ id_literal = rb_intern("literal");
2241
+ id_never_nl = rb_intern("never_nl");
2242
+ id_case_sensitive = rb_intern("case_sensitive");
2243
+ id_perl_classes = rb_intern("perl_classes");
2244
+ id_word_boundary = rb_intern("word_boundary");
2245
+ id_one_line = rb_intern("one_line");
2246
+ id_unanchored = rb_intern("unanchored");
2247
+ id_anchor = rb_intern("anchor");
2248
+ id_anchor_start = rb_intern("anchor_start");
2249
+ id_anchor_both = rb_intern("anchor_both");
2250
+ id_exception = rb_intern("exception");
2251
+ id_submatches = rb_intern("submatches");
2252
+ id_startpos = rb_intern("startpos");
2253
+ id_endpos = rb_intern("endpos");
2254
+ }