re2 2.15.0.rc1-x86-linux-gnu

Sign up to get free protection for your applications and to get access to all the features.
data/ext/re2/re2.cc ADDED
@@ -0,0 +1,2254 @@
1
+ /*
2
+ * re2 (https://github.com/mudge/re2)
3
+ * Ruby bindings to RE2, a "fast, safe, thread-friendly alternative to
4
+ * backtracking regular expression engines like those used in PCRE, Perl, and
5
+ * Python".
6
+ *
7
+ * Copyright (c) 2010, Paul Mucur (https://mudge.name)
8
+ * Released under the BSD Licence, please see LICENSE.txt
9
+ */
10
+
11
+ #include <stdint.h>
12
+
13
+ #include <map>
14
+ #include <sstream>
15
+ #include <string>
16
+ #include <vector>
17
+
18
+ #include <re2/re2.h>
19
+ #include <re2/set.h>
20
+ #include <ruby.h>
21
+ #include <ruby/encoding.h>
22
+
23
+ #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
24
+
25
+ typedef struct {
26
+ RE2 *pattern;
27
+ } re2_pattern;
28
+
29
+ typedef struct {
30
+ re2::StringPiece *matches;
31
+ int number_of_matches;
32
+ VALUE regexp, text;
33
+ } re2_matchdata;
34
+
35
+ typedef struct {
36
+ re2::StringPiece *input;
37
+ int number_of_capturing_groups;
38
+ bool eof;
39
+ VALUE regexp, text;
40
+ } re2_scanner;
41
+
42
+ typedef struct {
43
+ RE2::Set *set;
44
+ } re2_set;
45
+
46
+ VALUE re2_mRE2, re2_cRegexp, re2_cMatchData, re2_cScanner, re2_cSet,
47
+ re2_eSetMatchError, re2_eSetUnsupportedError, re2_eRegexpUnsupportedError;
48
+
49
+ /* Symbols used in RE2 options. */
50
+ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
51
+ id_max_mem, id_literal, id_never_nl, id_case_sensitive,
52
+ id_perl_classes, id_word_boundary, id_one_line, id_unanchored,
53
+ id_anchor, id_anchor_start, id_anchor_both, id_exception,
54
+ id_submatches, id_startpos, id_endpos;
55
+
56
+ inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
57
+ if (encoding == RE2::Options::EncodingUTF8) {
58
+ return rb_utf8_str_new(str, length);
59
+ }
60
+
61
+ VALUE string = rb_str_new(str, length);
62
+ rb_enc_associate_index(string, rb_enc_find_index("ISO-8859-1"));
63
+
64
+ return string;
65
+ }
66
+
67
+ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
68
+ if (TYPE(options) != T_HASH) {
69
+ rb_raise(rb_eArgError, "options should be a hash");
70
+ }
71
+
72
+ VALUE utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
73
+ if (!NIL_P(utf8)) {
74
+ re2_options->set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
75
+ }
76
+
77
+ VALUE posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
78
+ if (!NIL_P(posix_syntax)) {
79
+ re2_options->set_posix_syntax(RTEST(posix_syntax));
80
+ }
81
+
82
+ VALUE longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
83
+ if (!NIL_P(longest_match)) {
84
+ re2_options->set_longest_match(RTEST(longest_match));
85
+ }
86
+
87
+ VALUE log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
88
+ if (!NIL_P(log_errors)) {
89
+ re2_options->set_log_errors(RTEST(log_errors));
90
+ }
91
+
92
+ VALUE max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
93
+ if (!NIL_P(max_mem)) {
94
+ re2_options->set_max_mem(NUM2INT(max_mem));
95
+ }
96
+
97
+ VALUE literal = rb_hash_aref(options, ID2SYM(id_literal));
98
+ if (!NIL_P(literal)) {
99
+ re2_options->set_literal(RTEST(literal));
100
+ }
101
+
102
+ VALUE never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
103
+ if (!NIL_P(never_nl)) {
104
+ re2_options->set_never_nl(RTEST(never_nl));
105
+ }
106
+
107
+ VALUE case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
108
+ if (!NIL_P(case_sensitive)) {
109
+ re2_options->set_case_sensitive(RTEST(case_sensitive));
110
+ }
111
+
112
+ VALUE perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
113
+ if (!NIL_P(perl_classes)) {
114
+ re2_options->set_perl_classes(RTEST(perl_classes));
115
+ }
116
+
117
+ VALUE word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
118
+ if (!NIL_P(word_boundary)) {
119
+ re2_options->set_word_boundary(RTEST(word_boundary));
120
+ }
121
+
122
+ VALUE one_line = rb_hash_aref(options, ID2SYM(id_one_line));
123
+ if (!NIL_P(one_line)) {
124
+ re2_options->set_one_line(RTEST(one_line));
125
+ }
126
+ }
127
+
128
+ /* For compatibility with Ruby < 2.7 */
129
+ #ifdef HAVE_RB_GC_MARK_MOVABLE
130
+ #define re2_compact_callback(x) (x),
131
+ #else
132
+ #define rb_gc_mark_movable(x) rb_gc_mark(x)
133
+ #define re2_compact_callback(x)
134
+ #endif
135
+
136
+ static void re2_matchdata_mark(void *ptr) {
137
+ re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
138
+ rb_gc_mark_movable(m->regexp);
139
+ rb_gc_mark_movable(m->text);
140
+ }
141
+
142
+ #ifdef HAVE_RB_GC_MARK_MOVABLE
143
+ static void re2_matchdata_compact(void *ptr) {
144
+ re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
145
+ m->regexp = rb_gc_location(m->regexp);
146
+ m->text = rb_gc_location(m->text);
147
+ }
148
+ #endif
149
+
150
+ static void re2_matchdata_free(void *ptr) {
151
+ re2_matchdata *m = reinterpret_cast<re2_matchdata *>(ptr);
152
+ if (m->matches) {
153
+ delete[] m->matches;
154
+ }
155
+ xfree(m);
156
+ }
157
+
158
+ static size_t re2_matchdata_memsize(const void *ptr) {
159
+ const re2_matchdata *m = reinterpret_cast<const re2_matchdata *>(ptr);
160
+ size_t size = sizeof(*m);
161
+ if (m->matches) {
162
+ size += sizeof(*m->matches) * m->number_of_matches;
163
+ }
164
+
165
+ return size;
166
+ }
167
+
168
+ static const rb_data_type_t re2_matchdata_data_type = {
169
+ "RE2::MatchData",
170
+ {
171
+ re2_matchdata_mark,
172
+ re2_matchdata_free,
173
+ re2_matchdata_memsize,
174
+ re2_compact_callback(re2_matchdata_compact)
175
+ },
176
+ 0,
177
+ 0,
178
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
179
+ // macro to update VALUE references, as to trigger write barriers.
180
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
181
+ };
182
+
183
+ static void re2_scanner_mark(void *ptr) {
184
+ re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
185
+ rb_gc_mark_movable(s->regexp);
186
+ rb_gc_mark_movable(s->text);
187
+ }
188
+
189
+ #ifdef HAVE_RB_GC_MARK_MOVABLE
190
+ static void re2_scanner_compact(void *ptr) {
191
+ re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
192
+ s->regexp = rb_gc_location(s->regexp);
193
+ s->text = rb_gc_location(s->text);
194
+ }
195
+ #endif
196
+
197
+ static void re2_scanner_free(void *ptr) {
198
+ re2_scanner *s = reinterpret_cast<re2_scanner *>(ptr);
199
+ if (s->input) {
200
+ delete s->input;
201
+ }
202
+ xfree(s);
203
+ }
204
+
205
+ static size_t re2_scanner_memsize(const void *ptr) {
206
+ const re2_scanner *s = reinterpret_cast<const re2_scanner *>(ptr);
207
+ size_t size = sizeof(*s);
208
+ if (s->input) {
209
+ size += sizeof(*s->input);
210
+ }
211
+
212
+ return size;
213
+ }
214
+
215
+ static const rb_data_type_t re2_scanner_data_type = {
216
+ "RE2::Scanner",
217
+ {
218
+ re2_scanner_mark,
219
+ re2_scanner_free,
220
+ re2_scanner_memsize,
221
+ re2_compact_callback(re2_scanner_compact)
222
+ },
223
+ 0,
224
+ 0,
225
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
226
+ // macro to update VALUE references, as to trigger write barriers.
227
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
228
+ };
229
+
230
+ static void re2_regexp_free(void *ptr) {
231
+ re2_pattern *p = reinterpret_cast<re2_pattern *>(ptr);
232
+ if (p->pattern) {
233
+ delete p->pattern;
234
+ }
235
+ xfree(p);
236
+ }
237
+
238
+ static size_t re2_regexp_memsize(const void *ptr) {
239
+ const re2_pattern *p = reinterpret_cast<const re2_pattern *>(ptr);
240
+ size_t size = sizeof(*p);
241
+ if (p->pattern) {
242
+ size += sizeof(*p->pattern);
243
+ }
244
+
245
+ return size;
246
+ }
247
+
248
+ static const rb_data_type_t re2_regexp_data_type = {
249
+ "RE2::Regexp",
250
+ {
251
+ 0,
252
+ re2_regexp_free,
253
+ re2_regexp_memsize,
254
+ },
255
+ 0,
256
+ 0,
257
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
258
+ // macro to update VALUE references, as to trigger write barriers.
259
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
260
+ };
261
+
262
+ static VALUE re2_matchdata_allocate(VALUE klass) {
263
+ re2_matchdata *m;
264
+
265
+ return TypedData_Make_Struct(klass, re2_matchdata, &re2_matchdata_data_type,
266
+ m);
267
+ }
268
+
269
+ static VALUE re2_scanner_allocate(VALUE klass) {
270
+ re2_scanner *c;
271
+
272
+ return TypedData_Make_Struct(klass, re2_scanner, &re2_scanner_data_type, c);
273
+ }
274
+
275
+ /*
276
+ * Returns a frozen copy of the text supplied when matching.
277
+ *
278
+ * If the text was already a frozen string, returns the original.
279
+ *
280
+ * @return [String] a frozen string with the text supplied when matching
281
+ * @example
282
+ * m = RE2::Regexp.new('(\d+)').partial_match("bob 123")
283
+ * m.string #=> "bob 123"
284
+ */
285
+ static VALUE re2_matchdata_string(const VALUE self) {
286
+ re2_matchdata *m;
287
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
288
+
289
+ return m->text;
290
+ }
291
+
292
+ /*
293
+ * Returns the text supplied when incrementally matching with
294
+ * {RE2::Regexp#scan}.
295
+ *
296
+ * @return [String] the original string passed to {RE2::Regexp#scan}
297
+ * @example
298
+ * c = RE2::Regexp.new('(\d+)').scan("foo")
299
+ * c.string #=> "foo"
300
+ */
301
+ static VALUE re2_scanner_string(const VALUE self) {
302
+ re2_scanner *c;
303
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
304
+
305
+ return c->text;
306
+ }
307
+
308
+ /*
309
+ * Returns whether the {RE2::Scanner} has consumed all input or not.
310
+ *
311
+ * @return [Boolean] whether the {RE2::Scanner} has consumed all input or not
312
+ * @example
313
+ * c = RE2::Regexp.new('(\d+)').scan("foo")
314
+ * c.eof? #=> true
315
+ */
316
+ static VALUE re2_scanner_eof(const VALUE self) {
317
+ re2_scanner *c;
318
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
319
+
320
+ return BOOL2RUBY(c->eof);
321
+ }
322
+
323
+ /*
324
+ * Rewind the {RE2::Scanner} to the start of the string.
325
+ *
326
+ * @example
327
+ * s = RE2::Regexp.new('(\d+)').scan("1 2 3")
328
+ * e = s.to_enum
329
+ * e.scan #=> ["1"]
330
+ * e.scan #=> ["2"]
331
+ * s.rewind
332
+ * e.scan #=> ["1"]
333
+ */
334
+ static VALUE re2_scanner_rewind(VALUE self) {
335
+ re2_scanner *c;
336
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
337
+
338
+ delete c->input;
339
+ c->input = new(std::nothrow) re2::StringPiece(
340
+ RSTRING_PTR(c->text), RSTRING_LEN(c->text));
341
+ c->eof = false;
342
+
343
+ return self;
344
+ }
345
+
346
+ /*
347
+ * Scan the given text incrementally for matches using
348
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
349
+ * `FindAndConsume`}, returning an array of submatches on each subsequent
350
+ * call. Returns `nil` if no matches are found or an empty array for every
351
+ * match if the pattern has no capturing groups.
352
+ *
353
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
354
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
355
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
356
+ *
357
+ * @return [Array<String>] if the pattern has capturing groups
358
+ * @return [[]] if the pattern does not have capturing groups
359
+ * @return [nil] if no matches are found
360
+ * @example
361
+ * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
362
+ * s.scan #=> ["Foo"]
363
+ * s.scan #=> ["bar"]
364
+ */
365
+ static VALUE re2_scanner_scan(VALUE self) {
366
+ re2_pattern *p;
367
+ re2_scanner *c;
368
+
369
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
370
+ TypedData_Get_Struct(c->regexp, re2_pattern, &re2_regexp_data_type, p);
371
+
372
+ std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
373
+ std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
374
+ std::vector<re2::StringPiece> matches(c->number_of_capturing_groups);
375
+
376
+ if (c->eof) {
377
+ return Qnil;
378
+ }
379
+
380
+ re2::StringPiece::size_type original_input_size = c->input->size();
381
+
382
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
383
+ argv[i] = &matches[i];
384
+ args[i] = &argv[i];
385
+ }
386
+
387
+ if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
388
+ c->number_of_capturing_groups)) {
389
+ re2::StringPiece::size_type new_input_size = c->input->size();
390
+ bool input_advanced = new_input_size < original_input_size;
391
+
392
+ VALUE result = rb_ary_new2(c->number_of_capturing_groups);
393
+
394
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
395
+ if (matches[i].empty()) {
396
+ rb_ary_push(result, Qnil);
397
+ } else {
398
+ rb_ary_push(result, encoded_str_new(matches[i].data(),
399
+ matches[i].size(),
400
+ p->pattern->options().encoding()));
401
+ }
402
+ }
403
+
404
+ /* Check whether we've exhausted the input yet. */
405
+ c->eof = new_input_size == 0;
406
+
407
+ /* If the match didn't advance the input, we need to do this ourselves. */
408
+ if (!input_advanced && new_input_size > 0) {
409
+ c->input->remove_prefix(1);
410
+ }
411
+
412
+ return result;
413
+ } else {
414
+ return Qnil;
415
+ }
416
+ }
417
+
418
+ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
419
+ re2_matchdata *m;
420
+ re2_pattern *p;
421
+
422
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
423
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
424
+
425
+ int id;
426
+
427
+ if (FIXNUM_P(idx)) {
428
+ id = FIX2INT(idx);
429
+ } else if (SYMBOL_P(idx)) {
430
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
431
+ std::map<std::string, int>::const_iterator search = groups.find(rb_id2name(SYM2ID(idx)));
432
+
433
+ if (search != groups.end()) {
434
+ id = search->second;
435
+ } else {
436
+ return NULL;
437
+ }
438
+ } else {
439
+ StringValue(idx);
440
+
441
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
442
+ std::map<std::string, int>::const_iterator search = groups.find(std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)));
443
+
444
+ if (search != groups.end()) {
445
+ id = search->second;
446
+ } else {
447
+ return NULL;
448
+ }
449
+ }
450
+
451
+ if (id >= 0 && id < m->number_of_matches) {
452
+ re2::StringPiece *match = &m->matches[id];
453
+
454
+ if (!match->empty()) {
455
+ return match;
456
+ }
457
+ }
458
+
459
+ return NULL;
460
+ }
461
+
462
+ /*
463
+ * Returns the number of elements in the {RE2::MatchData} (including the
464
+ * overall match, submatches and any `nils`).
465
+ *
466
+ * @return [Integer] the number of elements
467
+ * @example
468
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
469
+ * m.size #=> 2
470
+ * m.length #=> 2
471
+ */
472
+ static VALUE re2_matchdata_size(const VALUE self) {
473
+ re2_matchdata *m;
474
+
475
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
476
+
477
+ return INT2FIX(m->number_of_matches);
478
+ }
479
+
480
+ /*
481
+ * Returns the offset of the start of the nth element of the {RE2::MatchData}.
482
+ *
483
+ * @param [Integer, String, Symbol] n the name or number of the submatch
484
+ * @return [Integer, nil] the offset of the start of the match or `nil` if
485
+ * there is no such submatch
486
+ * @example
487
+ * m = RE2::Regexp.new('ob (\d+)').match("bob 123")
488
+ * m.begin(0) #=> 1
489
+ * m.begin(1) #=> 4
490
+ */
491
+ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
492
+ re2_matchdata *m;
493
+
494
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
495
+
496
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
497
+ if (match == NULL) {
498
+ return Qnil;
499
+ } else {
500
+ long offset = match->data() - RSTRING_PTR(m->text);
501
+
502
+ return LONG2NUM(rb_str_sublen(m->text, offset));
503
+ }
504
+ }
505
+
506
+ /*
507
+ * Returns the offset of the character following the end of the nth element of
508
+ * the {RE2::MatchData}.
509
+ *
510
+ * @param [Integer, String, Symbol] n the name or number of the match
511
+ * @return [Integer, nil] the offset of the character following the end of the
512
+ * match or `nil` if there is no such match
513
+ * @example
514
+ * m = RE2::Regexp.new('ob (\d+) b').match("bob 123 bob")
515
+ * m.end(0) #=> 9
516
+ * m.end(1) #=> 7
517
+ */
518
+ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
519
+ re2_matchdata *m;
520
+
521
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
522
+
523
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
524
+ if (match == NULL) {
525
+ return Qnil;
526
+ } else {
527
+ long offset = (match->data() - RSTRING_PTR(m->text)) + match->size();
528
+
529
+ return LONG2NUM(rb_str_sublen(m->text, offset));
530
+ }
531
+ }
532
+
533
+ /*
534
+ * Returns the {RE2::Regexp} used in the match.
535
+ *
536
+ * @return [RE2::Regexp] the regular expression used in the match
537
+ * @example
538
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
539
+ * m.regexp #=> #<RE2::Regexp /(\d+)/>
540
+ */
541
+ static VALUE re2_matchdata_regexp(const VALUE self) {
542
+ re2_matchdata *m;
543
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
544
+
545
+ return m->regexp;
546
+ }
547
+
548
+ /*
549
+ * Returns the {RE2::Regexp} used in the {RE2::Scanner}.
550
+ *
551
+ * @return [RE2::Regexp] the regular expression used in the {RE2::Scanner}
552
+ * @example
553
+ * c = RE2::Regexp.new('(\d+)').scan("bob 123")
554
+ * c.regexp #=> #<RE2::Regexp /(\d+)/>
555
+ */
556
+ static VALUE re2_scanner_regexp(const VALUE self) {
557
+ re2_scanner *c;
558
+ TypedData_Get_Struct(self, re2_scanner, &re2_scanner_data_type, c);
559
+
560
+ return c->regexp;
561
+ }
562
+
563
+ static VALUE re2_regexp_allocate(VALUE klass) {
564
+ re2_pattern *p;
565
+
566
+ return TypedData_Make_Struct(klass, re2_pattern, &re2_regexp_data_type, p);
567
+ }
568
+
569
+ /*
570
+ * Returns the array of matches including the overall match, submatches and any
571
+ * `nil`s.
572
+ *
573
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
574
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
575
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
576
+ *
577
+ * @return [Array<String, nil>] the array of matches
578
+ * @example
579
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
580
+ * m.to_a #=> ["123", "123"]
581
+ */
582
+ static VALUE re2_matchdata_to_a(const VALUE self) {
583
+ re2_matchdata *m;
584
+ re2_pattern *p;
585
+
586
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
587
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
588
+
589
+ VALUE array = rb_ary_new2(m->number_of_matches);
590
+ for (int i = 0; i < m->number_of_matches; ++i) {
591
+ re2::StringPiece *match = &m->matches[i];
592
+
593
+ if (match->empty()) {
594
+ rb_ary_push(array, Qnil);
595
+ } else {
596
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
597
+ p->pattern->options().encoding()));
598
+ }
599
+ }
600
+
601
+ return array;
602
+ }
603
+
604
+ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
605
+ re2_matchdata *m;
606
+ re2_pattern *p;
607
+
608
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
609
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
610
+
611
+ if (nth < 0 || nth >= m->number_of_matches) {
612
+ return Qnil;
613
+ } else {
614
+ re2::StringPiece *match = &m->matches[nth];
615
+
616
+ if (match->empty()) {
617
+ return Qnil;
618
+ } else {
619
+ return encoded_str_new(match->data(), match->size(),
620
+ p->pattern->options().encoding());
621
+ }
622
+ }
623
+ }
624
+
625
+ static VALUE re2_matchdata_named_match(const std::string &name, const VALUE self) {
626
+ re2_matchdata *m;
627
+ re2_pattern *p;
628
+
629
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
630
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
631
+
632
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
633
+ std::map<std::string, int>::const_iterator search = groups.find(name);
634
+
635
+ if (search != groups.end()) {
636
+ return re2_matchdata_nth_match(search->second, self);
637
+ } else {
638
+ return Qnil;
639
+ }
640
+ }
641
+
642
+ /*
643
+ * Retrieve zero, one or more matches by index or name.
644
+ *
645
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
646
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
647
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
648
+ *
649
+ * @overload [](index)
650
+ * Access a particular match by index.
651
+ *
652
+ * @param [Integer] index the index of the match to fetch
653
+ * @return [String, nil] the specified match or `nil` if it isn't present
654
+ * @example
655
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
656
+ * m[0] #=> "123"
657
+ *
658
+ * @overload [](start, length)
659
+ * Access a range of matches by starting index and length.
660
+ *
661
+ * @param [Integer] start the index from which to start
662
+ * @param [Integer] length the number of elements to fetch
663
+ * @return [Array<String, nil>] the specified matches
664
+ * @example
665
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
666
+ * m[0, 1] #=> ["123"]
667
+ *
668
+ * @overload [](range)
669
+ * Access a range of matches by index.
670
+ *
671
+ * @param [Range] range the range of match indexes to fetch
672
+ * @return [Array<String, nil>] the specified matches
673
+ * @example
674
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
675
+ * m[0..1] #=> "[123", "123"]
676
+ *
677
+ * @overload [](name)
678
+ * Access a particular match by name.
679
+ *
680
+ * @param [String, Symbol] name the name of the match to fetch
681
+ * @return [String, nil] the specific match or `nil` if it isn't present
682
+ * @example
683
+ * m = RE2::Regexp.new('(?P<number>\d+)').match("bob 123")
684
+ * m["number"] #=> "123"
685
+ * m[:number] #=> "123"
686
+ */
687
+ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
688
+ VALUE idx, rest;
689
+ rb_scan_args(argc, argv, "11", &idx, &rest);
690
+
691
+ if (TYPE(idx) == T_STRING) {
692
+ return re2_matchdata_named_match(
693
+ std::string(RSTRING_PTR(idx), RSTRING_LEN(idx)), self);
694
+ } else if (SYMBOL_P(idx)) {
695
+ return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
696
+ } else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
697
+ return rb_ary_aref(argc, argv, re2_matchdata_to_a(self));
698
+ } else {
699
+ return re2_matchdata_nth_match(FIX2INT(idx), self);
700
+ }
701
+ }
702
+
703
+ /*
704
+ * Returns the entire matched string.
705
+ *
706
+ * @return [String] the entire matched string
707
+ */
708
+ static VALUE re2_matchdata_to_s(const VALUE self) {
709
+ return re2_matchdata_nth_match(0, self);
710
+ }
711
+
712
+ /*
713
+ * Returns a printable version of the match.
714
+ *
715
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
716
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
717
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
718
+ *
719
+ * @return [String] a printable version of the match
720
+ * @example
721
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
722
+ * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
723
+ */
724
+ static VALUE re2_matchdata_inspect(const VALUE self) {
725
+ re2_matchdata *m;
726
+ re2_pattern *p;
727
+
728
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
729
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
730
+
731
+ std::ostringstream output;
732
+ output << "#<RE2::MatchData";
733
+
734
+ for (int i = 0; i < m->number_of_matches; ++i) {
735
+ output << " ";
736
+
737
+ if (i > 0) {
738
+ output << i << ":";
739
+ }
740
+
741
+ VALUE match = re2_matchdata_nth_match(i, self);
742
+
743
+ if (match == Qnil) {
744
+ output << "nil";
745
+ } else {
746
+ output << "\"";
747
+ output.write(RSTRING_PTR(match), RSTRING_LEN(match));
748
+ output << "\"";
749
+ }
750
+ }
751
+
752
+ output << ">";
753
+
754
+ return encoded_str_new(output.str().data(), output.str().length(),
755
+ p->pattern->options().encoding());
756
+ }
757
+
758
+ /*
759
+ * Returns the array of submatches for pattern matching.
760
+ *
761
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
762
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
763
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is
764
+ * undefined).
765
+ *
766
+ * @return [Array<String, nil>] the array of submatches
767
+ * @example
768
+ * m = RE2::Regexp.new('(\d+)').match("bob 123")
769
+ * m.deconstruct #=> ["123"]
770
+ *
771
+ * @example pattern matching
772
+ * case RE2::Regexp.new('(\d+) (\d+)').match("bob 123 456")
773
+ * in x, y
774
+ * puts "Matched #{x} #{y}"
775
+ * else
776
+ * puts "Unrecognised match"
777
+ * end
778
+ */
779
+ static VALUE re2_matchdata_deconstruct(const VALUE self) {
780
+ re2_matchdata *m;
781
+ re2_pattern *p;
782
+
783
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
784
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
785
+
786
+ VALUE array = rb_ary_new2(m->number_of_matches - 1);
787
+ for (int i = 1; i < m->number_of_matches; ++i) {
788
+ re2::StringPiece *match = &m->matches[i];
789
+
790
+ if (match->empty()) {
791
+ rb_ary_push(array, Qnil);
792
+ } else {
793
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
794
+ p->pattern->options().encoding()));
795
+ }
796
+ }
797
+
798
+ return array;
799
+ }
800
+
801
+ /*
802
+ * Returns a hash of capturing group names to submatches for pattern matching.
803
+ *
804
+ * As this is used by Ruby's pattern matching, it will return an empty hash if given
805
+ * more keys than there are capturing groups. Given keys will populate the hash in
806
+ * order but an invalid name will cause the hash to be immediately returned.
807
+ *
808
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
809
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
810
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
811
+ *
812
+ * @return [Hash] a hash of capturing group names to submatches
813
+ * @param [Array<Symbol>, nil] keys an array of `Symbol` capturing group names
814
+ * or `nil` to return all names
815
+ * @example
816
+ * m = RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
817
+ * m.deconstruct_keys(nil) #=> {numbers: "123", letters: "abc"}
818
+ * m.deconstruct_keys([:numbers]) #=> {numbers: "123"}
819
+ * m.deconstruct_keys([:fruit]) #=> {}
820
+ * m.deconstruct_keys([:letters, :fruit]) #=> {letters: "abc"}
821
+ *
822
+ * @example pattern matching
823
+ * case RE2::Regexp.new('(?P<numbers>\d+) (?P<letters>[a-zA-Z]+)').match('123 abc')
824
+ * in numbers:, letters:
825
+ * puts "Numbers: #{numbers}, letters: #{letters}"
826
+ * else
827
+ * puts "Unrecognised match"
828
+ * end
829
+ */
830
+ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys) {
831
+ re2_matchdata *m;
832
+ re2_pattern *p;
833
+
834
+ TypedData_Get_Struct(self, re2_matchdata, &re2_matchdata_data_type, m);
835
+ TypedData_Get_Struct(m->regexp, re2_pattern, &re2_regexp_data_type, p);
836
+
837
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
838
+ VALUE capturing_groups = rb_hash_new();
839
+
840
+ if (NIL_P(keys)) {
841
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
842
+ rb_hash_aset(capturing_groups,
843
+ ID2SYM(rb_intern(it->first.data())),
844
+ re2_matchdata_nth_match(it->second, self));
845
+ }
846
+ } else {
847
+ Check_Type(keys, T_ARRAY);
848
+
849
+ if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) {
850
+ for (int i = 0; i < RARRAY_LEN(keys); ++i) {
851
+ VALUE key = rb_ary_entry(keys, i);
852
+ Check_Type(key, T_SYMBOL);
853
+ const char *name = rb_id2name(SYM2ID(key));
854
+ std::map<std::string, int>::const_iterator search = groups.find(name);
855
+
856
+ if (search != groups.end()) {
857
+ rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(search->second, self));
858
+ } else {
859
+ break;
860
+ }
861
+ }
862
+ }
863
+ }
864
+
865
+ return capturing_groups;
866
+ }
867
+
868
+ /*
869
+ * Shorthand to compile a new {RE2::Regexp}.
870
+ *
871
+ * @see RE2::Regexp#initialize
872
+ */
873
+ static VALUE re2_re2(int argc, VALUE *argv, VALUE) {
874
+ return rb_class_new_instance(argc, argv, re2_cRegexp);
875
+ }
876
+
877
+ /*
878
+ * Returns a new {RE2::Regexp} object with a compiled version of
879
+ * `pattern` stored inside.
880
+ *
881
+ * @overload initialize(pattern)
882
+ * Returns a new {RE2::Regexp} object with a compiled version of
883
+ * `pattern` stored inside with the default options.
884
+ *
885
+ * @param [String] pattern the pattern to compile
886
+ * @return [RE2::Regexp] a {RE2::Regexp} with the specified pattern
887
+ * @raise [TypeError] if the given pattern can't be coerced to a `String`
888
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled
889
+ * pattern
890
+ *
891
+ * @overload initialize(pattern, options)
892
+ * Returns a new {RE2::Regexp} object with a compiled version of
893
+ * `pattern` stored inside with the specified options.
894
+ *
895
+ * @param [String] pattern the pattern to compile
896
+ * @param [Hash] options the options with which to compile the pattern
897
+ * @option options [Boolean] :utf8 (true) text and pattern are UTF-8; otherwise Latin-1
898
+ * @option options [Boolean] :posix_syntax (false) restrict regexps to POSIX egrep syntax
899
+ * @option options [Boolean] :longest_match (false) search for longest match, not first match
900
+ * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
901
+ * @option options [Integer] :max_mem approx. max memory footprint of RE2
902
+ * @option options [Boolean] :literal (false) interpret string as literal, not regexp
903
+ * @option options [Boolean] :never_nl (false) never match `\n`, even if it is in regexp
904
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with `(?i)` unless in `posix_syntax` mode)
905
+ * @option options [Boolean] :perl_classes (false) allow Perl's `\d` `\s` `\w` `\D` `\S` `\W` when in `posix_syntax` mode
906
+ * @option options [Boolean] :word_boundary (false) allow `\b` `\B` (word boundary and not) when in `posix_syntax` mode
907
+ * @option options [Boolean] :one_line (false) `^` and `$` only match beginning and end of text when in `posix_syntax` mode
908
+ * @return [RE2::Regexp] a {RE2::Regexp} with the specified pattern and options
909
+ * @raise [TypeError] if the given pattern can't be coerced to a `String`
910
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
911
+ */
912
+ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
913
+ VALUE pattern, options;
914
+ re2_pattern *p;
915
+
916
+ rb_scan_args(argc, argv, "11", &pattern, &options);
917
+
918
+ /* Ensure pattern is a string. */
919
+ StringValue(pattern);
920
+
921
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
922
+
923
+ if (RTEST(options)) {
924
+ RE2::Options re2_options;
925
+ parse_re2_options(&re2_options, options);
926
+
927
+ p->pattern = new(std::nothrow) RE2(
928
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), re2_options);
929
+ } else {
930
+ p->pattern = new(std::nothrow) RE2(
931
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)));
932
+ }
933
+
934
+ if (p->pattern == 0) {
935
+ rb_raise(rb_eNoMemError, "not enough memory to allocate RE2 object");
936
+ }
937
+
938
+ return self;
939
+ }
940
+
941
+ /*
942
+ * Returns a printable version of the regular expression.
943
+ *
944
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
945
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
946
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is
947
+ * undefined).
948
+ *
949
+ * @return [String] a printable version of the regular expression
950
+ * @example
951
+ * re2 = RE2::Regexp.new("woo?")
952
+ * re2.inspect #=> "#<RE2::Regexp /woo?/>"
953
+ */
954
+ static VALUE re2_regexp_inspect(const VALUE self) {
955
+ re2_pattern *p;
956
+
957
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
958
+
959
+ std::ostringstream output;
960
+
961
+ output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
962
+
963
+ return encoded_str_new(output.str().data(), output.str().length(),
964
+ p->pattern->options().encoding());
965
+ }
966
+
967
+ /*
968
+ * Returns a string version of the regular expression.
969
+ *
970
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
971
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
972
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
973
+ *
974
+ * @return [String] a string version of the regular expression
975
+ * @example
976
+ * re2 = RE2::Regexp.new("woo?")
977
+ * re2.to_s #=> "woo?"
978
+ */
979
+ static VALUE re2_regexp_to_s(const VALUE self) {
980
+ re2_pattern *p;
981
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
982
+
983
+ return encoded_str_new(p->pattern->pattern().data(),
984
+ p->pattern->pattern().size(),
985
+ p->pattern->options().encoding());
986
+ }
987
+
988
+ /*
989
+ * Returns whether or not the regular expression was compiled successfully.
990
+ *
991
+ * @return [Boolean] whether or not compilation was successful
992
+ * @example
993
+ * re2 = RE2::Regexp.new("woo?")
994
+ * re2.ok? #=> true
995
+ */
996
+ static VALUE re2_regexp_ok(const VALUE self) {
997
+ re2_pattern *p;
998
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
999
+
1000
+ return BOOL2RUBY(p->pattern->ok());
1001
+ }
1002
+
1003
+ /*
1004
+ * Returns whether or not the regular expression was compiled with the `utf8`
1005
+ * option set to `true`.
1006
+ *
1007
+ * @return [Boolean] the `utf8` option
1008
+ * @example
1009
+ * re2 = RE2::Regexp.new("woo?", utf8: true)
1010
+ * re2.utf8? #=> true
1011
+ */
1012
+ static VALUE re2_regexp_utf8(const VALUE self) {
1013
+ re2_pattern *p;
1014
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1015
+
1016
+ return BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8);
1017
+ }
1018
+
1019
+ /*
1020
+ * Returns whether or not the regular expression was compiled with the
1021
+ * `posix_syntax` option set to `true`.
1022
+ *
1023
+ * @return [Boolean] the `posix_syntax` option
1024
+ * @example
1025
+ * re2 = RE2::Regexp.new("woo?", posix_syntax: true)
1026
+ * re2.posix_syntax? #=> true
1027
+ */
1028
+ static VALUE re2_regexp_posix_syntax(const VALUE self) {
1029
+ re2_pattern *p;
1030
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1031
+
1032
+ return BOOL2RUBY(p->pattern->options().posix_syntax());
1033
+ }
1034
+
1035
+ /*
1036
+ * Returns whether or not the regular expression was compiled with the
1037
+ * `longest_match` option set to `true`.
1038
+ *
1039
+ * @return [Boolean] the `longest_match` option
1040
+ * @example
1041
+ * re2 = RE2::Regexp.new("woo?", longest_match: true)
1042
+ * re2.longest_match? #=> true
1043
+ */
1044
+ static VALUE re2_regexp_longest_match(const VALUE self) {
1045
+ re2_pattern *p;
1046
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1047
+
1048
+ return BOOL2RUBY(p->pattern->options().longest_match());
1049
+ }
1050
+
1051
+ /*
1052
+ * Returns whether or not the regular expression was compiled with the
1053
+ * `log_errors` option set to `true`.
1054
+ *
1055
+ * @return [Boolean] the `log_errors` option
1056
+ * @example
1057
+ * re2 = RE2::Regexp.new("woo?", log_errors: true)
1058
+ * re2.log_errors? #=> true
1059
+ */
1060
+ static VALUE re2_regexp_log_errors(const VALUE self) {
1061
+ re2_pattern *p;
1062
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1063
+
1064
+ return BOOL2RUBY(p->pattern->options().log_errors());
1065
+ }
1066
+
1067
+ /*
1068
+ * Returns the `max_mem` setting for the regular expression.
1069
+ *
1070
+ * @return [Integer] the `max_mem` option
1071
+ * @example
1072
+ * re2 = RE2::Regexp.new("woo?", max_mem: 1024)
1073
+ * re2.max_mem #=> 1024
1074
+ */
1075
+ static VALUE re2_regexp_max_mem(const VALUE self) {
1076
+ re2_pattern *p;
1077
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1078
+
1079
+ return INT2FIX(p->pattern->options().max_mem());
1080
+ }
1081
+
1082
+ /*
1083
+ * Returns whether or not the regular expression was compiled with the
1084
+ * `literal` option set to `true`.
1085
+ *
1086
+ * @return [Boolean] the `literal` option
1087
+ * @example
1088
+ * re2 = RE2::Regexp.new("woo?", literal: true)
1089
+ * re2.literal? #=> true
1090
+ */
1091
+ static VALUE re2_regexp_literal(const VALUE self) {
1092
+ re2_pattern *p;
1093
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1094
+
1095
+ return BOOL2RUBY(p->pattern->options().literal());
1096
+ }
1097
+
1098
+ /*
1099
+ * Returns whether or not the regular expression was compiled with the
1100
+ * `never_nl` option set to `true`.
1101
+ *
1102
+ * @return [Boolean] the `never_nl` option
1103
+ * @example
1104
+ * re2 = RE2::Regexp.new("woo?", never_nl: true)
1105
+ * re2.never_nl? #=> true
1106
+ */
1107
+ static VALUE re2_regexp_never_nl(const VALUE self) {
1108
+ re2_pattern *p;
1109
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1110
+
1111
+ return BOOL2RUBY(p->pattern->options().never_nl());
1112
+ }
1113
+
1114
+ /*
1115
+ * Returns whether or not the regular expression was compiled with the
1116
+ * `case_sensitive` option set to `true`.
1117
+ *
1118
+ * @return [Boolean] the `case_sensitive` option
1119
+ * @example
1120
+ * re2 = RE2::Regexp.new("woo?", case_sensitive: true)
1121
+ * re2.case_sensitive? #=> true
1122
+ */
1123
+ static VALUE re2_regexp_case_sensitive(const VALUE self) {
1124
+ re2_pattern *p;
1125
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1126
+
1127
+ return BOOL2RUBY(p->pattern->options().case_sensitive());
1128
+ }
1129
+
1130
+ /*
1131
+ * Returns whether or not the regular expression was compiled with the
1132
+ * `case_sensitive` option set to `false`.
1133
+ *
1134
+ * @return [Boolean] the inverse of the `case_sensitive` option
1135
+ * @example
1136
+ * re2 = RE2::Regexp.new("woo?", case_sensitive: true)
1137
+ * re2.case_insensitive? #=> false
1138
+ * re2.casefold? #=> false
1139
+ */
1140
+ static VALUE re2_regexp_case_insensitive(const VALUE self) {
1141
+ return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
1142
+ }
1143
+
1144
+ /*
1145
+ * Returns whether or not the regular expression was compiled with the
1146
+ * perl_classes option set to `true`.
1147
+ *
1148
+ * @return [Boolean] the `perl_classes` option
1149
+ * @example
1150
+ * re2 = RE2::Regexp.new("woo?", perl_classes: true)
1151
+ * re2.perl_classes? #=> true
1152
+ */
1153
+ static VALUE re2_regexp_perl_classes(const VALUE self) {
1154
+ re2_pattern *p;
1155
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1156
+
1157
+ return BOOL2RUBY(p->pattern->options().perl_classes());
1158
+ }
1159
+
1160
+ /*
1161
+ * Returns whether or not the regular expression was compiled with the
1162
+ * `word_boundary` option set to `true`.
1163
+ *
1164
+ * @return [Boolean] the `word_boundary` option
1165
+ * @example
1166
+ * re2 = RE2::Regexp.new("woo?", word_boundary: true)
1167
+ * re2.word_boundary? #=> true
1168
+ */
1169
+ static VALUE re2_regexp_word_boundary(const VALUE self) {
1170
+ re2_pattern *p;
1171
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1172
+
1173
+ return BOOL2RUBY(p->pattern->options().word_boundary());
1174
+ }
1175
+
1176
+ /*
1177
+ * Returns whether or not the regular expression was compiled with the
1178
+ * `one_line` option set to `true`.
1179
+ *
1180
+ * @return [Boolean] the `one_line` option
1181
+ * @example
1182
+ * re2 = RE2::Regexp.new("woo?", one_line: true)
1183
+ * re2.one_line? #=> true
1184
+ */
1185
+ static VALUE re2_regexp_one_line(const VALUE self) {
1186
+ re2_pattern *p;
1187
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1188
+
1189
+ return BOOL2RUBY(p->pattern->options().one_line());
1190
+ }
1191
+
1192
+ /*
1193
+ * If the {RE2::Regexp} could not be created properly, returns an error string
1194
+ * otherwise returns `nil`.
1195
+ *
1196
+ * @return [String, nil] the error string or `nil`
1197
+ */
1198
+ static VALUE re2_regexp_error(const VALUE self) {
1199
+ re2_pattern *p;
1200
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1201
+
1202
+ if (p->pattern->ok()) {
1203
+ return Qnil;
1204
+ } else {
1205
+ return rb_str_new(p->pattern->error().data(), p->pattern->error().size());
1206
+ }
1207
+ }
1208
+
1209
+ /*
1210
+ * If the {RE2::Regexp} could not be created properly, returns
1211
+ * the offending portion of the regexp otherwise returns `nil`.
1212
+ *
1213
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1214
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1215
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1216
+ *
1217
+ * @return [String, nil] the offending portion of the regexp or `nil`
1218
+ */
1219
+ static VALUE re2_regexp_error_arg(const VALUE self) {
1220
+ re2_pattern *p;
1221
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1222
+
1223
+ if (p->pattern->ok()) {
1224
+ return Qnil;
1225
+ } else {
1226
+ return encoded_str_new(p->pattern->error_arg().data(),
1227
+ p->pattern->error_arg().size(),
1228
+ p->pattern->options().encoding());
1229
+ }
1230
+ }
1231
+
1232
+ /*
1233
+ * Returns the program size, a very approximate measure
1234
+ * of a regexp's "cost". Larger numbers are more expensive
1235
+ * than smaller numbers.
1236
+ *
1237
+ * @return [Integer] the regexp "cost"
1238
+ */
1239
+ static VALUE re2_regexp_program_size(const VALUE self) {
1240
+ re2_pattern *p;
1241
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1242
+
1243
+ return INT2FIX(p->pattern->ProgramSize());
1244
+ }
1245
+
1246
+ /*
1247
+ * Returns a hash of the options currently set for the {RE2::Regexp}.
1248
+ *
1249
+ * @return [Hash] the options
1250
+ */
1251
+ static VALUE re2_regexp_options(const VALUE self) {
1252
+ re2_pattern *p;
1253
+
1254
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1255
+ VALUE options = rb_hash_new();
1256
+
1257
+ rb_hash_aset(options, ID2SYM(id_utf8),
1258
+ BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8));
1259
+
1260
+ rb_hash_aset(options, ID2SYM(id_posix_syntax),
1261
+ BOOL2RUBY(p->pattern->options().posix_syntax()));
1262
+
1263
+ rb_hash_aset(options, ID2SYM(id_longest_match),
1264
+ BOOL2RUBY(p->pattern->options().longest_match()));
1265
+
1266
+ rb_hash_aset(options, ID2SYM(id_log_errors),
1267
+ BOOL2RUBY(p->pattern->options().log_errors()));
1268
+
1269
+ rb_hash_aset(options, ID2SYM(id_max_mem),
1270
+ INT2FIX(p->pattern->options().max_mem()));
1271
+
1272
+ rb_hash_aset(options, ID2SYM(id_literal),
1273
+ BOOL2RUBY(p->pattern->options().literal()));
1274
+
1275
+ rb_hash_aset(options, ID2SYM(id_never_nl),
1276
+ BOOL2RUBY(p->pattern->options().never_nl()));
1277
+
1278
+ rb_hash_aset(options, ID2SYM(id_case_sensitive),
1279
+ BOOL2RUBY(p->pattern->options().case_sensitive()));
1280
+
1281
+ rb_hash_aset(options, ID2SYM(id_perl_classes),
1282
+ BOOL2RUBY(p->pattern->options().perl_classes()));
1283
+
1284
+ rb_hash_aset(options, ID2SYM(id_word_boundary),
1285
+ BOOL2RUBY(p->pattern->options().word_boundary()));
1286
+
1287
+ rb_hash_aset(options, ID2SYM(id_one_line),
1288
+ BOOL2RUBY(p->pattern->options().one_line()));
1289
+
1290
+ /* This is a read-only hash after all... */
1291
+ rb_obj_freeze(options);
1292
+
1293
+ return options;
1294
+ }
1295
+
1296
+ /*
1297
+ * Returns the number of capturing subpatterns, or -1 if the regexp
1298
+ * wasn't valid on construction. The overall match (`$0`) does not
1299
+ * count: if the regexp is `"(a)(b)"`, returns 2.
1300
+ *
1301
+ * @return [Integer] the number of capturing subpatterns
1302
+ */
1303
+ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1304
+ re2_pattern *p;
1305
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1306
+
1307
+ return INT2FIX(p->pattern->NumberOfCapturingGroups());
1308
+ }
1309
+
1310
+ /*
1311
+ * Returns a hash of names to capturing indices of groups.
1312
+ *
1313
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1314
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1315
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1316
+ *
1317
+ * @return [Hash] a hash of names to capturing indices
1318
+ */
1319
+ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1320
+ re2_pattern *p;
1321
+
1322
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1323
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1324
+ VALUE capturing_groups = rb_hash_new();
1325
+
1326
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1327
+ rb_hash_aset(capturing_groups,
1328
+ encoded_str_new(it->first.data(), it->first.size(),
1329
+ p->pattern->options().encoding()),
1330
+ INT2FIX(it->second));
1331
+ }
1332
+
1333
+ return capturing_groups;
1334
+ }
1335
+
1336
+ /*
1337
+ * General matching: match the pattern against the given `text` using
1338
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1339
+ * `Match`} and return a {RE2::MatchData} instance with the specified number of
1340
+ * submatches (defaults to the total number of capturing groups) or a boolean
1341
+ * (if no submatches are required).
1342
+ *
1343
+ * The number of submatches has a significant impact on performance: requesting
1344
+ * one submatch is much faster than requesting more than one and requesting
1345
+ * zero submatches is faster still.
1346
+ *
1347
+ * @overload match(text)
1348
+ * Returns a {RE2::MatchData} containing the matching pattern and all
1349
+ * submatches resulting from looking for the regexp in `text` if the pattern
1350
+ * contains capturing groups.
1351
+ *
1352
+ * Returns either `true` or `false` indicating whether a successful match was
1353
+ * made if the pattern contains no capturing groups.
1354
+ *
1355
+ * @param [String] text the text to search
1356
+ * @return [RE2::MatchData, nil] if the pattern contains capturing groups
1357
+ * @return [Boolean] if the pattern does not contain capturing groups
1358
+ * @raise [NoMemoryError] if there was not enough memory to allocate the submatches
1359
+ * @raise [TypeError] if given text that cannot be coerced to a `String`
1360
+ * @example Matching with capturing groups
1361
+ * r = RE2::Regexp.new('w(o)(o)')
1362
+ * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1363
+ * @example Matching without capturing groups
1364
+ * r = RE2::Regexp.new('woo')
1365
+ * r.match('woo') #=> true
1366
+ *
1367
+ * @overload match(text, options)
1368
+ * See `match(text)` but with customisable offsets for starting and ending
1369
+ * matches, optional anchoring to the start or both ends of the text and a
1370
+ * specific number of submatches to extract (padded with `nil`s if
1371
+ * necessary).
1372
+ *
1373
+ * @param [String] text the text to search
1374
+ * @param [Hash] options the options with which to perform the match
1375
+ * @option options [Integer] :startpos (0) offset at which to start matching
1376
+ * @option options [Integer] :endpos offset at which to stop matching, defaults to the text length
1377
+ * @option options [Symbol] :anchor (:unanchored) one of :unanchored, :anchor_start, :anchor_both to anchor the match
1378
+ * @option options [Integer] :submatches how many submatches to extract (0 is
1379
+ * fastest), defaults to the number of capturing groups
1380
+ * @return [RE2::MatchData, nil] if extracting any submatches
1381
+ * @return [Boolean] if not extracting any submatches
1382
+ * @raise [ArgumentError] if given a negative number of submatches, invalid
1383
+ * anchor or invalid startpos, endpos pair
1384
+ * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1385
+ * @raise [TypeError] if given non-String text, non-numeric number of
1386
+ * submatches, non-symbol anchor or non-hash options
1387
+ * @raise [RE2::Regexp::UnsupportedError] if given an endpos argument on a
1388
+ * version of RE2 that does not support it
1389
+ * @example Matching with capturing groups
1390
+ * r = RE2::Regexp.new('w(o)(o)')
1391
+ * r.match('woo', submatches: 1) #=> #<RE2::MatchData "woo" 1:"o">
1392
+ * r.match('woo', submatches: 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1393
+ * r.match('woot', anchor: :anchor_both, submatches: 0)
1394
+ * #=> false
1395
+ * r.match('woot', anchor: :anchor_start, submatches: 0)
1396
+ * #=> true
1397
+ * @example Matching without capturing groups
1398
+ * r = RE2::Regexp.new('wo+')
1399
+ * r.match('woot', anchor: :anchor_both) #=> false
1400
+ * r.match('woot', anchor: :anchor_start) #=> true
1401
+ *
1402
+ * @overload match(text, submatches)
1403
+ * @deprecated Legacy syntax for matching against `text` with a specific
1404
+ * number of submatches to extract. Use `match(text, submatches: n)` instead.
1405
+ *
1406
+ * @param [String] text the text to search
1407
+ * @param [Integer] submatches the number of submatches to extract
1408
+ * @return [RE2::MatchData, nil] if extracting any submatches
1409
+ * @return [Boolean] if not extracting any submatches
1410
+ * @raise [NoMemoryError] if there was not enough memory to allocate the submatches
1411
+ * @raise [TypeError] if given non-numeric number of submatches
1412
+ * @example
1413
+ * r = RE2::Regexp.new('w(o)(o)')
1414
+ * r.match('woo', 0) #=> true
1415
+ * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
1416
+ * r.match('woo', 2) #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1417
+ */
1418
+ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1419
+ re2_pattern *p;
1420
+ re2_matchdata *m;
1421
+ VALUE text, options;
1422
+
1423
+ rb_scan_args(argc, argv, "11", &text, &options);
1424
+
1425
+ /* Ensure text is a string. */
1426
+ StringValue(text);
1427
+
1428
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1429
+
1430
+ int n;
1431
+ int startpos = 0;
1432
+ int endpos = RSTRING_LEN(text);
1433
+ RE2::Anchor anchor = RE2::UNANCHORED;
1434
+
1435
+ if (RTEST(options)) {
1436
+ if (FIXNUM_P(options)) {
1437
+ n = NUM2INT(options);
1438
+
1439
+ if (n < 0) {
1440
+ rb_raise(rb_eArgError, "number of matches should be >= 0");
1441
+ }
1442
+ } else {
1443
+ if (TYPE(options) != T_HASH) {
1444
+ options = rb_Hash(options);
1445
+ }
1446
+
1447
+ VALUE endpos_option = rb_hash_aref(options, ID2SYM(id_endpos));
1448
+ if (!NIL_P(endpos_option)) {
1449
+ #ifdef HAVE_ENDPOS_ARGUMENT
1450
+ Check_Type(endpos_option, T_FIXNUM);
1451
+
1452
+ endpos = NUM2INT(endpos_option);
1453
+
1454
+ if (endpos < 0) {
1455
+ rb_raise(rb_eArgError, "endpos should be >= 0");
1456
+ }
1457
+ #else
1458
+ rb_raise(re2_eRegexpUnsupportedError, "current version of RE2::Match() does not support endpos argument");
1459
+ #endif
1460
+ }
1461
+
1462
+ VALUE anchor_option = rb_hash_aref(options, ID2SYM(id_anchor));
1463
+ if (!NIL_P(anchor_option)) {
1464
+ Check_Type(anchor_option, T_SYMBOL);
1465
+
1466
+ ID id_anchor_option = SYM2ID(anchor_option);
1467
+ if (id_anchor_option == id_unanchored) {
1468
+ anchor = RE2::UNANCHORED;
1469
+ } else if (id_anchor_option == id_anchor_start) {
1470
+ anchor = RE2::ANCHOR_START;
1471
+ } else if (id_anchor_option == id_anchor_both) {
1472
+ anchor = RE2::ANCHOR_BOTH;
1473
+ } else {
1474
+ rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
1475
+ }
1476
+ }
1477
+
1478
+ VALUE submatches_option = rb_hash_aref(options, ID2SYM(id_submatches));
1479
+ if (!NIL_P(submatches_option)) {
1480
+ Check_Type(submatches_option, T_FIXNUM);
1481
+
1482
+ n = NUM2INT(submatches_option);
1483
+
1484
+ if (n < 0) {
1485
+ rb_raise(rb_eArgError, "number of matches should be >= 0");
1486
+ }
1487
+ } else {
1488
+ if (!p->pattern->ok()) {
1489
+ return Qnil;
1490
+ }
1491
+
1492
+ n = p->pattern->NumberOfCapturingGroups();
1493
+ }
1494
+
1495
+ VALUE startpos_option = rb_hash_aref(options, ID2SYM(id_startpos));
1496
+ if (!NIL_P(startpos_option)) {
1497
+ Check_Type(startpos_option, T_FIXNUM);
1498
+
1499
+ startpos = NUM2INT(startpos_option);
1500
+
1501
+ if (startpos < 0) {
1502
+ rb_raise(rb_eArgError, "startpos should be >= 0");
1503
+ }
1504
+ }
1505
+ }
1506
+ } else {
1507
+ if (!p->pattern->ok()) {
1508
+ return Qnil;
1509
+ }
1510
+
1511
+ n = p->pattern->NumberOfCapturingGroups();
1512
+ }
1513
+
1514
+ if (startpos > endpos) {
1515
+ rb_raise(rb_eArgError, "startpos should be <= endpos");
1516
+ }
1517
+
1518
+ if (n == 0) {
1519
+ #ifdef HAVE_ENDPOS_ARGUMENT
1520
+ bool matched = p->pattern->Match(
1521
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1522
+ startpos, endpos, anchor, 0, 0);
1523
+ #else
1524
+ bool matched = p->pattern->Match(
1525
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)),
1526
+ startpos, anchor, 0, 0);
1527
+ #endif
1528
+ return BOOL2RUBY(matched);
1529
+ } else {
1530
+ /* Because match returns the whole match as well. */
1531
+ n += 1;
1532
+
1533
+ VALUE matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1534
+ TypedData_Get_Struct(matchdata, re2_matchdata, &re2_matchdata_data_type, m);
1535
+ m->matches = new(std::nothrow) re2::StringPiece[n];
1536
+ RB_OBJ_WRITE(matchdata, &m->regexp, self);
1537
+ if (!RTEST(rb_obj_frozen_p(text))) {
1538
+ text = rb_str_freeze(rb_str_dup(text));
1539
+ }
1540
+ RB_OBJ_WRITE(matchdata, &m->text, text);
1541
+
1542
+ if (m->matches == 0) {
1543
+ rb_raise(rb_eNoMemError,
1544
+ "not enough memory to allocate StringPieces for matches");
1545
+ }
1546
+
1547
+ m->number_of_matches = n;
1548
+
1549
+ #ifdef HAVE_ENDPOS_ARGUMENT
1550
+ bool matched = p->pattern->Match(
1551
+ re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
1552
+ startpos, endpos, anchor, m->matches, n);
1553
+ #else
1554
+ bool matched = p->pattern->Match(
1555
+ re2::StringPiece(RSTRING_PTR(m->text), RSTRING_LEN(m->text)),
1556
+ startpos, anchor, m->matches, n);
1557
+ #endif
1558
+ if (matched) {
1559
+ return matchdata;
1560
+ } else {
1561
+ return Qnil;
1562
+ }
1563
+ }
1564
+ }
1565
+
1566
+ /*
1567
+ * Returns true if the pattern matches any substring of the given text using
1568
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L413-L427
1569
+ * `PartialMatch`}.
1570
+ *
1571
+ * @return [Boolean] whether the match was successful
1572
+ * @raise [TypeError] if text cannot be coerced to a `String`
1573
+ */
1574
+ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1575
+ re2_pattern *p;
1576
+
1577
+ /* Ensure text is a string. */
1578
+ StringValue(text);
1579
+
1580
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1581
+
1582
+ return BOOL2RUBY(RE2::PartialMatch(
1583
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
1584
+ }
1585
+
1586
+ /*
1587
+ * Returns true if the pattern matches the given text using
1588
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L376-L411
1589
+ * `FullMatch`}.
1590
+ *
1591
+ * @return [Boolean] whether the match was successful
1592
+ * @raise [TypeError] if text cannot be coerced to a `String`
1593
+ */
1594
+ static VALUE re2_regexp_full_match_p(const VALUE self, VALUE text) {
1595
+ re2_pattern *p;
1596
+
1597
+ /* Ensure text is a string. */
1598
+ StringValue(text);
1599
+
1600
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1601
+
1602
+ return BOOL2RUBY(RE2::FullMatch(
1603
+ re2::StringPiece(RSTRING_PTR(text), RSTRING_LEN(text)), *p->pattern));
1604
+ }
1605
+
1606
+ /*
1607
+ * Returns a {RE2::Scanner} for scanning the given text incrementally with
1608
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L447-L463
1609
+ * `FindAndConsume`}.
1610
+ *
1611
+ * @param [text] text the text to scan incrementally
1612
+ * @return [RE2::Scanner] an `Enumerable` {RE2::Scanner} object
1613
+ * @raise [TypeError] if `text` cannot be coerced to a `String`
1614
+ * @example
1615
+ * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
1616
+ * #=> #<RE2::Scanner:0x0000000000000001>
1617
+ */
1618
+ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1619
+ /* Ensure text is a string. */
1620
+ StringValue(text);
1621
+
1622
+ re2_pattern *p;
1623
+ re2_scanner *c;
1624
+
1625
+ TypedData_Get_Struct(self, re2_pattern, &re2_regexp_data_type, p);
1626
+ VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1627
+ TypedData_Get_Struct(scanner, re2_scanner, &re2_scanner_data_type, c);
1628
+
1629
+ c->input = new(std::nothrow) re2::StringPiece(
1630
+ RSTRING_PTR(text), RSTRING_LEN(text));
1631
+ RB_OBJ_WRITE(scanner, &c->regexp, self);
1632
+ RB_OBJ_WRITE(scanner, &c->text, text);
1633
+
1634
+ if (p->pattern->ok()) {
1635
+ c->number_of_capturing_groups = p->pattern->NumberOfCapturingGroups();
1636
+ } else {
1637
+ c->number_of_capturing_groups = 0;
1638
+ }
1639
+
1640
+ c->eof = false;
1641
+
1642
+ return scanner;
1643
+ }
1644
+
1645
+ /*
1646
+ * Returns whether the underlying RE2 version supports passing an `endpos`
1647
+ * argument to
1648
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1649
+ * Match}. If not, {RE2::Regexp#match} will raise an error if attempting to
1650
+ * pass an `endpos`.
1651
+ *
1652
+ * @return [Boolean] whether the underlying
1653
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L562-L588
1654
+ * Match} has an endpos argument
1655
+ */
1656
+ static VALUE re2_regexp_match_has_endpos_argument_p(VALUE) {
1657
+ #ifdef HAVE_ENDPOS_ARGUMENT
1658
+ return Qtrue;
1659
+ #else
1660
+ return Qfalse;
1661
+ #endif
1662
+ }
1663
+
1664
+ /*
1665
+ * Returns a copy of `str` with the first occurrence `pattern` replaced with
1666
+ * `rewrite` using
1667
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L465-L480
1668
+ * `Replace`}.
1669
+ *
1670
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1671
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1672
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1673
+ *
1674
+ * @param [String] str the string to modify
1675
+ * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1676
+ * @param [String] rewrite the string to replace with
1677
+ * @return [String] the resulting string
1678
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1679
+ * {RE2::Regexp}) cannot be coerced to `String`s
1680
+ * @example
1681
+ * RE2.Replace("hello there", "hello", "howdy") #=> "howdy there"
1682
+ * re2 = RE2::Regexp.new("hel+o")
1683
+ * RE2.Replace("hello there", re2, "yo") #=> "yo there"
1684
+ */
1685
+ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
1686
+ VALUE rewrite) {
1687
+ /* Ensure rewrite is a string. */
1688
+ StringValue(rewrite);
1689
+
1690
+ re2_pattern *p;
1691
+
1692
+ /* Take a copy of str so it can be modified in-place by
1693
+ * RE2::Replace.
1694
+ */
1695
+ StringValue(str);
1696
+ std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1697
+
1698
+ /* Do the replacement. */
1699
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1700
+ TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1701
+ RE2::Replace(&str_as_string, *p->pattern,
1702
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1703
+
1704
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1705
+ p->pattern->options().encoding());
1706
+ } else {
1707
+ /* Ensure pattern is a string. */
1708
+ StringValue(pattern);
1709
+
1710
+ RE2::Replace(&str_as_string,
1711
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1712
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1713
+
1714
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1715
+ }
1716
+ }
1717
+
1718
+ /*
1719
+ * Return a copy of `str` with `pattern` replaced by `rewrite` using
1720
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L482-L497
1721
+ * `GlobalReplace`}.
1722
+ *
1723
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1724
+ * returned in UTF-8 by default or ISO-8859-1 if the `:utf8` option for the
1725
+ * {RE2::Regexp} is set to `false` (any other encoding's behaviour is undefined).
1726
+ *
1727
+ * @param [String] str the string to modify
1728
+ * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1729
+ * @param [String] rewrite the string to replace with
1730
+ * @raise [TypeError] if the given rewrite or pattern (if not provided as a
1731
+ * {RE2::Regexp}) cannot be coerced to `String`s
1732
+ * @return [String] the resulting string
1733
+ * @example
1734
+ * re2 = RE2::Regexp.new("oo?")
1735
+ * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
1736
+ * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
1737
+ */
1738
+ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1739
+ VALUE rewrite) {
1740
+ /* Ensure rewrite is a string. */
1741
+ StringValue(rewrite);
1742
+
1743
+ /* Take a copy of str so it can be modified in-place by
1744
+ * RE2::GlobalReplace.
1745
+ */
1746
+ re2_pattern *p;
1747
+ StringValue(str);
1748
+ std::string str_as_string(RSTRING_PTR(str), RSTRING_LEN(str));
1749
+
1750
+ /* Do the replacement. */
1751
+ if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1752
+ TypedData_Get_Struct(pattern, re2_pattern, &re2_regexp_data_type, p);
1753
+ RE2::GlobalReplace(&str_as_string, *p->pattern,
1754
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1755
+
1756
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1757
+ p->pattern->options().encoding());
1758
+ } else {
1759
+ /* Ensure pattern is a string. */
1760
+ StringValue(pattern);
1761
+
1762
+ RE2::GlobalReplace(&str_as_string,
1763
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)),
1764
+ re2::StringPiece(RSTRING_PTR(rewrite), RSTRING_LEN(rewrite)));
1765
+
1766
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1767
+ }
1768
+ }
1769
+
1770
+ /*
1771
+ * Returns a version of `str` with all potentially meaningful regexp characters
1772
+ * escaped using
1773
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/re2.h#L512-L518
1774
+ * `QuoteMeta`}. The returned string, used as a regular expression, will
1775
+ * exactly match the original string.
1776
+ *
1777
+ * @param [String] unquoted the unquoted string
1778
+ * @raise [TypeError] if the given unquoted string cannot be coerced to a `String`
1779
+ * @return [String] the escaped string
1780
+ * @example
1781
+ * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
1782
+ */
1783
+ static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
1784
+ StringValue(unquoted);
1785
+
1786
+ std::string quoted_string = RE2::QuoteMeta(
1787
+ re2::StringPiece(RSTRING_PTR(unquoted), RSTRING_LEN(unquoted)));
1788
+
1789
+ return rb_str_new(quoted_string.data(), quoted_string.size());
1790
+ }
1791
+
1792
+ static void re2_set_free(void *ptr) {
1793
+ re2_set *s = reinterpret_cast<re2_set *>(ptr);
1794
+ if (s->set) {
1795
+ delete s->set;
1796
+ }
1797
+ xfree(s);
1798
+ }
1799
+
1800
+ static size_t re2_set_memsize(const void *ptr) {
1801
+ const re2_set *s = reinterpret_cast<const re2_set *>(ptr);
1802
+ size_t size = sizeof(*s);
1803
+ if (s->set) {
1804
+ size += sizeof(*s->set);
1805
+ }
1806
+
1807
+ return size;
1808
+ }
1809
+
1810
+ static const rb_data_type_t re2_set_data_type = {
1811
+ "RE2::Set",
1812
+ {
1813
+ 0,
1814
+ re2_set_free,
1815
+ re2_set_memsize,
1816
+ },
1817
+ 0,
1818
+ 0,
1819
+ // IMPORTANT: WB_PROTECTED objects must only use the RB_OBJ_WRITE()
1820
+ // macro to update VALUE references, as to trigger write barriers.
1821
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
1822
+ };
1823
+
1824
+ static VALUE re2_set_allocate(VALUE klass) {
1825
+ re2_set *s;
1826
+ VALUE result = TypedData_Make_Struct(klass, re2_set, &re2_set_data_type, s);
1827
+
1828
+ return result;
1829
+ }
1830
+
1831
+ /*
1832
+ * Returns a new {RE2::Set} object, a collection of patterns that can be
1833
+ * searched for simultaneously.
1834
+ *
1835
+ * @return [RE2::Set]
1836
+ *
1837
+ * @overload initialize
1838
+ * Returns a new {RE2::Set} object for unanchored patterns with the default
1839
+ * options.
1840
+ *
1841
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1842
+ * @return [RE2::Set]
1843
+ *
1844
+ * @overload initialize(anchor)
1845
+ * Returns a new {RE2::Set} object for the specified anchor with the default
1846
+ * options.
1847
+ *
1848
+ * @param [Symbol] anchor one of `:unanchored`, `:anchor_start`, `:anchor_both`
1849
+ * @raise [ArgumentError] if anchor is not `:unanchored`, `:anchor_start` or `:anchor_both`
1850
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1851
+ *
1852
+ * @overload initialize(anchor, options)
1853
+ * Returns a new {RE2::Set} object with the specified options.
1854
+ *
1855
+ * @param [Symbol] anchor one of `:unanchored`, `:anchor_start`, `:anchor_both`
1856
+ * @param [Hash] options the options with which to compile the pattern
1857
+ * @option options [Boolean] :utf8 (true) text and pattern are UTF-8; otherwise Latin-1
1858
+ * @option options [Boolean] :posix_syntax (false) restrict regexps to POSIX egrep syntax
1859
+ * @option options [Boolean] :longest_match (false) search for longest match, not first match
1860
+ * @option options [Boolean] :log_errors (true) log syntax and execution errors to ERROR
1861
+ * @option options [Integer] :max_mem approx. max memory footprint of RE2
1862
+ * @option options [Boolean] :literal (false) interpret string as literal, not regexp
1863
+ * @option options [Boolean] :never_nl (false) never match `\n`, even if it is in regexp
1864
+ * @option options [Boolean] :case_sensitive (true) match is case-sensitive (regexp can override with `(?i)` unless in `posix_syntax` mode)
1865
+ * @option options [Boolean] :perl_classes (false) allow Perl's `\d` `\s` `\w` `\D` `\S` `\W` when in `posix_syntax` mode
1866
+ * @option options [Boolean] :word_boundary (false) allow `\b` `\B` (word boundary and not) when in `posix_syntax` mode
1867
+ * @option options [Boolean] :one_line (false) `^` and `$` only match beginning and end of text when in `posix_syntax` mode
1868
+ * @return [RE2::Set] a {RE2::Set} with the specified anchor and options
1869
+ * @raise [ArgumentError] if `anchor` is not one of the accepted choices
1870
+ * @raise [NoMemoryError] if memory could not be allocated for the compiled pattern
1871
+ */
1872
+ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1873
+ VALUE anchor, options;
1874
+ re2_set *s;
1875
+
1876
+ rb_scan_args(argc, argv, "02", &anchor, &options);
1877
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
1878
+
1879
+ RE2::Anchor re2_anchor = RE2::UNANCHORED;
1880
+
1881
+ if (!NIL_P(anchor)) {
1882
+ Check_Type(anchor, T_SYMBOL);
1883
+ ID id_anchor_arg = SYM2ID(anchor);
1884
+ if (id_anchor_arg == id_unanchored) {
1885
+ re2_anchor = RE2::UNANCHORED;
1886
+ } else if (id_anchor_arg == id_anchor_start) {
1887
+ re2_anchor = RE2::ANCHOR_START;
1888
+ } else if (id_anchor_arg == id_anchor_both) {
1889
+ re2_anchor = RE2::ANCHOR_BOTH;
1890
+ } else {
1891
+ rb_raise(rb_eArgError, "anchor should be one of: :unanchored, :anchor_start, :anchor_both");
1892
+ }
1893
+ }
1894
+
1895
+ RE2::Options re2_options;
1896
+
1897
+ if (RTEST(options)) {
1898
+ parse_re2_options(&re2_options, options);
1899
+ }
1900
+
1901
+ s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
1902
+ if (s->set == 0) {
1903
+ rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1904
+ }
1905
+
1906
+ return self;
1907
+ }
1908
+
1909
+ /*
1910
+ * Adds a pattern to the set. Returns the index that will identify the pattern
1911
+ * in the output of {RE2::Set#match}. Cannot be called after {RE2::Set#compile}
1912
+ * has been called.
1913
+ *
1914
+ * @param [String] pattern the regex pattern
1915
+ * @return [Integer] the index of the pattern in the set
1916
+ * @raise [ArgumentError] if called after compile or the pattern is rejected
1917
+ * @example
1918
+ * set = RE2::Set.new
1919
+ * set.add("abc") #=> 0
1920
+ * set.add("def") #=> 1
1921
+ */
1922
+ static VALUE re2_set_add(VALUE self, VALUE pattern) {
1923
+ StringValue(pattern);
1924
+
1925
+ re2_set *s;
1926
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
1927
+
1928
+ /* To prevent the memory of the err string leaking when we call rb_raise,
1929
+ * take a copy of it and let it go out of scope.
1930
+ */
1931
+ char msg[100];
1932
+ int index;
1933
+
1934
+ {
1935
+ std::string err;
1936
+ index = s->set->Add(
1937
+ re2::StringPiece(RSTRING_PTR(pattern), RSTRING_LEN(pattern)), &err);
1938
+ strlcpy(msg, err.c_str(), sizeof(msg));
1939
+ }
1940
+
1941
+ if (index < 0) {
1942
+ rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", msg);
1943
+ }
1944
+
1945
+ return INT2FIX(index);
1946
+ }
1947
+
1948
+ /*
1949
+ * Compiles a {RE2::Set} so it can be used to match against. Must be called
1950
+ * after {RE2::Set#add} and before {RE2::Set#match}.
1951
+ *
1952
+ * @return [Boolean] whether compilation was a success
1953
+ * @example
1954
+ * set = RE2::Set.new
1955
+ * set.add("abc")
1956
+ * set.compile #=> true
1957
+ */
1958
+ static VALUE re2_set_compile(VALUE self) {
1959
+ re2_set *s;
1960
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
1961
+
1962
+ return BOOL2RUBY(s->set->Compile());
1963
+ }
1964
+
1965
+ /*
1966
+ * Returns whether the underlying RE2 version outputs error information from
1967
+ * {https://github.com/google/re2/blob/bc0faab533e2b27b85b8ad312abf061e33ed6b5d/re2/set.h#L62-L65
1968
+ * `RE2::Set::Match`}. If not, {RE2::Set#match} will raise an error if attempting to set
1969
+ * its `:exception` option to `true`.
1970
+ *
1971
+ * @return [Boolean] whether the underlying RE2 outputs error information from {RE2::Set} matches
1972
+ */
1973
+ static VALUE re2_set_match_raises_errors_p(VALUE) {
1974
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
1975
+ return Qtrue;
1976
+ #else
1977
+ return Qfalse;
1978
+ #endif
1979
+ }
1980
+
1981
+ /*
1982
+ * Matches the given text against patterns in the set, returning an array of
1983
+ * integer indices of the matching patterns if matched or an empty array if
1984
+ * there are no matches.
1985
+ *
1986
+ * @return [Array<Integer>]
1987
+ *
1988
+ * @overload match(str)
1989
+ * Returns an array of integer indices of patterns matching the given string
1990
+ * (if any). Raises exceptions if there are any errors while matching.
1991
+ *
1992
+ * @param [String] str the text to match against
1993
+ * @return [Array<Integer>] the indices of matching regexps
1994
+ * @raise [MatchError] if an error occurs while matching
1995
+ * @raise [UnsupportedError] if the underlying version of RE2 does not output error information
1996
+ * @example
1997
+ * set = RE2::Set.new
1998
+ * set.add("abc")
1999
+ * set.add("def")
2000
+ * set.compile
2001
+ * set.match("abcdef") #=> [0, 1]
2002
+ *
2003
+ * @overload match(str, options)
2004
+ * Returns an array of integer indices of patterns matching the given string
2005
+ * (if any). Raises exceptions if there are any errors while matching and the
2006
+ * `:exception` option is set to true.
2007
+ *
2008
+ * @param [String] str the text to match against
2009
+ * @param [Hash] options the options with which to match
2010
+ * @option options [Boolean] :exception (true) whether to raise exceptions with RE2's error information (not supported on ABI version 0 of RE2)
2011
+ * @return [Array<Integer>] the indices of matching regexps
2012
+ * @raise [MatchError] if an error occurs while matching
2013
+ * @raise [UnsupportedError] if the underlying version of RE2 does not output error information
2014
+ * @example
2015
+ * set = RE2::Set.new
2016
+ * set.add("abc")
2017
+ * set.add("def")
2018
+ * set.compile
2019
+ * set.match("abcdef", exception: true) #=> [0, 1]
2020
+ */
2021
+ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
2022
+ VALUE str, options;
2023
+ bool raise_exception = true;
2024
+ rb_scan_args(argc, argv, "11", &str, &options);
2025
+
2026
+ StringValue(str);
2027
+ re2_set *s;
2028
+ TypedData_Get_Struct(self, re2_set, &re2_set_data_type, s);
2029
+
2030
+ if (RTEST(options)) {
2031
+ Check_Type(options, T_HASH);
2032
+
2033
+ VALUE exception_option = rb_hash_aref(options, ID2SYM(id_exception));
2034
+ if (!NIL_P(exception_option)) {
2035
+ raise_exception = RTEST(exception_option);
2036
+ }
2037
+ }
2038
+
2039
+ std::vector<int> v;
2040
+
2041
+ if (raise_exception) {
2042
+ #ifdef HAVE_ERROR_INFO_ARGUMENT
2043
+ RE2::Set::ErrorInfo e;
2044
+ bool match_failed = !s->set->Match(
2045
+ re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v, &e);
2046
+ VALUE result = rb_ary_new2(v.size());
2047
+
2048
+ if (match_failed) {
2049
+ switch (e.kind) {
2050
+ case RE2::Set::kNoError:
2051
+ break;
2052
+ case RE2::Set::kNotCompiled:
2053
+ rb_raise(re2_eSetMatchError, "#match must not be called before #compile");
2054
+ case RE2::Set::kOutOfMemory:
2055
+ rb_raise(re2_eSetMatchError, "The DFA ran out of memory");
2056
+ case RE2::Set::kInconsistent:
2057
+ rb_raise(re2_eSetMatchError, "RE2::Prog internal error");
2058
+ default: // Just in case a future version of libre2 adds new ErrorKinds
2059
+ rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
2060
+ }
2061
+ } else {
2062
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
2063
+ rb_ary_push(result, INT2FIX(v[i]));
2064
+ }
2065
+ }
2066
+
2067
+ return result;
2068
+ #else
2069
+ rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
2070
+ #endif
2071
+ } else {
2072
+ bool matched = s->set->Match(
2073
+ re2::StringPiece(RSTRING_PTR(str), RSTRING_LEN(str)), &v);
2074
+ VALUE result = rb_ary_new2(v.size());
2075
+
2076
+ if (matched) {
2077
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
2078
+ rb_ary_push(result, INT2FIX(v[i]));
2079
+ }
2080
+ }
2081
+
2082
+ return result;
2083
+ }
2084
+ }
2085
+
2086
+ extern "C" void Init_re2(void) {
2087
+ re2_mRE2 = rb_define_module("RE2");
2088
+ re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
2089
+ re2_eRegexpUnsupportedError = rb_define_class_under(re2_cRegexp,
2090
+ "UnsupportedError", rb_const_get(rb_cObject, rb_intern("StandardError")));
2091
+ re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject);
2092
+ re2_cScanner = rb_define_class_under(re2_mRE2, "Scanner", rb_cObject);
2093
+ re2_cSet = rb_define_class_under(re2_mRE2, "Set", rb_cObject);
2094
+ re2_eSetMatchError = rb_define_class_under(re2_cSet, "MatchError",
2095
+ rb_const_get(rb_cObject, rb_intern("StandardError")));
2096
+ re2_eSetUnsupportedError = rb_define_class_under(re2_cSet, "UnsupportedError",
2097
+ rb_const_get(rb_cObject, rb_intern("StandardError")));
2098
+
2099
+ rb_define_alloc_func(re2_cRegexp,
2100
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_regexp_allocate));
2101
+ rb_define_alloc_func(re2_cMatchData,
2102
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_matchdata_allocate));
2103
+ rb_define_alloc_func(re2_cScanner,
2104
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_scanner_allocate));
2105
+ rb_define_alloc_func(re2_cSet,
2106
+ reinterpret_cast<VALUE (*)(VALUE)>(re2_set_allocate));
2107
+
2108
+ rb_define_method(re2_cMatchData, "string",
2109
+ RUBY_METHOD_FUNC(re2_matchdata_string), 0);
2110
+ rb_define_method(re2_cMatchData, "regexp",
2111
+ RUBY_METHOD_FUNC(re2_matchdata_regexp), 0);
2112
+ rb_define_method(re2_cMatchData, "to_a",
2113
+ RUBY_METHOD_FUNC(re2_matchdata_to_a), 0);
2114
+ rb_define_method(re2_cMatchData, "size",
2115
+ RUBY_METHOD_FUNC(re2_matchdata_size), 0);
2116
+ rb_define_method(re2_cMatchData, "length",
2117
+ RUBY_METHOD_FUNC(re2_matchdata_size), 0);
2118
+ rb_define_method(re2_cMatchData, "begin",
2119
+ RUBY_METHOD_FUNC(re2_matchdata_begin), 1);
2120
+ rb_define_method(re2_cMatchData, "end",
2121
+ RUBY_METHOD_FUNC(re2_matchdata_end), 1);
2122
+ rb_define_method(re2_cMatchData, "[]", RUBY_METHOD_FUNC(re2_matchdata_aref),
2123
+ -1);
2124
+ rb_define_method(re2_cMatchData, "to_s",
2125
+ RUBY_METHOD_FUNC(re2_matchdata_to_s), 0);
2126
+ rb_define_method(re2_cMatchData, "inspect",
2127
+ RUBY_METHOD_FUNC(re2_matchdata_inspect), 0);
2128
+ rb_define_method(re2_cMatchData, "deconstruct",
2129
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct), 0);
2130
+ rb_define_method(re2_cMatchData, "deconstruct_keys",
2131
+ RUBY_METHOD_FUNC(re2_matchdata_deconstruct_keys), 1);
2132
+
2133
+ rb_define_method(re2_cScanner, "string",
2134
+ RUBY_METHOD_FUNC(re2_scanner_string), 0);
2135
+ rb_define_method(re2_cScanner, "eof?",
2136
+ RUBY_METHOD_FUNC(re2_scanner_eof), 0);
2137
+ rb_define_method(re2_cScanner, "regexp",
2138
+ RUBY_METHOD_FUNC(re2_scanner_regexp), 0);
2139
+ rb_define_method(re2_cScanner, "scan",
2140
+ RUBY_METHOD_FUNC(re2_scanner_scan), 0);
2141
+ rb_define_method(re2_cScanner, "rewind",
2142
+ RUBY_METHOD_FUNC(re2_scanner_rewind), 0);
2143
+
2144
+ rb_define_singleton_method(re2_cRegexp, "match_has_endpos_argument?",
2145
+ RUBY_METHOD_FUNC(re2_regexp_match_has_endpos_argument_p), 0);
2146
+ rb_define_method(re2_cRegexp, "initialize",
2147
+ RUBY_METHOD_FUNC(re2_regexp_initialize), -1);
2148
+ rb_define_method(re2_cRegexp, "ok?", RUBY_METHOD_FUNC(re2_regexp_ok), 0);
2149
+ rb_define_method(re2_cRegexp, "error", RUBY_METHOD_FUNC(re2_regexp_error),
2150
+ 0);
2151
+ rb_define_method(re2_cRegexp, "error_arg",
2152
+ RUBY_METHOD_FUNC(re2_regexp_error_arg), 0);
2153
+ rb_define_method(re2_cRegexp, "program_size",
2154
+ RUBY_METHOD_FUNC(re2_regexp_program_size), 0);
2155
+ rb_define_method(re2_cRegexp, "options",
2156
+ RUBY_METHOD_FUNC(re2_regexp_options), 0);
2157
+ rb_define_method(re2_cRegexp, "number_of_capturing_groups",
2158
+ RUBY_METHOD_FUNC(re2_regexp_number_of_capturing_groups), 0);
2159
+ rb_define_method(re2_cRegexp, "named_capturing_groups",
2160
+ RUBY_METHOD_FUNC(re2_regexp_named_capturing_groups), 0);
2161
+ rb_define_method(re2_cRegexp, "match", RUBY_METHOD_FUNC(re2_regexp_match),
2162
+ -1);
2163
+ rb_define_method(re2_cRegexp, "match?", RUBY_METHOD_FUNC(re2_regexp_match_p),
2164
+ 1);
2165
+ rb_define_method(re2_cRegexp, "partial_match?",
2166
+ RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2167
+ rb_define_method(re2_cRegexp, "=~", RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2168
+ rb_define_method(re2_cRegexp, "===", RUBY_METHOD_FUNC(re2_regexp_match_p), 1);
2169
+ rb_define_method(re2_cRegexp, "full_match?",
2170
+ RUBY_METHOD_FUNC(re2_regexp_full_match_p), 1);
2171
+ rb_define_method(re2_cRegexp, "scan",
2172
+ RUBY_METHOD_FUNC(re2_regexp_scan), 1);
2173
+ rb_define_method(re2_cRegexp, "to_s", RUBY_METHOD_FUNC(re2_regexp_to_s), 0);
2174
+ rb_define_method(re2_cRegexp, "to_str", RUBY_METHOD_FUNC(re2_regexp_to_s),
2175
+ 0);
2176
+ rb_define_method(re2_cRegexp, "pattern", RUBY_METHOD_FUNC(re2_regexp_to_s),
2177
+ 0);
2178
+ rb_define_method(re2_cRegexp, "source", RUBY_METHOD_FUNC(re2_regexp_to_s),
2179
+ 0);
2180
+ rb_define_method(re2_cRegexp, "inspect",
2181
+ RUBY_METHOD_FUNC(re2_regexp_inspect), 0);
2182
+ rb_define_method(re2_cRegexp, "utf8?", RUBY_METHOD_FUNC(re2_regexp_utf8),
2183
+ 0);
2184
+ rb_define_method(re2_cRegexp, "posix_syntax?",
2185
+ RUBY_METHOD_FUNC(re2_regexp_posix_syntax), 0);
2186
+ rb_define_method(re2_cRegexp, "longest_match?",
2187
+ RUBY_METHOD_FUNC(re2_regexp_longest_match), 0);
2188
+ rb_define_method(re2_cRegexp, "log_errors?",
2189
+ RUBY_METHOD_FUNC(re2_regexp_log_errors), 0);
2190
+ rb_define_method(re2_cRegexp, "max_mem",
2191
+ RUBY_METHOD_FUNC(re2_regexp_max_mem), 0);
2192
+ rb_define_method(re2_cRegexp, "literal?",
2193
+ RUBY_METHOD_FUNC(re2_regexp_literal), 0);
2194
+ rb_define_method(re2_cRegexp, "never_nl?",
2195
+ RUBY_METHOD_FUNC(re2_regexp_never_nl), 0);
2196
+ rb_define_method(re2_cRegexp, "case_sensitive?",
2197
+ RUBY_METHOD_FUNC(re2_regexp_case_sensitive), 0);
2198
+ rb_define_method(re2_cRegexp, "case_insensitive?",
2199
+ RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
2200
+ rb_define_method(re2_cRegexp, "casefold?",
2201
+ RUBY_METHOD_FUNC(re2_regexp_case_insensitive), 0);
2202
+ rb_define_method(re2_cRegexp, "perl_classes?",
2203
+ RUBY_METHOD_FUNC(re2_regexp_perl_classes), 0);
2204
+ rb_define_method(re2_cRegexp, "word_boundary?",
2205
+ RUBY_METHOD_FUNC(re2_regexp_word_boundary), 0);
2206
+ rb_define_method(re2_cRegexp, "one_line?",
2207
+ RUBY_METHOD_FUNC(re2_regexp_one_line), 0);
2208
+
2209
+ rb_define_singleton_method(re2_cSet, "match_raises_errors?",
2210
+ RUBY_METHOD_FUNC(re2_set_match_raises_errors_p), 0);
2211
+ rb_define_method(re2_cSet, "initialize",
2212
+ RUBY_METHOD_FUNC(re2_set_initialize), -1);
2213
+ rb_define_method(re2_cSet, "add", RUBY_METHOD_FUNC(re2_set_add), 1);
2214
+ rb_define_method(re2_cSet, "compile", RUBY_METHOD_FUNC(re2_set_compile), 0);
2215
+ rb_define_method(re2_cSet, "match", RUBY_METHOD_FUNC(re2_set_match), -1);
2216
+
2217
+ rb_define_module_function(re2_mRE2, "Replace",
2218
+ RUBY_METHOD_FUNC(re2_Replace), 3);
2219
+ rb_define_module_function(re2_mRE2, "GlobalReplace",
2220
+ RUBY_METHOD_FUNC(re2_GlobalReplace), 3);
2221
+ rb_define_module_function(re2_mRE2, "QuoteMeta",
2222
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2223
+ rb_define_singleton_method(re2_cRegexp, "escape",
2224
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2225
+ rb_define_singleton_method(re2_cRegexp, "quote",
2226
+ RUBY_METHOD_FUNC(re2_QuoteMeta), 1);
2227
+
2228
+ // (see RE2::Regexp#initialize)
2229
+ rb_define_singleton_method(re2_cRegexp, "compile",
2230
+ RUBY_METHOD_FUNC(rb_class_new_instance), -1);
2231
+
2232
+ rb_define_module_function(rb_mKernel, "RE2", RUBY_METHOD_FUNC(re2_re2), -1);
2233
+
2234
+ /* Create the symbols used in options. */
2235
+ id_utf8 = rb_intern("utf8");
2236
+ id_posix_syntax = rb_intern("posix_syntax");
2237
+ id_longest_match = rb_intern("longest_match");
2238
+ id_log_errors = rb_intern("log_errors");
2239
+ id_max_mem = rb_intern("max_mem");
2240
+ id_literal = rb_intern("literal");
2241
+ id_never_nl = rb_intern("never_nl");
2242
+ id_case_sensitive = rb_intern("case_sensitive");
2243
+ id_perl_classes = rb_intern("perl_classes");
2244
+ id_word_boundary = rb_intern("word_boundary");
2245
+ id_one_line = rb_intern("one_line");
2246
+ id_unanchored = rb_intern("unanchored");
2247
+ id_anchor = rb_intern("anchor");
2248
+ id_anchor_start = rb_intern("anchor_start");
2249
+ id_anchor_both = rb_intern("anchor_both");
2250
+ id_exception = rb_intern("exception");
2251
+ id_submatches = rb_intern("submatches");
2252
+ id_startpos = rb_intern("startpos");
2253
+ id_endpos = rb_intern("endpos");
2254
+ }