re2 1.7.0 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/re2/re2.cc CHANGED
@@ -6,76 +6,19 @@
6
6
  * Released under the BSD Licence, please see LICENSE.txt
7
7
  */
8
8
 
9
- #include <ruby.h>
10
- #include <re2/re2.h>
11
- #include <re2/set.h>
12
9
  #include <stdint.h>
13
- #include <string>
10
+
11
+ #include <map>
14
12
  #include <sstream>
13
+ #include <string>
15
14
  #include <vector>
16
- using std::string;
17
- using std::ostringstream;
18
- using std::nothrow;
19
- using std::map;
20
- using std::vector;
21
-
22
- #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
23
- #define UNUSED(x) ((void)x)
24
15
 
25
- #ifndef RSTRING_LEN
26
- #define RSTRING_LEN(x) (RSTRING(x)->len)
27
- #endif
28
-
29
- #ifndef RSTRING_PTR
30
- #define RSTRING_PTR(x) (RSTRING(x)->ptr)
31
- #endif
32
-
33
- #ifdef HAVE_RUBY_ENCODING_H
34
- #include <ruby/encoding.h>
35
- #define ENCODED_STR_NEW(str, length, encoding) \
36
- ({ \
37
- VALUE _string = rb_str_new(str, length); \
38
- int _enc = rb_enc_find_index(encoding); \
39
- rb_enc_associate_index(_string, _enc); \
40
- _string; \
41
- })
42
- #define ENCODED_STR_NEW2(str, length, str2) \
43
- ({ \
44
- VALUE _string = rb_str_new(str, length); \
45
- int _enc = rb_enc_get_index(str2); \
46
- rb_enc_associate_index(_string, _enc); \
47
- _string; \
48
- })
49
- #else
50
- #define ENCODED_STR_NEW(str, length, encoding) \
51
- rb_str_new((const char *)str, (long)length)
52
- #define ENCODED_STR_NEW2(str, length, str2) \
53
- rb_str_new((const char *)str, (long)length)
54
- #endif
55
-
56
- #ifdef HAVE_RB_STR_SUBLEN
57
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
58
- LONG2NUM(rb_str_sublen(str, offset))
59
- #else
60
- #ifdef HAVE_RUBY_ENCODING_H
61
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
62
- ({ \
63
- VALUE _string = ENCODED_STR_NEW(RSTRING_PTR(str), offset, encoding); \
64
- rb_str_length(_string); \
65
- })
66
- #else
67
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
68
- LONG2NUM(offset)
69
- #endif
70
- #endif
16
+ #include <re2/re2.h>
17
+ #include <re2/set.h>
18
+ #include <ruby.h>
19
+ #include <ruby/encoding.h>
71
20
 
72
- #ifdef HAVE_ENDPOS_ARGUMENT
73
- #define match(pattern, text, startpos, endpos, anchor, match, nmatch) \
74
- (pattern->Match(text, startpos, endpos, anchor, match, nmatch))
75
- #else
76
- #define match(pattern, text, startpos, endpos, anchor, match, nmatch) \
77
- (pattern->Match(text, startpos, anchor, match, nmatch))
78
- #endif
21
+ #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
79
22
 
80
23
  typedef struct {
81
24
  RE2 *pattern;
@@ -107,95 +50,103 @@ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
107
50
  id_perl_classes, id_word_boundary, id_one_line,
108
51
  id_unanchored, id_anchor_start, id_anchor_both, id_exception;
109
52
 
110
- void parse_re2_options(RE2::Options& re2_options, VALUE options) {
53
+ inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
54
+ if (encoding == RE2::Options::EncodingUTF8) {
55
+ return rb_utf8_str_new(str, length);
56
+ }
57
+
58
+ VALUE string = rb_str_new(str, length);
59
+ rb_enc_associate_index(string, rb_enc_find_index("ISO-8859-1"));
60
+
61
+ return string;
62
+ }
63
+
64
+ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
111
65
  if (TYPE(options) != T_HASH) {
112
66
  rb_raise(rb_eArgError, "options should be a hash");
113
67
  }
114
- VALUE utf8, posix_syntax, longest_match, log_errors,
115
- max_mem, literal, never_nl, case_sensitive, perl_classes,
116
- word_boundary, one_line;
117
68
 
118
- utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
69
+ VALUE utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
119
70
  if (!NIL_P(utf8)) {
120
- re2_options.set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
71
+ re2_options->set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
121
72
  }
122
73
 
123
- posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
74
+ VALUE posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
124
75
  if (!NIL_P(posix_syntax)) {
125
- re2_options.set_posix_syntax(RTEST(posix_syntax));
76
+ re2_options->set_posix_syntax(RTEST(posix_syntax));
126
77
  }
127
78
 
128
- longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
79
+ VALUE longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
129
80
  if (!NIL_P(longest_match)) {
130
- re2_options.set_longest_match(RTEST(longest_match));
81
+ re2_options->set_longest_match(RTEST(longest_match));
131
82
  }
132
83
 
133
- log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
84
+ VALUE log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
134
85
  if (!NIL_P(log_errors)) {
135
- re2_options.set_log_errors(RTEST(log_errors));
86
+ re2_options->set_log_errors(RTEST(log_errors));
136
87
  }
137
88
 
138
- max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
89
+ VALUE max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
139
90
  if (!NIL_P(max_mem)) {
140
- re2_options.set_max_mem(NUM2INT(max_mem));
91
+ re2_options->set_max_mem(NUM2INT(max_mem));
141
92
  }
142
93
 
143
- literal = rb_hash_aref(options, ID2SYM(id_literal));
94
+ VALUE literal = rb_hash_aref(options, ID2SYM(id_literal));
144
95
  if (!NIL_P(literal)) {
145
- re2_options.set_literal(RTEST(literal));
96
+ re2_options->set_literal(RTEST(literal));
146
97
  }
147
98
 
148
- never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
99
+ VALUE never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
149
100
  if (!NIL_P(never_nl)) {
150
- re2_options.set_never_nl(RTEST(never_nl));
101
+ re2_options->set_never_nl(RTEST(never_nl));
151
102
  }
152
103
 
153
- case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
104
+ VALUE case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
154
105
  if (!NIL_P(case_sensitive)) {
155
- re2_options.set_case_sensitive(RTEST(case_sensitive));
106
+ re2_options->set_case_sensitive(RTEST(case_sensitive));
156
107
  }
157
108
 
158
- perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
109
+ VALUE perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
159
110
  if (!NIL_P(perl_classes)) {
160
- re2_options.set_perl_classes(RTEST(perl_classes));
111
+ re2_options->set_perl_classes(RTEST(perl_classes));
161
112
  }
162
113
 
163
- word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
114
+ VALUE word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
164
115
  if (!NIL_P(word_boundary)) {
165
- re2_options.set_word_boundary(RTEST(word_boundary));
116
+ re2_options->set_word_boundary(RTEST(word_boundary));
166
117
  }
167
118
 
168
- one_line = rb_hash_aref(options, ID2SYM(id_one_line));
119
+ VALUE one_line = rb_hash_aref(options, ID2SYM(id_one_line));
169
120
  if (!NIL_P(one_line)) {
170
- re2_options.set_one_line(RTEST(one_line));
121
+ re2_options->set_one_line(RTEST(one_line));
171
122
  }
172
123
  }
173
124
 
174
- void re2_matchdata_mark(re2_matchdata* self) {
125
+ static void re2_matchdata_mark(re2_matchdata* self) {
175
126
  rb_gc_mark(self->regexp);
176
127
  rb_gc_mark(self->text);
177
128
  }
178
129
 
179
- void re2_matchdata_free(re2_matchdata* self) {
130
+ static void re2_matchdata_free(re2_matchdata* self) {
180
131
  if (self->matches) {
181
132
  delete[] self->matches;
182
133
  }
183
134
  free(self);
184
135
  }
185
136
 
186
- void re2_scanner_mark(re2_scanner* self) {
137
+ static void re2_scanner_mark(re2_scanner* self) {
187
138
  rb_gc_mark(self->regexp);
188
139
  rb_gc_mark(self->text);
189
140
  }
190
141
 
191
- void re2_scanner_free(re2_scanner* self) {
142
+ static void re2_scanner_free(re2_scanner* self) {
192
143
  if (self->input) {
193
144
  delete self->input;
194
145
  }
195
146
  free(self);
196
147
  }
197
148
 
198
- void re2_regexp_free(re2_pattern* self) {
149
+ static void re2_regexp_free(re2_pattern* self) {
199
150
  if (self->pattern) {
200
151
  delete self->pattern;
201
152
  }
@@ -204,12 +155,14 @@ void re2_regexp_free(re2_pattern* self) {
204
155
 
205
156
  static VALUE re2_matchdata_allocate(VALUE klass) {
206
157
  re2_matchdata *m;
158
+
207
159
  return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark,
208
160
  re2_matchdata_free, m);
209
161
  }
210
162
 
211
163
  static VALUE re2_scanner_allocate(VALUE klass) {
212
164
  re2_scanner *c;
165
+
213
166
  return Data_Make_Struct(klass, re2_scanner, re2_scanner_mark,
214
167
  re2_scanner_free, c);
215
168
  }
@@ -222,7 +175,7 @@ static VALUE re2_scanner_allocate(VALUE klass) {
222
175
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
223
176
  * m.string #=> "bob 123"
224
177
  */
225
- static VALUE re2_matchdata_string(VALUE self) {
178
+ static VALUE re2_matchdata_string(const VALUE self) {
226
179
  re2_matchdata *m;
227
180
  Data_Get_Struct(self, re2_matchdata, m);
228
181
 
@@ -237,7 +190,7 @@ static VALUE re2_matchdata_string(VALUE self) {
237
190
  * c = RE2::Regexp.new('(\d+)').scan("foo")
238
191
  * c.string #=> "foo"
239
192
  */
240
- static VALUE re2_scanner_string(VALUE self) {
193
+ static VALUE re2_scanner_string(const VALUE self) {
241
194
  re2_scanner *c;
242
195
  Data_Get_Struct(self, re2_scanner, c);
243
196
 
@@ -252,7 +205,7 @@ static VALUE re2_scanner_string(VALUE self) {
252
205
  * c = RE2::Regexp.new('(\d+)').scan("foo")
253
206
  * c.eof? #=> true
254
207
  */
255
- static VALUE re2_scanner_eof(VALUE self) {
208
+ static VALUE re2_scanner_eof(const VALUE self) {
256
209
  re2_scanner *c;
257
210
  Data_Get_Struct(self, re2_scanner, c);
258
211
 
@@ -274,7 +227,8 @@ static VALUE re2_scanner_rewind(VALUE self) {
274
227
  re2_scanner *c;
275
228
  Data_Get_Struct(self, re2_scanner, c);
276
229
 
277
- c->input = new(nothrow) re2::StringPiece(StringValuePtr(c->text));
230
+ delete c->input;
231
+ c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(c->text));
278
232
  c->eof = false;
279
233
 
280
234
  return self;
@@ -284,6 +238,10 @@ static VALUE re2_scanner_rewind(VALUE self) {
284
238
  * Scan the given text incrementally for matches, returning an array of
285
239
  * matches on each subsequent call. Returns nil if no matches are found.
286
240
  *
241
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
242
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
243
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
244
+ *
287
245
  * @return [Array<String>] the matches.
288
246
  * @example
289
247
  * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
@@ -291,45 +249,41 @@ static VALUE re2_scanner_rewind(VALUE self) {
291
249
  * s.scan #=> ["bar"]
292
250
  */
293
251
  static VALUE re2_scanner_scan(VALUE self) {
294
- int i;
295
- size_t original_input_size, new_input_size;
296
- bool input_advanced;
297
252
  re2_pattern *p;
298
253
  re2_scanner *c;
299
- VALUE result;
300
254
 
301
255
  Data_Get_Struct(self, re2_scanner, c);
302
256
  Data_Get_Struct(c->regexp, re2_pattern, p);
303
257
 
304
- vector<RE2::Arg> argv(c->number_of_capturing_groups);
305
- vector<RE2::Arg*> args(c->number_of_capturing_groups);
306
- vector<string> matches(c->number_of_capturing_groups);
258
+ std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
259
+ std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
260
+ std::vector<std::string> matches(c->number_of_capturing_groups);
307
261
 
308
262
  if (c->eof) {
309
263
  return Qnil;
310
264
  }
311
265
 
312
- original_input_size = c->input->size();
266
+ re2::StringPiece::size_type original_input_size = c->input->size();
313
267
 
314
- for (i = 0; i < c->number_of_capturing_groups; i++) {
315
- matches[i] = "";
268
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
316
269
  argv[i] = &matches[i];
317
270
  args[i] = &argv[i];
318
271
  }
319
272
 
320
- if (RE2::FindAndConsumeN(c->input, *p->pattern, &args[0],
273
+ if (RE2::FindAndConsumeN(c->input, *p->pattern, args.data(),
321
274
  c->number_of_capturing_groups)) {
322
- result = rb_ary_new2(c->number_of_capturing_groups);
323
- new_input_size = c->input->size();
324
- input_advanced = new_input_size < original_input_size;
275
+ re2::StringPiece::size_type new_input_size = c->input->size();
276
+ bool input_advanced = new_input_size < original_input_size;
277
+
278
+ VALUE result = rb_ary_new2(c->number_of_capturing_groups);
325
279
 
326
- for (i = 0; i < c->number_of_capturing_groups; i++) {
280
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
327
281
  if (matches[i].empty()) {
328
282
  rb_ary_push(result, Qnil);
329
283
  } else {
330
- rb_ary_push(result, ENCODED_STR_NEW(matches[i].data(),
284
+ rb_ary_push(result, encoded_str_new(matches[i].data(),
331
285
  matches[i].size(),
332
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
286
+ p->pattern->options().encoding()));
333
287
  }
334
288
  }
335
289
 
@@ -340,47 +294,41 @@ static VALUE re2_scanner_scan(VALUE self) {
340
294
  if (!input_advanced && new_input_size > 0) {
341
295
  c->input->remove_prefix(1);
342
296
  }
297
+
298
+ return result;
343
299
  } else {
344
- result = Qnil;
300
+ return Qnil;
345
301
  }
346
-
347
- return result;
348
302
  }
349
303
 
350
304
  /*
351
305
  * Retrieve a matchdata by index or name.
352
306
  */
353
- re2::StringPiece *re2_matchdata_find_match(VALUE idx, VALUE self) {
354
- int id;
307
+ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
355
308
  re2_matchdata *m;
356
309
  re2_pattern *p;
357
- map<string, int> groups;
358
- string name;
359
- re2::StringPiece *match;
360
310
 
361
311
  Data_Get_Struct(self, re2_matchdata, m);
362
312
  Data_Get_Struct(m->regexp, re2_pattern, p);
363
313
 
314
+ int id;
315
+
364
316
  if (FIXNUM_P(idx)) {
365
317
  id = FIX2INT(idx);
366
318
  } else {
367
- if (SYMBOL_P(idx)) {
368
- name = rb_id2name(SYM2ID(idx));
369
- } else {
370
- name = StringValuePtr(idx);
371
- }
372
-
373
- groups = p->pattern->NamedCapturingGroups();
319
+ const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
320
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
321
+ std::map<std::string, int>::const_iterator search = groups.find(name);
374
322
 
375
- if (groups.count(name) == 1) {
376
- id = groups[name];
323
+ if (search != groups.end()) {
324
+ id = search->second;
377
325
  } else {
378
326
  return NULL;
379
327
  }
380
328
  }
381
329
 
382
330
  if (id >= 0 && id < m->number_of_matches) {
383
- match = &m->matches[id];
331
+ re2::StringPiece *match = &m->matches[id];
384
332
 
385
333
  if (!match->empty()) {
386
334
  return match;
@@ -399,7 +347,7 @@ re2::StringPiece *re2_matchdata_find_match(VALUE idx, VALUE self) {
399
347
  * m.size #=> 2
400
348
  * m.length #=> 2
401
349
  */
402
- static VALUE re2_matchdata_size(VALUE self) {
350
+ static VALUE re2_matchdata_size(const VALUE self) {
403
351
  re2_matchdata *m;
404
352
  Data_Get_Struct(self, re2_matchdata, m);
405
353
 
@@ -416,23 +364,18 @@ static VALUE re2_matchdata_size(VALUE self) {
416
364
  * m.begin(0) #=> 1
417
365
  * m.begin(1) #=> 4
418
366
  */
419
- static VALUE re2_matchdata_begin(VALUE self, VALUE n) {
367
+ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
420
368
  re2_matchdata *m;
421
- re2_pattern *p;
422
- re2::StringPiece *match;
423
- long offset;
424
369
 
425
370
  Data_Get_Struct(self, re2_matchdata, m);
426
- Data_Get_Struct(m->regexp, re2_pattern, p);
427
371
 
428
- match = re2_matchdata_find_match(n, self);
372
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
429
373
  if (match == NULL) {
430
374
  return Qnil;
431
375
  } else {
432
- offset = reinterpret_cast<uintptr_t>(match->data()) - reinterpret_cast<uintptr_t>(StringValuePtr(m->text));
376
+ long offset = match->data() - RSTRING_PTR(m->text);
433
377
 
434
- return ENCODED_STR_SUBLEN(StringValue(m->text), offset,
435
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
378
+ return LONG2NUM(rb_str_sublen(m->text, offset));
436
379
  }
437
380
  }
438
381
 
@@ -446,24 +389,18 @@ static VALUE re2_matchdata_begin(VALUE self, VALUE n) {
446
389
  * m.end(0) #=> 9
447
390
  * m.end(1) #=> 7
448
391
  */
449
- static VALUE re2_matchdata_end(VALUE self, VALUE n) {
392
+ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
450
393
  re2_matchdata *m;
451
- re2_pattern *p;
452
- re2::StringPiece *match;
453
- long offset;
454
394
 
455
395
  Data_Get_Struct(self, re2_matchdata, m);
456
- Data_Get_Struct(m->regexp, re2_pattern, p);
457
-
458
- match = re2_matchdata_find_match(n, self);
459
396
 
397
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
460
398
  if (match == NULL) {
461
399
  return Qnil;
462
400
  } else {
463
- offset = reinterpret_cast<uintptr_t>(match->data()) - reinterpret_cast<uintptr_t>(StringValuePtr(m->text)) + match->size();
401
+ long offset = (match->data() - RSTRING_PTR(m->text)) + match->size();
464
402
 
465
- return ENCODED_STR_SUBLEN(StringValue(m->text), offset,
466
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
403
+ return LONG2NUM(rb_str_sublen(m->text, offset));
467
404
  }
468
405
  }
469
406
 
@@ -475,9 +412,10 @@ static VALUE re2_matchdata_end(VALUE self, VALUE n) {
475
412
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
476
413
  * m.regexp #=> #<RE2::Regexp /(\d+)/>
477
414
  */
478
- static VALUE re2_matchdata_regexp(VALUE self) {
415
+ static VALUE re2_matchdata_regexp(const VALUE self) {
479
416
  re2_matchdata *m;
480
417
  Data_Get_Struct(self, re2_matchdata, m);
418
+
481
419
  return m->regexp;
482
420
  }
483
421
 
@@ -489,7 +427,7 @@ static VALUE re2_matchdata_regexp(VALUE self) {
489
427
  * c = RE2::Regexp.new('(\d+)').scan("bob 123")
490
428
  * c.regexp #=> #<RE2::Regexp /(\d+)/>
491
429
  */
492
- static VALUE re2_scanner_regexp(VALUE self) {
430
+ static VALUE re2_scanner_regexp(const VALUE self) {
493
431
  re2_scanner *c;
494
432
  Data_Get_Struct(self, re2_scanner, c);
495
433
 
@@ -498,46 +436,47 @@ static VALUE re2_scanner_regexp(VALUE self) {
498
436
 
499
437
  static VALUE re2_regexp_allocate(VALUE klass) {
500
438
  re2_pattern *p;
439
+
501
440
  return Data_Make_Struct(klass, re2_pattern, 0, re2_regexp_free, p);
502
441
  }
503
442
 
504
443
  /*
505
444
  * Returns the array of matches.
506
445
  *
446
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
447
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
448
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
449
+ *
507
450
  * @return [Array<String, nil>] the array of matches
508
451
  * @example
509
452
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
510
453
  * m.to_a #=> ["123", "123"]
511
454
  */
512
- static VALUE re2_matchdata_to_a(VALUE self) {
513
- int i;
455
+ static VALUE re2_matchdata_to_a(const VALUE self) {
514
456
  re2_matchdata *m;
515
457
  re2_pattern *p;
516
- re2::StringPiece *match;
517
- VALUE array;
518
458
 
519
459
  Data_Get_Struct(self, re2_matchdata, m);
520
460
  Data_Get_Struct(m->regexp, re2_pattern, p);
521
461
 
522
- array = rb_ary_new2(m->number_of_matches);
523
- for (i = 0; i < m->number_of_matches; i++) {
524
- match = &m->matches[i];
462
+ VALUE array = rb_ary_new2(m->number_of_matches);
463
+ for (int i = 0; i < m->number_of_matches; ++i) {
464
+ re2::StringPiece *match = &m->matches[i];
525
465
 
526
466
  if (match->empty()) {
527
467
  rb_ary_push(array, Qnil);
528
468
  } else {
529
- rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
530
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
469
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
470
+ p->pattern->options().encoding()));
531
471
  }
532
472
  }
533
473
 
534
474
  return array;
535
475
  }
536
476
 
537
- static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
477
+ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
538
478
  re2_matchdata *m;
539
479
  re2_pattern *p;
540
- re2::StringPiece *match;
541
480
 
542
481
  Data_Get_Struct(self, re2_matchdata, m);
543
482
  Data_Get_Struct(m->regexp, re2_pattern, p);
@@ -545,32 +484,29 @@ static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
545
484
  if (nth < 0 || nth >= m->number_of_matches) {
546
485
  return Qnil;
547
486
  } else {
548
- match = &m->matches[nth];
487
+ re2::StringPiece *match = &m->matches[nth];
549
488
 
550
489
  if (match->empty()) {
551
490
  return Qnil;
552
491
  } else {
553
- return ENCODED_STR_NEW(match->data(), match->size(),
554
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
492
+ return encoded_str_new(match->data(), match->size(),
493
+ p->pattern->options().encoding());
555
494
  }
556
495
  }
557
496
  }
558
497
 
559
- static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
560
- int idx;
498
+ static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
561
499
  re2_matchdata *m;
562
500
  re2_pattern *p;
563
- map<string, int> groups;
564
- string name_as_string(name);
565
501
 
566
502
  Data_Get_Struct(self, re2_matchdata, m);
567
503
  Data_Get_Struct(m->regexp, re2_pattern, p);
568
504
 
569
- groups = p->pattern->NamedCapturingGroups();
505
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
506
+ std::map<std::string, int>::const_iterator search = groups.find(name);
570
507
 
571
- if (groups.count(name_as_string) == 1) {
572
- idx = groups[name_as_string];
573
- return re2_matchdata_nth_match(idx, self);
508
+ if (search != groups.end()) {
509
+ return re2_matchdata_nth_match(search->second, self);
574
510
  } else {
575
511
  return Qnil;
576
512
  }
@@ -579,6 +515,10 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
579
515
  /*
580
516
  * Retrieve zero, one or more matches by index or name.
581
517
  *
518
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
519
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
520
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
521
+ *
582
522
  * @return [Array<String, nil>, String, Boolean]
583
523
  *
584
524
  * @overload [](index)
@@ -619,12 +559,12 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
619
559
  * m["number"] #=> "123"
620
560
  * m[:number] #=> "123"
621
561
  */
622
- static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
562
+ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
623
563
  VALUE idx, rest;
624
564
  rb_scan_args(argc, argv, "11", &idx, &rest);
625
565
 
626
566
  if (TYPE(idx) == T_STRING) {
627
- return re2_matchdata_named_match(StringValuePtr(idx), self);
567
+ return re2_matchdata_named_match(RSTRING_PTR(idx), self);
628
568
  } else if (SYMBOL_P(idx)) {
629
569
  return re2_matchdata_named_match(rb_id2name(SYM2ID(idx)), self);
630
570
  } else if (!NIL_P(rest) || !FIXNUM_P(idx) || FIX2INT(idx) < 0) {
@@ -639,57 +579,61 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
639
579
  *
640
580
  * @return [String] the entire matched string
641
581
  */
642
- static VALUE re2_matchdata_to_s(VALUE self) {
582
+ static VALUE re2_matchdata_to_s(const VALUE self) {
643
583
  return re2_matchdata_nth_match(0, self);
644
584
  }
645
585
 
646
586
  /*
647
587
  * Returns a printable version of the match.
648
588
  *
589
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
590
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
591
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
592
+ *
649
593
  * @return [String] a printable version of the match
650
594
  * @example
651
595
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
652
596
  * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
653
597
  */
654
- static VALUE re2_matchdata_inspect(VALUE self) {
655
- int i;
598
+ static VALUE re2_matchdata_inspect(const VALUE self) {
656
599
  re2_matchdata *m;
657
600
  re2_pattern *p;
658
- VALUE match, result;
659
- ostringstream output;
660
601
 
661
602
  Data_Get_Struct(self, re2_matchdata, m);
662
603
  Data_Get_Struct(m->regexp, re2_pattern, p);
663
604
 
605
+ std::ostringstream output;
664
606
  output << "#<RE2::MatchData";
665
607
 
666
- for (i = 0; i < m->number_of_matches; i++) {
608
+ for (int i = 0; i < m->number_of_matches; ++i) {
667
609
  output << " ";
668
610
 
669
611
  if (i > 0) {
670
612
  output << i << ":";
671
613
  }
672
614
 
673
- match = re2_matchdata_nth_match(i, self);
615
+ VALUE match = re2_matchdata_nth_match(i, self);
674
616
 
675
617
  if (match == Qnil) {
676
618
  output << "nil";
677
619
  } else {
678
- output << "\"" << StringValuePtr(match) << "\"";
620
+ output << "\"" << RSTRING_PTR(match) << "\"";
679
621
  }
680
622
  }
681
623
 
682
624
  output << ">";
683
625
 
684
- result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
685
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
686
-
687
- return result;
626
+ return encoded_str_new(output.str().data(), output.str().length(),
627
+ p->pattern->options().encoding());
688
628
  }
689
629
 
690
630
  /*
691
631
  * Returns the array of submatches for pattern matching.
692
632
  *
633
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
634
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
635
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
636
+ *
693
637
  * @return [Array<String, nil>] the array of submatches
694
638
  * @example
695
639
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
@@ -703,25 +647,22 @@ static VALUE re2_matchdata_inspect(VALUE self) {
703
647
  * puts "Unrecognised match"
704
648
  * end
705
649
  */
706
- static VALUE re2_matchdata_deconstruct(VALUE self) {
707
- int i;
650
+ static VALUE re2_matchdata_deconstruct(const VALUE self) {
708
651
  re2_matchdata *m;
709
652
  re2_pattern *p;
710
- re2::StringPiece *match;
711
- VALUE array;
712
653
 
713
654
  Data_Get_Struct(self, re2_matchdata, m);
714
655
  Data_Get_Struct(m->regexp, re2_pattern, p);
715
656
 
716
- array = rb_ary_new2(m->number_of_matches - 1);
717
- for (i = 1; i < m->number_of_matches; i++) {
718
- match = &m->matches[i];
657
+ VALUE array = rb_ary_new2(m->number_of_matches - 1);
658
+ for (int i = 1; i < m->number_of_matches; ++i) {
659
+ re2::StringPiece *match = &m->matches[i];
719
660
 
720
661
  if (match->empty()) {
721
662
  rb_ary_push(array, Qnil);
722
663
  } else {
723
- rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
724
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
664
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
665
+ p->pattern->options().encoding()));
725
666
  }
726
667
  }
727
668
 
@@ -735,6 +676,10 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
735
676
  * more keys than there are capturing groups. Given keys will populate the hash in
736
677
  * order but an invalid name will cause the hash to be immediately returned.
737
678
  *
679
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
680
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
681
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
682
+ *
738
683
  * @return [Hash] a hash of capturing group names to submatches
739
684
  * @param [Array<Symbol>, nil] keys an array of Symbol capturing group names or nil to return all names
740
685
  * @example
@@ -752,40 +697,37 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
752
697
  * puts "Unrecognised match"
753
698
  * end
754
699
  */
755
- static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
756
- int i;
757
- VALUE capturing_groups, key;
700
+ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys) {
758
701
  re2_matchdata *m;
759
702
  re2_pattern *p;
760
- map<string, int> groups;
761
- map<string, int>::iterator iterator;
762
703
 
763
704
  Data_Get_Struct(self, re2_matchdata, m);
764
705
  Data_Get_Struct(m->regexp, re2_pattern, p);
765
706
 
766
- groups = p->pattern->NamedCapturingGroups();
767
- capturing_groups = rb_hash_new();
707
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
708
+ VALUE capturing_groups = rb_hash_new();
768
709
 
769
710
  if (NIL_P(keys)) {
770
- for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
711
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
771
712
  rb_hash_aset(capturing_groups,
772
- ID2SYM(rb_intern(iterator->first.data())),
773
- re2_matchdata_nth_match(iterator->second, self));
713
+ ID2SYM(rb_intern(it->first.data())),
714
+ re2_matchdata_nth_match(it->second, self));
774
715
  }
775
716
  } else {
776
717
  Check_Type(keys, T_ARRAY);
777
718
 
778
719
  if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) {
779
- for (i = 0; i < RARRAY_LEN(keys); i++) {
780
- key = rb_ary_entry(keys, i);
720
+ for (int i = 0; i < RARRAY_LEN(keys); ++i) {
721
+ VALUE key = rb_ary_entry(keys, i);
781
722
  Check_Type(key, T_SYMBOL);
782
- string name(rb_id2name(SYM2ID(key)));
723
+ const char *name = rb_id2name(SYM2ID(key));
724
+ std::map<std::string, int>::const_iterator search = groups.find(name);
783
725
 
784
- if (groups.count(name) == 0) {
726
+ if (search != groups.end()) {
727
+ rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(search->second, self));
728
+ } else {
785
729
  break;
786
730
  }
787
-
788
- rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(groups[name], self));
789
731
  }
790
732
  }
791
733
  }
@@ -800,8 +742,7 @@ static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
800
742
  * @see RE2::Regexp#initialize
801
743
  *
802
744
  */
803
- static VALUE re2_re2(int argc, VALUE *argv, VALUE self) {
804
- UNUSED(self);
745
+ static VALUE re2_re2(int argc, VALUE *argv, VALUE) {
805
746
  return rb_class_new_instance(argc, argv, re2_cRegexp);
806
747
  }
807
748
 
@@ -845,15 +786,19 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
845
786
  re2_pattern *p;
846
787
 
847
788
  rb_scan_args(argc, argv, "11", &pattern, &options);
789
+
790
+ /* Ensure pattern is a string. */
791
+ StringValue(pattern);
792
+
848
793
  Data_Get_Struct(self, re2_pattern, p);
849
794
 
850
795
  if (RTEST(options)) {
851
796
  RE2::Options re2_options;
852
- parse_re2_options(re2_options, options);
797
+ parse_re2_options(&re2_options, options);
853
798
 
854
- p->pattern = new(nothrow) RE2(StringValuePtr(pattern), re2_options);
799
+ p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern), re2_options);
855
800
  } else {
856
- p->pattern = new(nothrow) RE2(StringValuePtr(pattern));
801
+ p->pattern = new(std::nothrow) RE2(RSTRING_PTR(pattern));
857
802
  }
858
803
 
859
804
  if (p->pattern == 0) {
@@ -866,40 +811,47 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
866
811
  /*
867
812
  * Returns a printable version of the regular expression +re2+.
868
813
  *
814
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
815
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
816
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
817
+ *
869
818
  * @return [String] a printable version of the regular expression
870
819
  * @example
871
820
  * re2 = RE2::Regexp.new("woo?")
872
821
  * re2.inspect #=> "#<RE2::Regexp /woo?/>"
873
822
  */
874
- static VALUE re2_regexp_inspect(VALUE self) {
823
+ static VALUE re2_regexp_inspect(const VALUE self) {
875
824
  re2_pattern *p;
876
- VALUE result;
877
- ostringstream output;
878
825
 
879
826
  Data_Get_Struct(self, re2_pattern, p);
880
827
 
881
- output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
828
+ std::ostringstream output;
882
829
 
883
- result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
884
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
830
+ output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
885
831
 
886
- return result;
832
+ return encoded_str_new(output.str().data(), output.str().length(),
833
+ p->pattern->options().encoding());
887
834
  }
888
835
 
889
836
  /*
890
837
  * Returns a string version of the regular expression +re2+.
891
838
  *
839
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
840
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
841
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
842
+ *
892
843
  * @return [String] a string version of the regular expression
893
844
  * @example
894
845
  * re2 = RE2::Regexp.new("woo?")
895
846
  * re2.to_s #=> "woo?"
896
847
  */
897
- static VALUE re2_regexp_to_s(VALUE self) {
848
+ static VALUE re2_regexp_to_s(const VALUE self) {
898
849
  re2_pattern *p;
899
850
  Data_Get_Struct(self, re2_pattern, p);
900
- return ENCODED_STR_NEW(p->pattern->pattern().data(),
851
+
852
+ return encoded_str_new(p->pattern->pattern().data(),
901
853
  p->pattern->pattern().size(),
902
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
854
+ p->pattern->options().encoding());
903
855
  }
904
856
 
905
857
  /*
@@ -911,9 +863,10 @@ static VALUE re2_regexp_to_s(VALUE self) {
911
863
  * re2 = RE2::Regexp.new("woo?")
912
864
  * re2.ok? #=> true
913
865
  */
914
- static VALUE re2_regexp_ok(VALUE self) {
866
+ static VALUE re2_regexp_ok(const VALUE self) {
915
867
  re2_pattern *p;
916
868
  Data_Get_Struct(self, re2_pattern, p);
869
+
917
870
  return BOOL2RUBY(p->pattern->ok());
918
871
  }
919
872
 
@@ -926,9 +879,10 @@ static VALUE re2_regexp_ok(VALUE self) {
926
879
  * re2 = RE2::Regexp.new("woo?", :utf8 => true)
927
880
  * re2.utf8? #=> true
928
881
  */
929
- static VALUE re2_regexp_utf8(VALUE self) {
882
+ static VALUE re2_regexp_utf8(const VALUE self) {
930
883
  re2_pattern *p;
931
884
  Data_Get_Struct(self, re2_pattern, p);
885
+
932
886
  return BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8);
933
887
  }
934
888
 
@@ -941,9 +895,10 @@ static VALUE re2_regexp_utf8(VALUE self) {
941
895
  * re2 = RE2::Regexp.new("woo?", :posix_syntax => true)
942
896
  * re2.posix_syntax? #=> true
943
897
  */
944
- static VALUE re2_regexp_posix_syntax(VALUE self) {
898
+ static VALUE re2_regexp_posix_syntax(const VALUE self) {
945
899
  re2_pattern *p;
946
900
  Data_Get_Struct(self, re2_pattern, p);
901
+
947
902
  return BOOL2RUBY(p->pattern->options().posix_syntax());
948
903
  }
949
904
 
@@ -956,9 +911,10 @@ static VALUE re2_regexp_posix_syntax(VALUE self) {
956
911
  * re2 = RE2::Regexp.new("woo?", :longest_match => true)
957
912
  * re2.longest_match? #=> true
958
913
  */
959
- static VALUE re2_regexp_longest_match(VALUE self) {
914
+ static VALUE re2_regexp_longest_match(const VALUE self) {
960
915
  re2_pattern *p;
961
916
  Data_Get_Struct(self, re2_pattern, p);
917
+
962
918
  return BOOL2RUBY(p->pattern->options().longest_match());
963
919
  }
964
920
 
@@ -971,9 +927,10 @@ static VALUE re2_regexp_longest_match(VALUE self) {
971
927
  * re2 = RE2::Regexp.new("woo?", :log_errors => true)
972
928
  * re2.log_errors? #=> true
973
929
  */
974
- static VALUE re2_regexp_log_errors(VALUE self) {
930
+ static VALUE re2_regexp_log_errors(const VALUE self) {
975
931
  re2_pattern *p;
976
932
  Data_Get_Struct(self, re2_pattern, p);
933
+
977
934
  return BOOL2RUBY(p->pattern->options().log_errors());
978
935
  }
979
936
 
@@ -986,9 +943,10 @@ static VALUE re2_regexp_log_errors(VALUE self) {
986
943
  * re2 = RE2::Regexp.new("woo?", :max_mem => 1024)
987
944
  * re2.max_mem #=> 1024
988
945
  */
989
- static VALUE re2_regexp_max_mem(VALUE self) {
946
+ static VALUE re2_regexp_max_mem(const VALUE self) {
990
947
  re2_pattern *p;
991
948
  Data_Get_Struct(self, re2_pattern, p);
949
+
992
950
  return INT2FIX(p->pattern->options().max_mem());
993
951
  }
994
952
 
@@ -1001,9 +959,10 @@ static VALUE re2_regexp_max_mem(VALUE self) {
1001
959
  * re2 = RE2::Regexp.new("woo?", :literal => true)
1002
960
  * re2.literal? #=> true
1003
961
  */
1004
- static VALUE re2_regexp_literal(VALUE self) {
962
+ static VALUE re2_regexp_literal(const VALUE self) {
1005
963
  re2_pattern *p;
1006
964
  Data_Get_Struct(self, re2_pattern, p);
965
+
1007
966
  return BOOL2RUBY(p->pattern->options().literal());
1008
967
  }
1009
968
 
@@ -1016,9 +975,10 @@ static VALUE re2_regexp_literal(VALUE self) {
1016
975
  * re2 = RE2::Regexp.new("woo?", :never_nl => true)
1017
976
  * re2.never_nl? #=> true
1018
977
  */
1019
- static VALUE re2_regexp_never_nl(VALUE self) {
978
+ static VALUE re2_regexp_never_nl(const VALUE self) {
1020
979
  re2_pattern *p;
1021
980
  Data_Get_Struct(self, re2_pattern, p);
981
+
1022
982
  return BOOL2RUBY(p->pattern->options().never_nl());
1023
983
  }
1024
984
 
@@ -1031,9 +991,10 @@ static VALUE re2_regexp_never_nl(VALUE self) {
1031
991
  * re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
1032
992
  * re2.case_sensitive? #=> true
1033
993
  */
1034
- static VALUE re2_regexp_case_sensitive(VALUE self) {
994
+ static VALUE re2_regexp_case_sensitive(const VALUE self) {
1035
995
  re2_pattern *p;
1036
996
  Data_Get_Struct(self, re2_pattern, p);
997
+
1037
998
  return BOOL2RUBY(p->pattern->options().case_sensitive());
1038
999
  }
1039
1000
 
@@ -1047,7 +1008,7 @@ static VALUE re2_regexp_case_sensitive(VALUE self) {
1047
1008
  * re2.case_insensitive? #=> false
1048
1009
  * re2.casefold? #=> false
1049
1010
  */
1050
- static VALUE re2_regexp_case_insensitive(VALUE self) {
1011
+ static VALUE re2_regexp_case_insensitive(const VALUE self) {
1051
1012
  return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
1052
1013
  }
1053
1014
 
@@ -1060,9 +1021,10 @@ static VALUE re2_regexp_case_insensitive(VALUE self) {
1060
1021
  * re2 = RE2::Regexp.new("woo?", :perl_classes => true)
1061
1022
  * re2.perl_classes? #=> true
1062
1023
  */
1063
- static VALUE re2_regexp_perl_classes(VALUE self) {
1024
+ static VALUE re2_regexp_perl_classes(const VALUE self) {
1064
1025
  re2_pattern *p;
1065
1026
  Data_Get_Struct(self, re2_pattern, p);
1027
+
1066
1028
  return BOOL2RUBY(p->pattern->options().perl_classes());
1067
1029
  }
1068
1030
 
@@ -1075,9 +1037,10 @@ static VALUE re2_regexp_perl_classes(VALUE self) {
1075
1037
  * re2 = RE2::Regexp.new("woo?", :word_boundary => true)
1076
1038
  * re2.word_boundary? #=> true
1077
1039
  */
1078
- static VALUE re2_regexp_word_boundary(VALUE self) {
1040
+ static VALUE re2_regexp_word_boundary(const VALUE self) {
1079
1041
  re2_pattern *p;
1080
1042
  Data_Get_Struct(self, re2_pattern, p);
1043
+
1081
1044
  return BOOL2RUBY(p->pattern->options().word_boundary());
1082
1045
  }
1083
1046
 
@@ -1090,9 +1053,10 @@ static VALUE re2_regexp_word_boundary(VALUE self) {
1090
1053
  * re2 = RE2::Regexp.new("woo?", :one_line => true)
1091
1054
  * re2.one_line? #=> true
1092
1055
  */
1093
- static VALUE re2_regexp_one_line(VALUE self) {
1056
+ static VALUE re2_regexp_one_line(const VALUE self) {
1094
1057
  re2_pattern *p;
1095
1058
  Data_Get_Struct(self, re2_pattern, p);
1059
+
1096
1060
  return BOOL2RUBY(p->pattern->options().one_line());
1097
1061
  }
1098
1062
 
@@ -1102,9 +1066,10 @@ static VALUE re2_regexp_one_line(VALUE self) {
1102
1066
  *
1103
1067
  * @return [String, nil] the error string or nil
1104
1068
  */
1105
- static VALUE re2_regexp_error(VALUE self) {
1069
+ static VALUE re2_regexp_error(const VALUE self) {
1106
1070
  re2_pattern *p;
1107
1071
  Data_Get_Struct(self, re2_pattern, p);
1072
+
1108
1073
  if (p->pattern->ok()) {
1109
1074
  return Qnil;
1110
1075
  } else {
@@ -1116,17 +1081,22 @@ static VALUE re2_regexp_error(VALUE self) {
1116
1081
  * If the RE2 could not be created properly, returns
1117
1082
  * the offending portion of the regexp otherwise returns nil.
1118
1083
  *
1084
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1085
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1086
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1087
+ *
1119
1088
  * @return [String, nil] the offending portion of the regexp or nil
1120
1089
  */
1121
- static VALUE re2_regexp_error_arg(VALUE self) {
1090
+ static VALUE re2_regexp_error_arg(const VALUE self) {
1122
1091
  re2_pattern *p;
1123
1092
  Data_Get_Struct(self, re2_pattern, p);
1093
+
1124
1094
  if (p->pattern->ok()) {
1125
1095
  return Qnil;
1126
1096
  } else {
1127
- return ENCODED_STR_NEW(p->pattern->error_arg().data(),
1097
+ return encoded_str_new(p->pattern->error_arg().data(),
1128
1098
  p->pattern->error_arg().size(),
1129
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1099
+ p->pattern->options().encoding());
1130
1100
  }
1131
1101
  }
1132
1102
 
@@ -1137,9 +1107,10 @@ static VALUE re2_regexp_error_arg(VALUE self) {
1137
1107
  *
1138
1108
  * @return [Integer] the regexp "cost"
1139
1109
  */
1140
- static VALUE re2_regexp_program_size(VALUE self) {
1110
+ static VALUE re2_regexp_program_size(const VALUE self) {
1141
1111
  re2_pattern *p;
1142
1112
  Data_Get_Struct(self, re2_pattern, p);
1113
+
1143
1114
  return INT2FIX(p->pattern->ProgramSize());
1144
1115
  }
1145
1116
 
@@ -1149,12 +1120,11 @@ static VALUE re2_regexp_program_size(VALUE self) {
1149
1120
  *
1150
1121
  * @return [Hash] the options
1151
1122
  */
1152
- static VALUE re2_regexp_options(VALUE self) {
1153
- VALUE options;
1123
+ static VALUE re2_regexp_options(const VALUE self) {
1154
1124
  re2_pattern *p;
1155
1125
 
1156
1126
  Data_Get_Struct(self, re2_pattern, p);
1157
- options = rb_hash_new();
1127
+ VALUE options = rb_hash_new();
1158
1128
 
1159
1129
  rb_hash_aset(options, ID2SYM(id_utf8),
1160
1130
  BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8));
@@ -1202,33 +1172,34 @@ static VALUE re2_regexp_options(VALUE self) {
1202
1172
  *
1203
1173
  * @return [Integer] the number of capturing subpatterns
1204
1174
  */
1205
- static VALUE re2_regexp_number_of_capturing_groups(VALUE self) {
1175
+ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1206
1176
  re2_pattern *p;
1207
-
1208
1177
  Data_Get_Struct(self, re2_pattern, p);
1178
+
1209
1179
  return INT2FIX(p->pattern->NumberOfCapturingGroups());
1210
1180
  }
1211
1181
 
1212
1182
  /*
1213
1183
  * Returns a hash of names to capturing indices of groups.
1214
1184
  *
1185
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1186
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1187
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1188
+ *
1215
1189
  * @return [Hash] a hash of names to capturing indices
1216
1190
  */
1217
- static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1218
- VALUE capturing_groups;
1191
+ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1219
1192
  re2_pattern *p;
1220
- map<string, int> groups;
1221
- map<string, int>::iterator iterator;
1222
1193
 
1223
1194
  Data_Get_Struct(self, re2_pattern, p);
1224
- groups = p->pattern->NamedCapturingGroups();
1225
- capturing_groups = rb_hash_new();
1195
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1196
+ VALUE capturing_groups = rb_hash_new();
1226
1197
 
1227
- for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
1198
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1228
1199
  rb_hash_aset(capturing_groups,
1229
- ENCODED_STR_NEW(iterator->first.data(), iterator->first.size(),
1230
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"),
1231
- INT2FIX(iterator->second));
1200
+ encoded_str_new(it->first.data(), it->first.size(),
1201
+ p->pattern->options().encoding()),
1202
+ INT2FIX(it->second));
1232
1203
  }
1233
1204
 
1234
1205
  return capturing_groups;
@@ -1242,16 +1213,23 @@ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1242
1213
  * @return [Boolean, RE2::MatchData]
1243
1214
  *
1244
1215
  * @overload match(text)
1245
- * Returns an {RE2::MatchData} containing the matching
1246
- * pattern and all subpatterns resulting from looking for
1247
- * the regexp in +text+.
1216
+ * Returns an {RE2::MatchData} containing the matching pattern and all
1217
+ * subpatterns resulting from looking for the regexp in +text+ if the pattern
1218
+ * contains capturing groups.
1219
+ *
1220
+ * Returns either true or false indicating whether a successful match was
1221
+ * made if the pattern contains no capturing groups.
1248
1222
  *
1249
1223
  * @param [String] text the text to search
1250
- * @return [RE2::MatchData] the matches
1224
+ * @return [RE2::MatchData] if the pattern contains capturing groups
1225
+ * @return [Boolean] if the pattern does not contain capturing groups
1251
1226
  * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1252
- * @example
1227
+ * @example Matching with capturing groups
1253
1228
  * r = RE2::Regexp.new('w(o)(o)')
1254
1229
  * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1230
+ * @example Matching without capturing groups
1231
+ * r = RE2::Regexp.new('woo')
1232
+ * r.match('woo') #=> true
1255
1233
  *
1256
1234
  * @overload match(text, 0)
1257
1235
  * Returns either true or false indicating whether a
@@ -1279,20 +1257,20 @@ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1279
1257
  * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
1280
1258
  * r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1281
1259
  */
1282
- static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1283
- int n;
1284
- bool matched;
1260
+ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1285
1261
  re2_pattern *p;
1286
1262
  re2_matchdata *m;
1287
- VALUE text, number_of_matches, matchdata;
1263
+ VALUE text, number_of_matches;
1288
1264
 
1289
1265
  rb_scan_args(argc, argv, "11", &text, &number_of_matches);
1290
1266
 
1291
1267
  /* Ensure text is a string. */
1292
- text = StringValue(text);
1268
+ StringValue(text);
1293
1269
 
1294
1270
  Data_Get_Struct(self, re2_pattern, p);
1295
1271
 
1272
+ int n;
1273
+
1296
1274
  if (RTEST(number_of_matches)) {
1297
1275
  n = NUM2INT(number_of_matches);
1298
1276
 
@@ -1308,17 +1286,21 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1308
1286
  }
1309
1287
 
1310
1288
  if (n == 0) {
1311
- matched = match(p->pattern, StringValuePtr(text), 0,
1312
- static_cast<int>(RSTRING_LEN(text)), RE2::UNANCHORED, 0, 0);
1289
+ #ifdef HAVE_ENDPOS_ARGUMENT
1290
+ bool matched = p->pattern->Match(RSTRING_PTR(text), 0,
1291
+ RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
1292
+ #else
1293
+ bool matched = p->pattern->Match(RSTRING_PTR(text), 0, RE2::UNANCHORED,
1294
+ 0, 0);
1295
+ #endif
1313
1296
  return BOOL2RUBY(matched);
1314
1297
  } else {
1315
-
1316
1298
  /* Because match returns the whole match as well. */
1317
1299
  n += 1;
1318
1300
 
1319
- matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1301
+ VALUE matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1320
1302
  Data_Get_Struct(matchdata, re2_matchdata, m);
1321
- m->matches = new(nothrow) re2::StringPiece[n];
1303
+ m->matches = new(std::nothrow) re2::StringPiece[n];
1322
1304
  m->regexp = self;
1323
1305
  m->text = rb_str_dup(text);
1324
1306
  rb_str_freeze(m->text);
@@ -1330,10 +1312,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1330
1312
 
1331
1313
  m->number_of_matches = n;
1332
1314
 
1333
- matched = match(p->pattern, StringValuePtr(m->text), 0,
1334
- static_cast<int>(RSTRING_LEN(m->text)),
1335
- RE2::UNANCHORED, m->matches, n);
1336
-
1315
+ #ifdef HAVE_ENDPOS_ARGUMENT
1316
+ bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
1317
+ RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
1318
+ #else
1319
+ bool matched = p->pattern->Match(RSTRING_PTR(m->text), 0,
1320
+ RE2::UNANCHORED, m->matches, n);
1321
+ #endif
1337
1322
  if (matched) {
1338
1323
  return matchdata;
1339
1324
  } else {
@@ -1348,10 +1333,8 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1348
1333
  *
1349
1334
  * @return [Boolean] whether the match was successful
1350
1335
  */
1351
- static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1352
- VALUE argv[2];
1353
- argv[0] = text;
1354
- argv[1] = INT2FIX(0);
1336
+ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1337
+ VALUE argv[2] = { text, INT2FIX(0) };
1355
1338
 
1356
1339
  return re2_regexp_match(2, argv, self);
1357
1340
  }
@@ -1362,16 +1345,18 @@ static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1362
1345
  * @example
1363
1346
  * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
1364
1347
  */
1365
- static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1348
+ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1349
+ /* Ensure text is a string. */
1350
+ StringValue(text);
1351
+
1366
1352
  re2_pattern *p;
1367
1353
  re2_scanner *c;
1368
- VALUE scanner;
1369
1354
 
1370
1355
  Data_Get_Struct(self, re2_pattern, p);
1371
- scanner = rb_class_new_instance(0, 0, re2_cScanner);
1356
+ VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1372
1357
  Data_Get_Struct(scanner, re2_scanner, c);
1373
1358
 
1374
- c->input = new(nothrow) re2::StringPiece(StringValuePtr(text));
1359
+ c->input = new(std::nothrow) re2::StringPiece(RSTRING_PTR(text));
1375
1360
  c->regexp = self;
1376
1361
  c->text = text;
1377
1362
 
@@ -1390,6 +1375,10 @@ static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1390
1375
  * Returns a copy of +str+ with the first occurrence +pattern+
1391
1376
  * replaced with +rewrite+.
1392
1377
  *
1378
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1379
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1380
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1381
+ *
1393
1382
  * @param [String] str the string to modify
1394
1383
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1395
1384
  * @param [String] rewrite the string to replace with
@@ -1399,34 +1388,42 @@ static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1399
1388
  * re2 = RE2::Regexp.new("hel+o")
1400
1389
  * RE2.Replace("hello there", re2, "yo") #=> "yo there"
1401
1390
  */
1402
- static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
1391
+ static VALUE re2_Replace(VALUE, VALUE str, VALUE pattern,
1403
1392
  VALUE rewrite) {
1404
- UNUSED(self);
1393
+ /* Ensure rewrite is a string. */
1394
+ StringValue(rewrite);
1395
+
1405
1396
  re2_pattern *p;
1406
1397
 
1407
- /* Convert all the inputs to be pumped into RE2::Replace. */
1408
- string str_as_string(StringValuePtr(str));
1398
+ /* Take a copy of str so it can be modified in-place by
1399
+ * RE2::Replace.
1400
+ */
1401
+ std::string str_as_string(StringValuePtr(str));
1409
1402
 
1410
1403
  /* Do the replacement. */
1411
1404
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1412
1405
  Data_Get_Struct(pattern, re2_pattern, p);
1413
- RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
1406
+ RE2::Replace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
1414
1407
 
1415
- return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
1416
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1408
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1409
+ p->pattern->options().encoding());
1417
1410
  } else {
1418
- RE2::Replace(&str_as_string, StringValuePtr(pattern),
1419
- StringValuePtr(rewrite));
1411
+ /* Ensure pattern is a string. */
1412
+ StringValue(pattern);
1420
1413
 
1421
- return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
1422
- pattern);
1423
- }
1414
+ RE2::Replace(&str_as_string, RSTRING_PTR(pattern), RSTRING_PTR(rewrite));
1424
1415
 
1416
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1417
+ }
1425
1418
  }
1426
1419
 
1427
1420
  /*
1428
1421
  * Return a copy of +str+ with +pattern+ replaced by +rewrite+.
1429
1422
  *
1423
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1424
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1425
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1426
+ *
1430
1427
  * @param [String] str the string to modify
1431
1428
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1432
1429
  * @param [String] rewrite the string to replace with
@@ -1436,27 +1433,32 @@ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
1436
1433
  * RE2.GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
1437
1434
  * RE2.GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
1438
1435
  */
1439
- static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
1436
+ static VALUE re2_GlobalReplace(VALUE, VALUE str, VALUE pattern,
1440
1437
  VALUE rewrite) {
1441
- UNUSED(self);
1438
+ /* Ensure rewrite is a string. */
1439
+ StringValue(rewrite);
1442
1440
 
1443
- /* Convert all the inputs to be pumped into RE2::GlobalReplace. */
1441
+ /* Take a copy of str so it can be modified in-place by
1442
+ * RE2::GlobalReplace.
1443
+ */
1444
1444
  re2_pattern *p;
1445
- string str_as_string(StringValuePtr(str));
1445
+ std::string str_as_string(StringValuePtr(str));
1446
1446
 
1447
1447
  /* Do the replacement. */
1448
1448
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1449
1449
  Data_Get_Struct(pattern, re2_pattern, p);
1450
- RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
1450
+ RE2::GlobalReplace(&str_as_string, *p->pattern, RSTRING_PTR(rewrite));
1451
1451
 
1452
- return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
1453
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1452
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1453
+ p->pattern->options().encoding());
1454
1454
  } else {
1455
- RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
1456
- StringValuePtr(rewrite));
1455
+ /* Ensure pattern is a string. */
1456
+ StringValue(pattern);
1457
+
1458
+ RE2::GlobalReplace(&str_as_string, RSTRING_PTR(pattern),
1459
+ RSTRING_PTR(rewrite));
1457
1460
 
1458
- return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
1459
- pattern);
1461
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1460
1462
  }
1461
1463
  }
1462
1464
 
@@ -1470,13 +1472,15 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
1470
1472
  * @example
1471
1473
  * RE2::Regexp.escape("1.5-2.0?") #=> "1\.5\-2\.0\?"
1472
1474
  */
1473
- static VALUE re2_QuoteMeta(VALUE self, VALUE unquoted) {
1474
- UNUSED(self);
1475
- string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted));
1475
+ static VALUE re2_QuoteMeta(VALUE, VALUE unquoted) {
1476
+ StringValue(unquoted);
1477
+
1478
+ std::string quoted_string = RE2::QuoteMeta(RSTRING_PTR(unquoted));
1479
+
1476
1480
  return rb_str_new(quoted_string.data(), quoted_string.size());
1477
1481
  }
1478
1482
 
1479
- void re2_set_free(re2_set *self) {
1483
+ static void re2_set_free(re2_set *self) {
1480
1484
  if (self->set) {
1481
1485
  delete self->set;
1482
1486
  }
@@ -1486,6 +1490,7 @@ void re2_set_free(re2_set *self) {
1486
1490
  static VALUE re2_set_allocate(VALUE klass) {
1487
1491
  re2_set *s;
1488
1492
  VALUE result = Data_Make_Struct(klass, re2_set, 0, re2_set_free, s);
1493
+
1489
1494
  return result;
1490
1495
  }
1491
1496
 
@@ -1533,18 +1538,13 @@ static VALUE re2_set_allocate(VALUE klass) {
1533
1538
  static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1534
1539
  VALUE anchor, options;
1535
1540
  re2_set *s;
1536
- RE2::Anchor re2_anchor;
1537
- RE2::Options re2_options;
1538
1541
 
1539
1542
  rb_scan_args(argc, argv, "02", &anchor, &options);
1540
1543
  Data_Get_Struct(self, re2_set, s);
1541
1544
 
1542
- if (RTEST(options)) {
1543
- parse_re2_options(re2_options, options);
1544
- }
1545
- if (NIL_P(anchor)) {
1546
- re2_anchor = RE2::UNANCHORED;
1547
- } else {
1545
+ RE2::Anchor re2_anchor = RE2::UNANCHORED;
1546
+
1547
+ if (!NIL_P(anchor)) {
1548
1548
  Check_Type(anchor, T_SYMBOL);
1549
1549
  ID id_anchor = SYM2ID(anchor);
1550
1550
  if (id_anchor == id_unanchored) {
@@ -1558,7 +1558,13 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1558
1558
  }
1559
1559
  }
1560
1560
 
1561
- s->set = new(nothrow) RE2::Set(re2_options, re2_anchor);
1561
+ RE2::Options re2_options;
1562
+
1563
+ if (RTEST(options)) {
1564
+ parse_re2_options(&re2_options, options);
1565
+ }
1566
+
1567
+ s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
1562
1568
  if (s->set == 0) {
1563
1569
  rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1564
1570
  }
@@ -1579,14 +1585,25 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1579
1585
  * set.add("def") #=> 1
1580
1586
  */
1581
1587
  static VALUE re2_set_add(VALUE self, VALUE pattern) {
1582
- Check_Type(pattern, T_STRING);
1583
- re2::StringPiece regex(RSTRING_PTR(pattern), RSTRING_LEN(pattern));
1584
- std::string err;
1588
+ StringValue(pattern);
1589
+
1585
1590
  re2_set *s;
1586
1591
  Data_Get_Struct(self, re2_set, s);
1587
- int index = s->set->Add(regex, &err);
1592
+
1593
+ /* To prevent the memory of the err string leaking when we call rb_raise,
1594
+ * take a copy of it and let it go out of scope.
1595
+ */
1596
+ char msg[100];
1597
+ int index;
1598
+
1599
+ {
1600
+ std::string err;
1601
+ index = s->set->Add(RSTRING_PTR(pattern), &err);
1602
+ strlcpy(msg, err.c_str(), sizeof(msg));
1603
+ }
1604
+
1588
1605
  if (index < 0) {
1589
- rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", err.c_str());
1606
+ rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", msg);
1590
1607
  }
1591
1608
 
1592
1609
  return INT2FIX(index);
@@ -1616,8 +1633,7 @@ static VALUE re2_set_compile(VALUE self) {
1616
1633
  *
1617
1634
  * @return [Bool] whether the underlying re2 outputs error information from Set matches
1618
1635
  */
1619
- static VALUE re2_set_match_raises_errors_p(VALUE self) {
1620
- UNUSED(self);
1636
+ static VALUE re2_set_match_raises_errors_p(VALUE) {
1621
1637
  #ifdef HAVE_ERROR_INFO_ARGUMENT
1622
1638
  return Qtrue;
1623
1639
  #else
@@ -1665,29 +1681,30 @@ static VALUE re2_set_match_raises_errors_p(VALUE self) {
1665
1681
  * set.compile
1666
1682
  * set.match("abcdef", :exception => true) # => [0, 1]
1667
1683
  */
1668
- static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1669
- VALUE str, options, exception_option;
1684
+ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1685
+ VALUE str, options;
1670
1686
  bool raise_exception = true;
1671
1687
  rb_scan_args(argc, argv, "11", &str, &options);
1672
- Check_Type(str, T_STRING);
1673
- re2::StringPiece data(RSTRING_PTR(str), RSTRING_LEN(str));
1674
- std::vector<int> v;
1688
+
1689
+ StringValue(str);
1675
1690
  re2_set *s;
1676
1691
  Data_Get_Struct(self, re2_set, s);
1677
1692
 
1678
1693
  if (RTEST(options)) {
1679
1694
  Check_Type(options, T_HASH);
1680
1695
 
1681
- exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1696
+ VALUE exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1682
1697
  if (!NIL_P(exception_option)) {
1683
1698
  raise_exception = RTEST(exception_option);
1684
1699
  }
1685
1700
  }
1686
1701
 
1702
+ std::vector<int> v;
1703
+
1687
1704
  if (raise_exception) {
1688
1705
  #ifdef HAVE_ERROR_INFO_ARGUMENT
1689
1706
  RE2::Set::ErrorInfo e;
1690
- bool match_failed = !s->set->Match(data, &v, &e);
1707
+ bool match_failed = !s->set->Match(RSTRING_PTR(str), &v, &e);
1691
1708
  VALUE result = rb_ary_new2(v.size());
1692
1709
 
1693
1710
  if (match_failed) {
@@ -1704,7 +1721,7 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1704
1721
  rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
1705
1722
  }
1706
1723
  } else {
1707
- for (size_t i = 0; i < v.size(); i++) {
1724
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
1708
1725
  rb_ary_push(result, INT2FIX(v[i]));
1709
1726
  }
1710
1727
  }
@@ -1714,11 +1731,11 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1714
1731
  rb_raise(re2_eSetUnsupportedError, "current version of RE2::Set::Match() does not output error information, :exception option can only be set to false");
1715
1732
  #endif
1716
1733
  } else {
1717
- bool matched = s->set->Match(data, &v);
1734
+ bool matched = s->set->Match(RSTRING_PTR(str), &v);
1718
1735
  VALUE result = rb_ary_new2(v.size());
1719
1736
 
1720
1737
  if (matched) {
1721
- for (size_t i = 0; i < v.size(); i++) {
1738
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
1722
1739
  rb_ary_push(result, INT2FIX(v[i]));
1723
1740
  }
1724
1741
  }
@@ -1727,12 +1744,7 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1727
1744
  }
1728
1745
  }
1729
1746
 
1730
- /* Forward declare Init_re2 to be called by C code but define it separately so
1731
- * that YARD can parse it.
1732
- */
1733
- extern "C" void Init_re2(void);
1734
-
1735
- void Init_re2(void) {
1747
+ extern "C" void Init_re2(void) {
1736
1748
  re2_mRE2 = rb_define_module("RE2");
1737
1749
  re2_cRegexp = rb_define_class_under(re2_mRE2, "Regexp", rb_cObject);
1738
1750
  re2_cMatchData = rb_define_class_under(re2_mRE2, "MatchData", rb_cObject);
@@ -1868,7 +1880,7 @@ void Init_re2(void) {
1868
1880
  rb_define_singleton_method(re2_cRegexp, "compile",
1869
1881
  RUBY_METHOD_FUNC(rb_class_new_instance), -1);
1870
1882
 
1871
- rb_define_global_function("RE2", RUBY_METHOD_FUNC(re2_re2), -1);
1883
+ rb_define_module_function(rb_mKernel, "RE2", RUBY_METHOD_FUNC(re2_re2), -1);
1872
1884
 
1873
1885
  /* Create the symbols used in options. */
1874
1886
  id_utf8 = rb_intern("utf8");