re2 2.0.0 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/re2/re2.cc CHANGED
@@ -6,77 +6,21 @@
6
6
  * Released under the BSD Licence, please see LICENSE.txt
7
7
  */
8
8
 
9
- #include <ruby.h>
10
- #include <re2/re2.h>
11
- #include <re2/set.h>
12
9
  #include <stdint.h>
13
- #include <string>
10
+
11
+ #include <map>
14
12
  #include <sstream>
13
+ #include <string>
15
14
  #include <vector>
16
- using std::string;
17
- using std::ostringstream;
18
- using std::nothrow;
19
- using std::map;
20
- using std::vector;
15
+
16
+ #include <re2/re2.h>
17
+ #include <re2/set.h>
18
+ #include <ruby.h>
19
+ #include <ruby/encoding.h>
21
20
 
22
21
  #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
23
22
  #define UNUSED(x) ((void)x)
24
23
 
25
- #ifndef RSTRING_LEN
26
- #define RSTRING_LEN(x) (RSTRING(x)->len)
27
- #endif
28
-
29
- #ifndef RSTRING_PTR
30
- #define RSTRING_PTR(x) (RSTRING(x)->ptr)
31
- #endif
32
-
33
- #ifdef HAVE_RUBY_ENCODING_H
34
- #include <ruby/encoding.h>
35
- #define ENCODED_STR_NEW(str, length, encoding) \
36
- ({ \
37
- VALUE _string = rb_str_new(str, length); \
38
- int _enc = rb_enc_find_index(encoding); \
39
- rb_enc_associate_index(_string, _enc); \
40
- _string; \
41
- })
42
- #define ENCODED_STR_NEW2(str, length, str2) \
43
- ({ \
44
- VALUE _string = rb_str_new(str, length); \
45
- int _enc = rb_enc_get_index(str2); \
46
- rb_enc_associate_index(_string, _enc); \
47
- _string; \
48
- })
49
- #else
50
- #define ENCODED_STR_NEW(str, length, encoding) \
51
- rb_str_new((const char *)str, (long)length)
52
- #define ENCODED_STR_NEW2(str, length, str2) \
53
- rb_str_new((const char *)str, (long)length)
54
- #endif
55
-
56
- #ifdef HAVE_RB_STR_SUBLEN
57
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
58
- LONG2NUM(rb_str_sublen(str, offset))
59
- #else
60
- #ifdef HAVE_RUBY_ENCODING_H
61
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
62
- ({ \
63
- VALUE _string = ENCODED_STR_NEW(RSTRING_PTR(str), offset, encoding); \
64
- rb_str_length(_string); \
65
- })
66
- #else
67
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
68
- LONG2NUM(offset)
69
- #endif
70
- #endif
71
-
72
- #ifdef HAVE_ENDPOS_ARGUMENT
73
- #define match(pattern, text, startpos, endpos, anchor, match, nmatch) \
74
- (pattern->Match(text, startpos, endpos, anchor, match, nmatch))
75
- #else
76
- #define match(pattern, text, startpos, endpos, anchor, match, nmatch) \
77
- (pattern->Match(text, startpos, anchor, match, nmatch))
78
- #endif
79
-
80
24
  typedef struct {
81
25
  RE2 *pattern;
82
26
  } re2_pattern;
@@ -107,95 +51,103 @@ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
107
51
  id_perl_classes, id_word_boundary, id_one_line,
108
52
  id_unanchored, id_anchor_start, id_anchor_both, id_exception;
109
53
 
110
- void parse_re2_options(RE2::Options& re2_options, VALUE options) {
54
+ inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
55
+ if (encoding == RE2::Options::EncodingUTF8) {
56
+ return rb_utf8_str_new(str, length);
57
+ }
58
+
59
+ VALUE string = rb_str_new(str, length);
60
+ rb_enc_associate_index(string, rb_enc_find_index("ISO-8859-1"));
61
+
62
+ return string;
63
+ }
64
+
65
+ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
111
66
  if (TYPE(options) != T_HASH) {
112
67
  rb_raise(rb_eArgError, "options should be a hash");
113
68
  }
114
- VALUE utf8, posix_syntax, longest_match, log_errors,
115
- max_mem, literal, never_nl, case_sensitive, perl_classes,
116
- word_boundary, one_line;
117
69
 
118
- utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
70
+ VALUE utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
119
71
  if (!NIL_P(utf8)) {
120
- re2_options.set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
72
+ re2_options->set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
121
73
  }
122
74
 
123
- posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
75
+ VALUE posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
124
76
  if (!NIL_P(posix_syntax)) {
125
- re2_options.set_posix_syntax(RTEST(posix_syntax));
77
+ re2_options->set_posix_syntax(RTEST(posix_syntax));
126
78
  }
127
79
 
128
- longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
80
+ VALUE longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
129
81
  if (!NIL_P(longest_match)) {
130
- re2_options.set_longest_match(RTEST(longest_match));
82
+ re2_options->set_longest_match(RTEST(longest_match));
131
83
  }
132
84
 
133
- log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
85
+ VALUE log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
134
86
  if (!NIL_P(log_errors)) {
135
- re2_options.set_log_errors(RTEST(log_errors));
87
+ re2_options->set_log_errors(RTEST(log_errors));
136
88
  }
137
89
 
138
- max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
90
+ VALUE max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
139
91
  if (!NIL_P(max_mem)) {
140
- re2_options.set_max_mem(NUM2INT(max_mem));
92
+ re2_options->set_max_mem(NUM2INT(max_mem));
141
93
  }
142
94
 
143
- literal = rb_hash_aref(options, ID2SYM(id_literal));
95
+ VALUE literal = rb_hash_aref(options, ID2SYM(id_literal));
144
96
  if (!NIL_P(literal)) {
145
- re2_options.set_literal(RTEST(literal));
97
+ re2_options->set_literal(RTEST(literal));
146
98
  }
147
99
 
148
- never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
100
+ VALUE never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
149
101
  if (!NIL_P(never_nl)) {
150
- re2_options.set_never_nl(RTEST(never_nl));
102
+ re2_options->set_never_nl(RTEST(never_nl));
151
103
  }
152
104
 
153
- case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
105
+ VALUE case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
154
106
  if (!NIL_P(case_sensitive)) {
155
- re2_options.set_case_sensitive(RTEST(case_sensitive));
107
+ re2_options->set_case_sensitive(RTEST(case_sensitive));
156
108
  }
157
109
 
158
- perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
110
+ VALUE perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
159
111
  if (!NIL_P(perl_classes)) {
160
- re2_options.set_perl_classes(RTEST(perl_classes));
112
+ re2_options->set_perl_classes(RTEST(perl_classes));
161
113
  }
162
114
 
163
- word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
115
+ VALUE word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
164
116
  if (!NIL_P(word_boundary)) {
165
- re2_options.set_word_boundary(RTEST(word_boundary));
117
+ re2_options->set_word_boundary(RTEST(word_boundary));
166
118
  }
167
119
 
168
- one_line = rb_hash_aref(options, ID2SYM(id_one_line));
120
+ VALUE one_line = rb_hash_aref(options, ID2SYM(id_one_line));
169
121
  if (!NIL_P(one_line)) {
170
- re2_options.set_one_line(RTEST(one_line));
122
+ re2_options->set_one_line(RTEST(one_line));
171
123
  }
172
124
  }
173
125
 
174
- void re2_matchdata_mark(re2_matchdata* self) {
126
+ static void re2_matchdata_mark(re2_matchdata* self) {
175
127
  rb_gc_mark(self->regexp);
176
128
  rb_gc_mark(self->text);
177
129
  }
178
130
 
179
- void re2_matchdata_free(re2_matchdata* self) {
131
+ static void re2_matchdata_free(re2_matchdata* self) {
180
132
  if (self->matches) {
181
133
  delete[] self->matches;
182
134
  }
183
135
  free(self);
184
136
  }
185
137
 
186
- void re2_scanner_mark(re2_scanner* self) {
138
+ static void re2_scanner_mark(re2_scanner* self) {
187
139
  rb_gc_mark(self->regexp);
188
140
  rb_gc_mark(self->text);
189
141
  }
190
142
 
191
- void re2_scanner_free(re2_scanner* self) {
143
+ static void re2_scanner_free(re2_scanner* self) {
192
144
  if (self->input) {
193
145
  delete self->input;
194
146
  }
195
147
  free(self);
196
148
  }
197
149
 
198
- void re2_regexp_free(re2_pattern* self) {
150
+ static void re2_regexp_free(re2_pattern* self) {
199
151
  if (self->pattern) {
200
152
  delete self->pattern;
201
153
  }
@@ -204,12 +156,14 @@ void re2_regexp_free(re2_pattern* self) {
204
156
 
205
157
  static VALUE re2_matchdata_allocate(VALUE klass) {
206
158
  re2_matchdata *m;
159
+
207
160
  return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark,
208
161
  re2_matchdata_free, m);
209
162
  }
210
163
 
211
164
  static VALUE re2_scanner_allocate(VALUE klass) {
212
165
  re2_scanner *c;
166
+
213
167
  return Data_Make_Struct(klass, re2_scanner, re2_scanner_mark,
214
168
  re2_scanner_free, c);
215
169
  }
@@ -222,7 +176,7 @@ static VALUE re2_scanner_allocate(VALUE klass) {
222
176
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
223
177
  * m.string #=> "bob 123"
224
178
  */
225
- static VALUE re2_matchdata_string(VALUE self) {
179
+ static VALUE re2_matchdata_string(const VALUE self) {
226
180
  re2_matchdata *m;
227
181
  Data_Get_Struct(self, re2_matchdata, m);
228
182
 
@@ -237,7 +191,7 @@ static VALUE re2_matchdata_string(VALUE self) {
237
191
  * c = RE2::Regexp.new('(\d+)').scan("foo")
238
192
  * c.string #=> "foo"
239
193
  */
240
- static VALUE re2_scanner_string(VALUE self) {
194
+ static VALUE re2_scanner_string(const VALUE self) {
241
195
  re2_scanner *c;
242
196
  Data_Get_Struct(self, re2_scanner, c);
243
197
 
@@ -252,7 +206,7 @@ static VALUE re2_scanner_string(VALUE self) {
252
206
  * c = RE2::Regexp.new('(\d+)').scan("foo")
253
207
  * c.eof? #=> true
254
208
  */
255
- static VALUE re2_scanner_eof(VALUE self) {
209
+ static VALUE re2_scanner_eof(const VALUE self) {
256
210
  re2_scanner *c;
257
211
  Data_Get_Struct(self, re2_scanner, c);
258
212
 
@@ -274,7 +228,7 @@ static VALUE re2_scanner_rewind(VALUE self) {
274
228
  re2_scanner *c;
275
229
  Data_Get_Struct(self, re2_scanner, c);
276
230
 
277
- c->input = new(nothrow) re2::StringPiece(StringValuePtr(c->text));
231
+ c->input = new(std::nothrow) re2::StringPiece(StringValuePtr(c->text));
278
232
  c->eof = false;
279
233
 
280
234
  return self;
@@ -284,6 +238,10 @@ static VALUE re2_scanner_rewind(VALUE self) {
284
238
  * Scan the given text incrementally for matches, returning an array of
285
239
  * matches on each subsequent call. Returns nil if no matches are found.
286
240
  *
241
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
242
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
243
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
244
+ *
287
245
  * @return [Array<String>] the matches.
288
246
  * @example
289
247
  * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
@@ -291,45 +249,41 @@ static VALUE re2_scanner_rewind(VALUE self) {
291
249
  * s.scan #=> ["bar"]
292
250
  */
293
251
  static VALUE re2_scanner_scan(VALUE self) {
294
- int i;
295
- size_t original_input_size, new_input_size;
296
- bool input_advanced;
297
252
  re2_pattern *p;
298
253
  re2_scanner *c;
299
- VALUE result;
300
254
 
301
255
  Data_Get_Struct(self, re2_scanner, c);
302
256
  Data_Get_Struct(c->regexp, re2_pattern, p);
303
257
 
304
- vector<RE2::Arg> argv(c->number_of_capturing_groups);
305
- vector<RE2::Arg*> args(c->number_of_capturing_groups);
306
- vector<string> matches(c->number_of_capturing_groups);
258
+ std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
259
+ std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
260
+ std::vector<std::string> matches(c->number_of_capturing_groups);
307
261
 
308
262
  if (c->eof) {
309
263
  return Qnil;
310
264
  }
311
265
 
312
- original_input_size = c->input->size();
266
+ re2::StringPiece::size_type original_input_size = c->input->size();
313
267
 
314
- for (i = 0; i < c->number_of_capturing_groups; i++) {
315
- matches[i] = "";
268
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
316
269
  argv[i] = &matches[i];
317
270
  args[i] = &argv[i];
318
271
  }
319
272
 
320
273
  if (RE2::FindAndConsumeN(c->input, *p->pattern, &args[0],
321
274
  c->number_of_capturing_groups)) {
322
- result = rb_ary_new2(c->number_of_capturing_groups);
323
- new_input_size = c->input->size();
324
- input_advanced = new_input_size < original_input_size;
275
+ re2::StringPiece::size_type new_input_size = c->input->size();
276
+ bool input_advanced = new_input_size < original_input_size;
325
277
 
326
- for (i = 0; i < c->number_of_capturing_groups; i++) {
278
+ VALUE result = rb_ary_new2(c->number_of_capturing_groups);
279
+
280
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
327
281
  if (matches[i].empty()) {
328
282
  rb_ary_push(result, Qnil);
329
283
  } else {
330
- rb_ary_push(result, ENCODED_STR_NEW(matches[i].data(),
284
+ rb_ary_push(result, encoded_str_new(matches[i].data(),
331
285
  matches[i].size(),
332
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
286
+ p->pattern->options().encoding()));
333
287
  }
334
288
  }
335
289
 
@@ -340,47 +294,41 @@ static VALUE re2_scanner_scan(VALUE self) {
340
294
  if (!input_advanced && new_input_size > 0) {
341
295
  c->input->remove_prefix(1);
342
296
  }
297
+
298
+ return result;
343
299
  } else {
344
- result = Qnil;
300
+ return Qnil;
345
301
  }
346
-
347
- return result;
348
302
  }
349
303
 
350
304
  /*
351
305
  * Retrieve a matchdata by index or name.
352
306
  */
353
- re2::StringPiece *re2_matchdata_find_match(VALUE idx, VALUE self) {
354
- int id;
307
+ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
355
308
  re2_matchdata *m;
356
309
  re2_pattern *p;
357
- map<string, int> groups;
358
- string name;
359
- re2::StringPiece *match;
360
310
 
361
311
  Data_Get_Struct(self, re2_matchdata, m);
362
312
  Data_Get_Struct(m->regexp, re2_pattern, p);
363
313
 
314
+ int id;
315
+
364
316
  if (FIXNUM_P(idx)) {
365
317
  id = FIX2INT(idx);
366
318
  } else {
367
- if (SYMBOL_P(idx)) {
368
- name = rb_id2name(SYM2ID(idx));
369
- } else {
370
- name = StringValuePtr(idx);
371
- }
372
-
373
- groups = p->pattern->NamedCapturingGroups();
319
+ const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
320
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
321
+ std::map<std::string, int>::const_iterator search = groups.find(name);
374
322
 
375
- if (groups.count(name) == 1) {
376
- id = groups[name];
323
+ if (search != groups.end()) {
324
+ id = search->second;
377
325
  } else {
378
326
  return NULL;
379
327
  }
380
328
  }
381
329
 
382
330
  if (id >= 0 && id < m->number_of_matches) {
383
- match = &m->matches[id];
331
+ re2::StringPiece *match = &m->matches[id];
384
332
 
385
333
  if (!match->empty()) {
386
334
  return match;
@@ -399,7 +347,7 @@ re2::StringPiece *re2_matchdata_find_match(VALUE idx, VALUE self) {
399
347
  * m.size #=> 2
400
348
  * m.length #=> 2
401
349
  */
402
- static VALUE re2_matchdata_size(VALUE self) {
350
+ static VALUE re2_matchdata_size(const VALUE self) {
403
351
  re2_matchdata *m;
404
352
  Data_Get_Struct(self, re2_matchdata, m);
405
353
 
@@ -416,23 +364,18 @@ static VALUE re2_matchdata_size(VALUE self) {
416
364
  * m.begin(0) #=> 1
417
365
  * m.begin(1) #=> 4
418
366
  */
419
- static VALUE re2_matchdata_begin(VALUE self, VALUE n) {
367
+ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
420
368
  re2_matchdata *m;
421
- re2_pattern *p;
422
- re2::StringPiece *match;
423
- long offset;
424
369
 
425
370
  Data_Get_Struct(self, re2_matchdata, m);
426
- Data_Get_Struct(m->regexp, re2_pattern, p);
427
371
 
428
- match = re2_matchdata_find_match(n, self);
372
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
429
373
  if (match == NULL) {
430
374
  return Qnil;
431
375
  } else {
432
- offset = reinterpret_cast<uintptr_t>(match->data()) - reinterpret_cast<uintptr_t>(StringValuePtr(m->text));
376
+ long offset = match->data() - StringValuePtr(m->text);
433
377
 
434
- return ENCODED_STR_SUBLEN(StringValue(m->text), offset,
435
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
378
+ return LONG2NUM(rb_str_sublen(StringValue(m->text), offset));
436
379
  }
437
380
  }
438
381
 
@@ -446,24 +389,18 @@ static VALUE re2_matchdata_begin(VALUE self, VALUE n) {
446
389
  * m.end(0) #=> 9
447
390
  * m.end(1) #=> 7
448
391
  */
449
- static VALUE re2_matchdata_end(VALUE self, VALUE n) {
392
+ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
450
393
  re2_matchdata *m;
451
- re2_pattern *p;
452
- re2::StringPiece *match;
453
- long offset;
454
394
 
455
395
  Data_Get_Struct(self, re2_matchdata, m);
456
- Data_Get_Struct(m->regexp, re2_pattern, p);
457
-
458
- match = re2_matchdata_find_match(n, self);
459
396
 
397
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
460
398
  if (match == NULL) {
461
399
  return Qnil;
462
400
  } else {
463
- offset = reinterpret_cast<uintptr_t>(match->data()) - reinterpret_cast<uintptr_t>(StringValuePtr(m->text)) + match->size();
401
+ long offset = (match->data() - StringValuePtr(m->text)) + match->size();
464
402
 
465
- return ENCODED_STR_SUBLEN(StringValue(m->text), offset,
466
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
403
+ return LONG2NUM(rb_str_sublen(StringValue(m->text), offset));
467
404
  }
468
405
  }
469
406
 
@@ -475,9 +412,10 @@ static VALUE re2_matchdata_end(VALUE self, VALUE n) {
475
412
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
476
413
  * m.regexp #=> #<RE2::Regexp /(\d+)/>
477
414
  */
478
- static VALUE re2_matchdata_regexp(VALUE self) {
415
+ static VALUE re2_matchdata_regexp(const VALUE self) {
479
416
  re2_matchdata *m;
480
417
  Data_Get_Struct(self, re2_matchdata, m);
418
+
481
419
  return m->regexp;
482
420
  }
483
421
 
@@ -489,7 +427,7 @@ static VALUE re2_matchdata_regexp(VALUE self) {
489
427
  * c = RE2::Regexp.new('(\d+)').scan("bob 123")
490
428
  * c.regexp #=> #<RE2::Regexp /(\d+)/>
491
429
  */
492
- static VALUE re2_scanner_regexp(VALUE self) {
430
+ static VALUE re2_scanner_regexp(const VALUE self) {
493
431
  re2_scanner *c;
494
432
  Data_Get_Struct(self, re2_scanner, c);
495
433
 
@@ -498,46 +436,47 @@ static VALUE re2_scanner_regexp(VALUE self) {
498
436
 
499
437
  static VALUE re2_regexp_allocate(VALUE klass) {
500
438
  re2_pattern *p;
439
+
501
440
  return Data_Make_Struct(klass, re2_pattern, 0, re2_regexp_free, p);
502
441
  }
503
442
 
504
443
  /*
505
444
  * Returns the array of matches.
506
445
  *
446
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
447
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
448
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
449
+ *
507
450
  * @return [Array<String, nil>] the array of matches
508
451
  * @example
509
452
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
510
453
  * m.to_a #=> ["123", "123"]
511
454
  */
512
- static VALUE re2_matchdata_to_a(VALUE self) {
513
- int i;
455
+ static VALUE re2_matchdata_to_a(const VALUE self) {
514
456
  re2_matchdata *m;
515
457
  re2_pattern *p;
516
- re2::StringPiece *match;
517
- VALUE array;
518
458
 
519
459
  Data_Get_Struct(self, re2_matchdata, m);
520
460
  Data_Get_Struct(m->regexp, re2_pattern, p);
521
461
 
522
- array = rb_ary_new2(m->number_of_matches);
523
- for (i = 0; i < m->number_of_matches; i++) {
524
- match = &m->matches[i];
462
+ VALUE array = rb_ary_new2(m->number_of_matches);
463
+ for (int i = 0; i < m->number_of_matches; ++i) {
464
+ re2::StringPiece *match = &m->matches[i];
525
465
 
526
466
  if (match->empty()) {
527
467
  rb_ary_push(array, Qnil);
528
468
  } else {
529
- rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
530
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
469
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
470
+ p->pattern->options().encoding()));
531
471
  }
532
472
  }
533
473
 
534
474
  return array;
535
475
  }
536
476
 
537
- static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
477
+ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
538
478
  re2_matchdata *m;
539
479
  re2_pattern *p;
540
- re2::StringPiece *match;
541
480
 
542
481
  Data_Get_Struct(self, re2_matchdata, m);
543
482
  Data_Get_Struct(m->regexp, re2_pattern, p);
@@ -545,32 +484,29 @@ static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
545
484
  if (nth < 0 || nth >= m->number_of_matches) {
546
485
  return Qnil;
547
486
  } else {
548
- match = &m->matches[nth];
487
+ re2::StringPiece *match = &m->matches[nth];
549
488
 
550
489
  if (match->empty()) {
551
490
  return Qnil;
552
491
  } else {
553
- return ENCODED_STR_NEW(match->data(), match->size(),
554
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
492
+ return encoded_str_new(match->data(), match->size(),
493
+ p->pattern->options().encoding());
555
494
  }
556
495
  }
557
496
  }
558
497
 
559
- static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
560
- int idx;
498
+ static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
561
499
  re2_matchdata *m;
562
500
  re2_pattern *p;
563
- map<string, int> groups;
564
- string name_as_string(name);
565
501
 
566
502
  Data_Get_Struct(self, re2_matchdata, m);
567
503
  Data_Get_Struct(m->regexp, re2_pattern, p);
568
504
 
569
- groups = p->pattern->NamedCapturingGroups();
505
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
506
+ std::map<std::string, int>::const_iterator search = groups.find(name);
570
507
 
571
- if (groups.count(name_as_string) == 1) {
572
- idx = groups[name_as_string];
573
- return re2_matchdata_nth_match(idx, self);
508
+ if (search != groups.end()) {
509
+ return re2_matchdata_nth_match(search->second, self);
574
510
  } else {
575
511
  return Qnil;
576
512
  }
@@ -579,6 +515,10 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
579
515
  /*
580
516
  * Retrieve zero, one or more matches by index or name.
581
517
  *
518
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
519
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
520
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
521
+ *
582
522
  * @return [Array<String, nil>, String, Boolean]
583
523
  *
584
524
  * @overload [](index)
@@ -619,7 +559,7 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
619
559
  * m["number"] #=> "123"
620
560
  * m[:number] #=> "123"
621
561
  */
622
- static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
562
+ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
623
563
  VALUE idx, rest;
624
564
  rb_scan_args(argc, argv, "11", &idx, &rest);
625
565
 
@@ -639,38 +579,40 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
639
579
  *
640
580
  * @return [String] the entire matched string
641
581
  */
642
- static VALUE re2_matchdata_to_s(VALUE self) {
582
+ static VALUE re2_matchdata_to_s(const VALUE self) {
643
583
  return re2_matchdata_nth_match(0, self);
644
584
  }
645
585
 
646
586
  /*
647
587
  * Returns a printable version of the match.
648
588
  *
589
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
590
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
591
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
592
+ *
649
593
  * @return [String] a printable version of the match
650
594
  * @example
651
595
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
652
596
  * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
653
597
  */
654
- static VALUE re2_matchdata_inspect(VALUE self) {
655
- int i;
598
+ static VALUE re2_matchdata_inspect(const VALUE self) {
656
599
  re2_matchdata *m;
657
600
  re2_pattern *p;
658
- VALUE match, result;
659
- ostringstream output;
660
601
 
661
602
  Data_Get_Struct(self, re2_matchdata, m);
662
603
  Data_Get_Struct(m->regexp, re2_pattern, p);
663
604
 
605
+ std::ostringstream output;
664
606
  output << "#<RE2::MatchData";
665
607
 
666
- for (i = 0; i < m->number_of_matches; i++) {
608
+ for (int i = 0; i < m->number_of_matches; ++i) {
667
609
  output << " ";
668
610
 
669
611
  if (i > 0) {
670
612
  output << i << ":";
671
613
  }
672
614
 
673
- match = re2_matchdata_nth_match(i, self);
615
+ VALUE match = re2_matchdata_nth_match(i, self);
674
616
 
675
617
  if (match == Qnil) {
676
618
  output << "nil";
@@ -681,15 +623,17 @@ static VALUE re2_matchdata_inspect(VALUE self) {
681
623
 
682
624
  output << ">";
683
625
 
684
- result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
685
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
686
-
687
- return result;
626
+ return encoded_str_new(output.str().data(), output.str().length(),
627
+ p->pattern->options().encoding());
688
628
  }
689
629
 
690
630
  /*
691
631
  * Returns the array of submatches for pattern matching.
692
632
  *
633
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
634
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
635
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
636
+ *
693
637
  * @return [Array<String, nil>] the array of submatches
694
638
  * @example
695
639
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
@@ -703,25 +647,22 @@ static VALUE re2_matchdata_inspect(VALUE self) {
703
647
  * puts "Unrecognised match"
704
648
  * end
705
649
  */
706
- static VALUE re2_matchdata_deconstruct(VALUE self) {
707
- int i;
650
+ static VALUE re2_matchdata_deconstruct(const VALUE self) {
708
651
  re2_matchdata *m;
709
652
  re2_pattern *p;
710
- re2::StringPiece *match;
711
- VALUE array;
712
653
 
713
654
  Data_Get_Struct(self, re2_matchdata, m);
714
655
  Data_Get_Struct(m->regexp, re2_pattern, p);
715
656
 
716
- array = rb_ary_new2(m->number_of_matches - 1);
717
- for (i = 1; i < m->number_of_matches; i++) {
718
- match = &m->matches[i];
657
+ VALUE array = rb_ary_new2(m->number_of_matches - 1);
658
+ for (int i = 1; i < m->number_of_matches; ++i) {
659
+ re2::StringPiece *match = &m->matches[i];
719
660
 
720
661
  if (match->empty()) {
721
662
  rb_ary_push(array, Qnil);
722
663
  } else {
723
- rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
724
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
664
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
665
+ p->pattern->options().encoding()));
725
666
  }
726
667
  }
727
668
 
@@ -735,6 +676,10 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
735
676
  * more keys than there are capturing groups. Given keys will populate the hash in
736
677
  * order but an invalid name will cause the hash to be immediately returned.
737
678
  *
679
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
680
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
681
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
682
+ *
738
683
  * @return [Hash] a hash of capturing group names to submatches
739
684
  * @param [Array<Symbol>, nil] keys an array of Symbol capturing group names or nil to return all names
740
685
  * @example
@@ -752,40 +697,37 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
752
697
  * puts "Unrecognised match"
753
698
  * end
754
699
  */
755
- static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
756
- int i;
757
- VALUE capturing_groups, key;
700
+ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys) {
758
701
  re2_matchdata *m;
759
702
  re2_pattern *p;
760
- map<string, int> groups;
761
- map<string, int>::iterator iterator;
762
703
 
763
704
  Data_Get_Struct(self, re2_matchdata, m);
764
705
  Data_Get_Struct(m->regexp, re2_pattern, p);
765
706
 
766
- groups = p->pattern->NamedCapturingGroups();
767
- capturing_groups = rb_hash_new();
707
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
708
+ VALUE capturing_groups = rb_hash_new();
768
709
 
769
710
  if (NIL_P(keys)) {
770
- for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
711
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
771
712
  rb_hash_aset(capturing_groups,
772
- ID2SYM(rb_intern(iterator->first.data())),
773
- re2_matchdata_nth_match(iterator->second, self));
713
+ ID2SYM(rb_intern(it->first.data())),
714
+ re2_matchdata_nth_match(it->second, self));
774
715
  }
775
716
  } else {
776
717
  Check_Type(keys, T_ARRAY);
777
718
 
778
719
  if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) {
779
- for (i = 0; i < RARRAY_LEN(keys); i++) {
780
- key = rb_ary_entry(keys, i);
720
+ for (int i = 0; i < RARRAY_LEN(keys); ++i) {
721
+ VALUE key = rb_ary_entry(keys, i);
781
722
  Check_Type(key, T_SYMBOL);
782
- string name(rb_id2name(SYM2ID(key)));
723
+ const char *name = rb_id2name(SYM2ID(key));
724
+ std::map<std::string, int>::const_iterator search = groups.find(name);
783
725
 
784
- if (groups.count(name) == 0) {
726
+ if (search != groups.end()) {
727
+ rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(search->second, self));
728
+ } else {
785
729
  break;
786
730
  }
787
-
788
- rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(groups[name], self));
789
731
  }
790
732
  }
791
733
  }
@@ -802,6 +744,7 @@ static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
802
744
  */
803
745
  static VALUE re2_re2(int argc, VALUE *argv, VALUE self) {
804
746
  UNUSED(self);
747
+
805
748
  return rb_class_new_instance(argc, argv, re2_cRegexp);
806
749
  }
807
750
 
@@ -849,11 +792,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
849
792
 
850
793
  if (RTEST(options)) {
851
794
  RE2::Options re2_options;
852
- parse_re2_options(re2_options, options);
795
+ parse_re2_options(&re2_options, options);
853
796
 
854
- p->pattern = new(nothrow) RE2(StringValuePtr(pattern), re2_options);
797
+ p->pattern = new(std::nothrow) RE2(StringValuePtr(pattern), re2_options);
855
798
  } else {
856
- p->pattern = new(nothrow) RE2(StringValuePtr(pattern));
799
+ p->pattern = new(std::nothrow) RE2(StringValuePtr(pattern));
857
800
  }
858
801
 
859
802
  if (p->pattern == 0) {
@@ -866,40 +809,47 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
866
809
  /*
867
810
  * Returns a printable version of the regular expression +re2+.
868
811
  *
812
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
813
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
814
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
815
+ *
869
816
  * @return [String] a printable version of the regular expression
870
817
  * @example
871
818
  * re2 = RE2::Regexp.new("woo?")
872
819
  * re2.inspect #=> "#<RE2::Regexp /woo?/>"
873
820
  */
874
- static VALUE re2_regexp_inspect(VALUE self) {
821
+ static VALUE re2_regexp_inspect(const VALUE self) {
875
822
  re2_pattern *p;
876
- VALUE result;
877
- ostringstream output;
878
823
 
879
824
  Data_Get_Struct(self, re2_pattern, p);
880
825
 
881
- output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
826
+ std::ostringstream output;
882
827
 
883
- result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
884
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
828
+ output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
885
829
 
886
- return result;
830
+ return encoded_str_new(output.str().data(), output.str().length(),
831
+ p->pattern->options().encoding());
887
832
  }
888
833
 
889
834
  /*
890
835
  * Returns a string version of the regular expression +re2+.
891
836
  *
837
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
838
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
839
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
840
+ *
892
841
  * @return [String] a string version of the regular expression
893
842
  * @example
894
843
  * re2 = RE2::Regexp.new("woo?")
895
844
  * re2.to_s #=> "woo?"
896
845
  */
897
- static VALUE re2_regexp_to_s(VALUE self) {
846
+ static VALUE re2_regexp_to_s(const VALUE self) {
898
847
  re2_pattern *p;
899
848
  Data_Get_Struct(self, re2_pattern, p);
900
- return ENCODED_STR_NEW(p->pattern->pattern().data(),
849
+
850
+ return encoded_str_new(p->pattern->pattern().data(),
901
851
  p->pattern->pattern().size(),
902
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
852
+ p->pattern->options().encoding());
903
853
  }
904
854
 
905
855
  /*
@@ -911,9 +861,10 @@ static VALUE re2_regexp_to_s(VALUE self) {
911
861
  * re2 = RE2::Regexp.new("woo?")
912
862
  * re2.ok? #=> true
913
863
  */
914
- static VALUE re2_regexp_ok(VALUE self) {
864
+ static VALUE re2_regexp_ok(const VALUE self) {
915
865
  re2_pattern *p;
916
866
  Data_Get_Struct(self, re2_pattern, p);
867
+
917
868
  return BOOL2RUBY(p->pattern->ok());
918
869
  }
919
870
 
@@ -926,9 +877,10 @@ static VALUE re2_regexp_ok(VALUE self) {
926
877
  * re2 = RE2::Regexp.new("woo?", :utf8 => true)
927
878
  * re2.utf8? #=> true
928
879
  */
929
- static VALUE re2_regexp_utf8(VALUE self) {
880
+ static VALUE re2_regexp_utf8(const VALUE self) {
930
881
  re2_pattern *p;
931
882
  Data_Get_Struct(self, re2_pattern, p);
883
+
932
884
  return BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8);
933
885
  }
934
886
 
@@ -941,9 +893,10 @@ static VALUE re2_regexp_utf8(VALUE self) {
941
893
  * re2 = RE2::Regexp.new("woo?", :posix_syntax => true)
942
894
  * re2.posix_syntax? #=> true
943
895
  */
944
- static VALUE re2_regexp_posix_syntax(VALUE self) {
896
+ static VALUE re2_regexp_posix_syntax(const VALUE self) {
945
897
  re2_pattern *p;
946
898
  Data_Get_Struct(self, re2_pattern, p);
899
+
947
900
  return BOOL2RUBY(p->pattern->options().posix_syntax());
948
901
  }
949
902
 
@@ -956,9 +909,10 @@ static VALUE re2_regexp_posix_syntax(VALUE self) {
956
909
  * re2 = RE2::Regexp.new("woo?", :longest_match => true)
957
910
  * re2.longest_match? #=> true
958
911
  */
959
- static VALUE re2_regexp_longest_match(VALUE self) {
912
+ static VALUE re2_regexp_longest_match(const VALUE self) {
960
913
  re2_pattern *p;
961
914
  Data_Get_Struct(self, re2_pattern, p);
915
+
962
916
  return BOOL2RUBY(p->pattern->options().longest_match());
963
917
  }
964
918
 
@@ -971,9 +925,10 @@ static VALUE re2_regexp_longest_match(VALUE self) {
971
925
  * re2 = RE2::Regexp.new("woo?", :log_errors => true)
972
926
  * re2.log_errors? #=> true
973
927
  */
974
- static VALUE re2_regexp_log_errors(VALUE self) {
928
+ static VALUE re2_regexp_log_errors(const VALUE self) {
975
929
  re2_pattern *p;
976
930
  Data_Get_Struct(self, re2_pattern, p);
931
+
977
932
  return BOOL2RUBY(p->pattern->options().log_errors());
978
933
  }
979
934
 
@@ -986,9 +941,10 @@ static VALUE re2_regexp_log_errors(VALUE self) {
986
941
  * re2 = RE2::Regexp.new("woo?", :max_mem => 1024)
987
942
  * re2.max_mem #=> 1024
988
943
  */
989
- static VALUE re2_regexp_max_mem(VALUE self) {
944
+ static VALUE re2_regexp_max_mem(const VALUE self) {
990
945
  re2_pattern *p;
991
946
  Data_Get_Struct(self, re2_pattern, p);
947
+
992
948
  return INT2FIX(p->pattern->options().max_mem());
993
949
  }
994
950
 
@@ -1001,9 +957,10 @@ static VALUE re2_regexp_max_mem(VALUE self) {
1001
957
  * re2 = RE2::Regexp.new("woo?", :literal => true)
1002
958
  * re2.literal? #=> true
1003
959
  */
1004
- static VALUE re2_regexp_literal(VALUE self) {
960
+ static VALUE re2_regexp_literal(const VALUE self) {
1005
961
  re2_pattern *p;
1006
962
  Data_Get_Struct(self, re2_pattern, p);
963
+
1007
964
  return BOOL2RUBY(p->pattern->options().literal());
1008
965
  }
1009
966
 
@@ -1016,9 +973,10 @@ static VALUE re2_regexp_literal(VALUE self) {
1016
973
  * re2 = RE2::Regexp.new("woo?", :never_nl => true)
1017
974
  * re2.never_nl? #=> true
1018
975
  */
1019
- static VALUE re2_regexp_never_nl(VALUE self) {
976
+ static VALUE re2_regexp_never_nl(const VALUE self) {
1020
977
  re2_pattern *p;
1021
978
  Data_Get_Struct(self, re2_pattern, p);
979
+
1022
980
  return BOOL2RUBY(p->pattern->options().never_nl());
1023
981
  }
1024
982
 
@@ -1031,9 +989,10 @@ static VALUE re2_regexp_never_nl(VALUE self) {
1031
989
  * re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
1032
990
  * re2.case_sensitive? #=> true
1033
991
  */
1034
- static VALUE re2_regexp_case_sensitive(VALUE self) {
992
+ static VALUE re2_regexp_case_sensitive(const VALUE self) {
1035
993
  re2_pattern *p;
1036
994
  Data_Get_Struct(self, re2_pattern, p);
995
+
1037
996
  return BOOL2RUBY(p->pattern->options().case_sensitive());
1038
997
  }
1039
998
 
@@ -1047,7 +1006,7 @@ static VALUE re2_regexp_case_sensitive(VALUE self) {
1047
1006
  * re2.case_insensitive? #=> false
1048
1007
  * re2.casefold? #=> false
1049
1008
  */
1050
- static VALUE re2_regexp_case_insensitive(VALUE self) {
1009
+ static VALUE re2_regexp_case_insensitive(const VALUE self) {
1051
1010
  return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
1052
1011
  }
1053
1012
 
@@ -1060,9 +1019,10 @@ static VALUE re2_regexp_case_insensitive(VALUE self) {
1060
1019
  * re2 = RE2::Regexp.new("woo?", :perl_classes => true)
1061
1020
  * re2.perl_classes? #=> true
1062
1021
  */
1063
- static VALUE re2_regexp_perl_classes(VALUE self) {
1022
+ static VALUE re2_regexp_perl_classes(const VALUE self) {
1064
1023
  re2_pattern *p;
1065
1024
  Data_Get_Struct(self, re2_pattern, p);
1025
+
1066
1026
  return BOOL2RUBY(p->pattern->options().perl_classes());
1067
1027
  }
1068
1028
 
@@ -1075,9 +1035,10 @@ static VALUE re2_regexp_perl_classes(VALUE self) {
1075
1035
  * re2 = RE2::Regexp.new("woo?", :word_boundary => true)
1076
1036
  * re2.word_boundary? #=> true
1077
1037
  */
1078
- static VALUE re2_regexp_word_boundary(VALUE self) {
1038
+ static VALUE re2_regexp_word_boundary(const VALUE self) {
1079
1039
  re2_pattern *p;
1080
1040
  Data_Get_Struct(self, re2_pattern, p);
1041
+
1081
1042
  return BOOL2RUBY(p->pattern->options().word_boundary());
1082
1043
  }
1083
1044
 
@@ -1090,9 +1051,10 @@ static VALUE re2_regexp_word_boundary(VALUE self) {
1090
1051
  * re2 = RE2::Regexp.new("woo?", :one_line => true)
1091
1052
  * re2.one_line? #=> true
1092
1053
  */
1093
- static VALUE re2_regexp_one_line(VALUE self) {
1054
+ static VALUE re2_regexp_one_line(const VALUE self) {
1094
1055
  re2_pattern *p;
1095
1056
  Data_Get_Struct(self, re2_pattern, p);
1057
+
1096
1058
  return BOOL2RUBY(p->pattern->options().one_line());
1097
1059
  }
1098
1060
 
@@ -1102,9 +1064,10 @@ static VALUE re2_regexp_one_line(VALUE self) {
1102
1064
  *
1103
1065
  * @return [String, nil] the error string or nil
1104
1066
  */
1105
- static VALUE re2_regexp_error(VALUE self) {
1067
+ static VALUE re2_regexp_error(const VALUE self) {
1106
1068
  re2_pattern *p;
1107
1069
  Data_Get_Struct(self, re2_pattern, p);
1070
+
1108
1071
  if (p->pattern->ok()) {
1109
1072
  return Qnil;
1110
1073
  } else {
@@ -1116,17 +1079,22 @@ static VALUE re2_regexp_error(VALUE self) {
1116
1079
  * If the RE2 could not be created properly, returns
1117
1080
  * the offending portion of the regexp otherwise returns nil.
1118
1081
  *
1082
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1083
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1084
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1085
+ *
1119
1086
  * @return [String, nil] the offending portion of the regexp or nil
1120
1087
  */
1121
- static VALUE re2_regexp_error_arg(VALUE self) {
1088
+ static VALUE re2_regexp_error_arg(const VALUE self) {
1122
1089
  re2_pattern *p;
1123
1090
  Data_Get_Struct(self, re2_pattern, p);
1091
+
1124
1092
  if (p->pattern->ok()) {
1125
1093
  return Qnil;
1126
1094
  } else {
1127
- return ENCODED_STR_NEW(p->pattern->error_arg().data(),
1095
+ return encoded_str_new(p->pattern->error_arg().data(),
1128
1096
  p->pattern->error_arg().size(),
1129
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1097
+ p->pattern->options().encoding());
1130
1098
  }
1131
1099
  }
1132
1100
 
@@ -1137,9 +1105,10 @@ static VALUE re2_regexp_error_arg(VALUE self) {
1137
1105
  *
1138
1106
  * @return [Integer] the regexp "cost"
1139
1107
  */
1140
- static VALUE re2_regexp_program_size(VALUE self) {
1108
+ static VALUE re2_regexp_program_size(const VALUE self) {
1141
1109
  re2_pattern *p;
1142
1110
  Data_Get_Struct(self, re2_pattern, p);
1111
+
1143
1112
  return INT2FIX(p->pattern->ProgramSize());
1144
1113
  }
1145
1114
 
@@ -1149,12 +1118,11 @@ static VALUE re2_regexp_program_size(VALUE self) {
1149
1118
  *
1150
1119
  * @return [Hash] the options
1151
1120
  */
1152
- static VALUE re2_regexp_options(VALUE self) {
1153
- VALUE options;
1121
+ static VALUE re2_regexp_options(const VALUE self) {
1154
1122
  re2_pattern *p;
1155
1123
 
1156
1124
  Data_Get_Struct(self, re2_pattern, p);
1157
- options = rb_hash_new();
1125
+ VALUE options = rb_hash_new();
1158
1126
 
1159
1127
  rb_hash_aset(options, ID2SYM(id_utf8),
1160
1128
  BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8));
@@ -1202,33 +1170,34 @@ static VALUE re2_regexp_options(VALUE self) {
1202
1170
  *
1203
1171
  * @return [Integer] the number of capturing subpatterns
1204
1172
  */
1205
- static VALUE re2_regexp_number_of_capturing_groups(VALUE self) {
1173
+ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1206
1174
  re2_pattern *p;
1207
-
1208
1175
  Data_Get_Struct(self, re2_pattern, p);
1176
+
1209
1177
  return INT2FIX(p->pattern->NumberOfCapturingGroups());
1210
1178
  }
1211
1179
 
1212
1180
  /*
1213
1181
  * Returns a hash of names to capturing indices of groups.
1214
1182
  *
1183
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1184
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1185
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1186
+ *
1215
1187
  * @return [Hash] a hash of names to capturing indices
1216
1188
  */
1217
- static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1218
- VALUE capturing_groups;
1189
+ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1219
1190
  re2_pattern *p;
1220
- map<string, int> groups;
1221
- map<string, int>::iterator iterator;
1222
1191
 
1223
1192
  Data_Get_Struct(self, re2_pattern, p);
1224
- groups = p->pattern->NamedCapturingGroups();
1225
- capturing_groups = rb_hash_new();
1193
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1194
+ VALUE capturing_groups = rb_hash_new();
1226
1195
 
1227
- for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
1196
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1228
1197
  rb_hash_aset(capturing_groups,
1229
- ENCODED_STR_NEW(iterator->first.data(), iterator->first.size(),
1230
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"),
1231
- INT2FIX(iterator->second));
1198
+ encoded_str_new(it->first.data(), it->first.size(),
1199
+ p->pattern->options().encoding()),
1200
+ INT2FIX(it->second));
1232
1201
  }
1233
1202
 
1234
1203
  return capturing_groups;
@@ -1242,16 +1211,23 @@ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1242
1211
  * @return [Boolean, RE2::MatchData]
1243
1212
  *
1244
1213
  * @overload match(text)
1245
- * Returns an {RE2::MatchData} containing the matching
1246
- * pattern and all subpatterns resulting from looking for
1247
- * the regexp in +text+.
1214
+ * Returns an {RE2::MatchData} containing the matching pattern and all
1215
+ * subpatterns resulting from looking for the regexp in +text+ if the pattern
1216
+ * contains capturing groups.
1217
+ *
1218
+ * Returns either true or false indicating whether a successful match was
1219
+ * made if the pattern contains no capturing groups.
1248
1220
  *
1249
1221
  * @param [String] text the text to search
1250
- * @return [RE2::MatchData] the matches
1222
+ * @return [RE2::MatchData] if the pattern contains capturing groups
1223
+ * @return [Boolean] if the pattern does not contain capturing groups
1251
1224
  * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1252
- * @example
1225
+ * @example Matching with capturing groups
1253
1226
  * r = RE2::Regexp.new('w(o)(o)')
1254
1227
  * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1228
+ * @example Matching without capturing groups
1229
+ * r = RE2::Regexp.new('woo')
1230
+ * r.match('woo') #=> true
1255
1231
  *
1256
1232
  * @overload match(text, 0)
1257
1233
  * Returns either true or false indicating whether a
@@ -1279,20 +1255,20 @@ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1279
1255
  * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
1280
1256
  * r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1281
1257
  */
1282
- static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1283
- int n;
1284
- bool matched;
1258
+ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1285
1259
  re2_pattern *p;
1286
1260
  re2_matchdata *m;
1287
- VALUE text, number_of_matches, matchdata;
1261
+ VALUE text, number_of_matches;
1288
1262
 
1289
1263
  rb_scan_args(argc, argv, "11", &text, &number_of_matches);
1290
1264
 
1291
1265
  /* Ensure text is a string. */
1292
- text = StringValue(text);
1266
+ StringValue(text);
1293
1267
 
1294
1268
  Data_Get_Struct(self, re2_pattern, p);
1295
1269
 
1270
+ int n;
1271
+
1296
1272
  if (RTEST(number_of_matches)) {
1297
1273
  n = NUM2INT(number_of_matches);
1298
1274
 
@@ -1308,17 +1284,21 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1308
1284
  }
1309
1285
 
1310
1286
  if (n == 0) {
1311
- matched = match(p->pattern, StringValuePtr(text), 0,
1312
- static_cast<int>(RSTRING_LEN(text)), RE2::UNANCHORED, 0, 0);
1287
+ #ifdef HAVE_ENDPOS_ARGUMENT
1288
+ bool matched = p->pattern->Match(StringValuePtr(text), 0,
1289
+ RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
1290
+ #else
1291
+ bool matched = p->pattern->Match(StringValuePtr(text), 0, RE2::UNANCHORED,
1292
+ 0, 0);
1293
+ #endif
1313
1294
  return BOOL2RUBY(matched);
1314
1295
  } else {
1315
-
1316
1296
  /* Because match returns the whole match as well. */
1317
1297
  n += 1;
1318
1298
 
1319
- matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1299
+ VALUE matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1320
1300
  Data_Get_Struct(matchdata, re2_matchdata, m);
1321
- m->matches = new(nothrow) re2::StringPiece[n];
1301
+ m->matches = new(std::nothrow) re2::StringPiece[n];
1322
1302
  m->regexp = self;
1323
1303
  m->text = rb_str_dup(text);
1324
1304
  rb_str_freeze(m->text);
@@ -1330,10 +1310,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1330
1310
 
1331
1311
  m->number_of_matches = n;
1332
1312
 
1333
- matched = match(p->pattern, StringValuePtr(m->text), 0,
1334
- static_cast<int>(RSTRING_LEN(m->text)),
1335
- RE2::UNANCHORED, m->matches, n);
1336
-
1313
+ #ifdef HAVE_ENDPOS_ARGUMENT
1314
+ bool matched = p->pattern->Match(StringValuePtr(m->text), 0,
1315
+ RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
1316
+ #else
1317
+ bool matched = p->pattern->Match(StringValuePtr(m->text), 0,
1318
+ RE2::UNANCHORED, m->matches, n);
1319
+ #endif
1337
1320
  if (matched) {
1338
1321
  return matchdata;
1339
1322
  } else {
@@ -1348,10 +1331,8 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1348
1331
  *
1349
1332
  * @return [Boolean] whether the match was successful
1350
1333
  */
1351
- static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1352
- VALUE argv[2];
1353
- argv[0] = text;
1354
- argv[1] = INT2FIX(0);
1334
+ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1335
+ VALUE argv[2] = { text, INT2FIX(0) };
1355
1336
 
1356
1337
  return re2_regexp_match(2, argv, self);
1357
1338
  }
@@ -1362,16 +1343,15 @@ static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1362
1343
  * @example
1363
1344
  * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
1364
1345
  */
1365
- static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1346
+ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1366
1347
  re2_pattern *p;
1367
1348
  re2_scanner *c;
1368
- VALUE scanner;
1369
1349
 
1370
1350
  Data_Get_Struct(self, re2_pattern, p);
1371
- scanner = rb_class_new_instance(0, 0, re2_cScanner);
1351
+ VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1372
1352
  Data_Get_Struct(scanner, re2_scanner, c);
1373
1353
 
1374
- c->input = new(nothrow) re2::StringPiece(StringValuePtr(text));
1354
+ c->input = new(std::nothrow) re2::StringPiece(StringValuePtr(text));
1375
1355
  c->regexp = self;
1376
1356
  c->text = text;
1377
1357
 
@@ -1390,6 +1370,10 @@ static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1390
1370
  * Returns a copy of +str+ with the first occurrence +pattern+
1391
1371
  * replaced with +rewrite+.
1392
1372
  *
1373
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1374
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1375
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1376
+ *
1393
1377
  * @param [String] str the string to modify
1394
1378
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1395
1379
  * @param [String] rewrite the string to replace with
@@ -1404,29 +1388,33 @@ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
1404
1388
  UNUSED(self);
1405
1389
  re2_pattern *p;
1406
1390
 
1407
- /* Convert all the inputs to be pumped into RE2::Replace. */
1408
- string str_as_string(StringValuePtr(str));
1391
+ /* Take a copy of str so it can be modified in-place by
1392
+ * RE2::Replace.
1393
+ */
1394
+ std::string str_as_string(StringValuePtr(str));
1409
1395
 
1410
1396
  /* Do the replacement. */
1411
1397
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1412
1398
  Data_Get_Struct(pattern, re2_pattern, p);
1413
1399
  RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
1414
1400
 
1415
- return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
1416
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1401
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1402
+ p->pattern->options().encoding());
1417
1403
  } else {
1418
1404
  RE2::Replace(&str_as_string, StringValuePtr(pattern),
1419
1405
  StringValuePtr(rewrite));
1420
1406
 
1421
- return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
1422
- pattern);
1407
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1423
1408
  }
1424
-
1425
1409
  }
1426
1410
 
1427
1411
  /*
1428
1412
  * Return a copy of +str+ with +pattern+ replaced by +rewrite+.
1429
1413
  *
1414
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1415
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1416
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1417
+ *
1430
1418
  * @param [String] str the string to modify
1431
1419
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1432
1420
  * @param [String] rewrite the string to replace with
@@ -1440,23 +1428,24 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
1440
1428
  VALUE rewrite) {
1441
1429
  UNUSED(self);
1442
1430
 
1443
- /* Convert all the inputs to be pumped into RE2::GlobalReplace. */
1431
+ /* Take a copy of str so it can be modified in-place by
1432
+ * RE2::GlobalReplace.
1433
+ */
1444
1434
  re2_pattern *p;
1445
- string str_as_string(StringValuePtr(str));
1435
+ std::string str_as_string(StringValuePtr(str));
1446
1436
 
1447
1437
  /* Do the replacement. */
1448
1438
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1449
1439
  Data_Get_Struct(pattern, re2_pattern, p);
1450
1440
  RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
1451
1441
 
1452
- return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
1453
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1442
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1443
+ p->pattern->options().encoding());
1454
1444
  } else {
1455
1445
  RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
1456
1446
  StringValuePtr(rewrite));
1457
1447
 
1458
- return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
1459
- pattern);
1448
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1460
1449
  }
1461
1450
  }
1462
1451
 
@@ -1472,11 +1461,12 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
1472
1461
  */
1473
1462
  static VALUE re2_QuoteMeta(VALUE self, VALUE unquoted) {
1474
1463
  UNUSED(self);
1475
- string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted));
1464
+ std::string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted));
1465
+
1476
1466
  return rb_str_new(quoted_string.data(), quoted_string.size());
1477
1467
  }
1478
1468
 
1479
- void re2_set_free(re2_set *self) {
1469
+ static void re2_set_free(re2_set *self) {
1480
1470
  if (self->set) {
1481
1471
  delete self->set;
1482
1472
  }
@@ -1486,6 +1476,7 @@ void re2_set_free(re2_set *self) {
1486
1476
  static VALUE re2_set_allocate(VALUE klass) {
1487
1477
  re2_set *s;
1488
1478
  VALUE result = Data_Make_Struct(klass, re2_set, 0, re2_set_free, s);
1479
+
1489
1480
  return result;
1490
1481
  }
1491
1482
 
@@ -1533,18 +1524,16 @@ static VALUE re2_set_allocate(VALUE klass) {
1533
1524
  static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1534
1525
  VALUE anchor, options;
1535
1526
  re2_set *s;
1536
- RE2::Anchor re2_anchor;
1527
+ RE2::Anchor re2_anchor = RE2::UNANCHORED;
1537
1528
  RE2::Options re2_options;
1538
1529
 
1539
1530
  rb_scan_args(argc, argv, "02", &anchor, &options);
1540
1531
  Data_Get_Struct(self, re2_set, s);
1541
1532
 
1542
1533
  if (RTEST(options)) {
1543
- parse_re2_options(re2_options, options);
1534
+ parse_re2_options(&re2_options, options);
1544
1535
  }
1545
- if (NIL_P(anchor)) {
1546
- re2_anchor = RE2::UNANCHORED;
1547
- } else {
1536
+ if (!NIL_P(anchor)) {
1548
1537
  Check_Type(anchor, T_SYMBOL);
1549
1538
  ID id_anchor = SYM2ID(anchor);
1550
1539
  if (id_anchor == id_unanchored) {
@@ -1558,7 +1547,7 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1558
1547
  }
1559
1548
  }
1560
1549
 
1561
- s->set = new(nothrow) RE2::Set(re2_options, re2_anchor);
1550
+ s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
1562
1551
  if (s->set == 0) {
1563
1552
  rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1564
1553
  }
@@ -1579,11 +1568,12 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1579
1568
  * set.add("def") #=> 1
1580
1569
  */
1581
1570
  static VALUE re2_set_add(VALUE self, VALUE pattern) {
1582
- Check_Type(pattern, T_STRING);
1571
+ StringValue(pattern);
1583
1572
  re2::StringPiece regex(RSTRING_PTR(pattern), RSTRING_LEN(pattern));
1584
1573
  std::string err;
1585
1574
  re2_set *s;
1586
1575
  Data_Get_Struct(self, re2_set, s);
1576
+
1587
1577
  int index = s->set->Add(regex, &err);
1588
1578
  if (index < 0) {
1589
1579
  rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", err.c_str());
@@ -1665,25 +1655,27 @@ static VALUE re2_set_match_raises_errors_p(VALUE self) {
1665
1655
  * set.compile
1666
1656
  * set.match("abcdef", :exception => true) # => [0, 1]
1667
1657
  */
1668
- static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1669
- VALUE str, options, exception_option;
1658
+ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1659
+ VALUE str, options;
1670
1660
  bool raise_exception = true;
1671
1661
  rb_scan_args(argc, argv, "11", &str, &options);
1672
- Check_Type(str, T_STRING);
1662
+
1663
+ StringValue(str);
1673
1664
  re2::StringPiece data(RSTRING_PTR(str), RSTRING_LEN(str));
1674
- std::vector<int> v;
1675
1665
  re2_set *s;
1676
1666
  Data_Get_Struct(self, re2_set, s);
1677
1667
 
1678
1668
  if (RTEST(options)) {
1679
1669
  Check_Type(options, T_HASH);
1680
1670
 
1681
- exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1671
+ VALUE exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1682
1672
  if (!NIL_P(exception_option)) {
1683
1673
  raise_exception = RTEST(exception_option);
1684
1674
  }
1685
1675
  }
1686
1676
 
1677
+ std::vector<int> v;
1678
+
1687
1679
  if (raise_exception) {
1688
1680
  #ifdef HAVE_ERROR_INFO_ARGUMENT
1689
1681
  RE2::Set::ErrorInfo e;
@@ -1704,7 +1696,7 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1704
1696
  rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
1705
1697
  }
1706
1698
  } else {
1707
- for (size_t i = 0; i < v.size(); i++) {
1699
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
1708
1700
  rb_ary_push(result, INT2FIX(v[i]));
1709
1701
  }
1710
1702
  }
@@ -1718,7 +1710,7 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1718
1710
  VALUE result = rb_ary_new2(v.size());
1719
1711
 
1720
1712
  if (matched) {
1721
- for (size_t i = 0; i < v.size(); i++) {
1713
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
1722
1714
  rb_ary_push(result, INT2FIX(v[i]));
1723
1715
  }
1724
1716
  }
@@ -1868,7 +1860,7 @@ void Init_re2(void) {
1868
1860
  rb_define_singleton_method(re2_cRegexp, "compile",
1869
1861
  RUBY_METHOD_FUNC(rb_class_new_instance), -1);
1870
1862
 
1871
- rb_define_global_function("RE2", RUBY_METHOD_FUNC(re2_re2), -1);
1863
+ rb_define_module_function(rb_mKernel, "RE2", RUBY_METHOD_FUNC(re2_re2), -1);
1872
1864
 
1873
1865
  /* Create the symbols used in options. */
1874
1866
  id_utf8 = rb_intern("utf8");