re2 2.0.0.beta2-arm-linux → 2.1.1-arm-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/re2/re2.cc CHANGED
@@ -6,77 +6,21 @@
6
6
  * Released under the BSD Licence, please see LICENSE.txt
7
7
  */
8
8
 
9
- #include <ruby.h>
10
- #include <re2/re2.h>
11
- #include <re2/set.h>
12
9
  #include <stdint.h>
13
- #include <string>
10
+
11
+ #include <map>
14
12
  #include <sstream>
13
+ #include <string>
15
14
  #include <vector>
16
- using std::string;
17
- using std::ostringstream;
18
- using std::nothrow;
19
- using std::map;
20
- using std::vector;
15
+
16
+ #include <re2/re2.h>
17
+ #include <re2/set.h>
18
+ #include <ruby.h>
19
+ #include <ruby/encoding.h>
21
20
 
22
21
  #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
23
22
  #define UNUSED(x) ((void)x)
24
23
 
25
- #ifndef RSTRING_LEN
26
- #define RSTRING_LEN(x) (RSTRING(x)->len)
27
- #endif
28
-
29
- #ifndef RSTRING_PTR
30
- #define RSTRING_PTR(x) (RSTRING(x)->ptr)
31
- #endif
32
-
33
- #ifdef HAVE_RUBY_ENCODING_H
34
- #include <ruby/encoding.h>
35
- #define ENCODED_STR_NEW(str, length, encoding) \
36
- ({ \
37
- VALUE _string = rb_str_new(str, length); \
38
- int _enc = rb_enc_find_index(encoding); \
39
- rb_enc_associate_index(_string, _enc); \
40
- _string; \
41
- })
42
- #define ENCODED_STR_NEW2(str, length, str2) \
43
- ({ \
44
- VALUE _string = rb_str_new(str, length); \
45
- int _enc = rb_enc_get_index(str2); \
46
- rb_enc_associate_index(_string, _enc); \
47
- _string; \
48
- })
49
- #else
50
- #define ENCODED_STR_NEW(str, length, encoding) \
51
- rb_str_new((const char *)str, (long)length)
52
- #define ENCODED_STR_NEW2(str, length, str2) \
53
- rb_str_new((const char *)str, (long)length)
54
- #endif
55
-
56
- #ifdef HAVE_RB_STR_SUBLEN
57
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
58
- LONG2NUM(rb_str_sublen(str, offset))
59
- #else
60
- #ifdef HAVE_RUBY_ENCODING_H
61
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
62
- ({ \
63
- VALUE _string = ENCODED_STR_NEW(RSTRING_PTR(str), offset, encoding); \
64
- rb_str_length(_string); \
65
- })
66
- #else
67
- #define ENCODED_STR_SUBLEN(str, offset, encoding) \
68
- LONG2NUM(offset)
69
- #endif
70
- #endif
71
-
72
- #ifdef HAVE_ENDPOS_ARGUMENT
73
- #define match(pattern, text, startpos, endpos, anchor, match, nmatch) \
74
- (pattern->Match(text, startpos, endpos, anchor, match, nmatch))
75
- #else
76
- #define match(pattern, text, startpos, endpos, anchor, match, nmatch) \
77
- (pattern->Match(text, startpos, anchor, match, nmatch))
78
- #endif
79
-
80
24
  typedef struct {
81
25
  RE2 *pattern;
82
26
  } re2_pattern;
@@ -107,95 +51,103 @@ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
107
51
  id_perl_classes, id_word_boundary, id_one_line,
108
52
  id_unanchored, id_anchor_start, id_anchor_both, id_exception;
109
53
 
110
- void parse_re2_options(RE2::Options& re2_options, VALUE options) {
54
+ inline VALUE encoded_str_new(const char *str, long length, RE2::Options::Encoding encoding) {
55
+ if (encoding == RE2::Options::EncodingUTF8) {
56
+ return rb_utf8_str_new(str, length);
57
+ }
58
+
59
+ VALUE string = rb_str_new(str, length);
60
+ rb_enc_associate_index(string, rb_enc_find_index("ISO-8859-1"));
61
+
62
+ return string;
63
+ }
64
+
65
+ static void parse_re2_options(RE2::Options* re2_options, const VALUE options) {
111
66
  if (TYPE(options) != T_HASH) {
112
67
  rb_raise(rb_eArgError, "options should be a hash");
113
68
  }
114
- VALUE utf8, posix_syntax, longest_match, log_errors,
115
- max_mem, literal, never_nl, case_sensitive, perl_classes,
116
- word_boundary, one_line;
117
69
 
118
- utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
70
+ VALUE utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
119
71
  if (!NIL_P(utf8)) {
120
- re2_options.set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
72
+ re2_options->set_encoding(RTEST(utf8) ? RE2::Options::EncodingUTF8 : RE2::Options::EncodingLatin1);
121
73
  }
122
74
 
123
- posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
75
+ VALUE posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
124
76
  if (!NIL_P(posix_syntax)) {
125
- re2_options.set_posix_syntax(RTEST(posix_syntax));
77
+ re2_options->set_posix_syntax(RTEST(posix_syntax));
126
78
  }
127
79
 
128
- longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
80
+ VALUE longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
129
81
  if (!NIL_P(longest_match)) {
130
- re2_options.set_longest_match(RTEST(longest_match));
82
+ re2_options->set_longest_match(RTEST(longest_match));
131
83
  }
132
84
 
133
- log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
85
+ VALUE log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
134
86
  if (!NIL_P(log_errors)) {
135
- re2_options.set_log_errors(RTEST(log_errors));
87
+ re2_options->set_log_errors(RTEST(log_errors));
136
88
  }
137
89
 
138
- max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
90
+ VALUE max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
139
91
  if (!NIL_P(max_mem)) {
140
- re2_options.set_max_mem(NUM2INT(max_mem));
92
+ re2_options->set_max_mem(NUM2INT(max_mem));
141
93
  }
142
94
 
143
- literal = rb_hash_aref(options, ID2SYM(id_literal));
95
+ VALUE literal = rb_hash_aref(options, ID2SYM(id_literal));
144
96
  if (!NIL_P(literal)) {
145
- re2_options.set_literal(RTEST(literal));
97
+ re2_options->set_literal(RTEST(literal));
146
98
  }
147
99
 
148
- never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
100
+ VALUE never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
149
101
  if (!NIL_P(never_nl)) {
150
- re2_options.set_never_nl(RTEST(never_nl));
102
+ re2_options->set_never_nl(RTEST(never_nl));
151
103
  }
152
104
 
153
- case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
105
+ VALUE case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
154
106
  if (!NIL_P(case_sensitive)) {
155
- re2_options.set_case_sensitive(RTEST(case_sensitive));
107
+ re2_options->set_case_sensitive(RTEST(case_sensitive));
156
108
  }
157
109
 
158
- perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
110
+ VALUE perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
159
111
  if (!NIL_P(perl_classes)) {
160
- re2_options.set_perl_classes(RTEST(perl_classes));
112
+ re2_options->set_perl_classes(RTEST(perl_classes));
161
113
  }
162
114
 
163
- word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
115
+ VALUE word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
164
116
  if (!NIL_P(word_boundary)) {
165
- re2_options.set_word_boundary(RTEST(word_boundary));
117
+ re2_options->set_word_boundary(RTEST(word_boundary));
166
118
  }
167
119
 
168
- one_line = rb_hash_aref(options, ID2SYM(id_one_line));
120
+ VALUE one_line = rb_hash_aref(options, ID2SYM(id_one_line));
169
121
  if (!NIL_P(one_line)) {
170
- re2_options.set_one_line(RTEST(one_line));
122
+ re2_options->set_one_line(RTEST(one_line));
171
123
  }
172
124
  }
173
125
 
174
- void re2_matchdata_mark(re2_matchdata* self) {
126
+ static void re2_matchdata_mark(re2_matchdata* self) {
175
127
  rb_gc_mark(self->regexp);
176
128
  rb_gc_mark(self->text);
177
129
  }
178
130
 
179
- void re2_matchdata_free(re2_matchdata* self) {
131
+ static void re2_matchdata_free(re2_matchdata* self) {
180
132
  if (self->matches) {
181
133
  delete[] self->matches;
182
134
  }
183
135
  free(self);
184
136
  }
185
137
 
186
- void re2_scanner_mark(re2_scanner* self) {
138
+ static void re2_scanner_mark(re2_scanner* self) {
187
139
  rb_gc_mark(self->regexp);
188
140
  rb_gc_mark(self->text);
189
141
  }
190
142
 
191
- void re2_scanner_free(re2_scanner* self) {
143
+ static void re2_scanner_free(re2_scanner* self) {
192
144
  if (self->input) {
193
145
  delete self->input;
194
146
  }
195
147
  free(self);
196
148
  }
197
149
 
198
- void re2_regexp_free(re2_pattern* self) {
150
+ static void re2_regexp_free(re2_pattern* self) {
199
151
  if (self->pattern) {
200
152
  delete self->pattern;
201
153
  }
@@ -204,12 +156,14 @@ void re2_regexp_free(re2_pattern* self) {
204
156
 
205
157
  static VALUE re2_matchdata_allocate(VALUE klass) {
206
158
  re2_matchdata *m;
159
+
207
160
  return Data_Make_Struct(klass, re2_matchdata, re2_matchdata_mark,
208
161
  re2_matchdata_free, m);
209
162
  }
210
163
 
211
164
  static VALUE re2_scanner_allocate(VALUE klass) {
212
165
  re2_scanner *c;
166
+
213
167
  return Data_Make_Struct(klass, re2_scanner, re2_scanner_mark,
214
168
  re2_scanner_free, c);
215
169
  }
@@ -222,7 +176,7 @@ static VALUE re2_scanner_allocate(VALUE klass) {
222
176
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
223
177
  * m.string #=> "bob 123"
224
178
  */
225
- static VALUE re2_matchdata_string(VALUE self) {
179
+ static VALUE re2_matchdata_string(const VALUE self) {
226
180
  re2_matchdata *m;
227
181
  Data_Get_Struct(self, re2_matchdata, m);
228
182
 
@@ -237,7 +191,7 @@ static VALUE re2_matchdata_string(VALUE self) {
237
191
  * c = RE2::Regexp.new('(\d+)').scan("foo")
238
192
  * c.string #=> "foo"
239
193
  */
240
- static VALUE re2_scanner_string(VALUE self) {
194
+ static VALUE re2_scanner_string(const VALUE self) {
241
195
  re2_scanner *c;
242
196
  Data_Get_Struct(self, re2_scanner, c);
243
197
 
@@ -252,7 +206,7 @@ static VALUE re2_scanner_string(VALUE self) {
252
206
  * c = RE2::Regexp.new('(\d+)').scan("foo")
253
207
  * c.eof? #=> true
254
208
  */
255
- static VALUE re2_scanner_eof(VALUE self) {
209
+ static VALUE re2_scanner_eof(const VALUE self) {
256
210
  re2_scanner *c;
257
211
  Data_Get_Struct(self, re2_scanner, c);
258
212
 
@@ -274,7 +228,7 @@ static VALUE re2_scanner_rewind(VALUE self) {
274
228
  re2_scanner *c;
275
229
  Data_Get_Struct(self, re2_scanner, c);
276
230
 
277
- c->input = new(nothrow) re2::StringPiece(StringValuePtr(c->text));
231
+ c->input = new(std::nothrow) re2::StringPiece(StringValuePtr(c->text));
278
232
  c->eof = false;
279
233
 
280
234
  return self;
@@ -284,6 +238,10 @@ static VALUE re2_scanner_rewind(VALUE self) {
284
238
  * Scan the given text incrementally for matches, returning an array of
285
239
  * matches on each subsequent call. Returns nil if no matches are found.
286
240
  *
241
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
242
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
243
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
244
+ *
287
245
  * @return [Array<String>] the matches.
288
246
  * @example
289
247
  * s = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
@@ -291,45 +249,41 @@ static VALUE re2_scanner_rewind(VALUE self) {
291
249
  * s.scan #=> ["bar"]
292
250
  */
293
251
  static VALUE re2_scanner_scan(VALUE self) {
294
- int i;
295
- size_t original_input_size, new_input_size;
296
- bool input_advanced;
297
252
  re2_pattern *p;
298
253
  re2_scanner *c;
299
- VALUE result;
300
254
 
301
255
  Data_Get_Struct(self, re2_scanner, c);
302
256
  Data_Get_Struct(c->regexp, re2_pattern, p);
303
257
 
304
- vector<RE2::Arg> argv(c->number_of_capturing_groups);
305
- vector<RE2::Arg*> args(c->number_of_capturing_groups);
306
- vector<string> matches(c->number_of_capturing_groups);
258
+ std::vector<RE2::Arg> argv(c->number_of_capturing_groups);
259
+ std::vector<RE2::Arg*> args(c->number_of_capturing_groups);
260
+ std::vector<std::string> matches(c->number_of_capturing_groups);
307
261
 
308
262
  if (c->eof) {
309
263
  return Qnil;
310
264
  }
311
265
 
312
- original_input_size = c->input->size();
266
+ re2::StringPiece::size_type original_input_size = c->input->size();
313
267
 
314
- for (i = 0; i < c->number_of_capturing_groups; i++) {
315
- matches[i] = "";
268
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
316
269
  argv[i] = &matches[i];
317
270
  args[i] = &argv[i];
318
271
  }
319
272
 
320
273
  if (RE2::FindAndConsumeN(c->input, *p->pattern, &args[0],
321
274
  c->number_of_capturing_groups)) {
322
- result = rb_ary_new2(c->number_of_capturing_groups);
323
- new_input_size = c->input->size();
324
- input_advanced = new_input_size < original_input_size;
275
+ re2::StringPiece::size_type new_input_size = c->input->size();
276
+ bool input_advanced = new_input_size < original_input_size;
325
277
 
326
- for (i = 0; i < c->number_of_capturing_groups; i++) {
278
+ VALUE result = rb_ary_new2(c->number_of_capturing_groups);
279
+
280
+ for (int i = 0; i < c->number_of_capturing_groups; ++i) {
327
281
  if (matches[i].empty()) {
328
282
  rb_ary_push(result, Qnil);
329
283
  } else {
330
- rb_ary_push(result, ENCODED_STR_NEW(matches[i].data(),
284
+ rb_ary_push(result, encoded_str_new(matches[i].data(),
331
285
  matches[i].size(),
332
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
286
+ p->pattern->options().encoding()));
333
287
  }
334
288
  }
335
289
 
@@ -340,47 +294,40 @@ static VALUE re2_scanner_scan(VALUE self) {
340
294
  if (!input_advanced && new_input_size > 0) {
341
295
  c->input->remove_prefix(1);
342
296
  }
297
+
298
+ return result;
343
299
  } else {
344
- result = Qnil;
300
+ return Qnil;
345
301
  }
346
-
347
- return result;
348
302
  }
349
303
 
350
304
  /*
351
305
  * Retrieve a matchdata by index or name.
352
306
  */
353
- re2::StringPiece *re2_matchdata_find_match(VALUE idx, VALUE self) {
354
- int id;
307
+ static re2::StringPiece *re2_matchdata_find_match(VALUE idx, const VALUE self) {
355
308
  re2_matchdata *m;
356
309
  re2_pattern *p;
357
- map<string, int> groups;
358
- string name;
359
- re2::StringPiece *match;
360
310
 
361
311
  Data_Get_Struct(self, re2_matchdata, m);
362
312
  Data_Get_Struct(m->regexp, re2_pattern, p);
363
313
 
314
+ int id;
315
+
364
316
  if (FIXNUM_P(idx)) {
365
317
  id = FIX2INT(idx);
366
318
  } else {
367
- if (SYMBOL_P(idx)) {
368
- name = rb_id2name(SYM2ID(idx));
369
- } else {
370
- name = StringValuePtr(idx);
371
- }
372
-
373
- groups = p->pattern->NamedCapturingGroups();
319
+ const char *name = SYMBOL_P(idx) ? rb_id2name(SYM2ID(idx)) : StringValuePtr(idx);
320
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
374
321
 
375
- if (groups.count(name) == 1) {
376
- id = groups[name];
322
+ if (std::map<std::string, int>::const_iterator search = groups.find(name); search != groups.end()) {
323
+ id = search->second;
377
324
  } else {
378
325
  return NULL;
379
326
  }
380
327
  }
381
328
 
382
329
  if (id >= 0 && id < m->number_of_matches) {
383
- match = &m->matches[id];
330
+ re2::StringPiece *match = &m->matches[id];
384
331
 
385
332
  if (!match->empty()) {
386
333
  return match;
@@ -399,7 +346,7 @@ re2::StringPiece *re2_matchdata_find_match(VALUE idx, VALUE self) {
399
346
  * m.size #=> 2
400
347
  * m.length #=> 2
401
348
  */
402
- static VALUE re2_matchdata_size(VALUE self) {
349
+ static VALUE re2_matchdata_size(const VALUE self) {
403
350
  re2_matchdata *m;
404
351
  Data_Get_Struct(self, re2_matchdata, m);
405
352
 
@@ -416,23 +363,18 @@ static VALUE re2_matchdata_size(VALUE self) {
416
363
  * m.begin(0) #=> 1
417
364
  * m.begin(1) #=> 4
418
365
  */
419
- static VALUE re2_matchdata_begin(VALUE self, VALUE n) {
366
+ static VALUE re2_matchdata_begin(const VALUE self, VALUE n) {
420
367
  re2_matchdata *m;
421
- re2_pattern *p;
422
- re2::StringPiece *match;
423
- long offset;
424
368
 
425
369
  Data_Get_Struct(self, re2_matchdata, m);
426
- Data_Get_Struct(m->regexp, re2_pattern, p);
427
370
 
428
- match = re2_matchdata_find_match(n, self);
371
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
429
372
  if (match == NULL) {
430
373
  return Qnil;
431
374
  } else {
432
- offset = reinterpret_cast<uintptr_t>(match->data()) - reinterpret_cast<uintptr_t>(StringValuePtr(m->text));
375
+ long offset = match->data() - StringValuePtr(m->text);
433
376
 
434
- return ENCODED_STR_SUBLEN(StringValue(m->text), offset,
435
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
377
+ return LONG2NUM(rb_str_sublen(StringValue(m->text), offset));
436
378
  }
437
379
  }
438
380
 
@@ -446,24 +388,18 @@ static VALUE re2_matchdata_begin(VALUE self, VALUE n) {
446
388
  * m.end(0) #=> 9
447
389
  * m.end(1) #=> 7
448
390
  */
449
- static VALUE re2_matchdata_end(VALUE self, VALUE n) {
391
+ static VALUE re2_matchdata_end(const VALUE self, VALUE n) {
450
392
  re2_matchdata *m;
451
- re2_pattern *p;
452
- re2::StringPiece *match;
453
- long offset;
454
393
 
455
394
  Data_Get_Struct(self, re2_matchdata, m);
456
- Data_Get_Struct(m->regexp, re2_pattern, p);
457
-
458
- match = re2_matchdata_find_match(n, self);
459
395
 
396
+ re2::StringPiece *match = re2_matchdata_find_match(n, self);
460
397
  if (match == NULL) {
461
398
  return Qnil;
462
399
  } else {
463
- offset = reinterpret_cast<uintptr_t>(match->data()) - reinterpret_cast<uintptr_t>(StringValuePtr(m->text)) + match->size();
400
+ long offset = (match->data() - StringValuePtr(m->text)) + match->size();
464
401
 
465
- return ENCODED_STR_SUBLEN(StringValue(m->text), offset,
466
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
402
+ return LONG2NUM(rb_str_sublen(StringValue(m->text), offset));
467
403
  }
468
404
  }
469
405
 
@@ -475,9 +411,10 @@ static VALUE re2_matchdata_end(VALUE self, VALUE n) {
475
411
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
476
412
  * m.regexp #=> #<RE2::Regexp /(\d+)/>
477
413
  */
478
- static VALUE re2_matchdata_regexp(VALUE self) {
414
+ static VALUE re2_matchdata_regexp(const VALUE self) {
479
415
  re2_matchdata *m;
480
416
  Data_Get_Struct(self, re2_matchdata, m);
417
+
481
418
  return m->regexp;
482
419
  }
483
420
 
@@ -489,7 +426,7 @@ static VALUE re2_matchdata_regexp(VALUE self) {
489
426
  * c = RE2::Regexp.new('(\d+)').scan("bob 123")
490
427
  * c.regexp #=> #<RE2::Regexp /(\d+)/>
491
428
  */
492
- static VALUE re2_scanner_regexp(VALUE self) {
429
+ static VALUE re2_scanner_regexp(const VALUE self) {
493
430
  re2_scanner *c;
494
431
  Data_Get_Struct(self, re2_scanner, c);
495
432
 
@@ -498,46 +435,47 @@ static VALUE re2_scanner_regexp(VALUE self) {
498
435
 
499
436
  static VALUE re2_regexp_allocate(VALUE klass) {
500
437
  re2_pattern *p;
438
+
501
439
  return Data_Make_Struct(klass, re2_pattern, 0, re2_regexp_free, p);
502
440
  }
503
441
 
504
442
  /*
505
443
  * Returns the array of matches.
506
444
  *
445
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
446
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
447
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
448
+ *
507
449
  * @return [Array<String, nil>] the array of matches
508
450
  * @example
509
451
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
510
452
  * m.to_a #=> ["123", "123"]
511
453
  */
512
- static VALUE re2_matchdata_to_a(VALUE self) {
513
- int i;
454
+ static VALUE re2_matchdata_to_a(const VALUE self) {
514
455
  re2_matchdata *m;
515
456
  re2_pattern *p;
516
- re2::StringPiece *match;
517
- VALUE array;
518
457
 
519
458
  Data_Get_Struct(self, re2_matchdata, m);
520
459
  Data_Get_Struct(m->regexp, re2_pattern, p);
521
460
 
522
- array = rb_ary_new2(m->number_of_matches);
523
- for (i = 0; i < m->number_of_matches; i++) {
524
- match = &m->matches[i];
461
+ VALUE array = rb_ary_new2(m->number_of_matches);
462
+ for (int i = 0; i < m->number_of_matches; ++i) {
463
+ re2::StringPiece *match = &m->matches[i];
525
464
 
526
465
  if (match->empty()) {
527
466
  rb_ary_push(array, Qnil);
528
467
  } else {
529
- rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
530
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
468
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
469
+ p->pattern->options().encoding()));
531
470
  }
532
471
  }
533
472
 
534
473
  return array;
535
474
  }
536
475
 
537
- static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
476
+ static VALUE re2_matchdata_nth_match(int nth, const VALUE self) {
538
477
  re2_matchdata *m;
539
478
  re2_pattern *p;
540
- re2::StringPiece *match;
541
479
 
542
480
  Data_Get_Struct(self, re2_matchdata, m);
543
481
  Data_Get_Struct(m->regexp, re2_pattern, p);
@@ -545,32 +483,28 @@ static VALUE re2_matchdata_nth_match(int nth, VALUE self) {
545
483
  if (nth < 0 || nth >= m->number_of_matches) {
546
484
  return Qnil;
547
485
  } else {
548
- match = &m->matches[nth];
486
+ re2::StringPiece *match = &m->matches[nth];
549
487
 
550
488
  if (match->empty()) {
551
489
  return Qnil;
552
490
  } else {
553
- return ENCODED_STR_NEW(match->data(), match->size(),
554
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
491
+ return encoded_str_new(match->data(), match->size(),
492
+ p->pattern->options().encoding());
555
493
  }
556
494
  }
557
495
  }
558
496
 
559
- static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
560
- int idx;
497
+ static VALUE re2_matchdata_named_match(const char* name, const VALUE self) {
561
498
  re2_matchdata *m;
562
499
  re2_pattern *p;
563
- map<string, int> groups;
564
- string name_as_string(name);
565
500
 
566
501
  Data_Get_Struct(self, re2_matchdata, m);
567
502
  Data_Get_Struct(m->regexp, re2_pattern, p);
568
503
 
569
- groups = p->pattern->NamedCapturingGroups();
504
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
570
505
 
571
- if (groups.count(name_as_string) == 1) {
572
- idx = groups[name_as_string];
573
- return re2_matchdata_nth_match(idx, self);
506
+ if (std::map<std::string, int>::const_iterator search = groups.find(name); search != groups.end()) {
507
+ return re2_matchdata_nth_match(search->second, self);
574
508
  } else {
575
509
  return Qnil;
576
510
  }
@@ -579,6 +513,10 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
579
513
  /*
580
514
  * Retrieve zero, one or more matches by index or name.
581
515
  *
516
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
517
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
518
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
519
+ *
582
520
  * @return [Array<String, nil>, String, Boolean]
583
521
  *
584
522
  * @overload [](index)
@@ -619,7 +557,7 @@ static VALUE re2_matchdata_named_match(const char* name, VALUE self) {
619
557
  * m["number"] #=> "123"
620
558
  * m[:number] #=> "123"
621
559
  */
622
- static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
560
+ static VALUE re2_matchdata_aref(int argc, VALUE *argv, const VALUE self) {
623
561
  VALUE idx, rest;
624
562
  rb_scan_args(argc, argv, "11", &idx, &rest);
625
563
 
@@ -639,38 +577,40 @@ static VALUE re2_matchdata_aref(int argc, VALUE *argv, VALUE self) {
639
577
  *
640
578
  * @return [String] the entire matched string
641
579
  */
642
- static VALUE re2_matchdata_to_s(VALUE self) {
580
+ static VALUE re2_matchdata_to_s(const VALUE self) {
643
581
  return re2_matchdata_nth_match(0, self);
644
582
  }
645
583
 
646
584
  /*
647
585
  * Returns a printable version of the match.
648
586
  *
587
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
588
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
589
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
590
+ *
649
591
  * @return [String] a printable version of the match
650
592
  * @example
651
593
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
652
594
  * m.inspect #=> "#<RE2::MatchData \"123\" 1:\"123\">"
653
595
  */
654
- static VALUE re2_matchdata_inspect(VALUE self) {
655
- int i;
596
+ static VALUE re2_matchdata_inspect(const VALUE self) {
656
597
  re2_matchdata *m;
657
598
  re2_pattern *p;
658
- VALUE match, result;
659
- ostringstream output;
660
599
 
661
600
  Data_Get_Struct(self, re2_matchdata, m);
662
601
  Data_Get_Struct(m->regexp, re2_pattern, p);
663
602
 
603
+ std::ostringstream output;
664
604
  output << "#<RE2::MatchData";
665
605
 
666
- for (i = 0; i < m->number_of_matches; i++) {
606
+ for (int i = 0; i < m->number_of_matches; ++i) {
667
607
  output << " ";
668
608
 
669
609
  if (i > 0) {
670
610
  output << i << ":";
671
611
  }
672
612
 
673
- match = re2_matchdata_nth_match(i, self);
613
+ VALUE match = re2_matchdata_nth_match(i, self);
674
614
 
675
615
  if (match == Qnil) {
676
616
  output << "nil";
@@ -681,15 +621,17 @@ static VALUE re2_matchdata_inspect(VALUE self) {
681
621
 
682
622
  output << ">";
683
623
 
684
- result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
685
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
686
-
687
- return result;
624
+ return encoded_str_new(output.str().data(), output.str().length(),
625
+ p->pattern->options().encoding());
688
626
  }
689
627
 
690
628
  /*
691
629
  * Returns the array of submatches for pattern matching.
692
630
  *
631
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
632
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
633
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
634
+ *
693
635
  * @return [Array<String, nil>] the array of submatches
694
636
  * @example
695
637
  * m = RE2::Regexp.new('(\d+)').match("bob 123")
@@ -703,25 +645,22 @@ static VALUE re2_matchdata_inspect(VALUE self) {
703
645
  * puts "Unrecognised match"
704
646
  * end
705
647
  */
706
- static VALUE re2_matchdata_deconstruct(VALUE self) {
707
- int i;
648
+ static VALUE re2_matchdata_deconstruct(const VALUE self) {
708
649
  re2_matchdata *m;
709
650
  re2_pattern *p;
710
- re2::StringPiece *match;
711
- VALUE array;
712
651
 
713
652
  Data_Get_Struct(self, re2_matchdata, m);
714
653
  Data_Get_Struct(m->regexp, re2_pattern, p);
715
654
 
716
- array = rb_ary_new2(m->number_of_matches - 1);
717
- for (i = 1; i < m->number_of_matches; i++) {
718
- match = &m->matches[i];
655
+ VALUE array = rb_ary_new2(m->number_of_matches - 1);
656
+ for (int i = 1; i < m->number_of_matches; ++i) {
657
+ re2::StringPiece *match = &m->matches[i];
719
658
 
720
659
  if (match->empty()) {
721
660
  rb_ary_push(array, Qnil);
722
661
  } else {
723
- rb_ary_push(array, ENCODED_STR_NEW(match->data(), match->size(),
724
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"));
662
+ rb_ary_push(array, encoded_str_new(match->data(), match->size(),
663
+ p->pattern->options().encoding()));
725
664
  }
726
665
  }
727
666
 
@@ -735,6 +674,10 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
735
674
  * more keys than there are capturing groups. Given keys will populate the hash in
736
675
  * order but an invalid name will cause the hash to be immediately returned.
737
676
  *
677
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
678
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
679
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
680
+ *
738
681
  * @return [Hash] a hash of capturing group names to submatches
739
682
  * @param [Array<Symbol>, nil] keys an array of Symbol capturing group names or nil to return all names
740
683
  * @example
@@ -752,40 +695,36 @@ static VALUE re2_matchdata_deconstruct(VALUE self) {
752
695
  * puts "Unrecognised match"
753
696
  * end
754
697
  */
755
- static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
756
- int i;
757
- VALUE capturing_groups, key;
698
+ static VALUE re2_matchdata_deconstruct_keys(const VALUE self, const VALUE keys) {
758
699
  re2_matchdata *m;
759
700
  re2_pattern *p;
760
- map<string, int> groups;
761
- map<string, int>::iterator iterator;
762
701
 
763
702
  Data_Get_Struct(self, re2_matchdata, m);
764
703
  Data_Get_Struct(m->regexp, re2_pattern, p);
765
704
 
766
- groups = p->pattern->NamedCapturingGroups();
767
- capturing_groups = rb_hash_new();
705
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
706
+ VALUE capturing_groups = rb_hash_new();
768
707
 
769
708
  if (NIL_P(keys)) {
770
- for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
709
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
771
710
  rb_hash_aset(capturing_groups,
772
- ID2SYM(rb_intern(iterator->first.data())),
773
- re2_matchdata_nth_match(iterator->second, self));
711
+ ID2SYM(rb_intern(it->first.data())),
712
+ re2_matchdata_nth_match(it->second, self));
774
713
  }
775
714
  } else {
776
715
  Check_Type(keys, T_ARRAY);
777
716
 
778
717
  if (p->pattern->NumberOfCapturingGroups() >= RARRAY_LEN(keys)) {
779
- for (i = 0; i < RARRAY_LEN(keys); i++) {
780
- key = rb_ary_entry(keys, i);
718
+ for (int i = 0; i < RARRAY_LEN(keys); ++i) {
719
+ VALUE key = rb_ary_entry(keys, i);
781
720
  Check_Type(key, T_SYMBOL);
782
- string name(rb_id2name(SYM2ID(key)));
721
+ const char *name = rb_id2name(SYM2ID(key));
783
722
 
784
- if (groups.count(name) == 0) {
723
+ if (std::map<std::string, int>::const_iterator search = groups.find(name); search != groups.end()) {
724
+ rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(search->second, self));
725
+ } else {
785
726
  break;
786
727
  }
787
-
788
- rb_hash_aset(capturing_groups, key, re2_matchdata_nth_match(groups[name], self));
789
728
  }
790
729
  }
791
730
  }
@@ -802,6 +741,7 @@ static VALUE re2_matchdata_deconstruct_keys(VALUE self, VALUE keys) {
802
741
  */
803
742
  static VALUE re2_re2(int argc, VALUE *argv, VALUE self) {
804
743
  UNUSED(self);
744
+
805
745
  return rb_class_new_instance(argc, argv, re2_cRegexp);
806
746
  }
807
747
 
@@ -849,11 +789,11 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
849
789
 
850
790
  if (RTEST(options)) {
851
791
  RE2::Options re2_options;
852
- parse_re2_options(re2_options, options);
792
+ parse_re2_options(&re2_options, options);
853
793
 
854
- p->pattern = new(nothrow) RE2(StringValuePtr(pattern), re2_options);
794
+ p->pattern = new(std::nothrow) RE2(StringValuePtr(pattern), re2_options);
855
795
  } else {
856
- p->pattern = new(nothrow) RE2(StringValuePtr(pattern));
796
+ p->pattern = new(std::nothrow) RE2(StringValuePtr(pattern));
857
797
  }
858
798
 
859
799
  if (p->pattern == 0) {
@@ -866,40 +806,47 @@ static VALUE re2_regexp_initialize(int argc, VALUE *argv, VALUE self) {
866
806
  /*
867
807
  * Returns a printable version of the regular expression +re2+.
868
808
  *
809
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
810
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
811
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
812
+ *
869
813
  * @return [String] a printable version of the regular expression
870
814
  * @example
871
815
  * re2 = RE2::Regexp.new("woo?")
872
816
  * re2.inspect #=> "#<RE2::Regexp /woo?/>"
873
817
  */
874
- static VALUE re2_regexp_inspect(VALUE self) {
818
+ static VALUE re2_regexp_inspect(const VALUE self) {
875
819
  re2_pattern *p;
876
- VALUE result;
877
- ostringstream output;
878
820
 
879
821
  Data_Get_Struct(self, re2_pattern, p);
880
822
 
881
- output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
823
+ std::ostringstream output;
882
824
 
883
- result = ENCODED_STR_NEW(output.str().data(), output.str().length(),
884
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
825
+ output << "#<RE2::Regexp /" << p->pattern->pattern() << "/>";
885
826
 
886
- return result;
827
+ return encoded_str_new(output.str().data(), output.str().length(),
828
+ p->pattern->options().encoding());
887
829
  }
888
830
 
889
831
  /*
890
832
  * Returns a string version of the regular expression +re2+.
891
833
  *
834
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
835
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
836
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
837
+ *
892
838
  * @return [String] a string version of the regular expression
893
839
  * @example
894
840
  * re2 = RE2::Regexp.new("woo?")
895
841
  * re2.to_s #=> "woo?"
896
842
  */
897
- static VALUE re2_regexp_to_s(VALUE self) {
843
+ static VALUE re2_regexp_to_s(const VALUE self) {
898
844
  re2_pattern *p;
899
845
  Data_Get_Struct(self, re2_pattern, p);
900
- return ENCODED_STR_NEW(p->pattern->pattern().data(),
846
+
847
+ return encoded_str_new(p->pattern->pattern().data(),
901
848
  p->pattern->pattern().size(),
902
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
849
+ p->pattern->options().encoding());
903
850
  }
904
851
 
905
852
  /*
@@ -911,9 +858,10 @@ static VALUE re2_regexp_to_s(VALUE self) {
911
858
  * re2 = RE2::Regexp.new("woo?")
912
859
  * re2.ok? #=> true
913
860
  */
914
- static VALUE re2_regexp_ok(VALUE self) {
861
+ static VALUE re2_regexp_ok(const VALUE self) {
915
862
  re2_pattern *p;
916
863
  Data_Get_Struct(self, re2_pattern, p);
864
+
917
865
  return BOOL2RUBY(p->pattern->ok());
918
866
  }
919
867
 
@@ -926,9 +874,10 @@ static VALUE re2_regexp_ok(VALUE self) {
926
874
  * re2 = RE2::Regexp.new("woo?", :utf8 => true)
927
875
  * re2.utf8? #=> true
928
876
  */
929
- static VALUE re2_regexp_utf8(VALUE self) {
877
+ static VALUE re2_regexp_utf8(const VALUE self) {
930
878
  re2_pattern *p;
931
879
  Data_Get_Struct(self, re2_pattern, p);
880
+
932
881
  return BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8);
933
882
  }
934
883
 
@@ -941,9 +890,10 @@ static VALUE re2_regexp_utf8(VALUE self) {
941
890
  * re2 = RE2::Regexp.new("woo?", :posix_syntax => true)
942
891
  * re2.posix_syntax? #=> true
943
892
  */
944
- static VALUE re2_regexp_posix_syntax(VALUE self) {
893
+ static VALUE re2_regexp_posix_syntax(const VALUE self) {
945
894
  re2_pattern *p;
946
895
  Data_Get_Struct(self, re2_pattern, p);
896
+
947
897
  return BOOL2RUBY(p->pattern->options().posix_syntax());
948
898
  }
949
899
 
@@ -956,9 +906,10 @@ static VALUE re2_regexp_posix_syntax(VALUE self) {
956
906
  * re2 = RE2::Regexp.new("woo?", :longest_match => true)
957
907
  * re2.longest_match? #=> true
958
908
  */
959
- static VALUE re2_regexp_longest_match(VALUE self) {
909
+ static VALUE re2_regexp_longest_match(const VALUE self) {
960
910
  re2_pattern *p;
961
911
  Data_Get_Struct(self, re2_pattern, p);
912
+
962
913
  return BOOL2RUBY(p->pattern->options().longest_match());
963
914
  }
964
915
 
@@ -971,9 +922,10 @@ static VALUE re2_regexp_longest_match(VALUE self) {
971
922
  * re2 = RE2::Regexp.new("woo?", :log_errors => true)
972
923
  * re2.log_errors? #=> true
973
924
  */
974
- static VALUE re2_regexp_log_errors(VALUE self) {
925
+ static VALUE re2_regexp_log_errors(const VALUE self) {
975
926
  re2_pattern *p;
976
927
  Data_Get_Struct(self, re2_pattern, p);
928
+
977
929
  return BOOL2RUBY(p->pattern->options().log_errors());
978
930
  }
979
931
 
@@ -986,9 +938,10 @@ static VALUE re2_regexp_log_errors(VALUE self) {
986
938
  * re2 = RE2::Regexp.new("woo?", :max_mem => 1024)
987
939
  * re2.max_mem #=> 1024
988
940
  */
989
- static VALUE re2_regexp_max_mem(VALUE self) {
941
+ static VALUE re2_regexp_max_mem(const VALUE self) {
990
942
  re2_pattern *p;
991
943
  Data_Get_Struct(self, re2_pattern, p);
944
+
992
945
  return INT2FIX(p->pattern->options().max_mem());
993
946
  }
994
947
 
@@ -1001,9 +954,10 @@ static VALUE re2_regexp_max_mem(VALUE self) {
1001
954
  * re2 = RE2::Regexp.new("woo?", :literal => true)
1002
955
  * re2.literal? #=> true
1003
956
  */
1004
- static VALUE re2_regexp_literal(VALUE self) {
957
+ static VALUE re2_regexp_literal(const VALUE self) {
1005
958
  re2_pattern *p;
1006
959
  Data_Get_Struct(self, re2_pattern, p);
960
+
1007
961
  return BOOL2RUBY(p->pattern->options().literal());
1008
962
  }
1009
963
 
@@ -1016,9 +970,10 @@ static VALUE re2_regexp_literal(VALUE self) {
1016
970
  * re2 = RE2::Regexp.new("woo?", :never_nl => true)
1017
971
  * re2.never_nl? #=> true
1018
972
  */
1019
- static VALUE re2_regexp_never_nl(VALUE self) {
973
+ static VALUE re2_regexp_never_nl(const VALUE self) {
1020
974
  re2_pattern *p;
1021
975
  Data_Get_Struct(self, re2_pattern, p);
976
+
1022
977
  return BOOL2RUBY(p->pattern->options().never_nl());
1023
978
  }
1024
979
 
@@ -1031,9 +986,10 @@ static VALUE re2_regexp_never_nl(VALUE self) {
1031
986
  * re2 = RE2::Regexp.new("woo?", :case_sensitive => true)
1032
987
  * re2.case_sensitive? #=> true
1033
988
  */
1034
- static VALUE re2_regexp_case_sensitive(VALUE self) {
989
+ static VALUE re2_regexp_case_sensitive(const VALUE self) {
1035
990
  re2_pattern *p;
1036
991
  Data_Get_Struct(self, re2_pattern, p);
992
+
1037
993
  return BOOL2RUBY(p->pattern->options().case_sensitive());
1038
994
  }
1039
995
 
@@ -1047,7 +1003,7 @@ static VALUE re2_regexp_case_sensitive(VALUE self) {
1047
1003
  * re2.case_insensitive? #=> false
1048
1004
  * re2.casefold? #=> false
1049
1005
  */
1050
- static VALUE re2_regexp_case_insensitive(VALUE self) {
1006
+ static VALUE re2_regexp_case_insensitive(const VALUE self) {
1051
1007
  return BOOL2RUBY(re2_regexp_case_sensitive(self) != Qtrue);
1052
1008
  }
1053
1009
 
@@ -1060,9 +1016,10 @@ static VALUE re2_regexp_case_insensitive(VALUE self) {
1060
1016
  * re2 = RE2::Regexp.new("woo?", :perl_classes => true)
1061
1017
  * re2.perl_classes? #=> true
1062
1018
  */
1063
- static VALUE re2_regexp_perl_classes(VALUE self) {
1019
+ static VALUE re2_regexp_perl_classes(const VALUE self) {
1064
1020
  re2_pattern *p;
1065
1021
  Data_Get_Struct(self, re2_pattern, p);
1022
+
1066
1023
  return BOOL2RUBY(p->pattern->options().perl_classes());
1067
1024
  }
1068
1025
 
@@ -1075,9 +1032,10 @@ static VALUE re2_regexp_perl_classes(VALUE self) {
1075
1032
  * re2 = RE2::Regexp.new("woo?", :word_boundary => true)
1076
1033
  * re2.word_boundary? #=> true
1077
1034
  */
1078
- static VALUE re2_regexp_word_boundary(VALUE self) {
1035
+ static VALUE re2_regexp_word_boundary(const VALUE self) {
1079
1036
  re2_pattern *p;
1080
1037
  Data_Get_Struct(self, re2_pattern, p);
1038
+
1081
1039
  return BOOL2RUBY(p->pattern->options().word_boundary());
1082
1040
  }
1083
1041
 
@@ -1090,9 +1048,10 @@ static VALUE re2_regexp_word_boundary(VALUE self) {
1090
1048
  * re2 = RE2::Regexp.new("woo?", :one_line => true)
1091
1049
  * re2.one_line? #=> true
1092
1050
  */
1093
- static VALUE re2_regexp_one_line(VALUE self) {
1051
+ static VALUE re2_regexp_one_line(const VALUE self) {
1094
1052
  re2_pattern *p;
1095
1053
  Data_Get_Struct(self, re2_pattern, p);
1054
+
1096
1055
  return BOOL2RUBY(p->pattern->options().one_line());
1097
1056
  }
1098
1057
 
@@ -1102,9 +1061,10 @@ static VALUE re2_regexp_one_line(VALUE self) {
1102
1061
  *
1103
1062
  * @return [String, nil] the error string or nil
1104
1063
  */
1105
- static VALUE re2_regexp_error(VALUE self) {
1064
+ static VALUE re2_regexp_error(const VALUE self) {
1106
1065
  re2_pattern *p;
1107
1066
  Data_Get_Struct(self, re2_pattern, p);
1067
+
1108
1068
  if (p->pattern->ok()) {
1109
1069
  return Qnil;
1110
1070
  } else {
@@ -1116,17 +1076,22 @@ static VALUE re2_regexp_error(VALUE self) {
1116
1076
  * If the RE2 could not be created properly, returns
1117
1077
  * the offending portion of the regexp otherwise returns nil.
1118
1078
  *
1079
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1080
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1081
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1082
+ *
1119
1083
  * @return [String, nil] the offending portion of the regexp or nil
1120
1084
  */
1121
- static VALUE re2_regexp_error_arg(VALUE self) {
1085
+ static VALUE re2_regexp_error_arg(const VALUE self) {
1122
1086
  re2_pattern *p;
1123
1087
  Data_Get_Struct(self, re2_pattern, p);
1088
+
1124
1089
  if (p->pattern->ok()) {
1125
1090
  return Qnil;
1126
1091
  } else {
1127
- return ENCODED_STR_NEW(p->pattern->error_arg().data(),
1092
+ return encoded_str_new(p->pattern->error_arg().data(),
1128
1093
  p->pattern->error_arg().size(),
1129
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1094
+ p->pattern->options().encoding());
1130
1095
  }
1131
1096
  }
1132
1097
 
@@ -1137,9 +1102,10 @@ static VALUE re2_regexp_error_arg(VALUE self) {
1137
1102
  *
1138
1103
  * @return [Integer] the regexp "cost"
1139
1104
  */
1140
- static VALUE re2_regexp_program_size(VALUE self) {
1105
+ static VALUE re2_regexp_program_size(const VALUE self) {
1141
1106
  re2_pattern *p;
1142
1107
  Data_Get_Struct(self, re2_pattern, p);
1108
+
1143
1109
  return INT2FIX(p->pattern->ProgramSize());
1144
1110
  }
1145
1111
 
@@ -1149,12 +1115,11 @@ static VALUE re2_regexp_program_size(VALUE self) {
1149
1115
  *
1150
1116
  * @return [Hash] the options
1151
1117
  */
1152
- static VALUE re2_regexp_options(VALUE self) {
1153
- VALUE options;
1118
+ static VALUE re2_regexp_options(const VALUE self) {
1154
1119
  re2_pattern *p;
1155
1120
 
1156
1121
  Data_Get_Struct(self, re2_pattern, p);
1157
- options = rb_hash_new();
1122
+ VALUE options = rb_hash_new();
1158
1123
 
1159
1124
  rb_hash_aset(options, ID2SYM(id_utf8),
1160
1125
  BOOL2RUBY(p->pattern->options().encoding() == RE2::Options::EncodingUTF8));
@@ -1202,33 +1167,34 @@ static VALUE re2_regexp_options(VALUE self) {
1202
1167
  *
1203
1168
  * @return [Integer] the number of capturing subpatterns
1204
1169
  */
1205
- static VALUE re2_regexp_number_of_capturing_groups(VALUE self) {
1170
+ static VALUE re2_regexp_number_of_capturing_groups(const VALUE self) {
1206
1171
  re2_pattern *p;
1207
-
1208
1172
  Data_Get_Struct(self, re2_pattern, p);
1173
+
1209
1174
  return INT2FIX(p->pattern->NumberOfCapturingGroups());
1210
1175
  }
1211
1176
 
1212
1177
  /*
1213
1178
  * Returns a hash of names to capturing indices of groups.
1214
1179
  *
1180
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1181
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1182
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1183
+ *
1215
1184
  * @return [Hash] a hash of names to capturing indices
1216
1185
  */
1217
- static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1218
- VALUE capturing_groups;
1186
+ static VALUE re2_regexp_named_capturing_groups(const VALUE self) {
1219
1187
  re2_pattern *p;
1220
- map<string, int> groups;
1221
- map<string, int>::iterator iterator;
1222
1188
 
1223
1189
  Data_Get_Struct(self, re2_pattern, p);
1224
- groups = p->pattern->NamedCapturingGroups();
1225
- capturing_groups = rb_hash_new();
1190
+ const std::map<std::string, int>& groups = p->pattern->NamedCapturingGroups();
1191
+ VALUE capturing_groups = rb_hash_new();
1226
1192
 
1227
- for (iterator = groups.begin(); iterator != groups.end(); iterator++) {
1193
+ for (std::map<std::string, int>::const_iterator it = groups.begin(); it != groups.end(); ++it) {
1228
1194
  rb_hash_aset(capturing_groups,
1229
- ENCODED_STR_NEW(iterator->first.data(), iterator->first.size(),
1230
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1"),
1231
- INT2FIX(iterator->second));
1195
+ encoded_str_new(it->first.data(), it->first.size(),
1196
+ p->pattern->options().encoding()),
1197
+ INT2FIX(it->second));
1232
1198
  }
1233
1199
 
1234
1200
  return capturing_groups;
@@ -1242,16 +1208,23 @@ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1242
1208
  * @return [Boolean, RE2::MatchData]
1243
1209
  *
1244
1210
  * @overload match(text)
1245
- * Returns an {RE2::MatchData} containing the matching
1246
- * pattern and all subpatterns resulting from looking for
1247
- * the regexp in +text+.
1211
+ * Returns an {RE2::MatchData} containing the matching pattern and all
1212
+ * subpatterns resulting from looking for the regexp in +text+ if the pattern
1213
+ * contains capturing groups.
1214
+ *
1215
+ * Returns either true or false indicating whether a successful match was
1216
+ * made if the pattern contains no capturing groups.
1248
1217
  *
1249
1218
  * @param [String] text the text to search
1250
- * @return [RE2::MatchData] the matches
1219
+ * @return [RE2::MatchData] if the pattern contains capturing groups
1220
+ * @return [Boolean] if the pattern does not contain capturing groups
1251
1221
  * @raise [NoMemoryError] if there was not enough memory to allocate the matches
1252
- * @example
1222
+ * @example Matching with capturing groups
1253
1223
  * r = RE2::Regexp.new('w(o)(o)')
1254
1224
  * r.match('woo') #=> #<RE2::MatchData "woo" 1:"o" 2:"o">
1225
+ * @example Matching without capturing groups
1226
+ * r = RE2::Regexp.new('woo')
1227
+ * r.match('woo') #=> true
1255
1228
  *
1256
1229
  * @overload match(text, 0)
1257
1230
  * Returns either true or false indicating whether a
@@ -1279,20 +1252,20 @@ static VALUE re2_regexp_named_capturing_groups(VALUE self) {
1279
1252
  * r.match('woo', 1) #=> #<RE2::MatchData "woo" 1:"o">
1280
1253
  * r.match('woo', 3) #=> #<RE2::MatchData "woo" 1:"o" 2:"o" 3:nil>
1281
1254
  */
1282
- static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1283
- int n;
1284
- bool matched;
1255
+ static VALUE re2_regexp_match(int argc, VALUE *argv, const VALUE self) {
1285
1256
  re2_pattern *p;
1286
1257
  re2_matchdata *m;
1287
- VALUE text, number_of_matches, matchdata;
1258
+ VALUE text, number_of_matches;
1288
1259
 
1289
1260
  rb_scan_args(argc, argv, "11", &text, &number_of_matches);
1290
1261
 
1291
1262
  /* Ensure text is a string. */
1292
- text = StringValue(text);
1263
+ StringValue(text);
1293
1264
 
1294
1265
  Data_Get_Struct(self, re2_pattern, p);
1295
1266
 
1267
+ int n;
1268
+
1296
1269
  if (RTEST(number_of_matches)) {
1297
1270
  n = NUM2INT(number_of_matches);
1298
1271
 
@@ -1308,17 +1281,21 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1308
1281
  }
1309
1282
 
1310
1283
  if (n == 0) {
1311
- matched = match(p->pattern, StringValuePtr(text), 0,
1312
- static_cast<int>(RSTRING_LEN(text)), RE2::UNANCHORED, 0, 0);
1284
+ #ifdef HAVE_ENDPOS_ARGUMENT
1285
+ bool matched = p->pattern->Match(StringValuePtr(text), 0,
1286
+ RSTRING_LEN(text), RE2::UNANCHORED, 0, 0);
1287
+ #else
1288
+ bool matched = p->pattern->Match(StringValuePtr(text), 0, RE2::UNANCHORED,
1289
+ 0, 0);
1290
+ #endif
1313
1291
  return BOOL2RUBY(matched);
1314
1292
  } else {
1315
-
1316
1293
  /* Because match returns the whole match as well. */
1317
1294
  n += 1;
1318
1295
 
1319
- matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1296
+ VALUE matchdata = rb_class_new_instance(0, 0, re2_cMatchData);
1320
1297
  Data_Get_Struct(matchdata, re2_matchdata, m);
1321
- m->matches = new(nothrow) re2::StringPiece[n];
1298
+ m->matches = new(std::nothrow) re2::StringPiece[n];
1322
1299
  m->regexp = self;
1323
1300
  m->text = rb_str_dup(text);
1324
1301
  rb_str_freeze(m->text);
@@ -1330,10 +1307,13 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1330
1307
 
1331
1308
  m->number_of_matches = n;
1332
1309
 
1333
- matched = match(p->pattern, StringValuePtr(m->text), 0,
1334
- static_cast<int>(RSTRING_LEN(m->text)),
1335
- RE2::UNANCHORED, m->matches, n);
1336
-
1310
+ #ifdef HAVE_ENDPOS_ARGUMENT
1311
+ bool matched = p->pattern->Match(StringValuePtr(m->text), 0,
1312
+ RSTRING_LEN(m->text), RE2::UNANCHORED, m->matches, n);
1313
+ #else
1314
+ bool matched = p->pattern->Match(StringValuePtr(m->text), 0,
1315
+ RE2::UNANCHORED, m->matches, n);
1316
+ #endif
1337
1317
  if (matched) {
1338
1318
  return matchdata;
1339
1319
  } else {
@@ -1348,10 +1328,8 @@ static VALUE re2_regexp_match(int argc, VALUE *argv, VALUE self) {
1348
1328
  *
1349
1329
  * @return [Boolean] whether the match was successful
1350
1330
  */
1351
- static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1352
- VALUE argv[2];
1353
- argv[0] = text;
1354
- argv[1] = INT2FIX(0);
1331
+ static VALUE re2_regexp_match_p(const VALUE self, VALUE text) {
1332
+ VALUE argv[2] = { text, INT2FIX(0) };
1355
1333
 
1356
1334
  return re2_regexp_match(2, argv, self);
1357
1335
  }
@@ -1362,16 +1340,15 @@ static VALUE re2_regexp_match_p(VALUE self, VALUE text) {
1362
1340
  * @example
1363
1341
  * c = RE2::Regexp.new('(\w+)').scan("Foo bar baz")
1364
1342
  */
1365
- static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1343
+ static VALUE re2_regexp_scan(const VALUE self, VALUE text) {
1366
1344
  re2_pattern *p;
1367
1345
  re2_scanner *c;
1368
- VALUE scanner;
1369
1346
 
1370
1347
  Data_Get_Struct(self, re2_pattern, p);
1371
- scanner = rb_class_new_instance(0, 0, re2_cScanner);
1348
+ VALUE scanner = rb_class_new_instance(0, 0, re2_cScanner);
1372
1349
  Data_Get_Struct(scanner, re2_scanner, c);
1373
1350
 
1374
- c->input = new(nothrow) re2::StringPiece(StringValuePtr(text));
1351
+ c->input = new(std::nothrow) re2::StringPiece(StringValuePtr(text));
1375
1352
  c->regexp = self;
1376
1353
  c->text = text;
1377
1354
 
@@ -1390,6 +1367,10 @@ static VALUE re2_regexp_scan(VALUE self, VALUE text) {
1390
1367
  * Returns a copy of +str+ with the first occurrence +pattern+
1391
1368
  * replaced with +rewrite+.
1392
1369
  *
1370
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1371
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1372
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1373
+ *
1393
1374
  * @param [String] str the string to modify
1394
1375
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1395
1376
  * @param [String] rewrite the string to replace with
@@ -1404,29 +1385,33 @@ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
1404
1385
  UNUSED(self);
1405
1386
  re2_pattern *p;
1406
1387
 
1407
- /* Convert all the inputs to be pumped into RE2::Replace. */
1408
- string str_as_string(StringValuePtr(str));
1388
+ /* Take a copy of str so it can be modified in-place by
1389
+ * RE2::Replace.
1390
+ */
1391
+ std::string str_as_string(StringValuePtr(str));
1409
1392
 
1410
1393
  /* Do the replacement. */
1411
1394
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1412
1395
  Data_Get_Struct(pattern, re2_pattern, p);
1413
1396
  RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
1414
1397
 
1415
- return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
1416
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1398
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1399
+ p->pattern->options().encoding());
1417
1400
  } else {
1418
1401
  RE2::Replace(&str_as_string, StringValuePtr(pattern),
1419
1402
  StringValuePtr(rewrite));
1420
1403
 
1421
- return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
1422
- pattern);
1404
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1423
1405
  }
1424
-
1425
1406
  }
1426
1407
 
1427
1408
  /*
1428
1409
  * Return a copy of +str+ with +pattern+ replaced by +rewrite+.
1429
1410
  *
1411
+ * Note RE2 only supports UTF-8 and ISO-8859-1 encoding so strings will be
1412
+ * returned in UTF-8 by default or ISO-8859-1 if the :utf8 option for the
1413
+ * RE2::Regexp is set to false (any other encoding's behaviour is undefined).
1414
+ *
1430
1415
  * @param [String] str the string to modify
1431
1416
  * @param [String, RE2::Regexp] pattern a regexp matching text to be replaced
1432
1417
  * @param [String] rewrite the string to replace with
@@ -1440,23 +1425,24 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
1440
1425
  VALUE rewrite) {
1441
1426
  UNUSED(self);
1442
1427
 
1443
- /* Convert all the inputs to be pumped into RE2::GlobalReplace. */
1428
+ /* Take a copy of str so it can be modified in-place by
1429
+ * RE2::GlobalReplace.
1430
+ */
1444
1431
  re2_pattern *p;
1445
- string str_as_string(StringValuePtr(str));
1432
+ std::string str_as_string(StringValuePtr(str));
1446
1433
 
1447
1434
  /* Do the replacement. */
1448
1435
  if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
1449
1436
  Data_Get_Struct(pattern, re2_pattern, p);
1450
1437
  RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite));
1451
1438
 
1452
- return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
1453
- p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
1439
+ return encoded_str_new(str_as_string.data(), str_as_string.size(),
1440
+ p->pattern->options().encoding());
1454
1441
  } else {
1455
1442
  RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
1456
1443
  StringValuePtr(rewrite));
1457
1444
 
1458
- return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
1459
- pattern);
1445
+ return encoded_str_new(str_as_string.data(), str_as_string.size(), RE2::Options::EncodingUTF8);
1460
1446
  }
1461
1447
  }
1462
1448
 
@@ -1472,11 +1458,12 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
1472
1458
  */
1473
1459
  static VALUE re2_QuoteMeta(VALUE self, VALUE unquoted) {
1474
1460
  UNUSED(self);
1475
- string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted));
1461
+ std::string quoted_string = RE2::QuoteMeta(StringValuePtr(unquoted));
1462
+
1476
1463
  return rb_str_new(quoted_string.data(), quoted_string.size());
1477
1464
  }
1478
1465
 
1479
- void re2_set_free(re2_set *self) {
1466
+ static void re2_set_free(re2_set *self) {
1480
1467
  if (self->set) {
1481
1468
  delete self->set;
1482
1469
  }
@@ -1486,6 +1473,7 @@ void re2_set_free(re2_set *self) {
1486
1473
  static VALUE re2_set_allocate(VALUE klass) {
1487
1474
  re2_set *s;
1488
1475
  VALUE result = Data_Make_Struct(klass, re2_set, 0, re2_set_free, s);
1476
+
1489
1477
  return result;
1490
1478
  }
1491
1479
 
@@ -1533,18 +1521,16 @@ static VALUE re2_set_allocate(VALUE klass) {
1533
1521
  static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1534
1522
  VALUE anchor, options;
1535
1523
  re2_set *s;
1536
- RE2::Anchor re2_anchor;
1524
+ RE2::Anchor re2_anchor = RE2::UNANCHORED;
1537
1525
  RE2::Options re2_options;
1538
1526
 
1539
1527
  rb_scan_args(argc, argv, "02", &anchor, &options);
1540
1528
  Data_Get_Struct(self, re2_set, s);
1541
1529
 
1542
1530
  if (RTEST(options)) {
1543
- parse_re2_options(re2_options, options);
1531
+ parse_re2_options(&re2_options, options);
1544
1532
  }
1545
- if (NIL_P(anchor)) {
1546
- re2_anchor = RE2::UNANCHORED;
1547
- } else {
1533
+ if (!NIL_P(anchor)) {
1548
1534
  Check_Type(anchor, T_SYMBOL);
1549
1535
  ID id_anchor = SYM2ID(anchor);
1550
1536
  if (id_anchor == id_unanchored) {
@@ -1558,7 +1544,7 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1558
1544
  }
1559
1545
  }
1560
1546
 
1561
- s->set = new(nothrow) RE2::Set(re2_options, re2_anchor);
1547
+ s->set = new(std::nothrow) RE2::Set(re2_options, re2_anchor);
1562
1548
  if (s->set == 0) {
1563
1549
  rb_raise(rb_eNoMemError, "not enough memory to allocate RE2::Set object");
1564
1550
  }
@@ -1579,11 +1565,12 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
1579
1565
  * set.add("def") #=> 1
1580
1566
  */
1581
1567
  static VALUE re2_set_add(VALUE self, VALUE pattern) {
1582
- Check_Type(pattern, T_STRING);
1568
+ StringValue(pattern);
1583
1569
  re2::StringPiece regex(RSTRING_PTR(pattern), RSTRING_LEN(pattern));
1584
1570
  std::string err;
1585
1571
  re2_set *s;
1586
1572
  Data_Get_Struct(self, re2_set, s);
1573
+
1587
1574
  int index = s->set->Add(regex, &err);
1588
1575
  if (index < 0) {
1589
1576
  rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", err.c_str());
@@ -1665,25 +1652,27 @@ static VALUE re2_set_match_raises_errors_p(VALUE self) {
1665
1652
  * set.compile
1666
1653
  * set.match("abcdef", :exception => true) # => [0, 1]
1667
1654
  */
1668
- static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1669
- VALUE str, options, exception_option;
1655
+ static VALUE re2_set_match(int argc, VALUE *argv, const VALUE self) {
1656
+ VALUE str, options;
1670
1657
  bool raise_exception = true;
1671
1658
  rb_scan_args(argc, argv, "11", &str, &options);
1672
- Check_Type(str, T_STRING);
1659
+
1660
+ StringValue(str);
1673
1661
  re2::StringPiece data(RSTRING_PTR(str), RSTRING_LEN(str));
1674
- std::vector<int> v;
1675
1662
  re2_set *s;
1676
1663
  Data_Get_Struct(self, re2_set, s);
1677
1664
 
1678
1665
  if (RTEST(options)) {
1679
1666
  Check_Type(options, T_HASH);
1680
1667
 
1681
- exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1668
+ VALUE exception_option = rb_hash_aref(options, ID2SYM(id_exception));
1682
1669
  if (!NIL_P(exception_option)) {
1683
1670
  raise_exception = RTEST(exception_option);
1684
1671
  }
1685
1672
  }
1686
1673
 
1674
+ std::vector<int> v;
1675
+
1687
1676
  if (raise_exception) {
1688
1677
  #ifdef HAVE_ERROR_INFO_ARGUMENT
1689
1678
  RE2::Set::ErrorInfo e;
@@ -1704,7 +1693,7 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1704
1693
  rb_raise(re2_eSetMatchError, "Unknown RE2::Set::ErrorKind: %d", e.kind);
1705
1694
  }
1706
1695
  } else {
1707
- for (size_t i = 0; i < v.size(); i++) {
1696
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
1708
1697
  rb_ary_push(result, INT2FIX(v[i]));
1709
1698
  }
1710
1699
  }
@@ -1718,7 +1707,7 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
1718
1707
  VALUE result = rb_ary_new2(v.size());
1719
1708
 
1720
1709
  if (matched) {
1721
- for (size_t i = 0; i < v.size(); i++) {
1710
+ for (std::vector<int>::size_type i = 0; i < v.size(); ++i) {
1722
1711
  rb_ary_push(result, INT2FIX(v[i]));
1723
1712
  }
1724
1713
  }
@@ -1868,7 +1857,7 @@ void Init_re2(void) {
1868
1857
  rb_define_singleton_method(re2_cRegexp, "compile",
1869
1858
  RUBY_METHOD_FUNC(rb_class_new_instance), -1);
1870
1859
 
1871
- rb_define_global_function("RE2", RUBY_METHOD_FUNC(re2_re2), -1);
1860
+ rb_define_module_function(rb_mKernel, "RE2", RUBY_METHOD_FUNC(re2_re2), -1);
1872
1861
 
1873
1862
  /* Create the symbols used in options. */
1874
1863
  id_utf8 = rb_intern("utf8");