re2 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ Copyright (c) 2010, Paul Mucur.
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of Paul Mucur, nor the names of its contributors may be
15
+ used to endorse or promote products derived from this software without
16
+ specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
@@ -0,0 +1,70 @@
1
+ re2
2
+ ===
3
+
4
+ A Ruby binding to [re2][], an "efficient, principled regular expression library".
5
+
6
+ Installation
7
+ ------------
8
+
9
+ You will need [re2][] installed in its default location of /usr/local as well as a C++ compiler such as [gcc][] (on Debian and Ubuntu, this is provided by the [build-essential][] package).
10
+
11
+ If you are using a packaged Ruby distribution, make sure you also have the Ruby header files installed such as those provided by the [ruby-dev][] package on Debian and Ubuntu.
12
+
13
+ You can then install the library via RubyGems: `gem install re2`
14
+
15
+ Usage
16
+ -----
17
+
18
+ You can use re2 as a mostly drop-in replacement for Ruby's own [Regexp][] class:
19
+
20
+ $ irb -rubygems
21
+ > require 're2'
22
+ > r = RE2.compile('w(\d)(\d+)')
23
+ => /w(\d)(\d+)/
24
+ > r.match("w1234")
25
+ => ["w1234", "1", "234"]
26
+ > r =~ "w1234"
27
+ => true
28
+ > r !~ "bob"
29
+ => true
30
+ > r.match("bob")
31
+ => nil
32
+
33
+ Features
34
+ --------
35
+
36
+ * Pre-compiling regular expressions with [`RE2.new(re)`](http://code.google.com/p/re2/source/browse/re2/re2.h#96), `RE2.compile(re)` or `RE2(re)` (including specifying options, e.g. `RE2.new("pattern", :case_sensitive => false)`
37
+
38
+ * Extracting matches with `re2.match(text)` (and an exact number of matches with `re2.match(text, number_of_matches)` such as `re2.match("123-234", 2)`)
39
+
40
+ * Checking for matches with `re2 =~ text`, `re2 === text` (for use in `case` statements) and `re2 !~ text`
41
+
42
+ * Checking regular expression compilation with `re2.ok?`, `re2.error` and `re2.error_arg`
43
+
44
+ * Checking regular expression "cost" with `re2.program_size`
45
+
46
+ * Checking the options for an expression with `re2.options` or individually with `re2.case_sensitive?`
47
+
48
+ * Performing full matches with [`RE2::FullMatch(text, re)`](http://code.google.com/p/re2/source/browse/re2/re2.h#30)
49
+
50
+ * Performing partial matches with [`RE2::PartialMatch(text, re)`](http://code.google.com/p/re2/source/browse/re2/re2.h#82)
51
+
52
+ * Performing in-place replacement with [`RE2::Replace(str, pattern, replace)`](http://code.google.com/p/re2/source/browse/re2/re2.h#335)
53
+
54
+ * Performing in-place global replacement with [`RE2::GlobalReplace(str, pattern, replace)`](http://code.google.com/p/re2/source/browse/re2/re2.h#352)
55
+
56
+ * Escaping regular expressions with [`RE2::QuoteMeta(unquoted)`](http://code.google.com/p/re2/source/browse/re2/re2.h#377), `RE2.escape(unquoted)` or `RE2.quote(unquoted)`
57
+
58
+ re2.cc should be well-documented so feel free to consult this file to see what can currently be used.
59
+
60
+ Contact
61
+ -------
62
+
63
+ All feedback should go to the mailing list: ruby.re2@librelist.com
64
+
65
+ [re2]: http://code.google.com/p/re2/
66
+ [gcc]: http://gcc.gnu.org/
67
+ [ruby-dev]: http://packages.debian.org/ruby-dev
68
+ [build-essential]: http://packages.debian.org/build-essential
69
+ [Regexp]: http://ruby-doc.org/core/classes/Regexp.html
70
+
@@ -0,0 +1,19 @@
1
+ begin
2
+ require 'rake/extensiontask'
3
+ require 'rake/testtask'
4
+ rescue LoadError
5
+ require 'rubygems'
6
+ require 'rake/extensiontask'
7
+ require 'rake/testtask'
8
+ end
9
+
10
+ Rake::ExtensionTask.new('re2')
11
+
12
+ Rake::TestTask.new do |t|
13
+ t.test_files = FileList["test/*_test.rb"]
14
+ t.verbose = true
15
+ end
16
+
17
+ task :test => :compile
18
+ task :default => :test
19
+
@@ -0,0 +1,18 @@
1
+ # re2 (http://github.com/mudge/re2)
2
+ # Ruby bindings to re2, an "efficient, principled regular expression library"
3
+ #
4
+ # Copyright (c) 2010, Paul Mucur (http://mucur.name)
5
+ # Released under the BSD Licence, please see LICENSE.txt
6
+
7
+ require 'mkmf'
8
+
9
+ incl, lib = dir_config("re2", "/usr/local/include", "/usr/local/lib")
10
+
11
+ $CFLAGS << " -Wall -Wextra -funroll-loops"
12
+
13
+ have_library("stdc++")
14
+ if have_library("re2")
15
+ create_makefile("re2")
16
+ else
17
+ abort "You must have re2 installed and specified with --with-re2-dir, please see http://code.google.com/p/re2/wiki/Install"
18
+ end
@@ -0,0 +1,1026 @@
1
+ /*
2
+ * re2 (http://github.com/mudge/re2)
3
+ * Ruby bindings to re2, an "efficient, principled regular expression library"
4
+ *
5
+ * Copyright (c) 2010, Paul Mucur (http://mucur.name)
6
+ * Released under the BSD Licence, please see LICENSE.txt
7
+ */
8
+
9
+ #include <re2/re2.h>
10
+
11
+ extern "C" {
12
+
13
+ #include <ruby.h>
14
+
15
+ #define BOOL2RUBY(v) (v ? Qtrue : Qfalse)
16
+ #define UNUSED(x) ((void)x)
17
+
18
+ #if !defined(RSTRING_LEN)
19
+ # define RSTRING_LEN(x) (RSTRING(x)->len)
20
+ #endif
21
+
22
+ typedef struct _re2p {
23
+ RE2 *pattern;
24
+ } re2_pattern;
25
+
26
+ VALUE re2_cRE2;
27
+
28
+ /* Symbols used in RE2 options. */
29
+ static ID id_utf8, id_posix_syntax, id_longest_match, id_log_errors,
30
+ id_max_mem, id_literal, id_never_nl, id_case_sensitive,
31
+ id_perl_classes, id_word_boundary, id_one_line;
32
+
33
+ void
34
+ re2_free(re2_pattern* self)
35
+ {
36
+ free(self);
37
+ }
38
+
39
+ static VALUE
40
+ re2_allocate(VALUE klass)
41
+ {
42
+ re2_pattern *p = (re2_pattern*)malloc(sizeof(re2_pattern));
43
+ p->pattern = NULL;
44
+ return Data_Wrap_Struct(klass, 0, re2_free, p);
45
+ }
46
+
47
+ /*
48
+ * call-seq:
49
+ * RE2(pattern) -> re2
50
+ * RE2(pattern, options) -> re2
51
+ *
52
+ * Returns a new RE2 object with a compiled version of
53
+ * +pattern+ stored inside. Equivalent to +RE2.new+.
54
+ */
55
+ static VALUE
56
+ re2_re2(int argc, VALUE *argv, VALUE self)
57
+ {
58
+ UNUSED(self);
59
+ return rb_class_new_instance(argc, argv, re2_cRE2);
60
+ }
61
+
62
+ /*
63
+ * call-seq:
64
+ * RE2.new(pattern) -> re2
65
+ * RE2.new(pattern, options) -> re2
66
+ * RE2.compile(pattern) -> re2
67
+ * RE2.compile(pattern, options) -> re2
68
+ *
69
+ * Returns a new RE2 object with a compiled version of
70
+ * +pattern+ stored inside.
71
+ *
72
+ * Options can be a hash with the following keys:
73
+ *
74
+ * :utf8 - text and pattern are UTF-8; otherwise
75
+ * Latin-1 (default true)
76
+ *
77
+ * :posix_syntax - restrict regexps to POSIX egrep syntax
78
+ * (default false)
79
+ *
80
+ * :longest_match - search for longest match, not first match
81
+ * (default false)
82
+ *
83
+ * :log_errors - log syntax and execution errors to ERROR
84
+ * (default true)
85
+ *
86
+ * :max_mem - approx. max memory footprint of RE2
87
+ *
88
+ * :literal - interpret string as literal, not regexp
89
+ * (default false)
90
+ *
91
+ * :never_nl - never match \n, even if it is in regexp
92
+ * (default false)
93
+ *
94
+ * :case_sensitive - match is case-sensitive (regexp can override
95
+ * with (?i) unless in posix_syntax mode)
96
+ * (default true)
97
+ *
98
+ * :perl_classes - allow Perl's \d \s \w \D \S \W when in
99
+ * posix_syntax mode (default false)
100
+ *
101
+ * :word_boundary - allow \b \B (word boundary and not) when
102
+ * in posix_syntax mode (default false)
103
+ *
104
+ * :one_line - ^ and $ only match beginning and end of text
105
+ * when in posix_syntax mode (default false)
106
+ */
107
+ static VALUE
108
+ re2_initialize(int argc, VALUE *argv, VALUE self)
109
+ {
110
+ VALUE pattern, options, utf8, posix_syntax, longest_match, log_errors,
111
+ max_mem, literal, never_nl, case_sensitive, perl_classes,
112
+ word_boundary, one_line;
113
+ re2_pattern *p;
114
+ RE2::Options *re2_options;
115
+
116
+ rb_scan_args(argc, argv, "11", &pattern, &options);
117
+ Data_Get_Struct(self, re2_pattern, p);
118
+
119
+ if (RTEST(options)) {
120
+ if (TYPE(options) != T_HASH) {
121
+ rb_raise(rb_eArgError, "options should be a hash");
122
+ }
123
+
124
+ re2_options = new RE2::Options();
125
+
126
+ utf8 = rb_hash_aref(options, ID2SYM(id_utf8));
127
+ if (!NIL_P(utf8)) {
128
+ re2_options->set_utf8(RTEST(utf8));
129
+ }
130
+
131
+ posix_syntax = rb_hash_aref(options, ID2SYM(id_posix_syntax));
132
+ if (!NIL_P(posix_syntax)) {
133
+ re2_options->set_posix_syntax(RTEST(posix_syntax));
134
+ }
135
+
136
+ longest_match = rb_hash_aref(options, ID2SYM(id_longest_match));
137
+ if (!NIL_P(longest_match)) {
138
+ re2_options->set_longest_match(RTEST(longest_match));
139
+ }
140
+
141
+ log_errors = rb_hash_aref(options, ID2SYM(id_log_errors));
142
+ if (!NIL_P(log_errors)) {
143
+ re2_options->set_log_errors(RTEST(log_errors));
144
+ }
145
+
146
+ max_mem = rb_hash_aref(options, ID2SYM(id_max_mem));
147
+ if (!NIL_P(max_mem)) {
148
+ re2_options->set_max_mem(NUM2INT(max_mem));
149
+ }
150
+
151
+ literal = rb_hash_aref(options, ID2SYM(id_literal));
152
+ if (!NIL_P(literal)) {
153
+ re2_options->set_literal(RTEST(literal));
154
+ }
155
+
156
+ never_nl = rb_hash_aref(options, ID2SYM(id_never_nl));
157
+ if (!NIL_P(never_nl)) {
158
+ re2_options->set_never_nl(RTEST(never_nl));
159
+ }
160
+
161
+ case_sensitive = rb_hash_aref(options, ID2SYM(id_case_sensitive));
162
+ if (!NIL_P(case_sensitive)) {
163
+ re2_options->set_case_sensitive(RTEST(case_sensitive));
164
+ }
165
+
166
+ perl_classes = rb_hash_aref(options, ID2SYM(id_perl_classes));
167
+ if (!NIL_P(perl_classes)) {
168
+ re2_options->set_perl_classes(RTEST(perl_classes));
169
+ }
170
+
171
+ word_boundary = rb_hash_aref(options, ID2SYM(id_word_boundary));
172
+ if (!NIL_P(word_boundary)) {
173
+ re2_options->set_word_boundary(RTEST(word_boundary));
174
+ }
175
+
176
+ one_line = rb_hash_aref(options, ID2SYM(id_one_line));
177
+ if (!NIL_P(one_line)) {
178
+ re2_options->set_one_line(RTEST(one_line));
179
+ }
180
+
181
+ p->pattern = new RE2(StringValuePtr(pattern), *re2_options);
182
+ } else {
183
+ p->pattern = new RE2(StringValuePtr(pattern));
184
+ }
185
+
186
+ return self;
187
+ }
188
+
189
+ /*
190
+ * call-seq:
191
+ * re2.inspect -> string
192
+ *
193
+ * Returns a printable version of the regular expression +re2+,
194
+ * surrounded by forward slashes.
195
+ *
196
+ * re2 = RE2.new("woo?")
197
+ * re2.inspect #=> "/woo?/"
198
+ */
199
+ static VALUE
200
+ re2_inspect(VALUE self)
201
+ {
202
+ VALUE result = rb_str_buf_new(0);
203
+ re2_pattern *p;
204
+
205
+ rb_str_buf_cat2(result, "/");
206
+ Data_Get_Struct(self, re2_pattern, p);
207
+ rb_str_buf_cat2(result, p->pattern->pattern().c_str());
208
+ rb_str_buf_cat2(result, "/");
209
+
210
+ return result;
211
+ }
212
+
213
+ /*
214
+ * call-seq:
215
+ * re2.to_s -> string
216
+ * re2.to_str -> string
217
+ * re2.pattern -> string
218
+ * re2.source -> string
219
+ * re2.inspect -> string
220
+ *
221
+ * Returns a string version of the regular expression +re2+.
222
+ *
223
+ * re2 = RE2.new("woo?")
224
+ * re2.to_s #=> "woo?"
225
+ */
226
+ static VALUE
227
+ re2_to_s(VALUE self)
228
+ {
229
+ re2_pattern *p;
230
+ Data_Get_Struct(self, re2_pattern, p);
231
+ return rb_str_new2(p->pattern->pattern().c_str());
232
+ }
233
+
234
+ /*
235
+ * call-seq:
236
+ * re2.ok? -> true or false
237
+ *
238
+ * Returns whether or not the regular expression +re2+
239
+ * was compiled successfully or not.
240
+ *
241
+ * re2 = RE2.new("woo?")
242
+ * re2.ok? #=> true
243
+ */
244
+ static VALUE
245
+ re2_ok(VALUE self)
246
+ {
247
+ re2_pattern *p;
248
+ Data_Get_Struct(self, re2_pattern, p);
249
+ return BOOL2RUBY(p->pattern->ok());
250
+ }
251
+
252
+ /*
253
+ * call-seq:
254
+ * re2.utf8? -> true or false
255
+ *
256
+ * Returns whether or not the regular expression +re2+
257
+ * was compiled with the utf8 option set to true.
258
+ *
259
+ * re2 = RE2.new("woo?", :utf8 => true)
260
+ * re2.utf8? #=> true
261
+ */
262
+ static VALUE
263
+ re2_utf8(VALUE self)
264
+ {
265
+ re2_pattern *p;
266
+ Data_Get_Struct(self, re2_pattern, p);
267
+ return BOOL2RUBY(p->pattern->options().utf8());
268
+ }
269
+
270
+ /*
271
+ * call-seq:
272
+ * re2.posix_syntax? -> true or false
273
+ *
274
+ * Returns whether or not the regular expression +re2+
275
+ * was compiled with the posix_syntax option set to true.
276
+ *
277
+ * re2 = RE2.new("woo?", :posix_syntax => true)
278
+ * re2.posix_syntax? #=> true
279
+ */
280
+ static VALUE
281
+ re2_posix_syntax(VALUE self)
282
+ {
283
+ re2_pattern *p;
284
+ Data_Get_Struct(self, re2_pattern, p);
285
+ return BOOL2RUBY(p->pattern->options().posix_syntax());
286
+ }
287
+
288
+ /*
289
+ * call-seq:
290
+ * re2.longest_match? -> true or false
291
+ *
292
+ * Returns whether or not the regular expression +re2+
293
+ * was compiled with the longest_match option set to true.
294
+ *
295
+ * re2 = RE2.new("woo?", :longest_match => true)
296
+ * re2.longest_match? #=> true
297
+ */
298
+ static VALUE
299
+ re2_longest_match(VALUE self)
300
+ {
301
+ re2_pattern *p;
302
+ Data_Get_Struct(self, re2_pattern, p);
303
+ return BOOL2RUBY(p->pattern->options().longest_match());
304
+ }
305
+
306
+ /*
307
+ * call-seq:
308
+ * re2.log_errors? -> true or false
309
+ *
310
+ * Returns whether or not the regular expression +re2+
311
+ * was compiled with the log_errors option set to true.
312
+ *
313
+ * re2 = RE2.new("woo?", :log_errors => true)
314
+ * re2.log_errors? #=> true
315
+ */
316
+ static VALUE
317
+ re2_log_errors(VALUE self)
318
+ {
319
+ re2_pattern *p;
320
+ Data_Get_Struct(self, re2_pattern, p);
321
+ return BOOL2RUBY(p->pattern->options().log_errors());
322
+ }
323
+
324
+ /*
325
+ * call-seq:
326
+ * re2.max_mem -> int
327
+ *
328
+ * Returns the max_mem setting for the regular expression
329
+ * +re2+.
330
+ *
331
+ * re2 = RE2.new("woo?", :max_mem => 1024)
332
+ * re2.max_mem #=> 1024
333
+ */
334
+ static VALUE
335
+ re2_max_mem(VALUE self)
336
+ {
337
+ re2_pattern *p;
338
+ Data_Get_Struct(self, re2_pattern, p);
339
+ return INT2FIX(p->pattern->options().max_mem());
340
+ }
341
+
342
+ /*
343
+ * call-seq:
344
+ * re2.literal? -> true or false
345
+ *
346
+ * Returns whether or not the regular expression +re2+
347
+ * was compiled with the literal option set to true.
348
+ *
349
+ * re2 = RE2.new("woo?", :literal => true)
350
+ * re2.literal? #=> true
351
+ */
352
+ static VALUE
353
+ re2_literal(VALUE self)
354
+ {
355
+ re2_pattern *p;
356
+ Data_Get_Struct(self, re2_pattern, p);
357
+ return BOOL2RUBY(p->pattern->options().literal());
358
+ }
359
+
360
+ /*
361
+ * call-seq:
362
+ * re2.never_nl? -> true or false
363
+ *
364
+ * Returns whether or not the regular expression +re2+
365
+ * was compiled with the never_nl option set to true.
366
+ *
367
+ * re2 = RE2.new("woo?", :never_nl => true)
368
+ * re2.never_nl? #=> true
369
+ */
370
+ static VALUE
371
+ re2_never_nl(VALUE self)
372
+ {
373
+ re2_pattern *p;
374
+ Data_Get_Struct(self, re2_pattern, p);
375
+ return BOOL2RUBY(p->pattern->options().never_nl());
376
+ }
377
+
378
+ /*
379
+ * call-seq:
380
+ * re2.case_sensitive? -> true or false
381
+ *
382
+ * Returns whether or not the regular expression +re2+
383
+ * was compiled with the case_sensitive option set to true.
384
+ *
385
+ * re2 = RE2.new("woo?", :case_sensitive => true)
386
+ * re2.case_sensitive? #=> true
387
+ */
388
+ static VALUE
389
+ re2_case_sensitive(VALUE self)
390
+ {
391
+ re2_pattern *p;
392
+ Data_Get_Struct(self, re2_pattern, p);
393
+ return BOOL2RUBY(p->pattern->options().case_sensitive());
394
+ }
395
+
396
+ /*
397
+ * call-seq:
398
+ * re2.case_insensitive? -> true or false
399
+ * re2.casefold? -> true or false
400
+ *
401
+ * Returns whether or not the regular expression +re2+
402
+ * was compiled with the case_sensitive option set to false.
403
+ *
404
+ * re2 = RE2.new("woo?", :case_sensitive => true)
405
+ * re2.case_insensitive? #=> false
406
+ */
407
+ static VALUE
408
+ re2_case_insensitive(VALUE self)
409
+ {
410
+ return BOOL2RUBY(re2_case_sensitive(self) != Qtrue);
411
+ }
412
+
413
+ /*
414
+ * call-seq:
415
+ * re2.perl_classes? -> true or false
416
+ *
417
+ * Returns whether or not the regular expression +re2+
418
+ * was compiled with the perl_classes option set to true.
419
+ *
420
+ * re2 = RE2.new("woo?", :perl_classes => true)
421
+ * re2.perl_classes? #=> true
422
+ */
423
+ static VALUE
424
+ re2_perl_classes(VALUE self)
425
+ {
426
+ re2_pattern *p;
427
+ Data_Get_Struct(self, re2_pattern, p);
428
+ return BOOL2RUBY(p->pattern->options().perl_classes());
429
+ }
430
+
431
+ /*
432
+ * call-seq:
433
+ * re2.word_boundary? -> true or false
434
+ *
435
+ * Returns whether or not the regular expression +re2+
436
+ * was compiled with the word_boundary option set to true.
437
+ *
438
+ * re2 = RE2.new("woo?", :word_boundary => true)
439
+ * re2.word_boundary? #=> true
440
+ */
441
+ static VALUE
442
+ re2_word_boundary(VALUE self)
443
+ {
444
+ re2_pattern *p;
445
+ Data_Get_Struct(self, re2_pattern, p);
446
+ return BOOL2RUBY(p->pattern->options().word_boundary());
447
+ }
448
+
449
+ /*
450
+ * call-seq:
451
+ * re2.one_line? -> true or false
452
+ *
453
+ * Returns whether or not the regular expression +re2+
454
+ * was compiled with the one_line option set to true.
455
+ *
456
+ * re2 = RE2.new("woo?", :one_line => true)
457
+ * re2.one_line? #=> true
458
+ */
459
+ static VALUE
460
+ re2_one_line(VALUE self)
461
+ {
462
+ re2_pattern *p;
463
+ Data_Get_Struct(self, re2_pattern, p);
464
+ return BOOL2RUBY(p->pattern->options().one_line());
465
+ }
466
+
467
+ /*
468
+ * call-seq:
469
+ * re2.error -> error_str
470
+ *
471
+ * If the RE2 could not be created properly, returns an
472
+ * error string.
473
+ */
474
+ static VALUE
475
+ re2_error(VALUE self)
476
+ {
477
+ re2_pattern *p;
478
+ Data_Get_Struct(self, re2_pattern, p);
479
+ return rb_str_new2(p->pattern->error().c_str());
480
+ }
481
+
482
+ /*
483
+ * call-seq:
484
+ * re2.error_arg -> error_str
485
+ *
486
+ * If the RE2 could not be created properly, returns
487
+ * the offending portion of the regexp.
488
+ */
489
+ static VALUE
490
+ re2_error_arg(VALUE self)
491
+ {
492
+ re2_pattern *p;
493
+ Data_Get_Struct(self, re2_pattern, p);
494
+ return rb_str_new2(p->pattern->error_arg().c_str());
495
+ }
496
+
497
+ /*
498
+ * call-seq:
499
+ * re2.program_size -> size
500
+ *
501
+ * Returns the program size, a very approximate measure
502
+ * of a regexp's "cost". Larger numbers are more expensive
503
+ * than smaller numbers.
504
+ */
505
+ static VALUE
506
+ re2_program_size(VALUE self)
507
+ {
508
+ re2_pattern *p;
509
+ Data_Get_Struct(self, re2_pattern, p);
510
+ return INT2FIX(p->pattern->ProgramSize());
511
+ }
512
+
513
+ /*
514
+ * call-seq:
515
+ * re2.options -> options_hash
516
+ *
517
+ * Returns a hash of the options currently set for
518
+ * +re2+.
519
+ */
520
+ static VALUE
521
+ re2_options(VALUE self)
522
+ {
523
+ VALUE options;
524
+ re2_pattern *p;
525
+
526
+ Data_Get_Struct(self, re2_pattern, p);
527
+ options = rb_hash_new();
528
+
529
+ rb_hash_aset(options, ID2SYM(id_utf8),
530
+ BOOL2RUBY(p->pattern->options().utf8()));
531
+
532
+ rb_hash_aset(options, ID2SYM(id_posix_syntax),
533
+ BOOL2RUBY(p->pattern->options().posix_syntax()));
534
+
535
+ rb_hash_aset(options, ID2SYM(id_longest_match),
536
+ BOOL2RUBY(p->pattern->options().longest_match()));
537
+
538
+ rb_hash_aset(options, ID2SYM(id_log_errors),
539
+ BOOL2RUBY(p->pattern->options().log_errors()));
540
+
541
+ rb_hash_aset(options, ID2SYM(id_max_mem),
542
+ INT2FIX(p->pattern->options().max_mem()));
543
+
544
+ rb_hash_aset(options, ID2SYM(id_literal),
545
+ BOOL2RUBY(p->pattern->options().literal()));
546
+
547
+ rb_hash_aset(options, ID2SYM(id_never_nl),
548
+ BOOL2RUBY(p->pattern->options().never_nl()));
549
+
550
+ rb_hash_aset(options, ID2SYM(id_case_sensitive),
551
+ BOOL2RUBY(p->pattern->options().case_sensitive()));
552
+
553
+ rb_hash_aset(options, ID2SYM(id_perl_classes),
554
+ BOOL2RUBY(p->pattern->options().perl_classes()));
555
+
556
+ rb_hash_aset(options, ID2SYM(id_word_boundary),
557
+ BOOL2RUBY(p->pattern->options().word_boundary()));
558
+
559
+ rb_hash_aset(options, ID2SYM(id_one_line),
560
+ BOOL2RUBY(p->pattern->options().one_line()));
561
+
562
+ // This is a read-only hash after all...
563
+ OBJ_FREEZE(options);
564
+
565
+ return options;
566
+ }
567
+
568
+ /*
569
+ * call-seq:
570
+ * re2.number_of_capturing_groups -> int
571
+ *
572
+ * Returns the number of capturing subpatterns, or -1 if the regexp
573
+ * wasn't valid on construction. The overall match ($0) does not
574
+ * count: if the regexp is "(a)(b)", returns 2.
575
+ */
576
+ static VALUE
577
+ re2_number_of_capturing_groups(VALUE self)
578
+ {
579
+ re2_pattern *p;
580
+
581
+ Data_Get_Struct(self, re2_pattern, p);
582
+ return INT2FIX(p->pattern->NumberOfCapturingGroups());
583
+ }
584
+
585
+ /*
586
+ * call-seq:
587
+ * re2.match(text) -> [match, match]
588
+ * re2.match(text, 0) -> true or false
589
+ * re2.match(text, num_of_matches) -> [match, match]
590
+ *
591
+ * Looks for the pattern in +re2+ in +text+; when specified
592
+ * without a second argument, will return an array of the matching
593
+ * pattern and all subpatterns. If the second argument is 0, a
594
+ * simple true or false will be returned to indicate a successful
595
+ * match. If the second argument is any integer greater than 0,
596
+ * that number of matches will be returned (padded with nils if
597
+ * there are insufficient matches).
598
+ *
599
+ * r = RE2.new('w(o)(o)')
600
+ * r.match('woo') #=> ["woo", "o", "o"]
601
+ * r.match('woo', 0) #=> true
602
+ * r.match('bob', 0) #=> false
603
+ * r.match('woo', 1) #=> ["woo", "o"]
604
+ */
605
+ static VALUE
606
+ re2_match(int argc, VALUE *argv, VALUE self)
607
+ {
608
+ int n;
609
+ bool matched;
610
+ re2_pattern *p;
611
+ VALUE text, number_of_matches, matches;
612
+ re2::StringPiece *string_matches, *text_as_string_piece;
613
+
614
+ rb_scan_args(argc, argv, "11", &text, &number_of_matches);
615
+
616
+ Data_Get_Struct(self, re2_pattern, p);
617
+
618
+ if (RTEST(number_of_matches)) {
619
+ n = NUM2INT(number_of_matches);
620
+ } else {
621
+ n = p->pattern->NumberOfCapturingGroups();
622
+ }
623
+
624
+ text_as_string_piece = new re2::StringPiece(StringValuePtr(text));
625
+
626
+ if (n == 0) {
627
+ return BOOL2RUBY(p->pattern->Match(*text_as_string_piece, 0, RE2::UNANCHORED, 0, 0));
628
+ } else {
629
+
630
+ /* Because match returns the whole match as well. */
631
+ n += 1;
632
+
633
+ string_matches = new re2::StringPiece[n];
634
+
635
+ matched = p->pattern->Match(*text_as_string_piece, 0, RE2::UNANCHORED, string_matches, n);
636
+
637
+ if (matched) {
638
+ matches = rb_ary_new();
639
+
640
+ for (int i = 0; i < n; i++) {
641
+ if (!string_matches[i].empty()) {
642
+ rb_ary_push(matches, rb_str_new2(string_matches[i].as_string().c_str()));
643
+ } else {
644
+ rb_ary_push(matches, Qnil);
645
+ }
646
+ }
647
+
648
+ return matches;
649
+ } else {
650
+ return Qnil;
651
+ }
652
+ }
653
+ }
654
+
655
+ /*
656
+ * call-seq:
657
+ * re2.match?(text) -> true or false
658
+ * re2 =~ text -> true or false
659
+ *
660
+ * Returns true or false to indicate a successful match.
661
+ * Equivalent to +re2.match(text, 0)+.
662
+ */
663
+ static VALUE
664
+ re2_match_query(VALUE self, VALUE text)
665
+ {
666
+ VALUE argv[2];
667
+ argv[0] = text;
668
+ argv[1] = INT2FIX(0);
669
+
670
+ return re2_match(2, argv, self);
671
+ }
672
+
673
+ /*
674
+ * call-seq:
675
+ * re2 !~ text -> true or false
676
+ *
677
+ * Returns true or false to indicate an unsuccessful match.
678
+ * Equivalent to +!re2.match(text, 0)+.
679
+ */
680
+ static VALUE
681
+ re2_bang_tilde(VALUE self, VALUE text)
682
+ {
683
+ return BOOL2RUBY(re2_match_query(self, text) != Qtrue);
684
+ }
685
+
686
+ /*
687
+ * call-seq:
688
+ * RE2::FullMatch(text, re) -> true or false
689
+ *
690
+ * Returns whether or not a full match for +re2+ was
691
+ * found in text.
692
+ *
693
+ * RE2::FullMatch("woo", "wo+") #=> true
694
+ * RE2::FullMatch("woo", "a") #=> false
695
+ * re2 = RE2.new("woo")
696
+ * RE2::FullMatch("woo", re2) #=> true
697
+ */
698
+ static VALUE
699
+ re2_FullMatch(VALUE self, VALUE text, VALUE re)
700
+ {
701
+ UNUSED(self);
702
+ bool result;
703
+ re2_pattern *p;
704
+
705
+ if (rb_obj_is_kind_of(re, re2_cRE2)) {
706
+ Data_Get_Struct(re, re2_pattern, p);
707
+ result = RE2::FullMatch(StringValuePtr(text), *p->pattern);
708
+ } else {
709
+ result = RE2::FullMatch(StringValuePtr(text), StringValuePtr(re));
710
+ }
711
+
712
+ return BOOL2RUBY(result);
713
+ }
714
+
715
+ /*
716
+ * call-seq:
717
+ * RE2::FullMatchN(text, re) -> array of matches
718
+ *
719
+ * Returns an array of successful matches as defined in
720
+ * +re+ for +text+.
721
+ *
722
+ * RE2::FullMatchN("woo", "w(oo)") #=> ["oo"]
723
+ */
724
+ static VALUE
725
+ re2_FullMatchN(VALUE self, VALUE text, VALUE re)
726
+ {
727
+ UNUSED(self);
728
+ int n;
729
+ bool matched;
730
+ re2_pattern *p;
731
+ VALUE matches;
732
+ RE2 *compiled_pattern;
733
+ RE2::Arg *argv;
734
+ const RE2::Arg **args;
735
+ std::string *string_matches;
736
+
737
+ if (rb_obj_is_kind_of(re, re2_cRE2)) {
738
+ Data_Get_Struct(re, re2_pattern, p);
739
+ compiled_pattern = p->pattern;
740
+ } else {
741
+ compiled_pattern = new RE2(StringValuePtr(re));
742
+ }
743
+
744
+ n = compiled_pattern->NumberOfCapturingGroups();
745
+
746
+ argv = new RE2::Arg[n];
747
+ args = new const RE2::Arg*[n];
748
+ string_matches = new std::string[n];
749
+
750
+ for (int i = 0; i < n; i++) {
751
+ args[i] = &argv[i];
752
+ argv[i] = &string_matches[i];
753
+ }
754
+
755
+ matched = RE2::FullMatchN(StringValuePtr(text), *compiled_pattern, args, n);
756
+
757
+ if (matched) {
758
+ matches = rb_ary_new();
759
+
760
+ for (int i = 0; i < n; i++) {
761
+ if (!string_matches[i].empty()) {
762
+ rb_ary_push(matches, rb_str_new2(string_matches[i].c_str()));
763
+ } else {
764
+ rb_ary_push(matches, Qnil);
765
+ }
766
+ }
767
+
768
+ return matches;
769
+ } else {
770
+ return Qnil;
771
+ }
772
+ }
773
+
774
+ /*
775
+ * call-seq:
776
+ * RE2::PartialMatchN(text, re) -> array of matches
777
+ *
778
+ * Returns an array of successful matches as defined in
779
+ * +re+ for +text+.
780
+ *
781
+ * RE2::PartialMatchN("woo", "w(oo)") #=> ["oo"]
782
+ */
783
+ static VALUE
784
+ re2_PartialMatchN(VALUE self, VALUE text, VALUE re)
785
+ {
786
+ UNUSED(self);
787
+ int n;
788
+ bool matched;
789
+ re2_pattern *p;
790
+ VALUE matches;
791
+ RE2 *compiled_pattern;
792
+ RE2::Arg *argv;
793
+ const RE2::Arg **args;
794
+ std::string *string_matches;
795
+
796
+ if (rb_obj_is_kind_of(re, re2_cRE2)) {
797
+ Data_Get_Struct(re, re2_pattern, p);
798
+ compiled_pattern = p->pattern;
799
+ } else {
800
+ compiled_pattern = new RE2(StringValuePtr(re));
801
+ }
802
+
803
+ n = compiled_pattern->NumberOfCapturingGroups();
804
+
805
+ argv = new RE2::Arg[n];
806
+ args = new const RE2::Arg*[n];
807
+ string_matches = new std::string[n];
808
+
809
+ for (int i = 0; i < n; i++) {
810
+ args[i] = &argv[i];
811
+ argv[i] = &string_matches[i];
812
+ }
813
+
814
+ matched = RE2::PartialMatchN(StringValuePtr(text), *compiled_pattern, args, n);
815
+
816
+ if (matched) {
817
+ matches = rb_ary_new();
818
+
819
+ for (int i = 0; i < n; i++) {
820
+ if (!string_matches[i].empty()) {
821
+ rb_ary_push(matches, rb_str_new2(string_matches[i].c_str()));
822
+ } else {
823
+ rb_ary_push(matches, Qnil);
824
+ }
825
+ }
826
+
827
+ return matches;
828
+ } else {
829
+ return Qnil;
830
+ }
831
+ }
832
+
833
+ /*
834
+ * call-seq:
835
+ * RE2::PartialMatch(text, re) -> true or false
836
+ *
837
+ * Returns whether or not a partial match for +re2+ was
838
+ * found in text.
839
+ *
840
+ * RE2::PartialMatch("woo", "o+") #=> true
841
+ * RE2::PartialMatch("woo", "a") #=> false
842
+ * re2 = RE2.new("oo?")
843
+ * RE2::PartialMatch("woo", re2) #=> true
844
+ */
845
+ static VALUE
846
+ re2_PartialMatch(VALUE self, VALUE text, VALUE re)
847
+ {
848
+ UNUSED(self);
849
+ bool result;
850
+ re2_pattern *p;
851
+
852
+ if (rb_obj_is_kind_of(re, re2_cRE2)) {
853
+ Data_Get_Struct(re, re2_pattern, p);
854
+ result = RE2::PartialMatch(StringValuePtr(text), *p->pattern);
855
+ } else {
856
+ result = RE2::PartialMatch(StringValuePtr(text), StringValuePtr(re));
857
+ }
858
+
859
+ return BOOL2RUBY(result);
860
+ }
861
+
862
+ /*
863
+ * call-seq:
864
+ * RE2::Replace(str, pattern, rewrite) -> str
865
+ *
866
+ * Replaces the first occurrence +pattern+ in +str+ with
867
+ * +rewrite+ <i>in place</i>.
868
+ *
869
+ * RE2::Replace("hello there", "hello", "howdy") #=> "howdy there"
870
+ * re2 = RE2.new("hel+o")
871
+ * RE2::Replace("hello there", re2, "yo") #=> "yo there"
872
+ * text = "Good morning"
873
+ * RE2::Replace(text, "morn", "even") #=> "Good evening"
874
+ * text #=> "Good evening"
875
+ */
876
+ static VALUE
877
+ re2_Replace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite)
878
+ {
879
+ UNUSED(self);
880
+ VALUE repl;
881
+ re2_pattern *p;
882
+
883
+ // Convert all the inputs to be pumped into RE2::Replace.
884
+ std::string str_as_string(StringValuePtr(str));
885
+ re2::StringPiece rewrite_as_string_piece(StringValuePtr(rewrite));
886
+
887
+ // Do the replacement.
888
+ if (rb_obj_is_kind_of(pattern, re2_cRE2)) {
889
+ Data_Get_Struct(pattern, re2_pattern, p);
890
+ RE2::Replace(&str_as_string, *p->pattern, rewrite_as_string_piece);
891
+ } else {
892
+ RE2::Replace(&str_as_string, StringValuePtr(pattern), rewrite_as_string_piece);
893
+ }
894
+
895
+ // Save the replacement as a VALUE.
896
+ repl = rb_str_new(str_as_string.c_str(), str_as_string.length());
897
+
898
+ // Replace the original string with the replacement.
899
+ rb_str_update(str, 0, RSTRING_LEN(str), repl);
900
+
901
+ return str;
902
+ }
903
+
904
+ /*
905
+ * call-seq:
906
+ * RE2::GlobalReplace(str, pattern, rewrite) -> str
907
+ *
908
+ * Replaces every occurrence of +pattern+ in +str+ with
909
+ * +rewrite+ <i>in place</i>.
910
+ *
911
+ * RE2::GlobalReplace("hello there", "e", "i") #=> "hillo thiri"
912
+ * re2 = RE2.new("oo?")
913
+ * RE2::GlobalReplace("whoops-doops", re2, "e") #=> "wheps-deps"
914
+ * text = "Good morning"
915
+ * RE2::GlobalReplace(text, "o", "ee") #=> "Geeeed meerning"
916
+ * text #=> "Geeeed meerning"
917
+ */
918
+ static VALUE
919
+ re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern, VALUE rewrite)
920
+ {
921
+ UNUSED(self);
922
+
923
+ // Convert all the inputs to be pumped into RE2::GlobalReplace.
924
+ re2_pattern *p;
925
+ std::string str_as_string(StringValuePtr(str));
926
+ re2::StringPiece rewrite_as_string_piece(StringValuePtr(rewrite));
927
+ VALUE repl;
928
+
929
+ // Do the replacement.
930
+ if (rb_obj_is_kind_of(pattern, re2_cRE2)) {
931
+ Data_Get_Struct(pattern, re2_pattern, p);
932
+ RE2::GlobalReplace(&str_as_string, *p->pattern, rewrite_as_string_piece);
933
+ } else {
934
+ RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern), rewrite_as_string_piece);
935
+ }
936
+
937
+ // Save the replacement as a VALUE.
938
+ repl = rb_str_new(str_as_string.c_str(), str_as_string.length());
939
+
940
+ // Replace the original string with the replacement.
941
+ rb_str_update(str, 0, RSTRING_LEN(str), repl);
942
+
943
+ return str;
944
+ }
945
+
946
+ /*
947
+ * call-seq:
948
+ * RE2::QuoteMeta(str) -> str
949
+ * RE2.escape(str) -> str
950
+ * RE2.quote(str) -> str
951
+ *
952
+ * Returns a version of str with all potentially meaningful regexp
953
+ * characters escaped. The returned string, used as a regular
954
+ * expression, will exactly match the original string.
955
+ *
956
+ * RE2::QuoteMeta("1.5-2.0?") #=> "1\.5\-2\.0\?"
957
+ */
958
+ static VALUE
959
+ re2_QuoteMeta(VALUE self, VALUE unquoted)
960
+ {
961
+ UNUSED(self);
962
+ re2::StringPiece unquoted_as_string_piece(StringValuePtr(unquoted));
963
+ return rb_str_new2(RE2::QuoteMeta(unquoted_as_string_piece).c_str());
964
+ }
965
+
966
+ void
967
+ Init_re2()
968
+ {
969
+ re2_cRE2 = rb_define_class("RE2", rb_cObject);
970
+ rb_define_alloc_func(re2_cRE2, (VALUE (*)(VALUE))re2_allocate);
971
+ rb_define_method(re2_cRE2, "initialize", (VALUE (*)(...))re2_initialize, -1);
972
+ rb_define_method(re2_cRE2, "ok?", (VALUE (*)(...))re2_ok, 0);
973
+ rb_define_method(re2_cRE2, "error", (VALUE (*)(...))re2_error, 0);
974
+ rb_define_method(re2_cRE2, "error_arg", (VALUE (*)(...))re2_error_arg, 0);
975
+ rb_define_method(re2_cRE2, "program_size", (VALUE (*)(...))re2_program_size, 0);
976
+ rb_define_method(re2_cRE2, "options", (VALUE (*)(...))re2_options, 0);
977
+ rb_define_method(re2_cRE2, "number_of_capturing_groups", (VALUE (*)(...))re2_number_of_capturing_groups, 0);
978
+ rb_define_method(re2_cRE2, "match", (VALUE (*)(...))re2_match, -1);
979
+ rb_define_method(re2_cRE2, "match?", (VALUE (*)(...))re2_match_query, 1);
980
+ rb_define_method(re2_cRE2, "=~", (VALUE (*)(...))re2_match_query, 1);
981
+ rb_define_method(re2_cRE2, "===", (VALUE (*)(...))re2_match_query, 1);
982
+ rb_define_method(re2_cRE2, "!~", (VALUE (*)(...))re2_bang_tilde, 1);
983
+ rb_define_method(re2_cRE2, "to_s", (VALUE (*)(...))re2_to_s, 0);
984
+ rb_define_method(re2_cRE2, "to_str", (VALUE (*)(...))re2_to_s, 0);
985
+ rb_define_method(re2_cRE2, "pattern", (VALUE (*)(...))re2_to_s, 0);
986
+ rb_define_method(re2_cRE2, "source", (VALUE (*)(...))re2_to_s, 0);
987
+ rb_define_method(re2_cRE2, "inspect", (VALUE (*)(...))re2_inspect, 0);
988
+ rb_define_method(re2_cRE2, "utf8?", (VALUE (*)(...))re2_utf8, 0);
989
+ rb_define_method(re2_cRE2, "posix_syntax?", (VALUE (*)(...))re2_posix_syntax, 0);
990
+ rb_define_method(re2_cRE2, "longest_match?", (VALUE (*)(...))re2_longest_match, 0);
991
+ rb_define_method(re2_cRE2, "log_errors?", (VALUE (*)(...))re2_log_errors, 0);
992
+ rb_define_method(re2_cRE2, "max_mem", (VALUE (*)(...))re2_max_mem, 0);
993
+ rb_define_method(re2_cRE2, "literal?", (VALUE (*)(...))re2_literal, 0);
994
+ rb_define_method(re2_cRE2, "never_nl?", (VALUE (*)(...))re2_never_nl, 0);
995
+ rb_define_method(re2_cRE2, "case_sensitive?", (VALUE (*)(...))re2_case_sensitive, 0);
996
+ rb_define_method(re2_cRE2, "case_insensitive?", (VALUE (*)(...))re2_case_insensitive, 0);
997
+ rb_define_method(re2_cRE2, "casefold?", (VALUE (*)(...))re2_case_insensitive, 0);
998
+ rb_define_method(re2_cRE2, "perl_classes?", (VALUE (*)(...))re2_perl_classes, 0);
999
+ rb_define_method(re2_cRE2, "word_boundary?", (VALUE (*)(...))re2_word_boundary, 0);
1000
+ rb_define_method(re2_cRE2, "one_line?", (VALUE (*)(...))re2_one_line, 0);
1001
+ rb_define_singleton_method(re2_cRE2, "FullMatch", (VALUE (*)(...))re2_FullMatch, 2);
1002
+ rb_define_singleton_method(re2_cRE2, "FullMatchN", (VALUE (*)(...))re2_FullMatchN, 2);
1003
+ rb_define_singleton_method(re2_cRE2, "PartialMatch", (VALUE (*)(...))re2_PartialMatch, 2);
1004
+ rb_define_singleton_method(re2_cRE2, "PartialMatchN", (VALUE (*)(...))re2_PartialMatchN, 2);
1005
+ rb_define_singleton_method(re2_cRE2, "Replace", (VALUE (*)(...))re2_Replace, 3);
1006
+ rb_define_singleton_method(re2_cRE2, "GlobalReplace", (VALUE (*)(...))re2_GlobalReplace, 3);
1007
+ rb_define_singleton_method(re2_cRE2, "QuoteMeta", (VALUE (*)(...))re2_QuoteMeta, 1);
1008
+ rb_define_singleton_method(re2_cRE2, "escape", (VALUE (*)(...))re2_QuoteMeta, 1);
1009
+ rb_define_singleton_method(re2_cRE2, "quote", (VALUE (*)(...))re2_QuoteMeta, 1);
1010
+ rb_define_singleton_method(re2_cRE2, "compile", (VALUE (*)(...))rb_class_new_instance, -1);
1011
+ rb_define_global_function("RE2", (VALUE (*)(...))re2_re2, -1);
1012
+
1013
+ /* Create the symbols used in options. */
1014
+ id_utf8 = rb_intern("utf8");
1015
+ id_posix_syntax = rb_intern("posix_syntax");
1016
+ id_longest_match = rb_intern("longest_match");
1017
+ id_log_errors = rb_intern("log_errors");
1018
+ id_max_mem = rb_intern("max_mem");
1019
+ id_literal = rb_intern("literal");
1020
+ id_never_nl = rb_intern("never_nl");
1021
+ id_case_sensitive = rb_intern("case_sensitive");
1022
+ id_perl_classes = rb_intern("perl_classes");
1023
+ id_word_boundary = rb_intern("word_boundary");
1024
+ id_one_line = rb_intern("one_line");
1025
+ }
1026
+ }