strscan 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1490 @@
1
+ /*
2
+ $Id$
3
+
4
+ Copyright (c) 1999-2006 Minero Aoki
5
+
6
+ This program is free software.
7
+ You can distribute/modify this program under the terms of
8
+ the Ruby License. For details, see the file COPYING.
9
+ */
10
+
11
+ #include "ruby/ruby.h"
12
+ #include "ruby/re.h"
13
+ #include "ruby/encoding.h"
14
+ #include "regint.h"
15
+
16
+ #define STRSCAN_VERSION "0.7.0"
17
+
18
+ /* =======================================================================
19
+ Data Type Definitions
20
+ ======================================================================= */
21
+
22
+ static VALUE StringScanner;
23
+ static VALUE ScanError;
24
+ static ID id_byteslice;
25
+
26
+ struct strscanner
27
+ {
28
+ /* multi-purpose flags */
29
+ unsigned long flags;
30
+ #define FLAG_MATCHED (1 << 0)
31
+
32
+ /* the string to scan */
33
+ VALUE str;
34
+
35
+ /* scan pointers */
36
+ long prev; /* legal only when MATCHED_P(s) */
37
+ long curr; /* always legal */
38
+
39
+ /* the regexp register; legal only when MATCHED_P(s) */
40
+ struct re_registers regs;
41
+
42
+ /* regexp used for last scan */
43
+ VALUE regex;
44
+ };
45
+
46
+ #define MATCHED_P(s) ((s)->flags & FLAG_MATCHED)
47
+ #define MATCHED(s) (s)->flags |= FLAG_MATCHED
48
+ #define CLEAR_MATCH_STATUS(s) (s)->flags &= ~FLAG_MATCHED
49
+
50
+ #define S_PBEG(s) (RSTRING_PTR((s)->str))
51
+ #define S_LEN(s) (RSTRING_LEN((s)->str))
52
+ #define S_PEND(s) (S_PBEG(s) + S_LEN(s))
53
+ #define CURPTR(s) (S_PBEG(s) + (s)->curr)
54
+ #define S_RESTLEN(s) (S_LEN(s) - (s)->curr)
55
+
56
+ #define EOS_P(s) ((s)->curr >= RSTRING_LEN(p->str))
57
+
58
+ #define GET_SCANNER(obj,var) do {\
59
+ (var) = check_strscan(obj);\
60
+ if (NIL_P((var)->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");\
61
+ } while (0)
62
+
63
+ /* =======================================================================
64
+ Function Prototypes
65
+ ======================================================================= */
66
+
67
+ static inline long minl _((const long n, const long x));
68
+ static VALUE infect _((VALUE str, struct strscanner *p));
69
+ static VALUE extract_range _((struct strscanner *p, long beg_i, long end_i));
70
+ static VALUE extract_beg_len _((struct strscanner *p, long beg_i, long len));
71
+
72
+ static struct strscanner *check_strscan _((VALUE obj));
73
+ static void strscan_mark _((void *p));
74
+ static void strscan_free _((void *p));
75
+ static size_t strscan_memsize _((const void *p));
76
+ static VALUE strscan_s_allocate _((VALUE klass));
77
+ static VALUE strscan_initialize _((int argc, VALUE *argv, VALUE self));
78
+ static VALUE strscan_init_copy _((VALUE vself, VALUE vorig));
79
+
80
+ static VALUE strscan_s_mustc _((VALUE self));
81
+ static VALUE strscan_terminate _((VALUE self));
82
+ static VALUE strscan_clear _((VALUE self));
83
+ static VALUE strscan_get_string _((VALUE self));
84
+ static VALUE strscan_set_string _((VALUE self, VALUE str));
85
+ static VALUE strscan_concat _((VALUE self, VALUE str));
86
+ static VALUE strscan_get_pos _((VALUE self));
87
+ static VALUE strscan_set_pos _((VALUE self, VALUE pos));
88
+ static VALUE strscan_do_scan _((VALUE self, VALUE regex,
89
+ int succptr, int getstr, int headonly));
90
+ static VALUE strscan_scan _((VALUE self, VALUE re));
91
+ static VALUE strscan_match_p _((VALUE self, VALUE re));
92
+ static VALUE strscan_skip _((VALUE self, VALUE re));
93
+ static VALUE strscan_check _((VALUE self, VALUE re));
94
+ static VALUE strscan_scan_full _((VALUE self, VALUE re,
95
+ VALUE succp, VALUE getp));
96
+ static VALUE strscan_scan_until _((VALUE self, VALUE re));
97
+ static VALUE strscan_skip_until _((VALUE self, VALUE re));
98
+ static VALUE strscan_check_until _((VALUE self, VALUE re));
99
+ static VALUE strscan_search_full _((VALUE self, VALUE re,
100
+ VALUE succp, VALUE getp));
101
+ static void adjust_registers_to_matched _((struct strscanner *p));
102
+ static VALUE strscan_getch _((VALUE self));
103
+ static VALUE strscan_get_byte _((VALUE self));
104
+ static VALUE strscan_getbyte _((VALUE self));
105
+ static VALUE strscan_peek _((VALUE self, VALUE len));
106
+ static VALUE strscan_peep _((VALUE self, VALUE len));
107
+ static VALUE strscan_unscan _((VALUE self));
108
+ static VALUE strscan_bol_p _((VALUE self));
109
+ static VALUE strscan_eos_p _((VALUE self));
110
+ static VALUE strscan_empty_p _((VALUE self));
111
+ static VALUE strscan_rest_p _((VALUE self));
112
+ static VALUE strscan_matched_p _((VALUE self));
113
+ static VALUE strscan_matched _((VALUE self));
114
+ static VALUE strscan_matched_size _((VALUE self));
115
+ static VALUE strscan_aref _((VALUE self, VALUE idx));
116
+ static VALUE strscan_pre_match _((VALUE self));
117
+ static VALUE strscan_post_match _((VALUE self));
118
+ static VALUE strscan_rest _((VALUE self));
119
+ static VALUE strscan_rest_size _((VALUE self));
120
+
121
+ static VALUE strscan_inspect _((VALUE self));
122
+ static VALUE inspect1 _((struct strscanner *p));
123
+ static VALUE inspect2 _((struct strscanner *p));
124
+
125
+ /* =======================================================================
126
+ Utils
127
+ ======================================================================= */
128
+
129
+ static VALUE
130
+ infect(VALUE str, struct strscanner *p)
131
+ {
132
+ OBJ_INFECT(str, p->str);
133
+ return str;
134
+ }
135
+
136
+ static VALUE
137
+ str_new(struct strscanner *p, const char *ptr, long len)
138
+ {
139
+ VALUE str = rb_str_new(ptr, len);
140
+ rb_enc_copy(str, p->str);
141
+ return str;
142
+ }
143
+
144
+ static inline long
145
+ minl(const long x, const long y)
146
+ {
147
+ return (x < y) ? x : y;
148
+ }
149
+
150
+ static VALUE
151
+ extract_range(struct strscanner *p, long beg_i, long end_i)
152
+ {
153
+ if (beg_i > S_LEN(p)) return Qnil;
154
+ end_i = minl(end_i, S_LEN(p));
155
+ return infect(str_new(p, S_PBEG(p) + beg_i, end_i - beg_i), p);
156
+ }
157
+
158
+ static VALUE
159
+ extract_beg_len(struct strscanner *p, long beg_i, long len)
160
+ {
161
+ if (beg_i > S_LEN(p)) return Qnil;
162
+ len = minl(len, S_LEN(p) - beg_i);
163
+ return infect(str_new(p, S_PBEG(p) + beg_i, len), p);
164
+ }
165
+
166
+ /* =======================================================================
167
+ Constructor
168
+ ======================================================================= */
169
+
170
+ static void
171
+ strscan_mark(void *ptr)
172
+ {
173
+ struct strscanner *p = ptr;
174
+ rb_gc_mark(p->str);
175
+ }
176
+
177
+ static void
178
+ strscan_free(void *ptr)
179
+ {
180
+ struct strscanner *p = ptr;
181
+ onig_region_free(&(p->regs), 0);
182
+ ruby_xfree(p);
183
+ }
184
+
185
+ static size_t
186
+ strscan_memsize(const void *ptr)
187
+ {
188
+ const struct strscanner *p = ptr;
189
+ return sizeof(*p) - sizeof(p->regs) + onig_region_memsize(&p->regs);
190
+ }
191
+
192
+ static const rb_data_type_t strscanner_type = {
193
+ "StringScanner",
194
+ {strscan_mark, strscan_free, strscan_memsize},
195
+ 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
196
+ };
197
+
198
+ static VALUE
199
+ strscan_s_allocate(VALUE klass)
200
+ {
201
+ struct strscanner *p;
202
+ VALUE obj = TypedData_Make_Struct(klass, struct strscanner, &strscanner_type, p);
203
+
204
+ CLEAR_MATCH_STATUS(p);
205
+ onig_region_init(&(p->regs));
206
+ p->str = Qnil;
207
+ return obj;
208
+ }
209
+
210
+ /*
211
+ * call-seq: StringScanner.new(string, dup = false)
212
+ *
213
+ * Creates a new StringScanner object to scan over the given +string+.
214
+ * +dup+ argument is obsolete and not used now.
215
+ */
216
+ static VALUE
217
+ strscan_initialize(int argc, VALUE *argv, VALUE self)
218
+ {
219
+ struct strscanner *p;
220
+ VALUE str, need_dup;
221
+
222
+ p = check_strscan(self);
223
+ rb_scan_args(argc, argv, "11", &str, &need_dup);
224
+ StringValue(str);
225
+ p->str = str;
226
+
227
+ return self;
228
+ }
229
+
230
+ static struct strscanner *
231
+ check_strscan(VALUE obj)
232
+ {
233
+ return rb_check_typeddata(obj, &strscanner_type);
234
+ }
235
+
236
+ /*
237
+ * call-seq:
238
+ * dup
239
+ * clone
240
+ *
241
+ * Duplicates a StringScanner object.
242
+ */
243
+ static VALUE
244
+ strscan_init_copy(VALUE vself, VALUE vorig)
245
+ {
246
+ struct strscanner *self, *orig;
247
+
248
+ self = check_strscan(vself);
249
+ orig = check_strscan(vorig);
250
+ if (self != orig) {
251
+ self->flags = orig->flags;
252
+ self->str = orig->str;
253
+ self->prev = orig->prev;
254
+ self->curr = orig->curr;
255
+ if (rb_reg_region_copy(&self->regs, &orig->regs))
256
+ rb_memerror();
257
+ RB_GC_GUARD(vorig);
258
+ }
259
+
260
+ return vself;
261
+ }
262
+
263
+ /* =======================================================================
264
+ Instance Methods
265
+ ======================================================================= */
266
+
267
+ /*
268
+ * call-seq: StringScanner.must_C_version
269
+ *
270
+ * This method is defined for backward compatibility.
271
+ */
272
+ static VALUE
273
+ strscan_s_mustc(VALUE self)
274
+ {
275
+ return self;
276
+ }
277
+
278
+ /*
279
+ * Reset the scan pointer (index 0) and clear matching data.
280
+ */
281
+ static VALUE
282
+ strscan_reset(VALUE self)
283
+ {
284
+ struct strscanner *p;
285
+
286
+ GET_SCANNER(self, p);
287
+ p->curr = 0;
288
+ CLEAR_MATCH_STATUS(p);
289
+ return self;
290
+ }
291
+
292
+ /*
293
+ * call-seq:
294
+ * terminate
295
+ * clear
296
+ *
297
+ * Set the scan pointer to the end of the string and clear matching data.
298
+ */
299
+ static VALUE
300
+ strscan_terminate(VALUE self)
301
+ {
302
+ struct strscanner *p;
303
+
304
+ GET_SCANNER(self, p);
305
+ p->curr = S_LEN(p);
306
+ CLEAR_MATCH_STATUS(p);
307
+ return self;
308
+ }
309
+
310
+ /*
311
+ * Equivalent to #terminate.
312
+ * This method is obsolete; use #terminate instead.
313
+ */
314
+ static VALUE
315
+ strscan_clear(VALUE self)
316
+ {
317
+ rb_warning("StringScanner#clear is obsolete; use #terminate instead");
318
+ return strscan_terminate(self);
319
+ }
320
+
321
+ /*
322
+ * Returns the string being scanned.
323
+ */
324
+ static VALUE
325
+ strscan_get_string(VALUE self)
326
+ {
327
+ struct strscanner *p;
328
+
329
+ GET_SCANNER(self, p);
330
+ return p->str;
331
+ }
332
+
333
+ /*
334
+ * call-seq: string=(str)
335
+ *
336
+ * Changes the string being scanned to +str+ and resets the scanner.
337
+ * Returns +str+.
338
+ */
339
+ static VALUE
340
+ strscan_set_string(VALUE self, VALUE str)
341
+ {
342
+ struct strscanner *p = check_strscan(self);
343
+
344
+ StringValue(str);
345
+ p->str = str;
346
+ p->curr = 0;
347
+ CLEAR_MATCH_STATUS(p);
348
+ return str;
349
+ }
350
+
351
+ /*
352
+ * call-seq:
353
+ * concat(str)
354
+ * <<(str)
355
+ *
356
+ * Appends +str+ to the string being scanned.
357
+ * This method does not affect scan pointer.
358
+ *
359
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
360
+ * s.scan(/Fri /)
361
+ * s << " +1000 GMT"
362
+ * s.string # -> "Fri Dec 12 1975 14:39 +1000 GMT"
363
+ * s.scan(/Dec/) # -> "Dec"
364
+ */
365
+ static VALUE
366
+ strscan_concat(VALUE self, VALUE str)
367
+ {
368
+ struct strscanner *p;
369
+
370
+ GET_SCANNER(self, p);
371
+ StringValue(str);
372
+ rb_str_append(p->str, str);
373
+ return self;
374
+ }
375
+
376
+ /*
377
+ * Returns the byte position of the scan pointer. In the 'reset' position, this
378
+ * value is zero. In the 'terminated' position (i.e. the string is exhausted),
379
+ * this value is the bytesize of the string.
380
+ *
381
+ * In short, it's a 0-based index into bytes of the string.
382
+ *
383
+ * s = StringScanner.new('test string')
384
+ * s.pos # -> 0
385
+ * s.scan_until /str/ # -> "test str"
386
+ * s.pos # -> 8
387
+ * s.terminate # -> #<StringScanner fin>
388
+ * s.pos # -> 11
389
+ */
390
+ static VALUE
391
+ strscan_get_pos(VALUE self)
392
+ {
393
+ struct strscanner *p;
394
+
395
+ GET_SCANNER(self, p);
396
+ return INT2FIX(p->curr);
397
+ }
398
+
399
+ /*
400
+ * Returns the character position of the scan pointer. In the 'reset' position, this
401
+ * value is zero. In the 'terminated' position (i.e. the string is exhausted),
402
+ * this value is the size of the string.
403
+ *
404
+ * In short, it's a 0-based index into the string.
405
+ *
406
+ * s = StringScanner.new("abcädeföghi")
407
+ * s.charpos # -> 0
408
+ * s.scan_until(/ä/) # -> "abcä"
409
+ * s.pos # -> 5
410
+ * s.charpos # -> 4
411
+ */
412
+ static VALUE
413
+ strscan_get_charpos(VALUE self)
414
+ {
415
+ struct strscanner *p;
416
+ VALUE substr;
417
+
418
+ GET_SCANNER(self, p);
419
+
420
+ substr = rb_funcall(p->str, id_byteslice, 2, INT2FIX(0), INT2NUM(p->curr));
421
+
422
+ return rb_str_length(substr);
423
+ }
424
+
425
+ /*
426
+ * call-seq: pos=(n)
427
+ *
428
+ * Set the byte position of the scan pointer.
429
+ *
430
+ * s = StringScanner.new('test string')
431
+ * s.pos = 7 # -> 7
432
+ * s.rest # -> "ring"
433
+ */
434
+ static VALUE
435
+ strscan_set_pos(VALUE self, VALUE v)
436
+ {
437
+ struct strscanner *p;
438
+ long i;
439
+
440
+ GET_SCANNER(self, p);
441
+ i = NUM2INT(v);
442
+ if (i < 0) i += S_LEN(p);
443
+ if (i < 0) rb_raise(rb_eRangeError, "index out of range");
444
+ if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range");
445
+ p->curr = i;
446
+ return INT2NUM(i);
447
+ }
448
+
449
+ static VALUE
450
+ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
451
+ {
452
+ regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
453
+ struct strscanner *p;
454
+ regex_t *re;
455
+ long ret;
456
+ int tmpreg;
457
+
458
+ Check_Type(regex, T_REGEXP);
459
+ GET_SCANNER(self, p);
460
+
461
+ CLEAR_MATCH_STATUS(p);
462
+ if (S_RESTLEN(p) < 0) {
463
+ return Qnil;
464
+ }
465
+
466
+ p->regex = regex;
467
+ re = rb_reg_prepare_re(regex, p->str);
468
+ tmpreg = re != RREGEXP_PTR(regex);
469
+ if (!tmpreg) RREGEXP(regex)->usecnt++;
470
+
471
+ if (headonly) {
472
+ ret = onig_match(re, (UChar* )CURPTR(p),
473
+ (UChar* )(CURPTR(p) + S_RESTLEN(p)),
474
+ (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
475
+ }
476
+ else {
477
+ ret = onig_search(re,
478
+ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
479
+ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
480
+ &(p->regs), ONIG_OPTION_NONE);
481
+ }
482
+ if (!tmpreg) RREGEXP(regex)->usecnt--;
483
+ if (tmpreg) {
484
+ if (RREGEXP(regex)->usecnt) {
485
+ onig_free(re);
486
+ }
487
+ else {
488
+ onig_free(RREGEXP_PTR(regex));
489
+ RREGEXP_PTR(regex) = re;
490
+ }
491
+ }
492
+
493
+ if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
494
+ if (ret < 0) {
495
+ /* not matched */
496
+ return Qnil;
497
+ }
498
+
499
+ MATCHED(p);
500
+ p->prev = p->curr;
501
+ if (succptr) {
502
+ p->curr += p->regs.end[0];
503
+ }
504
+ if (getstr) {
505
+ return extract_beg_len(p, p->prev, p->regs.end[0]);
506
+ }
507
+ else {
508
+ return INT2FIX(p->regs.end[0]);
509
+ }
510
+ }
511
+
512
+ /*
513
+ * call-seq: scan(pattern) => String
514
+ *
515
+ * Tries to match with +pattern+ at the current position. If there's a match,
516
+ * the scanner advances the "scan pointer" and returns the matched string.
517
+ * Otherwise, the scanner returns +nil+.
518
+ *
519
+ * s = StringScanner.new('test string')
520
+ * p s.scan(/\w+/) # -> "test"
521
+ * p s.scan(/\w+/) # -> nil
522
+ * p s.scan(/\s+/) # -> " "
523
+ * p s.scan(/\w+/) # -> "string"
524
+ * p s.scan(/./) # -> nil
525
+ *
526
+ */
527
+ static VALUE
528
+ strscan_scan(VALUE self, VALUE re)
529
+ {
530
+ return strscan_do_scan(self, re, 1, 1, 1);
531
+ }
532
+
533
+ /*
534
+ * call-seq: match?(pattern)
535
+ *
536
+ * Tests whether the given +pattern+ is matched from the current scan pointer.
537
+ * Returns the length of the match, or +nil+. The scan pointer is not advanced.
538
+ *
539
+ * s = StringScanner.new('test string')
540
+ * p s.match?(/\w+/) # -> 4
541
+ * p s.match?(/\w+/) # -> 4
542
+ * p s.match?(/\s+/) # -> nil
543
+ */
544
+ static VALUE
545
+ strscan_match_p(VALUE self, VALUE re)
546
+ {
547
+ return strscan_do_scan(self, re, 0, 0, 1);
548
+ }
549
+
550
+ /*
551
+ * call-seq: skip(pattern)
552
+ *
553
+ * Attempts to skip over the given +pattern+ beginning with the scan pointer.
554
+ * If it matches, the scan pointer is advanced to the end of the match, and the
555
+ * length of the match is returned. Otherwise, +nil+ is returned.
556
+ *
557
+ * It's similar to #scan, but without returning the matched string.
558
+ *
559
+ * s = StringScanner.new('test string')
560
+ * p s.skip(/\w+/) # -> 4
561
+ * p s.skip(/\w+/) # -> nil
562
+ * p s.skip(/\s+/) # -> 1
563
+ * p s.skip(/\w+/) # -> 6
564
+ * p s.skip(/./) # -> nil
565
+ *
566
+ */
567
+ static VALUE
568
+ strscan_skip(VALUE self, VALUE re)
569
+ {
570
+ return strscan_do_scan(self, re, 1, 0, 1);
571
+ }
572
+
573
+ /*
574
+ * call-seq: check(pattern)
575
+ *
576
+ * This returns the value that #scan would return, without advancing the scan
577
+ * pointer. The match register is affected, though.
578
+ *
579
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
580
+ * s.check /Fri/ # -> "Fri"
581
+ * s.pos # -> 0
582
+ * s.matched # -> "Fri"
583
+ * s.check /12/ # -> nil
584
+ * s.matched # -> nil
585
+ *
586
+ * Mnemonic: it "checks" to see whether a #scan will return a value.
587
+ */
588
+ static VALUE
589
+ strscan_check(VALUE self, VALUE re)
590
+ {
591
+ return strscan_do_scan(self, re, 0, 1, 1);
592
+ }
593
+
594
+ /*
595
+ * call-seq: scan_full(pattern, advance_pointer_p, return_string_p)
596
+ *
597
+ * Tests whether the given +pattern+ is matched from the current scan pointer.
598
+ * Advances the scan pointer if +advance_pointer_p+ is true.
599
+ * Returns the matched string if +return_string_p+ is true.
600
+ * The match register is affected.
601
+ *
602
+ * "full" means "#scan with full parameters".
603
+ */
604
+ static VALUE
605
+ strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f)
606
+ {
607
+ return strscan_do_scan(self, re, RTEST(s), RTEST(f), 1);
608
+ }
609
+
610
+ /*
611
+ * call-seq: scan_until(pattern)
612
+ *
613
+ * Scans the string _until_ the +pattern+ is matched. Returns the substring up
614
+ * to and including the end of the match, advancing the scan pointer to that
615
+ * location. If there is no match, +nil+ is returned.
616
+ *
617
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
618
+ * s.scan_until(/1/) # -> "Fri Dec 1"
619
+ * s.pre_match # -> "Fri Dec "
620
+ * s.scan_until(/XYZ/) # -> nil
621
+ */
622
+ static VALUE
623
+ strscan_scan_until(VALUE self, VALUE re)
624
+ {
625
+ return strscan_do_scan(self, re, 1, 1, 0);
626
+ }
627
+
628
+ /*
629
+ * call-seq: exist?(pattern)
630
+ *
631
+ * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string,
632
+ * without advancing the scan pointer. This predicates whether a #scan_until
633
+ * will return a value.
634
+ *
635
+ * s = StringScanner.new('test string')
636
+ * s.exist? /s/ # -> 3
637
+ * s.scan /test/ # -> "test"
638
+ * s.exist? /s/ # -> 2
639
+ * s.exist? /e/ # -> nil
640
+ */
641
+ static VALUE
642
+ strscan_exist_p(VALUE self, VALUE re)
643
+ {
644
+ return strscan_do_scan(self, re, 0, 0, 0);
645
+ }
646
+
647
+ /*
648
+ * call-seq: skip_until(pattern)
649
+ *
650
+ * Advances the scan pointer until +pattern+ is matched and consumed. Returns
651
+ * the number of bytes advanced, or +nil+ if no match was found.
652
+ *
653
+ * Look ahead to match +pattern+, and advance the scan pointer to the _end_
654
+ * of the match. Return the number of characters advanced, or +nil+ if the
655
+ * match was unsuccessful.
656
+ *
657
+ * It's similar to #scan_until, but without returning the intervening string.
658
+ *
659
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
660
+ * s.skip_until /12/ # -> 10
661
+ * s #
662
+ */
663
+ static VALUE
664
+ strscan_skip_until(VALUE self, VALUE re)
665
+ {
666
+ return strscan_do_scan(self, re, 1, 0, 0);
667
+ }
668
+
669
+ /*
670
+ * call-seq: check_until(pattern)
671
+ *
672
+ * This returns the value that #scan_until would return, without advancing the
673
+ * scan pointer. The match register is affected, though.
674
+ *
675
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
676
+ * s.check_until /12/ # -> "Fri Dec 12"
677
+ * s.pos # -> 0
678
+ * s.matched # -> 12
679
+ *
680
+ * Mnemonic: it "checks" to see whether a #scan_until will return a value.
681
+ */
682
+ static VALUE
683
+ strscan_check_until(VALUE self, VALUE re)
684
+ {
685
+ return strscan_do_scan(self, re, 0, 1, 0);
686
+ }
687
+
688
+ /*
689
+ * call-seq: search_full(pattern, advance_pointer_p, return_string_p)
690
+ *
691
+ * Scans the string _until_ the +pattern+ is matched.
692
+ * Advances the scan pointer if +advance_pointer_p+, otherwise not.
693
+ * Returns the matched string if +return_string_p+ is true, otherwise
694
+ * returns the number of bytes advanced.
695
+ * This method does affect the match register.
696
+ */
697
+ static VALUE
698
+ strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f)
699
+ {
700
+ return strscan_do_scan(self, re, RTEST(s), RTEST(f), 0);
701
+ }
702
+
703
+ static void
704
+ adjust_registers_to_matched(struct strscanner *p)
705
+ {
706
+ onig_region_clear(&(p->regs));
707
+ onig_region_set(&(p->regs), 0, 0, (int)(p->curr - p->prev));
708
+ }
709
+
710
+ /*
711
+ * Scans one character and returns it.
712
+ * This method is multibyte character sensitive.
713
+ *
714
+ * s = StringScanner.new("ab")
715
+ * s.getch # => "a"
716
+ * s.getch # => "b"
717
+ * s.getch # => nil
718
+ *
719
+ * $KCODE = 'EUC'
720
+ * s = StringScanner.new("\244\242")
721
+ * s.getch # => "\244\242" # Japanese hira-kana "A" in EUC-JP
722
+ * s.getch # => nil
723
+ */
724
+ static VALUE
725
+ strscan_getch(VALUE self)
726
+ {
727
+ struct strscanner *p;
728
+ long len;
729
+
730
+ GET_SCANNER(self, p);
731
+ CLEAR_MATCH_STATUS(p);
732
+ if (EOS_P(p))
733
+ return Qnil;
734
+
735
+ len = rb_enc_mbclen(CURPTR(p), S_PEND(p), rb_enc_get(p->str));
736
+ len = minl(len, S_RESTLEN(p));
737
+ p->prev = p->curr;
738
+ p->curr += len;
739
+ MATCHED(p);
740
+ adjust_registers_to_matched(p);
741
+ return extract_range(p, p->prev + p->regs.beg[0],
742
+ p->prev + p->regs.end[0]);
743
+ }
744
+
745
+ /*
746
+ * Scans one byte and returns it.
747
+ * This method is not multibyte character sensitive.
748
+ * See also: #getch.
749
+ *
750
+ * s = StringScanner.new('ab')
751
+ * s.get_byte # => "a"
752
+ * s.get_byte # => "b"
753
+ * s.get_byte # => nil
754
+ *
755
+ * $KCODE = 'EUC'
756
+ * s = StringScanner.new("\244\242")
757
+ * s.get_byte # => "\244"
758
+ * s.get_byte # => "\242"
759
+ * s.get_byte # => nil
760
+ */
761
+ static VALUE
762
+ strscan_get_byte(VALUE self)
763
+ {
764
+ struct strscanner *p;
765
+
766
+ GET_SCANNER(self, p);
767
+ CLEAR_MATCH_STATUS(p);
768
+ if (EOS_P(p))
769
+ return Qnil;
770
+
771
+ p->prev = p->curr;
772
+ p->curr++;
773
+ MATCHED(p);
774
+ adjust_registers_to_matched(p);
775
+ return extract_range(p, p->prev + p->regs.beg[0],
776
+ p->prev + p->regs.end[0]);
777
+ }
778
+
779
+ /*
780
+ * Equivalent to #get_byte.
781
+ * This method is obsolete; use #get_byte instead.
782
+ */
783
+ static VALUE
784
+ strscan_getbyte(VALUE self)
785
+ {
786
+ rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead");
787
+ return strscan_get_byte(self);
788
+ }
789
+
790
+ /*
791
+ * call-seq: peek(len)
792
+ *
793
+ * Extracts a string corresponding to <tt>string[pos,len]</tt>, without
794
+ * advancing the scan pointer.
795
+ *
796
+ * s = StringScanner.new('test string')
797
+ * s.peek(7) # => "test st"
798
+ * s.peek(7) # => "test st"
799
+ *
800
+ */
801
+ static VALUE
802
+ strscan_peek(VALUE self, VALUE vlen)
803
+ {
804
+ struct strscanner *p;
805
+ long len;
806
+
807
+ GET_SCANNER(self, p);
808
+
809
+ len = NUM2LONG(vlen);
810
+ if (EOS_P(p))
811
+ return infect(str_new(p, "", 0), p);
812
+
813
+ len = minl(len, S_RESTLEN(p));
814
+ return extract_beg_len(p, p->curr, len);
815
+ }
816
+
817
+ /*
818
+ * Equivalent to #peek.
819
+ * This method is obsolete; use #peek instead.
820
+ */
821
+ static VALUE
822
+ strscan_peep(VALUE self, VALUE vlen)
823
+ {
824
+ rb_warning("StringScanner#peep is obsolete; use #peek instead");
825
+ return strscan_peek(self, vlen);
826
+ }
827
+
828
+ /*
829
+ * Set the scan pointer to the previous position. Only one previous position is
830
+ * remembered, and it changes with each scanning operation.
831
+ *
832
+ * s = StringScanner.new('test string')
833
+ * s.scan(/\w+/) # => "test"
834
+ * s.unscan
835
+ * s.scan(/../) # => "te"
836
+ * s.scan(/\d/) # => nil
837
+ * s.unscan # ScanError: unscan failed: previous match record not exist
838
+ */
839
+ static VALUE
840
+ strscan_unscan(VALUE self)
841
+ {
842
+ struct strscanner *p;
843
+
844
+ GET_SCANNER(self, p);
845
+ if (! MATCHED_P(p))
846
+ rb_raise(ScanError, "unscan failed: previous match record not exist");
847
+ p->curr = p->prev;
848
+ CLEAR_MATCH_STATUS(p);
849
+ return self;
850
+ }
851
+
852
+ /*
853
+ * Returns +true+ iff the scan pointer is at the beginning of the line.
854
+ *
855
+ * s = StringScanner.new("test\ntest\n")
856
+ * s.bol? # => true
857
+ * s.scan(/te/)
858
+ * s.bol? # => false
859
+ * s.scan(/st\n/)
860
+ * s.bol? # => true
861
+ * s.terminate
862
+ * s.bol? # => true
863
+ */
864
+ static VALUE
865
+ strscan_bol_p(VALUE self)
866
+ {
867
+ struct strscanner *p;
868
+
869
+ GET_SCANNER(self, p);
870
+ if (CURPTR(p) > S_PEND(p)) return Qnil;
871
+ if (p->curr == 0) return Qtrue;
872
+ return (*(CURPTR(p) - 1) == '\n') ? Qtrue : Qfalse;
873
+ }
874
+
875
+ /*
876
+ * Returns +true+ if the scan pointer is at the end of the string.
877
+ *
878
+ * s = StringScanner.new('test string')
879
+ * p s.eos? # => false
880
+ * s.scan(/test/)
881
+ * p s.eos? # => false
882
+ * s.terminate
883
+ * p s.eos? # => true
884
+ */
885
+ static VALUE
886
+ strscan_eos_p(VALUE self)
887
+ {
888
+ struct strscanner *p;
889
+
890
+ GET_SCANNER(self, p);
891
+ return EOS_P(p) ? Qtrue : Qfalse;
892
+ }
893
+
894
+ /*
895
+ * Equivalent to #eos?.
896
+ * This method is obsolete, use #eos? instead.
897
+ */
898
+ static VALUE
899
+ strscan_empty_p(VALUE self)
900
+ {
901
+ rb_warning("StringScanner#empty? is obsolete; use #eos? instead");
902
+ return strscan_eos_p(self);
903
+ }
904
+
905
+ /*
906
+ * Returns true iff there is more data in the string. See #eos?.
907
+ * This method is obsolete; use #eos? instead.
908
+ *
909
+ * s = StringScanner.new('test string')
910
+ * s.eos? # These two
911
+ * s.rest? # are opposites.
912
+ */
913
+ static VALUE
914
+ strscan_rest_p(VALUE self)
915
+ {
916
+ struct strscanner *p;
917
+
918
+ GET_SCANNER(self, p);
919
+ return EOS_P(p) ? Qfalse : Qtrue;
920
+ }
921
+
922
+ /*
923
+ * Returns +true+ iff the last match was successful.
924
+ *
925
+ * s = StringScanner.new('test string')
926
+ * s.match?(/\w+/) # => 4
927
+ * s.matched? # => true
928
+ * s.match?(/\d+/) # => nil
929
+ * s.matched? # => false
930
+ */
931
+ static VALUE
932
+ strscan_matched_p(VALUE self)
933
+ {
934
+ struct strscanner *p;
935
+
936
+ GET_SCANNER(self, p);
937
+ return MATCHED_P(p) ? Qtrue : Qfalse;
938
+ }
939
+
940
+ /*
941
+ * Returns the last matched string.
942
+ *
943
+ * s = StringScanner.new('test string')
944
+ * s.match?(/\w+/) # -> 4
945
+ * s.matched # -> "test"
946
+ */
947
+ static VALUE
948
+ strscan_matched(VALUE self)
949
+ {
950
+ struct strscanner *p;
951
+
952
+ GET_SCANNER(self, p);
953
+ if (! MATCHED_P(p)) return Qnil;
954
+ return extract_range(p, p->prev + p->regs.beg[0],
955
+ p->prev + p->regs.end[0]);
956
+ }
957
+
958
+ /*
959
+ * Returns the size of the most recent match (see #matched), or +nil+ if there
960
+ * was no recent match.
961
+ *
962
+ * s = StringScanner.new('test string')
963
+ * s.check /\w+/ # -> "test"
964
+ * s.matched_size # -> 4
965
+ * s.check /\d+/ # -> nil
966
+ * s.matched_size # -> nil
967
+ */
968
+ static VALUE
969
+ strscan_matched_size(VALUE self)
970
+ {
971
+ struct strscanner *p;
972
+
973
+ GET_SCANNER(self, p);
974
+ if (! MATCHED_P(p)) return Qnil;
975
+ return INT2NUM(p->regs.end[0] - p->regs.beg[0]);
976
+ }
977
+
978
+ static int
979
+ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end, rb_encoding *enc)
980
+ {
981
+ int num;
982
+
983
+ num = onig_name_to_backref_number(RREGEXP_PTR(regexp),
984
+ (const unsigned char* )name, (const unsigned char* )name_end, regs);
985
+ if (num >= 1) {
986
+ return num;
987
+ }
988
+ else {
989
+ rb_enc_raise(enc, rb_eIndexError, "undefined group name reference: %.*s",
990
+ rb_long2int(name_end - name), name);
991
+ }
992
+
993
+ UNREACHABLE;
994
+ }
995
+
996
+ /*
997
+ * call-seq: [](n)
998
+ *
999
+ * Returns the n-th subgroup in the most recent match.
1000
+ *
1001
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1002
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1003
+ * s[0] # -> "Fri Dec 12 "
1004
+ * s[1] # -> "Fri"
1005
+ * s[2] # -> "Dec"
1006
+ * s[3] # -> "12"
1007
+ * s.post_match # -> "1975 14:39"
1008
+ * s.pre_match # -> ""
1009
+ *
1010
+ * s.reset
1011
+ * s.scan(/(?<wday>\w+) (?<month>\w+) (?<day>\d+) /) # -> "Fri Dec 12 "
1012
+ * s[0] # -> "Fri Dec 12 "
1013
+ * s[1] # -> "Fri"
1014
+ * s[2] # -> "Dec"
1015
+ * s[3] # -> "12"
1016
+ * s[:wday] # -> "Fri"
1017
+ * s[:month] # -> "Dec"
1018
+ * s[:day] # -> "12"
1019
+ * s.post_match # -> "1975 14:39"
1020
+ * s.pre_match # -> ""
1021
+ */
1022
+ static VALUE
1023
+ strscan_aref(VALUE self, VALUE idx)
1024
+ {
1025
+ const char *name;
1026
+ struct strscanner *p;
1027
+ long i;
1028
+
1029
+ GET_SCANNER(self, p);
1030
+ if (! MATCHED_P(p)) return Qnil;
1031
+
1032
+ switch (TYPE(idx)) {
1033
+ case T_SYMBOL:
1034
+ idx = rb_sym2str(idx);
1035
+ /* fall through */
1036
+ case T_STRING:
1037
+ if (!p->regex) return Qnil;
1038
+ RSTRING_GETMEM(idx, name, i);
1039
+ i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx));
1040
+ break;
1041
+ default:
1042
+ i = NUM2LONG(idx);
1043
+ }
1044
+
1045
+ if (i < 0)
1046
+ i += p->regs.num_regs;
1047
+ if (i < 0) return Qnil;
1048
+ if (i >= p->regs.num_regs) return Qnil;
1049
+ if (p->regs.beg[i] == -1) return Qnil;
1050
+
1051
+ return extract_range(p, p->prev + p->regs.beg[i],
1052
+ p->prev + p->regs.end[i]);
1053
+ }
1054
+
1055
+ /*
1056
+ * call-seq: size
1057
+ *
1058
+ * Returns the amount of subgroups in the most recent match.
1059
+ * The full match counts as a subgroup.
1060
+ *
1061
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1062
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1063
+ * s.size # -> 4
1064
+ */
1065
+ static VALUE
1066
+ strscan_size(VALUE self)
1067
+ {
1068
+ struct strscanner *p;
1069
+
1070
+ GET_SCANNER(self, p);
1071
+ if (! MATCHED_P(p)) return Qnil;
1072
+ return INT2FIX(p->regs.num_regs);
1073
+ }
1074
+
1075
+ /*
1076
+ * call-seq: captures
1077
+ *
1078
+ * Returns the subgroups in the most recent match (not including the full match).
1079
+ * If nothing was priorly matched, it returns nil.
1080
+ *
1081
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1082
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1083
+ * s.captures # -> ["Fri", "Dec", "12"]
1084
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> nil
1085
+ * s.captures # -> nil
1086
+ */
1087
+ static VALUE
1088
+ strscan_captures(VALUE self)
1089
+ {
1090
+ struct strscanner *p;
1091
+ int i, num_regs;
1092
+ VALUE new_ary;
1093
+
1094
+ GET_SCANNER(self, p);
1095
+ if (! MATCHED_P(p)) return Qnil;
1096
+
1097
+ num_regs = p->regs.num_regs;
1098
+ new_ary = rb_ary_new2(num_regs);
1099
+
1100
+ for (i = 1; i < num_regs; i++) {
1101
+ VALUE str = extract_range(p, p->prev + p->regs.beg[i],
1102
+ p->prev + p->regs.end[i]);
1103
+ rb_ary_push(new_ary, str);
1104
+ }
1105
+
1106
+ return new_ary;
1107
+ }
1108
+
1109
+ /*
1110
+ * call-seq:
1111
+ * scanner.values_at( i1, i2, ... iN ) -> an_array
1112
+ *
1113
+ * Returns the subgroups in the most recent match at the given indices.
1114
+ * If nothing was priorly matched, it returns nil.
1115
+ *
1116
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1117
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1118
+ * s.values_at 0, -1, 5, 2 # -> ["Fri Dec 12 ", "12", nil, "Dec"]
1119
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> nil
1120
+ * s.values_at 0, -1, 5, 2 # -> nil
1121
+ */
1122
+
1123
+ static VALUE
1124
+ strscan_values_at(int argc, VALUE *argv, VALUE self)
1125
+ {
1126
+ struct strscanner *p;
1127
+ long i;
1128
+ VALUE new_ary;
1129
+
1130
+ GET_SCANNER(self, p);
1131
+ if (! MATCHED_P(p)) return Qnil;
1132
+
1133
+ new_ary = rb_ary_new2(argc);
1134
+ for (i = 0; i<argc; i++) {
1135
+ rb_ary_push(new_ary, strscan_aref(self, argv[i]));
1136
+ }
1137
+
1138
+ return new_ary;
1139
+ }
1140
+
1141
+ /*
1142
+ * Returns the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan.
1143
+ *
1144
+ * s = StringScanner.new('test string')
1145
+ * s.scan(/\w+/) # -> "test"
1146
+ * s.scan(/\s+/) # -> " "
1147
+ * s.pre_match # -> "test"
1148
+ * s.post_match # -> "string"
1149
+ */
1150
+ static VALUE
1151
+ strscan_pre_match(VALUE self)
1152
+ {
1153
+ struct strscanner *p;
1154
+
1155
+ GET_SCANNER(self, p);
1156
+ if (! MATCHED_P(p)) return Qnil;
1157
+ return extract_range(p, 0, p->prev + p->regs.beg[0]);
1158
+ }
1159
+
1160
+ /*
1161
+ * Returns the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan.
1162
+ *
1163
+ * s = StringScanner.new('test string')
1164
+ * s.scan(/\w+/) # -> "test"
1165
+ * s.scan(/\s+/) # -> " "
1166
+ * s.pre_match # -> "test"
1167
+ * s.post_match # -> "string"
1168
+ */
1169
+ static VALUE
1170
+ strscan_post_match(VALUE self)
1171
+ {
1172
+ struct strscanner *p;
1173
+
1174
+ GET_SCANNER(self, p);
1175
+ if (! MATCHED_P(p)) return Qnil;
1176
+ return extract_range(p, p->prev + p->regs.end[0], S_LEN(p));
1177
+ }
1178
+
1179
+ /*
1180
+ * Returns the "rest" of the string (i.e. everything after the scan pointer).
1181
+ * If there is no more data (eos? = true), it returns <tt>""</tt>.
1182
+ */
1183
+ static VALUE
1184
+ strscan_rest(VALUE self)
1185
+ {
1186
+ struct strscanner *p;
1187
+
1188
+ GET_SCANNER(self, p);
1189
+ if (EOS_P(p)) {
1190
+ return infect(str_new(p, "", 0), p);
1191
+ }
1192
+ return extract_range(p, p->curr, S_LEN(p));
1193
+ }
1194
+
1195
+ /*
1196
+ * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>.
1197
+ */
1198
+ static VALUE
1199
+ strscan_rest_size(VALUE self)
1200
+ {
1201
+ struct strscanner *p;
1202
+ long i;
1203
+
1204
+ GET_SCANNER(self, p);
1205
+ if (EOS_P(p)) {
1206
+ return INT2FIX(0);
1207
+ }
1208
+ i = S_RESTLEN(p);
1209
+ return INT2FIX(i);
1210
+ }
1211
+
1212
+ /*
1213
+ * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>.
1214
+ * This method is obsolete; use #rest_size instead.
1215
+ */
1216
+ static VALUE
1217
+ strscan_restsize(VALUE self)
1218
+ {
1219
+ rb_warning("StringScanner#restsize is obsolete; use #rest_size instead");
1220
+ return strscan_rest_size(self);
1221
+ }
1222
+
1223
+ #define INSPECT_LENGTH 5
1224
+
1225
+ /*
1226
+ * Returns a string that represents the StringScanner object, showing:
1227
+ * - the current position
1228
+ * - the size of the string
1229
+ * - the characters surrounding the scan pointer
1230
+ *
1231
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1232
+ * s.inspect # -> '#<StringScanner 0/21 @ "Fri D...">'
1233
+ * s.scan_until /12/ # -> "Fri Dec 12"
1234
+ * s.inspect # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">'
1235
+ */
1236
+ static VALUE
1237
+ strscan_inspect(VALUE self)
1238
+ {
1239
+ struct strscanner *p;
1240
+ VALUE a, b;
1241
+
1242
+ p = check_strscan(self);
1243
+ if (NIL_P(p->str)) {
1244
+ a = rb_sprintf("#<%"PRIsVALUE" (uninitialized)>", rb_obj_class(self));
1245
+ return infect(a, p);
1246
+ }
1247
+ if (EOS_P(p)) {
1248
+ a = rb_sprintf("#<%"PRIsVALUE" fin>", rb_obj_class(self));
1249
+ return infect(a, p);
1250
+ }
1251
+ if (p->curr == 0) {
1252
+ b = inspect2(p);
1253
+ a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld @ %"PRIsVALUE">",
1254
+ rb_obj_class(self),
1255
+ p->curr, S_LEN(p),
1256
+ b);
1257
+ return infect(a, p);
1258
+ }
1259
+ a = inspect1(p);
1260
+ b = inspect2(p);
1261
+ a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld %"PRIsVALUE" @ %"PRIsVALUE">",
1262
+ rb_obj_class(self),
1263
+ p->curr, S_LEN(p),
1264
+ a, b);
1265
+ return infect(a, p);
1266
+ }
1267
+
1268
+ static VALUE
1269
+ inspect1(struct strscanner *p)
1270
+ {
1271
+ VALUE str;
1272
+ long len;
1273
+
1274
+ if (p->curr == 0) return rb_str_new2("");
1275
+ if (p->curr > INSPECT_LENGTH) {
1276
+ str = rb_str_new_cstr("...");
1277
+ len = INSPECT_LENGTH;
1278
+ }
1279
+ else {
1280
+ str = rb_str_new(0, 0);
1281
+ len = p->curr;
1282
+ }
1283
+ rb_str_cat(str, CURPTR(p) - len, len);
1284
+ return rb_str_dump(str);
1285
+ }
1286
+
1287
+ static VALUE
1288
+ inspect2(struct strscanner *p)
1289
+ {
1290
+ VALUE str;
1291
+ long len;
1292
+
1293
+ if (EOS_P(p)) return rb_str_new2("");
1294
+ len = S_RESTLEN(p);
1295
+ if (len > INSPECT_LENGTH) {
1296
+ str = rb_str_new(CURPTR(p), INSPECT_LENGTH);
1297
+ rb_str_cat2(str, "...");
1298
+ }
1299
+ else {
1300
+ str = rb_str_new(CURPTR(p), len);
1301
+ }
1302
+ return rb_str_dump(str);
1303
+ }
1304
+
1305
+ /* =======================================================================
1306
+ Ruby Interface
1307
+ ======================================================================= */
1308
+
1309
+ /*
1310
+ * Document-class: StringScanner
1311
+ *
1312
+ * StringScanner provides for lexical scanning operations on a String. Here is
1313
+ * an example of its usage:
1314
+ *
1315
+ * s = StringScanner.new('This is an example string')
1316
+ * s.eos? # -> false
1317
+ *
1318
+ * p s.scan(/\w+/) # -> "This"
1319
+ * p s.scan(/\w+/) # -> nil
1320
+ * p s.scan(/\s+/) # -> " "
1321
+ * p s.scan(/\s+/) # -> nil
1322
+ * p s.scan(/\w+/) # -> "is"
1323
+ * s.eos? # -> false
1324
+ *
1325
+ * p s.scan(/\s+/) # -> " "
1326
+ * p s.scan(/\w+/) # -> "an"
1327
+ * p s.scan(/\s+/) # -> " "
1328
+ * p s.scan(/\w+/) # -> "example"
1329
+ * p s.scan(/\s+/) # -> " "
1330
+ * p s.scan(/\w+/) # -> "string"
1331
+ * s.eos? # -> true
1332
+ *
1333
+ * p s.scan(/\s+/) # -> nil
1334
+ * p s.scan(/\w+/) # -> nil
1335
+ *
1336
+ * Scanning a string means remembering the position of a <i>scan pointer</i>,
1337
+ * which is just an index. The point of scanning is to move forward a bit at
1338
+ * a time, so matches are sought after the scan pointer; usually immediately
1339
+ * after it.
1340
+ *
1341
+ * Given the string "test string", here are the pertinent scan pointer
1342
+ * positions:
1343
+ *
1344
+ * t e s t s t r i n g
1345
+ * 0 1 2 ... 1
1346
+ * 0
1347
+ *
1348
+ * When you #scan for a pattern (a regular expression), the match must occur
1349
+ * at the character after the scan pointer. If you use #scan_until, then the
1350
+ * match can occur anywhere after the scan pointer. In both cases, the scan
1351
+ * pointer moves <i>just beyond</i> the last character of the match, ready to
1352
+ * scan again from the next character onwards. This is demonstrated by the
1353
+ * example above.
1354
+ *
1355
+ * == Method Categories
1356
+ *
1357
+ * There are other methods besides the plain scanners. You can look ahead in
1358
+ * the string without actually scanning. You can access the most recent match.
1359
+ * You can modify the string being scanned, reset or terminate the scanner,
1360
+ * find out or change the position of the scan pointer, skip ahead, and so on.
1361
+ *
1362
+ * === Advancing the Scan Pointer
1363
+ *
1364
+ * - #getch
1365
+ * - #get_byte
1366
+ * - #scan
1367
+ * - #scan_until
1368
+ * - #skip
1369
+ * - #skip_until
1370
+ *
1371
+ * === Looking Ahead
1372
+ *
1373
+ * - #check
1374
+ * - #check_until
1375
+ * - #exist?
1376
+ * - #match?
1377
+ * - #peek
1378
+ *
1379
+ * === Finding Where we Are
1380
+ *
1381
+ * - #beginning_of_line? (#bol?)
1382
+ * - #eos?
1383
+ * - #rest?
1384
+ * - #rest_size
1385
+ * - #pos
1386
+ *
1387
+ * === Setting Where we Are
1388
+ *
1389
+ * - #reset
1390
+ * - #terminate
1391
+ * - #pos=
1392
+ *
1393
+ * === Match Data
1394
+ *
1395
+ * - #matched
1396
+ * - #matched?
1397
+ * - #matched_size
1398
+ * - []
1399
+ * - #pre_match
1400
+ * - #post_match
1401
+ *
1402
+ * === Miscellaneous
1403
+ *
1404
+ * - <<
1405
+ * - #concat
1406
+ * - #string
1407
+ * - #string=
1408
+ * - #unscan
1409
+ *
1410
+ * There are aliases to several of the methods.
1411
+ */
1412
+ void
1413
+ Init_strscan(void)
1414
+ {
1415
+ ID id_scanerr = rb_intern("ScanError");
1416
+ VALUE tmp;
1417
+
1418
+ id_byteslice = rb_intern("byteslice");
1419
+
1420
+ StringScanner = rb_define_class("StringScanner", rb_cObject);
1421
+ ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError);
1422
+ if (!rb_const_defined(rb_cObject, id_scanerr)) {
1423
+ rb_const_set(rb_cObject, id_scanerr, ScanError);
1424
+ }
1425
+ tmp = rb_str_new2(STRSCAN_VERSION);
1426
+ rb_obj_freeze(tmp);
1427
+ rb_const_set(StringScanner, rb_intern("Version"), tmp);
1428
+ tmp = rb_str_new2("$Id$");
1429
+ rb_obj_freeze(tmp);
1430
+ rb_const_set(StringScanner, rb_intern("Id"), tmp);
1431
+
1432
+ rb_define_alloc_func(StringScanner, strscan_s_allocate);
1433
+ rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1);
1434
+ rb_define_private_method(StringScanner, "initialize_copy", strscan_init_copy, 1);
1435
+ rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0);
1436
+ rb_define_method(StringScanner, "reset", strscan_reset, 0);
1437
+ rb_define_method(StringScanner, "terminate", strscan_terminate, 0);
1438
+ rb_define_method(StringScanner, "clear", strscan_clear, 0);
1439
+ rb_define_method(StringScanner, "string", strscan_get_string, 0);
1440
+ rb_define_method(StringScanner, "string=", strscan_set_string, 1);
1441
+ rb_define_method(StringScanner, "concat", strscan_concat, 1);
1442
+ rb_define_method(StringScanner, "<<", strscan_concat, 1);
1443
+ rb_define_method(StringScanner, "pos", strscan_get_pos, 0);
1444
+ rb_define_method(StringScanner, "pos=", strscan_set_pos, 1);
1445
+ rb_define_method(StringScanner, "charpos", strscan_get_charpos, 0);
1446
+ rb_define_method(StringScanner, "pointer", strscan_get_pos, 0);
1447
+ rb_define_method(StringScanner, "pointer=", strscan_set_pos, 1);
1448
+
1449
+ rb_define_method(StringScanner, "scan", strscan_scan, 1);
1450
+ rb_define_method(StringScanner, "skip", strscan_skip, 1);
1451
+ rb_define_method(StringScanner, "match?", strscan_match_p, 1);
1452
+ rb_define_method(StringScanner, "check", strscan_check, 1);
1453
+ rb_define_method(StringScanner, "scan_full", strscan_scan_full, 3);
1454
+
1455
+ rb_define_method(StringScanner, "scan_until", strscan_scan_until, 1);
1456
+ rb_define_method(StringScanner, "skip_until", strscan_skip_until, 1);
1457
+ rb_define_method(StringScanner, "exist?", strscan_exist_p, 1);
1458
+ rb_define_method(StringScanner, "check_until", strscan_check_until, 1);
1459
+ rb_define_method(StringScanner, "search_full", strscan_search_full, 3);
1460
+
1461
+ rb_define_method(StringScanner, "getch", strscan_getch, 0);
1462
+ rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0);
1463
+ rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0);
1464
+ rb_define_method(StringScanner, "peek", strscan_peek, 1);
1465
+ rb_define_method(StringScanner, "peep", strscan_peep, 1);
1466
+
1467
+ rb_define_method(StringScanner, "unscan", strscan_unscan, 0);
1468
+
1469
+ rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);
1470
+ rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?"));
1471
+ rb_define_method(StringScanner, "eos?", strscan_eos_p, 0);
1472
+ rb_define_method(StringScanner, "empty?", strscan_empty_p, 0);
1473
+ rb_define_method(StringScanner, "rest?", strscan_rest_p, 0);
1474
+
1475
+ rb_define_method(StringScanner, "matched?", strscan_matched_p, 0);
1476
+ rb_define_method(StringScanner, "matched", strscan_matched, 0);
1477
+ rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0);
1478
+ rb_define_method(StringScanner, "[]", strscan_aref, 1);
1479
+ rb_define_method(StringScanner, "pre_match", strscan_pre_match, 0);
1480
+ rb_define_method(StringScanner, "post_match", strscan_post_match, 0);
1481
+ rb_define_method(StringScanner, "size", strscan_size, 0);
1482
+ rb_define_method(StringScanner, "captures", strscan_captures, 0);
1483
+ rb_define_method(StringScanner, "values_at", strscan_values_at, -1);
1484
+
1485
+ rb_define_method(StringScanner, "rest", strscan_rest, 0);
1486
+ rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0);
1487
+ rb_define_method(StringScanner, "restsize", strscan_restsize, 0);
1488
+
1489
+ rb_define_method(StringScanner, "inspect", strscan_inspect, 0);
1490
+ }