strscan 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1490 @@
1
+ /*
2
+ $Id$
3
+
4
+ Copyright (c) 1999-2006 Minero Aoki
5
+
6
+ This program is free software.
7
+ You can distribute/modify this program under the terms of
8
+ the Ruby License. For details, see the file COPYING.
9
+ */
10
+
11
+ #include "ruby/ruby.h"
12
+ #include "ruby/re.h"
13
+ #include "ruby/encoding.h"
14
+ #include "regint.h"
15
+
16
+ #define STRSCAN_VERSION "0.7.0"
17
+
18
+ /* =======================================================================
19
+ Data Type Definitions
20
+ ======================================================================= */
21
+
22
+ static VALUE StringScanner;
23
+ static VALUE ScanError;
24
+ static ID id_byteslice;
25
+
26
+ struct strscanner
27
+ {
28
+ /* multi-purpose flags */
29
+ unsigned long flags;
30
+ #define FLAG_MATCHED (1 << 0)
31
+
32
+ /* the string to scan */
33
+ VALUE str;
34
+
35
+ /* scan pointers */
36
+ long prev; /* legal only when MATCHED_P(s) */
37
+ long curr; /* always legal */
38
+
39
+ /* the regexp register; legal only when MATCHED_P(s) */
40
+ struct re_registers regs;
41
+
42
+ /* regexp used for last scan */
43
+ VALUE regex;
44
+ };
45
+
46
+ #define MATCHED_P(s) ((s)->flags & FLAG_MATCHED)
47
+ #define MATCHED(s) (s)->flags |= FLAG_MATCHED
48
+ #define CLEAR_MATCH_STATUS(s) (s)->flags &= ~FLAG_MATCHED
49
+
50
+ #define S_PBEG(s) (RSTRING_PTR((s)->str))
51
+ #define S_LEN(s) (RSTRING_LEN((s)->str))
52
+ #define S_PEND(s) (S_PBEG(s) + S_LEN(s))
53
+ #define CURPTR(s) (S_PBEG(s) + (s)->curr)
54
+ #define S_RESTLEN(s) (S_LEN(s) - (s)->curr)
55
+
56
+ #define EOS_P(s) ((s)->curr >= RSTRING_LEN(p->str))
57
+
58
+ #define GET_SCANNER(obj,var) do {\
59
+ (var) = check_strscan(obj);\
60
+ if (NIL_P((var)->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");\
61
+ } while (0)
62
+
63
+ /* =======================================================================
64
+ Function Prototypes
65
+ ======================================================================= */
66
+
67
+ static inline long minl _((const long n, const long x));
68
+ static VALUE infect _((VALUE str, struct strscanner *p));
69
+ static VALUE extract_range _((struct strscanner *p, long beg_i, long end_i));
70
+ static VALUE extract_beg_len _((struct strscanner *p, long beg_i, long len));
71
+
72
+ static struct strscanner *check_strscan _((VALUE obj));
73
+ static void strscan_mark _((void *p));
74
+ static void strscan_free _((void *p));
75
+ static size_t strscan_memsize _((const void *p));
76
+ static VALUE strscan_s_allocate _((VALUE klass));
77
+ static VALUE strscan_initialize _((int argc, VALUE *argv, VALUE self));
78
+ static VALUE strscan_init_copy _((VALUE vself, VALUE vorig));
79
+
80
+ static VALUE strscan_s_mustc _((VALUE self));
81
+ static VALUE strscan_terminate _((VALUE self));
82
+ static VALUE strscan_clear _((VALUE self));
83
+ static VALUE strscan_get_string _((VALUE self));
84
+ static VALUE strscan_set_string _((VALUE self, VALUE str));
85
+ static VALUE strscan_concat _((VALUE self, VALUE str));
86
+ static VALUE strscan_get_pos _((VALUE self));
87
+ static VALUE strscan_set_pos _((VALUE self, VALUE pos));
88
+ static VALUE strscan_do_scan _((VALUE self, VALUE regex,
89
+ int succptr, int getstr, int headonly));
90
+ static VALUE strscan_scan _((VALUE self, VALUE re));
91
+ static VALUE strscan_match_p _((VALUE self, VALUE re));
92
+ static VALUE strscan_skip _((VALUE self, VALUE re));
93
+ static VALUE strscan_check _((VALUE self, VALUE re));
94
+ static VALUE strscan_scan_full _((VALUE self, VALUE re,
95
+ VALUE succp, VALUE getp));
96
+ static VALUE strscan_scan_until _((VALUE self, VALUE re));
97
+ static VALUE strscan_skip_until _((VALUE self, VALUE re));
98
+ static VALUE strscan_check_until _((VALUE self, VALUE re));
99
+ static VALUE strscan_search_full _((VALUE self, VALUE re,
100
+ VALUE succp, VALUE getp));
101
+ static void adjust_registers_to_matched _((struct strscanner *p));
102
+ static VALUE strscan_getch _((VALUE self));
103
+ static VALUE strscan_get_byte _((VALUE self));
104
+ static VALUE strscan_getbyte _((VALUE self));
105
+ static VALUE strscan_peek _((VALUE self, VALUE len));
106
+ static VALUE strscan_peep _((VALUE self, VALUE len));
107
+ static VALUE strscan_unscan _((VALUE self));
108
+ static VALUE strscan_bol_p _((VALUE self));
109
+ static VALUE strscan_eos_p _((VALUE self));
110
+ static VALUE strscan_empty_p _((VALUE self));
111
+ static VALUE strscan_rest_p _((VALUE self));
112
+ static VALUE strscan_matched_p _((VALUE self));
113
+ static VALUE strscan_matched _((VALUE self));
114
+ static VALUE strscan_matched_size _((VALUE self));
115
+ static VALUE strscan_aref _((VALUE self, VALUE idx));
116
+ static VALUE strscan_pre_match _((VALUE self));
117
+ static VALUE strscan_post_match _((VALUE self));
118
+ static VALUE strscan_rest _((VALUE self));
119
+ static VALUE strscan_rest_size _((VALUE self));
120
+
121
+ static VALUE strscan_inspect _((VALUE self));
122
+ static VALUE inspect1 _((struct strscanner *p));
123
+ static VALUE inspect2 _((struct strscanner *p));
124
+
125
+ /* =======================================================================
126
+ Utils
127
+ ======================================================================= */
128
+
129
+ static VALUE
130
+ infect(VALUE str, struct strscanner *p)
131
+ {
132
+ OBJ_INFECT(str, p->str);
133
+ return str;
134
+ }
135
+
136
+ static VALUE
137
+ str_new(struct strscanner *p, const char *ptr, long len)
138
+ {
139
+ VALUE str = rb_str_new(ptr, len);
140
+ rb_enc_copy(str, p->str);
141
+ return str;
142
+ }
143
+
144
+ static inline long
145
+ minl(const long x, const long y)
146
+ {
147
+ return (x < y) ? x : y;
148
+ }
149
+
150
+ static VALUE
151
+ extract_range(struct strscanner *p, long beg_i, long end_i)
152
+ {
153
+ if (beg_i > S_LEN(p)) return Qnil;
154
+ end_i = minl(end_i, S_LEN(p));
155
+ return infect(str_new(p, S_PBEG(p) + beg_i, end_i - beg_i), p);
156
+ }
157
+
158
+ static VALUE
159
+ extract_beg_len(struct strscanner *p, long beg_i, long len)
160
+ {
161
+ if (beg_i > S_LEN(p)) return Qnil;
162
+ len = minl(len, S_LEN(p) - beg_i);
163
+ return infect(str_new(p, S_PBEG(p) + beg_i, len), p);
164
+ }
165
+
166
+ /* =======================================================================
167
+ Constructor
168
+ ======================================================================= */
169
+
170
+ static void
171
+ strscan_mark(void *ptr)
172
+ {
173
+ struct strscanner *p = ptr;
174
+ rb_gc_mark(p->str);
175
+ }
176
+
177
+ static void
178
+ strscan_free(void *ptr)
179
+ {
180
+ struct strscanner *p = ptr;
181
+ onig_region_free(&(p->regs), 0);
182
+ ruby_xfree(p);
183
+ }
184
+
185
+ static size_t
186
+ strscan_memsize(const void *ptr)
187
+ {
188
+ const struct strscanner *p = ptr;
189
+ return sizeof(*p) - sizeof(p->regs) + onig_region_memsize(&p->regs);
190
+ }
191
+
192
+ static const rb_data_type_t strscanner_type = {
193
+ "StringScanner",
194
+ {strscan_mark, strscan_free, strscan_memsize},
195
+ 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
196
+ };
197
+
198
+ static VALUE
199
+ strscan_s_allocate(VALUE klass)
200
+ {
201
+ struct strscanner *p;
202
+ VALUE obj = TypedData_Make_Struct(klass, struct strscanner, &strscanner_type, p);
203
+
204
+ CLEAR_MATCH_STATUS(p);
205
+ onig_region_init(&(p->regs));
206
+ p->str = Qnil;
207
+ return obj;
208
+ }
209
+
210
+ /*
211
+ * call-seq: StringScanner.new(string, dup = false)
212
+ *
213
+ * Creates a new StringScanner object to scan over the given +string+.
214
+ * +dup+ argument is obsolete and not used now.
215
+ */
216
+ static VALUE
217
+ strscan_initialize(int argc, VALUE *argv, VALUE self)
218
+ {
219
+ struct strscanner *p;
220
+ VALUE str, need_dup;
221
+
222
+ p = check_strscan(self);
223
+ rb_scan_args(argc, argv, "11", &str, &need_dup);
224
+ StringValue(str);
225
+ p->str = str;
226
+
227
+ return self;
228
+ }
229
+
230
+ static struct strscanner *
231
+ check_strscan(VALUE obj)
232
+ {
233
+ return rb_check_typeddata(obj, &strscanner_type);
234
+ }
235
+
236
+ /*
237
+ * call-seq:
238
+ * dup
239
+ * clone
240
+ *
241
+ * Duplicates a StringScanner object.
242
+ */
243
+ static VALUE
244
+ strscan_init_copy(VALUE vself, VALUE vorig)
245
+ {
246
+ struct strscanner *self, *orig;
247
+
248
+ self = check_strscan(vself);
249
+ orig = check_strscan(vorig);
250
+ if (self != orig) {
251
+ self->flags = orig->flags;
252
+ self->str = orig->str;
253
+ self->prev = orig->prev;
254
+ self->curr = orig->curr;
255
+ if (rb_reg_region_copy(&self->regs, &orig->regs))
256
+ rb_memerror();
257
+ RB_GC_GUARD(vorig);
258
+ }
259
+
260
+ return vself;
261
+ }
262
+
263
+ /* =======================================================================
264
+ Instance Methods
265
+ ======================================================================= */
266
+
267
+ /*
268
+ * call-seq: StringScanner.must_C_version
269
+ *
270
+ * This method is defined for backward compatibility.
271
+ */
272
+ static VALUE
273
+ strscan_s_mustc(VALUE self)
274
+ {
275
+ return self;
276
+ }
277
+
278
+ /*
279
+ * Reset the scan pointer (index 0) and clear matching data.
280
+ */
281
+ static VALUE
282
+ strscan_reset(VALUE self)
283
+ {
284
+ struct strscanner *p;
285
+
286
+ GET_SCANNER(self, p);
287
+ p->curr = 0;
288
+ CLEAR_MATCH_STATUS(p);
289
+ return self;
290
+ }
291
+
292
+ /*
293
+ * call-seq:
294
+ * terminate
295
+ * clear
296
+ *
297
+ * Set the scan pointer to the end of the string and clear matching data.
298
+ */
299
+ static VALUE
300
+ strscan_terminate(VALUE self)
301
+ {
302
+ struct strscanner *p;
303
+
304
+ GET_SCANNER(self, p);
305
+ p->curr = S_LEN(p);
306
+ CLEAR_MATCH_STATUS(p);
307
+ return self;
308
+ }
309
+
310
+ /*
311
+ * Equivalent to #terminate.
312
+ * This method is obsolete; use #terminate instead.
313
+ */
314
+ static VALUE
315
+ strscan_clear(VALUE self)
316
+ {
317
+ rb_warning("StringScanner#clear is obsolete; use #terminate instead");
318
+ return strscan_terminate(self);
319
+ }
320
+
321
+ /*
322
+ * Returns the string being scanned.
323
+ */
324
+ static VALUE
325
+ strscan_get_string(VALUE self)
326
+ {
327
+ struct strscanner *p;
328
+
329
+ GET_SCANNER(self, p);
330
+ return p->str;
331
+ }
332
+
333
+ /*
334
+ * call-seq: string=(str)
335
+ *
336
+ * Changes the string being scanned to +str+ and resets the scanner.
337
+ * Returns +str+.
338
+ */
339
+ static VALUE
340
+ strscan_set_string(VALUE self, VALUE str)
341
+ {
342
+ struct strscanner *p = check_strscan(self);
343
+
344
+ StringValue(str);
345
+ p->str = str;
346
+ p->curr = 0;
347
+ CLEAR_MATCH_STATUS(p);
348
+ return str;
349
+ }
350
+
351
+ /*
352
+ * call-seq:
353
+ * concat(str)
354
+ * <<(str)
355
+ *
356
+ * Appends +str+ to the string being scanned.
357
+ * This method does not affect scan pointer.
358
+ *
359
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
360
+ * s.scan(/Fri /)
361
+ * s << " +1000 GMT"
362
+ * s.string # -> "Fri Dec 12 1975 14:39 +1000 GMT"
363
+ * s.scan(/Dec/) # -> "Dec"
364
+ */
365
+ static VALUE
366
+ strscan_concat(VALUE self, VALUE str)
367
+ {
368
+ struct strscanner *p;
369
+
370
+ GET_SCANNER(self, p);
371
+ StringValue(str);
372
+ rb_str_append(p->str, str);
373
+ return self;
374
+ }
375
+
376
+ /*
377
+ * Returns the byte position of the scan pointer. In the 'reset' position, this
378
+ * value is zero. In the 'terminated' position (i.e. the string is exhausted),
379
+ * this value is the bytesize of the string.
380
+ *
381
+ * In short, it's a 0-based index into bytes of the string.
382
+ *
383
+ * s = StringScanner.new('test string')
384
+ * s.pos # -> 0
385
+ * s.scan_until /str/ # -> "test str"
386
+ * s.pos # -> 8
387
+ * s.terminate # -> #<StringScanner fin>
388
+ * s.pos # -> 11
389
+ */
390
+ static VALUE
391
+ strscan_get_pos(VALUE self)
392
+ {
393
+ struct strscanner *p;
394
+
395
+ GET_SCANNER(self, p);
396
+ return INT2FIX(p->curr);
397
+ }
398
+
399
+ /*
400
+ * Returns the character position of the scan pointer. In the 'reset' position, this
401
+ * value is zero. In the 'terminated' position (i.e. the string is exhausted),
402
+ * this value is the size of the string.
403
+ *
404
+ * In short, it's a 0-based index into the string.
405
+ *
406
+ * s = StringScanner.new("abcädeföghi")
407
+ * s.charpos # -> 0
408
+ * s.scan_until(/ä/) # -> "abcä"
409
+ * s.pos # -> 5
410
+ * s.charpos # -> 4
411
+ */
412
+ static VALUE
413
+ strscan_get_charpos(VALUE self)
414
+ {
415
+ struct strscanner *p;
416
+ VALUE substr;
417
+
418
+ GET_SCANNER(self, p);
419
+
420
+ substr = rb_funcall(p->str, id_byteslice, 2, INT2FIX(0), INT2NUM(p->curr));
421
+
422
+ return rb_str_length(substr);
423
+ }
424
+
425
+ /*
426
+ * call-seq: pos=(n)
427
+ *
428
+ * Set the byte position of the scan pointer.
429
+ *
430
+ * s = StringScanner.new('test string')
431
+ * s.pos = 7 # -> 7
432
+ * s.rest # -> "ring"
433
+ */
434
+ static VALUE
435
+ strscan_set_pos(VALUE self, VALUE v)
436
+ {
437
+ struct strscanner *p;
438
+ long i;
439
+
440
+ GET_SCANNER(self, p);
441
+ i = NUM2INT(v);
442
+ if (i < 0) i += S_LEN(p);
443
+ if (i < 0) rb_raise(rb_eRangeError, "index out of range");
444
+ if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range");
445
+ p->curr = i;
446
+ return INT2NUM(i);
447
+ }
448
+
449
+ static VALUE
450
+ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
451
+ {
452
+ regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
453
+ struct strscanner *p;
454
+ regex_t *re;
455
+ long ret;
456
+ int tmpreg;
457
+
458
+ Check_Type(regex, T_REGEXP);
459
+ GET_SCANNER(self, p);
460
+
461
+ CLEAR_MATCH_STATUS(p);
462
+ if (S_RESTLEN(p) < 0) {
463
+ return Qnil;
464
+ }
465
+
466
+ p->regex = regex;
467
+ re = rb_reg_prepare_re(regex, p->str);
468
+ tmpreg = re != RREGEXP_PTR(regex);
469
+ if (!tmpreg) RREGEXP(regex)->usecnt++;
470
+
471
+ if (headonly) {
472
+ ret = onig_match(re, (UChar* )CURPTR(p),
473
+ (UChar* )(CURPTR(p) + S_RESTLEN(p)),
474
+ (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
475
+ }
476
+ else {
477
+ ret = onig_search(re,
478
+ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
479
+ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
480
+ &(p->regs), ONIG_OPTION_NONE);
481
+ }
482
+ if (!tmpreg) RREGEXP(regex)->usecnt--;
483
+ if (tmpreg) {
484
+ if (RREGEXP(regex)->usecnt) {
485
+ onig_free(re);
486
+ }
487
+ else {
488
+ onig_free(RREGEXP_PTR(regex));
489
+ RREGEXP_PTR(regex) = re;
490
+ }
491
+ }
492
+
493
+ if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
494
+ if (ret < 0) {
495
+ /* not matched */
496
+ return Qnil;
497
+ }
498
+
499
+ MATCHED(p);
500
+ p->prev = p->curr;
501
+ if (succptr) {
502
+ p->curr += p->regs.end[0];
503
+ }
504
+ if (getstr) {
505
+ return extract_beg_len(p, p->prev, p->regs.end[0]);
506
+ }
507
+ else {
508
+ return INT2FIX(p->regs.end[0]);
509
+ }
510
+ }
511
+
512
+ /*
513
+ * call-seq: scan(pattern) => String
514
+ *
515
+ * Tries to match with +pattern+ at the current position. If there's a match,
516
+ * the scanner advances the "scan pointer" and returns the matched string.
517
+ * Otherwise, the scanner returns +nil+.
518
+ *
519
+ * s = StringScanner.new('test string')
520
+ * p s.scan(/\w+/) # -> "test"
521
+ * p s.scan(/\w+/) # -> nil
522
+ * p s.scan(/\s+/) # -> " "
523
+ * p s.scan(/\w+/) # -> "string"
524
+ * p s.scan(/./) # -> nil
525
+ *
526
+ */
527
+ static VALUE
528
+ strscan_scan(VALUE self, VALUE re)
529
+ {
530
+ return strscan_do_scan(self, re, 1, 1, 1);
531
+ }
532
+
533
+ /*
534
+ * call-seq: match?(pattern)
535
+ *
536
+ * Tests whether the given +pattern+ is matched from the current scan pointer.
537
+ * Returns the length of the match, or +nil+. The scan pointer is not advanced.
538
+ *
539
+ * s = StringScanner.new('test string')
540
+ * p s.match?(/\w+/) # -> 4
541
+ * p s.match?(/\w+/) # -> 4
542
+ * p s.match?(/\s+/) # -> nil
543
+ */
544
+ static VALUE
545
+ strscan_match_p(VALUE self, VALUE re)
546
+ {
547
+ return strscan_do_scan(self, re, 0, 0, 1);
548
+ }
549
+
550
+ /*
551
+ * call-seq: skip(pattern)
552
+ *
553
+ * Attempts to skip over the given +pattern+ beginning with the scan pointer.
554
+ * If it matches, the scan pointer is advanced to the end of the match, and the
555
+ * length of the match is returned. Otherwise, +nil+ is returned.
556
+ *
557
+ * It's similar to #scan, but without returning the matched string.
558
+ *
559
+ * s = StringScanner.new('test string')
560
+ * p s.skip(/\w+/) # -> 4
561
+ * p s.skip(/\w+/) # -> nil
562
+ * p s.skip(/\s+/) # -> 1
563
+ * p s.skip(/\w+/) # -> 6
564
+ * p s.skip(/./) # -> nil
565
+ *
566
+ */
567
+ static VALUE
568
+ strscan_skip(VALUE self, VALUE re)
569
+ {
570
+ return strscan_do_scan(self, re, 1, 0, 1);
571
+ }
572
+
573
+ /*
574
+ * call-seq: check(pattern)
575
+ *
576
+ * This returns the value that #scan would return, without advancing the scan
577
+ * pointer. The match register is affected, though.
578
+ *
579
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
580
+ * s.check /Fri/ # -> "Fri"
581
+ * s.pos # -> 0
582
+ * s.matched # -> "Fri"
583
+ * s.check /12/ # -> nil
584
+ * s.matched # -> nil
585
+ *
586
+ * Mnemonic: it "checks" to see whether a #scan will return a value.
587
+ */
588
+ static VALUE
589
+ strscan_check(VALUE self, VALUE re)
590
+ {
591
+ return strscan_do_scan(self, re, 0, 1, 1);
592
+ }
593
+
594
+ /*
595
+ * call-seq: scan_full(pattern, advance_pointer_p, return_string_p)
596
+ *
597
+ * Tests whether the given +pattern+ is matched from the current scan pointer.
598
+ * Advances the scan pointer if +advance_pointer_p+ is true.
599
+ * Returns the matched string if +return_string_p+ is true.
600
+ * The match register is affected.
601
+ *
602
+ * "full" means "#scan with full parameters".
603
+ */
604
+ static VALUE
605
+ strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f)
606
+ {
607
+ return strscan_do_scan(self, re, RTEST(s), RTEST(f), 1);
608
+ }
609
+
610
+ /*
611
+ * call-seq: scan_until(pattern)
612
+ *
613
+ * Scans the string _until_ the +pattern+ is matched. Returns the substring up
614
+ * to and including the end of the match, advancing the scan pointer to that
615
+ * location. If there is no match, +nil+ is returned.
616
+ *
617
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
618
+ * s.scan_until(/1/) # -> "Fri Dec 1"
619
+ * s.pre_match # -> "Fri Dec "
620
+ * s.scan_until(/XYZ/) # -> nil
621
+ */
622
+ static VALUE
623
+ strscan_scan_until(VALUE self, VALUE re)
624
+ {
625
+ return strscan_do_scan(self, re, 1, 1, 0);
626
+ }
627
+
628
+ /*
629
+ * call-seq: exist?(pattern)
630
+ *
631
+ * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string,
632
+ * without advancing the scan pointer. This predicates whether a #scan_until
633
+ * will return a value.
634
+ *
635
+ * s = StringScanner.new('test string')
636
+ * s.exist? /s/ # -> 3
637
+ * s.scan /test/ # -> "test"
638
+ * s.exist? /s/ # -> 2
639
+ * s.exist? /e/ # -> nil
640
+ */
641
+ static VALUE
642
+ strscan_exist_p(VALUE self, VALUE re)
643
+ {
644
+ return strscan_do_scan(self, re, 0, 0, 0);
645
+ }
646
+
647
+ /*
648
+ * call-seq: skip_until(pattern)
649
+ *
650
+ * Advances the scan pointer until +pattern+ is matched and consumed. Returns
651
+ * the number of bytes advanced, or +nil+ if no match was found.
652
+ *
653
+ * Look ahead to match +pattern+, and advance the scan pointer to the _end_
654
+ * of the match. Return the number of characters advanced, or +nil+ if the
655
+ * match was unsuccessful.
656
+ *
657
+ * It's similar to #scan_until, but without returning the intervening string.
658
+ *
659
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
660
+ * s.skip_until /12/ # -> 10
661
+ * s #
662
+ */
663
+ static VALUE
664
+ strscan_skip_until(VALUE self, VALUE re)
665
+ {
666
+ return strscan_do_scan(self, re, 1, 0, 0);
667
+ }
668
+
669
+ /*
670
+ * call-seq: check_until(pattern)
671
+ *
672
+ * This returns the value that #scan_until would return, without advancing the
673
+ * scan pointer. The match register is affected, though.
674
+ *
675
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
676
+ * s.check_until /12/ # -> "Fri Dec 12"
677
+ * s.pos # -> 0
678
+ * s.matched # -> 12
679
+ *
680
+ * Mnemonic: it "checks" to see whether a #scan_until will return a value.
681
+ */
682
+ static VALUE
683
+ strscan_check_until(VALUE self, VALUE re)
684
+ {
685
+ return strscan_do_scan(self, re, 0, 1, 0);
686
+ }
687
+
688
+ /*
689
+ * call-seq: search_full(pattern, advance_pointer_p, return_string_p)
690
+ *
691
+ * Scans the string _until_ the +pattern+ is matched.
692
+ * Advances the scan pointer if +advance_pointer_p+, otherwise not.
693
+ * Returns the matched string if +return_string_p+ is true, otherwise
694
+ * returns the number of bytes advanced.
695
+ * This method does affect the match register.
696
+ */
697
+ static VALUE
698
+ strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f)
699
+ {
700
+ return strscan_do_scan(self, re, RTEST(s), RTEST(f), 0);
701
+ }
702
+
703
+ static void
704
+ adjust_registers_to_matched(struct strscanner *p)
705
+ {
706
+ onig_region_clear(&(p->regs));
707
+ onig_region_set(&(p->regs), 0, 0, (int)(p->curr - p->prev));
708
+ }
709
+
710
+ /*
711
+ * Scans one character and returns it.
712
+ * This method is multibyte character sensitive.
713
+ *
714
+ * s = StringScanner.new("ab")
715
+ * s.getch # => "a"
716
+ * s.getch # => "b"
717
+ * s.getch # => nil
718
+ *
719
+ * $KCODE = 'EUC'
720
+ * s = StringScanner.new("\244\242")
721
+ * s.getch # => "\244\242" # Japanese hira-kana "A" in EUC-JP
722
+ * s.getch # => nil
723
+ */
724
+ static VALUE
725
+ strscan_getch(VALUE self)
726
+ {
727
+ struct strscanner *p;
728
+ long len;
729
+
730
+ GET_SCANNER(self, p);
731
+ CLEAR_MATCH_STATUS(p);
732
+ if (EOS_P(p))
733
+ return Qnil;
734
+
735
+ len = rb_enc_mbclen(CURPTR(p), S_PEND(p), rb_enc_get(p->str));
736
+ len = minl(len, S_RESTLEN(p));
737
+ p->prev = p->curr;
738
+ p->curr += len;
739
+ MATCHED(p);
740
+ adjust_registers_to_matched(p);
741
+ return extract_range(p, p->prev + p->regs.beg[0],
742
+ p->prev + p->regs.end[0]);
743
+ }
744
+
745
+ /*
746
+ * Scans one byte and returns it.
747
+ * This method is not multibyte character sensitive.
748
+ * See also: #getch.
749
+ *
750
+ * s = StringScanner.new('ab')
751
+ * s.get_byte # => "a"
752
+ * s.get_byte # => "b"
753
+ * s.get_byte # => nil
754
+ *
755
+ * $KCODE = 'EUC'
756
+ * s = StringScanner.new("\244\242")
757
+ * s.get_byte # => "\244"
758
+ * s.get_byte # => "\242"
759
+ * s.get_byte # => nil
760
+ */
761
+ static VALUE
762
+ strscan_get_byte(VALUE self)
763
+ {
764
+ struct strscanner *p;
765
+
766
+ GET_SCANNER(self, p);
767
+ CLEAR_MATCH_STATUS(p);
768
+ if (EOS_P(p))
769
+ return Qnil;
770
+
771
+ p->prev = p->curr;
772
+ p->curr++;
773
+ MATCHED(p);
774
+ adjust_registers_to_matched(p);
775
+ return extract_range(p, p->prev + p->regs.beg[0],
776
+ p->prev + p->regs.end[0]);
777
+ }
778
+
779
+ /*
780
+ * Equivalent to #get_byte.
781
+ * This method is obsolete; use #get_byte instead.
782
+ */
783
+ static VALUE
784
+ strscan_getbyte(VALUE self)
785
+ {
786
+ rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead");
787
+ return strscan_get_byte(self);
788
+ }
789
+
790
+ /*
791
+ * call-seq: peek(len)
792
+ *
793
+ * Extracts a string corresponding to <tt>string[pos,len]</tt>, without
794
+ * advancing the scan pointer.
795
+ *
796
+ * s = StringScanner.new('test string')
797
+ * s.peek(7) # => "test st"
798
+ * s.peek(7) # => "test st"
799
+ *
800
+ */
801
+ static VALUE
802
+ strscan_peek(VALUE self, VALUE vlen)
803
+ {
804
+ struct strscanner *p;
805
+ long len;
806
+
807
+ GET_SCANNER(self, p);
808
+
809
+ len = NUM2LONG(vlen);
810
+ if (EOS_P(p))
811
+ return infect(str_new(p, "", 0), p);
812
+
813
+ len = minl(len, S_RESTLEN(p));
814
+ return extract_beg_len(p, p->curr, len);
815
+ }
816
+
817
+ /*
818
+ * Equivalent to #peek.
819
+ * This method is obsolete; use #peek instead.
820
+ */
821
+ static VALUE
822
+ strscan_peep(VALUE self, VALUE vlen)
823
+ {
824
+ rb_warning("StringScanner#peep is obsolete; use #peek instead");
825
+ return strscan_peek(self, vlen);
826
+ }
827
+
828
+ /*
829
+ * Set the scan pointer to the previous position. Only one previous position is
830
+ * remembered, and it changes with each scanning operation.
831
+ *
832
+ * s = StringScanner.new('test string')
833
+ * s.scan(/\w+/) # => "test"
834
+ * s.unscan
835
+ * s.scan(/../) # => "te"
836
+ * s.scan(/\d/) # => nil
837
+ * s.unscan # ScanError: unscan failed: previous match record not exist
838
+ */
839
+ static VALUE
840
+ strscan_unscan(VALUE self)
841
+ {
842
+ struct strscanner *p;
843
+
844
+ GET_SCANNER(self, p);
845
+ if (! MATCHED_P(p))
846
+ rb_raise(ScanError, "unscan failed: previous match record not exist");
847
+ p->curr = p->prev;
848
+ CLEAR_MATCH_STATUS(p);
849
+ return self;
850
+ }
851
+
852
+ /*
853
+ * Returns +true+ iff the scan pointer is at the beginning of the line.
854
+ *
855
+ * s = StringScanner.new("test\ntest\n")
856
+ * s.bol? # => true
857
+ * s.scan(/te/)
858
+ * s.bol? # => false
859
+ * s.scan(/st\n/)
860
+ * s.bol? # => true
861
+ * s.terminate
862
+ * s.bol? # => true
863
+ */
864
+ static VALUE
865
+ strscan_bol_p(VALUE self)
866
+ {
867
+ struct strscanner *p;
868
+
869
+ GET_SCANNER(self, p);
870
+ if (CURPTR(p) > S_PEND(p)) return Qnil;
871
+ if (p->curr == 0) return Qtrue;
872
+ return (*(CURPTR(p) - 1) == '\n') ? Qtrue : Qfalse;
873
+ }
874
+
875
+ /*
876
+ * Returns +true+ if the scan pointer is at the end of the string.
877
+ *
878
+ * s = StringScanner.new('test string')
879
+ * p s.eos? # => false
880
+ * s.scan(/test/)
881
+ * p s.eos? # => false
882
+ * s.terminate
883
+ * p s.eos? # => true
884
+ */
885
+ static VALUE
886
+ strscan_eos_p(VALUE self)
887
+ {
888
+ struct strscanner *p;
889
+
890
+ GET_SCANNER(self, p);
891
+ return EOS_P(p) ? Qtrue : Qfalse;
892
+ }
893
+
894
+ /*
895
+ * Equivalent to #eos?.
896
+ * This method is obsolete, use #eos? instead.
897
+ */
898
+ static VALUE
899
+ strscan_empty_p(VALUE self)
900
+ {
901
+ rb_warning("StringScanner#empty? is obsolete; use #eos? instead");
902
+ return strscan_eos_p(self);
903
+ }
904
+
905
+ /*
906
+ * Returns true iff there is more data in the string. See #eos?.
907
+ * This method is obsolete; use #eos? instead.
908
+ *
909
+ * s = StringScanner.new('test string')
910
+ * s.eos? # These two
911
+ * s.rest? # are opposites.
912
+ */
913
+ static VALUE
914
+ strscan_rest_p(VALUE self)
915
+ {
916
+ struct strscanner *p;
917
+
918
+ GET_SCANNER(self, p);
919
+ return EOS_P(p) ? Qfalse : Qtrue;
920
+ }
921
+
922
+ /*
923
+ * Returns +true+ iff the last match was successful.
924
+ *
925
+ * s = StringScanner.new('test string')
926
+ * s.match?(/\w+/) # => 4
927
+ * s.matched? # => true
928
+ * s.match?(/\d+/) # => nil
929
+ * s.matched? # => false
930
+ */
931
+ static VALUE
932
+ strscan_matched_p(VALUE self)
933
+ {
934
+ struct strscanner *p;
935
+
936
+ GET_SCANNER(self, p);
937
+ return MATCHED_P(p) ? Qtrue : Qfalse;
938
+ }
939
+
940
+ /*
941
+ * Returns the last matched string.
942
+ *
943
+ * s = StringScanner.new('test string')
944
+ * s.match?(/\w+/) # -> 4
945
+ * s.matched # -> "test"
946
+ */
947
+ static VALUE
948
+ strscan_matched(VALUE self)
949
+ {
950
+ struct strscanner *p;
951
+
952
+ GET_SCANNER(self, p);
953
+ if (! MATCHED_P(p)) return Qnil;
954
+ return extract_range(p, p->prev + p->regs.beg[0],
955
+ p->prev + p->regs.end[0]);
956
+ }
957
+
958
+ /*
959
+ * Returns the size of the most recent match (see #matched), or +nil+ if there
960
+ * was no recent match.
961
+ *
962
+ * s = StringScanner.new('test string')
963
+ * s.check /\w+/ # -> "test"
964
+ * s.matched_size # -> 4
965
+ * s.check /\d+/ # -> nil
966
+ * s.matched_size # -> nil
967
+ */
968
+ static VALUE
969
+ strscan_matched_size(VALUE self)
970
+ {
971
+ struct strscanner *p;
972
+
973
+ GET_SCANNER(self, p);
974
+ if (! MATCHED_P(p)) return Qnil;
975
+ return INT2NUM(p->regs.end[0] - p->regs.beg[0]);
976
+ }
977
+
978
+ static int
979
+ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end, rb_encoding *enc)
980
+ {
981
+ int num;
982
+
983
+ num = onig_name_to_backref_number(RREGEXP_PTR(regexp),
984
+ (const unsigned char* )name, (const unsigned char* )name_end, regs);
985
+ if (num >= 1) {
986
+ return num;
987
+ }
988
+ else {
989
+ rb_enc_raise(enc, rb_eIndexError, "undefined group name reference: %.*s",
990
+ rb_long2int(name_end - name), name);
991
+ }
992
+
993
+ UNREACHABLE;
994
+ }
995
+
996
+ /*
997
+ * call-seq: [](n)
998
+ *
999
+ * Returns the n-th subgroup in the most recent match.
1000
+ *
1001
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1002
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1003
+ * s[0] # -> "Fri Dec 12 "
1004
+ * s[1] # -> "Fri"
1005
+ * s[2] # -> "Dec"
1006
+ * s[3] # -> "12"
1007
+ * s.post_match # -> "1975 14:39"
1008
+ * s.pre_match # -> ""
1009
+ *
1010
+ * s.reset
1011
+ * s.scan(/(?<wday>\w+) (?<month>\w+) (?<day>\d+) /) # -> "Fri Dec 12 "
1012
+ * s[0] # -> "Fri Dec 12 "
1013
+ * s[1] # -> "Fri"
1014
+ * s[2] # -> "Dec"
1015
+ * s[3] # -> "12"
1016
+ * s[:wday] # -> "Fri"
1017
+ * s[:month] # -> "Dec"
1018
+ * s[:day] # -> "12"
1019
+ * s.post_match # -> "1975 14:39"
1020
+ * s.pre_match # -> ""
1021
+ */
1022
+ static VALUE
1023
+ strscan_aref(VALUE self, VALUE idx)
1024
+ {
1025
+ const char *name;
1026
+ struct strscanner *p;
1027
+ long i;
1028
+
1029
+ GET_SCANNER(self, p);
1030
+ if (! MATCHED_P(p)) return Qnil;
1031
+
1032
+ switch (TYPE(idx)) {
1033
+ case T_SYMBOL:
1034
+ idx = rb_sym2str(idx);
1035
+ /* fall through */
1036
+ case T_STRING:
1037
+ if (!p->regex) return Qnil;
1038
+ RSTRING_GETMEM(idx, name, i);
1039
+ i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx));
1040
+ break;
1041
+ default:
1042
+ i = NUM2LONG(idx);
1043
+ }
1044
+
1045
+ if (i < 0)
1046
+ i += p->regs.num_regs;
1047
+ if (i < 0) return Qnil;
1048
+ if (i >= p->regs.num_regs) return Qnil;
1049
+ if (p->regs.beg[i] == -1) return Qnil;
1050
+
1051
+ return extract_range(p, p->prev + p->regs.beg[i],
1052
+ p->prev + p->regs.end[i]);
1053
+ }
1054
+
1055
+ /*
1056
+ * call-seq: size
1057
+ *
1058
+ * Returns the amount of subgroups in the most recent match.
1059
+ * The full match counts as a subgroup.
1060
+ *
1061
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1062
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1063
+ * s.size # -> 4
1064
+ */
1065
+ static VALUE
1066
+ strscan_size(VALUE self)
1067
+ {
1068
+ struct strscanner *p;
1069
+
1070
+ GET_SCANNER(self, p);
1071
+ if (! MATCHED_P(p)) return Qnil;
1072
+ return INT2FIX(p->regs.num_regs);
1073
+ }
1074
+
1075
+ /*
1076
+ * call-seq: captures
1077
+ *
1078
+ * Returns the subgroups in the most recent match (not including the full match).
1079
+ * If nothing was priorly matched, it returns nil.
1080
+ *
1081
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1082
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1083
+ * s.captures # -> ["Fri", "Dec", "12"]
1084
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> nil
1085
+ * s.captures # -> nil
1086
+ */
1087
+ static VALUE
1088
+ strscan_captures(VALUE self)
1089
+ {
1090
+ struct strscanner *p;
1091
+ int i, num_regs;
1092
+ VALUE new_ary;
1093
+
1094
+ GET_SCANNER(self, p);
1095
+ if (! MATCHED_P(p)) return Qnil;
1096
+
1097
+ num_regs = p->regs.num_regs;
1098
+ new_ary = rb_ary_new2(num_regs);
1099
+
1100
+ for (i = 1; i < num_regs; i++) {
1101
+ VALUE str = extract_range(p, p->prev + p->regs.beg[i],
1102
+ p->prev + p->regs.end[i]);
1103
+ rb_ary_push(new_ary, str);
1104
+ }
1105
+
1106
+ return new_ary;
1107
+ }
1108
+
1109
+ /*
1110
+ * call-seq:
1111
+ * scanner.values_at( i1, i2, ... iN ) -> an_array
1112
+ *
1113
+ * Returns the subgroups in the most recent match at the given indices.
1114
+ * If nothing was priorly matched, it returns nil.
1115
+ *
1116
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1117
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
1118
+ * s.values_at 0, -1, 5, 2 # -> ["Fri Dec 12 ", "12", nil, "Dec"]
1119
+ * s.scan(/(\w+) (\w+) (\d+) /) # -> nil
1120
+ * s.values_at 0, -1, 5, 2 # -> nil
1121
+ */
1122
+
1123
+ static VALUE
1124
+ strscan_values_at(int argc, VALUE *argv, VALUE self)
1125
+ {
1126
+ struct strscanner *p;
1127
+ long i;
1128
+ VALUE new_ary;
1129
+
1130
+ GET_SCANNER(self, p);
1131
+ if (! MATCHED_P(p)) return Qnil;
1132
+
1133
+ new_ary = rb_ary_new2(argc);
1134
+ for (i = 0; i<argc; i++) {
1135
+ rb_ary_push(new_ary, strscan_aref(self, argv[i]));
1136
+ }
1137
+
1138
+ return new_ary;
1139
+ }
1140
+
1141
+ /*
1142
+ * Returns the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan.
1143
+ *
1144
+ * s = StringScanner.new('test string')
1145
+ * s.scan(/\w+/) # -> "test"
1146
+ * s.scan(/\s+/) # -> " "
1147
+ * s.pre_match # -> "test"
1148
+ * s.post_match # -> "string"
1149
+ */
1150
+ static VALUE
1151
+ strscan_pre_match(VALUE self)
1152
+ {
1153
+ struct strscanner *p;
1154
+
1155
+ GET_SCANNER(self, p);
1156
+ if (! MATCHED_P(p)) return Qnil;
1157
+ return extract_range(p, 0, p->prev + p->regs.beg[0]);
1158
+ }
1159
+
1160
+ /*
1161
+ * Returns the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan.
1162
+ *
1163
+ * s = StringScanner.new('test string')
1164
+ * s.scan(/\w+/) # -> "test"
1165
+ * s.scan(/\s+/) # -> " "
1166
+ * s.pre_match # -> "test"
1167
+ * s.post_match # -> "string"
1168
+ */
1169
+ static VALUE
1170
+ strscan_post_match(VALUE self)
1171
+ {
1172
+ struct strscanner *p;
1173
+
1174
+ GET_SCANNER(self, p);
1175
+ if (! MATCHED_P(p)) return Qnil;
1176
+ return extract_range(p, p->prev + p->regs.end[0], S_LEN(p));
1177
+ }
1178
+
1179
+ /*
1180
+ * Returns the "rest" of the string (i.e. everything after the scan pointer).
1181
+ * If there is no more data (eos? = true), it returns <tt>""</tt>.
1182
+ */
1183
+ static VALUE
1184
+ strscan_rest(VALUE self)
1185
+ {
1186
+ struct strscanner *p;
1187
+
1188
+ GET_SCANNER(self, p);
1189
+ if (EOS_P(p)) {
1190
+ return infect(str_new(p, "", 0), p);
1191
+ }
1192
+ return extract_range(p, p->curr, S_LEN(p));
1193
+ }
1194
+
1195
+ /*
1196
+ * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>.
1197
+ */
1198
+ static VALUE
1199
+ strscan_rest_size(VALUE self)
1200
+ {
1201
+ struct strscanner *p;
1202
+ long i;
1203
+
1204
+ GET_SCANNER(self, p);
1205
+ if (EOS_P(p)) {
1206
+ return INT2FIX(0);
1207
+ }
1208
+ i = S_RESTLEN(p);
1209
+ return INT2FIX(i);
1210
+ }
1211
+
1212
+ /*
1213
+ * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>.
1214
+ * This method is obsolete; use #rest_size instead.
1215
+ */
1216
+ static VALUE
1217
+ strscan_restsize(VALUE self)
1218
+ {
1219
+ rb_warning("StringScanner#restsize is obsolete; use #rest_size instead");
1220
+ return strscan_rest_size(self);
1221
+ }
1222
+
1223
+ #define INSPECT_LENGTH 5
1224
+
1225
+ /*
1226
+ * Returns a string that represents the StringScanner object, showing:
1227
+ * - the current position
1228
+ * - the size of the string
1229
+ * - the characters surrounding the scan pointer
1230
+ *
1231
+ * s = StringScanner.new("Fri Dec 12 1975 14:39")
1232
+ * s.inspect # -> '#<StringScanner 0/21 @ "Fri D...">'
1233
+ * s.scan_until /12/ # -> "Fri Dec 12"
1234
+ * s.inspect # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">'
1235
+ */
1236
+ static VALUE
1237
+ strscan_inspect(VALUE self)
1238
+ {
1239
+ struct strscanner *p;
1240
+ VALUE a, b;
1241
+
1242
+ p = check_strscan(self);
1243
+ if (NIL_P(p->str)) {
1244
+ a = rb_sprintf("#<%"PRIsVALUE" (uninitialized)>", rb_obj_class(self));
1245
+ return infect(a, p);
1246
+ }
1247
+ if (EOS_P(p)) {
1248
+ a = rb_sprintf("#<%"PRIsVALUE" fin>", rb_obj_class(self));
1249
+ return infect(a, p);
1250
+ }
1251
+ if (p->curr == 0) {
1252
+ b = inspect2(p);
1253
+ a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld @ %"PRIsVALUE">",
1254
+ rb_obj_class(self),
1255
+ p->curr, S_LEN(p),
1256
+ b);
1257
+ return infect(a, p);
1258
+ }
1259
+ a = inspect1(p);
1260
+ b = inspect2(p);
1261
+ a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld %"PRIsVALUE" @ %"PRIsVALUE">",
1262
+ rb_obj_class(self),
1263
+ p->curr, S_LEN(p),
1264
+ a, b);
1265
+ return infect(a, p);
1266
+ }
1267
+
1268
+ static VALUE
1269
+ inspect1(struct strscanner *p)
1270
+ {
1271
+ VALUE str;
1272
+ long len;
1273
+
1274
+ if (p->curr == 0) return rb_str_new2("");
1275
+ if (p->curr > INSPECT_LENGTH) {
1276
+ str = rb_str_new_cstr("...");
1277
+ len = INSPECT_LENGTH;
1278
+ }
1279
+ else {
1280
+ str = rb_str_new(0, 0);
1281
+ len = p->curr;
1282
+ }
1283
+ rb_str_cat(str, CURPTR(p) - len, len);
1284
+ return rb_str_dump(str);
1285
+ }
1286
+
1287
+ static VALUE
1288
+ inspect2(struct strscanner *p)
1289
+ {
1290
+ VALUE str;
1291
+ long len;
1292
+
1293
+ if (EOS_P(p)) return rb_str_new2("");
1294
+ len = S_RESTLEN(p);
1295
+ if (len > INSPECT_LENGTH) {
1296
+ str = rb_str_new(CURPTR(p), INSPECT_LENGTH);
1297
+ rb_str_cat2(str, "...");
1298
+ }
1299
+ else {
1300
+ str = rb_str_new(CURPTR(p), len);
1301
+ }
1302
+ return rb_str_dump(str);
1303
+ }
1304
+
1305
+ /* =======================================================================
1306
+ Ruby Interface
1307
+ ======================================================================= */
1308
+
1309
+ /*
1310
+ * Document-class: StringScanner
1311
+ *
1312
+ * StringScanner provides for lexical scanning operations on a String. Here is
1313
+ * an example of its usage:
1314
+ *
1315
+ * s = StringScanner.new('This is an example string')
1316
+ * s.eos? # -> false
1317
+ *
1318
+ * p s.scan(/\w+/) # -> "This"
1319
+ * p s.scan(/\w+/) # -> nil
1320
+ * p s.scan(/\s+/) # -> " "
1321
+ * p s.scan(/\s+/) # -> nil
1322
+ * p s.scan(/\w+/) # -> "is"
1323
+ * s.eos? # -> false
1324
+ *
1325
+ * p s.scan(/\s+/) # -> " "
1326
+ * p s.scan(/\w+/) # -> "an"
1327
+ * p s.scan(/\s+/) # -> " "
1328
+ * p s.scan(/\w+/) # -> "example"
1329
+ * p s.scan(/\s+/) # -> " "
1330
+ * p s.scan(/\w+/) # -> "string"
1331
+ * s.eos? # -> true
1332
+ *
1333
+ * p s.scan(/\s+/) # -> nil
1334
+ * p s.scan(/\w+/) # -> nil
1335
+ *
1336
+ * Scanning a string means remembering the position of a <i>scan pointer</i>,
1337
+ * which is just an index. The point of scanning is to move forward a bit at
1338
+ * a time, so matches are sought after the scan pointer; usually immediately
1339
+ * after it.
1340
+ *
1341
+ * Given the string "test string", here are the pertinent scan pointer
1342
+ * positions:
1343
+ *
1344
+ * t e s t s t r i n g
1345
+ * 0 1 2 ... 1
1346
+ * 0
1347
+ *
1348
+ * When you #scan for a pattern (a regular expression), the match must occur
1349
+ * at the character after the scan pointer. If you use #scan_until, then the
1350
+ * match can occur anywhere after the scan pointer. In both cases, the scan
1351
+ * pointer moves <i>just beyond</i> the last character of the match, ready to
1352
+ * scan again from the next character onwards. This is demonstrated by the
1353
+ * example above.
1354
+ *
1355
+ * == Method Categories
1356
+ *
1357
+ * There are other methods besides the plain scanners. You can look ahead in
1358
+ * the string without actually scanning. You can access the most recent match.
1359
+ * You can modify the string being scanned, reset or terminate the scanner,
1360
+ * find out or change the position of the scan pointer, skip ahead, and so on.
1361
+ *
1362
+ * === Advancing the Scan Pointer
1363
+ *
1364
+ * - #getch
1365
+ * - #get_byte
1366
+ * - #scan
1367
+ * - #scan_until
1368
+ * - #skip
1369
+ * - #skip_until
1370
+ *
1371
+ * === Looking Ahead
1372
+ *
1373
+ * - #check
1374
+ * - #check_until
1375
+ * - #exist?
1376
+ * - #match?
1377
+ * - #peek
1378
+ *
1379
+ * === Finding Where we Are
1380
+ *
1381
+ * - #beginning_of_line? (#bol?)
1382
+ * - #eos?
1383
+ * - #rest?
1384
+ * - #rest_size
1385
+ * - #pos
1386
+ *
1387
+ * === Setting Where we Are
1388
+ *
1389
+ * - #reset
1390
+ * - #terminate
1391
+ * - #pos=
1392
+ *
1393
+ * === Match Data
1394
+ *
1395
+ * - #matched
1396
+ * - #matched?
1397
+ * - #matched_size
1398
+ * - []
1399
+ * - #pre_match
1400
+ * - #post_match
1401
+ *
1402
+ * === Miscellaneous
1403
+ *
1404
+ * - <<
1405
+ * - #concat
1406
+ * - #string
1407
+ * - #string=
1408
+ * - #unscan
1409
+ *
1410
+ * There are aliases to several of the methods.
1411
+ */
1412
+ void
1413
+ Init_strscan(void)
1414
+ {
1415
+ ID id_scanerr = rb_intern("ScanError");
1416
+ VALUE tmp;
1417
+
1418
+ id_byteslice = rb_intern("byteslice");
1419
+
1420
+ StringScanner = rb_define_class("StringScanner", rb_cObject);
1421
+ ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError);
1422
+ if (!rb_const_defined(rb_cObject, id_scanerr)) {
1423
+ rb_const_set(rb_cObject, id_scanerr, ScanError);
1424
+ }
1425
+ tmp = rb_str_new2(STRSCAN_VERSION);
1426
+ rb_obj_freeze(tmp);
1427
+ rb_const_set(StringScanner, rb_intern("Version"), tmp);
1428
+ tmp = rb_str_new2("$Id$");
1429
+ rb_obj_freeze(tmp);
1430
+ rb_const_set(StringScanner, rb_intern("Id"), tmp);
1431
+
1432
+ rb_define_alloc_func(StringScanner, strscan_s_allocate);
1433
+ rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1);
1434
+ rb_define_private_method(StringScanner, "initialize_copy", strscan_init_copy, 1);
1435
+ rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0);
1436
+ rb_define_method(StringScanner, "reset", strscan_reset, 0);
1437
+ rb_define_method(StringScanner, "terminate", strscan_terminate, 0);
1438
+ rb_define_method(StringScanner, "clear", strscan_clear, 0);
1439
+ rb_define_method(StringScanner, "string", strscan_get_string, 0);
1440
+ rb_define_method(StringScanner, "string=", strscan_set_string, 1);
1441
+ rb_define_method(StringScanner, "concat", strscan_concat, 1);
1442
+ rb_define_method(StringScanner, "<<", strscan_concat, 1);
1443
+ rb_define_method(StringScanner, "pos", strscan_get_pos, 0);
1444
+ rb_define_method(StringScanner, "pos=", strscan_set_pos, 1);
1445
+ rb_define_method(StringScanner, "charpos", strscan_get_charpos, 0);
1446
+ rb_define_method(StringScanner, "pointer", strscan_get_pos, 0);
1447
+ rb_define_method(StringScanner, "pointer=", strscan_set_pos, 1);
1448
+
1449
+ rb_define_method(StringScanner, "scan", strscan_scan, 1);
1450
+ rb_define_method(StringScanner, "skip", strscan_skip, 1);
1451
+ rb_define_method(StringScanner, "match?", strscan_match_p, 1);
1452
+ rb_define_method(StringScanner, "check", strscan_check, 1);
1453
+ rb_define_method(StringScanner, "scan_full", strscan_scan_full, 3);
1454
+
1455
+ rb_define_method(StringScanner, "scan_until", strscan_scan_until, 1);
1456
+ rb_define_method(StringScanner, "skip_until", strscan_skip_until, 1);
1457
+ rb_define_method(StringScanner, "exist?", strscan_exist_p, 1);
1458
+ rb_define_method(StringScanner, "check_until", strscan_check_until, 1);
1459
+ rb_define_method(StringScanner, "search_full", strscan_search_full, 3);
1460
+
1461
+ rb_define_method(StringScanner, "getch", strscan_getch, 0);
1462
+ rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0);
1463
+ rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0);
1464
+ rb_define_method(StringScanner, "peek", strscan_peek, 1);
1465
+ rb_define_method(StringScanner, "peep", strscan_peep, 1);
1466
+
1467
+ rb_define_method(StringScanner, "unscan", strscan_unscan, 0);
1468
+
1469
+ rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);
1470
+ rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?"));
1471
+ rb_define_method(StringScanner, "eos?", strscan_eos_p, 0);
1472
+ rb_define_method(StringScanner, "empty?", strscan_empty_p, 0);
1473
+ rb_define_method(StringScanner, "rest?", strscan_rest_p, 0);
1474
+
1475
+ rb_define_method(StringScanner, "matched?", strscan_matched_p, 0);
1476
+ rb_define_method(StringScanner, "matched", strscan_matched, 0);
1477
+ rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0);
1478
+ rb_define_method(StringScanner, "[]", strscan_aref, 1);
1479
+ rb_define_method(StringScanner, "pre_match", strscan_pre_match, 0);
1480
+ rb_define_method(StringScanner, "post_match", strscan_post_match, 0);
1481
+ rb_define_method(StringScanner, "size", strscan_size, 0);
1482
+ rb_define_method(StringScanner, "captures", strscan_captures, 0);
1483
+ rb_define_method(StringScanner, "values_at", strscan_values_at, -1);
1484
+
1485
+ rb_define_method(StringScanner, "rest", strscan_rest, 0);
1486
+ rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0);
1487
+ rb_define_method(StringScanner, "restsize", strscan_restsize, 0);
1488
+
1489
+ rb_define_method(StringScanner, "inspect", strscan_inspect, 0);
1490
+ }