regexp_parser 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ChangeLog +57 -0
- data/Gemfile +8 -0
- data/LICENSE +1 -1
- data/README.md +225 -206
- data/Rakefile +9 -3
- data/lib/regexp_parser.rb +7 -11
- data/lib/regexp_parser/expression.rb +72 -14
- data/lib/regexp_parser/expression/classes/alternation.rb +3 -16
- data/lib/regexp_parser/expression/classes/conditional.rb +57 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +17 -0
- data/lib/regexp_parser/expression/classes/keep.rb +7 -0
- data/lib/regexp_parser/expression/classes/set.rb +28 -7
- data/lib/regexp_parser/expression/methods/strfregexp.rb +113 -0
- data/lib/regexp_parser/expression/methods/tests.rb +116 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +63 -0
- data/lib/regexp_parser/expression/quantifier.rb +10 -0
- data/lib/regexp_parser/expression/sequence.rb +45 -0
- data/lib/regexp_parser/expression/subexpression.rb +29 -1
- data/lib/regexp_parser/lexer.rb +31 -8
- data/lib/regexp_parser/parser.rb +118 -45
- data/lib/regexp_parser/scanner.rb +1745 -1404
- data/lib/regexp_parser/scanner/property.rl +57 -3
- data/lib/regexp_parser/scanner/scanner.rl +161 -34
- data/lib/regexp_parser/syntax.rb +12 -2
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +3 -3
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +2 -7
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +4 -1
- data/lib/regexp_parser/syntax/ruby/2.1.4.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.5.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.rb +2 -2
- data/lib/regexp_parser/syntax/ruby/2.2.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.2.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +19 -2
- data/lib/regexp_parser/syntax/tokens/conditional.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/keep.rb +14 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +45 -4
- data/lib/regexp_parser/token.rb +23 -8
- data/lib/regexp_parser/version.rb +5 -0
- data/regexp_parser.gemspec +35 -0
- data/test/expression/test_all.rb +6 -1
- data/test/expression/test_base.rb +19 -0
- data/test/expression/test_conditionals.rb +114 -0
- data/test/expression/test_free_space.rb +33 -0
- data/test/expression/test_set.rb +61 -0
- data/test/expression/test_strfregexp.rb +214 -0
- data/test/expression/test_subexpression.rb +24 -0
- data/test/expression/test_tests.rb +99 -0
- data/test/expression/test_to_h.rb +48 -0
- data/test/expression/test_to_s.rb +46 -0
- data/test/expression/test_traverse.rb +164 -0
- data/test/lexer/test_all.rb +16 -3
- data/test/lexer/test_conditionals.rb +101 -0
- data/test/lexer/test_keep.rb +24 -0
- data/test/lexer/test_literals.rb +51 -51
- data/test/lexer/test_nesting.rb +62 -62
- data/test/lexer/test_refcalls.rb +18 -20
- data/test/parser/test_all.rb +18 -3
- data/test/parser/test_alternation.rb +11 -14
- data/test/parser/test_conditionals.rb +148 -0
- data/test/parser/test_escapes.rb +29 -5
- data/test/parser/test_free_space.rb +139 -0
- data/test/parser/test_groups.rb +40 -0
- data/test/parser/test_keep.rb +21 -0
- data/test/scanner/test_all.rb +8 -2
- data/test/scanner/test_conditionals.rb +166 -0
- data/test/scanner/test_escapes.rb +8 -5
- data/test/scanner/test_free_space.rb +133 -0
- data/test/scanner/test_groups.rb +28 -0
- data/test/scanner/test_keep.rb +33 -0
- data/test/scanner/test_properties.rb +4 -0
- data/test/scanner/test_scripts.rb +71 -1
- data/test/syntax/ruby/test_1.9.3.rb +2 -2
- data/test/syntax/ruby/test_2.0.0.rb +38 -0
- data/test/syntax/ruby/test_2.2.0.rb +38 -0
- data/test/syntax/ruby/test_all.rb +1 -8
- data/test/syntax/ruby/test_files.rb +104 -0
- data/test/test_all.rb +2 -1
- data/test/token/test_all.rb +2 -0
- data/test/token/test_token.rb +109 -0
- metadata +75 -21
- data/VERSION.yml +0 -5
- data/lib/regexp_parser/ctype.rb +0 -48
- data/test/syntax/ruby/test_2.x.rb +0 -46
@@ -11,7 +11,6 @@
|
|
11
11
|
'cntrl'i | 'digit'i | 'graph'i | 'lower'i | 'print'i |
|
12
12
|
'punct'i | 'space'i | 'upper'i | 'word'i | 'xdigit'i;
|
13
13
|
|
14
|
-
# TODO: are these case-insensitive?
|
15
14
|
property_name_posix = 'any'i | 'assigned'i | 'newline'i;
|
16
15
|
|
17
16
|
property_name = property_name_unicode | property_name_posix;
|
@@ -39,7 +38,9 @@
|
|
39
38
|
property_age = 'age=1.1'i | 'age=2.0'i | 'age=2.1'i |
|
40
39
|
'age=3.0'i | 'age=3.1'i | 'age=3.2'i |
|
41
40
|
'age=4.0'i | 'age=4.1'i | 'age=5.0'i |
|
42
|
-
'age=5.1'i | 'age=5.2'i | 'age=6.0'i
|
41
|
+
'age=5.1'i | 'age=5.2'i | 'age=6.0'i |
|
42
|
+
'age=6.1'i | 'age=6.2'i | 'age=6.3'i |
|
43
|
+
'age=7.0'i;
|
43
44
|
|
44
45
|
property_script = (alpha | space | '_')+; # everything else
|
45
46
|
|
@@ -222,6 +223,14 @@
|
|
222
223
|
self.emit(type, :age_5_2, text, ts-1, te)
|
223
224
|
when 'age=6.0'
|
224
225
|
self.emit(type, :age_6_0, text, ts-1, te)
|
226
|
+
when 'age=6.1'
|
227
|
+
self.emit(type, :age_6_1, text, ts-1, te)
|
228
|
+
when 'age=6.2'
|
229
|
+
self.emit(type, :age_6_2, text, ts-1, te)
|
230
|
+
when 'age=6.3'
|
231
|
+
self.emit(type, :age_6_3, text, ts-1, te)
|
232
|
+
when 'age=7.0'
|
233
|
+
self.emit(type, :age_7_0, text, ts-1, te)
|
225
234
|
|
226
235
|
# Derived Properties
|
227
236
|
when 'ahex', 'asciihexdigit'
|
@@ -327,8 +336,9 @@
|
|
327
336
|
when 'xidc', 'xidcontinue'
|
328
337
|
self.emit(type, :xid_continue, text, ts-1, te)
|
329
338
|
|
330
|
-
|
331
339
|
# Scripts
|
340
|
+
when 'aghb', 'caucasianalbanian'
|
341
|
+
self.emit(type, :script_caucasian_albanian, text, ts-1, te)
|
332
342
|
when 'arab', 'arabic'
|
333
343
|
self.emit(type, :script_arabic, text, ts-1, te)
|
334
344
|
when 'armi', 'imperialaramaic'
|
@@ -341,6 +351,8 @@
|
|
341
351
|
self.emit(type, :script_balinese, text, ts-1, te)
|
342
352
|
when 'bamu', 'bamum'
|
343
353
|
self.emit(type, :script_bamum, text, ts-1, te)
|
354
|
+
when 'bass', 'bassavah'
|
355
|
+
self.emit(type, :script_bassa_vah, text, ts-1, te)
|
344
356
|
when 'batk', 'batak'
|
345
357
|
self.emit(type, :script_batak, text, ts-1, te)
|
346
358
|
when 'beng', 'bengali'
|
@@ -373,8 +385,12 @@
|
|
373
385
|
self.emit(type, :script_devanagari, text, ts-1, te)
|
374
386
|
when 'dsrt', 'deseret'
|
375
387
|
self.emit(type, :script_deseret, text, ts-1, te)
|
388
|
+
when 'dupl', 'duployan'
|
389
|
+
self.emit(type, :script_duployan, text, ts-1, te)
|
376
390
|
when 'egyp', 'egyptianhieroglyphs'
|
377
391
|
self.emit(type, :script_egyptian_hieroglyphs, text, ts-1, te)
|
392
|
+
when 'elba', 'elbasan'
|
393
|
+
self.emit(type, :script_elbasan, text, ts-1, te)
|
378
394
|
when 'ethi', 'ethiopic'
|
379
395
|
self.emit(type, :script_ethiopic, text, ts-1, te)
|
380
396
|
when 'geor', 'georgian'
|
@@ -383,6 +399,8 @@
|
|
383
399
|
self.emit(type, :script_glagolitic, text, ts-1, te)
|
384
400
|
when 'goth', 'gothic'
|
385
401
|
self.emit(type, :script_gothic, text, ts-1, te)
|
402
|
+
when 'gran', 'grantha'
|
403
|
+
self.emit(type, :script_grantha, text, ts-1, te)
|
386
404
|
when 'grek', 'greek'
|
387
405
|
self.emit(type, :script_greek, text, ts-1, te)
|
388
406
|
when 'gujr', 'gujarati'
|
@@ -399,6 +417,8 @@
|
|
399
417
|
self.emit(type, :script_hebrew, text, ts-1, te)
|
400
418
|
when 'hira', 'hiragana'
|
401
419
|
self.emit(type, :script_hiragana, text, ts-1, te)
|
420
|
+
when 'hmng', 'pahawhhmong'
|
421
|
+
self.emit(type, :script_pahawh_hmong, text, ts-1, te)
|
402
422
|
when 'hrkt', 'katakanaorhiragana'
|
403
423
|
self.emit(type, :script_katakana_or_hiragana, text, ts-1, te)
|
404
424
|
when 'ital', 'olditalic'
|
@@ -413,6 +433,8 @@
|
|
413
433
|
self.emit(type, :script_kharoshthi, text, ts-1, te)
|
414
434
|
when 'khmr', 'khmer'
|
415
435
|
self.emit(type, :script_khmer, text, ts-1, te)
|
436
|
+
when 'khoj', 'khojki'
|
437
|
+
self.emit(type, :script_khojki, text, ts-1, te)
|
416
438
|
when 'knda', 'kannada'
|
417
439
|
self.emit(type, :script_kannada, text, ts-1, te)
|
418
440
|
when 'kthi', 'kaithi'
|
@@ -427,6 +449,8 @@
|
|
427
449
|
self.emit(type, :script_lepcha, text, ts-1, te)
|
428
450
|
when 'limb', 'limbu'
|
429
451
|
self.emit(type, :script_limbu, text, ts-1, te)
|
452
|
+
when 'lina', 'lineara'
|
453
|
+
self.emit(type, :script_linear_a, text, ts-1, te)
|
430
454
|
when 'linb', 'linearb'
|
431
455
|
self.emit(type, :script_linear_b, text, ts-1, te)
|
432
456
|
when 'lisu'
|
@@ -437,14 +461,28 @@
|
|
437
461
|
self.emit(type, :script_lydian, text, ts-1, te)
|
438
462
|
when 'mlym', 'malayalam'
|
439
463
|
self.emit(type, :script_malayalam, text, ts-1, te)
|
464
|
+
when 'mahj', 'mahajani'
|
465
|
+
self.emit(type, :script_mahajani, text, ts-1, te)
|
440
466
|
when 'mand', 'mandaic'
|
441
467
|
self.emit(type, :script_mandaic, text, ts-1, te)
|
468
|
+
when 'mani', 'manichaean'
|
469
|
+
self.emit(type, :script_manichaean, text, ts-1, te)
|
470
|
+
when 'mend', 'mendekikakui'
|
471
|
+
self.emit(type, :script_mende_kikakui, text, ts-1, te)
|
472
|
+
when 'modi'
|
473
|
+
self.emit(type, :script_modi, text, ts-1, te)
|
442
474
|
when 'mong', 'mongolian'
|
443
475
|
self.emit(type, :script_mongolian, text, ts-1, te)
|
476
|
+
when 'mroo', 'mro'
|
477
|
+
self.emit(type, :script_mro, text, ts-1, te)
|
444
478
|
when 'mtei', 'meeteimayek'
|
445
479
|
self.emit(type, :script_meetei_mayek, text, ts-1, te)
|
446
480
|
when 'mymr', 'myanmar'
|
447
481
|
self.emit(type, :script_myanmar, text, ts-1, te)
|
482
|
+
when 'narb', 'oldnortharabian'
|
483
|
+
self.emit(type, :script_old_north_arabian, text, ts-1, te)
|
484
|
+
when 'nbat', 'nabataean'
|
485
|
+
self.emit(type, :script_nabataean, text, ts-1, te)
|
448
486
|
when 'nkoo', 'nko'
|
449
487
|
self.emit(type, :script_nko, text, ts-1, te)
|
450
488
|
when 'ogam', 'ogham'
|
@@ -457,10 +495,18 @@
|
|
457
495
|
self.emit(type, :script_oriya, text, ts-1, te)
|
458
496
|
when 'osma', 'osmanya'
|
459
497
|
self.emit(type, :script_osmanya, text, ts-1, te)
|
498
|
+
when 'palm', 'palmyrene'
|
499
|
+
self.emit(type, :script_palmyrene, text, ts-1, te)
|
500
|
+
when 'pauc', 'paucinhau'
|
501
|
+
self.emit(type, :script_pau_cin_hau, text, ts-1, te)
|
502
|
+
when 'perm', 'oldpermic'
|
503
|
+
self.emit(type, :script_old_permic, text, ts-1, te)
|
460
504
|
when 'phag', 'phagspa'
|
461
505
|
self.emit(type, :script_phags_pa, text, ts-1, te)
|
462
506
|
when 'phli', 'inscriptionalpahlavi'
|
463
507
|
self.emit(type, :script_inscriptional_pahlavi, text, ts-1, te)
|
508
|
+
when 'phlp', 'psalterpahlavi'
|
509
|
+
self.emit(type, :script_psalter_pahlavi, text, ts-1, te)
|
464
510
|
when 'phnx', 'phoenician'
|
465
511
|
self.emit(type, :script_phoenician, text, ts-1, te)
|
466
512
|
when 'prti', 'inscriptionalparthian'
|
@@ -477,6 +523,10 @@
|
|
477
523
|
self.emit(type, :script_saurashtra, text, ts-1, te)
|
478
524
|
when 'shaw', 'shavian'
|
479
525
|
self.emit(type, :script_shavian, text, ts-1, te)
|
526
|
+
when 'sidd', 'siddham'
|
527
|
+
self.emit(type, :script_siddham, text, ts-1, te)
|
528
|
+
when 'sind', 'khudawadi'
|
529
|
+
self.emit(type, :script_khudawadi, text, ts-1, te)
|
480
530
|
when 'sinh', 'sinhala'
|
481
531
|
self.emit(type, :script_sinhala, text, ts-1, te)
|
482
532
|
when 'sund', 'sundanese'
|
@@ -507,10 +557,14 @@
|
|
507
557
|
self.emit(type, :script_thai, text, ts-1, te)
|
508
558
|
when 'tibt', 'tibetan'
|
509
559
|
self.emit(type, :script_tibetan, text, ts-1, te)
|
560
|
+
when 'tirh', 'tirhuta'
|
561
|
+
self.emit(type, :script_tirhuta, text, ts-1, te)
|
510
562
|
when 'ugar', 'ugaritic'
|
511
563
|
self.emit(type, :script_ugaritic, text, ts-1, te)
|
512
564
|
when 'vaii', 'vai'
|
513
565
|
self.emit(type, :script_vai, text, ts-1, te)
|
566
|
+
when 'wara', 'warangciti'
|
567
|
+
self.emit(type, :script_warang_citi, text, ts-1, te)
|
514
568
|
when 'xpeo', 'oldpersian'
|
515
569
|
self.emit(type, :script_old_persian, text, ts-1, te)
|
516
570
|
when 'xsux', 'cuneiform'
|
@@ -20,6 +20,8 @@
|
|
20
20
|
set_close = ']';
|
21
21
|
brackets = set_open | set_close;
|
22
22
|
|
23
|
+
comment = ('#' . [^\n]* . '\n');
|
24
|
+
|
23
25
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
24
26
|
'cntrl' | 'digit' | 'graph' |
|
25
27
|
'lower' | 'print' | 'punct' |
|
@@ -74,6 +76,8 @@
|
|
74
76
|
quantifier_possessive | quantifier_interval;
|
75
77
|
|
76
78
|
|
79
|
+
conditional = '(?(';
|
80
|
+
|
77
81
|
group_comment = '?#' . [^)]+ . group_close;
|
78
82
|
|
79
83
|
group_atomic = '?>';
|
@@ -84,23 +88,28 @@
|
|
84
88
|
assertion_lookbehind = '?<=';
|
85
89
|
assertion_nlookbehind = '?<!';
|
86
90
|
|
87
|
-
group_options = '?' . [\-
|
91
|
+
group_options = '?' . [\-mixdau];
|
88
92
|
|
89
93
|
group_ref = [gk];
|
90
|
-
|
94
|
+
group_name_char = (alnum | '_');
|
95
|
+
group_name_id = (group_name_char . (group_name_char+)?)?;
|
91
96
|
group_number = '-'? . [1-9] . ([0-9]+)?;
|
92
97
|
group_level = [+\-] . [0-9]+;
|
93
98
|
|
94
|
-
|
99
|
+
group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
|
100
|
+
group_lookup = group_name | group_number;
|
95
101
|
|
96
|
-
|
97
|
-
|
102
|
+
group_named = ('?' . group_name );
|
103
|
+
|
104
|
+
group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
|
105
|
+
("'" . group_name_id . group_level? "'"));
|
98
106
|
|
99
107
|
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
100
108
|
("'" . group_number . group_level? "'"));
|
101
109
|
|
102
110
|
group_type = group_atomic | group_passive | group_named;
|
103
111
|
|
112
|
+
|
104
113
|
assertion_type = assertion_lookahead | assertion_nlookahead |
|
105
114
|
assertion_lookbehind | assertion_nlookbehind;
|
106
115
|
|
@@ -133,8 +142,8 @@
|
|
133
142
|
}
|
134
143
|
|
135
144
|
# group (nesting) and set open/close actions
|
136
|
-
action group_opened { group_depth += 1; in_group = true }
|
137
|
-
action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
|
145
|
+
action group_opened { @group_depth += 1; @in_group = true }
|
146
|
+
action group_closed { @group_depth -= 1; @in_group = @group_depth > 0 ? true : false }
|
138
147
|
|
139
148
|
# Character set scanner, continues consuming characters until it meets the
|
140
149
|
# closing bracket of the set.
|
@@ -410,6 +419,22 @@
|
|
410
419
|
*|;
|
411
420
|
|
412
421
|
|
422
|
+
# conditional expressions scanner
|
423
|
+
# --------------------------------------------------------------------------
|
424
|
+
conditional_expression := |*
|
425
|
+
group_lookup . ')' {
|
426
|
+
text = text(data, ts, te-1).first
|
427
|
+
emit(:conditional, :condition, text, ts, te-1)
|
428
|
+
emit(:conditional, :condition_close, ')', te-1, te)
|
429
|
+
};
|
430
|
+
|
431
|
+
any {
|
432
|
+
fhold;
|
433
|
+
fcall main;
|
434
|
+
};
|
435
|
+
*|;
|
436
|
+
|
437
|
+
|
413
438
|
# Main scanner
|
414
439
|
# --------------------------------------------------------------------------
|
415
440
|
main := |*
|
@@ -421,7 +446,12 @@
|
|
421
446
|
};
|
422
447
|
|
423
448
|
alternation {
|
424
|
-
|
449
|
+
if in_conditional and conditional_stack.length > 0 and
|
450
|
+
conditional_stack.last[1] == @group_depth
|
451
|
+
emit(:conditional, :separator, *text(data, ts, te))
|
452
|
+
else
|
453
|
+
emit(:meta, :alternation, *text(data, ts, te))
|
454
|
+
end
|
425
455
|
};
|
426
456
|
|
427
457
|
# Anchors
|
@@ -434,6 +464,10 @@
|
|
434
464
|
emit(:anchor, :eol, *text(data, ts, te))
|
435
465
|
};
|
436
466
|
|
467
|
+
backslash . 'K' > (backslashed, 4) {
|
468
|
+
emit(:keep, :mark, *text(data, ts, te))
|
469
|
+
};
|
470
|
+
|
437
471
|
backslash . anchor_char > (backslashed, 3) {
|
438
472
|
case text = text(data, ts, te).first
|
439
473
|
when '\\A'; emit(:anchor, :bos, text, ts, te)
|
@@ -481,6 +515,23 @@
|
|
481
515
|
fcall character_set;
|
482
516
|
};
|
483
517
|
|
518
|
+
|
519
|
+
# Conditional expression
|
520
|
+
# (?(condition)Y|N) conditional expression
|
521
|
+
# ------------------------------------------------------------------------
|
522
|
+
conditional {
|
523
|
+
text = text(data, ts, te).first
|
524
|
+
|
525
|
+
in_conditional = true unless in_conditional
|
526
|
+
conditional_depth += 1
|
527
|
+
conditional_stack << [conditional_depth, @group_depth]
|
528
|
+
|
529
|
+
emit(:conditional, :open, text[0..-2], ts, te-1)
|
530
|
+
emit(:conditional, :condition_open, '(', te-1, te)
|
531
|
+
fcall conditional_expression;
|
532
|
+
};
|
533
|
+
|
534
|
+
|
484
535
|
# (?#...) comments: parsed as a single expression, without introducing a
|
485
536
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
486
537
|
# special case for close, action performed on all transitions to get the
|
@@ -491,12 +542,15 @@
|
|
491
542
|
};
|
492
543
|
|
493
544
|
# Expression options:
|
494
|
-
# (?
|
545
|
+
# (?imxdau-imx) option on/off
|
495
546
|
# i: ignore case
|
496
547
|
# m: multi-line (dot(.) match newline)
|
497
548
|
# x: extended form
|
549
|
+
# d: default class rules (1.9 compatible)
|
550
|
+
# a: ASCII class rules (\s, \w, etc.)
|
551
|
+
# u: Unicode class rules (\s, \w, etc.)
|
498
552
|
#
|
499
|
-
# (?
|
553
|
+
# (?imxdau-imx:subexp) option on/off for subexp
|
500
554
|
# ------------------------------------------------------------------------
|
501
555
|
group_open . group_options >group_opened {
|
502
556
|
p = scan_options(p, data, ts, te)
|
@@ -551,7 +605,29 @@
|
|
551
605
|
};
|
552
606
|
|
553
607
|
group_close @group_closed {
|
554
|
-
|
608
|
+
if in_conditional and conditional_stack.last and
|
609
|
+
conditional_stack.last[1] == (@group_depth + 1)
|
610
|
+
|
611
|
+
emit(:conditional, :close, *text(data, ts, te))
|
612
|
+
conditional_stack.pop
|
613
|
+
|
614
|
+
if conditional_stack.length == 0
|
615
|
+
in_conditional = false
|
616
|
+
end
|
617
|
+
else
|
618
|
+
if @spacing_stack.length > 1 and
|
619
|
+
@spacing_stack.last[1] == (@group_depth + 1)
|
620
|
+
@spacing_stack.pop
|
621
|
+
|
622
|
+
@free_spacing = @spacing_stack.last[0]
|
623
|
+
|
624
|
+
if @spacing_stack.length == 1
|
625
|
+
@in_options = false
|
626
|
+
end
|
627
|
+
end
|
628
|
+
|
629
|
+
emit(:group, :close, *text(data, ts, te))
|
630
|
+
end
|
555
631
|
};
|
556
632
|
|
557
633
|
|
@@ -662,10 +738,26 @@
|
|
662
738
|
fcall escape_sequence;
|
663
739
|
};
|
664
740
|
|
741
|
+
comment {
|
742
|
+
if @free_spacing
|
743
|
+
emit(:free_space, :comment, *text(data, ts, te))
|
744
|
+
else
|
745
|
+
append_literal(data, ts, te)
|
746
|
+
end
|
747
|
+
};
|
748
|
+
|
749
|
+
space+ {
|
750
|
+
if @free_spacing
|
751
|
+
emit(:free_space, :whitespace, *text(data, ts, te))
|
752
|
+
else
|
753
|
+
append_literal(data, ts, te)
|
754
|
+
end
|
755
|
+
};
|
756
|
+
|
665
757
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
666
758
|
# except meta characters.
|
667
759
|
# ------------------------------------------------------------------------
|
668
|
-
ascii_print+ |
|
760
|
+
(ascii_print -- space)+ |
|
669
761
|
ascii_nonprint+ |
|
670
762
|
utf8_2_byte+ |
|
671
763
|
utf8_3_byte+ |
|
@@ -683,11 +775,7 @@ module Regexp::Scanner
|
|
683
775
|
%% write data;
|
684
776
|
|
685
777
|
# General scanner error (catch all)
|
686
|
-
class ScannerError < StandardError
|
687
|
-
def initialize(what)
|
688
|
-
super what
|
689
|
-
end
|
690
|
-
end
|
778
|
+
class ScannerError < StandardError; end
|
691
779
|
|
692
780
|
# Base for all scanner validation errors
|
693
781
|
class ValidationError < StandardError
|
@@ -717,6 +805,13 @@ module Regexp::Scanner
|
|
717
805
|
end
|
718
806
|
end
|
719
807
|
|
808
|
+
# Invalid groupOption. Used for inline options.
|
809
|
+
class InvalidGroupOption < ValidationError
|
810
|
+
def initialize(option, text)
|
811
|
+
super "Invalid group option #{option} in #{text}"
|
812
|
+
end
|
813
|
+
end
|
814
|
+
|
720
815
|
# Invalid back reference. Used for name a number refs/calls.
|
721
816
|
class InvalidBackrefError < ValidationError
|
722
817
|
def initialize(what, reason)
|
@@ -737,18 +832,29 @@ module Regexp::Scanner
|
|
737
832
|
#
|
738
833
|
# This method may raise errors if a syntax error is encountered.
|
739
834
|
# --------------------------------------------------------------------------
|
740
|
-
def self.scan(
|
835
|
+
def self.scan(input_object, &block)
|
741
836
|
top, stack = 0, []
|
742
837
|
|
743
|
-
|
838
|
+
if input_object.is_a?(Regexp)
|
839
|
+
input = input_object.source
|
840
|
+
@free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
841
|
+
else
|
842
|
+
input = input_object
|
843
|
+
@free_spacing = false
|
844
|
+
end
|
845
|
+
|
846
|
+
|
744
847
|
data = input.unpack("c*") if input.is_a?(String)
|
745
848
|
eof = data.length
|
746
849
|
|
747
850
|
@tokens = []
|
748
851
|
@block = block_given? ? block : nil
|
749
852
|
|
750
|
-
in_group, group_depth = false, 0
|
853
|
+
@in_group, @group_depth = false, 0
|
854
|
+
@in_options, @spacing_stack = false, [[@free_spacing, 0]]
|
855
|
+
|
751
856
|
in_set, set_depth, set_type = false, 0, :set
|
857
|
+
in_conditional, conditional_depth, conditional_stack = false, 0, []
|
752
858
|
|
753
859
|
%% write init;
|
754
860
|
%% write exec;
|
@@ -759,7 +865,7 @@ module Regexp::Scanner
|
|
759
865
|
end
|
760
866
|
|
761
867
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
762
|
-
"[#{in_group}:#{group_depth}]") if in_group
|
868
|
+
"[#{@in_group}:#{@group_depth}]") if @in_group
|
763
869
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
764
870
|
"[#{in_set}:#{set_depth}]") if in_set
|
765
871
|
|
@@ -779,13 +885,19 @@ module Regexp::Scanner
|
|
779
885
|
|
780
886
|
options_char, options_length = true, 0
|
781
887
|
|
782
|
-
# Copy while we have option characters
|
783
|
-
#
|
784
|
-
|
888
|
+
# Copy while we have option characters. There is no maximum length,
|
889
|
+
# as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
|
890
|
+
negative_options = false
|
891
|
+
while options_char
|
785
892
|
if data[te + options_length]
|
786
893
|
c = data[te + options_length].chr
|
787
894
|
|
788
|
-
if c =~ /[-
|
895
|
+
if c =~ /[-mixdau]/
|
896
|
+
negative_options = true if c == '-'
|
897
|
+
|
898
|
+
raise InvalidGroupOption.new(c, text) if negative_options and
|
899
|
+
c =~ /[dau]/
|
900
|
+
|
789
901
|
text << c ; p += 1 ; options_length += 1
|
790
902
|
else
|
791
903
|
options_char = false
|
@@ -801,11 +913,11 @@ module Regexp::Scanner
|
|
801
913
|
if c == ':'
|
802
914
|
# Include the ':' in the options text
|
803
915
|
text << c ; p += 1 ; options_length += 1
|
804
|
-
|
916
|
+
emit_options(text, ts, te + options_length)
|
805
917
|
|
806
918
|
elsif c == ')'
|
807
919
|
# Don't include the closing ')', let group_close handle it.
|
808
|
-
|
920
|
+
emit_options(text, ts, te + options_length)
|
809
921
|
|
810
922
|
else
|
811
923
|
# Plain Regexp reports this as 'undefined group option'
|
@@ -849,6 +961,27 @@ module Regexp::Scanner
|
|
849
961
|
emit(:literal, :literal, text, ts, te)
|
850
962
|
end
|
851
963
|
|
964
|
+
def self.emit_options(text, ts, te)
|
965
|
+
if text =~ /\(\?([mixdau]+)?-?([mix]+)?:/
|
966
|
+
positive, negative = $1, $2
|
967
|
+
|
968
|
+
if positive =~ /x/
|
969
|
+
@free_spacing = true
|
970
|
+
end
|
971
|
+
|
972
|
+
# If the x appears in both, treat it like ruby does, the second cancels
|
973
|
+
# the first.
|
974
|
+
if negative =~ /x/
|
975
|
+
@free_spacing = false
|
976
|
+
end
|
977
|
+
end
|
978
|
+
|
979
|
+
@in_options = true
|
980
|
+
@spacing_stack << [@free_spacing, @group_depth]
|
981
|
+
|
982
|
+
emit(:group, :options, text, ts, te)
|
983
|
+
end
|
984
|
+
|
852
985
|
# Emits an array with the details of the scanned pattern
|
853
986
|
def self.emit(type, token, text, ts, te)
|
854
987
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -876,13 +1009,7 @@ module Regexp::Scanner
|
|
876
1009
|
error = ValidationError.new('expression')
|
877
1010
|
end
|
878
1011
|
|
879
|
-
|
880
|
-
# errors as warnings or ignore them
|
881
|
-
if false # @@config.validation_warn
|
882
|
-
$stderr.puts error.to_s # unless @@config.validation_ignore
|
883
|
-
else
|
884
|
-
raise error # unless @@config.validation_ignore
|
885
|
-
end
|
1012
|
+
raise error # unless @@config.validation_ignore
|
886
1013
|
end
|
887
1014
|
|
888
1015
|
# Used for references with an empty name or number
|