regexp_parser 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +57 -0
- data/Gemfile +8 -0
- data/LICENSE +1 -1
- data/README.md +225 -206
- data/Rakefile +9 -3
- data/lib/regexp_parser.rb +7 -11
- data/lib/regexp_parser/expression.rb +72 -14
- data/lib/regexp_parser/expression/classes/alternation.rb +3 -16
- data/lib/regexp_parser/expression/classes/conditional.rb +57 -0
- data/lib/regexp_parser/expression/classes/free_space.rb +17 -0
- data/lib/regexp_parser/expression/classes/keep.rb +7 -0
- data/lib/regexp_parser/expression/classes/set.rb +28 -7
- data/lib/regexp_parser/expression/methods/strfregexp.rb +113 -0
- data/lib/regexp_parser/expression/methods/tests.rb +116 -0
- data/lib/regexp_parser/expression/methods/traverse.rb +63 -0
- data/lib/regexp_parser/expression/quantifier.rb +10 -0
- data/lib/regexp_parser/expression/sequence.rb +45 -0
- data/lib/regexp_parser/expression/subexpression.rb +29 -1
- data/lib/regexp_parser/lexer.rb +31 -8
- data/lib/regexp_parser/parser.rb +118 -45
- data/lib/regexp_parser/scanner.rb +1745 -1404
- data/lib/regexp_parser/scanner/property.rl +57 -3
- data/lib/regexp_parser/scanner/scanner.rl +161 -34
- data/lib/regexp_parser/syntax.rb +12 -2
- data/lib/regexp_parser/syntax/ruby/1.9.1.rb +3 -3
- data/lib/regexp_parser/syntax/ruby/1.9.3.rb +2 -7
- data/lib/regexp_parser/syntax/ruby/2.0.0.rb +4 -1
- data/lib/regexp_parser/syntax/ruby/2.1.4.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.5.rb +13 -0
- data/lib/regexp_parser/syntax/ruby/2.1.rb +2 -2
- data/lib/regexp_parser/syntax/ruby/2.2.0.rb +16 -0
- data/lib/regexp_parser/syntax/ruby/2.2.rb +8 -0
- data/lib/regexp_parser/syntax/tokens.rb +19 -2
- data/lib/regexp_parser/syntax/tokens/conditional.rb +22 -0
- data/lib/regexp_parser/syntax/tokens/keep.rb +14 -0
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +45 -4
- data/lib/regexp_parser/token.rb +23 -8
- data/lib/regexp_parser/version.rb +5 -0
- data/regexp_parser.gemspec +35 -0
- data/test/expression/test_all.rb +6 -1
- data/test/expression/test_base.rb +19 -0
- data/test/expression/test_conditionals.rb +114 -0
- data/test/expression/test_free_space.rb +33 -0
- data/test/expression/test_set.rb +61 -0
- data/test/expression/test_strfregexp.rb +214 -0
- data/test/expression/test_subexpression.rb +24 -0
- data/test/expression/test_tests.rb +99 -0
- data/test/expression/test_to_h.rb +48 -0
- data/test/expression/test_to_s.rb +46 -0
- data/test/expression/test_traverse.rb +164 -0
- data/test/lexer/test_all.rb +16 -3
- data/test/lexer/test_conditionals.rb +101 -0
- data/test/lexer/test_keep.rb +24 -0
- data/test/lexer/test_literals.rb +51 -51
- data/test/lexer/test_nesting.rb +62 -62
- data/test/lexer/test_refcalls.rb +18 -20
- data/test/parser/test_all.rb +18 -3
- data/test/parser/test_alternation.rb +11 -14
- data/test/parser/test_conditionals.rb +148 -0
- data/test/parser/test_escapes.rb +29 -5
- data/test/parser/test_free_space.rb +139 -0
- data/test/parser/test_groups.rb +40 -0
- data/test/parser/test_keep.rb +21 -0
- data/test/scanner/test_all.rb +8 -2
- data/test/scanner/test_conditionals.rb +166 -0
- data/test/scanner/test_escapes.rb +8 -5
- data/test/scanner/test_free_space.rb +133 -0
- data/test/scanner/test_groups.rb +28 -0
- data/test/scanner/test_keep.rb +33 -0
- data/test/scanner/test_properties.rb +4 -0
- data/test/scanner/test_scripts.rb +71 -1
- data/test/syntax/ruby/test_1.9.3.rb +2 -2
- data/test/syntax/ruby/test_2.0.0.rb +38 -0
- data/test/syntax/ruby/test_2.2.0.rb +38 -0
- data/test/syntax/ruby/test_all.rb +1 -8
- data/test/syntax/ruby/test_files.rb +104 -0
- data/test/test_all.rb +2 -1
- data/test/token/test_all.rb +2 -0
- data/test/token/test_token.rb +109 -0
- metadata +75 -21
- data/VERSION.yml +0 -5
- data/lib/regexp_parser/ctype.rb +0 -48
- data/test/syntax/ruby/test_2.x.rb +0 -46
@@ -11,7 +11,6 @@
|
|
11
11
|
'cntrl'i | 'digit'i | 'graph'i | 'lower'i | 'print'i |
|
12
12
|
'punct'i | 'space'i | 'upper'i | 'word'i | 'xdigit'i;
|
13
13
|
|
14
|
-
# TODO: are these case-insensitive?
|
15
14
|
property_name_posix = 'any'i | 'assigned'i | 'newline'i;
|
16
15
|
|
17
16
|
property_name = property_name_unicode | property_name_posix;
|
@@ -39,7 +38,9 @@
|
|
39
38
|
property_age = 'age=1.1'i | 'age=2.0'i | 'age=2.1'i |
|
40
39
|
'age=3.0'i | 'age=3.1'i | 'age=3.2'i |
|
41
40
|
'age=4.0'i | 'age=4.1'i | 'age=5.0'i |
|
42
|
-
'age=5.1'i | 'age=5.2'i | 'age=6.0'i
|
41
|
+
'age=5.1'i | 'age=5.2'i | 'age=6.0'i |
|
42
|
+
'age=6.1'i | 'age=6.2'i | 'age=6.3'i |
|
43
|
+
'age=7.0'i;
|
43
44
|
|
44
45
|
property_script = (alpha | space | '_')+; # everything else
|
45
46
|
|
@@ -222,6 +223,14 @@
|
|
222
223
|
self.emit(type, :age_5_2, text, ts-1, te)
|
223
224
|
when 'age=6.0'
|
224
225
|
self.emit(type, :age_6_0, text, ts-1, te)
|
226
|
+
when 'age=6.1'
|
227
|
+
self.emit(type, :age_6_1, text, ts-1, te)
|
228
|
+
when 'age=6.2'
|
229
|
+
self.emit(type, :age_6_2, text, ts-1, te)
|
230
|
+
when 'age=6.3'
|
231
|
+
self.emit(type, :age_6_3, text, ts-1, te)
|
232
|
+
when 'age=7.0'
|
233
|
+
self.emit(type, :age_7_0, text, ts-1, te)
|
225
234
|
|
226
235
|
# Derived Properties
|
227
236
|
when 'ahex', 'asciihexdigit'
|
@@ -327,8 +336,9 @@
|
|
327
336
|
when 'xidc', 'xidcontinue'
|
328
337
|
self.emit(type, :xid_continue, text, ts-1, te)
|
329
338
|
|
330
|
-
|
331
339
|
# Scripts
|
340
|
+
when 'aghb', 'caucasianalbanian'
|
341
|
+
self.emit(type, :script_caucasian_albanian, text, ts-1, te)
|
332
342
|
when 'arab', 'arabic'
|
333
343
|
self.emit(type, :script_arabic, text, ts-1, te)
|
334
344
|
when 'armi', 'imperialaramaic'
|
@@ -341,6 +351,8 @@
|
|
341
351
|
self.emit(type, :script_balinese, text, ts-1, te)
|
342
352
|
when 'bamu', 'bamum'
|
343
353
|
self.emit(type, :script_bamum, text, ts-1, te)
|
354
|
+
when 'bass', 'bassavah'
|
355
|
+
self.emit(type, :script_bassa_vah, text, ts-1, te)
|
344
356
|
when 'batk', 'batak'
|
345
357
|
self.emit(type, :script_batak, text, ts-1, te)
|
346
358
|
when 'beng', 'bengali'
|
@@ -373,8 +385,12 @@
|
|
373
385
|
self.emit(type, :script_devanagari, text, ts-1, te)
|
374
386
|
when 'dsrt', 'deseret'
|
375
387
|
self.emit(type, :script_deseret, text, ts-1, te)
|
388
|
+
when 'dupl', 'duployan'
|
389
|
+
self.emit(type, :script_duployan, text, ts-1, te)
|
376
390
|
when 'egyp', 'egyptianhieroglyphs'
|
377
391
|
self.emit(type, :script_egyptian_hieroglyphs, text, ts-1, te)
|
392
|
+
when 'elba', 'elbasan'
|
393
|
+
self.emit(type, :script_elbasan, text, ts-1, te)
|
378
394
|
when 'ethi', 'ethiopic'
|
379
395
|
self.emit(type, :script_ethiopic, text, ts-1, te)
|
380
396
|
when 'geor', 'georgian'
|
@@ -383,6 +399,8 @@
|
|
383
399
|
self.emit(type, :script_glagolitic, text, ts-1, te)
|
384
400
|
when 'goth', 'gothic'
|
385
401
|
self.emit(type, :script_gothic, text, ts-1, te)
|
402
|
+
when 'gran', 'grantha'
|
403
|
+
self.emit(type, :script_grantha, text, ts-1, te)
|
386
404
|
when 'grek', 'greek'
|
387
405
|
self.emit(type, :script_greek, text, ts-1, te)
|
388
406
|
when 'gujr', 'gujarati'
|
@@ -399,6 +417,8 @@
|
|
399
417
|
self.emit(type, :script_hebrew, text, ts-1, te)
|
400
418
|
when 'hira', 'hiragana'
|
401
419
|
self.emit(type, :script_hiragana, text, ts-1, te)
|
420
|
+
when 'hmng', 'pahawhhmong'
|
421
|
+
self.emit(type, :script_pahawh_hmong, text, ts-1, te)
|
402
422
|
when 'hrkt', 'katakanaorhiragana'
|
403
423
|
self.emit(type, :script_katakana_or_hiragana, text, ts-1, te)
|
404
424
|
when 'ital', 'olditalic'
|
@@ -413,6 +433,8 @@
|
|
413
433
|
self.emit(type, :script_kharoshthi, text, ts-1, te)
|
414
434
|
when 'khmr', 'khmer'
|
415
435
|
self.emit(type, :script_khmer, text, ts-1, te)
|
436
|
+
when 'khoj', 'khojki'
|
437
|
+
self.emit(type, :script_khojki, text, ts-1, te)
|
416
438
|
when 'knda', 'kannada'
|
417
439
|
self.emit(type, :script_kannada, text, ts-1, te)
|
418
440
|
when 'kthi', 'kaithi'
|
@@ -427,6 +449,8 @@
|
|
427
449
|
self.emit(type, :script_lepcha, text, ts-1, te)
|
428
450
|
when 'limb', 'limbu'
|
429
451
|
self.emit(type, :script_limbu, text, ts-1, te)
|
452
|
+
when 'lina', 'lineara'
|
453
|
+
self.emit(type, :script_linear_a, text, ts-1, te)
|
430
454
|
when 'linb', 'linearb'
|
431
455
|
self.emit(type, :script_linear_b, text, ts-1, te)
|
432
456
|
when 'lisu'
|
@@ -437,14 +461,28 @@
|
|
437
461
|
self.emit(type, :script_lydian, text, ts-1, te)
|
438
462
|
when 'mlym', 'malayalam'
|
439
463
|
self.emit(type, :script_malayalam, text, ts-1, te)
|
464
|
+
when 'mahj', 'mahajani'
|
465
|
+
self.emit(type, :script_mahajani, text, ts-1, te)
|
440
466
|
when 'mand', 'mandaic'
|
441
467
|
self.emit(type, :script_mandaic, text, ts-1, te)
|
468
|
+
when 'mani', 'manichaean'
|
469
|
+
self.emit(type, :script_manichaean, text, ts-1, te)
|
470
|
+
when 'mend', 'mendekikakui'
|
471
|
+
self.emit(type, :script_mende_kikakui, text, ts-1, te)
|
472
|
+
when 'modi'
|
473
|
+
self.emit(type, :script_modi, text, ts-1, te)
|
442
474
|
when 'mong', 'mongolian'
|
443
475
|
self.emit(type, :script_mongolian, text, ts-1, te)
|
476
|
+
when 'mroo', 'mro'
|
477
|
+
self.emit(type, :script_mro, text, ts-1, te)
|
444
478
|
when 'mtei', 'meeteimayek'
|
445
479
|
self.emit(type, :script_meetei_mayek, text, ts-1, te)
|
446
480
|
when 'mymr', 'myanmar'
|
447
481
|
self.emit(type, :script_myanmar, text, ts-1, te)
|
482
|
+
when 'narb', 'oldnortharabian'
|
483
|
+
self.emit(type, :script_old_north_arabian, text, ts-1, te)
|
484
|
+
when 'nbat', 'nabataean'
|
485
|
+
self.emit(type, :script_nabataean, text, ts-1, te)
|
448
486
|
when 'nkoo', 'nko'
|
449
487
|
self.emit(type, :script_nko, text, ts-1, te)
|
450
488
|
when 'ogam', 'ogham'
|
@@ -457,10 +495,18 @@
|
|
457
495
|
self.emit(type, :script_oriya, text, ts-1, te)
|
458
496
|
when 'osma', 'osmanya'
|
459
497
|
self.emit(type, :script_osmanya, text, ts-1, te)
|
498
|
+
when 'palm', 'palmyrene'
|
499
|
+
self.emit(type, :script_palmyrene, text, ts-1, te)
|
500
|
+
when 'pauc', 'paucinhau'
|
501
|
+
self.emit(type, :script_pau_cin_hau, text, ts-1, te)
|
502
|
+
when 'perm', 'oldpermic'
|
503
|
+
self.emit(type, :script_old_permic, text, ts-1, te)
|
460
504
|
when 'phag', 'phagspa'
|
461
505
|
self.emit(type, :script_phags_pa, text, ts-1, te)
|
462
506
|
when 'phli', 'inscriptionalpahlavi'
|
463
507
|
self.emit(type, :script_inscriptional_pahlavi, text, ts-1, te)
|
508
|
+
when 'phlp', 'psalterpahlavi'
|
509
|
+
self.emit(type, :script_psalter_pahlavi, text, ts-1, te)
|
464
510
|
when 'phnx', 'phoenician'
|
465
511
|
self.emit(type, :script_phoenician, text, ts-1, te)
|
466
512
|
when 'prti', 'inscriptionalparthian'
|
@@ -477,6 +523,10 @@
|
|
477
523
|
self.emit(type, :script_saurashtra, text, ts-1, te)
|
478
524
|
when 'shaw', 'shavian'
|
479
525
|
self.emit(type, :script_shavian, text, ts-1, te)
|
526
|
+
when 'sidd', 'siddham'
|
527
|
+
self.emit(type, :script_siddham, text, ts-1, te)
|
528
|
+
when 'sind', 'khudawadi'
|
529
|
+
self.emit(type, :script_khudawadi, text, ts-1, te)
|
480
530
|
when 'sinh', 'sinhala'
|
481
531
|
self.emit(type, :script_sinhala, text, ts-1, te)
|
482
532
|
when 'sund', 'sundanese'
|
@@ -507,10 +557,14 @@
|
|
507
557
|
self.emit(type, :script_thai, text, ts-1, te)
|
508
558
|
when 'tibt', 'tibetan'
|
509
559
|
self.emit(type, :script_tibetan, text, ts-1, te)
|
560
|
+
when 'tirh', 'tirhuta'
|
561
|
+
self.emit(type, :script_tirhuta, text, ts-1, te)
|
510
562
|
when 'ugar', 'ugaritic'
|
511
563
|
self.emit(type, :script_ugaritic, text, ts-1, te)
|
512
564
|
when 'vaii', 'vai'
|
513
565
|
self.emit(type, :script_vai, text, ts-1, te)
|
566
|
+
when 'wara', 'warangciti'
|
567
|
+
self.emit(type, :script_warang_citi, text, ts-1, te)
|
514
568
|
when 'xpeo', 'oldpersian'
|
515
569
|
self.emit(type, :script_old_persian, text, ts-1, te)
|
516
570
|
when 'xsux', 'cuneiform'
|
@@ -20,6 +20,8 @@
|
|
20
20
|
set_close = ']';
|
21
21
|
brackets = set_open | set_close;
|
22
22
|
|
23
|
+
comment = ('#' . [^\n]* . '\n');
|
24
|
+
|
23
25
|
class_name_posix = 'alnum' | 'alpha' | 'blank' |
|
24
26
|
'cntrl' | 'digit' | 'graph' |
|
25
27
|
'lower' | 'print' | 'punct' |
|
@@ -74,6 +76,8 @@
|
|
74
76
|
quantifier_possessive | quantifier_interval;
|
75
77
|
|
76
78
|
|
79
|
+
conditional = '(?(';
|
80
|
+
|
77
81
|
group_comment = '?#' . [^)]+ . group_close;
|
78
82
|
|
79
83
|
group_atomic = '?>';
|
@@ -84,23 +88,28 @@
|
|
84
88
|
assertion_lookbehind = '?<=';
|
85
89
|
assertion_nlookbehind = '?<!';
|
86
90
|
|
87
|
-
group_options = '?' . [\-
|
91
|
+
group_options = '?' . [\-mixdau];
|
88
92
|
|
89
93
|
group_ref = [gk];
|
90
|
-
|
94
|
+
group_name_char = (alnum | '_');
|
95
|
+
group_name_id = (group_name_char . (group_name_char+)?)?;
|
91
96
|
group_number = '-'? . [1-9] . ([0-9]+)?;
|
92
97
|
group_level = [+\-] . [0-9]+;
|
93
98
|
|
94
|
-
|
99
|
+
group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
|
100
|
+
group_lookup = group_name | group_number;
|
95
101
|
|
96
|
-
|
97
|
-
|
102
|
+
group_named = ('?' . group_name );
|
103
|
+
|
104
|
+
group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
|
105
|
+
("'" . group_name_id . group_level? "'"));
|
98
106
|
|
99
107
|
group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
|
100
108
|
("'" . group_number . group_level? "'"));
|
101
109
|
|
102
110
|
group_type = group_atomic | group_passive | group_named;
|
103
111
|
|
112
|
+
|
104
113
|
assertion_type = assertion_lookahead | assertion_nlookahead |
|
105
114
|
assertion_lookbehind | assertion_nlookbehind;
|
106
115
|
|
@@ -133,8 +142,8 @@
|
|
133
142
|
}
|
134
143
|
|
135
144
|
# group (nesting) and set open/close actions
|
136
|
-
action group_opened { group_depth += 1; in_group = true }
|
137
|
-
action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
|
145
|
+
action group_opened { @group_depth += 1; @in_group = true }
|
146
|
+
action group_closed { @group_depth -= 1; @in_group = @group_depth > 0 ? true : false }
|
138
147
|
|
139
148
|
# Character set scanner, continues consuming characters until it meets the
|
140
149
|
# closing bracket of the set.
|
@@ -410,6 +419,22 @@
|
|
410
419
|
*|;
|
411
420
|
|
412
421
|
|
422
|
+
# conditional expressions scanner
|
423
|
+
# --------------------------------------------------------------------------
|
424
|
+
conditional_expression := |*
|
425
|
+
group_lookup . ')' {
|
426
|
+
text = text(data, ts, te-1).first
|
427
|
+
emit(:conditional, :condition, text, ts, te-1)
|
428
|
+
emit(:conditional, :condition_close, ')', te-1, te)
|
429
|
+
};
|
430
|
+
|
431
|
+
any {
|
432
|
+
fhold;
|
433
|
+
fcall main;
|
434
|
+
};
|
435
|
+
*|;
|
436
|
+
|
437
|
+
|
413
438
|
# Main scanner
|
414
439
|
# --------------------------------------------------------------------------
|
415
440
|
main := |*
|
@@ -421,7 +446,12 @@
|
|
421
446
|
};
|
422
447
|
|
423
448
|
alternation {
|
424
|
-
|
449
|
+
if in_conditional and conditional_stack.length > 0 and
|
450
|
+
conditional_stack.last[1] == @group_depth
|
451
|
+
emit(:conditional, :separator, *text(data, ts, te))
|
452
|
+
else
|
453
|
+
emit(:meta, :alternation, *text(data, ts, te))
|
454
|
+
end
|
425
455
|
};
|
426
456
|
|
427
457
|
# Anchors
|
@@ -434,6 +464,10 @@
|
|
434
464
|
emit(:anchor, :eol, *text(data, ts, te))
|
435
465
|
};
|
436
466
|
|
467
|
+
backslash . 'K' > (backslashed, 4) {
|
468
|
+
emit(:keep, :mark, *text(data, ts, te))
|
469
|
+
};
|
470
|
+
|
437
471
|
backslash . anchor_char > (backslashed, 3) {
|
438
472
|
case text = text(data, ts, te).first
|
439
473
|
when '\\A'; emit(:anchor, :bos, text, ts, te)
|
@@ -481,6 +515,23 @@
|
|
481
515
|
fcall character_set;
|
482
516
|
};
|
483
517
|
|
518
|
+
|
519
|
+
# Conditional expression
|
520
|
+
# (?(condition)Y|N) conditional expression
|
521
|
+
# ------------------------------------------------------------------------
|
522
|
+
conditional {
|
523
|
+
text = text(data, ts, te).first
|
524
|
+
|
525
|
+
in_conditional = true unless in_conditional
|
526
|
+
conditional_depth += 1
|
527
|
+
conditional_stack << [conditional_depth, @group_depth]
|
528
|
+
|
529
|
+
emit(:conditional, :open, text[0..-2], ts, te-1)
|
530
|
+
emit(:conditional, :condition_open, '(', te-1, te)
|
531
|
+
fcall conditional_expression;
|
532
|
+
};
|
533
|
+
|
534
|
+
|
484
535
|
# (?#...) comments: parsed as a single expression, without introducing a
|
485
536
|
# new nesting level. Comments may not include parentheses, escaped or not.
|
486
537
|
# special case for close, action performed on all transitions to get the
|
@@ -491,12 +542,15 @@
|
|
491
542
|
};
|
492
543
|
|
493
544
|
# Expression options:
|
494
|
-
# (?
|
545
|
+
# (?imxdau-imx) option on/off
|
495
546
|
# i: ignore case
|
496
547
|
# m: multi-line (dot(.) match newline)
|
497
548
|
# x: extended form
|
549
|
+
# d: default class rules (1.9 compatible)
|
550
|
+
# a: ASCII class rules (\s, \w, etc.)
|
551
|
+
# u: Unicode class rules (\s, \w, etc.)
|
498
552
|
#
|
499
|
-
# (?
|
553
|
+
# (?imxdau-imx:subexp) option on/off for subexp
|
500
554
|
# ------------------------------------------------------------------------
|
501
555
|
group_open . group_options >group_opened {
|
502
556
|
p = scan_options(p, data, ts, te)
|
@@ -551,7 +605,29 @@
|
|
551
605
|
};
|
552
606
|
|
553
607
|
group_close @group_closed {
|
554
|
-
|
608
|
+
if in_conditional and conditional_stack.last and
|
609
|
+
conditional_stack.last[1] == (@group_depth + 1)
|
610
|
+
|
611
|
+
emit(:conditional, :close, *text(data, ts, te))
|
612
|
+
conditional_stack.pop
|
613
|
+
|
614
|
+
if conditional_stack.length == 0
|
615
|
+
in_conditional = false
|
616
|
+
end
|
617
|
+
else
|
618
|
+
if @spacing_stack.length > 1 and
|
619
|
+
@spacing_stack.last[1] == (@group_depth + 1)
|
620
|
+
@spacing_stack.pop
|
621
|
+
|
622
|
+
@free_spacing = @spacing_stack.last[0]
|
623
|
+
|
624
|
+
if @spacing_stack.length == 1
|
625
|
+
@in_options = false
|
626
|
+
end
|
627
|
+
end
|
628
|
+
|
629
|
+
emit(:group, :close, *text(data, ts, te))
|
630
|
+
end
|
555
631
|
};
|
556
632
|
|
557
633
|
|
@@ -662,10 +738,26 @@
|
|
662
738
|
fcall escape_sequence;
|
663
739
|
};
|
664
740
|
|
741
|
+
comment {
|
742
|
+
if @free_spacing
|
743
|
+
emit(:free_space, :comment, *text(data, ts, te))
|
744
|
+
else
|
745
|
+
append_literal(data, ts, te)
|
746
|
+
end
|
747
|
+
};
|
748
|
+
|
749
|
+
space+ {
|
750
|
+
if @free_spacing
|
751
|
+
emit(:free_space, :whitespace, *text(data, ts, te))
|
752
|
+
else
|
753
|
+
append_literal(data, ts, te)
|
754
|
+
end
|
755
|
+
};
|
756
|
+
|
665
757
|
# Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
|
666
758
|
# except meta characters.
|
667
759
|
# ------------------------------------------------------------------------
|
668
|
-
ascii_print+ |
|
760
|
+
(ascii_print -- space)+ |
|
669
761
|
ascii_nonprint+ |
|
670
762
|
utf8_2_byte+ |
|
671
763
|
utf8_3_byte+ |
|
@@ -683,11 +775,7 @@ module Regexp::Scanner
|
|
683
775
|
%% write data;
|
684
776
|
|
685
777
|
# General scanner error (catch all)
|
686
|
-
class ScannerError < StandardError
|
687
|
-
def initialize(what)
|
688
|
-
super what
|
689
|
-
end
|
690
|
-
end
|
778
|
+
class ScannerError < StandardError; end
|
691
779
|
|
692
780
|
# Base for all scanner validation errors
|
693
781
|
class ValidationError < StandardError
|
@@ -717,6 +805,13 @@ module Regexp::Scanner
|
|
717
805
|
end
|
718
806
|
end
|
719
807
|
|
808
|
+
# Invalid groupOption. Used for inline options.
|
809
|
+
class InvalidGroupOption < ValidationError
|
810
|
+
def initialize(option, text)
|
811
|
+
super "Invalid group option #{option} in #{text}"
|
812
|
+
end
|
813
|
+
end
|
814
|
+
|
720
815
|
# Invalid back reference. Used for name a number refs/calls.
|
721
816
|
class InvalidBackrefError < ValidationError
|
722
817
|
def initialize(what, reason)
|
@@ -737,18 +832,29 @@ module Regexp::Scanner
|
|
737
832
|
#
|
738
833
|
# This method may raise errors if a syntax error is encountered.
|
739
834
|
# --------------------------------------------------------------------------
|
740
|
-
def self.scan(
|
835
|
+
def self.scan(input_object, &block)
|
741
836
|
top, stack = 0, []
|
742
837
|
|
743
|
-
|
838
|
+
if input_object.is_a?(Regexp)
|
839
|
+
input = input_object.source
|
840
|
+
@free_spacing = (input_object.options & Regexp::EXTENDED != 0)
|
841
|
+
else
|
842
|
+
input = input_object
|
843
|
+
@free_spacing = false
|
844
|
+
end
|
845
|
+
|
846
|
+
|
744
847
|
data = input.unpack("c*") if input.is_a?(String)
|
745
848
|
eof = data.length
|
746
849
|
|
747
850
|
@tokens = []
|
748
851
|
@block = block_given? ? block : nil
|
749
852
|
|
750
|
-
in_group, group_depth = false, 0
|
853
|
+
@in_group, @group_depth = false, 0
|
854
|
+
@in_options, @spacing_stack = false, [[@free_spacing, 0]]
|
855
|
+
|
751
856
|
in_set, set_depth, set_type = false, 0, :set
|
857
|
+
in_conditional, conditional_depth, conditional_stack = false, 0, []
|
752
858
|
|
753
859
|
%% write init;
|
754
860
|
%% write exec;
|
@@ -759,7 +865,7 @@ module Regexp::Scanner
|
|
759
865
|
end
|
760
866
|
|
761
867
|
raise PrematureEndError.new("(missing group closing paranthesis) "+
|
762
|
-
"[#{in_group}:#{group_depth}]") if in_group
|
868
|
+
"[#{@in_group}:#{@group_depth}]") if @in_group
|
763
869
|
raise PrematureEndError.new("(missing set closing bracket) "+
|
764
870
|
"[#{in_set}:#{set_depth}]") if in_set
|
765
871
|
|
@@ -779,13 +885,19 @@ module Regexp::Scanner
|
|
779
885
|
|
780
886
|
options_char, options_length = true, 0
|
781
887
|
|
782
|
-
# Copy while we have option characters
|
783
|
-
#
|
784
|
-
|
888
|
+
# Copy while we have option characters. There is no maximum length,
|
889
|
+
# as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
|
890
|
+
negative_options = false
|
891
|
+
while options_char
|
785
892
|
if data[te + options_length]
|
786
893
|
c = data[te + options_length].chr
|
787
894
|
|
788
|
-
if c =~ /[-
|
895
|
+
if c =~ /[-mixdau]/
|
896
|
+
negative_options = true if c == '-'
|
897
|
+
|
898
|
+
raise InvalidGroupOption.new(c, text) if negative_options and
|
899
|
+
c =~ /[dau]/
|
900
|
+
|
789
901
|
text << c ; p += 1 ; options_length += 1
|
790
902
|
else
|
791
903
|
options_char = false
|
@@ -801,11 +913,11 @@ module Regexp::Scanner
|
|
801
913
|
if c == ':'
|
802
914
|
# Include the ':' in the options text
|
803
915
|
text << c ; p += 1 ; options_length += 1
|
804
|
-
|
916
|
+
emit_options(text, ts, te + options_length)
|
805
917
|
|
806
918
|
elsif c == ')'
|
807
919
|
# Don't include the closing ')', let group_close handle it.
|
808
|
-
|
920
|
+
emit_options(text, ts, te + options_length)
|
809
921
|
|
810
922
|
else
|
811
923
|
# Plain Regexp reports this as 'undefined group option'
|
@@ -849,6 +961,27 @@ module Regexp::Scanner
|
|
849
961
|
emit(:literal, :literal, text, ts, te)
|
850
962
|
end
|
851
963
|
|
964
|
+
def self.emit_options(text, ts, te)
|
965
|
+
if text =~ /\(\?([mixdau]+)?-?([mix]+)?:/
|
966
|
+
positive, negative = $1, $2
|
967
|
+
|
968
|
+
if positive =~ /x/
|
969
|
+
@free_spacing = true
|
970
|
+
end
|
971
|
+
|
972
|
+
# If the x appears in both, treat it like ruby does, the second cancels
|
973
|
+
# the first.
|
974
|
+
if negative =~ /x/
|
975
|
+
@free_spacing = false
|
976
|
+
end
|
977
|
+
end
|
978
|
+
|
979
|
+
@in_options = true
|
980
|
+
@spacing_stack << [@free_spacing, @group_depth]
|
981
|
+
|
982
|
+
emit(:group, :options, text, ts, te)
|
983
|
+
end
|
984
|
+
|
852
985
|
# Emits an array with the details of the scanned pattern
|
853
986
|
def self.emit(type, token, text, ts, te)
|
854
987
|
#puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
|
@@ -876,13 +1009,7 @@ module Regexp::Scanner
|
|
876
1009
|
error = ValidationError.new('expression')
|
877
1010
|
end
|
878
1011
|
|
879
|
-
|
880
|
-
# errors as warnings or ignore them
|
881
|
-
if false # @@config.validation_warn
|
882
|
-
$stderr.puts error.to_s # unless @@config.validation_ignore
|
883
|
-
else
|
884
|
-
raise error # unless @@config.validation_ignore
|
885
|
-
end
|
1012
|
+
raise error # unless @@config.validation_ignore
|
886
1013
|
end
|
887
1014
|
|
888
1015
|
# Used for references with an empty name or number
|