regexp_parser 0.1.6 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +57 -0
  3. data/Gemfile +8 -0
  4. data/LICENSE +1 -1
  5. data/README.md +225 -206
  6. data/Rakefile +9 -3
  7. data/lib/regexp_parser.rb +7 -11
  8. data/lib/regexp_parser/expression.rb +72 -14
  9. data/lib/regexp_parser/expression/classes/alternation.rb +3 -16
  10. data/lib/regexp_parser/expression/classes/conditional.rb +57 -0
  11. data/lib/regexp_parser/expression/classes/free_space.rb +17 -0
  12. data/lib/regexp_parser/expression/classes/keep.rb +7 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +28 -7
  14. data/lib/regexp_parser/expression/methods/strfregexp.rb +113 -0
  15. data/lib/regexp_parser/expression/methods/tests.rb +116 -0
  16. data/lib/regexp_parser/expression/methods/traverse.rb +63 -0
  17. data/lib/regexp_parser/expression/quantifier.rb +10 -0
  18. data/lib/regexp_parser/expression/sequence.rb +45 -0
  19. data/lib/regexp_parser/expression/subexpression.rb +29 -1
  20. data/lib/regexp_parser/lexer.rb +31 -8
  21. data/lib/regexp_parser/parser.rb +118 -45
  22. data/lib/regexp_parser/scanner.rb +1745 -1404
  23. data/lib/regexp_parser/scanner/property.rl +57 -3
  24. data/lib/regexp_parser/scanner/scanner.rl +161 -34
  25. data/lib/regexp_parser/syntax.rb +12 -2
  26. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +3 -3
  27. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +2 -7
  28. data/lib/regexp_parser/syntax/ruby/2.0.0.rb +4 -1
  29. data/lib/regexp_parser/syntax/ruby/2.1.4.rb +13 -0
  30. data/lib/regexp_parser/syntax/ruby/2.1.5.rb +13 -0
  31. data/lib/regexp_parser/syntax/ruby/2.1.rb +2 -2
  32. data/lib/regexp_parser/syntax/ruby/2.2.0.rb +16 -0
  33. data/lib/regexp_parser/syntax/ruby/2.2.rb +8 -0
  34. data/lib/regexp_parser/syntax/tokens.rb +19 -2
  35. data/lib/regexp_parser/syntax/tokens/conditional.rb +22 -0
  36. data/lib/regexp_parser/syntax/tokens/keep.rb +14 -0
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +45 -4
  38. data/lib/regexp_parser/token.rb +23 -8
  39. data/lib/regexp_parser/version.rb +5 -0
  40. data/regexp_parser.gemspec +35 -0
  41. data/test/expression/test_all.rb +6 -1
  42. data/test/expression/test_base.rb +19 -0
  43. data/test/expression/test_conditionals.rb +114 -0
  44. data/test/expression/test_free_space.rb +33 -0
  45. data/test/expression/test_set.rb +61 -0
  46. data/test/expression/test_strfregexp.rb +214 -0
  47. data/test/expression/test_subexpression.rb +24 -0
  48. data/test/expression/test_tests.rb +99 -0
  49. data/test/expression/test_to_h.rb +48 -0
  50. data/test/expression/test_to_s.rb +46 -0
  51. data/test/expression/test_traverse.rb +164 -0
  52. data/test/lexer/test_all.rb +16 -3
  53. data/test/lexer/test_conditionals.rb +101 -0
  54. data/test/lexer/test_keep.rb +24 -0
  55. data/test/lexer/test_literals.rb +51 -51
  56. data/test/lexer/test_nesting.rb +62 -62
  57. data/test/lexer/test_refcalls.rb +18 -20
  58. data/test/parser/test_all.rb +18 -3
  59. data/test/parser/test_alternation.rb +11 -14
  60. data/test/parser/test_conditionals.rb +148 -0
  61. data/test/parser/test_escapes.rb +29 -5
  62. data/test/parser/test_free_space.rb +139 -0
  63. data/test/parser/test_groups.rb +40 -0
  64. data/test/parser/test_keep.rb +21 -0
  65. data/test/scanner/test_all.rb +8 -2
  66. data/test/scanner/test_conditionals.rb +166 -0
  67. data/test/scanner/test_escapes.rb +8 -5
  68. data/test/scanner/test_free_space.rb +133 -0
  69. data/test/scanner/test_groups.rb +28 -0
  70. data/test/scanner/test_keep.rb +33 -0
  71. data/test/scanner/test_properties.rb +4 -0
  72. data/test/scanner/test_scripts.rb +71 -1
  73. data/test/syntax/ruby/test_1.9.3.rb +2 -2
  74. data/test/syntax/ruby/test_2.0.0.rb +38 -0
  75. data/test/syntax/ruby/test_2.2.0.rb +38 -0
  76. data/test/syntax/ruby/test_all.rb +1 -8
  77. data/test/syntax/ruby/test_files.rb +104 -0
  78. data/test/test_all.rb +2 -1
  79. data/test/token/test_all.rb +2 -0
  80. data/test/token/test_token.rb +109 -0
  81. metadata +75 -21
  82. data/VERSION.yml +0 -5
  83. data/lib/regexp_parser/ctype.rb +0 -48
  84. data/test/syntax/ruby/test_2.x.rb +0 -46
@@ -11,7 +11,6 @@
11
11
  'cntrl'i | 'digit'i | 'graph'i | 'lower'i | 'print'i |
12
12
  'punct'i | 'space'i | 'upper'i | 'word'i | 'xdigit'i;
13
13
 
14
- # TODO: are these case-insensitive?
15
14
  property_name_posix = 'any'i | 'assigned'i | 'newline'i;
16
15
 
17
16
  property_name = property_name_unicode | property_name_posix;
@@ -39,7 +38,9 @@
39
38
  property_age = 'age=1.1'i | 'age=2.0'i | 'age=2.1'i |
40
39
  'age=3.0'i | 'age=3.1'i | 'age=3.2'i |
41
40
  'age=4.0'i | 'age=4.1'i | 'age=5.0'i |
42
- 'age=5.1'i | 'age=5.2'i | 'age=6.0'i;
41
+ 'age=5.1'i | 'age=5.2'i | 'age=6.0'i |
42
+ 'age=6.1'i | 'age=6.2'i | 'age=6.3'i |
43
+ 'age=7.0'i;
43
44
 
44
45
  property_script = (alpha | space | '_')+; # everything else
45
46
 
@@ -222,6 +223,14 @@
222
223
  self.emit(type, :age_5_2, text, ts-1, te)
223
224
  when 'age=6.0'
224
225
  self.emit(type, :age_6_0, text, ts-1, te)
226
+ when 'age=6.1'
227
+ self.emit(type, :age_6_1, text, ts-1, te)
228
+ when 'age=6.2'
229
+ self.emit(type, :age_6_2, text, ts-1, te)
230
+ when 'age=6.3'
231
+ self.emit(type, :age_6_3, text, ts-1, te)
232
+ when 'age=7.0'
233
+ self.emit(type, :age_7_0, text, ts-1, te)
225
234
 
226
235
  # Derived Properties
227
236
  when 'ahex', 'asciihexdigit'
@@ -327,8 +336,9 @@
327
336
  when 'xidc', 'xidcontinue'
328
337
  self.emit(type, :xid_continue, text, ts-1, te)
329
338
 
330
-
331
339
  # Scripts
340
+ when 'aghb', 'caucasianalbanian'
341
+ self.emit(type, :script_caucasian_albanian, text, ts-1, te)
332
342
  when 'arab', 'arabic'
333
343
  self.emit(type, :script_arabic, text, ts-1, te)
334
344
  when 'armi', 'imperialaramaic'
@@ -341,6 +351,8 @@
341
351
  self.emit(type, :script_balinese, text, ts-1, te)
342
352
  when 'bamu', 'bamum'
343
353
  self.emit(type, :script_bamum, text, ts-1, te)
354
+ when 'bass', 'bassavah'
355
+ self.emit(type, :script_bassa_vah, text, ts-1, te)
344
356
  when 'batk', 'batak'
345
357
  self.emit(type, :script_batak, text, ts-1, te)
346
358
  when 'beng', 'bengali'
@@ -373,8 +385,12 @@
373
385
  self.emit(type, :script_devanagari, text, ts-1, te)
374
386
  when 'dsrt', 'deseret'
375
387
  self.emit(type, :script_deseret, text, ts-1, te)
388
+ when 'dupl', 'duployan'
389
+ self.emit(type, :script_duployan, text, ts-1, te)
376
390
  when 'egyp', 'egyptianhieroglyphs'
377
391
  self.emit(type, :script_egyptian_hieroglyphs, text, ts-1, te)
392
+ when 'elba', 'elbasan'
393
+ self.emit(type, :script_elbasan, text, ts-1, te)
378
394
  when 'ethi', 'ethiopic'
379
395
  self.emit(type, :script_ethiopic, text, ts-1, te)
380
396
  when 'geor', 'georgian'
@@ -383,6 +399,8 @@
383
399
  self.emit(type, :script_glagolitic, text, ts-1, te)
384
400
  when 'goth', 'gothic'
385
401
  self.emit(type, :script_gothic, text, ts-1, te)
402
+ when 'gran', 'grantha'
403
+ self.emit(type, :script_grantha, text, ts-1, te)
386
404
  when 'grek', 'greek'
387
405
  self.emit(type, :script_greek, text, ts-1, te)
388
406
  when 'gujr', 'gujarati'
@@ -399,6 +417,8 @@
399
417
  self.emit(type, :script_hebrew, text, ts-1, te)
400
418
  when 'hira', 'hiragana'
401
419
  self.emit(type, :script_hiragana, text, ts-1, te)
420
+ when 'hmng', 'pahawhhmong'
421
+ self.emit(type, :script_pahawh_hmong, text, ts-1, te)
402
422
  when 'hrkt', 'katakanaorhiragana'
403
423
  self.emit(type, :script_katakana_or_hiragana, text, ts-1, te)
404
424
  when 'ital', 'olditalic'
@@ -413,6 +433,8 @@
413
433
  self.emit(type, :script_kharoshthi, text, ts-1, te)
414
434
  when 'khmr', 'khmer'
415
435
  self.emit(type, :script_khmer, text, ts-1, te)
436
+ when 'khoj', 'khojki'
437
+ self.emit(type, :script_khojki, text, ts-1, te)
416
438
  when 'knda', 'kannada'
417
439
  self.emit(type, :script_kannada, text, ts-1, te)
418
440
  when 'kthi', 'kaithi'
@@ -427,6 +449,8 @@
427
449
  self.emit(type, :script_lepcha, text, ts-1, te)
428
450
  when 'limb', 'limbu'
429
451
  self.emit(type, :script_limbu, text, ts-1, te)
452
+ when 'lina', 'lineara'
453
+ self.emit(type, :script_linear_a, text, ts-1, te)
430
454
  when 'linb', 'linearb'
431
455
  self.emit(type, :script_linear_b, text, ts-1, te)
432
456
  when 'lisu'
@@ -437,14 +461,28 @@
437
461
  self.emit(type, :script_lydian, text, ts-1, te)
438
462
  when 'mlym', 'malayalam'
439
463
  self.emit(type, :script_malayalam, text, ts-1, te)
464
+ when 'mahj', 'mahajani'
465
+ self.emit(type, :script_mahajani, text, ts-1, te)
440
466
  when 'mand', 'mandaic'
441
467
  self.emit(type, :script_mandaic, text, ts-1, te)
468
+ when 'mani', 'manichaean'
469
+ self.emit(type, :script_manichaean, text, ts-1, te)
470
+ when 'mend', 'mendekikakui'
471
+ self.emit(type, :script_mende_kikakui, text, ts-1, te)
472
+ when 'modi'
473
+ self.emit(type, :script_modi, text, ts-1, te)
442
474
  when 'mong', 'mongolian'
443
475
  self.emit(type, :script_mongolian, text, ts-1, te)
476
+ when 'mroo', 'mro'
477
+ self.emit(type, :script_mro, text, ts-1, te)
444
478
  when 'mtei', 'meeteimayek'
445
479
  self.emit(type, :script_meetei_mayek, text, ts-1, te)
446
480
  when 'mymr', 'myanmar'
447
481
  self.emit(type, :script_myanmar, text, ts-1, te)
482
+ when 'narb', 'oldnortharabian'
483
+ self.emit(type, :script_old_north_arabian, text, ts-1, te)
484
+ when 'nbat', 'nabataean'
485
+ self.emit(type, :script_nabataean, text, ts-1, te)
448
486
  when 'nkoo', 'nko'
449
487
  self.emit(type, :script_nko, text, ts-1, te)
450
488
  when 'ogam', 'ogham'
@@ -457,10 +495,18 @@
457
495
  self.emit(type, :script_oriya, text, ts-1, te)
458
496
  when 'osma', 'osmanya'
459
497
  self.emit(type, :script_osmanya, text, ts-1, te)
498
+ when 'palm', 'palmyrene'
499
+ self.emit(type, :script_palmyrene, text, ts-1, te)
500
+ when 'pauc', 'paucinhau'
501
+ self.emit(type, :script_pau_cin_hau, text, ts-1, te)
502
+ when 'perm', 'oldpermic'
503
+ self.emit(type, :script_old_permic, text, ts-1, te)
460
504
  when 'phag', 'phagspa'
461
505
  self.emit(type, :script_phags_pa, text, ts-1, te)
462
506
  when 'phli', 'inscriptionalpahlavi'
463
507
  self.emit(type, :script_inscriptional_pahlavi, text, ts-1, te)
508
+ when 'phlp', 'psalterpahlavi'
509
+ self.emit(type, :script_psalter_pahlavi, text, ts-1, te)
464
510
  when 'phnx', 'phoenician'
465
511
  self.emit(type, :script_phoenician, text, ts-1, te)
466
512
  when 'prti', 'inscriptionalparthian'
@@ -477,6 +523,10 @@
477
523
  self.emit(type, :script_saurashtra, text, ts-1, te)
478
524
  when 'shaw', 'shavian'
479
525
  self.emit(type, :script_shavian, text, ts-1, te)
526
+ when 'sidd', 'siddham'
527
+ self.emit(type, :script_siddham, text, ts-1, te)
528
+ when 'sind', 'khudawadi'
529
+ self.emit(type, :script_khudawadi, text, ts-1, te)
480
530
  when 'sinh', 'sinhala'
481
531
  self.emit(type, :script_sinhala, text, ts-1, te)
482
532
  when 'sund', 'sundanese'
@@ -507,10 +557,14 @@
507
557
  self.emit(type, :script_thai, text, ts-1, te)
508
558
  when 'tibt', 'tibetan'
509
559
  self.emit(type, :script_tibetan, text, ts-1, te)
560
+ when 'tirh', 'tirhuta'
561
+ self.emit(type, :script_tirhuta, text, ts-1, te)
510
562
  when 'ugar', 'ugaritic'
511
563
  self.emit(type, :script_ugaritic, text, ts-1, te)
512
564
  when 'vaii', 'vai'
513
565
  self.emit(type, :script_vai, text, ts-1, te)
566
+ when 'wara', 'warangciti'
567
+ self.emit(type, :script_warang_citi, text, ts-1, te)
514
568
  when 'xpeo', 'oldpersian'
515
569
  self.emit(type, :script_old_persian, text, ts-1, te)
516
570
  when 'xsux', 'cuneiform'
@@ -20,6 +20,8 @@
20
20
  set_close = ']';
21
21
  brackets = set_open | set_close;
22
22
 
23
+ comment = ('#' . [^\n]* . '\n');
24
+
23
25
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
24
26
  'cntrl' | 'digit' | 'graph' |
25
27
  'lower' | 'print' | 'punct' |
@@ -74,6 +76,8 @@
74
76
  quantifier_possessive | quantifier_interval;
75
77
 
76
78
 
79
+ conditional = '(?(';
80
+
77
81
  group_comment = '?#' . [^)]+ . group_close;
78
82
 
79
83
  group_atomic = '?>';
@@ -84,23 +88,28 @@
84
88
  assertion_lookbehind = '?<=';
85
89
  assertion_nlookbehind = '?<!';
86
90
 
87
- group_options = '?' . [\-mix];
91
+ group_options = '?' . [\-mixdau];
88
92
 
89
93
  group_ref = [gk];
90
- group_name = (alnum . (alnum+)?)?;
94
+ group_name_char = (alnum | '_');
95
+ group_name_id = (group_name_char . (group_name_char+)?)?;
91
96
  group_number = '-'? . [1-9] . ([0-9]+)?;
92
97
  group_level = [+\-] . [0-9]+;
93
98
 
94
- group_named = ('?<' . group_name . '>') | ("?'" . group_name . "'");
99
+ group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
100
+ group_lookup = group_name | group_number;
95
101
 
96
- group_name_ref = group_ref . (('<' . group_name . group_level? '>') |
97
- ("'" . group_name . group_level? "'"));
102
+ group_named = ('?' . group_name );
103
+
104
+ group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
105
+ ("'" . group_name_id . group_level? "'"));
98
106
 
99
107
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
100
108
  ("'" . group_number . group_level? "'"));
101
109
 
102
110
  group_type = group_atomic | group_passive | group_named;
103
111
 
112
+
104
113
  assertion_type = assertion_lookahead | assertion_nlookahead |
105
114
  assertion_lookbehind | assertion_nlookbehind;
106
115
 
@@ -133,8 +142,8 @@
133
142
  }
134
143
 
135
144
  # group (nesting) and set open/close actions
136
- action group_opened { group_depth += 1; in_group = true }
137
- action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
145
+ action group_opened { @group_depth += 1; @in_group = true }
146
+ action group_closed { @group_depth -= 1; @in_group = @group_depth > 0 ? true : false }
138
147
 
139
148
  # Character set scanner, continues consuming characters until it meets the
140
149
  # closing bracket of the set.
@@ -410,6 +419,22 @@
410
419
  *|;
411
420
 
412
421
 
422
+ # conditional expressions scanner
423
+ # --------------------------------------------------------------------------
424
+ conditional_expression := |*
425
+ group_lookup . ')' {
426
+ text = text(data, ts, te-1).first
427
+ emit(:conditional, :condition, text, ts, te-1)
428
+ emit(:conditional, :condition_close, ')', te-1, te)
429
+ };
430
+
431
+ any {
432
+ fhold;
433
+ fcall main;
434
+ };
435
+ *|;
436
+
437
+
413
438
  # Main scanner
414
439
  # --------------------------------------------------------------------------
415
440
  main := |*
@@ -421,7 +446,12 @@
421
446
  };
422
447
 
423
448
  alternation {
424
- emit(:meta, :alternation, *text(data, ts, te))
449
+ if in_conditional and conditional_stack.length > 0 and
450
+ conditional_stack.last[1] == @group_depth
451
+ emit(:conditional, :separator, *text(data, ts, te))
452
+ else
453
+ emit(:meta, :alternation, *text(data, ts, te))
454
+ end
425
455
  };
426
456
 
427
457
  # Anchors
@@ -434,6 +464,10 @@
434
464
  emit(:anchor, :eol, *text(data, ts, te))
435
465
  };
436
466
 
467
+ backslash . 'K' > (backslashed, 4) {
468
+ emit(:keep, :mark, *text(data, ts, te))
469
+ };
470
+
437
471
  backslash . anchor_char > (backslashed, 3) {
438
472
  case text = text(data, ts, te).first
439
473
  when '\\A'; emit(:anchor, :bos, text, ts, te)
@@ -481,6 +515,23 @@
481
515
  fcall character_set;
482
516
  };
483
517
 
518
+
519
+ # Conditional expression
520
+ # (?(condition)Y|N) conditional expression
521
+ # ------------------------------------------------------------------------
522
+ conditional {
523
+ text = text(data, ts, te).first
524
+
525
+ in_conditional = true unless in_conditional
526
+ conditional_depth += 1
527
+ conditional_stack << [conditional_depth, @group_depth]
528
+
529
+ emit(:conditional, :open, text[0..-2], ts, te-1)
530
+ emit(:conditional, :condition_open, '(', te-1, te)
531
+ fcall conditional_expression;
532
+ };
533
+
534
+
484
535
  # (?#...) comments: parsed as a single expression, without introducing a
485
536
  # new nesting level. Comments may not include parentheses, escaped or not.
486
537
  # special case for close, action performed on all transitions to get the
@@ -491,12 +542,15 @@
491
542
  };
492
543
 
493
544
  # Expression options:
494
- # (?imx-imx) option on/off
545
+ # (?imxdau-imx) option on/off
495
546
  # i: ignore case
496
547
  # m: multi-line (dot(.) match newline)
497
548
  # x: extended form
549
+ # d: default class rules (1.9 compatible)
550
+ # a: ASCII class rules (\s, \w, etc.)
551
+ # u: Unicode class rules (\s, \w, etc.)
498
552
  #
499
- # (?imx-imx:subexp) option on/off for subexp
553
+ # (?imxdau-imx:subexp) option on/off for subexp
500
554
  # ------------------------------------------------------------------------
501
555
  group_open . group_options >group_opened {
502
556
  p = scan_options(p, data, ts, te)
@@ -551,7 +605,29 @@
551
605
  };
552
606
 
553
607
  group_close @group_closed {
554
- emit(:group, :close, *text(data, ts, te))
608
+ if in_conditional and conditional_stack.last and
609
+ conditional_stack.last[1] == (@group_depth + 1)
610
+
611
+ emit(:conditional, :close, *text(data, ts, te))
612
+ conditional_stack.pop
613
+
614
+ if conditional_stack.length == 0
615
+ in_conditional = false
616
+ end
617
+ else
618
+ if @spacing_stack.length > 1 and
619
+ @spacing_stack.last[1] == (@group_depth + 1)
620
+ @spacing_stack.pop
621
+
622
+ @free_spacing = @spacing_stack.last[0]
623
+
624
+ if @spacing_stack.length == 1
625
+ @in_options = false
626
+ end
627
+ end
628
+
629
+ emit(:group, :close, *text(data, ts, te))
630
+ end
555
631
  };
556
632
 
557
633
 
@@ -662,10 +738,26 @@
662
738
  fcall escape_sequence;
663
739
  };
664
740
 
741
+ comment {
742
+ if @free_spacing
743
+ emit(:free_space, :comment, *text(data, ts, te))
744
+ else
745
+ append_literal(data, ts, te)
746
+ end
747
+ };
748
+
749
+ space+ {
750
+ if @free_spacing
751
+ emit(:free_space, :whitespace, *text(data, ts, te))
752
+ else
753
+ append_literal(data, ts, te)
754
+ end
755
+ };
756
+
665
757
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
666
758
  # except meta characters.
667
759
  # ------------------------------------------------------------------------
668
- ascii_print+ |
760
+ (ascii_print -- space)+ |
669
761
  ascii_nonprint+ |
670
762
  utf8_2_byte+ |
671
763
  utf8_3_byte+ |
@@ -683,11 +775,7 @@ module Regexp::Scanner
683
775
  %% write data;
684
776
 
685
777
  # General scanner error (catch all)
686
- class ScannerError < StandardError
687
- def initialize(what)
688
- super what
689
- end
690
- end
778
+ class ScannerError < StandardError; end
691
779
 
692
780
  # Base for all scanner validation errors
693
781
  class ValidationError < StandardError
@@ -717,6 +805,13 @@ module Regexp::Scanner
717
805
  end
718
806
  end
719
807
 
808
+ # Invalid groupOption. Used for inline options.
809
+ class InvalidGroupOption < ValidationError
810
+ def initialize(option, text)
811
+ super "Invalid group option #{option} in #{text}"
812
+ end
813
+ end
814
+
720
815
  # Invalid back reference. Used for name a number refs/calls.
721
816
  class InvalidBackrefError < ValidationError
722
817
  def initialize(what, reason)
@@ -737,18 +832,29 @@ module Regexp::Scanner
737
832
  #
738
833
  # This method may raise errors if a syntax error is encountered.
739
834
  # --------------------------------------------------------------------------
740
- def self.scan(input, &block)
835
+ def self.scan(input_object, &block)
741
836
  top, stack = 0, []
742
837
 
743
- input = input.source if input.is_a?(Regexp)
838
+ if input_object.is_a?(Regexp)
839
+ input = input_object.source
840
+ @free_spacing = (input_object.options & Regexp::EXTENDED != 0)
841
+ else
842
+ input = input_object
843
+ @free_spacing = false
844
+ end
845
+
846
+
744
847
  data = input.unpack("c*") if input.is_a?(String)
745
848
  eof = data.length
746
849
 
747
850
  @tokens = []
748
851
  @block = block_given? ? block : nil
749
852
 
750
- in_group, group_depth = false, 0
853
+ @in_group, @group_depth = false, 0
854
+ @in_options, @spacing_stack = false, [[@free_spacing, 0]]
855
+
751
856
  in_set, set_depth, set_type = false, 0, :set
857
+ in_conditional, conditional_depth, conditional_stack = false, 0, []
752
858
 
753
859
  %% write init;
754
860
  %% write exec;
@@ -759,7 +865,7 @@ module Regexp::Scanner
759
865
  end
760
866
 
761
867
  raise PrematureEndError.new("(missing group closing paranthesis) "+
762
- "[#{in_group}:#{group_depth}]") if in_group
868
+ "[#{@in_group}:#{@group_depth}]") if @in_group
763
869
  raise PrematureEndError.new("(missing set closing bracket) "+
764
870
  "[#{in_set}:#{set_depth}]") if in_set
765
871
 
@@ -779,13 +885,19 @@ module Regexp::Scanner
779
885
 
780
886
  options_char, options_length = true, 0
781
887
 
782
- # Copy while we have option characters, the maximum is 7, for (?mix-mix,
783
- # even though it doesn't make sense it is possible.
784
- while options_char and options_length < 7
888
+ # Copy while we have option characters. There is no maximum length,
889
+ # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
890
+ negative_options = false
891
+ while options_char
785
892
  if data[te + options_length]
786
893
  c = data[te + options_length].chr
787
894
 
788
- if c =~ /[-mix]/
895
+ if c =~ /[-mixdau]/
896
+ negative_options = true if c == '-'
897
+
898
+ raise InvalidGroupOption.new(c, text) if negative_options and
899
+ c =~ /[dau]/
900
+
789
901
  text << c ; p += 1 ; options_length += 1
790
902
  else
791
903
  options_char = false
@@ -801,11 +913,11 @@ module Regexp::Scanner
801
913
  if c == ':'
802
914
  # Include the ':' in the options text
803
915
  text << c ; p += 1 ; options_length += 1
804
- emit(:group, :options, text, ts, te + options_length)
916
+ emit_options(text, ts, te + options_length)
805
917
 
806
918
  elsif c == ')'
807
919
  # Don't include the closing ')', let group_close handle it.
808
- emit(:group, :options, text, ts, te + options_length)
920
+ emit_options(text, ts, te + options_length)
809
921
 
810
922
  else
811
923
  # Plain Regexp reports this as 'undefined group option'
@@ -849,6 +961,27 @@ module Regexp::Scanner
849
961
  emit(:literal, :literal, text, ts, te)
850
962
  end
851
963
 
964
+ def self.emit_options(text, ts, te)
965
+ if text =~ /\(\?([mixdau]+)?-?([mix]+)?:/
966
+ positive, negative = $1, $2
967
+
968
+ if positive =~ /x/
969
+ @free_spacing = true
970
+ end
971
+
972
+ # If the x appears in both, treat it like ruby does, the second cancels
973
+ # the first.
974
+ if negative =~ /x/
975
+ @free_spacing = false
976
+ end
977
+ end
978
+
979
+ @in_options = true
980
+ @spacing_stack << [@free_spacing, @group_depth]
981
+
982
+ emit(:group, :options, text, ts, te)
983
+ end
984
+
852
985
  # Emits an array with the details of the scanned pattern
853
986
  def self.emit(type, token, text, ts, te)
854
987
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -876,13 +1009,7 @@ module Regexp::Scanner
876
1009
  error = ValidationError.new('expression')
877
1010
  end
878
1011
 
879
- # TODO: configuration option to treat scanner level validation
880
- # errors as warnings or ignore them
881
- if false # @@config.validation_warn
882
- $stderr.puts error.to_s # unless @@config.validation_ignore
883
- else
884
- raise error # unless @@config.validation_ignore
885
- end
1012
+ raise error # unless @@config.validation_ignore
886
1013
  end
887
1014
 
888
1015
  # Used for references with an empty name or number