regexp_parser 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +57 -0
  3. data/Gemfile +8 -0
  4. data/LICENSE +1 -1
  5. data/README.md +225 -206
  6. data/Rakefile +9 -3
  7. data/lib/regexp_parser.rb +7 -11
  8. data/lib/regexp_parser/expression.rb +72 -14
  9. data/lib/regexp_parser/expression/classes/alternation.rb +3 -16
  10. data/lib/regexp_parser/expression/classes/conditional.rb +57 -0
  11. data/lib/regexp_parser/expression/classes/free_space.rb +17 -0
  12. data/lib/regexp_parser/expression/classes/keep.rb +7 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +28 -7
  14. data/lib/regexp_parser/expression/methods/strfregexp.rb +113 -0
  15. data/lib/regexp_parser/expression/methods/tests.rb +116 -0
  16. data/lib/regexp_parser/expression/methods/traverse.rb +63 -0
  17. data/lib/regexp_parser/expression/quantifier.rb +10 -0
  18. data/lib/regexp_parser/expression/sequence.rb +45 -0
  19. data/lib/regexp_parser/expression/subexpression.rb +29 -1
  20. data/lib/regexp_parser/lexer.rb +31 -8
  21. data/lib/regexp_parser/parser.rb +118 -45
  22. data/lib/regexp_parser/scanner.rb +1745 -1404
  23. data/lib/regexp_parser/scanner/property.rl +57 -3
  24. data/lib/regexp_parser/scanner/scanner.rl +161 -34
  25. data/lib/regexp_parser/syntax.rb +12 -2
  26. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +3 -3
  27. data/lib/regexp_parser/syntax/ruby/1.9.3.rb +2 -7
  28. data/lib/regexp_parser/syntax/ruby/2.0.0.rb +4 -1
  29. data/lib/regexp_parser/syntax/ruby/2.1.4.rb +13 -0
  30. data/lib/regexp_parser/syntax/ruby/2.1.5.rb +13 -0
  31. data/lib/regexp_parser/syntax/ruby/2.1.rb +2 -2
  32. data/lib/regexp_parser/syntax/ruby/2.2.0.rb +16 -0
  33. data/lib/regexp_parser/syntax/ruby/2.2.rb +8 -0
  34. data/lib/regexp_parser/syntax/tokens.rb +19 -2
  35. data/lib/regexp_parser/syntax/tokens/conditional.rb +22 -0
  36. data/lib/regexp_parser/syntax/tokens/keep.rb +14 -0
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +45 -4
  38. data/lib/regexp_parser/token.rb +23 -8
  39. data/lib/regexp_parser/version.rb +5 -0
  40. data/regexp_parser.gemspec +35 -0
  41. data/test/expression/test_all.rb +6 -1
  42. data/test/expression/test_base.rb +19 -0
  43. data/test/expression/test_conditionals.rb +114 -0
  44. data/test/expression/test_free_space.rb +33 -0
  45. data/test/expression/test_set.rb +61 -0
  46. data/test/expression/test_strfregexp.rb +214 -0
  47. data/test/expression/test_subexpression.rb +24 -0
  48. data/test/expression/test_tests.rb +99 -0
  49. data/test/expression/test_to_h.rb +48 -0
  50. data/test/expression/test_to_s.rb +46 -0
  51. data/test/expression/test_traverse.rb +164 -0
  52. data/test/lexer/test_all.rb +16 -3
  53. data/test/lexer/test_conditionals.rb +101 -0
  54. data/test/lexer/test_keep.rb +24 -0
  55. data/test/lexer/test_literals.rb +51 -51
  56. data/test/lexer/test_nesting.rb +62 -62
  57. data/test/lexer/test_refcalls.rb +18 -20
  58. data/test/parser/test_all.rb +18 -3
  59. data/test/parser/test_alternation.rb +11 -14
  60. data/test/parser/test_conditionals.rb +148 -0
  61. data/test/parser/test_escapes.rb +29 -5
  62. data/test/parser/test_free_space.rb +139 -0
  63. data/test/parser/test_groups.rb +40 -0
  64. data/test/parser/test_keep.rb +21 -0
  65. data/test/scanner/test_all.rb +8 -2
  66. data/test/scanner/test_conditionals.rb +166 -0
  67. data/test/scanner/test_escapes.rb +8 -5
  68. data/test/scanner/test_free_space.rb +133 -0
  69. data/test/scanner/test_groups.rb +28 -0
  70. data/test/scanner/test_keep.rb +33 -0
  71. data/test/scanner/test_properties.rb +4 -0
  72. data/test/scanner/test_scripts.rb +71 -1
  73. data/test/syntax/ruby/test_1.9.3.rb +2 -2
  74. data/test/syntax/ruby/test_2.0.0.rb +38 -0
  75. data/test/syntax/ruby/test_2.2.0.rb +38 -0
  76. data/test/syntax/ruby/test_all.rb +1 -8
  77. data/test/syntax/ruby/test_files.rb +104 -0
  78. data/test/test_all.rb +2 -1
  79. data/test/token/test_all.rb +2 -0
  80. data/test/token/test_token.rb +109 -0
  81. metadata +75 -21
  82. data/VERSION.yml +0 -5
  83. data/lib/regexp_parser/ctype.rb +0 -48
  84. data/test/syntax/ruby/test_2.x.rb +0 -46
@@ -11,7 +11,6 @@
11
11
  'cntrl'i | 'digit'i | 'graph'i | 'lower'i | 'print'i |
12
12
  'punct'i | 'space'i | 'upper'i | 'word'i | 'xdigit'i;
13
13
 
14
- # TODO: are these case-insensitive?
15
14
  property_name_posix = 'any'i | 'assigned'i | 'newline'i;
16
15
 
17
16
  property_name = property_name_unicode | property_name_posix;
@@ -39,7 +38,9 @@
39
38
  property_age = 'age=1.1'i | 'age=2.0'i | 'age=2.1'i |
40
39
  'age=3.0'i | 'age=3.1'i | 'age=3.2'i |
41
40
  'age=4.0'i | 'age=4.1'i | 'age=5.0'i |
42
- 'age=5.1'i | 'age=5.2'i | 'age=6.0'i;
41
+ 'age=5.1'i | 'age=5.2'i | 'age=6.0'i |
42
+ 'age=6.1'i | 'age=6.2'i | 'age=6.3'i |
43
+ 'age=7.0'i;
43
44
 
44
45
  property_script = (alpha | space | '_')+; # everything else
45
46
 
@@ -222,6 +223,14 @@
222
223
  self.emit(type, :age_5_2, text, ts-1, te)
223
224
  when 'age=6.0'
224
225
  self.emit(type, :age_6_0, text, ts-1, te)
226
+ when 'age=6.1'
227
+ self.emit(type, :age_6_1, text, ts-1, te)
228
+ when 'age=6.2'
229
+ self.emit(type, :age_6_2, text, ts-1, te)
230
+ when 'age=6.3'
231
+ self.emit(type, :age_6_3, text, ts-1, te)
232
+ when 'age=7.0'
233
+ self.emit(type, :age_7_0, text, ts-1, te)
225
234
 
226
235
  # Derived Properties
227
236
  when 'ahex', 'asciihexdigit'
@@ -327,8 +336,9 @@
327
336
  when 'xidc', 'xidcontinue'
328
337
  self.emit(type, :xid_continue, text, ts-1, te)
329
338
 
330
-
331
339
  # Scripts
340
+ when 'aghb', 'caucasianalbanian'
341
+ self.emit(type, :script_caucasian_albanian, text, ts-1, te)
332
342
  when 'arab', 'arabic'
333
343
  self.emit(type, :script_arabic, text, ts-1, te)
334
344
  when 'armi', 'imperialaramaic'
@@ -341,6 +351,8 @@
341
351
  self.emit(type, :script_balinese, text, ts-1, te)
342
352
  when 'bamu', 'bamum'
343
353
  self.emit(type, :script_bamum, text, ts-1, te)
354
+ when 'bass', 'bassavah'
355
+ self.emit(type, :script_bassa_vah, text, ts-1, te)
344
356
  when 'batk', 'batak'
345
357
  self.emit(type, :script_batak, text, ts-1, te)
346
358
  when 'beng', 'bengali'
@@ -373,8 +385,12 @@
373
385
  self.emit(type, :script_devanagari, text, ts-1, te)
374
386
  when 'dsrt', 'deseret'
375
387
  self.emit(type, :script_deseret, text, ts-1, te)
388
+ when 'dupl', 'duployan'
389
+ self.emit(type, :script_duployan, text, ts-1, te)
376
390
  when 'egyp', 'egyptianhieroglyphs'
377
391
  self.emit(type, :script_egyptian_hieroglyphs, text, ts-1, te)
392
+ when 'elba', 'elbasan'
393
+ self.emit(type, :script_elbasan, text, ts-1, te)
378
394
  when 'ethi', 'ethiopic'
379
395
  self.emit(type, :script_ethiopic, text, ts-1, te)
380
396
  when 'geor', 'georgian'
@@ -383,6 +399,8 @@
383
399
  self.emit(type, :script_glagolitic, text, ts-1, te)
384
400
  when 'goth', 'gothic'
385
401
  self.emit(type, :script_gothic, text, ts-1, te)
402
+ when 'gran', 'grantha'
403
+ self.emit(type, :script_grantha, text, ts-1, te)
386
404
  when 'grek', 'greek'
387
405
  self.emit(type, :script_greek, text, ts-1, te)
388
406
  when 'gujr', 'gujarati'
@@ -399,6 +417,8 @@
399
417
  self.emit(type, :script_hebrew, text, ts-1, te)
400
418
  when 'hira', 'hiragana'
401
419
  self.emit(type, :script_hiragana, text, ts-1, te)
420
+ when 'hmng', 'pahawhhmong'
421
+ self.emit(type, :script_pahawh_hmong, text, ts-1, te)
402
422
  when 'hrkt', 'katakanaorhiragana'
403
423
  self.emit(type, :script_katakana_or_hiragana, text, ts-1, te)
404
424
  when 'ital', 'olditalic'
@@ -413,6 +433,8 @@
413
433
  self.emit(type, :script_kharoshthi, text, ts-1, te)
414
434
  when 'khmr', 'khmer'
415
435
  self.emit(type, :script_khmer, text, ts-1, te)
436
+ when 'khoj', 'khojki'
437
+ self.emit(type, :script_khojki, text, ts-1, te)
416
438
  when 'knda', 'kannada'
417
439
  self.emit(type, :script_kannada, text, ts-1, te)
418
440
  when 'kthi', 'kaithi'
@@ -427,6 +449,8 @@
427
449
  self.emit(type, :script_lepcha, text, ts-1, te)
428
450
  when 'limb', 'limbu'
429
451
  self.emit(type, :script_limbu, text, ts-1, te)
452
+ when 'lina', 'lineara'
453
+ self.emit(type, :script_linear_a, text, ts-1, te)
430
454
  when 'linb', 'linearb'
431
455
  self.emit(type, :script_linear_b, text, ts-1, te)
432
456
  when 'lisu'
@@ -437,14 +461,28 @@
437
461
  self.emit(type, :script_lydian, text, ts-1, te)
438
462
  when 'mlym', 'malayalam'
439
463
  self.emit(type, :script_malayalam, text, ts-1, te)
464
+ when 'mahj', 'mahajani'
465
+ self.emit(type, :script_mahajani, text, ts-1, te)
440
466
  when 'mand', 'mandaic'
441
467
  self.emit(type, :script_mandaic, text, ts-1, te)
468
+ when 'mani', 'manichaean'
469
+ self.emit(type, :script_manichaean, text, ts-1, te)
470
+ when 'mend', 'mendekikakui'
471
+ self.emit(type, :script_mende_kikakui, text, ts-1, te)
472
+ when 'modi'
473
+ self.emit(type, :script_modi, text, ts-1, te)
442
474
  when 'mong', 'mongolian'
443
475
  self.emit(type, :script_mongolian, text, ts-1, te)
476
+ when 'mroo', 'mro'
477
+ self.emit(type, :script_mro, text, ts-1, te)
444
478
  when 'mtei', 'meeteimayek'
445
479
  self.emit(type, :script_meetei_mayek, text, ts-1, te)
446
480
  when 'mymr', 'myanmar'
447
481
  self.emit(type, :script_myanmar, text, ts-1, te)
482
+ when 'narb', 'oldnortharabian'
483
+ self.emit(type, :script_old_north_arabian, text, ts-1, te)
484
+ when 'nbat', 'nabataean'
485
+ self.emit(type, :script_nabataean, text, ts-1, te)
448
486
  when 'nkoo', 'nko'
449
487
  self.emit(type, :script_nko, text, ts-1, te)
450
488
  when 'ogam', 'ogham'
@@ -457,10 +495,18 @@
457
495
  self.emit(type, :script_oriya, text, ts-1, te)
458
496
  when 'osma', 'osmanya'
459
497
  self.emit(type, :script_osmanya, text, ts-1, te)
498
+ when 'palm', 'palmyrene'
499
+ self.emit(type, :script_palmyrene, text, ts-1, te)
500
+ when 'pauc', 'paucinhau'
501
+ self.emit(type, :script_pau_cin_hau, text, ts-1, te)
502
+ when 'perm', 'oldpermic'
503
+ self.emit(type, :script_old_permic, text, ts-1, te)
460
504
  when 'phag', 'phagspa'
461
505
  self.emit(type, :script_phags_pa, text, ts-1, te)
462
506
  when 'phli', 'inscriptionalpahlavi'
463
507
  self.emit(type, :script_inscriptional_pahlavi, text, ts-1, te)
508
+ when 'phlp', 'psalterpahlavi'
509
+ self.emit(type, :script_psalter_pahlavi, text, ts-1, te)
464
510
  when 'phnx', 'phoenician'
465
511
  self.emit(type, :script_phoenician, text, ts-1, te)
466
512
  when 'prti', 'inscriptionalparthian'
@@ -477,6 +523,10 @@
477
523
  self.emit(type, :script_saurashtra, text, ts-1, te)
478
524
  when 'shaw', 'shavian'
479
525
  self.emit(type, :script_shavian, text, ts-1, te)
526
+ when 'sidd', 'siddham'
527
+ self.emit(type, :script_siddham, text, ts-1, te)
528
+ when 'sind', 'khudawadi'
529
+ self.emit(type, :script_khudawadi, text, ts-1, te)
480
530
  when 'sinh', 'sinhala'
481
531
  self.emit(type, :script_sinhala, text, ts-1, te)
482
532
  when 'sund', 'sundanese'
@@ -507,10 +557,14 @@
507
557
  self.emit(type, :script_thai, text, ts-1, te)
508
558
  when 'tibt', 'tibetan'
509
559
  self.emit(type, :script_tibetan, text, ts-1, te)
560
+ when 'tirh', 'tirhuta'
561
+ self.emit(type, :script_tirhuta, text, ts-1, te)
510
562
  when 'ugar', 'ugaritic'
511
563
  self.emit(type, :script_ugaritic, text, ts-1, te)
512
564
  when 'vaii', 'vai'
513
565
  self.emit(type, :script_vai, text, ts-1, te)
566
+ when 'wara', 'warangciti'
567
+ self.emit(type, :script_warang_citi, text, ts-1, te)
514
568
  when 'xpeo', 'oldpersian'
515
569
  self.emit(type, :script_old_persian, text, ts-1, te)
516
570
  when 'xsux', 'cuneiform'
@@ -20,6 +20,8 @@
20
20
  set_close = ']';
21
21
  brackets = set_open | set_close;
22
22
 
23
+ comment = ('#' . [^\n]* . '\n');
24
+
23
25
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
24
26
  'cntrl' | 'digit' | 'graph' |
25
27
  'lower' | 'print' | 'punct' |
@@ -74,6 +76,8 @@
74
76
  quantifier_possessive | quantifier_interval;
75
77
 
76
78
 
79
+ conditional = '(?(';
80
+
77
81
  group_comment = '?#' . [^)]+ . group_close;
78
82
 
79
83
  group_atomic = '?>';
@@ -84,23 +88,28 @@
84
88
  assertion_lookbehind = '?<=';
85
89
  assertion_nlookbehind = '?<!';
86
90
 
87
- group_options = '?' . [\-mix];
91
+ group_options = '?' . [\-mixdau];
88
92
 
89
93
  group_ref = [gk];
90
- group_name = (alnum . (alnum+)?)?;
94
+ group_name_char = (alnum | '_');
95
+ group_name_id = (group_name_char . (group_name_char+)?)?;
91
96
  group_number = '-'? . [1-9] . ([0-9]+)?;
92
97
  group_level = [+\-] . [0-9]+;
93
98
 
94
- group_named = ('?<' . group_name . '>') | ("?'" . group_name . "'");
99
+ group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
100
+ group_lookup = group_name | group_number;
95
101
 
96
- group_name_ref = group_ref . (('<' . group_name . group_level? '>') |
97
- ("'" . group_name . group_level? "'"));
102
+ group_named = ('?' . group_name );
103
+
104
+ group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
105
+ ("'" . group_name_id . group_level? "'"));
98
106
 
99
107
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
100
108
  ("'" . group_number . group_level? "'"));
101
109
 
102
110
  group_type = group_atomic | group_passive | group_named;
103
111
 
112
+
104
113
  assertion_type = assertion_lookahead | assertion_nlookahead |
105
114
  assertion_lookbehind | assertion_nlookbehind;
106
115
 
@@ -133,8 +142,8 @@
133
142
  }
134
143
 
135
144
  # group (nesting) and set open/close actions
136
- action group_opened { group_depth += 1; in_group = true }
137
- action group_closed { group_depth -= 1; in_group = group_depth > 0 ? true : false }
145
+ action group_opened { @group_depth += 1; @in_group = true }
146
+ action group_closed { @group_depth -= 1; @in_group = @group_depth > 0 ? true : false }
138
147
 
139
148
  # Character set scanner, continues consuming characters until it meets the
140
149
  # closing bracket of the set.
@@ -410,6 +419,22 @@
410
419
  *|;
411
420
 
412
421
 
422
+ # conditional expressions scanner
423
+ # --------------------------------------------------------------------------
424
+ conditional_expression := |*
425
+ group_lookup . ')' {
426
+ text = text(data, ts, te-1).first
427
+ emit(:conditional, :condition, text, ts, te-1)
428
+ emit(:conditional, :condition_close, ')', te-1, te)
429
+ };
430
+
431
+ any {
432
+ fhold;
433
+ fcall main;
434
+ };
435
+ *|;
436
+
437
+
413
438
  # Main scanner
414
439
  # --------------------------------------------------------------------------
415
440
  main := |*
@@ -421,7 +446,12 @@
421
446
  };
422
447
 
423
448
  alternation {
424
- emit(:meta, :alternation, *text(data, ts, te))
449
+ if in_conditional and conditional_stack.length > 0 and
450
+ conditional_stack.last[1] == @group_depth
451
+ emit(:conditional, :separator, *text(data, ts, te))
452
+ else
453
+ emit(:meta, :alternation, *text(data, ts, te))
454
+ end
425
455
  };
426
456
 
427
457
  # Anchors
@@ -434,6 +464,10 @@
434
464
  emit(:anchor, :eol, *text(data, ts, te))
435
465
  };
436
466
 
467
+ backslash . 'K' > (backslashed, 4) {
468
+ emit(:keep, :mark, *text(data, ts, te))
469
+ };
470
+
437
471
  backslash . anchor_char > (backslashed, 3) {
438
472
  case text = text(data, ts, te).first
439
473
  when '\\A'; emit(:anchor, :bos, text, ts, te)
@@ -481,6 +515,23 @@
481
515
  fcall character_set;
482
516
  };
483
517
 
518
+
519
+ # Conditional expression
520
+ # (?(condition)Y|N) conditional expression
521
+ # ------------------------------------------------------------------------
522
+ conditional {
523
+ text = text(data, ts, te).first
524
+
525
+ in_conditional = true unless in_conditional
526
+ conditional_depth += 1
527
+ conditional_stack << [conditional_depth, @group_depth]
528
+
529
+ emit(:conditional, :open, text[0..-2], ts, te-1)
530
+ emit(:conditional, :condition_open, '(', te-1, te)
531
+ fcall conditional_expression;
532
+ };
533
+
534
+
484
535
  # (?#...) comments: parsed as a single expression, without introducing a
485
536
  # new nesting level. Comments may not include parentheses, escaped or not.
486
537
  # special case for close, action performed on all transitions to get the
@@ -491,12 +542,15 @@
491
542
  };
492
543
 
493
544
  # Expression options:
494
- # (?imx-imx) option on/off
545
+ # (?imxdau-imx) option on/off
495
546
  # i: ignore case
496
547
  # m: multi-line (dot(.) match newline)
497
548
  # x: extended form
549
+ # d: default class rules (1.9 compatible)
550
+ # a: ASCII class rules (\s, \w, etc.)
551
+ # u: Unicode class rules (\s, \w, etc.)
498
552
  #
499
- # (?imx-imx:subexp) option on/off for subexp
553
+ # (?imxdau-imx:subexp) option on/off for subexp
500
554
  # ------------------------------------------------------------------------
501
555
  group_open . group_options >group_opened {
502
556
  p = scan_options(p, data, ts, te)
@@ -551,7 +605,29 @@
551
605
  };
552
606
 
553
607
  group_close @group_closed {
554
- emit(:group, :close, *text(data, ts, te))
608
+ if in_conditional and conditional_stack.last and
609
+ conditional_stack.last[1] == (@group_depth + 1)
610
+
611
+ emit(:conditional, :close, *text(data, ts, te))
612
+ conditional_stack.pop
613
+
614
+ if conditional_stack.length == 0
615
+ in_conditional = false
616
+ end
617
+ else
618
+ if @spacing_stack.length > 1 and
619
+ @spacing_stack.last[1] == (@group_depth + 1)
620
+ @spacing_stack.pop
621
+
622
+ @free_spacing = @spacing_stack.last[0]
623
+
624
+ if @spacing_stack.length == 1
625
+ @in_options = false
626
+ end
627
+ end
628
+
629
+ emit(:group, :close, *text(data, ts, te))
630
+ end
555
631
  };
556
632
 
557
633
 
@@ -662,10 +738,26 @@
662
738
  fcall escape_sequence;
663
739
  };
664
740
 
741
+ comment {
742
+ if @free_spacing
743
+ emit(:free_space, :comment, *text(data, ts, te))
744
+ else
745
+ append_literal(data, ts, te)
746
+ end
747
+ };
748
+
749
+ space+ {
750
+ if @free_spacing
751
+ emit(:free_space, :whitespace, *text(data, ts, te))
752
+ else
753
+ append_literal(data, ts, te)
754
+ end
755
+ };
756
+
665
757
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
666
758
  # except meta characters.
667
759
  # ------------------------------------------------------------------------
668
- ascii_print+ |
760
+ (ascii_print -- space)+ |
669
761
  ascii_nonprint+ |
670
762
  utf8_2_byte+ |
671
763
  utf8_3_byte+ |
@@ -683,11 +775,7 @@ module Regexp::Scanner
683
775
  %% write data;
684
776
 
685
777
  # General scanner error (catch all)
686
- class ScannerError < StandardError
687
- def initialize(what)
688
- super what
689
- end
690
- end
778
+ class ScannerError < StandardError; end
691
779
 
692
780
  # Base for all scanner validation errors
693
781
  class ValidationError < StandardError
@@ -717,6 +805,13 @@ module Regexp::Scanner
717
805
  end
718
806
  end
719
807
 
808
+ # Invalid groupOption. Used for inline options.
809
+ class InvalidGroupOption < ValidationError
810
+ def initialize(option, text)
811
+ super "Invalid group option #{option} in #{text}"
812
+ end
813
+ end
814
+
720
815
  # Invalid back reference. Used for name a number refs/calls.
721
816
  class InvalidBackrefError < ValidationError
722
817
  def initialize(what, reason)
@@ -737,18 +832,29 @@ module Regexp::Scanner
737
832
  #
738
833
  # This method may raise errors if a syntax error is encountered.
739
834
  # --------------------------------------------------------------------------
740
- def self.scan(input, &block)
835
+ def self.scan(input_object, &block)
741
836
  top, stack = 0, []
742
837
 
743
- input = input.source if input.is_a?(Regexp)
838
+ if input_object.is_a?(Regexp)
839
+ input = input_object.source
840
+ @free_spacing = (input_object.options & Regexp::EXTENDED != 0)
841
+ else
842
+ input = input_object
843
+ @free_spacing = false
844
+ end
845
+
846
+
744
847
  data = input.unpack("c*") if input.is_a?(String)
745
848
  eof = data.length
746
849
 
747
850
  @tokens = []
748
851
  @block = block_given? ? block : nil
749
852
 
750
- in_group, group_depth = false, 0
853
+ @in_group, @group_depth = false, 0
854
+ @in_options, @spacing_stack = false, [[@free_spacing, 0]]
855
+
751
856
  in_set, set_depth, set_type = false, 0, :set
857
+ in_conditional, conditional_depth, conditional_stack = false, 0, []
752
858
 
753
859
  %% write init;
754
860
  %% write exec;
@@ -759,7 +865,7 @@ module Regexp::Scanner
759
865
  end
760
866
 
761
867
  raise PrematureEndError.new("(missing group closing paranthesis) "+
762
- "[#{in_group}:#{group_depth}]") if in_group
868
+ "[#{@in_group}:#{@group_depth}]") if @in_group
763
869
  raise PrematureEndError.new("(missing set closing bracket) "+
764
870
  "[#{in_set}:#{set_depth}]") if in_set
765
871
 
@@ -779,13 +885,19 @@ module Regexp::Scanner
779
885
 
780
886
  options_char, options_length = true, 0
781
887
 
782
- # Copy while we have option characters, the maximum is 7, for (?mix-mix,
783
- # even though it doesn't make sense it is possible.
784
- while options_char and options_length < 7
888
+ # Copy while we have option characters. There is no maximum length,
889
+ # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
890
+ negative_options = false
891
+ while options_char
785
892
  if data[te + options_length]
786
893
  c = data[te + options_length].chr
787
894
 
788
- if c =~ /[-mix]/
895
+ if c =~ /[-mixdau]/
896
+ negative_options = true if c == '-'
897
+
898
+ raise InvalidGroupOption.new(c, text) if negative_options and
899
+ c =~ /[dau]/
900
+
789
901
  text << c ; p += 1 ; options_length += 1
790
902
  else
791
903
  options_char = false
@@ -801,11 +913,11 @@ module Regexp::Scanner
801
913
  if c == ':'
802
914
  # Include the ':' in the options text
803
915
  text << c ; p += 1 ; options_length += 1
804
- emit(:group, :options, text, ts, te + options_length)
916
+ emit_options(text, ts, te + options_length)
805
917
 
806
918
  elsif c == ')'
807
919
  # Don't include the closing ')', let group_close handle it.
808
- emit(:group, :options, text, ts, te + options_length)
920
+ emit_options(text, ts, te + options_length)
809
921
 
810
922
  else
811
923
  # Plain Regexp reports this as 'undefined group option'
@@ -849,6 +961,27 @@ module Regexp::Scanner
849
961
  emit(:literal, :literal, text, ts, te)
850
962
  end
851
963
 
964
+ def self.emit_options(text, ts, te)
965
+ if text =~ /\(\?([mixdau]+)?-?([mix]+)?:/
966
+ positive, negative = $1, $2
967
+
968
+ if positive =~ /x/
969
+ @free_spacing = true
970
+ end
971
+
972
+ # If the x appears in both, treat it like ruby does, the second cancels
973
+ # the first.
974
+ if negative =~ /x/
975
+ @free_spacing = false
976
+ end
977
+ end
978
+
979
+ @in_options = true
980
+ @spacing_stack << [@free_spacing, @group_depth]
981
+
982
+ emit(:group, :options, text, ts, te)
983
+ end
984
+
852
985
  # Emits an array with the details of the scanned pattern
853
986
  def self.emit(type, token, text, ts, te)
854
987
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -876,13 +1009,7 @@ module Regexp::Scanner
876
1009
  error = ValidationError.new('expression')
877
1010
  end
878
1011
 
879
- # TODO: configuration option to treat scanner level validation
880
- # errors as warnings or ignore them
881
- if false # @@config.validation_warn
882
- $stderr.puts error.to_s # unless @@config.validation_ignore
883
- else
884
- raise error # unless @@config.validation_ignore
885
- end
1012
+ raise error # unless @@config.validation_ignore
886
1013
  end
887
1014
 
888
1015
  # Used for references with an empty name or number