regexp_parser 1.5.1 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,6 +6,8 @@ adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
8
  age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
9
11
  age=2.0: age=2.0
10
12
  age=2.1: age=2.1
11
13
  age=3.0: age=3.0
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
64
66
  cherokee: cherokee
65
67
  closepunctuation: close_punctuation
66
68
  cntrl: cntrl
67
- combiningmark: combining_mark
68
69
  common: common
69
70
  connectorpunctuation: connector_punctuation
70
71
  control: control
@@ -86,6 +87,7 @@ dogra: dogra
86
87
  duployan: duployan
87
88
  egyptianhieroglyphs: egyptian_hieroglyphs
88
89
  elbasan: elbasan
90
+ elymaic: elymaic
89
91
  emoji: emoji
90
92
  emojicomponent: emoji_component
91
93
  emojimodifier: emoji_modifier
@@ -206,8 +208,10 @@ indogra: in_dogra
206
208
  indominotiles: in_domino_tiles
207
209
  induployan: in_duployan
208
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
209
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
210
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
211
215
  inemoticons: in_emoticons
212
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
213
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
322
326
  inmyanmarextendeda: in_myanmar_extended_a
323
327
  inmyanmarextendedb: in_myanmar_extended_b
324
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
325
330
  innewa: in_newa
326
331
  innewtailue: in_new_tai_lue
327
332
  innko: in_nko
328
333
  innoblock: in_no_block
329
334
  innumberforms: in_number_forms
330
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
331
337
  inogham: in_ogham
332
338
  inolchiki: in_ol_chiki
333
339
  inoldhungarian: in_old_hungarian
@@ -343,6 +349,7 @@ inoriya: in_oriya
343
349
  inornamentaldingbats: in_ornamental_dingbats
344
350
  inosage: in_osage
345
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
346
353
  inpahawhhmong: in_pahawh_hmong
347
354
  inpalmyrene: in_palmyrene
348
355
  inpaucinhau: in_pau_cin_hau
@@ -368,6 +375,7 @@ insiddham: in_siddham
368
375
  insinhala: in_sinhala
369
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
370
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
371
379
  insogdian: in_sogdian
372
380
  insorasompeng: in_sora_sompeng
373
381
  insoyombo: in_soyombo
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
386
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
387
395
  insuttonsignwriting: in_sutton_signwriting
388
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
389
398
  insyriac: in_syriac
390
399
  insyriacsupplement: in_syriac_supplement
391
400
  intagalog: in_tagalog
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
397
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
398
407
  intakri: in_takri
399
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
400
410
  intangut: in_tangut
401
411
  intangutcomponents: in_tangut_components
402
412
  intelugu: in_telugu
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
414
424
  invariationselectorssupplement: in_variation_selectors_supplement
415
425
  invedicextensions: in_vedic_extensions
416
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
417
428
  inwarangciti: in_warang_citi
418
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
419
430
  inyiradicals: in_yi_radicals
@@ -469,6 +480,7 @@ mro: mro
469
480
  multani: multani
470
481
  myanmar: myanmar
471
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
472
484
  newa: newa
473
485
  newline: newline
474
486
  newtailue: new_tai_lue
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
477
489
  nonspacingmark: nonspacing_mark
478
490
  number: number
479
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
480
493
  ogham: ogham
481
494
  olchiki: ol_chiki
482
495
  oldhungarian: old_hungarian
@@ -569,6 +582,7 @@ uppercase: uppercase
569
582
  uppercaseletter: uppercase_letter
570
583
  vai: vai
571
584
  variationselector: variation_selector
585
+ wancho: wancho
572
586
  warangciti: warang_citi
573
587
  whitespace: white_space
574
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -49,6 +50,7 @@ dsrt: deseret
49
50
  dupl: duployan
50
51
  egyp: egyptian_hieroglyphs
51
52
  elba: elbasan
53
+ elym: elymaic
52
54
  ethi: ethiopic
53
55
  ext: extender
54
56
  geor: georgian
@@ -72,6 +74,7 @@ hex: hex_digit
72
74
  hira: hiragana
73
75
  hluw: anatolian_hieroglyphs
74
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
75
78
  hung: old_hungarian
76
79
  idc: id_continue
77
80
  ideo: ideographic
@@ -125,6 +128,7 @@ mtei: meetei_mayek
125
128
  mult: multani
126
129
  mymr: myanmar
127
130
  n: number
131
+ nand: nandinagari
128
132
  narb: old_north_arabian
129
133
  nbat: nabataean
130
134
  nchar: noncharacter_code_point
@@ -216,6 +220,7 @@ uideo: unified_ideograph
216
220
  vaii: vai
217
221
  vs: variation_selector
218
222
  wara: warang_citi
223
+ wcho: wancho
219
224
  wspace: white_space
220
225
  xidc: xid_continue
221
226
  xids: xid_start
@@ -21,7 +21,7 @@
21
21
  set_close = ']';
22
22
  brackets = set_open | set_close;
23
23
 
24
- comment = ('#' . [^\n]* . '\n');
24
+ comment = ('#' . [^\n]* . '\n'?);
25
25
 
26
26
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
27
  'cntrl' | 'digit' | 'graph' |
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -114,7 +118,9 @@
114
118
  curlies | parantheses | brackets |
115
119
  line_anchor | quantifier_greedy;
116
120
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
121
+ literal_delimiters = ']' | '}';
122
+
123
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
118
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
119
125
 
120
126
  utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
@@ -122,7 +128,7 @@
122
128
  utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
123
129
 
124
130
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
131
+ keep_mark | [xucCM];
126
132
 
127
133
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
134
  multi_codepoint_char_type | [0-9cCM];
@@ -417,6 +423,10 @@
417
423
  end
418
424
  };
419
425
 
426
+ literal_delimiters {
427
+ append_literal(data, ts, te)
428
+ };
429
+
420
430
  # Character sets
421
431
  # ------------------------------------------------------------------------
422
432
  set_open >set_opened {
@@ -620,10 +630,15 @@
620
630
  end
621
631
  };
622
632
 
623
- quantifier_interval @err(premature_end_error) {
633
+ quantifier_interval {
624
634
  emit(:quantifier, :interval, *text(data, ts, te))
625
635
  };
626
636
 
637
+ # Catch unmatched curly braces as literals
638
+ range_open {
639
+ append_literal(data, ts, te)
640
+ };
641
+
627
642
  # Escaped sequences
628
643
  # ------------------------------------------------------------------------
629
644
  backslash > (backslashed, 1) {
@@ -634,7 +649,9 @@
634
649
  if free_spacing
635
650
  emit(:free_space, :comment, *text(data, ts, te))
636
651
  else
637
- append_literal(data, ts, te)
652
+ # consume only the pound sign (#) and backtrack to do regular scanning
653
+ append_literal(data, ts, ts + 1)
654
+ fexec ts + 1;
638
655
  end
639
656
  };
640
657
 
@@ -722,21 +739,16 @@ class Regexp::Scanner
722
739
  #
723
740
  # This method may raise errors if a syntax error is encountered.
724
741
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
742
+ def self.scan(input_object, options: nil, &block)
743
+ new.scan(input_object, options: options, &block)
727
744
  end
728
745
 
729
- def scan(input_object, &block)
746
+ def scan(input_object, options: nil, &block)
730
747
  self.literal = nil
731
748
  stack = []
732
749
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
750
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
751
+ self.free_spacing = free_spacing?(input_object, options)
740
752
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
753
 
742
754
  data = input.unpack("c*") if input.is_a?(String)
@@ -802,6 +814,18 @@ class Regexp::Scanner
802
814
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
815
  :group_depth, :set_depth, :conditional_stack
804
816
 
817
+ def free_spacing?(input_object, options)
818
+ if options && !input_object.is_a?(String)
819
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
820
+ end
821
+
822
+ options = input_object.options if input_object.is_a?(::Regexp)
823
+
824
+ return false unless options
825
+
826
+ options & Regexp::EXTENDED != 0
827
+ end
828
+
805
829
  def in_group?
806
830
  group_depth > 0
807
831
  end
@@ -53,6 +53,10 @@ module Regexp::Syntax
53
53
 
54
54
  Age_V2_6_0 = [:'age=11.0']
55
55
 
56
+ Age_V2_6_2 = [:'age=12.0']
57
+
58
+ Age_V2_6_3 = [:'age=12.1']
59
+
56
60
  Age = all[:Age_V]
57
61
 
58
62
  Derived_V1_9_0 = [
@@ -297,6 +301,18 @@ module Regexp::Syntax
297
301
  :sogdian,
298
302
  ]
299
303
 
304
+ Script_V2_6_2 = [
305
+ :egyptian_hieroglyph_format_controls,
306
+ :elymaic,
307
+ :nandinagari,
308
+ :nyiakeng_puachue_hmong,
309
+ :ottoman_siyaq_numbers,
310
+ :small_kana_extension,
311
+ :symbols_and_pictographs_extended_a,
312
+ :tamil_supplement,
313
+ :wancho,
314
+ ]
315
+
300
316
  Script = all[:Script_V]
301
317
 
302
318
  UnicodeBlock_V1_9_0 = [
@@ -612,6 +628,18 @@ module Regexp::Syntax
612
628
  :in_sogdian,
613
629
  ]
614
630
 
631
+ UnicodeBlock_V2_6_2 = [
632
+ :in_egyptian_hieroglyph_format_controls,
633
+ :in_elymaic,
634
+ :in_nandinagari,
635
+ :in_nyiakeng_puachue_hmong,
636
+ :in_ottoman_siyaq_numbers,
637
+ :in_small_kana_extension,
638
+ :in_symbols_and_pictographs_extended_a,
639
+ :in_tamil_supplement,
640
+ :in_wancho,
641
+ ]
642
+
615
643
  UnicodeBlock = all[:UnicodeBlock_V]
616
644
 
617
645
  Emoji_V2_5_0 = [
@@ -632,6 +660,8 @@ module Regexp::Syntax
632
660
  V2_4_0 = all[:V2_4_0]
633
661
  V2_5_0 = all[:V2_5_0]
634
662
  V2_6_0 = all[:V2_6_0]
663
+ V2_6_2 = all[:V2_6_2]
664
+ V2_6_3 = all[:V2_6_3]
635
665
 
636
666
  All = all[/^V\d+_\d+_\d+$/]
637
667
 
@@ -0,0 +1,10 @@
1
+ module Regexp::Syntax
2
+ class V2_6_2 < Regexp::Syntax::V2_6_0
3
+ def initialize
4
+ super
5
+
6
+ implements :property, UnicodeProperty::V2_6_2
7
+ implements :nonproperty, UnicodeProperty::V2_6_2
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Regexp::Syntax
2
+ class V2_6_3 < Regexp::Syntax::V2_6_2
3
+ def initialize
4
+ super
5
+
6
+ implements :property, UnicodeProperty::V2_6_3
7
+ implements :nonproperty, UnicodeProperty::V2_6_3
8
+ end
9
+ end
10
+ end
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '1.5.1'
3
+ VERSION = '1.8.1'
4
4
  end
5
5
  end
@@ -32,5 +32,5 @@ Gem::Specification.new do |gem|
32
32
 
33
33
  gem.platform = Gem::Platform::RUBY
34
34
 
35
- gem.required_ruby_version = '>= 1.9.1'
35
+ gem.required_ruby_version = '>= 2.0.0'
36
36
  end
@@ -120,6 +120,13 @@ RSpec.describe(Regexp::MatchLength) do
120
120
  expect { result.next }.to raise_error(StopIteration)
121
121
  end
122
122
 
123
+ it 'is aware of limit option even if called without a block' do
124
+ result = ML.of(/a?/).each(limit: 1)
125
+ expect(result).to be_a(Enumerator)
126
+ expect(result.next).to eq 0
127
+ expect { result.next }.to raise_error(StopIteration)
128
+ end
129
+
123
130
  it 'is limited to 1000 iterations in case there are infinite match lengths' do
124
131
  expect(ML.of(/a*/).first(3000).size).to eq 1000
125
132
  end
@@ -39,6 +39,17 @@ RSpec.describe('Subexpression#traverse') do
39
39
  expect(visits).to eq 9
40
40
  end
41
41
 
42
+ specify('Subexpression#traverse without a block') do
43
+ root = RP.parse(/abc/)
44
+ enum = root.traverse
45
+
46
+ expect(enum).to be_a(Enumerator)
47
+ event, expr, idx = enum.next
48
+ expect(event).to eq(:visit)
49
+ expect(expr).to be_a(Regexp::Expression::Literal)
50
+ expect(idx).to eq(0)
51
+ end
52
+
42
53
  specify('Subexpression#walk alias') do
43
54
  root = RP.parse(/abc/)
44
55
 
@@ -81,6 +92,16 @@ RSpec.describe('Subexpression#traverse') do
81
92
  expect(indices).to eq [0, 0, 1, 0, 2]
82
93
  end
83
94
 
95
+ specify('Subexpression#each_expression without a block') do
96
+ root = RP.parse(/abc/)
97
+ enum = root.each_expression
98
+
99
+ expect(enum).to be_a(Enumerator)
100
+ expr, idx = enum.next
101
+ expect(expr).to be_a(Regexp::Expression::Literal)
102
+ expect(idx).to eq(0)
103
+ end
104
+
84
105
  specify('Subexpression#flat_map without block') do
85
106
  root = RP.parse(/a(b([c-e]+))?/)
86
107
 
@@ -85,44 +85,44 @@ RSpec.describe('Expression#options') do
85
85
  .and change { exp.unicode_classes? }.from(false).to(true)
86
86
  end
87
87
 
88
- RSpec.shared_examples '#options' do |regexp, klass, at: []|
88
+ RSpec.shared_examples '#options' do |regexp, path, klass|
89
89
  it "works for expression class #{klass}" do
90
- exp = RP.parse(/#{regexp.source}/i).dig(*at)
90
+ exp = RP.parse(/#{regexp.source}/i).dig(*path)
91
91
  expect(exp).to be_a(klass)
92
92
  expect(exp).to be_i
93
93
  expect(exp).not_to be_x
94
94
  end
95
95
  end
96
96
 
97
- include_examples '#options', //, Root
98
- include_examples '#options', /a/, Literal, at: [0]
99
- include_examples '#options', /\A/, Anchor::Base, at: [0]
100
- include_examples '#options', /\d/, CharacterType::Base, at: [0]
101
- include_examples '#options', /\n/, EscapeSequence::Base, at: [0]
102
- include_examples '#options', /\K/, Keep::Mark, at: [0]
103
- include_examples '#options', /./, CharacterType::Any, at: [0]
104
- include_examples '#options', /(a)/, Group::Base, at: [0]
105
- include_examples '#options', /(a)/, Literal, at: [0, 0]
106
- include_examples '#options', /(?=a)/, Assertion::Base, at: [0]
107
- include_examples '#options', /(?=a)/, Literal, at: [0, 0]
108
- include_examples '#options', /(a|b)/, Group::Base, at: [0]
109
- include_examples '#options', /(a|b)/, Alternation, at: [0, 0]
110
- include_examples '#options', /(a|b)/, Alternative, at: [0, 0, 0]
111
- include_examples '#options', /(a|b)/, Literal, at: [0, 0, 0, 0]
112
- include_examples '#options', /(a)\1/, Backreference::Base, at: [1]
113
- include_examples '#options', /(a)\k<1>/, Backreference::Number, at: [1]
114
- include_examples '#options', /(a)\g<1>/, Backreference::NumberCall, at: [1]
115
- include_examples '#options', /[a]/, CharacterSet, at: [0]
116
- include_examples '#options', /[a]/, Literal, at: [0, 0]
117
- include_examples '#options', /[a-z]/, CharacterSet::Range, at: [0, 0]
118
- include_examples '#options', /[a-z]/, Literal, at: [0, 0, 0]
119
- include_examples '#options', /[a&&z]/, CharacterSet::Intersection, at: [0, 0]
120
- include_examples '#options', /[a&&z]/, CharacterSet::IntersectedSequence, at: [0, 0, 0]
121
- include_examples '#options', /[a&&z]/, Literal, at: [0, 0, 0, 0]
122
- include_examples '#options', /[[:ascii:]]/, PosixClass, at: [0, 0]
123
- include_examples '#options', /\p{word}/, UnicodeProperty::Base, at: [0]
124
- include_examples '#options', /(a)(?(1)b|c)/, Conditional::Expression, at: [1]
125
- include_examples '#options', /(a)(?(1)b|c)/, Conditional::Condition, at: [1, 0]
126
- include_examples '#options', /(a)(?(1)b|c)/, Conditional::Branch, at: [1, 1]
127
- include_examples '#options', /(a)(?(1)b|c)/, Literal, at: [1, 1, 0]
97
+ include_examples '#options', //, [], Root
98
+ include_examples '#options', /a/, [0], Literal
99
+ include_examples '#options', /\A/, [0], Anchor::Base
100
+ include_examples '#options', /\d/, [0], CharacterType::Base
101
+ include_examples '#options', /\n/, [0], EscapeSequence::Base
102
+ include_examples '#options', /\K/, [0], Keep::Mark
103
+ include_examples '#options', /./, [0], CharacterType::Any
104
+ include_examples '#options', /(a)/, [0], Group::Base
105
+ include_examples '#options', /(a)/, [0, 0], Literal
106
+ include_examples '#options', /(?=a)/, [0], Assertion::Base
107
+ include_examples '#options', /(?=a)/, [0, 0], Literal
108
+ include_examples '#options', /(a|b)/, [0], Group::Base
109
+ include_examples '#options', /(a|b)/, [0, 0], Alternation
110
+ include_examples '#options', /(a|b)/, [0, 0, 0], Alternative
111
+ include_examples '#options', /(a|b)/, [0, 0, 0, 0], Literal
112
+ include_examples '#options', /(a)\1/, [1], Backreference::Base
113
+ include_examples '#options', /(a)\k<1>/, [1], Backreference::Number
114
+ include_examples '#options', /(a)\g<1>/, [1], Backreference::NumberCall
115
+ include_examples '#options', /[a]/, [0], CharacterSet
116
+ include_examples '#options', /[a]/, [0, 0], Literal
117
+ include_examples '#options', /[a-z]/, [0, 0], CharacterSet::Range
118
+ include_examples '#options', /[a-z]/, [0, 0, 0], Literal
119
+ include_examples '#options', /[a&&z]/, [0, 0], CharacterSet::Intersection
120
+ include_examples '#options', /[a&&z]/, [0, 0, 0], CharacterSet::IntersectedSequence
121
+ include_examples '#options', /[a&&z]/, [0, 0, 0, 0], Literal
122
+ include_examples '#options', /[[:ascii:]]/, [0, 0], PosixClass
123
+ include_examples '#options', /\p{word}/, [0], UnicodeProperty::Base
124
+ include_examples '#options', /(a)(?(1)b|c)/, [1], Conditional::Expression
125
+ include_examples '#options', /(a)(?(1)b|c)/, [1, 0], Conditional::Condition
126
+ include_examples '#options', /(a)(?(1)b|c)/, [1, 1], Conditional::Branch
127
+ include_examples '#options', /(a)(?(1)b|c)/, [1, 1, 0], Literal
128
128
  end