regexp_parser 1.5.1 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,8 @@ adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
8
  age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
9
11
  age=2.0: age=2.0
10
12
  age=2.1: age=2.1
11
13
  age=3.0: age=3.0
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
64
66
  cherokee: cherokee
65
67
  closepunctuation: close_punctuation
66
68
  cntrl: cntrl
67
- combiningmark: combining_mark
68
69
  common: common
69
70
  connectorpunctuation: connector_punctuation
70
71
  control: control
@@ -86,6 +87,7 @@ dogra: dogra
86
87
  duployan: duployan
87
88
  egyptianhieroglyphs: egyptian_hieroglyphs
88
89
  elbasan: elbasan
90
+ elymaic: elymaic
89
91
  emoji: emoji
90
92
  emojicomponent: emoji_component
91
93
  emojimodifier: emoji_modifier
@@ -206,8 +208,10 @@ indogra: in_dogra
206
208
  indominotiles: in_domino_tiles
207
209
  induployan: in_duployan
208
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
209
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
210
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
211
215
  inemoticons: in_emoticons
212
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
213
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
322
326
  inmyanmarextendeda: in_myanmar_extended_a
323
327
  inmyanmarextendedb: in_myanmar_extended_b
324
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
325
330
  innewa: in_newa
326
331
  innewtailue: in_new_tai_lue
327
332
  innko: in_nko
328
333
  innoblock: in_no_block
329
334
  innumberforms: in_number_forms
330
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
331
337
  inogham: in_ogham
332
338
  inolchiki: in_ol_chiki
333
339
  inoldhungarian: in_old_hungarian
@@ -343,6 +349,7 @@ inoriya: in_oriya
343
349
  inornamentaldingbats: in_ornamental_dingbats
344
350
  inosage: in_osage
345
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
346
353
  inpahawhhmong: in_pahawh_hmong
347
354
  inpalmyrene: in_palmyrene
348
355
  inpaucinhau: in_pau_cin_hau
@@ -368,6 +375,7 @@ insiddham: in_siddham
368
375
  insinhala: in_sinhala
369
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
370
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
371
379
  insogdian: in_sogdian
372
380
  insorasompeng: in_sora_sompeng
373
381
  insoyombo: in_soyombo
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
386
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
387
395
  insuttonsignwriting: in_sutton_signwriting
388
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
389
398
  insyriac: in_syriac
390
399
  insyriacsupplement: in_syriac_supplement
391
400
  intagalog: in_tagalog
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
397
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
398
407
  intakri: in_takri
399
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
400
410
  intangut: in_tangut
401
411
  intangutcomponents: in_tangut_components
402
412
  intelugu: in_telugu
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
414
424
  invariationselectorssupplement: in_variation_selectors_supplement
415
425
  invedicextensions: in_vedic_extensions
416
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
417
428
  inwarangciti: in_warang_citi
418
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
419
430
  inyiradicals: in_yi_radicals
@@ -469,6 +480,7 @@ mro: mro
469
480
  multani: multani
470
481
  myanmar: myanmar
471
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
472
484
  newa: newa
473
485
  newline: newline
474
486
  newtailue: new_tai_lue
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
477
489
  nonspacingmark: nonspacing_mark
478
490
  number: number
479
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
480
493
  ogham: ogham
481
494
  olchiki: ol_chiki
482
495
  oldhungarian: old_hungarian
@@ -569,6 +582,7 @@ uppercase: uppercase
569
582
  uppercaseletter: uppercase_letter
570
583
  vai: vai
571
584
  variationselector: variation_selector
585
+ wancho: wancho
572
586
  warangciti: warang_citi
573
587
  whitespace: white_space
574
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -49,6 +50,7 @@ dsrt: deseret
49
50
  dupl: duployan
50
51
  egyp: egyptian_hieroglyphs
51
52
  elba: elbasan
53
+ elym: elymaic
52
54
  ethi: ethiopic
53
55
  ext: extender
54
56
  geor: georgian
@@ -72,6 +74,7 @@ hex: hex_digit
72
74
  hira: hiragana
73
75
  hluw: anatolian_hieroglyphs
74
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
75
78
  hung: old_hungarian
76
79
  idc: id_continue
77
80
  ideo: ideographic
@@ -125,6 +128,7 @@ mtei: meetei_mayek
125
128
  mult: multani
126
129
  mymr: myanmar
127
130
  n: number
131
+ nand: nandinagari
128
132
  narb: old_north_arabian
129
133
  nbat: nabataean
130
134
  nchar: noncharacter_code_point
@@ -216,6 +220,7 @@ uideo: unified_ideograph
216
220
  vaii: vai
217
221
  vs: variation_selector
218
222
  wara: warang_citi
223
+ wcho: wancho
219
224
  wspace: white_space
220
225
  xidc: xid_continue
221
226
  xids: xid_start
@@ -21,7 +21,7 @@
21
21
  set_close = ']';
22
22
  brackets = set_open | set_close;
23
23
 
24
- comment = ('#' . [^\n]* . '\n');
24
+ comment = ('#' . [^\n]* . '\n'?);
25
25
 
26
26
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
27
  'cntrl' | 'digit' | 'graph' |
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -114,7 +118,9 @@
114
118
  curlies | parantheses | brackets |
115
119
  line_anchor | quantifier_greedy;
116
120
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
121
+ literal_delimiters = ']' | '}';
122
+
123
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
118
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
119
125
 
120
126
  utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
@@ -122,7 +128,7 @@
122
128
  utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
123
129
 
124
130
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
131
+ keep_mark | [xucCM];
126
132
 
127
133
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
134
  multi_codepoint_char_type | [0-9cCM];
@@ -417,6 +423,10 @@
417
423
  end
418
424
  };
419
425
 
426
+ literal_delimiters {
427
+ append_literal(data, ts, te)
428
+ };
429
+
420
430
  # Character sets
421
431
  # ------------------------------------------------------------------------
422
432
  set_open >set_opened {
@@ -620,10 +630,15 @@
620
630
  end
621
631
  };
622
632
 
623
- quantifier_interval @err(premature_end_error) {
633
+ quantifier_interval {
624
634
  emit(:quantifier, :interval, *text(data, ts, te))
625
635
  };
626
636
 
637
+ # Catch unmatched curly braces as literals
638
+ range_open {
639
+ append_literal(data, ts, te)
640
+ };
641
+
627
642
  # Escaped sequences
628
643
  # ------------------------------------------------------------------------
629
644
  backslash > (backslashed, 1) {
@@ -634,7 +649,9 @@
634
649
  if free_spacing
635
650
  emit(:free_space, :comment, *text(data, ts, te))
636
651
  else
637
- append_literal(data, ts, te)
652
+ # consume only the pound sign (#) and backtrack to do regular scanning
653
+ append_literal(data, ts, ts + 1)
654
+ fexec ts + 1;
638
655
  end
639
656
  };
640
657
 
@@ -722,21 +739,16 @@ class Regexp::Scanner
722
739
  #
723
740
  # This method may raise errors if a syntax error is encountered.
724
741
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
742
+ def self.scan(input_object, options: nil, &block)
743
+ new.scan(input_object, options: options, &block)
727
744
  end
728
745
 
729
- def scan(input_object, &block)
746
+ def scan(input_object, options: nil, &block)
730
747
  self.literal = nil
731
748
  stack = []
732
749
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
750
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
751
+ self.free_spacing = free_spacing?(input_object, options)
740
752
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
753
 
742
754
  data = input.unpack("c*") if input.is_a?(String)
@@ -802,6 +814,18 @@ class Regexp::Scanner
802
814
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
815
  :group_depth, :set_depth, :conditional_stack
804
816
 
817
+ def free_spacing?(input_object, options)
818
+ if options && !input_object.is_a?(String)
819
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
820
+ end
821
+
822
+ options = input_object.options if input_object.is_a?(::Regexp)
823
+
824
+ return false unless options
825
+
826
+ options & Regexp::EXTENDED != 0
827
+ end
828
+
805
829
  def in_group?
806
830
  group_depth > 0
807
831
  end
@@ -53,6 +53,10 @@ module Regexp::Syntax
53
53
 
54
54
  Age_V2_6_0 = [:'age=11.0']
55
55
 
56
+ Age_V2_6_2 = [:'age=12.0']
57
+
58
+ Age_V2_6_3 = [:'age=12.1']
59
+
56
60
  Age = all[:Age_V]
57
61
 
58
62
  Derived_V1_9_0 = [
@@ -297,6 +301,18 @@ module Regexp::Syntax
297
301
  :sogdian,
298
302
  ]
299
303
 
304
+ Script_V2_6_2 = [
305
+ :egyptian_hieroglyph_format_controls,
306
+ :elymaic,
307
+ :nandinagari,
308
+ :nyiakeng_puachue_hmong,
309
+ :ottoman_siyaq_numbers,
310
+ :small_kana_extension,
311
+ :symbols_and_pictographs_extended_a,
312
+ :tamil_supplement,
313
+ :wancho,
314
+ ]
315
+
300
316
  Script = all[:Script_V]
301
317
 
302
318
  UnicodeBlock_V1_9_0 = [
@@ -612,6 +628,18 @@ module Regexp::Syntax
612
628
  :in_sogdian,
613
629
  ]
614
630
 
631
+ UnicodeBlock_V2_6_2 = [
632
+ :in_egyptian_hieroglyph_format_controls,
633
+ :in_elymaic,
634
+ :in_nandinagari,
635
+ :in_nyiakeng_puachue_hmong,
636
+ :in_ottoman_siyaq_numbers,
637
+ :in_small_kana_extension,
638
+ :in_symbols_and_pictographs_extended_a,
639
+ :in_tamil_supplement,
640
+ :in_wancho,
641
+ ]
642
+
615
643
  UnicodeBlock = all[:UnicodeBlock_V]
616
644
 
617
645
  Emoji_V2_5_0 = [
@@ -632,6 +660,8 @@ module Regexp::Syntax
632
660
  V2_4_0 = all[:V2_4_0]
633
661
  V2_5_0 = all[:V2_5_0]
634
662
  V2_6_0 = all[:V2_6_0]
663
+ V2_6_2 = all[:V2_6_2]
664
+ V2_6_3 = all[:V2_6_3]
635
665
 
636
666
  All = all[/^V\d+_\d+_\d+$/]
637
667
 
@@ -0,0 +1,10 @@
1
+ module Regexp::Syntax
2
+ class V2_6_2 < Regexp::Syntax::V2_6_0
3
+ def initialize
4
+ super
5
+
6
+ implements :property, UnicodeProperty::V2_6_2
7
+ implements :nonproperty, UnicodeProperty::V2_6_2
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Regexp::Syntax
2
+ class V2_6_3 < Regexp::Syntax::V2_6_2
3
+ def initialize
4
+ super
5
+
6
+ implements :property, UnicodeProperty::V2_6_3
7
+ implements :nonproperty, UnicodeProperty::V2_6_3
8
+ end
9
+ end
10
+ end
@@ -1,5 +1,5 @@
1
1
  class Regexp
2
2
  class Parser
3
- VERSION = '1.5.1'
3
+ VERSION = '1.8.1'
4
4
  end
5
5
  end
@@ -32,5 +32,5 @@ Gem::Specification.new do |gem|
32
32
 
33
33
  gem.platform = Gem::Platform::RUBY
34
34
 
35
- gem.required_ruby_version = '>= 1.9.1'
35
+ gem.required_ruby_version = '>= 2.0.0'
36
36
  end
@@ -120,6 +120,13 @@ RSpec.describe(Regexp::MatchLength) do
120
120
  expect { result.next }.to raise_error(StopIteration)
121
121
  end
122
122
 
123
+ it 'is aware of limit option even if called without a block' do
124
+ result = ML.of(/a?/).each(limit: 1)
125
+ expect(result).to be_a(Enumerator)
126
+ expect(result.next).to eq 0
127
+ expect { result.next }.to raise_error(StopIteration)
128
+ end
129
+
123
130
  it 'is limited to 1000 iterations in case there are infinite match lengths' do
124
131
  expect(ML.of(/a*/).first(3000).size).to eq 1000
125
132
  end
@@ -39,6 +39,17 @@ RSpec.describe('Subexpression#traverse') do
39
39
  expect(visits).to eq 9
40
40
  end
41
41
 
42
+ specify('Subexpression#traverse without a block') do
43
+ root = RP.parse(/abc/)
44
+ enum = root.traverse
45
+
46
+ expect(enum).to be_a(Enumerator)
47
+ event, expr, idx = enum.next
48
+ expect(event).to eq(:visit)
49
+ expect(expr).to be_a(Regexp::Expression::Literal)
50
+ expect(idx).to eq(0)
51
+ end
52
+
42
53
  specify('Subexpression#walk alias') do
43
54
  root = RP.parse(/abc/)
44
55
 
@@ -81,6 +92,16 @@ RSpec.describe('Subexpression#traverse') do
81
92
  expect(indices).to eq [0, 0, 1, 0, 2]
82
93
  end
83
94
 
95
+ specify('Subexpression#each_expression without a block') do
96
+ root = RP.parse(/abc/)
97
+ enum = root.each_expression
98
+
99
+ expect(enum).to be_a(Enumerator)
100
+ expr, idx = enum.next
101
+ expect(expr).to be_a(Regexp::Expression::Literal)
102
+ expect(idx).to eq(0)
103
+ end
104
+
84
105
  specify('Subexpression#flat_map without block') do
85
106
  root = RP.parse(/a(b([c-e]+))?/)
86
107
 
@@ -85,44 +85,44 @@ RSpec.describe('Expression#options') do
85
85
  .and change { exp.unicode_classes? }.from(false).to(true)
86
86
  end
87
87
 
88
- RSpec.shared_examples '#options' do |regexp, klass, at: []|
88
+ RSpec.shared_examples '#options' do |regexp, path, klass|
89
89
  it "works for expression class #{klass}" do
90
- exp = RP.parse(/#{regexp.source}/i).dig(*at)
90
+ exp = RP.parse(/#{regexp.source}/i).dig(*path)
91
91
  expect(exp).to be_a(klass)
92
92
  expect(exp).to be_i
93
93
  expect(exp).not_to be_x
94
94
  end
95
95
  end
96
96
 
97
- include_examples '#options', //, Root
98
- include_examples '#options', /a/, Literal, at: [0]
99
- include_examples '#options', /\A/, Anchor::Base, at: [0]
100
- include_examples '#options', /\d/, CharacterType::Base, at: [0]
101
- include_examples '#options', /\n/, EscapeSequence::Base, at: [0]
102
- include_examples '#options', /\K/, Keep::Mark, at: [0]
103
- include_examples '#options', /./, CharacterType::Any, at: [0]
104
- include_examples '#options', /(a)/, Group::Base, at: [0]
105
- include_examples '#options', /(a)/, Literal, at: [0, 0]
106
- include_examples '#options', /(?=a)/, Assertion::Base, at: [0]
107
- include_examples '#options', /(?=a)/, Literal, at: [0, 0]
108
- include_examples '#options', /(a|b)/, Group::Base, at: [0]
109
- include_examples '#options', /(a|b)/, Alternation, at: [0, 0]
110
- include_examples '#options', /(a|b)/, Alternative, at: [0, 0, 0]
111
- include_examples '#options', /(a|b)/, Literal, at: [0, 0, 0, 0]
112
- include_examples '#options', /(a)\1/, Backreference::Base, at: [1]
113
- include_examples '#options', /(a)\k<1>/, Backreference::Number, at: [1]
114
- include_examples '#options', /(a)\g<1>/, Backreference::NumberCall, at: [1]
115
- include_examples '#options', /[a]/, CharacterSet, at: [0]
116
- include_examples '#options', /[a]/, Literal, at: [0, 0]
117
- include_examples '#options', /[a-z]/, CharacterSet::Range, at: [0, 0]
118
- include_examples '#options', /[a-z]/, Literal, at: [0, 0, 0]
119
- include_examples '#options', /[a&&z]/, CharacterSet::Intersection, at: [0, 0]
120
- include_examples '#options', /[a&&z]/, CharacterSet::IntersectedSequence, at: [0, 0, 0]
121
- include_examples '#options', /[a&&z]/, Literal, at: [0, 0, 0, 0]
122
- include_examples '#options', /[[:ascii:]]/, PosixClass, at: [0, 0]
123
- include_examples '#options', /\p{word}/, UnicodeProperty::Base, at: [0]
124
- include_examples '#options', /(a)(?(1)b|c)/, Conditional::Expression, at: [1]
125
- include_examples '#options', /(a)(?(1)b|c)/, Conditional::Condition, at: [1, 0]
126
- include_examples '#options', /(a)(?(1)b|c)/, Conditional::Branch, at: [1, 1]
127
- include_examples '#options', /(a)(?(1)b|c)/, Literal, at: [1, 1, 0]
97
+ include_examples '#options', //, [], Root
98
+ include_examples '#options', /a/, [0], Literal
99
+ include_examples '#options', /\A/, [0], Anchor::Base
100
+ include_examples '#options', /\d/, [0], CharacterType::Base
101
+ include_examples '#options', /\n/, [0], EscapeSequence::Base
102
+ include_examples '#options', /\K/, [0], Keep::Mark
103
+ include_examples '#options', /./, [0], CharacterType::Any
104
+ include_examples '#options', /(a)/, [0], Group::Base
105
+ include_examples '#options', /(a)/, [0, 0], Literal
106
+ include_examples '#options', /(?=a)/, [0], Assertion::Base
107
+ include_examples '#options', /(?=a)/, [0, 0], Literal
108
+ include_examples '#options', /(a|b)/, [0], Group::Base
109
+ include_examples '#options', /(a|b)/, [0, 0], Alternation
110
+ include_examples '#options', /(a|b)/, [0, 0, 0], Alternative
111
+ include_examples '#options', /(a|b)/, [0, 0, 0, 0], Literal
112
+ include_examples '#options', /(a)\1/, [1], Backreference::Base
113
+ include_examples '#options', /(a)\k<1>/, [1], Backreference::Number
114
+ include_examples '#options', /(a)\g<1>/, [1], Backreference::NumberCall
115
+ include_examples '#options', /[a]/, [0], CharacterSet
116
+ include_examples '#options', /[a]/, [0, 0], Literal
117
+ include_examples '#options', /[a-z]/, [0, 0], CharacterSet::Range
118
+ include_examples '#options', /[a-z]/, [0, 0, 0], Literal
119
+ include_examples '#options', /[a&&z]/, [0, 0], CharacterSet::Intersection
120
+ include_examples '#options', /[a&&z]/, [0, 0, 0], CharacterSet::IntersectedSequence
121
+ include_examples '#options', /[a&&z]/, [0, 0, 0, 0], Literal
122
+ include_examples '#options', /[[:ascii:]]/, [0, 0], PosixClass
123
+ include_examples '#options', /\p{word}/, [0], UnicodeProperty::Base
124
+ include_examples '#options', /(a)(?(1)b|c)/, [1], Conditional::Expression
125
+ include_examples '#options', /(a)(?(1)b|c)/, [1, 0], Conditional::Condition
126
+ include_examples '#options', /(a)(?(1)b|c)/, [1, 1], Conditional::Branch
127
+ include_examples '#options', /(a)(?(1)b|c)/, [1, 1, 0], Literal
128
128
  end