regexp_parser 2.2.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +70 -6
  3. data/Gemfile +2 -1
  4. data/README.md +23 -9
  5. data/Rakefile +1 -56
  6. data/lib/regexp_parser/error.rb +1 -1
  7. data/lib/regexp_parser/expression/base.rb +9 -57
  8. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
  9. data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
  10. data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
  12. data/lib/regexp_parser/expression/classes/group.rb +6 -6
  13. data/lib/regexp_parser/expression/methods/tests.rb +10 -1
  14. data/lib/regexp_parser/expression/quantifier.rb +40 -23
  15. data/lib/regexp_parser/expression/sequence.rb +2 -2
  16. data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
  17. data/lib/regexp_parser/expression/shared.rb +81 -0
  18. data/lib/regexp_parser/expression/subexpression.rb +11 -7
  19. data/lib/regexp_parser/expression.rb +1 -0
  20. data/lib/regexp_parser/lexer.rb +1 -1
  21. data/lib/regexp_parser/parser.rb +12 -60
  22. data/lib/regexp_parser/scanner/properties/long.csv +18 -0
  23. data/lib/regexp_parser/scanner/properties/short.csv +4 -0
  24. data/lib/regexp_parser/scanner/property.rl +1 -1
  25. data/lib/regexp_parser/scanner/scanner.rl +42 -31
  26. data/lib/regexp_parser/scanner.rb +729 -797
  27. data/lib/regexp_parser/syntax/any.rb +2 -5
  28. data/lib/regexp_parser/syntax/base.rb +91 -64
  29. data/lib/regexp_parser/syntax/token/quantifier.rb +4 -4
  30. data/lib/regexp_parser/syntax/token/unicode_property.rb +26 -5
  31. data/lib/regexp_parser/syntax/version_lookup.rb +20 -29
  32. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  33. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  34. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  35. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  36. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  37. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  38. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  39. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  40. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  41. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  42. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  43. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  44. data/lib/regexp_parser/syntax/versions/3.1.0.rb +3 -9
  45. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  46. data/lib/regexp_parser/syntax/versions.rb +1 -1
  47. data/lib/regexp_parser/version.rb +1 -1
  48. metadata +4 -2
@@ -0,0 +1,81 @@
1
+ module Regexp::Expression
2
+ module Shared
3
+ def self.included(mod)
4
+ mod.class_eval do
5
+ attr_accessor :type, :token, :text, :ts, :te,
6
+ :level, :set_level, :conditional_level,
7
+ :options, :quantifier
8
+
9
+ attr_reader :nesting_level
10
+ end
11
+ end
12
+
13
+ def init_from_token_and_options(token, options = {})
14
+ self.type = token.type
15
+ self.token = token.token
16
+ self.text = token.text
17
+ self.ts = token.ts
18
+ self.te = token.te
19
+ self.level = token.level
20
+ self.set_level = token.set_level
21
+ self.conditional_level = token.conditional_level
22
+ self.nesting_level = 0
23
+ self.options = options || {}
24
+ end
25
+ private :init_from_token_and_options
26
+
27
+ def initialize_copy(orig)
28
+ self.text = orig.text.dup if orig.text
29
+ self.options = orig.options.dup if orig.options
30
+ self.quantifier = orig.quantifier.clone if orig.quantifier
31
+ super
32
+ end
33
+
34
+ def starts_at
35
+ ts
36
+ end
37
+
38
+ def base_length
39
+ to_s(:base).length
40
+ end
41
+
42
+ def full_length
43
+ to_s.length
44
+ end
45
+
46
+ def to_s(format = :full)
47
+ "#{parts.join}#{quantifier_affix(format)}"
48
+ end
49
+ alias :to_str :to_s
50
+
51
+ def parts
52
+ [text.dup]
53
+ end
54
+
55
+ def quantifier_affix(expression_format)
56
+ quantifier.to_s if quantified? && expression_format != :base
57
+ end
58
+
59
+ def quantified?
60
+ !quantifier.nil?
61
+ end
62
+
63
+ def offset
64
+ [starts_at, full_length]
65
+ end
66
+
67
+ def coded_offset
68
+ '@%d+%d' % offset
69
+ end
70
+
71
+ def terminal?
72
+ !respond_to?(:expressions)
73
+ end
74
+
75
+ def nesting_level=(lvl)
76
+ @nesting_level = lvl
77
+ quantifier && quantifier.nesting_level = lvl
78
+ terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
79
+ end
80
+ end
81
+ end
@@ -5,9 +5,8 @@ module Regexp::Expression
5
5
  attr_accessor :expressions
6
6
 
7
7
  def initialize(token, options = {})
8
- super
9
-
10
8
  self.expressions = []
9
+ super
11
10
  end
12
11
 
13
12
  # Override base method to clone the expressions as well.
@@ -43,16 +42,21 @@ module Regexp::Expression
43
42
  ts + to_s.length
44
43
  end
45
44
 
46
- def to_s(format = :full)
47
- # Note: the format does not get passed down to subexpressions.
48
- "#{expressions.join}#{quantifier_affix(format)}"
45
+ def parts
46
+ expressions
49
47
  end
50
48
 
51
49
  def to_h
52
- attributes.merge({
50
+ attributes.merge(
53
51
  text: to_s(:base),
54
52
  expressions: expressions.map(&:to_h)
55
- })
53
+ )
54
+ end
55
+
56
+ private
57
+
58
+ def intersperse(expressions, separator)
59
+ expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
56
60
  end
57
61
  end
58
62
  end
@@ -1,5 +1,6 @@
1
1
  require 'regexp_parser/error'
2
2
 
3
+ require 'regexp_parser/expression/shared'
3
4
  require 'regexp_parser/expression/base'
4
5
  require 'regexp_parser/expression/quantifier'
5
6
  require 'regexp_parser/expression/subexpression'
@@ -18,7 +18,7 @@ class Regexp::Lexer
18
18
  end
19
19
 
20
20
  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.new(syntax)
21
+ syntax = Regexp::Syntax.for(syntax)
22
22
 
23
23
  self.tokens = []
24
24
  self.nesting = 0
@@ -39,6 +39,9 @@ class Regexp::Parser
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ # Trigger recursive setting of #nesting_level, which reflects how deep
43
+ # a node is in the tree. Do this at the end to account for tree rewrites.
44
+ root.nesting_level = 0
42
45
  assign_referenced_expressions
43
46
 
44
47
  if block_given?
@@ -286,17 +289,9 @@ class Regexp::Parser
286
289
  def nest(exp)
287
290
  nesting.push(exp)
288
291
  node << exp
289
- update_transplanted_subtree(exp, node)
290
292
  self.node = exp
291
293
  end
292
294
 
293
- # subtrees are transplanted to build Alternations, Intersections, Ranges
294
- def update_transplanted_subtree(exp, new_parent)
295
- exp.nesting_level = new_parent.nesting_level + 1
296
- exp.respond_to?(:each) &&
297
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
- end
299
-
300
295
  def escape(token)
301
296
  case token.token
302
297
 
@@ -483,7 +478,7 @@ class Regexp::Parser
483
478
  new_token = Regexp::Token.new(
484
479
  :group,
485
480
  :passive,
486
- '', # text
481
+ '', # text (none because this group is implicit)
487
482
  target_node.ts,
488
483
  nil, # te (unused)
489
484
  target_node.level,
@@ -493,66 +488,23 @@ class Regexp::Parser
493
488
  new_group = Group::Passive.new(new_token, active_opts)
494
489
  new_group.implicit = true
495
490
  new_group << target_node
496
- increase_level(target_node)
491
+ increase_group_level(target_node)
497
492
  node.expressions[node.expressions.index(target_node)] = new_group
498
493
  target_node = new_group
499
494
  end
500
495
 
501
- case token.token
502
- when :zero_or_one
503
- target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
504
- when :zero_or_one_reluctant
505
- target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
506
- when :zero_or_one_possessive
507
- target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
508
-
509
- when :zero_or_more
510
- target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
511
- when :zero_or_more_reluctant
512
- target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
513
- when :zero_or_more_possessive
514
- target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
515
-
516
- when :one_or_more
517
- target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
518
- when :one_or_more_reluctant
519
- target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
520
- when :one_or_more_possessive
521
- target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
522
-
523
- when :interval
524
- interval(target_node, token)
525
-
526
- else
496
+ unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
497
+ (?:_greedy|_reluctant|_possessive)?\z/x
527
498
  raise UnknownTokenError.new('Quantifier', token)
528
499
  end
500
+
501
+ target_node.quantify(token, active_opts)
529
502
  end
530
503
 
531
- def increase_level(exp)
504
+ def increase_group_level(exp)
532
505
  exp.level += 1
533
- exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
- end
535
-
536
- def interval(target_node, token)
537
- text = token.text
538
- mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
539
- case mchr
540
- when '?'
541
- range_text = text[0...-1]
542
- mode = :reluctant
543
- when '+'
544
- range_text = text[0...-1]
545
- mode = :possessive
546
- else
547
- range_text = text
548
- mode = :greedy
549
- end
550
-
551
- range = range_text.gsub(/\{|\}/, '').split(',', 2)
552
- min = range[0].empty? ? 0 : range[0]
553
- max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
554
-
555
- target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
506
+ exp.quantifier.level += 1 if exp.quantifier
507
+ exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
556
508
  end
557
509
 
558
510
  def set(token)
@@ -6,6 +6,7 @@ age=11.0,age=11.0
6
6
  age=12.0,age=12.0
7
7
  age=12.1,age=12.1
8
8
  age=13.0,age=13.0
9
+ age=14.0,age=14.0
9
10
  age=2.0,age=2.0
10
11
  age=2.1,age=2.1
11
12
  age=3.0,age=3.0
@@ -72,6 +73,7 @@ coptic,coptic
72
73
  cuneiform,cuneiform
73
74
  currencysymbol,currency_symbol
74
75
  cypriot,cypriot
76
+ cyprominoan,cypro_minoan
75
77
  cyrillic,cyrillic
76
78
  dash,dash
77
79
  dashpunctuation,dash_punctuation
@@ -136,6 +138,7 @@ inancientgreeknumbers,in_ancient_greek_numbers
136
138
  inancientsymbols,in_ancient_symbols
137
139
  inarabic,in_arabic
138
140
  inarabicextendeda,in_arabic_extended_a
141
+ inarabicextendedb,in_arabic_extended_b
139
142
  inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
140
143
  inarabicpresentationformsa,in_arabic_presentation_forms_a
141
144
  inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -197,6 +200,7 @@ incuneiform,in_cuneiform
197
200
  incuneiformnumbersandpunctuation,in_cuneiform_numbers_and_punctuation
198
201
  incurrencysymbols,in_currency_symbols
199
202
  incypriotsyllabary,in_cypriot_syllabary
203
+ incyprominoan,in_cypro_minoan
200
204
  incyrillic,in_cyrillic
201
205
  incyrillicextendeda,in_cyrillic_extended_a
202
206
  incyrillicextendedb,in_cyrillic_extended_b
@@ -223,6 +227,7 @@ inenclosedideographicsupplement,in_enclosed_ideographic_supplement
223
227
  inethiopic,in_ethiopic
224
228
  inethiopicextended,in_ethiopic_extended
225
229
  inethiopicextendeda,in_ethiopic_extended_a
230
+ inethiopicextendedb,in_ethiopic_extended_b
226
231
  inethiopicsupplement,in_ethiopic_supplement
227
232
  ingeneralpunctuation,in_general_punctuation
228
233
  ingeometricshapes,in_geometric_shapes
@@ -264,6 +269,7 @@ initialpunctuation,initial_punctuation
264
269
  injavanese,in_javanese
265
270
  inkaithi,in_kaithi
266
271
  inkanaextendeda,in_kana_extended_a
272
+ inkanaextendedb,in_kana_extended_b
267
273
  inkanasupplement,in_kana_supplement
268
274
  inkanbun,in_kanbun
269
275
  inkangxiradicals,in_kangxi_radicals
@@ -285,6 +291,8 @@ inlatinextendedb,in_latin_extended_b
285
291
  inlatinextendedc,in_latin_extended_c
286
292
  inlatinextendedd,in_latin_extended_d
287
293
  inlatinextendede,in_latin_extended_e
294
+ inlatinextendedf,in_latin_extended_f
295
+ inlatinextendedg,in_latin_extended_g
288
296
  inlepcha,in_lepcha
289
297
  inletterlikesymbols,in_letterlike_symbols
290
298
  inlimbu,in_limbu
@@ -349,6 +357,7 @@ inoldpersian,in_old_persian
349
357
  inoldsogdian,in_old_sogdian
350
358
  inoldsoutharabian,in_old_south_arabian
351
359
  inoldturkic,in_old_turkic
360
+ inolduyghur,in_old_uyghur
352
361
  inopticalcharacterrecognition,in_optical_character_recognition
353
362
  inoriya,in_oriya
354
363
  inornamentaldingbats,in_ornamental_dingbats
@@ -413,6 +422,7 @@ intaixuanjingsymbols,in_tai_xuan_jing_symbols
413
422
  intakri,in_takri
414
423
  intamil,in_tamil
415
424
  intamilsupplement,in_tamil_supplement
425
+ intangsa,in_tangsa
416
426
  intangut,in_tangut
417
427
  intangutcomponents,in_tangut_components
418
428
  intangutsupplement,in_tangut_supplement
@@ -422,15 +432,18 @@ inthai,in_thai
422
432
  intibetan,in_tibetan
423
433
  intifinagh,in_tifinagh
424
434
  intirhuta,in_tirhuta
435
+ intoto,in_toto
425
436
  intransportandmapsymbols,in_transport_and_map_symbols
426
437
  inugaritic,in_ugaritic
427
438
  inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
428
439
  inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
440
+ inunifiedcanadianaboriginalsyllabicsextendeda,in_unified_canadian_aboriginal_syllabics_extended_a
429
441
  invai,in_vai
430
442
  invariationselectors,in_variation_selectors
431
443
  invariationselectorssupplement,in_variation_selectors_supplement
432
444
  invedicextensions,in_vedic_extensions
433
445
  inverticalforms,in_vertical_forms
446
+ invithkuqi,in_vithkuqi
434
447
  inwancho,in_wancho
435
448
  inwarangciti,in_warang_citi
436
449
  inyezidi,in_yezidi
@@ -438,6 +451,7 @@ inyijinghexagramsymbols,in_yijing_hexagram_symbols
438
451
  inyiradicals,in_yi_radicals
439
452
  inyisyllables,in_yi_syllables
440
453
  inzanabazarsquare,in_zanabazar_square
454
+ inznamennymusicalnotation,in_znamenny_musical_notation
441
455
  javanese,javanese
442
456
  joincontrol,join_control
443
457
  kaithi,kaithi
@@ -509,6 +523,7 @@ oldpersian,old_persian
509
523
  oldsogdian,old_sogdian
510
524
  oldsoutharabian,old_south_arabian
511
525
  oldturkic,old_turkic
526
+ olduyghur,old_uyghur
512
527
  openpunctuation,open_punctuation
513
528
  oriya,oriya
514
529
  osage,osage
@@ -573,6 +588,7 @@ taitham,tai_tham
573
588
  taiviet,tai_viet
574
589
  takri,takri
575
590
  tamil,tamil
591
+ tangsa,tangsa
576
592
  tangut,tangut
577
593
  telugu,telugu
578
594
  terminalpunctuation,terminal_punctuation
@@ -582,6 +598,7 @@ tibetan,tibetan
582
598
  tifinagh,tifinagh
583
599
  tirhuta,tirhuta
584
600
  titlecaseletter,titlecase_letter
601
+ toto,toto
585
602
  ugaritic,ugaritic
586
603
  unassigned,unassigned
587
604
  unifiedideograph,unified_ideograph
@@ -591,6 +608,7 @@ uppercase,uppercase
591
608
  uppercaseletter,uppercase_letter
592
609
  vai,vai
593
610
  variationselector,variation_selector
611
+ vithkuqi,vithkuqi
594
612
  wancho,wancho
595
613
  warangciti,warang_citi
596
614
  whitespace,white_space
@@ -31,6 +31,7 @@ cn,unassigned
31
31
  co,private_use
32
32
  combiningmark,mark
33
33
  copt,coptic
34
+ cpmn,cypro_minoan
34
35
  cprt,cypriot
35
36
  cs,surrogate
36
37
  cwcf,changes_when_casefolded
@@ -154,6 +155,7 @@ orkh,old_turkic
154
155
  orya,oriya
155
156
  osge,osage
156
157
  osma,osmanya
158
+ ougr,old_uyghur
157
159
  oupper,other_uppercase
158
160
  p,punctuation
159
161
  palm,palmyrene
@@ -219,9 +221,11 @@ tglg,tagalog
219
221
  thaa,thaana
220
222
  tibt,tibetan
221
223
  tirh,tirhuta
224
+ tnsa,tangsa
222
225
  ugar,ugaritic
223
226
  uideo,unified_ideograph
224
227
  vaii,vai
228
+ vith,vithkuqi
225
229
  vs,variation_selector
226
230
  wara,warang_citi
227
231
  wcho,wancho
@@ -20,7 +20,7 @@
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- raise UnknownUnicodePropertyError.new(name) unless token
23
+ validation_error(:property, name) unless token
24
24
 
25
25
  self.emit(type, token.to_sym, text)
26
26
 
@@ -28,13 +28,7 @@
28
28
 
29
29
  comment = ('#' . [^\n]* . '\n'?);
30
30
 
31
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
32
- 'cntrl' | 'digit' | 'graph' |
33
- 'lower' | 'print' | 'punct' |
34
- 'space' | 'upper' | 'xdigit' |
35
- 'word' | 'ascii';
36
-
37
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
 
40
34
  # these are not supported in ruby at the moment
@@ -74,8 +68,7 @@
74
68
  quantity_maximum = ',' . (digit+);
75
69
  quantity_range = (digit+) . ',' . (digit+);
76
70
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
77
- quantity_maximum | quantity_range ) . range_close .
78
- quantifier_mode?;
71
+ quantity_maximum | quantity_range ) . range_close;
79
72
 
80
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
81
74
  quantifier_possessive | quantifier_interval;
@@ -223,24 +216,28 @@
223
216
  fcall character_set;
224
217
  };
225
218
 
226
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
219
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
227
220
  text = copy(data, ts, te)
228
221
 
229
222
  type = :posixclass
230
223
  class_name = text[2..-3]
231
- if class_name[0].chr == '^'
224
+ if class_name[0] == '^'
232
225
  class_name = class_name[1..-1]
233
226
  type = :nonposixclass
234
227
  end
235
228
 
229
+ unless self.class.posix_classes.include?(class_name)
230
+ validation_error(:posix_class, text)
231
+ end
232
+
236
233
  emit(type, class_name.to_sym, text)
237
234
  };
238
235
 
239
236
  # These are not supported in ruby at the moment. Enable them if they are.
240
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
237
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
238
  # emit(:set, :collation, copy(data, ts, te))
242
239
  # };
243
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
240
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
241
  # emit(:set, :equivalent, copy(data, ts, te))
245
242
  # };
246
243
 
@@ -323,7 +320,7 @@
323
320
 
324
321
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
325
322
  text = copy(data, ts-1, te)
326
- if text[2].chr == '{'
323
+ if text[2] == '{'
327
324
  emit(:escape, :codepoint_list, text)
328
325
  else
329
326
  emit(:escape, :codepoint, text)
@@ -419,12 +416,12 @@
419
416
 
420
417
  backslash . anchor_char > (backslashed, 3) {
421
418
  case text = copy(data, ts, te)
422
- when '\\A'; emit(:anchor, :bos, text)
423
- when '\\z'; emit(:anchor, :eos, text)
424
- when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
- when '\\b'; emit(:anchor, :word_boundary, text)
426
- when '\\B'; emit(:anchor, :nonword_boundary, text)
427
- when '\\G'; emit(:anchor, :match_start, text)
419
+ when '\A'; emit(:anchor, :bos, text)
420
+ when '\z'; emit(:anchor, :eos, text)
421
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\b'; emit(:anchor, :word_boundary, text)
423
+ when '\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\G'; emit(:anchor, :match_start, text)
428
425
  end
429
426
  };
430
427
 
@@ -477,7 +474,7 @@
477
474
  group_open . group_options >group_opened {
478
475
  text = copy(data, ts, te)
479
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
480
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
477
+ validation_error(:group_option, $1 || "-#{$2}", text)
481
478
  end
482
479
  emit_options(text)
483
480
  };
@@ -605,7 +602,7 @@
605
602
  end
606
603
  };
607
604
 
608
- quantifier_interval {
605
+ quantifier_interval {
609
606
  emit(:quantifier, :interval, copy(data, ts, te))
610
607
  };
611
608
 
@@ -686,6 +683,7 @@ class Regexp::Scanner
686
683
  end
687
684
 
688
685
  # Invalid groupOption. Used for inline options.
686
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
689
687
  class InvalidGroupOption < ValidationError
690
688
  def initialize(option, text)
691
689
  super "Invalid group option #{option} in #{text}"
@@ -706,6 +704,13 @@ class Regexp::Scanner
706
704
  end
707
705
  end
708
706
 
707
+ # The POSIX class name was not recognized by the scanner.
708
+ class UnknownPosixClassError < ValidationError
709
+ def initialize(text)
710
+ super "Unknown POSIX class #{text}"
711
+ end
712
+ end
713
+
709
714
  # Scans the given regular expression text, or Regexp object and collects the
710
715
  # emitted token into an array that gets returned at the end. If a block is
711
716
  # given, it gets called for each emitted token.
@@ -771,6 +776,11 @@ class Regexp::Scanner
771
776
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
772
777
  end
773
778
 
779
+ def self.posix_classes
780
+ %w[alnum alpha ascii blank cntrl digit graph
781
+ lower print punct space upper word xdigit]
782
+ end
783
+
774
784
  # Emits an array with the details of the scanned pattern
775
785
  def emit(type, token, text)
776
786
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -873,15 +883,16 @@ class Regexp::Scanner
873
883
 
874
884
  # Centralizes and unifies the handling of validation related
875
885
  # errors.
876
- def validation_error(type, what, reason)
877
- case type
878
- when :group
879
- error = InvalidGroupError.new(what, reason)
880
- when :backref
881
- error = InvalidBackrefError.new(what, reason)
882
- when :sequence
883
- error = InvalidSequenceError.new(what, reason)
884
- end
886
+ def validation_error(type, what, reason = nil)
887
+ error =
888
+ case type
889
+ when :backref then InvalidBackrefError.new(what, reason)
890
+ when :group then InvalidGroupError.new(what, reason)
891
+ when :group_option then InvalidGroupOption.new(what, reason)
892
+ when :posix_class then UnknownPosixClassError.new(what)
893
+ when :property then UnknownUnicodePropertyError.new(what)
894
+ when :sequence then InvalidSequenceError.new(what, reason)
895
+ end
885
896
 
886
897
  raise error # unless @@config.validation_ignore
887
898
  end