regexp_parser 2.8.1 → 2.11.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +6 -4
  3. data/LICENSE +1 -1
  4. data/Rakefile +5 -3
  5. data/lib/regexp_parser/error.rb +2 -0
  6. data/lib/regexp_parser/expression/base.rb +2 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +2 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +2 -0
  9. data/lib/regexp_parser/expression/classes/backreference.rb +3 -20
  10. data/lib/regexp_parser/expression/classes/character_set/intersection.rb +2 -0
  11. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -0
  12. data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
  13. data/lib/regexp_parser/expression/classes/character_type.rb +2 -0
  14. data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
  15. data/lib/regexp_parser/expression/classes/escape_sequence.rb +26 -95
  16. data/lib/regexp_parser/expression/classes/free_space.rb +2 -0
  17. data/lib/regexp_parser/expression/classes/group.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/keep.rb +3 -1
  19. data/lib/regexp_parser/expression/classes/literal.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/posix_class.rb +2 -4
  21. data/lib/regexp_parser/expression/classes/root.rb +2 -0
  22. data/lib/regexp_parser/expression/classes/unicode_property.rb +8 -9
  23. data/lib/regexp_parser/expression/methods/construct.rb +2 -0
  24. data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +7 -0
  25. data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +76 -0
  26. data/lib/regexp_parser/expression/methods/human_name.rb +2 -0
  27. data/lib/regexp_parser/expression/methods/match.rb +2 -0
  28. data/lib/regexp_parser/expression/methods/match_length.rb +2 -0
  29. data/lib/regexp_parser/expression/methods/negative.rb +22 -0
  30. data/lib/regexp_parser/expression/methods/options.rb +2 -0
  31. data/lib/regexp_parser/expression/methods/parts.rb +2 -0
  32. data/lib/regexp_parser/expression/methods/printing.rb +2 -0
  33. data/lib/regexp_parser/expression/methods/referenced_expressions.rb +30 -0
  34. data/lib/regexp_parser/expression/methods/strfregexp.rb +2 -0
  35. data/lib/regexp_parser/expression/methods/tests.rb +2 -0
  36. data/lib/regexp_parser/expression/methods/traverse.rb +2 -0
  37. data/lib/regexp_parser/expression/quantifier.rb +3 -1
  38. data/lib/regexp_parser/expression/sequence.rb +2 -0
  39. data/lib/regexp_parser/expression/sequence_operation.rb +2 -0
  40. data/lib/regexp_parser/expression/shared.rb +6 -3
  41. data/lib/regexp_parser/expression/subexpression.rb +2 -0
  42. data/lib/regexp_parser/expression.rb +39 -33
  43. data/lib/regexp_parser/lexer.rb +2 -0
  44. data/lib/regexp_parser/parser.rb +16 -9
  45. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +2 -0
  46. data/lib/regexp_parser/scanner/errors/scanner_error.rb +3 -1
  47. data/lib/regexp_parser/scanner/errors/validation_error.rb +2 -0
  48. data/lib/regexp_parser/scanner/properties/long.csv +37 -0
  49. data/lib/regexp_parser/scanner/properties/short.csv +9 -0
  50. data/lib/regexp_parser/scanner/scanner.rl +62 -18
  51. data/lib/regexp_parser/scanner.rb +1041 -936
  52. data/lib/regexp_parser/syntax/any.rb +2 -0
  53. data/lib/regexp_parser/syntax/base.rb +2 -0
  54. data/lib/regexp_parser/syntax/token/anchor.rb +5 -3
  55. data/lib/regexp_parser/syntax/token/assertion.rb +4 -2
  56. data/lib/regexp_parser/syntax/token/backreference.rb +8 -6
  57. data/lib/regexp_parser/syntax/token/character_set.rb +3 -1
  58. data/lib/regexp_parser/syntax/token/character_type.rb +6 -4
  59. data/lib/regexp_parser/syntax/token/conditional.rb +5 -3
  60. data/lib/regexp_parser/syntax/token/escape.rb +9 -7
  61. data/lib/regexp_parser/syntax/token/group.rb +8 -6
  62. data/lib/regexp_parser/syntax/token/keep.rb +3 -1
  63. data/lib/regexp_parser/syntax/token/meta.rb +4 -2
  64. data/lib/regexp_parser/syntax/token/posix_class.rb +4 -2
  65. data/lib/regexp_parser/syntax/token/quantifier.rb +8 -6
  66. data/lib/regexp_parser/syntax/token/unicode_property.rb +79 -46
  67. data/lib/regexp_parser/syntax/token/virtual.rb +5 -3
  68. data/lib/regexp_parser/syntax/token.rb +18 -16
  69. data/lib/regexp_parser/syntax/version_lookup.rb +4 -2
  70. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -0
  71. data/lib/regexp_parser/syntax/versions/1.9.1.rb +2 -0
  72. data/lib/regexp_parser/syntax/versions/1.9.3.rb +2 -0
  73. data/lib/regexp_parser/syntax/versions/2.0.0.rb +2 -0
  74. data/lib/regexp_parser/syntax/versions/2.2.0.rb +2 -0
  75. data/lib/regexp_parser/syntax/versions/2.3.0.rb +2 -0
  76. data/lib/regexp_parser/syntax/versions/2.4.0.rb +2 -0
  77. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -0
  78. data/lib/regexp_parser/syntax/versions/2.5.0.rb +2 -0
  79. data/lib/regexp_parser/syntax/versions/2.6.0.rb +2 -0
  80. data/lib/regexp_parser/syntax/versions/2.6.2.rb +2 -0
  81. data/lib/regexp_parser/syntax/versions/2.6.3.rb +2 -0
  82. data/lib/regexp_parser/syntax/versions/3.1.0.rb +2 -0
  83. data/lib/regexp_parser/syntax/versions/3.2.0.rb +2 -0
  84. data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
  85. data/lib/regexp_parser/syntax/versions.rb +3 -1
  86. data/lib/regexp_parser/syntax.rb +3 -1
  87. data/lib/regexp_parser/token.rb +2 -0
  88. data/lib/regexp_parser/version.rb +3 -1
  89. data/lib/regexp_parser.rb +8 -6
  90. data/regexp_parser.gemspec +7 -5
  91. metadata +12 -11
  92. data/CHANGELOG.md +0 -691
  93. data/README.md +0 -506
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Regexp::Expression
2
4
  module Shared
3
5
  module ClassMethods; end # filled in ./methods/*.rb
@@ -70,11 +72,12 @@ module Regexp::Expression
70
72
  # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
73
  #
72
74
  def to_s(format = :full)
73
- base = parts.each_with_object(''.dup) do |part, buff|
75
+ base = ''.dup
76
+ parts.each do |part|
74
77
  if part.instance_of?(String)
75
- buff << part
78
+ base << part
76
79
  elsif !part.custom_to_s_handling
77
- buff << part.to_s(:original)
80
+ base << part.to_s(:original)
78
81
  end
79
82
  end
80
83
  "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Regexp::Expression
2
4
  class Subexpression < Regexp::Expression::Base
3
5
  include Enumerable
@@ -1,36 +1,42 @@
1
- require 'regexp_parser/error'
1
+ # frozen_string_literal: true
2
2
 
3
- require 'regexp_parser/expression/shared'
4
- require 'regexp_parser/expression/base'
5
- require 'regexp_parser/expression/quantifier'
6
- require 'regexp_parser/expression/subexpression'
7
- require 'regexp_parser/expression/sequence'
8
- require 'regexp_parser/expression/sequence_operation'
3
+ require_relative 'error'
9
4
 
10
- require 'regexp_parser/expression/classes/alternation'
11
- require 'regexp_parser/expression/classes/anchor'
12
- require 'regexp_parser/expression/classes/backreference'
13
- require 'regexp_parser/expression/classes/character_set'
14
- require 'regexp_parser/expression/classes/character_set/intersection'
15
- require 'regexp_parser/expression/classes/character_set/range'
16
- require 'regexp_parser/expression/classes/character_type'
17
- require 'regexp_parser/expression/classes/conditional'
18
- require 'regexp_parser/expression/classes/escape_sequence'
19
- require 'regexp_parser/expression/classes/free_space'
20
- require 'regexp_parser/expression/classes/group'
21
- require 'regexp_parser/expression/classes/keep'
22
- require 'regexp_parser/expression/classes/literal'
23
- require 'regexp_parser/expression/classes/posix_class'
24
- require 'regexp_parser/expression/classes/root'
25
- require 'regexp_parser/expression/classes/unicode_property'
5
+ require_relative 'expression/shared'
6
+ require_relative 'expression/base'
7
+ require_relative 'expression/quantifier'
8
+ require_relative 'expression/subexpression'
9
+ require_relative 'expression/sequence'
10
+ require_relative 'expression/sequence_operation'
26
11
 
27
- require 'regexp_parser/expression/methods/construct'
28
- require 'regexp_parser/expression/methods/human_name'
29
- require 'regexp_parser/expression/methods/match'
30
- require 'regexp_parser/expression/methods/match_length'
31
- require 'regexp_parser/expression/methods/options'
32
- require 'regexp_parser/expression/methods/parts'
33
- require 'regexp_parser/expression/methods/printing'
34
- require 'regexp_parser/expression/methods/strfregexp'
35
- require 'regexp_parser/expression/methods/tests'
36
- require 'regexp_parser/expression/methods/traverse'
12
+ require_relative 'expression/classes/alternation'
13
+ require_relative 'expression/classes/anchor'
14
+ require_relative 'expression/classes/backreference'
15
+ require_relative 'expression/classes/character_set'
16
+ require_relative 'expression/classes/character_set/intersection'
17
+ require_relative 'expression/classes/character_set/range'
18
+ require_relative 'expression/classes/character_type'
19
+ require_relative 'expression/classes/conditional'
20
+ require_relative 'expression/classes/escape_sequence'
21
+ require_relative 'expression/classes/free_space'
22
+ require_relative 'expression/classes/group'
23
+ require_relative 'expression/classes/keep'
24
+ require_relative 'expression/classes/literal'
25
+ require_relative 'expression/classes/posix_class'
26
+ require_relative 'expression/classes/root'
27
+ require_relative 'expression/classes/unicode_property'
28
+
29
+ require_relative 'expression/methods/construct'
30
+ require_relative 'expression/methods/escape_sequence_char'
31
+ require_relative 'expression/methods/escape_sequence_codepoint'
32
+ require_relative 'expression/methods/human_name'
33
+ require_relative 'expression/methods/match'
34
+ require_relative 'expression/methods/match_length'
35
+ require_relative 'expression/methods/negative'
36
+ require_relative 'expression/methods/options'
37
+ require_relative 'expression/methods/parts'
38
+ require_relative 'expression/methods/printing'
39
+ require_relative 'expression/methods/referenced_expressions'
40
+ require_relative 'expression/methods/strfregexp'
41
+ require_relative 'expression/methods/tests'
42
+ require_relative 'expression/methods/traverse'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # A very thin wrapper around the scanner that breaks quantified literal runs,
2
4
  # collects emitted tokens into an array, calculates their nesting depth, and
3
5
  # normalizes tokens for the parser, and checks if they are implemented by the
@@ -1,5 +1,7 @@
1
- require 'regexp_parser/error'
2
- require 'regexp_parser/expression'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'error'
4
+ require_relative 'expression'
3
5
 
4
6
  class Regexp::Parser
5
7
  include Regexp::Expression
@@ -319,6 +321,7 @@ class Regexp::Parser
319
321
  when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320
322
  when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321
323
  when :octal; node << EscapeSequence::Octal.new(token, active_opts)
324
+ when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
322
325
 
323
326
  when :control
324
327
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
@@ -467,6 +470,7 @@ class Regexp::Parser
467
470
  when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
471
  when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
472
  when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
473
+ when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
470
474
  when *UPTokens::Script; node << UP::Script.new(token, active_opts)
471
475
  when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
472
476
 
@@ -574,21 +578,24 @@ class Regexp::Parser
574
578
  options_stack.last
575
579
  end
576
580
 
577
- # Assigns referenced expressions to refering expressions, e.g. if there is
581
+ # Assigns referenced expressions to referring expressions, e.g. if there is
578
582
  # an instance of Backreference::Number, its #referenced_expression is set to
579
583
  # the instance of Group::Capture that it refers to via its number.
580
584
  def assign_referenced_expressions
581
- # find all referencable and refering expressions
582
- targets = { 0 => root }
585
+ # find all referenceable and referring expressions
586
+ targets = { 0 => [root] }
583
587
  referrers = []
584
588
  root.each_expression do |exp|
585
- exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
586
- referrers << exp if exp.referential?
589
+ if exp.referential?
590
+ referrers << exp
591
+ elsif exp.is_a?(Group::Capture)
592
+ (targets[exp.identifier] ||= []) << exp
593
+ end
587
594
  end
588
- # assign reference expression to refering expressions
595
+ # assign referenced expressions to referring expressions
589
596
  # (in a second iteration because there might be forward references)
590
597
  referrers.each do |exp|
591
- exp.referenced_expression = targets[exp.reference] ||
598
+ exp.referenced_expressions = targets[exp.reference] ||
592
599
  raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
593
600
  end
594
601
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Regexp::Scanner
2
4
  # Unexpected end of pattern
3
5
  class PrematureEndError < ScannerError
@@ -1,4 +1,6 @@
1
- require 'regexp_parser/error'
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../../regexp_parser/error'
2
4
 
3
5
  class Regexp::Scanner
4
6
  # General scanner error (catch all)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Regexp::Scanner
2
4
  # Base for all scanner validation errors
3
5
  class ValidationError < ScannerError
@@ -8,6 +8,8 @@ age=12.1,age=12.1
8
8
  age=13.0,age=13.0
9
9
  age=14.0,age=14.0
10
10
  age=15.0,age=15.0
11
+ age=15.1,age=15.1
12
+ age=16.0,age=16.0
11
13
  age=2.0,age=2.0
12
14
  age=2.1,age=2.1
13
15
  age=3.0,age=3.0
@@ -102,18 +104,33 @@ extendedpictographic,extended_pictographic
102
104
  extender,extender
103
105
  finalpunctuation,final_punctuation
104
106
  format,format
107
+ garay,garay
105
108
  georgian,georgian
106
109
  glagolitic,glagolitic
107
110
  gothic,gothic
108
111
  grantha,grantha
109
112
  graph,graph
110
113
  graphemebase,grapheme_base
114
+ graphemeclusterbreak=control,grapheme_cluster_break=control
115
+ graphemeclusterbreak=cr,grapheme_cluster_break=cr
116
+ graphemeclusterbreak=extend,grapheme_cluster_break=extend
117
+ graphemeclusterbreak=l,grapheme_cluster_break=l
118
+ graphemeclusterbreak=lf,grapheme_cluster_break=lf
119
+ graphemeclusterbreak=lv,grapheme_cluster_break=lv
120
+ graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
121
+ graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
122
+ graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
123
+ graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
124
+ graphemeclusterbreak=t,grapheme_cluster_break=t
125
+ graphemeclusterbreak=v,grapheme_cluster_break=v
126
+ graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
111
127
  graphemeextend,grapheme_extend
112
128
  graphemelink,grapheme_link
113
129
  greek,greek
114
130
  gujarati,gujarati
115
131
  gunjalagondi,gunjala_gondi
116
132
  gurmukhi,gurmukhi
133
+ gurungkhema,gurung_khema
117
134
  han,han
118
135
  hangul,hangul
119
136
  hanifirohingya,hanifi_rohingya
@@ -123,11 +140,14 @@ hebrew,hebrew
123
140
  hexdigit,hex_digit
124
141
  hiragana,hiragana
125
142
  hyphen,hyphen
143
+ idcompatmathcontinue,id_compat_math_continue
144
+ idcompatmathstart,id_compat_math_start
126
145
  idcontinue,id_continue
127
146
  ideographic,ideographic
128
147
  idsbinaryoperator,ids_binary_operator
129
148
  idstart,id_start
130
149
  idstrinaryoperator,ids_trinary_operator
150
+ idsunaryoperator,ids_unary_operator
131
151
  imperialaramaic,imperial_aramaic
132
152
  inadlam,in_adlam
133
153
  inaegeannumbers,in_aegean_numbers
@@ -190,6 +210,7 @@ incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
190
210
  incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
191
211
  incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
192
212
  incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
213
+ incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
193
214
  incombiningdiacriticalmarks,in_combining_diacritical_marks
194
215
  incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
195
216
  incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
@@ -223,6 +244,7 @@ induployan,in_duployan
223
244
  inearlydynasticcuneiform,in_early_dynastic_cuneiform
224
245
  inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls
225
246
  inegyptianhieroglyphs,in_egyptian_hieroglyphs
247
+ inegyptianhieroglyphsextendeda,in_egyptian_hieroglyphs_extended_a
226
248
  inelbasan,in_elbasan
227
249
  inelymaic,in_elymaic
228
250
  inemoticons,in_emoticons
@@ -235,6 +257,7 @@ inethiopicextended,in_ethiopic_extended
235
257
  inethiopicextendeda,in_ethiopic_extended_a
236
258
  inethiopicextendedb,in_ethiopic_extended_b
237
259
  inethiopicsupplement,in_ethiopic_supplement
260
+ ingaray,in_garay
238
261
  ingeneralpunctuation,in_general_punctuation
239
262
  ingeometricshapes,in_geometric_shapes
240
263
  ingeometricshapesextended,in_geometric_shapes_extended
@@ -250,6 +273,7 @@ ingreekextended,in_greek_extended
250
273
  ingujarati,in_gujarati
251
274
  ingunjalagondi,in_gunjala_gondi
252
275
  ingurmukhi,in_gurmukhi
276
+ ingurungkhema,in_gurung_khema
253
277
  inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms
254
278
  inhangulcompatibilityjamo,in_hangul_compatibility_jamo
255
279
  inhanguljamo,in_hangul_jamo
@@ -291,6 +315,7 @@ inkhmer,in_khmer
291
315
  inkhmersymbols,in_khmer_symbols
292
316
  inkhojki,in_khojki
293
317
  inkhudawadi,in_khudawadi
318
+ inkiratrai,in_kirat_rai
294
319
  inlao,in_lao
295
320
  inlatin1supplement,in_latin_1_supplement
296
321
  inlatinextendeda,in_latin_extended_a
@@ -346,6 +371,7 @@ inmusicalsymbols,in_musical_symbols
346
371
  inmyanmar,in_myanmar
347
372
  inmyanmarextendeda,in_myanmar_extended_a
348
373
  inmyanmarextendedb,in_myanmar_extended_b
374
+ inmyanmarextendedc,in_myanmar_extended_c
349
375
  innabataean,in_nabataean
350
376
  innagmundari,in_nag_mundari
351
377
  innandinagari,in_nandinagari
@@ -367,6 +393,7 @@ inoldsogdian,in_old_sogdian
367
393
  inoldsoutharabian,in_old_south_arabian
368
394
  inoldturkic,in_old_turkic
369
395
  inolduyghur,in_old_uyghur
396
+ inolonal,in_ol_onal
370
397
  inopticalcharacterrecognition,in_optical_character_recognition
371
398
  inoriya,in_oriya
372
399
  inornamentaldingbats,in_ornamental_dingbats
@@ -406,6 +433,7 @@ inspacingmodifierletters,in_spacing_modifier_letters
406
433
  inspecials,in_specials
407
434
  insundanese,in_sundanese
408
435
  insundanesesupplement,in_sundanese_supplement
436
+ insunuwar,in_sunuwar
409
437
  insuperscriptsandsubscripts,in_superscripts_and_subscripts
410
438
  insupplementalarrowsa,in_supplemental_arrows_a
411
439
  insupplementalarrowsb,in_supplemental_arrows_b
@@ -419,6 +447,7 @@ insuttonsignwriting,in_sutton_signwriting
419
447
  insylotinagri,in_syloti_nagri
420
448
  insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a
421
449
  insymbolsforlegacycomputing,in_symbols_for_legacy_computing
450
+ insymbolsforlegacycomputingsupplement,in_symbols_for_legacy_computing_supplement
422
451
  insyriac,in_syriac
423
452
  insyriacsupplement,in_syriac_supplement
424
453
  intagalog,in_tagalog
@@ -441,8 +470,10 @@ inthai,in_thai
441
470
  intibetan,in_tibetan
442
471
  intifinagh,in_tifinagh
443
472
  intirhuta,in_tirhuta
473
+ intodhri,in_todhri
444
474
  intoto,in_toto
445
475
  intransportandmapsymbols,in_transport_and_map_symbols
476
+ intulutigalari,in_tulu_tigalari
446
477
  inugaritic,in_ugaritic
447
478
  inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
448
479
  inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
@@ -473,6 +504,7 @@ khitansmallscript,khitan_small_script
473
504
  khmer,khmer
474
505
  khojki,khojki
475
506
  khudawadi,khudawadi
507
+ kiratrai,kirat_rai
476
508
  lao,lao
477
509
  latin,latin
478
510
  lepcha,lepcha
@@ -506,6 +538,7 @@ meroiticcursive,meroitic_cursive
506
538
  meroitichieroglyphs,meroitic_hieroglyphs
507
539
  miao,miao
508
540
  modi,modi
541
+ modifiercombiningmark,modifier_combining_mark
509
542
  modifierletter,modifier_letter
510
543
  modifiersymbol,modifier_symbol
511
544
  mongolian,mongolian
@@ -535,6 +568,7 @@ oldsogdian,old_sogdian
535
568
  oldsoutharabian,old_south_arabian
536
569
  oldturkic,old_turkic
537
570
  olduyghur,old_uyghur
571
+ olonal,ol_onal
538
572
  openpunctuation,open_punctuation
539
573
  oriya,oriya
540
574
  osage,osage
@@ -588,6 +622,7 @@ space,space
588
622
  spaceseparator,space_separator
589
623
  spacingmark,spacing_mark
590
624
  sundanese,sundanese
625
+ sunuwar,sunuwar
591
626
  surrogate,surrogate
592
627
  sylotinagri,syloti_nagri
593
628
  symbol,symbol
@@ -609,7 +644,9 @@ tibetan,tibetan
609
644
  tifinagh,tifinagh
610
645
  tirhuta,tirhuta
611
646
  titlecaseletter,titlecase_letter
647
+ todhri,todhri
612
648
  toto,toto
649
+ tulutigalari,tulu_tigalari
613
650
  ugaritic,ugaritic
614
651
  unassigned,unassigned
615
652
  unifiedideograph,unified_ideograph
@@ -58,6 +58,7 @@ epres,emoji_presentation
58
58
  ethi,ethiopic
59
59
  ext,extender
60
60
  extpict,extended_pictographic
61
+ gara,garay
61
62
  geor,georgian
62
63
  glag,glagolitic
63
64
  gong,gunjala_gondi
@@ -69,6 +70,7 @@ grek,greek
69
70
  grext,grapheme_extend
70
71
  grlink,grapheme_link
71
72
  gujr,gujarati
73
+ gukh,gurung_khema
72
74
  guru,gurmukhi
73
75
  hang,hangul
74
76
  hani,han
@@ -86,6 +88,7 @@ ideo,ideographic
86
88
  ids,id_start
87
89
  idsb,ids_binary_operator
88
90
  idst,ids_trinary_operator
91
+ idsu,ids_unary_operator
89
92
  ital,old_italic
90
93
  java,javanese
91
94
  joinc,join_control
@@ -96,6 +99,7 @@ khmr,khmer
96
99
  khoj,khojki
97
100
  kits,khitan_small_script
98
101
  knda,kannada
102
+ krai,kirat_rai
99
103
  kthi,kaithi
100
104
  l,letter
101
105
  lana,tai_tham
@@ -121,6 +125,7 @@ mand,mandaic
121
125
  mani,manichaean
122
126
  marc,marchen
123
127
  mc,spacing_mark
128
+ mcm,modifier_combining_mark
124
129
  me,enclosing_mark
125
130
  medf,medefaidrin
126
131
  mend,mende_kikakui
@@ -153,6 +158,7 @@ oids,other_id_start
153
158
  olck,ol_chiki
154
159
  olower,other_lowercase
155
160
  omath,other_math
161
+ onao,ol_onal
156
162
  orkh,old_turkic
157
163
  orya,oriya
158
164
  osge,osage
@@ -207,6 +213,7 @@ sora,sora_sompeng
207
213
  soyo,soyombo
208
214
  sterm,sentence_terminal
209
215
  sund,sundanese
216
+ sunu,sunuwar
210
217
  sylo,syloti_nagri
211
218
  syrc,syriac
212
219
  tagb,tagbanwa
@@ -224,6 +231,8 @@ thaa,thaana
224
231
  tibt,tibetan
225
232
  tirh,tirhuta
226
233
  tnsa,tangsa
234
+ todr,todhri
235
+ tutg,tulu_tigalari
227
236
  ugar,ugaritic
228
237
  uideo,unified_ideograph
229
238
  vaii,vai
@@ -37,7 +37,8 @@
37
37
  octal_sequence = [0-7]{1,3};
38
38
 
39
39
  hex_sequence = 'x' . xdigit{1,2};
40
- hex_sequence_err = 'x' . [^0-9a-fA-F{];
40
+ hex_sequence_err = 'x' . [^0-9A-Fa-f];
41
+ high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
41
42
 
42
43
  codepoint_single = 'u' . xdigit{4};
43
44
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -78,8 +79,8 @@
78
79
  # try to treat every other group head as options group, like Ruby
79
80
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
80
81
 
81
- group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
- group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
82
+ group_name_id_ab = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
83
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
84
  group_number = '-'? . [0-9]+;
84
85
  group_level = [+\-] . [0-9]+;
85
86
 
@@ -210,7 +211,7 @@
210
211
  type = :nonposixclass
211
212
  end
212
213
 
213
- unless self.class.posix_classes.include?(class_name)
214
+ unless POSIX_CLASSES[class_name]
214
215
  raise ValidationError.for(:posix_class, text)
215
216
  end
216
217
 
@@ -246,7 +247,7 @@
246
247
  # Treat all remaining escapes - those not supported in sets - as literal.
247
248
  # (This currently includes \^, \-, \&, \:, although these could potentially
248
249
  # be meta chars when not escaped, depending on their position in the set.)
249
- any > (escaped_set_alpha, 1) {
250
+ (any | utf8_multibyte) > (escaped_set_alpha, 1) {
250
251
  emit(:escape, :literal, copy(data, ts-1, te))
251
252
  fret;
252
253
  };
@@ -256,9 +257,21 @@
256
257
  # escape sequence scanner
257
258
  # --------------------------------------------------------------------------
258
259
  escape_sequence := |*
259
- [1-9] {
260
+ [1-9] . [0-9]* {
260
261
  text = copy(data, ts-1, te)
261
- emit(:backref, :number, text)
262
+
263
+ # If not enough groups have been opened, there is a fallback to either an
264
+ # octal or literal interpretation for 2+ digit numerical escapes.
265
+ digits = text[1..-1]
266
+ if digits.size == 1 || digits.to_i <= capturing_group_count
267
+ emit(:backref, :number, text)
268
+ elsif digits =~ /\A[0-7]{2,}\z/
269
+ emit(:escape, :octal, text)
270
+ else
271
+ emit(:escape, :literal, text[0..1])
272
+ emit(:literal, :literal, text[2..-1])
273
+ end
274
+
262
275
  fret;
263
276
  };
264
277
 
@@ -267,6 +280,13 @@
267
280
  fret;
268
281
  };
269
282
 
283
+ [8-9] . [0-9] { # special case, emits two tokens
284
+ text = copy(data, ts-1, te)
285
+ emit(:escape, :literal, text[0, 2])
286
+ emit(:literal, :literal, text[2])
287
+ fret;
288
+ };
289
+
270
290
  meta_char {
271
291
  case text = copy(data, ts-1, te)
272
292
  when '\.'; emit(:escape, :dot, text)
@@ -314,6 +334,16 @@
314
334
  fret;
315
335
  };
316
336
 
337
+ high_hex_sequence > (escaped_alpha, 5) {
338
+ text = copy(data, ts-1, te)
339
+ if regexp_encoding == Encoding::BINARY
340
+ text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
341
+ else
342
+ emit(:escape, :utf8_hex, text)
343
+ end
344
+ fret;
345
+ };
346
+
317
347
  hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
318
348
  emit(:escape, :hex, copy(data, ts-1, te))
319
349
  fret;
@@ -357,6 +387,7 @@
357
387
  conditional_expression := |*
358
388
  group_lookup . ')' {
359
389
  text = copy(data, ts, te-1)
390
+ text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
360
391
  emit(:conditional, :condition, text)
361
392
  emit(:conditional, :condition_close, ')')
362
393
  };
@@ -506,6 +537,7 @@
506
537
  };
507
538
 
508
539
  group_open @group_opened {
540
+ self.capturing_group_count = capturing_group_count + 1
509
541
  text = copy(data, ts, te)
510
542
  emit(:group, :capture, text)
511
543
  };
@@ -534,13 +566,13 @@
534
566
  case text = copy(data, ts, te)
535
567
  when /^\\k(.)[^0-9\-][^+\-]*['>]$/
536
568
  emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
537
- when /^\\k(.)[1-9]\d*['>]$/
569
+ when /^\\k(.)0*[1-9]\d*['>]$/
538
570
  emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
539
- when /^\\k(.)-[1-9]\d*['>]$/
571
+ when /^\\k(.)-0*[1-9]\d*['>]$/
540
572
  emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
541
573
  when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
542
574
  emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
543
- when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
575
+ when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
544
576
  emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
545
577
  else
546
578
  raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
@@ -553,9 +585,9 @@
553
585
  case text = copy(data, ts, te)
554
586
  when /^\\g(.)[^0-9+\-].*['>]$/
555
587
  emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
556
- when /^\\g(.)\d+['>]$/
588
+ when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
557
589
  emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
558
- when /^\\g(.)[+-]\d+/
590
+ when /^\\g(.)[+-]0*[1-9]\d*/
559
591
  emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
560
592
  else
561
593
  raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
@@ -632,9 +664,9 @@
632
664
  *|;
633
665
  }%%
634
666
 
635
- require 'regexp_parser/scanner/errors/scanner_error'
636
- require 'regexp_parser/scanner/errors/premature_end_error'
637
- require 'regexp_parser/scanner/errors/validation_error'
667
+ require_relative 'scanner/errors/scanner_error'
668
+ require_relative 'scanner/errors/premature_end_error'
669
+ require_relative 'scanner/errors/validation_error'
638
670
 
639
671
  class Regexp::Scanner
640
672
  # Scans the given regular expression text, or Regexp object and collects the
@@ -654,6 +686,7 @@ class Regexp::Scanner
654
686
 
655
687
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
656
688
  self.free_spacing = free_spacing?(input_object, options)
689
+ self.regexp_encoding = extract_encoding(input_object, options)
657
690
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
658
691
 
659
692
  data = input.unpack("c*")
@@ -664,6 +697,7 @@ class Regexp::Scanner
664
697
 
665
698
  self.set_depth = 0
666
699
  self.group_depth = 0
700
+ self.capturing_group_count = 0
667
701
  self.conditional_stack = []
668
702
  self.char_pos = 0
669
703
 
@@ -703,10 +737,11 @@ class Regexp::Scanner
703
737
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
704
738
  end
705
739
 
706
- def self.posix_classes
740
+ # Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
741
+ POSIX_CLASSES =
707
742
  %w[alnum alpha ascii blank cntrl digit graph
708
743
  lower print punct space upper word xdigit]
709
- end
744
+ .inject({}) { |o, e| o.merge(e => true) }.freeze
710
745
 
711
746
  # Emits an array with the details of the scanned pattern
712
747
  def emit(type, token, text)
@@ -734,16 +769,25 @@ class Regexp::Scanner
734
769
  end
735
770
  end
736
771
 
737
- attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
772
+ attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
738
773
 
739
774
  private
740
775
 
741
776
  attr_accessor :block,
742
777
  :collect_tokens, :tokens, :prev_token,
743
778
  :free_spacing, :spacing_stack,
779
+ :regexp_encoding,
744
780
  :group_depth, :set_depth, :conditional_stack,
745
781
  :char_pos
746
782
 
783
+ def extract_encoding(input_object, options)
784
+ if input_object.is_a?(::Regexp)
785
+ input_object.encoding
786
+ elsif options && (options & Regexp::NOENCODING)
787
+ Encoding::BINARY
788
+ end
789
+ end
790
+
747
791
  def free_spacing?(input_object, options)
748
792
  if options && !input_object.is_a?(String)
749
793
  raise ArgumentError, 'options cannot be supplied unless scanning a String'