regexp_parser 2.10.0 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -1
  3. data/LICENSE +1 -1
  4. data/Rakefile +5 -3
  5. data/lib/regexp_parser/error.rb +2 -0
  6. data/lib/regexp_parser/expression/base.rb +2 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +2 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +2 -0
  9. data/lib/regexp_parser/expression/classes/backreference.rb +2 -0
  10. data/lib/regexp_parser/expression/classes/character_set/intersection.rb +2 -0
  11. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -0
  12. data/lib/regexp_parser/expression/classes/character_set.rb +2 -0
  13. data/lib/regexp_parser/expression/classes/character_type.rb +2 -0
  14. data/lib/regexp_parser/expression/classes/conditional.rb +2 -0
  15. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -0
  16. data/lib/regexp_parser/expression/classes/free_space.rb +2 -0
  17. data/lib/regexp_parser/expression/classes/group.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  19. data/lib/regexp_parser/expression/classes/literal.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/posix_class.rb +2 -0
  21. data/lib/regexp_parser/expression/classes/root.rb +2 -0
  22. data/lib/regexp_parser/expression/classes/unicode_property.rb +2 -0
  23. data/lib/regexp_parser/expression/methods/construct.rb +2 -0
  24. data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +2 -0
  25. data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +8 -0
  26. data/lib/regexp_parser/expression/methods/human_name.rb +2 -0
  27. data/lib/regexp_parser/expression/methods/match.rb +2 -0
  28. data/lib/regexp_parser/expression/methods/match_length.rb +2 -0
  29. data/lib/regexp_parser/expression/methods/negative.rb +2 -0
  30. data/lib/regexp_parser/expression/methods/options.rb +2 -0
  31. data/lib/regexp_parser/expression/methods/parts.rb +2 -0
  32. data/lib/regexp_parser/expression/methods/printing.rb +2 -0
  33. data/lib/regexp_parser/expression/methods/referenced_expressions.rb +2 -0
  34. data/lib/regexp_parser/expression/methods/strfregexp.rb +2 -0
  35. data/lib/regexp_parser/expression/methods/tests.rb +2 -0
  36. data/lib/regexp_parser/expression/methods/traverse.rb +2 -0
  37. data/lib/regexp_parser/expression/quantifier.rb +3 -1
  38. data/lib/regexp_parser/expression/sequence.rb +2 -0
  39. data/lib/regexp_parser/expression/sequence_operation.rb +2 -0
  40. data/lib/regexp_parser/expression/shared.rb +6 -3
  41. data/lib/regexp_parser/expression/subexpression.rb +2 -0
  42. data/lib/regexp_parser/expression.rb +2 -0
  43. data/lib/regexp_parser/lexer.rb +2 -0
  44. data/lib/regexp_parser/parser.rb +3 -0
  45. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +2 -0
  46. data/lib/regexp_parser/scanner/errors/scanner_error.rb +2 -0
  47. data/lib/regexp_parser/scanner/errors/validation_error.rb +2 -0
  48. data/lib/regexp_parser/scanner/properties/long.csv +32 -0
  49. data/lib/regexp_parser/scanner/properties/short.csv +12 -0
  50. data/lib/regexp_parser/scanner/scanner.rl +44 -8
  51. data/lib/regexp_parser/scanner.rb +786 -656
  52. data/lib/regexp_parser/syntax/any.rb +2 -0
  53. data/lib/regexp_parser/syntax/base.rb +2 -0
  54. data/lib/regexp_parser/syntax/token/anchor.rb +5 -3
  55. data/lib/regexp_parser/syntax/token/assertion.rb +4 -2
  56. data/lib/regexp_parser/syntax/token/backreference.rb +8 -6
  57. data/lib/regexp_parser/syntax/token/character_set.rb +3 -1
  58. data/lib/regexp_parser/syntax/token/character_type.rb +6 -4
  59. data/lib/regexp_parser/syntax/token/conditional.rb +5 -3
  60. data/lib/regexp_parser/syntax/token/escape.rb +9 -7
  61. data/lib/regexp_parser/syntax/token/group.rb +8 -6
  62. data/lib/regexp_parser/syntax/token/keep.rb +3 -1
  63. data/lib/regexp_parser/syntax/token/meta.rb +4 -2
  64. data/lib/regexp_parser/syntax/token/posix_class.rb +4 -2
  65. data/lib/regexp_parser/syntax/token/quantifier.rb +8 -6
  66. data/lib/regexp_parser/syntax/token/unicode_property.rb +134 -74
  67. data/lib/regexp_parser/syntax/token/virtual.rb +5 -3
  68. data/lib/regexp_parser/syntax/token.rb +5 -3
  69. data/lib/regexp_parser/syntax/version_lookup.rb +4 -2
  70. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -0
  71. data/lib/regexp_parser/syntax/versions/1.9.1.rb +2 -0
  72. data/lib/regexp_parser/syntax/versions/1.9.3.rb +2 -0
  73. data/lib/regexp_parser/syntax/versions/2.0.0.rb +2 -0
  74. data/lib/regexp_parser/syntax/versions/2.2.0.rb +2 -0
  75. data/lib/regexp_parser/syntax/versions/2.3.0.rb +2 -0
  76. data/lib/regexp_parser/syntax/versions/2.4.0.rb +2 -0
  77. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -0
  78. data/lib/regexp_parser/syntax/versions/2.5.0.rb +2 -0
  79. data/lib/regexp_parser/syntax/versions/2.6.0.rb +2 -0
  80. data/lib/regexp_parser/syntax/versions/2.6.2.rb +2 -0
  81. data/lib/regexp_parser/syntax/versions/2.6.3.rb +2 -0
  82. data/lib/regexp_parser/syntax/versions/3.1.0.rb +2 -0
  83. data/lib/regexp_parser/syntax/versions/3.2.0.rb +2 -0
  84. data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
  85. data/lib/regexp_parser/syntax/versions/4.0.0.rb +4 -0
  86. data/lib/regexp_parser/syntax/versions.rb +2 -0
  87. data/lib/regexp_parser/syntax.rb +2 -0
  88. data/lib/regexp_parser/token.rb +2 -0
  89. data/lib/regexp_parser/version.rb +3 -1
  90. data/lib/regexp_parser.rb +2 -0
  91. data/regexp_parser.gemspec +2 -0
  92. metadata +5 -6
@@ -11,6 +11,7 @@ bamu,bamum
11
11
  bass,bassa_vah
12
12
  batk,batak
13
13
  beng,bengali
14
+ berf,beria_erfe
14
15
  bhks,bhaiksuki
15
16
  bidic,bidi_control
16
17
  bopo,bopomofo
@@ -58,6 +59,7 @@ epres,emoji_presentation
58
59
  ethi,ethiopic
59
60
  ext,extender
60
61
  extpict,extended_pictographic
62
+ gara,garay
61
63
  geor,georgian
62
64
  glag,glagolitic
63
65
  gong,gunjala_gondi
@@ -69,6 +71,7 @@ grek,greek
69
71
  grext,grapheme_extend
70
72
  grlink,grapheme_link
71
73
  gujr,gujarati
74
+ gukh,gurung_khema
72
75
  guru,gurmukhi
73
76
  hang,hangul
74
77
  hani,han
@@ -97,6 +100,7 @@ khmr,khmer
97
100
  khoj,khojki
98
101
  kits,khitan_small_script
99
102
  knda,kannada
103
+ krai,kirat_rai
100
104
  kthi,kaithi
101
105
  l,letter
102
106
  lana,tai_tham
@@ -122,6 +126,7 @@ mand,mandaic
122
126
  mani,manichaean
123
127
  marc,marchen
124
128
  mc,spacing_mark
129
+ mcm,modifier_combining_mark
125
130
  me,enclosing_mark
126
131
  medf,medefaidrin
127
132
  mend,mende_kikakui
@@ -154,6 +159,7 @@ oids,other_id_start
154
159
  olck,ol_chiki
155
160
  olower,other_lowercase
156
161
  omath,other_math
162
+ onao,ol_onal
157
163
  orkh,old_turkic
158
164
  orya,oriya
159
165
  osge,osage
@@ -197,6 +203,7 @@ sgnw,signwriting
197
203
  shaw,shavian
198
204
  shrd,sharada
199
205
  sidd,siddham
206
+ sidt,sidetic
200
207
  sind,khudawadi
201
208
  sinh,sinhala
202
209
  sk,modifier_symbol
@@ -208,6 +215,7 @@ sora,sora_sompeng
208
215
  soyo,soyombo
209
216
  sterm,sentence_terminal
210
217
  sund,sundanese
218
+ sunu,sunuwar
211
219
  sylo,syloti_nagri
212
220
  syrc,syriac
213
221
  tagb,tagbanwa
@@ -217,6 +225,7 @@ talu,new_tai_lue
217
225
  taml,tamil
218
226
  tang,tangut
219
227
  tavt,tai_viet
228
+ tayo,tai_yo
220
229
  telu,telugu
221
230
  term,terminal_punctuation
222
231
  tfng,tifinagh
@@ -225,6 +234,9 @@ thaa,thaana
225
234
  tibt,tibetan
226
235
  tirh,tirhuta
227
236
  tnsa,tangsa
237
+ todr,todhri
238
+ tols,tolong_siki
239
+ tutg,tulu_tigalari
228
240
  ugar,ugaritic
229
241
  uideo,unified_ideograph
230
242
  vaii,vai
@@ -37,7 +37,8 @@
37
37
  octal_sequence = [0-7]{1,3};
38
38
 
39
39
  hex_sequence = 'x' . xdigit{1,2};
40
- hex_sequence_err = 'x' . [^0-9a-fA-F{];
40
+ hex_sequence_err = 'x' . [^0-9A-Fa-f];
41
+ high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
41
42
 
42
43
  codepoint_single = 'u' . xdigit{4};
43
44
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
210
211
  type = :nonposixclass
211
212
  end
212
213
 
213
- unless self.class.posix_classes.include?(class_name)
214
+ unless POSIX_CLASSES[class_name]
214
215
  raise ValidationError.for(:posix_class, text)
215
216
  end
216
217
 
@@ -246,7 +247,7 @@
246
247
  # Treat all remaining escapes - those not supported in sets - as literal.
247
248
  # (This currently includes \^, \-, \&, \:, although these could potentially
248
249
  # be meta chars when not escaped, depending on their position in the set.)
249
- any > (escaped_set_alpha, 1) {
250
+ (any | utf8_multibyte) > (escaped_set_alpha, 1) {
250
251
  emit(:escape, :literal, copy(data, ts-1, te))
251
252
  fret;
252
253
  };
@@ -256,9 +257,21 @@
256
257
  # escape sequence scanner
257
258
  # --------------------------------------------------------------------------
258
259
  escape_sequence := |*
259
- [1-9] {
260
+ [1-9] . [0-9]* {
260
261
  text = copy(data, ts-1, te)
261
- emit(:backref, :number, text)
262
+
263
+ # If not enough groups have been opened, there is a fallback to either an
264
+ # octal or literal interpretation for 2+ digit numerical escapes.
265
+ digits = text[1..-1]
266
+ if digits.size == 1 || digits.to_i <= capturing_group_count
267
+ emit(:backref, :number, text)
268
+ elsif digits =~ /\A[0-7]{2,}\z/
269
+ emit(:escape, :octal, text)
270
+ else
271
+ emit(:escape, :literal, text[0..1])
272
+ emit(:literal, :literal, text[2..-1])
273
+ end
274
+
262
275
  fret;
263
276
  };
264
277
 
@@ -321,6 +334,16 @@
321
334
  fret;
322
335
  };
323
336
 
337
+ high_hex_sequence > (escaped_alpha, 5) {
338
+ text = copy(data, ts-1, te)
339
+ if regexp_encoding == Encoding::BINARY
340
+ text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
341
+ else
342
+ emit(:escape, :utf8_hex, text)
343
+ end
344
+ fret;
345
+ };
346
+
324
347
  hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
348
  emit(:escape, :hex, copy(data, ts-1, te))
326
349
  fret;
@@ -514,6 +537,7 @@
514
537
  };
515
538
 
516
539
  group_open @group_opened {
540
+ self.capturing_group_count = capturing_group_count + 1
517
541
  text = copy(data, ts, te)
518
542
  emit(:group, :capture, text)
519
543
  };
@@ -662,6 +686,7 @@ class Regexp::Scanner
662
686
 
663
687
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
688
  self.free_spacing = free_spacing?(input_object, options)
689
+ self.regexp_encoding = extract_encoding(input_object, options)
665
690
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
666
691
 
667
692
  data = input.unpack("c*")
@@ -672,6 +697,7 @@ class Regexp::Scanner
672
697
 
673
698
  self.set_depth = 0
674
699
  self.group_depth = 0
700
+ self.capturing_group_count = 0
675
701
  self.conditional_stack = []
676
702
  self.char_pos = 0
677
703
 
@@ -711,10 +737,11 @@ class Regexp::Scanner
711
737
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
738
  end
713
739
 
714
- def self.posix_classes
740
+ # Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
741
+ POSIX_CLASSES =
715
742
  %w[alnum alpha ascii blank cntrl digit graph
716
743
  lower print punct space upper word xdigit]
717
- end
744
+ .inject({}) { |o, e| o.merge(e => true) }.freeze
718
745
 
719
746
  # Emits an array with the details of the scanned pattern
720
747
  def emit(type, token, text)
@@ -742,16 +769,25 @@ class Regexp::Scanner
742
769
  end
743
770
  end
744
771
 
745
- attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
772
+ attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
746
773
 
747
774
  private
748
775
 
749
776
  attr_accessor :block,
750
777
  :collect_tokens, :tokens, :prev_token,
751
778
  :free_spacing, :spacing_stack,
779
+ :regexp_encoding,
752
780
  :group_depth, :set_depth, :conditional_stack,
753
781
  :char_pos
754
782
 
783
+ def extract_encoding(input_object, options)
784
+ if input_object.is_a?(::Regexp)
785
+ input_object.encoding
786
+ elsif options && (options & Regexp::NOENCODING)
787
+ Encoding::BINARY
788
+ end
789
+ end
790
+
755
791
  def free_spacing?(input_object, options)
756
792
  if options && !input_object.is_a?(String)
757
793
  raise ArgumentError, 'options cannot be supplied unless scanning a String'