regexp_parser 2.10.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f6ed5457d89738fa1076cf3875cd2d009973f02857ea68e055ef3ef74a78dc91
4
- data.tar.gz: d67eb5f0cb37ad106574b2ae327eefcfc13c9d585cddec6661898f4d8166ebcc
3
+ metadata.gz: d7598b7311a82778cbcb493188dad178ce93c8478e420cd9e2382732ee90d4e1
4
+ data.tar.gz: 60a8399981030bdef025cf9657e043a5ccac93adeee62a589a8adb41ec460664
5
5
  SHA512:
6
- metadata.gz: 6b8adbc3c4707fc4c823456ae1d7547f17568802de03008a17fef18a5f95af08b0e42d48ccdfab25a740603a58ab89c036d70cec94405701201e5a5af51ce392
7
- data.tar.gz: 9bea98a42ab64a9b45ddc5564cd077d7eb6d2ddc293844759bb8001aa9fefd8aa26b0e03fff7a286ccde9f7aeacacda9fbb187fe04082749d3c2605e0cece7b9
6
+ metadata.gz: a7ac06fda5f76d4497b8f01d1e724917d009f7c9ea10befcf03a801af8e769b52619433a22cc997cf584b03e1ca9e6ced257f5fc07e327c966f5c25714d2d0b4
7
+ data.tar.gz: 3d3f89a383bb63208a41801ea059bfc407ff2e88d657d23b0f13740d418335ad47c9f5174bc1d5b7f06841d7a461828c57efa1f97f8bc1b9b42e255959bd18cf
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010, 2012-2024, Ammar Ali
1
+ Copyright (c) 2010, 2012-2025, Ammar Ali
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person
4
4
  obtaining a copy of this software and associated documentation
data/Rakefile CHANGED
@@ -14,10 +14,10 @@ RSpec::Core::RakeTask.new(:spec)
14
14
  task :default => [:'test:full']
15
15
 
16
16
  namespace :test do
17
- task full: [:'ragel:rb', :spec]
17
+ task full: [:ragel, :spec]
18
18
  end
19
19
 
20
20
  # Add ragel task as a prerequisite for building the gem to ensure that the
21
21
  # latest scanner code is generated and included in the build.
22
- desc "Runs ragel:rb before building the gem"
23
- task :build => ['ragel:rb']
22
+ desc "Runs ragel before building the gem"
23
+ task build: :ragel
@@ -18,6 +18,7 @@ module Regexp::Expression
18
18
  Codepoint = Class.new(Base) # e.g. \u000A
19
19
 
20
20
  CodepointList = Class.new(Base) # e.g. \u{A B}
21
+ UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
21
22
 
22
23
  AbstractMetaControlSequence = Class.new(Base)
23
24
  Control = Class.new(AbstractMetaControlSequence) # e.g. \cB
@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
15
15
  Hex.class_eval { def codepoint; text[/\h+/].hex end }
16
16
  Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
17
17
 
18
+ UTF8Hex.class_eval do
19
+ def codepoint
20
+ text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
21
+ end
22
+ end
23
+
18
24
  CodepointList.class_eval do
19
25
  # Maybe this should be a unique top-level expression class?
20
26
  def char
@@ -70,11 +70,12 @@ module Regexp::Expression
70
70
  # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
71
  #
72
72
  def to_s(format = :full)
73
- base = parts.each_with_object(''.dup) do |part, buff|
73
+ base = ''.dup
74
+ parts.each do |part|
74
75
  if part.instance_of?(String)
75
- buff << part
76
+ base << part
76
77
  elsif !part.custom_to_s_handling
77
- buff << part.to_s(:original)
78
+ base << part.to_s(:original)
78
79
  end
79
80
  end
80
81
  "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
@@ -319,6 +319,7 @@ class Regexp::Parser
319
319
  when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320
320
  when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321
321
  when :octal; node << EscapeSequence::Octal.new(token, active_opts)
322
+ when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
322
323
 
323
324
  when :control
324
325
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
@@ -9,6 +9,7 @@ age=13.0,age=13.0
9
9
  age=14.0,age=14.0
10
10
  age=15.0,age=15.0
11
11
  age=15.1,age=15.1
12
+ age=16.0,age=16.0
12
13
  age=2.0,age=2.0
13
14
  age=2.1,age=2.1
14
15
  age=3.0,age=3.0
@@ -103,6 +104,7 @@ extendedpictographic,extended_pictographic
103
104
  extender,extender
104
105
  finalpunctuation,final_punctuation
105
106
  format,format
107
+ garay,garay
106
108
  georgian,georgian
107
109
  glagolitic,glagolitic
108
110
  gothic,gothic
@@ -128,6 +130,7 @@ greek,greek
128
130
  gujarati,gujarati
129
131
  gunjalagondi,gunjala_gondi
130
132
  gurmukhi,gurmukhi
133
+ gurungkhema,gurung_khema
131
134
  han,han
132
135
  hangul,hangul
133
136
  hanifirohingya,hanifi_rohingya
@@ -241,6 +244,7 @@ induployan,in_duployan
241
244
  inearlydynasticcuneiform,in_early_dynastic_cuneiform
242
245
  inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls
243
246
  inegyptianhieroglyphs,in_egyptian_hieroglyphs
247
+ inegyptianhieroglyphsextendeda,in_egyptian_hieroglyphs_extended_a
244
248
  inelbasan,in_elbasan
245
249
  inelymaic,in_elymaic
246
250
  inemoticons,in_emoticons
@@ -253,6 +257,7 @@ inethiopicextended,in_ethiopic_extended
253
257
  inethiopicextendeda,in_ethiopic_extended_a
254
258
  inethiopicextendedb,in_ethiopic_extended_b
255
259
  inethiopicsupplement,in_ethiopic_supplement
260
+ ingaray,in_garay
256
261
  ingeneralpunctuation,in_general_punctuation
257
262
  ingeometricshapes,in_geometric_shapes
258
263
  ingeometricshapesextended,in_geometric_shapes_extended
@@ -268,6 +273,7 @@ ingreekextended,in_greek_extended
268
273
  ingujarati,in_gujarati
269
274
  ingunjalagondi,in_gunjala_gondi
270
275
  ingurmukhi,in_gurmukhi
276
+ ingurungkhema,in_gurung_khema
271
277
  inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms
272
278
  inhangulcompatibilityjamo,in_hangul_compatibility_jamo
273
279
  inhanguljamo,in_hangul_jamo
@@ -309,6 +315,7 @@ inkhmer,in_khmer
309
315
  inkhmersymbols,in_khmer_symbols
310
316
  inkhojki,in_khojki
311
317
  inkhudawadi,in_khudawadi
318
+ inkiratrai,in_kirat_rai
312
319
  inlao,in_lao
313
320
  inlatin1supplement,in_latin_1_supplement
314
321
  inlatinextendeda,in_latin_extended_a
@@ -364,6 +371,7 @@ inmusicalsymbols,in_musical_symbols
364
371
  inmyanmar,in_myanmar
365
372
  inmyanmarextendeda,in_myanmar_extended_a
366
373
  inmyanmarextendedb,in_myanmar_extended_b
374
+ inmyanmarextendedc,in_myanmar_extended_c
367
375
  innabataean,in_nabataean
368
376
  innagmundari,in_nag_mundari
369
377
  innandinagari,in_nandinagari
@@ -385,6 +393,7 @@ inoldsogdian,in_old_sogdian
385
393
  inoldsoutharabian,in_old_south_arabian
386
394
  inoldturkic,in_old_turkic
387
395
  inolduyghur,in_old_uyghur
396
+ inolonal,in_ol_onal
388
397
  inopticalcharacterrecognition,in_optical_character_recognition
389
398
  inoriya,in_oriya
390
399
  inornamentaldingbats,in_ornamental_dingbats
@@ -424,6 +433,7 @@ inspacingmodifierletters,in_spacing_modifier_letters
424
433
  inspecials,in_specials
425
434
  insundanese,in_sundanese
426
435
  insundanesesupplement,in_sundanese_supplement
436
+ insunuwar,in_sunuwar
427
437
  insuperscriptsandsubscripts,in_superscripts_and_subscripts
428
438
  insupplementalarrowsa,in_supplemental_arrows_a
429
439
  insupplementalarrowsb,in_supplemental_arrows_b
@@ -437,6 +447,7 @@ insuttonsignwriting,in_sutton_signwriting
437
447
  insylotinagri,in_syloti_nagri
438
448
  insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a
439
449
  insymbolsforlegacycomputing,in_symbols_for_legacy_computing
450
+ insymbolsforlegacycomputingsupplement,in_symbols_for_legacy_computing_supplement
440
451
  insyriac,in_syriac
441
452
  insyriacsupplement,in_syriac_supplement
442
453
  intagalog,in_tagalog
@@ -459,8 +470,10 @@ inthai,in_thai
459
470
  intibetan,in_tibetan
460
471
  intifinagh,in_tifinagh
461
472
  intirhuta,in_tirhuta
473
+ intodhri,in_todhri
462
474
  intoto,in_toto
463
475
  intransportandmapsymbols,in_transport_and_map_symbols
476
+ intulutigalari,in_tulu_tigalari
464
477
  inugaritic,in_ugaritic
465
478
  inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
466
479
  inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
@@ -491,6 +504,7 @@ khitansmallscript,khitan_small_script
491
504
  khmer,khmer
492
505
  khojki,khojki
493
506
  khudawadi,khudawadi
507
+ kiratrai,kirat_rai
494
508
  lao,lao
495
509
  latin,latin
496
510
  lepcha,lepcha
@@ -524,6 +538,7 @@ meroiticcursive,meroitic_cursive
524
538
  meroitichieroglyphs,meroitic_hieroglyphs
525
539
  miao,miao
526
540
  modi,modi
541
+ modifiercombiningmark,modifier_combining_mark
527
542
  modifierletter,modifier_letter
528
543
  modifiersymbol,modifier_symbol
529
544
  mongolian,mongolian
@@ -553,6 +568,7 @@ oldsogdian,old_sogdian
553
568
  oldsoutharabian,old_south_arabian
554
569
  oldturkic,old_turkic
555
570
  olduyghur,old_uyghur
571
+ olonal,ol_onal
556
572
  openpunctuation,open_punctuation
557
573
  oriya,oriya
558
574
  osage,osage
@@ -606,6 +622,7 @@ space,space
606
622
  spaceseparator,space_separator
607
623
  spacingmark,spacing_mark
608
624
  sundanese,sundanese
625
+ sunuwar,sunuwar
609
626
  surrogate,surrogate
610
627
  sylotinagri,syloti_nagri
611
628
  symbol,symbol
@@ -627,7 +644,9 @@ tibetan,tibetan
627
644
  tifinagh,tifinagh
628
645
  tirhuta,tirhuta
629
646
  titlecaseletter,titlecase_letter
647
+ todhri,todhri
630
648
  toto,toto
649
+ tulutigalari,tulu_tigalari
631
650
  ugaritic,ugaritic
632
651
  unassigned,unassigned
633
652
  unifiedideograph,unified_ideograph
@@ -58,6 +58,7 @@ epres,emoji_presentation
58
58
  ethi,ethiopic
59
59
  ext,extender
60
60
  extpict,extended_pictographic
61
+ gara,garay
61
62
  geor,georgian
62
63
  glag,glagolitic
63
64
  gong,gunjala_gondi
@@ -69,6 +70,7 @@ grek,greek
69
70
  grext,grapheme_extend
70
71
  grlink,grapheme_link
71
72
  gujr,gujarati
73
+ gukh,gurung_khema
72
74
  guru,gurmukhi
73
75
  hang,hangul
74
76
  hani,han
@@ -97,6 +99,7 @@ khmr,khmer
97
99
  khoj,khojki
98
100
  kits,khitan_small_script
99
101
  knda,kannada
102
+ krai,kirat_rai
100
103
  kthi,kaithi
101
104
  l,letter
102
105
  lana,tai_tham
@@ -122,6 +125,7 @@ mand,mandaic
122
125
  mani,manichaean
123
126
  marc,marchen
124
127
  mc,spacing_mark
128
+ mcm,modifier_combining_mark
125
129
  me,enclosing_mark
126
130
  medf,medefaidrin
127
131
  mend,mende_kikakui
@@ -154,6 +158,7 @@ oids,other_id_start
154
158
  olck,ol_chiki
155
159
  olower,other_lowercase
156
160
  omath,other_math
161
+ onao,ol_onal
157
162
  orkh,old_turkic
158
163
  orya,oriya
159
164
  osge,osage
@@ -208,6 +213,7 @@ sora,sora_sompeng
208
213
  soyo,soyombo
209
214
  sterm,sentence_terminal
210
215
  sund,sundanese
216
+ sunu,sunuwar
211
217
  sylo,syloti_nagri
212
218
  syrc,syriac
213
219
  tagb,tagbanwa
@@ -225,6 +231,8 @@ thaa,thaana
225
231
  tibt,tibetan
226
232
  tirh,tirhuta
227
233
  tnsa,tangsa
234
+ todr,todhri
235
+ tutg,tulu_tigalari
228
236
  ugar,ugaritic
229
237
  uideo,unified_ideograph
230
238
  vaii,vai
@@ -37,7 +37,8 @@
37
37
  octal_sequence = [0-7]{1,3};
38
38
 
39
39
  hex_sequence = 'x' . xdigit{1,2};
40
- hex_sequence_err = 'x' . [^0-9a-fA-F{];
40
+ hex_sequence_err = 'x' . [^0-9A-Fa-f];
41
+ high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
41
42
 
42
43
  codepoint_single = 'u' . xdigit{4};
43
44
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
210
211
  type = :nonposixclass
211
212
  end
212
213
 
213
- unless self.class.posix_classes.include?(class_name)
214
+ unless POSIX_CLASSES[class_name]
214
215
  raise ValidationError.for(:posix_class, text)
215
216
  end
216
217
 
@@ -256,9 +257,21 @@
256
257
  # escape sequence scanner
257
258
  # --------------------------------------------------------------------------
258
259
  escape_sequence := |*
259
- [1-9] {
260
+ [1-9] . [0-9]* {
260
261
  text = copy(data, ts-1, te)
261
- emit(:backref, :number, text)
262
+
263
+ # If not enough groups have been opened, there is a fallback to either an
264
+ # octal or literal interpretation for 2+ digit numerical escapes.
265
+ digits = text[1..-1]
266
+ if digits.size == 1 || digits.to_i <= capturing_group_count
267
+ emit(:backref, :number, text)
268
+ elsif digits =~ /\A[0-7]{2,}\z/
269
+ emit(:escape, :octal, text)
270
+ else
271
+ emit(:escape, :literal, text[0..1])
272
+ emit(:literal, :literal, text[2..-1])
273
+ end
274
+
262
275
  fret;
263
276
  };
264
277
 
@@ -321,6 +334,16 @@
321
334
  fret;
322
335
  };
323
336
 
337
+ high_hex_sequence > (escaped_alpha, 5) {
338
+ text = copy(data, ts-1, te)
339
+ if regexp_encoding == Encoding::BINARY
340
+ text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
341
+ else
342
+ emit(:escape, :utf8_hex, text)
343
+ end
344
+ fret;
345
+ };
346
+
324
347
  hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325
348
  emit(:escape, :hex, copy(data, ts-1, te))
326
349
  fret;
@@ -514,6 +537,7 @@
514
537
  };
515
538
 
516
539
  group_open @group_opened {
540
+ self.capturing_group_count = capturing_group_count + 1
517
541
  text = copy(data, ts, te)
518
542
  emit(:group, :capture, text)
519
543
  };
@@ -662,6 +686,7 @@ class Regexp::Scanner
662
686
 
663
687
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
664
688
  self.free_spacing = free_spacing?(input_object, options)
689
+ self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
665
690
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
666
691
 
667
692
  data = input.unpack("c*")
@@ -672,6 +697,7 @@ class Regexp::Scanner
672
697
 
673
698
  self.set_depth = 0
674
699
  self.group_depth = 0
700
+ self.capturing_group_count = 0
675
701
  self.conditional_stack = []
676
702
  self.char_pos = 0
677
703
 
@@ -711,10 +737,11 @@ class Regexp::Scanner
711
737
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
738
  end
713
739
 
714
- def self.posix_classes
740
+ # Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
741
+ POSIX_CLASSES =
715
742
  %w[alnum alpha ascii blank cntrl digit graph
716
743
  lower print punct space upper word xdigit]
717
- end
744
+ .inject({}) { |o, e| o.merge(e => true) }.freeze
718
745
 
719
746
  # Emits an array with the details of the scanned pattern
720
747
  def emit(type, token, text)
@@ -742,13 +769,14 @@ class Regexp::Scanner
742
769
  end
743
770
  end
744
771
 
745
- attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
772
+ attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
746
773
 
747
774
  private
748
775
 
749
776
  attr_accessor :block,
750
777
  :collect_tokens, :tokens, :prev_token,
751
778
  :free_spacing, :spacing_stack,
779
+ :regexp_encoding,
752
780
  :group_depth, :set_depth, :conditional_stack,
753
781
  :char_pos
754
782