RubyGems - regexp_parser - Versions diffs - 2.10.0 → 2.11.1 - Mend

regexp_parser 2.10.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/LICENSE +1 -1
data/Rakefile +3 -3
data/lib/regexp_parser/expression/classes/escape_sequence.rb +1 -0
data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +6 -0
data/lib/regexp_parser/expression/shared.rb +4 -3
data/lib/regexp_parser/parser.rb +1 -0
data/lib/regexp_parser/scanner/properties/long.csv +19 -0
data/lib/regexp_parser/scanner/properties/short.csv +8 -0
data/lib/regexp_parser/scanner/scanner.rl +35 -7
data/lib/regexp_parser/scanner.rb +500 -470
data/lib/regexp_parser/syntax/token/escape.rb +1 -1
data/lib/regexp_parser/syntax/token/unicode_property.rb +13 -0
data/lib/regexp_parser/syntax/versions/3.5.0.rb +4 -0
data/lib/regexp_parser/version.rb +1 -1
metadata +4 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f6ed5457d89738fa1076cf3875cd2d009973f02857ea68e055ef3ef74a78dc91
-  data.tar.gz: d67eb5f0cb37ad106574b2ae327eefcfc13c9d585cddec6661898f4d8166ebcc
+  metadata.gz: d7598b7311a82778cbcb493188dad178ce93c8478e420cd9e2382732ee90d4e1
+  data.tar.gz: 60a8399981030bdef025cf9657e043a5ccac93adeee62a589a8adb41ec460664
 SHA512:
-  metadata.gz: 6b8adbc3c4707fc4c823456ae1d7547f17568802de03008a17fef18a5f95af08b0e42d48ccdfab25a740603a58ab89c036d70cec94405701201e5a5af51ce392
-  data.tar.gz: 9bea98a42ab64a9b45ddc5564cd077d7eb6d2ddc293844759bb8001aa9fefd8aa26b0e03fff7a286ccde9f7aeacacda9fbb187fe04082749d3c2605e0cece7b9
+  metadata.gz: a7ac06fda5f76d4497b8f01d1e724917d009f7c9ea10befcf03a801af8e769b52619433a22cc997cf584b03e1ca9e6ced257f5fc07e327c966f5c25714d2d0b4
+  data.tar.gz: 3d3f89a383bb63208a41801ea059bfc407ff2e88d657d23b0f13740d418335ad47c9f5174bc1d5b7f06841d7a461828c57efa1f97f8bc1b9b42e255959bd18cf

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2010, 2012-2024,  Ammar Ali
+Copyright (c) 2010, 2012-2025,  Ammar Ali
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation

data/Rakefile CHANGED Viewed

@@ -14,10 +14,10 @@ RSpec::Core::RakeTask.new(:spec)
 task :default => [:'test:full']
 namespace :test do
-  task full: [:'ragel:rb', :spec]
+  task full: [:ragel, :spec]
 end
 # Add ragel task as a prerequisite for building the gem to ensure that the
 # latest scanner code is generated and included in the build.
-desc "Runs ragel:rb before building the gem"
-task :build => ['ragel:rb']
+desc "Runs ragel before building the gem"
+task build: :ragel

data/lib/regexp_parser/expression/classes/escape_sequence.rb CHANGED Viewed

@@ -18,6 +18,7 @@ module Regexp::Expression
     Codepoint   = Class.new(Base) # e.g. \u000A
     CodepointList = Class.new(Base) # e.g. \u{A B}
+    UTF8Hex       = Class.new(Base) # e.g. \xE2\x82\xAC
     AbstractMetaControlSequence = Class.new(Base)
     Control                     = Class.new(AbstractMetaControlSequence) # e.g. \cB

data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb CHANGED Viewed

@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
   Hex.class_eval         { def codepoint; text[/\h+/].hex end }
   Codepoint.class_eval   { def codepoint; text[/\h+/].hex end }
+  UTF8Hex.class_eval do
+    def codepoint
+      text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
+    end
+  end
   CodepointList.class_eval do
     # Maybe this should be a unique top-level expression class?
     def char

data/lib/regexp_parser/expression/shared.rb CHANGED Viewed

@@ -70,11 +70,12 @@ module Regexp::Expression
     # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
     #
     def to_s(format = :full)
-      base = parts.each_with_object(''.dup) do |part, buff|
+      base = ''.dup
+      parts.each do |part|
         if part.instance_of?(String)
-          buff << part
+          base << part
         elsif !part.custom_to_s_handling
-          buff << part.to_s(:original)
+          base << part.to_s(:original)
         end
       end
       "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"

data/lib/regexp_parser/parser.rb CHANGED Viewed

@@ -319,6 +319,7 @@ class Regexp::Parser
     when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
     when :hex;            node << EscapeSequence::Hex.new(token, active_opts)
     when :octal;          node << EscapeSequence::Octal.new(token, active_opts)
+    when :utf8_hex;       node << EscapeSequence::UTF8Hex.new(token, active_opts)
     when :control
       if token.text =~ /\A(?:\\C-\\M|\\c\\M)/

data/lib/regexp_parser/scanner/properties/long.csv CHANGED Viewed

@@ -9,6 +9,7 @@ age=13.0,age=13.0
 age=14.0,age=14.0
 age=15.0,age=15.0
 age=15.1,age=15.1
+age=16.0,age=16.0
 age=2.0,age=2.0
 age=2.1,age=2.1
 age=3.0,age=3.0
@@ -103,6 +104,7 @@ extendedpictographic,extended_pictographic
 extender,extender
 finalpunctuation,final_punctuation
 format,format
+garay,garay
 georgian,georgian
 glagolitic,glagolitic
 gothic,gothic
@@ -128,6 +130,7 @@ greek,greek
 gujarati,gujarati
 gunjalagondi,gunjala_gondi
 gurmukhi,gurmukhi
+gurungkhema,gurung_khema
 han,han
 hangul,hangul
 hanifirohingya,hanifi_rohingya
@@ -241,6 +244,7 @@ induployan,in_duployan
 inearlydynasticcuneiform,in_early_dynastic_cuneiform
 inegyptianhieroglyphformatcontrols,in_egyptian_hieroglyph_format_controls
 inegyptianhieroglyphs,in_egyptian_hieroglyphs
+inegyptianhieroglyphsextendeda,in_egyptian_hieroglyphs_extended_a
 inelbasan,in_elbasan
 inelymaic,in_elymaic
 inemoticons,in_emoticons
@@ -253,6 +257,7 @@ inethiopicextended,in_ethiopic_extended
 inethiopicextendeda,in_ethiopic_extended_a
 inethiopicextendedb,in_ethiopic_extended_b
 inethiopicsupplement,in_ethiopic_supplement
+ingaray,in_garay
 ingeneralpunctuation,in_general_punctuation
 ingeometricshapes,in_geometric_shapes
 ingeometricshapesextended,in_geometric_shapes_extended
@@ -268,6 +273,7 @@ ingreekextended,in_greek_extended
 ingujarati,in_gujarati
 ingunjalagondi,in_gunjala_gondi
 ingurmukhi,in_gurmukhi
+ingurungkhema,in_gurung_khema
 inhalfwidthandfullwidthforms,in_halfwidth_and_fullwidth_forms
 inhangulcompatibilityjamo,in_hangul_compatibility_jamo
 inhanguljamo,in_hangul_jamo
@@ -309,6 +315,7 @@ inkhmer,in_khmer
 inkhmersymbols,in_khmer_symbols
 inkhojki,in_khojki
 inkhudawadi,in_khudawadi
+inkiratrai,in_kirat_rai
 inlao,in_lao
 inlatin1supplement,in_latin_1_supplement
 inlatinextendeda,in_latin_extended_a
@@ -364,6 +371,7 @@ inmusicalsymbols,in_musical_symbols
 inmyanmar,in_myanmar
 inmyanmarextendeda,in_myanmar_extended_a
 inmyanmarextendedb,in_myanmar_extended_b
+inmyanmarextendedc,in_myanmar_extended_c
 innabataean,in_nabataean
 innagmundari,in_nag_mundari
 innandinagari,in_nandinagari
@@ -385,6 +393,7 @@ inoldsogdian,in_old_sogdian
 inoldsoutharabian,in_old_south_arabian
 inoldturkic,in_old_turkic
 inolduyghur,in_old_uyghur
+inolonal,in_ol_onal
 inopticalcharacterrecognition,in_optical_character_recognition
 inoriya,in_oriya
 inornamentaldingbats,in_ornamental_dingbats
@@ -424,6 +433,7 @@ inspacingmodifierletters,in_spacing_modifier_letters
 inspecials,in_specials
 insundanese,in_sundanese
 insundanesesupplement,in_sundanese_supplement
+insunuwar,in_sunuwar
 insuperscriptsandsubscripts,in_superscripts_and_subscripts
 insupplementalarrowsa,in_supplemental_arrows_a
 insupplementalarrowsb,in_supplemental_arrows_b
@@ -437,6 +447,7 @@ insuttonsignwriting,in_sutton_signwriting
 insylotinagri,in_syloti_nagri
 insymbolsandpictographsextendeda,in_symbols_and_pictographs_extended_a
 insymbolsforlegacycomputing,in_symbols_for_legacy_computing
+insymbolsforlegacycomputingsupplement,in_symbols_for_legacy_computing_supplement
 insyriac,in_syriac
 insyriacsupplement,in_syriac_supplement
 intagalog,in_tagalog
@@ -459,8 +470,10 @@ inthai,in_thai
 intibetan,in_tibetan
 intifinagh,in_tifinagh
 intirhuta,in_tirhuta
+intodhri,in_todhri
 intoto,in_toto
 intransportandmapsymbols,in_transport_and_map_symbols
+intulutigalari,in_tulu_tigalari
 inugaritic,in_ugaritic
 inunifiedcanadianaboriginalsyllabics,in_unified_canadian_aboriginal_syllabics
 inunifiedcanadianaboriginalsyllabicsextended,in_unified_canadian_aboriginal_syllabics_extended
@@ -491,6 +504,7 @@ khitansmallscript,khitan_small_script
 khmer,khmer
 khojki,khojki
 khudawadi,khudawadi
+kiratrai,kirat_rai
 lao,lao
 latin,latin
 lepcha,lepcha
@@ -524,6 +538,7 @@ meroiticcursive,meroitic_cursive
 meroitichieroglyphs,meroitic_hieroglyphs
 miao,miao
 modi,modi
+modifiercombiningmark,modifier_combining_mark
 modifierletter,modifier_letter
 modifiersymbol,modifier_symbol
 mongolian,mongolian
@@ -553,6 +568,7 @@ oldsogdian,old_sogdian
 oldsoutharabian,old_south_arabian
 oldturkic,old_turkic
 olduyghur,old_uyghur
+olonal,ol_onal
 openpunctuation,open_punctuation
 oriya,oriya
 osage,osage
@@ -606,6 +622,7 @@ space,space
 spaceseparator,space_separator
 spacingmark,spacing_mark
 sundanese,sundanese
+sunuwar,sunuwar
 surrogate,surrogate
 sylotinagri,syloti_nagri
 symbol,symbol
@@ -627,7 +644,9 @@ tibetan,tibetan
 tifinagh,tifinagh
 tirhuta,tirhuta
 titlecaseletter,titlecase_letter
+todhri,todhri
 toto,toto
+tulutigalari,tulu_tigalari
 ugaritic,ugaritic
 unassigned,unassigned
 unifiedideograph,unified_ideograph

data/lib/regexp_parser/scanner/properties/short.csv CHANGED Viewed

@@ -58,6 +58,7 @@ epres,emoji_presentation
 ethi,ethiopic
 ext,extender
 extpict,extended_pictographic
+gara,garay
 geor,georgian
 glag,glagolitic
 gong,gunjala_gondi
@@ -69,6 +70,7 @@ grek,greek
 grext,grapheme_extend
 grlink,grapheme_link
 gujr,gujarati
+gukh,gurung_khema
 guru,gurmukhi
 hang,hangul
 hani,han
@@ -97,6 +99,7 @@ khmr,khmer
 khoj,khojki
 kits,khitan_small_script
 knda,kannada
+krai,kirat_rai
 kthi,kaithi
 l,letter
 lana,tai_tham
@@ -122,6 +125,7 @@ mand,mandaic
 mani,manichaean
 marc,marchen
 mc,spacing_mark
+mcm,modifier_combining_mark
 me,enclosing_mark
 medf,medefaidrin
 mend,mende_kikakui
@@ -154,6 +158,7 @@ oids,other_id_start
 olck,ol_chiki
 olower,other_lowercase
 omath,other_math
+onao,ol_onal
 orkh,old_turkic
 orya,oriya
 osge,osage
@@ -208,6 +213,7 @@ sora,sora_sompeng
 soyo,soyombo
 sterm,sentence_terminal
 sund,sundanese
+sunu,sunuwar
 sylo,syloti_nagri
 syrc,syriac
 tagb,tagbanwa
@@ -225,6 +231,8 @@ thaa,thaana
 tibt,tibetan
 tirh,tirhuta
 tnsa,tangsa
+todr,todhri
+tutg,tulu_tigalari
 ugar,ugaritic
 uideo,unified_ideograph
 vaii,vai

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -37,7 +37,8 @@
   octal_sequence        = [0-7]{1,3};
   hex_sequence          = 'x' . xdigit{1,2};
-  hex_sequence_err      = 'x' . [^0-9a-fA-F{];
+  hex_sequence_err      = 'x' . [^0-9A-Fa-f];
+  high_hex_sequence     = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
   codepoint_single      = 'u' . xdigit{4};
   codepoint_list        = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
         type = :nonposixclass
       end
-      unless self.class.posix_classes.include?(class_name)
+      unless POSIX_CLASSES[class_name]
         raise ValidationError.for(:posix_class, text)
       end
@@ -256,9 +257,21 @@
   # escape sequence scanner
   # --------------------------------------------------------------------------
   escape_sequence := |*
-    [1-9] {
+    [1-9] . [0-9]* {
       text = copy(data, ts-1, te)
-      emit(:backref, :number, text)
+      # If not enough groups have been opened, there is a fallback to either an
+      # octal or literal interpretation for 2+ digit numerical escapes.
+      digits = text[1..-1]
+      if digits.size == 1 || digits.to_i <= capturing_group_count
+        emit(:backref, :number, text)
+      elsif digits =~ /\A[0-7]{2,}\z/
+        emit(:escape, :octal, text)
+      else
+        emit(:escape, :literal, text[0..1])
+        emit(:literal, :literal, text[2..-1])
+      end
       fret;
     };
@@ -321,6 +334,16 @@
       fret;
     };
+    high_hex_sequence > (escaped_alpha, 5) {
+      text = copy(data, ts-1, te)
+      if regexp_encoding == Encoding::BINARY
+        text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
+      else
+        emit(:escape, :utf8_hex, text)
+      end
+      fret;
+    };
     hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
       emit(:escape, :hex, copy(data, ts-1, te))
       fret;
@@ -514,6 +537,7 @@
     };
     group_open @group_opened {
+      self.capturing_group_count = capturing_group_count + 1
       text = copy(data, ts, te)
       emit(:group, :capture, text)
     };
@@ -662,6 +686,7 @@ class Regexp::Scanner
     input = input_object.is_a?(Regexp) ? input_object.source : input_object
     self.free_spacing = free_spacing?(input_object, options)
+    self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
     data  = input.unpack("c*")
@@ -672,6 +697,7 @@ class Regexp::Scanner
     self.set_depth = 0
     self.group_depth = 0
+    self.capturing_group_count = 0
     self.conditional_stack = []
     self.char_pos = 0
@@ -711,10 +737,11 @@ class Regexp::Scanner
     File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
   end
-  def self.posix_classes
+  # Use each_with_object for required_ruby_version >= 2.2, or #to_h for >= 2.6
+  POSIX_CLASSES =
     %w[alnum alpha ascii blank cntrl digit graph
        lower print punct space upper word xdigit]
-  end
+      .inject({}) { |o, e| o.merge(e => true) }.freeze
   # Emits an array with the details of the scanned pattern
   def emit(type, token, text)
@@ -742,13 +769,14 @@ class Regexp::Scanner
     end
   end
-  attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
+  attr_accessor :capturing_group_count, :literal_run # only public for #||= to work on ruby <= 2.5
   private
   attr_accessor :block,
                 :collect_tokens, :tokens, :prev_token,
                 :free_spacing, :spacing_stack,
+                :regexp_encoding,
                 :group_depth, :set_depth, :conditional_stack,
                 :char_pos