RubyGems - regexp_parser - Versions diffs - 2.0.2 → 2.2.0 - Mend

regexp_parser 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +54 -0
data/Gemfile +5 -1
data/README.md +15 -21
data/Rakefile +11 -17
data/lib/regexp_parser/error.rb +4 -0
data/lib/regexp_parser/expression/base.rb +123 -0
data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +5 -0
data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -1
data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +0 -0
data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +1 -0
data/lib/regexp_parser/expression/classes/free_space.rb +1 -3
data/lib/regexp_parser/expression/classes/group.rb +6 -1
data/lib/regexp_parser/expression/classes/literal.rb +1 -5
data/lib/regexp_parser/expression/classes/property.rb +1 -3
data/lib/regexp_parser/expression/classes/root.rb +0 -1
data/lib/regexp_parser/expression/classes/type.rb +0 -2
data/lib/regexp_parser/expression/quantifier.rb +2 -2
data/lib/regexp_parser/expression/sequence.rb +3 -10
data/lib/regexp_parser/expression/subexpression.rb +1 -2
data/lib/regexp_parser/expression.rb +7 -130
data/lib/regexp_parser/lexer.rb +7 -5
data/lib/regexp_parser/parser.rb +282 -334
data/lib/regexp_parser/scanner/properties/long.yml +13 -0
data/lib/regexp_parser/scanner/properties/short.yml +9 -1
data/lib/regexp_parser/scanner/scanner.rl +64 -87
data/lib/regexp_parser/scanner.rb +1024 -1073
data/lib/regexp_parser/syntax/any.rb +2 -4
data/lib/regexp_parser/syntax/base.rb +10 -10
data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
data/lib/regexp_parser/syntax/{tokens/backref.rb → token/backreference.rb} +6 -5
data/lib/regexp_parser/syntax/{tokens → token}/character_set.rb +2 -2
data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
data/lib/regexp_parser/syntax/token/escape.rb +31 -0
data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
data/lib/regexp_parser/syntax/{tokens → token}/meta.rb +2 -2
data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
data/lib/regexp_parser/syntax/token/unicode_property.rb +696 -0
data/lib/regexp_parser/syntax/token.rb +45 -0
data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
data/lib/regexp_parser/syntax/versions/1.8.6.rb +1 -1
data/lib/regexp_parser/syntax/versions/3.1.0.rb +10 -0
data/lib/regexp_parser/syntax.rb +8 -6
data/lib/regexp_parser/token.rb +9 -20
data/lib/regexp_parser/version.rb +1 -1
data/lib/regexp_parser.rb +0 -2
data/spec/expression/clone_spec.rb +36 -4
data/spec/expression/free_space_spec.rb +2 -2
data/spec/expression/methods/match_length_spec.rb +2 -2
data/spec/lexer/nesting_spec.rb +2 -2
data/spec/lexer/refcalls_spec.rb +5 -0
data/spec/parser/all_spec.rb +2 -2
data/spec/parser/escapes_spec.rb +43 -31
data/spec/parser/properties_spec.rb +6 -4
data/spec/parser/refcalls_spec.rb +5 -0
data/spec/parser/set/ranges_spec.rb +26 -16
data/spec/scanner/escapes_spec.rb +29 -20
data/spec/scanner/refcalls_spec.rb +19 -0
data/spec/scanner/sets_spec.rb +66 -23
data/spec/spec_helper.rb +13 -1
data/spec/support/capturing_stderr.rb +9 -0
data/spec/syntax/versions/1.8.6_spec.rb +2 -2
data/spec/syntax/versions/2.0.0_spec.rb +2 -2
data/spec/syntax/versions/aliases_spec.rb +1 -0
metadata +27 -26
data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
data/lib/regexp_parser/syntax/tokens.rb +0 -45
data/spec/support/runner.rb +0 -42
data/spec/support/warning_extractor.rb +0 -60

data/lib/regexp_parser/scanner/properties/long.yml CHANGED Viewed

@@ -8,6 +8,7 @@ age=10.0: age=10.0
 age=11.0: age=11.0
 age=12.0: age=12.0
 age=12.1: age=12.1
+age=13.0: age=13.0
 age=2.0: age=2.0
 age=2.1: age=2.1
 age=3.0: age=3.0
@@ -64,6 +65,7 @@ changeswhenlowercased: changes_when_lowercased
 changeswhentitlecased: changes_when_titlecased
 changeswhenuppercased: changes_when_uppercased
 cherokee: cherokee
+chorasmian: chorasmian
 closepunctuation: close_punctuation
 cntrl: cntrl
 common: common
@@ -83,6 +85,7 @@ deseret: deseret
 devanagari: devanagari
 diacritic: diacritic
 digit: digit
+divesakuru: dives_akuru
 dogra: dogra
 duployan: duployan
 egyptianhieroglyphs: egyptian_hieroglyphs
@@ -167,6 +170,7 @@ incham: in_cham
 incherokee: in_cherokee
 incherokeesupplement: in_cherokee_supplement
 inchesssymbols: in_chess_symbols
+inchorasmian: in_chorasmian
 incjkcompatibility: in_cjk_compatibility
 incjkcompatibilityforms: in_cjk_compatibility_forms
 incjkcompatibilityideographs: in_cjk_compatibility_ideographs
@@ -181,6 +185,7 @@ incjkunifiedideographsextensionc: in_cjk_unified_ideographs_extension_c
 incjkunifiedideographsextensiond: in_cjk_unified_ideographs_extension_d
 incjkunifiedideographsextensione: in_cjk_unified_ideographs_extension_e
 incjkunifiedideographsextensionf: in_cjk_unified_ideographs_extension_f
+incjkunifiedideographsextensiong: in_cjk_unified_ideographs_extension_g
 incombiningdiacriticalmarks: in_combining_diacritical_marks
 incombiningdiacriticalmarksextended: in_combining_diacritical_marks_extended
 incombiningdiacriticalmarksforsymbols: in_combining_diacritical_marks_for_symbols
@@ -204,6 +209,7 @@ indeseret: in_deseret
 indevanagari: in_devanagari
 indevanagariextended: in_devanagari_extended
 indingbats: in_dingbats
+indivesakuru: in_dives_akuru
 indogra: in_dogra
 indominotiles: in_domino_tiles
 induployan: in_duployan
@@ -269,6 +275,7 @@ inkatakana: in_katakana
 inkatakanaphoneticextensions: in_katakana_phonetic_extensions
 inkayahli: in_kayah_li
 inkharoshthi: in_kharoshthi
+inkhitansmallscript: in_khitan_small_script
 inkhmer: in_khmer
 inkhmersymbols: in_khmer_symbols
 inkhojki: in_khojki
@@ -288,6 +295,7 @@ inlineara: in_linear_a
 inlinearbideograms: in_linear_b_ideograms
 inlinearbsyllabary: in_linear_b_syllabary
 inlisu: in_lisu
+inlisusupplement: in_lisu_supplement
 inlowsurrogates: in_low_surrogates
 inlycian: in_lycian
 inlydian: in_lydian
@@ -395,6 +403,7 @@ insupplementaryprivateuseareab: in_supplementary_private_use_area_b
 insuttonsignwriting: in_sutton_signwriting
 insylotinagri: in_syloti_nagri
 insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
+insymbolsforlegacycomputing: in_symbols_for_legacy_computing
 insyriac: in_syriac
 insyriacsupplement: in_syriac_supplement
 intagalog: in_tagalog
@@ -409,6 +418,7 @@ intamil: in_tamil
 intamilsupplement: in_tamil_supplement
 intangut: in_tangut
 intangutcomponents: in_tangut_components
+intangutsupplement: in_tangut_supplement
 intelugu: in_telugu
 inthaana: in_thaana
 inthai: in_thai
@@ -426,6 +436,7 @@ invedicextensions: in_vedic_extensions
 inverticalforms: in_vertical_forms
 inwancho: in_wancho
 inwarangciti: in_warang_citi
+inyezidi: in_yezidi
 inyijinghexagramsymbols: in_yijing_hexagram_symbols
 inyiradicals: in_yi_radicals
 inyisyllables: in_yi_syllables
@@ -437,6 +448,7 @@ kannada: kannada
 katakana: katakana
 kayahli: kayah_li
 kharoshthi: kharoshthi
+khitansmallscript: khitan_small_script
 khmer: khmer
 khojki: khojki
 khudawadi: khudawadi
@@ -590,5 +602,6 @@ xdigit: xdigit
 xidcontinue: xid_continue
 xidstart: xid_start
 xposixpunct: xposixpunct
+yezidi: yezidi
 yi: yi
 zanabazarsquare: zanabazar_square

data/lib/regexp_parser/scanner/properties/short.yml CHANGED Viewed

@@ -28,6 +28,7 @@ cari: carian
 cc: control
 cf: format
 cher: cherokee
+chrs: chorasmian
 ci: case_ignorable
 cn: unassigned
 co: private_use
@@ -45,12 +46,17 @@ dep: deprecated
 deva: devanagari
 di: default_ignorable_code_point
 dia: diacritic
+diak: dives_akuru
 dogr: dogra
 dsrt: deseret
 dupl: duployan
+ebase: emoji_modifier_base
+ecomp: emoji_component
 egyp: egyptian_hieroglyphs
 elba: elbasan
 elym: elymaic
+emod: emoji_modifier
+epres: emoji_presentation
 ethi: ethiopic
 ext: extender
 geor: georgian
@@ -89,6 +95,7 @@ kana: katakana
 khar: kharoshthi
 khmr: khmer
 khoj: khojki
+kits: khitan_small_script
 knda: kannada
 kthi: kaithi
 l: letter
@@ -127,7 +134,7 @@ mroo: mro
 mtei: meetei_mayek
 mult: multani
 mymr: myanmar
-n: number
+"n": number
 nand: nandinagari
 narb: old_north_arabian
 nbat: nabataean
@@ -226,6 +233,7 @@ xidc: xid_continue
 xids: xid_start
 xpeo: old_persian
 xsux: cuneiform
+yezi: yezidi
 yiii: yi
 z: separator
 zanb: zanabazar_square

data/lib/regexp_parser/scanner/scanner.rl CHANGED Viewed

@@ -20,7 +20,7 @@
   group_open            = '(';
   group_close           = ')';
-  parantheses           = group_open | group_close;
+  parentheses           = group_open | group_close;
   set_open              = '[';
   set_close             = ']';
@@ -37,7 +37,7 @@
   class_posix           = ('[:' . '^'? . class_name_posix . ':]');
-  # these are not supported in ruby, and need verification
+  # these are not supported in ruby at the moment
   collating_sequence    = '[.' . (alpha | [\-])+ . '.]';
   character_equivalent  = '[=' . alpha . '=]';
@@ -58,6 +58,8 @@
   meta_sequence         = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
+  sequence_char         = [CMcux];
   zero_or_one           = '?' | '??' | '?+';
   zero_or_more          = '*' | '*?' | '*+';
   one_or_more           = '+' | '+?' | '++';
@@ -106,11 +108,15 @@
   group_named           = ('?' . group_name );
-  group_name_ref        = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
-                                       ("'" . group_name_id_sq? . group_level? "'"));
+  group_name_backref    = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
+                                 ("'" . group_name_id_sq? . group_level? "'"));
+  group_name_call       = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
+                                 ("'" . group_name_id_sq? . group_level? "'"));
-  group_number_ref      = group_ref . (('<' . group_number . group_level? '>') |
-                                       ("'" . group_number . group_level? "'"));
+  group_number_backref  = 'k' . (('<' . group_number . group_level? '>') |
+                                 ("'" . group_number . group_level? "'"));
+  group_number_call     = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
+                                 ("'" . ((group_number . group_level?) | '0') "'"));
   group_type            = group_atomic | group_passive | group_absence | group_named;
@@ -121,7 +127,7 @@
   # characters that 'break' a literal
   meta_char             = dot | backslash | alternation |
-                          curlies | parantheses | brackets |
+                          curlies | parentheses | brackets |
                           line_anchor | quantifier_greedy;
   literal_delimiters    = ']' | '}';
@@ -130,10 +136,12 @@
   ascii_nonprint        = (0x01..0x1f | 0x7f);
   non_literal_escape    = char_type_char | anchor_char | escaped_ascii |
-                          keep_mark | [xucCM];
+                          keep_mark | sequence_char;
+  # escapes that also work within a character set
+  set_escape            = backslash | brackets | escaped_ascii | property_char |
+                          sequence_char | single_codepoint_char_type;
-  non_set_escape        = (anchor_char - 'b') | group_ref | keep_mark |
-                          multi_codepoint_char_type | [0-9cCM];
   # EOF error, used where it can be detected
   action premature_end_error {
@@ -228,13 +236,13 @@
       emit(type, class_name.to_sym, text)
     };
-    collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      emit(:set, :collation, copy(data, ts, te))
-    };
-    character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
-      emit(:set, :equivalent, copy(data, ts, te))
-    };
+    # These are not supported in ruby at the moment. Enable them if they are.
+    # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    #   emit(:set, :collation, copy(data, ts, te))
+    # };
+    # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error)  {
+    #   emit(:set, :equivalent, copy(data, ts, te))
+    # };
     meta_char > (set_meta, 1) {
       emit(:literal, :literal, copy(data, ts, te))
@@ -249,16 +257,16 @@
   # set escapes scanner
   # --------------------------------------------------------------------------
   set_escape_sequence := |*
-    non_set_escape > (escaped_set_alpha, 2) {
-      emit(:escape, :literal, copy(data, ts-1, te))
-      fret;
-    };
-    any > (escaped_set_alpha, 1) {
+    set_escape > (escaped_set_alpha, 2) {
       fhold;
       fnext character_set;
       fcall escape_sequence;
     };
+    any > (escaped_set_alpha, 1) {
+      emit(:escape, :literal, copy(data, ts-1, te))
+      fret;
+    };
   *|;
@@ -538,67 +546,35 @@
     # Group backreference, named and numbered
     # ------------------------------------------------------------------------
-    backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
+    backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
       case text = copy(data, ts, te)
-      when /^\\([gk])(<>|'')/ # angle brackets
-        validation_error(:backref, 'ref/call', 'ref ID is empty')
-      # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
-      # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
-      when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
-        if $1 == 'k'
-          emit(:backref, :name_ref_ab, text)
-        else
-          emit(:backref, :name_call_ab, text)
-        end
-      when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
-        if $1 == 'k'
-          emit(:backref, :name_ref_sq, text)
-        else
-          emit(:backref, :name_call_sq, text)
-        end
-      when /^\\([gk])<\d+>/ # angle-brackets
-        if $1 == 'k'
-          emit(:backref, :number_ref_ab, text)
-        else
-          emit(:backref, :number_call_ab, text)
-        end
-      when /^\\([gk])'\d+'/ # single quotes
-        if $1 == 'k'
-          emit(:backref, :number_ref_sq, text)
-        else
-          emit(:backref, :number_call_sq, text)
-        end
-      when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
-        if $1 == 'k'
-          emit(:backref, :number_rel_ref_ab, text)
-        else
-          emit(:backref, :number_rel_call_ab, text)
-        end
-      when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
-        if $1 == 'k'
-          emit(:backref, :number_rel_ref_sq, text)
-        else
-          emit(:backref, :number_rel_call_sq, text)
-        end
-      when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
-        emit(:backref, :name_recursion_ref_ab, text)
-      when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
-        emit(:backref, :name_recursion_ref_sq, text)
-      when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
-        emit(:backref, :number_recursion_ref_ab, text)
-      when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
-        emit(:backref, :number_recursion_ref_sq, text)
+      when /^\\k(<>|'')/
+        validation_error(:backref, 'backreference', 'ref ID is empty')
+      when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
+        emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
+      when /^\\k(.)\d+\D$/
+        emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
+      when /^\\k(.)-\d+\D$/
+        emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
+      when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
+        emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
+      when /^\\k(.)-?\d+[+\-]\d+\D$/
+        emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
+      end
+    };
+    # Group call, named and numbered
+    # ------------------------------------------------------------------------
+    backslash . (group_name_call | group_number_call) > (backslashed, 4) {
+      case text = copy(data, ts, te)
+      when /^\\g(<>|'')/
+        validation_error(:backref, 'subexpression call', 'ref ID is empty')
+      when /^\\g(.)[^\p{digit}+\->][^+\-]*/
+        emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
+      when /^\\g(.)\d+\D$/
+        emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
+      when /^\\g(.)[+-]\d+/
+        emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
       end
     };
@@ -675,12 +651,14 @@
 # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
 # This file was generated from lib/regexp_parser/scanner/scanner.rl
+require 'regexp_parser/error'
 class Regexp::Scanner
   # General scanner error (catch all)
-  class ScannerError < StandardError; end
+  class ScannerError < Regexp::Parser::Error; end
   # Base for all scanner validation errors
-  class ValidationError < StandardError
+  class ValidationError < Regexp::Parser::Error
     def initialize(reason)
       super reason
     end
@@ -782,14 +760,13 @@ class Regexp::Scanner
   # lazy-load property maps when first needed
   require 'yaml'
-  PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
   def self.short_prop_map
-    @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
+    @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
   end
   def self.long_prop_map
-    @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
+    @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
   end
   # Emits an array with the details of the scanned pattern