RubyGems - regexp-examples - Versions diffs - 1.1.0 → 1.1.2 - Mend

regexp-examples 1.1.0 → 1.1.2

Files changed (23) hide show

checksums.yaml +4 -4
data/README.md +9 -9
data/Rakefile +3 -3
data/db/unicode_ranges_2.0.pstore +0 -0
data/db/unicode_ranges_2.1.pstore +0 -0
data/db/unicode_ranges_2.2.pstore +0 -0
data/lib/{regexp-examples/core_extensions → core_extensions}/regexp/examples.rb +3 -3
data/lib/regexp-examples.rb +11 -2
data/lib/regexp-examples/backreferences.rb +3 -4
data/lib/regexp-examples/chargroup_parser.rb +14 -14
data/lib/regexp-examples/constants.rb +5 -156
data/lib/regexp-examples/groups.rb +20 -12
data/lib/regexp-examples/helpers.rb +5 -5
data/lib/regexp-examples/parser.rb +52 -42
data/lib/regexp-examples/repeaters.rb +5 -5
data/lib/regexp-examples/unicode_char_ranges.rb +45 -0
data/lib/regexp-examples/version.rb +1 -1
data/regexp-examples.gemspec +4 -4
data/scripts/unicode_lister.rb +34 -150
data/spec/regexp-examples_spec.rb +81 -59
data/spec/regexp-random_example_spec.rb +2 -2
data/spec/spec_helper.rb +1 -1
metadata +8 -4

data/lib/regexp-examples/helpers.rb CHANGED Viewed

@@ -8,7 +8,7 @@ module RegexpExamples
   def self.permutations_of_strings(arrays_of_strings)
     first = arrays_of_strings.shift
     return first if arrays_of_strings.empty?
-    first.product( permutations_of_strings(arrays_of_strings) ).map do |result|
+    first.product(permutations_of_strings(arrays_of_strings)).map do |result|
       join_preserving_capture_groups(result)
     end
   end
@@ -16,8 +16,8 @@ module RegexpExamples
   def self.join_preserving_capture_groups(result)
     result.flatten!
     subgroups = result
-      .map(&:all_subgroups)
-      .flatten
+                .map(&:all_subgroups)
+                .flatten
     # Only save the LAST group from repeated capture groups, e.g. /([ab]){2}/
     subgroups.delete_if do |subgroup|
@@ -35,12 +35,12 @@ module RegexpExamples
   end
   private
   def self.generic_map_result(repeaters, method)
     repeaters
-      .map {|repeater| repeater.public_send(method)}
+      .map { |repeater| repeater.public_send(method) }
       .instance_eval do |partial_results|
         RegexpExamples.permutations_of_strings(partial_results)
       end
   end
 end

data/lib/regexp-examples/parser.rb CHANGED Viewed

@@ -15,9 +15,7 @@ module RegexpExamples
       repeaters = []
       until end_of_regexp
         group = parse_group(repeaters)
-        if group.is_a? OrGroup
-          return [OneTimeRepeater.new(group)]
-        end
+        return [OneTimeRepeater.new(group)] if group.is_a? OrGroup
         @current_position += 1
         repeaters << parse_repeater(group)
       end
@@ -101,12 +99,16 @@ module RegexpExamples
       @current_position += 1
       case
       when rest_of_string =~ /\A(\d{1,3})/
-        @current_position += ($1.length - 1) # In case of 10+ backrefs!
-        group = parse_backreference_group($1)
+        @current_position += (Regexp.last_match(1).length - 1) # In case of 10+ backrefs!
+        group = parse_backreference_group(Regexp.last_match(1))
       when rest_of_string =~ /\Ak['<]([\w-]+)['>]/ # Named capture group
-        @current_position += ($1.length + 2)
-        # Check for RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
-        group_id = ($1.to_i < 0) ? (@num_groups + $1.to_i + 1) : $1
+        @current_position += (Regexp.last_match(1).length + 2)
+        group_id = if Regexp.last_match(1).to_i < 0
+                     # RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
+                     @num_groups + Regexp.last_match(1).to_i + 1
+                   else
+                     Regexp.last_match(1)
+                   end
         group = parse_backreference_group(group_id)
       when BackslashCharMap.keys.include?(next_char)
         group = CharGroup.new(
@@ -114,32 +116,39 @@ module RegexpExamples
           @ignorecase
         )
       when rest_of_string =~ /\A(c|C-)(.)/ # Control character
-        @current_position += $1.length
-        group = parse_single_char_group( parse_control_character($2) )
+        @current_position += Regexp.last_match(1).length
+        group = parse_single_char_group(parse_control_character(Regexp.last_match(2)))
       when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence
-        @current_position += $1.length
-        group = parse_single_char_group( parse_escape_sequence($1) )
+        @current_position += Regexp.last_match(1).length
+        group = parse_single_char_group(parse_escape_sequence(Regexp.last_match(1)))
       when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence
-        @current_position += $1.length
-        sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
-        group = parse_single_char_group( parse_unicode_sequence(sequence) )
+        @current_position += Regexp.last_match(1).length
+        sequence = Regexp.last_match(1).match(/\h{1,4}/)[0] # Strip off "{" and "}"
+        group = parse_single_char_group(parse_unicode_sequence(sequence))
       when rest_of_string =~ /\A(p)\{(\^?)([^}]+)\}/i # Named properties
-        @current_position += ($2.length + $3.length + 2)
-        is_negative = ($1 == "P") ^ ($2 == "^") # Beware of double negatives! E.g. /\P{^Space}/
+        @current_position += (Regexp.last_match(2).length + # 0 or 1, of '^' is present
+                              Regexp.last_match(3).length + # Length of the property name
+                              2) # Length of opening and closing brackets (always 2)
+        # Beware of double negatives! E.g. /\P{^Space}/
+        is_negative = (Regexp.last_match(1) == 'P') ^ (Regexp.last_match(2) == '^')
         group = CharGroup.new(
           if is_negative
-            CharSets::Any.dup - NamedPropertyCharMap[$3.downcase]
+            CharSets::Any.dup - NamedPropertyCharMap[Regexp.last_match(3).downcase]
           else
-            NamedPropertyCharMap[$3.downcase]
+            NamedPropertyCharMap[Regexp.last_match(3).downcase]
           end,
           @ignorecase
         )
       when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
         group = PlaceHolderGroup.new
       when next_char == 'R' # Linebreak
-        group = CharGroup.new(["\r\n", "\n", "\v", "\f", "\r"], @ignorecase) # A bit hacky...
+        group = CharGroup.new(
+          ["\r\n", "\n", "\v", "\f", "\r"],
+          @ignorecase
+        ) # Using "\r\n" as one character is little bit hacky...
       when next_char == 'g' # Subexpression call
-        raise IllegalSyntaxError, "Subexpression calls (\\g) cannot be supported, as they are not regular"
+        fail IllegalSyntaxError,
+          'Subexpression calls (\\g) cannot be supported, as they are not regular'
       when next_char =~ /[bB]/ # Anchors
         raise_anchors_exception!
       when next_char =~ /[AG]/ # Start of string
@@ -155,7 +164,7 @@ module RegexpExamples
           raise_anchors_exception!
         end
       else
-        group = parse_single_char_group( next_char )
+        group = parse_single_char_group(next_char)
       end
       group
     end
@@ -193,7 +202,7 @@ module RegexpExamples
             comment_group = rest_of_string.match(/.*?[^\\](?:\\{2})*\)/)[0]
             @current_position += comment_group.length
           when match[2] =~ /\A(?=[mix-]+)([mix]*)-?([mix]*)/ # e.g. /(?i-mx)/
-            regexp_options_toggle($1, $2)
+            regexp_options_toggle(Regexp.last_match(1), Regexp.last_match(2))
             @num_groups -= 1 # Toggle "groups" should not increase backref group count
             @current_position += $&.length + 1
             if next_char == ':' # e.g. /(?i:subexpr)/
@@ -202,9 +211,11 @@ module RegexpExamples
               return PlaceHolderGroup.new
             end
           when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
-            raise IllegalSyntaxError, "Lookaheads are not regular; cannot generate examples"
+            fail IllegalSyntaxError,
+              'Lookaheads are not regular; cannot generate examples'
           when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
-            raise IllegalSyntaxError, "Lookbehinds are not regular; cannot generate examples"
+            fail IllegalSyntaxError,
+              'Lookbehinds are not regular; cannot generate examples'
           else # e.g. /(?<name>namedgroup)/
             @current_position += (match[3].length + 3)
             group_id = match[3]
@@ -226,12 +237,12 @@ module RegexpExamples
     end
     def regexp_options_toggle(on, off)
-      @ignorecase = true if (on.include? "i")
-      @ignorecase = false if (off.include? "i")
-      @multiline = true if (on.include? "m")
-      @multiline = false if (off.include? "m")
-      @extended = true if (on.include? "x")
-      @extended = false if (off.include? "x")
+      @ignorecase = true if on.include? 'i'
+      @ignorecase = false if off.include? 'i'
+      @multiline = true if on.include? 'm'
+      @multiline = false if off.include? 'm'
+      @extended = true if on.include? 'x'
+      @extended = false if off.include? 'x'
     end
     def parse_char_group
@@ -252,7 +263,6 @@ module RegexpExamples
       OrGroup.new(left_repeaters, right_repeaters)
     end
     def parse_single_char_group(char)
       SingleCharGroup.new(char, @ignorecase)
     end
@@ -310,17 +320,18 @@ module RegexpExamples
     end
     def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
-        # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
-        if min && !has_comma && !max && next_char == "?"
-          repeater = parse_question_mark_repeater(repeater)
-        else
-          parse_reluctant_or_possessive_repeater
-        end
-        repeater
+      # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
+      if min && !has_comma && !max && next_char == '?'
+        repeater = parse_question_mark_repeater(repeater)
+      else
+        parse_reluctant_or_possessive_repeater
+      end
+      repeater
     end
     def raise_anchors_exception!
-      raise IllegalSyntaxError, "Anchors ('#{next_char}') cannot be supported, as they are not regular"
+      fail IllegalSyntaxError,
+        "Anchors ('#{next_char}') cannot be supported, as they are not regular"
     end
     def parse_one_time_repeater(group)
@@ -336,8 +347,7 @@ module RegexpExamples
     end
     def end_of_regexp
-      next_char == ")" || @current_position >= regexp_string.length
+      next_char == ')' || @current_position >= regexp_string.length
     end
   end
 end

data/lib/regexp-examples/repeaters.rb CHANGED Viewed

@@ -6,11 +6,11 @@ module RegexpExamples
     end
     def result
-      group_results = group.result[0 .. RegexpExamples.MaxGroupResults-1]
+      group_results = group.result.first(RegexpExamples.MaxGroupResults)
       results = []
       min_repeats.upto(max_repeats) do |repeats|
         if repeats.zero?
-          results << [ GroupResult.new('') ]
+          results << [GroupResult.new('')]
         else
           results << RegexpExamples.permutations_of_strings(
             [group_results] * repeats
@@ -23,8 +23,8 @@ module RegexpExamples
     def random_result
       result = []
       rand(min_repeats..max_repeats).times { result << group.random_result }
-      result << [ GroupResult.new('') ] if result.empty? # in case of 0.times
-      RegexpExamples::permutations_of_strings(result)
+      result << [GroupResult.new('')] if result.empty? # in case of 0.times
+      RegexpExamples.permutations_of_strings(result)
     end
   end
@@ -74,9 +74,9 @@ module RegexpExamples
     end
     private
     def smallest(x, y)
       (x < y) ? x : y
     end
   end
 end

data/lib/regexp-examples/unicode_char_ranges.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'pstore'
+module RegexpExamples
+  class UnicodeCharRanges
+    # These values were generated by: scripts/unicode_lister.rb
+    # Note: Only the first 128 results are listed, for performance.
+    # Also, some groups seem to have no matches (weird!)
+    # (Don't care about ruby micro version number)
+    STORE_FILENAME = "unicode_ranges_#{RUBY_VERSION[0..2]}.pstore"
+    attr_reader :range_store
+    def initialize(filename = STORE_FILENAME)
+      @range_store = PStore.new(File.expand_path("../../../db/#{filename}", __FILE__))
+    end
+    def get(key)
+      range_store.transaction(true) do
+        ranges_to_unicode(range_store[key])
+      end
+    end
+    alias_method :[], :get
+    private
+    # TODO: Document example input/output of this method
+    # It's pretty simple, but this code is a little confusing!!
+    def ranges_to_unicode(ranges)
+      result = []
+      ranges.each do |range|
+        if range.is_a? Fixnum # Small hack to increase data compression
+          result << hex_to_unicode(range.to_s(16))
+        else
+          range.each { |num| result << hex_to_unicode(num.to_s(16)) }
+        end
+      end
+      result
+    end
+    def hex_to_unicode(hex)
+      eval("?\\u{#{hex}}")
+    end
+  end
+end

data/lib/regexp-examples/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RegexpExamples
-  VERSION = '1.1.0'
+  VERSION = '1.1.2'
 end

data/regexp-examples.gemspec CHANGED Viewed

@@ -1,4 +1,4 @@
-require File.expand_path("../lib/regexp-examples/version", __FILE__)
+require File.expand_path('../lib/regexp-examples/version', __FILE__)
 Gem::Specification.new do |s|
   s.name             = 'regexp-examples'
@@ -11,11 +11,11 @@ Gem::Specification.new do |s|
   s.files            = `git ls-files -z`.split("\x0")
   s.executables      = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
   s.test_files       = s.files.grep(%r{^(test|spec|features)/})
-  s.require_paths    = ["lib"]
+  s.require_paths    = ['lib']
   s.homepage         =
     'http://rubygems.org/gems/regexp-examples'
-  s.add_development_dependency "bundler", "~> 1.7"
-  s.add_development_dependency "rake", "~> 10.0"
+  s.add_development_dependency 'bundler', '~> 1.7'
+  s.add_development_dependency 'rake', '~> 10.0'
   s.license          = 'MIT'
   s.required_ruby_version = '>= 2.0.0'
 end

data/scripts/unicode_lister.rb CHANGED Viewed

@@ -1,180 +1,64 @@
+require 'pstore'
+require_relative '../lib/regexp-examples/unicode_char_ranges'
 # A script to generate lists of all unicode characters
 # that match all named group/character properties regexps.
 # For use in e.g. /\p{Arabic}/.examples
 # To (re-)generate this list, simply run this file!
 # > ruby scripts/unicode_lister.rb
-OutputFilename = 'unicode_result'
 # Taken from ruby documentation:
 # http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
 NamedGroups = %w(
-  Alnum
-  Alpha
-  Blank
-  Cntrl
-  Digit
-  Graph
-  Lower
-  Print
-  Punct
-  Space
-  Upper
-  XDigit
-  Word
-  ASCII
-  Any
-  Assigned
+  Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word ASCII Any Assigned
-  L
-  Ll
-  Lm
-  Lo
-  Lt
-  Lu
-  M
-  Mn
-  Mc
-  Me
-  N
-  Nd
-  Nl
-  No
-  P
-  Pc
-  Pd
-  Ps
-  Pe
-  Pi
-  Pf
-  Po
-  S
-  Sm
-  Sc
-  Sk
-  So
-  Z
-  Zs
-  Zl
-  Zp
-  C
-  Cc
-  Cf
-  Cn
-  Co
-  Cs
+  L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl Zp C Cc Cf Cn Co Cs
-  Arabic
-  Armenian
-  Balinese
-  Bengali
-  Bopomofo
-  Braille
-  Buginese
-  Buhid
-  Canadian_Aboriginal
-  Carian
-  Cham
-  Cherokee
-  Common
-  Coptic
-  Cuneiform
-  Cypriot
-  Cyrillic
-  Deseret
-  Devanagari
-  Ethiopic
-  Georgian
-  Glagolitic
-  Gothic
-  Greek
-  Gujarati
-  Gurmukhi
-  Han
-  Hangul
-  Hanunoo
-  Hebrew
-  Hiragana
-  Inherited
-  Kannada
-  Katakana
-  Kayah_Li
-  Kharoshthi
-  Khmer
-  Lao
-  Latin
-  Lepcha
-  Limbu
-  Linear_B
-  Lycian
-  Lydian
-  Malayalam
-  Mongolian
-  Myanmar
-  New_Tai_Lue
-  Nko
-  Ogham
-  Ol_Chiki
-  Old_Italic
-  Old_Persian
-  Oriya
-  Osmanya
-  Phags_Pa
-  Phoenician
-  Rejang
-  Runic
-  Saurashtra
-  Shavian
-  Sinhala
-  Sundanese
-  Syloti_Nagri
-  Syriac
-  Tagalog
-  Tagbanwa
-  Tai_Le
-  Tamil
-  Telugu
-  Thaana
-  Thai
-  Tibetan
-  Tifinagh
-  Ugaritic
-  Vai
-  Yi
+  Arabic Armenian Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal Carian Cham Cherokee
+  Common Coptic Cuneiform Cypriot Cyrillic Deseret Devanagari Ethiopic Georgian Glagolitic Gothic Greek
+  Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana Inherited Kannada Katakana Kayah_Li Kharoshthi Khmer
+  Lao Latin Lepcha Limbu Linear_B Lycian Lydian Malayalam Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki
+  Old_Italic Old_Persian Oriya Osmanya Phags_Pa Phoenician Rejang Runic Saurashtra Shavian Sinhala Sundanese
+  Syloti_Nagri Syriac Tagalog Tagbanwa Tai_Le Tamil Telugu Thaana Thai Tibetan Tifinagh Ugaritic Vai Yi
 )
-# Note: For some reason, a character encoding-related exception gets raised
-# when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
-# This means my calculation is MISSING results in the range: 55296..65535
-# However, for the sake of performance, I'm also being "lazy" and only calculating/saving
-# the first 128 matches anyway!
-# If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
+# Note: For the range 55296..57343, these are reserved values that are not legal
+# unicode characters.
+# I.e. a character encoding-related exception gets raised when you do:
+# `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")`
+# TODO: Add a link to somewhere that explains this better.
-# Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
-# Example output: "1..4, 6..7, 12, 14" (String)
+# "Compresses" the values in an array by using ranges.
+# Example input: [1, 2, 3, 4, 6, 7, 12, 14]
+# Example output: [1..4, 6..7, 12, 14]
 def calculate_ranges(matching_codes)
-  return "" if matching_codes.empty?
+  return [] if matching_codes.empty?
   first = matching_codes.shift
-  matching_codes.inject([first..first]) do |r,x|
+  matching_codes.inject([first..first]) do |r, x|
     if r.last.last.succ != x
       r << (x..x) # Start new range
     else
       r[0..-2] << (r.last.first..x) # Update last range
     end
   end
-    .map { |range| range.size == 1 ? range.first : range}
-    .join(", ")
+    .map { |range| range.size == 1 ? range.first : range } # Replace `int..int` with `int`
 end
 count = 0
-File.open(OutputFilename, 'w') do |f|
+filename = RegexpExamples::UnicodeCharRanges::STORE_FILENAME
+store = PStore.new(filename)
+store.transaction do
   NamedGroups.each do |name|
-  count += 1
-    matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
-    f.puts "'#{name.downcase}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
+    count += 1
+    # Only generating first 128 matches, for performance...
+    # (I have tried this with generating ALL examples, and it makes the ruby gem
+    # painfully slow and bloated... Especially the test suite.)
+    matching_codes = [(0..55_295), (57_344..65_535)].map(&:to_a).flatten.lazy
+                     .find { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }
+    (128)
+    store[name.downcase] = calculate_ranges(matching_codes)
     puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
   end
-  puts "*"*50
-  puts "Finished! Result stored in: #{OutputFilename}"
+  puts '*' * 50
+  puts "Finished! Result stored in: ./db/#{filename}"
 end