regexp_optimized_union 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/regexp_optimized_union.rb +76 -15
- data/readme.md +2 -1
- metadata +2 -2
| @@ -1,3 +1,14 @@ | |
| 1 | 
            +
            if RUBY_VERSION < '1.9'
         | 
| 2 | 
            +
              require 'enumerator'
         | 
| 3 | 
            +
              class String
         | 
| 4 | 
            +
                unless defined?(ord)
         | 
| 5 | 
            +
                  def ord
         | 
| 6 | 
            +
                    unpack('C').first
         | 
| 7 | 
            +
                  end
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
              end
         | 
| 10 | 
            +
            end
         | 
| 11 | 
            +
             | 
| 1 12 | 
             
            class Regexp
         | 
| 2 13 | 
             
              # trie for optimization
         | 
| 3 14 | 
             
              class OptimizeTrie < Hash
         | 
| @@ -14,7 +25,7 @@ class Regexp | |
| 14 25 | 
             
                def single_char?
         | 
| 15 26 | 
             
                  size == 1 and values[0].empty?
         | 
| 16 27 | 
             
                end
         | 
| 17 | 
            -
             | 
| 28 | 
            +
             | 
| 18 29 | 
             
                # prereq: single_branch?
         | 
| 19 30 | 
             
                def to_chars
         | 
| 20 31 | 
             
                  if empty?
         | 
| @@ -23,7 +34,7 @@ class Regexp | |
| 23 34 | 
             
                    [keys[0], *values[0].to_chars]
         | 
| 24 35 | 
             
                  end
         | 
| 25 36 | 
             
                end
         | 
| 26 | 
            -
             | 
| 37 | 
            +
             | 
| 27 38 | 
             
                # prereq: opt_suffix
         | 
| 28 39 | 
             
                # returns: regexp src
         | 
| 29 40 | 
             
                def extract_common_suffix
         | 
| @@ -46,12 +57,12 @@ class Regexp | |
| 46 57 | 
             
                      break
         | 
| 47 58 | 
             
                    end
         | 
| 48 59 | 
             
                  end
         | 
| 49 | 
            -
             | 
| 60 | 
            +
             | 
| 50 61 | 
             
                  if common_size
         | 
| 51 62 | 
             
                    common = branches[0].take(common_size).reverse.join
         | 
| 52 63 | 
             
                    if branches.all?{|b| b.size == common_size + 1 }
         | 
| 53 | 
            -
                      diff = branches.map | 
| 54 | 
            -
                      " | 
| 64 | 
            +
                      diff = build_char_group(branches.map &:last)
         | 
| 65 | 
            +
                      "#{diff}#{common}"
         | 
| 55 66 | 
             
                    else
         | 
| 56 67 | 
             
                      diff = branches.map do |b|
         | 
| 57 68 | 
             
                        b.drop(common_size).reverse.join
         | 
| @@ -61,25 +72,67 @@ class Regexp | |
| 61 72 | 
             
                  end
         | 
| 62 73 | 
             
                end
         | 
| 63 74 |  | 
| 75 | 
            +
                def build_char_group chars
         | 
| 76 | 
            +
                  return chars.first if chars.size == 1
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                  if RUBY_VERSION < '1.9'
         | 
| 79 | 
            +
                    chars, mb_chars = chars.partition{|c| c.bytesize == 1}
         | 
| 80 | 
            +
                  else
         | 
| 81 | 
            +
                    mb_chars = []
         | 
| 82 | 
            +
                  end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                  chars = chars.map(&:ord)
         | 
| 85 | 
            +
                  chars.sort!
         | 
| 86 | 
            +
                  first_char = chars.shift
         | 
| 87 | 
            +
                  groups = [(first_char..first_char)]
         | 
| 88 | 
            +
                  chars.each do |c|
         | 
| 89 | 
            +
                    if c == groups.last.end + 1
         | 
| 90 | 
            +
                      groups[-1] = groups.last.begin..c
         | 
| 91 | 
            +
                    else
         | 
| 92 | 
            +
                      groups << (c..c)
         | 
| 93 | 
            +
                    end
         | 
| 94 | 
            +
                  end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                  groups.map! do |range|
         | 
| 97 | 
            +
                    # only apply range to >= 4 contiguous chars
         | 
| 98 | 
            +
                    if range.end >= range.begin + 3
         | 
| 99 | 
            +
                      "#{range.begin.chr}-#{range.end.chr}"
         | 
| 100 | 
            +
                    elsif range.end > range.begin
         | 
| 101 | 
            +
                      range.map(&:chr).join
         | 
| 102 | 
            +
                    else
         | 
| 103 | 
            +
                      range.begin.chr
         | 
| 104 | 
            +
                    end
         | 
| 105 | 
            +
                  end
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  "[#{groups.join}#{mb_chars.join}]"
         | 
| 108 | 
            +
                end
         | 
| 109 | 
            +
             | 
| 64 110 | 
             
                def to_re_src
         | 
| 65 111 | 
             
                  return '' if empty?
         | 
| 66 | 
            -
             | 
| 112 | 
            +
             | 
| 67 113 | 
             
                  res = extract_common_suffix if opt_suffix
         | 
| 114 | 
            +
                  char_group = false
         | 
| 68 115 | 
             
                  if !res
         | 
| 69 116 | 
             
                    can_be_branched = true
         | 
| 70 | 
            -
                     | 
| 117 | 
            +
                    branches = map do |key, value|
         | 
| 71 118 | 
             
                      "#{key}#{value.to_re_src}"
         | 
| 72 | 
            -
                    end | 
| 119 | 
            +
                    end
         | 
| 120 | 
            +
                    if branches.all?{|b| b.bytesize == 1}
         | 
| 121 | 
            +
                      char_group = true
         | 
| 122 | 
            +
                      res = build_char_group branches
         | 
| 123 | 
            +
                    else
         | 
| 124 | 
            +
                      res = branches.join '|'
         | 
| 125 | 
            +
                    end
         | 
| 73 126 | 
             
                  end
         | 
| 74 | 
            -
             | 
| 127 | 
            +
             | 
| 75 128 | 
             
                  if opt_maybe
         | 
| 76 | 
            -
                    if single_char?
         | 
| 129 | 
            +
                    if char_group or single_char?
         | 
| 77 130 | 
             
                      "#{res}?"
         | 
| 78 131 | 
             
                    else
         | 
| 79 132 | 
             
                      "(?:#{res})?"
         | 
| 80 133 | 
             
                    end
         | 
| 81 134 | 
             
                  else
         | 
| 82 | 
            -
                    if can_be_branched and size > 1 and parent
         | 
| 135 | 
            +
                    if can_be_branched and size > 1 and parent and !char_group
         | 
| 83 136 | 
             
                      "(?:#{res})"
         | 
| 84 137 | 
             
                    else
         | 
| 85 138 | 
             
                      res
         | 
| @@ -125,8 +178,11 @@ class Regexp | |
| 125 178 | 
             
            end
         | 
| 126 179 |  | 
| 127 180 | 
             
            if __FILE__ == $PROGRAM_NAME
         | 
| 181 | 
            +
              # NOTE test will fail under ruby 1.8.7 due to hash order, but results should be identical
         | 
| 182 | 
            +
              success = true
         | 
| 128 183 | 
             
              {
         | 
| 129 184 | 
             
                %w[]                        => //,
         | 
| 185 | 
            +
                %w[a b c d f]               => /[a-df]/,
         | 
| 130 186 | 
             
                %w[foo]                     => /foo/,
         | 
| 131 187 | 
             
                %w[foo bar]                 => /foo|bar/,
         | 
| 132 188 | 
             
                %w[foo foob bar]            => /foob?|bar/,
         | 
| @@ -134,17 +190,22 @@ if __FILE__ == $PROGRAM_NAME | |
| 134 190 | 
             
                %w[bazfoo bazfoobar bazbar] => /baz(?:foo(?:bar)?|bar)/,
         | 
| 135 191 | 
             
                %w[fooabar foobbar]         => /foo[ab]bar/,
         | 
| 136 192 | 
             
                %w[fooabar foobazbar]       => /foo(?:a|baz)bar/,
         | 
| 137 | 
            -
                %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar | 
| 193 | 
            +
                %w[foobar fooabar foogabar] => /foo(?:|a|ga)bar/,
         | 
| 194 | 
            +
                %w[vax vcx vbx vdx]         => /v[a-d]x/,
         | 
| 195 | 
            +
                %w[vax vcx vbx]             => /v[abc]x/,
         | 
| 196 | 
            +
                %w[xa xc xb x]              => /x[abc]?/
         | 
| 138 197 | 
             
              }.each do |a, r|
         | 
| 139 198 | 
             
                l = Regexp.optimized_union a
         | 
| 140 199 | 
             
                a.each do |s|
         | 
| 141 200 | 
             
                  if l.match(s).offset(0) != [0, s.size]
         | 
| 142 | 
            -
                     | 
| 201 | 
            +
                    success = false
         | 
| 202 | 
            +
                    puts "#{l.inspect} from #{a.inspect} not match #{s.inspect}"
         | 
| 143 203 | 
             
                  end
         | 
| 144 204 | 
             
                end
         | 
| 145 205 | 
             
                if r != l
         | 
| 146 | 
            -
                   | 
| 206 | 
            +
                  success = false
         | 
| 207 | 
            +
                  puts "expected #{r} from #{a.inspect} but got #{l}"
         | 
| 147 208 | 
             
                end
         | 
| 148 209 | 
             
              end
         | 
| 149 | 
            -
              puts 'test success!'
         | 
| 210 | 
            +
              puts 'test success!' if success
         | 
| 150 211 | 
             
            end
         | 
    
        data/readme.md
    CHANGED
    
    | @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            `Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list. | 
| 1 | 
            +
            `Regexp.optimized_union(word_list, regexp_options)` generates optimized regexp for matching union of word list.
         | 
| 2 2 |  | 
| 3 3 | 
             
            ### Install:
         | 
| 4 4 |  | 
| @@ -24,6 +24,7 @@ Regexp.optimized_union(%w[foobar fooabar foogabar]) #=> /foo(?:|a|ga)bar/ | |
| 24 24 |  | 
| 25 25 | 
             
            - Treed common prefix extraction.
         | 
| 26 26 | 
             
            - Common suffix aggregation.
         | 
| 27 | 
            +
            - If 4 or more contiguous chars exist in a char group, they are turned into char range.
         | 
| 27 28 | 
             
            - Optional leaf to `?`.
         | 
| 28 29 |  | 
| 29 30 | 
             
            Mostly the same as described in http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/List.pm#IMPLEMENTATION
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: regexp_optimized_union
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.2
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2012-11- | 
| 12 | 
            +
            date: 2012-11-14 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies: []
         | 
| 14 14 | 
             
            description: Regexp.optimized_union(word_list, regexp_options) generates optimized
         | 
| 15 15 | 
             
              regexp for matching union of word list
         |