Text 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metadata +51 -67
- data/README.rdoc +0 -28
- data/lib/text.rb +0 -6
- data/lib/text/double_metaphone.rb +0 -356
- data/lib/text/figlet.rb +0 -17
- data/lib/text/figlet/font.rb +0 -117
- data/lib/text/figlet/smusher.rb +0 -64
- data/lib/text/figlet/typesetter.rb +0 -68
- data/lib/text/levenshtein.rb +0 -65
- data/lib/text/metaphone.rb +0 -97
- data/lib/text/porter_stemming.rb +0 -171
- data/lib/text/soundex.rb +0 -61
- data/rakefile.rb +0 -44
- data/test/data/big.flf +0 -2204
- data/test/data/big.txt +0 -8
- data/test/data/chunky.flf +0 -512
- data/test/data/chunky.txt +0 -5
- data/test/data/double_metaphone.csv +0 -1218
- data/test/data/metaphone.txt +0 -51
- data/test/data/metaphone_buggy.txt +0 -52
- data/test/data/porter_stemming_input.txt +0 -23531
- data/test/data/porter_stemming_output.txt +0 -23531
- data/test/preamble.rb +0 -10
- data/test/test_double_metaphone.rb +0 -23
- data/test/test_figlet.rb +0 -17
- data/test/test_levenshtein.rb +0 -80
- data/test/test_metaphone.rb +0 -39
- data/test/test_porter_stemming.rb +0 -16
- data/test/test_soundex.rb +0 -27
    
        data/lib/text/figlet.rb
    DELETED
    
    | @@ -1,17 +0,0 @@ | |
| 1 | 
            -
            #
         | 
| 2 | 
            -
            # Ruby implementation of the Figlet program (http://www.figlet.org/).
         | 
| 3 | 
            -
            #
         | 
| 4 | 
            -
            # Author: Tim Fletcher (twoggle@gmail.com)
         | 
| 5 | 
            -
            #
         | 
| 6 | 
            -
            # Usage:
         | 
| 7 | 
            -
            #
         | 
| 8 | 
            -
            #   big_font = Text::Figlet::Font.new('big.flf')
         | 
| 9 | 
            -
            #   
         | 
| 10 | 
            -
            #   figlet = Text::Figlet::Typesetter.new(big_font)
         | 
| 11 | 
            -
            #   
         | 
| 12 | 
            -
            #   puts figlet['hello world']
         | 
| 13 | 
            -
            #
         | 
| 14 | 
            -
            #
         | 
| 15 | 
            -
            require 'text/figlet/font'
         | 
| 16 | 
            -
            require 'text/figlet/smusher'
         | 
| 17 | 
            -
            require 'text/figlet/typesetter'
         | 
    
        data/lib/text/figlet/font.rb
    DELETED
    
    | @@ -1,117 +0,0 @@ | |
| 1 | 
            -
            module Text
         | 
| 2 | 
            -
            module Figlet
         | 
| 3 | 
            -
             | 
| 4 | 
            -
              class UnknownFontFormat < StandardError
         | 
| 5 | 
            -
              end  
         | 
| 6 | 
            -
             | 
| 7 | 
            -
              class Font
         | 
| 8 | 
            -
                def initialize(filename, load_german = true)
         | 
| 9 | 
            -
                  file = File.open(filename, 'rb')
         | 
| 10 | 
            -
              
         | 
| 11 | 
            -
                  header = file.gets.strip.split(/ /)
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  raise UnknownFontFormat if 'flf2a' != header[0][0, 5]
         | 
| 14 | 
            -
             | 
| 15 | 
            -
                  @hard_blank = header.shift[-1, 1]
         | 
| 16 | 
            -
                  @height = header.shift.to_i
         | 
| 17 | 
            -
                  @baseline = header.shift
         | 
| 18 | 
            -
                  @max_length = header.shift
         | 
| 19 | 
            -
                  @old_layout = header.shift.to_i
         | 
| 20 | 
            -
                  @comment_count = header.shift.to_i
         | 
| 21 | 
            -
                  @right_to_left = header.shift
         | 
| 22 | 
            -
                  @right_to_left = !@right_to_left.nil? && @right_to_left.to_i == 1
         | 
| 23 | 
            -
                  
         | 
| 24 | 
            -
                  @load_german, @characters = load_german, {}
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                  load_comments file
         | 
| 27 | 
            -
                  load_ascii_characters file
         | 
| 28 | 
            -
                  load_german_characters file
         | 
| 29 | 
            -
                  load_extended_characters file
         | 
| 30 | 
            -
                  
         | 
| 31 | 
            -
                  file.close
         | 
| 32 | 
            -
                end
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                def [](char)
         | 
| 35 | 
            -
                  @characters[char]
         | 
| 36 | 
            -
                end
         | 
| 37 | 
            -
                
         | 
| 38 | 
            -
                def has_char?(char)
         | 
| 39 | 
            -
                  @characters.has_key? char
         | 
| 40 | 
            -
                end
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                attr_reader :height, :hard_blank, :old_layout
         | 
| 43 | 
            -
                
         | 
| 44 | 
            -
                def right_to_left?
         | 
| 45 | 
            -
                  @right_to_left
         | 
| 46 | 
            -
                end
         | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
                private
         | 
| 50 | 
            -
                
         | 
| 51 | 
            -
                def load_comments(file)
         | 
| 52 | 
            -
                  @comment_count.times { file.gets.strip }
         | 
| 53 | 
            -
                end
         | 
| 54 | 
            -
                
         | 
| 55 | 
            -
                def load_ascii_characters(file)
         | 
| 56 | 
            -
                  (32..126).each { |i| @characters[i] = load_char(file) }
         | 
| 57 | 
            -
                end
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                def load_german_characters(file)
         | 
| 60 | 
            -
                  [91, 92, 93, 123, 124, 125, 126].each do |i|
         | 
| 61 | 
            -
                    if @load_german
         | 
| 62 | 
            -
                      unless char = load_char(file)
         | 
| 63 | 
            -
                        return
         | 
| 64 | 
            -
                      end
         | 
| 65 | 
            -
                      @characters[i] = char
         | 
| 66 | 
            -
                    else
         | 
| 67 | 
            -
                      skip_char file
         | 
| 68 | 
            -
                    end
         | 
| 69 | 
            -
                  end
         | 
| 70 | 
            -
                end
         | 
| 71 | 
            -
                
         | 
| 72 | 
            -
                def load_extended_characters(file)
         | 
| 73 | 
            -
                  until file.eof?
         | 
| 74 | 
            -
                    i = file.gets.strip.split(/ /).first
         | 
| 75 | 
            -
                    if i.empty?
         | 
| 76 | 
            -
                      next
         | 
| 77 | 
            -
                    elsif /^\-0x/i =~ i # comment
         | 
| 78 | 
            -
                      skip_char file
         | 
| 79 | 
            -
                    else
         | 
| 80 | 
            -
                      if /^0x/i =~ i
         | 
| 81 | 
            -
                        i = i[2, 1].hex
         | 
| 82 | 
            -
                      elsif '0' == i[0] && '0' != i || '-0' == i[0, 2]
         | 
| 83 | 
            -
                        i = i.oct
         | 
| 84 | 
            -
                      end
         | 
| 85 | 
            -
                      unless char = load_char(file)
         | 
| 86 | 
            -
                        return
         | 
| 87 | 
            -
                      end
         | 
| 88 | 
            -
                      @characters[i] = char
         | 
| 89 | 
            -
                    end
         | 
| 90 | 
            -
                  end
         | 
| 91 | 
            -
                end
         | 
| 92 | 
            -
                
         | 
| 93 | 
            -
                def load_char(file)
         | 
| 94 | 
            -
                  char = []
         | 
| 95 | 
            -
                  @height.times do
         | 
| 96 | 
            -
                    return false if file.eof?
         | 
| 97 | 
            -
                    line = file.gets.rstrip
         | 
| 98 | 
            -
                    if match = /(.){1,2}$/.match(line)
         | 
| 99 | 
            -
                      line.gsub! match[1], ''
         | 
| 100 | 
            -
                    end
         | 
| 101 | 
            -
                    line << "\x00"
         | 
| 102 | 
            -
                    char << line
         | 
| 103 | 
            -
                  end
         | 
| 104 | 
            -
                  return char      
         | 
| 105 | 
            -
                end
         | 
| 106 | 
            -
             | 
| 107 | 
            -
                def skip_char(file)
         | 
| 108 | 
            -
                  @height.times do
         | 
| 109 | 
            -
                    return if file.eof?
         | 
| 110 | 
            -
                    return if file.gets.strip.nil?
         | 
| 111 | 
            -
                  end
         | 
| 112 | 
            -
                end
         | 
| 113 | 
            -
             | 
| 114 | 
            -
              end
         | 
| 115 | 
            -
             | 
| 116 | 
            -
            end # module Figlet
         | 
| 117 | 
            -
            end # module Text
         | 
    
        data/lib/text/figlet/smusher.rb
    DELETED
    
    | @@ -1,64 +0,0 @@ | |
| 1 | 
            -
            module Text
         | 
| 2 | 
            -
            module Figlet
         | 
| 3 | 
            -
             | 
| 4 | 
            -
              class Smusher
         | 
| 5 | 
            -
             | 
| 6 | 
            -
                def initialize(font)
         | 
| 7 | 
            -
                  @font = font
         | 
| 8 | 
            -
                end
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                def [](result)
         | 
| 11 | 
            -
                  todo = false
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  @font.height.times do |j|
         | 
| 14 | 
            -
                    result[j] = result[j].sub(pattern) { todo, x = callback(todo, $1, $2); x }
         | 
| 15 | 
            -
                  end
         | 
| 16 | 
            -
                  @font.height.times do |j|
         | 
| 17 | 
            -
                    result[j] = if todo
         | 
| 18 | 
            -
                      result[j].sub(/\s\x00(?!$)|\x00\s/, '').sub(/\x00(?!$)/, '')
         | 
| 19 | 
            -
                    else
         | 
| 20 | 
            -
                      result[j].sub(/\x00(?!$)/, '')
         | 
| 21 | 
            -
                    end
         | 
| 22 | 
            -
                  end
         | 
| 23 | 
            -
                end
         | 
| 24 | 
            -
             | 
| 25 | 
            -
                def pattern
         | 
| 26 | 
            -
                  @pattern ||= /([^#{@font.hard_blank}\x00\s])\x00([^#{@font.hard_blank}\x00\s])/
         | 
| 27 | 
            -
                end
         | 
| 28 | 
            -
             | 
| 29 | 
            -
                def symbols
         | 
| 30 | 
            -
                  @@symbols ||= {
         | 
| 31 | 
            -
                    24 => '|/\\[]{}()<>',
         | 
| 32 | 
            -
                    8 => {'[' => ']', ']' => '[', '{' => '}', '}' => '{', '(' => ')', ')' => '('},
         | 
| 33 | 
            -
                    16 => {"/\\" => '|', "\\/" => 'Y', '><' => 'X'}
         | 
| 34 | 
            -
                  }
         | 
| 35 | 
            -
                end
         | 
| 36 | 
            -
             | 
| 37 | 
            -
                def old_layout?(n)
         | 
| 38 | 
            -
                  @font.old_layout & n > 0
         | 
| 39 | 
            -
                end
         | 
| 40 | 
            -
             | 
| 41 | 
            -
                def callback(s, a, b)
         | 
| 42 | 
            -
                  combined = a + b
         | 
| 43 | 
            -
                  
         | 
| 44 | 
            -
                  if old_layout?(1) && a == b
         | 
| 45 | 
            -
                    return true, a
         | 
| 46 | 
            -
                  elsif old_layout?(2) && ('_' == a && symbols[24].include?(b) || '_' == b && symbols[24].include?(a))
         | 
| 47 | 
            -
                    return true, a
         | 
| 48 | 
            -
                  elsif old_layout?(4) && ((left = symbols[24].index(a)) && (right = symbols[24].index(b)))
         | 
| 49 | 
            -
                    return true, (right > left ? b : a)
         | 
| 50 | 
            -
                  elsif old_layout?(8) && (symbols[8].has_key?(b) && symbols[8][b] == a)
         | 
| 51 | 
            -
                    return true, '|'
         | 
| 52 | 
            -
                  elsif old_layout?(16) && symbols[16].has_key?(combined)
         | 
| 53 | 
            -
                    return true, symbols[16][combined]
         | 
| 54 | 
            -
                  elsif old_layout?(32) && (a == b && @font.hard_blank == a)
         | 
| 55 | 
            -
                    return true, @font.hard_blank
         | 
| 56 | 
            -
                  else
         | 
| 57 | 
            -
                    return s, "#{a}\00#{b}"
         | 
| 58 | 
            -
                  end
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
              end
         | 
| 62 | 
            -
             | 
| 63 | 
            -
            end # module Figlet
         | 
| 64 | 
            -
            end # module Text
         | 
| @@ -1,68 +0,0 @@ | |
| 1 | 
            -
            module Text
         | 
| 2 | 
            -
            module Figlet
         | 
| 3 | 
            -
             | 
| 4 | 
            -
              class Typesetter
         | 
| 5 | 
            -
             | 
| 6 | 
            -
                def initialize(font, options = nil)
         | 
| 7 | 
            -
                  @font = font
         | 
| 8 | 
            -
                  @options = options || {}
         | 
| 9 | 
            -
                  @smush = @options.has_key?(:smush) ? @options[:smush] : true
         | 
| 10 | 
            -
                end
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                def [](str)
         | 
| 13 | 
            -
                  result = []
         | 
| 14 | 
            -
                  str.length.times do |i|
         | 
| 15 | 
            -
                    char = str[i]
         | 
| 16 | 
            -
                    unless @font.has_char?(char)
         | 
| 17 | 
            -
                      if @font.has_char?(0)
         | 
| 18 | 
            -
                        char = 0
         | 
| 19 | 
            -
                      else
         | 
| 20 | 
            -
                        next
         | 
| 21 | 
            -
                      end
         | 
| 22 | 
            -
                    end
         | 
| 23 | 
            -
                    @font.height.times do |j|
         | 
| 24 | 
            -
                      line = @font[char][j]
         | 
| 25 | 
            -
                      if result[j].nil?
         | 
| 26 | 
            -
                        result[j] = line
         | 
| 27 | 
            -
                      else
         | 
| 28 | 
            -
                        result[j] = @font.right_to_left?? (line + result[j]) : (result[j] + line)
         | 
| 29 | 
            -
                      end
         | 
| 30 | 
            -
                    end
         | 
| 31 | 
            -
                    if @font.old_layout > -1 && i > 0
         | 
| 32 | 
            -
                      diff = -1
         | 
| 33 | 
            -
                      @font.height.times do |j|
         | 
| 34 | 
            -
                        if match = /\S(\s*\x00\s*)\S/.match(result[j])
         | 
| 35 | 
            -
                          len = match[1].length
         | 
| 36 | 
            -
                          diff = (diff == -1 ? len : min(diff, len))
         | 
| 37 | 
            -
                        end
         | 
| 38 | 
            -
                      end
         | 
| 39 | 
            -
                      diff -= 1
         | 
| 40 | 
            -
                      if diff > 0
         | 
| 41 | 
            -
                        @font.height.times do |j|
         | 
| 42 | 
            -
                          if match = /\x00(\s{0,#{diff}})/.match(result[j])
         | 
| 43 | 
            -
                            b = diff - match[1].length
         | 
| 44 | 
            -
                            result[j] = result[j].sub(/\s{0,#{b}}\x00\s{#{match[1].length}}/, "\0")
         | 
| 45 | 
            -
                          end
         | 
| 46 | 
            -
                        end
         | 
| 47 | 
            -
                      end
         | 
| 48 | 
            -
                      smush[result] if @smush
         | 
| 49 | 
            -
                    end
         | 
| 50 | 
            -
                  end
         | 
| 51 | 
            -
                  return result.join("\n").gsub(/\0/, '').gsub(@font.hard_blank, ' ')
         | 
| 52 | 
            -
                end
         | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
                private
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                def min(a, b)
         | 
| 58 | 
            -
                  a > b ? b : a
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                def smush
         | 
| 62 | 
            -
                  @smusher ||= Smusher.new(@font)
         | 
| 63 | 
            -
                end
         | 
| 64 | 
            -
             | 
| 65 | 
            -
              end
         | 
| 66 | 
            -
             | 
| 67 | 
            -
            end # module Figlet
         | 
| 68 | 
            -
            end # module Text
         | 
    
        data/lib/text/levenshtein.rb
    DELETED
    
    | @@ -1,65 +0,0 @@ | |
| 1 | 
            -
            #
         | 
| 2 | 
            -
            # Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
         | 
| 3 | 
            -
            #
         | 
| 4 | 
            -
            # The Levenshtein distance is a measure of how similar two strings s and t are,
         | 
| 5 | 
            -
            # calculated as the number of deletions/insertions/substitutions needed to
         | 
| 6 | 
            -
            # transform s into t. The greater the distance, the more the strings differ.
         | 
| 7 | 
            -
            #
         | 
| 8 | 
            -
            # The Levenshtein distance is also sometimes referred to as the
         | 
| 9 | 
            -
            # easier-to-pronounce-and-spell 'edit distance'.
         | 
| 10 | 
            -
            #
         | 
| 11 | 
            -
            # Author: Paul Battley (pbattley@gmail.com)
         | 
| 12 | 
            -
            #
         | 
| 13 | 
            -
             | 
| 14 | 
            -
            module Text # :nodoc:
         | 
| 15 | 
            -
            module Levenshtein
         | 
| 16 | 
            -
             | 
| 17 | 
            -
              # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
         | 
| 18 | 
            -
              # +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
         | 
| 19 | 
            -
              # as ISO-8859-*.
         | 
| 20 | 
            -
              #
         | 
| 21 | 
            -
              # The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
         | 
| 22 | 
            -
              # Otherwise, the comparison will be performed byte-by-byte. There is no specific support 
         | 
| 23 | 
            -
              # for Shift-JIS or EUC strings.
         | 
| 24 | 
            -
              #
         | 
| 25 | 
            -
              # When using Unicode text, be aware that this algorithm does not perform normalisation. 
         | 
| 26 | 
            -
              # If there is a possibility of different normalised forms being used, normalisation
         | 
| 27 | 
            -
              # should be performed beforehand.
         | 
| 28 | 
            -
              #
         | 
| 29 | 
            -
              def distance(str1, str2)
         | 
| 30 | 
            -
                if $KCODE =~ /^U/i
         | 
| 31 | 
            -
                  unpack_rule = 'U*'
         | 
| 32 | 
            -
                else
         | 
| 33 | 
            -
                  unpack_rule = 'C*'
         | 
| 34 | 
            -
                end
         | 
| 35 | 
            -
                s = str1.unpack(unpack_rule)
         | 
| 36 | 
            -
                t = str2.unpack(unpack_rule)
         | 
| 37 | 
            -
                n = s.length
         | 
| 38 | 
            -
                m = t.length
         | 
| 39 | 
            -
                return m if (0 == n)
         | 
| 40 | 
            -
                return n if (0 == m)
         | 
| 41 | 
            -
              
         | 
| 42 | 
            -
                d = (0..m).to_a
         | 
| 43 | 
            -
                x = nil
         | 
| 44 | 
            -
             | 
| 45 | 
            -
                (0...n).each do |i|
         | 
| 46 | 
            -
                  e = i+1
         | 
| 47 | 
            -
                  (0...m).each do |j|
         | 
| 48 | 
            -
                    cost = (s[i] == t[j]) ? 0 : 1
         | 
| 49 | 
            -
                    x = [
         | 
| 50 | 
            -
                      d[j+1] + 1, # insertion
         | 
| 51 | 
            -
                      e + 1,      # deletion
         | 
| 52 | 
            -
                      d[j] + cost # substitution
         | 
| 53 | 
            -
                    ].min
         | 
| 54 | 
            -
                    d[j] = e
         | 
| 55 | 
            -
                    e = x
         | 
| 56 | 
            -
                  end
         | 
| 57 | 
            -
                  d[m] = x
         | 
| 58 | 
            -
                end
         | 
| 59 | 
            -
             | 
| 60 | 
            -
                return x
         | 
| 61 | 
            -
              end
         | 
| 62 | 
            -
             | 
| 63 | 
            -
              extend self
         | 
| 64 | 
            -
            end
         | 
| 65 | 
            -
            end
         | 
    
        data/lib/text/metaphone.rb
    DELETED
    
    | @@ -1,97 +0,0 @@ | |
| 1 | 
            -
            # 
         | 
| 2 | 
            -
            # An implementation of the Metaphone phonetic coding system in Ruby.
         | 
| 3 | 
            -
            # 
         | 
| 4 | 
            -
            # Metaphone encodes names into a phonetic form such that similar-sounding names
         | 
| 5 | 
            -
            # have the same or similar Metaphone encodings.
         | 
| 6 | 
            -
            # 
         | 
| 7 | 
            -
            # The original system was described by Lawrence Philips in Computer Language
         | 
| 8 | 
            -
            # Vol. 7 No. 12, December 1990, pp 39-43.
         | 
| 9 | 
            -
            # 
         | 
| 10 | 
            -
            # As there are multiple implementations of Metaphone, each with their own
         | 
| 11 | 
            -
            # quirks, I have based this on my interpretation of the algorithm specification.
         | 
| 12 | 
            -
            # Even LP's original BASIC implementation appears to contain bugs (specifically
         | 
| 13 | 
            -
            # with the handling of CC and MB), when compared to his explanation of the
         | 
| 14 | 
            -
            # algorithm.
         | 
| 15 | 
            -
            # 
         | 
| 16 | 
            -
            # I have also compared this implementation with that found in PHP's standard
         | 
| 17 | 
            -
            # library, which appears to mimic the behaviour of LP's original BASIC
         | 
| 18 | 
            -
            # implementation. For compatibility, these rules can also be used by passing
         | 
| 19 | 
            -
            # :buggy=>true to the methods.
         | 
| 20 | 
            -
            # 
         | 
| 21 | 
            -
            # Author: Paul Battley (pbattley@gmail.com)
         | 
| 22 | 
            -
            #
         | 
| 23 | 
            -
             | 
| 24 | 
            -
            module Text # :nodoc:
         | 
| 25 | 
            -
            module Metaphone
         | 
| 26 | 
            -
             | 
| 27 | 
            -
              module Rules # :nodoc:all
         | 
| 28 | 
            -
                
         | 
| 29 | 
            -
                # Metaphone rules.  These are simply applied in order.
         | 
| 30 | 
            -
                #
         | 
| 31 | 
            -
                STANDARD = [ 
         | 
| 32 | 
            -
                  # Regexp, replacement
         | 
| 33 | 
            -
                  [ /([bcdfhjklmnpqrstvwxyz])\1+/,
         | 
| 34 | 
            -
                                     '\1' ],  # Remove doubled consonants except g.
         | 
| 35 | 
            -
                                              # [PHP] remove c from regexp.
         | 
| 36 | 
            -
                  [ /^ae/,            'E' ],
         | 
| 37 | 
            -
                  [ /^[gkp]n/,        'N' ],
         | 
| 38 | 
            -
                  [ /^wr/,            'R' ],
         | 
| 39 | 
            -
                  [ /^x/,             'S' ],
         | 
| 40 | 
            -
                  [ /^wh/,            'W' ],
         | 
| 41 | 
            -
                  [ /mb$/,            'M' ],  # [PHP] remove $ from regexp.
         | 
| 42 | 
            -
                  [ /(?!^)sch/,      'SK' ],
         | 
| 43 | 
            -
                  [ /th/,             '0' ],
         | 
| 44 | 
            -
                  [ /t?ch|sh/,        'X' ],
         | 
| 45 | 
            -
                  [ /c(?=ia)/,        'X' ],
         | 
| 46 | 
            -
                  [ /[st](?=i[ao])/,  'X' ],
         | 
| 47 | 
            -
                  [ /s?c(?=[iey])/,   'S' ],
         | 
| 48 | 
            -
                  [ /[cq]/,           'K' ],
         | 
| 49 | 
            -
                  [ /dg(?=[iey])/,    'J' ],
         | 
| 50 | 
            -
                  [ /d/,              'T' ],
         | 
| 51 | 
            -
                  [ /g(?=h[^aeiou])/, ''  ],
         | 
| 52 | 
            -
                  [ /gn(ed)?/,        'N' ],
         | 
| 53 | 
            -
                  [ /([^g]|^)g(?=[iey])/,
         | 
| 54 | 
            -
                                    '\1J' ],
         | 
| 55 | 
            -
                  [ /g+/,             'K' ],
         | 
| 56 | 
            -
                  [ /ph/,             'F' ],
         | 
| 57 | 
            -
                  [ /([aeiou])h(?=\b|[^aeiou])/,
         | 
| 58 | 
            -
                                     '\1' ],
         | 
| 59 | 
            -
                  [ /[wy](?![aeiou])/, '' ],
         | 
| 60 | 
            -
                  [ /z/,              'S' ],
         | 
| 61 | 
            -
                  [ /v/,              'F' ],
         | 
| 62 | 
            -
                  [ /(?!^)[aeiou]+/,  ''  ],
         | 
| 63 | 
            -
                ]
         | 
| 64 | 
            -
              
         | 
| 65 | 
            -
                # The rules for the 'buggy' alternate implementation used by PHP etc.
         | 
| 66 | 
            -
                #
         | 
| 67 | 
            -
                BUGGY = STANDARD.dup
         | 
| 68 | 
            -
                BUGGY[0] = [ /([bdfhjklmnpqrstvwxyz])\1+/, '\1' ]
         | 
| 69 | 
            -
                BUGGY[6] = [ /mb/, 'M' ]
         | 
| 70 | 
            -
              end
         | 
| 71 | 
            -
             | 
| 72 | 
            -
              # Returns the Metaphone representation of a string. If the string contains
         | 
| 73 | 
            -
              # multiple words, each word in turn is converted into its Metaphone
         | 
| 74 | 
            -
              # representation. Note that only the letters A-Z are supported, so any
         | 
| 75 | 
            -
              # language-specific processing should be done beforehand.
         | 
| 76 | 
            -
              #
         | 
| 77 | 
            -
              # If the :buggy option is set, alternate 'buggy' rules are used.
         | 
| 78 | 
            -
              #
         | 
| 79 | 
            -
              def metaphone(str, options={})
         | 
| 80 | 
            -
                return str.strip.split(/\s+/).map { |w| metaphone_word(w, options) }.join(' ')
         | 
| 81 | 
            -
              end
         | 
| 82 | 
            -
              
         | 
| 83 | 
            -
            private
         | 
| 84 | 
            -
             | 
| 85 | 
            -
              def metaphone_word(w, options={})
         | 
| 86 | 
            -
                # Normalise case and remove non-ASCII
         | 
| 87 | 
            -
                s = w.downcase.gsub(/[^a-z]/, '')
         | 
| 88 | 
            -
                # Apply the Metaphone rules
         | 
| 89 | 
            -
                rules = options[:buggy] ? Rules::BUGGY : Rules::STANDARD
         | 
| 90 | 
            -
                rules.each { |rx, rep| s.gsub!(rx, rep) }
         | 
| 91 | 
            -
                return s.upcase
         | 
| 92 | 
            -
              end
         | 
| 93 | 
            -
             | 
| 94 | 
            -
              extend self
         | 
| 95 | 
            -
             | 
| 96 | 
            -
            end
         | 
| 97 | 
            -
            end
         | 
    
        data/lib/text/porter_stemming.rb
    DELETED
    
    | @@ -1,171 +0,0 @@ | |
| 1 | 
            -
            #
         | 
| 2 | 
            -
            # This is the Porter Stemming algorithm, ported to Ruby from the
         | 
| 3 | 
            -
            # version coded up in Perl.  It's easy to follow against the rules
         | 
| 4 | 
            -
            # in the original paper in:
         | 
| 5 | 
            -
            #
         | 
| 6 | 
            -
            #   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
         | 
| 7 | 
            -
            #   no. 3, pp 130-137,
         | 
| 8 | 
            -
            #
         | 
| 9 | 
            -
            # Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
         | 
| 10 | 
            -
            #
         | 
| 11 | 
            -
            module Text # :nodoc:
         | 
| 12 | 
            -
            module PorterStemming
         | 
| 13 | 
            -
             | 
| 14 | 
            -
              STEP_2_LIST = {
         | 
| 15 | 
            -
                'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
         | 
| 16 | 
            -
                'izer' => 'ize', 'bli' => 'ble',
         | 
| 17 | 
            -
                'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
         | 
| 18 | 
            -
                'ization' => 'ize', 'ation' => 'ate',
         | 
| 19 | 
            -
                'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
         | 
| 20 | 
            -
                'ousness' => 'ous', 'aliti' => 'al',
         | 
| 21 | 
            -
                'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
         | 
| 22 | 
            -
              }
         | 
| 23 | 
            -
              
         | 
| 24 | 
            -
              STEP_3_LIST = {
         | 
| 25 | 
            -
                'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
         | 
| 26 | 
            -
                'ical' => 'ic', 'ful' => '', 'ness' => ''
         | 
| 27 | 
            -
              }
         | 
| 28 | 
            -
             | 
| 29 | 
            -
              SUFFIX_1_REGEXP = /(
         | 
| 30 | 
            -
                                ational  |
         | 
| 31 | 
            -
                                tional   |
         | 
| 32 | 
            -
                                enci     |
         | 
| 33 | 
            -
                                anci     |
         | 
| 34 | 
            -
                                izer     |
         | 
| 35 | 
            -
                                bli      |
         | 
| 36 | 
            -
                                alli     |
         | 
| 37 | 
            -
                                entli    |
         | 
| 38 | 
            -
                                eli      |
         | 
| 39 | 
            -
                                ousli    |
         | 
| 40 | 
            -
                                ization  |
         | 
| 41 | 
            -
                                ation    |
         | 
| 42 | 
            -
                                ator     |
         | 
| 43 | 
            -
                                alism    |
         | 
| 44 | 
            -
                                iveness  |
         | 
| 45 | 
            -
                                fulness  |
         | 
| 46 | 
            -
                                ousness  |
         | 
| 47 | 
            -
                                aliti    |
         | 
| 48 | 
            -
                                iviti    |
         | 
| 49 | 
            -
                                biliti   |
         | 
| 50 | 
            -
                                logi)$/x
         | 
| 51 | 
            -
             | 
| 52 | 
            -
              SUFFIX_2_REGEXP = /(
         | 
| 53 | 
            -
                                  al       |
         | 
| 54 | 
            -
                                  ance     |
         | 
| 55 | 
            -
                                  ence     |
         | 
| 56 | 
            -
                                  er       |
         | 
| 57 | 
            -
                                  ic       | 
         | 
| 58 | 
            -
                                  able     |
         | 
| 59 | 
            -
                                  ible     |
         | 
| 60 | 
            -
                                  ant      |
         | 
| 61 | 
            -
                                  ement    |
         | 
| 62 | 
            -
                                  ment     |
         | 
| 63 | 
            -
                                  ent      |
         | 
| 64 | 
            -
                                  ou       |
         | 
| 65 | 
            -
                                  ism      |
         | 
| 66 | 
            -
                                  ate      |
         | 
| 67 | 
            -
                                  iti      |
         | 
| 68 | 
            -
                                  ous      |
         | 
| 69 | 
            -
                                  ive      |
         | 
| 70 | 
            -
                                  ize)$/x
         | 
| 71 | 
            -
             | 
| 72 | 
            -
              C = "[^aeiou]"             # consonant
         | 
| 73 | 
            -
              V = "[aeiouy]"             # vowel
         | 
| 74 | 
            -
              CC = "#{C}(?>[^aeiouy]*)"  # consonant sequence
         | 
| 75 | 
            -
              VV = "#{V}(?>[aeiou]*)"    # vowel sequence
         | 
| 76 | 
            -
             | 
| 77 | 
            -
              MGR0 = /^(#{CC})?#{VV}#{CC}/o                # [cc]vvcc... is m>0
         | 
| 78 | 
            -
              MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o       # [cc]vvcc[vv] is m=1
         | 
| 79 | 
            -
              MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o      # [cc]vvccvvcc... is m>1
         | 
| 80 | 
            -
              VOWEL_IN_STEM   = /^(#{CC})?#{V}/o           # vowel in stem
         | 
| 81 | 
            -
              
         | 
| 82 | 
            -
              def self.stem(word)
         | 
| 83 | 
            -
             | 
| 84 | 
            -
                # make a copy of the given object and convert it to a string.
         | 
| 85 | 
            -
                word = word.dup.to_str
         | 
| 86 | 
            -
                
         | 
| 87 | 
            -
                return word if word.length < 3
         | 
| 88 | 
            -
                
         | 
| 89 | 
            -
                # now map initial y to Y so that the patterns never treat it as vowel
         | 
| 90 | 
            -
                word[0] = 'Y' if word[0] == ?y
         | 
| 91 | 
            -
                
         | 
| 92 | 
            -
                # Step 1a
         | 
| 93 | 
            -
                if word =~ /(ss|i)es$/
         | 
| 94 | 
            -
                  word = $` + $1
         | 
| 95 | 
            -
                elsif word =~ /([^s])s$/ 
         | 
| 96 | 
            -
                  word = $` + $1
         | 
| 97 | 
            -
                end
         | 
| 98 | 
            -
             | 
| 99 | 
            -
                # Step 1b
         | 
| 100 | 
            -
                if word =~ /eed$/
         | 
| 101 | 
            -
                  word.chop! if $` =~ MGR0 
         | 
| 102 | 
            -
                elsif word =~ /(ed|ing)$/
         | 
| 103 | 
            -
                  stem = $`
         | 
| 104 | 
            -
                  if stem =~ VOWEL_IN_STEM 
         | 
| 105 | 
            -
                    word = stem
         | 
| 106 | 
            -
                    case word
         | 
| 107 | 
            -
                      when /(at|bl|iz)$/             then word << "e"
         | 
| 108 | 
            -
                      when /([^aeiouylsz])\1$/       then word.chop!
         | 
| 109 | 
            -
                      when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
         | 
| 110 | 
            -
                    end
         | 
| 111 | 
            -
                  end
         | 
| 112 | 
            -
                end
         | 
| 113 | 
            -
             | 
| 114 | 
            -
                if word =~ /y$/ 
         | 
| 115 | 
            -
                  stem = $`
         | 
| 116 | 
            -
                  word = stem + "i" if stem =~ VOWEL_IN_STEM 
         | 
| 117 | 
            -
                end
         | 
| 118 | 
            -
             | 
| 119 | 
            -
                # Step 2
         | 
| 120 | 
            -
                if word =~ SUFFIX_1_REGEXP
         | 
| 121 | 
            -
                  stem = $`
         | 
| 122 | 
            -
                  suffix = $1
         | 
| 123 | 
            -
                  # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
         | 
| 124 | 
            -
                  if stem =~ MGR0
         | 
| 125 | 
            -
                    word = stem + STEP_2_LIST[suffix]
         | 
| 126 | 
            -
                  end
         | 
| 127 | 
            -
                end
         | 
| 128 | 
            -
             | 
| 129 | 
            -
                # Step 3
         | 
| 130 | 
            -
                if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
         | 
| 131 | 
            -
                  stem = $`
         | 
| 132 | 
            -
                  suffix = $1
         | 
| 133 | 
            -
                  if stem =~ MGR0
         | 
| 134 | 
            -
                    word = stem + STEP_3_LIST[suffix]
         | 
| 135 | 
            -
                  end
         | 
| 136 | 
            -
                end
         | 
| 137 | 
            -
             | 
| 138 | 
            -
                # Step 4
         | 
| 139 | 
            -
                if word =~ SUFFIX_2_REGEXP
         | 
| 140 | 
            -
                  stem = $`
         | 
| 141 | 
            -
                  if stem =~ MGR1
         | 
| 142 | 
            -
                    word = stem
         | 
| 143 | 
            -
                  end
         | 
| 144 | 
            -
                elsif word =~ /(s|t)(ion)$/
         | 
| 145 | 
            -
                  stem = $` + $1
         | 
| 146 | 
            -
                  if stem =~ MGR1
         | 
| 147 | 
            -
                    word = stem
         | 
| 148 | 
            -
                  end
         | 
| 149 | 
            -
                end
         | 
| 150 | 
            -
             | 
| 151 | 
            -
                #  Step 5
         | 
| 152 | 
            -
                if word =~ /e$/ 
         | 
| 153 | 
            -
                  stem = $`
         | 
| 154 | 
            -
                  if (stem =~ MGR1) ||
         | 
| 155 | 
            -
                      (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
         | 
| 156 | 
            -
                    word = stem
         | 
| 157 | 
            -
                  end
         | 
| 158 | 
            -
                end
         | 
| 159 | 
            -
             | 
| 160 | 
            -
                if word =~ /ll$/ && word =~ MGR1
         | 
| 161 | 
            -
                  word.chop!
         | 
| 162 | 
            -
                end
         | 
| 163 | 
            -
             | 
| 164 | 
            -
                # and turn initial Y back to y
         | 
| 165 | 
            -
                word[0] = 'y' if word[0] == ?Y
         | 
| 166 | 
            -
             | 
| 167 | 
            -
                word
         | 
| 168 | 
            -
              end
         | 
| 169 | 
            -
             | 
| 170 | 
            -
            end
         | 
| 171 | 
            -
            end
         |