RubyGems - text - Versions diffs - 0.1.13 - Mend

text 0.1.13

Files changed (30) hide show

data/README.rdoc +28 -0
data/Rakefile +48 -0
data/lib/text.rb +7 -0
data/lib/text/double_metaphone.rb +356 -0
data/lib/text/figlet.rb +17 -0
data/lib/text/figlet/font.rb +117 -0
data/lib/text/figlet/smusher.rb +64 -0
data/lib/text/figlet/typesetter.rb +68 -0
data/lib/text/levenshtein.rb +65 -0
data/lib/text/metaphone.rb +97 -0
data/lib/text/porter_stemming.rb +171 -0
data/lib/text/soundex.rb +61 -0
data/lib/text/version.rb +9 -0
data/test/data/big.flf +2204 -0
data/test/data/big.txt +8 -0
data/test/data/chunky.flf +512 -0
data/test/data/chunky.txt +5 -0
data/test/data/double_metaphone.csv +1218 -0
data/test/data/metaphone.txt +51 -0
data/test/data/metaphone_buggy.txt +52 -0
data/test/data/porter_stemming_input.txt +23531 -0
data/test/data/porter_stemming_output.txt +23531 -0
data/test/preamble.rb +10 -0
data/test/test_double_metaphone.rb +23 -0
data/test/test_figlet.rb +17 -0
data/test/test_levenshtein.rb +80 -0
data/test/test_metaphone.rb +39 -0
data/test/test_porter_stemming.rb +16 -0
data/test/test_soundex.rb +27 -0
metadata +85 -0

@@ -0,0 +1,117 @@
+module Text
+module Figlet
+  class UnknownFontFormat < StandardError
+  end
+  class Font
+    def initialize(filename, load_german = true)
+      file = File.open(filename, 'rb')
+      header = file.gets.strip.split(/ /)
+      raise UnknownFontFormat if 'flf2a' != header[0][0, 5]
+      @hard_blank = header.shift[-1, 1]
+      @height = header.shift.to_i
+      @baseline = header.shift
+      @max_length = header.shift
+      @old_layout = header.shift.to_i
+      @comment_count = header.shift.to_i
+      @right_to_left = header.shift
+      @right_to_left = !@right_to_left.nil? && @right_to_left.to_i == 1
+      @load_german, @characters = load_german, {}
+      load_comments file
+      load_ascii_characters file
+      load_german_characters file
+      load_extended_characters file
+      file.close
+    end
+    def [](char)
+      @characters[char]
+    end
+    def has_char?(char)
+      @characters.has_key? char
+    end
+    attr_reader :height, :hard_blank, :old_layout
+    def right_to_left?
+      @right_to_left
+    end
+    private
+    def load_comments(file)
+      @comment_count.times { file.gets.strip }
+    end
+    def load_ascii_characters(file)
+      (32..126).each { |i| @characters[i] = load_char(file) }
+    end
+    def load_german_characters(file)
+      [91, 92, 93, 123, 124, 125, 126].each do |i|
+        if @load_german
+          unless char = load_char(file)
+            return
+          end
+          @characters[i] = char
+        else
+          skip_char file
+        end
+      end
+    end
+    def load_extended_characters(file)
+      until file.eof?
+        i = file.gets.strip.split(/ /).first
+        if i.empty?
+          next
+        elsif /^\-0x/i =~ i # comment
+          skip_char file
+        else
+          if /^0x/i =~ i
+            i = i[2, 1].hex
+          elsif '0' == i[0] && '0' != i || '-0' == i[0, 2]
+            i = i.oct
+          end
+          unless char = load_char(file)
+            return
+          end
+          @characters[i] = char
+        end
+      end
+    end
+    def load_char(file)
+      char = []
+      @height.times do
+        return false if file.eof?
+        line = file.gets.rstrip
+        if match = /(.){1,2}$/.match(line)
+          line.gsub! match[1], ''
+        end
+        line << "\x00"
+        char << line
+      end
+      return char
+    end
+    def skip_char(file)
+      @height.times do
+        return if file.eof?
+        return if file.gets.strip.nil?
+      end
+    end
+  end
+end # module Figlet
+end # module Text

data/lib/text/figlet/smusher.rb ADDED

@@ -0,0 +1,64 @@
+module Text
+module Figlet
+  class Smusher
+    def initialize(font)
+      @font = font
+    end
+    def [](result)
+      todo = false
+      @font.height.times do |j|
+        result[j] = result[j].sub(pattern) { todo, x = callback(todo, $1, $2); x }
+      end
+      @font.height.times do |j|
+        result[j] = if todo
+          result[j].sub(/\s\x00(?!$)|\x00\s/, '').sub(/\x00(?!$)/, '')
+        else
+          result[j].sub(/\x00(?!$)/, '')
+        end
+      end
+    end
+    def pattern
+      @pattern ||= /([^#{@font.hard_blank}\x00\s])\x00([^#{@font.hard_blank}\x00\s])/
+    end
+    def symbols
+      @@symbols ||= {
+        24 => '|/\\[]{}()<>',
+        8 => {'[' => ']', ']' => '[', '{' => '}', '}' => '{', '(' => ')', ')' => '('},
+        16 => {"/\\" => '|', "\\/" => 'Y', '><' => 'X'}
+      }
+    end
+    def old_layout?(n)
+      @font.old_layout & n > 0
+    end
+    def callback(s, a, b)
+      combined = a + b
+      if old_layout?(1) && a == b
+        return true, a
+      elsif old_layout?(2) && ('_' == a && symbols[24].include?(b) || '_' == b && symbols[24].include?(a))
+        return true, a
+      elsif old_layout?(4) && ((left = symbols[24].index(a)) && (right = symbols[24].index(b)))
+        return true, (right > left ? b : a)
+      elsif old_layout?(8) && (symbols[8].has_key?(b) && symbols[8][b] == a)
+        return true, '|'
+      elsif old_layout?(16) && symbols[16].has_key?(combined)
+        return true, symbols[16][combined]
+      elsif old_layout?(32) && (a == b && @font.hard_blank == a)
+        return true, @font.hard_blank
+      else
+        return s, "#{a}\00#{b}"
+      end
+    end
+  end
+end # module Figlet
+end # module Text

data/lib/text/figlet/typesetter.rb ADDED

@@ -0,0 +1,68 @@
+module Text
+module Figlet
+  class Typesetter
+    def initialize(font, options = nil)
+      @font = font
+      @options = options || {}
+      @smush = @options.has_key?(:smush) ? @options[:smush] : true
+    end
+    def [](str)
+      result = []
+      str.length.times do |i|
+        char = str[i]
+        unless @font.has_char?(char)
+          if @font.has_char?(0)
+            char = 0
+          else
+            next
+          end
+        end
+        @font.height.times do |j|
+          line = @font[char][j]
+          if result[j].nil?
+            result[j] = line
+          else
+            result[j] = @font.right_to_left?? (line + result[j]) : (result[j] + line)
+          end
+        end
+        if @font.old_layout > -1 && i > 0
+          diff = -1
+          @font.height.times do |j|
+            if match = /\S(\s*\x00\s*)\S/.match(result[j])
+              len = match[1].length
+              diff = (diff == -1 ? len : min(diff, len))
+            end
+          end
+          diff -= 1
+          if diff > 0
+            @font.height.times do |j|
+              if match = /\x00(\s{0,#{diff}})/.match(result[j])
+                b = diff - match[1].length
+                result[j] = result[j].sub(/\s{0,#{b}}\x00\s{#{match[1].length}}/, "\0")
+              end
+            end
+          end
+          smush[result] if @smush
+        end
+      end
+      return result.join("\n").gsub(/\0/, '').gsub(@font.hard_blank, ' ')
+    end
+    private
+    def min(a, b)
+      a > b ? b : a
+    end
+    def smush
+      @smusher ||= Smusher.new(@font)
+    end
+  end
+end # module Figlet
+end # module Text

data/lib/text/levenshtein.rb ADDED

@@ -0,0 +1,65 @@
+#
+# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
+#
+# The Levenshtein distance is a measure of how similar two strings s and t are,
+# calculated as the number of deletions/insertions/substitutions needed to
+# transform s into t. The greater the distance, the more the strings differ.
+#
+# The Levenshtein distance is also sometimes referred to as the
+# easier-to-pronounce-and-spell 'edit distance'.
+#
+# Author: Paul Battley (pbattley@gmail.com)
+#
+module Text # :nodoc:
+module Levenshtein
+  # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
+  # +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
+  # as ISO-8859-*.
+  #
+  # The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
+  # Otherwise, the comparison will be performed byte-by-byte. There is no specific support
+  # for Shift-JIS or EUC strings.
+  #
+  # When using Unicode text, be aware that this algorithm does not perform normalisation.
+  # If there is a possibility of different normalised forms being used, normalisation
+  # should be performed beforehand.
+  #
+  def distance(str1, str2)
+    if $KCODE =~ /^U/i
+      unpack_rule = 'U*'
+    else
+      unpack_rule = 'C*'
+    end
+    s = str1.unpack(unpack_rule)
+    t = str2.unpack(unpack_rule)
+    n = s.length
+    m = t.length
+    return m if (0 == n)
+    return n if (0 == m)
+    d = (0..m).to_a
+    x = nil
+    (0...n).each do |i|
+      e = i+1
+      (0...m).each do |j|
+        cost = (s[i] == t[j]) ? 0 : 1
+        x = [
+          d[j+1] + 1, # insertion
+          e + 1,      # deletion
+          d[j] + cost # substitution
+        ].min
+        d[j] = e
+        e = x
+      end
+      d[m] = x
+    end
+    return x
+  end
+  extend self
+end
+end

data/lib/text/metaphone.rb ADDED

@@ -0,0 +1,97 @@
+#
+# An implementation of the Metaphone phonetic coding system in Ruby.
+#
+# Metaphone encodes names into a phonetic form such that similar-sounding names
+# have the same or similar Metaphone encodings.
+#
+# The original system was described by Lawrence Philips in Computer Language
+# Vol. 7 No. 12, December 1990, pp 39-43.
+#
+# As there are multiple implementations of Metaphone, each with their own
+# quirks, I have based this on my interpretation of the algorithm specification.
+# Even LP's original BASIC implementation appears to contain bugs (specifically
+# with the handling of CC and MB), when compared to his explanation of the
+# algorithm.
+#
+# I have also compared this implementation with that found in PHP's standard
+# library, which appears to mimic the behaviour of LP's original BASIC
+# implementation. For compatibility, these rules can also be used by passing
+# :buggy=>true to the methods.
+#
+# Author: Paul Battley (pbattley@gmail.com)
+#
+module Text # :nodoc:
+module Metaphone
+  module Rules # :nodoc:all
+    # Metaphone rules.  These are simply applied in order.
+    #
+    STANDARD = [
+      # Regexp, replacement
+      [ /([bcdfhjklmnpqrstvwxyz])\1+/,
+                         '\1' ],  # Remove doubled consonants except g.
+                                  # [PHP] remove c from regexp.
+      [ /^ae/,            'E' ],
+      [ /^[gkp]n/,        'N' ],
+      [ /^wr/,            'R' ],
+      [ /^x/,             'S' ],
+      [ /^wh/,            'W' ],
+      [ /mb$/,            'M' ],  # [PHP] remove $ from regexp.
+      [ /(?!^)sch/,      'SK' ],
+      [ /th/,             '0' ],
+      [ /t?ch|sh/,        'X' ],
+      [ /c(?=ia)/,        'X' ],
+      [ /[st](?=i[ao])/,  'X' ],
+      [ /s?c(?=[iey])/,   'S' ],
+      [ /[cq]/,           'K' ],
+      [ /dg(?=[iey])/,    'J' ],
+      [ /d/,              'T' ],
+      [ /g(?=h[^aeiou])/, ''  ],
+      [ /gn(ed)?/,        'N' ],
+      [ /([^g]|^)g(?=[iey])/,
+                        '\1J' ],
+      [ /g+/,             'K' ],
+      [ /ph/,             'F' ],
+      [ /([aeiou])h(?=\b|[^aeiou])/,
+                         '\1' ],
+      [ /[wy](?![aeiou])/, '' ],
+      [ /z/,              'S' ],
+      [ /v/,              'F' ],
+      [ /(?!^)[aeiou]+/,  ''  ],
+    ]
+    # The rules for the 'buggy' alternate implementation used by PHP etc.
+    #
+    BUGGY = STANDARD.dup
+    BUGGY[0] = [ /([bdfhjklmnpqrstvwxyz])\1+/, '\1' ]
+    BUGGY[6] = [ /mb/, 'M' ]
+  end
+  # Returns the Metaphone representation of a string. If the string contains
+  # multiple words, each word in turn is converted into its Metaphone
+  # representation. Note that only the letters A-Z are supported, so any
+  # language-specific processing should be done beforehand.
+  #
+  # If the :buggy option is set, alternate 'buggy' rules are used.
+  #
+  def metaphone(str, options={})
+    return str.strip.split(/\s+/).map { |w| metaphone_word(w, options) }.join(' ')
+  end
+private
+  def metaphone_word(w, options={})
+    # Normalise case and remove non-ASCII
+    s = w.downcase.gsub(/[^a-z]/, '')
+    # Apply the Metaphone rules
+    rules = options[:buggy] ? Rules::BUGGY : Rules::STANDARD
+    rules.each { |rx, rep| s.gsub!(rx, rep) }
+    return s.upcase
+  end
+  extend self
+end
+end

data/lib/text/porter_stemming.rb ADDED

@@ -0,0 +1,171 @@
+#
+# This is the Porter Stemming algorithm, ported to Ruby from the
+# version coded up in Perl.  It's easy to follow against the rules
+# in the original paper in:
+#
+#   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+#   no. 3, pp 130-137,
+#
+# Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
+#
+module Text # :nodoc:
+module PorterStemming
+  STEP_2_LIST = {
+    'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
+    'izer' => 'ize', 'bli' => 'ble',
+    'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
+    'ization' => 'ize', 'ation' => 'ate',
+    'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
+    'ousness' => 'ous', 'aliti' => 'al',
+    'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
+  }
+  STEP_3_LIST = {
+    'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
+    'ical' => 'ic', 'ful' => '', 'ness' => ''
+  }
+  SUFFIX_1_REGEXP = /(
+                    ational  |
+                    tional   |
+                    enci     |
+                    anci     |
+                    izer     |
+                    bli      |
+                    alli     |
+                    entli    |
+                    eli      |
+                    ousli    |
+                    ization  |
+                    ation    |
+                    ator     |
+                    alism    |
+                    iveness  |
+                    fulness  |
+                    ousness  |
+                    aliti    |
+                    iviti    |
+                    biliti   |
+                    logi)$/x
+  SUFFIX_2_REGEXP = /(
+                      al       |
+                      ance     |
+                      ence     |
+                      er       |
+                      ic       |
+                      able     |
+                      ible     |
+                      ant      |
+                      ement    |
+                      ment     |
+                      ent      |
+                      ou       |
+                      ism      |
+                      ate      |
+                      iti      |
+                      ous      |
+                      ive      |
+                      ize)$/x
+  C = "[^aeiou]"             # consonant
+  V = "[aeiouy]"             # vowel
+  CC = "#{C}(?>[^aeiouy]*)"  # consonant sequence
+  VV = "#{V}(?>[aeiou]*)"    # vowel sequence
+  MGR0 = /^(#{CC})?#{VV}#{CC}/o                # [cc]vvcc... is m>0
+  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o       # [cc]vvcc[vv] is m=1
+  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o      # [cc]vvccvvcc... is m>1
+  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o           # vowel in stem
+  def self.stem(word)
+    # make a copy of the given object and convert it to a string.
+    word = word.dup.to_str
+    return word if word.length < 3
+    # now map initial y to Y so that the patterns never treat it as vowel
+    word[0] = 'Y' if word[0] == ?y
+    # Step 1a
+    if word =~ /(ss|i)es$/
+      word = $` + $1
+    elsif word =~ /([^s])s$/
+      word = $` + $1
+    end
+    # Step 1b
+    if word =~ /eed$/
+      word.chop! if $` =~ MGR0
+    elsif word =~ /(ed|ing)$/
+      stem = $`
+      if stem =~ VOWEL_IN_STEM
+        word = stem
+        case word
+          when /(at|bl|iz)$/             then word << "e"
+          when /([^aeiouylsz])\1$/       then word.chop!
+          when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
+        end
+      end
+    end
+    if word =~ /y$/
+      stem = $`
+      word = stem + "i" if stem =~ VOWEL_IN_STEM
+    end
+    # Step 2
+    if word =~ SUFFIX_1_REGEXP
+      stem = $`
+      suffix = $1
+      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
+      if stem =~ MGR0
+        word = stem + STEP_2_LIST[suffix]
+      end
+    end
+    # Step 3
+    if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
+      stem = $`
+      suffix = $1
+      if stem =~ MGR0
+        word = stem + STEP_3_LIST[suffix]
+      end
+    end
+    # Step 4
+    if word =~ SUFFIX_2_REGEXP
+      stem = $`
+      if stem =~ MGR1
+        word = stem
+      end
+    elsif word =~ /(s|t)(ion)$/
+      stem = $` + $1
+      if stem =~ MGR1
+        word = stem
+      end
+    end
+    #  Step 5
+    if word =~ /e$/
+      stem = $`
+      if (stem =~ MGR1) ||
+          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
+        word = stem
+      end
+    end
+    if word =~ /ll$/ && word =~ MGR1
+      word.chop!
+    end
+    # and turn initial Y back to y
+    word[0] = 'y' if word[0] == ?Y
+    word
+  end
+end
+end