RubyGems - treat - Versions diffs - 0.1.1 → 0.1.2 - Mend

treat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

data/INSTALL +1 -0
data/README +3 -0
data/TODO +14 -26
data/bin/INFO +1 -1
data/lib/treat/buildable.rb +10 -11
data/lib/treat/categories.rb +8 -6
data/lib/treat/category.rb +7 -2
data/lib/treat/delegatable.rb +64 -56
data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
data/lib/treat/detectors/language/language_detector.rb +2 -1
data/lib/treat/detectors/language/what_language.rb +2 -2
data/lib/treat/detectors.rb +3 -0
data/lib/treat/entities/entity.rb +1 -1
data/lib/treat/entities.rb +9 -10
data/lib/treat/exception.rb +3 -1
data/lib/treat/extractors/named_entity/abner.rb +1 -1
data/lib/treat/extractors/named_entity/stanford.rb +2 -2
data/lib/treat/extractors/time/chronic.rb +2 -2
data/lib/treat/extractors/time/nickel.rb +2 -2
data/lib/treat/extractors/topic_words/lda.rb +2 -2
data/lib/treat/extractors.rb +12 -9
data/lib/treat/feature.rb +6 -1
data/lib/treat/formatters/cleaners/html.rb +1 -1
data/lib/treat/formatters.rb +8 -8
data/lib/treat/group.rb +11 -10
data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
data/lib/treat/inflectors.rb +8 -21
data/lib/treat/kernel.rb +120 -0
data/lib/treat/languages/arabic.rb +14 -0
data/lib/treat/languages/categories.rb +5 -0
data/lib/treat/languages/chinese.rb +12 -0
data/lib/treat/languages/english/categories.rb +23 -0
data/lib/treat/{resources → languages/english}/tags.rb +127 -184
data/lib/treat/languages/english.rb +33 -0
data/lib/treat/languages/french.rb +17 -0
data/lib/treat/languages/german.rb +17 -0
data/lib/treat/languages/italian.rb +14 -0
data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
data/lib/treat/languages/xinhua.rb +12 -0
data/lib/treat/languages.rb +91 -0
data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
data/lib/treat/lexicalizers/tag/brill.rb +2 -1
data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
data/lib/treat/lexicalizers.rb +1 -1
data/lib/treat/object.rb +6 -0
data/lib/treat/processors/parsers/enju.rb +3 -2
data/lib/treat/processors/parsers/stanford.rb +15 -12
data/lib/treat/processors/segmenters/punkt.rb +1 -1
data/lib/treat/processors/segmenters/stanford.rb +7 -5
data/lib/treat/processors/segmenters/tactful.rb +1 -1
data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
data/lib/treat/processors/tokenizers/stanford.rb +7 -5
data/lib/treat/visitable.rb +2 -1
data/lib/treat.rb +105 -54
data/test/tc_entity.rb +5 -0
data/test/tc_resources.rb +5 -5
data/test/tc_treat.rb +1 -2
data/test/tests.rb +2 -1
metadata +63 -64
data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
data/lib/treat/resources/categories.rb +0 -18
data/lib/treat/resources/delegates.rb +0 -96
data/lib/treat/resources/dependencies.rb +0 -0
data/lib/treat/resources/edges.rb +0 -8
data/lib/treat/resources/formats.rb +0 -23
data/lib/treat/resources/languages.rb +0 -86
data/lib/treat/resources.rb +0 -10
data/lib/treat/utilities.rb +0 -127

data/lib/treat/group.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 module Treat
   module Group
+    # Modify the extended class.
     def self.extended(group)
       group.module_eval do
         class << self
@@ -75,22 +76,22 @@ module Treat
       end
       @@list[mod]
     end
-    # Set inherit to false by default.
-    def const_get(const)
-      super(const, false)
-    end
+    # Get constants in this module, excluding those
+    # defined by parent modules.
+    def const_get(const); super(const, false); end
     # Autoload the algorithms.
     def const_missing(const)
       bits = self.ancestors[0].to_s.split('::')
       bits.collect! { |bit| ucc(bit) }
-      file = bits.join('/') + "/#{ucc(const)}"          # Fix
-      #if not File.readable?(file + '.rb')
-      #  raise Treat::Exception,
-      #  "File '#{file}.rb' corresponding to requested delegate "+
-      #  "#{self}::#{const} does not exist."
+      file = bits.join('/') + "/#{ucc(const)}"
+      if not File.readable?("#{Treat.lib}/#{file}.rb")
+        raise Treat::Exception,
+        "File '#{file}.rb' corresponding to requested delegate "+
+        "#{self}::#{const} does not exist."
+      else
         require file
         const_get(const)
-      #end
+      end
     end
   end
 end

data/lib/treat/inflectors/cardinal_words/linguistics.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Treat
   module Inflectors
     module CardinalWords
       class Linguistics
-        silently { require 'linguistics' }
+        silence_warnings { require 'linguistics' }
         #
         # Options:
         #
@@ -32,12 +32,12 @@ module Treat
           begin
             l = entity.language.to_s.upcase
             delegate = nil
-            silently { delegate = ::Linguistics.const_get(l) }
+            silence_warnings { delegate = ::Linguistics.const_get(l) }
           rescue RuntimeError
             raise "Ruby Linguistics does not have a module " +
             " installed for the #{entity.language} language."
           end
-          silently { delegate.numwords(entity.to_s, options) }
+          silence_warnings { delegate.numwords(entity.to_s, options) }
         end
       end
     end

data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb RENAMED Viewed

@@ -1,23 +1,23 @@
 module Treat
   module Inflectors
-    module Conjugators
+    module Conjugations
       class Linguistics
-        silently { require 'linguistics' }
+        silence_warnings { require 'linguistics' }
         def self.conjugate(entity, parameters)
           begin
             l = entity.language.to_s.upcase
             delegate = nil
-            silently { delegate = ::Linguistics.const_get(l) }
+            silence_warnings { delegate = ::Linguistics.const_get(l) }
           rescue RuntimeError
             raise "Ruby Linguistics does not have a module " +
             " installed for the #{entity.language} language."
           end
           if parameters[:mode] == :infinitive
-            silently { delegate.infinitive(entity.to_s) }
+            silence_warnings { delegate.infinitive(entity.to_s) }
           elsif parameters[:mode] == :participle && parameters[:tense] == :present
-            silently { delegate.present_participle(entity.to_s) }
+            silence_warnings { delegate.present_participle(entity.to_s) }
           elsif parameters[:count] == :plural && parameters.size == 1
-            silently { delegate.plural_verb(entity.to_s) }
+            silence_warnings { delegate.plural_verb(entity.to_s) }
           else
             raise Treat::Exception,
             'This combination of modes, tenses, persons ' +

data/lib/treat/inflectors/{declensors → declensions}/en.rb RENAMED Viewed

@@ -1,8 +1,8 @@
-silently { require 'english' }
+silence_warnings { require 'english' }
 module Treat
    module Inflectors
-      module Declensors
+      module Declensions
          module En
             def self.declense(entity, options)
                string = entity.to_s

data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb RENAMED Viewed

@@ -1,7 +1,7 @@
 module Treat
   module Inflectors
-    module Declensors
-      silently { require 'linguistics' }
+    module Declensions
+      silence_warnings { require 'linguistics' }
       # Obtain word declensions in English using the
       # ruby 'linguistics' gem.
       class Linguistics
@@ -9,7 +9,7 @@ module Treat
           begin
             l = entity.language.to_s.upcase
             delegate = nil
-            silently { delegate = ::Linguistics.const_get(l) }
+            silence_warnings { delegate = ::Linguistics.const_get(l) }
           rescue RuntimeError
             raise "Ruby Linguistics does not have a module " +
             " installed for the #{entity.language} language."
@@ -18,9 +18,9 @@ module Treat
           if options[:count] == :plural
             if entity.has?(:category) &&
               [:noun, :adjective, :verb].include?(entity.category)
-              silently { delegate.send(:"plural_#{entity.category}", string) }
+              silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
             else
-              silently { delegate.plural(string) }
+              silence_warnings { delegate.plural(string) }
             end
           end
         end

data/lib/treat/inflectors/ordinal_words/linguistics.rb CHANGED Viewed

@@ -2,18 +2,18 @@ module Treat
   module Inflectors
     module OrdinalWords
       class Linguistics
-        silently { require 'linguistics' }
+        silence_warnings { require 'linguistics' }
         def self.ordinal_words(number, options = {})
           begin
             l = number.language.to_s.upcase
             delegate = nil
-            silently { delegate = ::Linguistics.const_get(l) }
+            silence_warnings { delegate = ::Linguistics.const_get(l) }
           rescue RuntimeError
-            lang = Treat::Resources::Language.describe(number.language)
+            lang = Treat::Languages.describe(number.language)
             raise "Ruby Linguistics does not have a module " +
             " installed for the #{lang} language."
           end
-          silently { delegate.ordinate(number.to_s) }
+          silence_warnings { delegate.ordinate(number.to_s) }
         end
       end
     end

data/lib/treat/inflectors/{stemmers → stem}/porter.rb RENAMED Viewed

@@ -1,6 +1,6 @@
 module Treat
   module Inflectors
-    module Stemmers
+    module Stem
       # Stem a word using a native Ruby implementation of the
       # Porter stemming algorithm, ported to Ruby from the
       # version coded up in Perl.

data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb RENAMED Viewed

@@ -1,6 +1,6 @@
 module Treat
   module Inflectors
-    module Stemmers
+    module Stem
       # Stems words using the 'ruby-stemmer' gem, which
       # wraps a C version of the Porter stemming algorithm.
       #
@@ -9,13 +9,13 @@ module Treat
       # Program, Vol. 14, no. 3, pp 130-137,
       # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
       class PorterC
-        silently { require 'lingua/stemmer' }
+        silence_warnings { require 'lingua/stemmer' }
         ::LinguaStemmer = ::Lingua
         Object.instance_eval { remove_const :Lingua }
         # Stem the word using the Porter C algorithm.
         # Options: none.
         def self.stem(word, options = {})
-          silently { ::LinguaStemmer.stemmer(word.to_s) }
+          silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
         end
       end
     end

data/lib/treat/inflectors/{stemmers → stem}/uea.rb RENAMED Viewed

@@ -1,6 +1,6 @@
 module Treat
   module Inflectors
-    module Stemmers
+    module Stem
       # Stems a word using the UEA algorithm, implemented
       # by the 'uea-stemmer' gem.
       #
@@ -15,13 +15,13 @@ module Treat
       #   http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
       class UEA
         # Require the 'uea-stemmer' gem.
-        silently { require 'uea-stemmer' }
+        silence_warnings { require 'uea-stemmer' }
         # Keep only one copy of the stemmer.
         @@stemmer = nil
         # Stems a word using the UEA algorithm, implemented
         # by the 'uea-stemmer' gem.
         def self.stem(entity, options = {})
-          @@stemmer ||= silently { ::UEAStemmer.new }
+          @@stemmer ||= silence_warnings { ::UEAStemmer.new }
           @@stemmer.stem(entity.to_s).strip
         end
       end

data/lib/treat/inflectors.rb CHANGED Viewed

@@ -1,44 +1,31 @@
 module Treat
   # Algorithms to retrieve the inflections of a word.
-  # Stemmers return the stem (not root form) of a word.
-  # Taggers return the part of speech tag of a word.
-  # Inflectors allow to retrieve the different inflections of a
-  # noun (declensions), a verb (conjugations). Lexicons return,
-  # among other things, the gloss or synset of a word.
   module Inflectors
-    # Lemmatizers return the root form of a word.
-    module Lemmatizers
+    # Return the stem (*not root form*) of a word.
+    module Stem
       extend Group
       self.type = :annotator
       self.targets = [:word]
     end
-    # Stemmers return the stem (*not root form*) of a word.
-    module Stemmers
+    # Retrieve the different declensions of a noun (singular, plural).
+    module Declensions
       extend Group
       self.type = :annotator
       self.targets = [:word]
     end
-    # Declensors allow to retrieve the different declensions of a
-    # noun (singular, plural).
-    module Declensors
+    # Retrieve the different conjugations of a word.
+    module Conjugations
       extend Group
       self.type = :annotator
       self.targets = [:word]
     end
-    # Conjugators allow to retrieve the different conjugations of
-    # a word.
-    module Conjugators
-      extend Group
-      self.type = :annotator
-      self.targets = [:word]
-    end
-    # Cardinal retrieve the full text description of a number.
+    # Retrieve the full text description of a cardinal number.
     module CardinalWords
       extend Group
       self.type = :annotator
       self.targets = [:number]
     end
-    # Ordinal retrieve the ordinal form of numbers.
+    # Retrieve the full text description of an ordinal number.
     module OrdinalWords
       extend Group
       self.type = :annotator

data/lib/treat/kernel.rb ADDED Viewed

@@ -0,0 +1,120 @@
+# Extends the core Kernel module to provide
+# easy access to utility functions used across
+# the library.
+module Kernel
+  require 'fileutils'
+  require 'tempfile'
+  # A list of acronyms used in class names within
+  # the program. These do not CamelCase; they
+  # CAMELCase.
+  Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR'].join('|')
+  # A cache to optimize camel casing.
+  @@cc_cache = {}
+  # A cache to optimize un camel casing.
+  @@ucc_cache = {}
+  # Returns the platform we are running on.
+  def platform
+    RUBY_PLATFORM.split("-")[1]
+  end
+  # Runs a block of code without warnings.
+  def silence_warnings(&block)
+    warn_level = $VERBOSE
+    $VERBOSE = nil
+    result = block.call
+    $VERBOSE = warn_level
+    result
+  end
+  # Runs a block of code while blocking
+  # stdout. Currently not implemented.
+  def silence_streams(*streams)
+    yield
+  end
+  # Create a temporary file which is deleted
+  # after execution of the block.
+  def create_temp_file(ext, value = nil, &block)
+    tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
+    tmp.puts(value) if value
+    block.call(tmp.path)
+  end
+  # Convert un_camel_case to CamelCase.
+  def camel_case(o_phrase)
+    phrase = o_phrase.to_s.dup
+    return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
+    phrase.gsub!(/#{Acronyms.downcase}[^a-z]+/) { |a| a.upcase }
+    phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
+    phrase.gsub!('_', '')
+    @@cc_cache[o_phrase] = phrase
+    phrase
+  end
+  alias :cc :camel_case
+  # Convert CamelCase to un_camel_case.
+  def un_camel_case(o_phrase)
+    phrase = o_phrase.to_s.dup
+    return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
+    phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
+    phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase  }
+    phrase = phrase[1..-1] if phrase[0] == '_'
+    @@ucc_cache[o_phrase] = phrase
+    phrase
+  end
+  alias :ucc :un_camel_case
+  # Retrieve the Class from a Module::Class.
+  def class_name(n); n.to_s.split('::')[-1]; end
+  alias :cl :class_name
+  # Search the list to see if there are words similar to #name
+  # in the #list If yes, return a string saying "Did you mean
+  # ... ?" with the names.
+  def did_you_mean?(list, name)
+    msg = ''
+    sugg = []
+    list.each do |element|
+      l = levenshtein(element,name)
+      if  l > 0 && l < 2
+        sugg << element
+      end
+    end
+    unless sugg.empty?
+      if sugg.size == 1
+        msg += " Perhaps you meant '#{sugg[0]}' ?"
+      else
+        sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
+        msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
+        " or '#{sugg[-1]}' ?"
+      end
+    end
+    msg
+  end
+  alias :dym? :did_you_mean?
+  # Return the name of the method that called the method
+  # that calls this method.
+  def caller_method(n = 3)
+    at = caller(n).first
+    /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
+    :"#{Regexp.last_match[3]}"
+  end
+  alias :cm :caller_method
+  # Return the levensthein distance between two stringsm
+  # taking into account the costs of insertion, deletion,
+  # and substitution. Stolen from:
+  # http://ruby-snippets.heroku.com/string/levenshtein-distance
+  # Used by did_you_mean?
+  def levenshtein(first, other, ins=1, del=1, sub=1)
+    return nil if first.nil? || other.nil?
+    dm = []
+    dm[0] = (0..first.length).collect { |i| i * ins}
+    fill = [0] * (first.length - 1)
+    for i in 1..other.length
+      dm[i] = [i * del, fill.flatten]
+    end
+    for i in 1..other.length
+      for j in 1..first.length
+        dm[i][j] = [
+          dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
+          dm[i][j-1] + ins,
+          dm[i-1][j] + del
+        ].min
+      end
+    end
+    dm[other.length][first.length]
+  end
+end

data/lib/treat/languages/arabic.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Treat
+  module Languages
+    class Arabic
+      Extractors = {}
+      Inflectors = {}
+      Lexicalizers = {
+        tag: [:stanford]
+      }
+      Processors = {
+        parsers: [:stanford]
+      }
+    end
+  end
+end

data/lib/treat/languages/categories.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Treat
+  module Languages
+  end
+end

data/lib/treat/languages/chinese.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Treat
+  module Languages
+    class Chinese
+      Extractors = {}
+      Inflectors = {}
+      Lexicalizers = {
+        tag: [:stanford]
+      }
+      Processors = {}
+    end
+  end
+end

data/lib/treat/languages/english/categories.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Treat
+  module Languages
+    class English
+      # A list of all possible word categories.
+      Categories = [
+        :adjective, :adverb, :noun, :verb, :interjection,
+        :clitic, :coverb, :conjunction, :determiner, :particle,
+        :preposition, :pronoun, :number, :symbol, :punctuation,
+        :complementizer
+      ]
+      wttc = {}
+      Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
+        category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
+        wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
+        wttc[tags[0]][:claws_5] = category
+        wttc[tags[1]][:brown] = category
+        wttc[tags[2]][:penn] = category
+      end
+      # A hash converting word tags to word categories.
+      WordTagToCategory = wttc
+    end
+  end
+end