RubyGems - treat - Versions diffs - 0.1.2 → 0.1.3 - Mend

treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

data/LICENSE +7 -8
data/TODO +16 -13
data/examples/keywords.rb +89 -1
data/lib/treat/buildable.rb +1 -8
data/lib/treat/categories.rb +3 -4
data/lib/treat/category.rb +1 -1
data/lib/treat/delegatable.rb +1 -1
data/lib/treat/detectors/encoding/native.rb +5 -0
data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
data/lib/treat/detectors/language/language_detector.rb +4 -0
data/lib/treat/detectors/language/what_language.rb +4 -4
data/lib/treat/detectors.rb +1 -1
data/lib/treat/entities/entity.rb +5 -3
data/lib/treat/entities/tokens.rb +14 -5
data/lib/treat/entities/zones.rb +4 -0
data/lib/treat/entities.rb +7 -5
data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
data/lib/treat/extractors/time/chronic.rb +8 -0
data/lib/treat/extractors/time/native.rb +6 -0
data/lib/treat/extractors/time/nickel.rb +31 -23
data/lib/treat/extractors/topic_words/lda.rb +21 -16
data/lib/treat/extractors/topics/reuters.rb +6 -4
data/lib/treat/extractors.rb +7 -7
data/lib/treat/formatters/readers/abw.rb +32 -0
data/lib/treat/formatters/readers/autoselect.rb +13 -11
data/lib/treat/formatters/readers/doc.rb +13 -0
data/lib/treat/formatters/readers/gocr.rb +2 -0
data/lib/treat/formatters/readers/html.rb +21 -1
data/lib/treat/formatters/readers/ocropus.rb +3 -3
data/lib/treat/formatters/readers/odt.rb +41 -0
data/lib/treat/formatters/readers/pdf.rb +5 -2
data/lib/treat/formatters/readers/txt.rb +2 -0
data/lib/treat/formatters/serializers/xml.rb +3 -2
data/lib/treat/formatters/serializers/yaml.rb +2 -0
data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
data/lib/treat/formatters/unserializers/xml.rb +6 -1
data/lib/treat/formatters/unserializers/yaml.rb +5 -1
data/lib/treat/formatters/visualizers/dot.rb +35 -37
data/lib/treat/formatters/visualizers/html.rb +1 -0
data/lib/treat/formatters/visualizers/inspect.rb +4 -0
data/lib/treat/formatters/visualizers/short_value.rb +18 -3
data/lib/treat/formatters/visualizers/standoff.rb +11 -6
data/lib/treat/formatters/visualizers/tree.rb +5 -1
data/lib/treat/formatters/visualizers/txt.rb +6 -1
data/lib/treat/formatters.rb +1 -1
data/lib/treat/group.rb +4 -3
data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
data/lib/treat/inflectors/stem/porter.rb +6 -2
data/lib/treat/inflectors/stem/porter_c.rb +4 -1
data/lib/treat/inflectors/stem/uea.rb +4 -4
data/lib/treat/languages/english/tags.rb +16 -0
data/lib/treat/languages/english.rb +4 -1
data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
data/lib/treat/lexicalizers/tag/brill.rb +3 -11
data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
data/lib/treat/lexicalizers.rb +0 -2
data/lib/treat/processors/chunkers/txt.rb +4 -4
data/lib/treat/processors/parsers/enju.rb +3 -17
data/lib/treat/processors/parsers/stanford.rb +4 -0
data/lib/treat/processors/segmenters/punkt.rb +1 -0
data/lib/treat/processors/segmenters/stanford.rb +4 -0
data/lib/treat/processors/segmenters/tactful.rb +4 -1
data/lib/treat/processors/tokenizers/punkt.rb +1 -2
data/lib/treat/processors/tokenizers/stanford.rb +4 -0
data/lib/treat/processors/tokenizers/tactful.rb +1 -1
data/lib/treat/processors.rb +4 -4
data/lib/treat/proxies.rb +18 -11
data/lib/treat/registrable.rb +12 -5
data/lib/treat/sugar.rb +8 -3
data/lib/treat/tree.rb +10 -3
data/lib/treat.rb +55 -55
data/test/tc_entity.rb +7 -7
data/test/tc_extractors.rb +6 -4
data/test/tc_formatters.rb +0 -4
data/test/tests.rb +2 -0
data/test/texts.rb +4 -4
metadata +48 -56
data/examples/texts/bugged_out.txt +0 -26
data/examples/texts/half_cocked_basel.txt +0 -16
data/examples/texts/hedge_funds.txt +0 -24
data/examples/texts/hose_and_dry.txt +0 -19
data/examples/texts/hungarys_troubles.txt +0 -46
data/examples/texts/indias_slowdown.txt +0 -15
data/examples/texts/merkozy_rides_again.txt +0 -24
data/examples/texts/prada_is_not_walmart.txt +0 -9
data/examples/texts/republican_nomination.txt +0 -26
data/examples/texts/to_infinity_and_beyond.txt +0 -15
data/lib/treat/entities/text.rb +0 -7
data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
data/lib/treat/formatters/cleaners/html.rb +0 -17

data/lib/treat/formatters/visualizers/dot.rb CHANGED Viewed

@@ -2,62 +2,54 @@ module Treat
   module Formatters
     module Visualizers
       class Dot
-        # Border colors to use for different POS tags.
-        BorderColors = {
-          :verb => "#00AABB",
-          :noun => "#FAD4A7",
-          :adverb => '#103585',
-          :adjective => '#D21D54'
-        }
+        DefaultOptions = {colors: {}, :features => :all}
         # Create the top-most graph structure
         # and delegate the creation of the graph
         # nodes to to_dot.
         def self.visualize(entity, options = {})
+          options = DefaultOptions.merge(options)
           string = "graph {"
-          string << self.to_dot(entity)
+          string << self.to_dot(entity, options)
           string << "\n}"
         end
         # dot -Tpdf test4.dot > test4.pdf
-        def self.to_dot(entity)
+        def self.to_dot(entity, options)
+          # Id
           string = ''
-          if entity.is_leaf?
-            if entity.is_a?(Treat::Entities::Word)
-              label = "label=\"#{entity.value} (#{entity.tag})\","
-              label << "color=\"#{BorderColors[entity.cat]}\","
-            else
-              label = "label=\"#{entity.value.inspect[1..-2]}\","
-            end
+          label = ''
+          string = "\n#{entity.id} ["
+          # Value
+          if entity.is_a?(Treat::Entities::Token)
+            label = entity.to_s
           else
-            if entity.class < Entities::Constituent
-              label = "label=\"#{entity.tag}\","
-              # label << "color=\"#{BorderColors[entity.tag]}\","
-            else
-              label = "label=\"#{cc(cl(entity.class))}\","
+            label = entity.type.to_s.capitalize + " "
+            if entity.is_leaf?
+              label = entity.short_value.gsub(' [...]', " [...] \\n")
             end
           end
-          string << "\n#{entity.id} ["
+          # Features
           if entity.has_features?
-            string << label
-            entity.features.each_pair do |feature, value|
-              if value.is_a?(Treat::Entities::Entity)
-                string << "#{feature}=\"#{value.id}\","
-              else
-                string << "#{feature}=\"#{value}\","
+            unless options[:features] == :none
+              label << "\\n"
+              entity.features.each do |feature, value|
+                if options[:features] == :all ||
+                  options[:features].include?(feature)
+                  if value.is_a?(Treat::Entities::Entity)
+                    label << "\\n#{feature}=\\\"*#{value.id}\\\","
+                  else
+                    label << "\\n#{feature}=\\\"#{value}\\\","
+                  end
+                end
               end
             end
-            string = string[0..-2]
-            string << "]"
-          else
-            string << "#{label[0..-2]}]"
           end
+          label = label[0..-2] if label[-1] == ','
+          string << "label=\"#{label}\"]"
+          # Parent-child relationships.
           if entity.has_parent?
             string << "\n#{entity.parent.id} -- #{entity.id};"
           end
-          if entity.has_children?
-            entity.each do |child|
-              string << self.to_dot(child)
-            end
-          end
+          # Edges.
           if entity.has_edges?
             entity.edges.each_pair do |target, type|
               string << "\n#{entity.id} -- #{target}"
@@ -65,6 +57,12 @@ module Treat
               string << "arrowhead=\"odiamond\"]"
             end
           end
+          # Recurse.
+          if entity.has_children?
+            entity.each do |child|
+              string << self.to_dot(child, options)
+            end
+          end
           string
         end
       end

data/lib/treat/formatters/visualizers/html.rb CHANGED Viewed

@@ -5,6 +5,7 @@ module Treat
       class HTML
         # Not implemented yet.
         def self.visualize(entity, options = {})
+          raise 'Not implemented yet.'
         end
       end
     end

data/lib/treat/formatters/visualizers/inspect.rb CHANGED Viewed

@@ -1,7 +1,11 @@
 module Treat
   module Formatters
     module Visualizers
+      # Handles the call to inspect.
       class Inspect
+        # Return a terminal-friendly visualization of an entity.
+        #
+        # Options: none.
         def self.visualize(entity, options = {})
           s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
           unless caller_method == :inspect

data/lib/treat/formatters/visualizers/short_value.rb CHANGED Viewed

@@ -2,11 +2,26 @@ module Treat
   module Formatters
     module Visualizers
       class ShortValue
+        # Default options for the visualizer.
+        DefaultOptions = { max_words: 6, max_length: 30 }
+        # Returns the text value of an entity, shortend
+        # with [..] if the value is longer than :max_words
+        # or longer than :max_length.
+        #
+        # Options:
+        # - (Integer) :max_words => the maximum number
+        # of words in an entity before it is shortened.
+        # - (Integer) :max_length => the maximum number
+        # of characters in an entity before it is shortened.s
         def self.visualize(entity, options = {})
-          options[:max_length] ||= 6
+          options = DefaultOptions.merge(options)
           words = entity.to_s.split(' ')
-          return entity.to_s if words.size < options[:max_length]
-          words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
+          if words.size < options[:max_words] ||
+            entity.to_s.length < options[:max_length]
+            entity.to_s
+          else
+            words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
+          end
         end
       end
     end

data/lib/treat/formatters/visualizers/standoff.rb CHANGED Viewed

@@ -5,24 +5,29 @@ module Treat
       # an entity in standoff format; for example:
       # (S (NP John) (VP has (VP come))).
       class Standoff
-        Recurse = Proc.new do |entity, options|
+        # Default options for the visualizer.
+        DefaultOptions = { indent: 0 }
+        # A lambda to recursively visualize the children
+        # of an entity.
+        Recurse = lambda do |entity, options|
           v = ''
           entity.each { |child| v += visualize(child, options) }
           v
         end
         # Visualize the entity using standoff notation.
-        # This can only be called on sentences, as it
-        # is not a suitable format to represent larger
-        # entity.
+        # This can only be called on sentences and smaller
+        # entities, as it is not a suitable format to
+        # represent larger entities.
         def self.visualize(entity, options = {})
-          options = {:indent => 0} if options.empty?
+          options = DefaultOptions.merge(options)
           value = '';  spaces = ''
           options[:indent].times { spaces << '   '}
           options[:indent] += 1
           if entity.is_a?(Treat::Entities::Token)
             value += "#{spaces}(#{entity.tag} #{entity.value})"
           elsif entity.is_a?(Treat::Entities::Constituent)
-            value += ("#{spaces}(#{entity.tag}\n" +
+            tag = entity.has?(:tag) ? entity.tag : ''
+            value += ("#{spaces}(#{tag}\n" +
             "#{Recurse.call(entity, options)})\n")
           elsif entity.is_a?(Treat::Entities::Sentence)
             value += ("#{spaces}(S\n" +

data/lib/treat/formatters/visualizers/tree.rb CHANGED Viewed

@@ -1,11 +1,15 @@
 module Treat
   module Formatters
     module Visualizers
+      # This class generates an ASCII representation
+      # of a tree of entities.
       class Tree
+        # Default options for the visualizer.
+        DefaultOptions = { indent: 0 }
         # Obtain a plain text tree representation
         # of the entity.
         def self.visualize(entity, options = {})
-          options = {:indent => 0} if options.empty?
+          options = DefaultOptions.merge(options)
           string = ''
           if entity.has_children?
             spacer = '--'

data/lib/treat/formatters/visualizers/txt.rb CHANGED Viewed

@@ -3,10 +3,15 @@ module Treat
     module Visualizers
       # Creates a plain text visualization of an entity.
       class Txt
+        # The default options for the visualizer.
+        DefaultOptions = { sep: ' ' }
         # Obtain a plain text visualization of the entity,
         # with no additional information.
+        #
+        # Options:
+        # (String) :sep => the separator to use between words.
         def self.visualize(entity, options = {})
-          options[:sep] = ' '
+          options = DefaultOptions.merge(options)
           return entity.value if !entity.has_children?
           value = ''
           entity.each do |child|

data/lib/treat/formatters.rb CHANGED Viewed

@@ -34,7 +34,7 @@ module Treat
     # Cleaners strip a text from its mark up.
     module Cleaners
       extend Group
-      self.type = :annotator
+      self.type = :transformer
       self.targets = [:document]
       self.default = :html
     end

data/lib/treat/group.rb CHANGED Viewed

@@ -61,14 +61,15 @@ module Treat
       end
       is_target
     end
+    # Cache the list of adaptors to improve performance.
+    @@list = {}
     # Populates once the list of the adaptors in the group
     # by crawling the filesystem.
-    @@list = {}
     def list
       mod = ucc(cl(self))
       if @@list[mod].nil?
         @@list[mod] = []
-        dirs = Dir["#{File.dirname(__FILE__)}/*/#{mod}/*.rb"]     # Fix
+        dirs = Dir.glob("#{Treat.lib}/treat/*/#{mod}/*.rb")
         dirs.each do |file|
           @@list[mod] <<
           :"#{file.split('/')[-1][0..-4]}"
@@ -79,7 +80,7 @@ module Treat
     # Get constants in this module, excluding those
     # defined by parent modules.
     def const_get(const); super(const, false); end
-    # Autoload the algorithms.
+    # Lazy load the classes in the group.
     def const_missing(const)
       bits = self.ancestors[0].to_s.split('::')
       bits.collect! { |bit| ucc(bit) }

data/lib/treat/inflectors/cardinal_words/linguistics.rb CHANGED Viewed

@@ -1,43 +1,40 @@
 module Treat
   module Inflectors
     module CardinalWords
+      # This class is a wrapper for the functions included
+      # in the 'linguistics' gem that allow to describe a
+      # number in words in cardinal form.
+      #
+      # Project website: http://deveiate.org/projects/Linguistics/
       class Linguistics
+        # Require the 'linguistics' gem.
         silence_warnings { require 'linguistics' }
+        # Return the description of a cardinal number in words.
         #
         # Options:
         #
-        # :group => Controls how many numbers at a time are
+        # - :group => Controls how many numbers at a time are
         # grouped together. Valid values are 0 (normal grouping),
         # 1 (single-digit grouping, e.g., “one, two, three, four”),
         # 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
         # 3 (triple-digit grouping, e.g., “one twenty-three, four”).
-        # :comma => Set the character/s used to separate word groups.
+        # - :comma => Set the character/s used to separate word groups.
         # Defaults to ", ".
-        # :and => Set the word and/or characters used where ' and '
+        # - :and => Set the word and/or characters used where ' and '
         # (the default) is normally used. Setting :and to ' ', for
         # example, will cause 2556 to be returned as “two-thousand,
         # five hundred fifty-six” instead of “two-thousand, five
         # hundred and fifty-six”.
-        # :zero => Set the word used to represent the numeral 0 in
+        # - :zero => Set the word used to represent the numeral 0 in
         # the result. 'zero' is the default.
-        # :decimal => Set the translation of any decimal points in
+        # - :decimal => Set the translation of any decimal points in
         # the number; the default is 'point'.
-        # :asArray If set to a true value, the number will be returned
+        # - :asArray If set to a true value, the number will be returned
         # as an array of word groups instead of a String.
         #
         # More specific options when using :type => :ordinal:
-        #
-        #
         def self.cardinal_words(entity, options = {})
-          begin
-            l = entity.language.to_s.upcase
-            delegate = nil
-            silence_warnings { delegate = ::Linguistics.const_get(l) }
-          rescue RuntimeError
-            raise "Ruby Linguistics does not have a module " +
-            " installed for the #{entity.language} language."
-          end
-          silence_warnings { delegate.numwords(entity.to_s, options) }
+          silence_warnings { ::Linguistics::EN.numwords(entity.to_s, options) }
         end
       end
     end

data/lib/treat/inflectors/conjugations/linguistics.rb CHANGED Viewed

@@ -1,15 +1,28 @@
 module Treat
   module Inflectors
     module Conjugations
+      # This class is a wrapper for the functions included
+      # in the 'linguistics' gem that allow to conjugate verbs.
+      #
+      # Project website: http://deveiate.org/projects/Linguistics/
       class Linguistics
         silence_warnings { require 'linguistics' }
-        def self.conjugate(entity, parameters)
+        # Conjugate a verb using ruby linguistics with the specified
+        # mode, tense, count and person.
+        #
+        # Options:
+        #
+        # - (Symbol) :mode => :infinitive, :indicative, :subjunctive, :participle
+        # - (Symbol) :tense => :past, :present, :future
+        # - (Symbol) :count => :singular, :plural
+        # - (Symbol) :person => :first, :second, :third
+        def self.conjugations(entity, parameters)
           begin
             l = entity.language.to_s.upcase
             delegate = nil
             silence_warnings { delegate = ::Linguistics.const_get(l) }
           rescue RuntimeError
-            raise "Ruby Linguistics does not have a module " +
+            raise "Ruby Linguistics does not have a module " +
             " installed for the #{entity.language} language."
           end
           if parameters[:mode] == :infinitive
@@ -27,4 +40,4 @@ module Treat
       end
     end
   end
-end
+end

data/lib/treat/inflectors/declensions/linguistics.rb CHANGED Viewed

@@ -1,24 +1,35 @@
 module Treat
   module Inflectors
     module Declensions
-      silence_warnings { require 'linguistics' }
-      # Obtain word declensions in English using the
-      # ruby 'linguistics' gem.
+      # This class is a wrapper for the functions included
+      # in the 'linguistics' gem that allow to obtain the
+      # declensions of a word.
+      #
+      # Project website: http://deveiate.org/projects/Linguistics/
       class Linguistics
-        def self.declense(entity, options = {})
+        # Require Ruby Linguistics
+        silence_warnings { require 'linguistics' }
+        # Retrieve a declension of a word using the 'linguistics' gem.
+        #
+        # Options:
+        #
+        # - (Identifier) :count => :singular, :plural
+        def self.declensions(entity, options = {})
           begin
             l = entity.language.to_s.upcase
             delegate = nil
             silence_warnings { delegate = ::Linguistics.const_get(l) }
           rescue RuntimeError
-            raise "Ruby Linguistics does not have a module " +
+            raise "Ruby Linguistics does not have a module " +
             " installed for the #{entity.language} language."
           end
           string = entity.to_s
           if options[:count] == :plural
             if entity.has?(:category) &&
               [:noun, :adjective, :verb].include?(entity.category)
-              silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
+              silence_warnings do
+                delegate.send(:"plural_#{entity.category}", string)
+              end
             else
               silence_warnings { delegate.plural(string) }
             end

data/lib/treat/inflectors/ordinal_words/linguistics.rb CHANGED Viewed

@@ -1,19 +1,18 @@
 module Treat
   module Inflectors
     module OrdinalWords
+      # This class is a wrapper for the functions included
+      # in the 'linguistics' gem that allow to describe a
+      # number in words in ordinal form.
+      #
+      # Project website: http://deveiate.org/projects/Linguistics/
       class Linguistics
+        # Require Ruby Linguistics.
         silence_warnings { require 'linguistics' }
+        # Desribe a number in words in ordinal form, using the
+        # 'linguistics' gem.
         def self.ordinal_words(number, options = {})
-          begin
-            l = number.language.to_s.upcase
-            delegate = nil
-            silence_warnings { delegate = ::Linguistics.const_get(l) }
-          rescue RuntimeError
-            lang = Treat::Languages.describe(number.language)
-            raise "Ruby Linguistics does not have a module " +
-            " installed for the #{lang} language."
-          end
-          silence_warnings { delegate.ordinate(number.to_s) }
+          silence_warnings { ::Linguistics::EN.ordinate(number.to_s) }
         end
       end
     end

data/lib/treat/inflectors/stem/porter.rb CHANGED Viewed

@@ -2,16 +2,20 @@ module Treat
   module Inflectors
     module Stem
       # Stem a word using a native Ruby implementation of the
-      # Porter stemming algorithm, ported to Ruby from the
-      # version coded up in Perl.
+      # Porter stemming algorithm, ported to Ruby from a
+      # version coded up in Perl. This is a simplified
+      # implementation; for a true and fast Porter stemmer,
+      # see Treat::Inflectors::Stem::PorterC.
       #
       # Authored by Ray Pereda (raypereda@hotmail.com).
+      # Unknown license.
       #
       # Original paper: Porter, 1980. An algorithm for suffix stripping,
       # Program, Vol. 14, no. 3, pp 130-137,
       # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
       class Porter
         # Returns the stem of a word using a native Porter stemmer.
+        #
         # Options: none.
         def self.stem(word, options = {})
           # Copy the word and convert it to a string.

data/lib/treat/inflectors/stem/porter_c.rb CHANGED Viewed

@@ -9,10 +9,13 @@ module Treat
       # Program, Vol. 14, no. 3, pp 130-137,
       # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
       class PorterC
+        # Require the 'ruby-stemmer' gem.
         silence_warnings { require 'lingua/stemmer' }
+        # Remove a conflict between this gem and the 'engtagger' gem.
         ::LinguaStemmer = ::Lingua
         Object.instance_eval { remove_const :Lingua }
-        # Stem the word using the Porter C algorithm.
+        # Stem the word using a full-blown Porter stemmer in C.
+        #
         # Options: none.
         def self.stem(word, options = {})
           silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }

data/lib/treat/inflectors/stem/uea.rb CHANGED Viewed

@@ -9,10 +9,10 @@ module Treat
       # groups of rules: the first to clean the tokens, and
       # the second to alter suffixes."
       #
-      #   Project website: https://github.com/ealdent/uea-stemmer
-      #   Original paper: Jenkins, Marie-Claire, Smith, Dan,
-      #   Conservative stemming for search and indexing, 2005.
-      #   http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
+      # Project website: https://github.com/ealdent/uea-stemmer
+      # Original paper: Jenkins, Marie-Claire, Smith, Dan,
+      # Conservative stemming for search and indexing, 2005.
+      # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
       class UEA
         # Require the 'uea-stemmer' gem.
         silence_warnings { require 'uea-stemmer' }

data/lib/treat/languages/english/tags.rb CHANGED Viewed

@@ -183,6 +183,22 @@ module Treat
         ['PRT',	'Particle'],
         ['S',	'Sentence']
       ]
+      # Maps Enju categories to Treat categories.
+      EnjuCatToCategory = {
+        'ADJ' => :adjective,
+        'ADV' => :adverb,
+        'CONJ' => :conjunction,
+        'COOD' => :conjunction,
+        'C' => :complementizer,
+        'D' => :determiner,
+        'N' => :noun,
+        'P' => :preposition,
+        'PN' => :punctuation,
+        'SC' => :conjunction,
+        'V' => :verb,
+        'PRT' => :particle
+      }
       # Description of the xcat in the Enju output specification.
       EnjuXCatDescription = [

data/lib/treat/languages/english.rb CHANGED Viewed

@@ -1,8 +1,10 @@
 module Treat
   module Languages
     class English
       require 'treat/languages/english/tags'
       require 'treat/languages/english/categories'
       Extractors = {
         time: [:chronic],
         topics: [:reuters],
@@ -11,7 +13,7 @@ module Treat
       }
       Processors = {
         chunkers: [:txt],
-        parsers: [:enju, :stanford],
+        parsers: [:stanford, :enju],
         segmenters: [:tactful, :punkt, :stanford],
         tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
       }
@@ -28,6 +30,7 @@ module Treat
         ordinal_words: [:linguistics],
         cardinal_words: [:linguistics]
       }
     end
   end
 end

data/lib/treat/lexicalizers/category/from_tag.rb CHANGED Viewed

@@ -4,13 +4,12 @@ module Treat
       # A class that detects the category of a word from its tag,
       # using the default tagger for the language of the entity.
       class FromTag
-        DefaultOptions = { tagger: nil }
         # Find the category of the current entity.
+        #
         # Options:
-        # :tagger => (Symbol) force the use of a tagger.
-        # :tag_to_cat => (Hash) a list of categories for each possible tag.
+        #
+        # - (Symbol) :tagger => force the use of a tagger.
         def self.category(entity, options = {})
-          options = DefaultOptions.merge(options)
           tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
           lang = Treat::Languages.get(entity.language)
           cat = lang::WordTagToCategory[tag]
@@ -21,6 +20,7 @@ module Treat
             if cat.size == 1
               return cat[0]
             else
+              entity.set :tag_set, :penn
               if entity.has?(:tag_set)
                 if cat[entity.tag_set]
                   return cat[entity.tag_set]

data/lib/treat/lexicalizers/linkages/naive.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Treat
         end
         # Return the subject of the sentence|verb.
         def self.subject(entity, options)
-          verb = entity.category == :verb ?
+          verb = (entity.has?(:category) && entity.category == :verb) ?
           main_verb(entity) : entity.main_verb
           args = []
           main_verb.edges.each_pair do |id,edge|
@@ -37,7 +37,7 @@ module Treat
         end
         # Return the object of the sentence|verb.
         def self.object(entity, options)
-          verb = entity.category == :verb ?
+          verb = (entity.has?(:category) && entity.category == :verb) ?
           main_verb(entity) : entity.main_verb
           if verb.voice == 'passive'
             return
@@ -50,7 +50,7 @@ module Treat
         end
         # Find the main verb (shallowest verb in the tree).
         def self.main_verb(entity, options)
-          verbs = entity.words_with_cat(:verb)
+          verbs = entity.verbs
           if verbs.empty?
             return
           end

data/lib/treat/lexicalizers/tag/brill.rb CHANGED Viewed

@@ -52,24 +52,16 @@ module Treat
         @@tagger = nil
         # Hold the user-set options
         @@options = {}
-        # Hold the default options.
-        DefaultOptions =  {
-          lexicon: nil,
-          lexical_rules: nil,
-          contextual_rules: nil
-        }
         # Tag words using a native Brill tagger.
         #
-        # Available options:
+        # Options:
+        #
         # :lexicon => String (Lexicon file to use)
         # :lexical_rules => String (Lexical rule file to use)
         # :contextual_rules => String (Contextual rules file to use)
         def self.tag(entity, options = {})
           # Reinitialize the tagger if the options have changed.
-          if options != @@options
-            @@options = DefaultOptions.merge(options)
-            @@tagger = nil # Reset the tagger
-          end
+          @@tagger = nil if options != @@options
           # Create the tagger if necessary
           @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
           options[:lexical_rules], options[:contextual_rules])

data/lib/treat/lexicalizers/tag/lingua.rb CHANGED Viewed

@@ -24,9 +24,8 @@ module Treat
         @@options = {}
         # Hold the default options.
         DefaultOptions =  {
-          unknown_word_tag: '?',
-          relax: false,
-          debug: false
+          unknown_word_tag: 'FW',
+          relax: false
         }
         # Tag the word using a probabilistic model taking
         # into account known words found in a lexicon and
@@ -34,11 +33,10 @@ module Treat
         #
         # Options:
         #
-        #   :relax => (Boolean) Relax the Hidden Markov Model:
+        # - (Boolean) :relax => Relax the Hidden Markov Model:
         #   this may improve accuracy for uncommon words,
         #   particularly words used polysemously.
-        #   :debug => (Boolean) Print debug messages.
-        #   :unknown_word_tag => (String) Tag for unknown words.
+        # - (String) :unknown_word_tag => Tag for unknown words.
         def self.tag(entity, options = {})
           # Reinitialize the tagger if the options have changed.
           if options != @@options