RubyGems - treat - Versions diffs - 0.1.1 - Mend

treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

data/INSTALL +0 -0
data/LICENSE +28 -0
data/README +0 -0
data/TODO +67 -0
data/bin/INFO +1 -0
data/examples/benchmark.rb +81 -0
data/examples/keywords.rb +60 -0
data/examples/texts/bugged_out.txt +26 -0
data/examples/texts/half_cocked_basel.txt +16 -0
data/examples/texts/hedge_funds.txt +24 -0
data/examples/texts/hose_and_dry.txt +19 -0
data/examples/texts/hungarys_troubles.txt +46 -0
data/examples/texts/indias_slowdown.txt +15 -0
data/examples/texts/merkozy_rides_again.txt +24 -0
data/examples/texts/prada_is_not_walmart.txt +9 -0
data/examples/texts/republican_nomination.txt +26 -0
data/examples/texts/to_infinity_and_beyond.txt +15 -0
data/lib/treat.rb +91 -0
data/lib/treat/buildable.rb +115 -0
data/lib/treat/categories.rb +29 -0
data/lib/treat/category.rb +28 -0
data/lib/treat/delegatable.rb +90 -0
data/lib/treat/detectors.rb +28 -0
data/lib/treat/detectors/encoding/native.rb +12 -0
data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
data/lib/treat/detectors/format/file.rb +36 -0
data/lib/treat/detectors/language/language_detector.rb +19 -0
data/lib/treat/detectors/language/what_language.rb +29 -0
data/lib/treat/entities.rb +52 -0
data/lib/treat/entities/collection.rb +19 -0
data/lib/treat/entities/constituents.rb +15 -0
data/lib/treat/entities/document.rb +11 -0
data/lib/treat/entities/entity.rb +242 -0
data/lib/treat/entities/sentence.rb +8 -0
data/lib/treat/entities/text.rb +7 -0
data/lib/treat/entities/tokens.rb +37 -0
data/lib/treat/entities/zones.rb +17 -0
data/lib/treat/exception.rb +5 -0
data/lib/treat/extractors.rb +41 -0
data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
data/lib/treat/extractors/named_entity/abner.rb +20 -0
data/lib/treat/extractors/named_entity/stanford.rb +174 -0
data/lib/treat/extractors/statistics/frequency.rb +22 -0
data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
data/lib/treat/extractors/statistics/position_in.rb +13 -0
data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
data/lib/treat/extractors/time/chronic.rb +12 -0
data/lib/treat/extractors/time/native.rb +12 -0
data/lib/treat/extractors/time/nickel.rb +45 -0
data/lib/treat/extractors/topic_words/lda.rb +71 -0
data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
data/lib/treat/extractors/topics/reuters.rb +91 -0
data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
data/lib/treat/feature.rb +53 -0
data/lib/treat/formatters.rb +44 -0
data/lib/treat/formatters/cleaners/html.rb +17 -0
data/lib/treat/formatters/readers/autoselect.rb +35 -0
data/lib/treat/formatters/readers/gocr.rb +24 -0
data/lib/treat/formatters/readers/html.rb +13 -0
data/lib/treat/formatters/readers/ocropus.rb +31 -0
data/lib/treat/formatters/readers/pdf.rb +17 -0
data/lib/treat/formatters/readers/txt.rb +15 -0
data/lib/treat/formatters/serializers/xml.rb +48 -0
data/lib/treat/formatters/serializers/yaml.rb +15 -0
data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
data/lib/treat/formatters/unserializers/xml.rb +79 -0
data/lib/treat/formatters/unserializers/yaml.rb +15 -0
data/lib/treat/formatters/visualizers/dot.rb +73 -0
data/lib/treat/formatters/visualizers/html.rb +12 -0
data/lib/treat/formatters/visualizers/inspect.rb +16 -0
data/lib/treat/formatters/visualizers/short_value.rb +14 -0
data/lib/treat/formatters/visualizers/standoff.rb +41 -0
data/lib/treat/formatters/visualizers/tree.rb +28 -0
data/lib/treat/formatters/visualizers/txt.rb +31 -0
data/lib/treat/group.rb +96 -0
data/lib/treat/inflectors.rb +50 -0
data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
data/lib/treat/inflectors/declensors/en.rb +18 -0
data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
data/lib/treat/inflectors/stemmers/porter.rb +158 -0
data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
data/lib/treat/inflectors/stemmers/uea.rb +30 -0
data/lib/treat/lexicalizers.rb +49 -0
data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
data/lib/treat/lexicalizers/tag/brill.rb +101 -0
data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
data/lib/treat/processors.rb +45 -0
data/lib/treat/processors/chunkers/txt.rb +27 -0
data/lib/treat/processors/parsers/enju.rb +214 -0
data/lib/treat/processors/parsers/stanford.rb +60 -0
data/lib/treat/processors/segmenters/punkt.rb +48 -0
data/lib/treat/processors/segmenters/stanford.rb +45 -0
data/lib/treat/processors/segmenters/tactful.rb +34 -0
data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
data/lib/treat/processors/tokenizers/perl.rb +96 -0
data/lib/treat/processors/tokenizers/punkt.rb +42 -0
data/lib/treat/processors/tokenizers/stanford.rb +33 -0
data/lib/treat/processors/tokenizers/tactful.rb +59 -0
data/lib/treat/proxies.rb +66 -0
data/lib/treat/registrable.rb +26 -0
data/lib/treat/resources.rb +10 -0
data/lib/treat/resources/categories.rb +18 -0
data/lib/treat/resources/delegates.rb +96 -0
data/lib/treat/resources/dependencies.rb +0 -0
data/lib/treat/resources/edges.rb +8 -0
data/lib/treat/resources/formats.rb +23 -0
data/lib/treat/resources/languages.rb +86 -0
data/lib/treat/resources/languages.txt +504 -0
data/lib/treat/resources/tags.rb +393 -0
data/lib/treat/sugar.rb +43 -0
data/lib/treat/tree.rb +174 -0
data/lib/treat/utilities.rb +127 -0
data/lib/treat/visitable.rb +27 -0
data/test/profile.rb +2 -0
data/test/tc_detectors.rb +27 -0
data/test/tc_entity.rb +105 -0
data/test/tc_extractors.rb +48 -0
data/test/tc_formatters.rb +46 -0
data/test/tc_inflectors.rb +39 -0
data/test/tc_lexicalizers.rb +39 -0
data/test/tc_processors.rb +36 -0
data/test/tc_resources.rb +27 -0
data/test/tc_treat.rb +64 -0
data/test/tc_tree.rb +60 -0
data/test/tests.rb +19 -0
data/test/texts.rb +20 -0
data/test/texts/english/long.html +24 -0
data/test/texts/english/long.txt +22 -0
data/test/texts/english/medium.txt +5 -0
data/test/texts/english/short.txt +3 -0
metadata +412 -0

data/lib/treat/formatters/unserializers/yaml.rb ADDED

@@ -0,0 +1,15 @@
+module Treat
+  module Formatters
+    module Unserializers
+      class YAML
+        # Require the Psych YAML parser.
+        require 'psych'
+        # Unserialize a YAML file representing an entity.
+        def self.unserialize(document, options = {})
+          document << ::Psych.load(File.read(document.file))
+          document
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/dot.rb ADDED

@@ -0,0 +1,73 @@
+module Treat
+  module Formatters
+    module Visualizers
+      class Dot
+        # Border colors to use for different POS tags.
+        BorderColors = {
+          :verb => "#00AABB",
+          :noun => "#FAD4A7",
+          :adverb => '#103585',
+          :adjective => '#D21D54'
+        }
+        # Create the top-most graph structure
+        # and delegate the creation of the graph
+        # nodes to to_dot.
+        def self.visualize(entity, options = {})
+          string = "graph {"
+          string << self.to_dot(entity)
+          string << "\n}"
+        end
+        # dot -Tpdf test4.dot > test4.pdf
+        def self.to_dot(entity)
+          string = ''
+          if entity.is_leaf?
+            if entity.is_a?(Treat::Entities::Word)
+              label = "label=\"#{entity.value} (#{entity.tag})\","
+              label << "color=\"#{BorderColors[entity.cat]}\","
+            else
+              label = "label=\"#{entity.value.inspect[1..-2]}\","
+            end
+          else
+            if entity.class < Entities::Constituent
+              label = "label=\"#{entity.tag}\","
+              # label << "color=\"#{BorderColors[entity.tag]}\","
+            else
+              label = "label=\"#{cc(cl(entity.class))}\","
+            end
+          end
+          string << "\n#{entity.id} ["
+          if entity.has_features?
+            string << label
+            entity.features.each_pair do |feature, value|
+              if value.is_a?(Treat::Entities::Entity)
+                string << "#{feature}=\"#{value.id}\","
+              else
+                string << "#{feature}=\"#{value}\","
+              end
+            end
+            string = string[0..-2]
+            string << "]"
+          else
+            string << "#{label[0..-2]}]"
+          end
+          if entity.has_parent?
+            string << "\n#{entity.parent.id} -- #{entity.id};"
+          end
+          if entity.has_children?
+            entity.each do |child|
+              string << self.to_dot(child)
+            end
+          end
+          if entity.has_edges?
+            entity.edges.each_pair do |target, type|
+              string << "\n#{entity.id} -- #{target}"
+              string << "[label=#{type},dir=forward,"
+              string << "arrowhead=\"odiamond\"]"
+            end
+          end
+          string
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/html.rb ADDED

@@ -0,0 +1,12 @@
+module Treat
+  module Formatters
+    module Visualizers
+      # This class is not implemented yet.
+      class HTML
+        # Not implemented yet.
+        def self.visualize(entity, options = {})
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/inspect.rb ADDED

@@ -0,0 +1,16 @@
+module Treat
+  module Formatters
+    module Visualizers
+      class Inspect
+        def self.visualize(entity, options = {})
+          s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
+          unless caller_method == :inspect
+            s += "  | #{entity.short_value.inspect}  |  #{entity.features.inspect}" +
+            "  | #{entity.edges.inspect}"
+          end
+          s
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/short_value.rb ADDED

@@ -0,0 +1,14 @@
+module Treat
+  module Formatters
+    module Visualizers
+      class ShortValue
+        def self.visualize(entity, options = {})
+          options[:max_length] ||= 6
+          words = entity.to_s.split(' ')
+          return entity.to_s if words.size < options[:max_length]
+          words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/standoff.rb ADDED

@@ -0,0 +1,41 @@
+module Treat
+  module Formatters
+    module Visualizers
+      # This class allows the visualization of
+      # an entity in standoff format; for example:
+      # (S (NP John) (VP has (VP come))).
+      class Standoff
+        Recurse = Proc.new do |entity, options|
+          v = ''
+          entity.each { |child| v += visualize(child, options) }
+          v
+        end
+        # Visualize the entity using standoff notation.
+        # This can only be called on sentences, as it
+        # is not a suitable format to represent larger
+        # entity.
+        def self.visualize(entity, options = {})
+          options = {:indent => 0} if options.empty?
+          value = '';  spaces = ''
+          options[:indent].times { spaces << '   '}
+          options[:indent] += 1
+          if entity.is_a?(Treat::Entities::Token)
+            value += "#{spaces}(#{entity.tag} #{entity.value})"
+          elsif entity.is_a?(Treat::Entities::Constituent)
+            value += ("#{spaces}(#{entity.tag}\n" +
+            "#{Recurse.call(entity, options)})\n")
+          elsif entity.is_a?(Treat::Entities::Sentence)
+            value += ("#{spaces}(S\n" +
+            "#{Recurse.call(entity, options)})\n")
+          else
+            raise 'Standoff format is unsuitable to represent' +
+            ' entities larger than sentences.'
+          end
+          options[:indent] -= 1
+          value.gsub!(")\n)", "))")
+          value
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/tree.rb ADDED

@@ -0,0 +1,28 @@
+module Treat
+  module Formatters
+    module Visualizers
+      class Tree
+        # Obtain a plain text tree representation
+        # of the entity.
+        def self.visualize(entity, options = {})
+          options = {:indent => 0} if options.empty?
+          string = ''
+          if entity.has_children?
+            spacer = '--'
+            spaces = ''
+            options[:indent].times { spaces << '   '}
+            string << "+ #{entity.inspect}\n#{spaces}|"
+            options[:indent] += 1
+            entity.children.each do |child|
+              string = string + "\n" + spaces + '+' +
+              spacer + self.visualize(child, options)
+            end
+            options[:indent] -= 1
+            return string
+          end
+          '> ' + entity.inspect
+        end
+      end
+    end
+  end
+end

data/lib/treat/formatters/visualizers/txt.rb ADDED

@@ -0,0 +1,31 @@
+module Treat
+  module Formatters
+    module Visualizers
+      # Creates a plain text visualization of an entity.
+      class Txt
+        # Obtain a plain text visualization of the entity,
+        # with no additional information.
+        def self.visualize(entity, options = {})
+          options[:sep] = ' '
+          return entity.value if !entity.has_children?
+          value = ''
+          entity.each do |child|
+            if child.is_a?(Treat::Entities::Token) || child.value != ''
+              # Remove the trailing space for tokens that
+              # 'stick' to the previous one, such
+              # as punctuation symbols and clitics.
+              if child.is_a?(Treat::Entities::Punctuation) ||
+                child.is_a?(Treat::Entities::Clitic)
+                value.strip!
+              end
+              value += child.value + options[:sep]
+            else
+              value += visualize(child, options)
+            end
+          end
+          value
+        end
+      end
+    end
+  end
+end

data/lib/treat/group.rb ADDED

@@ -0,0 +1,96 @@
+module Treat
+  module Group
+    def self.extended(group)
+      group.module_eval do
+        class << self
+          attr_accessor :type, :default, :targets
+        end
+        # Return the method corresponding to the group.
+        # This method resolves the name of the method
+        # that a group should provide based on the name
+        # of the group. Basically, if the group ends in
+        # -ers, the verb corresponding to the group is
+        # returned (tokenizers -> tokenize, inflectors ->
+        # inflect). Otherwise, the name of the method
+        # is the same as that of the group (encoding ->
+        # encoding, tag -> tag).
+        @method = nil
+        def self.method
+          return @method if @method
+          m = ucc(cl(self))
+          if m[-3..-1] == 'ers'
+            if ['k', 't', 'm', 'd', 'g', 'n'].include? m[-4]
+              n = m[0..-4]
+              n = n[0..-2] if n[-1] == n[-2]
+            else
+              n = m[0..-3]
+            end
+          elsif m[-3..-1] == 'ors'
+            n = m[0..-4] + 'e'
+          else
+            n = m
+          end
+          @method = :"#{n}"
+        end
+      end
+    end
+    # Create a new algorithm within the group. Once
+    # the algorithm is added, it will be automatically
+    # installed on all the targets of the group.
+    def add(class_name, &block)
+      class_name = :"#{cc(class_name)}"
+      klass = self.const_set(class_name, Class.new)
+      method = self.method
+      klass.class_eval do
+        @@block = block
+        eval "def #{method}(entity);" +
+        "@@block.call(entity); end"
+      end
+    end
+    # Boolean - does the group have the supplied class
+    # included in its targets?
+    def has_target?(target, strict = false)
+      is_target = false
+      self.targets.each do |entity_type|
+        entity_type = Entities.const_get(cc(entity_type))
+        if target < entity_type || entity_type == target
+          is_target = true; break
+        end
+      end
+      is_target
+    end
+    # Populates once the list of the adaptors in the group
+    # by crawling the filesystem.
+    @@list = {}
+    def list
+      mod = ucc(cl(self))
+      if @@list[mod].nil?
+        @@list[mod] = []
+        dirs = Dir["#{File.dirname(__FILE__)}/*/#{mod}/*.rb"]     # Fix
+        dirs.each do |file|
+          @@list[mod] <<
+          :"#{file.split('/')[-1][0..-4]}"
+        end
+      end
+      @@list[mod]
+    end
+    # Set inherit to false by default.
+    def const_get(const)
+      super(const, false)
+    end
+    # Autoload the algorithms.
+    def const_missing(const)
+      bits = self.ancestors[0].to_s.split('::')
+      bits.collect! { |bit| ucc(bit) }
+      file = bits.join('/') + "/#{ucc(const)}"          # Fix
+      #if not File.readable?(file + '.rb')
+      #  raise Treat::Exception,
+      #  "File '#{file}.rb' corresponding to requested delegate "+
+      #  "#{self}::#{const} does not exist."
+        require file
+        const_get(const)
+      #end
+    end
+  end
+end

data/lib/treat/inflectors.rb ADDED

@@ -0,0 +1,50 @@
+module Treat
+  # Algorithms to retrieve the inflections of a word.
+  # Stemmers return the stem (not root form) of a word.
+  # Taggers return the part of speech tag of a word.
+  # Inflectors allow to retrieve the different inflections of a
+  # noun (declensions), a verb (conjugations). Lexicons return,
+  # among other things, the gloss or synset of a word.
+  module Inflectors
+    # Lemmatizers return the root form of a word.
+    module Lemmatizers
+      extend Group
+      self.type = :annotator
+      self.targets = [:word]
+    end
+    # Stemmers return the stem (*not root form*) of a word.
+    module Stemmers
+      extend Group
+      self.type = :annotator
+      self.targets = [:word]
+    end
+    # Declensors allow to retrieve the different declensions of a
+    # noun (singular, plural).
+    module Declensors
+      extend Group
+      self.type = :annotator
+      self.targets = [:word]
+    end
+    # Conjugators allow to retrieve the different conjugations of
+    # a word.
+    module Conjugators
+      extend Group
+      self.type = :annotator
+      self.targets = [:word]
+    end
+    # Cardinal retrieve the full text description of a number.
+    module CardinalWords
+      extend Group
+      self.type = :annotator
+      self.targets = [:number]
+    end
+    # Ordinal retrieve the ordinal form of numbers.
+    module OrdinalWords
+      extend Group
+      self.type = :annotator
+      self.targets = [:number]
+    end
+    extend Treat::Category
+  end
+end

data/lib/treat/inflectors/cardinal_words/linguistics.rb ADDED

@@ -0,0 +1,45 @@
+module Treat
+  module Inflectors
+    module CardinalWords
+      class Linguistics
+        silently { require 'linguistics' }
+        #
+        # Options:
+        #
+        # :group => Controls how many numbers at a time are
+        # grouped together. Valid values are 0 (normal grouping),
+        # 1 (single-digit grouping, e.g., “one, two, three, four”),
+        # 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
+        # 3 (triple-digit grouping, e.g., “one twenty-three, four”).
+        # :comma => Set the character/s used to separate word groups.
+        # Defaults to ", ".
+        # :and => Set the word and/or characters used where ' and '
+        # (the default) is normally used. Setting :and to ' ', for
+        # example, will cause 2556 to be returned as “two-thousand,
+        # five hundred fifty-six” instead of “two-thousand, five
+        # hundred and fifty-six”.
+        # :zero => Set the word used to represent the numeral 0 in
+        # the result. 'zero' is the default.
+        # :decimal => Set the translation of any decimal points in
+        # the number; the default is 'point'.
+        # :asArray If set to a true value, the number will be returned
+        # as an array of word groups instead of a String.
+        #
+        # More specific options when using :type => :ordinal:
+        #
+        #
+        def self.cardinal_words(entity, options = {})
+          begin
+            l = entity.language.to_s.upcase
+            delegate = nil
+            silently { delegate = ::Linguistics.const_get(l) }
+          rescue RuntimeError
+            raise "Ruby Linguistics does not have a module " +
+            " installed for the #{entity.language} language."
+          end
+          silently { delegate.numwords(entity.to_s, options) }
+        end
+      end
+    end
+  end
+end

data/lib/treat/inflectors/conjugators/linguistics.rb ADDED

@@ -0,0 +1,30 @@
+module Treat
+  module Inflectors
+    module Conjugators
+      class Linguistics
+        silently { require 'linguistics' }
+        def self.conjugate(entity, parameters)
+          begin
+            l = entity.language.to_s.upcase
+            delegate = nil
+            silently { delegate = ::Linguistics.const_get(l) }
+          rescue RuntimeError
+            raise "Ruby Linguistics does not have a module " +
+            " installed for the #{entity.language} language."
+          end
+          if parameters[:mode] == :infinitive
+            silently { delegate.infinitive(entity.to_s) }
+          elsif parameters[:mode] == :participle && parameters[:tense] == :present
+            silently { delegate.present_participle(entity.to_s) }
+          elsif parameters[:count] == :plural && parameters.size == 1
+            silently { delegate.plural_verb(entity.to_s) }
+          else
+            raise Treat::Exception,
+            'This combination of modes, tenses, persons ' +
+            'and/or counts is not presently supported.'
+          end
+        end
+      end
+    end
+  end
+end