RubyGems - treat - Versions diffs - 0.1.1 - Mend

treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

data/INSTALL +0 -0
data/LICENSE +28 -0
data/README +0 -0
data/TODO +67 -0
data/bin/INFO +1 -0
data/examples/benchmark.rb +81 -0
data/examples/keywords.rb +60 -0
data/examples/texts/bugged_out.txt +26 -0
data/examples/texts/half_cocked_basel.txt +16 -0
data/examples/texts/hedge_funds.txt +24 -0
data/examples/texts/hose_and_dry.txt +19 -0
data/examples/texts/hungarys_troubles.txt +46 -0
data/examples/texts/indias_slowdown.txt +15 -0
data/examples/texts/merkozy_rides_again.txt +24 -0
data/examples/texts/prada_is_not_walmart.txt +9 -0
data/examples/texts/republican_nomination.txt +26 -0
data/examples/texts/to_infinity_and_beyond.txt +15 -0
data/lib/treat.rb +91 -0
data/lib/treat/buildable.rb +115 -0
data/lib/treat/categories.rb +29 -0
data/lib/treat/category.rb +28 -0
data/lib/treat/delegatable.rb +90 -0
data/lib/treat/detectors.rb +28 -0
data/lib/treat/detectors/encoding/native.rb +12 -0
data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
data/lib/treat/detectors/format/file.rb +36 -0
data/lib/treat/detectors/language/language_detector.rb +19 -0
data/lib/treat/detectors/language/what_language.rb +29 -0
data/lib/treat/entities.rb +52 -0
data/lib/treat/entities/collection.rb +19 -0
data/lib/treat/entities/constituents.rb +15 -0
data/lib/treat/entities/document.rb +11 -0
data/lib/treat/entities/entity.rb +242 -0
data/lib/treat/entities/sentence.rb +8 -0
data/lib/treat/entities/text.rb +7 -0
data/lib/treat/entities/tokens.rb +37 -0
data/lib/treat/entities/zones.rb +17 -0
data/lib/treat/exception.rb +5 -0
data/lib/treat/extractors.rb +41 -0
data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
data/lib/treat/extractors/named_entity/abner.rb +20 -0
data/lib/treat/extractors/named_entity/stanford.rb +174 -0
data/lib/treat/extractors/statistics/frequency.rb +22 -0
data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
data/lib/treat/extractors/statistics/position_in.rb +13 -0
data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
data/lib/treat/extractors/time/chronic.rb +12 -0
data/lib/treat/extractors/time/native.rb +12 -0
data/lib/treat/extractors/time/nickel.rb +45 -0
data/lib/treat/extractors/topic_words/lda.rb +71 -0
data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
data/lib/treat/extractors/topics/reuters.rb +91 -0
data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
data/lib/treat/feature.rb +53 -0
data/lib/treat/formatters.rb +44 -0
data/lib/treat/formatters/cleaners/html.rb +17 -0
data/lib/treat/formatters/readers/autoselect.rb +35 -0
data/lib/treat/formatters/readers/gocr.rb +24 -0
data/lib/treat/formatters/readers/html.rb +13 -0
data/lib/treat/formatters/readers/ocropus.rb +31 -0
data/lib/treat/formatters/readers/pdf.rb +17 -0
data/lib/treat/formatters/readers/txt.rb +15 -0
data/lib/treat/formatters/serializers/xml.rb +48 -0
data/lib/treat/formatters/serializers/yaml.rb +15 -0
data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
data/lib/treat/formatters/unserializers/xml.rb +79 -0
data/lib/treat/formatters/unserializers/yaml.rb +15 -0
data/lib/treat/formatters/visualizers/dot.rb +73 -0
data/lib/treat/formatters/visualizers/html.rb +12 -0
data/lib/treat/formatters/visualizers/inspect.rb +16 -0
data/lib/treat/formatters/visualizers/short_value.rb +14 -0
data/lib/treat/formatters/visualizers/standoff.rb +41 -0
data/lib/treat/formatters/visualizers/tree.rb +28 -0
data/lib/treat/formatters/visualizers/txt.rb +31 -0
data/lib/treat/group.rb +96 -0
data/lib/treat/inflectors.rb +50 -0
data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
data/lib/treat/inflectors/declensors/en.rb +18 -0
data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
data/lib/treat/inflectors/stemmers/porter.rb +158 -0
data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
data/lib/treat/inflectors/stemmers/uea.rb +30 -0
data/lib/treat/lexicalizers.rb +49 -0
data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
data/lib/treat/lexicalizers/tag/brill.rb +101 -0
data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
data/lib/treat/processors.rb +45 -0
data/lib/treat/processors/chunkers/txt.rb +27 -0
data/lib/treat/processors/parsers/enju.rb +214 -0
data/lib/treat/processors/parsers/stanford.rb +60 -0
data/lib/treat/processors/segmenters/punkt.rb +48 -0
data/lib/treat/processors/segmenters/stanford.rb +45 -0
data/lib/treat/processors/segmenters/tactful.rb +34 -0
data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
data/lib/treat/processors/tokenizers/perl.rb +96 -0
data/lib/treat/processors/tokenizers/punkt.rb +42 -0
data/lib/treat/processors/tokenizers/stanford.rb +33 -0
data/lib/treat/processors/tokenizers/tactful.rb +59 -0
data/lib/treat/proxies.rb +66 -0
data/lib/treat/registrable.rb +26 -0
data/lib/treat/resources.rb +10 -0
data/lib/treat/resources/categories.rb +18 -0
data/lib/treat/resources/delegates.rb +96 -0
data/lib/treat/resources/dependencies.rb +0 -0
data/lib/treat/resources/edges.rb +8 -0
data/lib/treat/resources/formats.rb +23 -0
data/lib/treat/resources/languages.rb +86 -0
data/lib/treat/resources/languages.txt +504 -0
data/lib/treat/resources/tags.rb +393 -0
data/lib/treat/sugar.rb +43 -0
data/lib/treat/tree.rb +174 -0
data/lib/treat/utilities.rb +127 -0
data/lib/treat/visitable.rb +27 -0
data/test/profile.rb +2 -0
data/test/tc_detectors.rb +27 -0
data/test/tc_entity.rb +105 -0
data/test/tc_extractors.rb +48 -0
data/test/tc_formatters.rb +46 -0
data/test/tc_inflectors.rb +39 -0
data/test/tc_lexicalizers.rb +39 -0
data/test/tc_processors.rb +36 -0
data/test/tc_resources.rb +27 -0
data/test/tc_treat.rb +64 -0
data/test/tc_tree.rb +60 -0
data/test/tests.rb +19 -0
data/test/texts.rb +20 -0
data/test/texts/english/long.html +24 -0
data/test/texts/english/long.txt +22 -0
data/test/texts/english/medium.txt +5 -0
data/test/texts/english/short.txt +3 -0
metadata +412 -0

data/lib/treat/detectors/language/language_detector.rb ADDED

@@ -0,0 +1,19 @@
+module Treat
+  module Detectors
+    module Language
+      class LanguageDetector
+        def self.language(entity, options = {})
+          if Treat.detect_language == false
+            return Treat.default_language
+          else
+            dlvl = Treat.language_detection_level
+            if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
+               entity.has_parent?
+                return entity.ancestor_with_type(dlvl).language
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/treat/detectors/language/what_language.rb ADDED

@@ -0,0 +1,29 @@
+module Treat
+  module Detectors
+    module Language
+      # Require the 'whatlanguage' gem.
+      silently { require 'whatlanguage'  }
+      # Adaptor for the 'whatlanguage' gem, which
+      # performs probabilistic language detection.
+      class WhatLanguage < LanguageDetector
+        # Keep only once instance of the gem class.
+        @@wl = nil
+        # Detect the language of an entity using the
+        # 'whatlanguage' gem. Return an identifier
+        # corresponding to the ISO-639-2 code for the
+        # language.
+        def self.language(entity, options = {})
+          predetection = super(entity, options)
+          return predetection if predetection
+          @@wl ||= ::WhatLanguage.new(:all)
+          all = @@wl.process_text(entity.to_s)
+          lang = {}
+          all.each do |k,v|
+            lang[Treat::Resources::Languages.find(k)] = v
+          end
+          Treat::Feature.new(lang).best
+        end
+      end
+    end
+  end
+end

data/lib/treat/entities.rb ADDED

@@ -0,0 +1,52 @@
+module Treat
+  # Abstract and concrete structures extending the
+  # Tree::Node class to represent textual entities:
+  #
+  # - Collection
+  # - Document
+  # - Text
+  # - Zone (a Section, Title, Paragraph, or List)
+  # - Sentence
+  # - Constituent (a Phrase or Clause)
+  # - Token (a Word, Number, Punctuation, or Symbol).
+  module Entities
+    # Require Entity first, since the other classes
+    # extend this class.
+    require 'treat/entities/entity'
+    require 'treat/entities/collection'
+    require 'treat/entities/document'
+    require 'treat/entities/text'
+    require 'treat/entities/zones'
+    require 'treat/entities/sentence'
+    require 'treat/entities/constituents'
+    require 'treat/entities/tokens'
+    # Make the constants buildable.
+    constants.each do |entity|
+      define_singleton_method(entity) do |value='', id=nil|
+        const_get(entity).build(value, id)
+      end
+    end
+    # Provide a list of defined entity types,
+    # as non-camel case identifiers.
+    @@list = []
+    def self.list
+      return @@list unless @@list.empty?
+      self.constants.each do |constant|
+        @@list << :"#{ucc(constant)}"
+      end
+      @@list
+    end
+    # Return the 'z-order' for hierarchical
+    # comparison of entity types.
+    def self.rank(type)
+      klass = Entities.const_get(cc(type))
+      return 6 if klass == Collection || klass < Collection
+      return 5 if klass == Document || klass < Document
+      return 4 if klass == Text || klass < Text
+      return 3 if klass == Zone || klass < Zone
+      return 2 if klass == Sentence || klass < Sentence
+      return 1 if klass == Constituent || klass < Constituent
+      return 0 if klass == Token || klass < Token
+    end
+  end
+end

data/lib/treat/entities/collection.rb ADDED

@@ -0,0 +1,19 @@
+module Treat
+  module Entities
+    # Represents a collection of texts.
+    class Collection < Entity
+      # Initialize the collection with a folder
+      # containing the texts of the collection.
+      def initialize(folder = nil, id = nil)
+        super('', id)
+        if folder
+          set :folder, folder
+          Dir.glob("#{folder}/*").each do |f|
+            next if FileTest.directory?(f)
+            self << Document.new(f)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/treat/entities/constituents.rb ADDED

@@ -0,0 +1,15 @@
+module Treat
+  module Entities
+    # Represents any syntactic constituent
+    # of a sentence.
+    class Constituent < Entity
+    end
+    # Represents a phrase inside a sentence
+    # or by itself.
+    class Phrase < Constituent
+    end
+    # Represents a clause inside a sentence.
+    class Clause < Constituent
+    end
+  end
+end

data/lib/treat/entities/document.rb ADDED

@@ -0,0 +1,11 @@
+module Treat
+  module Entities
+    # Represents a document.
+    class Document < Entity
+      def initialize(file, id = nil)
+        super('', id)
+        set :file, file
+      end
+    end
+  end
+end

data/lib/treat/entities/entity.rb ADDED

@@ -0,0 +1,242 @@
+require 'treat/tree'
+require 'treat/feature'
+require 'treat/delegatable'
+require 'treat/visitable'
+require 'treat/registrable'
+require 'treat/buildable'
+module Treat
+  module Entities
+    class Entity < Tree::Node
+      # Implements support for #register
+      include Registrable
+      # Implement support for #accept.
+      include Visitable
+      # Implement support for #self.add_delegators
+      extend Delegatable
+      # Implement support for #self.from_*
+      extend Buildable
+      # Initialize the document with its filename.
+      # Optionally specify a reader to read the file.
+      # If +read+ is set to false, the document will
+      # not be read automatically; in that case, the
+      # method #read must be called on the document
+      # object to load it in.
+      def self.build(file_or_value = '', id = nil)
+        from_anything(file_or_value, id)
+      end
+      # Initialize the entity with its value and
+      # (optionally) a unique identifier. By default,
+      # the object_id will be used as id. Also initialize
+      # the token registry in the root node.
+      def initialize(value = '', id = nil)
+        id ||= object_id
+        super(value, id)
+      end
+      # Return a lowercase identifier representing the
+      # type of entity (e.g. :word, :token, etc.)
+      def type; :"#{cl(self.class).downcase}"; end
+      # Catch missing methods to support method-like
+      # access to features (e.g. entity.cat instead of
+      # entity.features[:cat]) and to support magic
+      # methods (see #parse_magic_method). If the
+      # feature does not exist
+      def method_missing(sym, *args, &block)
+        return self.build(*args) if sym == nil
+        if !@features[sym]
+          r = parse_magic_method(sym, *args, &block)
+          if r == :no_magic
+            begin
+              super(sym, *args, &block)
+            rescue NoMethodError
+              # Check...
+              if Categories.have_method?(sym)
+                msg = "Method #{sym} cannot be called on a #{type}."
+              else
+                msg = "Method #{sym} does not exist."
+                msg += did_you_mean?(Category.methods, sym)
+              end
+              raise Treat::Exception, msg
+            end
+          else
+            r
+          end
+        else
+          @features[sym]
+        end
+      end
+      # Parse "magic methods", which allow the following
+      # syntaxes to be used (where 'word' can be replaced
+      # by any entity type, e.g. token, zone, etc.):
+      #
+      # - each_word : iterate over each entity of type word.
+      # - words: return an array of words in the entity.
+      # - word: return the first word in the entity.
+      # - word_count: return the number of words in the entity.
+      # - words_with_*(value) (where  is an arbitrary feature):
+      #   return the words that have the given feature.
+      # - word_with_*(value) : return the first word with
+      #   the feature specified by * in value.
+      #
+      # Also provides magical methods for types of words:
+      #
+      # - each_noun:
+      # - nouns:
+      # - noun:
+      # - noun_count:
+      # - nouns_with_*(value)
+      # - noun_with_*(value)
+      #
+      # Note that repetition of code in this method
+      # (instead of method chaining) is intentional
+      # and aims to reduce the number of method
+      # dispatches done by Ruby to improve performance.
+      def parse_magic_method(sym, *args, &block)
+        @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
+        @@cats_regexp ||= "(#{Treat::Resources::Categories::List.join('|')})"
+        method = sym.to_s =~ /entities/ ?
+        sym.to_s.gsub('entities', 'entitys'):
+        method = sym.to_s
+        a = []
+        if method =~ /^parent_#{@@entities_regexp}$/           # Optimize all
+          self.class.send(:define_method, "parent_#{$1}") do
+            ancestor_with_types(:"#{$1}")
+          end
+          ancestor_with_types(:"#{$1}")
+        elsif method =~ /^each_#{@@entities_regexp}$/
+          each_entity(:"#{$1}") { |entity| yield entity }
+        elsif method =~ /^#{@@entities_regexp}s$/
+          each_entity(:"#{$1}") { |e| a << e }
+          a
+        elsif method =~ /^#{@@entities_regexp}$/
+          each_entity(:"#{$1}") { |e| a << e }
+          first_but_warn(a, $1)
+        elsif method =~ /^#{@@entities_regexp}_count$/
+          i = 0
+          each_entity(:"#{$1}") { |e| i += 1 }
+          i
+        elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
+          each_entity(:"#{$1}") do |e|
+            a << e if e.has?(:"#{$2}") &&
+            e.send(:"#{$2}") == args[0]
+          end
+          a
+        elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
+          each_entity(:"#{$1}") do |e|
+            a << e if e.has?(:"#{$2}") &&
+            e.send(:"#{$2}") == args[0]
+          end
+          first_but_warn(a, $1)
+        elsif method =~ /^each_with_([a-z]*)$/
+          each_entity do |e|
+            yield e if e.has?(:"#{$2}") &&
+            e.send(:"#{$2}") == args[0]
+          end
+        elsif method =~ /^each_#{@@cats_regexp}$/
+          each_entity(:word) { |e| yield e if e.cat == :"#{$1}" }
+        elsif method =~ /^#{@@cats_regexp}s$/
+          each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
+          a
+        elsif method =~ /^#{@@cats_regexp}$/
+          each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
+          first_but_warn(a, $1)
+        elsif method =~ /^#{@@cats_regexp}_count$/
+          i = 0
+          each_entity(:word) { |e| i += 1 if e.cat == :"#{$1}" }
+          i
+        elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
+          each_entity(:word) do |e|
+            a << e if e.cat == :"#{$1}" &&
+            e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
+          end
+          a
+        elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
+          each_entity(:word) do |e|
+            a << e if e.cat == :"#{$1}" &&
+            e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
+          end
+          first_but_warn(a, $1)
+        else
+          :no_magic
+        end
+      end
+      # Add an entity to the current entity.
+      # Registers the entity in the root node
+      # token registry if the entity is a leaf.
+      #
+      # @see Treat::Registrable
+      def <<(entities, clear_parent = true)
+        entities = [entities] unless entities.is_a? Array
+        entities.each do |entity|
+          register_token(entity) if entity.is_leaf?
+        end
+        super(entities)
+        @parent.value = '' if has_parent?
+        entities[0]
+      end
+      # Yields each entity of any of the supplied
+      # types in the children tree of this Entity.
+      # Note that this function is recursive, unlike
+      # #each. It does not yield the top element being
+      # recursed.
+      def each_entity(*types)
+        yield self if match_types(self, types)
+        if has_children?
+          @children.each do |child|
+            child.each_entity(*types) { |y| yield y }
+          end
+        end
+      end
+      # Returns the first ancestor of this
+      # entity that has the given type.
+      def ancestor_with_types(*types)
+        ancestor = @parent
+        while not match_types(ancestor, types)
+          return nil unless ancestor.has_parent?
+          ancestor = ancestor.parent
+        end
+        match_types(ancestor, types) ? ancestor : nil
+      end
+      alias :ancestor_with_type :ancestor_with_types
+      # Return the entity's string value in plain text format.
+      def to_string; @value; end
+      # An alias for #to_string.
+      def to_s; visualize(:txt); end
+      alias :to_str :to_s
+      # Return an informative string representation of the entity.
+      def inspect; visualize(:inspect); end
+      # Print out an ASCII representation of the tree.
+      def print_tree; puts visualize(:tree); end
+      # Return a shortened value of the entity's string value using [...].
+      def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
+      # Convenience functions. Convenience decorators.
+      def frequency_of(word); statistics(:frequency_of, value: word); end
+      private
+      # Return the first element in the array, warning if not
+      # the only one in the array. Used for magic methods: e.g.,
+      # the magic method "word" if called on a sentence
+      # with many words, Treat will return the first word
+      # but warn the user.
+      def first_but_warn(array, type)
+        if array.size > 1
+          warn "Warning: requested one #{type}, but" +
+          " there are many #{type}s in the given entity."
+        end
+        array[0]
+      end
+      # Cache a list of the type => class relationships.
+      @@type_classes = {}
+      # Returns true if the node is of the same type or
+      # is a subtype of of one of the specified entity types,
+      # which are supplied as identifiers rather than classes.
+      def match_types(node, entity_types)
+        entity_types.each do |type|
+          @@type_classes[type] ||= Entities.const_get(cc(type))
+          return true if node.is_a? @@type_classes[type]
+        end
+        false
+      end
+    end
+  end
+end

data/lib/treat/entities/sentence.rb ADDED

@@ -0,0 +1,8 @@
+module Treat
+  module Entities
+    # Represents a sentence.
+    class Sentence < Entity
+      def subject(l = nil, o = {}); link(l, o.merge({:linkage => :subject})); end
+    end
+  end
+end

data/lib/treat/entities/text.rb ADDED

@@ -0,0 +1,7 @@
+module Treat
+  module Entities
+    # Represents a text.
+    class Text < Entity
+    end
+  end
+end

data/lib/treat/entities/tokens.rb ADDED

@@ -0,0 +1,37 @@
+module Treat
+  module Entities
+    # Represents a terminal element in the text structure.
+    class Token < Entity
+      # All tokens are leafs.
+      def is_leaf?; true; end
+      def frequency; self.set :frequency, statistics(:frequency); end
+    end
+    # Represents a word.
+    class Word < Token
+      def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
+      def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
+      def plural(declensor = nil); declense(declensor, :count => :plural); end
+      def singular(declensor = nil); declense(declensor, :count => :singular); end
+    end
+    class Clitic < Token
+    end
+    # Represents a number.
+    class Number < Token
+      # Convert the number to an integer.
+      def to_i; to_s.to_i; end
+      # Convert the number to a float.
+      def to_f; to_s.to_f; end
+    end
+    # Represents a punctuation sign.
+    class Punctuation < Token
+    end
+    # Represents a character that is neither
+    # alphabetical, numerical or a punctuation
+    # character (e.g. @#$%&*).
+    class Symbol < Token
+    end
+    # Represents an entity of unknown type.
+    class Unknown < Token
+    end
+  end
+end