RubyGems - treat - Versions diffs - 0.1.2 → 0.1.3 - Mend

treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

data/LICENSE +7 -8
data/TODO +16 -13
data/examples/keywords.rb +89 -1
data/lib/treat/buildable.rb +1 -8
data/lib/treat/categories.rb +3 -4
data/lib/treat/category.rb +1 -1
data/lib/treat/delegatable.rb +1 -1
data/lib/treat/detectors/encoding/native.rb +5 -0
data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
data/lib/treat/detectors/language/language_detector.rb +4 -0
data/lib/treat/detectors/language/what_language.rb +4 -4
data/lib/treat/detectors.rb +1 -1
data/lib/treat/entities/entity.rb +5 -3
data/lib/treat/entities/tokens.rb +14 -5
data/lib/treat/entities/zones.rb +4 -0
data/lib/treat/entities.rb +7 -5
data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
data/lib/treat/extractors/time/chronic.rb +8 -0
data/lib/treat/extractors/time/native.rb +6 -0
data/lib/treat/extractors/time/nickel.rb +31 -23
data/lib/treat/extractors/topic_words/lda.rb +21 -16
data/lib/treat/extractors/topics/reuters.rb +6 -4
data/lib/treat/extractors.rb +7 -7
data/lib/treat/formatters/readers/abw.rb +32 -0
data/lib/treat/formatters/readers/autoselect.rb +13 -11
data/lib/treat/formatters/readers/doc.rb +13 -0
data/lib/treat/formatters/readers/gocr.rb +2 -0
data/lib/treat/formatters/readers/html.rb +21 -1
data/lib/treat/formatters/readers/ocropus.rb +3 -3
data/lib/treat/formatters/readers/odt.rb +41 -0
data/lib/treat/formatters/readers/pdf.rb +5 -2
data/lib/treat/formatters/readers/txt.rb +2 -0
data/lib/treat/formatters/serializers/xml.rb +3 -2
data/lib/treat/formatters/serializers/yaml.rb +2 -0
data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
data/lib/treat/formatters/unserializers/xml.rb +6 -1
data/lib/treat/formatters/unserializers/yaml.rb +5 -1
data/lib/treat/formatters/visualizers/dot.rb +35 -37
data/lib/treat/formatters/visualizers/html.rb +1 -0
data/lib/treat/formatters/visualizers/inspect.rb +4 -0
data/lib/treat/formatters/visualizers/short_value.rb +18 -3
data/lib/treat/formatters/visualizers/standoff.rb +11 -6
data/lib/treat/formatters/visualizers/tree.rb +5 -1
data/lib/treat/formatters/visualizers/txt.rb +6 -1
data/lib/treat/formatters.rb +1 -1
data/lib/treat/group.rb +4 -3
data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
data/lib/treat/inflectors/stem/porter.rb +6 -2
data/lib/treat/inflectors/stem/porter_c.rb +4 -1
data/lib/treat/inflectors/stem/uea.rb +4 -4
data/lib/treat/languages/english/tags.rb +16 -0
data/lib/treat/languages/english.rb +4 -1
data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
data/lib/treat/lexicalizers/tag/brill.rb +3 -11
data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
data/lib/treat/lexicalizers.rb +0 -2
data/lib/treat/processors/chunkers/txt.rb +4 -4
data/lib/treat/processors/parsers/enju.rb +3 -17
data/lib/treat/processors/parsers/stanford.rb +4 -0
data/lib/treat/processors/segmenters/punkt.rb +1 -0
data/lib/treat/processors/segmenters/stanford.rb +4 -0
data/lib/treat/processors/segmenters/tactful.rb +4 -1
data/lib/treat/processors/tokenizers/punkt.rb +1 -2
data/lib/treat/processors/tokenizers/stanford.rb +4 -0
data/lib/treat/processors/tokenizers/tactful.rb +1 -1
data/lib/treat/processors.rb +4 -4
data/lib/treat/proxies.rb +18 -11
data/lib/treat/registrable.rb +12 -5
data/lib/treat/sugar.rb +8 -3
data/lib/treat/tree.rb +10 -3
data/lib/treat.rb +55 -55
data/test/tc_entity.rb +7 -7
data/test/tc_extractors.rb +6 -4
data/test/tc_formatters.rb +0 -4
data/test/tests.rb +2 -0
data/test/texts.rb +4 -4
metadata +48 -56
data/examples/texts/bugged_out.txt +0 -26
data/examples/texts/half_cocked_basel.txt +0 -16
data/examples/texts/hedge_funds.txt +0 -24
data/examples/texts/hose_and_dry.txt +0 -19
data/examples/texts/hungarys_troubles.txt +0 -46
data/examples/texts/indias_slowdown.txt +0 -15
data/examples/texts/merkozy_rides_again.txt +0 -24
data/examples/texts/prada_is_not_walmart.txt +0 -9
data/examples/texts/republican_nomination.txt +0 -26
data/examples/texts/to_infinity_and_beyond.txt +0 -15
data/lib/treat/entities/text.rb +0 -7
data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
data/lib/treat/formatters/cleaners/html.rb +0 -17

data/LICENSE CHANGED Viewed

@@ -18,11 +18,10 @@ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
 Non-trivial amount of code has been incorporated and modified from
 other libraries, specifically for the following files:
- - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
- - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
- - Inflectors/lemmatizers/e_lemma.rb - Utiyama Masao (GPL license)
- - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
- - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license),
- - extractors/topics/reuters.rb - Mark Watson (GPL license)
- - inflectors/stemmers/porter.rb - Ray Pereda (No license information)
- - tree.rb - Partyl based on work by
+- processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
+- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
+- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
+- processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
+- extractors/topics/reuters.rb - Mark Watson (GPL license)
+- inflectors/stemmers/porter.rb - Ray Pereda (No license information)
+- tree.rb - Partly based on work by Anupam Sengupta (Creative Commons Attribution-ShareAlike Unported v. 3.0)

data/TODO CHANGED Viewed

@@ -1,25 +1,26 @@
 ## Urgent
-- Linkers
+- Linkers + documentation
 - Check taggers for context
 - Stanford dependencies parse
-- Enju: test
 - Ocropus => use better function
 - Optimize magic methods... is_token? type methods, phrase categories.
-- Move statistics?
+- Move statistics
 - Synset class move
 - general procedure for options, check that user doesn't want to change options...
-- Languages: dependencies vs. edges, PTB function tags
-- Check for # Fix everywhere
-- Check paths; parse bin paths
-- Ferret, Natural Inputs
-- Use consistently delegate
-- Text becomes section
-- Remove top level
+- Ferret, Spider
 - Loading multiple JARs
+- Linguistics loader, stanford loader
 - Tokenized sentences are not parsed
-- Documentation
-- Remove feature
+- Dot colors
+- Fix encoders
+- Fix Punkt segmenter training text.
+- Mark Watson's text extractor
+- Statistics position in
+- Fix documentation antiword, Graphviz, # encoding: utf-8
+- Shortcut methods.. pre/postprocessors
+- Only Phrase..
+- Frequency in
 ## Eventually
@@ -52,4 +53,6 @@
 - String type detector for other languages
 - Automatic benchmark
 - Raspell spell checker
-- Multithreading
+- Multithreading
+- Mark Watson's Java NLP utility to identify proper nouns (human names and places) in text
+- FastTag a Java fast part of speech tagger.

data/examples/keywords.rb CHANGED Viewed

@@ -57,4 +57,92 @@ c.each_document do |d|
     end
   end
-end
+end
+Treat.edulcorate
+Treat.bin = '/ruby/nat/bin'
+c = Collection 'economist'
+c.each_document { |doc| doc.chunk.segment.tokenize }
+topic_words = c.topic_words(
+  :lda,
+  :topics => 5,
+  :words_per_topic => 5,
+  :iterations => 20
+)
+keywords = c.keywords(
+  :topics_frequency,
+  :topic_words => topic_words,
+  :tf_idf_threshold => 180
+)
+puts keywords.inspect
+abort
+c = Phrase 'a test clause'
+c.parse
+puts c.visualize(:tree)
+puts c.visualize(:inspect)
+puts c.visualize(:short_value)
+puts c.visualize(:standoff)
+puts c.visualize(:tree)
+c.serialize(:yaml).save('test.yml')
+c.serialize(:xml).save('test.xml')
+d = Phrase 'test.yml'
+d.print_tree
+d = Phrase 'test.xml'
+d.print_tree
+puts d.words[0].position_in_parent
+abort
+w = Word 'running'
+puts w.stem(:porter_c)
+puts w.stem(:porter)
+puts w.stem(:uea)
+w = Word 'run'
+puts w.infinitive(:linguistics)
+puts w.present_participle(:linguistics)
+puts w.plural(:linguistics)
+w = Word 'table'
+puts w.synonyms.inspect
+puts w.antonyms.inspect
+puts w.hyponyms.inspect
+puts w.hypernyms.inspect
+n = Number 2
+puts n.ordinal_words(:linguistics)
+puts n.cardinal_words(:linguistics)
+s = Sentence 'A sentence to parse.'
+s.dup.parse(:enju).print_tree
+s.dup.parse(:stanford).print_tree
+s = Sentence 'A sentence to tokenize'
+s.dup.tokenize(:macintyre).print_tree
+s.dup.tokenize(:multilingual).print_tree
+s.dup.tokenize(:perl).print_tree
+s.dup.tokenize(:punkt).print_tree
+s.dup.tokenize(:stanford).print_tree
+s.dup.tokenize(:tactful).print_tree
+=begin
+c = Collection 'economist'
+# c.each_document { |d| d.chunk.segment.tokenize }
+c.documents[0].chunk.segment
+c.sentences[0].parse(:enju)
+c.each_word { |word| word.stem }
+c.visualize(:dot, features: [:tag]).save('test.dot')
+=end

data/lib/treat/buildable.rb CHANGED Viewed

@@ -24,9 +24,8 @@ module Treat
         "Cannot create a document or collection from " +
         "a string (need a readable file/folder)."
       end
-      string = string.to_s
       dot = string.count('.') + string.count('!') + string.count('?')
-      return Treat::Entities::Text.new(string) if dot > 1 ||
+      return Treat::Entities::Section.new(string) if dot > 1 ||
       (string.count("\n") > 0 && dot == 1)
       return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
       if string.count(' ') == 0
@@ -99,12 +98,6 @@ module Treat
       d.read
     end
     def from_serialized_file(file)
-      unless [Treat::Entities::Document,
-        Treat::Entities::Collection].include?(self)
-        raise Treat::Exception,
-        "Cannot create something else than a " +
-        "document from raw file '#{file}'."
-      end
       d = Treat::Entities::Document.new(file)
       d.unserialize
       d.children[0].set_as_root!

data/lib/treat/categories.rb CHANGED Viewed

@@ -1,17 +1,16 @@
 module Treat
   # This module keeps track of all categories that
-  # exist and the methods they implement, and is
-  # responsible for including the categories.
+  # exist and the methods they implement.
   module Categories
-    # A list of categories.
     class << self; attr_accessor :list; end
+    # Array - list of all categories.
     self.list = []
     # Boolean - does any of the categories have
     # a method that corresponds to sym?
     def self.have_method?(sym); methods.include?(sym); end
     # Cache the list of methods once it has been computed.
     @@methods = []
-    # Provide a list of all methods implemented
+    # Array - provide a list of all methods implemented
     # by all Treat categories.
     def self.methods
       return @@methods unless @@methods.empty?

data/lib/treat/category.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Treat
         groups.each do |group|
           group = const_get(group)
           group.targets.each do |entity_type|
-            entity = Entities.const_get(cc(entity_type))
+            entity = Treat::Entities.const_get(cc(entity_type))
             entity.class_eval { add_delegators group }
           end
         end

data/lib/treat/delegatable.rb CHANGED Viewed

@@ -46,7 +46,7 @@ module Treat
         delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
         result = entity.accept(group, delegate_klass, m, options)
         if decorator
-          result = group.send(decorator, self, result)
+          result = group.send(decorator, entity, result)
         end
         if group.type == :annotator
           f = decorator.nil? ? m : decorator

data/lib/treat/detectors/encoding/native.rb CHANGED Viewed

@@ -1,7 +1,12 @@
 module Treat
   module Detectors
     module Encoding
+      # A wrapper class for Ruby's native encoding detector.
       class Native
+        # Return the encoding of the entity according
+        # to the Ruby interpreter.
+        #
+        # Options: none.
         def self.encoding(entity, options={})
           entity.value.encoding.name.
           gsub('-', '_').downcase.intern

data/lib/treat/detectors/encoding/r_chardet19.rb CHANGED Viewed

@@ -6,9 +6,8 @@ module Treat
       # A wrapper for the 'rchardet19' gem, which
       # detects the encoding of a file.
       class RChardet19
-        # Returns an Encoding object representing
-        # the encoding of the supplied entity's
-        # text value.
+        # Returns the encoding of the entity according
+        # to the 'rchardet19' gem.
         #
         # Options: none.
         def self.encoding(entity, options={})

data/lib/treat/detectors/language/language_detector.rb CHANGED Viewed

@@ -1,6 +1,10 @@
 module Treat
   module Detectors
     module Language
+      # A generic language detector, which is called before
+      # any language detector and ensures that configuration
+      # options concerning language are enforced (e.g. returns
+      # the default language when Treat.detect_language is false).
       class LanguageDetector
         def self.language(entity, options = {})
           if Treat.detect_language == false

data/lib/treat/detectors/language/what_language.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module Treat
       # performs probabilistic language detection.
       class WhatLanguage < LanguageDetector
         # Keep only once instance of the gem class.
-        @@wl = nil
+        @@detector = nil
         # Detect the language of an entity using the
         # 'whatlanguage' gem. Return an identifier
         # corresponding to the ISO-639-2 code for the
@@ -15,10 +15,10 @@ module Treat
         def self.language(entity, options = {})
           predetection = super(entity, options)
           return predetection if predetection
-          @@wl ||= ::WhatLanguage.new(:all)
-          all = @@wl.process_text(entity.to_s)
+          @@detector ||= ::WhatLanguage.new(:possibilities)
+          possibilities = @@detector.process_text(entity.to_s)
           lang = {}
-          all.each do |k,v|
+          possibilities.each do |k,v|
             lang[Treat::Languages.find(k)] = v
           end
           Treat::Feature.new(lang).best

data/lib/treat/detectors.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Treat
   # Detectors detect a specific meta-information about
   # an entity, such as encoding, format and language.
   #
-  # Detectors are language-independent, and thus they
+  # Detectors are language-independent, and thus there
   # are default algorithms specified for each of them.
   module Detectors
     # Group for algorithms that detect encoding.

data/lib/treat/entities/entity.rb CHANGED Viewed

@@ -43,7 +43,7 @@ module Treat
       # feature does not exist
       def method_missing(sym, *args, &block)
         return self.build(*args) if sym == nil
-        if !@features[sym]
+        if !@features.has_key?(sym)
           r = parse_magic_method(sym, *args, &block)
           if r == :no_magic
             begin
@@ -168,7 +168,10 @@ module Treat
       def <<(entities, clear_parent = true)
         entities = [entities] unless entities.is_a? Array
         entities.each do |entity|
-          register_token(entity) if entity.is_leaf?
+          if entity.is_a?(Treat::Entities::Token) ||
+            entity.is_a?(Treat::Entities::Constituent)
+              register_token(entity) unless entity.value == ''
+          end
         end
         super(entities)
         @parent.value = '' if has_parent?
@@ -211,7 +214,6 @@ module Treat
       def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
       # Convenience functions. Convenience decorators.
       def frequency_of(word); statistics(:frequency_of, value: word); end
       private
       # Return the first element in the array, warning if not
       # the only one in the array. Used for magic methods: e.g.,

data/lib/treat/entities/tokens.rb CHANGED Viewed

@@ -4,15 +4,24 @@ module Treat
     class Token < Entity
       # All tokens are leafs.
       def is_leaf?; true; end
-      def frequency; self.set :frequency, statistics(:frequency); end
+      # Convenience function for statistics.
+      def frequency; statistics(:frequency_in); end
+      def frequency_in(type); statistics(:frequency_in, type: type); end
+      def position_in(type); statistics(:position_in_parent); end
+      def tf_idf; statistics(:tf_idf); end
     end
     # Represents a word.
     class Word < Token
-      def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
-      def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
-      def plural(declensor = nil); declense(declensor, :count => :plural); end
-      def singular(declensor = nil); declense(declensor, :count => :singular); end
+      # Convenience function for conjugations.
+      def infinitive(conjugator = nil); conjugations(conjugator, :mode => :infinitive); end
+      # Convenience function for conjugations.
+      def present_participle(conjugator = nil); conjugations(conjugator, :tense => :present, :mode => :participle); end
+      # Convenience function for declensions.
+      def plural(declensor = nil); declensions(declensor, :count => :plural); end
+      # Convenience function for declensions.
+      def singular(declensor = nil); declensions(declensor, :count => :singular); end
     end
+    # Represents a clitic ('s).
     class Clitic < Token
     end
     # Represents a number.

data/lib/treat/entities/zones.rb CHANGED Viewed

@@ -13,5 +13,9 @@ module Treat
     # Represents a list.
     class List < Zone
     end
+    # Represents a section, usually with a title
+    # and at least one paragraph.
+    class Section < Zone
+    end
   end
 end

data/lib/treat/entities.rb CHANGED Viewed

@@ -14,7 +14,6 @@ module Treat
     # Then require all possible entities.
     require 'treat/entities/collection'
     require 'treat/entities/document'
-    require 'treat/entities/text'
     require 'treat/entities/zones'
     require 'treat/entities/sentence'
     require 'treat/entities/constituents'
@@ -25,9 +24,11 @@ module Treat
         const_get(entity).build(value, id)
       end
     end
+    # Cache a list of defined entity types to
+    # improve performance.
+    @@list = []
     # Provide a list of defined entity types,
     # as non-camel case identifiers.
-    @@list = []
     def self.list
       return @@list unless @@list.empty?
       self.constants.each do |constant|
@@ -35,16 +36,17 @@ module Treat
       end
       @@list
     end
-    # Return the 'z-order' for hierarchical
-    # comparison of entity types.
+    # Return the hierarchy level of the entity
+    # class, the minimum being a Token and the
+    # maximum being a Collection.
     def self.rank(type)
       klass = Entities.const_get(cc(type))
       compare = lambda { |a,b| a == b || a < b }
       return 0 if compare.call(klass, Token)
       return 1 if compare.call(klass, Constituent)
       return 2 if compare.call(klass, Sentence)
+      return 3 if compare.call(klass, Zone)
       return 4 if compare.call(klass, Document)
-      return 3 if compare.call(klass, Section)
       return 5 if compare.call(klass, Collection)
     end
   end

data/lib/treat/extractors/keywords/topics_frequency.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Treat
+  module Extractors
+    module Keywords
+      class TopicsFrequency
+        DefaultOptions = {tf_idf_threshold: 180, topic_words: nil}
+        def self.keywords(entity, options = {})
+          options = DefaultOptions.merge(options)
+          unless options[:topic_words]
+            raise Treat::Exception, "You must supply topic words."
+          end
+          if Treat::Entities.rank(entity.type) <
+            Treat::Entities.rank(:sentence)
+            raise Treat::Exception, 'Cannot get the key ' +
+            'sentences of an entity smaller than a sentence.'
+          else
+            find_keywords(entity, options)
+          end
+        end
+        def self.find_keywords(entity, options)
+          keywords = []
+          entity.each_word do |word|
+            found = false
+            options[:topic_words].each do |i, topic_words|
+              next if keywords.include?(word.value)
+              if topic_words.include?(word.value)
+                found = true
+                tf_idf = word.tf_idf
+                if tf_idf < options[:tf_idf_threshold]
+                  keywords << word.value
+                  word.set :is_keyword?, found
+                end
+              end
+            end
+          end
+          keywords
+        end
+      end
+    end
+  end
+end

data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} RENAMED Viewed

@@ -1,15 +1,16 @@
 module Treat
   module Extractors
     module Statistics
-      class Frequency
-        # Find the frequency of a given string value.
+      class FrequencyIn
+        DefaultOptions = {type: nil}
         def self.statistics(entity, options={})
+          options = DefaultOptions.merge(options)
           if entity.is_leaf?
             w = entity.value.downcase
-            if entity.token_registry[:value][w].nil?
+            if entity.token_registry(options[:type])[:value][w].nil?
               0
             else
-              entity.token_registry[:value][w].size
+              entity.token_registry(options[:type])[:value][w].size
             end
           else
             raise Treat::Exception,

data/lib/treat/extractors/statistics/frequency_of.rb CHANGED Viewed

@@ -5,11 +5,9 @@ module Treat
         # Find the frequency of a given string value.
         def self.statistics(entity, options = {})
           w = options[:value]
-          if entity.token_registry[:value][w].nil?
-            0
-          else
-            entity.token_registry[:value][w].size
-          end
+          raise Treat::Exception, "Must supply a non-nil value." unless w
+          entity.token_registry[:value][w].nil? ? 0 :
+          entity.token_registry[:value][w].size
         end
       end
     end

data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} RENAMED Viewed

@@ -1,11 +1,12 @@
 module Treat
   module Extractors
     module Statistics
-      class PositionIn
+      class PositionInParent
         # Find the position of the current entity
         # inside the parent entity with type entity_type.
-        def self.statistics(entity)
-          raise Treat::Exception, 'Could you implement this?'
+        # Not implemented.
+        def self.statistics(entity, options = {})
+          entity.parent.children.index(entity)
         end
       end
     end

data/lib/treat/extractors/statistics/tf_idf.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Treat
+  module Extractors
+    module Statistics
+      # "The term count in the given document is simply the
+      # number of times a given term appears in that document.
+      # This count is usually normalized to prevent a bias
+      # towards longer documents (which may have a higher
+      # term count regardless of the actual importance of
+      # that term in the document) to give a measure of the
+      # importance of the term t within the particular document d.
+      # Thus we have the term frequency tf(t,d), defined in the
+      # simplest case as the occurrence count of a term in a document.
+      #
+      # The inverse document frequency is a measure of the general
+      # importance of the term (obtained by dividing the total number
+      # of documents by the number of documents containing the term,
+      # and then taking the logarithm of that quotient)."
+      #
+      # (From Wikipedia)
+      class TfIdf
+        DefaultOptions = { type: nil }
+        def self.statistics(entity, options={})
+          tf = entity.frequency_in(:document)
+          tf = tf / entity.root.word_count
+          d = entity.root.document_count
+          i = 0
+          entity.root.each_document do |document|
+            i += 1 if document.frequency_of(entity.value)
+          end
+          idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
+          tf.to_f/idf.to_f
+        end
+      end
+    end
+  end
+end

data/lib/treat/extractors/statistics/transition_matrix.rb CHANGED Viewed

@@ -1,23 +1,23 @@
 module Treat
   module Extractors
     module Statistics
+      # Experimental algorithm to generate transition matrices.
       class TransitionMatrix
+        DefaultOptions = {
+          normalize: true,
+          features: [:tag],
+          condition: lambda { |e| true },
+          entity_types: [:word],
+          relationships: [:parent, :right, :children]
+        }
         # Find the transition matrix.
         def self.statistics(entity, options={})
-          normalize = options[:normalize] || true
-          features = options[:features] || [:tag]
-          condition = options[:condition] || lambda { |e| true }
-          entity_types = options[:entity_types] ? options[:entity_types] :
-          [options[:entity_type]]
-          relationships = options[:relationships] ||
-          [:parent, :left, :right, :children]
+          options = DefaultOptions.merge(options)
           # Create lambdas to generate the arrays.
-          empty_prototype = {}; features.each { |f| empty_prototype[f] = {} }
+          empty_prototype = {}; options[:features].each { |f| empty_prototype[f] = {} }
           empty = lambda { Marshal.load(Marshal.dump(empty_prototype)) }
-          empty2_prototype = {}; relationships.each { |r| empty2_prototype[r] = empty.call }
+          empty2_prototype = {}; options[:relationships].each { |r| empty2_prototype[r] = empty.call }
           empty2 = lambda { Marshal.load(Marshal.dump(empty2_prototype)) }
           # Deep (recursive) merger.
@@ -27,24 +27,25 @@ module Treat
           # Master matrix.
           mm = nil
+          tm = empty.call
-          entity.each_entity(*entity_types) do |target|
-            next unless condition.call(target)
+          entity.each_entity(*options[:entity_types]) do |target|
+            next unless options[:condition].call(target)
             # Initialize the empty transition matrix.
-            tm = empty.call
             # Calculate the transition probabilities.
-            features.each do |f1|
+            options[:features].each do |f1|
               v1 = target.send(f1)
               tm[f1][v1] = empty2.call
-              relationships.each do |relationship|
+              options[:relationships].each do |relationship|
                 tm[f1][v1][relationship] = empty.call
-                features.each do |f2|
+                options[:features].each do |f2|
                   relatives = target.send(relationship)
                   relatives = [relatives] unless relatives.is_a? Array
                   relatives.each do |relative|
@@ -55,9 +56,9 @@ module Treat
                       tm[f1][v1][relationship][f2][v2] += 1.0
                     end
                   end
                   tm[f1][v1][:edge] = empty.call
                   target.edges.each do |id, edge_type|
                     s = target.ancestor_with_type :sentence
                     if s
@@ -68,14 +69,13 @@ module Treat
                       tm[f1][v1][:edge][f2][v2] += 1.0
                     end
                   end
                 end
               end
             end
-            mm = mm ? mm.merge(tm, &merger) : tm
           end
-          if normalize
+          mm = mm ? mm.merge(tm, &merger) : tm
+          if options[:normalize]
             normalize(mm)
           else
             mm