RubyGems - treat - Versions diffs - 0.2.5 → 1.0.0 - Mend

treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

data/LICENSE +3 -3
data/README.md +33 -0
data/files/INFO +1 -0
data/lib/treat.rb +40 -105
data/lib/treat/ai.rb +12 -0
data/lib/treat/ai/classifiers/id3.rb +27 -0
data/lib/treat/categories.rb +82 -35
data/lib/treat/categorizable.rb +44 -0
data/lib/treat/classification.rb +61 -0
data/lib/treat/configurable.rb +115 -0
data/lib/treat/data_set.rb +42 -0
data/lib/treat/dependencies.rb +24 -0
data/lib/treat/downloader.rb +87 -0
data/lib/treat/entities.rb +68 -66
data/lib/treat/entities/abilities.rb +10 -0
data/lib/treat/entities/abilities/buildable.rb +327 -0
data/lib/treat/entities/abilities/checkable.rb +31 -0
data/lib/treat/entities/abilities/copyable.rb +45 -0
data/lib/treat/entities/abilities/countable.rb +51 -0
data/lib/treat/entities/abilities/debuggable.rb +83 -0
data/lib/treat/entities/abilities/delegatable.rb +123 -0
data/lib/treat/entities/abilities/doable.rb +62 -0
data/lib/treat/entities/abilities/exportable.rb +11 -0
data/lib/treat/entities/abilities/iterable.rb +115 -0
data/lib/treat/entities/abilities/magical.rb +83 -0
data/lib/treat/entities/abilities/registrable.rb +74 -0
data/lib/treat/entities/abilities/stringable.rb +91 -0
data/lib/treat/entities/entities.rb +104 -0
data/lib/treat/entities/entity.rb +122 -245
data/lib/treat/exception.rb +4 -4
data/lib/treat/extractors.rb +77 -80
data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
data/lib/treat/extractors/language/what_language.rb +50 -45
data/lib/treat/extractors/name_tag/stanford.rb +55 -0
data/lib/treat/extractors/tf_idf/native.rb +87 -0
data/lib/treat/extractors/time/chronic.rb +55 -0
data/lib/treat/extractors/time/nickel.rb +86 -62
data/lib/treat/extractors/time/ruby.rb +53 -0
data/lib/treat/extractors/topic_words/lda.rb +67 -58
data/lib/treat/extractors/topics/reuters.rb +100 -87
data/lib/treat/formatters.rb +39 -35
data/lib/treat/formatters/readers/abw.rb +49 -29
data/lib/treat/formatters/readers/autoselect.rb +37 -33
data/lib/treat/formatters/readers/doc.rb +19 -13
data/lib/treat/formatters/readers/html.rb +52 -30
data/lib/treat/formatters/readers/image.rb +41 -40
data/lib/treat/formatters/readers/odt.rb +59 -45
data/lib/treat/formatters/readers/pdf.rb +28 -25
data/lib/treat/formatters/readers/txt.rb +12 -15
data/lib/treat/formatters/readers/xml.rb +73 -36
data/lib/treat/formatters/serializers/xml.rb +80 -79
data/lib/treat/formatters/serializers/yaml.rb +19 -18
data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
data/lib/treat/formatters/unserializers/xml.rb +94 -99
data/lib/treat/formatters/unserializers/yaml.rb +20 -19
data/lib/treat/formatters/visualizers/dot.rb +132 -132
data/lib/treat/formatters/visualizers/standoff.rb +52 -44
data/lib/treat/formatters/visualizers/tree.rb +26 -29
data/lib/treat/groupable.rb +153 -0
data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
data/lib/treat/inflectors.rb +50 -45
data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
data/lib/treat/inflectors/declensors/active_support.rb +31 -0
data/lib/treat/inflectors/declensors/english.rb +38 -0
data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
data/lib/treat/inflectors/stemmers/porter.rb +160 -0
data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
data/lib/treat/inflectors/stemmers/uea.rb +28 -0
data/lib/treat/installer.rb +308 -0
data/lib/treat/kernel.rb +105 -27
data/lib/treat/languages.rb +122 -88
data/lib/treat/languages/arabic.rb +15 -15
data/lib/treat/languages/chinese.rb +15 -15
data/lib/treat/languages/dutch.rb +15 -15
data/lib/treat/languages/english.rb +61 -62
data/lib/treat/languages/french.rb +19 -19
data/lib/treat/languages/german.rb +20 -20
data/lib/treat/languages/greek.rb +15 -15
data/lib/treat/languages/italian.rb +16 -16
data/lib/treat/languages/polish.rb +15 -15
data/lib/treat/languages/portuguese.rb +15 -15
data/lib/treat/languages/russian.rb +15 -15
data/lib/treat/languages/spanish.rb +16 -16
data/lib/treat/languages/swedish.rb +16 -16
data/lib/treat/lexicalizers.rb +34 -55
data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
data/lib/treat/linguistics.rb +9 -0
data/lib/treat/linguistics/categories.rb +11 -0
data/lib/treat/linguistics/tags.rb +422 -0
data/lib/treat/loaders/linguistics.rb +30 -0
data/lib/treat/loaders/stanford.rb +27 -0
data/lib/treat/object.rb +1 -0
data/lib/treat/processors.rb +37 -44
data/lib/treat/processors/chunkers/autoselect.rb +16 -0
data/lib/treat/processors/chunkers/html.rb +71 -0
data/lib/treat/processors/chunkers/txt.rb +18 -24
data/lib/treat/processors/parsers/enju.rb +253 -208
data/lib/treat/processors/parsers/stanford.rb +130 -131
data/lib/treat/processors/segmenters/punkt.rb +79 -45
data/lib/treat/processors/segmenters/stanford.rb +46 -48
data/lib/treat/processors/segmenters/tactful.rb +43 -36
data/lib/treat/processors/tokenizers/perl.rb +124 -92
data/lib/treat/processors/tokenizers/ptb.rb +81 -0
data/lib/treat/processors/tokenizers/punkt.rb +48 -42
data/lib/treat/processors/tokenizers/stanford.rb +39 -38
data/lib/treat/processors/tokenizers/tactful.rb +64 -55
data/lib/treat/proxies.rb +52 -35
data/lib/treat/retrievers.rb +26 -16
data/lib/treat/retrievers/indexers/ferret.rb +47 -26
data/lib/treat/retrievers/searchers/ferret.rb +69 -50
data/lib/treat/tree.rb +241 -183
data/spec/collection.rb +123 -0
data/spec/document.rb +93 -0
data/spec/entity.rb +408 -0
data/spec/languages.rb +25 -0
data/spec/phrase.rb +146 -0
data/spec/samples/mathematicians/archimedes.abw +34 -0
data/spec/samples/mathematicians/euler.html +21 -0
data/spec/samples/mathematicians/gauss.pdf +0 -0
data/spec/samples/mathematicians/leibniz.txt +13 -0
data/spec/samples/mathematicians/newton.doc +0 -0
data/spec/sandbox.rb +5 -0
data/spec/token.rb +109 -0
data/spec/treat.rb +52 -0
data/spec/tree.rb +117 -0
data/spec/word.rb +110 -0
data/spec/zone.rb +66 -0
data/tmp/INFO +1 -1
metadata +100 -201
data/INSTALL +0 -1
data/README +0 -3
data/TODO +0 -28
data/lib/economist/half_cocked_basel.txt +0 -16
data/lib/economist/hungarys_troubles.txt +0 -46
data/lib/economist/indias_slowdown.txt +0 -15
data/lib/economist/merkozy_rides_again.txt +0 -24
data/lib/economist/prada_is_not_walmart.txt +0 -9
data/lib/economist/to_infinity_and_beyond.txt +0 -15
data/lib/ferret/_11.cfs +0 -0
data/lib/ferret/_14.cfs +0 -0
data/lib/ferret/_p.cfs +0 -0
data/lib/ferret/_s.cfs +0 -0
data/lib/ferret/_v.cfs +0 -0
data/lib/ferret/_y.cfs +0 -0
data/lib/ferret/segments +0 -0
data/lib/ferret/segments_15 +0 -0
data/lib/treat/buildable.rb +0 -157
data/lib/treat/category.rb +0 -33
data/lib/treat/delegatable.rb +0 -116
data/lib/treat/doable.rb +0 -45
data/lib/treat/entities/collection.rb +0 -14
data/lib/treat/entities/document.rb +0 -12
data/lib/treat/entities/phrases.rb +0 -17
data/lib/treat/entities/tokens.rb +0 -61
data/lib/treat/entities/zones.rb +0 -41
data/lib/treat/extractors/coreferences/stanford.rb +0 -69
data/lib/treat/extractors/date/chronic.rb +0 -32
data/lib/treat/extractors/date/ruby.rb +0 -25
data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
data/lib/treat/extractors/language/language_extractor.rb +0 -27
data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
data/lib/treat/extractors/roles/naive.rb +0 -73
data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
data/lib/treat/extractors/statistics/position_in.rb +0 -14
data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
data/lib/treat/feature.rb +0 -58
data/lib/treat/features.rb +0 -7
data/lib/treat/formatters/visualizers/short_value.rb +0 -29
data/lib/treat/formatters/visualizers/txt.rb +0 -45
data/lib/treat/group.rb +0 -106
data/lib/treat/helpers/linguistics_loader.rb +0 -18
data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
data/lib/treat/inflectors/declensions/english.rb +0 -319
data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
data/lib/treat/inflectors/stem/porter.rb +0 -162
data/lib/treat/inflectors/stem/porter_c.rb +0 -26
data/lib/treat/inflectors/stem/uea.rb +0 -30
data/lib/treat/install.rb +0 -59
data/lib/treat/languages/tags.rb +0 -377
data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
data/lib/treat/lexicalizers/tag/brill.rb +0 -91
data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
data/lib/treat/registrable.rb +0 -28
data/lib/treat/sugar.rb +0 -50
data/lib/treat/viewable.rb +0 -29
data/lib/treat/visitable.rb +0 -28
data/test/profile.rb +0 -2
data/test/tc_entity.rb +0 -117
data/test/tc_extractors.rb +0 -73
data/test/tc_formatters.rb +0 -41
data/test/tc_inflectors.rb +0 -34
data/test/tc_lexicalizers.rb +0 -32
data/test/tc_processors.rb +0 -50
data/test/tc_resources.rb +0 -22
data/test/tc_treat.rb +0 -60
data/test/tc_tree.rb +0 -60
data/test/tests.rb +0 -20
data/test/texts.rb +0 -19
data/test/texts/english/half_cocked_basel.txt +0 -16
data/test/texts/english/hose_and_dry.doc +0 -0
data/test/texts/english/hungarys_troubles.abw +0 -70
data/test/texts/english/long.html +0 -24
data/test/texts/english/long.txt +0 -22
data/test/texts/english/medium.txt +0 -5
data/test/texts/english/republican_nomination.pdf +0 -0
data/test/texts/english/saving_the_euro.odt +0 -0
data/test/texts/english/short.txt +0 -3
data/test/texts/english/zero_sum.html +0 -111

data/lib/treat/extractors/time/chronic.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# A wrapper for the 'chronic' gem, which parses
+# date information.
+#
+# Project website: http://chronic.rubyforge.org/
+class Treat::Extractors::Time::Chronic
+  # Require the 'chronic' gem.
+  silence_warnings { require 'chronic' }
+  # Require the Ruby DateTime module
+  require 'date'
+  # Return the date information contained within
+  # the entity by parsing it with the 'chronic' gem.
+  #
+  # Options: none.
+  def self.time(entity, options = {})
+    s = entity.to_s
+    return if s =~ /^[0-9]+$/
+    time = nil
+    silence_warnings do
+      time = ::Chronic.parse(s, {:guess => true})
+    end
+    if entity.has_parent? && remove_time_from_ancestors(entity, time)
+      nil
+    else
+      time
+    end
+  end
+  # Keeps the lowest-level time annotations that do
+  # not conflict with a higher time annotation.
+  # Returns true if the entity conflicts with a
+  # higher-level time annotation.
+  def self.remove_time_from_ancestors(entity, time)
+    entity.ancestors_with_type(:phrase).each do |a|
+      next if !a.has?(:time)
+      unless a.get(:time) == time
+        return true
+      end
+      a.unset(:time)
+    end
+    false
+  end
+end

data/lib/treat/extractors/time/nickel.rb CHANGED Viewed

@@ -1,71 +1,95 @@
-module Treat
-  module Extractors
-    module Time
-      # A wrapper for the 'nickel' gem, which parses
-      # times and dates and supplies additional information
-      # concerning these. The additional information supplied
-      # that this class annotates entities with is:
-      #
-      # - time_recurrence: frequency of recurrence in words*.
-      # - time_recurrence_interval: frequency of recurrence in days.
-      # - start_time: a DateTime object representing the beginning of
-      #   an event.
-      # - end_time: a DateTime object representing the end of an event.
-      #
-      # Examples of values for time_recurrence are:
-      #
-      # - single: "lunch with megan tomorrow at noon"
-      # - daily: "Art exhibit until March 1st"
-      # - weekly: "math class every wed from 8-11am"
-      # - daymonthly: "open bar at joes the first friday of every month"
-      # - datemonthly: "pay credit card bill on the 22nd of each month"
-      #
-      # Project website: http://naturalinputs.com/
-      class Nickel
-        require 'date'
-        silence_warnings { require 'nickel' }
-        # Extract time information from a bit of text.
-        def self.time(entity, options = {})
-          return nil if entity.to_s.strip == ''
-          n = nil
-          silence_warnings { n = ::Nickel.parse(entity.to_s.strip) }
-          occ = n.occurrences[0]
-          return nil unless occ
+# A wrapper for the 'nickel' gem, which parses
+# times and dates and supplies additional information
+# concerning these. The additional information supplied
+# that this class annotates entities with is:
+#
+# - time_recurrence: frequency of recurrence in words*.
+# - time_recurrence_interval: frequency of recurrence in days.
+# - start_time: a DateTime object representing the beginning of
+#   an event.
+# - end_time: a DateTime object representing the end of an event.
+#
+# Examples of values for time_recurrence are:
+#
+# - single: "lunch with megan tomorrow at noon"
+# - daily: "Art exhibit until March 1st"
+# - weekly: "math class every wed from 8-11am"
+# - daymonthly: "open bar at joes the first friday of every month"
+# - datemonthly: "pay credit card bill on the 22nd of each month"
+#
+# Project website: http://naturalinputs.com/
+class Treat::Extractors::Time::Nickel
-          rec = occ.type.to_s.gsub('single', 'once').intern
-          time_recurrence = rec
-          interval = occ.interval ? occ.interval : :none
-          time_recurrence_interval = interval
+  require 'date'
-          s = [occ.start_date, occ.start_time]
-          ds = [s[0].year, s[0].month, s[0].day] if s[0]
-          ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
+  silence_warnings { require 'nickel' }
-          e = [occ.end_date, occ.end_time]
-          de = [e[0].year, e[0].month, e[0].day] if e[0]
-          te = [e[1].hour, e[1].minute, e[1].second] if e[1]
+  # Extract time information from a bit of text.
+  def self.time(entity, options = {})
+    s = entity.to_s
+    return if s =~ /^[0-9]+$/
+    n = nil
+    begin
+      silence_warnings { n = ::Nickel.parse(s.to_s.strip) }
+    rescue
+      return
+    end
+    occ = n.occurrences[0]
+    return unless occ
+    rec = occ.type.to_s.gsub('single', 'once').intern
+    time_recurrence = rec
+    interval = occ.interval ?
+    occ.interval : :none
+    time_recurrence_interval = interval
+    s = [occ.start_date, occ.start_time]
+    ds = [s[0].year, s[0].month, s[0].day] if s[0]
+    ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
-          start_time = ::DateTime.civil(*ds) if ds && !ts
-          start_time = ::DateTime.civil(*ds, *ts) if ds && ts
-          end_time = ::DateTime.civil(*de) if de && !te
-          end_time = ::DateTime.civil(*de, *te) if de && te
+    e = [occ.end_date, occ.end_time]
+    de = [e[0].year, e[0].month, e[0].day] if e[0]
+    te = [e[1].hour, e[1].minute, e[1].second] if e[1]
-          time = Treat::Features::Time.new(             # Fix - time message.
-          start_time, end_time, time_recurrence,
-          time_recurrence_interval
-          )
+    start_time = ::DateTime.civil(*ds) if ds && !ts
+    start_time = ::DateTime.civil(*ds, *ts) if ds && ts
+    end_time = ::DateTime.civil(*de) if de && !te
+    end_time = ::DateTime.civil(*de, *te) if de && te
+    return unless start_time
-          # Keeps the lowest-level time annotations
-          # that do not conflict with the highest-level
-          # time annotation.
-          entity.ancestors_with_type(:phrase).each do |a|
-            unless a.id == entity.id || a.children[0].size == 0
-              a.unset(:time)
-            end
-          end
-          time
-        end
-      end
+    if entity.has_parent? &&
+      remove_time_from_ancestors(entity, start_time)
+      nil
+    else
+      entity.set :time_recurrence,
+      time_recurrence
+      entity.set :time_recurrence_interval,
+      time_recurrence_interval
+      entity.set :end_time, end_time if end_time
+      start_time
     end
   end
+  # Keeps the lowest-level time annotations that do
+  # not conflict with a higher time annotation.
+  # Returns true if the entity conflicts with a
+  # higher-level time annotation.
+  def self.remove_time_from_ancestors(entity, time)
+    entity.ancestors_with_type(:phrase).each do |a|
+      next if !a.has?(:time)
+      return false unless a.get(:time).to_s == time.to_s
+      a.unset(:time, :time_recurrence,
+      :time_recurrence_interval, :end_time)
+    end
+    true
+  end
 end

data/lib/treat/extractors/time/ruby.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# A wrapper for Ruby's native date/time parsing.
+class Treat::Extractors::Time::Ruby
+  # Require Ruby's date module.
+  require 'date'
+  # Return a DateTime object representing the date/time
+  # contained within the entity, using Ruby's native
+  # date/time parser. This extractor is suitable for the
+  # detection of well-structured dates and times, such as
+  # 2011/02/03 5:00.
+  #
+  # Options: none.
+  def self.time(entity, options = {})
+    s = entity.to_s
+    return if s =~ /^[0-9]+$/
+    begin
+      time = ::DateTime.parse(s)
+      if  entity.has_parent? &&
+        remove_time_from_ancestors(entity, time)
+        nil
+      else
+        time
+      end
+    rescue
+      nil
+    end
+  end
+  # Keeps the lowest-level time annotations that do
+  # not conflict with a higher time annotation.
+  # Returns true if the entity conflicts with a
+  # higher-level time annotation.
+  def self.remove_time_from_ancestors(entity, time)
+    entity.ancestors_with_type(:phrase).each do |a|
+      next if !a.has?(:time)
+      unless a.get(:time) == time
+        return true
+      end
+      a.unset(:time)
+    end
+    false
+  end
+end

data/lib/treat/extractors/topic_words/lda.rb CHANGED Viewed

@@ -1,63 +1,72 @@
-module Treat
-  module Extractors
-    module TopicWords
-      # An adapter for the 'lda-ruby' gem, which clusters
-      # documents into topics based on Latent Dirichlet
-      # Allocation.
-      #
-      # Original paper:
-      # Blei, David M., Ng, Andrew Y., and Jordan, Michael
-      # I. 2003. Latent dirichlet allocation. Journal of
-      # Machine Learning Research. 3 (Mar. 2003), 993-1022.
-      #
-      # Project website: https://github.com/ealdent/lda-ruby
-      class LDA
-        # Require the lda-ruby gem.
-        silence_warnings { require 'lda-ruby' }
-        # Monkey patch the TextCorpus class to call it without
-        # having to create any files.
-        Lda::TextCorpus.class_eval do
-          # Ruby, Y U NO SHUT UP!
-          silence_warnings { undef :initialize }
-          # Redefine initialize to take in an array of sections
-          def initialize(sections)
-            super(nil)
-            sections.each do |section|
-              add_document(Lda::TextDocument.new(self, section))
-            end
-          end
-        end
-        # Default options for the LDA algorithm.
-        DefaultOptions = {
-          :num_topics => 20,
-          :words_per_topic => 10,
-          :iterations => 20
-        }
-        # Retrieve the topic words of a collection.
-        def self.topic_words(collection, options = {})
-          options = DefaultOptions.merge(options)
-          # Create a corpus with the collection
-          sections = collection.sections.collect do |t|
-            t.to_s.encode('UTF-8', :invalid => :replace,
-            :undef => :replace, :replace => "?")            # Fix
-          end
-          corpus = Lda::TextCorpus.new(sections)
+# An adapter for the 'lda-ruby' gem, which clusters
+# documents into topics based on Latent Dirichlet
+# Allocation.
+#
+# Original paper:
+# Blei, David M., Ng, Andrew Y., and Jordan, Michael
+# I. 2003. Latent dirichlet allocation. Journal of
+# Machine Learning Research. 3 (Mar. 2003), 993-1022.
+#
+# Project website: https://github.com/ealdent/lda-ruby
+module Treat::Extractors::TopicWords::LDA
-          # Create an Lda object for training
-          lda = Lda::Lda.new(corpus)
-          lda.num_topics = options[:num_topics]
-          lda.max_iter = options[:iterations]
-          # Run the EM algorithm using random starting points
-          silence_stdout { lda.em('random') }
-          # Load the vocabulary.
-          if options[:vocabulary]
-            lda.load_vocabulary(options[:vocabulary])
-          end
-          # Get the topic words.
-          lda.top_words(options[:words_per_topic])
-        end
+  # Require the lda-ruby gem.
+  silence_warnings { require 'lda-ruby' }
+  # Monkey patch the TextCorpus class to
+  # call it without having to create any files.
+  Lda::TextCorpus.class_eval do
+    # Ruby, Y U NO SHUT UP!
+    silence_warnings { undef :initialize }
+    # Redefine initialize to take in an
+    # array of sections.
+    def initialize(sections)
+      super(nil)
+      sections.each do |section|
+        add_document(
+        Lda::TextDocument.new(self, section))
       end
     end
   end
+  # Default options for the LDA algorithm.
+  DefaultOptions = {
+    :num_topics => 20,
+    :words_per_topic => 10,
+    :iterations => 20,
+    :vocabulary => nil
+  }
+  # Retrieve the topic words of a collection.
+  def self.topic_words(collection, options = {})
+    options = DefaultOptions.merge(options)
+    docs = collection.documents.map { |d| d.to_s }
+    # Create a corpus with the collection
+    corpus = Lda::TextCorpus.new(docs)
+    # Create an Lda object for training
+    lda = Lda::Lda.new(corpus)
+    lda.num_topics = options[:num_topics]
+    lda.max_iter = options[:iterations]
+    # Run the EM algorithm using random
+    # starting points
+    silence_stdout do
+      lda.em('random')
+    end
+    # Load the vocabulary.
+    if options[:vocabulary]
+      lda.load_vocabulary(options[:vocabulary])
+    end
+    # Get the topic words.
+    lda.top_words(
+    options[:words_per_topic]
+    ).values
+  end
 end

data/lib/treat/extractors/topics/reuters.rb CHANGED Viewed

@@ -1,92 +1,105 @@
-module Treat
-  module Extractors
-    module Topics
-      # A Ruby Part text categorizer that was trained
-      # using the Reuters news story corpus.  Version 0.1
-      #
-      # Copyright 2005 Mark Watson.  All rights reserved.
-      # This software is released under the GPL.
-      # Rewrite for inclusion in Treat by Louis Mullie (2011).
-      #
-      # Original project website: http://www.markwatson.com/opensource/
-      class Reuters
-        # Require the Nokogiri XML parser.
-        require 'nokogiri'
-        # Hashes to hold the topics.
-        @@industry = {}
-        @@region = {}
-        @@topics = {}
-        # Get the topic of the text.
-        #
-        # Options: none.
-        def self.topics(text, options = {})
-          stems = []
-          @@reduce = 0
-          unless text.words.size > 0
-            raise Treat::Exception,
-            "Annotator 'topics' requires processor 'tokenize'."
-          end
-          text.words.collect! do |tok|
-            stem = tok.stem.downcase
-            val = tok.value.downcase
-            stems << stem
-            unless stem == val
-              stems << val
-            end
-          end
-          get_topics
-          score_words(@@industry, stems) +
-          score_words(@@region, stems) +
-          score_words(@@topics, stems)
-          #Treat::Feature.new(topics)
-        end
-        # Read the topics from the XML files.
-        def self.get_topics
-          return unless @@industry.size == 0
-          @@industry = read_xml(Treat.lib + '/treat/extractors/topics/reuters/industry.xml')
-          @@region = read_xml(Treat.lib + '/treat/extractors/topics/reuters/region.xml')
-          @@topics = read_xml(Treat.lib + '/treat/extractors/topics/reuters/topics.xml')
-        end
-        def self.read_xml(file_name)
-          hash = {}
-          doc = Nokogiri::XML(File.read(file_name))
-          doc.root.children.each do |category|
-            cat = category["cat"]
-            next if cat.nil?
-            cat = cat.downcase
-            hash[cat] ||= {}
-            hash[cat][category["name"]] =
-            category["score"].to_f
-          end
-          hash
-        end
-        def self.score_words(hash, word_list)
-          category_names = hash.keys
-          count_hash = {}
-          category_names.each do |cat_name|
-            cat_name = cat_name.downcase
-            count_hash[cat_name] ||= 0
-            word_list.each do |word|
-              unless hash[cat_name][word].nil?
-                count_hash[cat_name] +=
-                hash[cat_name][word]
-              end
-            end
-          end
-          count_hash = best_of_hash(count_hash)
-          count_hash.keys
-        end
-        def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
-          ret = {}
-          hash.keys.each do |key|
-            if hash[key] > cutoff
-              ret[key] = hash[key] * scale
-              ret[key] = ret[key].round(2)
-            end
-          end
-          ret
+# A Ruby text categorizer that was trained using
+# the Reuters news story corpus.
+#
+# Copyright 2005 Mark Watson. All rights reserved.
+# Rewrite for inclusion in Treat by Louis Mullie (2011).
+#
+# Original project website:
+# http://www.markwatson.com/opensource/
+module Treat::Extractors::Topics::Reuters
+  # Require the Nokogiri XML parser.
+  require 'nokogiri'
+  # Hashes to hold the topics.
+  @@industry = {}
+  @@region = {}
+  @@topics = {}
+  # Get the general topic of the text using
+  # a Reuters-trained model.
+  #
+  # Options: none.
+  def self.topics(text, options = {})
+    stems = []
+    @@reduce = 0
+    unless text.words.size > 0
+      raise Treat::Exception,
+      "Annotator 'topics' requires " +
+      "processor 'tokenize'."
+    end
+    text.words.collect! do |tok|
+      stem = tok.stem.downcase
+      val = tok.value.downcase
+      stems << stem
+      unless stem == val
+        stems << val
+      end
+    end
+    get_topics
+    score_words(@@industry, stems) +
+    score_words(@@region, stems) +
+    score_words(@@topics, stems)
+    #Treat::Feature.new(topics)
+  end
+  # Read the topics from the XML files.
+  def self.get_topics
+    return unless @@industry.size == 0
+    @@industry = read_xml(Treat.models +
+    'reuters/industry.xml')
+    @@region = read_xml(Treat.models +
+    'reuters/region.xml')
+    @@topics = read_xml(Treat.models +
+    'reuters/topics.xml')
+  end
+  # Read an XML file and populate a
+  # hash of topics.
+  def self.read_xml(file_name)
+    hash = {}
+    doc = Nokogiri::XML(File.read(file_name))
+    doc.root.children.each do |category|
+      cat = category["cat"]
+      next if cat.nil?
+      cat = cat.downcase
+      hash[cat] ||= {}
+      hash[cat][category["name"]] =
+      category["score"].to_f
+    end
+    hash
+  end
+  # Score the words by adding the scores
+  # of each word occurence.
+  def self.score_words(hash, word_list)
+    category_names = hash.keys
+    count_hash = {}
+    category_names.each do |cat_name|
+      cat_name = cat_name.downcase
+      count_hash[cat_name] ||= 0
+      word_list.each do |word|
+        unless hash[cat_name][word].nil?
+          count_hash[cat_name] +=
+          hash[cat_name][word]
         end
       end
     end
+    count_hash = best_of_hash(count_hash)
+    count_hash.keys
+  end
+  # Retrieve the words with the scores above
+  # cutoff inside the hash of scored words.
+  def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
+    ret = {}
+    hash.keys.each do |key|
+      if hash[key] > cutoff
+        ret[key] = hash[key] * scale
+        ret[key] = ret[key].round(2)
+      end
+    end
+    ret
   end
 end