RubyGems - treat - Versions diffs - 0.1.4 → 0.2.0 - Mend

treat 0.1.4 → 0.2.0

Files changed (160) hide show

data/LICENSE +4 -4
data/TODO +21 -54
data/lib/economist/half_cocked_basel.txt +16 -0
data/lib/economist/hose_and_dry.doc +0 -0
data/lib/economist/hungarys_troubles.abw +70 -0
data/lib/economist/republican_nomination.pdf +0 -0
data/lib/economist/saving_the_euro.odt +0 -0
data/lib/economist/to_infinity_and_beyond.txt +15 -0
data/lib/economist/zero_sum.html +91 -0
data/lib/treat.rb +58 -72
data/lib/treat/buildable.rb +59 -15
data/lib/treat/categories.rb +26 -14
data/lib/treat/category.rb +2 -2
data/lib/treat/delegatable.rb +65 -48
data/lib/treat/doable.rb +44 -0
data/lib/treat/entities.rb +34 -14
data/lib/treat/entities/collection.rb +2 -0
data/lib/treat/entities/document.rb +3 -2
data/lib/treat/entities/entity.rb +105 -90
data/lib/treat/entities/phrases.rb +17 -0
data/lib/treat/entities/tokens.rb +28 -13
data/lib/treat/entities/zones.rb +20 -0
data/lib/treat/extractors.rb +49 -11
data/lib/treat/extractors/coreferences/stanford.rb +68 -0
data/lib/treat/extractors/date/chronic.rb +32 -0
data/lib/treat/extractors/date/ruby.rb +25 -0
data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
data/lib/treat/extractors/language/what_language.rb +49 -0
data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
data/lib/treat/extractors/roles/naive.rb +73 -0
data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
data/lib/treat/extractors/time/nickel.rb +30 -12
data/lib/treat/extractors/topic_words/lda.rb +9 -9
data/lib/treat/extractors/topics/reuters.rb +14 -15
data/lib/treat/extractors/topics/reuters/region.xml +1 -0
data/lib/treat/features.rb +7 -0
data/lib/treat/formatters/readers/abw.rb +6 -1
data/lib/treat/formatters/readers/autoselect.rb +5 -6
data/lib/treat/formatters/readers/doc.rb +3 -1
data/lib/treat/formatters/readers/html.rb +1 -1
data/lib/treat/formatters/readers/image.rb +43 -0
data/lib/treat/formatters/readers/odt.rb +1 -2
data/lib/treat/formatters/readers/pdf.rb +9 -1
data/lib/treat/formatters/readers/xml.rb +40 -0
data/lib/treat/formatters/serializers/xml.rb +50 -14
data/lib/treat/formatters/serializers/yaml.rb +7 -2
data/lib/treat/formatters/unserializers/xml.rb +33 -7
data/lib/treat/formatters/visualizers/dot.rb +90 -20
data/lib/treat/formatters/visualizers/short_value.rb +2 -2
data/lib/treat/formatters/visualizers/standoff.rb +2 -2
data/lib/treat/formatters/visualizers/tree.rb +1 -1
data/lib/treat/formatters/visualizers/txt.rb +13 -4
data/lib/treat/group.rb +16 -10
data/lib/treat/helpers/linguistics_loader.rb +18 -0
data/lib/treat/inflectors.rb +10 -0
data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
data/lib/treat/inflectors/declensions/english.rb +319 -0
data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
data/lib/treat/install.rb +59 -0
data/lib/treat/kernel.rb +18 -8
data/lib/treat/languages.rb +18 -11
data/lib/treat/languages/arabic.rb +4 -2
data/lib/treat/languages/chinese.rb +6 -2
data/lib/treat/languages/dutch.rb +16 -0
data/lib/treat/languages/english.rb +47 -19
data/lib/treat/languages/french.rb +8 -5
data/lib/treat/languages/german.rb +9 -6
data/lib/treat/languages/greek.rb +16 -0
data/lib/treat/languages/italian.rb +6 -3
data/lib/treat/languages/polish.rb +16 -0
data/lib/treat/languages/portuguese.rb +16 -0
data/lib/treat/languages/russian.rb +16 -0
data/lib/treat/languages/spanish.rb +16 -0
data/lib/treat/languages/swedish.rb +16 -0
data/lib/treat/languages/tags.rb +377 -0
data/lib/treat/lexicalizers.rb +34 -23
data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
data/lib/treat/lexicalizers/tag/brill.rb +35 -40
data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
data/lib/treat/processors.rb +8 -8
data/lib/treat/processors/chunkers/txt.rb +4 -4
data/lib/treat/processors/parsers/enju.rb +114 -99
data/lib/treat/processors/parsers/stanford.rb +109 -41
data/lib/treat/processors/segmenters/punkt.rb +17 -18
data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
data/lib/treat/processors/segmenters/stanford.rb +38 -37
data/lib/treat/processors/segmenters/tactful.rb +5 -4
data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
data/lib/treat/processors/tokenizers/perl.rb +2 -2
data/lib/treat/processors/tokenizers/punkt.rb +6 -2
data/lib/treat/processors/tokenizers/stanford.rb +25 -24
data/lib/treat/processors/tokenizers/tactful.rb +1 -2
data/lib/treat/proxies.rb +2 -35
data/lib/treat/registrable.rb +17 -22
data/lib/treat/sugar.rb +11 -11
data/lib/treat/tree.rb +27 -17
data/lib/treat/viewable.rb +29 -0
data/lib/treat/visitable.rb +1 -1
data/test/tc_entity.rb +56 -49
data/test/tc_extractors.rb +41 -18
data/test/tc_formatters.rb +7 -8
data/test/tc_inflectors.rb +19 -24
data/test/tc_lexicalizers.rb +12 -19
data/test/tc_processors.rb +26 -12
data/test/tc_resources.rb +2 -7
data/test/tc_treat.rb +20 -22
data/test/tc_tree.rb +4 -4
data/test/tests.rb +3 -5
data/test/texts.rb +13 -14
data/tmp/INFO +1 -0
metadata +78 -158
data/bin/INFO +0 -1
data/examples/benchmark.rb +0 -81
data/examples/keywords.rb +0 -148
data/lib/treat/detectors.rb +0 -31
data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
data/lib/treat/detectors/format/file.rb +0 -36
data/lib/treat/detectors/language/what_language.rb +0 -29
data/lib/treat/entities/constituents.rb +0 -15
data/lib/treat/entities/sentence.rb +0 -8
data/lib/treat/extractors/named_entity/abner.rb +0 -20
data/lib/treat/extractors/named_entity/stanford.rb +0 -174
data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
data/lib/treat/extractors/time/chronic.rb +0 -20
data/lib/treat/extractors/time/native.rb +0 -18
data/lib/treat/formatters/readers/gocr.rb +0 -26
data/lib/treat/formatters/readers/ocropus.rb +0 -31
data/lib/treat/formatters/visualizers/html.rb +0 -13
data/lib/treat/formatters/visualizers/inspect.rb +0 -20
data/lib/treat/inflectors/declensions/en.rb +0 -18
data/lib/treat/languages/categories.rb +0 -5
data/lib/treat/languages/english/categories.rb +0 -23
data/lib/treat/languages/english/tags.rb +0 -352
data/lib/treat/languages/xinhua.rb +0 -12
data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
data/lib/treat/string.rb +0 -5
data/test/tc_detectors.rb +0 -26

data/lib/treat/extractors/statistics/frequency_in.rb CHANGED

@@ -2,20 +2,13 @@ module Treat
   module Extractors
     module Statistics
       class FrequencyIn
-        DefaultOptions = {type: nil}
-        def self.statistics(entity, options={})
+        DefaultOptions = { :parent => nil }
+        # Find the frequency of a given string value.
+        def self.statistics(entity, options = {})
           options = DefaultOptions.merge(options)
-          if entity.is_leaf?
-            w = entity.value.downcase
-            if entity.token_registry(options[:type])[:value][w].nil?
-              0
-            else
-              entity.token_registry(options[:type])[:value][w].size
-            end
-          else
-            raise Treat::Exception,
-            'Cannot get the frequency of a non-terminal entity.'
-          end
+          tr = entity.token_registry(options[:parent])
+          tv = tr[:value][entity.value]
+          tv ? tv.size : 1
         end
       end
     end

data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} RENAMED

@@ -6,7 +6,7 @@ module Treat
         # inside the parent entity with type entity_type.
         # Not implemented.
         def self.statistics(entity, options = {})
-          entity.parent.children.index(entity)
+          entity.parent.children.index(entity)      ## Fix - ancestor_w_type
         end
       end
     end

data/lib/treat/extractors/statistics/tf_idf.rb CHANGED

@@ -1,34 +1,102 @@
 module Treat
   module Extractors
     module Statistics
-      # "The term count in the given document is simply the
-      # number of times a given term appears in that document.
-      # This count is usually normalized to prevent a bias
-      # towards longer documents (which may have a higher
-      # term count regardless of the actual importance of
-      # that term in the document) to give a measure of the
-      # importance of the term t within the particular document d.
-      # Thus we have the term frequency tf(t,d), defined in the
+      # "The term count in the given document is simply the
+      # number of times a given term appears in that document.
+      # This count is usually normalized to prevent a bias
+      # towards longer documents (which may have a higher
+      # term count regardless of the actual importance of
+      # that term in the document) to give a measure of the
+      # importance of the term t within the particular document d.
+      # Thus we have the term frequency tf(t,d), defined in the
       # simplest case as the occurrence count of a term in a document.
-      #
-      # The inverse document frequency is a measure of the general
-      # importance of the term (obtained by dividing the total number
-      # of documents by the number of documents containing the term,
+      #
+      # The inverse document frequency is a measure of the general
+      # importance of the term (obtained by dividing the total number
+      # of documents by the number of documents containing the term,
       # and then taking the logarithm of that quotient)."
       #
       # (From Wikipedia)
       class TfIdf
-        DefaultOptions = { type: nil }
+        DefaultOptions = {
+          :tf => :natural,
+          :idf => :logarithm,
+          :remove_common_words => true,
+          :precision => 4
+        }
+        Algorithms = {
+          :tf => {
+            :natural => lambda { |tf| tf },
+            :logarithm => lambda { |tf| Math.log(1 + tf) },
+            :sqrt =>lambda { |tf| Math.sqrt(tf) }
+          },
+          :idf => {
+            :logarithm => lambda { |n,df| Math.log(n/(1 + df)) },
+            :none => lambda { |n,idf| 1 }
+          }
+        }
+        # Optimization caches for tf idf.
+        @@n = {} # Number of documents in the collection (n).
+        @@df= {} # Number of documents that have a given value (document count).
+        @@f = {} # Number of times a word appears in a given document (term count).
+        @@wc = {} # Number of words in a given document (word count).
+        @@cw = {} # Common words to filter out.
         def self.statistics(entity, options={})
-          tf = entity.frequency_in(:document)
-          tf = tf / entity.root.word_count
-          d = entity.root.document_count
-          i = 0
-          entity.root.each_document do |document|
-            i += 1 if document.frequency_of(entity.value)
+          l = Treat::Languages.get(entity.language)
+          if l.const_defined?(:CommonWords)
+            @@cw[entity.language] = l.const_get(:CommonWords)
+            return 0 if @@cw[entity.language].include?(entity.value)
+          end
+          return 0 if entity.value.length <= 2
+          options = DefaultOptions.merge(options)
+          lambdas = options.partition do |k,v|
+            [:tf, :idf, :normalization].include?(k)
+          end[0]
+          lambdas.each do |opt,val|
+            if opt.is_a?(Symbol)
+              if Algorithms[opt][val]
+                options[opt] = Algorithms[opt][val]
+              else
+                raise Treat::Exception,
+                "The specified algorithm '#{val}' "+
+                "to calculate #{opt} does not exist."
+              end
+            end
+          end
+          collection = entity.parent_collection
+          document = entity.parent_document
+          dc = collection.document_count
+          if !collection || !document
+            raise Treat::Exception,
+            "Tf*Idf requires a collection with documents."
+          end
+          val = entity.value.downcase
+          @@n[collection.id] = dc if @@n[collection.id].nil?
+          @@df[collection.id] ||= {}
+          if @@df[collection.id][val].nil?
+            df = 0
+            collection.each_document do |doc|
+               @@f[doc.id] ||= {}
+              if @@f[doc.id][val].nil?
+                @@f[doc.id][val] =
+                doc.token_registry[:value][val] ?
+                doc.token_registry[:value][val].size : 0
+              end
+              df += 1 if @@f[doc.id][val] > 0
+            end
+            @@df[collection.id][val] = df
+          end
+          f = @@f[document.id][entity.value].to_f
+          df = @@df[collection.id][entity.value].to_f
+          tf = options[:tf].call(f).to_f
+          if options[:normalize_word_count]
+            @@wc[document.id] ||= document.word_count
+            tf /= @@wc[document.id]
           end
-          idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
-          tf.to_f/idf.to_f
+          n = @@n[collection.id].to_f
+          idf = options[:idf].call(n, df)
+          tf_idf = tf * idf
+          tf_idf.abs.round(options[:precision])
         end
       end
     end

data/lib/treat/extractors/statistics/transition_matrix.rb CHANGED

@@ -4,11 +4,11 @@ module Treat
       # Experimental algorithm to generate transition matrices.
       class TransitionMatrix
         DefaultOptions = {
-          normalize: true,
-          features: [:tag],
-          condition: lambda { |e| true },
-          entity_types: [:word],
-          relationships: [:parent, :right, :children]
+          :normalize => true,
+          :features => [:tag],
+          :condition => lambda { |e| true },
+          :entity_types => [:word],
+          :relationships => [:parent, :right, :children]
         }
         # Find the transition matrix.
         def self.statistics(entity, options={})
@@ -34,7 +34,7 @@ module Treat
             next unless options[:condition].call(target)
             # Initialize the empty transition matrix.
             # Calculate the transition probabilities.
             options[:features].each do |f1|
@@ -57,16 +57,16 @@ module Treat
                     end
                   end
-                  tm[f1][v1][:edge] = empty.call
+                  tm[f1][v1][:dependency] = empty.call
-                  target.edges.each do |id, edge_type|
+                  target.dependencies.each do |dependency|
                     s = target.ancestor_with_type :sentence
                     if s
-                      x = s.find(id)
+                      x = s.find(dependency.target)
                       next unless relative.has?(f2)
                       v2 = x.send(f2)
-                      tm[f1][v1][:edge][f2][v2] ||= 0.0
-                      tm[f1][v1][:edge][f2][v2] += 1.0
+                      tm[f1][v1][:dependency][f2][v2] ||= 0.0
+                      tm[f1][v1][:dependency][f2][v2] += 1.0
                     end
                   end

data/lib/treat/extractors/statistics/transition_probability.rb CHANGED

@@ -34,14 +34,14 @@ module Treat
                 end
               end
-              entity.edges.each do |id, edge|
+              entity.dependencies.each do |dependency|
                 s = entity.ancestor_with_type :sentence
                 if s
-                  x = s.find(id)
+                  x = s.find(dependency.target)
                   next unless h.has?(f2)
                   v2 = x.send(f2)
-                  if tm[f1][v1][:edge][f2][v2]
-                    score += tm[f1][v1][:edge][f2][v2]
+                  if tm[f1][v1][:dependency][f2][v2]
+                    score += tm[f1][v1][:dependency][f2][v2]
                     count += 1
                   end
                 end

data/lib/treat/extractors/time/nickel.rb CHANGED

@@ -1,7 +1,7 @@
 module Treat
   module Extractors
     module Time
-      # A wrapper for the 'nickel' gem, which parses
+      # A wrapper for the 'nickel' gem, which parses
       # times and dates and supplies additional information
       # concerning these. The additional information supplied
       # that this class annotates entities with is:
@@ -11,7 +11,7 @@ module Treat
       # - start_time: a DateTime object representing the beginning of
       #   an event.
       # - end_time: a DateTime object representing the end of an event.
-      #
+      #
       # Examples of values for time_recurrence are:
       #
       # - single: "lunch with megan tomorrow at noon"
@@ -19,33 +19,51 @@ module Treat
       # - weekly: "math class every wed from 8-11am"
       # - daymonthly: "open bar at joes the first friday of every month"
       # - datemonthly: "pay credit card bill on the 22nd of each month"
-      #
+      #
       # Project website: http://naturalinputs.com/
-      module Nickel
+      class Nickel
         require 'date'
         silence_warnings { require 'nickel' }
         # Extract time information from a bit of text.
         def self.time(entity, options = {})
-          n = silence_warnings { ::Nickel.parse(entity.to_s) }
+          return nil if entity.to_s.strip == ''
+          n = nil
+          silence_warnings { n = ::Nickel.parse(entity.to_s.strip) }
           occ = n.occurrences[0]
+          return nil unless occ
           rec = occ.type.to_s.gsub('single', 'once').intern
-          entity.set :time_recurrence, rec
+          time_recurrence = rec
           interval = occ.interval ? occ.interval : :none
-          entity.set :time_recurrence_interval, interval
+          time_recurrence_interval = interval
           s = [occ.start_date, occ.start_time]
           ds = [s[0].year, s[0].month, s[0].day] if s[0]
-          #ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
+          ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
           e = [occ.end_date, occ.end_time]
           de = [e[0].year, e[0].month, e[0].day] if e[0]
-          #te = [e[1].hour, e[1].min, e[1].sec] if e[1]
+          te = [e[1].hour, e[1].minute, e[1].second] if e[1]
+          start_time = ::DateTime.civil(*ds) if ds && !ts
+          start_time = ::DateTime.civil(*ds, *ts) if ds && ts
+          end_time = ::DateTime.civil(*de) if de && !te
+          end_time = ::DateTime.civil(*de, *te) if de && te
-          entity.set :start_time, ::DateTime.civil(*ds) if ds
-          entity.set :end_time, ::DateTime.civil(*de) if de
+          time = Treat::Features::Time.new(             # Fix - time message.
+          start_time, end_time, time_recurrence,
+          time_recurrence_interval
+          )
-          entity.start_time
+          # Keeps the lowest-level time annotations
+          # that do not conflict with the highest-level
+          # time annotation.
+          entity.ancestors_with_type(:phrase).each do |a|
+            unless a.id == entity.id || a.children[0].size == 0
+              a.unset(:time)
+            end
+          end
+          time
         end
       end
     end

data/lib/treat/extractors/topic_words/lda.rb CHANGED

@@ -29,26 +29,26 @@ module Treat
         end
         # Default options for the LDA algorithm.
         DefaultOptions = {
-          topics: 20,
-          words_per_topic: 10,
-          iterations: 20
+          :num_topics => 20,
+          :words_per_topic => 10,
+          :iterations => 20
         }
         # Retrieve the topic words of a collection.
         def self.topic_words(collection, options = {})
           options = DefaultOptions.merge(options)
           # Create a corpus with the collection
           sections = collection.sections.collect do |t|
-            t.to_s.encode_compliant('UTF-8')    # fix
+            t.to_s.encode('UTF-8', :invalid => :replace,
+            :undef => :replace, :replace => "?")            # Fix
           end
           corpus = Lda::TextCorpus.new(sections)
           # Create an Lda object for training
           lda = Lda::Lda.new(corpus)
-          lda.num_topics = options[:topics]
+          lda.num_topics = options[:num_topics]
           lda.max_iter = options[:iterations]
           # Run the EM algorithm using random starting points
-          silence_streams(STDOUT, STDERR) { lda.em('random') }
+          silence_stdout { lda.em('random') }
           # Load the vocabulary.
           if options[:vocabulary]
             lda.load_vocabulary(options[:vocabulary])
@@ -57,8 +57,8 @@ module Treat
           # Get the topic words and annotate the section.
           topic_words = lda.top_words(options[:words_per_topic])
-          topic_words.each do |i, words|
-            collection.each_word do |word|
+          collection.each_word do |word|
+            topic_words.each do |i, words|
               if words.include?(word)
                 word.set :is_topic_word?, true
                 word.set :topic_id, i

data/lib/treat/extractors/topics/reuters.rb CHANGED

@@ -22,24 +22,27 @@ module Treat
         def self.topics(text, options = {})
           stems = []
           @@reduce = 0
-          text.to_s.tokenize.words.collect! do |tok|
+          unless text.words.size > 0
+            raise Treat::Exception,
+            "Annotator 'topics' requires processor 'tokenize'."
+          end
+          text.words.collect! do |tok|
             stem = tok.stem.downcase
             val = tok.value.downcase
             stems << stem
             unless stem == val
               stems << val
-              @@reduce += 1
             end
           end
           get_topics
-          topics = score_words(@@industry, stems)
-          topics = topics.merge(score_words(@@region, stems))
-          topics = topics.merge(score_words(@@topics, stems))
+          score_words(@@industry, stems) +
+          score_words(@@region, stems) +
+          score_words(@@topics, stems)
           #Treat::Feature.new(topics)
         end
         # Read the topics from the XML files.
         def self.get_topics
-          return unless @@industry.empty?
+          return unless @@industry.size == 0
           @@industry = read_xml(Treat.lib + '/treat/extractors/topics/reuters/industry.xml')
           @@region = read_xml(Treat.lib + '/treat/extractors/topics/reuters/region.xml')
           @@topics = read_xml(Treat.lib + '/treat/extractors/topics/reuters/topics.xml')
@@ -65,21 +68,17 @@ module Treat
             count_hash[cat_name] ||= 0
             word_list.each do |word|
               unless hash[cat_name][word].nil?
-                count_hash[cat_name] =
-                count_hash[cat_name] +
+                count_hash[cat_name] +=
                 hash[cat_name][word]
               end
             end
           end
-          count_hash = best_of_hash(count_hash,
-          (word_list.size.to_f - @@reduce.to_f)  / 250.0,
-          100.0 / (1 + word_list.size.to_f - @@reduce.to_f))
-          count_hash
+          count_hash = best_of_hash(count_hash)
+          count_hash.keys
         end
-        def self.best_of_hash(hash, cutoff = 1, scale = 1)
-          cutoff = 1 if cutoff == 0
+        def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
           ret = {}
-          hash.keys.each() do |key|
+          hash.keys.each do |key|
             if hash[key] > cutoff
               ret[key] = hash[key] * scale
               ret[key] = ret[key].round(2)

data/lib/treat/extractors/topics/reuters/region.xml CHANGED

@@ -13437,6 +13437,7 @@
  <word cat="CANADA" name="nuinsco" score="1.000000" />
  <word cat="CANADA" name="noverco" score="1.000000" />
  <word cat="CANADA" name="enscor" score="1.000000" />
+ <word cat="CANADA" name="ottawa" score="1.000000" />
  <word cat="CANADA" name="winnipegg" score="1.000000" />
  <word cat="CANADA" name="mantadoc" score="1.000000" />
  <word cat="CANADA" name="canmar" score="1.000000" />

data/lib/treat/features.rb ADDED

@@ -0,0 +1,7 @@
+module Treat
+  module Features
+    Time = Struct.new(:start, :end, :recurrence, :recurrence_interval)
+    Roles = Struct.new(:subject, :verb, :object, :patient, :agent)
+    Date = Struct.new(:year, :month, :day)
+  end
+end

data/lib/treat/formatters/readers/abw.rb CHANGED

@@ -18,7 +18,12 @@ module Treat
           end
           def text(s)
             if s != 'AbiWord' && s != 'application/x-abiword'
-              @plain_text << s if s.strip.length > 0
+              s.strip!
+              if s.length > 0
+                s += ' '
+                s += "\n\n" if s.length < 60
+              end
+              @plain_text << s
             end
           end
         end

data/lib/treat/formatters/readers/autoselect.rb CHANGED

@@ -6,10 +6,8 @@ module Treat
       # the appropriate reader based on the file
       # extension of the supplied document.
       class Autoselect
-        # A list of image extensions that should be routed to OCR.
+        # A list of image extensions that should be routed to Ocropus.
         ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
-        # Default options.
-        DefaultOptions = {:ocr => :ocropus}
         # Select the appropriate reader based on the format
         # of the filename in document.
         #
@@ -17,19 +15,20 @@ module Treat
         #
         # - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
         def self.read(document, options)
-          options = DefaultOptions.merge(options)
           ext = document.file.split('.')[-1]
-          reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
+          reader = ImageExtensions.include?(ext) ? 'image' : ext
           reader = 'html' if reader == 'htm'
           reader = 'yaml' if reader == 'yml'
           begin
             r = Treat::Formatters::Readers.const_get(cc(reader))
-          rescue NameError => e
+          rescue NameError
             puts e.message
             raise Treat::Exception,
             "Cannot find a reader for format: '#{ext}'."
           end
           document = r.read(document, options)
+          document.set :encoding, document.to_s.encoding.to_s.downcase
+          document
         end
       end
     end