RubyGems - treat - Versions diffs - 1.0.0 → 1.0.1 - Mend

treat 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/README.md +1 -1
data/lib/treat.rb +1 -1
data/lib/treat/ai/classifiers/id3.rb +6 -4
data/lib/treat/classification.rb +10 -11
data/lib/treat/entities/abilities/buildable.rb +2 -3
data/lib/treat/entities/abilities/countable.rb +7 -5
data/lib/treat/entities/abilities/iterable.rb +9 -0
data/lib/treat/entities/abilities/magical.rb +10 -5
data/lib/treat/extractors/keywords/tf_idf.rb +21 -18
data/lib/treat/loaders/stanford.rb +1 -1
data/spec/document.rb +13 -0
data/spec/sandbox.rb +114 -3
metadata +2 -2

data/README.md CHANGED

@@ -19,7 +19,7 @@ Treat is a toolkit for natural language processing and computational linguistics
 **Resources**
-* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/master/frames).
+* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
 * See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
 * Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
 * Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).

data/lib/treat.rb CHANGED

@@ -10,7 +10,7 @@ module Treat
   end
   # The current version of Treat.
-  VERSION = "1.0.0"
+  VERSION = "1.0.1"
   # Add methods to handle syntactic sugar,
   # language configuration options, and paths.

data/lib/treat/ai/classifiers/id3.rb CHANGED

@@ -5,22 +5,24 @@ class Treat::AI::Classifiers::ID3
   @@classifiers = {}
   def self.classify(entity, options = {})
     set = options[:training]
     cl = set.classification
     if !@@classifiers[cl]
       dec_tree = DecisionTree::ID3Tree.new(
-      set.labels, set.items,
-      cl.default, :continuous)
+      set.labels.map { |l| l.to_s }, set.items,
+      cl.default, cl.mode)
       dec_tree.train
     else
       dec_tree = @@classifiers[cl]
     end
+    cl.export_item(entity, false).inspect
     dec_tree.predict(
       cl.export_item(entity, false)
-    )[0]
+    )
   end

data/lib/treat/classification.rb CHANGED

@@ -4,18 +4,19 @@ class Treat::Classification
   attr_reader :features
   attr_reader :question
   attr_reader :labels
+  attr_reader :mode
   attr_reader :default
-  def initialize(type_or_types, feature_or_features, question, default = false)
+  def initialize(type_or_types, feature_or_features,
+    question, default = false, mode = :continuous)
     @types, @features,
     @question, @default =
     [*type_or_types],
     [*feature_or_features],
     question, default
+    @mode = mode
     @labels = []
     @features.each do |cmd|
       if cmd.is_a?(Array)
         @labels << cmd[0]
@@ -23,11 +24,9 @@ class Treat::Classification
         @labels << cmd
       end
     end
   end
   def export_item(e, include_question = true)
     line = []
     @features.each do |cmd|
@@ -46,16 +45,16 @@ class Treat::Classification
       end
     end
-    begin
-      if include_question
-        line << e.send(@question)
+    if include_question
+      if e.has?(@question)
+        line << e.get(@question)
+      else
+        line << @default
       end
-    rescue Treat::Exception
-      line << @default
     end
     line[-1] = '' if line[-1].nil?
     line
   end
 end

data/lib/treat/entities/abilities/buildable.rb CHANGED

@@ -95,7 +95,7 @@ module Treat::Entities::Abilities::Buildable
     f = Treat::Downloader.download(
     uri.scheme, uri.host, path, file)
-    options[:_default_format] = :html
+    options[:default_to] ||= :html
     e = from_file(f, options)
     e.set :url, url
@@ -170,9 +170,8 @@ module Treat::Entities::Abilities::Buildable
       "point to a readable file."
     end
-    dflt = options[:_default_format]
     fmt = Treat::Formatters::Readers::Autoselect.
-    detect_format(file, dflt)
+    detect_format(file, options[:default_to])
     options[:_format] = fmt
     if fmt == :yaml || fmt == :yml ||

data/lib/treat/entities/abilities/countable.rb CHANGED

@@ -3,27 +3,29 @@ module Treat::Entities::Abilities::Countable
   # Find the position of the current entity
   # inside the parent entity, starting at 1.
   def position
     unless has_parent?
       raise Treat::Exception,
       "No parent to get position in."
     end
     parent.children.index(self) + 1
   end
+  # Find the position of this entity from
+  # the end of the parent entity.
+  def position_from_end
+    p = position
+    parent.size - p
+  end
   # Find the frequency of the entity in
   # the supplied parent or in the root
   # node if nil.
   def frequency_in(parent_type = nil)
     unless parent_type
       root.registry[:value][id]
     end
     registry(parent_type)[:value][value]
   end
   # Get the frequency of this entity's

data/lib/treat/entities/abilities/iterable.rb CHANGED

@@ -99,6 +99,15 @@ module Treat::Entities::Abilities::Iterable
   alias :ancestors_with_type :ancestors_with_types
+  # Number of children that have a given feature.
+  def num_children_with_feature(feature)
+    i = 0
+    each do |c|
+      i += 1 if c.has?(feature)
+    end
+    i
+  end
   # Return the first element in the array, warning if not
   # the only one in the array. Used for magic methods: e.g.,
   # the magic method "word" if called on a sentence with many

data/lib/treat/entities/abilities/magical.rb CHANGED

@@ -35,6 +35,9 @@ module Treat::Entities::Abilities::Magical
       entities_with_type($1.intern)
     elsif method =~ /^#{@@entities_regexp}$/
       first_but_warn(entities_with_type($1.intern), $1)
+    elsif method =~ /^first_#{@@entities_regexp}$/
+      e = entities_with_type($1.intern)
+      e ? e[0] : nil
     elsif method =~ /^parent_#{@@entities_regexp}$/
       ancestor_with_type($1.intern)
     elsif method =~ /^each_#{@@entities_regexp}$/
@@ -59,21 +62,23 @@ module Treat::Entities::Abilities::Magical
       entities_with_category($1.intern)
     elsif method =~ /^#{@@cats_regexp}$/
      first_but_warn(entities_with_category($1.intern), $1)
+    elsif method =~ /^first_#{@@cats_regexp}$/
+     e = entities_with_category($1.intern)
+     e ? e[0] : nil
     elsif method =~ /^#{@@cats_regexp}_count$/
       entities_with_category($1.intern).size
+    elsif method =~ /^(.*)_count$/
+      num_children_with_feature($1.intern)
     elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
       entities_with_feature($2.intern, args[0], $1)
     elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
       first_but_warn(entities_with_feature(
       $2.intern, args[0], $1.intern), $1)
-    elsif method =~ /^([a-z]*)_of_first_#{@@entities_regexp}$/
-      f = send(:"#{$2}s".intern).first
+    elsif method =~ /^([a-z]*)_of_(.*)$/
+      f = send($2.intern)
       f ? f.send($1.intern) : nil
     elsif method =~ /^frequency_in_#{@@entities_regexp}$/
       frequency_in($1.intern)
-      # first_word
-      # tag_of_first_verb
-      # tag_of_title
     else
       return :no_magic
     end

data/lib/treat/extractors/keywords/tf_idf.rb CHANGED

@@ -13,9 +13,9 @@ class Treat::Extractors::Keywords::TfIdf
     options = DefaultOptions.merge(options)
     tf_idfs = {}
     entity.each_word do |word|
-      word.check_has(:tf_idf, false)
-      tf_idfs[word] ||= word.get(:tf_idf)
+      tf_idfs[word] ||= word.tf_idf
     end
     tf_idfs = tf_idfs.
@@ -32,29 +32,32 @@ class Treat::Extractors::Keywords::TfIdf
       w = word[0].to_s
       next if keywords.include?(w)
-      entity.each_word_with_value(w) do |w2|
-        ps = w2.parent_phrase
-        if ps.has?(:keyword_count)
-          ps.set :keyword_count,
-          ps.keyword_count + 1
-        else
-          ps.set :keyword_count, 1
-        end
-        ps.set :keyword_density,
-        (ps.keyword_count / ps.size)
-      end
       break if i > options[:number]
       keywords << w
       i += 1
     end
+    entity.each_word do |word|
+      if keywords.include?(word.to_s)
+        word.set :is_keyword?, true
+        pp = entity.parent_phrase
+        next unless pp
+        if pp.has? :keyword_count
+          pp.set :keyword_count,
+          pp.keyword_count + 1
+        else
+          pp.set :keyword_count, 1
+        end
+      else
+        word.set :is_keyword?, false
+      end
+    end
     keywords
   end
 end

data/lib/treat/loaders/stanford.rb CHANGED

@@ -19,7 +19,7 @@ class Treat::Loaders
     StanfordCoreNLP.log_file =
     NULL_DEVICE if Treat.silence
-    StanfordCoreNLP.init
+    StanfordCoreNLP.bind
     @@loaded = true
   end

data/spec/document.rb CHANGED

@@ -36,11 +36,24 @@ describe Treat::Entities::Document do
         "a document with the contents of the file" do
           url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
           d = Treat::Entities::Document.build(url)
+          d.format.should eql :html
+          d.print_tree
           d.should be_an_instance_of Treat::Entities::Document
           d.to_s.index('Rubyist').should_not eql nil
         end
       end
+      context "when supplied with a url with no file extension" do
+        it "downloads the file the URL points to and opens " +
+        "a document with the contents of the file, assuming " +
+        "the downloaded file to be in HTML format" do
+          url = 'http://www.economist.com/node/21552208'
+          d = Treat::Entities::Document.build(url)
+          d.should be_an_instance_of Treat::Entities::Document
+          d.to_s.index('Ronnie Lupe').should_not eql nil
+        end
+      end
       context "when called with anything else than a " +
       "readable file name or url" do

data/spec/sandbox.rb CHANGED

@@ -1,5 +1,116 @@
+=begin
 require_relative '../lib/treat'
-c = Collection (Treat.spec + 'samples/mathematicians')
-c.do :chunk, :segment, :tokenize, :tf_idf, :keywords
-c.visualize :dot, :file => 'test2.dot', :remove_types => [:paragraph]
+def extract(sentences, n)
+  sentences.to_a.values_at(
+  *sentences.values
+  .each_with_index
+  .sort.reverse
+  .map(&:last)
+  .sort.take(n))
+  .map(&:first)
+end
+describe "#summarize" do
+  it "provides a summary of the text" do
+    Treat.debug = true
+    Treat.silence = true
+    context = Treat::DataSet.open('economist-context.yml')
+    content = Treat::DataSet.open('economist-content.yml')
+    c = Collection (Treat.spec + 'economist')
+    c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
+    c.each_document do |d|
+       sentences = {}
+      d.each_sentence do |sentence|
+        cx = sentence.classify(:training => context)
+        ct = sentence.classify(:training => content)
+        sentences[sentence] = cx[1] + ct[1]
+      end
+      puts
+      puts d.titles[0].to_s
+      puts
+      puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
+    end
+    c.serialize file: 'economist-coll.yaml'
+    c = Collection (Treat.spec + 'economist')
+    c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
+    # Topic word count ? Synonyms of keywords ?
+    # Time expressions?
+    classify_content = Treat::Classification.new(
+      :phrase,
+      [:word_count, :number_count,
+      :keyword_count, :name_tag_count],
+      :has_key_content?
+    )
+    classify_context = Treat::Classification.new(
+      :phrase,
+      [:position,
+      :position_from_end,
+      :type_of_parent_zone,
+      :value_of_first_word,
+      :tag_of_first_word
+      ],
+      :has_key_context?,
+      false,
+      :discrete
+    )
+    c.each_sentence do |s|
+      puts s.to_s
+      a = STDIN.gets.to_s.strip
+      if a == ''
+        s.set :has_key_content?, false
+        s.set :has_key_context?, false
+      else
+        s.set :has_key_content?, true
+        s.set :has_key_context?, true
+      end
+    end
+    context = c.export(classify_context)
+    content = c.export(classify_content)
+    context.save('economist-context.yml')
+    content.save('economist-content.yml')
+    context = Treat::DataSet.open('economist-context.yml')
+    content = Treat::DataSet.open('economist-content.yml')
+    c.each_document do |d|
+      sentences = {}
+      d.each_sentence do |sentence|
+        cx = sentence.classify(:training => context)
+        ct = sentence.classify(:training => content)
+        sentences[sentence] = cx[1] + ct[1]
+      end
+      puts
+      puts d.titles[0].to_s
+      puts
+      puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
+    end
+  end
+end
+=end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: treat
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-05 00:00:00.000000000 Z
+date: 2012-04-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rubyzip