RubyGems - treat - Versions diffs - 1.0.1 → 1.0.2 - Mend

treat 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/lib/treat/ai/classifiers/id3.rb +1 -2
data/lib/treat/ai/classifiers/mlp.rb +30 -0
data/lib/treat/ai.rb +1 -1
data/lib/treat/classification.rb +11 -8
data/lib/treat/data_set.rb +9 -0
data/lib/treat/dependencies.rb +2 -1
data/lib/treat/entities/abilities/buildable.rb +1 -1
data/lib/treat/entities/abilities/countable.rb +1 -1
data/lib/treat/entities/abilities/iterable.rb +3 -5
data/lib/treat/entities/abilities/registrable.rb +10 -0
data/lib/treat/entities/entity.rb +0 -10
data/lib/treat/extractors/keywords/tf_idf.rb +2 -9
data/lib/treat/formatters/readers/autoselect.rb +6 -11
data/lib/treat/formatters/serializers/mongo.rb +64 -0
data/lib/treat/formatters/serializers/xml.rb +10 -4
data/lib/treat/formatters/unserializers/xml.rb +6 -0
data/lib/treat/kernel.rb +1 -1
data/lib/treat/processors/tokenizers/perl.rb +4 -0
data/lib/treat.rb +1 -1
data/spec/entity.rb +47 -12
data/spec/sandbox.rb +16 -108
metadata +6 -5

data/lib/treat/ai/classifiers/id3.rb CHANGED Viewed

@@ -14,12 +14,11 @@ class Treat::AI::Classifiers::ID3
       set.labels.map { |l| l.to_s }, set.items,
       cl.default, cl.mode)
       dec_tree.train
+      @@classifiers[cl] = dec_tree
     else
       dec_tree = @@classifiers[cl]
     end
-    cl.export_item(entity, false).inspect
     dec_tree.predict(
       cl.export_item(entity, false)
     )

data/lib/treat/ai/classifiers/mlp.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# Currently, this MLP is limited to 1 output.
+class Treat::AI::Classifiers::MLP
+  require 'ai4r'
+  @@mlps = {}
+  def self.classify(entity, options = {})
+    set = options[:training]
+    cl = set.classification
+    if !@@mlps[cl]
+      net = Ai4r::NeuralNetwork::
+      Backpropagation.new([cl.labels.size, 3, 1])
+      set.items.each do |item|
+        inputs = item[0..-2]
+        outputs = [item[-1]]
+        net.train(inputs, outputs)
+      end
+      @@mlps[cl] = net
+    else
+      net = @@mlps[cl]
+    end
+    net.eval(cl.export_item(entity, false))[0]
+  end
+end

data/lib/treat/ai.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Treat::AI
   module Classifiers
     extend Treat::Groupable
-    self.type = :annotator
+    self.type = :computer
     self.targets = [:entity]
     self.default = :id3
   end

data/lib/treat/classification.rb CHANGED Viewed

@@ -27,21 +27,25 @@ class Treat::Classification
   end
   def export_item(e, include_question = true)
     line = []
     @features.each do |cmd|
+      dflt = nil
       begin
         if cmd.is_a?(Array)
-          line << cmd[1].call(e)
+          if cmd.size == 3
+            r = cmd[1].call(e)
+            dflt = cmd[2]
+            line << (r ? r : dflt)
+          elsif cmd.size == 2
+            r = e.send(cmd[0])
+            dflt = cmd[1]
+            line << (r ? r : dflt)
+          end
         else
           line << e.send(cmd)
         end
-      rescue Treat::Exception
-        dflt = (
-        (cmd.is_a?(Array) && cmd[2]) ?
-        cmd[2] : nil
-        )
-        line << dflt
       end
     end
@@ -53,7 +57,6 @@ class Treat::Classification
       end
     end
-    line[-1] = '' if line[-1].nil?
     line
   end

data/lib/treat/data_set.rb CHANGED Viewed

@@ -39,4 +39,13 @@ class Treat::DataSet
     end
   end
+  def to_ai4r
+    Ai4r::Data::DataSet.new(
+      :data_items => items,
+      :data_labels => (
+        labels.map { |l| l.to_s } +
+        [classification.question.to_s]
+    ))
+  end
 end

data/lib/treat/dependencies.rb CHANGED Viewed

@@ -11,7 +11,8 @@ class Treat::Dependencies
     ['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
     ['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
     ['chronic', '>= 0.6.7', 'detect date and time in text'],
-    ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities']
+    ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities'],
+    ['ai4r', '>= 1.11', 'perform different kinds of classification tasks on text entities']
   ]
   Binary = [

data/lib/treat/entities/abilities/buildable.rb CHANGED Viewed

@@ -98,7 +98,7 @@ module Treat::Entities::Abilities::Buildable
     options[:default_to] ||= :html
     e = from_file(f, options)
-    e.set :url, url
+    e.set :url, uri.to_s
     e
   end

data/lib/treat/entities/abilities/countable.rb CHANGED Viewed

@@ -14,7 +14,7 @@ module Treat::Entities::Abilities::Countable
   # the end of the parent entity.
   def position_from_end
     p = position
-    parent.size - p
+    parent.children.size - p
   end
   # Find the frequency of the entity in

data/lib/treat/entities/abilities/iterable.rb CHANGED Viewed

@@ -30,10 +30,8 @@ module Treat::Entities::Abilities::Iterable
     a = []
     type = :entity unless type
     each_entity(type) do |e|
-      a << e if (e.has?(feature) &&
-      e.features[feature] == value) ||
-      ([:id, :value, :type].include?(feature) &&
-      e.send(feature) == value)
+      r = e.send(feature)
+      a << e if r == value
     end
     a
   end
@@ -51,7 +49,7 @@ module Treat::Entities::Abilities::Iterable
   # Returns an array of the entities with the given
   # category.
   def entities_with_category(category, type = nil)
-    entities_with_feature(:category, type)
+    entities_with_feature(:category, category, type)
   end
   # Returns the first ancestor of this entity

data/lib/treat/entities/abilities/registrable.rb CHANGED Viewed

@@ -5,6 +5,16 @@ module Treat::Entities::Abilities::Registrable
   # Registers a token in the @registry hash.
   def register(entity)
+    unless @registry
+      @count = 0
+      @registry = {
+        :value => {},
+        :position => {},
+        :type => {},
+        :id => {}
+      }
+    end
     if entity.is_a?(Treat::Entities::Token) ||
       entity.is_a?(Treat::Entities::Phrase)
       val = entity.to_s.downcase

data/lib/treat/entities/entity.rb CHANGED Viewed

@@ -64,18 +64,8 @@ module Treat::Entities
       super(value, id)
       @type = :entity if self == Entity
       @type ||= ucc(cl(self.class)).intern
-      unless is_a?(Treat::Entities::Token)
-        @count = 0
-        @registry = {
-          :id => {},
-          :value => {},
-          :type => {},
-          :position => {}
-        }
-      end
     end
     # Add an entity to the current entity.
     # Registers the entity in the root node
     # token registry if the entity is a leaf.

data/lib/treat/extractors/keywords/tf_idf.rb CHANGED Viewed

@@ -41,17 +41,10 @@ class Treat::Extractors::Keywords::TfIdf
     entity.each_word do |word|
       if keywords.include?(word.to_s)
-        word.set :is_keyword?, true
+        word.set :keyword, true
         pp = entity.parent_phrase
-        next unless pp
-        if pp.has? :keyword_count
-          pp.set :keyword_count,
-          pp.keyword_count + 1
-        else
-          pp.set :keyword_count, 1
-        end
       else
-        word.set :is_keyword?, false
+        word.set :keyword, false
       end
     end

data/lib/treat/formatters/readers/autoselect.rb CHANGED Viewed

@@ -15,21 +15,16 @@ class Treat::Formatters::Readers::Autoselect
     document.read(detect_format(document.file, options[:default_to]))
   end
-  def self.detect_format(filename, default_to = DefaultOptions[:default_to])
+  def self.detect_format(filename, default_to = nil)
+    default_to ||= DefaultOptions[:default_to]
     ext = filename.scan(ExtensionRegexp)
-    ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ?
-    ext[0][0] : ''
-    format =
-    ImageExtensions.include?(ext) ?
-    'image' : ext
-    # Humanize extensions.
+    ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ? ext[0][0] : ''
+    format = ImageExtensions.include?(ext) ? 'image' : ext
     format = 'html' if format == 'htm'
     format = 'yaml' if format == 'yml'
-    format = default_to if format == ''
+    format = default_to if format.to_s == ''
     format.intern

data/lib/treat/formatters/serializers/mongo.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# Stores an entity in a Mongo collection.
+class Treat::Formatters::Serializers::Mongo
+  # Reauire the Mongo DB
+  require 'mongo'
+  # Serialize an entity tree in XML format.
+  #
+  # Options:
+  # - (String) :file => a file to write to.
+  def self.serialize(entity, options = {})
+    unless options[:database]
+      raise Treat::Exception,
+      'Must supply the database name.'
+    end
+    @@conn ||= Mongo::Connection.new
+    @@db ||= @@conn[options[:database]]
+    path = []
+    entity.each_ancestor do |ancestor|
+      path << [ancestor.type, ancestor.id]
+    end
+    path = path.reverse
+    target = @@db
+    path.each do |type_id|
+      coll = @@db[type_id[0]][type_id[1]]
+    end
+    # Store path
+    Treat::Entities.list.each do |type|
+      type = entity.type.to_s
+      type = (type == 'entity') ? 'entities' : (type + 's')
+      doc = coll[type]
+      features = {}
+      features['id'] = entity.id
+      features['value'] = entity.value
+      entity.features.each_pair do |feature, value|
+        if value.is_a? Treat::Entities::Entity
+          value = value.id
+        elsif value.is_a?(Array) || value.is_a?(Hash)
+          value = value.inspect
+        else
+          value = value.to_s
+        end
+        features[feature.to_s] = value
+      end
+      doc.insert(features)
+    end
+  end
+end

data/lib/treat/formatters/serializers/xml.rb CHANGED Viewed

@@ -8,12 +8,14 @@ class Treat::Formatters::Serializers::XML
   # Options:
   # - (String) :file => a file to write to.
   def self.serialize(entity, options = {})
-    options = options.merge({:indent => 0}) if options[:indent].nil?
+    if options[:indent].nil?
+      options = options.merge({:indent => 0})
+    end
     indent = options[:indent]
     if options[:indent] == 0
       enc = entity.to_s.encoding.to_s.downcase
-      string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>\n<treat>\n"
+      string = "<?xml version=\"1.0\" " +
+      "encoding=\"#{enc}\" ?>\n<treat>\n"
     else
       string = ''
     end
@@ -26,20 +28,24 @@ class Treat::Formatters::Serializers::XML
         if value.is_a? Treat::Entities::Entity
           attributes << "#{feature}='#{value.id}' "
         else
+          value = value.inspect if value.is_a?(Symbol)
           attributes << "#{feature}='#{escape(value)}' "
         end
       end
+      ############ To be refactored
       unless entity.dependencies.empty?
         attributes << "dependencies='"
         a = []
         entity.dependencies.each do |dependency|
-          a << ("{target: #{dependency.target}, type: #{dependency.type}, " +
+          a << ("{target: #{dependency.target}, "+
+          "type: #{dependency.type}, " +
           "directed: #{dependency.directed}, " +
           "direction: #{dependency.direction}}" )
         end
         # Structs.
         attributes << a.join(',') + "'"
       end
+      ############ End of ugly code
     end
     tag = entity.class.to_s.split('::')[-1].downcase
     string += "#{spaces}<#{tag}#{attributes}>"

data/lib/treat/formatters/unserializers/xml.rb CHANGED Viewed

@@ -36,6 +36,7 @@ module Treat::Formatters::Unserializers::XML
       id = nil; value = ''
       attributes = {}
       dependencies = []
       unless xml_reader.attributes.size == 0
         xml_reader.attributes.each_pair do |k,v|
           if k == 'id'
@@ -64,6 +65,11 @@ module Treat::Formatters::Unserializers::XML
           elsif k == 'value'
             value = v
           else
+            v = v[1..-1].intern if v[0] == ':'
+            v = v.to_i if v =~ /^[0-9]*$/
+            v = v.to_f if v =~ /^[0-9\.]*$/
+            v = false if v == 'false'
+            v = true if v == 'true'
             attributes[k.intern] = v
           end
         end

data/lib/treat/kernel.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Kernel
   # A list of acronyms used in class names within
   # the program. These do not CamelCase; they
   # CAMELCase.
-  Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
+  Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo mlp]
   # A cache to optimize camel casing.
   @@cc_cache = {}

data/lib/treat/processors/tokenizers/perl.rb CHANGED Viewed

@@ -43,6 +43,8 @@ module Treat::Processors::Tokenizers::Perl
     # Replace all decimal points by ^^
     Treat::Helpers::DecimalPointEscaper.escape!(text)
+=begin
     # Translate some common extended ascii
     # characters to quotes
@@ -60,6 +62,8 @@ module Treat::Processors::Tokenizers::Perl
     text.gsub!(/\"(?=\s)/," '' ")
     #s{\"} { `` }g;
     text.gsub!(/\"(?=\s)/," `` ")
+=end
     # Isolate ellipses
     # s{\.\.\.}   { ... }g;
     text.gsub!(/\.\.\./,' ... ')

data/lib/treat.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Treat
   end
   # The current version of Treat.
-  VERSION = "1.0.1"
+  VERSION = "1.0.2"
   # Add methods to handle syntactic sugar,
   # language configuration options, and paths.

data/spec/entity.rb CHANGED Viewed

@@ -113,7 +113,7 @@ describe Treat::Entities::Entity do
   describe "Exportable" do
     context "when supplied with a classification to export" do
-      classification = Treat::Classification.new(:word, :tag, :is_keyword?)
+      classification = Treat::Classification.new(:word, :tag, :is_keyword)
       it "returns a data set with the exported features" do
         ds = @sentence.export(classification)
         ds.classification.should eql classification
@@ -316,13 +316,14 @@ describe Treat::Entities::Entity do
   describe "Formatters" do
+    before do
+      @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
+      @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
+    end
     describe "#serialize" do
-      before :all do
-        @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
-        @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
-      end
       context "when called with a file to save to" do
         it "serializes a document to the supplied format" do
@@ -332,24 +333,58 @@ describe Treat::Entities::Entity do
             s = Treat::Entities::Paragraph.new(@txt)
             s.do(:segment, :tokenize)
             s.serialize(ser, :file => f)
+            File.delete(f)
+          end
+        end
+      end
+    end
+    describe "#unserialize" do
+      context "when called with a serialized file" do
+        it "reconstitutes the original entity" do
+          @serializers.each do |ser|
+            f = Treat.spec + 'test.' + ser.to_s
+            s = Treat::Entities::Paragraph.new(@txt)
+            s.set :test_int, 9
+            s.set :test_float, 9.9
+            s.set :test_string, 'hello'
+            s.set :test_sym, :hello
+            s.set :test_bool, false
+            s.do(:segment, :tokenize)
+            s.serialize(ser, :file => f)
             d = Treat::Entities::Document.build(f)
+            d.test_int.should eql 9
+            d.test_float.should eql 9.9
+            d.test_string.should eql 'hello'
+            d.test_sym.should eql :hello
+            d.test_bool.should eql false
             d.to_s.should eql @txt
             d.size.should eql s.size
             d.token_count.should eql s.token_count
             d.tokens[0].id.should eql s.tokens[0].id
             File.delete(f)
           end
         end
       end
     end
-    describe "#unserialize" do
-    end
   end
   describe "Extractors" do

data/spec/sandbox.rb CHANGED Viewed

@@ -1,116 +1,24 @@
-=begin
 require_relative '../lib/treat'
-def extract(sentences, n)
-  sentences.to_a.values_at(
-  *sentences.values
-  .each_with_index
-  .sort.reverse
-  .map(&:last)
-  .sort.take(n))
-  .map(&:first)
-end
+s = Sentence "Barack Obama was killed last night."
+s.tokenize
-describe "#summarize" do
+puts s.word_with_position(2).inspect
-  it "provides a summary of the text" do
-    Treat.debug = true
-    Treat.silence = true
+s.word_with_position(2).set :highlighted, 1
-    context = Treat::DataSet.open('economist-context.yml')
-    content = Treat::DataSet.open('economist-content.yml')
-    c = Collection (Treat.spec + 'economist')
-    c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
-    c.each_document do |d|
-       sentences = {}
-      d.each_sentence do |sentence|
-        cx = sentence.classify(:training => context)
-        ct = sentence.classify(:training => content)
-        sentences[sentence] = cx[1] + ct[1]
-      end
+cl = Treat::Classification.new(
+  :word,
+  [[:position, 0]],
+  :highlighted,
+  0
+)
-      puts
-      puts d.titles[0].to_s
-      puts
-      puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
-    end
+data_set = s.export(cl)
-    c.serialize file: 'economist-coll.yaml'
+s2 = Sentence ''
+w = Word 'Hello'
+s2 << w
+w.set :position, 2
-    c = Collection (Treat.spec + 'economist')
-    c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
-    # Topic word count ? Synonyms of keywords ?
-    # Time expressions?
-    classify_content = Treat::Classification.new(
-      :phrase,
-      [:word_count, :number_count,
-      :keyword_count, :name_tag_count],
-      :has_key_content?
-    )
-    classify_context = Treat::Classification.new(
-      :phrase,
-      [:position,
-      :position_from_end,
-      :type_of_parent_zone,
-      :value_of_first_word,
-      :tag_of_first_word
-      ],
-      :has_key_context?,
-      false,
-      :discrete
-    )
-    c.each_sentence do |s|
-      puts s.to_s
-      a = STDIN.gets.to_s.strip
-      if a == ''
-        s.set :has_key_content?, false
-        s.set :has_key_context?, false
-      else
-        s.set :has_key_content?, true
-        s.set :has_key_context?, true
-      end
-    end
-    context = c.export(classify_context)
-    content = c.export(classify_content)
-    context.save('economist-context.yml')
-    content.save('economist-content.yml')
-    context = Treat::DataSet.open('economist-context.yml')
-    content = Treat::DataSet.open('economist-content.yml')
-    c.each_document do |d|
-      sentences = {}
-      d.each_sentence do |sentence|
-        cx = sentence.classify(:training => context)
-        ct = sentence.classify(:training => content)
-        sentences[sentence] = cx[1] + ct[1]
-      end
-      puts
-      puts d.titles[0].to_s
-      puts
-      puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
-    end
-  end
-end
-=end
+puts w.classify(:mlp, :training => data_set).inspect

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: treat
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-07 00:00:00.000000000 Z
+date: 2012-04-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rubyzip
@@ -75,8 +75,7 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: 0.9.2
-description: ! ' Treat is a Ruby toolkit for text retrieval, information extraction
-  and natural language processing. '
+description: ! ' Treat is a full-fledged natural language processing toolkit for Ruby. '
 email:
 - louis.mullie@gmail.com
 executables: []
@@ -84,6 +83,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/treat/ai/classifiers/id3.rb
+- lib/treat/ai/classifiers/mlp.rb
 - lib/treat/ai.rb
 - lib/treat/categories.rb
 - lib/treat/categorizable.rb
@@ -128,6 +128,7 @@ files:
 - lib/treat/formatters/readers/pdf.rb
 - lib/treat/formatters/readers/txt.rb
 - lib/treat/formatters/readers/xml.rb
+- lib/treat/formatters/serializers/mongo.rb
 - lib/treat/formatters/serializers/xml.rb
 - lib/treat/formatters/serializers/yaml.rb
 - lib/treat/formatters/unserializers/autoselect.rb
@@ -244,5 +245,5 @@ rubyforge_project:
 rubygems_version: 1.8.21
 signing_key:
 specification_version: 3
-summary: A text retrieval, extraction and annotation toolkit for Ruby.
+summary: Text Retrieval, Extraction and Annotation Toolkit.
 test_files: []