treat 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,12 +14,11 @@ class Treat::AI::Classifiers::ID3
14
14
  set.labels.map { |l| l.to_s }, set.items,
15
15
  cl.default, cl.mode)
16
16
  dec_tree.train
17
+ @@classifiers[cl] = dec_tree
17
18
  else
18
19
  dec_tree = @@classifiers[cl]
19
20
  end
20
21
 
21
- cl.export_item(entity, false).inspect
22
-
23
22
  dec_tree.predict(
24
23
  cl.export_item(entity, false)
25
24
  )
@@ -0,0 +1,30 @@
1
+ # Currently, this MLP is limited to 1 output.
2
+ class Treat::AI::Classifiers::MLP
3
+
4
+ require 'ai4r'
5
+
6
+ @@mlps = {}
7
+
8
+ def self.classify(entity, options = {})
9
+
10
+ set = options[:training]
11
+ cl = set.classification
12
+
13
+ if !@@mlps[cl]
14
+ net = Ai4r::NeuralNetwork::
15
+ Backpropagation.new([cl.labels.size, 3, 1])
16
+ set.items.each do |item|
17
+ inputs = item[0..-2]
18
+ outputs = [item[-1]]
19
+ net.train(inputs, outputs)
20
+ end
21
+ @@mlps[cl] = net
22
+ else
23
+ net = @@mlps[cl]
24
+ end
25
+
26
+ net.eval(cl.export_item(entity, false))[0]
27
+
28
+ end
29
+
30
+ end
data/lib/treat/ai.rb CHANGED
@@ -2,7 +2,7 @@ module Treat::AI
2
2
 
3
3
  module Classifiers
4
4
  extend Treat::Groupable
5
- self.type = :annotator
5
+ self.type = :computer
6
6
  self.targets = [:entity]
7
7
  self.default = :id3
8
8
  end
@@ -27,21 +27,25 @@ class Treat::Classification
27
27
  end
28
28
 
29
29
  def export_item(e, include_question = true)
30
+
30
31
  line = []
31
32
 
32
33
  @features.each do |cmd|
34
+ dflt = nil
33
35
  begin
34
36
  if cmd.is_a?(Array)
35
- line << cmd[1].call(e)
37
+ if cmd.size == 3
38
+ r = cmd[1].call(e)
39
+ dflt = cmd[2]
40
+ line << (r ? r : dflt)
41
+ elsif cmd.size == 2
42
+ r = e.send(cmd[0])
43
+ dflt = cmd[1]
44
+ line << (r ? r : dflt)
45
+ end
36
46
  else
37
47
  line << e.send(cmd)
38
48
  end
39
- rescue Treat::Exception
40
- dflt = (
41
- (cmd.is_a?(Array) && cmd[2]) ?
42
- cmd[2] : nil
43
- )
44
- line << dflt
45
49
  end
46
50
  end
47
51
 
@@ -53,7 +57,6 @@ class Treat::Classification
53
57
  end
54
58
  end
55
59
 
56
- line[-1] = '' if line[-1].nil?
57
60
  line
58
61
  end
59
62
 
@@ -39,4 +39,13 @@ class Treat::DataSet
39
39
  end
40
40
  end
41
41
 
42
+ def to_ai4r
43
+ Ai4r::Data::DataSet.new(
44
+ :data_items => items,
45
+ :data_labels => (
46
+ labels.map { |l| l.to_s } +
47
+ [classification.question.to_s]
48
+ ))
49
+ end
50
+
42
51
  end
@@ -11,7 +11,8 @@ class Treat::Dependencies
11
11
  ['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
12
12
  ['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
13
13
  ['chronic', '>= 0.6.7', 'detect date and time in text'],
14
- ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities']
14
+ ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities'],
15
+ ['ai4r', '>= 1.11', 'perform different kinds of classification tasks on text entities']
15
16
  ]
16
17
 
17
18
  Binary = [
@@ -98,7 +98,7 @@ module Treat::Entities::Abilities::Buildable
98
98
  options[:default_to] ||= :html
99
99
 
100
100
  e = from_file(f, options)
101
- e.set :url, url
101
+ e.set :url, uri.to_s
102
102
  e
103
103
 
104
104
  end
@@ -14,7 +14,7 @@ module Treat::Entities::Abilities::Countable
14
14
  # the end of the parent entity.
15
15
  def position_from_end
16
16
  p = position
17
- parent.size - p
17
+ parent.children.size - p
18
18
  end
19
19
 
20
20
  # Find the frequency of the entity in
@@ -30,10 +30,8 @@ module Treat::Entities::Abilities::Iterable
30
30
  a = []
31
31
  type = :entity unless type
32
32
  each_entity(type) do |e|
33
- a << e if (e.has?(feature) &&
34
- e.features[feature] == value) ||
35
- ([:id, :value, :type].include?(feature) &&
36
- e.send(feature) == value)
33
+ r = e.send(feature)
34
+ a << e if r == value
37
35
  end
38
36
  a
39
37
  end
@@ -51,7 +49,7 @@ module Treat::Entities::Abilities::Iterable
51
49
  # Returns an array of the entities with the given
52
50
  # category.
53
51
  def entities_with_category(category, type = nil)
54
- entities_with_feature(:category, type)
52
+ entities_with_feature(:category, category, type)
55
53
  end
56
54
 
57
55
  # Returns the first ancestor of this entity
@@ -5,6 +5,16 @@ module Treat::Entities::Abilities::Registrable
5
5
  # Registers a token in the @registry hash.
6
6
  def register(entity)
7
7
 
8
+ unless @registry
9
+ @count = 0
10
+ @registry = {
11
+ :value => {},
12
+ :position => {},
13
+ :type => {},
14
+ :id => {}
15
+ }
16
+ end
17
+
8
18
  if entity.is_a?(Treat::Entities::Token) ||
9
19
  entity.is_a?(Treat::Entities::Phrase)
10
20
  val = entity.to_s.downcase
@@ -64,18 +64,8 @@ module Treat::Entities
64
64
  super(value, id)
65
65
  @type = :entity if self == Entity
66
66
  @type ||= ucc(cl(self.class)).intern
67
- unless is_a?(Treat::Entities::Token)
68
- @count = 0
69
- @registry = {
70
- :id => {},
71
- :value => {},
72
- :type => {},
73
- :position => {}
74
- }
75
- end
76
67
  end
77
68
 
78
-
79
69
  # Add an entity to the current entity.
80
70
  # Registers the entity in the root node
81
71
  # token registry if the entity is a leaf.
@@ -41,17 +41,10 @@ class Treat::Extractors::Keywords::TfIdf
41
41
  entity.each_word do |word|
42
42
 
43
43
  if keywords.include?(word.to_s)
44
- word.set :is_keyword?, true
44
+ word.set :keyword, true
45
45
  pp = entity.parent_phrase
46
- next unless pp
47
- if pp.has? :keyword_count
48
- pp.set :keyword_count,
49
- pp.keyword_count + 1
50
- else
51
- pp.set :keyword_count, 1
52
- end
53
46
  else
54
- word.set :is_keyword?, false
47
+ word.set :keyword, false
55
48
  end
56
49
 
57
50
  end
@@ -15,21 +15,16 @@ class Treat::Formatters::Readers::Autoselect
15
15
  document.read(detect_format(document.file, options[:default_to]))
16
16
  end
17
17
 
18
- def self.detect_format(filename, default_to = DefaultOptions[:default_to])
19
-
18
+ def self.detect_format(filename, default_to = nil)
19
+ default_to ||= DefaultOptions[:default_to]
20
20
  ext = filename.scan(ExtensionRegexp)
21
- ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ?
22
- ext[0][0] : ''
23
-
24
- format =
25
- ImageExtensions.include?(ext) ?
26
- 'image' : ext
27
-
28
- # Humanize extensions.
21
+ ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ? ext[0][0] : ''
22
+
23
+ format = ImageExtensions.include?(ext) ? 'image' : ext
29
24
  format = 'html' if format == 'htm'
30
25
  format = 'yaml' if format == 'yml'
31
26
 
32
- format = default_to if format == ''
27
+ format = default_to if format.to_s == ''
33
28
 
34
29
  format.intern
35
30
 
@@ -0,0 +1,64 @@
1
+ # Stores an entity in a Mongo collection.
2
+ class Treat::Formatters::Serializers::Mongo
3
+
4
+ # Reauire the Mongo DB
5
+ require 'mongo'
6
+
7
+ # Serialize an entity tree in XML format.
8
+ #
9
+ # Options:
10
+ # - (String) :file => a file to write to.
11
+ def self.serialize(entity, options = {})
12
+
13
+ unless options[:database]
14
+ raise Treat::Exception,
15
+ 'Must supply the database name.'
16
+ end
17
+
18
+ @@conn ||= Mongo::Connection.new
19
+ @@db ||= @@conn[options[:database]]
20
+
21
+ path = []
22
+
23
+ entity.each_ancestor do |ancestor|
24
+ path << [ancestor.type, ancestor.id]
25
+ end
26
+
27
+ path = path.reverse
28
+
29
+ target = @@db
30
+
31
+ path.each do |type_id|
32
+ coll = @@db[type_id[0]][type_id[1]]
33
+ end
34
+
35
+ # Store path
36
+
37
+ Treat::Entities.list.each do |type|
38
+
39
+ type = entity.type.to_s
40
+ type = (type == 'entity') ? 'entities' : (type + 's')
41
+ doc = coll[type]
42
+
43
+ features = {}
44
+ features['id'] = entity.id
45
+ features['value'] = entity.value
46
+
47
+ entity.features.each_pair do |feature, value|
48
+ if value.is_a? Treat::Entities::Entity
49
+ value = value.id
50
+ elsif value.is_a?(Array) || value.is_a?(Hash)
51
+ value = value.inspect
52
+ else
53
+ value = value.to_s
54
+ end
55
+ features[feature.to_s] = value
56
+ end
57
+
58
+ doc.insert(features)
59
+
60
+ end
61
+
62
+ end
63
+
64
+ end
@@ -8,12 +8,14 @@ class Treat::Formatters::Serializers::XML
8
8
  # Options:
9
9
  # - (String) :file => a file to write to.
10
10
  def self.serialize(entity, options = {})
11
-
12
- options = options.merge({:indent => 0}) if options[:indent].nil?
11
+ if options[:indent].nil?
12
+ options = options.merge({:indent => 0})
13
+ end
13
14
  indent = options[:indent]
14
15
  if options[:indent] == 0
15
16
  enc = entity.to_s.encoding.to_s.downcase
16
- string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>\n<treat>\n"
17
+ string = "<?xml version=\"1.0\" " +
18
+ "encoding=\"#{enc}\" ?>\n<treat>\n"
17
19
  else
18
20
  string = ''
19
21
  end
@@ -26,20 +28,24 @@ class Treat::Formatters::Serializers::XML
26
28
  if value.is_a? Treat::Entities::Entity
27
29
  attributes << "#{feature}='#{value.id}' "
28
30
  else
31
+ value = value.inspect if value.is_a?(Symbol)
29
32
  attributes << "#{feature}='#{escape(value)}' "
30
33
  end
31
34
  end
35
+ ############ To be refactored
32
36
  unless entity.dependencies.empty?
33
37
  attributes << "dependencies='"
34
38
  a = []
35
39
  entity.dependencies.each do |dependency|
36
- a << ("{target: #{dependency.target}, type: #{dependency.type}, " +
40
+ a << ("{target: #{dependency.target}, "+
41
+ "type: #{dependency.type}, " +
37
42
  "directed: #{dependency.directed}, " +
38
43
  "direction: #{dependency.direction}}" )
39
44
  end
40
45
  # Structs.
41
46
  attributes << a.join(',') + "'"
42
47
  end
48
+ ############ End of ugly code
43
49
  end
44
50
  tag = entity.class.to_s.split('::')[-1].downcase
45
51
  string += "#{spaces}<#{tag}#{attributes}>"
@@ -36,6 +36,7 @@ module Treat::Formatters::Unserializers::XML
36
36
  id = nil; value = ''
37
37
  attributes = {}
38
38
  dependencies = []
39
+
39
40
  unless xml_reader.attributes.size == 0
40
41
  xml_reader.attributes.each_pair do |k,v|
41
42
  if k == 'id'
@@ -64,6 +65,11 @@ module Treat::Formatters::Unserializers::XML
64
65
  elsif k == 'value'
65
66
  value = v
66
67
  else
68
+ v = v[1..-1].intern if v[0] == ':'
69
+ v = v.to_i if v =~ /^[0-9]*$/
70
+ v = v.to_f if v =~ /^[0-9\.]*$/
71
+ v = false if v == 'false'
72
+ v = true if v == 'true'
67
73
  attributes[k.intern] = v
68
74
  end
69
75
  end
data/lib/treat/kernel.rb CHANGED
@@ -10,7 +10,7 @@ module Kernel
10
10
  # A list of acronyms used in class names within
11
11
  # the program. These do not CamelCase; they
12
12
  # CAMELCase.
13
- Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
13
+ Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo mlp]
14
14
 
15
15
  # A cache to optimize camel casing.
16
16
  @@cc_cache = {}
@@ -43,6 +43,8 @@ module Treat::Processors::Tokenizers::Perl
43
43
 
44
44
  # Replace all decimal points by ^^
45
45
  Treat::Helpers::DecimalPointEscaper.escape!(text)
46
+
47
+ =begin
46
48
 
47
49
  # Translate some common extended ascii
48
50
  # characters to quotes
@@ -60,6 +62,8 @@ module Treat::Processors::Tokenizers::Perl
60
62
  text.gsub!(/\"(?=\s)/," '' ")
61
63
  #s{\"} { `` }g;
62
64
  text.gsub!(/\"(?=\s)/," `` ")
65
+ =end
66
+
63
67
  # Isolate ellipses
64
68
  # s{\.\.\.} { ... }g;
65
69
  text.gsub!(/\.\.\./,' ... ')
data/lib/treat.rb CHANGED
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.1"
13
+ VERSION = "1.0.2"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
data/spec/entity.rb CHANGED
@@ -113,7 +113,7 @@ describe Treat::Entities::Entity do
113
113
  describe "Exportable" do
114
114
 
115
115
  context "when supplied with a classification to export" do
116
- classification = Treat::Classification.new(:word, :tag, :is_keyword?)
116
+ classification = Treat::Classification.new(:word, :tag, :is_keyword)
117
117
  it "returns a data set with the exported features" do
118
118
  ds = @sentence.export(classification)
119
119
  ds.classification.should eql classification
@@ -316,13 +316,14 @@ describe Treat::Entities::Entity do
316
316
 
317
317
  describe "Formatters" do
318
318
 
319
+
320
+ before do
321
+ @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
322
+ @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
323
+ end
324
+
319
325
  describe "#serialize" do
320
-
321
- before :all do
322
- @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
323
- @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
324
- end
325
-
326
+
326
327
  context "when called with a file to save to" do
327
328
 
328
329
  it "serializes a document to the supplied format" do
@@ -332,24 +333,58 @@ describe Treat::Entities::Entity do
332
333
  s = Treat::Entities::Paragraph.new(@txt)
333
334
  s.do(:segment, :tokenize)
334
335
  s.serialize(ser, :file => f)
336
+ File.delete(f)
337
+ end
338
+
339
+ end
340
+
341
+ end
342
+
343
+ end
344
+
345
+ describe "#unserialize" do
346
+
347
+ context "when called with a serialized file" do
348
+
349
+ it "reconstitutes the original entity" do
350
+ @serializers.each do |ser|
351
+
352
+ f = Treat.spec + 'test.' + ser.to_s
353
+ s = Treat::Entities::Paragraph.new(@txt)
354
+
355
+ s.set :test_int, 9
356
+ s.set :test_float, 9.9
357
+ s.set :test_string, 'hello'
358
+ s.set :test_sym, :hello
359
+ s.set :test_bool, false
360
+
361
+ s.do(:segment, :tokenize)
362
+
363
+ s.serialize(ser, :file => f)
364
+
335
365
  d = Treat::Entities::Document.build(f)
366
+
367
+ d.test_int.should eql 9
368
+ d.test_float.should eql 9.9
369
+ d.test_string.should eql 'hello'
370
+ d.test_sym.should eql :hello
371
+ d.test_bool.should eql false
372
+
336
373
  d.to_s.should eql @txt
337
374
  d.size.should eql s.size
375
+
338
376
  d.token_count.should eql s.token_count
339
377
  d.tokens[0].id.should eql s.tokens[0].id
378
+
340
379
  File.delete(f)
341
380
  end
342
-
381
+
343
382
  end
344
383
 
345
384
  end
346
385
 
347
386
  end
348
387
 
349
- describe "#unserialize" do
350
-
351
- end
352
-
353
388
  end
354
389
 
355
390
  describe "Extractors" do
data/spec/sandbox.rb CHANGED
@@ -1,116 +1,24 @@
1
- =begin
2
1
  require_relative '../lib/treat'
3
2
 
4
- def extract(sentences, n)
5
- sentences.to_a.values_at(
6
- *sentences.values
7
- .each_with_index
8
- .sort.reverse
9
- .map(&:last)
10
- .sort.take(n))
11
- .map(&:first)
12
- end
3
+ s = Sentence "Barack Obama was killed last night."
4
+ s.tokenize
13
5
 
14
- describe "#summarize" do
6
+ puts s.word_with_position(2).inspect
15
7
 
16
- it "provides a summary of the text" do
17
-
18
- Treat.debug = true
19
- Treat.silence = true
8
+ s.word_with_position(2).set :highlighted, 1
20
9
 
21
-
22
- context = Treat::DataSet.open('economist-context.yml')
23
- content = Treat::DataSet.open('economist-content.yml')
24
-
25
- c = Collection (Treat.spec + 'economist')
26
- c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
27
-
28
- c.each_document do |d|
29
-
30
- sentences = {}
31
-
32
- d.each_sentence do |sentence|
33
- cx = sentence.classify(:training => context)
34
- ct = sentence.classify(:training => content)
35
- sentences[sentence] = cx[1] + ct[1]
36
- end
10
+ cl = Treat::Classification.new(
11
+ :word,
12
+ [[:position, 0]],
13
+ :highlighted,
14
+ 0
15
+ )
37
16
 
38
- puts
39
- puts d.titles[0].to_s
40
- puts
41
-
42
- puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
43
-
44
- end
17
+ data_set = s.export(cl)
45
18
 
46
- c.serialize file: 'economist-coll.yaml'
19
+ s2 = Sentence ''
20
+ w = Word 'Hello'
21
+ s2 << w
22
+ w.set :position, 2
47
23
 
48
- c = Collection (Treat.spec + 'economist')
49
- c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
50
-
51
- # Topic word count ? Synonyms of keywords ?
52
- # Time expressions?
53
- classify_content = Treat::Classification.new(
54
- :phrase,
55
- [:word_count, :number_count,
56
- :keyword_count, :name_tag_count],
57
- :has_key_content?
58
- )
59
-
60
- classify_context = Treat::Classification.new(
61
- :phrase,
62
- [:position,
63
- :position_from_end,
64
- :type_of_parent_zone,
65
- :value_of_first_word,
66
- :tag_of_first_word
67
- ],
68
- :has_key_context?,
69
- false,
70
- :discrete
71
- )
72
-
73
- c.each_sentence do |s|
74
- puts s.to_s
75
- a = STDIN.gets.to_s.strip
76
- if a == ''
77
- s.set :has_key_content?, false
78
- s.set :has_key_context?, false
79
- else
80
- s.set :has_key_content?, true
81
- s.set :has_key_context?, true
82
- end
83
- end
84
-
85
- context = c.export(classify_context)
86
- content = c.export(classify_content)
87
-
88
- context.save('economist-context.yml')
89
- content.save('economist-content.yml')
90
-
91
- context = Treat::DataSet.open('economist-context.yml')
92
- content = Treat::DataSet.open('economist-content.yml')
93
-
94
- c.each_document do |d|
95
-
96
- sentences = {}
97
-
98
- d.each_sentence do |sentence|
99
- cx = sentence.classify(:training => context)
100
- ct = sentence.classify(:training => content)
101
- sentences[sentence] = cx[1] + ct[1]
102
- end
103
-
104
- puts
105
- puts d.titles[0].to_s
106
- puts
107
-
108
- puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
109
-
110
- end
111
-
112
-
113
- end
114
-
115
- end
116
- =end
24
+ puts w.classify(:mlp, :training => data_set).inspect
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-07 00:00:00.000000000 Z
12
+ date: 2012-04-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip
@@ -75,8 +75,7 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: 0.9.2
78
- description: ! ' Treat is a Ruby toolkit for text retrieval, information extraction
79
- and natural language processing. '
78
+ description: ! ' Treat is a full-fledged natural language processing toolkit for Ruby. '
80
79
  email:
81
80
  - louis.mullie@gmail.com
82
81
  executables: []
@@ -84,6 +83,7 @@ extensions: []
84
83
  extra_rdoc_files: []
85
84
  files:
86
85
  - lib/treat/ai/classifiers/id3.rb
86
+ - lib/treat/ai/classifiers/mlp.rb
87
87
  - lib/treat/ai.rb
88
88
  - lib/treat/categories.rb
89
89
  - lib/treat/categorizable.rb
@@ -128,6 +128,7 @@ files:
128
128
  - lib/treat/formatters/readers/pdf.rb
129
129
  - lib/treat/formatters/readers/txt.rb
130
130
  - lib/treat/formatters/readers/xml.rb
131
+ - lib/treat/formatters/serializers/mongo.rb
131
132
  - lib/treat/formatters/serializers/xml.rb
132
133
  - lib/treat/formatters/serializers/yaml.rb
133
134
  - lib/treat/formatters/unserializers/autoselect.rb
@@ -244,5 +245,5 @@ rubyforge_project:
244
245
  rubygems_version: 1.8.21
245
246
  signing_key:
246
247
  specification_version: 3
247
- summary: A text retrieval, extraction and annotation toolkit for Ruby.
248
+ summary: Text Retrieval, Extraction and Annotation Toolkit.
248
249
  test_files: []