treat 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,12 +14,11 @@ class Treat::AI::Classifiers::ID3
14
14
  set.labels.map { |l| l.to_s }, set.items,
15
15
  cl.default, cl.mode)
16
16
  dec_tree.train
17
+ @@classifiers[cl] = dec_tree
17
18
  else
18
19
  dec_tree = @@classifiers[cl]
19
20
  end
20
21
 
21
- cl.export_item(entity, false).inspect
22
-
23
22
  dec_tree.predict(
24
23
  cl.export_item(entity, false)
25
24
  )
@@ -0,0 +1,30 @@
1
+ # Currently, this MLP is limited to 1 output.
2
+ class Treat::AI::Classifiers::MLP
3
+
4
+ require 'ai4r'
5
+
6
+ @@mlps = {}
7
+
8
+ def self.classify(entity, options = {})
9
+
10
+ set = options[:training]
11
+ cl = set.classification
12
+
13
+ if !@@mlps[cl]
14
+ net = Ai4r::NeuralNetwork::
15
+ Backpropagation.new([cl.labels.size, 3, 1])
16
+ set.items.each do |item|
17
+ inputs = item[0..-2]
18
+ outputs = [item[-1]]
19
+ net.train(inputs, outputs)
20
+ end
21
+ @@mlps[cl] = net
22
+ else
23
+ net = @@mlps[cl]
24
+ end
25
+
26
+ net.eval(cl.export_item(entity, false))[0]
27
+
28
+ end
29
+
30
+ end
data/lib/treat/ai.rb CHANGED
@@ -2,7 +2,7 @@ module Treat::AI
2
2
 
3
3
  module Classifiers
4
4
  extend Treat::Groupable
5
- self.type = :annotator
5
+ self.type = :computer
6
6
  self.targets = [:entity]
7
7
  self.default = :id3
8
8
  end
@@ -27,21 +27,25 @@ class Treat::Classification
27
27
  end
28
28
 
29
29
  def export_item(e, include_question = true)
30
+
30
31
  line = []
31
32
 
32
33
  @features.each do |cmd|
34
+ dflt = nil
33
35
  begin
34
36
  if cmd.is_a?(Array)
35
- line << cmd[1].call(e)
37
+ if cmd.size == 3
38
+ r = cmd[1].call(e)
39
+ dflt = cmd[2]
40
+ line << (r ? r : dflt)
41
+ elsif cmd.size == 2
42
+ r = e.send(cmd[0])
43
+ dflt = cmd[1]
44
+ line << (r ? r : dflt)
45
+ end
36
46
  else
37
47
  line << e.send(cmd)
38
48
  end
39
- rescue Treat::Exception
40
- dflt = (
41
- (cmd.is_a?(Array) && cmd[2]) ?
42
- cmd[2] : nil
43
- )
44
- line << dflt
45
49
  end
46
50
  end
47
51
 
@@ -53,7 +57,6 @@ class Treat::Classification
53
57
  end
54
58
  end
55
59
 
56
- line[-1] = '' if line[-1].nil?
57
60
  line
58
61
  end
59
62
 
@@ -39,4 +39,13 @@ class Treat::DataSet
39
39
  end
40
40
  end
41
41
 
42
+ def to_ai4r
43
+ Ai4r::Data::DataSet.new(
44
+ :data_items => items,
45
+ :data_labels => (
46
+ labels.map { |l| l.to_s } +
47
+ [classification.question.to_s]
48
+ ))
49
+ end
50
+
42
51
  end
@@ -11,7 +11,8 @@ class Treat::Dependencies
11
11
  ['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
12
12
  ['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
13
13
  ['chronic', '>= 0.6.7', 'detect date and time in text'],
14
- ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities']
14
+ ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities'],
15
+ ['ai4r', '>= 1.11', 'perform different kinds of classification tasks on text entities']
15
16
  ]
16
17
 
17
18
  Binary = [
@@ -98,7 +98,7 @@ module Treat::Entities::Abilities::Buildable
98
98
  options[:default_to] ||= :html
99
99
 
100
100
  e = from_file(f, options)
101
- e.set :url, url
101
+ e.set :url, uri.to_s
102
102
  e
103
103
 
104
104
  end
@@ -14,7 +14,7 @@ module Treat::Entities::Abilities::Countable
14
14
  # the end of the parent entity.
15
15
  def position_from_end
16
16
  p = position
17
- parent.size - p
17
+ parent.children.size - p
18
18
  end
19
19
 
20
20
  # Find the frequency of the entity in
@@ -30,10 +30,8 @@ module Treat::Entities::Abilities::Iterable
30
30
  a = []
31
31
  type = :entity unless type
32
32
  each_entity(type) do |e|
33
- a << e if (e.has?(feature) &&
34
- e.features[feature] == value) ||
35
- ([:id, :value, :type].include?(feature) &&
36
- e.send(feature) == value)
33
+ r = e.send(feature)
34
+ a << e if r == value
37
35
  end
38
36
  a
39
37
  end
@@ -51,7 +49,7 @@ module Treat::Entities::Abilities::Iterable
51
49
  # Returns an array of the entities with the given
52
50
  # category.
53
51
  def entities_with_category(category, type = nil)
54
- entities_with_feature(:category, type)
52
+ entities_with_feature(:category, category, type)
55
53
  end
56
54
 
57
55
  # Returns the first ancestor of this entity
@@ -5,6 +5,16 @@ module Treat::Entities::Abilities::Registrable
5
5
  # Registers a token in the @registry hash.
6
6
  def register(entity)
7
7
 
8
+ unless @registry
9
+ @count = 0
10
+ @registry = {
11
+ :value => {},
12
+ :position => {},
13
+ :type => {},
14
+ :id => {}
15
+ }
16
+ end
17
+
8
18
  if entity.is_a?(Treat::Entities::Token) ||
9
19
  entity.is_a?(Treat::Entities::Phrase)
10
20
  val = entity.to_s.downcase
@@ -64,18 +64,8 @@ module Treat::Entities
64
64
  super(value, id)
65
65
  @type = :entity if self == Entity
66
66
  @type ||= ucc(cl(self.class)).intern
67
- unless is_a?(Treat::Entities::Token)
68
- @count = 0
69
- @registry = {
70
- :id => {},
71
- :value => {},
72
- :type => {},
73
- :position => {}
74
- }
75
- end
76
67
  end
77
68
 
78
-
79
69
  # Add an entity to the current entity.
80
70
  # Registers the entity in the root node
81
71
  # token registry if the entity is a leaf.
@@ -41,17 +41,10 @@ class Treat::Extractors::Keywords::TfIdf
41
41
  entity.each_word do |word|
42
42
 
43
43
  if keywords.include?(word.to_s)
44
- word.set :is_keyword?, true
44
+ word.set :keyword, true
45
45
  pp = entity.parent_phrase
46
- next unless pp
47
- if pp.has? :keyword_count
48
- pp.set :keyword_count,
49
- pp.keyword_count + 1
50
- else
51
- pp.set :keyword_count, 1
52
- end
53
46
  else
54
- word.set :is_keyword?, false
47
+ word.set :keyword, false
55
48
  end
56
49
 
57
50
  end
@@ -15,21 +15,16 @@ class Treat::Formatters::Readers::Autoselect
15
15
  document.read(detect_format(document.file, options[:default_to]))
16
16
  end
17
17
 
18
- def self.detect_format(filename, default_to = DefaultOptions[:default_to])
19
-
18
+ def self.detect_format(filename, default_to = nil)
19
+ default_to ||= DefaultOptions[:default_to]
20
20
  ext = filename.scan(ExtensionRegexp)
21
- ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ?
22
- ext[0][0] : ''
23
-
24
- format =
25
- ImageExtensions.include?(ext) ?
26
- 'image' : ext
27
-
28
- # Humanize extensions.
21
+ ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ? ext[0][0] : ''
22
+
23
+ format = ImageExtensions.include?(ext) ? 'image' : ext
29
24
  format = 'html' if format == 'htm'
30
25
  format = 'yaml' if format == 'yml'
31
26
 
32
- format = default_to if format == ''
27
+ format = default_to if format.to_s == ''
33
28
 
34
29
  format.intern
35
30
 
@@ -0,0 +1,64 @@
1
+ # Stores an entity in a Mongo collection.
2
+ class Treat::Formatters::Serializers::Mongo
3
+
4
+ # Reauire the Mongo DB
5
+ require 'mongo'
6
+
7
+ # Serialize an entity tree in XML format.
8
+ #
9
+ # Options:
10
+ # - (String) :file => a file to write to.
11
+ def self.serialize(entity, options = {})
12
+
13
+ unless options[:database]
14
+ raise Treat::Exception,
15
+ 'Must supply the database name.'
16
+ end
17
+
18
+ @@conn ||= Mongo::Connection.new
19
+ @@db ||= @@conn[options[:database]]
20
+
21
+ path = []
22
+
23
+ entity.each_ancestor do |ancestor|
24
+ path << [ancestor.type, ancestor.id]
25
+ end
26
+
27
+ path = path.reverse
28
+
29
+ target = @@db
30
+
31
+ path.each do |type_id|
32
+ coll = @@db[type_id[0]][type_id[1]]
33
+ end
34
+
35
+ # Store path
36
+
37
+ Treat::Entities.list.each do |type|
38
+
39
+ type = entity.type.to_s
40
+ type = (type == 'entity') ? 'entities' : (type + 's')
41
+ doc = coll[type]
42
+
43
+ features = {}
44
+ features['id'] = entity.id
45
+ features['value'] = entity.value
46
+
47
+ entity.features.each_pair do |feature, value|
48
+ if value.is_a? Treat::Entities::Entity
49
+ value = value.id
50
+ elsif value.is_a?(Array) || value.is_a?(Hash)
51
+ value = value.inspect
52
+ else
53
+ value = value.to_s
54
+ end
55
+ features[feature.to_s] = value
56
+ end
57
+
58
+ doc.insert(features)
59
+
60
+ end
61
+
62
+ end
63
+
64
+ end
@@ -8,12 +8,14 @@ class Treat::Formatters::Serializers::XML
8
8
  # Options:
9
9
  # - (String) :file => a file to write to.
10
10
  def self.serialize(entity, options = {})
11
-
12
- options = options.merge({:indent => 0}) if options[:indent].nil?
11
+ if options[:indent].nil?
12
+ options = options.merge({:indent => 0})
13
+ end
13
14
  indent = options[:indent]
14
15
  if options[:indent] == 0
15
16
  enc = entity.to_s.encoding.to_s.downcase
16
- string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>\n<treat>\n"
17
+ string = "<?xml version=\"1.0\" " +
18
+ "encoding=\"#{enc}\" ?>\n<treat>\n"
17
19
  else
18
20
  string = ''
19
21
  end
@@ -26,20 +28,24 @@ class Treat::Formatters::Serializers::XML
26
28
  if value.is_a? Treat::Entities::Entity
27
29
  attributes << "#{feature}='#{value.id}' "
28
30
  else
31
+ value = value.inspect if value.is_a?(Symbol)
29
32
  attributes << "#{feature}='#{escape(value)}' "
30
33
  end
31
34
  end
35
+ ############ To be refactored
32
36
  unless entity.dependencies.empty?
33
37
  attributes << "dependencies='"
34
38
  a = []
35
39
  entity.dependencies.each do |dependency|
36
- a << ("{target: #{dependency.target}, type: #{dependency.type}, " +
40
+ a << ("{target: #{dependency.target}, "+
41
+ "type: #{dependency.type}, " +
37
42
  "directed: #{dependency.directed}, " +
38
43
  "direction: #{dependency.direction}}" )
39
44
  end
40
45
  # Structs.
41
46
  attributes << a.join(',') + "'"
42
47
  end
48
+ ############ End of ugly code
43
49
  end
44
50
  tag = entity.class.to_s.split('::')[-1].downcase
45
51
  string += "#{spaces}<#{tag}#{attributes}>"
@@ -36,6 +36,7 @@ module Treat::Formatters::Unserializers::XML
36
36
  id = nil; value = ''
37
37
  attributes = {}
38
38
  dependencies = []
39
+
39
40
  unless xml_reader.attributes.size == 0
40
41
  xml_reader.attributes.each_pair do |k,v|
41
42
  if k == 'id'
@@ -64,6 +65,11 @@ module Treat::Formatters::Unserializers::XML
64
65
  elsif k == 'value'
65
66
  value = v
66
67
  else
68
+ v = v[1..-1].intern if v[0] == ':'
69
+ v = v.to_i if v =~ /^[0-9]*$/
70
+ v = v.to_f if v =~ /^[0-9\.]*$/
71
+ v = false if v == 'false'
72
+ v = true if v == 'true'
67
73
  attributes[k.intern] = v
68
74
  end
69
75
  end
data/lib/treat/kernel.rb CHANGED
@@ -10,7 +10,7 @@ module Kernel
10
10
  # A list of acronyms used in class names within
11
11
  # the program. These do not CamelCase; they
12
12
  # CAMELCase.
13
- Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
13
+ Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo mlp]
14
14
 
15
15
  # A cache to optimize camel casing.
16
16
  @@cc_cache = {}
@@ -43,6 +43,8 @@ module Treat::Processors::Tokenizers::Perl
43
43
 
44
44
  # Replace all decimal points by ^^
45
45
  Treat::Helpers::DecimalPointEscaper.escape!(text)
46
+
47
+ =begin
46
48
 
47
49
  # Translate some common extended ascii
48
50
  # characters to quotes
@@ -60,6 +62,8 @@ module Treat::Processors::Tokenizers::Perl
60
62
  text.gsub!(/\"(?=\s)/," '' ")
61
63
  #s{\"} { `` }g;
62
64
  text.gsub!(/\"(?=\s)/," `` ")
65
+ =end
66
+
63
67
  # Isolate ellipses
64
68
  # s{\.\.\.} { ... }g;
65
69
  text.gsub!(/\.\.\./,' ... ')
data/lib/treat.rb CHANGED
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.1"
13
+ VERSION = "1.0.2"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
data/spec/entity.rb CHANGED
@@ -113,7 +113,7 @@ describe Treat::Entities::Entity do
113
113
  describe "Exportable" do
114
114
 
115
115
  context "when supplied with a classification to export" do
116
- classification = Treat::Classification.new(:word, :tag, :is_keyword?)
116
+ classification = Treat::Classification.new(:word, :tag, :is_keyword)
117
117
  it "returns a data set with the exported features" do
118
118
  ds = @sentence.export(classification)
119
119
  ds.classification.should eql classification
@@ -316,13 +316,14 @@ describe Treat::Entities::Entity do
316
316
 
317
317
  describe "Formatters" do
318
318
 
319
+
320
+ before do
321
+ @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
322
+ @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
323
+ end
324
+
319
325
  describe "#serialize" do
320
-
321
- before :all do
322
- @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
323
- @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
324
- end
325
-
326
+
326
327
  context "when called with a file to save to" do
327
328
 
328
329
  it "serializes a document to the supplied format" do
@@ -332,24 +333,58 @@ describe Treat::Entities::Entity do
332
333
  s = Treat::Entities::Paragraph.new(@txt)
333
334
  s.do(:segment, :tokenize)
334
335
  s.serialize(ser, :file => f)
336
+ File.delete(f)
337
+ end
338
+
339
+ end
340
+
341
+ end
342
+
343
+ end
344
+
345
+ describe "#unserialize" do
346
+
347
+ context "when called with a serialized file" do
348
+
349
+ it "reconstitutes the original entity" do
350
+ @serializers.each do |ser|
351
+
352
+ f = Treat.spec + 'test.' + ser.to_s
353
+ s = Treat::Entities::Paragraph.new(@txt)
354
+
355
+ s.set :test_int, 9
356
+ s.set :test_float, 9.9
357
+ s.set :test_string, 'hello'
358
+ s.set :test_sym, :hello
359
+ s.set :test_bool, false
360
+
361
+ s.do(:segment, :tokenize)
362
+
363
+ s.serialize(ser, :file => f)
364
+
335
365
  d = Treat::Entities::Document.build(f)
366
+
367
+ d.test_int.should eql 9
368
+ d.test_float.should eql 9.9
369
+ d.test_string.should eql 'hello'
370
+ d.test_sym.should eql :hello
371
+ d.test_bool.should eql false
372
+
336
373
  d.to_s.should eql @txt
337
374
  d.size.should eql s.size
375
+
338
376
  d.token_count.should eql s.token_count
339
377
  d.tokens[0].id.should eql s.tokens[0].id
378
+
340
379
  File.delete(f)
341
380
  end
342
-
381
+
343
382
  end
344
383
 
345
384
  end
346
385
 
347
386
  end
348
387
 
349
- describe "#unserialize" do
350
-
351
- end
352
-
353
388
  end
354
389
 
355
390
  describe "Extractors" do
data/spec/sandbox.rb CHANGED
@@ -1,116 +1,24 @@
1
- =begin
2
1
  require_relative '../lib/treat'
3
2
 
4
- def extract(sentences, n)
5
- sentences.to_a.values_at(
6
- *sentences.values
7
- .each_with_index
8
- .sort.reverse
9
- .map(&:last)
10
- .sort.take(n))
11
- .map(&:first)
12
- end
3
+ s = Sentence "Barack Obama was killed last night."
4
+ s.tokenize
13
5
 
14
- describe "#summarize" do
6
+ puts s.word_with_position(2).inspect
15
7
 
16
- it "provides a summary of the text" do
17
-
18
- Treat.debug = true
19
- Treat.silence = true
8
+ s.word_with_position(2).set :highlighted, 1
20
9
 
21
-
22
- context = Treat::DataSet.open('economist-context.yml')
23
- content = Treat::DataSet.open('economist-content.yml')
24
-
25
- c = Collection (Treat.spec + 'economist')
26
- c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
27
-
28
- c.each_document do |d|
29
-
30
- sentences = {}
31
-
32
- d.each_sentence do |sentence|
33
- cx = sentence.classify(:training => context)
34
- ct = sentence.classify(:training => content)
35
- sentences[sentence] = cx[1] + ct[1]
36
- end
10
+ cl = Treat::Classification.new(
11
+ :word,
12
+ [[:position, 0]],
13
+ :highlighted,
14
+ 0
15
+ )
37
16
 
38
- puts
39
- puts d.titles[0].to_s
40
- puts
41
-
42
- puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
43
-
44
- end
17
+ data_set = s.export(cl)
45
18
 
46
- c.serialize file: 'economist-coll.yaml'
19
+ s2 = Sentence ''
20
+ w = Word 'Hello'
21
+ s2 << w
22
+ w.set :position, 2
47
23
 
48
- c = Collection (Treat.spec + 'economist')
49
- c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
50
-
51
- # Topic word count ? Synonyms of keywords ?
52
- # Time expressions?
53
- classify_content = Treat::Classification.new(
54
- :phrase,
55
- [:word_count, :number_count,
56
- :keyword_count, :name_tag_count],
57
- :has_key_content?
58
- )
59
-
60
- classify_context = Treat::Classification.new(
61
- :phrase,
62
- [:position,
63
- :position_from_end,
64
- :type_of_parent_zone,
65
- :value_of_first_word,
66
- :tag_of_first_word
67
- ],
68
- :has_key_context?,
69
- false,
70
- :discrete
71
- )
72
-
73
- c.each_sentence do |s|
74
- puts s.to_s
75
- a = STDIN.gets.to_s.strip
76
- if a == ''
77
- s.set :has_key_content?, false
78
- s.set :has_key_context?, false
79
- else
80
- s.set :has_key_content?, true
81
- s.set :has_key_context?, true
82
- end
83
- end
84
-
85
- context = c.export(classify_context)
86
- content = c.export(classify_content)
87
-
88
- context.save('economist-context.yml')
89
- content.save('economist-content.yml')
90
-
91
- context = Treat::DataSet.open('economist-context.yml')
92
- content = Treat::DataSet.open('economist-content.yml')
93
-
94
- c.each_document do |d|
95
-
96
- sentences = {}
97
-
98
- d.each_sentence do |sentence|
99
- cx = sentence.classify(:training => context)
100
- ct = sentence.classify(:training => content)
101
- sentences[sentence] = cx[1] + ct[1]
102
- end
103
-
104
- puts
105
- puts d.titles[0].to_s
106
- puts
107
-
108
- puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
109
-
110
- end
111
-
112
-
113
- end
114
-
115
- end
116
- =end
24
+ puts w.classify(:mlp, :training => data_set).inspect
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-07 00:00:00.000000000 Z
12
+ date: 2012-04-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip
@@ -75,8 +75,7 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: 0.9.2
78
- description: ! ' Treat is a Ruby toolkit for text retrieval, information extraction
79
- and natural language processing. '
78
+ description: ! ' Treat is a full-fledged natural language processing toolkit for Ruby. '
80
79
  email:
81
80
  - louis.mullie@gmail.com
82
81
  executables: []
@@ -84,6 +83,7 @@ extensions: []
84
83
  extra_rdoc_files: []
85
84
  files:
86
85
  - lib/treat/ai/classifiers/id3.rb
86
+ - lib/treat/ai/classifiers/mlp.rb
87
87
  - lib/treat/ai.rb
88
88
  - lib/treat/categories.rb
89
89
  - lib/treat/categorizable.rb
@@ -128,6 +128,7 @@ files:
128
128
  - lib/treat/formatters/readers/pdf.rb
129
129
  - lib/treat/formatters/readers/txt.rb
130
130
  - lib/treat/formatters/readers/xml.rb
131
+ - lib/treat/formatters/serializers/mongo.rb
131
132
  - lib/treat/formatters/serializers/xml.rb
132
133
  - lib/treat/formatters/serializers/yaml.rb
133
134
  - lib/treat/formatters/unserializers/autoselect.rb
@@ -244,5 +245,5 @@ rubyforge_project:
244
245
  rubygems_version: 1.8.21
245
246
  signing_key:
246
247
  specification_version: 3
247
- summary: A text retrieval, extraction and annotation toolkit for Ruby.
248
+ summary: Text Retrieval, Extraction and Annotation Toolkit.
248
249
  test_files: []