treat 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/treat/ai/classifiers/id3.rb +1 -2
- data/lib/treat/ai/classifiers/mlp.rb +30 -0
- data/lib/treat/ai.rb +1 -1
- data/lib/treat/classification.rb +11 -8
- data/lib/treat/data_set.rb +9 -0
- data/lib/treat/dependencies.rb +2 -1
- data/lib/treat/entities/abilities/buildable.rb +1 -1
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +3 -5
- data/lib/treat/entities/abilities/registrable.rb +10 -0
- data/lib/treat/entities/entity.rb +0 -10
- data/lib/treat/extractors/keywords/tf_idf.rb +2 -9
- data/lib/treat/formatters/readers/autoselect.rb +6 -11
- data/lib/treat/formatters/serializers/mongo.rb +64 -0
- data/lib/treat/formatters/serializers/xml.rb +10 -4
- data/lib/treat/formatters/unserializers/xml.rb +6 -0
- data/lib/treat/kernel.rb +1 -1
- data/lib/treat/processors/tokenizers/perl.rb +4 -0
- data/lib/treat.rb +1 -1
- data/spec/entity.rb +47 -12
- data/spec/sandbox.rb +16 -108
- metadata +6 -5
@@ -14,12 +14,11 @@ class Treat::AI::Classifiers::ID3
|
|
14
14
|
set.labels.map { |l| l.to_s }, set.items,
|
15
15
|
cl.default, cl.mode)
|
16
16
|
dec_tree.train
|
17
|
+
@@classifiers[cl] = dec_tree
|
17
18
|
else
|
18
19
|
dec_tree = @@classifiers[cl]
|
19
20
|
end
|
20
21
|
|
21
|
-
cl.export_item(entity, false).inspect
|
22
|
-
|
23
22
|
dec_tree.predict(
|
24
23
|
cl.export_item(entity, false)
|
25
24
|
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Currently, this MLP is limited to 1 output.
|
2
|
+
class Treat::AI::Classifiers::MLP
|
3
|
+
|
4
|
+
require 'ai4r'
|
5
|
+
|
6
|
+
@@mlps = {}
|
7
|
+
|
8
|
+
def self.classify(entity, options = {})
|
9
|
+
|
10
|
+
set = options[:training]
|
11
|
+
cl = set.classification
|
12
|
+
|
13
|
+
if !@@mlps[cl]
|
14
|
+
net = Ai4r::NeuralNetwork::
|
15
|
+
Backpropagation.new([cl.labels.size, 3, 1])
|
16
|
+
set.items.each do |item|
|
17
|
+
inputs = item[0..-2]
|
18
|
+
outputs = [item[-1]]
|
19
|
+
net.train(inputs, outputs)
|
20
|
+
end
|
21
|
+
@@mlps[cl] = net
|
22
|
+
else
|
23
|
+
net = @@mlps[cl]
|
24
|
+
end
|
25
|
+
|
26
|
+
net.eval(cl.export_item(entity, false))[0]
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/lib/treat/ai.rb
CHANGED
data/lib/treat/classification.rb
CHANGED
@@ -27,21 +27,25 @@ class Treat::Classification
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def export_item(e, include_question = true)
|
30
|
+
|
30
31
|
line = []
|
31
32
|
|
32
33
|
@features.each do |cmd|
|
34
|
+
dflt = nil
|
33
35
|
begin
|
34
36
|
if cmd.is_a?(Array)
|
35
|
-
|
37
|
+
if cmd.size == 3
|
38
|
+
r = cmd[1].call(e)
|
39
|
+
dflt = cmd[2]
|
40
|
+
line << (r ? r : dflt)
|
41
|
+
elsif cmd.size == 2
|
42
|
+
r = e.send(cmd[0])
|
43
|
+
dflt = cmd[1]
|
44
|
+
line << (r ? r : dflt)
|
45
|
+
end
|
36
46
|
else
|
37
47
|
line << e.send(cmd)
|
38
48
|
end
|
39
|
-
rescue Treat::Exception
|
40
|
-
dflt = (
|
41
|
-
(cmd.is_a?(Array) && cmd[2]) ?
|
42
|
-
cmd[2] : nil
|
43
|
-
)
|
44
|
-
line << dflt
|
45
49
|
end
|
46
50
|
end
|
47
51
|
|
@@ -53,7 +57,6 @@ class Treat::Classification
|
|
53
57
|
end
|
54
58
|
end
|
55
59
|
|
56
|
-
line[-1] = '' if line[-1].nil?
|
57
60
|
line
|
58
61
|
end
|
59
62
|
|
data/lib/treat/data_set.rb
CHANGED
data/lib/treat/dependencies.rb
CHANGED
@@ -11,7 +11,8 @@ class Treat::Dependencies
|
|
11
11
|
['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
|
12
12
|
['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
|
13
13
|
['chronic', '>= 0.6.7', 'detect date and time in text'],
|
14
|
-
['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities']
|
14
|
+
['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities'],
|
15
|
+
['ai4r', '>= 1.11', 'perform different kinds of classification tasks on text entities']
|
15
16
|
]
|
16
17
|
|
17
18
|
Binary = [
|
@@ -30,10 +30,8 @@ module Treat::Entities::Abilities::Iterable
|
|
30
30
|
a = []
|
31
31
|
type = :entity unless type
|
32
32
|
each_entity(type) do |e|
|
33
|
-
|
34
|
-
e
|
35
|
-
([:id, :value, :type].include?(feature) &&
|
36
|
-
e.send(feature) == value)
|
33
|
+
r = e.send(feature)
|
34
|
+
a << e if r == value
|
37
35
|
end
|
38
36
|
a
|
39
37
|
end
|
@@ -51,7 +49,7 @@ module Treat::Entities::Abilities::Iterable
|
|
51
49
|
# Returns an array of the entities with the given
|
52
50
|
# category.
|
53
51
|
def entities_with_category(category, type = nil)
|
54
|
-
entities_with_feature(:category, type)
|
52
|
+
entities_with_feature(:category, category, type)
|
55
53
|
end
|
56
54
|
|
57
55
|
# Returns the first ancestor of this entity
|
@@ -5,6 +5,16 @@ module Treat::Entities::Abilities::Registrable
|
|
5
5
|
# Registers a token in the @registry hash.
|
6
6
|
def register(entity)
|
7
7
|
|
8
|
+
unless @registry
|
9
|
+
@count = 0
|
10
|
+
@registry = {
|
11
|
+
:value => {},
|
12
|
+
:position => {},
|
13
|
+
:type => {},
|
14
|
+
:id => {}
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
8
18
|
if entity.is_a?(Treat::Entities::Token) ||
|
9
19
|
entity.is_a?(Treat::Entities::Phrase)
|
10
20
|
val = entity.to_s.downcase
|
@@ -64,18 +64,8 @@ module Treat::Entities
|
|
64
64
|
super(value, id)
|
65
65
|
@type = :entity if self == Entity
|
66
66
|
@type ||= ucc(cl(self.class)).intern
|
67
|
-
unless is_a?(Treat::Entities::Token)
|
68
|
-
@count = 0
|
69
|
-
@registry = {
|
70
|
-
:id => {},
|
71
|
-
:value => {},
|
72
|
-
:type => {},
|
73
|
-
:position => {}
|
74
|
-
}
|
75
|
-
end
|
76
67
|
end
|
77
68
|
|
78
|
-
|
79
69
|
# Add an entity to the current entity.
|
80
70
|
# Registers the entity in the root node
|
81
71
|
# token registry if the entity is a leaf.
|
@@ -41,17 +41,10 @@ class Treat::Extractors::Keywords::TfIdf
|
|
41
41
|
entity.each_word do |word|
|
42
42
|
|
43
43
|
if keywords.include?(word.to_s)
|
44
|
-
word.set :
|
44
|
+
word.set :keyword, true
|
45
45
|
pp = entity.parent_phrase
|
46
|
-
next unless pp
|
47
|
-
if pp.has? :keyword_count
|
48
|
-
pp.set :keyword_count,
|
49
|
-
pp.keyword_count + 1
|
50
|
-
else
|
51
|
-
pp.set :keyword_count, 1
|
52
|
-
end
|
53
46
|
else
|
54
|
-
word.set :
|
47
|
+
word.set :keyword, false
|
55
48
|
end
|
56
49
|
|
57
50
|
end
|
@@ -15,21 +15,16 @@ class Treat::Formatters::Readers::Autoselect
|
|
15
15
|
document.read(detect_format(document.file, options[:default_to]))
|
16
16
|
end
|
17
17
|
|
18
|
-
def self.detect_format(filename, default_to =
|
19
|
-
|
18
|
+
def self.detect_format(filename, default_to = nil)
|
19
|
+
default_to ||= DefaultOptions[:default_to]
|
20
20
|
ext = filename.scan(ExtensionRegexp)
|
21
|
-
ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ?
|
22
|
-
|
23
|
-
|
24
|
-
format =
|
25
|
-
ImageExtensions.include?(ext) ?
|
26
|
-
'image' : ext
|
27
|
-
|
28
|
-
# Humanize extensions.
|
21
|
+
ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ? ext[0][0] : ''
|
22
|
+
|
23
|
+
format = ImageExtensions.include?(ext) ? 'image' : ext
|
29
24
|
format = 'html' if format == 'htm'
|
30
25
|
format = 'yaml' if format == 'yml'
|
31
26
|
|
32
|
-
format = default_to if format == ''
|
27
|
+
format = default_to if format.to_s == ''
|
33
28
|
|
34
29
|
format.intern
|
35
30
|
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Stores an entity in a Mongo collection.
|
2
|
+
class Treat::Formatters::Serializers::Mongo
|
3
|
+
|
4
|
+
# Reauire the Mongo DB
|
5
|
+
require 'mongo'
|
6
|
+
|
7
|
+
# Serialize an entity tree in XML format.
|
8
|
+
#
|
9
|
+
# Options:
|
10
|
+
# - (String) :file => a file to write to.
|
11
|
+
def self.serialize(entity, options = {})
|
12
|
+
|
13
|
+
unless options[:database]
|
14
|
+
raise Treat::Exception,
|
15
|
+
'Must supply the database name.'
|
16
|
+
end
|
17
|
+
|
18
|
+
@@conn ||= Mongo::Connection.new
|
19
|
+
@@db ||= @@conn[options[:database]]
|
20
|
+
|
21
|
+
path = []
|
22
|
+
|
23
|
+
entity.each_ancestor do |ancestor|
|
24
|
+
path << [ancestor.type, ancestor.id]
|
25
|
+
end
|
26
|
+
|
27
|
+
path = path.reverse
|
28
|
+
|
29
|
+
target = @@db
|
30
|
+
|
31
|
+
path.each do |type_id|
|
32
|
+
coll = @@db[type_id[0]][type_id[1]]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Store path
|
36
|
+
|
37
|
+
Treat::Entities.list.each do |type|
|
38
|
+
|
39
|
+
type = entity.type.to_s
|
40
|
+
type = (type == 'entity') ? 'entities' : (type + 's')
|
41
|
+
doc = coll[type]
|
42
|
+
|
43
|
+
features = {}
|
44
|
+
features['id'] = entity.id
|
45
|
+
features['value'] = entity.value
|
46
|
+
|
47
|
+
entity.features.each_pair do |feature, value|
|
48
|
+
if value.is_a? Treat::Entities::Entity
|
49
|
+
value = value.id
|
50
|
+
elsif value.is_a?(Array) || value.is_a?(Hash)
|
51
|
+
value = value.inspect
|
52
|
+
else
|
53
|
+
value = value.to_s
|
54
|
+
end
|
55
|
+
features[feature.to_s] = value
|
56
|
+
end
|
57
|
+
|
58
|
+
doc.insert(features)
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
@@ -8,12 +8,14 @@ class Treat::Formatters::Serializers::XML
|
|
8
8
|
# Options:
|
9
9
|
# - (String) :file => a file to write to.
|
10
10
|
def self.serialize(entity, options = {})
|
11
|
-
|
12
|
-
|
11
|
+
if options[:indent].nil?
|
12
|
+
options = options.merge({:indent => 0})
|
13
|
+
end
|
13
14
|
indent = options[:indent]
|
14
15
|
if options[:indent] == 0
|
15
16
|
enc = entity.to_s.encoding.to_s.downcase
|
16
|
-
string = "<?xml version=\"1.0\"
|
17
|
+
string = "<?xml version=\"1.0\" " +
|
18
|
+
"encoding=\"#{enc}\" ?>\n<treat>\n"
|
17
19
|
else
|
18
20
|
string = ''
|
19
21
|
end
|
@@ -26,20 +28,24 @@ class Treat::Formatters::Serializers::XML
|
|
26
28
|
if value.is_a? Treat::Entities::Entity
|
27
29
|
attributes << "#{feature}='#{value.id}' "
|
28
30
|
else
|
31
|
+
value = value.inspect if value.is_a?(Symbol)
|
29
32
|
attributes << "#{feature}='#{escape(value)}' "
|
30
33
|
end
|
31
34
|
end
|
35
|
+
############ To be refactored
|
32
36
|
unless entity.dependencies.empty?
|
33
37
|
attributes << "dependencies='"
|
34
38
|
a = []
|
35
39
|
entity.dependencies.each do |dependency|
|
36
|
-
a << ("{target: #{dependency.target},
|
40
|
+
a << ("{target: #{dependency.target}, "+
|
41
|
+
"type: #{dependency.type}, " +
|
37
42
|
"directed: #{dependency.directed}, " +
|
38
43
|
"direction: #{dependency.direction}}" )
|
39
44
|
end
|
40
45
|
# Structs.
|
41
46
|
attributes << a.join(',') + "'"
|
42
47
|
end
|
48
|
+
############ End of ugly code
|
43
49
|
end
|
44
50
|
tag = entity.class.to_s.split('::')[-1].downcase
|
45
51
|
string += "#{spaces}<#{tag}#{attributes}>"
|
@@ -36,6 +36,7 @@ module Treat::Formatters::Unserializers::XML
|
|
36
36
|
id = nil; value = ''
|
37
37
|
attributes = {}
|
38
38
|
dependencies = []
|
39
|
+
|
39
40
|
unless xml_reader.attributes.size == 0
|
40
41
|
xml_reader.attributes.each_pair do |k,v|
|
41
42
|
if k == 'id'
|
@@ -64,6 +65,11 @@ module Treat::Formatters::Unserializers::XML
|
|
64
65
|
elsif k == 'value'
|
65
66
|
value = v
|
66
67
|
else
|
68
|
+
v = v[1..-1].intern if v[0] == ':'
|
69
|
+
v = v.to_i if v =~ /^[0-9]*$/
|
70
|
+
v = v.to_f if v =~ /^[0-9\.]*$/
|
71
|
+
v = false if v == 'false'
|
72
|
+
v = true if v == 'true'
|
67
73
|
attributes[k.intern] = v
|
68
74
|
end
|
69
75
|
end
|
data/lib/treat/kernel.rb
CHANGED
@@ -10,7 +10,7 @@ module Kernel
|
|
10
10
|
# A list of acronyms used in class names within
|
11
11
|
# the program. These do not CamelCase; they
|
12
12
|
# CAMELCase.
|
13
|
-
Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo]
|
13
|
+
Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo mlp]
|
14
14
|
|
15
15
|
# A cache to optimize camel casing.
|
16
16
|
@@cc_cache = {}
|
@@ -43,6 +43,8 @@ module Treat::Processors::Tokenizers::Perl
|
|
43
43
|
|
44
44
|
# Replace all decimal points by ^^
|
45
45
|
Treat::Helpers::DecimalPointEscaper.escape!(text)
|
46
|
+
|
47
|
+
=begin
|
46
48
|
|
47
49
|
# Translate some common extended ascii
|
48
50
|
# characters to quotes
|
@@ -60,6 +62,8 @@ module Treat::Processors::Tokenizers::Perl
|
|
60
62
|
text.gsub!(/\"(?=\s)/," '' ")
|
61
63
|
#s{\"} { `` }g;
|
62
64
|
text.gsub!(/\"(?=\s)/," `` ")
|
65
|
+
=end
|
66
|
+
|
63
67
|
# Isolate ellipses
|
64
68
|
# s{\.\.\.} { ... }g;
|
65
69
|
text.gsub!(/\.\.\./,' ... ')
|
data/lib/treat.rb
CHANGED
data/spec/entity.rb
CHANGED
@@ -113,7 +113,7 @@ describe Treat::Entities::Entity do
|
|
113
113
|
describe "Exportable" do
|
114
114
|
|
115
115
|
context "when supplied with a classification to export" do
|
116
|
-
classification = Treat::Classification.new(:word, :tag, :is_keyword
|
116
|
+
classification = Treat::Classification.new(:word, :tag, :is_keyword)
|
117
117
|
it "returns a data set with the exported features" do
|
118
118
|
ds = @sentence.export(classification)
|
119
119
|
ds.classification.should eql classification
|
@@ -316,13 +316,14 @@ describe Treat::Entities::Entity do
|
|
316
316
|
|
317
317
|
describe "Formatters" do
|
318
318
|
|
319
|
+
|
320
|
+
before do
|
321
|
+
@serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
|
322
|
+
@txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
|
323
|
+
end
|
324
|
+
|
319
325
|
describe "#serialize" do
|
320
|
-
|
321
|
-
before :all do
|
322
|
-
@serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
|
323
|
-
@txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
|
324
|
-
end
|
325
|
-
|
326
|
+
|
326
327
|
context "when called with a file to save to" do
|
327
328
|
|
328
329
|
it "serializes a document to the supplied format" do
|
@@ -332,24 +333,58 @@ describe Treat::Entities::Entity do
|
|
332
333
|
s = Treat::Entities::Paragraph.new(@txt)
|
333
334
|
s.do(:segment, :tokenize)
|
334
335
|
s.serialize(ser, :file => f)
|
336
|
+
File.delete(f)
|
337
|
+
end
|
338
|
+
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
344
|
+
|
345
|
+
describe "#unserialize" do
|
346
|
+
|
347
|
+
context "when called with a serialized file" do
|
348
|
+
|
349
|
+
it "reconstitutes the original entity" do
|
350
|
+
@serializers.each do |ser|
|
351
|
+
|
352
|
+
f = Treat.spec + 'test.' + ser.to_s
|
353
|
+
s = Treat::Entities::Paragraph.new(@txt)
|
354
|
+
|
355
|
+
s.set :test_int, 9
|
356
|
+
s.set :test_float, 9.9
|
357
|
+
s.set :test_string, 'hello'
|
358
|
+
s.set :test_sym, :hello
|
359
|
+
s.set :test_bool, false
|
360
|
+
|
361
|
+
s.do(:segment, :tokenize)
|
362
|
+
|
363
|
+
s.serialize(ser, :file => f)
|
364
|
+
|
335
365
|
d = Treat::Entities::Document.build(f)
|
366
|
+
|
367
|
+
d.test_int.should eql 9
|
368
|
+
d.test_float.should eql 9.9
|
369
|
+
d.test_string.should eql 'hello'
|
370
|
+
d.test_sym.should eql :hello
|
371
|
+
d.test_bool.should eql false
|
372
|
+
|
336
373
|
d.to_s.should eql @txt
|
337
374
|
d.size.should eql s.size
|
375
|
+
|
338
376
|
d.token_count.should eql s.token_count
|
339
377
|
d.tokens[0].id.should eql s.tokens[0].id
|
378
|
+
|
340
379
|
File.delete(f)
|
341
380
|
end
|
342
|
-
|
381
|
+
|
343
382
|
end
|
344
383
|
|
345
384
|
end
|
346
385
|
|
347
386
|
end
|
348
387
|
|
349
|
-
describe "#unserialize" do
|
350
|
-
|
351
|
-
end
|
352
|
-
|
353
388
|
end
|
354
389
|
|
355
390
|
describe "Extractors" do
|
data/spec/sandbox.rb
CHANGED
@@ -1,116 +1,24 @@
|
|
1
|
-
=begin
|
2
1
|
require_relative '../lib/treat'
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
*sentences.values
|
7
|
-
.each_with_index
|
8
|
-
.sort.reverse
|
9
|
-
.map(&:last)
|
10
|
-
.sort.take(n))
|
11
|
-
.map(&:first)
|
12
|
-
end
|
3
|
+
s = Sentence "Barack Obama was killed last night."
|
4
|
+
s.tokenize
|
13
5
|
|
14
|
-
|
6
|
+
puts s.word_with_position(2).inspect
|
15
7
|
|
16
|
-
|
17
|
-
|
18
|
-
Treat.debug = true
|
19
|
-
Treat.silence = true
|
8
|
+
s.word_with_position(2).set :highlighted, 1
|
20
9
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
c.each_document do |d|
|
29
|
-
|
30
|
-
sentences = {}
|
31
|
-
|
32
|
-
d.each_sentence do |sentence|
|
33
|
-
cx = sentence.classify(:training => context)
|
34
|
-
ct = sentence.classify(:training => content)
|
35
|
-
sentences[sentence] = cx[1] + ct[1]
|
36
|
-
end
|
10
|
+
cl = Treat::Classification.new(
|
11
|
+
:word,
|
12
|
+
[[:position, 0]],
|
13
|
+
:highlighted,
|
14
|
+
0
|
15
|
+
)
|
37
16
|
|
38
|
-
|
39
|
-
puts d.titles[0].to_s
|
40
|
-
puts
|
41
|
-
|
42
|
-
puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
|
43
|
-
|
44
|
-
end
|
17
|
+
data_set = s.export(cl)
|
45
18
|
|
46
|
-
|
19
|
+
s2 = Sentence ''
|
20
|
+
w = Word 'Hello'
|
21
|
+
s2 << w
|
22
|
+
w.set :position, 2
|
47
23
|
|
48
|
-
|
49
|
-
c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
|
50
|
-
|
51
|
-
# Topic word count ? Synonyms of keywords ?
|
52
|
-
# Time expressions?
|
53
|
-
classify_content = Treat::Classification.new(
|
54
|
-
:phrase,
|
55
|
-
[:word_count, :number_count,
|
56
|
-
:keyword_count, :name_tag_count],
|
57
|
-
:has_key_content?
|
58
|
-
)
|
59
|
-
|
60
|
-
classify_context = Treat::Classification.new(
|
61
|
-
:phrase,
|
62
|
-
[:position,
|
63
|
-
:position_from_end,
|
64
|
-
:type_of_parent_zone,
|
65
|
-
:value_of_first_word,
|
66
|
-
:tag_of_first_word
|
67
|
-
],
|
68
|
-
:has_key_context?,
|
69
|
-
false,
|
70
|
-
:discrete
|
71
|
-
)
|
72
|
-
|
73
|
-
c.each_sentence do |s|
|
74
|
-
puts s.to_s
|
75
|
-
a = STDIN.gets.to_s.strip
|
76
|
-
if a == ''
|
77
|
-
s.set :has_key_content?, false
|
78
|
-
s.set :has_key_context?, false
|
79
|
-
else
|
80
|
-
s.set :has_key_content?, true
|
81
|
-
s.set :has_key_context?, true
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
context = c.export(classify_context)
|
86
|
-
content = c.export(classify_content)
|
87
|
-
|
88
|
-
context.save('economist-context.yml')
|
89
|
-
content.save('economist-content.yml')
|
90
|
-
|
91
|
-
context = Treat::DataSet.open('economist-context.yml')
|
92
|
-
content = Treat::DataSet.open('economist-content.yml')
|
93
|
-
|
94
|
-
c.each_document do |d|
|
95
|
-
|
96
|
-
sentences = {}
|
97
|
-
|
98
|
-
d.each_sentence do |sentence|
|
99
|
-
cx = sentence.classify(:training => context)
|
100
|
-
ct = sentence.classify(:training => content)
|
101
|
-
sentences[sentence] = cx[1] + ct[1]
|
102
|
-
end
|
103
|
-
|
104
|
-
puts
|
105
|
-
puts d.titles[0].to_s
|
106
|
-
puts
|
107
|
-
|
108
|
-
puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
end
|
114
|
-
|
115
|
-
end
|
116
|
-
=end
|
24
|
+
puts w.classify(:mlp, :training => data_set).inspect
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|
@@ -75,8 +75,7 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: 0.9.2
|
78
|
-
description: ! ' Treat is a
|
79
|
-
and natural language processing. '
|
78
|
+
description: ! ' Treat is a full-fledged natural language processing toolkit for Ruby. '
|
80
79
|
email:
|
81
80
|
- louis.mullie@gmail.com
|
82
81
|
executables: []
|
@@ -84,6 +83,7 @@ extensions: []
|
|
84
83
|
extra_rdoc_files: []
|
85
84
|
files:
|
86
85
|
- lib/treat/ai/classifiers/id3.rb
|
86
|
+
- lib/treat/ai/classifiers/mlp.rb
|
87
87
|
- lib/treat/ai.rb
|
88
88
|
- lib/treat/categories.rb
|
89
89
|
- lib/treat/categorizable.rb
|
@@ -128,6 +128,7 @@ files:
|
|
128
128
|
- lib/treat/formatters/readers/pdf.rb
|
129
129
|
- lib/treat/formatters/readers/txt.rb
|
130
130
|
- lib/treat/formatters/readers/xml.rb
|
131
|
+
- lib/treat/formatters/serializers/mongo.rb
|
131
132
|
- lib/treat/formatters/serializers/xml.rb
|
132
133
|
- lib/treat/formatters/serializers/yaml.rb
|
133
134
|
- lib/treat/formatters/unserializers/autoselect.rb
|
@@ -244,5 +245,5 @@ rubyforge_project:
|
|
244
245
|
rubygems_version: 1.8.21
|
245
246
|
signing_key:
|
246
247
|
specification_version: 3
|
247
|
-
summary:
|
248
|
+
summary: Text Retrieval, Extraction and Annotation Toolkit.
|
248
249
|
test_files: []
|