treat 1.1.2 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/lib/treat/config/core/acronyms.rb +2 -1
- data/lib/treat/config/libraries/punkt.rb +1 -0
- data/lib/treat/config/libraries/reuters.rb +1 -0
- data/lib/treat/core/data_set.rb +125 -66
- data/lib/treat/core/export.rb +59 -0
- data/lib/treat/core/problem.rb +101 -18
- data/lib/treat/core/question.rb +23 -7
- data/lib/treat/entities/abilities/iterable.rb +7 -3
- data/lib/treat/entities/abilities/stringable.rb +5 -5
- data/lib/treat/entities/collection.rb +10 -6
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/helpers/objtohash.rb +8 -0
- data/lib/treat/loaders/stanford.rb +10 -8
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +2 -2
- data/lib/treat/workers/formatters/serializers/xml.rb +7 -7
- data/lib/treat/workers/formatters/unserializers/mongo.rb +16 -8
- data/lib/treat/workers/formatters/unserializers/xml.rb +5 -5
- data/lib/treat/workers/formatters/visualizers/dot.rb +7 -7
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -3
- data/lib/treat/workers/learners/classifiers/linear.rb +53 -0
- data/lib/treat/workers/learners/classifiers/mlp.rb +5 -5
- data/lib/treat/workers/learners/classifiers/svm.rb +31 -0
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +4 -2
- data/lib/treat/workers/processors/parsers/enju.rb +17 -17
- data/lib/treat/workers/processors/segmenters/punkt.rb +3 -1
- data/spec/collection.rb +3 -3
- data/spec/core.rb +430 -21
- data/spec/document.rb +1 -1
- data/spec/entity.rb +2 -8
- data/spec/helper.rb +34 -0
- data/spec/phrase.rb +1 -1
- data/spec/sandbox.rb +31 -8
- data/spec/token.rb +1 -1
- data/spec/treat.rb +1 -1
- data/spec/word.rb +1 -1
- data/spec/zone.rb +1 -1
- metadata +9 -8
- data/files/3_2_release_notes.html +0 -766
- data/files/bc-monty-robinson-sentencing.html +0 -1569
- data/files/syria-aleppo-clashes.html +0 -1376
- data/lib/treat/core/feature.rb +0 -42
- data/lib/treat/core/node.rb +0 -251
- data/spec/node.rb +0 -117
@@ -33,14 +33,14 @@ class Treat::Workers::Formatters::Serializers::XML
|
|
33
33
|
end
|
34
34
|
end
|
35
35
|
############ To be refactored
|
36
|
-
unless entity.
|
37
|
-
attributes << "
|
36
|
+
unless entity.edges.empty?
|
37
|
+
attributes << "edges='"
|
38
38
|
a = []
|
39
|
-
entity.
|
40
|
-
a << ("{target: #{
|
41
|
-
"type: #{
|
42
|
-
"directed: #{
|
43
|
-
"direction: #{
|
39
|
+
entity.edges.each do |edge|
|
40
|
+
a << ("{target: #{edge.target}, "+
|
41
|
+
"type: #{edge.type}, " +
|
42
|
+
"directed: #{edge.directed}, " +
|
43
|
+
"direction: #{edge.direction}}" )
|
44
44
|
end
|
45
45
|
# Structs.
|
46
46
|
attributes << a.join(',') + "'"
|
@@ -22,7 +22,7 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
22
22
|
entity.type.to_s.capitalize.intern).superclass).downcase
|
23
23
|
supertype = entity.type.to_s if supertype == 'entity'
|
24
24
|
supertypes = supertype + 's'
|
25
|
-
|
25
|
+
supertypes = 'documents' if entity.type == :collection
|
26
26
|
coll = @@database.collection(supertypes)
|
27
27
|
records = coll.find(selector).to_a
|
28
28
|
|
@@ -30,16 +30,24 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
30
30
|
raise Treat::Exception,
|
31
31
|
"Couldn't find any records using " +
|
32
32
|
"selector #{selector.inspect}."
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
end
|
34
|
+
|
35
|
+
if entity.type == :document
|
36
|
+
if records.size == 1
|
37
|
+
self.do_unserialize(
|
38
|
+
records.first, options)
|
39
|
+
else
|
40
|
+
raise Treat::Exception,
|
41
|
+
"More than one document matched" +
|
42
|
+
"your selector #{selector.inspect}."
|
43
|
+
end
|
44
|
+
elsif entity.type == :collection
|
45
|
+
collection = Treat::Entities::Collection.new
|
38
46
|
records.each do |record|
|
39
|
-
|
47
|
+
collection << self.
|
40
48
|
do_unserialize(record, options)
|
41
49
|
end
|
42
|
-
|
50
|
+
collection
|
43
51
|
end
|
44
52
|
|
45
53
|
end
|
@@ -35,13 +35,13 @@ module Treat::Workers::Formatters::Unserializers::XML
|
|
35
35
|
|
36
36
|
id = nil; value = ''
|
37
37
|
attributes = {}
|
38
|
-
|
38
|
+
edges = []
|
39
39
|
|
40
40
|
unless xml_reader.attributes.size == 0
|
41
41
|
xml_reader.attributes.each_pair do |k,v|
|
42
42
|
if k == 'id'
|
43
43
|
id = v.to_i
|
44
|
-
elsif k == '
|
44
|
+
elsif k == 'edges'
|
45
45
|
a = v.split('--')
|
46
46
|
a.each do |b|
|
47
47
|
c = b.split(';')
|
@@ -54,7 +54,7 @@ module Treat::Workers::Formatters::Unserializers::XML
|
|
54
54
|
end
|
55
55
|
|
56
56
|
target, type, directed, direction = *vals
|
57
|
-
|
57
|
+
edges << [
|
58
58
|
target.to_i,
|
59
59
|
type,
|
60
60
|
(directed == 'true' ? true : false),
|
@@ -87,8 +87,8 @@ module Treat::Workers::Formatters::Unserializers::XML
|
|
87
87
|
end
|
88
88
|
current_element.features = attributes
|
89
89
|
current_element.features = attributes
|
90
|
-
|
91
|
-
target, type, directed, direction = *
|
90
|
+
edges.each do |edge|
|
91
|
+
target, type, directed, direction = *edge
|
92
92
|
current_element.link(target, type, directed, direction)
|
93
93
|
end
|
94
94
|
else
|
@@ -106,18 +106,18 @@ class Treat::Workers::Formatters::Visualizers::DOT
|
|
106
106
|
string << "\n#{entity.parent.id} -- #{entity.id};"
|
107
107
|
end
|
108
108
|
end
|
109
|
-
#
|
110
|
-
if entity.
|
111
|
-
entity.
|
109
|
+
# edges.
|
110
|
+
if entity.has_edges?
|
111
|
+
entity.edges.each do |edge|
|
112
112
|
dir = ''
|
113
|
-
if
|
114
|
-
dir =
|
113
|
+
if edge.directed == true
|
114
|
+
dir = edge.direction == 1 ? 'forward' : 'back'
|
115
115
|
dir = ",dir=#{dir}"
|
116
116
|
else
|
117
117
|
dir = ",dir=both"
|
118
118
|
end
|
119
|
-
string << "\n#{entity.id} -- #{
|
120
|
-
string << "[label=#{
|
119
|
+
string << "\n#{entity.id} -- #{edge.target}"
|
120
|
+
string << "[label=#{edge.type}#{dir}]"
|
121
121
|
end
|
122
122
|
end
|
123
123
|
# Recurse.
|
@@ -11,8 +11,9 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|
11
11
|
|
12
12
|
if !@@classifiers[cl]
|
13
13
|
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
-
cl.
|
15
|
-
|
14
|
+
cl.feature_labels.map { |l| l.to_s },
|
15
|
+
set.items.map { |i| i[:features]},
|
16
|
+
cl.question.default, cl.question.type)
|
16
17
|
dec_tree.train
|
17
18
|
@@classifiers[cl] = dec_tree
|
18
19
|
else
|
@@ -20,7 +21,7 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|
20
21
|
dec_tree.graph('testingbitch')
|
21
22
|
end
|
22
23
|
dec_tree.predict(
|
23
|
-
cl.
|
24
|
+
cl.export_features(entity, false)
|
24
25
|
)
|
25
26
|
end
|
26
27
|
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class Treat::Workers::Learners::Classifiers::Linear
|
2
|
+
|
3
|
+
require 'linear'
|
4
|
+
|
5
|
+
@@classifiers = {}
|
6
|
+
|
7
|
+
DefaultOptions = {
|
8
|
+
bias: 1,
|
9
|
+
eps: 0.1,
|
10
|
+
solver_type: MCSVM_CS
|
11
|
+
}
|
12
|
+
|
13
|
+
def self.classify(entity, options = {})
|
14
|
+
|
15
|
+
options = DefaultOptions.merge(options)
|
16
|
+
set = options[:training]
|
17
|
+
problem = set.problem
|
18
|
+
|
19
|
+
if !@@classifiers[problem]
|
20
|
+
labels = problem.question.labels
|
21
|
+
unless labels
|
22
|
+
raise Treat::Exception,
|
23
|
+
"LibLinear requires that you provide the possible " +
|
24
|
+
"labels to assign to classification items when " +
|
25
|
+
"specifying the question."
|
26
|
+
end
|
27
|
+
param = LParameter.new
|
28
|
+
param.solver_type = options[:solver_type]
|
29
|
+
param.eps = options[:eps]
|
30
|
+
bias = options[:bias]
|
31
|
+
data = set.items.map do |item|
|
32
|
+
self.array_to_hash(item[:features])
|
33
|
+
end
|
34
|
+
prob = LProblem.new(labels, data, bias)
|
35
|
+
@@classifiers[problem] =
|
36
|
+
LModel.new(prob, param)
|
37
|
+
end
|
38
|
+
|
39
|
+
@@classifiers[problem].predict(
|
40
|
+
self.array_to_hash(problem.
|
41
|
+
export_features(entity, false)))
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.array_to_hash(array)
|
46
|
+
hash = {}
|
47
|
+
0.upto(array.length - 1) do |i|
|
48
|
+
hash[i] = array[i]
|
49
|
+
end
|
50
|
+
hash
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -11,11 +11,11 @@ class Treat::Workers::Learners::Classifiers::MLP
|
|
11
11
|
cl = set.problem
|
12
12
|
|
13
13
|
if !@@mlps[cl]
|
14
|
-
net = Ai4r::NeuralNetwork::
|
15
|
-
|
14
|
+
net = Ai4r::NeuralNetwork::Backpropagation.new(
|
15
|
+
[cl.feature_labels.size, 3, 1])
|
16
16
|
set.items.each do |item|
|
17
|
-
inputs = item[0..-2]
|
18
|
-
outputs = [item[-1]]
|
17
|
+
inputs = item[:features][0..-2]
|
18
|
+
outputs = [item[:features][-1]]
|
19
19
|
net.train(inputs, outputs)
|
20
20
|
end
|
21
21
|
@@mlps[cl] = net
|
@@ -23,7 +23,7 @@ class Treat::Workers::Learners::Classifiers::MLP
|
|
23
23
|
net = @@mlps[cl]
|
24
24
|
end
|
25
25
|
|
26
|
-
net.eval(cl.
|
26
|
+
net.eval(cl.export_features(entity, false))[0]
|
27
27
|
|
28
28
|
end
|
29
29
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
class Treat::Workers::Learners::Classifiers::SVM
|
2
|
+
|
3
|
+
require 'svm'
|
4
|
+
|
5
|
+
@@classifiers = {}
|
6
|
+
|
7
|
+
def self.classify(entity, options = {})
|
8
|
+
|
9
|
+
set = options[:training]
|
10
|
+
problem = set.problem
|
11
|
+
|
12
|
+
if !@@classifiers[problem]
|
13
|
+
labels = problem.question.labels
|
14
|
+
unless labels
|
15
|
+
raise Treat::Exception,
|
16
|
+
"LibSVM requires that you provide the possible " +
|
17
|
+
"labels to assign to classification items when " +
|
18
|
+
"specifying the question."
|
19
|
+
end
|
20
|
+
data = set.items.map { |item| item[:features] }
|
21
|
+
prob = Problem.new(labels, data)
|
22
|
+
param = Parameter.new(:kernel_type => LINEAR, :C => 10)
|
23
|
+
@@classifiers[problem] = Model.new(prob, param)
|
24
|
+
end
|
25
|
+
|
26
|
+
@@classifiers[problem].predict_probability(
|
27
|
+
problem.export_features(entity, false))[0]
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -53,8 +53,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|
53
53
|
def self.init_tagger(language)
|
54
54
|
Treat::Loaders::Stanford.load(language)
|
55
55
|
model = StanfordCoreNLP::Config::Models[:pos][language]
|
56
|
-
|
57
|
-
|
56
|
+
model_path = Treat.libraries.stanford.model_path ||
|
57
|
+
Treat.paths.models + 'stanford/'
|
58
|
+
model = model_path + StanfordCoreNLP::
|
59
|
+
Config::ModelFolders[:pos] + model
|
58
60
|
@@taggers[language] ||=
|
59
61
|
StanfordCoreNLP::MaxentTagger.new(model)
|
60
62
|
end
|
@@ -38,7 +38,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
38
38
|
val = entity.to_s
|
39
39
|
|
40
40
|
@@id_table = {}
|
41
|
-
@@
|
41
|
+
@@edges_table = {}
|
42
42
|
|
43
43
|
stdin, stdout = proc
|
44
44
|
text, remove_last = valid_text(val)
|
@@ -63,7 +63,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
63
63
|
end
|
64
64
|
|
65
65
|
link_heads(entity)
|
66
|
-
|
66
|
+
add_edges(entity)
|
67
67
|
end
|
68
68
|
|
69
69
|
# Return the process running Enju.
|
@@ -102,12 +102,12 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
102
102
|
pd = cd
|
103
103
|
next
|
104
104
|
end
|
105
|
-
# Get and format attributes and
|
105
|
+
# Get and format attributes and edges.
|
106
106
|
attributes = reader.attributes
|
107
107
|
id = attributes.delete('id')
|
108
|
-
new_attr = {};
|
108
|
+
new_attr = {}; edges = {}
|
109
109
|
unless attributes.size == 0
|
110
|
-
new_attr,
|
110
|
+
new_attr, edges =
|
111
111
|
cleanup_attributes(reader.name, attributes)
|
112
112
|
end
|
113
113
|
# Create the appropriate entity for the
|
@@ -117,17 +117,17 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
117
117
|
when 'sentence'
|
118
118
|
entity = Treat::Entities::Sentence.new('')
|
119
119
|
@@id_table[id] = entity.id
|
120
|
-
@@
|
120
|
+
@@edges_table[entity.id] = edges
|
121
121
|
entity.features = new_attr
|
122
122
|
when 'cons'
|
123
123
|
entity = entity <<
|
124
124
|
Treat::Entities::Phrase.new('')
|
125
125
|
@@id_table[id] = entity.id
|
126
|
-
@@
|
126
|
+
@@edges_table[entity.id] = edges
|
127
127
|
entity.features = new_attr
|
128
128
|
when 'tok'
|
129
129
|
tmp_attributes = new_attr
|
130
|
-
|
130
|
+
tmp_edges = edges
|
131
131
|
else
|
132
132
|
current_value = reader.value.gsub(/\s+/, "")
|
133
133
|
unless current_value.size == 0
|
@@ -136,7 +136,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
136
136
|
if entity.is_a?(Treat::Entities::Word)
|
137
137
|
entity.features = tmp_attributes
|
138
138
|
@@id_table[id] = entity.id
|
139
|
-
@@
|
139
|
+
@@edges_table[entity.id] = tmp_edges
|
140
140
|
else
|
141
141
|
# Do something useful here
|
142
142
|
entity.set :tag, 'SYM'
|
@@ -179,15 +179,15 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
179
179
|
end
|
180
180
|
end
|
181
181
|
|
182
|
-
# Add
|
183
|
-
def self.
|
182
|
+
# Add edges a posteriori to a parsed entity.
|
183
|
+
def self.add_edges(entity2)
|
184
184
|
|
185
185
|
entity2.each_entity(:word, :phrase) do |entity|
|
186
|
-
@@
|
187
|
-
next if
|
186
|
+
@@edges_table.each_pair do |id, edges|
|
187
|
+
next if edges.nil?
|
188
188
|
entity = entity2.root.find(id)
|
189
189
|
next if entity.nil?
|
190
|
-
|
190
|
+
edges.each_pair do |argument, type|
|
191
191
|
# Skip this argument if we
|
192
192
|
# don't know the target node.
|
193
193
|
next if argument == 'unk'
|
@@ -205,7 +205,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
205
205
|
def self.cleanup_attributes(name, attributes)
|
206
206
|
|
207
207
|
new_attr = {}
|
208
|
-
|
208
|
+
edges = {}
|
209
209
|
pred = attributes.delete('pred')
|
210
210
|
|
211
211
|
attributes.each_pair do |attribute2, value|
|
@@ -214,7 +214,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
214
214
|
|
215
215
|
if attribute == 'arg1' ||
|
216
216
|
attribute == 'arg2'
|
217
|
-
|
217
|
+
edges[value] = pred
|
218
218
|
next
|
219
219
|
end
|
220
220
|
|
@@ -256,7 +256,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
256
256
|
new_attr.delete :base
|
257
257
|
end
|
258
258
|
|
259
|
-
return new_attr,
|
259
|
+
return new_attr, edges
|
260
260
|
|
261
261
|
end
|
262
262
|
|
@@ -63,7 +63,9 @@ module Treat::Workers::Processors::Segmenters::Punkt
|
|
63
63
|
if options[:model]
|
64
64
|
model = options[:model]
|
65
65
|
else
|
66
|
-
|
66
|
+
model_path = Treat.libraries.punkt.model_path ||
|
67
|
+
Treat.paths.models + 'punkt/'
|
68
|
+
model = model_path + "#{lang}.yaml"
|
67
69
|
unless File.readable?(model)
|
68
70
|
raise Treat::Exception,
|
69
71
|
"Could not get the language model " +
|
data/spec/collection.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require_relative '
|
1
|
+
require_relative 'helper'
|
2
2
|
|
3
3
|
describe Treat::Entities::Collection do
|
4
4
|
|
@@ -29,7 +29,7 @@ describe Treat::Entities::Collection do
|
|
29
29
|
f = Treat.paths.spec + 'samples/test'
|
30
30
|
c = Treat::Entities::Collection.build(f)
|
31
31
|
c << Treat::Entities::Document.new
|
32
|
-
c.size.should eql
|
32
|
+
c.size.should eql 1
|
33
33
|
FileUtils.rm_rf(f)
|
34
34
|
end
|
35
35
|
end
|
@@ -45,7 +45,7 @@ describe Treat::Entities::Collection do
|
|
45
45
|
it "recursively searches the folder for " +
|
46
46
|
"files and opens them into a collection of documents" do
|
47
47
|
collection = Treat::Entities::Collection.build(@file)
|
48
|
-
collection.size.should eql
|
48
|
+
collection.size.should eql 5
|
49
49
|
end
|
50
50
|
|
51
51
|
end
|