treat 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/lib/treat/config/core/acronyms.rb +2 -1
- data/lib/treat/config/libraries/punkt.rb +1 -0
- data/lib/treat/config/libraries/reuters.rb +1 -0
- data/lib/treat/core/data_set.rb +125 -66
- data/lib/treat/core/export.rb +59 -0
- data/lib/treat/core/problem.rb +101 -18
- data/lib/treat/core/question.rb +23 -7
- data/lib/treat/entities/abilities/iterable.rb +7 -3
- data/lib/treat/entities/abilities/stringable.rb +5 -5
- data/lib/treat/entities/collection.rb +10 -6
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/helpers/objtohash.rb +8 -0
- data/lib/treat/loaders/stanford.rb +10 -8
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +2 -2
- data/lib/treat/workers/formatters/serializers/xml.rb +7 -7
- data/lib/treat/workers/formatters/unserializers/mongo.rb +16 -8
- data/lib/treat/workers/formatters/unserializers/xml.rb +5 -5
- data/lib/treat/workers/formatters/visualizers/dot.rb +7 -7
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -3
- data/lib/treat/workers/learners/classifiers/linear.rb +53 -0
- data/lib/treat/workers/learners/classifiers/mlp.rb +5 -5
- data/lib/treat/workers/learners/classifiers/svm.rb +31 -0
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +4 -2
- data/lib/treat/workers/processors/parsers/enju.rb +17 -17
- data/lib/treat/workers/processors/segmenters/punkt.rb +3 -1
- data/spec/collection.rb +3 -3
- data/spec/core.rb +430 -21
- data/spec/document.rb +1 -1
- data/spec/entity.rb +2 -8
- data/spec/helper.rb +34 -0
- data/spec/phrase.rb +1 -1
- data/spec/sandbox.rb +31 -8
- data/spec/token.rb +1 -1
- data/spec/treat.rb +1 -1
- data/spec/word.rb +1 -1
- data/spec/zone.rb +1 -1
- metadata +9 -8
- data/files/3_2_release_notes.html +0 -766
- data/files/bc-monty-robinson-sentencing.html +0 -1569
- data/files/syria-aleppo-clashes.html +0 -1376
- data/lib/treat/core/feature.rb +0 -42
- data/lib/treat/core/node.rb +0 -251
- data/spec/node.rb +0 -117
@@ -33,14 +33,14 @@ class Treat::Workers::Formatters::Serializers::XML
|
|
33
33
|
end
|
34
34
|
end
|
35
35
|
############ To be refactored
|
36
|
-
unless entity.
|
37
|
-
attributes << "
|
36
|
+
unless entity.edges.empty?
|
37
|
+
attributes << "edges='"
|
38
38
|
a = []
|
39
|
-
entity.
|
40
|
-
a << ("{target: #{
|
41
|
-
"type: #{
|
42
|
-
"directed: #{
|
43
|
-
"direction: #{
|
39
|
+
entity.edges.each do |edge|
|
40
|
+
a << ("{target: #{edge.target}, "+
|
41
|
+
"type: #{edge.type}, " +
|
42
|
+
"directed: #{edge.directed}, " +
|
43
|
+
"direction: #{edge.direction}}" )
|
44
44
|
end
|
45
45
|
# Structs.
|
46
46
|
attributes << a.join(',') + "'"
|
@@ -22,7 +22,7 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
22
22
|
entity.type.to_s.capitalize.intern).superclass).downcase
|
23
23
|
supertype = entity.type.to_s if supertype == 'entity'
|
24
24
|
supertypes = supertype + 's'
|
25
|
-
|
25
|
+
supertypes = 'documents' if entity.type == :collection
|
26
26
|
coll = @@database.collection(supertypes)
|
27
27
|
records = coll.find(selector).to_a
|
28
28
|
|
@@ -30,16 +30,24 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
30
30
|
raise Treat::Exception,
|
31
31
|
"Couldn't find any records using " +
|
32
32
|
"selector #{selector.inspect}."
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
end
|
34
|
+
|
35
|
+
if entity.type == :document
|
36
|
+
if records.size == 1
|
37
|
+
self.do_unserialize(
|
38
|
+
records.first, options)
|
39
|
+
else
|
40
|
+
raise Treat::Exception,
|
41
|
+
"More than one document matched" +
|
42
|
+
"your selector #{selector.inspect}."
|
43
|
+
end
|
44
|
+
elsif entity.type == :collection
|
45
|
+
collection = Treat::Entities::Collection.new
|
38
46
|
records.each do |record|
|
39
|
-
|
47
|
+
collection << self.
|
40
48
|
do_unserialize(record, options)
|
41
49
|
end
|
42
|
-
|
50
|
+
collection
|
43
51
|
end
|
44
52
|
|
45
53
|
end
|
@@ -35,13 +35,13 @@ module Treat::Workers::Formatters::Unserializers::XML
|
|
35
35
|
|
36
36
|
id = nil; value = ''
|
37
37
|
attributes = {}
|
38
|
-
|
38
|
+
edges = []
|
39
39
|
|
40
40
|
unless xml_reader.attributes.size == 0
|
41
41
|
xml_reader.attributes.each_pair do |k,v|
|
42
42
|
if k == 'id'
|
43
43
|
id = v.to_i
|
44
|
-
elsif k == '
|
44
|
+
elsif k == 'edges'
|
45
45
|
a = v.split('--')
|
46
46
|
a.each do |b|
|
47
47
|
c = b.split(';')
|
@@ -54,7 +54,7 @@ module Treat::Workers::Formatters::Unserializers::XML
|
|
54
54
|
end
|
55
55
|
|
56
56
|
target, type, directed, direction = *vals
|
57
|
-
|
57
|
+
edges << [
|
58
58
|
target.to_i,
|
59
59
|
type,
|
60
60
|
(directed == 'true' ? true : false),
|
@@ -87,8 +87,8 @@ module Treat::Workers::Formatters::Unserializers::XML
|
|
87
87
|
end
|
88
88
|
current_element.features = attributes
|
89
89
|
current_element.features = attributes
|
90
|
-
|
91
|
-
target, type, directed, direction = *
|
90
|
+
edges.each do |edge|
|
91
|
+
target, type, directed, direction = *edge
|
92
92
|
current_element.link(target, type, directed, direction)
|
93
93
|
end
|
94
94
|
else
|
@@ -106,18 +106,18 @@ class Treat::Workers::Formatters::Visualizers::DOT
|
|
106
106
|
string << "\n#{entity.parent.id} -- #{entity.id};"
|
107
107
|
end
|
108
108
|
end
|
109
|
-
#
|
110
|
-
if entity.
|
111
|
-
entity.
|
109
|
+
# edges.
|
110
|
+
if entity.has_edges?
|
111
|
+
entity.edges.each do |edge|
|
112
112
|
dir = ''
|
113
|
-
if
|
114
|
-
dir =
|
113
|
+
if edge.directed == true
|
114
|
+
dir = edge.direction == 1 ? 'forward' : 'back'
|
115
115
|
dir = ",dir=#{dir}"
|
116
116
|
else
|
117
117
|
dir = ",dir=both"
|
118
118
|
end
|
119
|
-
string << "\n#{entity.id} -- #{
|
120
|
-
string << "[label=#{
|
119
|
+
string << "\n#{entity.id} -- #{edge.target}"
|
120
|
+
string << "[label=#{edge.type}#{dir}]"
|
121
121
|
end
|
122
122
|
end
|
123
123
|
# Recurse.
|
@@ -11,8 +11,9 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|
11
11
|
|
12
12
|
if !@@classifiers[cl]
|
13
13
|
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
-
cl.
|
15
|
-
|
14
|
+
cl.feature_labels.map { |l| l.to_s },
|
15
|
+
set.items.map { |i| i[:features]},
|
16
|
+
cl.question.default, cl.question.type)
|
16
17
|
dec_tree.train
|
17
18
|
@@classifiers[cl] = dec_tree
|
18
19
|
else
|
@@ -20,7 +21,7 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|
20
21
|
dec_tree.graph('testingbitch')
|
21
22
|
end
|
22
23
|
dec_tree.predict(
|
23
|
-
cl.
|
24
|
+
cl.export_features(entity, false)
|
24
25
|
)
|
25
26
|
end
|
26
27
|
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class Treat::Workers::Learners::Classifiers::Linear
|
2
|
+
|
3
|
+
require 'linear'
|
4
|
+
|
5
|
+
@@classifiers = {}
|
6
|
+
|
7
|
+
DefaultOptions = {
|
8
|
+
bias: 1,
|
9
|
+
eps: 0.1,
|
10
|
+
solver_type: MCSVM_CS
|
11
|
+
}
|
12
|
+
|
13
|
+
def self.classify(entity, options = {})
|
14
|
+
|
15
|
+
options = DefaultOptions.merge(options)
|
16
|
+
set = options[:training]
|
17
|
+
problem = set.problem
|
18
|
+
|
19
|
+
if !@@classifiers[problem]
|
20
|
+
labels = problem.question.labels
|
21
|
+
unless labels
|
22
|
+
raise Treat::Exception,
|
23
|
+
"LibLinear requires that you provide the possible " +
|
24
|
+
"labels to assign to classification items when " +
|
25
|
+
"specifying the question."
|
26
|
+
end
|
27
|
+
param = LParameter.new
|
28
|
+
param.solver_type = options[:solver_type]
|
29
|
+
param.eps = options[:eps]
|
30
|
+
bias = options[:bias]
|
31
|
+
data = set.items.map do |item|
|
32
|
+
self.array_to_hash(item[:features])
|
33
|
+
end
|
34
|
+
prob = LProblem.new(labels, data, bias)
|
35
|
+
@@classifiers[problem] =
|
36
|
+
LModel.new(prob, param)
|
37
|
+
end
|
38
|
+
|
39
|
+
@@classifiers[problem].predict(
|
40
|
+
self.array_to_hash(problem.
|
41
|
+
export_features(entity, false)))
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.array_to_hash(array)
|
46
|
+
hash = {}
|
47
|
+
0.upto(array.length - 1) do |i|
|
48
|
+
hash[i] = array[i]
|
49
|
+
end
|
50
|
+
hash
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -11,11 +11,11 @@ class Treat::Workers::Learners::Classifiers::MLP
|
|
11
11
|
cl = set.problem
|
12
12
|
|
13
13
|
if !@@mlps[cl]
|
14
|
-
net = Ai4r::NeuralNetwork::
|
15
|
-
|
14
|
+
net = Ai4r::NeuralNetwork::Backpropagation.new(
|
15
|
+
[cl.feature_labels.size, 3, 1])
|
16
16
|
set.items.each do |item|
|
17
|
-
inputs = item[0..-2]
|
18
|
-
outputs = [item[-1]]
|
17
|
+
inputs = item[:features][0..-2]
|
18
|
+
outputs = [item[:features][-1]]
|
19
19
|
net.train(inputs, outputs)
|
20
20
|
end
|
21
21
|
@@mlps[cl] = net
|
@@ -23,7 +23,7 @@ class Treat::Workers::Learners::Classifiers::MLP
|
|
23
23
|
net = @@mlps[cl]
|
24
24
|
end
|
25
25
|
|
26
|
-
net.eval(cl.
|
26
|
+
net.eval(cl.export_features(entity, false))[0]
|
27
27
|
|
28
28
|
end
|
29
29
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
class Treat::Workers::Learners::Classifiers::SVM
|
2
|
+
|
3
|
+
require 'svm'
|
4
|
+
|
5
|
+
@@classifiers = {}
|
6
|
+
|
7
|
+
def self.classify(entity, options = {})
|
8
|
+
|
9
|
+
set = options[:training]
|
10
|
+
problem = set.problem
|
11
|
+
|
12
|
+
if !@@classifiers[problem]
|
13
|
+
labels = problem.question.labels
|
14
|
+
unless labels
|
15
|
+
raise Treat::Exception,
|
16
|
+
"LibSVM requires that you provide the possible " +
|
17
|
+
"labels to assign to classification items when " +
|
18
|
+
"specifying the question."
|
19
|
+
end
|
20
|
+
data = set.items.map { |item| item[:features] }
|
21
|
+
prob = Problem.new(labels, data)
|
22
|
+
param = Parameter.new(:kernel_type => LINEAR, :C => 10)
|
23
|
+
@@classifiers[problem] = Model.new(prob, param)
|
24
|
+
end
|
25
|
+
|
26
|
+
@@classifiers[problem].predict_probability(
|
27
|
+
problem.export_features(entity, false))[0]
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -53,8 +53,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|
53
53
|
def self.init_tagger(language)
|
54
54
|
Treat::Loaders::Stanford.load(language)
|
55
55
|
model = StanfordCoreNLP::Config::Models[:pos][language]
|
56
|
-
|
57
|
-
|
56
|
+
model_path = Treat.libraries.stanford.model_path ||
|
57
|
+
Treat.paths.models + 'stanford/'
|
58
|
+
model = model_path + StanfordCoreNLP::
|
59
|
+
Config::ModelFolders[:pos] + model
|
58
60
|
@@taggers[language] ||=
|
59
61
|
StanfordCoreNLP::MaxentTagger.new(model)
|
60
62
|
end
|
@@ -38,7 +38,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
38
38
|
val = entity.to_s
|
39
39
|
|
40
40
|
@@id_table = {}
|
41
|
-
@@
|
41
|
+
@@edges_table = {}
|
42
42
|
|
43
43
|
stdin, stdout = proc
|
44
44
|
text, remove_last = valid_text(val)
|
@@ -63,7 +63,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
63
63
|
end
|
64
64
|
|
65
65
|
link_heads(entity)
|
66
|
-
|
66
|
+
add_edges(entity)
|
67
67
|
end
|
68
68
|
|
69
69
|
# Return the process running Enju.
|
@@ -102,12 +102,12 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
102
102
|
pd = cd
|
103
103
|
next
|
104
104
|
end
|
105
|
-
# Get and format attributes and
|
105
|
+
# Get and format attributes and edges.
|
106
106
|
attributes = reader.attributes
|
107
107
|
id = attributes.delete('id')
|
108
|
-
new_attr = {};
|
108
|
+
new_attr = {}; edges = {}
|
109
109
|
unless attributes.size == 0
|
110
|
-
new_attr,
|
110
|
+
new_attr, edges =
|
111
111
|
cleanup_attributes(reader.name, attributes)
|
112
112
|
end
|
113
113
|
# Create the appropriate entity for the
|
@@ -117,17 +117,17 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
117
117
|
when 'sentence'
|
118
118
|
entity = Treat::Entities::Sentence.new('')
|
119
119
|
@@id_table[id] = entity.id
|
120
|
-
@@
|
120
|
+
@@edges_table[entity.id] = edges
|
121
121
|
entity.features = new_attr
|
122
122
|
when 'cons'
|
123
123
|
entity = entity <<
|
124
124
|
Treat::Entities::Phrase.new('')
|
125
125
|
@@id_table[id] = entity.id
|
126
|
-
@@
|
126
|
+
@@edges_table[entity.id] = edges
|
127
127
|
entity.features = new_attr
|
128
128
|
when 'tok'
|
129
129
|
tmp_attributes = new_attr
|
130
|
-
|
130
|
+
tmp_edges = edges
|
131
131
|
else
|
132
132
|
current_value = reader.value.gsub(/\s+/, "")
|
133
133
|
unless current_value.size == 0
|
@@ -136,7 +136,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
136
136
|
if entity.is_a?(Treat::Entities::Word)
|
137
137
|
entity.features = tmp_attributes
|
138
138
|
@@id_table[id] = entity.id
|
139
|
-
@@
|
139
|
+
@@edges_table[entity.id] = tmp_edges
|
140
140
|
else
|
141
141
|
# Do something useful here
|
142
142
|
entity.set :tag, 'SYM'
|
@@ -179,15 +179,15 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
179
179
|
end
|
180
180
|
end
|
181
181
|
|
182
|
-
# Add
|
183
|
-
def self.
|
182
|
+
# Add edges a posteriori to a parsed entity.
|
183
|
+
def self.add_edges(entity2)
|
184
184
|
|
185
185
|
entity2.each_entity(:word, :phrase) do |entity|
|
186
|
-
@@
|
187
|
-
next if
|
186
|
+
@@edges_table.each_pair do |id, edges|
|
187
|
+
next if edges.nil?
|
188
188
|
entity = entity2.root.find(id)
|
189
189
|
next if entity.nil?
|
190
|
-
|
190
|
+
edges.each_pair do |argument, type|
|
191
191
|
# Skip this argument if we
|
192
192
|
# don't know the target node.
|
193
193
|
next if argument == 'unk'
|
@@ -205,7 +205,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
205
205
|
def self.cleanup_attributes(name, attributes)
|
206
206
|
|
207
207
|
new_attr = {}
|
208
|
-
|
208
|
+
edges = {}
|
209
209
|
pred = attributes.delete('pred')
|
210
210
|
|
211
211
|
attributes.each_pair do |attribute2, value|
|
@@ -214,7 +214,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
214
214
|
|
215
215
|
if attribute == 'arg1' ||
|
216
216
|
attribute == 'arg2'
|
217
|
-
|
217
|
+
edges[value] = pred
|
218
218
|
next
|
219
219
|
end
|
220
220
|
|
@@ -256,7 +256,7 @@ module Treat::Workers::Processors::Parsers::Enju
|
|
256
256
|
new_attr.delete :base
|
257
257
|
end
|
258
258
|
|
259
|
-
return new_attr,
|
259
|
+
return new_attr, edges
|
260
260
|
|
261
261
|
end
|
262
262
|
|
@@ -63,7 +63,9 @@ module Treat::Workers::Processors::Segmenters::Punkt
|
|
63
63
|
if options[:model]
|
64
64
|
model = options[:model]
|
65
65
|
else
|
66
|
-
|
66
|
+
model_path = Treat.libraries.punkt.model_path ||
|
67
|
+
Treat.paths.models + 'punkt/'
|
68
|
+
model = model_path + "#{lang}.yaml"
|
67
69
|
unless File.readable?(model)
|
68
70
|
raise Treat::Exception,
|
69
71
|
"Could not get the language model " +
|
data/spec/collection.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require_relative '
|
1
|
+
require_relative 'helper'
|
2
2
|
|
3
3
|
describe Treat::Entities::Collection do
|
4
4
|
|
@@ -29,7 +29,7 @@ describe Treat::Entities::Collection do
|
|
29
29
|
f = Treat.paths.spec + 'samples/test'
|
30
30
|
c = Treat::Entities::Collection.build(f)
|
31
31
|
c << Treat::Entities::Document.new
|
32
|
-
c.size.should eql
|
32
|
+
c.size.should eql 1
|
33
33
|
FileUtils.rm_rf(f)
|
34
34
|
end
|
35
35
|
end
|
@@ -45,7 +45,7 @@ describe Treat::Entities::Collection do
|
|
45
45
|
it "recursively searches the folder for " +
|
46
46
|
"files and opens them into a collection of documents" do
|
47
47
|
collection = Treat::Entities::Collection.build(@file)
|
48
|
-
collection.size.should eql
|
48
|
+
collection.size.should eql 5
|
49
49
|
end
|
50
50
|
|
51
51
|
end
|