rbbt-dm 1.1.48 → 1.1.53
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/network/paths.rb +1 -1
- data/lib/rbbt/tensorflow.rb +43 -0
- data/lib/rbbt/vector/model.rb +164 -65
- data/lib/rbbt/vector/model/spaCy.rb +76 -0
- data/lib/rbbt/vector/model/svm.rb +1 -1
- data/lib/rbbt/vector/model/tensorflow.rb +55 -0
- data/share/spaCy/cpu/textcat_accuracy.conf +86 -0
- data/share/spaCy/cpu/textcat_efficiency.conf +78 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +84 -0
- data/share/spaCy/gpu/textcat_efficiency.conf +73 -0
- data/test/rbbt/network/test_paths.rb +1 -1
- data/test/rbbt/vector/model/test_spaCy.rb +121 -0
- data/test/rbbt/vector/model/test_tensorflow.rb +57 -0
- data/test/rbbt/vector/test_model.rb +354 -0
- metadata +15 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b00fd271a576dd8e92f3e24e863ce59128c39edba34c14c75b3f0414f45e2ccf
|
4
|
+
data.tar.gz: 19a2825592b122ab485abaffa432cf19a71afc6f9fc30d6e7a63793fc70de914
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b7b327a6de2ea159266ae41f38a8139e84552ece97b94bdab809dd26474be59b4a62456628257e788384e2cd2d5ea3c6d4f915dfbca1fb24fcad283d75c539ad
|
7
|
+
data.tar.gz: 53581538c5d4ac0a9ff7acda9039565f608caa04e5bfa5721f1e9efc29fb69ff7e1ff88de0379c8ca9be217feb4c273d44fefc5d30f901dcd6c723e0db28abab
|
data/lib/rbbt/network/paths.rb
CHANGED
@@ -32,7 +32,7 @@ module Paths
|
|
32
32
|
if end_node
|
33
33
|
end_node = end_node.select{|n| parents.keys.include? n}.first unless String === end_node
|
34
34
|
return nil if not parents.include? end_node
|
35
|
-
extract_path(parents, start_node,
|
35
|
+
extract_path(parents, start_node, end_node)
|
36
36
|
else
|
37
37
|
parents
|
38
38
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'rbbt/util/python'
|
2
|
+
|
3
|
+
module RbbtTensorflow
|
4
|
+
|
5
|
+
def self.init
|
6
|
+
RbbtPython.run do
|
7
|
+
pyimport "tensorflow", as: "tf"
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.test
|
12
|
+
|
13
|
+
mod = x_test = y_test = nil
|
14
|
+
RbbtPython.run do
|
15
|
+
|
16
|
+
mnist_db = tf.keras.datasets.mnist
|
17
|
+
|
18
|
+
(x_train, y_train), (x_test, y_test) = mnist_db.load_data()
|
19
|
+
x_train, x_test = x_train / 255.0, x_test / 255.0
|
20
|
+
|
21
|
+
mod = tf.keras.models.Sequential.new([
|
22
|
+
tf.keras.layers.Flatten.new(input_shape: [28, 28]),
|
23
|
+
tf.keras.layers.Dense.new(128, activation:'relu'),
|
24
|
+
tf.keras.layers.Dropout.new(0.2),
|
25
|
+
tf.keras.layers.Dense.new(10, activation:'softmax')
|
26
|
+
])
|
27
|
+
mod.compile(optimizer='adam',
|
28
|
+
loss='sparse_categorical_crossentropy',
|
29
|
+
metrics=['accuracy'])
|
30
|
+
mod.fit(x_train, y_train, epochs:1)
|
31
|
+
mod
|
32
|
+
end
|
33
|
+
|
34
|
+
RbbtPython.run do
|
35
|
+
mod.evaluate(x_test, y_test, verbose:2)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
if __FILE__ == $0
|
41
|
+
RbbtTensorflow.init
|
42
|
+
RbbtTensorflow.test
|
43
|
+
end
|
data/lib/rbbt/vector/model.rb
CHANGED
@@ -46,10 +46,10 @@ save(model, file='#{model_file}')
|
|
46
46
|
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
47
47
|
load(file="#{model_file}");
|
48
48
|
#{code}
|
49
|
-
cat(paste(label, sep="\\n"));
|
49
|
+
cat(paste(label, sep="\\n", collapse="\\n"));
|
50
50
|
EOF
|
51
|
-
|
52
|
-
res =
|
51
|
+
txt = io.read
|
52
|
+
res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/).collect{|l| l.to_f}
|
53
53
|
|
54
54
|
if list
|
55
55
|
res
|
@@ -60,13 +60,51 @@ cat(paste(label, sep="\\n"));
|
|
60
60
|
end
|
61
61
|
end
|
62
62
|
|
63
|
+
def __load_method(file)
|
64
|
+
code = Open.read(file)
|
65
|
+
code.sub!(/.*Proc\.new/, "Proc.new")
|
66
|
+
instance_eval code, file
|
67
|
+
end
|
68
|
+
|
63
69
|
def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil)
|
64
70
|
@directory = directory
|
65
71
|
FileUtils.mkdir_p @directory unless File.exists? @directory
|
72
|
+
|
66
73
|
@model_file = File.join(@directory, "model")
|
67
|
-
|
68
|
-
|
69
|
-
|
74
|
+
@extract_features_file = File.join(@directory, "features")
|
75
|
+
@train_model_file = File.join(@directory, "train_model")
|
76
|
+
@eval_model_file = File.join(@directory, "eval_model")
|
77
|
+
@train_model_file_R = File.join(@directory, "train_model.R")
|
78
|
+
@eval_model_file_R = File.join(@directory, "eval_model.R")
|
79
|
+
|
80
|
+
if extract_features.nil?
|
81
|
+
if File.exists?(@extract_features_file)
|
82
|
+
@extract_features = __load_method @extract_features_file
|
83
|
+
end
|
84
|
+
else
|
85
|
+
@extract_features = extract_features
|
86
|
+
end
|
87
|
+
|
88
|
+
if train_model.nil?
|
89
|
+
if File.exists?(@train_model_file)
|
90
|
+
@train_model = __load_method @train_model_file
|
91
|
+
elsif File.exists?(@train_model_file_R)
|
92
|
+
@train_model = Open.read(@train_model_file_R)
|
93
|
+
end
|
94
|
+
else
|
95
|
+
@train_model = train_model
|
96
|
+
end
|
97
|
+
|
98
|
+
if eval_model.nil?
|
99
|
+
if File.exists?(@eval_model_file)
|
100
|
+
@eval_model = __load_method @eval_model_file
|
101
|
+
elsif File.exists?(@eval_model_file_R)
|
102
|
+
@eval_model = Open.read(@eval_model_file_R)
|
103
|
+
end
|
104
|
+
else
|
105
|
+
@eval_model = eval_model
|
106
|
+
end
|
107
|
+
|
70
108
|
@features = []
|
71
109
|
@labels = []
|
72
110
|
end
|
@@ -77,8 +115,47 @@ cat(paste(label, sep="\\n"));
|
|
77
115
|
end
|
78
116
|
|
79
117
|
def add(element, label = nil)
|
80
|
-
|
81
|
-
@
|
118
|
+
features = @extract_features ? extract_features.call(element) : element
|
119
|
+
@features << features
|
120
|
+
@labels << label
|
121
|
+
end
|
122
|
+
|
123
|
+
def add_list(elements, labels = nil)
|
124
|
+
if @extract_features.nil? || @extract_features.arity == 1
|
125
|
+
elements.zip(labels || [nil]).each do |elem,label|
|
126
|
+
add(elem, label)
|
127
|
+
end
|
128
|
+
else
|
129
|
+
features = @extract_features.call(nil, elements)
|
130
|
+
@features.concat features
|
131
|
+
@labels.concat labels if labels
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def save_models
|
136
|
+
require 'method_source'
|
137
|
+
|
138
|
+
case
|
139
|
+
when Proc === train_model
|
140
|
+
begin
|
141
|
+
Open.write(@train_model_file, train_model.source)
|
142
|
+
rescue
|
143
|
+
end
|
144
|
+
when String === train_model
|
145
|
+
Open.write(@train_model_file_R, @train_model)
|
146
|
+
end
|
147
|
+
|
148
|
+
Open.write(@extract_features_file, @extract_features.source) if @extract_features
|
149
|
+
|
150
|
+
case
|
151
|
+
when Proc === eval_model
|
152
|
+
begin
|
153
|
+
Open.write(@eval_model_file, eval_model.source)
|
154
|
+
rescue
|
155
|
+
end
|
156
|
+
when String === eval_model
|
157
|
+
Open.write(@eval_model_file_R, eval_model)
|
158
|
+
end
|
82
159
|
end
|
83
160
|
|
84
161
|
def train
|
@@ -88,6 +165,7 @@ cat(paste(label, sep="\\n"));
|
|
88
165
|
when String === train_model
|
89
166
|
VectorModel.R_train(@model_file, @features, @labels, train_model)
|
90
167
|
end
|
168
|
+
save_models
|
91
169
|
end
|
92
170
|
|
93
171
|
def run(code)
|
@@ -96,99 +174,120 @@ cat(paste(label, sep="\\n"));
|
|
96
174
|
|
97
175
|
def eval(element)
|
98
176
|
case
|
99
|
-
when Proc === eval_model
|
100
|
-
eval_model.call(@model_file, extract_features.call(element), false)
|
101
|
-
when String === eval_model
|
102
|
-
VectorModel.R_eval(@model_file, extract_features.call(element), false, eval_model)
|
177
|
+
when Proc === @eval_model
|
178
|
+
@eval_model.call(@model_file, @extract_features.call(element), false)
|
179
|
+
when String === @eval_model
|
180
|
+
VectorModel.R_eval(@model_file, @extract_features.call(element), false, eval_model)
|
103
181
|
end
|
104
182
|
end
|
105
183
|
|
106
184
|
def eval_list(elements, extract = true)
|
185
|
+
|
186
|
+
if extract && ! @extract_features.nil?
|
187
|
+
features = if @extract_features.arity == 1
|
188
|
+
elements.collect{|element| @extract_features.call(element) }
|
189
|
+
else
|
190
|
+
@extract_features.call(nil, elements)
|
191
|
+
end
|
192
|
+
else
|
193
|
+
features = elements
|
194
|
+
end
|
195
|
+
|
107
196
|
case
|
108
197
|
when Proc === eval_model
|
109
|
-
eval_model.call(@model_file,
|
198
|
+
eval_model.call(@model_file, features, true)
|
110
199
|
when String === eval_model
|
111
|
-
|
200
|
+
VectorModel.R_eval(@model_file, features, true, eval_model)
|
112
201
|
end
|
113
202
|
end
|
114
203
|
|
115
|
-
def cross_validation(folds = 10)
|
116
|
-
|
117
|
-
|
118
|
-
|
204
|
+
#def cross_validation(folds = 10)
|
205
|
+
# saved_features = @features
|
206
|
+
# saved_labels = @labels
|
207
|
+
# seq = (0..features.length - 1).to_a
|
119
208
|
|
120
|
-
|
209
|
+
# chunk_size = features.length / folds
|
121
210
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
211
|
+
# acc = []
|
212
|
+
# folds.times do
|
213
|
+
# seq = seq.shuffle
|
214
|
+
# eval_chunk = seq[0..chunk_size]
|
215
|
+
# train_chunk = seq[chunk_size.. -1]
|
127
216
|
|
128
|
-
|
129
|
-
|
217
|
+
# eval_features = @features.values_at *eval_chunk
|
218
|
+
# eval_labels = @labels.values_at *eval_chunk
|
130
219
|
|
131
|
-
|
132
|
-
|
220
|
+
# @features = @features.values_at *train_chunk
|
221
|
+
# @labels = @labels.values_at *train_chunk
|
133
222
|
|
134
|
-
|
135
|
-
|
223
|
+
# train
|
224
|
+
# predictions = eval_list eval_features, false
|
136
225
|
|
137
|
-
|
226
|
+
# acc << predictions.zip(eval_labels).collect{|pred,lab| pred - lab < 0.5 ? 1 : 0}.inject(0){|acc,e| acc +=e} / chunk_size
|
138
227
|
|
139
|
-
|
140
|
-
|
141
|
-
|
228
|
+
# @features = saved_features
|
229
|
+
# @labels = saved_labels
|
230
|
+
# end
|
142
231
|
|
143
|
-
|
144
|
-
end
|
232
|
+
# acc
|
233
|
+
#end
|
145
234
|
|
146
235
|
def cross_validation(folds = 10)
|
147
236
|
|
148
237
|
res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
|
149
238
|
|
150
|
-
|
151
|
-
|
239
|
+
orig_features = @features
|
240
|
+
orig_labels = @labels
|
152
241
|
|
153
|
-
|
242
|
+
begin
|
243
|
+
feature_folds = Misc.divide(@features, folds)
|
244
|
+
labels_folds = Misc.divide(@labels, folds)
|
154
245
|
|
155
|
-
|
156
|
-
train_set = feature_folds.values_at(*((0..9).to_a - [fix])).inject([]){|acc,e| acc += e; acc}
|
246
|
+
folds.times do |fix|
|
157
247
|
|
158
|
-
|
159
|
-
train_labels = labels_folds.values_at(*((0..9).to_a - [fix])).flatten
|
248
|
+
rest = (0..(folds-1)).to_a - [fix]
|
160
249
|
|
161
|
-
|
250
|
+
test_set = feature_folds[fix]
|
251
|
+
train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}
|
162
252
|
|
163
|
-
|
164
|
-
|
165
|
-
self.train
|
166
|
-
predictions = self.eval_list test_set, false
|
253
|
+
test_labels = labels_folds[fix]
|
254
|
+
train_labels = labels_folds.values_at(*rest).flatten
|
167
255
|
|
168
|
-
|
169
|
-
gs = gs.to_i
|
170
|
-
pred = pred > 0.5 ? 1 : 0
|
171
|
-
tp += 1 if gs == pred && gs == 1
|
172
|
-
tn += 1 if gs == pred && gs == 0
|
173
|
-
fp += 1 if gs == 0 && pred == 1
|
174
|
-
fn += 1 if gs == 1 && pred == 0
|
175
|
-
end
|
256
|
+
tp, fp, tn, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
|
176
257
|
|
177
|
-
|
178
|
-
|
258
|
+
@features = train_set
|
259
|
+
@labels = train_labels
|
260
|
+
self.train
|
261
|
+
predictions = self.eval_list test_set, false
|
179
262
|
|
180
|
-
|
181
|
-
re = tp.to_f / p
|
263
|
+
raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length
|
182
264
|
|
183
|
-
|
265
|
+
test_labels.zip(predictions).each do |gs,pred|
|
266
|
+
gs = gs.to_i
|
267
|
+
pred = pred > 0.5 ? 1 : 0
|
268
|
+
tp += 1 if gs == pred && gs == 1
|
269
|
+
tn += 1 if gs == pred && gs == 0
|
270
|
+
fp += 1 if gs == 0 && pred == 1
|
271
|
+
fn += 1 if gs == 1 && pred == 0
|
272
|
+
end
|
184
273
|
|
185
|
-
|
274
|
+
p = tp + fn
|
275
|
+
pp = tp + fp
|
186
276
|
|
187
|
-
|
277
|
+
pr = tp.to_f / pp
|
278
|
+
re = tp.to_f / p
|
188
279
|
|
189
|
-
|
190
|
-
|
280
|
+
f1 = (2.0 * tp) / (2.0 * tp + fp + fn)
|
281
|
+
|
282
|
+
Log.debug "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
|
191
283
|
|
284
|
+
res[fix] = [tp,tn,fp,fn,pr,re,f1]
|
285
|
+
end
|
286
|
+
ensure
|
287
|
+
@features = orig_features
|
288
|
+
@labels = orig_labels
|
289
|
+
end
|
290
|
+
self.train
|
192
291
|
res
|
193
292
|
end
|
194
293
|
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'rbbt/vector/model'
|
2
|
+
require 'rbbt/nlp/spaCy'
|
3
|
+
|
4
|
+
class SpaCyModel < VectorModel
|
5
|
+
attr_accessor :config
|
6
|
+
|
7
|
+
def spacy(&block)
|
8
|
+
RbbtPython.run "spacy" do
|
9
|
+
RbbtPython.module_eval(&block)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(dir, config, lang = 'en_core_web_md')
|
14
|
+
@config = case
|
15
|
+
when Path === config
|
16
|
+
config.read
|
17
|
+
when Misc.is_filename?(config)
|
18
|
+
Open.read(config)
|
19
|
+
when (Misc.is_filename?(config, false) && Rbbt.share.spaCy.cpu[config].exists?)
|
20
|
+
Rbbt.share.spaCy.cpu[config].read
|
21
|
+
when (Misc.is_filename?(config, false) && Rbbt.share.spaCy[config].exists?)
|
22
|
+
Rbbt.share.spaCy[config].read
|
23
|
+
else
|
24
|
+
config
|
25
|
+
end
|
26
|
+
@lang = lang
|
27
|
+
|
28
|
+
super(dir)
|
29
|
+
|
30
|
+
@train_model = Proc.new do |file, features, labels|
|
31
|
+
texts = features
|
32
|
+
docs = []
|
33
|
+
tmpconfig = File.join(file, 'config')
|
34
|
+
tmptrain = File.join(file, 'train.spacy')
|
35
|
+
SpaCy.config(@config, tmpconfig)
|
36
|
+
spacy do
|
37
|
+
nlp = SpaCy.nlp(lang)
|
38
|
+
docs = []
|
39
|
+
RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar => "Training documents into spacy format" do |doc,label|
|
40
|
+
if %w(1 true pos).include?(label.to_s.downcase)
|
41
|
+
doc.cats["positive"] = 1
|
42
|
+
doc.cats["negative"] = 0
|
43
|
+
else
|
44
|
+
doc.cats["positive"] = 0
|
45
|
+
doc.cats["negative"] = 1
|
46
|
+
end
|
47
|
+
docs << doc
|
48
|
+
end
|
49
|
+
|
50
|
+
doc_bin = spacy.tokens.DocBin.new(docs: docs)
|
51
|
+
doc_bin.to_disk(tmptrain)
|
52
|
+
end
|
53
|
+
|
54
|
+
gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
|
55
|
+
CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
|
56
|
+
end
|
57
|
+
|
58
|
+
@eval_model = Proc.new do |file, features|
|
59
|
+
texts = features
|
60
|
+
|
61
|
+
docs = []
|
62
|
+
spacy do
|
63
|
+
nlp = spacy.load("#{file}/model-best")
|
64
|
+
|
65
|
+
Log::ProgressBar.with_bar texts.length, :desc => "Evaluating documents" do |bar|
|
66
|
+
texts.collect do |text|
|
67
|
+
cats = nlp.(text).cats
|
68
|
+
bar.tick
|
69
|
+
cats['positive'] > cats['negative'] ? 1 : 0
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|