rbbt-dm 1.1.50 → 1.1.54
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/tensorflow.rb +1 -1
- data/lib/rbbt/vector/model/random_forest.rb +26 -0
- data/lib/rbbt/vector/model/spaCy.rb +8 -4
- data/lib/rbbt/vector/model/svm.rb +3 -3
- data/lib/rbbt/vector/model/tensorflow.rb +1 -1
- data/lib/rbbt/vector/model.rb +137 -48
- data/share/spaCy/cpu/textcat_accuracy.conf +86 -0
- data/share/spaCy/cpu/textcat_efficiency.conf +78 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +84 -0
- data/share/spaCy/gpu/textcat_efficiency.conf +73 -0
- data/test/rbbt/vector/model/test_spaCy.rb +34 -1
- data/test/rbbt/vector/model/test_svm.rb +3 -3
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -3
- data/test/rbbt/vector/test_model.rb +162 -77
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40ee19cdf9fd742bfa844428a16e61f398f76d53ca0bfdda5499f5bb03db1c2b
|
4
|
+
data.tar.gz: 7be309bf582ce9e547cf316a36b88ad3043fb0bdc9cd5d5590d507f0f78a7a71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 801e3c9a8541b3b87b12b961c31c4932c17b36db6c9d65dd9a88ef1adb5811188d79570099cf96a1dea3d71d6323edfe2707fd46ebe9e9faa96f38ade84684eb
|
7
|
+
data.tar.gz: e1955147d51d34595cf169e818da48584cf9f1597dd2a714d91de88cf90846e9845f3147715ea95769cfb453adf170879c926e02419ff1437b95968cf9bf9ffc
|
data/lib/rbbt/tensorflow.rb
CHANGED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rbbt/vector/model'
|
2
|
+
class RFModel < VectorModel
|
3
|
+
def initialize(dir)
|
4
|
+
super(dir)
|
5
|
+
|
6
|
+
@extract_features = Proc.new{|element|
|
7
|
+
element
|
8
|
+
}
|
9
|
+
|
10
|
+
@train_model =<<-EOF
|
11
|
+
rbbt.require("randomForest");
|
12
|
+
model = randomForest(as.factor(label) ~ ., data = features);
|
13
|
+
EOF
|
14
|
+
|
15
|
+
@eval_model =<<-EOF
|
16
|
+
rbbt.require("randomForest");
|
17
|
+
pred = names(model$forest$xlevels)
|
18
|
+
for (p in pred) {
|
19
|
+
if (class(features[[p]]) == "factor") {
|
20
|
+
features[[p]] = factor(features[[p]], levels=model$forest$xlevels[[p]])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
label = predict(model, features);
|
24
|
+
EOF
|
25
|
+
end
|
26
|
+
end
|
@@ -51,7 +51,8 @@ class SpaCyModel < VectorModel
|
|
51
51
|
doc_bin.to_disk(tmptrain)
|
52
52
|
end
|
53
53
|
|
54
|
-
|
54
|
+
gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
|
55
|
+
CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
|
55
56
|
end
|
56
57
|
|
57
58
|
@eval_model = Proc.new do |file, features|
|
@@ -61,9 +62,12 @@ class SpaCyModel < VectorModel
|
|
61
62
|
spacy do
|
62
63
|
nlp = spacy.load("#{file}/model-best")
|
63
64
|
|
64
|
-
texts.
|
65
|
-
|
66
|
-
|
65
|
+
Log::ProgressBar.with_bar texts.length, :desc => "Evaluating documents" do |bar|
|
66
|
+
texts.collect do |text|
|
67
|
+
cats = nlp.(text).cats
|
68
|
+
bar.tick
|
69
|
+
cats['positive'] > cats['negative'] ? 1 : 0
|
70
|
+
end
|
67
71
|
end
|
68
72
|
end
|
69
73
|
end
|
@@ -8,12 +8,12 @@ class SVMModel < VectorModel
|
|
8
8
|
}
|
9
9
|
|
10
10
|
@train_model =<<-EOF
|
11
|
-
|
12
|
-
model = svm(as.factor(
|
11
|
+
rbbt.require('e1071');
|
12
|
+
model = svm(as.factor(label) ~ ., data = features);
|
13
13
|
EOF
|
14
14
|
|
15
15
|
@eval_model =<<-EOF
|
16
|
-
|
16
|
+
rbbt.require('e1071');
|
17
17
|
label = predict(model, features);
|
18
18
|
EOF
|
19
19
|
end
|
@@ -32,7 +32,7 @@ class TensorFlowModel < VectorModel
|
|
32
32
|
end
|
33
33
|
@graph ||= keras_graph
|
34
34
|
@graph.compile(**@compile_options)
|
35
|
-
@graph.fit(features, labels, :epochs => @epochs, :verbose =>
|
35
|
+
@graph.fit(features, labels, :epochs => @epochs, :verbose => true)
|
36
36
|
@graph.save(file)
|
37
37
|
end
|
38
38
|
|
data/lib/rbbt/vector/model.rb
CHANGED
@@ -2,54 +2,85 @@ require 'rbbt/util/R'
|
|
2
2
|
|
3
3
|
class VectorModel
|
4
4
|
attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model
|
5
|
-
attr_accessor :features, :labels
|
5
|
+
attr_accessor :features, :names, :labels, :factor_levels
|
6
6
|
|
7
|
-
def self.R_run(model_file, features, labels, code)
|
7
|
+
def self.R_run(model_file, features, labels, code, names = nil, factor_levels = nil)
|
8
8
|
TmpFile.with_file do |feature_file|
|
9
9
|
Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
|
10
|
-
Open.write(feature_file + '.
|
10
|
+
Open.write(feature_file + '.label', labels * "\n" + "\n")
|
11
|
+
Open.write(feature_file + '.names', names * "\n" + "\n") if names
|
12
|
+
|
13
|
+
|
14
|
+
what = case labels.first
|
15
|
+
when Numeric, Integer, Float
|
16
|
+
'numeric()'
|
17
|
+
else
|
18
|
+
'character()'
|
19
|
+
end
|
11
20
|
|
12
21
|
R.run <<-EOF
|
13
|
-
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=
|
14
|
-
|
15
|
-
|
22
|
+
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
|
23
|
+
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
|
24
|
+
#{ factor_levels.collect do |name,levels|
|
25
|
+
"features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
|
26
|
+
end * "\n" if factor_levels }
|
27
|
+
labels = scan("#{ feature_file }.label", what=#{what});
|
28
|
+
features = cbind(features, label = labels);
|
16
29
|
#{code}
|
17
30
|
EOF
|
18
31
|
end
|
19
32
|
end
|
20
33
|
|
21
|
-
def self.R_train(model_file, features, labels, code)
|
34
|
+
def self.R_train(model_file, features, labels, code, names = nil, factor_levels = nil)
|
22
35
|
TmpFile.with_file do |feature_file|
|
23
36
|
Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
|
24
|
-
Open.write(feature_file + '.
|
37
|
+
Open.write(feature_file + '.label', labels * "\n" + "\n")
|
38
|
+
Open.write(feature_file + '.names', names * "\n" + "\n") if names
|
39
|
+
|
40
|
+
what = case labels.first
|
41
|
+
when Numeric, Integer, Float
|
42
|
+
'numeric()'
|
43
|
+
else
|
44
|
+
'character()'
|
45
|
+
end
|
25
46
|
|
26
47
|
R.run <<-EOF
|
27
|
-
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=
|
28
|
-
labels = scan("#{ feature_file }.
|
29
|
-
features =
|
48
|
+
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
|
49
|
+
labels = scan("#{ feature_file }.label", what=#{what});
|
50
|
+
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
|
51
|
+
features = cbind(features, label = labels);
|
52
|
+
#{ factor_levels.collect do |name,levels|
|
53
|
+
"features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
|
54
|
+
end * "\n" if factor_levels }
|
30
55
|
#{code}
|
31
56
|
save(model, file='#{model_file}')
|
32
57
|
EOF
|
33
58
|
end
|
34
59
|
end
|
35
60
|
|
36
|
-
def self.R_eval(model_file, features, list, code)
|
61
|
+
def self.R_eval(model_file, features, list, code, names = nil, factor_levels = nil)
|
37
62
|
TmpFile.with_file do |feature_file|
|
63
|
+
if list
|
64
|
+
Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
|
65
|
+
else
|
66
|
+
Open.write(feature_file, features * "\t" + "\n")
|
67
|
+
end
|
68
|
+
Open.write(feature_file + '.names', names * "\n" + "\n") if names
|
69
|
+
|
38
70
|
TmpFile.with_file do |results|
|
39
|
-
if list
|
40
|
-
Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
|
41
|
-
else
|
42
|
-
Open.write(feature_file, features * "\t" + "\n")
|
43
|
-
end
|
44
71
|
|
45
72
|
io = R.run <<-EOF
|
46
|
-
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=
|
73
|
+
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
|
74
|
+
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
|
75
|
+
#{ factor_levels.collect do |name,levels|
|
76
|
+
"features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
|
77
|
+
end * "\n" if factor_levels }
|
47
78
|
load(file="#{model_file}");
|
48
79
|
#{code}
|
49
80
|
cat(paste(label, sep="\\n", collapse="\\n"));
|
50
81
|
EOF
|
51
82
|
txt = io.read
|
52
|
-
res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/)
|
83
|
+
res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/)
|
53
84
|
|
54
85
|
if list
|
55
86
|
res
|
@@ -66,7 +97,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
66
97
|
instance_eval code, file
|
67
98
|
end
|
68
99
|
|
69
|
-
def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil)
|
100
|
+
def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil, names = nil, factor_levels = nil)
|
70
101
|
@directory = directory
|
71
102
|
FileUtils.mkdir_p @directory unless File.exists? @directory
|
72
103
|
|
@@ -76,6 +107,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
76
107
|
@eval_model_file = File.join(@directory, "eval_model")
|
77
108
|
@train_model_file_R = File.join(@directory, "train_model.R")
|
78
109
|
@eval_model_file_R = File.join(@directory, "eval_model.R")
|
110
|
+
@names_file = File.join(@directory, "feature_names")
|
111
|
+
@levels_file = File.join(@directory, "levels")
|
79
112
|
|
80
113
|
if extract_features.nil?
|
81
114
|
if File.exists?(@extract_features_file)
|
@@ -105,6 +138,22 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
105
138
|
@eval_model = eval_model
|
106
139
|
end
|
107
140
|
|
141
|
+
if names.nil?
|
142
|
+
if File.exists?(@names_file)
|
143
|
+
@names = Open.read(@names_file).split("\n")
|
144
|
+
end
|
145
|
+
else
|
146
|
+
@extract_features = names
|
147
|
+
end
|
148
|
+
|
149
|
+
if factor_levels.nil?
|
150
|
+
if File.exists?(@levels_file)
|
151
|
+
@factor_levels = YAML.load(Open.read(@levels_file))
|
152
|
+
end
|
153
|
+
else
|
154
|
+
@factor_levels = factor_levels
|
155
|
+
end
|
156
|
+
|
108
157
|
@features = []
|
109
158
|
@labels = []
|
110
159
|
end
|
@@ -156,28 +205,31 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
156
205
|
when String === eval_model
|
157
206
|
Open.write(@eval_model_file_R, eval_model)
|
158
207
|
end
|
208
|
+
|
209
|
+
Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
|
210
|
+
Open.write(@names_file, @names * "\n" + "\n") if @names
|
159
211
|
end
|
160
212
|
|
161
213
|
def train
|
162
214
|
case
|
163
215
|
when Proc === train_model
|
164
|
-
train_model.call(@model_file, @features, @labels)
|
216
|
+
train_model.call(@model_file, @features, @labels, @names, @factor_levels)
|
165
217
|
when String === train_model
|
166
|
-
VectorModel.R_train(@model_file, @features, @labels, train_model)
|
218
|
+
VectorModel.R_train(@model_file, @features, @labels, train_model, @names, @factor_levels)
|
167
219
|
end
|
168
220
|
save_models
|
169
221
|
end
|
170
222
|
|
171
223
|
def run(code)
|
172
|
-
VectorModel.R_run(@model_file, @features, @labels, code)
|
224
|
+
VectorModel.R_run(@model_file, @features, @labels, code, @names, @factor_levels)
|
173
225
|
end
|
174
226
|
|
175
227
|
def eval(element)
|
176
228
|
case
|
177
229
|
when Proc === @eval_model
|
178
|
-
@eval_model.call(@model_file, @extract_features.call(element), false)
|
230
|
+
@eval_model.call(@model_file, @extract_features.call(element), false, nil, @names, @factor_levels)
|
179
231
|
when String === @eval_model
|
180
|
-
VectorModel.R_eval(@model_file, @extract_features.call(element), false, eval_model)
|
232
|
+
VectorModel.R_eval(@model_file, @extract_features.call(element), false, eval_model, @names, @factor_levels)
|
181
233
|
end
|
182
234
|
end
|
183
235
|
|
@@ -195,9 +247,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
195
247
|
|
196
248
|
case
|
197
249
|
when Proc === eval_model
|
198
|
-
eval_model.call(@model_file, features, true)
|
250
|
+
eval_model.call(@model_file, features, true, nil, @names, @factor_levels)
|
199
251
|
when String === eval_model
|
200
|
-
VectorModel.R_eval(@model_file, features, true, eval_model)
|
252
|
+
VectorModel.R_eval(@model_file, features, true, eval_model, @names, @factor_levels)
|
201
253
|
end
|
202
254
|
end
|
203
255
|
|
@@ -231,14 +283,61 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
231
283
|
|
232
284
|
# acc
|
233
285
|
#end
|
286
|
+
#
|
287
|
+
|
288
|
+
def self.f1_metrics(test, predicted, good_label = nil)
|
289
|
+
tp, tn, fp, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
|
290
|
+
|
291
|
+
labels = (test + predicted).uniq
|
292
|
+
|
293
|
+
if labels.length == 2 || good_label
|
294
|
+
good_label = labels.uniq.select{|l| l.to_s == "true"}.first if good_label.nil?
|
295
|
+
good_label = labels.uniq.select{|l| l.to_s == "1"}.first if good_label.nil?
|
296
|
+
good_label = labels.uniq.sort.first if good_label.nil?
|
297
|
+
|
298
|
+
test.zip(predicted).each do |gs,pred|
|
299
|
+
gs = gs.to_s
|
300
|
+
pred = pred.to_s
|
301
|
+
|
302
|
+
tp += 1 if gs == pred && gs == good_label
|
303
|
+
tn += 1 if gs == pred && gs != good_label
|
304
|
+
fp += 1 if gs != good_label && pred == good_label
|
305
|
+
fn += 1 if gs == good_label && pred != good_label
|
306
|
+
end
|
307
|
+
|
308
|
+
p = tp + fn
|
309
|
+
pp = tp + fp
|
234
310
|
|
235
|
-
|
311
|
+
pr = tp.to_f / pp
|
312
|
+
re = tp.to_f / p
|
236
313
|
|
237
|
-
|
314
|
+
f1 = (2.0 * tp) / (2.0 * tp + fp + fn)
|
315
|
+
|
316
|
+
[tp, tn, fp, fn, pr, re, f1]
|
317
|
+
else
|
318
|
+
num = labels.length
|
319
|
+
acc = []
|
320
|
+
labels.each do |good_label|
|
321
|
+
values = VectorModel.f1_metrics(test, predicted, good_label)
|
322
|
+
acc << values
|
323
|
+
end
|
324
|
+
Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
def cross_validation(folds = 10, good_label = nil)
|
238
329
|
|
239
330
|
orig_features = @features
|
240
331
|
orig_labels = @labels
|
241
332
|
|
333
|
+
multiclass = @labels.uniq.length > 2
|
334
|
+
|
335
|
+
if multiclass
|
336
|
+
res = TSV.setup({}, "Fold~P,R,F1#:type=:list")
|
337
|
+
else
|
338
|
+
res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
|
339
|
+
end
|
340
|
+
|
242
341
|
begin
|
243
342
|
feature_folds = Misc.divide(@features, folds)
|
244
343
|
labels_folds = Misc.divide(@labels, folds)
|
@@ -253,8 +352,6 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
253
352
|
test_labels = labels_folds[fix]
|
254
353
|
train_labels = labels_folds.values_at(*rest).flatten
|
255
354
|
|
256
|
-
tp, fp, tn, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
|
257
|
-
|
258
355
|
@features = train_set
|
259
356
|
@labels = train_labels
|
260
357
|
self.train
|
@@ -262,26 +359,18 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
262
359
|
|
263
360
|
raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length
|
264
361
|
|
265
|
-
test_labels.
|
266
|
-
gs = gs.to_i
|
267
|
-
pred = pred > 0.5 ? 1 : 0
|
268
|
-
tp += 1 if gs == pred && gs == 1
|
269
|
-
tn += 1 if gs == pred && gs == 0
|
270
|
-
fp += 1 if gs == 0 && pred == 1
|
271
|
-
fn += 1 if gs == 1 && pred == 0
|
272
|
-
end
|
273
|
-
|
274
|
-
p = tp + fn
|
275
|
-
pp = tp + fp
|
276
|
-
|
277
|
-
pr = tp.to_f / pp
|
278
|
-
re = tp.to_f / p
|
362
|
+
different_labels = test_labels.uniq
|
279
363
|
|
280
|
-
|
364
|
+
tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)
|
281
365
|
|
282
|
-
|
366
|
+
if multiclass
|
367
|
+
Log.low "Multi-class CV Fold #{fix} - Average P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
|
368
|
+
res[fix] = [pr,re,f1]
|
369
|
+
else
|
370
|
+
Log.low "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
|
371
|
+
res[fix] = [tp,tn,fp,fn,pr,re,f1]
|
372
|
+
end
|
283
373
|
|
284
|
-
res[fix] = [tp,tn,fp,fn,pr,re,f1]
|
285
374
|
end
|
286
375
|
ensure
|
287
376
|
@features = orig_features
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = null
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["tok2vec","textcat"]
|
14
|
+
batch_size = 1000
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.tok2vec]
|
19
|
+
factory = "tok2vec"
|
20
|
+
|
21
|
+
[components.tok2vec.model]
|
22
|
+
@architectures = "spacy.Tok2Vec.v2"
|
23
|
+
|
24
|
+
[components.tok2vec.model.embed]
|
25
|
+
@architectures = "spacy.MultiHashEmbed.v2"
|
26
|
+
width = ${components.tok2vec.model.encode.width}
|
27
|
+
attrs = ["ORTH", "SHAPE"]
|
28
|
+
rows = [5000, 2500]
|
29
|
+
include_static_vectors = true
|
30
|
+
|
31
|
+
[components.tok2vec.model.encode]
|
32
|
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
33
|
+
width = 256
|
34
|
+
depth = 8
|
35
|
+
window_size = 1
|
36
|
+
maxout_pieces = 3
|
37
|
+
|
38
|
+
[components.textcat]
|
39
|
+
factory = "textcat"
|
40
|
+
|
41
|
+
[components.textcat.model]
|
42
|
+
@architectures = "spacy.TextCatEnsemble.v2"
|
43
|
+
nO = null
|
44
|
+
|
45
|
+
[components.textcat.model.tok2vec]
|
46
|
+
@architectures = "spacy.Tok2VecListener.v1"
|
47
|
+
width = ${components.tok2vec.model.encode.width}
|
48
|
+
|
49
|
+
[components.textcat.model.linear_model]
|
50
|
+
@architectures = "spacy.TextCatBOW.v1"
|
51
|
+
exclusive_classes = true
|
52
|
+
ngram_size = 1
|
53
|
+
no_output_layer = false
|
54
|
+
|
55
|
+
[corpora]
|
56
|
+
|
57
|
+
[corpora.train]
|
58
|
+
@readers = "spacy.Corpus.v1"
|
59
|
+
path = ${paths.train}
|
60
|
+
max_length = 2000
|
61
|
+
|
62
|
+
[corpora.dev]
|
63
|
+
@readers = "spacy.Corpus.v1"
|
64
|
+
path = ${paths.dev}
|
65
|
+
max_length = 0
|
66
|
+
|
67
|
+
[training]
|
68
|
+
dev_corpus = "corpora.dev"
|
69
|
+
train_corpus = "corpora.train"
|
70
|
+
|
71
|
+
[training.optimizer]
|
72
|
+
@optimizers = "Adam.v1"
|
73
|
+
|
74
|
+
[training.batcher]
|
75
|
+
@batchers = "spacy.batch_by_words.v1"
|
76
|
+
discard_oversize = false
|
77
|
+
tolerance = 0.2
|
78
|
+
|
79
|
+
[training.batcher.size]
|
80
|
+
@schedules = "compounding.v1"
|
81
|
+
start = 100
|
82
|
+
stop = 1000
|
83
|
+
compound = 1.001
|
84
|
+
|
85
|
+
[initialize]
|
86
|
+
vectors = "en_core_web_lg"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = null
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["tok2vec","textcat"]
|
14
|
+
batch_size = 1000
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.tok2vec]
|
19
|
+
factory = "tok2vec"
|
20
|
+
|
21
|
+
[components.tok2vec.model]
|
22
|
+
@architectures = "spacy.Tok2Vec.v2"
|
23
|
+
|
24
|
+
[components.tok2vec.model.embed]
|
25
|
+
@architectures = "spacy.MultiHashEmbed.v2"
|
26
|
+
width = ${components.tok2vec.model.encode.width}
|
27
|
+
attrs = ["ORTH", "SHAPE"]
|
28
|
+
rows = [5000, 2500]
|
29
|
+
include_static_vectors = false
|
30
|
+
|
31
|
+
[components.tok2vec.model.encode]
|
32
|
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
33
|
+
width = 96
|
34
|
+
depth = 4
|
35
|
+
window_size = 1
|
36
|
+
maxout_pieces = 3
|
37
|
+
|
38
|
+
[components.textcat]
|
39
|
+
factory = "textcat"
|
40
|
+
|
41
|
+
[components.textcat.model]
|
42
|
+
@architectures = "spacy.TextCatBOW.v1"
|
43
|
+
exclusive_classes = true
|
44
|
+
ngram_size = 1
|
45
|
+
no_output_layer = false
|
46
|
+
|
47
|
+
[corpora]
|
48
|
+
|
49
|
+
[corpora.train]
|
50
|
+
@readers = "spacy.Corpus.v1"
|
51
|
+
path = ${paths.train}
|
52
|
+
max_length = 2000
|
53
|
+
|
54
|
+
[corpora.dev]
|
55
|
+
@readers = "spacy.Corpus.v1"
|
56
|
+
path = ${paths.dev}
|
57
|
+
max_length = 0
|
58
|
+
|
59
|
+
[training]
|
60
|
+
dev_corpus = "corpora.dev"
|
61
|
+
train_corpus = "corpora.train"
|
62
|
+
|
63
|
+
[training.optimizer]
|
64
|
+
@optimizers = "Adam.v1"
|
65
|
+
|
66
|
+
[training.batcher]
|
67
|
+
@batchers = "spacy.batch_by_words.v1"
|
68
|
+
discard_oversize = false
|
69
|
+
tolerance = 0.2
|
70
|
+
|
71
|
+
[training.batcher.size]
|
72
|
+
@schedules = "compounding.v1"
|
73
|
+
start = 100
|
74
|
+
stop = 1000
|
75
|
+
compound = 1.001
|
76
|
+
|
77
|
+
[initialize]
|
78
|
+
vectors = null
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = "pytorch"
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["transformer","textcat"]
|
14
|
+
batch_size = 128
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.transformer]
|
19
|
+
factory = "transformer"
|
20
|
+
|
21
|
+
[components.transformer.model]
|
22
|
+
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
+
name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
|
+
tokenizer_config = {"use_fast": true}
|
25
|
+
|
26
|
+
[components.transformer.model.get_spans]
|
27
|
+
@span_getters = "spacy-transformers.strided_spans.v1"
|
28
|
+
window = 128
|
29
|
+
stride = 96
|
30
|
+
|
31
|
+
[components.textcat]
|
32
|
+
factory = "textcat"
|
33
|
+
|
34
|
+
[components.textcat.model]
|
35
|
+
@architectures = "spacy.TextCatEnsemble.v2"
|
36
|
+
nO = null
|
37
|
+
|
38
|
+
[components.textcat.model.tok2vec]
|
39
|
+
@architectures = "spacy-transformers.TransformerListener.v1"
|
40
|
+
grad_factor = 1.0
|
41
|
+
|
42
|
+
[components.textcat.model.tok2vec.pooling]
|
43
|
+
@layers = "reduce_mean.v1"
|
44
|
+
|
45
|
+
[components.textcat.model.linear_model]
|
46
|
+
@architectures = "spacy.TextCatBOW.v1"
|
47
|
+
exclusive_classes = true
|
48
|
+
ngram_size = 1
|
49
|
+
no_output_layer = false
|
50
|
+
|
51
|
+
[corpora]
|
52
|
+
|
53
|
+
[corpora.train]
|
54
|
+
@readers = "spacy.Corpus.v1"
|
55
|
+
path = ${paths.train}
|
56
|
+
max_length = 500
|
57
|
+
|
58
|
+
[corpora.dev]
|
59
|
+
@readers = "spacy.Corpus.v1"
|
60
|
+
path = ${paths.dev}
|
61
|
+
max_length = 0
|
62
|
+
|
63
|
+
[training]
|
64
|
+
accumulate_gradient = 3
|
65
|
+
dev_corpus = "corpora.dev"
|
66
|
+
train_corpus = "corpora.train"
|
67
|
+
|
68
|
+
[training.optimizer]
|
69
|
+
@optimizers = "Adam.v1"
|
70
|
+
|
71
|
+
[training.optimizer.learn_rate]
|
72
|
+
@schedules = "warmup_linear.v1"
|
73
|
+
warmup_steps = 250
|
74
|
+
total_steps = 20000
|
75
|
+
initial_rate = 5e-5
|
76
|
+
|
77
|
+
[training.batcher]
|
78
|
+
@batchers = "spacy.batch_by_padded.v1"
|
79
|
+
discard_oversize = true
|
80
|
+
size = 2000
|
81
|
+
buffer = 256
|
82
|
+
|
83
|
+
[initialize]
|
84
|
+
vectors = null
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = "pytorch"
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["transformer","textcat"]
|
14
|
+
batch_size = 128
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.transformer]
|
19
|
+
factory = "transformer"
|
20
|
+
|
21
|
+
[components.transformer.model]
|
22
|
+
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
+
name = "roberta-base"
|
24
|
+
tokenizer_config = {"use_fast": true}
|
25
|
+
|
26
|
+
[components.transformer.model.get_spans]
|
27
|
+
@span_getters = "spacy-transformers.strided_spans.v1"
|
28
|
+
window = 128
|
29
|
+
stride = 96
|
30
|
+
|
31
|
+
[components.textcat]
|
32
|
+
factory = "textcat"
|
33
|
+
|
34
|
+
[components.textcat.model]
|
35
|
+
@architectures = "spacy.TextCatBOW.v1"
|
36
|
+
exclusive_classes = true
|
37
|
+
ngram_size = 1
|
38
|
+
no_output_layer = false
|
39
|
+
|
40
|
+
[corpora]
|
41
|
+
|
42
|
+
[corpora.train]
|
43
|
+
@readers = "spacy.Corpus.v1"
|
44
|
+
path = ${paths.train}
|
45
|
+
max_length = 500
|
46
|
+
|
47
|
+
[corpora.dev]
|
48
|
+
@readers = "spacy.Corpus.v1"
|
49
|
+
path = ${paths.dev}
|
50
|
+
max_length = 0
|
51
|
+
|
52
|
+
[training]
|
53
|
+
accumulate_gradient = 3
|
54
|
+
dev_corpus = "corpora.dev"
|
55
|
+
train_corpus = "corpora.train"
|
56
|
+
|
57
|
+
[training.optimizer]
|
58
|
+
@optimizers = "Adam.v1"
|
59
|
+
|
60
|
+
[training.optimizer.learn_rate]
|
61
|
+
@schedules = "warmup_linear.v1"
|
62
|
+
warmup_steps = 250
|
63
|
+
total_steps = 20000
|
64
|
+
initial_rate = 5e-5
|
65
|
+
|
66
|
+
[training.batcher]
|
67
|
+
@batchers = "spacy.batch_by_padded.v1"
|
68
|
+
discard_oversize = true
|
69
|
+
size = 2000
|
70
|
+
buffer = 256
|
71
|
+
|
72
|
+
[initialize]
|
73
|
+
vectors = null
|
@@ -37,7 +37,7 @@ class TestSpaCyModel < Test::Unit::TestCase
|
|
37
37
|
model.cross_validation
|
38
38
|
end
|
39
39
|
|
40
|
-
def
|
40
|
+
def test_svm_spacy
|
41
41
|
|
42
42
|
require 'rbbt/tsv/csv'
|
43
43
|
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
@@ -84,5 +84,38 @@ class TestSpaCyModel < Test::Unit::TestCase
|
|
84
84
|
end
|
85
85
|
end
|
86
86
|
|
87
|
+
def test_spyCy_trf
|
88
|
+
TmpFile.with_file() do |dir|
|
89
|
+
Log.severity = 0
|
90
|
+
FileUtils.mkdir_p dir
|
91
|
+
|
92
|
+
model = SpaCyModel.new(
|
93
|
+
dir,
|
94
|
+
"gpu/textcat_accuracy.conf"
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
require 'rbbt/tsv/csv'
|
99
|
+
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
100
|
+
tsv = TSV.csv(Open.open(url))
|
101
|
+
tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
|
102
|
+
|
103
|
+
good = tsv.select("Recommended IND" => '1')
|
104
|
+
bad = tsv.select("Recommended IND" => '0')
|
105
|
+
|
106
|
+
gsize = 2000
|
107
|
+
bsize = 500
|
108
|
+
good.keys[0..gsize-1].each do |text|
|
109
|
+
next if text.nil? || text.empty?
|
110
|
+
model.add text, '1'
|
111
|
+
end
|
112
|
+
|
113
|
+
bad.keys[0..bsize-1].each do |text|
|
114
|
+
model.add text, '0'
|
115
|
+
end
|
116
|
+
|
117
|
+
model.cross_validation
|
118
|
+
end
|
119
|
+
end
|
87
120
|
end
|
88
121
|
|
@@ -33,10 +33,10 @@ class TestSVMModel < Test::Unit::TestCase
|
|
33
33
|
|
34
34
|
model.train
|
35
35
|
|
36
|
-
assert model.eval("1;1;1")
|
37
|
-
assert model.eval("0;0;0")
|
36
|
+
assert model.eval("1;1;1") == "1"
|
37
|
+
assert model.eval("0;0;0") == "0"
|
38
38
|
|
39
|
-
assert_equal [true, false], model.eval_list(%w(1;1;1 0;0;0)).collect{|v| v
|
39
|
+
assert_equal [true, false], model.eval_list(%w(1;1;1 0;0;0)).collect{|v| v == "1"}
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
@@ -4,12 +4,13 @@ require 'rbbt/vector/model/tensorflow'
|
|
4
4
|
class TestTensorflowModel < Test::Unit::TestCase
|
5
5
|
|
6
6
|
def test_keras
|
7
|
+
Log.severity = 0
|
7
8
|
TmpFile.with_file() do |dir|
|
8
9
|
FileUtils.mkdir_p dir
|
9
10
|
|
10
11
|
model = TensorFlowModel.new(
|
11
12
|
dir,
|
12
|
-
optimizer:'adam',
|
13
|
+
optimizer: 'adam',
|
13
14
|
loss: 'sparse_categorical_crossentropy',
|
14
15
|
metrics: ['accuracy']
|
15
16
|
)
|
@@ -42,6 +43,7 @@ class TestTensorflowModel < Test::Unit::TestCase
|
|
42
43
|
|
43
44
|
predictions = model.eval_list x_test.tolist()
|
44
45
|
sum = 0
|
46
|
+
|
45
47
|
predictions.zip(y_test.tolist()).each do |pred,label|
|
46
48
|
sum += 1 if label.to_i == pred
|
47
49
|
end
|
@@ -49,8 +51,6 @@ class TestTensorflowModel < Test::Unit::TestCase
|
|
49
51
|
end
|
50
52
|
|
51
53
|
assert sum.to_f / predictions.length > 0.7
|
52
|
-
|
53
|
-
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
@@ -211,7 +211,7 @@ cat(label, file="#{results}");
|
|
211
211
|
end
|
212
212
|
end
|
213
213
|
|
214
|
-
def
|
214
|
+
def test_model_save
|
215
215
|
text =<<-EOF
|
216
216
|
1 0;1;1
|
217
217
|
1 1;0;1
|
@@ -243,9 +243,9 @@ cat(label, file="#{results}");
|
|
243
243
|
R.run <<-EOF
|
244
244
|
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
245
245
|
labels = scan("#{ feature_file }.class", what=numeric());
|
246
|
-
features = cbind(features,
|
246
|
+
features = cbind(features, label = labels);
|
247
247
|
rbbt.require('e1071')
|
248
|
-
model = svm(
|
248
|
+
model = svm(label ~ ., data = features)
|
249
249
|
save(model, file="#{ model_file }");
|
250
250
|
EOF
|
251
251
|
end
|
@@ -276,12 +276,18 @@ cat(label, file="#{results}");
|
|
276
276
|
|
277
277
|
model.train
|
278
278
|
|
279
|
+
model = VectorModel.new(dir)
|
280
|
+
pairs = text.split(/\n/).collect do |line|
|
281
|
+
label, features = line.split(" ")
|
282
|
+
model.add features, label
|
283
|
+
end
|
284
|
+
|
279
285
|
assert model.eval("1;1;1").to_f > 0.5
|
280
286
|
assert model.eval("0;0;0").to_f < 0.5
|
281
287
|
end
|
282
288
|
end
|
283
289
|
|
284
|
-
def
|
290
|
+
def test_model_name
|
285
291
|
text =<<-EOF
|
286
292
|
1 0;1;1
|
287
293
|
1 1;0;1
|
@@ -298,50 +304,31 @@ cat(label, file="#{results}");
|
|
298
304
|
FileUtils.mkdir_p dir
|
299
305
|
model = VectorModel.new(dir)
|
300
306
|
|
301
|
-
model.
|
302
|
-
element.split(";")
|
303
|
-
}
|
307
|
+
model.names = %w(Var1 Var2 Var3)
|
304
308
|
|
305
|
-
model.
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
311
|
-
labels = scan("#{ feature_file }.class", what=numeric());
|
312
|
-
features = cbind(features, class = labels);
|
313
|
-
rbbt.require('e1071')
|
314
|
-
model = svm(class ~ ., data = features)
|
315
|
-
save(model, file="#{ model_file }");
|
316
|
-
EOF
|
309
|
+
model.extract_features = Proc.new{|element,list|
|
310
|
+
if element
|
311
|
+
element.split(";")
|
312
|
+
elsif list
|
313
|
+
list.collect{|e| e.split(";") }
|
317
314
|
end
|
318
315
|
}
|
319
316
|
|
320
|
-
model.
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
317
|
+
model.train_model =<<-EOF
|
318
|
+
rbbt.require('e1071')
|
319
|
+
model = svm(as.factor(label) ~ Var1 + Var2, data = features)
|
320
|
+
EOF
|
321
|
+
|
322
|
+
model.eval_model = <<-EOF
|
326
323
|
library(e1071)
|
327
|
-
load(file="#{ model_file }")
|
328
324
|
label = predict(model, features);
|
329
|
-
|
330
|
-
EOF
|
331
|
-
).read
|
332
|
-
Open.read(results)
|
333
|
-
end
|
334
|
-
end
|
335
|
-
|
336
|
-
}
|
325
|
+
EOF
|
337
326
|
|
338
327
|
pairs = text.split(/\n/).collect do |line|
|
339
328
|
label, features = line.split(" ")
|
340
|
-
|
329
|
+
model.add features, label
|
341
330
|
end
|
342
331
|
|
343
|
-
model.add_list(*Misc.zip_fields(pairs))
|
344
|
-
|
345
332
|
model.train
|
346
333
|
|
347
334
|
assert model.eval("1;1;1").to_f > 0.5
|
@@ -349,23 +336,25 @@ cat(label, file="#{results}");
|
|
349
336
|
end
|
350
337
|
end
|
351
338
|
|
352
|
-
def
|
339
|
+
def test_model_cv
|
353
340
|
text =<<-EOF
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
1
|
358
|
-
1 1;1;
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
341
|
+
0 0;1;0;0
|
342
|
+
0 1;0;0;0
|
343
|
+
0 0;1;0;0
|
344
|
+
0 1;0;0;0
|
345
|
+
1 0;1;1;0
|
346
|
+
1 1;0;1;0
|
347
|
+
1 1;1;1;0
|
348
|
+
1 0;1;1;0
|
349
|
+
1 1;1;1;0
|
363
350
|
EOF
|
364
351
|
|
365
352
|
TmpFile.with_file() do |dir|
|
366
353
|
FileUtils.mkdir_p dir
|
367
354
|
model = VectorModel.new(dir)
|
368
355
|
|
356
|
+
model.names = %w(Var1 Var2 Var3 Var4)
|
357
|
+
|
369
358
|
model.extract_features = Proc.new{|element,list|
|
370
359
|
if element
|
371
360
|
element.split(";")
|
@@ -374,55 +363,151 @@ cat(label, file="#{results}");
|
|
374
363
|
end
|
375
364
|
}
|
376
365
|
|
377
|
-
model.train_model
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
R.run <<-EOF
|
382
|
-
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
383
|
-
labels = scan("#{ feature_file }.class", what=numeric());
|
384
|
-
features = cbind(features, class = labels);
|
385
|
-
rbbt.require('e1071')
|
386
|
-
model = svm(class ~ ., data = features)
|
387
|
-
save(model, file="#{ model_file }");
|
388
|
-
EOF
|
389
|
-
end
|
390
|
-
}
|
366
|
+
model.train_model =<<-EOF
|
367
|
+
rbbt.require('randomForest')
|
368
|
+
model = randomForest(as.factor(label) ~ ., data = features)
|
369
|
+
EOF
|
391
370
|
|
392
|
-
model.eval_model =
|
393
|
-
|
394
|
-
TmpFile.with_file do |results|
|
395
|
-
Open.write(feature_file, features * "\t")
|
396
|
-
puts R.run(<<-EOF
|
397
|
-
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
398
|
-
library(e1071)
|
399
|
-
load(file="#{ model_file }")
|
371
|
+
model.eval_model = <<-EOF
|
372
|
+
rbbt.require('randomForest')
|
400
373
|
label = predict(model, features);
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
374
|
+
EOF
|
375
|
+
|
376
|
+
pairs = text.split(/\n/).collect do |line|
|
377
|
+
label, features = line.split(" ")
|
378
|
+
model.add features, label
|
379
|
+
end
|
380
|
+
|
381
|
+
model.train
|
382
|
+
|
383
|
+
assert_equal "0", model.eval("1;1;0;0")
|
384
|
+
assert_equal "1", model.eval("1;1;1;0")
|
385
|
+
|
386
|
+
Log.with_severity 1 do
|
387
|
+
model.cross_validation(2)
|
388
|
+
end
|
407
389
|
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
def test_model_mclass
|
394
|
+
text =<<-EOF
|
395
|
+
0 0;1;0;0
|
396
|
+
0 1;0;0;0
|
397
|
+
0 0;1;0;0
|
398
|
+
0 1;0;0;0
|
399
|
+
1 0;1;1;0
|
400
|
+
1 1;0;1;0
|
401
|
+
1 1;1;1;0
|
402
|
+
1 0;1;1;0
|
403
|
+
1 1;1;1;0
|
404
|
+
2 0;1;0;1
|
405
|
+
2 1;0;0;1
|
406
|
+
2 1;1;0;1
|
407
|
+
2 0;1;0;1
|
408
|
+
2 1;1;0;1
|
409
|
+
EOF
|
410
|
+
|
411
|
+
TmpFile.with_file() do |dir|
|
412
|
+
FileUtils.mkdir_p dir
|
413
|
+
model = VectorModel.new(dir)
|
414
|
+
|
415
|
+
model.names = %w(Var1 Var2 Var3 Var4)
|
416
|
+
|
417
|
+
model.extract_features = Proc.new{|element,list|
|
418
|
+
if element
|
419
|
+
element.split(";")
|
420
|
+
elsif list
|
421
|
+
list.collect{|e| e.split(";") }
|
422
|
+
end
|
408
423
|
}
|
409
424
|
|
425
|
+
model.train_model =<<-EOF
|
426
|
+
rbbt.require('randomForest')
|
427
|
+
model = randomForest(as.factor(label) ~ ., data = features)
|
428
|
+
EOF
|
429
|
+
|
430
|
+
model.eval_model = <<-EOF
|
431
|
+
rbbt.require('randomForest')
|
432
|
+
label = predict(model, features);
|
433
|
+
EOF
|
434
|
+
|
410
435
|
pairs = text.split(/\n/).collect do |line|
|
411
436
|
label, features = line.split(" ")
|
412
437
|
model.add features, label
|
413
438
|
end
|
414
439
|
|
415
440
|
model.train
|
441
|
+
|
442
|
+
assert_equal "0", model.eval("1;1;0;0")
|
443
|
+
assert_equal "1", model.eval("1;1;1;0")
|
444
|
+
assert_equal "2", model.eval("1;1;0;1")
|
445
|
+
|
446
|
+
Log.with_severity 1 do
|
447
|
+
model.cross_validation(2)
|
448
|
+
end
|
416
449
|
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
def test_model_factor_levels
|
454
|
+
text =<<-EOF
|
455
|
+
0 0;1;0;f1
|
456
|
+
0 1;0;0;f1
|
457
|
+
0 0;1;0;f1
|
458
|
+
0 1;0;0;f1
|
459
|
+
1 0;1;1;f2
|
460
|
+
1 1;0;1;f2
|
461
|
+
1 1;1;1;f2
|
462
|
+
1 0;1;1;f2
|
463
|
+
1 1;1;1;f2
|
464
|
+
EOF
|
465
|
+
|
466
|
+
TmpFile.with_file() do |dir|
|
467
|
+
FileUtils.mkdir_p dir
|
417
468
|
model = VectorModel.new(dir)
|
469
|
+
|
470
|
+
model.names = %w(Var1 Var2 Var3 Factor)
|
471
|
+
|
472
|
+
model.extract_features = Proc.new{|element,list|
|
473
|
+
if element
|
474
|
+
element.split(";")
|
475
|
+
elsif list
|
476
|
+
list.collect{|e| e.split(";") }
|
477
|
+
end
|
478
|
+
}
|
479
|
+
|
480
|
+
model.train_model =<<-EOF
|
481
|
+
rbbt.require('randomForest')
|
482
|
+
model = randomForest(as.factor(label) ~ ., data = features)
|
483
|
+
EOF
|
484
|
+
|
485
|
+
model.eval_model = <<-EOF
|
486
|
+
rbbt.require('randomForest')
|
487
|
+
label = predict(model, features);
|
488
|
+
EOF
|
489
|
+
|
418
490
|
pairs = text.split(/\n/).collect do |line|
|
419
491
|
label, features = line.split(" ")
|
420
492
|
model.add features, label
|
421
493
|
end
|
422
494
|
|
423
|
-
|
424
|
-
|
495
|
+
Log.with_severity 0 do
|
496
|
+
model.train
|
497
|
+
model.cross_validation(2)
|
498
|
+
|
499
|
+
assert_raise do
|
500
|
+
assert_equal "0", model.eval("1;1;0;f1")
|
501
|
+
end
|
502
|
+
|
503
|
+
model.factor_levels = {"Factor" => %w(f1 f2)}
|
504
|
+
model.train
|
505
|
+
model = VectorModel.new(dir)
|
506
|
+
assert_equal "1", model.eval("1;1;1;f2")
|
507
|
+
end
|
508
|
+
|
425
509
|
end
|
426
510
|
end
|
427
511
|
|
512
|
+
|
428
513
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-dm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.54
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-12-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -107,12 +107,17 @@ files:
|
|
107
107
|
- lib/rbbt/statistics/rank_product.rb
|
108
108
|
- lib/rbbt/tensorflow.rb
|
109
109
|
- lib/rbbt/vector/model.rb
|
110
|
+
- lib/rbbt/vector/model/random_forest.rb
|
110
111
|
- lib/rbbt/vector/model/spaCy.rb
|
111
112
|
- lib/rbbt/vector/model/svm.rb
|
112
113
|
- lib/rbbt/vector/model/tensorflow.rb
|
113
114
|
- share/R/MA.R
|
114
115
|
- share/R/barcode.R
|
115
116
|
- share/R/heatmap.3.R
|
117
|
+
- share/spaCy/cpu/textcat_accuracy.conf
|
118
|
+
- share/spaCy/cpu/textcat_efficiency.conf
|
119
|
+
- share/spaCy/gpu/textcat_accuracy.conf
|
120
|
+
- share/spaCy/gpu/textcat_efficiency.conf
|
116
121
|
- test/rbbt/matrix/test_barcode.rb
|
117
122
|
- test/rbbt/network/test_paths.rb
|
118
123
|
- test/rbbt/statistics/test_fdr.rb
|