rbbt-dm 1.1.56 → 1.1.59
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/lib/rbbt/vector/model/random_forest.rb +11 -1
- data/lib/rbbt/vector/model/spaCy.rb +27 -19
- data/lib/rbbt/vector/model/svm.rb +3 -3
- data/lib/rbbt/vector/model/util.rb +12 -0
- data/lib/rbbt/vector/model.rb +29 -4
- data/share/spaCy/cpu/textcat_multilabel_accuracy.conf +86 -0
- data/share/spaCy/cpu/textcat_multilabel_efficiency.conf +78 -0
- data/share/spaCy/gpu/textcat_multilabel_accuracy.conf +84 -0
- data/share/spaCy/gpu/textcat_multilabel_efficiency.conf +73 -0
- data/test/rbbt/vector/model/test_spaCy.rb +13 -6
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e001607266948a5221118c15d1fc95ed4266b0f8880b2fa628350d429ed3f7d
|
4
|
+
data.tar.gz: 1d56618e3039e1d99c8183aace2ae20e8cd3dafce0d574b5dbd49ce4f5a1ee14
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8d324c664257cb142ae7363de776ea7b6e367cd14c22026018c00de335bc3e35be428d00dad6d84a61c3f0874057612d1379e6839b1cea6fc312ea5d8e9a699
|
7
|
+
data.tar.gz: b2e52024a63f3105ac88ca1b471df0b69fe91237a1e3fa70185fc519e0740421c58755eb3560003c9f4e4f60b6479bf449fca7596684e3badba46e4ec242feee
|
data/LICENSE
CHANGED
@@ -16,11 +16,21 @@ model = randomForest(as.factor(label) ~ ., data = features);
|
|
16
16
|
rbbt.require("randomForest");
|
17
17
|
pred = names(model$forest$xlevels)
|
18
18
|
for (p in pred) {
|
19
|
-
if (
|
19
|
+
if (is.factor(features[[p]])) {
|
20
20
|
features[[p]] = factor(features[[p]], levels=model$forest$xlevels[[p]])
|
21
21
|
}
|
22
22
|
}
|
23
23
|
label = predict(model, features);
|
24
24
|
EOF
|
25
25
|
end
|
26
|
+
|
27
|
+
def importance
|
28
|
+
TmpFile.with_file do |tmp|
|
29
|
+
tsv = R.run <<-EOF
|
30
|
+
load(file="#{model_file}");
|
31
|
+
rbbt.tsv.write('#{tmp}', model$importance)
|
32
|
+
EOF
|
33
|
+
TSV.open(tmp)
|
34
|
+
end
|
35
|
+
end
|
26
36
|
end
|
@@ -4,13 +4,13 @@ require 'rbbt/nlp/spaCy'
|
|
4
4
|
class SpaCyModel < VectorModel
|
5
5
|
attr_accessor :config
|
6
6
|
|
7
|
-
def spacy(&block)
|
7
|
+
def self.spacy(&block)
|
8
8
|
RbbtPython.run "spacy" do
|
9
9
|
RbbtPython.module_eval(&block)
|
10
10
|
end
|
11
11
|
end
|
12
12
|
|
13
|
-
def initialize(dir, config, lang = 'en_core_web_md')
|
13
|
+
def initialize(dir, config, categories = %w(positive negative), lang = 'en_core_web_md')
|
14
14
|
@config = case
|
15
15
|
when Path === config
|
16
16
|
config.read
|
@@ -30,20 +30,21 @@ class SpaCyModel < VectorModel
|
|
30
30
|
@train_model = Proc.new do |file, features, labels|
|
31
31
|
texts = features
|
32
32
|
docs = []
|
33
|
+
unique_labels = labels.uniq
|
33
34
|
tmpconfig = File.join(file, 'config')
|
34
35
|
tmptrain = File.join(file, 'train.spacy')
|
35
36
|
SpaCy.config(@config, tmpconfig)
|
36
|
-
|
37
|
+
|
38
|
+
bar = bar(features.length, "Training documents into spacy format")
|
39
|
+
SpaCyModel.spacy do
|
37
40
|
nlp = SpaCy.nlp(lang)
|
38
41
|
docs = []
|
39
|
-
RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar =>
|
40
|
-
|
41
|
-
|
42
|
-
doc.cats[
|
43
|
-
else
|
44
|
-
doc.cats["positive"] = 0
|
45
|
-
doc.cats["negative"] = 1
|
42
|
+
RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar => bar do |doc,label|
|
43
|
+
unique_labels.each do |other_label|
|
44
|
+
next if other_label == label
|
45
|
+
doc.cats[other_label] = false
|
46
46
|
end
|
47
|
+
doc.cats[label] = true
|
47
48
|
docs << doc
|
48
49
|
end
|
49
50
|
|
@@ -51,24 +52,31 @@ class SpaCyModel < VectorModel
|
|
51
52
|
doc_bin.to_disk(tmptrain)
|
52
53
|
end
|
53
54
|
|
54
|
-
gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
|
55
|
+
gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train, :default => 0)
|
55
56
|
CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
|
56
57
|
end
|
57
58
|
|
58
|
-
@eval_model = Proc.new do |file, features|
|
59
|
+
@eval_model = Proc.new do |file, features,list|
|
59
60
|
texts = features
|
61
|
+
texts = [texts] unless list
|
60
62
|
|
61
63
|
docs = []
|
62
|
-
|
64
|
+
bar = bar(features.length, "Evaluating model")
|
65
|
+
SpaCyModel.spacy do
|
63
66
|
nlp = spacy.load("#{file}/model-best")
|
64
67
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
bar.tick
|
69
|
-
cats['positive'] > cats['negative'] ? 1 : 0
|
70
|
-
end
|
68
|
+
docs = nlp.pipe(texts)
|
69
|
+
RbbtPython.collect docs, :bar => bar do |d|
|
70
|
+
d.cats.sort_by{|l,v| v.to_f || 0 }.last.first
|
71
71
|
end
|
72
|
+
#nlp.(docs).cats.collect{|cats| cats.sort_by{|l,v| v.to_f }.last.first }
|
73
|
+
#Log::ProgressBar.with_bar texts.length, :desc => "Evaluating documents" do |bar|
|
74
|
+
# texts.collect do |text|
|
75
|
+
# cats = nlp.(text).cats
|
76
|
+
# bar.tick
|
77
|
+
# cats.sort_by{|l,v| v.to_f }.last.first
|
78
|
+
# end
|
79
|
+
#end
|
72
80
|
end
|
73
81
|
end
|
74
82
|
end
|
@@ -3,16 +3,16 @@ class SVMModel < VectorModel
|
|
3
3
|
def initialize(dir)
|
4
4
|
super(dir)
|
5
5
|
|
6
|
-
@extract_features
|
6
|
+
@extract_features ||= Proc.new{|element|
|
7
7
|
element
|
8
8
|
}
|
9
9
|
|
10
|
-
@train_model
|
10
|
+
@train_model ||=<<-EOF
|
11
11
|
rbbt.require('e1071');
|
12
12
|
model = svm(as.factor(label) ~ ., data = features);
|
13
13
|
EOF
|
14
14
|
|
15
|
-
@eval_model
|
15
|
+
@eval_model ||=<<-EOF
|
16
16
|
rbbt.require('e1071');
|
17
17
|
label = predict(model, features);
|
18
18
|
EOF
|
data/lib/rbbt/vector/model.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rbbt/util/R'
|
2
|
+
require 'rbbt/vector/model/util'
|
2
3
|
|
3
4
|
class VectorModel
|
4
5
|
attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model
|
@@ -53,6 +54,13 @@ features = cbind(features, label = labels);
|
|
53
54
|
"features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
|
54
55
|
end * "\n" if factor_levels }
|
55
56
|
#{code}
|
57
|
+
# Save used factor levels
|
58
|
+
factor_levels = c()
|
59
|
+
for (c in names(features)){
|
60
|
+
if (is.factor(features[[c]]))
|
61
|
+
factor_levels[c] = paste(levels(features[[c]]), collapse="\t")
|
62
|
+
}
|
63
|
+
rbbt.tsv.write("#{model_file}.factor_levels", factor_levels, names=c('Levels'), type='flat')
|
56
64
|
save(model, file='#{model_file}')
|
57
65
|
EOF
|
58
66
|
end
|
@@ -150,6 +158,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
150
158
|
if File.exists?(@levels_file)
|
151
159
|
@factor_levels = YAML.load(Open.read(@levels_file))
|
152
160
|
end
|
161
|
+
if File.exists?(@model_file + '.factor_levels')
|
162
|
+
@factor_levels = TSV.open(@model_file + '.factor_levels')
|
163
|
+
end
|
153
164
|
else
|
154
165
|
@factor_levels = factor_levels
|
155
166
|
end
|
@@ -320,6 +331,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
320
331
|
acc = []
|
321
332
|
labels.each do |good_label|
|
322
333
|
values = VectorModel.f1_metrics(test, predicted, good_label)
|
334
|
+
tp, tn, fp, fn, pr, re, f1 = values
|
335
|
+
Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
|
323
336
|
acc << values
|
324
337
|
end
|
325
338
|
Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
|
@@ -340,12 +353,21 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
340
353
|
end
|
341
354
|
|
342
355
|
begin
|
343
|
-
|
344
|
-
|
356
|
+
if folds == 1
|
357
|
+
feature_folds = [@features]
|
358
|
+
labels_folds = [@labels]
|
359
|
+
else
|
360
|
+
feature_folds = Misc.divide(@features, folds)
|
361
|
+
labels_folds = Misc.divide(@labels, folds)
|
362
|
+
end
|
345
363
|
|
346
364
|
folds.times do |fix|
|
347
365
|
|
348
|
-
|
366
|
+
if folds == 1
|
367
|
+
rest = [fix]
|
368
|
+
else
|
369
|
+
rest = (0..(folds-1)).to_a - [fix]
|
370
|
+
end
|
349
371
|
|
350
372
|
test_set = feature_folds[fix]
|
351
373
|
train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}
|
@@ -355,6 +377,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
355
377
|
|
356
378
|
@features = train_set
|
357
379
|
@labels = train_labels
|
380
|
+
|
358
381
|
self.train
|
359
382
|
predictions = self.eval_list test_set, false
|
360
383
|
|
@@ -362,6 +385,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
362
385
|
|
363
386
|
different_labels = test_labels.uniq
|
364
387
|
|
388
|
+
Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%" end
|
389
|
+
|
365
390
|
tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)
|
366
391
|
|
367
392
|
if multiclass
|
@@ -377,7 +402,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
377
402
|
@features = orig_features
|
378
403
|
@labels = orig_labels
|
379
404
|
end
|
380
|
-
self.train
|
405
|
+
self.train unless folds == 1
|
381
406
|
res
|
382
407
|
end
|
383
408
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = null
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["tok2vec","textcat_multilabel"]
|
14
|
+
batch_size = 1000
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.tok2vec]
|
19
|
+
factory = "tok2vec"
|
20
|
+
|
21
|
+
[components.tok2vec.model]
|
22
|
+
@architectures = "spacy.Tok2Vec.v2"
|
23
|
+
|
24
|
+
[components.tok2vec.model.embed]
|
25
|
+
@architectures = "spacy.MultiHashEmbed.v2"
|
26
|
+
width = ${components.tok2vec.model.encode.width}
|
27
|
+
attrs = ["ORTH", "SHAPE"]
|
28
|
+
rows = [5000, 2500]
|
29
|
+
include_static_vectors = true
|
30
|
+
|
31
|
+
[components.tok2vec.model.encode]
|
32
|
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
33
|
+
width = 256
|
34
|
+
depth = 8
|
35
|
+
window_size = 1
|
36
|
+
maxout_pieces = 3
|
37
|
+
|
38
|
+
[components.textcat_multilabel]
|
39
|
+
factory = "textcat_multilabel"
|
40
|
+
|
41
|
+
[components.textcat_multilabel.model]
|
42
|
+
@architectures = "spacy.TextCatEnsemble.v2"
|
43
|
+
nO = null
|
44
|
+
|
45
|
+
[components.textcat_multilabel.model.tok2vec]
|
46
|
+
@architectures = "spacy.Tok2VecListener.v1"
|
47
|
+
width = ${components.tok2vec.model.encode.width}
|
48
|
+
|
49
|
+
[components.textcat_multilabel.model.linear_model]
|
50
|
+
@architectures = "spacy.TextCatBOW.v1"
|
51
|
+
exclusive_classes = true
|
52
|
+
ngram_size = 1
|
53
|
+
no_output_layer = false
|
54
|
+
|
55
|
+
[corpora]
|
56
|
+
|
57
|
+
[corpora.train]
|
58
|
+
@readers = "spacy.Corpus.v1"
|
59
|
+
path = ${paths.train}
|
60
|
+
max_length = 2000
|
61
|
+
|
62
|
+
[corpora.dev]
|
63
|
+
@readers = "spacy.Corpus.v1"
|
64
|
+
path = ${paths.dev}
|
65
|
+
max_length = 0
|
66
|
+
|
67
|
+
[training]
|
68
|
+
dev_corpus = "corpora.dev"
|
69
|
+
train_corpus = "corpora.train"
|
70
|
+
|
71
|
+
[training.optimizer]
|
72
|
+
@optimizers = "Adam.v1"
|
73
|
+
|
74
|
+
[training.batcher]
|
75
|
+
@batchers = "spacy.batch_by_words.v1"
|
76
|
+
discard_oversize = false
|
77
|
+
tolerance = 0.2
|
78
|
+
|
79
|
+
[training.batcher.size]
|
80
|
+
@schedules = "compounding.v1"
|
81
|
+
start = 100
|
82
|
+
stop = 1000
|
83
|
+
compound = 1.001
|
84
|
+
|
85
|
+
[initialize]
|
86
|
+
vectors = "en_core_web_lg"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = null
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["tok2vec","textcat_multilabel"]
|
14
|
+
batch_size = 1000
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.tok2vec]
|
19
|
+
factory = "tok2vec"
|
20
|
+
|
21
|
+
[components.tok2vec.model]
|
22
|
+
@architectures = "spacy.Tok2Vec.v2"
|
23
|
+
|
24
|
+
[components.tok2vec.model.embed]
|
25
|
+
@architectures = "spacy.MultiHashEmbed.v2"
|
26
|
+
width = ${components.tok2vec.model.encode.width}
|
27
|
+
attrs = ["ORTH", "SHAPE"]
|
28
|
+
rows = [5000, 2500]
|
29
|
+
include_static_vectors = false
|
30
|
+
|
31
|
+
[components.tok2vec.model.encode]
|
32
|
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
33
|
+
width = 96
|
34
|
+
depth = 4
|
35
|
+
window_size = 1
|
36
|
+
maxout_pieces = 3
|
37
|
+
|
38
|
+
[components.textcat_multilabel]
|
39
|
+
factory = "textcat_multilabel"
|
40
|
+
|
41
|
+
[components.textcat_multilabel.model]
|
42
|
+
@architectures = "spacy.TextCatBOW.v1"
|
43
|
+
exclusive_classes = true
|
44
|
+
ngram_size = 1
|
45
|
+
no_output_layer = false
|
46
|
+
|
47
|
+
[corpora]
|
48
|
+
|
49
|
+
[corpora.train]
|
50
|
+
@readers = "spacy.Corpus.v1"
|
51
|
+
path = ${paths.train}
|
52
|
+
max_length = 2000
|
53
|
+
|
54
|
+
[corpora.dev]
|
55
|
+
@readers = "spacy.Corpus.v1"
|
56
|
+
path = ${paths.dev}
|
57
|
+
max_length = 0
|
58
|
+
|
59
|
+
[training]
|
60
|
+
dev_corpus = "corpora.dev"
|
61
|
+
train_corpus = "corpora.train"
|
62
|
+
|
63
|
+
[training.optimizer]
|
64
|
+
@optimizers = "Adam.v1"
|
65
|
+
|
66
|
+
[training.batcher]
|
67
|
+
@batchers = "spacy.batch_by_words.v1"
|
68
|
+
discard_oversize = false
|
69
|
+
tolerance = 0.2
|
70
|
+
|
71
|
+
[training.batcher.size]
|
72
|
+
@schedules = "compounding.v1"
|
73
|
+
start = 100
|
74
|
+
stop = 1000
|
75
|
+
compound = 1.001
|
76
|
+
|
77
|
+
[initialize]
|
78
|
+
vectors = null
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = "pytorch"
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["transformer","textcat_multilabel"]
|
14
|
+
batch_size = 128
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.transformer]
|
19
|
+
factory = "transformer"
|
20
|
+
|
21
|
+
[components.transformer.model]
|
22
|
+
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
+
name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
|
+
tokenizer_config = {"use_fast": true}
|
25
|
+
|
26
|
+
[components.transformer.model.get_spans]
|
27
|
+
@span_getters = "spacy-transformers.strided_spans.v1"
|
28
|
+
window = 128
|
29
|
+
stride = 96
|
30
|
+
|
31
|
+
[components.textcat_multilabel]
|
32
|
+
factory = "textcat_multilabel"
|
33
|
+
|
34
|
+
[components.textcat_multilabel.model]
|
35
|
+
@architectures = "spacy.TextCatEnsemble.v2"
|
36
|
+
nO = null
|
37
|
+
|
38
|
+
[components.textcat_multilabel.model.tok2vec]
|
39
|
+
@architectures = "spacy-transformers.TransformerListener.v1"
|
40
|
+
grad_factor = 1.0
|
41
|
+
|
42
|
+
[components.textcat_multilabel.model.tok2vec.pooling]
|
43
|
+
@layers = "reduce_mean.v1"
|
44
|
+
|
45
|
+
[components.textcat_multilabel.model.linear_model]
|
46
|
+
@architectures = "spacy.TextCatBOW.v1"
|
47
|
+
exclusive_classes = true
|
48
|
+
ngram_size = 1
|
49
|
+
no_output_layer = false
|
50
|
+
|
51
|
+
[corpora]
|
52
|
+
|
53
|
+
[corpora.train]
|
54
|
+
@readers = "spacy.Corpus.v1"
|
55
|
+
path = ${paths.train}
|
56
|
+
max_length = 500
|
57
|
+
|
58
|
+
[corpora.dev]
|
59
|
+
@readers = "spacy.Corpus.v1"
|
60
|
+
path = ${paths.dev}
|
61
|
+
max_length = 0
|
62
|
+
|
63
|
+
[training]
|
64
|
+
accumulate_gradient = 3
|
65
|
+
dev_corpus = "corpora.dev"
|
66
|
+
train_corpus = "corpora.train"
|
67
|
+
|
68
|
+
[training.optimizer]
|
69
|
+
@optimizers = "Adam.v1"
|
70
|
+
|
71
|
+
[training.optimizer.learn_rate]
|
72
|
+
@schedules = "warmup_linear.v1"
|
73
|
+
warmup_steps = 250
|
74
|
+
total_steps = 20000
|
75
|
+
initial_rate = 5e-5
|
76
|
+
|
77
|
+
[training.batcher]
|
78
|
+
@batchers = "spacy.batch_by_padded.v1"
|
79
|
+
discard_oversize = true
|
80
|
+
size = 2000
|
81
|
+
buffer = 256
|
82
|
+
|
83
|
+
[initialize]
|
84
|
+
vectors = null
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = "pytorch"
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["transformer","textcat_multilabel"]
|
14
|
+
batch_size = 128
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.transformer]
|
19
|
+
factory = "transformer"
|
20
|
+
|
21
|
+
[components.transformer.model]
|
22
|
+
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
+
name = "roberta-base"
|
24
|
+
tokenizer_config = {"use_fast": true}
|
25
|
+
|
26
|
+
[components.transformer.model.get_spans]
|
27
|
+
@span_getters = "spacy-transformers.strided_spans.v1"
|
28
|
+
window = 128
|
29
|
+
stride = 96
|
30
|
+
|
31
|
+
[components.textcat_multilabel]
|
32
|
+
factory = "textcat_multilabel"
|
33
|
+
|
34
|
+
[components.textcat_multilabel.model]
|
35
|
+
@architectures = "spacy.TextCatBOW.v1"
|
36
|
+
exclusive_classes = true
|
37
|
+
ngram_size = 1
|
38
|
+
no_output_layer = false
|
39
|
+
|
40
|
+
[corpora]
|
41
|
+
|
42
|
+
[corpora.train]
|
43
|
+
@readers = "spacy.Corpus.v1"
|
44
|
+
path = ${paths.train}
|
45
|
+
max_length = 500
|
46
|
+
|
47
|
+
[corpora.dev]
|
48
|
+
@readers = "spacy.Corpus.v1"
|
49
|
+
path = ${paths.dev}
|
50
|
+
max_length = 0
|
51
|
+
|
52
|
+
[training]
|
53
|
+
accumulate_gradient = 3
|
54
|
+
dev_corpus = "corpora.dev"
|
55
|
+
train_corpus = "corpora.train"
|
56
|
+
|
57
|
+
[training.optimizer]
|
58
|
+
@optimizers = "Adam.v1"
|
59
|
+
|
60
|
+
[training.optimizer.learn_rate]
|
61
|
+
@schedules = "warmup_linear.v1"
|
62
|
+
warmup_steps = 250
|
63
|
+
total_steps = 20000
|
64
|
+
initial_rate = 5e-5
|
65
|
+
|
66
|
+
[training.batcher]
|
67
|
+
@batchers = "spacy.batch_by_padded.v1"
|
68
|
+
discard_oversize = true
|
69
|
+
size = 2000
|
70
|
+
buffer = 256
|
71
|
+
|
72
|
+
[initialize]
|
73
|
+
vectors = null
|
@@ -23,18 +23,23 @@ class TestSpaCyModel < Test::Unit::TestCase
|
|
23
23
|
good = tsv.select("Recommended IND" => '1')
|
24
24
|
bad = tsv.select("Recommended IND" => '0')
|
25
25
|
|
26
|
-
gsize =
|
27
|
-
bsize =
|
26
|
+
gsize = 200
|
27
|
+
bsize = 50
|
28
28
|
good.keys[0..gsize-1].each do |text|
|
29
29
|
next if text.nil? || text.empty?
|
30
|
-
model.add text, '
|
30
|
+
model.add text, 'good'
|
31
31
|
end
|
32
32
|
|
33
33
|
bad.keys[0..bsize-1].each do |text|
|
34
|
-
model.add text, '
|
34
|
+
model.add text, 'bad'
|
35
35
|
end
|
36
36
|
|
37
|
-
model.cross_validation
|
37
|
+
model.cross_validation 1
|
38
|
+
|
39
|
+
model = VectorModel.new dir
|
40
|
+
|
41
|
+
assert Misc.counts(model.eval_list(good.keys[0..50]))['good'] > 40
|
42
|
+
assert Misc.counts(model.eval_list(bad.keys[0..50]))['bad'] > 40
|
38
43
|
end
|
39
44
|
|
40
45
|
def test_svm_spacy
|
@@ -91,14 +96,16 @@ class TestSpaCyModel < Test::Unit::TestCase
|
|
91
96
|
|
92
97
|
model = SpaCyModel.new(
|
93
98
|
dir,
|
94
|
-
"
|
99
|
+
"cpu/textcat_efficiency.conf"
|
95
100
|
)
|
96
101
|
|
97
102
|
|
103
|
+
Rbbt::Config.set 'gpu_id', nil, :spacy
|
98
104
|
require 'rbbt/tsv/csv'
|
99
105
|
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
100
106
|
tsv = TSV.csv(Open.open(url))
|
101
107
|
tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
|
108
|
+
tsv = tsv.subset(tsv.keys.sample(100))
|
102
109
|
|
103
110
|
good = tsv.select("Recommended IND" => '1')
|
104
111
|
bad = tsv.select("Recommended IND" => '0')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-dm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.59
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -111,13 +111,18 @@ files:
|
|
111
111
|
- lib/rbbt/vector/model/spaCy.rb
|
112
112
|
- lib/rbbt/vector/model/svm.rb
|
113
113
|
- lib/rbbt/vector/model/tensorflow.rb
|
114
|
+
- lib/rbbt/vector/model/util.rb
|
114
115
|
- share/R/MA.R
|
115
116
|
- share/R/barcode.R
|
116
117
|
- share/R/heatmap.3.R
|
117
118
|
- share/spaCy/cpu/textcat_accuracy.conf
|
118
119
|
- share/spaCy/cpu/textcat_efficiency.conf
|
120
|
+
- share/spaCy/cpu/textcat_multilabel_accuracy.conf
|
121
|
+
- share/spaCy/cpu/textcat_multilabel_efficiency.conf
|
119
122
|
- share/spaCy/gpu/textcat_accuracy.conf
|
120
123
|
- share/spaCy/gpu/textcat_efficiency.conf
|
124
|
+
- share/spaCy/gpu/textcat_multilabel_accuracy.conf
|
125
|
+
- share/spaCy/gpu/textcat_multilabel_efficiency.conf
|
121
126
|
- test/rbbt/matrix/test_barcode.rb
|
122
127
|
- test/rbbt/network/test_paths.rb
|
123
128
|
- test/rbbt/statistics/test_fdr.rb
|