rbbt-dm 1.1.48 → 1.1.53
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/network/paths.rb +1 -1
- data/lib/rbbt/tensorflow.rb +43 -0
- data/lib/rbbt/vector/model.rb +164 -65
- data/lib/rbbt/vector/model/spaCy.rb +76 -0
- data/lib/rbbt/vector/model/svm.rb +1 -1
- data/lib/rbbt/vector/model/tensorflow.rb +55 -0
- data/share/spaCy/cpu/textcat_accuracy.conf +86 -0
- data/share/spaCy/cpu/textcat_efficiency.conf +78 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +84 -0
- data/share/spaCy/gpu/textcat_efficiency.conf +73 -0
- data/test/rbbt/network/test_paths.rb +1 -1
- data/test/rbbt/vector/model/test_spaCy.rb +121 -0
- data/test/rbbt/vector/model/test_tensorflow.rb +57 -0
- data/test/rbbt/vector/test_model.rb +354 -0
- metadata +15 -4
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rbbt/vector/model'
|
2
|
+
require 'rbbt/tensorflow'
|
3
|
+
|
4
|
+
class TensorFlowModel < VectorModel
|
5
|
+
attr_accessor :graph, :epochs, :compile_options
|
6
|
+
|
7
|
+
def tensorflow(&block)
|
8
|
+
RbbtPython.run "tensorflow" do
|
9
|
+
RbbtPython.module_eval(&block)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def keras(&block)
|
14
|
+
RbbtPython.run "tensorflow.keras", as: 'keras' do
|
15
|
+
RbbtPython.run "tensorflow" do
|
16
|
+
RbbtPython.module_eval(&block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(dir, graph = nil, epochs = 3, **compile_options)
|
22
|
+
@graph = graph
|
23
|
+
@epochs = epochs
|
24
|
+
@compile_options = compile_options
|
25
|
+
|
26
|
+
super(dir)
|
27
|
+
|
28
|
+
@train_model = Proc.new do |file, features, labels|
|
29
|
+
tensorflow do
|
30
|
+
features = tensorflow.convert_to_tensor(features)
|
31
|
+
labels = tensorflow.convert_to_tensor(labels)
|
32
|
+
end
|
33
|
+
@graph ||= keras_graph
|
34
|
+
@graph.compile(**@compile_options)
|
35
|
+
@graph.fit(features, labels, :epochs => @epochs, :verbose => false)
|
36
|
+
@graph.save(file)
|
37
|
+
end
|
38
|
+
|
39
|
+
@eval_model = Proc.new do |file, features|
|
40
|
+
tensorflow do
|
41
|
+
features = tensorflow.convert_to_tensor(features)
|
42
|
+
end
|
43
|
+
keras do
|
44
|
+
@graph ||= keras.models.load_model(file)
|
45
|
+
indices = @graph.predict(features, :verbose => false).tolist()
|
46
|
+
labels = indices.collect{|p| p.length > 1 ? p.index(p.max): p.first }
|
47
|
+
labels
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def keras_graph(&block)
|
53
|
+
@graph = keras(&block)
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = null
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["tok2vec","textcat"]
|
14
|
+
batch_size = 1000
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.tok2vec]
|
19
|
+
factory = "tok2vec"
|
20
|
+
|
21
|
+
[components.tok2vec.model]
|
22
|
+
@architectures = "spacy.Tok2Vec.v2"
|
23
|
+
|
24
|
+
[components.tok2vec.model.embed]
|
25
|
+
@architectures = "spacy.MultiHashEmbed.v2"
|
26
|
+
width = ${components.tok2vec.model.encode.width}
|
27
|
+
attrs = ["ORTH", "SHAPE"]
|
28
|
+
rows = [5000, 2500]
|
29
|
+
include_static_vectors = true
|
30
|
+
|
31
|
+
[components.tok2vec.model.encode]
|
32
|
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
33
|
+
width = 256
|
34
|
+
depth = 8
|
35
|
+
window_size = 1
|
36
|
+
maxout_pieces = 3
|
37
|
+
|
38
|
+
[components.textcat]
|
39
|
+
factory = "textcat"
|
40
|
+
|
41
|
+
[components.textcat.model]
|
42
|
+
@architectures = "spacy.TextCatEnsemble.v2"
|
43
|
+
nO = null
|
44
|
+
|
45
|
+
[components.textcat.model.tok2vec]
|
46
|
+
@architectures = "spacy.Tok2VecListener.v1"
|
47
|
+
width = ${components.tok2vec.model.encode.width}
|
48
|
+
|
49
|
+
[components.textcat.model.linear_model]
|
50
|
+
@architectures = "spacy.TextCatBOW.v1"
|
51
|
+
exclusive_classes = true
|
52
|
+
ngram_size = 1
|
53
|
+
no_output_layer = false
|
54
|
+
|
55
|
+
[corpora]
|
56
|
+
|
57
|
+
[corpora.train]
|
58
|
+
@readers = "spacy.Corpus.v1"
|
59
|
+
path = ${paths.train}
|
60
|
+
max_length = 2000
|
61
|
+
|
62
|
+
[corpora.dev]
|
63
|
+
@readers = "spacy.Corpus.v1"
|
64
|
+
path = ${paths.dev}
|
65
|
+
max_length = 0
|
66
|
+
|
67
|
+
[training]
|
68
|
+
dev_corpus = "corpora.dev"
|
69
|
+
train_corpus = "corpora.train"
|
70
|
+
|
71
|
+
[training.optimizer]
|
72
|
+
@optimizers = "Adam.v1"
|
73
|
+
|
74
|
+
[training.batcher]
|
75
|
+
@batchers = "spacy.batch_by_words.v1"
|
76
|
+
discard_oversize = false
|
77
|
+
tolerance = 0.2
|
78
|
+
|
79
|
+
[training.batcher.size]
|
80
|
+
@schedules = "compounding.v1"
|
81
|
+
start = 100
|
82
|
+
stop = 1000
|
83
|
+
compound = 1.001
|
84
|
+
|
85
|
+
[initialize]
|
86
|
+
vectors = "en_core_web_lg"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = null
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["tok2vec","textcat"]
|
14
|
+
batch_size = 1000
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.tok2vec]
|
19
|
+
factory = "tok2vec"
|
20
|
+
|
21
|
+
[components.tok2vec.model]
|
22
|
+
@architectures = "spacy.Tok2Vec.v2"
|
23
|
+
|
24
|
+
[components.tok2vec.model.embed]
|
25
|
+
@architectures = "spacy.MultiHashEmbed.v2"
|
26
|
+
width = ${components.tok2vec.model.encode.width}
|
27
|
+
attrs = ["ORTH", "SHAPE"]
|
28
|
+
rows = [5000, 2500]
|
29
|
+
include_static_vectors = false
|
30
|
+
|
31
|
+
[components.tok2vec.model.encode]
|
32
|
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
33
|
+
width = 96
|
34
|
+
depth = 4
|
35
|
+
window_size = 1
|
36
|
+
maxout_pieces = 3
|
37
|
+
|
38
|
+
[components.textcat]
|
39
|
+
factory = "textcat"
|
40
|
+
|
41
|
+
[components.textcat.model]
|
42
|
+
@architectures = "spacy.TextCatBOW.v1"
|
43
|
+
exclusive_classes = true
|
44
|
+
ngram_size = 1
|
45
|
+
no_output_layer = false
|
46
|
+
|
47
|
+
[corpora]
|
48
|
+
|
49
|
+
[corpora.train]
|
50
|
+
@readers = "spacy.Corpus.v1"
|
51
|
+
path = ${paths.train}
|
52
|
+
max_length = 2000
|
53
|
+
|
54
|
+
[corpora.dev]
|
55
|
+
@readers = "spacy.Corpus.v1"
|
56
|
+
path = ${paths.dev}
|
57
|
+
max_length = 0
|
58
|
+
|
59
|
+
[training]
|
60
|
+
dev_corpus = "corpora.dev"
|
61
|
+
train_corpus = "corpora.train"
|
62
|
+
|
63
|
+
[training.optimizer]
|
64
|
+
@optimizers = "Adam.v1"
|
65
|
+
|
66
|
+
[training.batcher]
|
67
|
+
@batchers = "spacy.batch_by_words.v1"
|
68
|
+
discard_oversize = false
|
69
|
+
tolerance = 0.2
|
70
|
+
|
71
|
+
[training.batcher.size]
|
72
|
+
@schedules = "compounding.v1"
|
73
|
+
start = 100
|
74
|
+
stop = 1000
|
75
|
+
compound = 1.001
|
76
|
+
|
77
|
+
[initialize]
|
78
|
+
vectors = null
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = "pytorch"
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["transformer","textcat"]
|
14
|
+
batch_size = 128
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.transformer]
|
19
|
+
factory = "transformer"
|
20
|
+
|
21
|
+
[components.transformer.model]
|
22
|
+
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
+
name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
|
+
tokenizer_config = {"use_fast": true}
|
25
|
+
|
26
|
+
[components.transformer.model.get_spans]
|
27
|
+
@span_getters = "spacy-transformers.strided_spans.v1"
|
28
|
+
window = 128
|
29
|
+
stride = 96
|
30
|
+
|
31
|
+
[components.textcat]
|
32
|
+
factory = "textcat"
|
33
|
+
|
34
|
+
[components.textcat.model]
|
35
|
+
@architectures = "spacy.TextCatEnsemble.v2"
|
36
|
+
nO = null
|
37
|
+
|
38
|
+
[components.textcat.model.tok2vec]
|
39
|
+
@architectures = "spacy-transformers.TransformerListener.v1"
|
40
|
+
grad_factor = 1.0
|
41
|
+
|
42
|
+
[components.textcat.model.tok2vec.pooling]
|
43
|
+
@layers = "reduce_mean.v1"
|
44
|
+
|
45
|
+
[components.textcat.model.linear_model]
|
46
|
+
@architectures = "spacy.TextCatBOW.v1"
|
47
|
+
exclusive_classes = true
|
48
|
+
ngram_size = 1
|
49
|
+
no_output_layer = false
|
50
|
+
|
51
|
+
[corpora]
|
52
|
+
|
53
|
+
[corpora.train]
|
54
|
+
@readers = "spacy.Corpus.v1"
|
55
|
+
path = ${paths.train}
|
56
|
+
max_length = 500
|
57
|
+
|
58
|
+
[corpora.dev]
|
59
|
+
@readers = "spacy.Corpus.v1"
|
60
|
+
path = ${paths.dev}
|
61
|
+
max_length = 0
|
62
|
+
|
63
|
+
[training]
|
64
|
+
accumulate_gradient = 3
|
65
|
+
dev_corpus = "corpora.dev"
|
66
|
+
train_corpus = "corpora.train"
|
67
|
+
|
68
|
+
[training.optimizer]
|
69
|
+
@optimizers = "Adam.v1"
|
70
|
+
|
71
|
+
[training.optimizer.learn_rate]
|
72
|
+
@schedules = "warmup_linear.v1"
|
73
|
+
warmup_steps = 250
|
74
|
+
total_steps = 20000
|
75
|
+
initial_rate = 5e-5
|
76
|
+
|
77
|
+
[training.batcher]
|
78
|
+
@batchers = "spacy.batch_by_padded.v1"
|
79
|
+
discard_oversize = true
|
80
|
+
size = 2000
|
81
|
+
buffer = 256
|
82
|
+
|
83
|
+
[initialize]
|
84
|
+
vectors = null
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# This is an auto-generated partial config. To use it with 'spacy train'
|
2
|
+
# you can run spacy init fill-config to auto-fill all default settings:
|
3
|
+
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
|
4
|
+
[paths]
|
5
|
+
train = null
|
6
|
+
dev = null
|
7
|
+
|
8
|
+
[system]
|
9
|
+
gpu_allocator = "pytorch"
|
10
|
+
|
11
|
+
[nlp]
|
12
|
+
lang = "en"
|
13
|
+
pipeline = ["transformer","textcat"]
|
14
|
+
batch_size = 128
|
15
|
+
|
16
|
+
[components]
|
17
|
+
|
18
|
+
[components.transformer]
|
19
|
+
factory = "transformer"
|
20
|
+
|
21
|
+
[components.transformer.model]
|
22
|
+
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
+
name = "roberta-base"
|
24
|
+
tokenizer_config = {"use_fast": true}
|
25
|
+
|
26
|
+
[components.transformer.model.get_spans]
|
27
|
+
@span_getters = "spacy-transformers.strided_spans.v1"
|
28
|
+
window = 128
|
29
|
+
stride = 96
|
30
|
+
|
31
|
+
[components.textcat]
|
32
|
+
factory = "textcat"
|
33
|
+
|
34
|
+
[components.textcat.model]
|
35
|
+
@architectures = "spacy.TextCatBOW.v1"
|
36
|
+
exclusive_classes = true
|
37
|
+
ngram_size = 1
|
38
|
+
no_output_layer = false
|
39
|
+
|
40
|
+
[corpora]
|
41
|
+
|
42
|
+
[corpora.train]
|
43
|
+
@readers = "spacy.Corpus.v1"
|
44
|
+
path = ${paths.train}
|
45
|
+
max_length = 500
|
46
|
+
|
47
|
+
[corpora.dev]
|
48
|
+
@readers = "spacy.Corpus.v1"
|
49
|
+
path = ${paths.dev}
|
50
|
+
max_length = 0
|
51
|
+
|
52
|
+
[training]
|
53
|
+
accumulate_gradient = 3
|
54
|
+
dev_corpus = "corpora.dev"
|
55
|
+
train_corpus = "corpora.train"
|
56
|
+
|
57
|
+
[training.optimizer]
|
58
|
+
@optimizers = "Adam.v1"
|
59
|
+
|
60
|
+
[training.optimizer.learn_rate]
|
61
|
+
@schedules = "warmup_linear.v1"
|
62
|
+
warmup_steps = 250
|
63
|
+
total_steps = 20000
|
64
|
+
initial_rate = 5e-5
|
65
|
+
|
66
|
+
[training.batcher]
|
67
|
+
@batchers = "spacy.batch_by_padded.v1"
|
68
|
+
discard_oversize = true
|
69
|
+
size = 2000
|
70
|
+
buffer = 256
|
71
|
+
|
72
|
+
[initialize]
|
73
|
+
vectors = null
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/spaCy'
|
3
|
+
require 'rbbt/vector/model/svm'
|
4
|
+
|
5
|
+
class TestSpaCyModel < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_spyCy
|
8
|
+
TmpFile.with_file() do |dir|
|
9
|
+
Log.severity = 0
|
10
|
+
FileUtils.mkdir_p dir
|
11
|
+
|
12
|
+
model = SpaCyModel.new(
|
13
|
+
dir,
|
14
|
+
"cpu/textcat_efficiency.conf"
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
require 'rbbt/tsv/csv'
|
19
|
+
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
20
|
+
tsv = TSV.csv(Open.open(url))
|
21
|
+
tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
|
22
|
+
|
23
|
+
good = tsv.select("Recommended IND" => '1')
|
24
|
+
bad = tsv.select("Recommended IND" => '0')
|
25
|
+
|
26
|
+
gsize = 2000
|
27
|
+
bsize = 500
|
28
|
+
good.keys[0..gsize-1].each do |text|
|
29
|
+
next if text.nil? || text.empty?
|
30
|
+
model.add text, '1'
|
31
|
+
end
|
32
|
+
|
33
|
+
bad.keys[0..bsize-1].each do |text|
|
34
|
+
model.add text, '0'
|
35
|
+
end
|
36
|
+
|
37
|
+
model.cross_validation
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_svm_spacy
|
41
|
+
|
42
|
+
require 'rbbt/tsv/csv'
|
43
|
+
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
44
|
+
tsv = TSV.csv(Open.open(url))
|
45
|
+
tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
|
46
|
+
|
47
|
+
good = tsv.select("Recommended IND" => '1')
|
48
|
+
bad = tsv.select("Recommended IND" => '0')
|
49
|
+
|
50
|
+
gsize = 2000
|
51
|
+
bsize = 500
|
52
|
+
model = SVMModel.new(
|
53
|
+
dir
|
54
|
+
)
|
55
|
+
|
56
|
+
nlp = RbbtPython.run "spacy" do
|
57
|
+
spacy.load('en_core_web_md')
|
58
|
+
end
|
59
|
+
|
60
|
+
model.extract_features = Proc.new do |text|
|
61
|
+
vs = RbbtPython.run do
|
62
|
+
RbbtPython.collect nlp.(text).__iter__ do |token|
|
63
|
+
token.vector.tolist()
|
64
|
+
end
|
65
|
+
end
|
66
|
+
length = vs.length
|
67
|
+
|
68
|
+
v = vs.inject(nil){|acc,ev| acc = acc.nil? ? ev : acc.zip(ev).collect{|a,b| a + b } }
|
69
|
+
|
70
|
+
v.collect{|e| e / length }
|
71
|
+
end
|
72
|
+
|
73
|
+
TSV.traverse good.keys[0..gsize-1], :type => :array, :bar => true do |text|
|
74
|
+
next if text.nil? || text.empty?
|
75
|
+
model.add text, '1'
|
76
|
+
end
|
77
|
+
|
78
|
+
TSV.traverse bad.keys[0..bsize-1], :type => :array, :bar => true do |text|
|
79
|
+
model.add text, '0'
|
80
|
+
end
|
81
|
+
|
82
|
+
model.cross_validation
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_spyCy_trf
|
88
|
+
TmpFile.with_file() do |dir|
|
89
|
+
Log.severity = 0
|
90
|
+
FileUtils.mkdir_p dir
|
91
|
+
|
92
|
+
model = SpaCyModel.new(
|
93
|
+
dir,
|
94
|
+
"gpu/textcat_accuracy.conf"
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
require 'rbbt/tsv/csv'
|
99
|
+
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
100
|
+
tsv = TSV.csv(Open.open(url))
|
101
|
+
tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
|
102
|
+
|
103
|
+
good = tsv.select("Recommended IND" => '1')
|
104
|
+
bad = tsv.select("Recommended IND" => '0')
|
105
|
+
|
106
|
+
gsize = 2000
|
107
|
+
bsize = 500
|
108
|
+
good.keys[0..gsize-1].each do |text|
|
109
|
+
next if text.nil? || text.empty?
|
110
|
+
model.add text, '1'
|
111
|
+
end
|
112
|
+
|
113
|
+
bad.keys[0..bsize-1].each do |text|
|
114
|
+
model.add text, '0'
|
115
|
+
end
|
116
|
+
|
117
|
+
model.cross_validation
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|