rbbt-dm 1.2.6 → 1.2.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/barcode.rb +2 -2
- data/lib/rbbt/matrix/differential.rb +3 -3
- data/lib/rbbt/matrix/knowledge_base.rb +1 -1
- data/lib/rbbt/plots/bar.rb +1 -1
- data/lib/rbbt/stan.rb +1 -1
- data/lib/rbbt/statistics/hypergeometric.rb +2 -1
- data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
- data/lib/rbbt/vector/model/huggingface.rb +57 -38
- data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
- data/lib/rbbt/vector/model/random_forest.rb +1 -1
- data/lib/rbbt/vector/model/spaCy.rb +8 -14
- data/lib/rbbt/vector/model/tensorflow.rb +6 -5
- data/lib/rbbt/vector/model/torch.rb +37 -0
- data/lib/rbbt/vector/model/util.rb +18 -0
- data/lib/rbbt/vector/model.rb +100 -56
- data/python/rbbt_dm/__init__.py +48 -1
- data/python/rbbt_dm/atcold/__init__.py +0 -0
- data/python/rbbt_dm/atcold/plot_lib.py +141 -0
- data/python/rbbt_dm/atcold/spiral.py +27 -0
- data/python/rbbt_dm/huggingface.py +57 -26
- data/python/rbbt_dm/language_model.py +70 -0
- data/python/rbbt_dm/util.py +30 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
- data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
- data/test/rbbt/vector/model/test_huggingface.rb +258 -27
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
- data/test/rbbt/vector/model/test_spaCy.rb +1 -1
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
- data/test/rbbt/vector/test_model.rb +25 -26
- data/test/test_helper.rb +13 -0
- metadata +26 -16
- data/lib/rbbt/tensorflow.rb +0 -43
- data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -3,7 +3,7 @@ require 'rbbt/vector/model/huggingface'
|
|
3
3
|
|
4
4
|
class TestHuggingface < Test::Unit::TestCase
|
5
5
|
|
6
|
-
def
|
6
|
+
def _test_options
|
7
7
|
TmpFile.with_file do |dir|
|
8
8
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
9
9
|
task = "SequenceClassification"
|
@@ -11,20 +11,20 @@ class TestHuggingface < Test::Unit::TestCase
|
|
11
11
|
model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
|
12
12
|
iii model.eval "This is dog"
|
13
13
|
iii model.eval "This is cat"
|
14
|
-
iii model.
|
14
|
+
iii model.eval_list(["This is dog", "This is cat"])
|
15
15
|
|
16
16
|
model = VectorModel.new dir
|
17
|
-
iii model.
|
17
|
+
iii model.eval_list(["This is dog", "This is cat"])
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def
|
21
|
+
def _test_pipeline
|
22
22
|
require 'rbbt/util/python'
|
23
23
|
model = VectorModel.new
|
24
24
|
model.post_process do |elements|
|
25
25
|
elements.collect{|e| e['label'] }
|
26
26
|
end
|
27
|
-
model.eval_model do |
|
27
|
+
model.eval_model do |elements|
|
28
28
|
RbbtPython.run :transformers do
|
29
29
|
classifier ||= transformers.pipeline("sentiment-analysis")
|
30
30
|
classifier.call(elements)
|
@@ -33,21 +33,53 @@ class TestHuggingface < Test::Unit::TestCase
|
|
33
33
|
|
34
34
|
assert_equal ["POSITIVE"], model.eval("I've been waiting for a HuggingFace course my whole life.")
|
35
35
|
end
|
36
|
+
|
37
|
+
def _test_tokenizer_size
|
38
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
39
|
+
tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
|
40
|
+
"MaskedLM", checkpoint, :max_length => 5, :model_max_length => 5)
|
41
|
+
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true, max_length: 5)["input_ids"].__len__
|
42
|
+
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
|
43
|
+
end
|
36
44
|
|
37
|
-
def
|
45
|
+
def _test_sst_eval
|
38
46
|
TmpFile.with_file do |dir|
|
39
47
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
40
48
|
|
41
|
-
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
49
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
|
42
50
|
|
43
51
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
44
52
|
|
45
|
-
assert_equal
|
53
|
+
assert_equal "Bad", model.eval("This is dog")
|
54
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
46
55
|
end
|
47
56
|
end
|
48
57
|
|
49
58
|
|
50
59
|
def test_sst_train
|
60
|
+
TmpFile.with_file do |dir|
|
61
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
62
|
+
|
63
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
|
64
|
+
|
65
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
66
|
+
|
67
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
68
|
+
|
69
|
+
100.times do
|
70
|
+
model.add "Dog is good", "Good"
|
71
|
+
end
|
72
|
+
|
73
|
+
model.train
|
74
|
+
|
75
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
76
|
+
|
77
|
+
model = VectorModel.new dir
|
78
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def _test_sst_train_with_labels
|
51
83
|
TmpFile.with_file do |dir|
|
52
84
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
53
85
|
|
@@ -55,28 +87,29 @@ class TestHuggingface < Test::Unit::TestCase
|
|
55
87
|
|
56
88
|
model.model_options[:class_labels] = %w(Bad Good)
|
57
89
|
|
58
|
-
assert_equal ["Bad", "Good"], model.
|
90
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
59
91
|
|
60
92
|
100.times do
|
61
|
-
model.add "Dog is good",
|
93
|
+
model.add "Dog is good", "Good"
|
62
94
|
end
|
63
95
|
|
64
96
|
model.train
|
65
97
|
|
66
|
-
assert_equal ["Good", "Good"], model.
|
98
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
67
99
|
|
68
100
|
model = VectorModel.new dir
|
69
|
-
assert_equal ["Good", "Good"], model.
|
101
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
70
102
|
end
|
71
103
|
end
|
72
104
|
|
73
|
-
|
105
|
+
|
106
|
+
def _test_sst_train_no_save
|
74
107
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
75
108
|
|
76
109
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint
|
77
110
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
78
111
|
|
79
|
-
assert_equal ["Bad", "Good"], model.
|
112
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
80
113
|
|
81
114
|
100.times do
|
82
115
|
model.add "Dog is good", 1
|
@@ -84,48 +117,50 @@ class TestHuggingface < Test::Unit::TestCase
|
|
84
117
|
|
85
118
|
model.train
|
86
119
|
|
87
|
-
assert_equal ["Good", "Good"], model.
|
120
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
88
121
|
end
|
89
122
|
|
90
|
-
def
|
123
|
+
def _test_sst_train_save_and_load
|
91
124
|
TmpFile.with_file do |dir|
|
92
125
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
93
126
|
|
94
127
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
95
128
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
96
129
|
|
97
|
-
assert_equal ["Bad", "Good"], model.
|
130
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
98
131
|
|
99
132
|
100.times do
|
100
|
-
model.add "Dog is good",
|
133
|
+
model.add "Dog is good", "Good"
|
101
134
|
end
|
102
135
|
|
103
136
|
model.train
|
104
137
|
|
105
138
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
106
139
|
|
107
|
-
assert_equal ["Good", "Good"], model.
|
140
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
108
141
|
|
109
|
-
|
142
|
+
model_path = model.model_path
|
110
143
|
|
111
|
-
model = HuggingfaceModel.new "SequenceClassification",
|
144
|
+
model = HuggingfaceModel.new "SequenceClassification", model_path
|
112
145
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
113
146
|
|
114
|
-
assert_equal ["Good", "Good"], model.
|
147
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
115
148
|
|
116
149
|
model = VectorModel.new dir
|
117
150
|
|
118
|
-
assert_equal "Good", model.
|
151
|
+
assert_equal "Good", model.eval_list("This is dog")
|
119
152
|
|
120
153
|
end
|
121
154
|
end
|
122
155
|
|
123
|
-
def
|
156
|
+
def _test_sst_stress_test
|
124
157
|
TmpFile.with_file do |dir|
|
125
158
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
126
159
|
|
127
160
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
128
161
|
|
162
|
+
assert_equal 0, model.eval("This is dog")
|
163
|
+
|
129
164
|
100.times do
|
130
165
|
model.add "Dog is good", 1
|
131
166
|
model.add "Cat is bad", 0
|
@@ -136,18 +171,214 @@ class TestHuggingface < Test::Unit::TestCase
|
|
136
171
|
end
|
137
172
|
|
138
173
|
Misc.benchmark 1000 do
|
139
|
-
model.
|
174
|
+
model.eval_list(["This is good", "This is terrible", "This is dog", "This is cat", "Very different stuff", "Dog is bad", "Cat is good"])
|
140
175
|
end
|
141
176
|
end
|
142
177
|
end
|
143
178
|
|
144
|
-
def
|
179
|
+
def _test_mask_eval
|
145
180
|
checkpoint = "bert-base-uncased"
|
146
181
|
|
147
182
|
model = HuggingfaceModel.new "MaskedLM", checkpoint
|
148
|
-
assert_equal 3, model.
|
183
|
+
assert_equal 3, model.eval_list(["Paris is the [MASK] of the France.", "The [MASK] worked very hard all the time.", "The [MASK] arrested the dangerous [MASK]."]).
|
149
184
|
reject{|v| v.empty?}.length
|
150
185
|
end
|
151
186
|
|
187
|
+
def _test_mask_eval_tokenizer
|
188
|
+
checkpoint = "bert-base-uncased"
|
189
|
+
|
190
|
+
model = HuggingfaceModel.new "MaskedLM", checkpoint
|
191
|
+
|
192
|
+
mod, tokenizer = model.init
|
193
|
+
|
194
|
+
orig = tokenizer.call("Hi [GENE]")["input_ids"]
|
195
|
+
tokenizer.add_tokens(["[GENE]"])
|
196
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
197
|
+
new = tokenizer.call("Hi [GENE]")["input_ids"]
|
198
|
+
|
199
|
+
assert orig.length > new.length
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
def _test_custom_class
|
204
|
+
TmpFile.with_file do |dir|
|
205
|
+
Open.write File.join(dir, "mypkg/__init__.py"), ""
|
206
|
+
|
207
|
+
Open.write File.join(dir, "mypkg/mymodel.py"), <<~EOF
|
208
|
+
|
209
|
+
# Esta clase es igual que la de RobertaForTokenClassification
|
210
|
+
# Importamos los métodos necesarios
|
211
|
+
import torch.nn as nn
|
212
|
+
from transformers import RobertaConfig
|
213
|
+
from transformers.modeling_outputs import TokenClassifierOutput
|
214
|
+
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
|
215
|
+
|
216
|
+
# Creamos una clase que herede de RobertaPreTrainedModel
|
217
|
+
class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
|
218
|
+
config_class = RobertaConfig
|
219
|
+
|
220
|
+
def __init__(self, config):
|
221
|
+
# Se usa para inicializar el modelo Roberta
|
222
|
+
super().__init__(config)
|
223
|
+
# Numero de etiquetas que se van a clasificar (sería el número de etiquetas del corpus*2)
|
224
|
+
# Una correspondiente a la etiqueta I y otra a la B.
|
225
|
+
self.num_labels = config.num_labels
|
226
|
+
# No incorporamos pooling layer para devolver los hidden states de cada token (no sólo el CLS)
|
227
|
+
self.roberta = RobertaModel(config, add_pooling_layer=False)
|
228
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
229
|
+
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
230
|
+
self.init_weights()
|
231
|
+
|
232
|
+
def forward(self, input_ids = None, attention_mask = None, token_type_ids = None, labels = None,
|
233
|
+
**kwargs):
|
234
|
+
# Obtenemos una codificación del input (los hidden states)
|
235
|
+
outputs = self.roberta(input_ids, attention_mask = attention_mask,
|
236
|
+
token_type_ids = token_type_ids, **kwargs)
|
237
|
+
|
238
|
+
# A la salida de los hidden states le aplicamos la capa de dropout
|
239
|
+
sequence_output = self.dropout(outputs[0])
|
240
|
+
# Y posteriormente la capa de clasificación.
|
241
|
+
logits = self.classifier(sequence_output)
|
242
|
+
# Si labels tiene algún valor (lo que se hará durante el proceso de entrenamiento), se calculan las Loss
|
243
|
+
# para justar los pesos en el backprop.
|
244
|
+
loss = None
|
245
|
+
if labels is not None:
|
246
|
+
loss_fct = nn.CrossEntropyLoss()
|
247
|
+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
248
|
+
|
249
|
+
return TokenClassifierOutput(loss=loss, logits=logits,
|
250
|
+
hidden_states=outputs.hidden_states,
|
251
|
+
attentions=outputs.attentions)
|
252
|
+
EOF
|
253
|
+
|
254
|
+
RbbtPython.add_path dir
|
255
|
+
|
256
|
+
biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
|
257
|
+
model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta
|
258
|
+
|
259
|
+
model.post_process do |result,is_list|
|
260
|
+
if is_list
|
261
|
+
RbbtPython.numpy2ruby result.predictions
|
262
|
+
else
|
263
|
+
result["logits"][0]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
texto = "El paciente tiene un cáncer del pulmon"
|
268
|
+
assert model.eval(texto)[5][1] > 0
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
def _test_sst_train_word_embeddings
|
273
|
+
TmpFile.with_file do |dir|
|
274
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
275
|
+
|
276
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
277
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
278
|
+
|
279
|
+
mod, tokenizer = model.init
|
280
|
+
|
281
|
+
orig = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
282
|
+
orig = RbbtPython.numpy2ruby(orig.cpu.detach.numpy)
|
283
|
+
|
284
|
+
100.times do
|
285
|
+
model.add "Dog is good", "Good"
|
286
|
+
end
|
287
|
+
|
288
|
+
model.train
|
289
|
+
|
290
|
+
new = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
291
|
+
new = RbbtPython.numpy2ruby(new.cpu.detach.numpy)
|
292
|
+
|
293
|
+
diff = []
|
294
|
+
new.each_with_index do |row,i|
|
295
|
+
diff << i if row != orig[i]
|
296
|
+
end
|
297
|
+
|
298
|
+
assert diff.length > 0
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def _test_sst_freeze_word_embeddings
|
303
|
+
TmpFile.with_file do |dir|
|
304
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
305
|
+
|
306
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
307
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
308
|
+
|
309
|
+
mod, tokenizer = model.init
|
310
|
+
|
311
|
+
layer = HuggingfaceModel.freeze_layer(mod, 'distilbert')
|
312
|
+
|
313
|
+
orig = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
314
|
+
orig = RbbtPython.numpy2ruby(orig.cpu.detach.numpy)
|
315
|
+
|
316
|
+
100.times do
|
317
|
+
model.add "Dog is good", "Good"
|
318
|
+
end
|
319
|
+
|
320
|
+
model.train
|
321
|
+
|
322
|
+
new = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
323
|
+
new = RbbtPython.numpy2ruby(new.cpu.detach.numpy)
|
324
|
+
|
325
|
+
diff = []
|
326
|
+
new.each_with_index do |row,i|
|
327
|
+
diff << i if row != orig[i]
|
328
|
+
end
|
329
|
+
|
330
|
+
assert diff.length == 0
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
def _test_sst_save_word_embeddings
|
335
|
+
TmpFile.with_file do |dir|
|
336
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
337
|
+
|
338
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
339
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
340
|
+
|
341
|
+
mod, tokenizer = model.init
|
342
|
+
|
343
|
+
100.times do
|
344
|
+
model.add "Dog is good", "Good"
|
345
|
+
end
|
346
|
+
|
347
|
+
model.train
|
348
|
+
|
349
|
+
orig = RbbtPython.numpy2ruby(
|
350
|
+
HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings').cpu.detach.numpy)
|
351
|
+
|
352
|
+
model = HuggingfaceModel.new "MaskedLM", checkpoint, dir
|
353
|
+
|
354
|
+
mod, tokenizer = model.init
|
355
|
+
|
356
|
+
new = RbbtPython.numpy2ruby(
|
357
|
+
HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings').cpu.detach.numpy)
|
358
|
+
|
359
|
+
|
360
|
+
diff = []
|
361
|
+
new.each_with_index do |row,i|
|
362
|
+
diff << i if row != orig[i]
|
363
|
+
end
|
364
|
+
|
365
|
+
assert diff.length == 0
|
366
|
+
|
367
|
+
model = HuggingfaceModel.new "MaskedLM", checkpoint
|
368
|
+
|
369
|
+
mod, tokenizer = model.init
|
370
|
+
|
371
|
+
new = RbbtPython.numpy2ruby(
|
372
|
+
HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings').cpu.detach.numpy)
|
373
|
+
|
374
|
+
|
375
|
+
diff = []
|
376
|
+
new.each_with_index do |row,i|
|
377
|
+
diff << i if row != orig[i]
|
378
|
+
end
|
379
|
+
|
380
|
+
assert diff.length > 0
|
381
|
+
end
|
382
|
+
end
|
152
383
|
end
|
153
384
|
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/pytorch_lightning'
|
3
|
+
|
4
|
+
class TestPytorchLightning < Test::Unit::TestCase
|
5
|
+
def test_clustering
|
6
|
+
nsamples = 10
|
7
|
+
ngenes = 10000
|
8
|
+
samples = nsamples.times.collect{|i| "Sample-#{i}" }
|
9
|
+
data = TSV.setup({}, :key_field => "Gene", :fields => samples + ["cluster"], :type => :list, :cast => :to_f)
|
10
|
+
|
11
|
+
profiles = []
|
12
|
+
p0 = 3
|
13
|
+
p1 = 7
|
14
|
+
profiles[0] = nsamples.times.collect{ rand() + p0 }
|
15
|
+
profiles[1] = nsamples.times.collect{ rand() + p1 }
|
16
|
+
|
17
|
+
ngenes.times do |genen|
|
18
|
+
gene = "Gene-#{genen}"
|
19
|
+
cluster = genen % 2
|
20
|
+
values = profiles[cluster].collect do |m|
|
21
|
+
rand() + m
|
22
|
+
end
|
23
|
+
data[gene] = values + [cluster]
|
24
|
+
end
|
25
|
+
|
26
|
+
python = <<~EOF
|
27
|
+
import torch
|
28
|
+
from torch import nn
|
29
|
+
from torch.nn import functional as F
|
30
|
+
from torch.utils.data import DataLoader
|
31
|
+
from torch.utils.data import random_split
|
32
|
+
from torchvision.datasets import MNIST
|
33
|
+
from torchvision import transforms
|
34
|
+
import pytorch_lightning as pl
|
35
|
+
|
36
|
+
class TestPytorchLightningModel(pl.LightningModule):
|
37
|
+
def __init__(self, input_size=10, internal_dim=1):
|
38
|
+
super().__init__()
|
39
|
+
self.model = nn.Tanh()
|
40
|
+
|
41
|
+
def configure_optimizers(self):
|
42
|
+
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
|
43
|
+
return optimizer
|
44
|
+
|
45
|
+
@torch.cuda.amp.autocast(True)
|
46
|
+
def forward(self, x):
|
47
|
+
x = x.to(self.dtype)
|
48
|
+
return self.model(x).squeeze()
|
49
|
+
|
50
|
+
@torch.cuda.amp.autocast(True)
|
51
|
+
def training_step(self, train_batch, batch_idx):
|
52
|
+
x, y = train_batch
|
53
|
+
x = x.to(self.dtype)
|
54
|
+
y = y.to(self.dtype)
|
55
|
+
y_hat = self.model(x).squeeze()
|
56
|
+
loss = F.mse_loss(y, y_hat)
|
57
|
+
self.log('train_loss', loss)
|
58
|
+
return loss
|
59
|
+
|
60
|
+
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float64)
|
61
|
+
def validation_step(self, val_batch, batch_idx):
|
62
|
+
x, y = train_batch
|
63
|
+
y_hat = self.model(x)
|
64
|
+
loss = F.mse_loss(y, y_hat)
|
65
|
+
self.log('val_loss', loss)
|
66
|
+
|
67
|
+
EOF
|
68
|
+
|
69
|
+
with_python(python) do |pkg|
|
70
|
+
model = PytorchLightningModel.new pkg , "TestPytorchLightningModel", nil, model_args: {internal_dim: 1}
|
71
|
+
TmpFile.with_file(data.to_s) do |data_file|
|
72
|
+
ds = RbbtPython.call_method "rbbt_dm", :tsv, filename: data_file
|
73
|
+
model.loader = RbbtPython.class_new_obj("torch.utils.data", :DataLoader, dataset: ds, batch_size: 64)
|
74
|
+
model.trainer = RbbtPython.class_new_obj("pytorch_lightning", "Trainer", gpus: 1, max_epochs: 5, precision: 16)
|
75
|
+
end
|
76
|
+
model.train
|
77
|
+
encoding = model.eval_list(data.values.collect{|v| v[0..-2] }).detach().cpu().numpy()
|
78
|
+
iii encoding[0..10]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
@@ -100,7 +100,7 @@ class TestSpaCyModel < Test::Unit::TestCase
|
|
100
100
|
)
|
101
101
|
|
102
102
|
|
103
|
-
Rbbt::Config.set 'gpu_id',
|
103
|
+
Rbbt::Config.set 'gpu_id', 0, :spacy
|
104
104
|
require 'rbbt/tsv/csv'
|
105
105
|
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
106
106
|
tsv = TSV.csv(Open.open(url))
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/vector/model/tensorflow'
|
3
|
+
require 'rbbt/util/python'
|
3
4
|
|
4
5
|
class TestTensorflowModel < Test::Unit::TestCase
|
5
6
|
|
@@ -10,6 +11,7 @@ class TestTensorflowModel < Test::Unit::TestCase
|
|
10
11
|
|
11
12
|
model = TensorFlowModel.new(
|
12
13
|
dir,
|
14
|
+
jit_compile: true,
|
13
15
|
optimizer: 'adam',
|
14
16
|
loss: 'sparse_categorical_crossentropy',
|
15
17
|
metrics: ['accuracy']
|
@@ -53,5 +55,6 @@ class TestTensorflowModel < Test::Unit::TestCase
|
|
53
55
|
assert sum.to_f / predictions.length > 0.7
|
54
56
|
end
|
55
57
|
end
|
58
|
+
|
56
59
|
end
|
57
60
|
|
@@ -26,7 +26,7 @@ class TestVectorModel < Test::Unit::TestCase
|
|
26
26
|
element.split(";")
|
27
27
|
}
|
28
28
|
|
29
|
-
model.train_model = Proc.new{|
|
29
|
+
model.train_model = Proc.new{|features,labels|
|
30
30
|
TmpFile.with_file do |feature_file|
|
31
31
|
Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
|
32
32
|
Open.write(feature_file + '.class', labels * "\n")
|
@@ -36,23 +36,23 @@ labels = scan("#{ feature_file }.class", what=numeric());
|
|
36
36
|
features = cbind(features, class = labels);
|
37
37
|
rbbt.require('e1071')
|
38
38
|
model = svm(class ~ ., data = features)
|
39
|
-
save(model, file="#{
|
39
|
+
save(model, file="#{ @model_path }");
|
40
40
|
EOF
|
41
41
|
end
|
42
42
|
}
|
43
43
|
|
44
|
-
model.eval_model = Proc.new{|
|
44
|
+
model.eval_model = Proc.new{|features|
|
45
45
|
TmpFile.with_file do |feature_file|
|
46
46
|
TmpFile.with_file do |results|
|
47
47
|
Open.write(feature_file, features * "\t")
|
48
|
-
|
48
|
+
R.run <<-EOF
|
49
49
|
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
50
50
|
library(e1071)
|
51
|
-
load(file="#{
|
51
|
+
load(file="#{ @model_path }")
|
52
52
|
label = predict(model, features);
|
53
53
|
cat(label, file="#{results}");
|
54
54
|
EOF
|
55
|
-
|
55
|
+
|
56
56
|
Open.read(results)
|
57
57
|
end
|
58
58
|
end
|
@@ -96,7 +96,7 @@ cat(label, file="#{results}");
|
|
96
96
|
end
|
97
97
|
}
|
98
98
|
|
99
|
-
model.train_model = Proc.new{|
|
99
|
+
model.train_model = Proc.new{|features,labels|
|
100
100
|
TmpFile.with_file do |feature_file|
|
101
101
|
Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
|
102
102
|
Open.write(feature_file + '.class', labels * "\n")
|
@@ -106,23 +106,23 @@ labels = scan("#{ feature_file }.class", what=numeric());
|
|
106
106
|
features = cbind(features, class = labels);
|
107
107
|
rbbt.require('e1071')
|
108
108
|
model = svm(class ~ ., data = features)
|
109
|
-
save(model, file="#{
|
109
|
+
save(model, file="#{ @model_path }");
|
110
110
|
EOF
|
111
111
|
end
|
112
112
|
}
|
113
113
|
|
114
|
-
model.eval_model = Proc.new{|
|
114
|
+
model.eval_model = Proc.new{|features|
|
115
115
|
TmpFile.with_file do |feature_file|
|
116
116
|
TmpFile.with_file do |results|
|
117
117
|
Open.write(feature_file, features * "\t")
|
118
|
-
|
118
|
+
R.run <<-EOF
|
119
119
|
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
120
120
|
library(e1071)
|
121
|
-
load(file="#{
|
121
|
+
load(file="#{ @model_path }")
|
122
122
|
label = predict(model, features);
|
123
123
|
cat(label, file="#{results}");
|
124
124
|
EOF
|
125
|
-
|
125
|
+
|
126
126
|
Open.read(results)
|
127
127
|
end
|
128
128
|
end
|
@@ -164,7 +164,7 @@ cat(label, file="#{results}");
|
|
164
164
|
element.split(";")
|
165
165
|
}
|
166
166
|
|
167
|
-
model.train_model = Proc.new{|
|
167
|
+
model.train_model = Proc.new{|features,labels|
|
168
168
|
TmpFile.with_file do |feature_file|
|
169
169
|
Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
|
170
170
|
Open.write(feature_file + '.class', labels * "\n")
|
@@ -174,23 +174,23 @@ labels = scan("#{ feature_file }.class", what=numeric());
|
|
174
174
|
features = cbind(features, class = labels);
|
175
175
|
rbbt.require('e1071')
|
176
176
|
model = svm(class ~ ., data = features)
|
177
|
-
save(model, file="#{
|
177
|
+
save(model, file="#{ @model_path }");
|
178
178
|
EOF
|
179
179
|
end
|
180
180
|
}
|
181
181
|
|
182
|
-
model.eval_model = Proc.new{|
|
182
|
+
model.eval_model = Proc.new{|features|
|
183
183
|
TmpFile.with_file do |feature_file|
|
184
184
|
TmpFile.with_file do |results|
|
185
185
|
Open.write(feature_file, features * "\t")
|
186
|
-
|
186
|
+
R.run <<-EOF
|
187
187
|
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
188
188
|
library(e1071)
|
189
|
-
load(file="#{
|
189
|
+
load(file="#{ @model_path }")
|
190
190
|
label = predict(model, features);
|
191
191
|
cat(label, file="#{results}");
|
192
192
|
EOF
|
193
|
-
|
193
|
+
|
194
194
|
Open.read(results)
|
195
195
|
end
|
196
196
|
end
|
@@ -236,7 +236,7 @@ cat(label, file="#{results}");
|
|
236
236
|
end
|
237
237
|
}
|
238
238
|
|
239
|
-
model.train_model = Proc.new{|
|
239
|
+
model.train_model = Proc.new{|features,labels|
|
240
240
|
TmpFile.with_file do |feature_file|
|
241
241
|
Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
|
242
242
|
Open.write(feature_file + '.class', labels * "\n")
|
@@ -246,23 +246,23 @@ labels = scan("#{ feature_file }.class", what=numeric());
|
|
246
246
|
features = cbind(features, label = labels);
|
247
247
|
rbbt.require('e1071')
|
248
248
|
model = svm(label ~ ., data = features)
|
249
|
-
save(model, file="#{
|
249
|
+
save(model, file="#{ @model_path }");
|
250
250
|
EOF
|
251
251
|
end
|
252
252
|
}
|
253
253
|
|
254
|
-
model.eval_model = Proc.new{|
|
254
|
+
model.eval_model = Proc.new{|features|
|
255
255
|
TmpFile.with_file do |feature_file|
|
256
256
|
TmpFile.with_file do |results|
|
257
257
|
Open.write(feature_file, features * "\t")
|
258
|
-
|
258
|
+
R.run <<-EOF
|
259
259
|
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
|
260
260
|
library(e1071)
|
261
|
-
load(file="#{
|
261
|
+
load(file="#{ @model_path }")
|
262
262
|
label = predict(model, features);
|
263
263
|
cat(label, file="#{results}");
|
264
264
|
EOF
|
265
|
-
|
265
|
+
|
266
266
|
Open.read(results)
|
267
267
|
end
|
268
268
|
end
|
@@ -282,7 +282,6 @@ cat(label, file="#{results}");
|
|
282
282
|
model.add features, label
|
283
283
|
end
|
284
284
|
|
285
|
-
iii model.eval("1;1;1")
|
286
285
|
assert model.eval("1;1;1").to_f > 0.5
|
287
286
|
assert model.eval("0;0;0").to_f < 0.5
|
288
287
|
end
|
@@ -515,7 +514,7 @@ label = predict(model, features);
|
|
515
514
|
TmpFile.with_file do |dir|
|
516
515
|
model = VectorModel.new dir
|
517
516
|
|
518
|
-
model.eval_model do |
|
517
|
+
model.eval_model do |elements|
|
519
518
|
elements = [elements] unless Array === elements
|
520
519
|
RbbtPython.binding_run do
|
521
520
|
pyimport :torch
|