rbbt-dm 1.2.7 → 1.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/barcode.rb +2 -2
- data/lib/rbbt/matrix/differential.rb +3 -3
- data/lib/rbbt/matrix/knowledge_base.rb +1 -1
- data/lib/rbbt/plots/bar.rb +1 -1
- data/lib/rbbt/stan.rb +1 -1
- data/lib/rbbt/statistics/hypergeometric.rb +2 -1
- data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
- data/lib/rbbt/vector/model/huggingface.rb +57 -38
- data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
- data/lib/rbbt/vector/model/random_forest.rb +1 -1
- data/lib/rbbt/vector/model/spaCy.rb +8 -6
- data/lib/rbbt/vector/model/tensorflow.rb +6 -5
- data/lib/rbbt/vector/model/torch.rb +37 -0
- data/lib/rbbt/vector/model.rb +82 -52
- data/python/rbbt_dm/__init__.py +48 -1
- data/python/rbbt_dm/atcold/__init__.py +0 -0
- data/python/rbbt_dm/atcold/plot_lib.py +141 -0
- data/python/rbbt_dm/atcold/spiral.py +27 -0
- data/python/rbbt_dm/huggingface.py +57 -26
- data/python/rbbt_dm/language_model.py +70 -0
- data/python/rbbt_dm/util.py +30 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
- data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
- data/test/rbbt/vector/model/test_huggingface.rb +258 -27
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
- data/test/rbbt/vector/model/test_spaCy.rb +1 -1
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
- data/test/rbbt/vector/test_model.rb +25 -26
- data/test/test_helper.rb +13 -0
- metadata +26 -16
- data/lib/rbbt/tensorflow.rb +0 -43
- data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -0,0 +1,70 @@
|
|
1
|
+
def group_texts(examples):
|
2
|
+
# Concatenate all texts.
|
3
|
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
4
|
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
5
|
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
6
|
+
# customize this part to your needs.
|
7
|
+
total_length = (total_length // block_size) * block_size
|
8
|
+
# Split by chunks of max_len.
|
9
|
+
result = {
|
10
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
11
|
+
for k, t in concatenated_examples.items()
|
12
|
+
}
|
13
|
+
result["labels"] = result["input_ids"].copy()
|
14
|
+
return result
|
15
|
+
|
16
|
+
def whole_word_masking_data_collator(features):
|
17
|
+
from transformers import default_data_collator
|
18
|
+
for feature in features:
|
19
|
+
word_ids = feature.pop("word_ids")
|
20
|
+
|
21
|
+
# Create a map between words and corresponding token indices
|
22
|
+
mapping = collections.defaultdict(list)
|
23
|
+
current_word_index = -1
|
24
|
+
current_word = None
|
25
|
+
for idx, word_id in enumerate(word_ids):
|
26
|
+
if word_id is not None:
|
27
|
+
if word_id != current_word:
|
28
|
+
current_word = word_id
|
29
|
+
current_word_index += 1
|
30
|
+
mapping[current_word_index].append(idx)
|
31
|
+
|
32
|
+
# Randomly mask words
|
33
|
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
34
|
+
input_ids = feature["input_ids"]
|
35
|
+
labels = feature["labels"]
|
36
|
+
new_labels = [-100] * len(labels)
|
37
|
+
for word_id in np.where(mask)[0]:
|
38
|
+
word_id = word_id.item()
|
39
|
+
for idx in mapping[word_id]:
|
40
|
+
new_labels[idx] = labels[idx]
|
41
|
+
input_ids[idx] = tokenizer.mask_token_id
|
42
|
+
feature["labels"] = new_labels
|
43
|
+
|
44
|
+
return default_data_collator(features)
|
45
|
+
|
46
|
+
if __name__ == "__main__2":
|
47
|
+
|
48
|
+
from transformers import AutoModelForMaskedLM
|
49
|
+
from transformers import AutoTokenizer
|
50
|
+
import torch
|
51
|
+
|
52
|
+
model_checkpoint = "distilbert-base-uncased"
|
53
|
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
54
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
55
|
+
|
56
|
+
text = "This is a great [MASK]."
|
57
|
+
|
58
|
+
inputs = tokenizer(text, return_tensors="pt")
|
59
|
+
token_logits = model(**inputs).logits
|
60
|
+
# Find the location of [MASK] and extract its logits
|
61
|
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
62
|
+
mask_token_logits = token_logits[0, mask_token_index, :]
|
63
|
+
# Pick the [MASK] candidates with the highest logits
|
64
|
+
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
65
|
+
|
66
|
+
for token in top_5_tokens:
|
67
|
+
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import random
|
2
|
+
import torch
|
3
|
+
import numpy
|
4
|
+
|
5
|
+
def set_seed(seed):
|
6
|
+
"""
|
7
|
+
Set seed in several backends
|
8
|
+
"""
|
9
|
+
random.seed(seed)
|
10
|
+
numpy.random.seed(seed)
|
11
|
+
torch.manual_seed(seed)
|
12
|
+
if torch.cuda.is_available():
|
13
|
+
torch.cuda.manual_seed(seed)
|
14
|
+
torch.cuda.manual_seed_all(seed)
|
15
|
+
|
16
|
+
def deterministic():
|
17
|
+
"""
|
18
|
+
Ensure that all operations are deterministic on GPU (if used) for
|
19
|
+
reproducibility
|
20
|
+
"""
|
21
|
+
torch.backends.cudnn.deterministic = True
|
22
|
+
torch.backends.cudnn.benchmark = False
|
23
|
+
|
24
|
+
def device():
|
25
|
+
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
26
|
+
|
27
|
+
def data_directory():
|
28
|
+
from pathlib import Path
|
29
|
+
print(Path.home())
|
30
|
+
|
@@ -20,7 +20,8 @@ factory = "transformer"
|
|
20
20
|
|
21
21
|
[components.transformer.model]
|
22
22
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
-
name = "
|
23
|
+
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
24
|
+
#name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
25
|
tokenizer_config = {"use_fast": true}
|
25
26
|
|
26
27
|
[components.transformer.model.get_spans]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/huggingface/masked_lm'
|
3
|
+
|
4
|
+
class TestMaskedLM < Test::Unit::TestCase
|
5
|
+
def test_train_new_word
|
6
|
+
TmpFile.with_file do |dir|
|
7
|
+
|
8
|
+
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
9
|
+
mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
|
10
|
+
|
11
|
+
mod, tokenizer = mlm.init
|
12
|
+
if tokenizer.vocab["[GENE]"].nil?
|
13
|
+
tokenizer.add_tokens("[GENE]")
|
14
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
15
|
+
end
|
16
|
+
|
17
|
+
100.times do
|
18
|
+
mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
|
19
|
+
mlm.add "This [MASK] is expressed.", %w([GENE])
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
|
23
|
+
|
24
|
+
mlm.train
|
25
|
+
|
26
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
27
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
28
|
+
|
29
|
+
mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
|
30
|
+
|
31
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
32
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
33
|
+
|
34
|
+
mlm = VectorModel.new dir
|
35
|
+
|
36
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
37
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -3,7 +3,7 @@ require 'rbbt/vector/model/huggingface'
|
|
3
3
|
|
4
4
|
class TestHuggingface < Test::Unit::TestCase
|
5
5
|
|
6
|
-
def
|
6
|
+
def _test_options
|
7
7
|
TmpFile.with_file do |dir|
|
8
8
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
9
9
|
task = "SequenceClassification"
|
@@ -11,20 +11,20 @@ class TestHuggingface < Test::Unit::TestCase
|
|
11
11
|
model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
|
12
12
|
iii model.eval "This is dog"
|
13
13
|
iii model.eval "This is cat"
|
14
|
-
iii model.
|
14
|
+
iii model.eval_list(["This is dog", "This is cat"])
|
15
15
|
|
16
16
|
model = VectorModel.new dir
|
17
|
-
iii model.
|
17
|
+
iii model.eval_list(["This is dog", "This is cat"])
|
18
18
|
end
|
19
19
|
end
|
20
20
|
|
21
|
-
def
|
21
|
+
def _test_pipeline
|
22
22
|
require 'rbbt/util/python'
|
23
23
|
model = VectorModel.new
|
24
24
|
model.post_process do |elements|
|
25
25
|
elements.collect{|e| e['label'] }
|
26
26
|
end
|
27
|
-
model.eval_model do |
|
27
|
+
model.eval_model do |elements|
|
28
28
|
RbbtPython.run :transformers do
|
29
29
|
classifier ||= transformers.pipeline("sentiment-analysis")
|
30
30
|
classifier.call(elements)
|
@@ -33,21 +33,53 @@ class TestHuggingface < Test::Unit::TestCase
|
|
33
33
|
|
34
34
|
assert_equal ["POSITIVE"], model.eval("I've been waiting for a HuggingFace course my whole life.")
|
35
35
|
end
|
36
|
+
|
37
|
+
def _test_tokenizer_size
|
38
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
39
|
+
tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
|
40
|
+
"MaskedLM", checkpoint, :max_length => 5, :model_max_length => 5)
|
41
|
+
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true, max_length: 5)["input_ids"].__len__
|
42
|
+
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
|
43
|
+
end
|
36
44
|
|
37
|
-
def
|
45
|
+
def _test_sst_eval
|
38
46
|
TmpFile.with_file do |dir|
|
39
47
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
40
48
|
|
41
|
-
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
49
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
|
42
50
|
|
43
51
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
44
52
|
|
45
|
-
assert_equal
|
53
|
+
assert_equal "Bad", model.eval("This is dog")
|
54
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
46
55
|
end
|
47
56
|
end
|
48
57
|
|
49
58
|
|
50
59
|
def test_sst_train
|
60
|
+
TmpFile.with_file do |dir|
|
61
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
62
|
+
|
63
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
|
64
|
+
|
65
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
66
|
+
|
67
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
68
|
+
|
69
|
+
100.times do
|
70
|
+
model.add "Dog is good", "Good"
|
71
|
+
end
|
72
|
+
|
73
|
+
model.train
|
74
|
+
|
75
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
76
|
+
|
77
|
+
model = VectorModel.new dir
|
78
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def _test_sst_train_with_labels
|
51
83
|
TmpFile.with_file do |dir|
|
52
84
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
53
85
|
|
@@ -55,28 +87,29 @@ class TestHuggingface < Test::Unit::TestCase
|
|
55
87
|
|
56
88
|
model.model_options[:class_labels] = %w(Bad Good)
|
57
89
|
|
58
|
-
assert_equal ["Bad", "Good"], model.
|
90
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
59
91
|
|
60
92
|
100.times do
|
61
|
-
model.add "Dog is good",
|
93
|
+
model.add "Dog is good", "Good"
|
62
94
|
end
|
63
95
|
|
64
96
|
model.train
|
65
97
|
|
66
|
-
assert_equal ["Good", "Good"], model.
|
98
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
67
99
|
|
68
100
|
model = VectorModel.new dir
|
69
|
-
assert_equal ["Good", "Good"], model.
|
101
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
70
102
|
end
|
71
103
|
end
|
72
104
|
|
73
|
-
|
105
|
+
|
106
|
+
def _test_sst_train_no_save
|
74
107
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
75
108
|
|
76
109
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint
|
77
110
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
78
111
|
|
79
|
-
assert_equal ["Bad", "Good"], model.
|
112
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
80
113
|
|
81
114
|
100.times do
|
82
115
|
model.add "Dog is good", 1
|
@@ -84,48 +117,50 @@ class TestHuggingface < Test::Unit::TestCase
|
|
84
117
|
|
85
118
|
model.train
|
86
119
|
|
87
|
-
assert_equal ["Good", "Good"], model.
|
120
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
88
121
|
end
|
89
122
|
|
90
|
-
def
|
123
|
+
def _test_sst_train_save_and_load
|
91
124
|
TmpFile.with_file do |dir|
|
92
125
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
93
126
|
|
94
127
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
95
128
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
96
129
|
|
97
|
-
assert_equal ["Bad", "Good"], model.
|
130
|
+
assert_equal ["Bad", "Good"], model.eval_list(["This is dog", "This is cat"])
|
98
131
|
|
99
132
|
100.times do
|
100
|
-
model.add "Dog is good",
|
133
|
+
model.add "Dog is good", "Good"
|
101
134
|
end
|
102
135
|
|
103
136
|
model.train
|
104
137
|
|
105
138
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
106
139
|
|
107
|
-
assert_equal ["Good", "Good"], model.
|
140
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
108
141
|
|
109
|
-
|
142
|
+
model_path = model.model_path
|
110
143
|
|
111
|
-
model = HuggingfaceModel.new "SequenceClassification",
|
144
|
+
model = HuggingfaceModel.new "SequenceClassification", model_path
|
112
145
|
model.model_options[:class_labels] = ["Bad", "Good"]
|
113
146
|
|
114
|
-
assert_equal ["Good", "Good"], model.
|
147
|
+
assert_equal ["Good", "Good"], model.eval_list(["This is dog", "This is cat"])
|
115
148
|
|
116
149
|
model = VectorModel.new dir
|
117
150
|
|
118
|
-
assert_equal "Good", model.
|
151
|
+
assert_equal "Good", model.eval_list("This is dog")
|
119
152
|
|
120
153
|
end
|
121
154
|
end
|
122
155
|
|
123
|
-
def
|
156
|
+
def _test_sst_stress_test
|
124
157
|
TmpFile.with_file do |dir|
|
125
158
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
126
159
|
|
127
160
|
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
128
161
|
|
162
|
+
assert_equal 0, model.eval("This is dog")
|
163
|
+
|
129
164
|
100.times do
|
130
165
|
model.add "Dog is good", 1
|
131
166
|
model.add "Cat is bad", 0
|
@@ -136,18 +171,214 @@ class TestHuggingface < Test::Unit::TestCase
|
|
136
171
|
end
|
137
172
|
|
138
173
|
Misc.benchmark 1000 do
|
139
|
-
model.
|
174
|
+
model.eval_list(["This is good", "This is terrible", "This is dog", "This is cat", "Very different stuff", "Dog is bad", "Cat is good"])
|
140
175
|
end
|
141
176
|
end
|
142
177
|
end
|
143
178
|
|
144
|
-
def
|
179
|
+
def _test_mask_eval
|
145
180
|
checkpoint = "bert-base-uncased"
|
146
181
|
|
147
182
|
model = HuggingfaceModel.new "MaskedLM", checkpoint
|
148
|
-
assert_equal 3, model.
|
183
|
+
assert_equal 3, model.eval_list(["Paris is the [MASK] of the France.", "The [MASK] worked very hard all the time.", "The [MASK] arrested the dangerous [MASK]."]).
|
149
184
|
reject{|v| v.empty?}.length
|
150
185
|
end
|
151
186
|
|
187
|
+
def _test_mask_eval_tokenizer
|
188
|
+
checkpoint = "bert-base-uncased"
|
189
|
+
|
190
|
+
model = HuggingfaceModel.new "MaskedLM", checkpoint
|
191
|
+
|
192
|
+
mod, tokenizer = model.init
|
193
|
+
|
194
|
+
orig = tokenizer.call("Hi [GENE]")["input_ids"]
|
195
|
+
tokenizer.add_tokens(["[GENE]"])
|
196
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
197
|
+
new = tokenizer.call("Hi [GENE]")["input_ids"]
|
198
|
+
|
199
|
+
assert orig.length > new.length
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
def _test_custom_class
|
204
|
+
TmpFile.with_file do |dir|
|
205
|
+
Open.write File.join(dir, "mypkg/__init__.py"), ""
|
206
|
+
|
207
|
+
Open.write File.join(dir, "mypkg/mymodel.py"), <<~EOF
|
208
|
+
|
209
|
+
# Esta clase es igual que la de RobertaForTokenClassification
|
210
|
+
# Importamos los métodos necesarios
|
211
|
+
import torch.nn as nn
|
212
|
+
from transformers import RobertaConfig
|
213
|
+
from transformers.modeling_outputs import TokenClassifierOutput
|
214
|
+
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
|
215
|
+
|
216
|
+
# Creamos una clase que herede de RobertaPreTrainedModel
|
217
|
+
class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
|
218
|
+
config_class = RobertaConfig
|
219
|
+
|
220
|
+
def __init__(self, config):
|
221
|
+
# Se usa para inicializar el modelo Roberta
|
222
|
+
super().__init__(config)
|
223
|
+
# Numero de etiquetas que se van a clasificar (sería el número de etiquetas del corpus*2)
|
224
|
+
# Una correspondiente a la etiqueta I y otra a la B.
|
225
|
+
self.num_labels = config.num_labels
|
226
|
+
# No incorporamos pooling layer para devolver los hidden states de cada token (no sólo el CLS)
|
227
|
+
self.roberta = RobertaModel(config, add_pooling_layer=False)
|
228
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
229
|
+
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
230
|
+
self.init_weights()
|
231
|
+
|
232
|
+
def forward(self, input_ids = None, attention_mask = None, token_type_ids = None, labels = None,
|
233
|
+
**kwargs):
|
234
|
+
# Obtenemos una codificación del input (los hidden states)
|
235
|
+
outputs = self.roberta(input_ids, attention_mask = attention_mask,
|
236
|
+
token_type_ids = token_type_ids, **kwargs)
|
237
|
+
|
238
|
+
# A la salida de los hidden states le aplicamos la capa de dropout
|
239
|
+
sequence_output = self.dropout(outputs[0])
|
240
|
+
# Y posteriormente la capa de clasificación.
|
241
|
+
logits = self.classifier(sequence_output)
|
242
|
+
# Si labels tiene algún valor (lo que se hará durante el proceso de entrenamiento), se calculan las Loss
|
243
|
+
# para justar los pesos en el backprop.
|
244
|
+
loss = None
|
245
|
+
if labels is not None:
|
246
|
+
loss_fct = nn.CrossEntropyLoss()
|
247
|
+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
248
|
+
|
249
|
+
return TokenClassifierOutput(loss=loss, logits=logits,
|
250
|
+
hidden_states=outputs.hidden_states,
|
251
|
+
attentions=outputs.attentions)
|
252
|
+
EOF
|
253
|
+
|
254
|
+
RbbtPython.add_path dir
|
255
|
+
|
256
|
+
biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
|
257
|
+
model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta
|
258
|
+
|
259
|
+
model.post_process do |result,is_list|
|
260
|
+
if is_list
|
261
|
+
RbbtPython.numpy2ruby result.predictions
|
262
|
+
else
|
263
|
+
result["logits"][0]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
texto = "El paciente tiene un cáncer del pulmon"
|
268
|
+
assert model.eval(texto)[5][1] > 0
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
def _test_sst_train_word_embeddings
|
273
|
+
TmpFile.with_file do |dir|
|
274
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
275
|
+
|
276
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
277
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
278
|
+
|
279
|
+
mod, tokenizer = model.init
|
280
|
+
|
281
|
+
orig = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
282
|
+
orig = RbbtPython.numpy2ruby(orig.cpu.detach.numpy)
|
283
|
+
|
284
|
+
100.times do
|
285
|
+
model.add "Dog is good", "Good"
|
286
|
+
end
|
287
|
+
|
288
|
+
model.train
|
289
|
+
|
290
|
+
new = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
291
|
+
new = RbbtPython.numpy2ruby(new.cpu.detach.numpy)
|
292
|
+
|
293
|
+
diff = []
|
294
|
+
new.each_with_index do |row,i|
|
295
|
+
diff << i if row != orig[i]
|
296
|
+
end
|
297
|
+
|
298
|
+
assert diff.length > 0
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def _test_sst_freeze_word_embeddings
|
303
|
+
TmpFile.with_file do |dir|
|
304
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
305
|
+
|
306
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
307
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
308
|
+
|
309
|
+
mod, tokenizer = model.init
|
310
|
+
|
311
|
+
layer = HuggingfaceModel.freeze_layer(mod, 'distilbert')
|
312
|
+
|
313
|
+
orig = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
314
|
+
orig = RbbtPython.numpy2ruby(orig.cpu.detach.numpy)
|
315
|
+
|
316
|
+
100.times do
|
317
|
+
model.add "Dog is good", "Good"
|
318
|
+
end
|
319
|
+
|
320
|
+
model.train
|
321
|
+
|
322
|
+
new = HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings')
|
323
|
+
new = RbbtPython.numpy2ruby(new.cpu.detach.numpy)
|
324
|
+
|
325
|
+
diff = []
|
326
|
+
new.each_with_index do |row,i|
|
327
|
+
diff << i if row != orig[i]
|
328
|
+
end
|
329
|
+
|
330
|
+
assert diff.length == 0
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
def _test_sst_save_word_embeddings
|
335
|
+
TmpFile.with_file do |dir|
|
336
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
337
|
+
|
338
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
|
339
|
+
model.model_options[:class_labels] = %w(Bad Good)
|
340
|
+
|
341
|
+
mod, tokenizer = model.init
|
342
|
+
|
343
|
+
100.times do
|
344
|
+
model.add "Dog is good", "Good"
|
345
|
+
end
|
346
|
+
|
347
|
+
model.train
|
348
|
+
|
349
|
+
orig = RbbtPython.numpy2ruby(
|
350
|
+
HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings').cpu.detach.numpy)
|
351
|
+
|
352
|
+
model = HuggingfaceModel.new "MaskedLM", checkpoint, dir
|
353
|
+
|
354
|
+
mod, tokenizer = model.init
|
355
|
+
|
356
|
+
new = RbbtPython.numpy2ruby(
|
357
|
+
HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings').cpu.detach.numpy)
|
358
|
+
|
359
|
+
|
360
|
+
diff = []
|
361
|
+
new.each_with_index do |row,i|
|
362
|
+
diff << i if row != orig[i]
|
363
|
+
end
|
364
|
+
|
365
|
+
assert diff.length == 0
|
366
|
+
|
367
|
+
model = HuggingfaceModel.new "MaskedLM", checkpoint
|
368
|
+
|
369
|
+
mod, tokenizer = model.init
|
370
|
+
|
371
|
+
new = RbbtPython.numpy2ruby(
|
372
|
+
HuggingfaceModel.get_weights(mod, 'distilbert.embeddings.word_embeddings').cpu.detach.numpy)
|
373
|
+
|
374
|
+
|
375
|
+
diff = []
|
376
|
+
new.each_with_index do |row,i|
|
377
|
+
diff << i if row != orig[i]
|
378
|
+
end
|
379
|
+
|
380
|
+
assert diff.length > 0
|
381
|
+
end
|
382
|
+
end
|
152
383
|
end
|
153
384
|
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/pytorch_lightning'
|
3
|
+
|
4
|
+
class TestPytorchLightning < Test::Unit::TestCase
|
5
|
+
def test_clustering
|
6
|
+
nsamples = 10
|
7
|
+
ngenes = 10000
|
8
|
+
samples = nsamples.times.collect{|i| "Sample-#{i}" }
|
9
|
+
data = TSV.setup({}, :key_field => "Gene", :fields => samples + ["cluster"], :type => :list, :cast => :to_f)
|
10
|
+
|
11
|
+
profiles = []
|
12
|
+
p0 = 3
|
13
|
+
p1 = 7
|
14
|
+
profiles[0] = nsamples.times.collect{ rand() + p0 }
|
15
|
+
profiles[1] = nsamples.times.collect{ rand() + p1 }
|
16
|
+
|
17
|
+
ngenes.times do |genen|
|
18
|
+
gene = "Gene-#{genen}"
|
19
|
+
cluster = genen % 2
|
20
|
+
values = profiles[cluster].collect do |m|
|
21
|
+
rand() + m
|
22
|
+
end
|
23
|
+
data[gene] = values + [cluster]
|
24
|
+
end
|
25
|
+
|
26
|
+
python = <<~EOF
|
27
|
+
import torch
|
28
|
+
from torch import nn
|
29
|
+
from torch.nn import functional as F
|
30
|
+
from torch.utils.data import DataLoader
|
31
|
+
from torch.utils.data import random_split
|
32
|
+
from torchvision.datasets import MNIST
|
33
|
+
from torchvision import transforms
|
34
|
+
import pytorch_lightning as pl
|
35
|
+
|
36
|
+
class TestPytorchLightningModel(pl.LightningModule):
|
37
|
+
def __init__(self, input_size=10, internal_dim=1):
|
38
|
+
super().__init__()
|
39
|
+
self.model = nn.Tanh()
|
40
|
+
|
41
|
+
def configure_optimizers(self):
|
42
|
+
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
|
43
|
+
return optimizer
|
44
|
+
|
45
|
+
@torch.cuda.amp.autocast(True)
|
46
|
+
def forward(self, x):
|
47
|
+
x = x.to(self.dtype)
|
48
|
+
return self.model(x).squeeze()
|
49
|
+
|
50
|
+
@torch.cuda.amp.autocast(True)
|
51
|
+
def training_step(self, train_batch, batch_idx):
|
52
|
+
x, y = train_batch
|
53
|
+
x = x.to(self.dtype)
|
54
|
+
y = y.to(self.dtype)
|
55
|
+
y_hat = self.model(x).squeeze()
|
56
|
+
loss = F.mse_loss(y, y_hat)
|
57
|
+
self.log('train_loss', loss)
|
58
|
+
return loss
|
59
|
+
|
60
|
+
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float64)
|
61
|
+
def validation_step(self, val_batch, batch_idx):
|
62
|
+
x, y = train_batch
|
63
|
+
y_hat = self.model(x)
|
64
|
+
loss = F.mse_loss(y, y_hat)
|
65
|
+
self.log('val_loss', loss)
|
66
|
+
|
67
|
+
EOF
|
68
|
+
|
69
|
+
with_python(python) do |pkg|
|
70
|
+
model = PytorchLightningModel.new pkg , "TestPytorchLightningModel", nil, model_args: {internal_dim: 1}
|
71
|
+
TmpFile.with_file(data.to_s) do |data_file|
|
72
|
+
ds = RbbtPython.call_method "rbbt_dm", :tsv, filename: data_file
|
73
|
+
model.loader = RbbtPython.class_new_obj("torch.utils.data", :DataLoader, dataset: ds, batch_size: 64)
|
74
|
+
model.trainer = RbbtPython.class_new_obj("pytorch_lightning", "Trainer", gpus: 1, max_epochs: 5, precision: 16)
|
75
|
+
end
|
76
|
+
model.train
|
77
|
+
encoding = model.eval_list(data.values.collect{|v| v[0..-2] }).detach().cpu().numpy()
|
78
|
+
iii encoding[0..10]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
@@ -100,7 +100,7 @@ class TestSpaCyModel < Test::Unit::TestCase
|
|
100
100
|
)
|
101
101
|
|
102
102
|
|
103
|
-
Rbbt::Config.set 'gpu_id',
|
103
|
+
Rbbt::Config.set 'gpu_id', 0, :spacy
|
104
104
|
require 'rbbt/tsv/csv'
|
105
105
|
url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
|
106
106
|
tsv = TSV.csv(Open.open(url))
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/vector/model/tensorflow'
|
3
|
+
require 'rbbt/util/python'
|
3
4
|
|
4
5
|
class TestTensorflowModel < Test::Unit::TestCase
|
5
6
|
|
@@ -10,6 +11,7 @@ class TestTensorflowModel < Test::Unit::TestCase
|
|
10
11
|
|
11
12
|
model = TensorFlowModel.new(
|
12
13
|
dir,
|
14
|
+
jit_compile: true,
|
13
15
|
optimizer: 'adam',
|
14
16
|
loss: 'sparse_categorical_crossentropy',
|
15
17
|
metrics: ['accuracy']
|
@@ -53,5 +55,6 @@ class TestTensorflowModel < Test::Unit::TestCase
|
|
53
55
|
assert sum.to_f / predictions.length > 0.7
|
54
56
|
end
|
55
57
|
end
|
58
|
+
|
56
59
|
end
|
57
60
|
|