rbbt-dm 1.2.6 → 1.2.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/barcode.rb +2 -2
- data/lib/rbbt/matrix/differential.rb +3 -3
- data/lib/rbbt/matrix/knowledge_base.rb +1 -1
- data/lib/rbbt/plots/bar.rb +1 -1
- data/lib/rbbt/stan.rb +1 -1
- data/lib/rbbt/statistics/hypergeometric.rb +2 -1
- data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
- data/lib/rbbt/vector/model/huggingface.rb +57 -38
- data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
- data/lib/rbbt/vector/model/random_forest.rb +1 -1
- data/lib/rbbt/vector/model/spaCy.rb +8 -14
- data/lib/rbbt/vector/model/tensorflow.rb +6 -5
- data/lib/rbbt/vector/model/torch.rb +37 -0
- data/lib/rbbt/vector/model/util.rb +18 -0
- data/lib/rbbt/vector/model.rb +100 -56
- data/python/rbbt_dm/__init__.py +48 -1
- data/python/rbbt_dm/atcold/__init__.py +0 -0
- data/python/rbbt_dm/atcold/plot_lib.py +141 -0
- data/python/rbbt_dm/atcold/spiral.py +27 -0
- data/python/rbbt_dm/huggingface.py +57 -26
- data/python/rbbt_dm/language_model.py +70 -0
- data/python/rbbt_dm/util.py +30 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
- data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
- data/test/rbbt/vector/model/test_huggingface.rb +258 -27
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
- data/test/rbbt/vector/model/test_spaCy.rb +1 -1
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
- data/test/rbbt/vector/test_model.rb +25 -26
- data/test/test_helper.rb +13 -0
- metadata +26 -16
- data/lib/rbbt/tensorflow.rb +0 -43
- data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,41 @@
|
|
1
1
|
#{{{ LOAD MODEL
|
2
2
|
|
3
3
|
def import_module_class(module, class_name):
|
4
|
-
|
4
|
+
if (not module == None):
|
5
|
+
exec(f"from {module} import {class_name}")
|
5
6
|
return eval(class_name)
|
6
7
|
|
7
|
-
def load_model(task, checkpoint):
|
8
|
-
|
9
|
-
|
8
|
+
def load_model(task, checkpoint, **kwargs):
|
9
|
+
if (":" in task):
|
10
|
+
module, class_name = task.split(":")
|
11
|
+
if (task == None):
|
12
|
+
module, class_name = None, module
|
13
|
+
return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
|
14
|
+
else:
|
15
|
+
class_name = 'AutoModelFor' + task
|
16
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
10
17
|
|
11
|
-
def load_tokenizer(task, checkpoint):
|
18
|
+
def load_tokenizer(task, checkpoint, **kwargs):
|
12
19
|
class_name = 'AutoTokenizer'
|
13
|
-
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
20
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
14
21
|
|
15
22
|
def load_model_and_tokenizer(task, checkpoint):
|
16
23
|
model = load_model(task, checkpoint)
|
17
24
|
tokenizer = load_tokenizer(task, checkpoint)
|
18
25
|
return model, tokenizer
|
19
26
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
f
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
# Not used
|
28
|
+
|
29
|
+
#def load_model_and_tokenizer_from_directory(directory):
|
30
|
+
# import os
|
31
|
+
# import json
|
32
|
+
# options_file = os.path.join(directory, 'options.json')
|
33
|
+
# f = open(options_file, "r")
|
34
|
+
# options = json.load(f.read())
|
35
|
+
# f.close()
|
36
|
+
# task = options["task"]
|
37
|
+
# checkpoint = options["checkpoint"]
|
38
|
+
# return load_model_and_tokenizer(task, checkpoint)
|
30
39
|
|
31
40
|
#{{{ SIMPLE EVALUATE
|
32
41
|
|
@@ -51,21 +60,36 @@ def load_tsv(tsv_file):
|
|
51
60
|
from datasets import load_dataset
|
52
61
|
return load_dataset('csv', data_files=[tsv_file], sep="\t")
|
53
62
|
|
63
|
+
def load_json(json_file):
|
64
|
+
from datasets import load_dataset
|
65
|
+
return load_dataset('json', data_files=[json_file])
|
66
|
+
|
67
|
+
def tokenize_dataset(tokenizer, dataset):
|
68
|
+
return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
|
69
|
+
|
54
70
|
def tsv_dataset(tokenizer, tsv_file):
|
55
71
|
dataset = load_tsv(tsv_file)
|
56
|
-
|
57
|
-
|
72
|
+
return tokenize_dataset(tokenizer, dataset)
|
73
|
+
|
74
|
+
def json_dataset(tokenizer, json_file):
|
75
|
+
dataset = load_json(json_file)
|
76
|
+
return tokenize_dataset(tokenizer, dataset)
|
58
77
|
|
59
78
|
def training_args(*args, **kwargs):
|
60
79
|
from transformers import TrainingArguments
|
61
80
|
training_args = TrainingArguments(*args, **kwargs)
|
62
81
|
return training_args
|
63
82
|
|
64
|
-
|
65
|
-
def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
83
|
+
def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
|
66
84
|
from transformers import Trainer
|
67
85
|
|
68
|
-
|
86
|
+
if (isinstance(dataset, str)):
|
87
|
+
if (dataset.endswith('.json')):
|
88
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
89
|
+
else:
|
90
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
91
|
+
else:
|
92
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
69
93
|
|
70
94
|
if (not class_weights == None):
|
71
95
|
import torch
|
@@ -86,7 +110,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
86
110
|
model,
|
87
111
|
training_args,
|
88
112
|
train_dataset = tokenized_dataset["train"],
|
89
|
-
tokenizer = tokenizer
|
113
|
+
tokenizer = tokenizer,
|
114
|
+
**kwargs
|
90
115
|
)
|
91
116
|
else:
|
92
117
|
|
@@ -94,7 +119,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
94
119
|
model,
|
95
120
|
training_args,
|
96
121
|
train_dataset = tokenized_dataset["train"],
|
97
|
-
tokenizer = tokenizer
|
122
|
+
tokenizer = tokenizer,
|
123
|
+
**kwargs
|
98
124
|
)
|
99
125
|
|
100
126
|
trainer.train()
|
@@ -124,10 +150,16 @@ def find_tokens_in_input(dataset, token_ids):
|
|
124
150
|
return position_rows
|
125
151
|
|
126
152
|
|
127
|
-
def predict_model(model, tokenizer, training_args,
|
153
|
+
def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
|
128
154
|
from transformers import Trainer
|
129
155
|
|
130
|
-
|
156
|
+
if (isinstance(dataset, str)):
|
157
|
+
if (dataset.endswith('.json')):
|
158
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
159
|
+
else:
|
160
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
161
|
+
else:
|
162
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
131
163
|
|
132
164
|
trainer = Trainer(
|
133
165
|
model,
|
@@ -143,4 +175,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
|
|
143
175
|
else:
|
144
176
|
return result
|
145
177
|
|
146
|
-
|
@@ -0,0 +1,70 @@
|
|
1
|
+
def group_texts(examples):
|
2
|
+
# Concatenate all texts.
|
3
|
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
4
|
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
5
|
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
6
|
+
# customize this part to your needs.
|
7
|
+
total_length = (total_length // block_size) * block_size
|
8
|
+
# Split by chunks of max_len.
|
9
|
+
result = {
|
10
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
11
|
+
for k, t in concatenated_examples.items()
|
12
|
+
}
|
13
|
+
result["labels"] = result["input_ids"].copy()
|
14
|
+
return result
|
15
|
+
|
16
|
+
def whole_word_masking_data_collator(features):
|
17
|
+
from transformers import default_data_collator
|
18
|
+
for feature in features:
|
19
|
+
word_ids = feature.pop("word_ids")
|
20
|
+
|
21
|
+
# Create a map between words and corresponding token indices
|
22
|
+
mapping = collections.defaultdict(list)
|
23
|
+
current_word_index = -1
|
24
|
+
current_word = None
|
25
|
+
for idx, word_id in enumerate(word_ids):
|
26
|
+
if word_id is not None:
|
27
|
+
if word_id != current_word:
|
28
|
+
current_word = word_id
|
29
|
+
current_word_index += 1
|
30
|
+
mapping[current_word_index].append(idx)
|
31
|
+
|
32
|
+
# Randomly mask words
|
33
|
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
34
|
+
input_ids = feature["input_ids"]
|
35
|
+
labels = feature["labels"]
|
36
|
+
new_labels = [-100] * len(labels)
|
37
|
+
for word_id in np.where(mask)[0]:
|
38
|
+
word_id = word_id.item()
|
39
|
+
for idx in mapping[word_id]:
|
40
|
+
new_labels[idx] = labels[idx]
|
41
|
+
input_ids[idx] = tokenizer.mask_token_id
|
42
|
+
feature["labels"] = new_labels
|
43
|
+
|
44
|
+
return default_data_collator(features)
|
45
|
+
|
46
|
+
if __name__ == "__main__2":
|
47
|
+
|
48
|
+
from transformers import AutoModelForMaskedLM
|
49
|
+
from transformers import AutoTokenizer
|
50
|
+
import torch
|
51
|
+
|
52
|
+
model_checkpoint = "distilbert-base-uncased"
|
53
|
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
54
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
55
|
+
|
56
|
+
text = "This is a great [MASK]."
|
57
|
+
|
58
|
+
inputs = tokenizer(text, return_tensors="pt")
|
59
|
+
token_logits = model(**inputs).logits
|
60
|
+
# Find the location of [MASK] and extract its logits
|
61
|
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
62
|
+
mask_token_logits = token_logits[0, mask_token_index, :]
|
63
|
+
# Pick the [MASK] candidates with the highest logits
|
64
|
+
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
65
|
+
|
66
|
+
for token in top_5_tokens:
|
67
|
+
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import random
|
2
|
+
import torch
|
3
|
+
import numpy
|
4
|
+
|
5
|
+
def set_seed(seed):
|
6
|
+
"""
|
7
|
+
Set seed in several backends
|
8
|
+
"""
|
9
|
+
random.seed(seed)
|
10
|
+
numpy.random.seed(seed)
|
11
|
+
torch.manual_seed(seed)
|
12
|
+
if torch.cuda.is_available():
|
13
|
+
torch.cuda.manual_seed(seed)
|
14
|
+
torch.cuda.manual_seed_all(seed)
|
15
|
+
|
16
|
+
def deterministic():
|
17
|
+
"""
|
18
|
+
Ensure that all operations are deterministic on GPU (if used) for
|
19
|
+
reproducibility
|
20
|
+
"""
|
21
|
+
torch.backends.cudnn.deterministic = True
|
22
|
+
torch.backends.cudnn.benchmark = False
|
23
|
+
|
24
|
+
def device():
|
25
|
+
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
26
|
+
|
27
|
+
def data_directory():
|
28
|
+
from pathlib import Path
|
29
|
+
print(Path.home())
|
30
|
+
|
@@ -20,7 +20,8 @@ factory = "transformer"
|
|
20
20
|
|
21
21
|
[components.transformer.model]
|
22
22
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
-
name = "
|
23
|
+
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
24
|
+
#name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
25
|
tokenizer_config = {"use_fast": true}
|
25
26
|
|
26
27
|
[components.transformer.model.get_spans]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/huggingface/masked_lm'
|
3
|
+
|
4
|
+
class TestMaskedLM < Test::Unit::TestCase
|
5
|
+
def test_train_new_word
|
6
|
+
TmpFile.with_file do |dir|
|
7
|
+
|
8
|
+
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
9
|
+
mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
|
10
|
+
|
11
|
+
mod, tokenizer = mlm.init
|
12
|
+
if tokenizer.vocab["[GENE]"].nil?
|
13
|
+
tokenizer.add_tokens("[GENE]")
|
14
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
15
|
+
end
|
16
|
+
|
17
|
+
100.times do
|
18
|
+
mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
|
19
|
+
mlm.add "This [MASK] is expressed.", %w([GENE])
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
|
23
|
+
|
24
|
+
mlm.train
|
25
|
+
|
26
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
27
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
28
|
+
|
29
|
+
mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
|
30
|
+
|
31
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
32
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
33
|
+
|
34
|
+
mlm = VectorModel.new dir
|
35
|
+
|
36
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
37
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|