rbbt-dm 1.2.7 → 1.2.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/barcode.rb +2 -2
- data/lib/rbbt/matrix/differential.rb +3 -3
- data/lib/rbbt/matrix/knowledge_base.rb +1 -1
- data/lib/rbbt/plots/bar.rb +1 -1
- data/lib/rbbt/stan.rb +1 -1
- data/lib/rbbt/statistics/hypergeometric.rb +2 -1
- data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
- data/lib/rbbt/vector/model/huggingface.rb +39 -52
- data/lib/rbbt/vector/model/python.rb +33 -0
- data/lib/rbbt/vector/model/pytorch_lightning.rb +31 -0
- data/lib/rbbt/vector/model/random_forest.rb +1 -1
- data/lib/rbbt/vector/model/spaCy.rb +8 -6
- data/lib/rbbt/vector/model/tensorflow.rb +6 -5
- data/lib/rbbt/vector/model/torch/dataloader.rb +58 -0
- data/lib/rbbt/vector/model/torch/helpers.rb +52 -0
- data/lib/rbbt/vector/model/torch/introspection.rb +31 -0
- data/lib/rbbt/vector/model/torch/load_and_save.rb +30 -0
- data/lib/rbbt/vector/model/torch.rb +71 -0
- data/lib/rbbt/vector/model.rb +84 -54
- data/python/rbbt_dm/__init__.py +31 -1
- data/python/rbbt_dm/atcold/__init__.py +0 -0
- data/python/rbbt_dm/atcold/plot_lib.py +141 -0
- data/python/rbbt_dm/atcold/spiral.py +27 -0
- data/python/rbbt_dm/huggingface.py +64 -28
- data/python/rbbt_dm/language_model.py +70 -0
- data/python/rbbt_dm/util.py +32 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
- data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
- data/test/rbbt/vector/model/test_huggingface.rb +258 -27
- data/test/rbbt/vector/model/test_python.rb +31 -0
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +97 -0
- data/test/rbbt/vector/model/test_spaCy.rb +1 -1
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
- data/test/rbbt/vector/model/test_torch.rb +61 -0
- data/test/rbbt/vector/test_model.rb +25 -26
- data/test/test_helper.rb +13 -0
- metadata +35 -16
- data/lib/rbbt/tensorflow.rb +0 -43
- data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,43 @@
|
|
1
1
|
#{{{ LOAD MODEL
|
2
|
+
import datasets
|
3
|
+
import rbbt
|
2
4
|
|
3
5
|
def import_module_class(module, class_name):
|
4
|
-
|
6
|
+
if (not module == None):
|
7
|
+
exec(f"from {module} import {class_name}")
|
5
8
|
return eval(class_name)
|
6
9
|
|
7
|
-
def load_model(task, checkpoint):
|
8
|
-
|
9
|
-
|
10
|
+
def load_model(task, checkpoint, **kwargs):
|
11
|
+
if (":" in task):
|
12
|
+
module, class_name = task.split(":")
|
13
|
+
if (task == None):
|
14
|
+
module, class_name = None, module
|
15
|
+
return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
|
16
|
+
else:
|
17
|
+
class_name = 'AutoModelFor' + task
|
18
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
10
19
|
|
11
|
-
def load_tokenizer(task, checkpoint):
|
20
|
+
def load_tokenizer(task, checkpoint, **kwargs):
|
12
21
|
class_name = 'AutoTokenizer'
|
13
|
-
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
22
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
14
23
|
|
15
24
|
def load_model_and_tokenizer(task, checkpoint):
|
16
25
|
model = load_model(task, checkpoint)
|
17
26
|
tokenizer = load_tokenizer(task, checkpoint)
|
18
27
|
return model, tokenizer
|
19
28
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
f
|
27
|
-
|
28
|
-
|
29
|
-
|
29
|
+
# Not used
|
30
|
+
|
31
|
+
#def load_model_and_tokenizer_from_directory(directory):
|
32
|
+
# import os
|
33
|
+
# import json
|
34
|
+
# options_file = os.path.join(directory, 'options.json')
|
35
|
+
# f = open(options_file, "r")
|
36
|
+
# options = json.load(f.read())
|
37
|
+
# f.close()
|
38
|
+
# task = options["task"]
|
39
|
+
# checkpoint = options["checkpoint"]
|
40
|
+
# return load_model_and_tokenizer(task, checkpoint)
|
30
41
|
|
31
42
|
#{{{ SIMPLE EVALUATE
|
32
43
|
|
@@ -48,24 +59,42 @@ def eval_model(model, tokenizer, texts, return_logits = True):
|
|
48
59
|
#{{{ TRAIN AND PREDICT
|
49
60
|
|
50
61
|
def load_tsv(tsv_file):
|
51
|
-
|
52
|
-
|
62
|
+
tsv = rbbt.tsv(tsv_file)
|
63
|
+
print(tsv)
|
64
|
+
ds = datasets.Dataset.from_pandas(tsv)
|
65
|
+
d = datasets.DatasetDict()
|
66
|
+
d["train"] = ds
|
67
|
+
return d
|
68
|
+
|
69
|
+
def load_json(json_file):
|
70
|
+
return datasets.load_dataset('json', data_files=[json_file])
|
71
|
+
|
72
|
+
def tokenize_dataset(tokenizer, dataset):
|
73
|
+
return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
|
53
74
|
|
54
75
|
def tsv_dataset(tokenizer, tsv_file):
|
55
76
|
dataset = load_tsv(tsv_file)
|
56
|
-
|
57
|
-
|
77
|
+
return tokenize_dataset(tokenizer, dataset)
|
78
|
+
|
79
|
+
def json_dataset(tokenizer, json_file):
|
80
|
+
dataset = load_json(json_file)
|
81
|
+
return tokenize_dataset(tokenizer, dataset)
|
58
82
|
|
59
83
|
def training_args(*args, **kwargs):
|
60
84
|
from transformers import TrainingArguments
|
61
85
|
training_args = TrainingArguments(*args, **kwargs)
|
62
86
|
return training_args
|
63
87
|
|
64
|
-
|
65
|
-
def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
88
|
+
def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
|
66
89
|
from transformers import Trainer
|
67
90
|
|
68
|
-
|
91
|
+
if (isinstance(dataset, str)):
|
92
|
+
if (dataset.endswith('.json')):
|
93
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
94
|
+
else:
|
95
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
96
|
+
else:
|
97
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
69
98
|
|
70
99
|
if (not class_weights == None):
|
71
100
|
import torch
|
@@ -86,7 +115,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
86
115
|
model,
|
87
116
|
training_args,
|
88
117
|
train_dataset = tokenized_dataset["train"],
|
89
|
-
tokenizer = tokenizer
|
118
|
+
tokenizer = tokenizer,
|
119
|
+
**kwargs
|
90
120
|
)
|
91
121
|
else:
|
92
122
|
|
@@ -94,7 +124,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
94
124
|
model,
|
95
125
|
training_args,
|
96
126
|
train_dataset = tokenized_dataset["train"],
|
97
|
-
tokenizer = tokenizer
|
127
|
+
tokenizer = tokenizer,
|
128
|
+
**kwargs
|
98
129
|
)
|
99
130
|
|
100
131
|
trainer.train()
|
@@ -124,10 +155,16 @@ def find_tokens_in_input(dataset, token_ids):
|
|
124
155
|
return position_rows
|
125
156
|
|
126
157
|
|
127
|
-
def predict_model(model, tokenizer, training_args,
|
158
|
+
def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
|
128
159
|
from transformers import Trainer
|
129
160
|
|
130
|
-
|
161
|
+
if (isinstance(dataset, str)):
|
162
|
+
if (dataset.endswith('.json')):
|
163
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
164
|
+
else:
|
165
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
166
|
+
else:
|
167
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
131
168
|
|
132
169
|
trainer = Trainer(
|
133
170
|
model,
|
@@ -143,4 +180,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
|
|
143
180
|
else:
|
144
181
|
return result
|
145
182
|
|
146
|
-
|
@@ -0,0 +1,70 @@
|
|
1
|
+
def group_texts(examples):
|
2
|
+
# Concatenate all texts.
|
3
|
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
4
|
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
5
|
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
6
|
+
# customize this part to your needs.
|
7
|
+
total_length = (total_length // block_size) * block_size
|
8
|
+
# Split by chunks of max_len.
|
9
|
+
result = {
|
10
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
11
|
+
for k, t in concatenated_examples.items()
|
12
|
+
}
|
13
|
+
result["labels"] = result["input_ids"].copy()
|
14
|
+
return result
|
15
|
+
|
16
|
+
def whole_word_masking_data_collator(features):
|
17
|
+
from transformers import default_data_collator
|
18
|
+
for feature in features:
|
19
|
+
word_ids = feature.pop("word_ids")
|
20
|
+
|
21
|
+
# Create a map between words and corresponding token indices
|
22
|
+
mapping = collections.defaultdict(list)
|
23
|
+
current_word_index = -1
|
24
|
+
current_word = None
|
25
|
+
for idx, word_id in enumerate(word_ids):
|
26
|
+
if word_id is not None:
|
27
|
+
if word_id != current_word:
|
28
|
+
current_word = word_id
|
29
|
+
current_word_index += 1
|
30
|
+
mapping[current_word_index].append(idx)
|
31
|
+
|
32
|
+
# Randomly mask words
|
33
|
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
34
|
+
input_ids = feature["input_ids"]
|
35
|
+
labels = feature["labels"]
|
36
|
+
new_labels = [-100] * len(labels)
|
37
|
+
for word_id in np.where(mask)[0]:
|
38
|
+
word_id = word_id.item()
|
39
|
+
for idx in mapping[word_id]:
|
40
|
+
new_labels[idx] = labels[idx]
|
41
|
+
input_ids[idx] = tokenizer.mask_token_id
|
42
|
+
feature["labels"] = new_labels
|
43
|
+
|
44
|
+
return default_data_collator(features)
|
45
|
+
|
46
|
+
if __name__ == "__main__2":
|
47
|
+
|
48
|
+
from transformers import AutoModelForMaskedLM
|
49
|
+
from transformers import AutoTokenizer
|
50
|
+
import torch
|
51
|
+
|
52
|
+
model_checkpoint = "distilbert-base-uncased"
|
53
|
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
54
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
55
|
+
|
56
|
+
text = "This is a great [MASK]."
|
57
|
+
|
58
|
+
inputs = tokenizer(text, return_tensors="pt")
|
59
|
+
token_logits = model(**inputs).logits
|
60
|
+
# Find the location of [MASK] and extract its logits
|
61
|
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
62
|
+
mask_token_logits = token_logits[0, mask_token_index, :]
|
63
|
+
# Pick the [MASK] candidates with the highest logits
|
64
|
+
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
65
|
+
|
66
|
+
for token in top_5_tokens:
|
67
|
+
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import random
|
2
|
+
import torch
|
3
|
+
import numpy
|
4
|
+
|
5
|
+
def set_seed(seed):
|
6
|
+
"""
|
7
|
+
Set seed in several backends
|
8
|
+
"""
|
9
|
+
random.seed(seed)
|
10
|
+
numpy.random.seed(seed)
|
11
|
+
torch.manual_seed(seed)
|
12
|
+
if torch.cuda.is_available():
|
13
|
+
torch.cuda.manual_seed(seed)
|
14
|
+
torch.cuda.manual_seed_all(seed)
|
15
|
+
|
16
|
+
def deterministic():
|
17
|
+
"""
|
18
|
+
Ensure that all operations are deterministic on GPU (if used) for
|
19
|
+
reproducibility
|
20
|
+
"""
|
21
|
+
torch.backends.cudnn.deterministic = True
|
22
|
+
torch.backends.cudnn.benchmark = False
|
23
|
+
|
24
|
+
def device():
|
25
|
+
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
26
|
+
|
27
|
+
def data_directory():
|
28
|
+
from pathlib import Path
|
29
|
+
print(Path.home())
|
30
|
+
|
31
|
+
def model_device(model):
|
32
|
+
return next(model.parameters()).device
|
@@ -20,7 +20,8 @@ factory = "transformer"
|
|
20
20
|
|
21
21
|
[components.transformer.model]
|
22
22
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
-
name = "
|
23
|
+
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
24
|
+
#name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
25
|
tokenizer_config = {"use_fast": true}
|
25
26
|
|
26
27
|
[components.transformer.model.get_spans]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/huggingface/masked_lm'
|
3
|
+
|
4
|
+
class TestMaskedLM < Test::Unit::TestCase
|
5
|
+
def test_train_new_word
|
6
|
+
TmpFile.with_file do |dir|
|
7
|
+
|
8
|
+
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
9
|
+
mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
|
10
|
+
|
11
|
+
mod, tokenizer = mlm.init
|
12
|
+
if tokenizer.vocab["[GENE]"].nil?
|
13
|
+
tokenizer.add_tokens("[GENE]")
|
14
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
15
|
+
end
|
16
|
+
|
17
|
+
100.times do
|
18
|
+
mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
|
19
|
+
mlm.add "This [MASK] is expressed.", %w([GENE])
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
|
23
|
+
|
24
|
+
mlm.train
|
25
|
+
|
26
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
27
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
28
|
+
|
29
|
+
mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
|
30
|
+
|
31
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
32
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
33
|
+
|
34
|
+
mlm = VectorModel.new dir
|
35
|
+
|
36
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
37
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|