rbbt-dm 1.2.7 → 1.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/barcode.rb +2 -2
- data/lib/rbbt/matrix/differential.rb +3 -3
- data/lib/rbbt/matrix/knowledge_base.rb +1 -1
- data/lib/rbbt/plots/bar.rb +1 -1
- data/lib/rbbt/stan.rb +1 -1
- data/lib/rbbt/statistics/hypergeometric.rb +2 -1
- data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
- data/lib/rbbt/vector/model/huggingface.rb +39 -52
- data/lib/rbbt/vector/model/python.rb +33 -0
- data/lib/rbbt/vector/model/pytorch_lightning.rb +31 -0
- data/lib/rbbt/vector/model/random_forest.rb +1 -1
- data/lib/rbbt/vector/model/spaCy.rb +8 -6
- data/lib/rbbt/vector/model/tensorflow.rb +6 -5
- data/lib/rbbt/vector/model/torch/dataloader.rb +58 -0
- data/lib/rbbt/vector/model/torch/helpers.rb +52 -0
- data/lib/rbbt/vector/model/torch/introspection.rb +31 -0
- data/lib/rbbt/vector/model/torch/load_and_save.rb +30 -0
- data/lib/rbbt/vector/model/torch.rb +71 -0
- data/lib/rbbt/vector/model.rb +84 -54
- data/python/rbbt_dm/__init__.py +31 -1
- data/python/rbbt_dm/atcold/__init__.py +0 -0
- data/python/rbbt_dm/atcold/plot_lib.py +141 -0
- data/python/rbbt_dm/atcold/spiral.py +27 -0
- data/python/rbbt_dm/huggingface.py +64 -28
- data/python/rbbt_dm/language_model.py +70 -0
- data/python/rbbt_dm/util.py +32 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
- data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
- data/test/rbbt/vector/model/test_huggingface.rb +258 -27
- data/test/rbbt/vector/model/test_python.rb +31 -0
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +97 -0
- data/test/rbbt/vector/model/test_spaCy.rb +1 -1
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
- data/test/rbbt/vector/model/test_torch.rb +61 -0
- data/test/rbbt/vector/test_model.rb +25 -26
- data/test/test_helper.rb +13 -0
- metadata +35 -16
- data/lib/rbbt/tensorflow.rb +0 -43
- data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,43 @@
|
|
1
1
|
#{{{ LOAD MODEL
|
2
|
+
import datasets
|
3
|
+
import rbbt
|
2
4
|
|
3
5
|
def import_module_class(module, class_name):
|
4
|
-
|
6
|
+
if (not module == None):
|
7
|
+
exec(f"from {module} import {class_name}")
|
5
8
|
return eval(class_name)
|
6
9
|
|
7
|
-
def load_model(task, checkpoint):
|
8
|
-
|
9
|
-
|
10
|
+
def load_model(task, checkpoint, **kwargs):
|
11
|
+
if (":" in task):
|
12
|
+
module, class_name = task.split(":")
|
13
|
+
if (task == None):
|
14
|
+
module, class_name = None, module
|
15
|
+
return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
|
16
|
+
else:
|
17
|
+
class_name = 'AutoModelFor' + task
|
18
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
10
19
|
|
11
|
-
def load_tokenizer(task, checkpoint):
|
20
|
+
def load_tokenizer(task, checkpoint, **kwargs):
|
12
21
|
class_name = 'AutoTokenizer'
|
13
|
-
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
22
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
14
23
|
|
15
24
|
def load_model_and_tokenizer(task, checkpoint):
|
16
25
|
model = load_model(task, checkpoint)
|
17
26
|
tokenizer = load_tokenizer(task, checkpoint)
|
18
27
|
return model, tokenizer
|
19
28
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
f
|
27
|
-
|
28
|
-
|
29
|
-
|
29
|
+
# Not used
|
30
|
+
|
31
|
+
#def load_model_and_tokenizer_from_directory(directory):
|
32
|
+
# import os
|
33
|
+
# import json
|
34
|
+
# options_file = os.path.join(directory, 'options.json')
|
35
|
+
# f = open(options_file, "r")
|
36
|
+
# options = json.load(f.read())
|
37
|
+
# f.close()
|
38
|
+
# task = options["task"]
|
39
|
+
# checkpoint = options["checkpoint"]
|
40
|
+
# return load_model_and_tokenizer(task, checkpoint)
|
30
41
|
|
31
42
|
#{{{ SIMPLE EVALUATE
|
32
43
|
|
@@ -48,24 +59,42 @@ def eval_model(model, tokenizer, texts, return_logits = True):
|
|
48
59
|
#{{{ TRAIN AND PREDICT
|
49
60
|
|
50
61
|
def load_tsv(tsv_file):
|
51
|
-
|
52
|
-
|
62
|
+
tsv = rbbt.tsv(tsv_file)
|
63
|
+
print(tsv)
|
64
|
+
ds = datasets.Dataset.from_pandas(tsv)
|
65
|
+
d = datasets.DatasetDict()
|
66
|
+
d["train"] = ds
|
67
|
+
return d
|
68
|
+
|
69
|
+
def load_json(json_file):
|
70
|
+
return datasets.load_dataset('json', data_files=[json_file])
|
71
|
+
|
72
|
+
def tokenize_dataset(tokenizer, dataset):
|
73
|
+
return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
|
53
74
|
|
54
75
|
def tsv_dataset(tokenizer, tsv_file):
|
55
76
|
dataset = load_tsv(tsv_file)
|
56
|
-
|
57
|
-
|
77
|
+
return tokenize_dataset(tokenizer, dataset)
|
78
|
+
|
79
|
+
def json_dataset(tokenizer, json_file):
|
80
|
+
dataset = load_json(json_file)
|
81
|
+
return tokenize_dataset(tokenizer, dataset)
|
58
82
|
|
59
83
|
def training_args(*args, **kwargs):
|
60
84
|
from transformers import TrainingArguments
|
61
85
|
training_args = TrainingArguments(*args, **kwargs)
|
62
86
|
return training_args
|
63
87
|
|
64
|
-
|
65
|
-
def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
88
|
+
def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
|
66
89
|
from transformers import Trainer
|
67
90
|
|
68
|
-
|
91
|
+
if (isinstance(dataset, str)):
|
92
|
+
if (dataset.endswith('.json')):
|
93
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
94
|
+
else:
|
95
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
96
|
+
else:
|
97
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
69
98
|
|
70
99
|
if (not class_weights == None):
|
71
100
|
import torch
|
@@ -86,7 +115,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
86
115
|
model,
|
87
116
|
training_args,
|
88
117
|
train_dataset = tokenized_dataset["train"],
|
89
|
-
tokenizer = tokenizer
|
118
|
+
tokenizer = tokenizer,
|
119
|
+
**kwargs
|
90
120
|
)
|
91
121
|
else:
|
92
122
|
|
@@ -94,7 +124,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
94
124
|
model,
|
95
125
|
training_args,
|
96
126
|
train_dataset = tokenized_dataset["train"],
|
97
|
-
tokenizer = tokenizer
|
127
|
+
tokenizer = tokenizer,
|
128
|
+
**kwargs
|
98
129
|
)
|
99
130
|
|
100
131
|
trainer.train()
|
@@ -124,10 +155,16 @@ def find_tokens_in_input(dataset, token_ids):
|
|
124
155
|
return position_rows
|
125
156
|
|
126
157
|
|
127
|
-
def predict_model(model, tokenizer, training_args,
|
158
|
+
def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
|
128
159
|
from transformers import Trainer
|
129
160
|
|
130
|
-
|
161
|
+
if (isinstance(dataset, str)):
|
162
|
+
if (dataset.endswith('.json')):
|
163
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
164
|
+
else:
|
165
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
166
|
+
else:
|
167
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
131
168
|
|
132
169
|
trainer = Trainer(
|
133
170
|
model,
|
@@ -143,4 +180,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
|
|
143
180
|
else:
|
144
181
|
return result
|
145
182
|
|
146
|
-
|
@@ -0,0 +1,70 @@
|
|
1
|
+
def group_texts(examples):
|
2
|
+
# Concatenate all texts.
|
3
|
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
4
|
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
5
|
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
6
|
+
# customize this part to your needs.
|
7
|
+
total_length = (total_length // block_size) * block_size
|
8
|
+
# Split by chunks of max_len.
|
9
|
+
result = {
|
10
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
11
|
+
for k, t in concatenated_examples.items()
|
12
|
+
}
|
13
|
+
result["labels"] = result["input_ids"].copy()
|
14
|
+
return result
|
15
|
+
|
16
|
+
def whole_word_masking_data_collator(features):
|
17
|
+
from transformers import default_data_collator
|
18
|
+
for feature in features:
|
19
|
+
word_ids = feature.pop("word_ids")
|
20
|
+
|
21
|
+
# Create a map between words and corresponding token indices
|
22
|
+
mapping = collections.defaultdict(list)
|
23
|
+
current_word_index = -1
|
24
|
+
current_word = None
|
25
|
+
for idx, word_id in enumerate(word_ids):
|
26
|
+
if word_id is not None:
|
27
|
+
if word_id != current_word:
|
28
|
+
current_word = word_id
|
29
|
+
current_word_index += 1
|
30
|
+
mapping[current_word_index].append(idx)
|
31
|
+
|
32
|
+
# Randomly mask words
|
33
|
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
34
|
+
input_ids = feature["input_ids"]
|
35
|
+
labels = feature["labels"]
|
36
|
+
new_labels = [-100] * len(labels)
|
37
|
+
for word_id in np.where(mask)[0]:
|
38
|
+
word_id = word_id.item()
|
39
|
+
for idx in mapping[word_id]:
|
40
|
+
new_labels[idx] = labels[idx]
|
41
|
+
input_ids[idx] = tokenizer.mask_token_id
|
42
|
+
feature["labels"] = new_labels
|
43
|
+
|
44
|
+
return default_data_collator(features)
|
45
|
+
|
46
|
+
if __name__ == "__main__2":
|
47
|
+
|
48
|
+
from transformers import AutoModelForMaskedLM
|
49
|
+
from transformers import AutoTokenizer
|
50
|
+
import torch
|
51
|
+
|
52
|
+
model_checkpoint = "distilbert-base-uncased"
|
53
|
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
54
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
55
|
+
|
56
|
+
text = "This is a great [MASK]."
|
57
|
+
|
58
|
+
inputs = tokenizer(text, return_tensors="pt")
|
59
|
+
token_logits = model(**inputs).logits
|
60
|
+
# Find the location of [MASK] and extract its logits
|
61
|
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
62
|
+
mask_token_logits = token_logits[0, mask_token_index, :]
|
63
|
+
# Pick the [MASK] candidates with the highest logits
|
64
|
+
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
65
|
+
|
66
|
+
for token in top_5_tokens:
|
67
|
+
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import random
|
2
|
+
import torch
|
3
|
+
import numpy
|
4
|
+
|
5
|
+
def set_seed(seed):
|
6
|
+
"""
|
7
|
+
Set seed in several backends
|
8
|
+
"""
|
9
|
+
random.seed(seed)
|
10
|
+
numpy.random.seed(seed)
|
11
|
+
torch.manual_seed(seed)
|
12
|
+
if torch.cuda.is_available():
|
13
|
+
torch.cuda.manual_seed(seed)
|
14
|
+
torch.cuda.manual_seed_all(seed)
|
15
|
+
|
16
|
+
def deterministic():
|
17
|
+
"""
|
18
|
+
Ensure that all operations are deterministic on GPU (if used) for
|
19
|
+
reproducibility
|
20
|
+
"""
|
21
|
+
torch.backends.cudnn.deterministic = True
|
22
|
+
torch.backends.cudnn.benchmark = False
|
23
|
+
|
24
|
+
def device():
|
25
|
+
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
26
|
+
|
27
|
+
def data_directory():
|
28
|
+
from pathlib import Path
|
29
|
+
print(Path.home())
|
30
|
+
|
31
|
+
def model_device(model):
|
32
|
+
return next(model.parameters()).device
|
@@ -20,7 +20,8 @@ factory = "transformer"
|
|
20
20
|
|
21
21
|
[components.transformer.model]
|
22
22
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
-
name = "
|
23
|
+
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
24
|
+
#name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
25
|
tokenizer_config = {"use_fast": true}
|
25
26
|
|
26
27
|
[components.transformer.model.get_spans]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/huggingface/masked_lm'
|
3
|
+
|
4
|
+
class TestMaskedLM < Test::Unit::TestCase
|
5
|
+
def test_train_new_word
|
6
|
+
TmpFile.with_file do |dir|
|
7
|
+
|
8
|
+
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
9
|
+
mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
|
10
|
+
|
11
|
+
mod, tokenizer = mlm.init
|
12
|
+
if tokenizer.vocab["[GENE]"].nil?
|
13
|
+
tokenizer.add_tokens("[GENE]")
|
14
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
15
|
+
end
|
16
|
+
|
17
|
+
100.times do
|
18
|
+
mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
|
19
|
+
mlm.add "This [MASK] is expressed.", %w([GENE])
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
|
23
|
+
|
24
|
+
mlm.train
|
25
|
+
|
26
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
27
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
28
|
+
|
29
|
+
mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
|
30
|
+
|
31
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
32
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
33
|
+
|
34
|
+
mlm = VectorModel.new dir
|
35
|
+
|
36
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
37
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|