rbbt-dm 1.2.6 → 1.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/barcode.rb +2 -2
- data/lib/rbbt/matrix/differential.rb +3 -3
- data/lib/rbbt/matrix/knowledge_base.rb +1 -1
- data/lib/rbbt/plots/bar.rb +1 -1
- data/lib/rbbt/stan.rb +1 -1
- data/lib/rbbt/statistics/hypergeometric.rb +2 -1
- data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
- data/lib/rbbt/vector/model/huggingface.rb +57 -38
- data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
- data/lib/rbbt/vector/model/random_forest.rb +1 -1
- data/lib/rbbt/vector/model/spaCy.rb +8 -14
- data/lib/rbbt/vector/model/tensorflow.rb +6 -5
- data/lib/rbbt/vector/model/torch.rb +37 -0
- data/lib/rbbt/vector/model/util.rb +18 -0
- data/lib/rbbt/vector/model.rb +100 -56
- data/python/rbbt_dm/__init__.py +48 -1
- data/python/rbbt_dm/atcold/__init__.py +0 -0
- data/python/rbbt_dm/atcold/plot_lib.py +141 -0
- data/python/rbbt_dm/atcold/spiral.py +27 -0
- data/python/rbbt_dm/huggingface.py +57 -26
- data/python/rbbt_dm/language_model.py +70 -0
- data/python/rbbt_dm/util.py +30 -0
- data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
- data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
- data/test/rbbt/vector/model/test_huggingface.rb +258 -27
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
- data/test/rbbt/vector/model/test_spaCy.rb +1 -1
- data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
- data/test/rbbt/vector/test_model.rb +25 -26
- data/test/test_helper.rb +13 -0
- metadata +26 -16
- data/lib/rbbt/tensorflow.rb +0 -43
- data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,41 @@
|
|
1
1
|
#{{{ LOAD MODEL
|
2
2
|
|
3
3
|
def import_module_class(module, class_name):
|
4
|
-
|
4
|
+
if (not module == None):
|
5
|
+
exec(f"from {module} import {class_name}")
|
5
6
|
return eval(class_name)
|
6
7
|
|
7
|
-
def load_model(task, checkpoint):
|
8
|
-
|
9
|
-
|
8
|
+
def load_model(task, checkpoint, **kwargs):
|
9
|
+
if (":" in task):
|
10
|
+
module, class_name = task.split(":")
|
11
|
+
if (task == None):
|
12
|
+
module, class_name = None, module
|
13
|
+
return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
|
14
|
+
else:
|
15
|
+
class_name = 'AutoModelFor' + task
|
16
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
10
17
|
|
11
|
-
def load_tokenizer(task, checkpoint):
|
18
|
+
def load_tokenizer(task, checkpoint, **kwargs):
|
12
19
|
class_name = 'AutoTokenizer'
|
13
|
-
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
20
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
14
21
|
|
15
22
|
def load_model_and_tokenizer(task, checkpoint):
|
16
23
|
model = load_model(task, checkpoint)
|
17
24
|
tokenizer = load_tokenizer(task, checkpoint)
|
18
25
|
return model, tokenizer
|
19
26
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
f
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
# Not used
|
28
|
+
|
29
|
+
#def load_model_and_tokenizer_from_directory(directory):
|
30
|
+
# import os
|
31
|
+
# import json
|
32
|
+
# options_file = os.path.join(directory, 'options.json')
|
33
|
+
# f = open(options_file, "r")
|
34
|
+
# options = json.load(f.read())
|
35
|
+
# f.close()
|
36
|
+
# task = options["task"]
|
37
|
+
# checkpoint = options["checkpoint"]
|
38
|
+
# return load_model_and_tokenizer(task, checkpoint)
|
30
39
|
|
31
40
|
#{{{ SIMPLE EVALUATE
|
32
41
|
|
@@ -51,21 +60,36 @@ def load_tsv(tsv_file):
|
|
51
60
|
from datasets import load_dataset
|
52
61
|
return load_dataset('csv', data_files=[tsv_file], sep="\t")
|
53
62
|
|
63
|
+
def load_json(json_file):
|
64
|
+
from datasets import load_dataset
|
65
|
+
return load_dataset('json', data_files=[json_file])
|
66
|
+
|
67
|
+
def tokenize_dataset(tokenizer, dataset):
|
68
|
+
return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
|
69
|
+
|
54
70
|
def tsv_dataset(tokenizer, tsv_file):
|
55
71
|
dataset = load_tsv(tsv_file)
|
56
|
-
|
57
|
-
|
72
|
+
return tokenize_dataset(tokenizer, dataset)
|
73
|
+
|
74
|
+
def json_dataset(tokenizer, json_file):
|
75
|
+
dataset = load_json(json_file)
|
76
|
+
return tokenize_dataset(tokenizer, dataset)
|
58
77
|
|
59
78
|
def training_args(*args, **kwargs):
|
60
79
|
from transformers import TrainingArguments
|
61
80
|
training_args = TrainingArguments(*args, **kwargs)
|
62
81
|
return training_args
|
63
82
|
|
64
|
-
|
65
|
-
def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
83
|
+
def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
|
66
84
|
from transformers import Trainer
|
67
85
|
|
68
|
-
|
86
|
+
if (isinstance(dataset, str)):
|
87
|
+
if (dataset.endswith('.json')):
|
88
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
89
|
+
else:
|
90
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
91
|
+
else:
|
92
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
69
93
|
|
70
94
|
if (not class_weights == None):
|
71
95
|
import torch
|
@@ -86,7 +110,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
86
110
|
model,
|
87
111
|
training_args,
|
88
112
|
train_dataset = tokenized_dataset["train"],
|
89
|
-
tokenizer = tokenizer
|
113
|
+
tokenizer = tokenizer,
|
114
|
+
**kwargs
|
90
115
|
)
|
91
116
|
else:
|
92
117
|
|
@@ -94,7 +119,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
|
|
94
119
|
model,
|
95
120
|
training_args,
|
96
121
|
train_dataset = tokenized_dataset["train"],
|
97
|
-
tokenizer = tokenizer
|
122
|
+
tokenizer = tokenizer,
|
123
|
+
**kwargs
|
98
124
|
)
|
99
125
|
|
100
126
|
trainer.train()
|
@@ -124,10 +150,16 @@ def find_tokens_in_input(dataset, token_ids):
|
|
124
150
|
return position_rows
|
125
151
|
|
126
152
|
|
127
|
-
def predict_model(model, tokenizer, training_args,
|
153
|
+
def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
|
128
154
|
from transformers import Trainer
|
129
155
|
|
130
|
-
|
156
|
+
if (isinstance(dataset, str)):
|
157
|
+
if (dataset.endswith('.json')):
|
158
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
159
|
+
else:
|
160
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
161
|
+
else:
|
162
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
131
163
|
|
132
164
|
trainer = Trainer(
|
133
165
|
model,
|
@@ -143,4 +175,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
|
|
143
175
|
else:
|
144
176
|
return result
|
145
177
|
|
146
|
-
|
@@ -0,0 +1,70 @@
|
|
1
|
+
def group_texts(examples):
|
2
|
+
# Concatenate all texts.
|
3
|
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
4
|
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
5
|
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
6
|
+
# customize this part to your needs.
|
7
|
+
total_length = (total_length // block_size) * block_size
|
8
|
+
# Split by chunks of max_len.
|
9
|
+
result = {
|
10
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
11
|
+
for k, t in concatenated_examples.items()
|
12
|
+
}
|
13
|
+
result["labels"] = result["input_ids"].copy()
|
14
|
+
return result
|
15
|
+
|
16
|
+
def whole_word_masking_data_collator(features):
|
17
|
+
from transformers import default_data_collator
|
18
|
+
for feature in features:
|
19
|
+
word_ids = feature.pop("word_ids")
|
20
|
+
|
21
|
+
# Create a map between words and corresponding token indices
|
22
|
+
mapping = collections.defaultdict(list)
|
23
|
+
current_word_index = -1
|
24
|
+
current_word = None
|
25
|
+
for idx, word_id in enumerate(word_ids):
|
26
|
+
if word_id is not None:
|
27
|
+
if word_id != current_word:
|
28
|
+
current_word = word_id
|
29
|
+
current_word_index += 1
|
30
|
+
mapping[current_word_index].append(idx)
|
31
|
+
|
32
|
+
# Randomly mask words
|
33
|
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
34
|
+
input_ids = feature["input_ids"]
|
35
|
+
labels = feature["labels"]
|
36
|
+
new_labels = [-100] * len(labels)
|
37
|
+
for word_id in np.where(mask)[0]:
|
38
|
+
word_id = word_id.item()
|
39
|
+
for idx in mapping[word_id]:
|
40
|
+
new_labels[idx] = labels[idx]
|
41
|
+
input_ids[idx] = tokenizer.mask_token_id
|
42
|
+
feature["labels"] = new_labels
|
43
|
+
|
44
|
+
return default_data_collator(features)
|
45
|
+
|
46
|
+
if __name__ == "__main__2":
|
47
|
+
|
48
|
+
from transformers import AutoModelForMaskedLM
|
49
|
+
from transformers import AutoTokenizer
|
50
|
+
import torch
|
51
|
+
|
52
|
+
model_checkpoint = "distilbert-base-uncased"
|
53
|
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
54
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
55
|
+
|
56
|
+
text = "This is a great [MASK]."
|
57
|
+
|
58
|
+
inputs = tokenizer(text, return_tensors="pt")
|
59
|
+
token_logits = model(**inputs).logits
|
60
|
+
# Find the location of [MASK] and extract its logits
|
61
|
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
62
|
+
mask_token_logits = token_logits[0, mask_token_index, :]
|
63
|
+
# Pick the [MASK] candidates with the highest logits
|
64
|
+
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
65
|
+
|
66
|
+
for token in top_5_tokens:
|
67
|
+
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import random
|
2
|
+
import torch
|
3
|
+
import numpy
|
4
|
+
|
5
|
+
def set_seed(seed):
|
6
|
+
"""
|
7
|
+
Set seed in several backends
|
8
|
+
"""
|
9
|
+
random.seed(seed)
|
10
|
+
numpy.random.seed(seed)
|
11
|
+
torch.manual_seed(seed)
|
12
|
+
if torch.cuda.is_available():
|
13
|
+
torch.cuda.manual_seed(seed)
|
14
|
+
torch.cuda.manual_seed_all(seed)
|
15
|
+
|
16
|
+
def deterministic():
|
17
|
+
"""
|
18
|
+
Ensure that all operations are deterministic on GPU (if used) for
|
19
|
+
reproducibility
|
20
|
+
"""
|
21
|
+
torch.backends.cudnn.deterministic = True
|
22
|
+
torch.backends.cudnn.benchmark = False
|
23
|
+
|
24
|
+
def device():
|
25
|
+
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
26
|
+
|
27
|
+
def data_directory():
|
28
|
+
from pathlib import Path
|
29
|
+
print(Path.home())
|
30
|
+
|
@@ -20,7 +20,8 @@ factory = "transformer"
|
|
20
20
|
|
21
21
|
[components.transformer.model]
|
22
22
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
23
|
-
name = "
|
23
|
+
name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
24
|
+
#name = "emilyalsentzer/Bio_ClinicalBERT"
|
24
25
|
tokenizer_config = {"use_fast": true}
|
25
26
|
|
26
27
|
[components.transformer.model.get_spans]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/vector/model/huggingface/masked_lm'
|
3
|
+
|
4
|
+
class TestMaskedLM < Test::Unit::TestCase
|
5
|
+
def test_train_new_word
|
6
|
+
TmpFile.with_file do |dir|
|
7
|
+
|
8
|
+
checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
|
9
|
+
mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
|
10
|
+
|
11
|
+
mod, tokenizer = mlm.init
|
12
|
+
if tokenizer.vocab["[GENE]"].nil?
|
13
|
+
tokenizer.add_tokens("[GENE]")
|
14
|
+
mod.resize_token_embeddings(tokenizer.__len__)
|
15
|
+
end
|
16
|
+
|
17
|
+
100.times do
|
18
|
+
mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
|
19
|
+
mlm.add "This [MASK] is expressed.", %w([GENE])
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
|
23
|
+
|
24
|
+
mlm.train
|
25
|
+
|
26
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
27
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
28
|
+
|
29
|
+
mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
|
30
|
+
|
31
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
32
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
33
|
+
|
34
|
+
mlm = VectorModel.new dir
|
35
|
+
|
36
|
+
assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
|
37
|
+
assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|