rbbt-dm 1.2.6 → 1.2.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/matrix/barcode.rb +2 -2
  3. data/lib/rbbt/matrix/differential.rb +3 -3
  4. data/lib/rbbt/matrix/knowledge_base.rb +1 -1
  5. data/lib/rbbt/plots/bar.rb +1 -1
  6. data/lib/rbbt/stan.rb +1 -1
  7. data/lib/rbbt/statistics/hypergeometric.rb +2 -1
  8. data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
  9. data/lib/rbbt/vector/model/huggingface.rb +57 -38
  10. data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
  11. data/lib/rbbt/vector/model/random_forest.rb +1 -1
  12. data/lib/rbbt/vector/model/spaCy.rb +8 -14
  13. data/lib/rbbt/vector/model/tensorflow.rb +6 -5
  14. data/lib/rbbt/vector/model/torch.rb +37 -0
  15. data/lib/rbbt/vector/model/util.rb +18 -0
  16. data/lib/rbbt/vector/model.rb +100 -56
  17. data/python/rbbt_dm/__init__.py +48 -1
  18. data/python/rbbt_dm/atcold/__init__.py +0 -0
  19. data/python/rbbt_dm/atcold/plot_lib.py +141 -0
  20. data/python/rbbt_dm/atcold/spiral.py +27 -0
  21. data/python/rbbt_dm/huggingface.py +57 -26
  22. data/python/rbbt_dm/language_model.py +70 -0
  23. data/python/rbbt_dm/util.py +30 -0
  24. data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
  25. data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
  26. data/test/rbbt/vector/model/test_huggingface.rb +258 -27
  27. data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
  28. data/test/rbbt/vector/model/test_spaCy.rb +1 -1
  29. data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
  30. data/test/rbbt/vector/test_model.rb +25 -26
  31. data/test/test_helper.rb +13 -0
  32. metadata +26 -16
  33. data/lib/rbbt/tensorflow.rb +0 -43
  34. data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,41 @@
1
1
  #{{{ LOAD MODEL
2
2
 
3
3
  def import_module_class(module, class_name):
4
- exec(f"from {module} import {class_name}")
4
+ if (not module == None):
5
+ exec(f"from {module} import {class_name}")
5
6
  return eval(class_name)
6
7
 
7
- def load_model(task, checkpoint):
8
- class_name = 'AutoModelFor' + task
9
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
8
+ def load_model(task, checkpoint, **kwargs):
9
+ if (":" in task):
10
+ module, class_name = task.split(":")
11
+ if (task == None):
12
+ module, class_name = None, module
13
+ return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
14
+ else:
15
+ class_name = 'AutoModelFor' + task
16
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint)
10
17
 
11
- def load_tokenizer(task, checkpoint):
18
+ def load_tokenizer(task, checkpoint, **kwargs):
12
19
  class_name = 'AutoTokenizer'
13
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
20
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
14
21
 
15
22
  def load_model_and_tokenizer(task, checkpoint):
16
23
  model = load_model(task, checkpoint)
17
24
  tokenizer = load_tokenizer(task, checkpoint)
18
25
  return model, tokenizer
19
26
 
20
- def load_model_and_tokenizer_from_directory(directory):
21
- import os
22
- import json
23
- options_file = os.path.join(directory, 'options.json')
24
- f = open(options_file, "r")
25
- options = json.load(f.read())
26
- f.close()
27
- task = options["task"]
28
- checkpoint = options["checkpoint"]
29
- return load_model_and_tokenizer(task, checkpoint)
27
+ # Not used
28
+
29
+ #def load_model_and_tokenizer_from_directory(directory):
30
+ # import os
31
+ # import json
32
+ # options_file = os.path.join(directory, 'options.json')
33
+ # f = open(options_file, "r")
34
+ # options = json.load(f.read())
35
+ # f.close()
36
+ # task = options["task"]
37
+ # checkpoint = options["checkpoint"]
38
+ # return load_model_and_tokenizer(task, checkpoint)
30
39
 
31
40
  #{{{ SIMPLE EVALUATE
32
41
 
@@ -51,21 +60,36 @@ def load_tsv(tsv_file):
51
60
  from datasets import load_dataset
52
61
  return load_dataset('csv', data_files=[tsv_file], sep="\t")
53
62
 
63
+ def load_json(json_file):
64
+ from datasets import load_dataset
65
+ return load_dataset('json', data_files=[json_file])
66
+
67
+ def tokenize_dataset(tokenizer, dataset):
68
+ return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
69
+
54
70
  def tsv_dataset(tokenizer, tsv_file):
55
71
  dataset = load_tsv(tsv_file)
56
- tokenized_dataset = dataset.map(lambda example: tokenizer(example["text"], truncation=True) , batched=True)
57
- return tokenized_dataset
72
+ return tokenize_dataset(tokenizer, dataset)
73
+
74
+ def json_dataset(tokenizer, json_file):
75
+ dataset = load_json(json_file)
76
+ return tokenize_dataset(tokenizer, dataset)
58
77
 
59
78
  def training_args(*args, **kwargs):
60
79
  from transformers import TrainingArguments
61
80
  training_args = TrainingArguments(*args, **kwargs)
62
81
  return training_args
63
82
 
64
-
65
- def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
83
+ def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
66
84
  from transformers import Trainer
67
85
 
68
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
86
+ if (isinstance(dataset, str)):
87
+ if (dataset.endswith('.json')):
88
+ tokenized_dataset = json_dataset(tokenizer, dataset)
89
+ else:
90
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
91
+ else:
92
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
69
93
 
70
94
  if (not class_weights == None):
71
95
  import torch
@@ -86,7 +110,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
86
110
  model,
87
111
  training_args,
88
112
  train_dataset = tokenized_dataset["train"],
89
- tokenizer = tokenizer
113
+ tokenizer = tokenizer,
114
+ **kwargs
90
115
  )
91
116
  else:
92
117
 
@@ -94,7 +119,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
94
119
  model,
95
120
  training_args,
96
121
  train_dataset = tokenized_dataset["train"],
97
- tokenizer = tokenizer
122
+ tokenizer = tokenizer,
123
+ **kwargs
98
124
  )
99
125
 
100
126
  trainer.train()
@@ -124,10 +150,16 @@ def find_tokens_in_input(dataset, token_ids):
124
150
  return position_rows
125
151
 
126
152
 
127
- def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = None):
153
+ def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
128
154
  from transformers import Trainer
129
155
 
130
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
156
+ if (isinstance(dataset, str)):
157
+ if (dataset.endswith('.json')):
158
+ tokenized_dataset = json_dataset(tokenizer, dataset)
159
+ else:
160
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
161
+ else:
162
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
131
163
 
132
164
  trainer = Trainer(
133
165
  model,
@@ -143,4 +175,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
143
175
  else:
144
176
  return result
145
177
 
146
-
@@ -0,0 +1,70 @@
1
+ def group_texts(examples):
2
+ # Concatenate all texts.
3
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
4
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
5
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
6
+ # customize this part to your needs.
7
+ total_length = (total_length // block_size) * block_size
8
+ # Split by chunks of max_len.
9
+ result = {
10
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
11
+ for k, t in concatenated_examples.items()
12
+ }
13
+ result["labels"] = result["input_ids"].copy()
14
+ return result
15
+
16
+ def whole_word_masking_data_collator(features):
17
+ from transformers import default_data_collator
18
+ for feature in features:
19
+ word_ids = feature.pop("word_ids")
20
+
21
+ # Create a map between words and corresponding token indices
22
+ mapping = collections.defaultdict(list)
23
+ current_word_index = -1
24
+ current_word = None
25
+ for idx, word_id in enumerate(word_ids):
26
+ if word_id is not None:
27
+ if word_id != current_word:
28
+ current_word = word_id
29
+ current_word_index += 1
30
+ mapping[current_word_index].append(idx)
31
+
32
+ # Randomly mask words
33
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
34
+ input_ids = feature["input_ids"]
35
+ labels = feature["labels"]
36
+ new_labels = [-100] * len(labels)
37
+ for word_id in np.where(mask)[0]:
38
+ word_id = word_id.item()
39
+ for idx in mapping[word_id]:
40
+ new_labels[idx] = labels[idx]
41
+ input_ids[idx] = tokenizer.mask_token_id
42
+ feature["labels"] = new_labels
43
+
44
+ return default_data_collator(features)
45
+
46
+ if __name__ == "__main__2":
47
+
48
+ from transformers import AutoModelForMaskedLM
49
+ from transformers import AutoTokenizer
50
+ import torch
51
+
52
+ model_checkpoint = "distilbert-base-uncased"
53
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
54
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
55
+
56
+ text = "This is a great [MASK]."
57
+
58
+ inputs = tokenizer(text, return_tensors="pt")
59
+ token_logits = model(**inputs).logits
60
+ # Find the location of [MASK] and extract its logits
61
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
62
+ mask_token_logits = token_logits[0, mask_token_index, :]
63
+ # Pick the [MASK] candidates with the highest logits
64
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
65
+
66
+ for token in top_5_tokens:
67
+ print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
68
+
69
+
70
+
@@ -0,0 +1,30 @@
1
+ import random
2
+ import torch
3
+ import numpy
4
+
5
+ def set_seed(seed):
6
+ """
7
+ Set seed in several backends
8
+ """
9
+ random.seed(seed)
10
+ numpy.random.seed(seed)
11
+ torch.manual_seed(seed)
12
+ if torch.cuda.is_available():
13
+ torch.cuda.manual_seed(seed)
14
+ torch.cuda.manual_seed_all(seed)
15
+
16
+ def deterministic():
17
+ """
18
+ Ensure that all operations are deterministic on GPU (if used) for
19
+ reproducibility
20
+ """
21
+ torch.backends.cudnn.deterministic = True
22
+ torch.backends.cudnn.benchmark = False
23
+
24
+ def device():
25
+ return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
26
+
27
+ def data_directory():
28
+ from pathlib import Path
29
+ print(Path.home())
30
+
@@ -20,7 +20,8 @@ factory = "transformer"
20
20
 
21
21
  [components.transformer.model]
22
22
  @architectures = "spacy-transformers.TransformerModel.v1"
23
- name = "emilyalsentzer/Bio_ClinicalBERT"
23
+ name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
24
+ #name = "emilyalsentzer/Bio_ClinicalBERT"
24
25
  tokenizer_config = {"use_fast": true}
25
26
 
26
27
  [components.transformer.model.get_spans]
@@ -0,0 +1,41 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/huggingface/masked_lm'
3
+
4
+ class TestMaskedLM < Test::Unit::TestCase
5
+ def test_train_new_word
6
+ TmpFile.with_file do |dir|
7
+
8
+ checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
9
+ mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
10
+
11
+ mod, tokenizer = mlm.init
12
+ if tokenizer.vocab["[GENE]"].nil?
13
+ tokenizer.add_tokens("[GENE]")
14
+ mod.resize_token_embeddings(tokenizer.__len__)
15
+ end
16
+
17
+ 100.times do
18
+ mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
19
+ mlm.add "This [MASK] is expressed.", %w([GENE])
20
+ end
21
+
22
+ assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
23
+
24
+ mlm.train
25
+
26
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
27
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
28
+
29
+ mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
30
+
31
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
32
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
33
+
34
+ mlm = VectorModel.new dir
35
+
36
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
37
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
38
+
39
+ end
40
+ end
41
+ end