rbbt-dm 1.2.7 → 1.2.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/matrix/barcode.rb +2 -2
  3. data/lib/rbbt/matrix/differential.rb +3 -3
  4. data/lib/rbbt/matrix/knowledge_base.rb +1 -1
  5. data/lib/rbbt/plots/bar.rb +1 -1
  6. data/lib/rbbt/stan.rb +1 -1
  7. data/lib/rbbt/statistics/hypergeometric.rb +2 -1
  8. data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
  9. data/lib/rbbt/vector/model/huggingface.rb +39 -52
  10. data/lib/rbbt/vector/model/python.rb +33 -0
  11. data/lib/rbbt/vector/model/pytorch_lightning.rb +31 -0
  12. data/lib/rbbt/vector/model/random_forest.rb +1 -1
  13. data/lib/rbbt/vector/model/spaCy.rb +8 -6
  14. data/lib/rbbt/vector/model/tensorflow.rb +6 -5
  15. data/lib/rbbt/vector/model/torch/dataloader.rb +58 -0
  16. data/lib/rbbt/vector/model/torch/helpers.rb +52 -0
  17. data/lib/rbbt/vector/model/torch/introspection.rb +31 -0
  18. data/lib/rbbt/vector/model/torch/load_and_save.rb +30 -0
  19. data/lib/rbbt/vector/model/torch.rb +71 -0
  20. data/lib/rbbt/vector/model.rb +84 -54
  21. data/python/rbbt_dm/__init__.py +31 -1
  22. data/python/rbbt_dm/atcold/__init__.py +0 -0
  23. data/python/rbbt_dm/atcold/plot_lib.py +141 -0
  24. data/python/rbbt_dm/atcold/spiral.py +27 -0
  25. data/python/rbbt_dm/huggingface.py +64 -28
  26. data/python/rbbt_dm/language_model.py +70 -0
  27. data/python/rbbt_dm/util.py +32 -0
  28. data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
  29. data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
  30. data/test/rbbt/vector/model/test_huggingface.rb +258 -27
  31. data/test/rbbt/vector/model/test_python.rb +31 -0
  32. data/test/rbbt/vector/model/test_pytorch_lightning.rb +97 -0
  33. data/test/rbbt/vector/model/test_spaCy.rb +1 -1
  34. data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
  35. data/test/rbbt/vector/model/test_torch.rb +61 -0
  36. data/test/rbbt/vector/test_model.rb +25 -26
  37. data/test/test_helper.rb +13 -0
  38. metadata +35 -16
  39. data/lib/rbbt/tensorflow.rb +0 -43
  40. data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,43 @@
1
1
  #{{{ LOAD MODEL
2
+ import datasets
3
+ import rbbt
2
4
 
3
5
  def import_module_class(module, class_name):
4
- exec(f"from {module} import {class_name}")
6
+ if (not module == None):
7
+ exec(f"from {module} import {class_name}")
5
8
  return eval(class_name)
6
9
 
7
- def load_model(task, checkpoint):
8
- class_name = 'AutoModelFor' + task
9
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
10
+ def load_model(task, checkpoint, **kwargs):
11
+ if (":" in task):
12
+ module, class_name = task.split(":")
13
+ if (task == None):
14
+ module, class_name = None, module
15
+ return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
16
+ else:
17
+ class_name = 'AutoModelFor' + task
18
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint)
10
19
 
11
- def load_tokenizer(task, checkpoint):
20
+ def load_tokenizer(task, checkpoint, **kwargs):
12
21
  class_name = 'AutoTokenizer'
13
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
22
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
14
23
 
15
24
  def load_model_and_tokenizer(task, checkpoint):
16
25
  model = load_model(task, checkpoint)
17
26
  tokenizer = load_tokenizer(task, checkpoint)
18
27
  return model, tokenizer
19
28
 
20
- def load_model_and_tokenizer_from_directory(directory):
21
- import os
22
- import json
23
- options_file = os.path.join(directory, 'options.json')
24
- f = open(options_file, "r")
25
- options = json.load(f.read())
26
- f.close()
27
- task = options["task"]
28
- checkpoint = options["checkpoint"]
29
- return load_model_and_tokenizer(task, checkpoint)
29
+ # Not used
30
+
31
+ #def load_model_and_tokenizer_from_directory(directory):
32
+ # import os
33
+ # import json
34
+ # options_file = os.path.join(directory, 'options.json')
35
+ # f = open(options_file, "r")
36
+ # options = json.load(f.read())
37
+ # f.close()
38
+ # task = options["task"]
39
+ # checkpoint = options["checkpoint"]
40
+ # return load_model_and_tokenizer(task, checkpoint)
30
41
 
31
42
  #{{{ SIMPLE EVALUATE
32
43
 
@@ -48,24 +59,42 @@ def eval_model(model, tokenizer, texts, return_logits = True):
48
59
  #{{{ TRAIN AND PREDICT
49
60
 
50
61
  def load_tsv(tsv_file):
51
- from datasets import load_dataset
52
- return load_dataset('csv', data_files=[tsv_file], sep="\t")
62
+ tsv = rbbt.tsv(tsv_file)
63
+ print(tsv)
64
+ ds = datasets.Dataset.from_pandas(tsv)
65
+ d = datasets.DatasetDict()
66
+ d["train"] = ds
67
+ return d
68
+
69
+ def load_json(json_file):
70
+ return datasets.load_dataset('json', data_files=[json_file])
71
+
72
+ def tokenize_dataset(tokenizer, dataset):
73
+ return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
53
74
 
54
75
  def tsv_dataset(tokenizer, tsv_file):
55
76
  dataset = load_tsv(tsv_file)
56
- tokenized_dataset = dataset.map(lambda example: tokenizer(example["text"], truncation=True, max_length=512) , batched=True)
57
- return tokenized_dataset
77
+ return tokenize_dataset(tokenizer, dataset)
78
+
79
+ def json_dataset(tokenizer, json_file):
80
+ dataset = load_json(json_file)
81
+ return tokenize_dataset(tokenizer, dataset)
58
82
 
59
83
  def training_args(*args, **kwargs):
60
84
  from transformers import TrainingArguments
61
85
  training_args = TrainingArguments(*args, **kwargs)
62
86
  return training_args
63
87
 
64
-
65
- def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
88
+ def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
66
89
  from transformers import Trainer
67
90
 
68
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
91
+ if (isinstance(dataset, str)):
92
+ if (dataset.endswith('.json')):
93
+ tokenized_dataset = json_dataset(tokenizer, dataset)
94
+ else:
95
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
96
+ else:
97
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
69
98
 
70
99
  if (not class_weights == None):
71
100
  import torch
@@ -86,7 +115,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
86
115
  model,
87
116
  training_args,
88
117
  train_dataset = tokenized_dataset["train"],
89
- tokenizer = tokenizer
118
+ tokenizer = tokenizer,
119
+ **kwargs
90
120
  )
91
121
  else:
92
122
 
@@ -94,7 +124,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
94
124
  model,
95
125
  training_args,
96
126
  train_dataset = tokenized_dataset["train"],
97
- tokenizer = tokenizer
127
+ tokenizer = tokenizer,
128
+ **kwargs
98
129
  )
99
130
 
100
131
  trainer.train()
@@ -124,10 +155,16 @@ def find_tokens_in_input(dataset, token_ids):
124
155
  return position_rows
125
156
 
126
157
 
127
- def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = None):
158
+ def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
128
159
  from transformers import Trainer
129
160
 
130
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
161
+ if (isinstance(dataset, str)):
162
+ if (dataset.endswith('.json')):
163
+ tokenized_dataset = json_dataset(tokenizer, dataset)
164
+ else:
165
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
166
+ else:
167
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
131
168
 
132
169
  trainer = Trainer(
133
170
  model,
@@ -143,4 +180,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
143
180
  else:
144
181
  return result
145
182
 
146
-
@@ -0,0 +1,70 @@
1
+ def group_texts(examples):
2
+ # Concatenate all texts.
3
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
4
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
5
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
6
+ # customize this part to your needs.
7
+ total_length = (total_length // block_size) * block_size
8
+ # Split by chunks of max_len.
9
+ result = {
10
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
11
+ for k, t in concatenated_examples.items()
12
+ }
13
+ result["labels"] = result["input_ids"].copy()
14
+ return result
15
+
16
+ def whole_word_masking_data_collator(features):
17
+ from transformers import default_data_collator
18
+ for feature in features:
19
+ word_ids = feature.pop("word_ids")
20
+
21
+ # Create a map between words and corresponding token indices
22
+ mapping = collections.defaultdict(list)
23
+ current_word_index = -1
24
+ current_word = None
25
+ for idx, word_id in enumerate(word_ids):
26
+ if word_id is not None:
27
+ if word_id != current_word:
28
+ current_word = word_id
29
+ current_word_index += 1
30
+ mapping[current_word_index].append(idx)
31
+
32
+ # Randomly mask words
33
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
34
+ input_ids = feature["input_ids"]
35
+ labels = feature["labels"]
36
+ new_labels = [-100] * len(labels)
37
+ for word_id in np.where(mask)[0]:
38
+ word_id = word_id.item()
39
+ for idx in mapping[word_id]:
40
+ new_labels[idx] = labels[idx]
41
+ input_ids[idx] = tokenizer.mask_token_id
42
+ feature["labels"] = new_labels
43
+
44
+ return default_data_collator(features)
45
+
46
+ if __name__ == "__main__2":
47
+
48
+ from transformers import AutoModelForMaskedLM
49
+ from transformers import AutoTokenizer
50
+ import torch
51
+
52
+ model_checkpoint = "distilbert-base-uncased"
53
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
54
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
55
+
56
+ text = "This is a great [MASK]."
57
+
58
+ inputs = tokenizer(text, return_tensors="pt")
59
+ token_logits = model(**inputs).logits
60
+ # Find the location of [MASK] and extract its logits
61
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
62
+ mask_token_logits = token_logits[0, mask_token_index, :]
63
+ # Pick the [MASK] candidates with the highest logits
64
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
65
+
66
+ for token in top_5_tokens:
67
+ print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
68
+
69
+
70
+
@@ -0,0 +1,32 @@
1
+ import random
2
+ import torch
3
+ import numpy
4
+
5
+ def set_seed(seed):
6
+ """
7
+ Set seed in several backends
8
+ """
9
+ random.seed(seed)
10
+ numpy.random.seed(seed)
11
+ torch.manual_seed(seed)
12
+ if torch.cuda.is_available():
13
+ torch.cuda.manual_seed(seed)
14
+ torch.cuda.manual_seed_all(seed)
15
+
16
+ def deterministic():
17
+ """
18
+ Ensure that all operations are deterministic on GPU (if used) for
19
+ reproducibility
20
+ """
21
+ torch.backends.cudnn.deterministic = True
22
+ torch.backends.cudnn.benchmark = False
23
+
24
+ def device():
25
+ return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
26
+
27
+ def data_directory():
28
+ from pathlib import Path
29
+ print(Path.home())
30
+
31
+ def model_device(model):
32
+ return next(model.parameters()).device
@@ -20,7 +20,8 @@ factory = "transformer"
20
20
 
21
21
  [components.transformer.model]
22
22
  @architectures = "spacy-transformers.TransformerModel.v1"
23
- name = "emilyalsentzer/Bio_ClinicalBERT"
23
+ name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
24
+ #name = "emilyalsentzer/Bio_ClinicalBERT"
24
25
  tokenizer_config = {"use_fast": true}
25
26
 
26
27
  [components.transformer.model.get_spans]
@@ -0,0 +1,41 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/huggingface/masked_lm'
3
+
4
+ class TestMaskedLM < Test::Unit::TestCase
5
+ def test_train_new_word
6
+ TmpFile.with_file do |dir|
7
+
8
+ checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
9
+ mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
10
+
11
+ mod, tokenizer = mlm.init
12
+ if tokenizer.vocab["[GENE]"].nil?
13
+ tokenizer.add_tokens("[GENE]")
14
+ mod.resize_token_embeddings(tokenizer.__len__)
15
+ end
16
+
17
+ 100.times do
18
+ mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
19
+ mlm.add "This [MASK] is expressed.", %w([GENE])
20
+ end
21
+
22
+ assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
23
+
24
+ mlm.train
25
+
26
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
27
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
28
+
29
+ mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
30
+
31
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
32
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
33
+
34
+ mlm = VectorModel.new dir
35
+
36
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
37
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
38
+
39
+ end
40
+ end
41
+ end