rbbt-dm 1.2.7 → 1.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/matrix/barcode.rb +2 -2
  3. data/lib/rbbt/matrix/differential.rb +3 -3
  4. data/lib/rbbt/matrix/knowledge_base.rb +1 -1
  5. data/lib/rbbt/plots/bar.rb +1 -1
  6. data/lib/rbbt/stan.rb +1 -1
  7. data/lib/rbbt/statistics/hypergeometric.rb +2 -1
  8. data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
  9. data/lib/rbbt/vector/model/huggingface.rb +39 -52
  10. data/lib/rbbt/vector/model/python.rb +33 -0
  11. data/lib/rbbt/vector/model/pytorch_lightning.rb +31 -0
  12. data/lib/rbbt/vector/model/random_forest.rb +1 -1
  13. data/lib/rbbt/vector/model/spaCy.rb +8 -6
  14. data/lib/rbbt/vector/model/tensorflow.rb +6 -5
  15. data/lib/rbbt/vector/model/torch/dataloader.rb +58 -0
  16. data/lib/rbbt/vector/model/torch/helpers.rb +52 -0
  17. data/lib/rbbt/vector/model/torch/introspection.rb +31 -0
  18. data/lib/rbbt/vector/model/torch/load_and_save.rb +30 -0
  19. data/lib/rbbt/vector/model/torch.rb +71 -0
  20. data/lib/rbbt/vector/model.rb +84 -54
  21. data/python/rbbt_dm/__init__.py +31 -1
  22. data/python/rbbt_dm/atcold/__init__.py +0 -0
  23. data/python/rbbt_dm/atcold/plot_lib.py +141 -0
  24. data/python/rbbt_dm/atcold/spiral.py +27 -0
  25. data/python/rbbt_dm/huggingface.py +64 -28
  26. data/python/rbbt_dm/language_model.py +70 -0
  27. data/python/rbbt_dm/util.py +32 -0
  28. data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
  29. data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
  30. data/test/rbbt/vector/model/test_huggingface.rb +258 -27
  31. data/test/rbbt/vector/model/test_python.rb +31 -0
  32. data/test/rbbt/vector/model/test_pytorch_lightning.rb +97 -0
  33. data/test/rbbt/vector/model/test_spaCy.rb +1 -1
  34. data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
  35. data/test/rbbt/vector/model/test_torch.rb +61 -0
  36. data/test/rbbt/vector/test_model.rb +25 -26
  37. data/test/test_helper.rb +13 -0
  38. metadata +35 -16
  39. data/lib/rbbt/tensorflow.rb +0 -43
  40. data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,43 @@
1
1
  #{{{ LOAD MODEL
2
+ import datasets
3
+ import rbbt
2
4
 
3
5
  def import_module_class(module, class_name):
4
- exec(f"from {module} import {class_name}")
6
+ if (not module == None):
7
+ exec(f"from {module} import {class_name}")
5
8
  return eval(class_name)
6
9
 
7
- def load_model(task, checkpoint):
8
- class_name = 'AutoModelFor' + task
9
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
10
+ def load_model(task, checkpoint, **kwargs):
11
+ if (":" in task):
12
+ module, class_name = task.split(":")
13
+ if (task == None):
14
+ module, class_name = None, module
15
+ return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
16
+ else:
17
+ class_name = 'AutoModelFor' + task
18
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint)
10
19
 
11
- def load_tokenizer(task, checkpoint):
20
+ def load_tokenizer(task, checkpoint, **kwargs):
12
21
  class_name = 'AutoTokenizer'
13
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
22
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
14
23
 
15
24
  def load_model_and_tokenizer(task, checkpoint):
16
25
  model = load_model(task, checkpoint)
17
26
  tokenizer = load_tokenizer(task, checkpoint)
18
27
  return model, tokenizer
19
28
 
20
- def load_model_and_tokenizer_from_directory(directory):
21
- import os
22
- import json
23
- options_file = os.path.join(directory, 'options.json')
24
- f = open(options_file, "r")
25
- options = json.load(f.read())
26
- f.close()
27
- task = options["task"]
28
- checkpoint = options["checkpoint"]
29
- return load_model_and_tokenizer(task, checkpoint)
29
+ # Not used
30
+
31
+ #def load_model_and_tokenizer_from_directory(directory):
32
+ # import os
33
+ # import json
34
+ # options_file = os.path.join(directory, 'options.json')
35
+ # f = open(options_file, "r")
36
+ # options = json.load(f.read())
37
+ # f.close()
38
+ # task = options["task"]
39
+ # checkpoint = options["checkpoint"]
40
+ # return load_model_and_tokenizer(task, checkpoint)
30
41
 
31
42
  #{{{ SIMPLE EVALUATE
32
43
 
@@ -48,24 +59,42 @@ def eval_model(model, tokenizer, texts, return_logits = True):
48
59
  #{{{ TRAIN AND PREDICT
49
60
 
50
61
  def load_tsv(tsv_file):
51
- from datasets import load_dataset
52
- return load_dataset('csv', data_files=[tsv_file], sep="\t")
62
+ tsv = rbbt.tsv(tsv_file)
63
+ print(tsv)
64
+ ds = datasets.Dataset.from_pandas(tsv)
65
+ d = datasets.DatasetDict()
66
+ d["train"] = ds
67
+ return d
68
+
69
+ def load_json(json_file):
70
+ return datasets.load_dataset('json', data_files=[json_file])
71
+
72
+ def tokenize_dataset(tokenizer, dataset):
73
+ return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
53
74
 
54
75
  def tsv_dataset(tokenizer, tsv_file):
55
76
  dataset = load_tsv(tsv_file)
56
- tokenized_dataset = dataset.map(lambda example: tokenizer(example["text"], truncation=True, max_length=512) , batched=True)
57
- return tokenized_dataset
77
+ return tokenize_dataset(tokenizer, dataset)
78
+
79
+ def json_dataset(tokenizer, json_file):
80
+ dataset = load_json(json_file)
81
+ return tokenize_dataset(tokenizer, dataset)
58
82
 
59
83
  def training_args(*args, **kwargs):
60
84
  from transformers import TrainingArguments
61
85
  training_args = TrainingArguments(*args, **kwargs)
62
86
  return training_args
63
87
 
64
-
65
- def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
88
+ def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
66
89
  from transformers import Trainer
67
90
 
68
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
91
+ if (isinstance(dataset, str)):
92
+ if (dataset.endswith('.json')):
93
+ tokenized_dataset = json_dataset(tokenizer, dataset)
94
+ else:
95
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
96
+ else:
97
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
69
98
 
70
99
  if (not class_weights == None):
71
100
  import torch
@@ -86,7 +115,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
86
115
  model,
87
116
  training_args,
88
117
  train_dataset = tokenized_dataset["train"],
89
- tokenizer = tokenizer
118
+ tokenizer = tokenizer,
119
+ **kwargs
90
120
  )
91
121
  else:
92
122
 
@@ -94,7 +124,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
94
124
  model,
95
125
  training_args,
96
126
  train_dataset = tokenized_dataset["train"],
97
- tokenizer = tokenizer
127
+ tokenizer = tokenizer,
128
+ **kwargs
98
129
  )
99
130
 
100
131
  trainer.train()
@@ -124,10 +155,16 @@ def find_tokens_in_input(dataset, token_ids):
124
155
  return position_rows
125
156
 
126
157
 
127
- def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = None):
158
+ def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
128
159
  from transformers import Trainer
129
160
 
130
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
161
+ if (isinstance(dataset, str)):
162
+ if (dataset.endswith('.json')):
163
+ tokenized_dataset = json_dataset(tokenizer, dataset)
164
+ else:
165
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
166
+ else:
167
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
131
168
 
132
169
  trainer = Trainer(
133
170
  model,
@@ -143,4 +180,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
143
180
  else:
144
181
  return result
145
182
 
146
-
@@ -0,0 +1,70 @@
1
+ def group_texts(examples):
2
+ # Concatenate all texts.
3
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
4
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
5
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
6
+ # customize this part to your needs.
7
+ total_length = (total_length // block_size) * block_size
8
+ # Split by chunks of max_len.
9
+ result = {
10
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
11
+ for k, t in concatenated_examples.items()
12
+ }
13
+ result["labels"] = result["input_ids"].copy()
14
+ return result
15
+
16
+ def whole_word_masking_data_collator(features):
17
+ from transformers import default_data_collator
18
+ for feature in features:
19
+ word_ids = feature.pop("word_ids")
20
+
21
+ # Create a map between words and corresponding token indices
22
+ mapping = collections.defaultdict(list)
23
+ current_word_index = -1
24
+ current_word = None
25
+ for idx, word_id in enumerate(word_ids):
26
+ if word_id is not None:
27
+ if word_id != current_word:
28
+ current_word = word_id
29
+ current_word_index += 1
30
+ mapping[current_word_index].append(idx)
31
+
32
+ # Randomly mask words
33
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
34
+ input_ids = feature["input_ids"]
35
+ labels = feature["labels"]
36
+ new_labels = [-100] * len(labels)
37
+ for word_id in np.where(mask)[0]:
38
+ word_id = word_id.item()
39
+ for idx in mapping[word_id]:
40
+ new_labels[idx] = labels[idx]
41
+ input_ids[idx] = tokenizer.mask_token_id
42
+ feature["labels"] = new_labels
43
+
44
+ return default_data_collator(features)
45
+
46
+ if __name__ == "__main__2":
47
+
48
+ from transformers import AutoModelForMaskedLM
49
+ from transformers import AutoTokenizer
50
+ import torch
51
+
52
+ model_checkpoint = "distilbert-base-uncased"
53
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
54
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
55
+
56
+ text = "This is a great [MASK]."
57
+
58
+ inputs = tokenizer(text, return_tensors="pt")
59
+ token_logits = model(**inputs).logits
60
+ # Find the location of [MASK] and extract its logits
61
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
62
+ mask_token_logits = token_logits[0, mask_token_index, :]
63
+ # Pick the [MASK] candidates with the highest logits
64
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
65
+
66
+ for token in top_5_tokens:
67
+ print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
68
+
69
+
70
+
@@ -0,0 +1,32 @@
1
+ import random
2
+ import torch
3
+ import numpy
4
+
5
+ def set_seed(seed):
6
+ """
7
+ Set seed in several backends
8
+ """
9
+ random.seed(seed)
10
+ numpy.random.seed(seed)
11
+ torch.manual_seed(seed)
12
+ if torch.cuda.is_available():
13
+ torch.cuda.manual_seed(seed)
14
+ torch.cuda.manual_seed_all(seed)
15
+
16
+ def deterministic():
17
+ """
18
+ Ensure that all operations are deterministic on GPU (if used) for
19
+ reproducibility
20
+ """
21
+ torch.backends.cudnn.deterministic = True
22
+ torch.backends.cudnn.benchmark = False
23
+
24
+ def device():
25
+ return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
26
+
27
+ def data_directory():
28
+ from pathlib import Path
29
+ print(Path.home())
30
+
31
+ def model_device(model):
32
+ return next(model.parameters()).device
@@ -20,7 +20,8 @@ factory = "transformer"
20
20
 
21
21
  [components.transformer.model]
22
22
  @architectures = "spacy-transformers.TransformerModel.v1"
23
- name = "emilyalsentzer/Bio_ClinicalBERT"
23
+ name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
24
+ #name = "emilyalsentzer/Bio_ClinicalBERT"
24
25
  tokenizer_config = {"use_fast": true}
25
26
 
26
27
  [components.transformer.model.get_spans]
@@ -0,0 +1,41 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/huggingface/masked_lm'
3
+
4
+ class TestMaskedLM < Test::Unit::TestCase
5
+ def test_train_new_word
6
+ TmpFile.with_file do |dir|
7
+
8
+ checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
9
+ mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
10
+
11
+ mod, tokenizer = mlm.init
12
+ if tokenizer.vocab["[GENE]"].nil?
13
+ tokenizer.add_tokens("[GENE]")
14
+ mod.resize_token_embeddings(tokenizer.__len__)
15
+ end
16
+
17
+ 100.times do
18
+ mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
19
+ mlm.add "This [MASK] is expressed.", %w([GENE])
20
+ end
21
+
22
+ assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
23
+
24
+ mlm.train
25
+
26
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
27
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
28
+
29
+ mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
30
+
31
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
32
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
33
+
34
+ mlm = VectorModel.new dir
35
+
36
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
37
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
38
+
39
+ end
40
+ end
41
+ end