rbbt-dm 1.2.6 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/matrix/barcode.rb +2 -2
  3. data/lib/rbbt/matrix/differential.rb +3 -3
  4. data/lib/rbbt/matrix/knowledge_base.rb +1 -1
  5. data/lib/rbbt/plots/bar.rb +1 -1
  6. data/lib/rbbt/stan.rb +1 -1
  7. data/lib/rbbt/statistics/hypergeometric.rb +2 -1
  8. data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
  9. data/lib/rbbt/vector/model/huggingface.rb +57 -38
  10. data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
  11. data/lib/rbbt/vector/model/random_forest.rb +1 -1
  12. data/lib/rbbt/vector/model/spaCy.rb +8 -14
  13. data/lib/rbbt/vector/model/tensorflow.rb +6 -5
  14. data/lib/rbbt/vector/model/torch.rb +37 -0
  15. data/lib/rbbt/vector/model/util.rb +18 -0
  16. data/lib/rbbt/vector/model.rb +100 -56
  17. data/python/rbbt_dm/__init__.py +48 -1
  18. data/python/rbbt_dm/atcold/__init__.py +0 -0
  19. data/python/rbbt_dm/atcold/plot_lib.py +141 -0
  20. data/python/rbbt_dm/atcold/spiral.py +27 -0
  21. data/python/rbbt_dm/huggingface.py +57 -26
  22. data/python/rbbt_dm/language_model.py +70 -0
  23. data/python/rbbt_dm/util.py +30 -0
  24. data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
  25. data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
  26. data/test/rbbt/vector/model/test_huggingface.rb +258 -27
  27. data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
  28. data/test/rbbt/vector/model/test_spaCy.rb +1 -1
  29. data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
  30. data/test/rbbt/vector/test_model.rb +25 -26
  31. data/test/test_helper.rb +13 -0
  32. metadata +26 -16
  33. data/lib/rbbt/tensorflow.rb +0 -43
  34. data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
@@ -1,32 +1,41 @@
1
1
  #{{{ LOAD MODEL
2
2
 
3
3
  def import_module_class(module, class_name):
4
- exec(f"from {module} import {class_name}")
4
+ if (not module == None):
5
+ exec(f"from {module} import {class_name}")
5
6
  return eval(class_name)
6
7
 
7
- def load_model(task, checkpoint):
8
- class_name = 'AutoModelFor' + task
9
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
8
+ def load_model(task, checkpoint, **kwargs):
9
+ if (":" in task):
10
+ module, class_name = task.split(":")
11
+ if (task == None):
12
+ module, class_name = None, module
13
+ return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
14
+ else:
15
+ class_name = 'AutoModelFor' + task
16
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint)
10
17
 
11
- def load_tokenizer(task, checkpoint):
18
+ def load_tokenizer(task, checkpoint, **kwargs):
12
19
  class_name = 'AutoTokenizer'
13
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
20
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
14
21
 
15
22
  def load_model_and_tokenizer(task, checkpoint):
16
23
  model = load_model(task, checkpoint)
17
24
  tokenizer = load_tokenizer(task, checkpoint)
18
25
  return model, tokenizer
19
26
 
20
- def load_model_and_tokenizer_from_directory(directory):
21
- import os
22
- import json
23
- options_file = os.path.join(directory, 'options.json')
24
- f = open(options_file, "r")
25
- options = json.load(f.read())
26
- f.close()
27
- task = options["task"]
28
- checkpoint = options["checkpoint"]
29
- return load_model_and_tokenizer(task, checkpoint)
27
+ # Not used
28
+
29
+ #def load_model_and_tokenizer_from_directory(directory):
30
+ # import os
31
+ # import json
32
+ # options_file = os.path.join(directory, 'options.json')
33
+ # f = open(options_file, "r")
34
+ # options = json.load(f.read())
35
+ # f.close()
36
+ # task = options["task"]
37
+ # checkpoint = options["checkpoint"]
38
+ # return load_model_and_tokenizer(task, checkpoint)
30
39
 
31
40
  #{{{ SIMPLE EVALUATE
32
41
 
@@ -51,21 +60,36 @@ def load_tsv(tsv_file):
51
60
  from datasets import load_dataset
52
61
  return load_dataset('csv', data_files=[tsv_file], sep="\t")
53
62
 
63
+ def load_json(json_file):
64
+ from datasets import load_dataset
65
+ return load_dataset('json', data_files=[json_file])
66
+
67
+ def tokenize_dataset(tokenizer, dataset):
68
+ return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
69
+
54
70
  def tsv_dataset(tokenizer, tsv_file):
55
71
  dataset = load_tsv(tsv_file)
56
- tokenized_dataset = dataset.map(lambda example: tokenizer(example["text"], truncation=True) , batched=True)
57
- return tokenized_dataset
72
+ return tokenize_dataset(tokenizer, dataset)
73
+
74
+ def json_dataset(tokenizer, json_file):
75
+ dataset = load_json(json_file)
76
+ return tokenize_dataset(tokenizer, dataset)
58
77
 
59
78
  def training_args(*args, **kwargs):
60
79
  from transformers import TrainingArguments
61
80
  training_args = TrainingArguments(*args, **kwargs)
62
81
  return training_args
63
82
 
64
-
65
- def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
83
+ def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
66
84
  from transformers import Trainer
67
85
 
68
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
86
+ if (isinstance(dataset, str)):
87
+ if (dataset.endswith('.json')):
88
+ tokenized_dataset = json_dataset(tokenizer, dataset)
89
+ else:
90
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
91
+ else:
92
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
69
93
 
70
94
  if (not class_weights == None):
71
95
  import torch
@@ -86,7 +110,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
86
110
  model,
87
111
  training_args,
88
112
  train_dataset = tokenized_dataset["train"],
89
- tokenizer = tokenizer
113
+ tokenizer = tokenizer,
114
+ **kwargs
90
115
  )
91
116
  else:
92
117
 
@@ -94,7 +119,8 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
94
119
  model,
95
120
  training_args,
96
121
  train_dataset = tokenized_dataset["train"],
97
- tokenizer = tokenizer
122
+ tokenizer = tokenizer,
123
+ **kwargs
98
124
  )
99
125
 
100
126
  trainer.train()
@@ -124,10 +150,16 @@ def find_tokens_in_input(dataset, token_ids):
124
150
  return position_rows
125
151
 
126
152
 
127
- def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = None):
153
+ def predict_model(model, tokenizer, training_args, dataset, locate_tokens = None):
128
154
  from transformers import Trainer
129
155
 
130
- tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
156
+ if (isinstance(dataset, str)):
157
+ if (dataset.endswith('.json')):
158
+ tokenized_dataset = json_dataset(tokenizer, dataset)
159
+ else:
160
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
161
+ else:
162
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
131
163
 
132
164
  trainer = Trainer(
133
165
  model,
@@ -143,4 +175,3 @@ def predict_model(model, tokenizer, training_args, tsv_file, locate_tokens = Non
143
175
  else:
144
176
  return result
145
177
 
146
-
@@ -0,0 +1,70 @@
1
+ def group_texts(examples):
2
+ # Concatenate all texts.
3
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
4
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
5
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
6
+ # customize this part to your needs.
7
+ total_length = (total_length // block_size) * block_size
8
+ # Split by chunks of max_len.
9
+ result = {
10
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
11
+ for k, t in concatenated_examples.items()
12
+ }
13
+ result["labels"] = result["input_ids"].copy()
14
+ return result
15
+
16
+ def whole_word_masking_data_collator(features):
17
+ from transformers import default_data_collator
18
+ for feature in features:
19
+ word_ids = feature.pop("word_ids")
20
+
21
+ # Create a map between words and corresponding token indices
22
+ mapping = collections.defaultdict(list)
23
+ current_word_index = -1
24
+ current_word = None
25
+ for idx, word_id in enumerate(word_ids):
26
+ if word_id is not None:
27
+ if word_id != current_word:
28
+ current_word = word_id
29
+ current_word_index += 1
30
+ mapping[current_word_index].append(idx)
31
+
32
+ # Randomly mask words
33
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
34
+ input_ids = feature["input_ids"]
35
+ labels = feature["labels"]
36
+ new_labels = [-100] * len(labels)
37
+ for word_id in np.where(mask)[0]:
38
+ word_id = word_id.item()
39
+ for idx in mapping[word_id]:
40
+ new_labels[idx] = labels[idx]
41
+ input_ids[idx] = tokenizer.mask_token_id
42
+ feature["labels"] = new_labels
43
+
44
+ return default_data_collator(features)
45
+
46
+ if __name__ == "__main__2":
47
+
48
+ from transformers import AutoModelForMaskedLM
49
+ from transformers import AutoTokenizer
50
+ import torch
51
+
52
+ model_checkpoint = "distilbert-base-uncased"
53
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
54
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
55
+
56
+ text = "This is a great [MASK]."
57
+
58
+ inputs = tokenizer(text, return_tensors="pt")
59
+ token_logits = model(**inputs).logits
60
+ # Find the location of [MASK] and extract its logits
61
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
62
+ mask_token_logits = token_logits[0, mask_token_index, :]
63
+ # Pick the [MASK] candidates with the highest logits
64
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
65
+
66
+ for token in top_5_tokens:
67
+ print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
68
+
69
+
70
+
@@ -0,0 +1,30 @@
1
+ import random
2
+ import torch
3
+ import numpy
4
+
5
+ def set_seed(seed):
6
+ """
7
+ Set seed in several backends
8
+ """
9
+ random.seed(seed)
10
+ numpy.random.seed(seed)
11
+ torch.manual_seed(seed)
12
+ if torch.cuda.is_available():
13
+ torch.cuda.manual_seed(seed)
14
+ torch.cuda.manual_seed_all(seed)
15
+
16
+ def deterministic():
17
+ """
18
+ Ensure that all operations are deterministic on GPU (if used) for
19
+ reproducibility
20
+ """
21
+ torch.backends.cudnn.deterministic = True
22
+ torch.backends.cudnn.benchmark = False
23
+
24
+ def device():
25
+ return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
26
+
27
+ def data_directory():
28
+ from pathlib import Path
29
+ print(Path.home())
30
+
@@ -20,7 +20,8 @@ factory = "transformer"
20
20
 
21
21
  [components.transformer.model]
22
22
  @architectures = "spacy-transformers.TransformerModel.v1"
23
- name = "emilyalsentzer/Bio_ClinicalBERT"
23
+ name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
24
+ #name = "emilyalsentzer/Bio_ClinicalBERT"
24
25
  tokenizer_config = {"use_fast": true}
25
26
 
26
27
  [components.transformer.model.get_spans]
@@ -0,0 +1,41 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)),'../../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/huggingface/masked_lm'
3
+
4
+ class TestMaskedLM < Test::Unit::TestCase
5
+ def test_train_new_word
6
+ TmpFile.with_file do |dir|
7
+
8
+ checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
9
+ mlm = MaskedLMModel.new checkpoint, dir, tokenizer_args: {max_length: 16, model_max_length: 16}
10
+
11
+ mod, tokenizer = mlm.init
12
+ if tokenizer.vocab["[GENE]"].nil?
13
+ tokenizer.add_tokens("[GENE]")
14
+ mod.resize_token_embeddings(tokenizer.__len__)
15
+ end
16
+
17
+ 100.times do
18
+ mlm.add "This [GENE] is [MASK] on tumor cells.", %w(expressed)
19
+ mlm.add "This [MASK] is expressed.", %w([GENE])
20
+ end
21
+
22
+ assert_equal "protein", mlm.eval(["This [MASK] is expressed."])
23
+
24
+ mlm.train
25
+
26
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
27
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
28
+
29
+ mlm = MaskedLMModel.new checkpoint, dir, :max_length => 16
30
+
31
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
32
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
33
+
34
+ mlm = VectorModel.new dir
35
+
36
+ assert_equal "[GENE]", mlm.eval(["This [MASK] is expressed."])
37
+ assert_equal "expressed", mlm.eval(["This [GENE] is [MASK] in tumor cells."])
38
+
39
+ end
40
+ end
41
+ end