rbbt-dm 1.3.1 → 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/differential.rb +1 -1
- data/lib/rbbt/matrix.rb +14 -3
- data/lib/rbbt/stan.rb +3 -4
- data/lib/rbbt/statistics/fdr.rb +2 -5
- data/lib/rbbt/statistics/hypergeometric.rb +12 -8
- data/lib/rbbt/vector/model/huggingface.rb +53 -43
- data/lib/rbbt/vector/model/python.rb +1 -1
- data/lib/rbbt/vector/model/pytorch_lightning.rb +38 -9
- data/lib/rbbt/vector/model/torch/dataloader.rb +1 -1
- data/lib/rbbt/vector/model/torch/helpers.rb +19 -1
- data/lib/rbbt/vector/model/torch/introspection.rb +7 -6
- data/lib/rbbt/vector/model/torch/load_and_save.rb +6 -0
- data/lib/rbbt/vector/model/torch.rb +45 -22
- data/lib/rbbt/vector/model.rb +22 -5
- data/python/rbbt_dm/__init__.py +4 -0
- data/python/rbbt_dm/huggingface.py +6 -3
- data/share/R/MA.R +5 -2
- data/test/rbbt/statistics/test_fdr.rb +1 -3
- data/test/rbbt/statistics/test_hypergeometric.rb +5 -0
- data/test/rbbt/vector/model/test_huggingface.rb +23 -9
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +5 -1
- data/test/rbbt/vector/model/test_torch.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 904f77b8390128686b8cf153e517aff21394bd43548b8116e0d28188924a833e
|
4
|
+
data.tar.gz: d05f1712851cb5c552cfedac2166abb45d508cad6abbf493b41f5becde0e570c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b762389ed54ce7a91da87258f2ee856d04b4c1fef73894ac6c0e6219423967bfb42b89430f952b62d609a38c1acd2935ca4d424c35215fcc987e8af24f1fde3d
|
7
|
+
data.tar.gz: d3248d9996ff5f1298203d7d69595cb8f0a0dd037a4a666f0631d106b785f3401f56e5bc0ef2883e32766d1792ece1e8807a0c6d3bd7e5ecd174fca8fc698dc3
|
data/lib/rbbt/matrix.rb
CHANGED
@@ -12,6 +12,7 @@ class RbbtMatrix
|
|
12
12
|
|
13
13
|
attr_accessor :data_file, :labels, :value_type, :format, :organism, :identifiers
|
14
14
|
def initialize(data_file, labels = nil, value_type = nil, format = nil, organism=nil, identifiers=nil)
|
15
|
+
data_file = data_file.find if Path === data_file
|
15
16
|
@data_file = data_file
|
16
17
|
@labels = labels
|
17
18
|
@value_type = value_type || 'count'
|
@@ -42,7 +43,7 @@ class RbbtMatrix
|
|
42
43
|
end
|
43
44
|
|
44
45
|
def samples
|
45
|
-
@samples ||= TSV.parse_header(@data_file)
|
46
|
+
@samples ||= TSV.parse_header(@data_file)[:fields]
|
46
47
|
end
|
47
48
|
|
48
49
|
def subsets=(subsets)
|
@@ -181,9 +182,14 @@ class RbbtMatrix
|
|
181
182
|
|
182
183
|
identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
|
183
184
|
|
184
|
-
data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
|
185
|
+
new_data = data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
|
185
186
|
Misc.mean(v.compact)
|
186
187
|
end
|
188
|
+
|
189
|
+
new_data.delete ""
|
190
|
+
new_data.delete nil
|
191
|
+
|
192
|
+
new_data
|
187
193
|
end
|
188
194
|
subsets = self.subsets
|
189
195
|
matrix = RbbtMatrix.new file, labels, value_type, "Ensembl Gene ID", organism
|
@@ -202,9 +208,14 @@ class RbbtMatrix
|
|
202
208
|
|
203
209
|
identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
|
204
210
|
|
205
|
-
data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
|
211
|
+
new_data = data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
|
206
212
|
Misc.mean(v.compact)
|
207
213
|
end
|
214
|
+
|
215
|
+
new_data.delete ""
|
216
|
+
new_data.delete nil
|
217
|
+
|
218
|
+
new_data
|
208
219
|
end
|
209
220
|
subsets = self.subsets
|
210
221
|
matrix = RbbtMatrix.new file, labels, value_type, "Associated Gene Name", organism
|
data/lib/rbbt/stan.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'rbbt/util/R'
|
2
|
-
require 'mkfifo'
|
3
2
|
|
4
3
|
module STAN
|
5
4
|
|
@@ -88,7 +87,7 @@ data{
|
|
88
87
|
end
|
89
88
|
|
90
89
|
def self.exec(data, model, input_directory, parameter_chains, sample_file, debug = FALSE, stan_options = {})
|
91
|
-
stan_options =
|
90
|
+
stan_options = IndiferentHash.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
|
92
91
|
|
93
92
|
data = {} if data.nil?
|
94
93
|
|
@@ -123,7 +122,7 @@ print(fit)
|
|
123
122
|
|
124
123
|
def self.stream_chain(data, model, directory = nil, options = {})
|
125
124
|
options, directory = directory, nil if Hash === directory
|
126
|
-
debug =
|
125
|
+
debug = IndiferentHash.process_options options, :debug
|
127
126
|
|
128
127
|
if directory.nil?
|
129
128
|
directory = TmpFile.tmp_file
|
@@ -178,7 +177,7 @@ print(fit)
|
|
178
177
|
end
|
179
178
|
|
180
179
|
def self.run(data, model, directory, options = {})
|
181
|
-
debug =
|
180
|
+
debug = IndiferentHash.process_options options, :debug
|
182
181
|
|
183
182
|
input_directory = File.join(directory, 'inputs')
|
184
183
|
|
data/lib/rbbt/statistics/fdr.rb
CHANGED
@@ -172,7 +172,8 @@ module FDR
|
|
172
172
|
values << p[1]
|
173
173
|
}
|
174
174
|
|
175
|
-
|
175
|
+
iii RUBY_VERSION[0]
|
176
|
+
if RUBY_VERSION[0] == "2" || RUBY_VERSION[0] == "3"
|
176
177
|
new_values = FDR.adjust(values)
|
177
178
|
keys.zip(new_values).each do |k,v|
|
178
179
|
vs = data[k]
|
@@ -195,8 +196,4 @@ module FDR
|
|
195
196
|
data.unnamed = unnamed if unnamed
|
196
197
|
end
|
197
198
|
end
|
198
|
-
|
199
199
|
end
|
200
|
-
|
201
|
-
|
202
|
-
|
@@ -64,10 +64,10 @@ double lBinom(double n, double k)
|
|
64
64
|
builder.c_singleton <<-EOC
|
65
65
|
/**
|
66
66
|
* * Compute the Hypergeometric accumulated value.
|
67
|
-
* * @param total =>
|
68
|
-
* * @param support =>
|
69
|
-
* * @param list =>
|
70
|
-
* * @param found =>
|
67
|
+
* * @param total => Balls in urn
|
68
|
+
* * @param support => Positive balls in urn
|
69
|
+
* * @param list => Drawn balls
|
70
|
+
* * @param found => Positive drawn balls
|
71
71
|
* * @return The result
|
72
72
|
* */
|
73
73
|
//pvalues[annotation] = Hypergeometric.hypergeometric(tsv_size, counts[annotation], total, count)
|
@@ -102,10 +102,13 @@ double hypergeometric_c(double total, double support, double list, double found)
|
|
102
102
|
EOC
|
103
103
|
end
|
104
104
|
|
105
|
-
def self.
|
106
|
-
#RSRuby.instance.phyper(count - 1, positive, negative, total, false).to_f
|
105
|
+
def self.hypergeometric_R(count, positive, negative, total)
|
107
106
|
R.eval("phyper(#{ count } - 1, #{ positive }, #{ negative }, #{ total }, lower.tail=FALSE)").to_f
|
108
107
|
end
|
108
|
+
|
109
|
+
def self.hypergeometric(count, positive, negative, total)
|
110
|
+
hypergeometric_c(positive + negative, positive, total, count)
|
111
|
+
end
|
109
112
|
end
|
110
113
|
|
111
114
|
module TSV
|
@@ -260,7 +263,8 @@ module TSV
|
|
260
263
|
elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
|
261
264
|
count = elems.length
|
262
265
|
next if count < options[:min_support] or not counts.include? annotation
|
263
|
-
pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
|
266
|
+
#pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
|
267
|
+
pvalues[annotation] = Hypergeometric.hypergeometric_c(tsv_size, counts[annotation], total, count)
|
264
268
|
end
|
265
269
|
|
266
270
|
pvalues = FDR.adjust_hash! pvalues if options[:fdr]
|
@@ -268,7 +272,7 @@ module TSV
|
|
268
272
|
pvalues.delete_if{|k, pvalue| pvalue > options[:cutoff] } if options[:cutoff]
|
269
273
|
|
270
274
|
if add_keys
|
271
|
-
tsv = TSV.setup(pvalues.keys.collect{|k| k.dup}, :key_field => fields, :fields => [], :type => :double)
|
275
|
+
tsv = TSV.setup(pvalues.keys.collect{|k| k.dup }, :key_field => fields, :fields => [], :type => :double)
|
272
276
|
|
273
277
|
tsv.add_field 'p-value' do |annot, values|
|
274
278
|
[pvalues[annot]]
|
@@ -18,18 +18,23 @@ class HuggingfaceModel < TorchModel
|
|
18
18
|
|
19
19
|
checkpoint = checkpoint.find if Path === checkpoint
|
20
20
|
|
21
|
-
@model_options =
|
21
|
+
@model_options[:tokenizer_options] = @model_options.delete(:tokenizer_args) if @model_options.include?(:tokenizer_args)
|
22
|
+
tokenizer_args = IndiferentHash.pull_keys @model_options, :tokenizer
|
23
|
+
@model_options[:tokenizer_args] = tokenizer_args
|
24
|
+
|
25
|
+
@model_options[:task] = task if task
|
26
|
+
@model_options[:checkpoint] = checkpoint if checkpoint
|
22
27
|
|
23
28
|
init_model do
|
24
29
|
checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint]
|
25
30
|
|
26
31
|
model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model,
|
27
|
-
@model_options[:task], checkpoint, **(IndiferentHash.setup(model_options
|
32
|
+
@model_options[:task], checkpoint, **(IndiferentHash.setup(@model_options.except(:training_args, :tokenizer_args, :task, :checkpoint, :class_labels))))
|
28
33
|
|
29
|
-
tokenizer_checkpoint = @model_options[:
|
34
|
+
tokenizer_checkpoint = @model_options[:tokenizer_args][:checkpoint] || checkpoint
|
30
35
|
|
31
36
|
tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
|
32
|
-
|
37
|
+
tokenizer_checkpoint, **(IndiferentHash.setup(@model_options[:tokenizer_args])))
|
33
38
|
|
34
39
|
[model, tokenizer]
|
35
40
|
end
|
@@ -57,7 +62,7 @@ class HuggingfaceModel < TorchModel
|
|
57
62
|
end
|
58
63
|
|
59
64
|
dataset_file = TorchModel.text_dataset(tsv_file, texts)
|
60
|
-
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir,
|
65
|
+
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, {})
|
61
66
|
|
62
67
|
begin
|
63
68
|
RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens)
|
@@ -82,7 +87,7 @@ class HuggingfaceModel < TorchModel
|
|
82
87
|
checkpoint_dir = File.join(tmpdir, 'checkpoints')
|
83
88
|
end
|
84
89
|
|
85
|
-
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir,
|
90
|
+
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, training_args)
|
86
91
|
dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels])
|
87
92
|
|
88
93
|
RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
|
@@ -107,43 +112,47 @@ class HuggingfaceModel < TorchModel
|
|
107
112
|
predictions = result["logits"]
|
108
113
|
end
|
109
114
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
115
|
+
if @model_options[:return_logits]
|
116
|
+
result = RbbtPython.numpy2ruby(predictions)
|
117
|
+
else
|
118
|
+
task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
|
119
|
+
result = case task
|
120
|
+
when "SequenceClassification"
|
121
|
+
RbbtPython.collect(predictions) do |logits|
|
122
|
+
logits = RbbtPython.numpy2ruby logits
|
123
|
+
best_class = logits.index logits.max
|
124
|
+
best_class = class_labels[best_class] if class_labels
|
125
|
+
best_class
|
126
|
+
end
|
127
|
+
when "MaskedLM"
|
128
|
+
all_token_positions = token_positions.to_a
|
129
|
+
|
130
|
+
i = 0
|
131
|
+
RbbtPython.collect(predictions) do |item_logits|
|
132
|
+
item_token_positions = all_token_positions[i]
|
133
|
+
i += 1
|
134
|
+
|
135
|
+
item_logits = RbbtPython.numpy2ruby(item_logits)
|
136
|
+
item_masks = item_token_positions.collect do |token_positions|
|
137
|
+
|
138
|
+
best = item_logits.values_at(*token_positions).collect do |logits|
|
139
|
+
best_token, best_score = nil
|
140
|
+
logits.each_with_index do |v,i|
|
141
|
+
if best_score.nil? || v > best_score
|
142
|
+
best_token, best_score = i, v
|
143
|
+
end
|
144
|
+
end
|
145
|
+
best_token
|
146
|
+
end
|
147
|
+
|
148
|
+
best.collect{|b| tokenizer.decode(b) } * "|"
|
149
|
+
end
|
150
|
+
Array === locate_tokens ? item_masks : item_masks.first
|
151
|
+
end
|
152
|
+
else
|
153
|
+
predictions
|
154
|
+
end
|
155
|
+
end
|
147
156
|
|
148
157
|
(! is_list || single) && Array === result ? result.first : result
|
149
158
|
end
|
@@ -155,6 +164,7 @@ class HuggingfaceModel < TorchModel
|
|
155
164
|
def reset_model
|
156
165
|
@model, @tokenizer = nil
|
157
166
|
Open.rm_rf @model_path
|
167
|
+
Open.rm_rf TorchModel.model_architecture(model_path)
|
158
168
|
init
|
159
169
|
end
|
160
170
|
end
|
@@ -7,25 +7,54 @@ class PytorchLightningModel < TorchModel
|
|
7
7
|
|
8
8
|
train_model do |features,labels|
|
9
9
|
model = init
|
10
|
-
|
10
|
+
train_loader = self.loader
|
11
11
|
val_loader = self.val_loader
|
12
|
-
if
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
if train_loader.nil?
|
13
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
14
|
+
batch_size ||= model_options[:batch_size]
|
15
|
+
batch_size ||= 1
|
16
|
+
|
17
|
+
shuffle = model_options[:training_args][:shuffle]
|
18
|
+
shuffle = true if shuffle.nil?
|
19
|
+
|
20
|
+
num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
|
21
|
+
train_loader = RbbtPython.run :torch do
|
22
|
+
dataset = features.zip(labels).collect{|f,l| [torch.tensor(f), l] }
|
23
|
+
torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, shuffle: shuffle, num_workers: num_workers.to_i)
|
17
24
|
end
|
18
25
|
end
|
19
|
-
trainer.fit(model,
|
26
|
+
trainer.fit(model, train_loader, val_loader)
|
20
27
|
TorchModel.save_architecture(model, model_path) if @directory
|
21
28
|
TorchModel.save_state(model, model_path) if @directory
|
22
29
|
end
|
30
|
+
|
31
|
+
eval_model do |features,list=false|
|
32
|
+
model = init
|
33
|
+
eval_loader = self.loader
|
34
|
+
if list
|
35
|
+
if eval_loader.nil?
|
36
|
+
batch_size ||= model_options[:batch_size]
|
37
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
38
|
+
batch_size ||= 1
|
39
|
+
|
40
|
+
num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
|
41
|
+
eval_loader = RbbtPython.run :torch do
|
42
|
+
dataset = torch.tensor(features)
|
43
|
+
torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, num_workers: num_workers.to_i)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
trainer.predict(model, eval_loader).inject([]){|acc,res| acc.concat RbbtPython.numpy2ruby(res[1])}
|
47
|
+
else
|
48
|
+
model.call(torch.tensor(features))
|
49
|
+
end
|
50
|
+
end
|
23
51
|
end
|
24
52
|
|
25
53
|
def trainer
|
26
54
|
@trainer ||= begin
|
27
|
-
|
28
|
-
|
55
|
+
trainer_args = {default_root_dir: File.join(@directory, 'checkpoints')}.
|
56
|
+
merge(model_options[:training_args].except(:batch_size))
|
57
|
+
RbbtPython.class_new_obj("pytorch_lightning", "Trainer", trainer_args)
|
29
58
|
end
|
30
59
|
end
|
31
60
|
end
|
@@ -42,7 +42,7 @@ class TorchModel
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def self.text_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
|
45
|
-
elements = elements.compact.collect{|e| e.gsub("\n", ' ') }
|
45
|
+
elements = elements.compact.collect{|e| e.gsub("\n", ' ').gsub('"', '\'') }
|
46
46
|
tsv = feature_tsv(elements, labels, class_labels)
|
47
47
|
tsv.fields[0] = "text"
|
48
48
|
if labels.nil?
|
@@ -3,9 +3,27 @@ class TorchModel
|
|
3
3
|
def to_ruby
|
4
4
|
RbbtPython.numpy2ruby(self)
|
5
5
|
end
|
6
|
+
|
7
|
+
def to_ruby!
|
8
|
+
r = self.to_ruby
|
9
|
+
self.del
|
10
|
+
r
|
11
|
+
end
|
12
|
+
|
13
|
+
def length
|
14
|
+
PyCall.len(self)
|
15
|
+
end
|
16
|
+
|
6
17
|
def self.setup(obj)
|
7
18
|
obj.extend Tensor
|
8
19
|
end
|
20
|
+
|
21
|
+
def del
|
22
|
+
self.detach
|
23
|
+
self.grad = nil
|
24
|
+
self.storage.resize_ 0
|
25
|
+
self.to("cpu")
|
26
|
+
end
|
9
27
|
end
|
10
28
|
|
11
29
|
def self.init_python
|
@@ -46,7 +64,7 @@ class TorchModel
|
|
46
64
|
end
|
47
65
|
|
48
66
|
def self.tensor(obj, device, dtype)
|
49
|
-
RbbtPython.torch.tensor(obj, dtype: dtype, device: device)
|
67
|
+
TorchModel::Tensor.setup(RbbtPython.torch.tensor(obj, dtype: dtype, device: device))
|
50
68
|
end
|
51
69
|
|
52
70
|
end
|
@@ -13,19 +13,20 @@ class TorchModel
|
|
13
13
|
end
|
14
14
|
def get_weights(...); TorchModel.get_weights(model, ...); end
|
15
15
|
|
16
|
-
def self.freeze(layer)
|
16
|
+
def self.freeze(layer, requires_grad=false)
|
17
17
|
begin
|
18
|
-
PyCall.getattr(layer, :weight).requires_grad =
|
18
|
+
PyCall.getattr(layer, :weight).requires_grad = requires_grad
|
19
19
|
rescue
|
20
20
|
end
|
21
21
|
RbbtPython.iterate(layer.children) do |layer|
|
22
|
-
freeze(layer)
|
22
|
+
freeze(layer, requires_grad)
|
23
23
|
end
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
|
+
def self.freeze_layer(model, layer, requires_grad = false)
|
26
27
|
layer = get_layer(model, layer)
|
27
|
-
freeze(layer)
|
28
|
+
freeze(layer, requires_grad)
|
28
29
|
end
|
29
|
-
def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
|
30
30
|
|
31
|
+
def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
|
31
32
|
end
|
@@ -27,4 +27,10 @@ class TorchModel
|
|
27
27
|
Log.debug "Loading model architecture from #{model_architecture}"
|
28
28
|
RbbtPython.torch.load(model_architecture)
|
29
29
|
end
|
30
|
+
|
31
|
+
def reset_model
|
32
|
+
@trainer = @model = nil
|
33
|
+
Open.rm_rf model_path
|
34
|
+
Open.rm_rf TorchModel.model_architecture(model_path)
|
35
|
+
end
|
30
36
|
end
|
@@ -2,47 +2,37 @@ require_relative 'python'
|
|
2
2
|
|
3
3
|
class TorchModel < PythonModel
|
4
4
|
|
5
|
-
attr_accessor :criterion, :optimizer
|
5
|
+
attr_accessor :criterion, :optimizer
|
6
6
|
|
7
7
|
def initialize(...)
|
8
8
|
TorchModel.init_python
|
9
9
|
super(...)
|
10
|
-
@training_args = model_options[:training_args] || {}
|
11
10
|
|
11
|
+
@model_options[:training_options] = @model_options.delete(:training_args) if @model_options.include?(:training_args)
|
12
|
+
training_args = IndiferentHash.pull_keys(@model_options, :training) || {}
|
13
|
+
@model_options[:training_args] = training_args
|
12
14
|
init_model do
|
13
15
|
model = TorchModel.load_architecture(model_path)
|
14
16
|
if model.nil?
|
15
17
|
RbbtPython.add_path @directory
|
16
|
-
RbbtPython.
|
18
|
+
RbbtPython.process_paths
|
19
|
+
RbbtPython.class_new_obj(@python_module, @python_class, **model_options.except(:training_args, :batch_size))
|
17
20
|
else
|
18
21
|
TorchModel.load_state(model, model_path)
|
19
22
|
end
|
20
23
|
end
|
21
24
|
|
22
|
-
eval_model do |features,list=false|
|
23
|
-
init
|
24
|
-
@device ||= TorchModel.device(model_options)
|
25
|
-
@dtype ||= TorchModel.dtype(model_options)
|
26
|
-
model.to(@device)
|
27
|
-
|
28
|
-
tensor = list ? TorchModel.tensor(features, @device, @dtype) : TorchModel.tensor([features], @device, @dtype)
|
29
|
-
|
30
|
-
loss, res = model.call(tensor)
|
31
|
-
|
32
|
-
res = loss if res.nil?
|
33
|
-
|
34
|
-
res = TorchModel::Tensor.setup(list ? res : res[0])
|
35
|
-
|
36
|
-
res
|
37
|
-
end
|
38
|
-
|
39
25
|
train_model do |features,labels|
|
40
26
|
init
|
41
27
|
@device ||= TorchModel.device(model_options)
|
42
28
|
@dtype ||= TorchModel.dtype(model_options)
|
43
29
|
model.to(@device)
|
44
|
-
@optimizer ||= TorchModel.optimizer(model, training_args)
|
45
|
-
|
30
|
+
@optimizer ||= TorchModel.optimizer(model, model_options[:training_args] || {})
|
31
|
+
|
32
|
+
epochs = model_options[:training_args][:epochs] || 3
|
33
|
+
batch_size = model_options[:batch_size]
|
34
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
35
|
+
batch_size ||= 1
|
46
36
|
|
47
37
|
inputs = TorchModel.tensor(features, @device, @dtype)
|
48
38
|
#target = TorchModel.tensor(labels.collect{|v| [v] }, @device, @dtype)
|
@@ -63,6 +53,39 @@ class TorchModel < PythonModel
|
|
63
53
|
TorchModel.save_architecture(model, model_path) if @directory
|
64
54
|
TorchModel.save_state(model, model_path) if @directory
|
65
55
|
end
|
56
|
+
|
57
|
+
eval_model do |features,list=false|
|
58
|
+
init
|
59
|
+
@device ||= TorchModel.device(model_options)
|
60
|
+
@dtype ||= TorchModel.dtype(model_options)
|
61
|
+
model.to(@device)
|
62
|
+
model.eval
|
63
|
+
|
64
|
+
features = [features] unless list
|
65
|
+
|
66
|
+
batch_size = model_options[:batch_size]
|
67
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
68
|
+
batch_size ||= 1
|
69
|
+
|
70
|
+
res = Misc.chunk(features, batch_size).inject(nil) do |acc,batch|
|
71
|
+
tensor = TorchModel.tensor(batch, @device, @dtype)
|
72
|
+
|
73
|
+
loss, chunk_res = model.call(tensor)
|
74
|
+
tensor.del
|
75
|
+
|
76
|
+
chunk_res = loss if chunk_res.nil?
|
77
|
+
|
78
|
+
TorchModel::Tensor.setup(chunk_res)
|
79
|
+
acc = acc.nil? ? chunk_res.to_ruby! : acc + chunk_res.to_ruby!
|
80
|
+
|
81
|
+
acc
|
82
|
+
end
|
83
|
+
|
84
|
+
res = TorchModel::Tensor.setup(list ? res : res[0])
|
85
|
+
|
86
|
+
res
|
87
|
+
end
|
88
|
+
|
66
89
|
end
|
67
90
|
end
|
68
91
|
require_relative 'torch/helpers'
|
data/lib/rbbt/vector/model.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/vector/model/util'
|
|
3
3
|
require 'rbbt/util/python'
|
4
4
|
|
5
5
|
RbbtPython.add_path Rbbt.python.find(:lib)
|
6
|
-
RbbtPython.init_rbbt
|
6
|
+
#RbbtPython.init_rbbt
|
7
7
|
|
8
8
|
class VectorModel
|
9
9
|
attr_accessor :directory, :model_path, :extract_features, :init_model, :train_model, :eval_model, :post_process, :balance
|
@@ -166,8 +166,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
166
166
|
@options_file = File.join(@directory, "options.json")
|
167
167
|
|
168
168
|
if File.exist?(@options_file)
|
169
|
-
|
170
|
-
IndiferentHash.setup(
|
169
|
+
file_options = JSON.parse(Open.read(@options_file))
|
170
|
+
IndiferentHash.setup(file_options)
|
171
|
+
@model_options = file_options.deep_merge(@model_options)
|
171
172
|
end
|
172
173
|
end
|
173
174
|
|
@@ -254,8 +255,24 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
254
255
|
|
255
256
|
def add_list(elements, labels = nil)
|
256
257
|
if @extract_features.nil? || @extract_features.arity == 1
|
257
|
-
|
258
|
-
|
258
|
+
case labels
|
259
|
+
when nil
|
260
|
+
elements.each do |elem|
|
261
|
+
add(elem)
|
262
|
+
end
|
263
|
+
when Array
|
264
|
+
elements.zip(labels).each do |elem,label|
|
265
|
+
add(elem, label)
|
266
|
+
end
|
267
|
+
when Hash
|
268
|
+
elements.each do |elem|
|
269
|
+
label = labels[elem]
|
270
|
+
add(elem, label)
|
271
|
+
end
|
272
|
+
else
|
273
|
+
elements.each do |elem|
|
274
|
+
add(elem, labels)
|
275
|
+
end
|
259
276
|
end
|
260
277
|
else
|
261
278
|
features = self.instance_exec(nil, elements, &@extract_features)
|
data/python/rbbt_dm/__init__.py
CHANGED
@@ -27,5 +27,9 @@ def tsv_dataset(filename, *args, **kwargs):
|
|
27
27
|
def tsv(*args, **kwargs):
|
28
28
|
return tsv_dataset(*args, **kwargs)
|
29
29
|
|
30
|
+
def tsv_loader(*args, **kwargs):
|
31
|
+
dataset = tsv(*args, kwargs)
|
32
|
+
return torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
|
33
|
+
|
30
34
|
def data_dir():
|
31
35
|
return rbbt.path('var/rbbt_dm/data')
|
@@ -15,15 +15,15 @@ def load_model(task, checkpoint, **kwargs):
|
|
15
15
|
return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
|
16
16
|
else:
|
17
17
|
class_name = 'AutoModelFor' + task
|
18
|
-
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
18
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
19
19
|
|
20
|
-
def load_tokenizer(
|
20
|
+
def load_tokenizer(checkpoint, **kwargs):
|
21
21
|
class_name = 'AutoTokenizer'
|
22
22
|
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
23
23
|
|
24
24
|
def load_model_and_tokenizer(task, checkpoint):
|
25
25
|
model = load_model(task, checkpoint)
|
26
|
-
tokenizer = load_tokenizer(
|
26
|
+
tokenizer = load_tokenizer(checkpoint)
|
27
27
|
return model, tokenizer
|
28
28
|
|
29
29
|
# Not used
|
@@ -88,6 +88,9 @@ def training_args(*args, **kwargs):
|
|
88
88
|
def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
|
89
89
|
from transformers import Trainer
|
90
90
|
|
91
|
+
# Note: Parameters need to be made contiguous. I'm not sure why they weren't
|
92
|
+
for param in model.parameters(): param.data = param.data.contiguous()
|
93
|
+
|
91
94
|
if (isinstance(dataset, str)):
|
92
95
|
if (dataset.endswith('.json')):
|
93
96
|
tokenized_dataset = json_dataset(tokenizer, dataset)
|
data/share/R/MA.R
CHANGED
@@ -99,6 +99,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
|
|
99
99
|
}
|
100
100
|
|
101
101
|
if (log2){
|
102
|
+
full_rows = apply(is.na(data), 1, sum) == 0
|
103
|
+
data = data[full_rows,]
|
102
104
|
cutoff <- 1
|
103
105
|
drop <- which(apply(data, 1, max) < cutoff)
|
104
106
|
min = min(data[data != -Inf])
|
@@ -182,10 +184,11 @@ rbbt.dm.matrix.differential <- function(file, main, contrast = NULL, type = 'lim
|
|
182
184
|
contrast <- make.names(contrast);
|
183
185
|
}
|
184
186
|
|
185
|
-
if (is.null(type) || type == 'limma')
|
187
|
+
if (is.null(type) || type == 'limma'){
|
186
188
|
result = rbbt.dm.matrix.differential.limma(data, main, contrast, log2, two.channel, eBayes.trend)
|
187
|
-
else
|
189
|
+
}else{
|
188
190
|
result = rbbt.dm.matrix.differential.DESeq(data, main, contrast)
|
191
|
+
}
|
189
192
|
|
190
193
|
if (is.null(outfile)){
|
191
194
|
return(result);
|
@@ -32,8 +32,6 @@ class TestFDR < Test::Unit::TestCase
|
|
32
32
|
assert_equal(clean(@r_adj), clean(FDR.adjust_native(@values)))
|
33
33
|
assert_equal(clean(FDR.adjust_fast(@values)), clean(FDR.adjust_native(@values)))
|
34
34
|
|
35
|
-
assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0]
|
35
|
+
assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0].to_i < 2
|
36
36
|
end
|
37
37
|
end
|
38
|
-
|
39
|
-
|
@@ -4,6 +4,11 @@ require 'test/unit'
|
|
4
4
|
|
5
5
|
class TestHypergeometric < Test::Unit::TestCase
|
6
6
|
|
7
|
+
def test_hypergeometric_c
|
8
|
+
assert_equal Hypergeometric.hypergeometric_c(2, 1, 1, 1).round(2), 0.5
|
9
|
+
assert_equal Hypergeometric.hypergeometric_c(10, 1, 1, 1).round(2), 0.1
|
10
|
+
end
|
11
|
+
|
7
12
|
def test_hypergeometric
|
8
13
|
assert Hypergeometric.hypergeometric(100, 20, 15, 13) < 0.0005
|
9
14
|
end
|
@@ -9,12 +9,8 @@ class TestHuggingface < Test::Unit::TestCase
|
|
9
9
|
task = "SequenceClassification"
|
10
10
|
|
11
11
|
model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
|
12
|
-
iii model.eval "This is dog"
|
13
|
-
iii model.eval "This is cat"
|
14
|
-
iii model.eval_list(["This is dog", "This is cat"])
|
15
12
|
|
16
13
|
model = VectorModel.new dir
|
17
|
-
iii model.eval_list(["This is dog", "This is cat"])
|
18
14
|
end
|
19
15
|
end
|
20
16
|
|
@@ -42,7 +38,7 @@ class TestHuggingface < Test::Unit::TestCase
|
|
42
38
|
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
|
43
39
|
end
|
44
40
|
|
45
|
-
def
|
41
|
+
def _test_sst_eval
|
46
42
|
TmpFile.with_file do |dir|
|
47
43
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
48
44
|
|
@@ -55,12 +51,29 @@ class TestHuggingface < Test::Unit::TestCase
|
|
55
51
|
end
|
56
52
|
end
|
57
53
|
|
54
|
+
def _test_sst_logits
|
55
|
+
TmpFile.with_file do |dir|
|
56
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
57
|
+
|
58
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
|
59
|
+
|
60
|
+
model.model_options[:class_labels] = ["Bad", "Good"]
|
61
|
+
model.model_options[:return_logits] = true
|
62
|
+
|
63
|
+
logits = model.eval("This is dog")
|
64
|
+
assert logits[0] > logits[1]
|
65
|
+
logits = model.eval_list(["This is dog", "This is cat"])
|
66
|
+
assert logits[0][0] > logits[0][1]
|
67
|
+
assert logits[1][0] < logits[1][1]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
58
71
|
|
59
|
-
def
|
72
|
+
def test_sst_train
|
60
73
|
TmpFile.with_file do |dir|
|
61
74
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
62
75
|
|
63
|
-
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
|
76
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, tokenizer_args:{max_length: 128}, tokenizer_padding: true, tokenizer_truncation: true
|
64
77
|
|
65
78
|
model.model_options[:class_labels] = %w(Bad Good)
|
66
79
|
|
@@ -148,12 +161,12 @@ class TestHuggingface < Test::Unit::TestCase
|
|
148
161
|
|
149
162
|
model = VectorModel.new dir
|
150
163
|
|
151
|
-
assert_equal "Good", model.eval_list("This is dog")
|
164
|
+
assert_equal ["Good"], model.eval_list(["This is dog"])
|
152
165
|
|
153
166
|
end
|
154
167
|
end
|
155
168
|
|
156
|
-
def
|
169
|
+
def __test_sst_stress_test
|
157
170
|
TmpFile.with_file do |dir|
|
158
171
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
159
172
|
|
@@ -252,6 +265,7 @@ class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
|
|
252
265
|
EOF
|
253
266
|
|
254
267
|
RbbtPython.add_path dir
|
268
|
+
RbbtPython.process_paths
|
255
269
|
|
256
270
|
biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
|
257
271
|
model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta
|
@@ -88,9 +88,13 @@ class TestPytorchLightningModel(pl.LightningModule):
|
|
88
88
|
res = model.eval_list([[10.0], [11.2], [14.3]])
|
89
89
|
assert_equal 3, RbbtPython.numpy2ruby(res).length
|
90
90
|
|
91
|
+
orig_res = res
|
91
92
|
model = VectorModel.new dir
|
92
93
|
model.init
|
93
|
-
|
94
|
+
res = model.eval([10.0])
|
95
|
+
res = model.eval_list([[10.0], [11.2], [14.3]])
|
96
|
+
assert_equal 3, RbbtPython.numpy2ruby(res).length
|
97
|
+
assert_equal orig_res, res
|
94
98
|
end
|
95
99
|
end
|
96
100
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-dm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -174,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
174
174
|
- !ruby/object:Gem::Version
|
175
175
|
version: '0'
|
176
176
|
requirements: []
|
177
|
-
rubygems_version: 3.5.
|
177
|
+
rubygems_version: 3.5.23
|
178
178
|
signing_key:
|
179
179
|
specification_version: 4
|
180
180
|
summary: Data-mining and statistics
|