rbbt-dm 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/matrix/differential.rb +1 -1
- data/lib/rbbt/matrix.rb +14 -3
- data/lib/rbbt/stan.rb +3 -4
- data/lib/rbbt/statistics/fdr.rb +2 -5
- data/lib/rbbt/statistics/hypergeometric.rb +12 -8
- data/lib/rbbt/vector/model/huggingface.rb +64 -43
- data/lib/rbbt/vector/model/python.rb +1 -1
- data/lib/rbbt/vector/model/pytorch_lightning.rb +38 -9
- data/lib/rbbt/vector/model/torch/dataloader.rb +3 -4
- data/lib/rbbt/vector/model/torch/helpers.rb +19 -1
- data/lib/rbbt/vector/model/torch/introspection.rb +7 -6
- data/lib/rbbt/vector/model/torch/load_and_save.rb +6 -0
- data/lib/rbbt/vector/model/torch.rb +45 -22
- data/lib/rbbt/vector/model.rb +27 -7
- data/python/rbbt_dm/__init__.py +4 -0
- data/python/rbbt_dm/huggingface.py +6 -3
- data/share/R/MA.R +7 -3
- data/test/rbbt/statistics/test_fdr.rb +1 -3
- data/test/rbbt/statistics/test_hypergeometric.rb +5 -0
- data/test/rbbt/vector/model/test_huggingface.rb +23 -9
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +5 -1
- data/test/rbbt/vector/model/test_torch.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 904f77b8390128686b8cf153e517aff21394bd43548b8116e0d28188924a833e
|
4
|
+
data.tar.gz: d05f1712851cb5c552cfedac2166abb45d508cad6abbf493b41f5becde0e570c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b762389ed54ce7a91da87258f2ee856d04b4c1fef73894ac6c0e6219423967bfb42b89430f952b62d609a38c1acd2935ca4d424c35215fcc987e8af24f1fde3d
|
7
|
+
data.tar.gz: d3248d9996ff5f1298203d7d69595cb8f0a0dd037a4a666f0631d106b785f3401f56e5bc0ef2883e32766d1792ece1e8807a0c6d3bd7e5ecd174fca8fc698dc3
|
data/lib/rbbt/matrix.rb
CHANGED
@@ -12,6 +12,7 @@ class RbbtMatrix
|
|
12
12
|
|
13
13
|
attr_accessor :data_file, :labels, :value_type, :format, :organism, :identifiers
|
14
14
|
def initialize(data_file, labels = nil, value_type = nil, format = nil, organism=nil, identifiers=nil)
|
15
|
+
data_file = data_file.find if Path === data_file
|
15
16
|
@data_file = data_file
|
16
17
|
@labels = labels
|
17
18
|
@value_type = value_type || 'count'
|
@@ -42,7 +43,7 @@ class RbbtMatrix
|
|
42
43
|
end
|
43
44
|
|
44
45
|
def samples
|
45
|
-
@samples ||= TSV.parse_header(@data_file)
|
46
|
+
@samples ||= TSV.parse_header(@data_file)[:fields]
|
46
47
|
end
|
47
48
|
|
48
49
|
def subsets=(subsets)
|
@@ -181,9 +182,14 @@ class RbbtMatrix
|
|
181
182
|
|
182
183
|
identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
|
183
184
|
|
184
|
-
data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
|
185
|
+
new_data = data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
|
185
186
|
Misc.mean(v.compact)
|
186
187
|
end
|
188
|
+
|
189
|
+
new_data.delete ""
|
190
|
+
new_data.delete nil
|
191
|
+
|
192
|
+
new_data
|
187
193
|
end
|
188
194
|
subsets = self.subsets
|
189
195
|
matrix = RbbtMatrix.new file, labels, value_type, "Ensembl Gene ID", organism
|
@@ -202,9 +208,14 @@ class RbbtMatrix
|
|
202
208
|
|
203
209
|
identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
|
204
210
|
|
205
|
-
data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
|
211
|
+
new_data = data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
|
206
212
|
Misc.mean(v.compact)
|
207
213
|
end
|
214
|
+
|
215
|
+
new_data.delete ""
|
216
|
+
new_data.delete nil
|
217
|
+
|
218
|
+
new_data
|
208
219
|
end
|
209
220
|
subsets = self.subsets
|
210
221
|
matrix = RbbtMatrix.new file, labels, value_type, "Associated Gene Name", organism
|
data/lib/rbbt/stan.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'rbbt/util/R'
|
2
|
-
require 'mkfifo'
|
3
2
|
|
4
3
|
module STAN
|
5
4
|
|
@@ -88,7 +87,7 @@ data{
|
|
88
87
|
end
|
89
88
|
|
90
89
|
def self.exec(data, model, input_directory, parameter_chains, sample_file, debug = FALSE, stan_options = {})
|
91
|
-
stan_options =
|
90
|
+
stan_options = IndiferentHash.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
|
92
91
|
|
93
92
|
data = {} if data.nil?
|
94
93
|
|
@@ -123,7 +122,7 @@ print(fit)
|
|
123
122
|
|
124
123
|
def self.stream_chain(data, model, directory = nil, options = {})
|
125
124
|
options, directory = directory, nil if Hash === directory
|
126
|
-
debug =
|
125
|
+
debug = IndiferentHash.process_options options, :debug
|
127
126
|
|
128
127
|
if directory.nil?
|
129
128
|
directory = TmpFile.tmp_file
|
@@ -178,7 +177,7 @@ print(fit)
|
|
178
177
|
end
|
179
178
|
|
180
179
|
def self.run(data, model, directory, options = {})
|
181
|
-
debug =
|
180
|
+
debug = IndiferentHash.process_options options, :debug
|
182
181
|
|
183
182
|
input_directory = File.join(directory, 'inputs')
|
184
183
|
|
data/lib/rbbt/statistics/fdr.rb
CHANGED
@@ -172,7 +172,8 @@ module FDR
|
|
172
172
|
values << p[1]
|
173
173
|
}
|
174
174
|
|
175
|
-
|
175
|
+
iii RUBY_VERSION[0]
|
176
|
+
if RUBY_VERSION[0] == "2" || RUBY_VERSION[0] == "3"
|
176
177
|
new_values = FDR.adjust(values)
|
177
178
|
keys.zip(new_values).each do |k,v|
|
178
179
|
vs = data[k]
|
@@ -195,8 +196,4 @@ module FDR
|
|
195
196
|
data.unnamed = unnamed if unnamed
|
196
197
|
end
|
197
198
|
end
|
198
|
-
|
199
199
|
end
|
200
|
-
|
201
|
-
|
202
|
-
|
@@ -64,10 +64,10 @@ double lBinom(double n, double k)
|
|
64
64
|
builder.c_singleton <<-EOC
|
65
65
|
/**
|
66
66
|
* * Compute the Hypergeometric accumulated value.
|
67
|
-
* * @param total =>
|
68
|
-
* * @param support =>
|
69
|
-
* * @param list =>
|
70
|
-
* * @param found =>
|
67
|
+
* * @param total => Balls in urn
|
68
|
+
* * @param support => Positive balls in urn
|
69
|
+
* * @param list => Drawn balls
|
70
|
+
* * @param found => Positive drawn balls
|
71
71
|
* * @return The result
|
72
72
|
* */
|
73
73
|
//pvalues[annotation] = Hypergeometric.hypergeometric(tsv_size, counts[annotation], total, count)
|
@@ -102,10 +102,13 @@ double hypergeometric_c(double total, double support, double list, double found)
|
|
102
102
|
EOC
|
103
103
|
end
|
104
104
|
|
105
|
-
def self.
|
106
|
-
#RSRuby.instance.phyper(count - 1, positive, negative, total, false).to_f
|
105
|
+
def self.hypergeometric_R(count, positive, negative, total)
|
107
106
|
R.eval("phyper(#{ count } - 1, #{ positive }, #{ negative }, #{ total }, lower.tail=FALSE)").to_f
|
108
107
|
end
|
108
|
+
|
109
|
+
def self.hypergeometric(count, positive, negative, total)
|
110
|
+
hypergeometric_c(positive + negative, positive, total, count)
|
111
|
+
end
|
109
112
|
end
|
110
113
|
|
111
114
|
module TSV
|
@@ -260,7 +263,8 @@ module TSV
|
|
260
263
|
elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
|
261
264
|
count = elems.length
|
262
265
|
next if count < options[:min_support] or not counts.include? annotation
|
263
|
-
pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
|
266
|
+
#pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
|
267
|
+
pvalues[annotation] = Hypergeometric.hypergeometric_c(tsv_size, counts[annotation], total, count)
|
264
268
|
end
|
265
269
|
|
266
270
|
pvalues = FDR.adjust_hash! pvalues if options[:fdr]
|
@@ -268,7 +272,7 @@ module TSV
|
|
268
272
|
pvalues.delete_if{|k, pvalue| pvalue > options[:cutoff] } if options[:cutoff]
|
269
273
|
|
270
274
|
if add_keys
|
271
|
-
tsv = TSV.setup(pvalues.keys.collect{|k| k.dup}, :key_field => fields, :fields => [], :type => :double)
|
275
|
+
tsv = TSV.setup(pvalues.keys.collect{|k| k.dup }, :key_field => fields, :fields => [], :type => :double)
|
272
276
|
|
273
277
|
tsv.add_field 'p-value' do |annot, values|
|
274
278
|
[pvalues[annot]]
|
@@ -2,23 +2,39 @@ require 'rbbt/vector/model/torch'
|
|
2
2
|
|
3
3
|
class HuggingfaceModel < TorchModel
|
4
4
|
|
5
|
+
attr_accessor :tokenizer
|
6
|
+
def init
|
7
|
+
@model, @tokenizer = self.instance_exec(&@init_model) if @model.nil?
|
8
|
+
[@model, @tokenizer]
|
9
|
+
end
|
10
|
+
|
11
|
+
def tokenizer
|
12
|
+
init
|
13
|
+
@tokenizer
|
14
|
+
end
|
15
|
+
|
5
16
|
def initialize(task, checkpoint, dir = nil, model_options = {})
|
6
17
|
super(dir, nil, model_options)
|
7
18
|
|
8
19
|
checkpoint = checkpoint.find if Path === checkpoint
|
9
20
|
|
10
|
-
@model_options =
|
21
|
+
@model_options[:tokenizer_options] = @model_options.delete(:tokenizer_args) if @model_options.include?(:tokenizer_args)
|
22
|
+
tokenizer_args = IndiferentHash.pull_keys @model_options, :tokenizer
|
23
|
+
@model_options[:tokenizer_args] = tokenizer_args
|
24
|
+
|
25
|
+
@model_options[:task] = task if task
|
26
|
+
@model_options[:checkpoint] = checkpoint if checkpoint
|
11
27
|
|
12
28
|
init_model do
|
13
29
|
checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint]
|
14
30
|
|
15
31
|
model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model,
|
16
|
-
@model_options[:task], checkpoint, **(IndiferentHash.setup(model_options
|
32
|
+
@model_options[:task], checkpoint, **(IndiferentHash.setup(@model_options.except(:training_args, :tokenizer_args, :task, :checkpoint, :class_labels))))
|
17
33
|
|
18
|
-
tokenizer_checkpoint = @model_options[:
|
34
|
+
tokenizer_checkpoint = @model_options[:tokenizer_args][:checkpoint] || checkpoint
|
19
35
|
|
20
36
|
tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
|
21
|
-
|
37
|
+
tokenizer_checkpoint, **(IndiferentHash.setup(@model_options[:tokenizer_args])))
|
22
38
|
|
23
39
|
[model, tokenizer]
|
24
40
|
end
|
@@ -46,7 +62,7 @@ class HuggingfaceModel < TorchModel
|
|
46
62
|
end
|
47
63
|
|
48
64
|
dataset_file = TorchModel.text_dataset(tsv_file, texts)
|
49
|
-
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir,
|
65
|
+
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, {})
|
50
66
|
|
51
67
|
begin
|
52
68
|
RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens)
|
@@ -71,7 +87,7 @@ class HuggingfaceModel < TorchModel
|
|
71
87
|
checkpoint_dir = File.join(tmpdir, 'checkpoints')
|
72
88
|
end
|
73
89
|
|
74
|
-
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir,
|
90
|
+
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, training_args)
|
75
91
|
dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels])
|
76
92
|
|
77
93
|
RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
|
@@ -96,43 +112,47 @@ class HuggingfaceModel < TorchModel
|
|
96
112
|
predictions = result["logits"]
|
97
113
|
end
|
98
114
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
115
|
+
if @model_options[:return_logits]
|
116
|
+
result = RbbtPython.numpy2ruby(predictions)
|
117
|
+
else
|
118
|
+
task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
|
119
|
+
result = case task
|
120
|
+
when "SequenceClassification"
|
121
|
+
RbbtPython.collect(predictions) do |logits|
|
122
|
+
logits = RbbtPython.numpy2ruby logits
|
123
|
+
best_class = logits.index logits.max
|
124
|
+
best_class = class_labels[best_class] if class_labels
|
125
|
+
best_class
|
126
|
+
end
|
127
|
+
when "MaskedLM"
|
128
|
+
all_token_positions = token_positions.to_a
|
129
|
+
|
130
|
+
i = 0
|
131
|
+
RbbtPython.collect(predictions) do |item_logits|
|
132
|
+
item_token_positions = all_token_positions[i]
|
133
|
+
i += 1
|
134
|
+
|
135
|
+
item_logits = RbbtPython.numpy2ruby(item_logits)
|
136
|
+
item_masks = item_token_positions.collect do |token_positions|
|
137
|
+
|
138
|
+
best = item_logits.values_at(*token_positions).collect do |logits|
|
139
|
+
best_token, best_score = nil
|
140
|
+
logits.each_with_index do |v,i|
|
141
|
+
if best_score.nil? || v > best_score
|
142
|
+
best_token, best_score = i, v
|
143
|
+
end
|
144
|
+
end
|
145
|
+
best_token
|
146
|
+
end
|
147
|
+
|
148
|
+
best.collect{|b| tokenizer.decode(b) } * "|"
|
149
|
+
end
|
150
|
+
Array === locate_tokens ? item_masks : item_masks.first
|
151
|
+
end
|
152
|
+
else
|
153
|
+
predictions
|
154
|
+
end
|
155
|
+
end
|
136
156
|
|
137
157
|
(! is_list || single) && Array === result ? result.first : result
|
138
158
|
end
|
@@ -144,6 +164,7 @@ class HuggingfaceModel < TorchModel
|
|
144
164
|
def reset_model
|
145
165
|
@model, @tokenizer = nil
|
146
166
|
Open.rm_rf @model_path
|
167
|
+
Open.rm_rf TorchModel.model_architecture(model_path)
|
147
168
|
init
|
148
169
|
end
|
149
170
|
end
|
@@ -7,25 +7,54 @@ class PytorchLightningModel < TorchModel
|
|
7
7
|
|
8
8
|
train_model do |features,labels|
|
9
9
|
model = init
|
10
|
-
|
10
|
+
train_loader = self.loader
|
11
11
|
val_loader = self.val_loader
|
12
|
-
if
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
if train_loader.nil?
|
13
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
14
|
+
batch_size ||= model_options[:batch_size]
|
15
|
+
batch_size ||= 1
|
16
|
+
|
17
|
+
shuffle = model_options[:training_args][:shuffle]
|
18
|
+
shuffle = true if shuffle.nil?
|
19
|
+
|
20
|
+
num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
|
21
|
+
train_loader = RbbtPython.run :torch do
|
22
|
+
dataset = features.zip(labels).collect{|f,l| [torch.tensor(f), l] }
|
23
|
+
torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, shuffle: shuffle, num_workers: num_workers.to_i)
|
17
24
|
end
|
18
25
|
end
|
19
|
-
trainer.fit(model,
|
26
|
+
trainer.fit(model, train_loader, val_loader)
|
20
27
|
TorchModel.save_architecture(model, model_path) if @directory
|
21
28
|
TorchModel.save_state(model, model_path) if @directory
|
22
29
|
end
|
30
|
+
|
31
|
+
eval_model do |features,list=false|
|
32
|
+
model = init
|
33
|
+
eval_loader = self.loader
|
34
|
+
if list
|
35
|
+
if eval_loader.nil?
|
36
|
+
batch_size ||= model_options[:batch_size]
|
37
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
38
|
+
batch_size ||= 1
|
39
|
+
|
40
|
+
num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
|
41
|
+
eval_loader = RbbtPython.run :torch do
|
42
|
+
dataset = torch.tensor(features)
|
43
|
+
torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, num_workers: num_workers.to_i)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
trainer.predict(model, eval_loader).inject([]){|acc,res| acc.concat RbbtPython.numpy2ruby(res[1])}
|
47
|
+
else
|
48
|
+
model.call(torch.tensor(features))
|
49
|
+
end
|
50
|
+
end
|
23
51
|
end
|
24
52
|
|
25
53
|
def trainer
|
26
54
|
@trainer ||= begin
|
27
|
-
|
28
|
-
|
55
|
+
trainer_args = {default_root_dir: File.join(@directory, 'checkpoints')}.
|
56
|
+
merge(model_options[:training_args].except(:batch_size))
|
57
|
+
RbbtPython.class_new_obj("pytorch_lightning", "Trainer", trainer_args)
|
29
58
|
end
|
30
59
|
end
|
31
60
|
end
|
@@ -42,13 +42,12 @@ class TorchModel
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def self.text_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
|
45
|
-
elements = elements.collect{|e| e.gsub("\n", ' ') }
|
45
|
+
elements = elements.compact.collect{|e| e.gsub("\n", ' ').gsub('"', '\'') }
|
46
46
|
tsv = feature_tsv(elements, labels, class_labels)
|
47
|
+
tsv.fields[0] = "text"
|
47
48
|
if labels.nil?
|
48
|
-
tsv
|
49
|
-
tsv.type = :single
|
49
|
+
tsv = tsv.to_single
|
50
50
|
else
|
51
|
-
tsv.fields[0] = "text"
|
52
51
|
tsv.type = :list
|
53
52
|
end
|
54
53
|
Open.write(tsv_dataset_file, tsv.to_s)
|
@@ -3,9 +3,27 @@ class TorchModel
|
|
3
3
|
def to_ruby
|
4
4
|
RbbtPython.numpy2ruby(self)
|
5
5
|
end
|
6
|
+
|
7
|
+
def to_ruby!
|
8
|
+
r = self.to_ruby
|
9
|
+
self.del
|
10
|
+
r
|
11
|
+
end
|
12
|
+
|
13
|
+
def length
|
14
|
+
PyCall.len(self)
|
15
|
+
end
|
16
|
+
|
6
17
|
def self.setup(obj)
|
7
18
|
obj.extend Tensor
|
8
19
|
end
|
20
|
+
|
21
|
+
def del
|
22
|
+
self.detach
|
23
|
+
self.grad = nil
|
24
|
+
self.storage.resize_ 0
|
25
|
+
self.to("cpu")
|
26
|
+
end
|
9
27
|
end
|
10
28
|
|
11
29
|
def self.init_python
|
@@ -46,7 +64,7 @@ class TorchModel
|
|
46
64
|
end
|
47
65
|
|
48
66
|
def self.tensor(obj, device, dtype)
|
49
|
-
RbbtPython.torch.tensor(obj, dtype: dtype, device: device)
|
67
|
+
TorchModel::Tensor.setup(RbbtPython.torch.tensor(obj, dtype: dtype, device: device))
|
50
68
|
end
|
51
69
|
|
52
70
|
end
|
@@ -13,19 +13,20 @@ class TorchModel
|
|
13
13
|
end
|
14
14
|
def get_weights(...); TorchModel.get_weights(model, ...); end
|
15
15
|
|
16
|
-
def self.freeze(layer)
|
16
|
+
def self.freeze(layer, requires_grad=false)
|
17
17
|
begin
|
18
|
-
PyCall.getattr(layer, :weight).requires_grad =
|
18
|
+
PyCall.getattr(layer, :weight).requires_grad = requires_grad
|
19
19
|
rescue
|
20
20
|
end
|
21
21
|
RbbtPython.iterate(layer.children) do |layer|
|
22
|
-
freeze(layer)
|
22
|
+
freeze(layer, requires_grad)
|
23
23
|
end
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
|
+
def self.freeze_layer(model, layer, requires_grad = false)
|
26
27
|
layer = get_layer(model, layer)
|
27
|
-
freeze(layer)
|
28
|
+
freeze(layer, requires_grad)
|
28
29
|
end
|
29
|
-
def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
|
30
30
|
|
31
|
+
def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
|
31
32
|
end
|
@@ -27,4 +27,10 @@ class TorchModel
|
|
27
27
|
Log.debug "Loading model architecture from #{model_architecture}"
|
28
28
|
RbbtPython.torch.load(model_architecture)
|
29
29
|
end
|
30
|
+
|
31
|
+
def reset_model
|
32
|
+
@trainer = @model = nil
|
33
|
+
Open.rm_rf model_path
|
34
|
+
Open.rm_rf TorchModel.model_architecture(model_path)
|
35
|
+
end
|
30
36
|
end
|
@@ -2,47 +2,37 @@ require_relative 'python'
|
|
2
2
|
|
3
3
|
class TorchModel < PythonModel
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :criterion, :optimizer
|
6
6
|
|
7
7
|
def initialize(...)
|
8
8
|
TorchModel.init_python
|
9
9
|
super(...)
|
10
|
-
@training_args = model_options[:training_args] || {}
|
11
10
|
|
11
|
+
@model_options[:training_options] = @model_options.delete(:training_args) if @model_options.include?(:training_args)
|
12
|
+
training_args = IndiferentHash.pull_keys(@model_options, :training) || {}
|
13
|
+
@model_options[:training_args] = training_args
|
12
14
|
init_model do
|
13
15
|
model = TorchModel.load_architecture(model_path)
|
14
16
|
if model.nil?
|
15
17
|
RbbtPython.add_path @directory
|
16
|
-
RbbtPython.
|
18
|
+
RbbtPython.process_paths
|
19
|
+
RbbtPython.class_new_obj(@python_module, @python_class, **model_options.except(:training_args, :batch_size))
|
17
20
|
else
|
18
21
|
TorchModel.load_state(model, model_path)
|
19
22
|
end
|
20
23
|
end
|
21
24
|
|
22
|
-
eval_model do |features,list=false|
|
23
|
-
init
|
24
|
-
@device ||= TorchModel.device(model_options)
|
25
|
-
@dtype ||= TorchModel.dtype(model_options)
|
26
|
-
model.to(@device)
|
27
|
-
|
28
|
-
tensor = list ? TorchModel.tensor(features, @device, @dtype) : TorchModel.tensor([features], @device, @dtype)
|
29
|
-
|
30
|
-
loss, res = model.call(tensor)
|
31
|
-
|
32
|
-
res = loss if res.nil?
|
33
|
-
|
34
|
-
res = TorchModel::Tensor.setup(list ? res : res[0])
|
35
|
-
|
36
|
-
res
|
37
|
-
end
|
38
|
-
|
39
25
|
train_model do |features,labels|
|
40
26
|
init
|
41
27
|
@device ||= TorchModel.device(model_options)
|
42
28
|
@dtype ||= TorchModel.dtype(model_options)
|
43
29
|
model.to(@device)
|
44
|
-
@optimizer ||= TorchModel.optimizer(model, training_args)
|
45
|
-
|
30
|
+
@optimizer ||= TorchModel.optimizer(model, model_options[:training_args] || {})
|
31
|
+
|
32
|
+
epochs = model_options[:training_args][:epochs] || 3
|
33
|
+
batch_size = model_options[:batch_size]
|
34
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
35
|
+
batch_size ||= 1
|
46
36
|
|
47
37
|
inputs = TorchModel.tensor(features, @device, @dtype)
|
48
38
|
#target = TorchModel.tensor(labels.collect{|v| [v] }, @device, @dtype)
|
@@ -63,6 +53,39 @@ class TorchModel < PythonModel
|
|
63
53
|
TorchModel.save_architecture(model, model_path) if @directory
|
64
54
|
TorchModel.save_state(model, model_path) if @directory
|
65
55
|
end
|
56
|
+
|
57
|
+
eval_model do |features,list=false|
|
58
|
+
init
|
59
|
+
@device ||= TorchModel.device(model_options)
|
60
|
+
@dtype ||= TorchModel.dtype(model_options)
|
61
|
+
model.to(@device)
|
62
|
+
model.eval
|
63
|
+
|
64
|
+
features = [features] unless list
|
65
|
+
|
66
|
+
batch_size = model_options[:batch_size]
|
67
|
+
batch_size ||= model_options[:training_args][:batch_size]
|
68
|
+
batch_size ||= 1
|
69
|
+
|
70
|
+
res = Misc.chunk(features, batch_size).inject(nil) do |acc,batch|
|
71
|
+
tensor = TorchModel.tensor(batch, @device, @dtype)
|
72
|
+
|
73
|
+
loss, chunk_res = model.call(tensor)
|
74
|
+
tensor.del
|
75
|
+
|
76
|
+
chunk_res = loss if chunk_res.nil?
|
77
|
+
|
78
|
+
TorchModel::Tensor.setup(chunk_res)
|
79
|
+
acc = acc.nil? ? chunk_res.to_ruby! : acc + chunk_res.to_ruby!
|
80
|
+
|
81
|
+
acc
|
82
|
+
end
|
83
|
+
|
84
|
+
res = TorchModel::Tensor.setup(list ? res : res[0])
|
85
|
+
|
86
|
+
res
|
87
|
+
end
|
88
|
+
|
66
89
|
end
|
67
90
|
end
|
68
91
|
require_relative 'torch/helpers'
|
data/lib/rbbt/vector/model.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/vector/model/util'
|
|
3
3
|
require 'rbbt/util/python'
|
4
4
|
|
5
5
|
RbbtPython.add_path Rbbt.python.find(:lib)
|
6
|
-
RbbtPython.init_rbbt
|
6
|
+
#RbbtPython.init_rbbt
|
7
7
|
|
8
8
|
class VectorModel
|
9
9
|
attr_accessor :directory, :model_path, :extract_features, :init_model, :train_model, :eval_model, :post_process, :balance
|
@@ -166,8 +166,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
166
166
|
@options_file = File.join(@directory, "options.json")
|
167
167
|
|
168
168
|
if File.exist?(@options_file)
|
169
|
-
|
170
|
-
IndiferentHash.setup(
|
169
|
+
file_options = JSON.parse(Open.read(@options_file))
|
170
|
+
IndiferentHash.setup(file_options)
|
171
|
+
@model_options = file_options.deep_merge(@model_options)
|
171
172
|
end
|
172
173
|
end
|
173
174
|
|
@@ -254,8 +255,24 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
254
255
|
|
255
256
|
def add_list(elements, labels = nil)
|
256
257
|
if @extract_features.nil? || @extract_features.arity == 1
|
257
|
-
|
258
|
-
|
258
|
+
case labels
|
259
|
+
when nil
|
260
|
+
elements.each do |elem|
|
261
|
+
add(elem)
|
262
|
+
end
|
263
|
+
when Array
|
264
|
+
elements.zip(labels).each do |elem,label|
|
265
|
+
add(elem, label)
|
266
|
+
end
|
267
|
+
when Hash
|
268
|
+
elements.each do |elem|
|
269
|
+
label = labels[elem]
|
270
|
+
add(elem, label)
|
271
|
+
end
|
272
|
+
else
|
273
|
+
elements.each do |elem|
|
274
|
+
add(elem, labels)
|
275
|
+
end
|
259
276
|
end
|
260
277
|
else
|
261
278
|
features = self.instance_exec(nil, elements, &@extract_features)
|
@@ -482,8 +499,11 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
482
499
|
@labels = orig_labels
|
483
500
|
end unless folds == -1
|
484
501
|
|
485
|
-
|
486
|
-
|
502
|
+
if folds != 1
|
503
|
+
self.reset_model if self.respond_to? :reset_model
|
504
|
+
self.train
|
505
|
+
end
|
506
|
+
|
487
507
|
res
|
488
508
|
end
|
489
509
|
end
|
data/python/rbbt_dm/__init__.py
CHANGED
@@ -27,5 +27,9 @@ def tsv_dataset(filename, *args, **kwargs):
|
|
27
27
|
def tsv(*args, **kwargs):
|
28
28
|
return tsv_dataset(*args, **kwargs)
|
29
29
|
|
30
|
+
def tsv_loader(*args, **kwargs):
|
31
|
+
dataset = tsv(*args, kwargs)
|
32
|
+
return torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
|
33
|
+
|
30
34
|
def data_dir():
|
31
35
|
return rbbt.path('var/rbbt_dm/data')
|
@@ -15,15 +15,15 @@ def load_model(task, checkpoint, **kwargs):
|
|
15
15
|
return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
|
16
16
|
else:
|
17
17
|
class_name = 'AutoModelFor' + task
|
18
|
-
return import_module_class('transformers', class_name).from_pretrained(checkpoint)
|
18
|
+
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
19
19
|
|
20
|
-
def load_tokenizer(
|
20
|
+
def load_tokenizer(checkpoint, **kwargs):
|
21
21
|
class_name = 'AutoTokenizer'
|
22
22
|
return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
|
23
23
|
|
24
24
|
def load_model_and_tokenizer(task, checkpoint):
|
25
25
|
model = load_model(task, checkpoint)
|
26
|
-
tokenizer = load_tokenizer(
|
26
|
+
tokenizer = load_tokenizer(checkpoint)
|
27
27
|
return model, tokenizer
|
28
28
|
|
29
29
|
# Not used
|
@@ -88,6 +88,9 @@ def training_args(*args, **kwargs):
|
|
88
88
|
def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
|
89
89
|
from transformers import Trainer
|
90
90
|
|
91
|
+
# Note: Parameters need to be made contiguous. I'm not sure why they weren't
|
92
|
+
for param in model.parameters(): param.data = param.data.contiguous()
|
93
|
+
|
91
94
|
if (isinstance(dataset, str)):
|
92
95
|
if (dataset.endswith('.json')):
|
93
96
|
tokenized_dataset = json_dataset(tokenizer, dataset)
|
data/share/R/MA.R
CHANGED
@@ -99,6 +99,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
|
|
99
99
|
}
|
100
100
|
|
101
101
|
if (log2){
|
102
|
+
full_rows = apply(is.na(data), 1, sum) == 0
|
103
|
+
data = data[full_rows,]
|
102
104
|
cutoff <- 1
|
103
105
|
drop <- which(apply(data, 1, max) < cutoff)
|
104
106
|
min = min(data[data != -Inf])
|
@@ -106,7 +108,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
|
|
106
108
|
data <- DGEList(data)
|
107
109
|
data <- calcNormFactors(data)
|
108
110
|
data = cpm(data, log=TRUE, prior.count=3)
|
109
|
-
|
111
|
+
if (length(drop) > 0)
|
112
|
+
data <- data[-drop,]
|
110
113
|
}else{
|
111
114
|
data[data == 0] = NA
|
112
115
|
good.rows = apply(is.na(data),1,sum) != dim(data)[2]
|
@@ -181,10 +184,11 @@ rbbt.dm.matrix.differential <- function(file, main, contrast = NULL, type = 'lim
|
|
181
184
|
contrast <- make.names(contrast);
|
182
185
|
}
|
183
186
|
|
184
|
-
if (type == 'limma')
|
187
|
+
if (is.null(type) || type == 'limma'){
|
185
188
|
result = rbbt.dm.matrix.differential.limma(data, main, contrast, log2, two.channel, eBayes.trend)
|
186
|
-
else
|
189
|
+
}else{
|
187
190
|
result = rbbt.dm.matrix.differential.DESeq(data, main, contrast)
|
191
|
+
}
|
188
192
|
|
189
193
|
if (is.null(outfile)){
|
190
194
|
return(result);
|
@@ -32,8 +32,6 @@ class TestFDR < Test::Unit::TestCase
|
|
32
32
|
assert_equal(clean(@r_adj), clean(FDR.adjust_native(@values)))
|
33
33
|
assert_equal(clean(FDR.adjust_fast(@values)), clean(FDR.adjust_native(@values)))
|
34
34
|
|
35
|
-
assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0]
|
35
|
+
assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0].to_i < 2
|
36
36
|
end
|
37
37
|
end
|
38
|
-
|
39
|
-
|
@@ -4,6 +4,11 @@ require 'test/unit'
|
|
4
4
|
|
5
5
|
class TestHypergeometric < Test::Unit::TestCase
|
6
6
|
|
7
|
+
def test_hypergeometric_c
|
8
|
+
assert_equal Hypergeometric.hypergeometric_c(2, 1, 1, 1).round(2), 0.5
|
9
|
+
assert_equal Hypergeometric.hypergeometric_c(10, 1, 1, 1).round(2), 0.1
|
10
|
+
end
|
11
|
+
|
7
12
|
def test_hypergeometric
|
8
13
|
assert Hypergeometric.hypergeometric(100, 20, 15, 13) < 0.0005
|
9
14
|
end
|
@@ -9,12 +9,8 @@ class TestHuggingface < Test::Unit::TestCase
|
|
9
9
|
task = "SequenceClassification"
|
10
10
|
|
11
11
|
model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
|
12
|
-
iii model.eval "This is dog"
|
13
|
-
iii model.eval "This is cat"
|
14
|
-
iii model.eval_list(["This is dog", "This is cat"])
|
15
12
|
|
16
13
|
model = VectorModel.new dir
|
17
|
-
iii model.eval_list(["This is dog", "This is cat"])
|
18
14
|
end
|
19
15
|
end
|
20
16
|
|
@@ -42,7 +38,7 @@ class TestHuggingface < Test::Unit::TestCase
|
|
42
38
|
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
|
43
39
|
end
|
44
40
|
|
45
|
-
def
|
41
|
+
def _test_sst_eval
|
46
42
|
TmpFile.with_file do |dir|
|
47
43
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
48
44
|
|
@@ -55,12 +51,29 @@ class TestHuggingface < Test::Unit::TestCase
|
|
55
51
|
end
|
56
52
|
end
|
57
53
|
|
54
|
+
def _test_sst_logits
|
55
|
+
TmpFile.with_file do |dir|
|
56
|
+
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
57
|
+
|
58
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
|
59
|
+
|
60
|
+
model.model_options[:class_labels] = ["Bad", "Good"]
|
61
|
+
model.model_options[:return_logits] = true
|
62
|
+
|
63
|
+
logits = model.eval("This is dog")
|
64
|
+
assert logits[0] > logits[1]
|
65
|
+
logits = model.eval_list(["This is dog", "This is cat"])
|
66
|
+
assert logits[0][0] > logits[0][1]
|
67
|
+
assert logits[1][0] < logits[1][1]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
58
71
|
|
59
|
-
def
|
72
|
+
def test_sst_train
|
60
73
|
TmpFile.with_file do |dir|
|
61
74
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
62
75
|
|
63
|
-
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
|
76
|
+
model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, tokenizer_args:{max_length: 128}, tokenizer_padding: true, tokenizer_truncation: true
|
64
77
|
|
65
78
|
model.model_options[:class_labels] = %w(Bad Good)
|
66
79
|
|
@@ -148,12 +161,12 @@ class TestHuggingface < Test::Unit::TestCase
|
|
148
161
|
|
149
162
|
model = VectorModel.new dir
|
150
163
|
|
151
|
-
assert_equal "Good", model.eval_list("This is dog")
|
164
|
+
assert_equal ["Good"], model.eval_list(["This is dog"])
|
152
165
|
|
153
166
|
end
|
154
167
|
end
|
155
168
|
|
156
|
-
def
|
169
|
+
def __test_sst_stress_test
|
157
170
|
TmpFile.with_file do |dir|
|
158
171
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
159
172
|
|
@@ -252,6 +265,7 @@ class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
|
|
252
265
|
EOF
|
253
266
|
|
254
267
|
RbbtPython.add_path dir
|
268
|
+
RbbtPython.process_paths
|
255
269
|
|
256
270
|
biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
|
257
271
|
model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta
|
@@ -88,9 +88,13 @@ class TestPytorchLightningModel(pl.LightningModule):
|
|
88
88
|
res = model.eval_list([[10.0], [11.2], [14.3]])
|
89
89
|
assert_equal 3, RbbtPython.numpy2ruby(res).length
|
90
90
|
|
91
|
+
orig_res = res
|
91
92
|
model = VectorModel.new dir
|
92
93
|
model.init
|
93
|
-
|
94
|
+
res = model.eval([10.0])
|
95
|
+
res = model.eval_list([[10.0], [11.2], [14.3]])
|
96
|
+
assert_equal 3, RbbtPython.numpy2ruby(res).length
|
97
|
+
assert_equal orig_res, res
|
94
98
|
end
|
95
99
|
end
|
96
100
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-dm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -174,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
174
174
|
- !ruby/object:Gem::Version
|
175
175
|
version: '0'
|
176
176
|
requirements: []
|
177
|
-
rubygems_version: 3.5.
|
177
|
+
rubygems_version: 3.5.23
|
178
178
|
signing_key:
|
179
179
|
specification_version: 4
|
180
180
|
summary: Data-mining and statistics
|