rbbt-dm 1.3.0 → 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f9b8071884e4e9d7a8c04f175fe262aad9e2b77911dca787a957a5c5f797fb9b
4
- data.tar.gz: 1c7334d62036d3ae07b7f625b310f401b5078022f909be34cd78bb66c5b2af06
3
+ metadata.gz: 904f77b8390128686b8cf153e517aff21394bd43548b8116e0d28188924a833e
4
+ data.tar.gz: d05f1712851cb5c552cfedac2166abb45d508cad6abbf493b41f5becde0e570c
5
5
  SHA512:
6
- metadata.gz: 22c73d01543e93a2a7b10ecaa88db9a663b35c8264b6d0e5e9d4b00096f34955250105dec4787242529c594c1a959feb23a4b5cd46298850eee7a813dc551d0f
7
- data.tar.gz: 545663b2ee93dd0e6e6b54e353cb3bfafab9001c7031b42e7f895fb95ea85ffb6c1dcdb54bb671ee5cace49561cca018212e25ee43592b457e4e1abe83277076
6
+ metadata.gz: b762389ed54ce7a91da87258f2ee856d04b4c1fef73894ac6c0e6219423967bfb42b89430f952b62d609a38c1acd2935ca4d424c35215fcc987e8af24f1fde3d
7
+ data.tar.gz: d3248d9996ff5f1298203d7d69595cb8f0a0dd037a4a666f0631d106b785f3401f56e5bc0ef2883e32766d1792ece1e8807a0c6d3bd7e5ecd174fca8fc698dc3
@@ -64,7 +64,7 @@ data = rbbt.dm.matrix.differential(#{ R.ruby2R data_file },
64
64
  )
65
65
  EOS
66
66
 
67
- R.run(cmd, :monitor => true)
67
+ R.run(cmd, :monitor => true)
68
68
  end
69
69
  end
70
70
  end
data/lib/rbbt/matrix.rb CHANGED
@@ -12,6 +12,7 @@ class RbbtMatrix
12
12
 
13
13
  attr_accessor :data_file, :labels, :value_type, :format, :organism, :identifiers
14
14
  def initialize(data_file, labels = nil, value_type = nil, format = nil, organism=nil, identifiers=nil)
15
+ data_file = data_file.find if Path === data_file
15
16
  @data_file = data_file
16
17
  @labels = labels
17
18
  @value_type = value_type || 'count'
@@ -42,7 +43,7 @@ class RbbtMatrix
42
43
  end
43
44
 
44
45
  def samples
45
- @samples ||= TSV.parse_header(@data_file).fields
46
+ @samples ||= TSV.parse_header(@data_file)[:fields]
46
47
  end
47
48
 
48
49
  def subsets=(subsets)
@@ -181,9 +182,14 @@ class RbbtMatrix
181
182
 
182
183
  identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
183
184
 
184
- data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
185
+ new_data = data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
185
186
  Misc.mean(v.compact)
186
187
  end
188
+
189
+ new_data.delete ""
190
+ new_data.delete nil
191
+
192
+ new_data
187
193
  end
188
194
  subsets = self.subsets
189
195
  matrix = RbbtMatrix.new file, labels, value_type, "Ensembl Gene ID", organism
@@ -202,9 +208,14 @@ class RbbtMatrix
202
208
 
203
209
  identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
204
210
 
205
- data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
211
+ new_data = data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
206
212
  Misc.mean(v.compact)
207
213
  end
214
+
215
+ new_data.delete ""
216
+ new_data.delete nil
217
+
218
+ new_data
208
219
  end
209
220
  subsets = self.subsets
210
221
  matrix = RbbtMatrix.new file, labels, value_type, "Associated Gene Name", organism
data/lib/rbbt/stan.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  require 'rbbt/util/R'
2
- require 'mkfifo'
3
2
 
4
3
  module STAN
5
4
 
@@ -88,7 +87,7 @@ data{
88
87
  end
89
88
 
90
89
  def self.exec(data, model, input_directory, parameter_chains, sample_file, debug = FALSE, stan_options = {})
91
- stan_options = Misc.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
90
+ stan_options = IndiferentHash.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
92
91
 
93
92
  data = {} if data.nil?
94
93
 
@@ -123,7 +122,7 @@ print(fit)
123
122
 
124
123
  def self.stream_chain(data, model, directory = nil, options = {})
125
124
  options, directory = directory, nil if Hash === directory
126
- debug = Misc.process_options options, :debug
125
+ debug = IndiferentHash.process_options options, :debug
127
126
 
128
127
  if directory.nil?
129
128
  directory = TmpFile.tmp_file
@@ -178,7 +177,7 @@ print(fit)
178
177
  end
179
178
 
180
179
  def self.run(data, model, directory, options = {})
181
- debug = Misc.process_options options, :debug
180
+ debug = IndiferentHash.process_options options, :debug
182
181
 
183
182
  input_directory = File.join(directory, 'inputs')
184
183
 
@@ -172,7 +172,8 @@ module FDR
172
172
  values << p[1]
173
173
  }
174
174
 
175
- if RUBY_VERSION[0] == "2"
175
+ iii RUBY_VERSION[0]
176
+ if RUBY_VERSION[0] == "2" || RUBY_VERSION[0] == "3"
176
177
  new_values = FDR.adjust(values)
177
178
  keys.zip(new_values).each do |k,v|
178
179
  vs = data[k]
@@ -195,8 +196,4 @@ module FDR
195
196
  data.unnamed = unnamed if unnamed
196
197
  end
197
198
  end
198
-
199
199
  end
200
-
201
-
202
-
@@ -64,10 +64,10 @@ double lBinom(double n, double k)
64
64
  builder.c_singleton <<-EOC
65
65
  /**
66
66
  * * Compute the Hypergeometric accumulated value.
67
- * * @param total => total size
68
- * * @param support => total support
69
- * * @param list => selected list size
70
- * * @param found => support
67
+ * * @param total => Balls in urn
68
+ * * @param support => Positive balls in urn
69
+ * * @param list => Drawn balls
70
+ * * @param found => Positive drawn balls
71
71
  * * @return The result
72
72
  * */
73
73
  //pvalues[annotation] = Hypergeometric.hypergeometric(tsv_size, counts[annotation], total, count)
@@ -102,10 +102,13 @@ double hypergeometric_c(double total, double support, double list, double found)
102
102
  EOC
103
103
  end
104
104
 
105
- def self.hypergeometric(count, positive, negative, total)
106
- #RSRuby.instance.phyper(count - 1, positive, negative, total, false).to_f
105
+ def self.hypergeometric_R(count, positive, negative, total)
107
106
  R.eval("phyper(#{ count } - 1, #{ positive }, #{ negative }, #{ total }, lower.tail=FALSE)").to_f
108
107
  end
108
+
109
+ def self.hypergeometric(count, positive, negative, total)
110
+ hypergeometric_c(positive + negative, positive, total, count)
111
+ end
109
112
  end
110
113
 
111
114
  module TSV
@@ -260,7 +263,8 @@ module TSV
260
263
  elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
261
264
  count = elems.length
262
265
  next if count < options[:min_support] or not counts.include? annotation
263
- pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
266
+ #pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
267
+ pvalues[annotation] = Hypergeometric.hypergeometric_c(tsv_size, counts[annotation], total, count)
264
268
  end
265
269
 
266
270
  pvalues = FDR.adjust_hash! pvalues if options[:fdr]
@@ -268,7 +272,7 @@ module TSV
268
272
  pvalues.delete_if{|k, pvalue| pvalue > options[:cutoff] } if options[:cutoff]
269
273
 
270
274
  if add_keys
271
- tsv = TSV.setup(pvalues.keys.collect{|k| k.dup}, :key_field => fields, :fields => [], :type => :double)
275
+ tsv = TSV.setup(pvalues.keys.collect{|k| k.dup }, :key_field => fields, :fields => [], :type => :double)
272
276
 
273
277
  tsv.add_field 'p-value' do |annot, values|
274
278
  [pvalues[annot]]
@@ -2,23 +2,39 @@ require 'rbbt/vector/model/torch'
2
2
 
3
3
  class HuggingfaceModel < TorchModel
4
4
 
5
+ attr_accessor :tokenizer
6
+ def init
7
+ @model, @tokenizer = self.instance_exec(&@init_model) if @model.nil?
8
+ [@model, @tokenizer]
9
+ end
10
+
11
+ def tokenizer
12
+ init
13
+ @tokenizer
14
+ end
15
+
5
16
  def initialize(task, checkpoint, dir = nil, model_options = {})
6
17
  super(dir, nil, model_options)
7
18
 
8
19
  checkpoint = checkpoint.find if Path === checkpoint
9
20
 
10
- @model_options = Misc.add_defaults @model_options, :task => task, :checkpoint => checkpoint
21
+ @model_options[:tokenizer_options] = @model_options.delete(:tokenizer_args) if @model_options.include?(:tokenizer_args)
22
+ tokenizer_args = IndiferentHash.pull_keys @model_options, :tokenizer
23
+ @model_options[:tokenizer_args] = tokenizer_args
24
+
25
+ @model_options[:task] = task if task
26
+ @model_options[:checkpoint] = checkpoint if checkpoint
11
27
 
12
28
  init_model do
13
29
  checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint]
14
30
 
15
31
  model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model,
16
- @model_options[:task], checkpoint, **(IndiferentHash.setup(model_options[:model_args]) || {}))
32
+ @model_options[:task], checkpoint, **(IndiferentHash.setup(@model_options.except(:training_args, :tokenizer_args, :task, :checkpoint, :class_labels))))
17
33
 
18
- tokenizer_checkpoint = @model_options[:tokenizer_checkpoint] || checkpoint
34
+ tokenizer_checkpoint = @model_options[:tokenizer_args][:checkpoint] || checkpoint
19
35
 
20
36
  tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
21
- @model_options[:task], tokenizer_checkpoint, **(IndiferentHash.setup(model_options[:tokenizer_args]) || {}))
37
+ tokenizer_checkpoint, **(IndiferentHash.setup(@model_options[:tokenizer_args])))
22
38
 
23
39
  [model, tokenizer]
24
40
  end
@@ -46,7 +62,7 @@ class HuggingfaceModel < TorchModel
46
62
  end
47
63
 
48
64
  dataset_file = TorchModel.text_dataset(tsv_file, texts)
49
- training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
65
+ training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, {})
50
66
 
51
67
  begin
52
68
  RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens)
@@ -71,7 +87,7 @@ class HuggingfaceModel < TorchModel
71
87
  checkpoint_dir = File.join(tmpdir, 'checkpoints')
72
88
  end
73
89
 
74
- training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
90
+ training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, training_args)
75
91
  dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels])
76
92
 
77
93
  RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
@@ -96,43 +112,47 @@ class HuggingfaceModel < TorchModel
96
112
  predictions = result["logits"]
97
113
  end
98
114
 
99
- task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
100
- result = case task
101
- when "SequenceClassification"
102
- RbbtPython.collect(predictions) do |logits|
103
- logits = RbbtPython.numpy2ruby logits
104
- best_class = logits.index logits.max
105
- best_class = class_labels[best_class] if class_labels
106
- best_class
107
- end
108
- when "MaskedLM"
109
- all_token_positions = token_positions.to_a
110
-
111
- i = 0
112
- RbbtPython.collect(predictions) do |item_logits|
113
- item_token_positions = all_token_positions[i]
114
- i += 1
115
-
116
- item_logits = RbbtPython.numpy2ruby(item_logits)
117
- item_masks = item_token_positions.collect do |token_positions|
118
-
119
- best = item_logits.values_at(*token_positions).collect do |logits|
120
- best_token, best_score = nil
121
- logits.each_with_index do |v,i|
122
- if best_score.nil? || v > best_score
123
- best_token, best_score = i, v
124
- end
125
- end
126
- best_token
127
- end
128
-
129
- best.collect{|b| tokenizer.decode(b) } * "|"
130
- end
131
- Array === locate_tokens ? item_masks : item_masks.first
132
- end
133
- else
134
- predictions
135
- end
115
+ if @model_options[:return_logits]
116
+ result = RbbtPython.numpy2ruby(predictions)
117
+ else
118
+ task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
119
+ result = case task
120
+ when "SequenceClassification"
121
+ RbbtPython.collect(predictions) do |logits|
122
+ logits = RbbtPython.numpy2ruby logits
123
+ best_class = logits.index logits.max
124
+ best_class = class_labels[best_class] if class_labels
125
+ best_class
126
+ end
127
+ when "MaskedLM"
128
+ all_token_positions = token_positions.to_a
129
+
130
+ i = 0
131
+ RbbtPython.collect(predictions) do |item_logits|
132
+ item_token_positions = all_token_positions[i]
133
+ i += 1
134
+
135
+ item_logits = RbbtPython.numpy2ruby(item_logits)
136
+ item_masks = item_token_positions.collect do |token_positions|
137
+
138
+ best = item_logits.values_at(*token_positions).collect do |logits|
139
+ best_token, best_score = nil
140
+ logits.each_with_index do |v,i|
141
+ if best_score.nil? || v > best_score
142
+ best_token, best_score = i, v
143
+ end
144
+ end
145
+ best_token
146
+ end
147
+
148
+ best.collect{|b| tokenizer.decode(b) } * "|"
149
+ end
150
+ Array === locate_tokens ? item_masks : item_masks.first
151
+ end
152
+ else
153
+ predictions
154
+ end
155
+ end
136
156
 
137
157
  (! is_list || single) && Array === result ? result.first : result
138
158
  end
@@ -144,6 +164,7 @@ class HuggingfaceModel < TorchModel
144
164
  def reset_model
145
165
  @model, @tokenizer = nil
146
166
  Open.rm_rf @model_path
167
+ Open.rm_rf TorchModel.model_architecture(model_path)
147
168
  init
148
169
  end
149
170
  end
@@ -2,7 +2,7 @@ require 'rbbt/vector/model'
2
2
  require 'rbbt/util/python'
3
3
 
4
4
  RbbtPython.add_path Rbbt.python.find(:lib)
5
- RbbtPython.init_rbbt
5
+ #RbbtPython.init_rbbt
6
6
 
7
7
  class PythonModel < VectorModel
8
8
  attr_accessor :python_class, :python_module
@@ -7,25 +7,54 @@ class PytorchLightningModel < TorchModel
7
7
 
8
8
  train_model do |features,labels|
9
9
  model = init
10
- loader = self.loader
10
+ train_loader = self.loader
11
11
  val_loader = self.val_loader
12
- if (features && features.any?) && loader.nil?
13
- TmpFile.with_file do |tsv_dataset_file|
14
- TorchModel.feature_dataset(tsv_dataset_file, features, labels)
15
- RbbtPython.pyimport :rbbt_dm
16
- loader = RbbtPython.rbbt_dm.tsv(tsv_dataset_file)
12
+ if train_loader.nil?
13
+ batch_size ||= model_options[:training_args][:batch_size]
14
+ batch_size ||= model_options[:batch_size]
15
+ batch_size ||= 1
16
+
17
+ shuffle = model_options[:training_args][:shuffle]
18
+ shuffle = true if shuffle.nil?
19
+
20
+ num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
21
+ train_loader = RbbtPython.run :torch do
22
+ dataset = features.zip(labels).collect{|f,l| [torch.tensor(f), l] }
23
+ torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, shuffle: shuffle, num_workers: num_workers.to_i)
17
24
  end
18
25
  end
19
- trainer.fit(model, loader, val_loader)
26
+ trainer.fit(model, train_loader, val_loader)
20
27
  TorchModel.save_architecture(model, model_path) if @directory
21
28
  TorchModel.save_state(model, model_path) if @directory
22
29
  end
30
+
31
+ eval_model do |features,list=false|
32
+ model = init
33
+ eval_loader = self.loader
34
+ if list
35
+ if eval_loader.nil?
36
+ batch_size ||= model_options[:batch_size]
37
+ batch_size ||= model_options[:training_args][:batch_size]
38
+ batch_size ||= 1
39
+
40
+ num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
41
+ eval_loader = RbbtPython.run :torch do
42
+ dataset = torch.tensor(features)
43
+ torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, num_workers: num_workers.to_i)
44
+ end
45
+ end
46
+ trainer.predict(model, eval_loader).inject([]){|acc,res| acc.concat RbbtPython.numpy2ruby(res[1])}
47
+ else
48
+ model.call(torch.tensor(features))
49
+ end
50
+ end
23
51
  end
24
52
 
25
53
  def trainer
26
54
  @trainer ||= begin
27
- options = @model_options[:training_args] || @model_options[:trainer_args]
28
- RbbtPython.class_new_obj("pytorch_lightning", "Trainer", options || {})
55
+ trainer_args = {default_root_dir: File.join(@directory, 'checkpoints')}.
56
+ merge(model_options[:training_args].except(:batch_size))
57
+ RbbtPython.class_new_obj("pytorch_lightning", "Trainer", trainer_args)
29
58
  end
30
59
  end
31
60
  end
@@ -42,13 +42,12 @@ class TorchModel
42
42
  end
43
43
 
44
44
  def self.text_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
45
- elements = elements.collect{|e| e.gsub("\n", ' ') }
45
+ elements = elements.compact.collect{|e| e.gsub("\n", ' ').gsub('"', '\'') }
46
46
  tsv = feature_tsv(elements, labels, class_labels)
47
+ tsv.fields[0] = "text"
47
48
  if labels.nil?
48
- tsv.fields[0] = "text"
49
- tsv.type = :single
49
+ tsv = tsv.to_single
50
50
  else
51
- tsv.fields[0] = "text"
52
51
  tsv.type = :list
53
52
  end
54
53
  Open.write(tsv_dataset_file, tsv.to_s)
@@ -3,9 +3,27 @@ class TorchModel
3
3
  def to_ruby
4
4
  RbbtPython.numpy2ruby(self)
5
5
  end
6
+
7
+ def to_ruby!
8
+ r = self.to_ruby
9
+ self.del
10
+ r
11
+ end
12
+
13
+ def length
14
+ PyCall.len(self)
15
+ end
16
+
6
17
  def self.setup(obj)
7
18
  obj.extend Tensor
8
19
  end
20
+
21
+ def del
22
+ self.detach
23
+ self.grad = nil
24
+ self.storage.resize_ 0
25
+ self.to("cpu")
26
+ end
9
27
  end
10
28
 
11
29
  def self.init_python
@@ -46,7 +64,7 @@ class TorchModel
46
64
  end
47
65
 
48
66
  def self.tensor(obj, device, dtype)
49
- RbbtPython.torch.tensor(obj, dtype: dtype, device: device)
67
+ TorchModel::Tensor.setup(RbbtPython.torch.tensor(obj, dtype: dtype, device: device))
50
68
  end
51
69
 
52
70
  end
@@ -13,19 +13,20 @@ class TorchModel
13
13
  end
14
14
  def get_weights(...); TorchModel.get_weights(model, ...); end
15
15
 
16
- def self.freeze(layer)
16
+ def self.freeze(layer, requires_grad=false)
17
17
  begin
18
- PyCall.getattr(layer, :weight).requires_grad = false
18
+ PyCall.getattr(layer, :weight).requires_grad = requires_grad
19
19
  rescue
20
20
  end
21
21
  RbbtPython.iterate(layer.children) do |layer|
22
- freeze(layer)
22
+ freeze(layer, requires_grad)
23
23
  end
24
24
  end
25
- def self.freeze_layer(model, layer)
25
+
26
+ def self.freeze_layer(model, layer, requires_grad = false)
26
27
  layer = get_layer(model, layer)
27
- freeze(layer)
28
+ freeze(layer, requires_grad)
28
29
  end
29
- def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
30
30
 
31
+ def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
31
32
  end
@@ -27,4 +27,10 @@ class TorchModel
27
27
  Log.debug "Loading model architecture from #{model_architecture}"
28
28
  RbbtPython.torch.load(model_architecture)
29
29
  end
30
+
31
+ def reset_model
32
+ @trainer = @model = nil
33
+ Open.rm_rf model_path
34
+ Open.rm_rf TorchModel.model_architecture(model_path)
35
+ end
30
36
  end
@@ -2,47 +2,37 @@ require_relative 'python'
2
2
 
3
3
  class TorchModel < PythonModel
4
4
 
5
- attr_accessor :model, :criterion, :optimizer, :training_args
5
+ attr_accessor :criterion, :optimizer
6
6
 
7
7
  def initialize(...)
8
8
  TorchModel.init_python
9
9
  super(...)
10
- @training_args = model_options[:training_args] || {}
11
10
 
11
+ @model_options[:training_options] = @model_options.delete(:training_args) if @model_options.include?(:training_args)
12
+ training_args = IndiferentHash.pull_keys(@model_options, :training) || {}
13
+ @model_options[:training_args] = training_args
12
14
  init_model do
13
15
  model = TorchModel.load_architecture(model_path)
14
16
  if model.nil?
15
17
  RbbtPython.add_path @directory
16
- RbbtPython.class_new_obj(@python_module, @python_class, **model_options)
18
+ RbbtPython.process_paths
19
+ RbbtPython.class_new_obj(@python_module, @python_class, **model_options.except(:training_args, :batch_size))
17
20
  else
18
21
  TorchModel.load_state(model, model_path)
19
22
  end
20
23
  end
21
24
 
22
- eval_model do |features,list=false|
23
- init
24
- @device ||= TorchModel.device(model_options)
25
- @dtype ||= TorchModel.dtype(model_options)
26
- model.to(@device)
27
-
28
- tensor = list ? TorchModel.tensor(features, @device, @dtype) : TorchModel.tensor([features], @device, @dtype)
29
-
30
- loss, res = model.call(tensor)
31
-
32
- res = loss if res.nil?
33
-
34
- res = TorchModel::Tensor.setup(list ? res : res[0])
35
-
36
- res
37
- end
38
-
39
25
  train_model do |features,labels|
40
26
  init
41
27
  @device ||= TorchModel.device(model_options)
42
28
  @dtype ||= TorchModel.dtype(model_options)
43
29
  model.to(@device)
44
- @optimizer ||= TorchModel.optimizer(model, training_args)
45
- epochs = training_args[:epochs] || 3
30
+ @optimizer ||= TorchModel.optimizer(model, model_options[:training_args] || {})
31
+
32
+ epochs = model_options[:training_args][:epochs] || 3
33
+ batch_size = model_options[:batch_size]
34
+ batch_size ||= model_options[:training_args][:batch_size]
35
+ batch_size ||= 1
46
36
 
47
37
  inputs = TorchModel.tensor(features, @device, @dtype)
48
38
  #target = TorchModel.tensor(labels.collect{|v| [v] }, @device, @dtype)
@@ -63,6 +53,39 @@ class TorchModel < PythonModel
63
53
  TorchModel.save_architecture(model, model_path) if @directory
64
54
  TorchModel.save_state(model, model_path) if @directory
65
55
  end
56
+
57
+ eval_model do |features,list=false|
58
+ init
59
+ @device ||= TorchModel.device(model_options)
60
+ @dtype ||= TorchModel.dtype(model_options)
61
+ model.to(@device)
62
+ model.eval
63
+
64
+ features = [features] unless list
65
+
66
+ batch_size = model_options[:batch_size]
67
+ batch_size ||= model_options[:training_args][:batch_size]
68
+ batch_size ||= 1
69
+
70
+ res = Misc.chunk(features, batch_size).inject(nil) do |acc,batch|
71
+ tensor = TorchModel.tensor(batch, @device, @dtype)
72
+
73
+ loss, chunk_res = model.call(tensor)
74
+ tensor.del
75
+
76
+ chunk_res = loss if chunk_res.nil?
77
+
78
+ TorchModel::Tensor.setup(chunk_res)
79
+ acc = acc.nil? ? chunk_res.to_ruby! : acc + chunk_res.to_ruby!
80
+
81
+ acc
82
+ end
83
+
84
+ res = TorchModel::Tensor.setup(list ? res : res[0])
85
+
86
+ res
87
+ end
88
+
66
89
  end
67
90
  end
68
91
  require_relative 'torch/helpers'
@@ -3,7 +3,7 @@ require 'rbbt/vector/model/util'
3
3
  require 'rbbt/util/python'
4
4
 
5
5
  RbbtPython.add_path Rbbt.python.find(:lib)
6
- RbbtPython.init_rbbt
6
+ #RbbtPython.init_rbbt
7
7
 
8
8
  class VectorModel
9
9
  attr_accessor :directory, :model_path, :extract_features, :init_model, :train_model, :eval_model, :post_process, :balance
@@ -166,8 +166,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
166
166
  @options_file = File.join(@directory, "options.json")
167
167
 
168
168
  if File.exist?(@options_file)
169
- @model_options = JSON.parse(Open.read(@options_file)).merge(@model_options || {})
170
- IndiferentHash.setup(@model_options)
169
+ file_options = JSON.parse(Open.read(@options_file))
170
+ IndiferentHash.setup(file_options)
171
+ @model_options = file_options.deep_merge(@model_options)
171
172
  end
172
173
  end
173
174
 
@@ -254,8 +255,24 @@ cat(paste(label, sep="\\n", collapse="\\n"));
254
255
 
255
256
  def add_list(elements, labels = nil)
256
257
  if @extract_features.nil? || @extract_features.arity == 1
257
- elements.zip(labels || [nil]).each do |elem,label|
258
- add(elem, label)
258
+ case labels
259
+ when nil
260
+ elements.each do |elem|
261
+ add(elem)
262
+ end
263
+ when Array
264
+ elements.zip(labels).each do |elem,label|
265
+ add(elem, label)
266
+ end
267
+ when Hash
268
+ elements.each do |elem|
269
+ label = labels[elem]
270
+ add(elem, label)
271
+ end
272
+ else
273
+ elements.each do |elem|
274
+ add(elem, labels)
275
+ end
259
276
  end
260
277
  else
261
278
  features = self.instance_exec(nil, elements, &@extract_features)
@@ -482,8 +499,11 @@ cat(paste(label, sep="\\n", collapse="\\n"));
482
499
  @labels = orig_labels
483
500
  end unless folds == -1
484
501
 
485
- self.reset_model if self.respond_to? :reset_model
486
- self.train unless folds == 1
502
+ if folds != 1
503
+ self.reset_model if self.respond_to? :reset_model
504
+ self.train
505
+ end
506
+
487
507
  res
488
508
  end
489
509
  end
@@ -27,5 +27,9 @@ def tsv_dataset(filename, *args, **kwargs):
27
27
  def tsv(*args, **kwargs):
28
28
  return tsv_dataset(*args, **kwargs)
29
29
 
30
+ def tsv_loader(*args, **kwargs):
31
+ dataset = tsv(*args, kwargs)
32
+ return torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
33
+
30
34
  def data_dir():
31
35
  return rbbt.path('var/rbbt_dm/data')
@@ -15,15 +15,15 @@ def load_model(task, checkpoint, **kwargs):
15
15
  return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
16
16
  else:
17
17
  class_name = 'AutoModelFor' + task
18
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
18
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
19
19
 
20
- def load_tokenizer(task, checkpoint, **kwargs):
20
+ def load_tokenizer(checkpoint, **kwargs):
21
21
  class_name = 'AutoTokenizer'
22
22
  return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
23
23
 
24
24
  def load_model_and_tokenizer(task, checkpoint):
25
25
  model = load_model(task, checkpoint)
26
- tokenizer = load_tokenizer(task, checkpoint)
26
+ tokenizer = load_tokenizer(checkpoint)
27
27
  return model, tokenizer
28
28
 
29
29
  # Not used
@@ -88,6 +88,9 @@ def training_args(*args, **kwargs):
88
88
  def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
89
89
  from transformers import Trainer
90
90
 
91
+ # Note: Parameters need to be made contiguous. I'm not sure why they weren't
92
+ for param in model.parameters(): param.data = param.data.contiguous()
93
+
91
94
  if (isinstance(dataset, str)):
92
95
  if (dataset.endswith('.json')):
93
96
  tokenized_dataset = json_dataset(tokenizer, dataset)
data/share/R/MA.R CHANGED
@@ -99,6 +99,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
99
99
  }
100
100
 
101
101
  if (log2){
102
+ full_rows = apply(is.na(data), 1, sum) == 0
103
+ data = data[full_rows,]
102
104
  cutoff <- 1
103
105
  drop <- which(apply(data, 1, max) < cutoff)
104
106
  min = min(data[data != -Inf])
@@ -106,7 +108,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
106
108
  data <- DGEList(data)
107
109
  data <- calcNormFactors(data)
108
110
  data = cpm(data, log=TRUE, prior.count=3)
109
- data <- data[-drop,]
111
+ if (length(drop) > 0)
112
+ data <- data[-drop,]
110
113
  }else{
111
114
  data[data == 0] = NA
112
115
  good.rows = apply(is.na(data),1,sum) != dim(data)[2]
@@ -181,10 +184,11 @@ rbbt.dm.matrix.differential <- function(file, main, contrast = NULL, type = 'lim
181
184
  contrast <- make.names(contrast);
182
185
  }
183
186
 
184
- if (type == 'limma')
187
+ if (is.null(type) || type == 'limma'){
185
188
  result = rbbt.dm.matrix.differential.limma(data, main, contrast, log2, two.channel, eBayes.trend)
186
- else
189
+ }else{
187
190
  result = rbbt.dm.matrix.differential.DESeq(data, main, contrast)
191
+ }
188
192
 
189
193
  if (is.null(outfile)){
190
194
  return(result);
@@ -32,8 +32,6 @@ class TestFDR < Test::Unit::TestCase
32
32
  assert_equal(clean(@r_adj), clean(FDR.adjust_native(@values)))
33
33
  assert_equal(clean(FDR.adjust_fast(@values)), clean(FDR.adjust_native(@values)))
34
34
 
35
- assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0] != "2"
35
+ assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0].to_i < 2
36
36
  end
37
37
  end
38
-
39
-
@@ -4,6 +4,11 @@ require 'test/unit'
4
4
 
5
5
  class TestHypergeometric < Test::Unit::TestCase
6
6
 
7
+ def test_hypergeometric_c
8
+ assert_equal Hypergeometric.hypergeometric_c(2, 1, 1, 1).round(2), 0.5
9
+ assert_equal Hypergeometric.hypergeometric_c(10, 1, 1, 1).round(2), 0.1
10
+ end
11
+
7
12
  def test_hypergeometric
8
13
  assert Hypergeometric.hypergeometric(100, 20, 15, 13) < 0.0005
9
14
  end
@@ -9,12 +9,8 @@ class TestHuggingface < Test::Unit::TestCase
9
9
  task = "SequenceClassification"
10
10
 
11
11
  model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
12
- iii model.eval "This is dog"
13
- iii model.eval "This is cat"
14
- iii model.eval_list(["This is dog", "This is cat"])
15
12
 
16
13
  model = VectorModel.new dir
17
- iii model.eval_list(["This is dog", "This is cat"])
18
14
  end
19
15
  end
20
16
 
@@ -42,7 +38,7 @@ class TestHuggingface < Test::Unit::TestCase
42
38
  assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
43
39
  end
44
40
 
45
- def test_sst_eval
41
+ def _test_sst_eval
46
42
  TmpFile.with_file do |dir|
47
43
  checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
48
44
 
@@ -55,12 +51,29 @@ class TestHuggingface < Test::Unit::TestCase
55
51
  end
56
52
  end
57
53
 
54
+ def _test_sst_logits
55
+ TmpFile.with_file do |dir|
56
+ checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
57
+
58
+ model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
59
+
60
+ model.model_options[:class_labels] = ["Bad", "Good"]
61
+ model.model_options[:return_logits] = true
62
+
63
+ logits = model.eval("This is dog")
64
+ assert logits[0] > logits[1]
65
+ logits = model.eval_list(["This is dog", "This is cat"])
66
+ assert logits[0][0] > logits[0][1]
67
+ assert logits[1][0] < logits[1][1]
68
+ end
69
+ end
70
+
58
71
 
59
- def _test_sst_train
72
+ def test_sst_train
60
73
  TmpFile.with_file do |dir|
61
74
  checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
62
75
 
63
- model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
76
+ model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, tokenizer_args:{max_length: 128}, tokenizer_padding: true, tokenizer_truncation: true
64
77
 
65
78
  model.model_options[:class_labels] = %w(Bad Good)
66
79
 
@@ -148,12 +161,12 @@ class TestHuggingface < Test::Unit::TestCase
148
161
 
149
162
  model = VectorModel.new dir
150
163
 
151
- assert_equal "Good", model.eval_list("This is dog")
164
+ assert_equal ["Good"], model.eval_list(["This is dog"])
152
165
 
153
166
  end
154
167
  end
155
168
 
156
- def _test_sst_stress_test
169
+ def __test_sst_stress_test
157
170
  TmpFile.with_file do |dir|
158
171
  checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
159
172
 
@@ -252,6 +265,7 @@ class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
252
265
  EOF
253
266
 
254
267
  RbbtPython.add_path dir
268
+ RbbtPython.process_paths
255
269
 
256
270
  biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
257
271
  model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta
@@ -88,9 +88,13 @@ class TestPytorchLightningModel(pl.LightningModule):
88
88
  res = model.eval_list([[10.0], [11.2], [14.3]])
89
89
  assert_equal 3, RbbtPython.numpy2ruby(res).length
90
90
 
91
+ orig_res = res
91
92
  model = VectorModel.new dir
92
93
  model.init
93
-
94
+ res = model.eval([10.0])
95
+ res = model.eval_list([[10.0], [11.2], [14.3]])
96
+ assert_equal 3, RbbtPython.numpy2ruby(res).length
97
+ assert_equal orig_res, res
94
98
  end
95
99
  end
96
100
  end
@@ -26,7 +26,7 @@ class TestTorch < Test::Unit::TestCase
26
26
  model.add 5.0, [10.0]
27
27
  model.add 10.0, [20.0]
28
28
 
29
- model.training_args[:epochs] = 1000
29
+ model.model_options[:training_args][:epochs] = 1000
30
30
  model.train
31
31
 
32
32
  w = model.get_weights.to_ruby.first.first
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-12-21 00:00:00.000000000 Z
11
+ date: 2025-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -174,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
174
174
  - !ruby/object:Gem::Version
175
175
  version: '0'
176
176
  requirements: []
177
- rubygems_version: 3.5.0.dev
177
+ rubygems_version: 3.5.23
178
178
  signing_key:
179
179
  specification_version: 4
180
180
  summary: Data-mining and statistics