rbbt-dm 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f9b8071884e4e9d7a8c04f175fe262aad9e2b77911dca787a957a5c5f797fb9b
4
- data.tar.gz: 1c7334d62036d3ae07b7f625b310f401b5078022f909be34cd78bb66c5b2af06
3
+ metadata.gz: 904f77b8390128686b8cf153e517aff21394bd43548b8116e0d28188924a833e
4
+ data.tar.gz: d05f1712851cb5c552cfedac2166abb45d508cad6abbf493b41f5becde0e570c
5
5
  SHA512:
6
- metadata.gz: 22c73d01543e93a2a7b10ecaa88db9a663b35c8264b6d0e5e9d4b00096f34955250105dec4787242529c594c1a959feb23a4b5cd46298850eee7a813dc551d0f
7
- data.tar.gz: 545663b2ee93dd0e6e6b54e353cb3bfafab9001c7031b42e7f895fb95ea85ffb6c1dcdb54bb671ee5cace49561cca018212e25ee43592b457e4e1abe83277076
6
+ metadata.gz: b762389ed54ce7a91da87258f2ee856d04b4c1fef73894ac6c0e6219423967bfb42b89430f952b62d609a38c1acd2935ca4d424c35215fcc987e8af24f1fde3d
7
+ data.tar.gz: d3248d9996ff5f1298203d7d69595cb8f0a0dd037a4a666f0631d106b785f3401f56e5bc0ef2883e32766d1792ece1e8807a0c6d3bd7e5ecd174fca8fc698dc3
@@ -64,7 +64,7 @@ data = rbbt.dm.matrix.differential(#{ R.ruby2R data_file },
64
64
  )
65
65
  EOS
66
66
 
67
- R.run(cmd, :monitor => true)
67
+ R.run(cmd, :monitor => true)
68
68
  end
69
69
  end
70
70
  end
data/lib/rbbt/matrix.rb CHANGED
@@ -12,6 +12,7 @@ class RbbtMatrix
12
12
 
13
13
  attr_accessor :data_file, :labels, :value_type, :format, :organism, :identifiers
14
14
  def initialize(data_file, labels = nil, value_type = nil, format = nil, organism=nil, identifiers=nil)
15
+ data_file = data_file.find if Path === data_file
15
16
  @data_file = data_file
16
17
  @labels = labels
17
18
  @value_type = value_type || 'count'
@@ -42,7 +43,7 @@ class RbbtMatrix
42
43
  end
43
44
 
44
45
  def samples
45
- @samples ||= TSV.parse_header(@data_file).fields
46
+ @samples ||= TSV.parse_header(@data_file)[:fields]
46
47
  end
47
48
 
48
49
  def subsets=(subsets)
@@ -181,9 +182,14 @@ class RbbtMatrix
181
182
 
182
183
  identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
183
184
 
184
- data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
185
+ new_data = data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
185
186
  Misc.mean(v.compact)
186
187
  end
188
+
189
+ new_data.delete ""
190
+ new_data.delete nil
191
+
192
+ new_data
187
193
  end
188
194
  subsets = self.subsets
189
195
  matrix = RbbtMatrix.new file, labels, value_type, "Ensembl Gene ID", organism
@@ -202,9 +208,14 @@ class RbbtMatrix
202
208
 
203
209
  identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
204
210
 
205
- data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
211
+ new_data = data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
206
212
  Misc.mean(v.compact)
207
213
  end
214
+
215
+ new_data.delete ""
216
+ new_data.delete nil
217
+
218
+ new_data
208
219
  end
209
220
  subsets = self.subsets
210
221
  matrix = RbbtMatrix.new file, labels, value_type, "Associated Gene Name", organism
data/lib/rbbt/stan.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  require 'rbbt/util/R'
2
- require 'mkfifo'
3
2
 
4
3
  module STAN
5
4
 
@@ -88,7 +87,7 @@ data{
88
87
  end
89
88
 
90
89
  def self.exec(data, model, input_directory, parameter_chains, sample_file, debug = FALSE, stan_options = {})
91
- stan_options = Misc.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
90
+ stan_options = IndiferentHash.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
92
91
 
93
92
  data = {} if data.nil?
94
93
 
@@ -123,7 +122,7 @@ print(fit)
123
122
 
124
123
  def self.stream_chain(data, model, directory = nil, options = {})
125
124
  options, directory = directory, nil if Hash === directory
126
- debug = Misc.process_options options, :debug
125
+ debug = IndiferentHash.process_options options, :debug
127
126
 
128
127
  if directory.nil?
129
128
  directory = TmpFile.tmp_file
@@ -178,7 +177,7 @@ print(fit)
178
177
  end
179
178
 
180
179
  def self.run(data, model, directory, options = {})
181
- debug = Misc.process_options options, :debug
180
+ debug = IndiferentHash.process_options options, :debug
182
181
 
183
182
  input_directory = File.join(directory, 'inputs')
184
183
 
@@ -172,7 +172,8 @@ module FDR
172
172
  values << p[1]
173
173
  }
174
174
 
175
- if RUBY_VERSION[0] == "2"
175
+ iii RUBY_VERSION[0]
176
+ if RUBY_VERSION[0] == "2" || RUBY_VERSION[0] == "3"
176
177
  new_values = FDR.adjust(values)
177
178
  keys.zip(new_values).each do |k,v|
178
179
  vs = data[k]
@@ -195,8 +196,4 @@ module FDR
195
196
  data.unnamed = unnamed if unnamed
196
197
  end
197
198
  end
198
-
199
199
  end
200
-
201
-
202
-
@@ -64,10 +64,10 @@ double lBinom(double n, double k)
64
64
  builder.c_singleton <<-EOC
65
65
  /**
66
66
  * * Compute the Hypergeometric accumulated value.
67
- * * @param total => total size
68
- * * @param support => total support
69
- * * @param list => selected list size
70
- * * @param found => support
67
+ * * @param total => Balls in urn
68
+ * * @param support => Positive balls in urn
69
+ * * @param list => Drawn balls
70
+ * * @param found => Positive drawn balls
71
71
  * * @return The result
72
72
  * */
73
73
  //pvalues[annotation] = Hypergeometric.hypergeometric(tsv_size, counts[annotation], total, count)
@@ -102,10 +102,13 @@ double hypergeometric_c(double total, double support, double list, double found)
102
102
  EOC
103
103
  end
104
104
 
105
- def self.hypergeometric(count, positive, negative, total)
106
- #RSRuby.instance.phyper(count - 1, positive, negative, total, false).to_f
105
+ def self.hypergeometric_R(count, positive, negative, total)
107
106
  R.eval("phyper(#{ count } - 1, #{ positive }, #{ negative }, #{ total }, lower.tail=FALSE)").to_f
108
107
  end
108
+
109
+ def self.hypergeometric(count, positive, negative, total)
110
+ hypergeometric_c(positive + negative, positive, total, count)
111
+ end
109
112
  end
110
113
 
111
114
  module TSV
@@ -260,7 +263,8 @@ module TSV
260
263
  elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
261
264
  count = elems.length
262
265
  next if count < options[:min_support] or not counts.include? annotation
263
- pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
266
+ #pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
267
+ pvalues[annotation] = Hypergeometric.hypergeometric_c(tsv_size, counts[annotation], total, count)
264
268
  end
265
269
 
266
270
  pvalues = FDR.adjust_hash! pvalues if options[:fdr]
@@ -268,7 +272,7 @@ module TSV
268
272
  pvalues.delete_if{|k, pvalue| pvalue > options[:cutoff] } if options[:cutoff]
269
273
 
270
274
  if add_keys
271
- tsv = TSV.setup(pvalues.keys.collect{|k| k.dup}, :key_field => fields, :fields => [], :type => :double)
275
+ tsv = TSV.setup(pvalues.keys.collect{|k| k.dup }, :key_field => fields, :fields => [], :type => :double)
272
276
 
273
277
  tsv.add_field 'p-value' do |annot, values|
274
278
  [pvalues[annot]]
@@ -2,23 +2,39 @@ require 'rbbt/vector/model/torch'
2
2
 
3
3
  class HuggingfaceModel < TorchModel
4
4
 
5
+ attr_accessor :tokenizer
6
+ def init
7
+ @model, @tokenizer = self.instance_exec(&@init_model) if @model.nil?
8
+ [@model, @tokenizer]
9
+ end
10
+
11
+ def tokenizer
12
+ init
13
+ @tokenizer
14
+ end
15
+
5
16
  def initialize(task, checkpoint, dir = nil, model_options = {})
6
17
  super(dir, nil, model_options)
7
18
 
8
19
  checkpoint = checkpoint.find if Path === checkpoint
9
20
 
10
- @model_options = Misc.add_defaults @model_options, :task => task, :checkpoint => checkpoint
21
+ @model_options[:tokenizer_options] = @model_options.delete(:tokenizer_args) if @model_options.include?(:tokenizer_args)
22
+ tokenizer_args = IndiferentHash.pull_keys @model_options, :tokenizer
23
+ @model_options[:tokenizer_args] = tokenizer_args
24
+
25
+ @model_options[:task] = task if task
26
+ @model_options[:checkpoint] = checkpoint if checkpoint
11
27
 
12
28
  init_model do
13
29
  checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint]
14
30
 
15
31
  model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model,
16
- @model_options[:task], checkpoint, **(IndiferentHash.setup(model_options[:model_args]) || {}))
32
+ @model_options[:task], checkpoint, **(IndiferentHash.setup(@model_options.except(:training_args, :tokenizer_args, :task, :checkpoint, :class_labels))))
17
33
 
18
- tokenizer_checkpoint = @model_options[:tokenizer_checkpoint] || checkpoint
34
+ tokenizer_checkpoint = @model_options[:tokenizer_args][:checkpoint] || checkpoint
19
35
 
20
36
  tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
21
- @model_options[:task], tokenizer_checkpoint, **(IndiferentHash.setup(model_options[:tokenizer_args]) || {}))
37
+ tokenizer_checkpoint, **(IndiferentHash.setup(@model_options[:tokenizer_args])))
22
38
 
23
39
  [model, tokenizer]
24
40
  end
@@ -46,7 +62,7 @@ class HuggingfaceModel < TorchModel
46
62
  end
47
63
 
48
64
  dataset_file = TorchModel.text_dataset(tsv_file, texts)
49
- training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
65
+ training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, {})
50
66
 
51
67
  begin
52
68
  RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens)
@@ -71,7 +87,7 @@ class HuggingfaceModel < TorchModel
71
87
  checkpoint_dir = File.join(tmpdir, 'checkpoints')
72
88
  end
73
89
 
74
- training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
90
+ training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, training_args)
75
91
  dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels])
76
92
 
77
93
  RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
@@ -96,43 +112,47 @@ class HuggingfaceModel < TorchModel
96
112
  predictions = result["logits"]
97
113
  end
98
114
 
99
- task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
100
- result = case task
101
- when "SequenceClassification"
102
- RbbtPython.collect(predictions) do |logits|
103
- logits = RbbtPython.numpy2ruby logits
104
- best_class = logits.index logits.max
105
- best_class = class_labels[best_class] if class_labels
106
- best_class
107
- end
108
- when "MaskedLM"
109
- all_token_positions = token_positions.to_a
110
-
111
- i = 0
112
- RbbtPython.collect(predictions) do |item_logits|
113
- item_token_positions = all_token_positions[i]
114
- i += 1
115
-
116
- item_logits = RbbtPython.numpy2ruby(item_logits)
117
- item_masks = item_token_positions.collect do |token_positions|
118
-
119
- best = item_logits.values_at(*token_positions).collect do |logits|
120
- best_token, best_score = nil
121
- logits.each_with_index do |v,i|
122
- if best_score.nil? || v > best_score
123
- best_token, best_score = i, v
124
- end
125
- end
126
- best_token
127
- end
128
-
129
- best.collect{|b| tokenizer.decode(b) } * "|"
130
- end
131
- Array === locate_tokens ? item_masks : item_masks.first
132
- end
133
- else
134
- predictions
135
- end
115
+ if @model_options[:return_logits]
116
+ result = RbbtPython.numpy2ruby(predictions)
117
+ else
118
+ task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
119
+ result = case task
120
+ when "SequenceClassification"
121
+ RbbtPython.collect(predictions) do |logits|
122
+ logits = RbbtPython.numpy2ruby logits
123
+ best_class = logits.index logits.max
124
+ best_class = class_labels[best_class] if class_labels
125
+ best_class
126
+ end
127
+ when "MaskedLM"
128
+ all_token_positions = token_positions.to_a
129
+
130
+ i = 0
131
+ RbbtPython.collect(predictions) do |item_logits|
132
+ item_token_positions = all_token_positions[i]
133
+ i += 1
134
+
135
+ item_logits = RbbtPython.numpy2ruby(item_logits)
136
+ item_masks = item_token_positions.collect do |token_positions|
137
+
138
+ best = item_logits.values_at(*token_positions).collect do |logits|
139
+ best_token, best_score = nil
140
+ logits.each_with_index do |v,i|
141
+ if best_score.nil? || v > best_score
142
+ best_token, best_score = i, v
143
+ end
144
+ end
145
+ best_token
146
+ end
147
+
148
+ best.collect{|b| tokenizer.decode(b) } * "|"
149
+ end
150
+ Array === locate_tokens ? item_masks : item_masks.first
151
+ end
152
+ else
153
+ predictions
154
+ end
155
+ end
136
156
 
137
157
  (! is_list || single) && Array === result ? result.first : result
138
158
  end
@@ -144,6 +164,7 @@ class HuggingfaceModel < TorchModel
144
164
  def reset_model
145
165
  @model, @tokenizer = nil
146
166
  Open.rm_rf @model_path
167
+ Open.rm_rf TorchModel.model_architecture(model_path)
147
168
  init
148
169
  end
149
170
  end
@@ -2,7 +2,7 @@ require 'rbbt/vector/model'
2
2
  require 'rbbt/util/python'
3
3
 
4
4
  RbbtPython.add_path Rbbt.python.find(:lib)
5
- RbbtPython.init_rbbt
5
+ #RbbtPython.init_rbbt
6
6
 
7
7
  class PythonModel < VectorModel
8
8
  attr_accessor :python_class, :python_module
@@ -7,25 +7,54 @@ class PytorchLightningModel < TorchModel
7
7
 
8
8
  train_model do |features,labels|
9
9
  model = init
10
- loader = self.loader
10
+ train_loader = self.loader
11
11
  val_loader = self.val_loader
12
- if (features && features.any?) && loader.nil?
13
- TmpFile.with_file do |tsv_dataset_file|
14
- TorchModel.feature_dataset(tsv_dataset_file, features, labels)
15
- RbbtPython.pyimport :rbbt_dm
16
- loader = RbbtPython.rbbt_dm.tsv(tsv_dataset_file)
12
+ if train_loader.nil?
13
+ batch_size ||= model_options[:training_args][:batch_size]
14
+ batch_size ||= model_options[:batch_size]
15
+ batch_size ||= 1
16
+
17
+ shuffle = model_options[:training_args][:shuffle]
18
+ shuffle = true if shuffle.nil?
19
+
20
+ num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
21
+ train_loader = RbbtPython.run :torch do
22
+ dataset = features.zip(labels).collect{|f,l| [torch.tensor(f), l] }
23
+ torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, shuffle: shuffle, num_workers: num_workers.to_i)
17
24
  end
18
25
  end
19
- trainer.fit(model, loader, val_loader)
26
+ trainer.fit(model, train_loader, val_loader)
20
27
  TorchModel.save_architecture(model, model_path) if @directory
21
28
  TorchModel.save_state(model, model_path) if @directory
22
29
  end
30
+
31
+ eval_model do |features,list=false|
32
+ model = init
33
+ eval_loader = self.loader
34
+ if list
35
+ if eval_loader.nil?
36
+ batch_size ||= model_options[:batch_size]
37
+ batch_size ||= model_options[:training_args][:batch_size]
38
+ batch_size ||= 1
39
+
40
+ num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
41
+ eval_loader = RbbtPython.run :torch do
42
+ dataset = torch.tensor(features)
43
+ torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, num_workers: num_workers.to_i)
44
+ end
45
+ end
46
+ trainer.predict(model, eval_loader).inject([]){|acc,res| acc.concat RbbtPython.numpy2ruby(res[1])}
47
+ else
48
+ model.call(torch.tensor(features))
49
+ end
50
+ end
23
51
  end
24
52
 
25
53
  def trainer
26
54
  @trainer ||= begin
27
- options = @model_options[:training_args] || @model_options[:trainer_args]
28
- RbbtPython.class_new_obj("pytorch_lightning", "Trainer", options || {})
55
+ trainer_args = {default_root_dir: File.join(@directory, 'checkpoints')}.
56
+ merge(model_options[:training_args].except(:batch_size))
57
+ RbbtPython.class_new_obj("pytorch_lightning", "Trainer", trainer_args)
29
58
  end
30
59
  end
31
60
  end
@@ -42,13 +42,12 @@ class TorchModel
42
42
  end
43
43
 
44
44
  def self.text_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
45
- elements = elements.collect{|e| e.gsub("\n", ' ') }
45
+ elements = elements.compact.collect{|e| e.gsub("\n", ' ').gsub('"', '\'') }
46
46
  tsv = feature_tsv(elements, labels, class_labels)
47
+ tsv.fields[0] = "text"
47
48
  if labels.nil?
48
- tsv.fields[0] = "text"
49
- tsv.type = :single
49
+ tsv = tsv.to_single
50
50
  else
51
- tsv.fields[0] = "text"
52
51
  tsv.type = :list
53
52
  end
54
53
  Open.write(tsv_dataset_file, tsv.to_s)
@@ -3,9 +3,27 @@ class TorchModel
3
3
  def to_ruby
4
4
  RbbtPython.numpy2ruby(self)
5
5
  end
6
+
7
+ def to_ruby!
8
+ r = self.to_ruby
9
+ self.del
10
+ r
11
+ end
12
+
13
+ def length
14
+ PyCall.len(self)
15
+ end
16
+
6
17
  def self.setup(obj)
7
18
  obj.extend Tensor
8
19
  end
20
+
21
+ def del
22
+ self.detach
23
+ self.grad = nil
24
+ self.storage.resize_ 0
25
+ self.to("cpu")
26
+ end
9
27
  end
10
28
 
11
29
  def self.init_python
@@ -46,7 +64,7 @@ class TorchModel
46
64
  end
47
65
 
48
66
  def self.tensor(obj, device, dtype)
49
- RbbtPython.torch.tensor(obj, dtype: dtype, device: device)
67
+ TorchModel::Tensor.setup(RbbtPython.torch.tensor(obj, dtype: dtype, device: device))
50
68
  end
51
69
 
52
70
  end
@@ -13,19 +13,20 @@ class TorchModel
13
13
  end
14
14
  def get_weights(...); TorchModel.get_weights(model, ...); end
15
15
 
16
- def self.freeze(layer)
16
+ def self.freeze(layer, requires_grad=false)
17
17
  begin
18
- PyCall.getattr(layer, :weight).requires_grad = false
18
+ PyCall.getattr(layer, :weight).requires_grad = requires_grad
19
19
  rescue
20
20
  end
21
21
  RbbtPython.iterate(layer.children) do |layer|
22
- freeze(layer)
22
+ freeze(layer, requires_grad)
23
23
  end
24
24
  end
25
- def self.freeze_layer(model, layer)
25
+
26
+ def self.freeze_layer(model, layer, requires_grad = false)
26
27
  layer = get_layer(model, layer)
27
- freeze(layer)
28
+ freeze(layer, requires_grad)
28
29
  end
29
- def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
30
30
 
31
+ def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
31
32
  end
@@ -27,4 +27,10 @@ class TorchModel
27
27
  Log.debug "Loading model architecture from #{model_architecture}"
28
28
  RbbtPython.torch.load(model_architecture)
29
29
  end
30
+
31
+ def reset_model
32
+ @trainer = @model = nil
33
+ Open.rm_rf model_path
34
+ Open.rm_rf TorchModel.model_architecture(model_path)
35
+ end
30
36
  end
@@ -2,47 +2,37 @@ require_relative 'python'
2
2
 
3
3
  class TorchModel < PythonModel
4
4
 
5
- attr_accessor :model, :criterion, :optimizer, :training_args
5
+ attr_accessor :criterion, :optimizer
6
6
 
7
7
  def initialize(...)
8
8
  TorchModel.init_python
9
9
  super(...)
10
- @training_args = model_options[:training_args] || {}
11
10
 
11
+ @model_options[:training_options] = @model_options.delete(:training_args) if @model_options.include?(:training_args)
12
+ training_args = IndiferentHash.pull_keys(@model_options, :training) || {}
13
+ @model_options[:training_args] = training_args
12
14
  init_model do
13
15
  model = TorchModel.load_architecture(model_path)
14
16
  if model.nil?
15
17
  RbbtPython.add_path @directory
16
- RbbtPython.class_new_obj(@python_module, @python_class, **model_options)
18
+ RbbtPython.process_paths
19
+ RbbtPython.class_new_obj(@python_module, @python_class, **model_options.except(:training_args, :batch_size))
17
20
  else
18
21
  TorchModel.load_state(model, model_path)
19
22
  end
20
23
  end
21
24
 
22
- eval_model do |features,list=false|
23
- init
24
- @device ||= TorchModel.device(model_options)
25
- @dtype ||= TorchModel.dtype(model_options)
26
- model.to(@device)
27
-
28
- tensor = list ? TorchModel.tensor(features, @device, @dtype) : TorchModel.tensor([features], @device, @dtype)
29
-
30
- loss, res = model.call(tensor)
31
-
32
- res = loss if res.nil?
33
-
34
- res = TorchModel::Tensor.setup(list ? res : res[0])
35
-
36
- res
37
- end
38
-
39
25
  train_model do |features,labels|
40
26
  init
41
27
  @device ||= TorchModel.device(model_options)
42
28
  @dtype ||= TorchModel.dtype(model_options)
43
29
  model.to(@device)
44
- @optimizer ||= TorchModel.optimizer(model, training_args)
45
- epochs = training_args[:epochs] || 3
30
+ @optimizer ||= TorchModel.optimizer(model, model_options[:training_args] || {})
31
+
32
+ epochs = model_options[:training_args][:epochs] || 3
33
+ batch_size = model_options[:batch_size]
34
+ batch_size ||= model_options[:training_args][:batch_size]
35
+ batch_size ||= 1
46
36
 
47
37
  inputs = TorchModel.tensor(features, @device, @dtype)
48
38
  #target = TorchModel.tensor(labels.collect{|v| [v] }, @device, @dtype)
@@ -63,6 +53,39 @@ class TorchModel < PythonModel
63
53
  TorchModel.save_architecture(model, model_path) if @directory
64
54
  TorchModel.save_state(model, model_path) if @directory
65
55
  end
56
+
57
+ eval_model do |features,list=false|
58
+ init
59
+ @device ||= TorchModel.device(model_options)
60
+ @dtype ||= TorchModel.dtype(model_options)
61
+ model.to(@device)
62
+ model.eval
63
+
64
+ features = [features] unless list
65
+
66
+ batch_size = model_options[:batch_size]
67
+ batch_size ||= model_options[:training_args][:batch_size]
68
+ batch_size ||= 1
69
+
70
+ res = Misc.chunk(features, batch_size).inject(nil) do |acc,batch|
71
+ tensor = TorchModel.tensor(batch, @device, @dtype)
72
+
73
+ loss, chunk_res = model.call(tensor)
74
+ tensor.del
75
+
76
+ chunk_res = loss if chunk_res.nil?
77
+
78
+ TorchModel::Tensor.setup(chunk_res)
79
+ acc = acc.nil? ? chunk_res.to_ruby! : acc + chunk_res.to_ruby!
80
+
81
+ acc
82
+ end
83
+
84
+ res = TorchModel::Tensor.setup(list ? res : res[0])
85
+
86
+ res
87
+ end
88
+
66
89
  end
67
90
  end
68
91
  require_relative 'torch/helpers'
@@ -3,7 +3,7 @@ require 'rbbt/vector/model/util'
3
3
  require 'rbbt/util/python'
4
4
 
5
5
  RbbtPython.add_path Rbbt.python.find(:lib)
6
- RbbtPython.init_rbbt
6
+ #RbbtPython.init_rbbt
7
7
 
8
8
  class VectorModel
9
9
  attr_accessor :directory, :model_path, :extract_features, :init_model, :train_model, :eval_model, :post_process, :balance
@@ -166,8 +166,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
166
166
  @options_file = File.join(@directory, "options.json")
167
167
 
168
168
  if File.exist?(@options_file)
169
- @model_options = JSON.parse(Open.read(@options_file)).merge(@model_options || {})
170
- IndiferentHash.setup(@model_options)
169
+ file_options = JSON.parse(Open.read(@options_file))
170
+ IndiferentHash.setup(file_options)
171
+ @model_options = file_options.deep_merge(@model_options)
171
172
  end
172
173
  end
173
174
 
@@ -254,8 +255,24 @@ cat(paste(label, sep="\\n", collapse="\\n"));
254
255
 
255
256
  def add_list(elements, labels = nil)
256
257
  if @extract_features.nil? || @extract_features.arity == 1
257
- elements.zip(labels || [nil]).each do |elem,label|
258
- add(elem, label)
258
+ case labels
259
+ when nil
260
+ elements.each do |elem|
261
+ add(elem)
262
+ end
263
+ when Array
264
+ elements.zip(labels).each do |elem,label|
265
+ add(elem, label)
266
+ end
267
+ when Hash
268
+ elements.each do |elem|
269
+ label = labels[elem]
270
+ add(elem, label)
271
+ end
272
+ else
273
+ elements.each do |elem|
274
+ add(elem, labels)
275
+ end
259
276
  end
260
277
  else
261
278
  features = self.instance_exec(nil, elements, &@extract_features)
@@ -482,8 +499,11 @@ cat(paste(label, sep="\\n", collapse="\\n"));
482
499
  @labels = orig_labels
483
500
  end unless folds == -1
484
501
 
485
- self.reset_model if self.respond_to? :reset_model
486
- self.train unless folds == 1
502
+ if folds != 1
503
+ self.reset_model if self.respond_to? :reset_model
504
+ self.train
505
+ end
506
+
487
507
  res
488
508
  end
489
509
  end
@@ -27,5 +27,9 @@ def tsv_dataset(filename, *args, **kwargs):
27
27
  def tsv(*args, **kwargs):
28
28
  return tsv_dataset(*args, **kwargs)
29
29
 
30
+ def tsv_loader(*args, **kwargs):
31
+ dataset = tsv(*args, kwargs)
32
+ return torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
33
+
30
34
  def data_dir():
31
35
  return rbbt.path('var/rbbt_dm/data')
@@ -15,15 +15,15 @@ def load_model(task, checkpoint, **kwargs):
15
15
  return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
16
16
  else:
17
17
  class_name = 'AutoModelFor' + task
18
- return import_module_class('transformers', class_name).from_pretrained(checkpoint)
18
+ return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
19
19
 
20
- def load_tokenizer(task, checkpoint, **kwargs):
20
+ def load_tokenizer(checkpoint, **kwargs):
21
21
  class_name = 'AutoTokenizer'
22
22
  return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
23
23
 
24
24
  def load_model_and_tokenizer(task, checkpoint):
25
25
  model = load_model(task, checkpoint)
26
- tokenizer = load_tokenizer(task, checkpoint)
26
+ tokenizer = load_tokenizer(checkpoint)
27
27
  return model, tokenizer
28
28
 
29
29
  # Not used
@@ -88,6 +88,9 @@ def training_args(*args, **kwargs):
88
88
  def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
89
89
  from transformers import Trainer
90
90
 
91
+ # Note: Parameters need to be made contiguous. I'm not sure why they weren't
92
+ for param in model.parameters(): param.data = param.data.contiguous()
93
+
91
94
  if (isinstance(dataset, str)):
92
95
  if (dataset.endswith('.json')):
93
96
  tokenized_dataset = json_dataset(tokenizer, dataset)
data/share/R/MA.R CHANGED
@@ -99,6 +99,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
99
99
  }
100
100
 
101
101
  if (log2){
102
+ full_rows = apply(is.na(data), 1, sum) == 0
103
+ data = data[full_rows,]
102
104
  cutoff <- 1
103
105
  drop <- which(apply(data, 1, max) < cutoff)
104
106
  min = min(data[data != -Inf])
@@ -106,7 +108,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
106
108
  data <- DGEList(data)
107
109
  data <- calcNormFactors(data)
108
110
  data = cpm(data, log=TRUE, prior.count=3)
109
- data <- data[-drop,]
111
+ if (length(drop) > 0)
112
+ data <- data[-drop,]
110
113
  }else{
111
114
  data[data == 0] = NA
112
115
  good.rows = apply(is.na(data),1,sum) != dim(data)[2]
@@ -181,10 +184,11 @@ rbbt.dm.matrix.differential <- function(file, main, contrast = NULL, type = 'lim
181
184
  contrast <- make.names(contrast);
182
185
  }
183
186
 
184
- if (type == 'limma')
187
+ if (is.null(type) || type == 'limma'){
185
188
  result = rbbt.dm.matrix.differential.limma(data, main, contrast, log2, two.channel, eBayes.trend)
186
- else
189
+ }else{
187
190
  result = rbbt.dm.matrix.differential.DESeq(data, main, contrast)
191
+ }
188
192
 
189
193
  if (is.null(outfile)){
190
194
  return(result);
@@ -32,8 +32,6 @@ class TestFDR < Test::Unit::TestCase
32
32
  assert_equal(clean(@r_adj), clean(FDR.adjust_native(@values)))
33
33
  assert_equal(clean(FDR.adjust_fast(@values)), clean(FDR.adjust_native(@values)))
34
34
 
35
- assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0] != "2"
35
+ assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0].to_i < 2
36
36
  end
37
37
  end
38
-
39
-
@@ -4,6 +4,11 @@ require 'test/unit'
4
4
 
5
5
  class TestHypergeometric < Test::Unit::TestCase
6
6
 
7
+ def test_hypergeometric_c
8
+ assert_equal Hypergeometric.hypergeometric_c(2, 1, 1, 1).round(2), 0.5
9
+ assert_equal Hypergeometric.hypergeometric_c(10, 1, 1, 1).round(2), 0.1
10
+ end
11
+
7
12
  def test_hypergeometric
8
13
  assert Hypergeometric.hypergeometric(100, 20, 15, 13) < 0.0005
9
14
  end
@@ -9,12 +9,8 @@ class TestHuggingface < Test::Unit::TestCase
9
9
  task = "SequenceClassification"
10
10
 
11
11
  model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
12
- iii model.eval "This is dog"
13
- iii model.eval "This is cat"
14
- iii model.eval_list(["This is dog", "This is cat"])
15
12
 
16
13
  model = VectorModel.new dir
17
- iii model.eval_list(["This is dog", "This is cat"])
18
14
  end
19
15
  end
20
16
 
@@ -42,7 +38,7 @@ class TestHuggingface < Test::Unit::TestCase
42
38
  assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
43
39
  end
44
40
 
45
- def test_sst_eval
41
+ def _test_sst_eval
46
42
  TmpFile.with_file do |dir|
47
43
  checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
48
44
 
@@ -55,12 +51,29 @@ class TestHuggingface < Test::Unit::TestCase
55
51
  end
56
52
  end
57
53
 
54
+ def _test_sst_logits
55
+ TmpFile.with_file do |dir|
56
+ checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
57
+
58
+ model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
59
+
60
+ model.model_options[:class_labels] = ["Bad", "Good"]
61
+ model.model_options[:return_logits] = true
62
+
63
+ logits = model.eval("This is dog")
64
+ assert logits[0] > logits[1]
65
+ logits = model.eval_list(["This is dog", "This is cat"])
66
+ assert logits[0][0] > logits[0][1]
67
+ assert logits[1][0] < logits[1][1]
68
+ end
69
+ end
70
+
58
71
 
59
- def _test_sst_train
72
+ def test_sst_train
60
73
  TmpFile.with_file do |dir|
61
74
  checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
62
75
 
63
- model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
76
+ model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, tokenizer_args:{max_length: 128}, tokenizer_padding: true, tokenizer_truncation: true
64
77
 
65
78
  model.model_options[:class_labels] = %w(Bad Good)
66
79
 
@@ -148,12 +161,12 @@ class TestHuggingface < Test::Unit::TestCase
148
161
 
149
162
  model = VectorModel.new dir
150
163
 
151
- assert_equal "Good", model.eval_list("This is dog")
164
+ assert_equal ["Good"], model.eval_list(["This is dog"])
152
165
 
153
166
  end
154
167
  end
155
168
 
156
- def _test_sst_stress_test
169
+ def __test_sst_stress_test
157
170
  TmpFile.with_file do |dir|
158
171
  checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
159
172
 
@@ -252,6 +265,7 @@ class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
252
265
  EOF
253
266
 
254
267
  RbbtPython.add_path dir
268
+ RbbtPython.process_paths
255
269
 
256
270
  biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
257
271
  model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta
@@ -88,9 +88,13 @@ class TestPytorchLightningModel(pl.LightningModule):
88
88
  res = model.eval_list([[10.0], [11.2], [14.3]])
89
89
  assert_equal 3, RbbtPython.numpy2ruby(res).length
90
90
 
91
+ orig_res = res
91
92
  model = VectorModel.new dir
92
93
  model.init
93
-
94
+ res = model.eval([10.0])
95
+ res = model.eval_list([[10.0], [11.2], [14.3]])
96
+ assert_equal 3, RbbtPython.numpy2ruby(res).length
97
+ assert_equal orig_res, res
94
98
  end
95
99
  end
96
100
  end
@@ -26,7 +26,7 @@ class TestTorch < Test::Unit::TestCase
26
26
  model.add 5.0, [10.0]
27
27
  model.add 10.0, [20.0]
28
28
 
29
- model.training_args[:epochs] = 1000
29
+ model.model_options[:training_args][:epochs] = 1000
30
30
  model.train
31
31
 
32
32
  w = model.get_weights.to_ruby.first.first
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-12-21 00:00:00.000000000 Z
11
+ date: 2025-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -174,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
174
174
  - !ruby/object:Gem::Version
175
175
  version: '0'
176
176
  requirements: []
177
- rubygems_version: 3.5.0.dev
177
+ rubygems_version: 3.5.23
178
178
  signing_key:
179
179
  specification_version: 4
180
180
  summary: Data-mining and statistics