RubyGems - rbbt-dm - Versions diffs - 1.3.0 → 1.3.2 - Mend

rbbt-dm 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/lib/rbbt/matrix/differential.rb +1 -1
data/lib/rbbt/matrix.rb +14 -3
data/lib/rbbt/stan.rb +3 -4
data/lib/rbbt/statistics/fdr.rb +2 -5
data/lib/rbbt/statistics/hypergeometric.rb +12 -8
data/lib/rbbt/vector/model/huggingface.rb +64 -43
data/lib/rbbt/vector/model/python.rb +1 -1
data/lib/rbbt/vector/model/pytorch_lightning.rb +38 -9
data/lib/rbbt/vector/model/torch/dataloader.rb +3 -4
data/lib/rbbt/vector/model/torch/helpers.rb +19 -1
data/lib/rbbt/vector/model/torch/introspection.rb +7 -6
data/lib/rbbt/vector/model/torch/load_and_save.rb +6 -0
data/lib/rbbt/vector/model/torch.rb +45 -22
data/lib/rbbt/vector/model.rb +27 -7
data/python/rbbt_dm/__init__.py +4 -0
data/python/rbbt_dm/huggingface.py +6 -3
data/share/R/MA.R +7 -3
data/test/rbbt/statistics/test_fdr.rb +1 -3
data/test/rbbt/statistics/test_hypergeometric.rb +5 -0
data/test/rbbt/vector/model/test_huggingface.rb +23 -9
data/test/rbbt/vector/model/test_pytorch_lightning.rb +5 -1
data/test/rbbt/vector/model/test_torch.rb +1 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f9b8071884e4e9d7a8c04f175fe262aad9e2b77911dca787a957a5c5f797fb9b
-  data.tar.gz: 1c7334d62036d3ae07b7f625b310f401b5078022f909be34cd78bb66c5b2af06
+  metadata.gz: 904f77b8390128686b8cf153e517aff21394bd43548b8116e0d28188924a833e
+  data.tar.gz: d05f1712851cb5c552cfedac2166abb45d508cad6abbf493b41f5becde0e570c
 SHA512:
-  metadata.gz: 22c73d01543e93a2a7b10ecaa88db9a663b35c8264b6d0e5e9d4b00096f34955250105dec4787242529c594c1a959feb23a4b5cd46298850eee7a813dc551d0f
-  data.tar.gz: 545663b2ee93dd0e6e6b54e353cb3bfafab9001c7031b42e7f895fb95ea85ffb6c1dcdb54bb671ee5cace49561cca018212e25ee43592b457e4e1abe83277076
+  metadata.gz: b762389ed54ce7a91da87258f2ee856d04b4c1fef73894ac6c0e6219423967bfb42b89430f952b62d609a38c1acd2935ca4d424c35215fcc987e8af24f1fde3d
+  data.tar.gz: d3248d9996ff5f1298203d7d69595cb8f0a0dd037a4a666f0631d106b785f3401f56e5bc0ef2883e32766d1792ece1e8807a0c6d3bd7e5ecd174fca8fc698dc3

data/lib/rbbt/matrix/differential.rb CHANGED Viewed

@@ -64,7 +64,7 @@ data = rbbt.dm.matrix.differential(#{ R.ruby2R data_file },
   )
         EOS
-        R.run(cmd, :monitor => true)
+      R.run(cmd, :monitor => true)
     end
   end
 end

data/lib/rbbt/matrix.rb CHANGED Viewed

@@ -12,6 +12,7 @@ class RbbtMatrix
   attr_accessor :data_file, :labels, :value_type, :format, :organism, :identifiers
   def initialize(data_file, labels = nil, value_type = nil, format = nil, organism=nil, identifiers=nil)
+    data_file = data_file.find if Path === data_file
     @data_file = data_file
     @labels = labels
     @value_type = value_type || 'count'
@@ -42,7 +43,7 @@ class RbbtMatrix
   end
   def samples
-    @samples ||= TSV.parse_header(@data_file).fields
+    @samples ||= TSV.parse_header(@data_file)[:fields]
   end
   def subsets=(subsets)
@@ -181,9 +182,14 @@ class RbbtMatrix
       identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
-      data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
+      new_data = data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
         Misc.mean(v.compact)
       end
+      new_data.delete ""
+      new_data.delete nil
+      new_data
     end
     subsets = self.subsets
     matrix = RbbtMatrix.new file, labels, value_type, "Ensembl Gene ID", organism
@@ -202,9 +208,14 @@ class RbbtMatrix
       identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq
-      data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
+      new_data = data.change_key("Associated Gene Name", :identifiers => identifiers.reverse) do |v|
         Misc.mean(v.compact)
       end
+      new_data.delete ""
+      new_data.delete nil
+      new_data
     end
     subsets = self.subsets
     matrix = RbbtMatrix.new file, labels, value_type, "Associated Gene Name", organism

data/lib/rbbt/stan.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 require 'rbbt/util/R'
-require 'mkfifo'
 module STAN
@@ -88,7 +87,7 @@ data{
   end
   def self.exec(data, model, input_directory, parameter_chains, sample_file, debug = FALSE, stan_options = {})
-    stan_options = Misc.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
+    stan_options = IndiferentHash.add_defaults stan_options, :iter => 1000, :warmup => 500, :chains => 1, :seed => 2887, :refresh => 1200
     data = {} if data.nil?
@@ -123,7 +122,7 @@ print(fit)
   def self.stream_chain(data, model, directory = nil, options = {})
     options, directory = directory, nil if Hash === directory
-    debug = Misc.process_options options, :debug
+    debug = IndiferentHash.process_options options, :debug
     if directory.nil?
       directory = TmpFile.tmp_file
@@ -178,7 +177,7 @@ print(fit)
   end
   def self.run(data, model, directory, options = {})
-    debug = Misc.process_options options, :debug
+    debug = IndiferentHash.process_options options, :debug
     input_directory = File.join(directory, 'inputs')

data/lib/rbbt/statistics/fdr.rb CHANGED Viewed

@@ -172,7 +172,8 @@ module FDR
         values << p[1]
       }
-      if RUBY_VERSION[0] == "2"
+      iii RUBY_VERSION[0]
+      if RUBY_VERSION[0] == "2" || RUBY_VERSION[0] == "3"
         new_values = FDR.adjust(values)
         keys.zip(new_values).each do |k,v|
           vs = data[k]
@@ -195,8 +196,4 @@ module FDR
       data.unnamed = unnamed if unnamed
     end
   end
 end

data/lib/rbbt/statistics/hypergeometric.rb CHANGED Viewed

@@ -64,10 +64,10 @@ double lBinom(double n, double k)
     builder.c_singleton <<-EOC
 /**
 *  * Compute the Hypergeometric accumulated value.
-*  * @param total       => total size
-*  * @param support     => total support
-*  * @param list        => selected list size
-*  * @param found       => support
+*  * @param total       => Balls in urn
+*  * @param support     => Positive balls in urn
+*  * @param list        => Drawn balls
+*  * @param found       => Positive drawn balls
 *  * @return The result
 *  */
         //pvalues[annotation] = Hypergeometric.hypergeometric(tsv_size, counts[annotation], total, count)
@@ -102,10 +102,13 @@ double hypergeometric_c(double total, double support, double list, double found)
     EOC
   end
-  def self.hypergeometric(count, positive, negative, total)
-    #RSRuby.instance.phyper(count - 1, positive, negative, total, false).to_f
+  def self.hypergeometric_R(count, positive, negative, total)
     R.eval("phyper(#{ count } - 1, #{ positive }, #{ negative }, #{ total }, lower.tail=FALSE)").to_f
   end
+  def self.hypergeometric(count, positive, negative, total)
+    hypergeometric_c(positive + negative, positive, total, count)
+  end
 end
 module TSV
@@ -260,7 +263,8 @@ module TSV
         elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
         count = elems.length
         next if count < options[:min_support] or not counts.include? annotation
-        pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
+        #pvalues[annotation] = Hypergeometric.hypergeometric(count, counts[annotation], tsv_size - counts[annotation], total)
+        pvalues[annotation] = Hypergeometric.hypergeometric_c(tsv_size, counts[annotation], total, count)
       end
       pvalues = FDR.adjust_hash! pvalues if options[:fdr]
@@ -268,7 +272,7 @@ module TSV
       pvalues.delete_if{|k, pvalue| pvalue > options[:cutoff] } if options[:cutoff]
       if add_keys
-        tsv = TSV.setup(pvalues.keys.collect{|k| k.dup}, :key_field => fields, :fields => [], :type => :double)
+        tsv = TSV.setup(pvalues.keys.collect{|k| k.dup }, :key_field => fields, :fields => [], :type => :double)
         tsv.add_field 'p-value' do |annot, values|
           [pvalues[annot]]

data/lib/rbbt/vector/model/huggingface.rb CHANGED Viewed

@@ -2,23 +2,39 @@ require 'rbbt/vector/model/torch'
 class HuggingfaceModel < TorchModel
+  attr_accessor :tokenizer
+  def init
+    @model, @tokenizer = self.instance_exec(&@init_model) if @model.nil?
+    [@model, @tokenizer]
+  end
+  def tokenizer
+    init
+    @tokenizer
+  end
   def initialize(task, checkpoint, dir = nil, model_options = {})
     super(dir, nil, model_options)
     checkpoint = checkpoint.find if Path === checkpoint
-    @model_options = Misc.add_defaults @model_options, :task => task, :checkpoint => checkpoint
+    @model_options[:tokenizer_options] = @model_options.delete(:tokenizer_args) if @model_options.include?(:tokenizer_args)
+    tokenizer_args = IndiferentHash.pull_keys @model_options, :tokenizer
+    @model_options[:tokenizer_args] = tokenizer_args
+    @model_options[:task] = task if task
+    @model_options[:checkpoint] = checkpoint if checkpoint
     init_model do
       checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint]
       model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model,
-                                     @model_options[:task], checkpoint, **(IndiferentHash.setup(model_options[:model_args]) || {}))
+                                     @model_options[:task], checkpoint, **(IndiferentHash.setup(@model_options.except(:training_args, :tokenizer_args, :task, :checkpoint, :class_labels))))
-      tokenizer_checkpoint = @model_options[:tokenizer_checkpoint] || checkpoint
+      tokenizer_checkpoint = @model_options[:tokenizer_args][:checkpoint] || checkpoint
       tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
-                                         @model_options[:task], tokenizer_checkpoint, **(IndiferentHash.setup(model_options[:tokenizer_args]) || {}))
+                                         tokenizer_checkpoint, **(IndiferentHash.setup(@model_options[:tokenizer_args])))
       [model, tokenizer]
     end
@@ -46,7 +62,7 @@ class HuggingfaceModel < TorchModel
         end
         dataset_file = TorchModel.text_dataset(tsv_file, texts)
-        training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
+        training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, {})
         begin
           RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens)
@@ -71,7 +87,7 @@ class HuggingfaceModel < TorchModel
         checkpoint_dir = File.join(tmpdir, 'checkpoints')
       end
-      training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
+      training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, training_args)
       dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels])
       RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
@@ -96,43 +112,47 @@ class HuggingfaceModel < TorchModel
         predictions = result["logits"]
       end
-      task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
-      result = case task
-               when "SequenceClassification"
-                 RbbtPython.collect(predictions) do |logits|
-                   logits = RbbtPython.numpy2ruby logits
-                   best_class = logits.index logits.max
-                   best_class = class_labels[best_class] if class_labels
-                   best_class
-                 end
-               when "MaskedLM"
-                 all_token_positions = token_positions.to_a
-                 i = 0
-                 RbbtPython.collect(predictions) do |item_logits|
-                   item_token_positions = all_token_positions[i]
-                   i += 1
-                   item_logits = RbbtPython.numpy2ruby(item_logits)
-                   item_masks = item_token_positions.collect do |token_positions|
-                     best = item_logits.values_at(*token_positions).collect do |logits|
-                       best_token, best_score = nil
-                       logits.each_with_index do |v,i|
-                         if best_score.nil? || v > best_score
-                           best_token, best_score = i, v
-                         end
-                       end
-                       best_token
-                     end
-                     best.collect{|b| tokenizer.decode(b) } * "|"
-                   end
-                   Array === locate_tokens ? item_masks : item_masks.first
-                 end
-               else
-                 predictions
-               end
+      if @model_options[:return_logits]
+        result = RbbtPython.numpy2ruby(predictions)
+      else
+        task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
+        result = case task
+                when "SequenceClassification"
+                  RbbtPython.collect(predictions) do |logits|
+                    logits = RbbtPython.numpy2ruby logits
+                    best_class = logits.index logits.max
+                    best_class = class_labels[best_class] if class_labels
+                    best_class
+                  end
+                when "MaskedLM"
+                  all_token_positions = token_positions.to_a
+                  i = 0
+                  RbbtPython.collect(predictions) do |item_logits|
+                    item_token_positions = all_token_positions[i]
+                    i += 1
+                    item_logits = RbbtPython.numpy2ruby(item_logits)
+                    item_masks = item_token_positions.collect do |token_positions|
+                      best = item_logits.values_at(*token_positions).collect do |logits|
+                        best_token, best_score = nil
+                        logits.each_with_index do |v,i|
+                          if best_score.nil? || v > best_score
+                            best_token, best_score = i, v
+                          end
+                        end
+                        best_token
+                      end
+                      best.collect{|b| tokenizer.decode(b) } * "|"
+                    end
+                    Array === locate_tokens ? item_masks : item_masks.first
+                  end
+                else
+                  predictions
+                end
+      end
       (! is_list || single) && Array === result ? result.first : result
     end
@@ -144,6 +164,7 @@ class HuggingfaceModel < TorchModel
   def reset_model
     @model, @tokenizer = nil
     Open.rm_rf @model_path
+    Open.rm_rf TorchModel.model_architecture(model_path)
     init
   end
 end

data/lib/rbbt/vector/model/python.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'rbbt/vector/model'
 require 'rbbt/util/python'
 RbbtPython.add_path Rbbt.python.find(:lib)
-RbbtPython.init_rbbt
+#RbbtPython.init_rbbt
 class PythonModel < VectorModel
   attr_accessor :python_class, :python_module

data/lib/rbbt/vector/model/pytorch_lightning.rb CHANGED Viewed

@@ -7,25 +7,54 @@ class PytorchLightningModel < TorchModel
     train_model do |features,labels|
       model = init
-      loader = self.loader
+      train_loader = self.loader
       val_loader = self.val_loader
-      if (features && features.any?) && loader.nil?
-        TmpFile.with_file do |tsv_dataset_file|
-          TorchModel.feature_dataset(tsv_dataset_file, features, labels)
-          RbbtPython.pyimport :rbbt_dm
-          loader = RbbtPython.rbbt_dm.tsv(tsv_dataset_file)
+      if train_loader.nil?
+        batch_size ||= model_options[:training_args][:batch_size]
+        batch_size ||= model_options[:batch_size]
+        batch_size ||= 1
+        shuffle = model_options[:training_args][:shuffle]
+        shuffle = true if shuffle.nil?
+        num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
+        train_loader = RbbtPython.run :torch do
+          dataset = features.zip(labels).collect{|f,l| [torch.tensor(f), l] }
+          torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, shuffle: shuffle, num_workers: num_workers.to_i)
         end
       end
-      trainer.fit(model, loader, val_loader)
+      trainer.fit(model, train_loader, val_loader)
       TorchModel.save_architecture(model, model_path) if @directory
       TorchModel.save_state(model, model_path) if @directory
     end
+    eval_model do |features,list=false|
+      model = init
+      eval_loader = self.loader
+      if list
+        if eval_loader.nil?
+          batch_size ||= model_options[:batch_size]
+          batch_size ||= model_options[:training_args][:batch_size]
+          batch_size ||= 1
+          num_workers = Rbbt.config(:num_workers, :dataloader, :default => 2)
+          eval_loader = RbbtPython.run :torch do
+            dataset = torch.tensor(features)
+            torch.utils.data.DataLoader.call(dataset, batch_size: batch_size, num_workers: num_workers.to_i)
+          end
+        end
+        trainer.predict(model, eval_loader).inject([]){|acc,res| acc.concat RbbtPython.numpy2ruby(res[1])}
+      else
+        model.call(torch.tensor(features))
+      end
+    end
   end
   def trainer
     @trainer ||= begin
-                   options = @model_options[:training_args] || @model_options[:trainer_args]
-                   RbbtPython.class_new_obj("pytorch_lightning", "Trainer", options || {})
+                   trainer_args = {default_root_dir: File.join(@directory, 'checkpoints')}.
+                     merge(model_options[:training_args].except(:batch_size))
+                   RbbtPython.class_new_obj("pytorch_lightning", "Trainer", trainer_args)
                  end
   end
 end

data/lib/rbbt/vector/model/torch/dataloader.rb CHANGED Viewed

@@ -42,13 +42,12 @@ class TorchModel
   end
   def self.text_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
-    elements = elements.collect{|e| e.gsub("\n", ' ') }
+    elements = elements.compact.collect{|e| e.gsub("\n", ' ').gsub('"', '\'') }
     tsv = feature_tsv(elements, labels, class_labels)
+    tsv.fields[0] = "text"
     if labels.nil?
-      tsv.fields[0] = "text"
-      tsv.type = :single
+      tsv = tsv.to_single
     else
-      tsv.fields[0] = "text"
       tsv.type = :list
     end
     Open.write(tsv_dataset_file, tsv.to_s)

data/lib/rbbt/vector/model/torch/helpers.rb CHANGED Viewed

@@ -3,9 +3,27 @@ class TorchModel
     def to_ruby
       RbbtPython.numpy2ruby(self)
     end
+    def to_ruby!
+      r = self.to_ruby
+      self.del
+      r
+    end
+    def length
+      PyCall.len(self)
+    end
     def self.setup(obj)
       obj.extend Tensor
     end
+    def del
+      self.detach
+      self.grad = nil
+      self.storage.resize_ 0
+      self.to("cpu")
+    end
   end
   def self.init_python
@@ -46,7 +64,7 @@ class TorchModel
   end
   def self.tensor(obj, device, dtype)
-    RbbtPython.torch.tensor(obj, dtype: dtype, device: device)
+    TorchModel::Tensor.setup(RbbtPython.torch.tensor(obj, dtype: dtype, device: device))
   end
 end

data/lib/rbbt/vector/model/torch/introspection.rb CHANGED Viewed

@@ -13,19 +13,20 @@ class TorchModel
   end
   def get_weights(...); TorchModel.get_weights(model, ...); end
-  def self.freeze(layer)
+  def self.freeze(layer, requires_grad=false)
     begin
-      PyCall.getattr(layer, :weight).requires_grad = false
+      PyCall.getattr(layer, :weight).requires_grad = requires_grad
     rescue
     end
     RbbtPython.iterate(layer.children) do |layer|
-      freeze(layer)
+      freeze(layer, requires_grad)
     end
   end
-  def self.freeze_layer(model, layer)
+  def self.freeze_layer(model, layer, requires_grad = false)
     layer = get_layer(model, layer)
-    freeze(layer)
+    freeze(layer, requires_grad)
   end
-  def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
+  def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
 end

data/lib/rbbt/vector/model/torch/load_and_save.rb CHANGED Viewed

@@ -27,4 +27,10 @@ class TorchModel
     Log.debug "Loading model architecture from #{model_architecture}"
     RbbtPython.torch.load(model_architecture)
   end
+  def reset_model
+    @trainer = @model = nil
+    Open.rm_rf model_path
+    Open.rm_rf TorchModel.model_architecture(model_path)
+  end
 end

data/lib/rbbt/vector/model/torch.rb CHANGED Viewed

@@ -2,47 +2,37 @@ require_relative 'python'
 class TorchModel < PythonModel
-  attr_accessor :model, :criterion, :optimizer, :training_args
+  attr_accessor :criterion, :optimizer
   def initialize(...)
     TorchModel.init_python
     super(...)
-    @training_args = model_options[:training_args] || {}
+    @model_options[:training_options] = @model_options.delete(:training_args) if @model_options.include?(:training_args)
+    training_args = IndiferentHash.pull_keys(@model_options, :training) || {}
+    @model_options[:training_args] = training_args
     init_model do
       model = TorchModel.load_architecture(model_path)
       if model.nil?
         RbbtPython.add_path @directory
-        RbbtPython.class_new_obj(@python_module, @python_class, **model_options)
+        RbbtPython.process_paths
+        RbbtPython.class_new_obj(@python_module, @python_class, **model_options.except(:training_args, :batch_size))
       else
         TorchModel.load_state(model, model_path)
       end
     end
-    eval_model do |features,list=false|
-      init
-      @device ||= TorchModel.device(model_options)
-      @dtype ||= TorchModel.dtype(model_options)
-      model.to(@device)
-      tensor = list ? TorchModel.tensor(features, @device, @dtype) : TorchModel.tensor([features], @device, @dtype)
-      loss, res = model.call(tensor)
-      res = loss if res.nil?
-      res = TorchModel::Tensor.setup(list ? res : res[0])
-      res
-    end
     train_model do |features,labels|
       init
       @device ||= TorchModel.device(model_options)
       @dtype ||= TorchModel.dtype(model_options)
       model.to(@device)
-      @optimizer ||= TorchModel.optimizer(model, training_args)
-      epochs = training_args[:epochs] || 3
+      @optimizer ||= TorchModel.optimizer(model, model_options[:training_args] || {})
+      epochs = model_options[:training_args][:epochs] || 3
+      batch_size = model_options[:batch_size]
+      batch_size ||= model_options[:training_args][:batch_size]
+      batch_size ||= 1
       inputs = TorchModel.tensor(features, @device, @dtype)
       #target = TorchModel.tensor(labels.collect{|v| [v] }, @device, @dtype)
@@ -63,6 +53,39 @@ class TorchModel < PythonModel
       TorchModel.save_architecture(model, model_path) if @directory
       TorchModel.save_state(model, model_path) if @directory
     end
+    eval_model do |features,list=false|
+      init
+      @device ||= TorchModel.device(model_options)
+      @dtype ||= TorchModel.dtype(model_options)
+      model.to(@device)
+      model.eval
+      features = [features] unless list
+      batch_size = model_options[:batch_size]
+      batch_size ||= model_options[:training_args][:batch_size]
+      batch_size ||= 1
+      res = Misc.chunk(features, batch_size).inject(nil) do |acc,batch|
+        tensor = TorchModel.tensor(batch, @device, @dtype)
+        loss, chunk_res = model.call(tensor)
+        tensor.del
+        chunk_res = loss if chunk_res.nil?
+        TorchModel::Tensor.setup(chunk_res)
+        acc = acc.nil? ? chunk_res.to_ruby! : acc + chunk_res.to_ruby!
+        acc
+      end
+      res = TorchModel::Tensor.setup(list ? res : res[0])
+      res
+    end
   end
 end
 require_relative 'torch/helpers'

data/lib/rbbt/vector/model.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rbbt/vector/model/util'
 require 'rbbt/util/python'
 RbbtPython.add_path Rbbt.python.find(:lib)
-RbbtPython.init_rbbt
+#RbbtPython.init_rbbt
 class VectorModel
   attr_accessor :directory, :model_path, :extract_features, :init_model, :train_model, :eval_model, :post_process, :balance
@@ -166,8 +166,9 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       @options_file          = File.join(@directory, "options.json")
       if File.exist?(@options_file)
-        @model_options = JSON.parse(Open.read(@options_file)).merge(@model_options || {})
-        IndiferentHash.setup(@model_options)
+        file_options = JSON.parse(Open.read(@options_file))
+        IndiferentHash.setup(file_options)
+        @model_options = file_options.deep_merge(@model_options)
       end
     end
@@ -254,8 +255,24 @@ cat(paste(label, sep="\\n", collapse="\\n"));
   def add_list(elements, labels = nil)
     if @extract_features.nil? || @extract_features.arity == 1
-      elements.zip(labels || [nil]).each do |elem,label|
-        add(elem, label)
+      case labels
+      when nil
+        elements.each do |elem|
+          add(elem)
+        end
+      when Array
+        elements.zip(labels).each do |elem,label|
+          add(elem, label)
+        end
+      when Hash
+        elements.each do |elem|
+          label = labels[elem]
+          add(elem, label)
+        end
+      else
+        elements.each do |elem|
+          add(elem, labels)
+        end
       end
     else
       features = self.instance_exec(nil, elements, &@extract_features)
@@ -482,8 +499,11 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       @labels = orig_labels
     end unless folds == -1
-    self.reset_model if self.respond_to? :reset_model
-    self.train unless folds == 1
+    if folds != 1
+      self.reset_model if self.respond_to? :reset_model
+      self.train
+    end
     res
   end
 end

data/python/rbbt_dm/__init__.py CHANGED Viewed

@@ -27,5 +27,9 @@ def tsv_dataset(filename, *args, **kwargs):
 def tsv(*args, **kwargs):
     return tsv_dataset(*args, **kwargs)
+def tsv_loader(*args, **kwargs):
+    dataset = tsv(*args, kwargs)
+    return torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
 def data_dir():
     return rbbt.path('var/rbbt_dm/data')

data/python/rbbt_dm/huggingface.py CHANGED Viewed

@@ -15,15 +15,15 @@ def load_model(task, checkpoint, **kwargs):
         return import_module_class(module, class_name).from_pretrained(checkpoint, **kwargs)
     else:
         class_name = 'AutoModelFor' + task
-        return import_module_class('transformers', class_name).from_pretrained(checkpoint)
+        return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
-def load_tokenizer(task, checkpoint, **kwargs):
+def load_tokenizer(checkpoint, **kwargs):
     class_name = 'AutoTokenizer'
     return import_module_class('transformers', class_name).from_pretrained(checkpoint, **kwargs)
 def load_model_and_tokenizer(task, checkpoint):
     model = load_model(task, checkpoint)
-    tokenizer = load_tokenizer(task, checkpoint)
+    tokenizer = load_tokenizer(checkpoint)
     return model, tokenizer
 # Not used
@@ -88,6 +88,9 @@ def training_args(*args, **kwargs):
 def train_model(model, tokenizer, training_args, dataset, class_weights=None, **kwargs):
     from transformers import Trainer
+    # Note: Parameters need to be made contiguous. I'm not sure why they weren't
+    for param in model.parameters(): param.data = param.data.contiguous()
     if (isinstance(dataset, str)):
         if (dataset.endswith('.json')):
             tokenized_dataset = json_dataset(tokenizer, dataset)

data/share/R/MA.R CHANGED Viewed

@@ -99,6 +99,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
     }
     if (log2){
+       full_rows = apply(is.na(data), 1, sum) == 0
+       data = data[full_rows,]
        cutoff <- 1
        drop <- which(apply(data, 1, max) < cutoff)
        min = min(data[data != -Inf])
@@ -106,7 +108,8 @@ rbbt.dm.matrix.differential.limma <- function(data, main, contrast=NULL, log2=NU
        data <- DGEList(data)
        data <- calcNormFactors(data)
        data = cpm(data, log=TRUE, prior.count=3)
-       data <- data[-drop,]
+       if (length(drop) > 0)
+           data <- data[-drop,]
     }else{
        data[data == 0] = NA
        good.rows = apply(is.na(data),1,sum) != dim(data)[2]
@@ -181,10 +184,11 @@ rbbt.dm.matrix.differential <- function(file, main, contrast = NULL, type = 'lim
         contrast <- make.names(contrast);
     }
-    if (type == 'limma')
+    if (is.null(type) || type == 'limma'){
         result = rbbt.dm.matrix.differential.limma(data, main, contrast, log2, two.channel, eBayes.trend)
-    else
+    }else{
         result = rbbt.dm.matrix.differential.DESeq(data, main, contrast)
+    }
     if (is.null(outfile)){
         return(result);

data/test/rbbt/statistics/test_fdr.rb CHANGED Viewed

@@ -32,8 +32,6 @@ class TestFDR < Test::Unit::TestCase
     assert_equal(clean(@r_adj), clean(FDR.adjust_native(@values)))
     assert_equal(clean(FDR.adjust_fast(@values)), clean(FDR.adjust_native(@values)))
-    assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0] != "2"
+    assert_equal(clean(@r_adj), clean(FDR.adjust_fast_self(copy(@values)))) if RUBY_VERSION[0].to_i < 2
   end
 end

data/test/rbbt/statistics/test_hypergeometric.rb CHANGED Viewed

@@ -4,6 +4,11 @@ require 'test/unit'
 class TestHypergeometric < Test::Unit::TestCase
+  def test_hypergeometric_c
+    assert_equal Hypergeometric.hypergeometric_c(2, 1, 1, 1).round(2), 0.5
+    assert_equal Hypergeometric.hypergeometric_c(10, 1, 1, 1).round(2), 0.1
+  end
   def test_hypergeometric
     assert Hypergeometric.hypergeometric(100, 20, 15, 13) < 0.0005
   end

data/test/rbbt/vector/model/test_huggingface.rb CHANGED Viewed

@@ -9,12 +9,8 @@ class TestHuggingface < Test::Unit::TestCase
       task = "SequenceClassification"
       model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
-      iii model.eval "This is dog"
-      iii model.eval "This is cat"
-      iii model.eval_list(["This is dog", "This is cat"])
       model = VectorModel.new dir
-      iii model.eval_list(["This is dog", "This is cat"])
     end
   end
@@ -42,7 +38,7 @@ class TestHuggingface < Test::Unit::TestCase
     assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
   end
-  def test_sst_eval
+  def _test_sst_eval
     TmpFile.with_file do |dir|
       checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -55,12 +51,29 @@ class TestHuggingface < Test::Unit::TestCase
     end
   end
+  def _test_sst_logits
+    TmpFile.with_file do |dir|
+      checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
+      model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, :tokenizer_args => {:max_length => 16}
+      model.model_options[:class_labels] = ["Bad", "Good"]
+      model.model_options[:return_logits] = true
+      logits = model.eval("This is dog")
+      assert logits[0] > logits[1]
+      logits = model.eval_list(["This is dog", "This is cat"])
+      assert logits[0][0] > logits[0][1]
+      assert logits[1][0] < logits[1][1]
+    end
+  end
-  def _test_sst_train
+  def test_sst_train
     TmpFile.with_file do |dir|
       checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
-      model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, max_length: 128
+      model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir, tokenizer_args:{max_length: 128}, tokenizer_padding: true, tokenizer_truncation: true
       model.model_options[:class_labels] = %w(Bad Good)
@@ -148,12 +161,12 @@ class TestHuggingface < Test::Unit::TestCase
       model = VectorModel.new dir
-      assert_equal "Good", model.eval_list("This is dog")
+      assert_equal ["Good"], model.eval_list(["This is dog"])
     end
   end
-  def _test_sst_stress_test
+  def __test_sst_stress_test
     TmpFile.with_file do |dir|
       checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -252,6 +265,7 @@ class RobertaForTokenClassification_NER(RobertaPreTrainedModel):
       EOF
       RbbtPython.add_path dir
+      RbbtPython.process_paths
       biomedical_roberta = "PlanTL-GOB-ES/bsc-bio-ehr-es-cantemist"
       model = HuggingfaceModel.new "mypkg.mymodel:RobertaForTokenClassification_NER", biomedical_roberta

data/test/rbbt/vector/model/test_pytorch_lightning.rb CHANGED Viewed

@@ -88,9 +88,13 @@ class TestPytorchLightningModel(pl.LightningModule):
       res = model.eval_list([[10.0], [11.2], [14.3]])
       assert_equal 3, RbbtPython.numpy2ruby(res).length
+      orig_res = res
       model = VectorModel.new dir
       model.init
+      res = model.eval([10.0])
+      res = model.eval_list([[10.0], [11.2], [14.3]])
+      assert_equal 3, RbbtPython.numpy2ruby(res).length
+      assert_equal orig_res, res
     end
   end
 end

data/test/rbbt/vector/model/test_torch.rb CHANGED Viewed

@@ -26,7 +26,7 @@ class TestTorch < Test::Unit::TestCase
       model.add 5.0, [10.0]
       model.add 10.0, [20.0]
-      model.training_args[:epochs] = 1000
+      model.model_options[:training_args][:epochs] = 1000
       model.train
       w = model.get_weights.to_ruby.first.first

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-dm
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.3.2
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-12-21 00:00:00.000000000 Z
+date: 2025-01-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -174,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.0.dev
+rubygems_version: 3.5.23
 signing_key:
 specification_version: 4
 summary: Data-mining and statistics