RubyGems - rbbt-dm - Versions diffs - 1.2.6 → 1.2.9 - Mend

rbbt-dm 1.2.6 → 1.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/lib/rbbt/matrix/barcode.rb +2 -2
data/lib/rbbt/matrix/differential.rb +3 -3
data/lib/rbbt/matrix/knowledge_base.rb +1 -1
data/lib/rbbt/plots/bar.rb +1 -1
data/lib/rbbt/stan.rb +1 -1
data/lib/rbbt/statistics/hypergeometric.rb +2 -1
data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
data/lib/rbbt/vector/model/huggingface.rb +57 -38
data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
data/lib/rbbt/vector/model/random_forest.rb +1 -1
data/lib/rbbt/vector/model/spaCy.rb +8 -14
data/lib/rbbt/vector/model/tensorflow.rb +6 -5
data/lib/rbbt/vector/model/torch.rb +37 -0
data/lib/rbbt/vector/model/util.rb +18 -0
data/lib/rbbt/vector/model.rb +100 -56
data/python/rbbt_dm/__init__.py +48 -1
data/python/rbbt_dm/atcold/__init__.py +0 -0
data/python/rbbt_dm/atcold/plot_lib.py +141 -0
data/python/rbbt_dm/atcold/spiral.py +27 -0
data/python/rbbt_dm/huggingface.py +57 -26
data/python/rbbt_dm/language_model.py +70 -0
data/python/rbbt_dm/util.py +30 -0
data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
data/test/rbbt/vector/model/test_huggingface.rb +258 -27
data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
data/test/rbbt/vector/model/test_spaCy.rb +1 -1
data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
data/test/rbbt/vector/test_model.rb +25 -26
data/test/test_helper.rb +13 -0
metadata +26 -16
data/lib/rbbt/tensorflow.rb +0 -43
data/lib/rbbt/vector/model/huggingface.old.rb +0 -160

data/test/test_helper.rb CHANGED Viewed

@@ -19,4 +19,17 @@ class Test::Unit::TestCase
   def datafile_test(file)
     Test::Unit::TestCase.datafile_test(file)
   end
+  def with_python(code, &block)
+    TmpFile.with_file do |dir|
+      pkg = "pkg#{rand(100)}"
+      Open.write File.join(dir, "#{pkg}/__init__.py"), code
+      RbbtPython.add_path dir
+      Misc.in_dir dir do
+        yield pkg
+      end
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-dm
 version: !ruby/object:Gem::Version
-  version: 1.2.6
+  version: 1.2.9
 platform: ruby
 authors:
 - Miguel Vazquez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-08 00:00:00.000000000 Z
+date: 2023-08-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -105,17 +105,23 @@ files:
 - lib/rbbt/statistics/hypergeometric.rb
 - lib/rbbt/statistics/random_walk.rb
 - lib/rbbt/statistics/rank_product.rb
-- lib/rbbt/tensorflow.rb
 - lib/rbbt/vector/model.rb
-- lib/rbbt/vector/model/huggingface.old.rb
 - lib/rbbt/vector/model/huggingface.rb
+- lib/rbbt/vector/model/huggingface/masked_lm.rb
+- lib/rbbt/vector/model/pytorch_lightning.rb
 - lib/rbbt/vector/model/random_forest.rb
 - lib/rbbt/vector/model/spaCy.rb
 - lib/rbbt/vector/model/svm.rb
 - lib/rbbt/vector/model/tensorflow.rb
+- lib/rbbt/vector/model/torch.rb
 - lib/rbbt/vector/model/util.rb
 - python/rbbt_dm/__init__.py
+- python/rbbt_dm/atcold/__init__.py
+- python/rbbt_dm/atcold/plot_lib.py
+- python/rbbt_dm/atcold/spiral.py
 - python/rbbt_dm/huggingface.py
+- python/rbbt_dm/language_model.py
+- python/rbbt_dm/util.py
 - share/R/MA.R
 - share/R/barcode.R
 - share/R/heatmap.3.R
@@ -135,7 +141,9 @@ files:
 - test/rbbt/statistics/test_random_walk.rb
 - test/rbbt/test_ml_task.rb
 - test/rbbt/test_stan.rb
+- test/rbbt/vector/model/huggingface/test_masked_lm.rb
 - test/rbbt/vector/model/test_huggingface.rb
+- test/rbbt/vector/model/test_pytorch_lightning.rb
 - test/rbbt/vector/model/test_spaCy.rb
 - test/rbbt/vector/model/test_svm.rb
 - test/rbbt/vector/model/test_tensorflow.rb
@@ -159,22 +167,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.2
+rubygems_version: 3.4.19
 signing_key:
 specification_version: 4
 summary: Data-mining and statistics
 test_files:
-- test/test_helper.rb
-- test/rbbt/vector/test_model.rb
-- test/rbbt/vector/model/test_huggingface.rb
-- test/rbbt/vector/model/test_tensorflow.rb
-- test/rbbt/vector/model/test_spaCy.rb
-- test/rbbt/vector/model/test_svm.rb
-- test/rbbt/statistics/test_random_walk.rb
-- test/rbbt/statistics/test_fisher.rb
+- test/rbbt/matrix/test_barcode.rb
+- test/rbbt/network/test_paths.rb
 - test/rbbt/statistics/test_fdr.rb
+- test/rbbt/statistics/test_fisher.rb
 - test/rbbt/statistics/test_hypergeometric.rb
-- test/rbbt/test_stan.rb
-- test/rbbt/matrix/test_barcode.rb
+- test/rbbt/statistics/test_random_walk.rb
 - test/rbbt/test_ml_task.rb
-- test/rbbt/network/test_paths.rb
+- test/rbbt/test_stan.rb
+- test/rbbt/vector/model/huggingface/test_masked_lm.rb
+- test/rbbt/vector/model/test_huggingface.rb
+- test/rbbt/vector/model/test_pytorch_lightning.rb
+- test/rbbt/vector/model/test_spaCy.rb
+- test/rbbt/vector/model/test_svm.rb
+- test/rbbt/vector/model/test_tensorflow.rb
+- test/rbbt/vector/test_model.rb
+- test/test_helper.rb

data/lib/rbbt/tensorflow.rb DELETED Viewed

@@ -1,43 +0,0 @@
-require 'rbbt/util/python'
-module RbbtTensorflow
-  def self.init
-    RbbtPython.run do
-      pyimport "tensorflow", as: "tf"
-    end
-  end
-  def self.test
-    mod = x_test = y_test = nil
-    RbbtPython.run do
-      mnist_db = tf.keras.datasets.mnist
-      (x_train, y_train), (x_test, y_test) = mnist_db.load_data()
-      x_train, x_test = x_train / 255.0, x_test / 255.0
-      mod = tf.keras.models.Sequential.new([
-        tf.keras.layers.Flatten.new(input_shape: [28, 28]),
-        tf.keras.layers.Dense.new(128, activation:'relu'),
-        tf.keras.layers.Dropout.new(0.2),
-        tf.keras.layers.Dense.new(10, activation:'softmax')
-      ])
-      mod.compile(optimizer='adam',
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-      mod.fit(x_train, y_train, epochs:3)
-      mod
-    end
-    RbbtPython.run do
-      mod.evaluate(x_test,  y_test, verbose:2)
-    end
-  end
-end
-if __FILE__ == $0
-  RbbtTensorflow.init
-  RbbtTensorflow.test
-end

data/lib/rbbt/vector/model/huggingface.old.rb DELETED Viewed

@@ -1,160 +0,0 @@
-require 'rbbt/vector/model'
-require 'rbbt/util/python'
-RbbtPython.add_path Rbbt.python.find(:lib)
-RbbtPython.init_rbbt
-class HuggingfaceModel < VectorModel
-  attr_accessor :checkpoint, :task, :locate_tokens, :class_labels, :class_weights, :training_args
-  def self.tsv_dataset(tsv_dataset_file, elements, labels = nil)
-    if labels
-      Open.write(tsv_dataset_file) do |ffile|
-        ffile.puts ["label", "text"].flatten * "\t"
-        elements.zip(labels).each do |element,label|
-          ffile.puts [label, element].flatten * "\t"
-        end
-      end
-    else
-      Open.write(tsv_dataset_file) do |ffile|
-        ffile.puts ["text"].flatten * "\t"
-        elements.each{|element| ffile.puts element }
-      end
-    end
-    tsv_dataset_file
-  end
-  def self.call_method(name, *args)
-    RbbtPython.import_method("rbbt_dm.huggingface", name).call(*args)
-  end
-  def call_method(name, *args)
-    HuggingfaceModel.call_method(name, *args)
-  end
-  #def input_tsv_file
-  #  File.join(@directory, 'dataset.tsv') if @directory
-  #end
-  #def checkpoint_dir
-  #  File.join(@directory, 'checkpoints') if @directory
-  #end
-  def self.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil)
-    TmpFile.with_file do |tmpfile|
-      tsv_file = File.join(tmpfile, 'dataset.tsv')
-      if training_args
-        training_args = training_args.dup
-        checkpoint_dir = training_args.delete(:checkpoint_dir)
-      end
-      checkpoint_dir = File.join(tmpfile, 'checkpoints')
-      Open.mkdir File.dirname(tsv_file)
-      Open.mkdir File.dirname(checkpoint_dir)
-      if labels
-        training_args_obj = call_method(:training_args, checkpoint_dir, **training_args)
-        call_method(:train_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements, labels), class_weights)
-      else
-        locate_tokens, training_args = training_args, {}
-        if Array === elements
-          training_args_obj = call_method(:training_args, checkpoint_dir)
-          call_method(:predict_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements), locate_tokens)
-        else
-          call_method(:eval_model, model, tokenizer, [elements], locate_tokens)
-        end
-      end
-    end
-  end
-  def init_model
-    @model, @tokenizer = call_method(:load_model_and_tokenizer, @task, @checkpoint)
-  end
-  def reset_model
-    init_model
-  end
-  def initialize(task, initial_checkpoint = nil, *args)
-    super(*args)
-    @task = task
-    @checkpoint = model_file && File.exists?(model_file)? model_file : initial_checkpoint
-    init_model
-    @locate_tokens = @tokenizer.special_tokens_map["mask_token"]  if @task == "MaskedLM"
-    @training_args = {}
-    train_model do |file,elements,labels|
-      HuggingfaceModel.run_model(@model, @tokenizer, elements, labels, @training_args, @class_weights)
-      @model.save_pretrained(file) if file
-      @tokenizer.save_pretrained(file) if file
-    end
-    eval_model do |file,elements|
-      @model, @tokenizer = HuggingfaceModel.call_method(:load_model_and_tokenizer, @task, @checkpoint)
-      HuggingfaceModel.run_model(@model, @tokenizer, elements, nil, @locate_tokens)
-    end
-    post_process do |result|
-      if result.respond_to?(:predictions)
-        single = false
-        predictions = result.predictions
-      elsif result["token_positions"]
-        predictions = result["result"].predictions
-        token_positions = result["token_positions"]
-      else
-        single = true
-        predictions = result["logits"]
-      end
-      result = case @task
-               when "SequenceClassification"
-                 RbbtPython.collect(predictions) do |logits|
-                   logits = RbbtPython.numpy2ruby logits
-                   best_class = logits.index logits.max
-                   best_class = @class_labels[best_class] if @class_labels
-                   best_class
-                 end
-               when "MaskedLM"
-                 all_token_positions = token_positions.to_a
-                 i = 0
-                 RbbtPython.collect(predictions) do |item_logits|
-                   item_token_positions = all_token_positions[i]
-                   i += 1
-                   item_logits = RbbtPython.numpy2ruby(item_logits)
-                   item_masks = item_token_positions.collect do |token_positions|
-                     best = item_logits.values_at(*token_positions).collect do |logits|
-                       best_token, best_score = nil
-                       logits.each_with_index do |v,i|
-                         if best_score.nil? || v > best_score
-                           best_token, best_score = i, v
-                         end
-                       end
-                       best_token
-                     end
-                     best.collect{|b| @tokenizer.decode(b) } * "|"
-                   end
-                   Array === @locate_tokens ? item_masks : item_masks.first
-                 end
-               else
-                 logits
-               end
-      single ? result.first : result
-    end
-  end
-end