RubyGems - rbbt-dm - Versions diffs - 1.2.4 → 1.2.7 - Mend

rbbt-dm 1.2.4 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/rbbt/vector/model/huggingface.old.rb +160 -0
data/lib/rbbt/vector/model/huggingface.rb +68 -45
data/lib/rbbt/vector/model/spaCy.rb +0 -8
data/lib/rbbt/vector/model/util.rb +18 -0
data/lib/rbbt/vector/model.rb +56 -40
data/python/rbbt_dm/huggingface.py +38 -27
data/test/rbbt/vector/model/test_huggingface.rb +31 -9
metadata +16 -15

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: abaea1fff82b5e14a84dc9afc966fc8dde6482d50769d196854c1d619adebaf3
-  data.tar.gz: 561b8864fc2c0ba271a2a658da0d3492c7481a2368b40c3b91fe6edb4ebca4cd
+  metadata.gz: 1c55843bf543c88167239f6e182495963e0683c5a7fdd7c3a7ab9bd501a78bc8
+  data.tar.gz: d01aaf45331766eac6d868749b8df72c49d1a6888f44f7a1d4f8cbfefe258c87
 SHA512:
-  metadata.gz: f26f6b27f1beb2554fa78369d1d618cc13175e0c9bb0e789b9490dcae0f7f6df4449a3c72d183ae22c96324d4e2f1ab0352bde8068c1c18871d52c5f5b53c235
-  data.tar.gz: bb33d93cbe24ea974beedb0530f9af317dec06c7e76f32c37d724322ba05f241c6b79a706a88f1bbe703ac4bc78c53c220f28c3f38cf7939477274b8747c436e
+  metadata.gz: 7b6a225ce0403759ab45f26d371d491c19fc76f6560771868a58b9de921fd3aa03750bd7aec95c34029f61f53e71e382958f2779ca790fde30958cfbd1169a0b
+  data.tar.gz: ae1b6d44072398fbde96a0cb31f9586076dee1a5c7e2ac32726c65ecaaa3d08b59ea627c7a0f9f4a8e87547d5a403452ea5bee1d0736d610bf73b6456cb99be9

data/lib/rbbt/vector/model/huggingface.old.rb ADDED Viewed

@@ -0,0 +1,160 @@
+require 'rbbt/vector/model'
+require 'rbbt/util/python'
+RbbtPython.add_path Rbbt.python.find(:lib)
+RbbtPython.init_rbbt
+class HuggingfaceModel < VectorModel
+  attr_accessor :checkpoint, :task, :locate_tokens, :class_labels, :class_weights, :training_args
+  def self.tsv_dataset(tsv_dataset_file, elements, labels = nil)
+    if labels
+      Open.write(tsv_dataset_file) do |ffile|
+        ffile.puts ["label", "text"].flatten * "\t"
+        elements.zip(labels).each do |element,label|
+          ffile.puts [label, element].flatten * "\t"
+        end
+      end
+    else
+      Open.write(tsv_dataset_file) do |ffile|
+        ffile.puts ["text"].flatten * "\t"
+        elements.each{|element| ffile.puts element }
+      end
+    end
+    tsv_dataset_file
+  end
+  def self.call_method(name, *args)
+    RbbtPython.import_method("rbbt_dm.huggingface", name).call(*args)
+  end
+  def call_method(name, *args)
+    HuggingfaceModel.call_method(name, *args)
+  end
+  #def input_tsv_file
+  #  File.join(@directory, 'dataset.tsv') if @directory
+  #end
+  #def checkpoint_dir
+  #  File.join(@directory, 'checkpoints') if @directory
+  #end
+  def self.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil)
+    TmpFile.with_file do |tmpfile|
+      tsv_file = File.join(tmpfile, 'dataset.tsv')
+      if training_args
+        training_args = training_args.dup
+        checkpoint_dir = training_args.delete(:checkpoint_dir)
+      end
+      checkpoint_dir = File.join(tmpfile, 'checkpoints')
+      Open.mkdir File.dirname(tsv_file)
+      Open.mkdir File.dirname(checkpoint_dir)
+      if labels
+        training_args_obj = call_method(:training_args, checkpoint_dir, **training_args)
+        call_method(:train_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements, labels), class_weights)
+      else
+        locate_tokens, training_args = training_args, {}
+        if Array === elements
+          training_args_obj = call_method(:training_args, checkpoint_dir)
+          call_method(:predict_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements), locate_tokens)
+        else
+          call_method(:eval_model, model, tokenizer, [elements], locate_tokens)
+        end
+      end
+    end
+  end
+  def init_model
+    @model, @tokenizer = call_method(:load_model_and_tokenizer, @task, @checkpoint)
+  end
+  def reset_model
+    init_model
+  end
+  def initialize(task, initial_checkpoint = nil, *args)
+    super(*args)
+    @task = task
+    @checkpoint = model_file && File.exists?(model_file)? model_file : initial_checkpoint
+    init_model
+    @locate_tokens = @tokenizer.special_tokens_map["mask_token"]  if @task == "MaskedLM"
+    @training_args = {}
+    train_model do |file,elements,labels|
+      HuggingfaceModel.run_model(@model, @tokenizer, elements, labels, @training_args, @class_weights)
+      @model.save_pretrained(file) if file
+      @tokenizer.save_pretrained(file) if file
+    end
+    eval_model do |file,elements|
+      @model, @tokenizer = HuggingfaceModel.call_method(:load_model_and_tokenizer, @task, @checkpoint)
+      HuggingfaceModel.run_model(@model, @tokenizer, elements, nil, @locate_tokens)
+    end
+    post_process do |result|
+      if result.respond_to?(:predictions)
+        single = false
+        predictions = result.predictions
+      elsif result["token_positions"]
+        predictions = result["result"].predictions
+        token_positions = result["token_positions"]
+      else
+        single = true
+        predictions = result["logits"]
+      end
+      result = case @task
+               when "SequenceClassification"
+                 RbbtPython.collect(predictions) do |logits|
+                   logits = RbbtPython.numpy2ruby logits
+                   best_class = logits.index logits.max
+                   best_class = @class_labels[best_class] if @class_labels
+                   best_class
+                 end
+               when "MaskedLM"
+                 all_token_positions = token_positions.to_a
+                 i = 0
+                 RbbtPython.collect(predictions) do |item_logits|
+                   item_token_positions = all_token_positions[i]
+                   i += 1
+                   item_logits = RbbtPython.numpy2ruby(item_logits)
+                   item_masks = item_token_positions.collect do |token_positions|
+                     best = item_logits.values_at(*token_positions).collect do |logits|
+                       best_token, best_score = nil
+                       logits.each_with_index do |v,i|
+                         if best_score.nil? || v > best_score
+                           best_token, best_score = i, v
+                         end
+                       end
+                       best_token
+                     end
+                     best.collect{|b| @tokenizer.decode(b) } * "|"
+                   end
+                   Array === @locate_tokens ? item_masks : item_masks.first
+                 end
+               else
+                 logits
+               end
+      single ? result.first : result
+    end
+  end
+end

data/lib/rbbt/vector/model/huggingface.rb CHANGED Viewed

@@ -6,9 +6,7 @@ RbbtPython.init_rbbt
 class HuggingfaceModel < VectorModel
-  attr_accessor :checkpoint, :task, :locate_tokens, :class_labels, :class_weights
-  def tsv_dataset(tsv_dataset_file, elements, labels = nil)
+  def self.tsv_dataset(tsv_dataset_file, elements, labels = nil)
     if labels
       Open.write(tsv_dataset_file) do |ffile|
@@ -27,59 +25,74 @@ class HuggingfaceModel < VectorModel
     tsv_dataset_file
   end
-  def call_method(name, *args)
-    RbbtPython.import_method("rbbt_dm.huggingface", name).call(*args)
-  end
-  def input_tsv_file
-    File.join(@directory, 'dataset.tsv') if @directory
-  end
+  def initialize(task, checkpoint, *args)
+    options = args.pop if Hash === args.last
+    options = Misc.add_defaults options, :task => task, :checkpoint => checkpoint
+    super(*args)
+    @model_options ||= {}
+    @model_options.merge!(options)
-  def checkpoint_dir
-    File.join(@directory, 'checkpoints') if @directory
-  end
+    eval_model do |directory,texts|
+      checkpoint = directory && File.directory?(directory) ? directory : @model_options[:checkpoint]
-  def run_model(elements, labels = nil)
-    TmpFile.with_file do |tmpfile|
-      tsv_file = input_tsv_file || File.join(tmpfile, 'dataset.tsv')
-      output_dir = checkpoint_dir || File.join(tmpfile, 'checkpoints')
+      if @model.nil?
+        @model, @tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_model_and_tokenizer, @model_options[:task], checkpoint)
+      end
+      if Array === texts
-      Open.mkdir File.dirname(output_dir)
-      Open.mkdir File.dirname(tsv_file)
+        if @model_options.include?(:locate_tokens)
+          locate_tokens = @model_options[:locate_tokens]
+        elsif @model_options[:task] == "MaskedLM"
+          @model_options[:locate_tokens] = locate_tokens = @tokenizer.special_tokens_map["mask_token"]
+        end
-      if labels
-        training_args = call_method(:training_args, output_dir)
-        call_method(:train_model, @model, @tokenizer, training_args, tsv_dataset(tsv_file, elements, labels), @class_weights)
-      else
-        if Array === elements
-          training_args = call_method(:training_args, output_dir)
-          call_method(:predict_model, @model, @tokenizer, training_args, tsv_dataset(tsv_file, elements), @locate_tokens)
+        if @directory
+          tsv_file = File.join(@directory, 'dataset.tsv')
+          checkpoint_dir = File.join(@directory, 'checkpoints')
         else
-          call_method(:eval_model, @model, @tokenizer, [elements], @locate_tokens)
+          tmpdir = TmpFile.tmp_file
+          Open.mkdir tmpdir
+          tsv_file = File.join(tmpdir, 'dataset.tsv')
+          checkpoint_dir = File.join(tmpdir, 'checkpoints')
+        end
+        dataset_file = HuggingfaceModel.tsv_dataset(tsv_file, texts)
+        training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
+        begin
+          RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, @model, @tokenizer, training_args_obj, dataset_file, locate_tokens)
+        ensure
+          Open.rm_rf tmpdir if tmpdir
         end
+      else
+        RbbtPython.call_method("rbbt_dm.huggingface", :eval_model, @model, @tokenizer, [texts], locate_tokens)
       end
     end
-  end
-  def initialize(task, initial_checkpoint = nil, *args)
-    super(*args)
-    @task = task
-    @checkpoint = model_file && File.exists?(model_file)? model_file : initial_checkpoint
+    train_model do |directory,texts,labels|
+      checkpoint = directory && File.directory?(directory) ? directory : @model_options[:checkpoint]
+      @model, @tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_model_and_tokenizer, @model_options[:task], checkpoint)
-    @model, @tokenizer = call_method(:load_model_and_tokenizer, @task, @checkpoint)
+      if @directory
+        tsv_file = File.join(@directory, 'dataset.tsv')
+        checkpoint_dir = File.join(@directory, 'checkpoints')
+      else
+        tmpdir = TmpFile.tmp_file
+        Open.mkdir tmpdir
+        tsv_file = File.join(tmpdir, 'dataset.tsv')
+        checkpoint_dir = File.join(tmpdir, 'checkpoints')
+      end
-    @locate_tokens = @tokenizer.special_tokens_map["mask_token"]  if @task == "MaskedLM"
+      training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
+      dataset_file = HuggingfaceModel.tsv_dataset(tsv_file, texts, labels)
-    train_model do |file,elements,labels|
-      run_model(elements, labels)
+      RbbtPython.call_method("rbbt_dm.huggingface", :train_model, @model, @tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
-      @model.save_pretrained(file) if file
-      @tokenizer.save_pretrained(file) if file
-    end
+      Open.rm_rf tmpdir if tmpdir
-    eval_model do |file,elements|
-      run_model(elements)
+      @model.save_pretrained(directory) if directory
+      @tokenizer.save_pretrained(directory) if directory
     end
     post_process do |result|
@@ -94,12 +107,13 @@ class HuggingfaceModel < VectorModel
         predictions = result["logits"]
       end
-      result = case @task
+      task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens
+      result = case task
                when "SequenceClassification"
                  RbbtPython.collect(predictions) do |logits|
                    logits = RbbtPython.numpy2ruby logits
                    best_class = logits.index logits.max
-                   best_class = @class_labels[best_class] if @class_labels
+                   best_class = class_labels[best_class] if class_labels
                    best_class
                  end
                when "MaskedLM"
@@ -125,7 +139,7 @@ class HuggingfaceModel < VectorModel
                      best.collect{|b| @tokenizer.decode(b) } * "|"
                    end
-                   Array === @locate_tokens ? item_masks : item_masks.first
+                   Array === locate_tokens ? item_masks : item_masks.first
                  end
                else
                  logits
@@ -133,6 +147,15 @@ class HuggingfaceModel < VectorModel
       single ? result.first : result
     end
+    save_models if @directory
   end
+  def reset_model
+    @model, @tokenizer = nil
+    Open.rm @model_file
+  end
 end

data/lib/rbbt/vector/model/spaCy.rb CHANGED Viewed

@@ -75,14 +75,6 @@ class SpaCyModel < VectorModel
             d.cats.sort_by{|l,v| v.to_f || 0 }.last.first
           end
         end
-        #nlp.(docs).cats.collect{|cats| cats.sort_by{|l,v| v.to_f }.last.first }
-        #Log::ProgressBar.with_bar texts.length, :desc => "Evaluating documents" do |bar|
-        #  texts.collect do |text|
-        #    cats = nlp.(text).cats
-        #    bar.tick
-        #    cats.sort_by{|l,v| v.to_f }.last.first
-        #  end
-        #end
       end
     end
   end

data/lib/rbbt/vector/model/util.rb CHANGED Viewed

@@ -9,4 +9,22 @@ class VectorModel
     @bar.init
     @bar
   end
+  def balance_labels
+    counts = Misc.counts(@labels)
+    min = counts.values.min
+    used = {}
+    new_labels = []
+    new_features = []
+    @labels.zip(@features).shuffle.each do |label, features|
+      used[label] ||= 0
+      next if used[label] > min
+      used[label] += 1
+      new_labels << label
+      new_features << features
+    end
+    @labels = new_labels
+    @features = new_features
+  end
 end

data/lib/rbbt/vector/model.rb CHANGED Viewed

@@ -2,8 +2,9 @@ require 'rbbt/util/R'
 require 'rbbt/vector/model/util'
 class VectorModel
-  attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model, :post_process
+  attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model, :post_process, :balance
   attr_accessor :features, :names, :labels, :factor_levels
+  attr_accessor :model_options
   def extract_features(&block)
     @extract_features = block if block_given?
@@ -126,7 +127,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     instance_eval code, file
   end
-  def initialize(directory = nil, extract_features = nil, train_model = nil, eval_model = nil, names = nil, factor_levels = nil)
+  def initialize(directory = nil, extract_features = nil, train_model = nil, eval_model = nil, post_process = nil, names = nil, factor_levels = nil)
     @directory = directory
     if @directory
       FileUtils.mkdir_p @directory unless File.exists?(@directory)
@@ -135,10 +136,18 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       @extract_features_file = File.join(@directory, "features")
       @train_model_file = File.join(@directory, "train_model")
       @eval_model_file = File.join(@directory, "eval_model")
+      @post_process_file = File.join(@directory, "post_process")
       @train_model_file_R = File.join(@directory, "train_model.R")
       @eval_model_file_R = File.join(@directory, "eval_model.R")
+      @post_process_file_R = File.join(@directory, "post_process.R")
       @names_file = File.join(@directory, "feature_names")
       @levels_file = File.join(@directory, "levels")
+      @options_file = File.join(@directory, "options.json")
+      if File.exists?(@options_file)
+        @model_options = JSON.parse(Open.read(@options_file))
+        IndiferentHash.setup(@model_options)
+      end
     end
     if extract_features.nil?
@@ -169,6 +178,17 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       @eval_model = eval_model
     end
+    if post_process.nil?
+      if @post_process_file && File.exists?(@post_process_file)
+        @post_process = __load_method @post_process_file
+      elsif @post_process_file_R && File.exists?(@post_process_file_R)
+        @post_process = Open.read(@post_process_file_R)
+      end
+    else
+      @post_process = post_process
+    end
     if names.nil?
       if @names_file && File.exists?(@names_file)
         @names = Open.read(@names_file).split("\n")
@@ -240,18 +260,43 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       Open.write(@eval_model_file_R, eval_model)
     end
+    case
+    when Proc === post_process
+      begin
+        Open.write(@post_process_file, post_process.source)
+      rescue
+      end
+    when String === post_process
+      Open.write(@post_process_file_R, post_process)
+    end
     Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
     Open.write(@names_file, @names * "\n" + "\n") if @names
+    Open.write(@options_file, @model_options.to_json) if @model_options
   end
   def train
-    case
-    when Proc === @train_model
-      self.instance_exec(@model_file, @features, @labels, @names, @factor_levels, &@train_model)
-    when String === @train_model
-      VectorModel.R_train(@model_file,  @features, @labels, train_model, @names, @factor_levels)
+    begin
+      if @balance
+        @original_features = @features
+        @original_labels = @labels
+        self.balance_labels
+      end
+      case
+      when Proc === @train_model
+        self.instance_exec(@model_file, @features, @labels, @names, @factor_levels, &@train_model)
+      when String === @train_model
+        VectorModel.R_train(@model_file,  @features, @labels, train_model, @names, @factor_levels)
+      end
+    ensure
+      if @balance
+        @features =  @original_features
+        @labels = @original_labels
+      end
     end
-    save_models
+    save_models if @directory
   end
   def run(code)
@@ -299,38 +344,6 @@ cat(paste(label, sep="\\n", collapse="\\n"));
     result
   end
-  #def cross_validation(folds = 10)
-  #  saved_features = @features
-  #  saved_labels = @labels
-  #  seq = (0..features.length - 1).to_a
-  #  chunk_size = features.length / folds
-  #  acc = []
-  #  folds.times do
-  #    seq = seq.shuffle
-  #    eval_chunk = seq[0..chunk_size]
-  #    train_chunk = seq[chunk_size.. -1]
-  #    eval_features = @features.values_at *eval_chunk
-  #    eval_labels = @labels.values_at *eval_chunk
-  #    @features = @features.values_at *train_chunk
-  #    @labels = @labels.values_at *train_chunk
-  #    train
-  #    predictions = eval_list eval_features, false
-  #    acc << predictions.zip(eval_labels).collect{|pred,lab| pred - lab < 0.5 ? 1 : 0}.inject(0){|acc,e| acc +=e} / chunk_size
-  #    @features = saved_features
-  #    @labels = saved_labels
-  #  end
-  #  acc
-  #end
-  #
   def self.f1_metrics(test, predicted, good_label = nil)
     tp, tn, fp, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
@@ -413,6 +426,7 @@ cat(paste(label, sep="\\n", collapse="\\n"));
         @features = train_set
         @labels = train_labels
+        self.reset_model if self.respond_to? :reset_model
         self.train
         predictions = self.eval_list test_set, false
@@ -437,6 +451,8 @@ cat(paste(label, sep="\\n", collapse="\\n"));
       @features = orig_features
       @labels = orig_labels
     end unless folds == -1
+    self.reset_model if self.respond_to? :reset_model
     self.train unless folds == 1
     res
   end

data/python/rbbt_dm/huggingface.py CHANGED Viewed

@@ -17,6 +17,17 @@ def load_model_and_tokenizer(task, checkpoint):
     tokenizer = load_tokenizer(task, checkpoint)
     return model, tokenizer
+def load_model_and_tokenizer_from_directory(directory):
+    import os
+    import json
+    options_file = os.path.join(directory, 'options.json')
+    f = open(options_file, "r")
+    options = json.load(f.read())
+    f.close()
+    task = options["task"]
+    checkpoint = options["checkpoint"]
+    return load_model_and_tokenizer(task, checkpoint)
 #{{{ SIMPLE EVALUATE
 def forward(model, features):
@@ -42,7 +53,7 @@ def load_tsv(tsv_file):
 def tsv_dataset(tokenizer, tsv_file):
     dataset = load_tsv(tsv_file)
-    tokenized_dataset = dataset.map(lambda example: tokenizer(example["text"], truncation=True) , batched=True)
+    tokenized_dataset = dataset.map(lambda example: tokenizer(example["text"], truncation=True, max_length=512) , batched=True)
     return tokenized_dataset
 def training_args(*args, **kwargs):
@@ -57,34 +68,34 @@ def train_model(model, tokenizer, training_args, tsv_file, class_weights=None):
     tokenized_dataset = tsv_dataset(tokenizer, tsv_file)
     if (not class_weights == None):
-      import torch
-      from torch import nn
-      class WeightTrainer(Trainer):
-          def compute_loss(self, model, inputs, return_outputs=False):
-              labels = inputs.get("labels")
-              # forward pass
-              outputs = model(**inputs)
-              logits = outputs.get('logits')
-              # compute custom loss
-              loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(model.device))
-              loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
-              return (loss, outputs) if return_outputs else loss
-      trainer = WeightTrainer(
-              model,
-              training_args,
-              train_dataset = tokenized_dataset["train"],
-              tokenizer = tokenizer
-              )
+        import torch
+        from torch import nn
+        class WeightTrainer(Trainer):
+            def compute_loss(self, model, inputs, return_outputs=False):
+                labels = inputs.get("labels")
+                # forward pass
+                outputs = model(**inputs)
+                logits = outputs.get('logits')
+                # compute custom loss
+                loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(model.device))
+                loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+                return (loss, outputs) if return_outputs else loss
+        trainer = WeightTrainer(
+                model,
+                training_args,
+                train_dataset = tokenized_dataset["train"],
+                tokenizer = tokenizer
+                )
     else:
-      trainer = Trainer(
-              model,
-              training_args,
-              train_dataset = tokenized_dataset["train"],
-              tokenizer = tokenizer
-              )
+        trainer = Trainer(
+                model,
+                training_args,
+                train_dataset = tokenized_dataset["train"],
+                tokenizer = tokenizer
+                )
     trainer.train()

data/test/rbbt/vector/model/test_huggingface.rb CHANGED Viewed

@@ -3,6 +3,21 @@ require 'rbbt/vector/model/huggingface'
 class TestHuggingface < Test::Unit::TestCase
+  def test_options
+    TmpFile.with_file do |dir|
+      checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
+      task = "SequenceClassification"
+      model = HuggingfaceModel.new task, checkpoint, dir, :class_labels => %w(bad good)
+      iii model.eval "This is dog"
+      iii model.eval "This is cat"
+      iii model.eval(["This is dog", "This is cat"])
+      model = VectorModel.new dir
+      iii model.eval(["This is dog", "This is cat"])
+    end
+  end
   def test_pipeline
     require 'rbbt/util/python'
     model = VectorModel.new
@@ -25,7 +40,7 @@ class TestHuggingface < Test::Unit::TestCase
       model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
-      model.class_labels = ["Bad", "Good"]
+      model.model_options[:class_labels] = ["Bad", "Good"]
       assert_equal ["Bad", "Good"], model.eval(["This is dog", "This is cat"])
     end
@@ -37,7 +52,8 @@ class TestHuggingface < Test::Unit::TestCase
       checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
       model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
-      model.class_labels = ["Bad", "Good"]
+      model.model_options[:class_labels] = %w(Bad Good)
       assert_equal ["Bad", "Good"], model.eval(["This is dog", "This is cat"])
@@ -48,6 +64,9 @@ class TestHuggingface < Test::Unit::TestCase
       model.train
       assert_equal ["Good", "Good"], model.eval(["This is dog", "This is cat"])
+      model = VectorModel.new dir
+      assert_equal ["Good", "Good"], model.eval(["This is dog", "This is cat"])
     end
   end
@@ -55,7 +74,7 @@ class TestHuggingface < Test::Unit::TestCase
     checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
     model = HuggingfaceModel.new "SequenceClassification", checkpoint
-    model.class_labels = ["Bad", "Good"]
+    model.model_options[:class_labels] = ["Bad", "Good"]
     assert_equal ["Bad", "Good"], model.eval(["This is dog", "This is cat"])
@@ -73,7 +92,7 @@ class TestHuggingface < Test::Unit::TestCase
       checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
       model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
-      model.class_labels = ["Bad", "Good"]
+      model.model_options[:class_labels] = ["Bad", "Good"]
       assert_equal ["Bad", "Good"], model.eval(["This is dog", "This is cat"])
@@ -84,15 +103,20 @@ class TestHuggingface < Test::Unit::TestCase
       model.train
       model = HuggingfaceModel.new "SequenceClassification", checkpoint, dir
-      model.class_labels = ["Bad", "Good"]
       assert_equal ["Good", "Good"], model.eval(["This is dog", "This is cat"])
-      model = HuggingfaceModel.new "SequenceClassification", model.model_file
-      model.class_labels = ["Bad", "Good"]
+      model_file = model.model_file
+      model = HuggingfaceModel.new "SequenceClassification", model_file
+      model.model_options[:class_labels] = ["Bad", "Good"]
       assert_equal ["Good", "Good"], model.eval(["This is dog", "This is cat"])
+      model = VectorModel.new dir
+      assert_equal "Good", model.eval("This is dog")
     end
   end
@@ -123,9 +147,7 @@ class TestHuggingface < Test::Unit::TestCase
     model = HuggingfaceModel.new "MaskedLM", checkpoint
     assert_equal 3, model.eval(["Paris is the [MASK] of the France.", "The [MASK] worked very hard all the time.", "The [MASK] arrested the dangerous [MASK]."]).
       reject{|v| v.empty?}.length
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbbt-dm
 version: !ruby/object:Gem::Version
-  version: 1.2.4
+  version: 1.2.7
 platform: ruby
 authors:
 - Miguel Vazquez
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-07 00:00:00.000000000 Z
+date: 2023-02-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rbbt-util
@@ -107,6 +107,7 @@ files:
 - lib/rbbt/statistics/rank_product.rb
 - lib/rbbt/tensorflow.rb
 - lib/rbbt/vector/model.rb
+- lib/rbbt/vector/model/huggingface.old.rb
 - lib/rbbt/vector/model/huggingface.rb
 - lib/rbbt/vector/model/random_forest.rb
 - lib/rbbt/vector/model/spaCy.rb
@@ -143,7 +144,7 @@ files:
 homepage: http://github.com/mikisvaz/rbbt-phgx
 licenses: []
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -158,22 +159,22 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.6
-signing_key:
+rubygems_version: 3.1.2
+signing_key:
 specification_version: 4
 summary: Data-mining and statistics
 test_files:
-- test/rbbt/statistics/test_hypergeometric.rb
-- test/rbbt/statistics/test_fisher.rb
-- test/rbbt/statistics/test_fdr.rb
-- test/rbbt/statistics/test_random_walk.rb
-- test/rbbt/test_ml_task.rb
+- test/test_helper.rb
 - test/rbbt/vector/test_model.rb
+- test/rbbt/vector/model/test_huggingface.rb
 - test/rbbt/vector/model/test_tensorflow.rb
 - test/rbbt/vector/model/test_spaCy.rb
-- test/rbbt/vector/model/test_huggingface.rb
 - test/rbbt/vector/model/test_svm.rb
-- test/rbbt/network/test_paths.rb
-- test/rbbt/matrix/test_barcode.rb
+- test/rbbt/statistics/test_random_walk.rb
+- test/rbbt/statistics/test_fisher.rb
+- test/rbbt/statistics/test_fdr.rb
+- test/rbbt/statistics/test_hypergeometric.rb
 - test/rbbt/test_stan.rb
-- test/test_helper.rb
+- test/rbbt/matrix/test_barcode.rb
+- test/rbbt/test_ml_task.rb
+- test/rbbt/network/test_paths.rb