RubyGems - transformers-rb - Versions diffs - 0.1.0 → 0.1.2 - Mend

transformers-rb 0.1.0 → 0.1.2

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/README.md +97 -14
data/lib/transformers/configuration_utils.rb +7 -0
data/lib/transformers/feature_extraction_utils.rb +1 -0
data/lib/transformers/modeling_utils.rb +63 -11
data/lib/transformers/models/bert/modeling_bert.rb +3 -3
data/lib/transformers/pipelines/_init.rb +25 -6
data/lib/transformers/pipelines/embedding.rb +46 -0
data/lib/transformers/pipelines/image_classification.rb +1 -1
data/lib/transformers/pipelines/image_feature_extraction.rb +1 -1
data/lib/transformers/pipelines/text_classification.rb +2 -2
data/lib/transformers/pipelines/token_classification.rb +76 -1
data/lib/transformers/sentence_transformer.rb +3 -20
data/lib/transformers/tokenization_utils_fast.rb +8 -0
data/lib/transformers/version.rb +1 -1
data/lib/transformers.rb +7 -0
metadata +6 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c8f34c5454e2a1ac18bbb9a4b290a43e994cd3984fa2b4125ff4af969b9d17ed
-  data.tar.gz: 57c876fd1a4e62089fdc7bcbfcb9c155050166458a679991036894a6721ac168
+  metadata.gz: 3f29055705824ba101cba238960d4f10825c75bc7867b9eb0b611cda6a547612
+  data.tar.gz: d0967f7742f7b2d6194376eb040a3be81e77a9ded94302aeb934de678959e434
 SHA512:
-  metadata.gz: 7458b1ba0303e0741abf16a63efc350b6cad5e5dff48c46dcba6f62858f562bbf83478eb918995c45f5882159cf5c22d696cc4b0360f813312c2263da7c28205
-  data.tar.gz: a4d98b210a22d23bc55f452a93dd0e9df20c81cbc0d537d29eaa1069b157eec1df6cdc76fc7f398947e03ae841166d609bff0d250506eb0dd0745ef0fdf86efd
+  metadata.gz: 38b9ed4fd654ca593e3d6e7c7f20eb3c6b68ecfa5f86099fbc8d160f9093617cc79a571c331a0ec0c70a6770c8d9460194ba75e61d534f9e15931f22e5ae60c3
+  data.tar.gz: 00ce437ce8fe419fafddd59b7f9f61050d2ddf5817816b53beb1c43badbced9fe77ea28b26f2d607e87a9be855a19c4b254bc50df2881b4126f2def5c6875c3d

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,14 @@
+## 0.1.2 (2024-09-10)
+- Fixed default revision for pipelines
+## 0.1.1 (2024-08-29)
+- Added `embedding` pipeline
+- Added experimental `fast_init` option
+- Improved performance of loading models
+- Fixed error with `aggregation_strategy` option
 ## 0.1.0 (2024-08-19)
 - First release

data/README.md CHANGED Viewed

@@ -2,6 +2,8 @@
 :slightly_smiling_face: State-of-the-art [transformers](https://github.com/huggingface/transformers) for Ruby
+For fast inference, check out [Informers](https://github.com/ankane/informers) :fire:
 [![Build Status](https://github.com/ankane/transformers-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/transformers-ruby/actions)
 ## Installation
@@ -21,6 +23,20 @@ gem "transformers-rb"
 ## Models
+Embedding
+- [sentence-transformers/all-MiniLM-L6-v2](#sentence-transformersall-MiniLM-L6-v2)
+- [sentence-transformers/multi-qa-MiniLM-L6-cos-v1](#sentence-transformersmulti-qa-MiniLM-L6-cos-v1)
+- [mixedbread-ai/mxbai-embed-large-v1](#mixedbread-aimxbai-embed-large-v1)
+- [thenlper/gte-small](#thenlpergte-small)
+- [intfloat/e5-base-v2](#intfloate5-base-v2)
+- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
+- [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)
+Sparse embedding
+- [opensearch-project/opensearch-neural-sparse-encoding-v1](#opensearch-projectopensearch-neural-sparse-encoding-v1)
 ### sentence-transformers/all-MiniLM-L6-v2
 [Docs](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
@@ -28,8 +44,8 @@ gem "transformers-rb"
 ```ruby
 sentences = ["This is an example sentence", "Each sentence is converted"]
-model = Transformers::SentenceTransformer.new("sentence-transformers/all-MiniLM-L6-v2")
-embeddings = model.encode(sentences)
+model = Transformers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
+embeddings = model.(sentences)
 ```
 ### sentence-transformers/multi-qa-MiniLM-L6-cos-v1
@@ -40,10 +56,10 @@ embeddings = model.encode(sentences)
 query = "How many people live in London?"
 docs = ["Around 9 Million people live in London", "London is known for its financial district"]
-model = Transformers::SentenceTransformer.new("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
-query_emb = model.encode(query)
-doc_emb = model.encode(docs)
-scores = Torch.mm(Torch.tensor([query_emb]), Torch.tensor(doc_emb).transpose(0, 1))[0].cpu.to_a
+model = Transformers.pipeline("embedding", "sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
+query_embedding = model.(query)
+doc_embeddings = model.(docs)
+scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
 doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
 ```
@@ -52,18 +68,78 @@ doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
 [Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
 ```ruby
-def transform_query(query)
-  "Represent this sentence for searching relevant passages: #{query}"
-end
+query_prefix = "Represent this sentence for searching relevant passages: "
-docs = [
-  transform_query("puppy"),
+input = [
   "The dog is barking",
-  "The cat is purring"
+  "The cat is purring",
+  query_prefix + "puppy"
 ]
-model = Transformers::SentenceTransformer.new("mixedbread-ai/mxbai-embed-large-v1")
-embeddings = model.encode(docs)
+model = Transformers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
+embeddings = model.(input)
+```
+### thenlper/gte-small
+[Docs](https://huggingface.co/thenlper/gte-small)
+```ruby
+sentences = ["That is a happy person", "That is a very happy person"]
+model = Transformers.pipeline("embedding", "thenlper/gte-small")
+embeddings = model.(sentences)
+```
+### intfloat/e5-base-v2
+[Docs](https://huggingface.co/intfloat/e5-base-v2)
+```ruby
+doc_prefix = "passage: "
+query_prefix = "query: "
+input = [
+  doc_prefix + "Ruby is a programming language created by Matz",
+  query_prefix + "Ruby creator"
+]
+model = Transformers.pipeline("embedding", "intfloat/e5-base-v2")
+embeddings = model.(input)
+```
+### BAAI/bge-base-en-v1.5
+[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
+```ruby
+query_prefix = "Represent this sentence for searching relevant passages: "
+input = [
+  "The dog is barking",
+  "The cat is purring",
+  query_prefix + "puppy"
+]
+model = Transformers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
+embeddings = model.(input)
+```
+### Snowflake/snowflake-arctic-embed-m-v1.5
+[Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)
+```ruby
+query_prefix = "Represent this sentence for searching relevant passages: "
+input = [
+  "The dog is barking",
+  "The cat is purring",
+  query_prefix + "puppy"
+]
+model = Transformers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
+embeddings = model.(input, pooling: "cls")
 ```
 ### opensearch-project/opensearch-neural-sparse-encoding-v1
@@ -89,6 +165,13 @@ embeddings = values.to_a
 ## Pipelines
+Embedding
+```ruby
+embed = Transformers.pipeline("embedding")
+embed.("We are very happy to show you the 🤗 Transformers library.")
+```
 Named-entity recognition
 ```ruby

data/lib/transformers/configuration_utils.rb CHANGED Viewed

@@ -207,10 +207,17 @@ module Transformers
         config = new(**config_dict)
+        to_remove = []
         kwargs.each do |key, value|
           if config.respond_to?("#{key}=")
             config.public_send("#{key}=", value)
           end
+          if key != :torch_dtype
+            to_remove << key
+          end
+        end
+        to_remove.each do |key|
+          kwargs.delete(key)
         end
         Transformers.logger.info("Model config #{config}")

data/lib/transformers/feature_extraction_utils.rb CHANGED Viewed

@@ -22,6 +22,7 @@ module Transformers
     def to_h
       @data
     end
+    alias_method :to_hash, :to_h
     def [](item)
       @data[item]

data/lib/transformers/modeling_utils.rb CHANGED Viewed

@@ -14,6 +14,47 @@
 # limitations under the License.
 module Transformers
+  module ModelingUtils
+    TORCH_INIT_FUNCTIONS = {
+      "uniform!" => Torch::NN::Init.method(:uniform!),
+      "normal!" => Torch::NN::Init.method(:normal!),
+      # "trunc_normal!" => Torch::NN::Init.method(:trunc_normal!),
+      "constant!" => Torch::NN::Init.method(:constant!),
+      "xavier_uniform!" => Torch::NN::Init.method(:xavier_uniform!),
+      "xavier_normal!" => Torch::NN::Init.method(:xavier_normal!),
+      "kaiming_uniform!" => Torch::NN::Init.method(:kaiming_uniform!),
+      "kaiming_normal!" => Torch::NN::Init.method(:kaiming_normal!),
+      # "uniform" => Torch::NN::Init.method(:uniform),
+      # "normal" => Torch::NN::Init.method(:normal),
+      # "xavier_uniform" => Torch::NN::Init.method(:xavier_uniform),
+      # "xavier_normal" => Torch::NN::Init.method(:xavier_normal),
+      # "kaiming_uniform" => Torch::NN::Init.method(:kaiming_uniform),
+      # "kaiming_normal" => Torch::NN::Init.method(:kaiming_normal)
+    }
+    # private
+    # note: this improves loading time significantly, but is not thread-safe!
+    def self.no_init_weights
+      return yield unless Transformers.fast_init
+      _skip_init = lambda do |*args, **kwargs|
+        # pass
+      end
+      # Save the original initialization functions
+      TORCH_INIT_FUNCTIONS.each do |name, init_func|
+        Torch::NN::Init.singleton_class.undef_method(name)
+        Torch::NN::Init.define_singleton_method(name, &_skip_init)
+      end
+      yield
+    ensure
+      # Restore the original initialization functions
+      TORCH_INIT_FUNCTIONS.each do |name, init_func|
+        Torch::NN::Init.singleton_class.undef_method(name)
+        Torch::NN::Init.define_singleton_method(name, init_func)
+      end
+    end
+  end
   module ModuleUtilsMixin
     def get_extended_attention_mask(
       attention_mask,
@@ -138,7 +179,11 @@ module Transformers
     end
     def _initialize_weights(mod)
+      if mod.instance_variable_defined?(:@is_hf_initialized)
+        return
+      end
       _init_weights(mod)
+      mod.instance_variable_set(:@is_hf_initialized, true)
     end
     def tie_weights
@@ -166,7 +211,9 @@ module Transformers
         prune_heads(@config.pruned_heads)
       end
-      if true
+      # TODO implement no_init_weights context manager
+      _init_weights = false
+      if _init_weights
         # Initialize weights
         apply(method(:_initialize_weights))
@@ -512,8 +559,8 @@ module Transformers
         config.name_or_path = pretrained_model_name_or_path
-        model_kwargs = {}
-        model = new(config, *model_args, **model_kwargs)
+        # Instantiate model.
+        model = ModelingUtils.no_init_weights { new(config, *model_args, **model_kwargs) }
         # make sure we use the model's config since the __init__ call might have copied it
         config = model.config
@@ -683,6 +730,10 @@ module Transformers
           end
         end
+        if _fast_init
+          # TODO
+        end
         # Make sure we are able to load base models as well as derived models (with heads)
         start_prefix = ""
         model_to_load = model
@@ -756,28 +807,29 @@ module Transformers
           raise Todo
         end
+        model_class_name = model.class.name.split("::").last
         if unexpected_keys.length > 0
           archs = model.config.architectures.nil? ? [] : model.config.architectures
-          warner = archs.include?(model.class.name) ? Transformers.logger.method(:warn) : Transformers.logger.method(:info)
+          warner = archs.include?(model_class_name) ? Transformers.logger.method(:warn) : Transformers.logger.method(:info)
           warner.(
             "Some weights of the model checkpoint at #{pretrained_model_name_or_path} were not used when" +
-            " initializing #{model.class.name}: #{unexpected_keys}\n- This IS expected if you are" +
-            " initializing #{model.class.name} from the checkpoint of a model trained on another task or" +
+            " initializing #{model_class_name}: #{unexpected_keys}\n- This IS expected if you are" +
+            " initializing #{model_class_name} from the checkpoint of a model trained on another task or" +
             " with another architecture (e.g. initializing a BertForSequenceClassification model from a" +
             " BertForPreTraining model).\n- This IS NOT expected if you are initializing" +
-            " #{model.class.name} from the checkpoint of a model that you expect to be exactly identical" +
+            " #{model_class_name} from the checkpoint of a model that you expect to be exactly identical" +
             " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
           )
         else
-          Transformers.logger.info("All model checkpoint weights were used when initializing #{model.class.name}.\n")
+          Transformers.logger.info("All model checkpoint weights were used when initializing #{model_class_name}.\n")
         end
         if missing_keys.length > 0
-          Transformers.logger.info("Some weights of #{model.class.name} were not initialized from the model checkpoint at #{pretrained_model_name_or_path} and are newly initialized: #{missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.")
+          Transformers.logger.info("Some weights of #{model_class_name} were not initialized from the model checkpoint at #{pretrained_model_name_or_path} and are newly initialized: #{missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.")
         elsif mismatched_keys.length == 0
           Transformers.logger.info(
-            "All the weights of #{model.class.name} were initialized from the model checkpoint at" +
+            "All the weights of #{model_class_name} were initialized from the model checkpoint at" +
             " #{pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" +
-            " was trained on, you can already use #{model.class.name} for predictions without further" +
+            " was trained on, you can already use #{model_class_name} for predictions without further" +
             " training."
           )
         end

data/lib/transformers/models/bert/modeling_bert.rb CHANGED Viewed

@@ -97,7 +97,7 @@ module Transformers
         @position_embedding_type = position_embedding_type || config.position_embedding_type || "absolute"
         if @position_embedding_type == "relative_key" || @position_embedding_type == "relative_key_query"
           @max_position_embeddings = config.max_position_embeddings
-          @distance_embedding = Torch:NN::Embedding.new(2 * config.max_position_embeddings - 1, @attention_head_size)
+          @distance_embedding = Torch::NN::Embedding.new(2 * config.max_position_embeddings - 1, @attention_head_size)
         end
         @is_decoder = config.is_decoder
@@ -639,8 +639,8 @@ module Transformers
           extended_attention_mask = get_extended_attention_mask(attention_mask, input_shape)
         end
-        # # If a 2D or 3D attention mask is provided for the cross-attention
-        # # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if @config.is_decoder && !encoder_hidden_states.nil?
           encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size
           encoder_hidden_shape = [encoder_batch_size, encoder_sequence_length]

data/lib/transformers/pipelines/_init.rb CHANGED Viewed

@@ -24,7 +24,7 @@ module Transformers
       "pt" => [AutoModel],
       "default" => {
         "model" => {
-          "pt" => ["distilbert/distilbert-base-cased", "935ac13"]
+          "pt" => ["distilbert/distilbert-base-cased", "6ea8117"]
         }
       },
       "type" => "multimodal"
@@ -34,7 +34,7 @@ module Transformers
       "pt" => [AutoModelForSequenceClassification],
       "default" => {
         "model" => {
-          "pt" => ["distilbert/distilbert-base-uncased-finetuned-sst-2-english", "af0f99b"]
+          "pt" => ["distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"]
         }
       },
       "type" => "text"
@@ -44,7 +44,7 @@ module Transformers
       "pt" => [AutoModelForTokenClassification],
       "default" => {
         "model" => {
-          "pt" => ["dbmdz/bert-large-cased-finetuned-conll03-english", "f2482bf"]
+          "pt" => ["dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"]
         }
       },
       "type" => "text"
@@ -54,7 +54,7 @@ module Transformers
       "pt" => [AutoModelForQuestionAnswering],
       "default" => {
         "model" => {
-          "pt" => ["distilbert/distilbert-base-cased-distilled-squad", "626af31"]
+          "pt" => ["distilbert/distilbert-base-cased-distilled-squad", "564e9b5"]
         }
       },
       "type" => "text"
@@ -64,7 +64,7 @@ module Transformers
       "pt" => [AutoModelForImageClassification],
       "default" => {
         "model" => {
-          "pt" => ["google/vit-base-patch16-224", "5dca96d"]
+          "pt" => ["google/vit-base-patch16-224", "3f49326"]
         }
       },
       "type" => "image"
@@ -78,7 +78,17 @@ module Transformers
         }
       },
       "type" => "image"
-    }
+    },
+    "embedding" => {
+      "impl" => EmbeddingPipeline,
+      "pt" => [AutoModel],
+      "default" => {
+        "model" => {
+          "pt" => ["sentence-transformers/all-MiniLM-L6-v2", "8b3219a"]
+        }
+      },
+      "type" => "text"
+    },
   }
   PIPELINE_REGISTRY = PipelineRegistry.new(supported_tasks: SUPPORTED_TASKS, task_aliases: TASK_ALIASES)
@@ -86,6 +96,7 @@ module Transformers
   class << self
     def pipeline(
       task,
+      model_arg = nil,
       model: nil,
       config: nil,
       tokenizer: nil,
@@ -103,6 +114,13 @@ module Transformers
       pipeline_class: nil,
       **kwargs
     )
+      if !model_arg.nil?
+        if !model.nil?
+          raise ArgumentError, "Cannot pass multiple models"
+        end
+        model = model_arg
+      end
       model_kwargs ||= {}
       # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
       # this is to keep BC).
@@ -209,6 +227,7 @@ module Transformers
           " #{revision} (#{Utils::Hub::HUGGINGFACE_CO_RESOLVE_ENDPOINT}/#{model}).\n" +
           "Using a pipeline without specifying a model name and revision in production is not recommended."
         )
+        hub_kwargs[:revision] = revision
         if config.nil? && model.is_a?(String)
           config = AutoConfig.from_pretrained(model, _from_pipeline: task, **hub_kwargs, **model_kwargs)
           hub_kwargs[:_commit_hash] = config._commit_hash

data/lib/transformers/pipelines/embedding.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module Transformers
+  class EmbeddingPipeline < Pipeline
+    def _sanitize_parameters(**kwargs)
+      [{}, {}, kwargs]
+    end
+    def preprocess(inputs)
+      @tokenizer.(inputs, return_tensors: @framework)
+    end
+    def _forward(model_inputs)
+      {
+        last_hidden_state: @model.(**model_inputs)[0],
+        attention_mask: model_inputs[:attention_mask]
+      }
+    end
+    def postprocess(model_outputs, pooling: "mean", normalize: true)
+      output = model_outputs[:last_hidden_state]
+      case pooling
+      when "none"
+        # do nothing
+      when "mean"
+        output = mean_pooling(output, model_outputs[:attention_mask])
+      when "cls"
+        output = output[0.., 0]
+      else
+        raise Error, "Pooling method '#{pooling}' not supported."
+      end
+      if normalize
+        output = Torch::NN::Functional.normalize(output, p: 2, dim: 1)
+      end
+      output[0].to_a
+    end
+    private
+    def mean_pooling(output, attention_mask)
+      input_mask_expanded = attention_mask.unsqueeze(-1).expand(output.size).float
+      Torch.sum(output * input_mask_expanded, 1) / Torch.clamp(input_mask_expanded.sum(1), min: 1e-9)
+    end
+  end
+end

data/lib/transformers/pipelines/image_classification.rb CHANGED Viewed

@@ -45,7 +45,7 @@ module Transformers
     end
     def _forward(model_inputs)
-      model_outputs = @model.(**model_inputs.to_h)
+      model_outputs = @model.(**model_inputs)
       model_outputs
     end

data/lib/transformers/pipelines/image_feature_extraction.rb CHANGED Viewed

@@ -29,7 +29,7 @@ module Transformers
     end
     def _forward(model_inputs)
-      model_outputs = @model.(**model_inputs.to_h)
+      model_outputs = @model.(**model_inputs)
       model_outputs
     end

data/lib/transformers/pipelines/text_classification.rb CHANGED Viewed

@@ -34,7 +34,7 @@ module Transformers
       end
       if function_to_apply.is_a?(String)
-        function_to_apply = ClassificationFunction.new(function_to_apply.upcase).to_s
+        function_to_apply = ClassificationFunction.new(function_to_apply.downcase).to_s
       end
       if !function_to_apply.nil?
@@ -62,7 +62,7 @@ module Transformers
     end
     def _forward(model_inputs)
-      @model.(**model_inputs.to_h)
+      @model.(**model_inputs)
     end
     def postprocess(model_outputs, function_to_apply: nil, top_k: 1, _legacy: true)

data/lib/transformers/pipelines/token_classification.rb CHANGED Viewed

@@ -62,7 +62,7 @@ module Transformers
       if !aggregation_strategy.nil?
         if aggregation_strategy.is_a?(String)
-          aggregation_strategy = AggregationStrategy.new(aggregation_strategy.upcase).to_s
+          aggregation_strategy = AggregationStrategy.new(aggregation_strategy.downcase).to_s
         end
         if (
           [AggregationStrategy::FIRST, AggregationStrategy::MAX, AggregationStrategy::AVERAGE].include?(aggregation_strategy) &&
@@ -278,5 +278,80 @@ module Transformers
       end
       group_entities(entities)
     end
+    def aggregate_word(entities, aggregation_strategy)
+      raise Todo
+    end
+    def aggregate_words(entities, aggregation_strategy)
+      raise Todo
+    end
+    def group_sub_entities(entities)
+      # Get the first entity in the entity group
+      entity = entities[0][:entity].split("-", 2)[-1]
+      scores = entities.map { |entity| entity[:score] }
+      tokens = entities.map { |entity| entity[:word] }
+      entity_group = {
+        entity_group: entity,
+        score: scores.sum / scores.count.to_f,
+        word: @tokenizer.convert_tokens_to_string(tokens),
+        start: entities[0][:start],
+        end: entities[-1][:end]
+      }
+      entity_group
+    end
+    def get_tag(entity_name)
+      if entity_name.start_with?("B-")
+        bi = "B"
+        tag = entity_name[2..]
+      elsif entity_name.start_with?("I-")
+        bi = "I"
+        tag = entity_name[2..]
+      else
+        # It's not in B-, I- format
+        # Default to I- for continuation.
+        bi = "I"
+        tag = entity_name
+      end
+      [bi, tag]
+    end
+    def group_entities(entities)
+      entity_groups = []
+      entity_group_disagg = []
+      entities.each do |entity|
+        if entity_group_disagg.empty?
+          entity_group_disagg << entity
+          next
+        end
+        # If the current entity is similar and adjacent to the previous entity,
+        # append it to the disaggregated entity group
+        # The split is meant to account for the "B" and "I" prefixes
+        # Shouldn't merge if both entities are B-type
+        bi, tag = get_tag(entity[:entity])
+        _last_bi, last_tag = get_tag(entity_group_disagg[-1][:entity])
+        if tag == last_tag && bi != "B"
+          # Modify subword type to be previous_type
+          entity_group_disagg << entity
+        else
+          # If the current entity is different from the previous entity
+          # aggregate the disaggregated entity group
+          entity_groups << group_sub_entities(entity_group_disagg)
+          entity_group_disagg = [entity]
+        end
+      end
+      if entity_group_disagg.any?
+        # it's the last entity, add it to the entity groups
+        entity_groups << group_sub_entities(entity_group_disagg)
+      end
+      entity_groups
+    end
   end
 end

data/lib/transformers/sentence_transformer.rb CHANGED Viewed

@@ -2,36 +2,19 @@ module Transformers
   class SentenceTransformer
     def initialize(model_id)
       @model_id = model_id
-      @tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
-      @model = Transformers::AutoModel.from_pretrained(model_id)
+      @model = Transformers.pipeline("embedding", model_id)
     end
     def encode(sentences)
-      singular = sentences.is_a?(String)
-      sentences = [sentences] if singular
-      input = @tokenizer.(sentences, padding: true, truncation: true, return_tensors: "pt")
-      output = Torch.no_grad { @model.(**input) }[0]
       # TODO check modules.json
       if [
         "sentence-transformers/all-MiniLM-L6-v2",
         "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
       ].include?(@model_id)
-        output = mean_pooling(output, input[:attention_mask])
-        output = Torch::NN::Functional.normalize(output, p: 2, dim: 1).to_a
+        @model.(sentences)
       else
-        output = output[0.., 0].to_a
+        @model.(sentences, pooling: "cls", normalize: false)
       end
-      singular ? output[0] : output
-    end
-    private
-    def mean_pooling(output, attention_mask)
-      input_mask_expanded = attention_mask.unsqueeze(-1).expand(output.size).float
-      Torch.sum(output * input_mask_expanded, 1) / Torch.clamp(input_mask_expanded.sum(1), min: 1e-9)
     end
   end
 end

data/lib/transformers/tokenization_utils_fast.rb CHANGED Viewed

@@ -91,6 +91,10 @@ module Transformers
       get_vocab
     end
+    def backend_tokenizer
+      @tokenizer
+    end
     def convert_tokens_to_ids(tokens)
       if tokens.nil?
         return nil
@@ -130,6 +134,10 @@ module Transformers
       tokens
     end
+    def convert_tokens_to_string(tokens)
+      backend_tokenizer.decoder.decode(tokens)
+    end
     private
     def set_truncation_and_padding(

data/lib/transformers/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Transformers
-  VERSION = "0.1.0"
+  VERSION = "0.1.2"
 end

data/lib/transformers.rb CHANGED Viewed

@@ -75,6 +75,7 @@ require_relative "transformers/models/vit/modeling_vit"
 # pipelines
 require_relative "transformers/pipelines/base"
 require_relative "transformers/pipelines/feature_extraction"
+require_relative "transformers/pipelines/embedding"
 require_relative "transformers/pipelines/image_classification"
 require_relative "transformers/pipelines/image_feature_extraction"
 require_relative "transformers/pipelines/pt_utils"
@@ -97,4 +98,10 @@ module Transformers
       "not implemented yet"
     end
   end
+  class << self
+    # experimental
+    attr_accessor :fast_init
+  end
+  self.fast_init = false
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: transformers-rb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-08-19 00:00:00.000000000 Z
+date: 2024-09-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -44,14 +44,14 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: 0.5.2
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: 0.5.2
 - !ruby/object:Gem::Dependency
   name: torch-rb
   requirement: !ruby/object:Gem::Requirement
@@ -113,6 +113,7 @@ files:
 - lib/transformers/models/vit/modeling_vit.rb
 - lib/transformers/pipelines/_init.rb
 - lib/transformers/pipelines/base.rb
+- lib/transformers/pipelines/embedding.rb
 - lib/transformers/pipelines/feature_extraction.rb
 - lib/transformers/pipelines/image_classification.rb
 - lib/transformers/pipelines/image_feature_extraction.rb
@@ -154,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.11
+rubygems_version: 3.5.16
 signing_key:
 specification_version: 4
 summary: State-of-the-art transformers for Ruby