RubyGems - transformers-rb - Versions diffs - 0.1.0 - Mend

transformers-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +203 -0
data/README.md +163 -0
data/lib/transformers/activations.rb +57 -0
data/lib/transformers/configuration_utils.rb +285 -0
data/lib/transformers/convert_slow_tokenizer.rb +90 -0
data/lib/transformers/data/processors/squad.rb +115 -0
data/lib/transformers/dynamic_module_utils.rb +25 -0
data/lib/transformers/feature_extraction_utils.rb +110 -0
data/lib/transformers/hf_hub/constants.rb +71 -0
data/lib/transformers/hf_hub/errors.rb +11 -0
data/lib/transformers/hf_hub/file_download.rb +764 -0
data/lib/transformers/hf_hub/utils/_errors.rb +94 -0
data/lib/transformers/hf_hub/utils/_headers.rb +109 -0
data/lib/transformers/image_processing_base.rb +169 -0
data/lib/transformers/image_processing_utils.rb +63 -0
data/lib/transformers/image_transforms.rb +208 -0
data/lib/transformers/image_utils.rb +165 -0
data/lib/transformers/modeling_outputs.rb +81 -0
data/lib/transformers/modeling_utils.rb +888 -0
data/lib/transformers/models/auto/auto_factory.rb +138 -0
data/lib/transformers/models/auto/configuration_auto.rb +61 -0
data/lib/transformers/models/auto/feature_extraction_auto.rb +20 -0
data/lib/transformers/models/auto/image_processing_auto.rb +104 -0
data/lib/transformers/models/auto/modeling_auto.rb +80 -0
data/lib/transformers/models/auto/tokenization_auto.rb +160 -0
data/lib/transformers/models/bert/configuration_bert.rb +65 -0
data/lib/transformers/models/bert/modeling_bert.rb +836 -0
data/lib/transformers/models/bert/tokenization_bert.rb +115 -0
data/lib/transformers/models/bert/tokenization_bert_fast.rb +52 -0
data/lib/transformers/models/distilbert/configuration_distilbert.rb +63 -0
data/lib/transformers/models/distilbert/modeling_distilbert.rb +616 -0
data/lib/transformers/models/distilbert/tokenization_distilbert.rb +114 -0
data/lib/transformers/models/distilbert/tokenization_distilbert_fast.rb +71 -0
data/lib/transformers/models/vit/configuration_vit.rb +60 -0
data/lib/transformers/models/vit/image_processing_vit.rb +170 -0
data/lib/transformers/models/vit/modeling_vit.rb +506 -0
data/lib/transformers/pipelines/_init.rb +348 -0
data/lib/transformers/pipelines/base.rb +301 -0
data/lib/transformers/pipelines/feature_extraction.rb +47 -0
data/lib/transformers/pipelines/image_classification.rb +110 -0
data/lib/transformers/pipelines/image_feature_extraction.rb +56 -0
data/lib/transformers/pipelines/pt_utils.rb +53 -0
data/lib/transformers/pipelines/question_answering.rb +508 -0
data/lib/transformers/pipelines/text_classification.rb +123 -0
data/lib/transformers/pipelines/token_classification.rb +282 -0
data/lib/transformers/ruby_utils.rb +33 -0
data/lib/transformers/sentence_transformer.rb +37 -0
data/lib/transformers/tokenization_utils.rb +152 -0
data/lib/transformers/tokenization_utils_base.rb +937 -0
data/lib/transformers/tokenization_utils_fast.rb +386 -0
data/lib/transformers/torch_utils.rb +25 -0
data/lib/transformers/utils/_init.rb +31 -0
data/lib/transformers/utils/generic.rb +107 -0
data/lib/transformers/utils/hub.rb +209 -0
data/lib/transformers/utils/import_utils.rb +45 -0
data/lib/transformers/utils/logging.rb +52 -0
data/lib/transformers/version.rb +3 -0
data/lib/transformers-rb.rb +1 -0
data/lib/transformers.rb +100 -0
data/licenses/LICENSE-huggingface-hub.txt +201 -0
data/licenses/LICENSE-sentence-transformers.txt +201 -0
data/licenses/NOTICE-sentence-transformers.txt +5 -0
metadata +161 -0

data/lib/transformers/tokenization_utils_fast.rb ADDED Viewed

@@ -0,0 +1,386 @@
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+module Transformers
+  class PreTrainedTokenizerFast < PreTrainedTokenizerBase
+    def initialize(*args, **kwargs)
+      tokenizer_object = kwargs.delete(:tokenizer_object)
+      slow_tokenizer = kwargs.delete(:__slow_tokenizer)
+      fast_tokenizer_file = kwargs.delete(:tokenizer_file)
+      from_slow = kwargs.delete(:from_slow) { false }
+      _added_tokens_decoder = kwargs.delete(:added_tokens_decoder)
+      if !tokenizer_object.nil?
+        fast_tokenizer = Copy.deepcopy(tokenizer_object)
+      elsif !fast_tokenizer_file.nil? && !from_slow
+        # We have a serialization from tokenizers which let us directly build the backend
+        fast_tokenizer = Tokenizers::Tokenizer.from_file(fast_tokenizer_file)
+      elsif !slow_tokenizer.nil?
+        # We need to convert a slow tokenizer to build the backend
+        fast_tokenizer = ConvertSlowTokenizer.convert_slow_tokenizer(slow_tokenizer)
+      elsif !@slow_tokenizer_class.nil?
+        # We need to create and convert a slow tokenizer to build the backend
+        slow_tokenizer = @slow_tokenizer_class.new(*args, **kwargs)
+        fast_tokenizer = ConvertSlowTokenizer.convert_slow_tokenizer(slow_tokenizer)
+      else
+        raise ArgumentError, <<~MSG
+          Couldn't instantiate the backend tokenizer from one of:
+          (1) a `tokenizers` library serialization file,
+          (2) a slow tokenizer instance to convert or
+          (3) an equivalent slow tokenizer class to instantiate and convert.
+          You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
+        MSG
+      end
+      @tokenizer = fast_tokenizer
+      if !slow_tokenizer.nil?
+        kwargs.merge!(slow_tokenizer.init_kwargs)
+      end
+      @decode_use_source_tokenizer = false
+      _truncation = @tokenizer.truncation
+      if !_truncation.nil?
+        _truncation = _truncation.transform_keys(&:to_sym)
+        @tokenizer.enable_truncation(_truncation[:max_length], **_truncation.except(:max_length))
+        kwargs[:max_length] ||= _truncation[:max_length]
+        kwargs[:truncation_side] ||= _truncation[:direction]
+        kwargs[:stride] ||= _truncation[:stride]
+        kwargs[:truncation_strategy] ||= _truncation[:strategy]
+      else
+        @tokenizer.no_truncation
+      end
+      _padding = @tokenizer.padding
+      if !_padding.nil?
+        _padding = _padding.transform_keys(&:to_sym)
+        @tokenizer.enable_padding(**_padding)
+        kwargs[:pad_token] ||= _padding[:pad_token]
+        kwargs[:pad_token_type_id] ||= _padding[:pad_token_type_id]
+        kwargs[:padding_side] ||= _padding[:direction]
+        kwargs[:max_length] ||= _padding[:length]
+        kwargs[:pad_to_multiple_of] ||= _padding[:pad_to_multiple_of]
+      end
+      # We call this after having initialized the backend tokenizer because we update it.
+      super(**kwargs)
+    end
+    def is_fast
+      true
+    end
+    def get_vocab
+      @tokenizer.vocab(with_added_tokens: true)
+    end
+    def vocab
+      get_vocab
+    end
+    def convert_tokens_to_ids(tokens)
+      if tokens.nil?
+        return nil
+      end
+      if tokens.is_a?(String)
+        return _convert_token_to_id_with_added_voc(tokens)
+      end
+      ids = []
+      tokens.each do |token|
+        ids << _convert_token_to_id_with_added_voc(token)
+      end
+      ids
+    end
+    def _convert_token_to_id_with_added_voc(token)
+      index = @tokenizer.token_to_id(token)
+      if index.nil?
+        return unk_token_id
+      end
+      index
+    end
+    def convert_ids_to_tokens(ids, skip_special_tokens: false)
+      if ids.is_a?(Integer)
+        return @tokenizer.id_to_token(ids)
+      end
+      tokens = []
+      ids.each do |index|
+        index = index.to_i
+        if skip_special_tokens && @all_special_ids.include?(index)
+          next
+        end
+        tokens << @tokenizer.id_to_token(index)
+      end
+      tokens
+    end
+    private
+    def set_truncation_and_padding(
+      padding_strategy:,
+      truncation_strategy:,
+      max_length:,
+      stride:,
+      pad_to_multiple_of:
+    )
+      _truncation = @tokenizer.truncation
+      _padding = @tokenizer.padding
+      # Set truncation and padding on the backend tokenizer
+      if truncation_strategy == TruncationStrategy::DO_NOT_TRUNCATE
+        if !_truncation.nil?
+          @tokenizer.no_truncation
+        end
+      else
+        target = {
+          max_length: max_length,
+          stride: stride,
+          strategy: truncation_strategy,
+          direction: @truncation_side
+        }
+        # _truncation might contain more keys that the target `transformers`
+        # supports. Use only the target keys to trigger `enable_truncation`.
+        # This should enable this code to works on various `tokenizers`
+        # targets.
+        if _truncation.nil?
+          current = nil
+        else
+          current = target.to_h { |k, _| [k, _truncation[k]] }
+        end
+        if current != target
+          @tokenizer.enable_truncation(target.delete(:max_length), **target)
+        end
+      end
+      if padding_strategy == PaddingStrategy::DO_NOT_PAD
+        if !_padding.nil?
+          @tokenizer.no_padding
+        end
+      else
+        length = padding_strategy == PaddingStrategy::MAX_LENGTH ? max_length : nil
+        target = {
+          length: length,
+          direction: @padding_side,
+          pad_id: @pad_token_id,
+          pad_token: @pad_token,
+          pad_type_id: @pad_token_type_id,
+          pad_to_multiple_of: pad_to_multiple_of
+        }
+        if _padding != target
+          @tokenizer.enable_padding(**target)
+        end
+      end
+    end
+    def _batch_encode_plus(
+      batch_text_or_text_pairs,
+      add_special_tokens: true,
+      padding_strategy: PaddingStrategy::DO_NOT_PAD,
+      truncation_strategy: TruncationStrategy::DO_NOT_TRUNCATE,
+      max_length: nil,
+      stride: 0,
+      is_split_into_words: false,
+      pad_to_multiple_of: nil,
+      return_tensors: nil,
+      return_token_type_ids: nil,
+      return_attention_mask: nil,
+      return_overflowing_tokens: false,
+      return_special_tokens_mask: false,
+      return_offsets_mapping: false,
+      return_length: false,
+      verbose: true
+    )
+      if !batch_text_or_text_pairs.is_a?(Array)
+        raise TypeError, "batch_text_or_text_pairs has to be an array (got #{batch_text_or_text_pairs.class.name})"
+      end
+      # Set the truncation and padding strategy and restore the initial configuration
+      set_truncation_and_padding(
+        padding_strategy: padding_strategy,
+        truncation_strategy: truncation_strategy,
+        max_length: max_length,
+        stride: stride,
+        pad_to_multiple_of: pad_to_multiple_of
+      )
+      encodings =
+        @tokenizer.encode_batch(
+          batch_text_or_text_pairs,
+          add_special_tokens: add_special_tokens,
+          is_pretokenized: is_split_into_words,
+        )
+      # Convert encoding to dict
+      # `Tokens` has type: Tuple[
+      #                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
+      #                       List[EncodingFast]
+      #                    ]
+      # with nested dimensions corresponding to batch, overflows, sequence length
+      tokens_and_encodings =
+        encodings.map do |encoding|
+          _convert_encoding(
+            encoding: encoding,
+            return_token_type_ids: return_token_type_ids,
+            return_attention_mask: return_attention_mask,
+            return_overflowing_tokens: return_overflowing_tokens,
+            return_special_tokens_mask: return_special_tokens_mask,
+            return_offsets_mapping: return_offsets_mapping,
+            return_length: return_length,
+            verbose: verbose
+          )
+        end
+      # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+      # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+      # (we say ~ because the number of overflow varies with the example in the batch)
+      #
+      # To match each overflowing sample with the original sample in the batch
+      # we add an overflow_to_sample_mapping array (see below)
+      sanitized_tokens = {}
+      tokens_and_encodings[0][0].each_key do |key|
+        stack = tokens_and_encodings.map { |item, _| item[key][0] }
+        sanitized_tokens[key] = stack
+      end
+      sanitized_encodings = tokens_and_encodings.map { |_, item| item[0] }
+      # If returning overflowing tokens, we need to return a mapping
+      # from the batch idx to the original sample
+      if return_overflowing_tokens
+        overflow_to_sample_mapping = []
+        tokens_and_encodings.each_with_index do |(toks, _), i|
+          overflow_to_sample_mapping += [i] * toks["input_ids"].length
+        end
+        sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+      end
+      sanitized_tokens["input_ids"].each do |input_ids|
+        _eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+      end
+      BatchEncoding.new(data: sanitized_tokens, encoding: sanitized_encodings, tensor_type: return_tensors)
+    end
+    def _convert_encoding(
+      encoding:,
+      return_token_type_ids: nil,
+      return_attention_mask: nil,
+      return_overflowing_tokens: false,
+      return_special_tokens_mask: false,
+      return_offsets_mapping: false,
+      return_length: false,
+      verbose: true
+    )
+      if return_token_type_ids.nil?
+        return_token_type_ids = self.class.model_input_names.include?("token_type_ids")
+      end
+      if return_attention_mask.nil?
+        return_attention_mask = self.class.model_input_names.include?("attention_mask")
+      end
+      if return_overflowing_tokens && !encoding.overflowing.nil?
+        encodings = [encoding] + encoding.overflowing
+      else
+        encodings = [encoding]
+      end
+      encoding_dict = Hash.new { |h, k| h[k] = [] }
+      encodings.each do |e|
+        encoding_dict["input_ids"] << e.ids
+        if return_token_type_ids
+          encoding_dict["token_type_ids"] << e.type_ids
+        end
+        if return_attention_mask
+          encoding_dict["attention_mask"] << e.attention_mask
+        end
+        if return_special_tokens_mask
+          encoding_dict["special_tokens_mask"] << e.special_tokens_mask
+        end
+        if return_offsets_mapping
+          encoding_dict["offset_mapping"] << e.offsets
+        end
+        if return_length
+          encoding_dict["length"] << e.ids.length
+        end
+      end
+      [encoding_dict, encodings]
+    end
+    def _encode_plus(
+      text:,
+      text_pair: nil,
+      add_special_tokens: true,
+      padding_strategy: PaddingStrategy::DO_NOT_PAD,
+      truncation_strategy: TruncationStrategy::DO_NOT_TRUNCATE,
+      max_length: nil,
+      stride: 0,
+      is_split_into_words: false,
+      pad_to_multiple_of: nil,
+      return_tensors: nil,
+      return_token_type_ids: nil,
+      return_attention_mask: nil,
+      return_overflowing_tokens: false,
+      return_special_tokens_mask: false,
+      return_offsets_mapping: false,
+      return_length: false,
+      verbose: true,
+      **kwargs
+    )
+      batched_input = text_pair ? [[text, text_pair]] : [text]
+      batched_output =
+        _batch_encode_plus(
+          batched_input,
+          is_split_into_words: is_split_into_words,
+          add_special_tokens: add_special_tokens,
+          padding_strategy: padding_strategy,
+          truncation_strategy: truncation_strategy,
+          max_length: max_length,
+          stride: stride,
+          pad_to_multiple_of: pad_to_multiple_of,
+          return_tensors: return_tensors,
+          return_token_type_ids: return_token_type_ids,
+          return_attention_mask: return_attention_mask,
+          return_overflowing_tokens: return_overflowing_tokens,
+          return_special_tokens_mask: return_special_tokens_mask,
+          return_offsets_mapping: return_offsets_mapping,
+          return_length: return_length,
+          verbose: verbose,
+          **kwargs
+        )
+      # Return tensor is None, then we can remove the leading batch axis
+      # Overflowing tokens are returned as a batch of output so we keep them in this case
+      if return_tensors.nil? && !return_overflowing_tokens
+        batched_output =
+          BatchEncoding.new(
+            data: batched_output.items.to_h { |key, value|
+              [key, value.length > 0 && value[0].is_a?(Array) ? value[0] : value]
+            },
+            encoding: batched_output.encodings
+          )
+      end
+      _eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+      batched_output
+    end
+  end
+end

data/lib/transformers/torch_utils.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+module Transformers
+  module TorchUtils
+    def self.apply_chunking_to_forward(forward_fn, chunk_size, chunk_dim, *input_tensors)
+      if chunk_size > 0
+        raise Todo
+      end
+      forward_fn.(*input_tensors)
+    end
+  end
+end

data/lib/transformers/utils/_init.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+module Transformers
+  WEIGHTS_NAME = "pytorch_model.bin"
+  WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
+  TF2_WEIGHTS_NAME = "tf_model.h5"
+  TF2_WEIGHTS_INDEX_NAME = "tf_model.h5.index.json"
+  TF_WEIGHTS_NAME = "model.ckpt"
+  FLAX_WEIGHTS_NAME = "flax_model.msgpack"
+  FLAX_WEIGHTS_INDEX_NAME = "flax_model.msgpack.index.json"
+  SAFE_WEIGHTS_NAME = "model.safetensors"
+  SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
+  CONFIG_NAME = "config.json"
+  FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
+  IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
+  PROCESSOR_NAME = "processor_config.json"
+  GENERATION_CONFIG_NAME = "generation_config.json"
+  MODEL_CARD_NAME = "modelcard.json"
+end

data/lib/transformers/utils/generic.rb ADDED Viewed

@@ -0,0 +1,107 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+module Transformers
+  class ModelOutput
+    def self.attributes
+      @attributes ||= []
+    end
+    def self.attribute(attribute)
+      attributes << attribute.to_sym
+      define_method(attribute) do
+        self[attribute]
+      end
+    end
+    def initialize(**kwargs)
+      @data = kwargs
+    end
+    def [](k)
+      if k.is_a?(String) || k.is_a?(Symbol)
+        @data[k.to_sym]
+      else
+        to_tuple[k]
+      end
+    end
+    def to_tuple
+      self.class.attributes.map { |k| @data[k] }.compact
+    end
+  end
+  class ExplicitEnum
+    def initialize(value)
+      expected = self.class.constants.map { |k| self.class.const_get(k) }
+      unless expected.include?(value)
+        raise ArgumentError, "#{value} is not a valid #{self.class.name}, please select one of #{expected.inspect}"
+      end
+      @value = value
+    end
+    def to_s
+      @value
+    end
+  end
+  class PaddingStrategy < ExplicitEnum
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+  end
+  class TensorType < ExplicitEnum
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+    MLX = "mlx"
+  end
+  module Utils
+    def self.infer_framework(model_class)
+      if model_class < Torch::NN::Module
+        "pt"
+      else
+        raise TypeError, "Could not infer framework from class #{model_class}."
+      end
+    end
+    def self._is_numo(x)
+      x.is_a?(Numo::NArray)
+    end
+    def self.is_numo_array(x)
+      _is_numo(x)
+    end
+    def self._is_torch(x)
+      x.is_a?(Torch::Tensor)
+    end
+    def self.is_torch_tensor(x)
+      _is_torch(x)
+    end
+    def self._is_torch_device(x)
+      x.is_a?(Torch::Device)
+    end
+    def self.is_torch_device(x)
+      _is_torch_device(x)
+    end
+  end
+end