RubyGems - informers - Versions diffs - 1.0.2 → 1.1.0 - Mend

informers 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/README.md +213 -19
data/lib/informers/configs.rb +10 -8
data/lib/informers/model.rb +2 -14
data/lib/informers/models.rb +1027 -13
data/lib/informers/pipelines.rb +781 -14
data/lib/informers/processors.rb +796 -0
data/lib/informers/tokenizers.rb +166 -4
data/lib/informers/utils/core.rb +4 -0
data/lib/informers/utils/generation.rb +294 -0
data/lib/informers/utils/image.rb +116 -0
data/lib/informers/utils/math.rb +73 -0
data/lib/informers/utils/tensor.rb +46 -0
data/lib/informers/version.rb +1 -1
data/lib/informers.rb +3 -0
metadata +8 -5

data/lib/informers/tokenizers.rb CHANGED Viewed

@@ -1,16 +1,65 @@
 module Informers
   class PreTrainedTokenizer
-    attr_reader :sep_token_id
+    attr_reader :mask_token, :mask_token_id, :sep_token_id
     def initialize(tokenizer_json, tokenizer_config)
       super()
+      @tokenizer_config = tokenizer_config
       @tokenizer = Tokenizers::Tokenizer.from_file(tokenizer_json)
-      @sep_token = tokenizer_config["sep_token"]
-      @sep_token_id = @tokenizer.token_to_id(@sep_token)
+      # Add added_tokens to model
+      @special_tokens = []
+      @all_special_ids = []
+      @added_tokens = []
+      @tokenizer.added_tokens_decoder.each do |id, token|
+        @added_tokens << token
+        if token.special
+          @special_tokens << token.content
+          @all_special_ids << id
+        end
+      end
+      # Update additional_special_tokens
+      @additional_special_tokens = tokenizer_config["additional_special_tokens"] || []
+      @special_tokens.concat(@additional_special_tokens)
+      @mask_token = get_token("mask_token")
+      @mask_token_id = @tokenizer.token_to_id(@mask_token) if @mask_token
+      @sep_token = get_token("sep_token")
+      @sep_token_id = @tokenizer.token_to_id(@sep_token) if @sep_token
       @model_max_length = tokenizer_config["model_max_length"]
+      # for donut-base-finetuned-docvqa
+      if @model_max_length && @model_max_length > (1 << 63)
+        @model_max_length = 1 << 63
+      end
+    end
+    def get_token(*keys)
+      keys.each do |key|
+        item = @tokenizer_config[key]
+        if !item
+          next
+        end
+        if item.is_a?(Hash)
+          if item["__type"] == "AddedToken"
+            return item["content"]
+          else
+            raise Error, "Unknown token: #{item}"
+          end
+        else
+          return item
+        end
+      end
+      nil
     end
     def call(
@@ -76,6 +125,22 @@ module Informers
     def convert_tokens_to_string(tokens)
       @tokenizer.decoder.decode(tokens)
     end
+    def convert_tokens_to_ids(tokens)
+      tokens.map { |t| @tokenizer.token_to_id(t) }
+    end
+    def id_to_token(id)
+      @tokenizer.id_to_token(id)
+    end
+    def batch_decode(batch, **decode_args)
+      @tokenizer.decode_batch(batch, **decode_args)
+    end
+    def padding_side=(side)
+      @tokenizer.enable_padding(direction: side)
+    end
   end
   class BertTokenizer < PreTrainedTokenizer
@@ -91,11 +156,108 @@ module Informers
   class DistilBertTokenizer < PreTrainedTokenizer
   end
+  class T5Tokenizer < PreTrainedTokenizer
+  end
+  class GPT2Tokenizer < PreTrainedTokenizer
+    # _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`
+  end
+  class BartTokenizer < PreTrainedTokenizer
+  end
+  class RobertaTokenizer < PreTrainedTokenizer
+  end
+  class XLMRobertaTokenizer < PreTrainedTokenizer
+  end
+  class MPNetTokenizer < PreTrainedTokenizer
+  end
+  class CLIPTokenizer < PreTrainedTokenizer
+  end
+  class NllbTokenizer < PreTrainedTokenizer
+    attr_reader :language_regex, :language_codes, :lang_to_token
+    def initialize(tokenizer_json, tokenizer_config)
+      super(tokenizer_json, tokenizer_config)
+      @language_regex = /^[a-z]{3}_[A-Z][a-z]{3}$/
+      @language_codes = @special_tokens.filter { |x| @language_regex.match?(x) }
+      @lang_to_token = ->(x) { x } # Identity function
+    end
+    def _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs)
+      Utils._build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs)
+    end
+  end
+  class M2M100Tokenizer < PreTrainedTokenizer
+    attr_reader :language_regex, :language_codes, :lang_to_token
+    def initialize(tokenizer_json, tokenizer_config)
+      super(tokenizer_json, tokenizer_config)
+      @language_regex = /^__[a-z]{2,3}__$/
+      @language_codes = @special_tokens
+        .filter { |x| @language_regex.match?(x) }
+        .map { |x| x.slice(2, -2) }
+      @lang_to_token = ->(x) { "__#{x}__" }
+    end
+    def _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs)
+      Utils._build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs)
+    end
+  end
+  module Utils
+    def self._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs)
+      if !slf.respond_to?(:language_codes) || !slf.language_codes.is_a?(Array)
+        raise Error, "Tokenizer must have `language_codes` attribute set and it should be an array of language ids."
+      end
+      if !slf.respond_to?(:language_regex) || !slf.language_regex.is_a?(Regexp)
+        raise Error, "Tokenizer must have `language_regex` attribute set and it should be a regular expression."
+      end
+      if !slf.respond_to?(:lang_to_token) || !slf.lang_to_token.respond_to?(:call)
+        raise Error, "Tokenizer must have `lang_to_token` attribute set and it should be a function."
+      end
+      src_lang_token = generate_kwargs[:src_lang]
+      tgt_lang_token = generate_kwargs[:tgt_lang]
+      if !slf.language_codes.include?(tgt_lang_token)
+        raise Error, "Target language code #{tgt_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
+      end
+      if !src_lang_token.nil?
+        # Check that the source language is valid:
+        if !slf.language_codes.include?(src_lang_token)
+          raise Error, "Source language code #{src_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
+        end
+      end
+      # Override the `forced_bos_token_id` to force the correct language
+      generate_kwargs["forced_bos_token_id"] = slf.convert_tokens_to_ids([slf.lang_to_token.(tgt_lang_token)])[0]
+      slf.(raw_inputs, **tokenizer_options)
+    end
+  end
   class AutoTokenizer
     TOKENIZER_CLASS_MAPPING = {
+      "T5Tokenizer" => T5Tokenizer,
       "BertTokenizer" => BertTokenizer,
       "DebertaV2Tokenizer" => DebertaV2Tokenizer,
-      "DistilBertTokenizer" => DistilBertTokenizer
+      "DistilBertTokenizer" => DistilBertTokenizer,
+      "BartTokenizer" => BartTokenizer,
+      "RobertaTokenizer" => RobertaTokenizer,
+      "XLMRobertaTokenizer" => XLMRobertaTokenizer,
+      "MPNetTokenizer" => MPNetTokenizer,
+      "CLIPTokenizer" => CLIPTokenizer,
+      "GPT2Tokenizer" => GPT2Tokenizer,
+      "NllbTokenizer" => NllbTokenizer,
+      "M2M100Tokenizer" => M2M100Tokenizer
     }
     def self.from_pretrained(

data/lib/informers/utils/core.rb CHANGED Viewed

@@ -3,5 +3,9 @@ module Informers
     def self.dispatch_callback(progress_callback, data)
       progress_callback.(data) if progress_callback
     end
+    def self.calculate_reflect_offset(i, w)
+      ((i + w) % (2 * w) - w).abs
+    end
   end
 end

data/lib/informers/utils/generation.rb ADDED Viewed

@@ -0,0 +1,294 @@
+module Informers
+  module Utils
+    class GenerationConfig
+      def initialize(kwargs)
+        @config = {}
+        # Parameters that control the length of the output
+        @config["max_length"] = kwargs["max_length"] || 20
+        @config["max_new_tokens"] = kwargs["max_new_tokens"]
+        @config["min_length"] = kwargs["min_length"] || 0
+        @config["min_new_tokens"] = kwargs["min_new_tokens"]
+        @config["early_stopping"] = kwargs["early_stopping"] || false
+        @config["max_time"] = kwargs["max_time"]
+        # Parameters that control the generation strategy used
+        @config["do_sample"] = kwargs["do_sample"] || false
+        @config["num_beams"] = kwargs["num_beams"] || 1
+        @config["num_beam_groups"] = kwargs["num_beam_groups"] || 1
+        @config["penalty_alpha"] = kwargs["penalty_alpha"]
+        @config["use_cache"] = kwargs.fetch("use_cache", true)
+        # Parameters for manipulation of the model output logits
+        @config["temperature"] = kwargs["temperature"] || 1.0
+        @config["top_k"] = kwargs["top_k"] || 50
+        @config["top_p"] = kwargs["top_p"] || 1.0
+        @config["typical_p"] = kwargs["typical_p"] || 1.0
+        @config["epsilon_cutoff"] = kwargs["epsilon_cutoff"] || 0.0
+        @config["eta_cutoff"] = kwargs["eta_cutoff"] || 0.0
+        @config["diversity_penalty"] = kwargs["diversity_penalty"] || 0.0
+        @config["repetition_penalty"] = kwargs["repetition_penalty"] || 1.0
+        @config["encoder_repetition_penalty"] = kwargs["encoder_repetition_penalty"] || 1.0
+        @config["length_penalty"] = kwargs["length_penalty"] || 1.0
+        @config["no_repeat_ngram_size"] = kwargs["no_repeat_ngram_size"] || 0
+        @config["bad_words_ids"] = kwargs["bad_words_ids"]
+        @config["force_words_ids"] = kwargs["force_words_ids"]
+        @config["renormalize_logits"] = kwargs["renormalize_logits"] || false
+        @config["constraints"] = kwargs["constraints"]
+        @config["forced_bos_token_id"] = kwargs["forced_bos_token_id"]
+        @config["forced_eos_token_id"] = kwargs["forced_eos_token_id"]
+        @config["remove_invalid_values"] = kwargs["remove_invalid_values"] || false
+        @config["exponential_decay_length_penalty"] = kwargs["exponential_decay_length_penalty"]
+        @config["suppress_tokens"] = kwargs["suppress_tokens"]
+        @config["begin_suppress_tokens"] = kwargs["begin_suppress_tokens"]
+        @config["forced_decoder_ids"] = kwargs["forced_decoder_ids"]
+        # Parameters that define the output variables of `generate`
+        @config["num_return_sequences"] = kwargs["num_return_sequences"] || 1
+        @config["output_attentions"] = kwargs["output_attentions"] || false
+        @config["output_hidden_states"] = kwargs["output_hidden_states"] || false
+        @config["output_scores"] = kwargs["output_scores"] || false
+        @config["return_dict_in_generate"] = kwargs["return_dict_in_generate"] || false
+        # Special tokens that can be used at generation time
+        @config["pad_token_id"] = kwargs["pad_token_id"]
+        @config["bos_token_id"] = kwargs["bos_token_id"]
+        @config["eos_token_id"] = kwargs["eos_token_id"]
+        # Generation parameters exclusive to encoder-decoder models
+        @config["encoder_no_repeat_ngram_size"] = kwargs["encoder_no_repeat_ngram_size"] || 0
+        @config["decoder_start_token_id"] = kwargs["decoder_start_token_id"]
+        # Wild card
+        @generation_kwargs = kwargs["generation_kwargs"] || {}
+      end
+      def [](key)
+        @config[key.to_s]
+      end
+      def merge!(config)
+        @config.merge!(config)
+      end
+    end
+    class Sampler
+      def initialize(generation_config)
+        super()
+        @generation_config = generation_config
+      end
+      def call(logits, index = -1)
+        # Sample from logits, of dims [batch, sequence_length, vocab_size].
+        # If index is specified, sample from [batch, index, vocab_size].
+        sample(logits, index)
+      end
+      def get_logits(logits, index)
+        vocab_size = Utils.dims(logits)[-1]
+        logs = logits.flatten
+        if index == -1
+          logs = logs.last(vocab_size)
+        else
+          raise Todo
+        end
+        # add temperature
+        if @generation_config["temperature"] > 0
+          logs = logs.map { |x| x / @generation_config["temperature"] }
+        end
+        logs
+      end
+      def self.get_sampler(generation_config)
+        if generation_config[:do_sample]
+          MultinomialSampler.new(generation_config)
+        elsif generation_config[:num_beams] > 1
+          BeamSearchSampler.new(generation_config)
+        else
+          if generation_config[:num_return_sequences] > 1
+            raise Error, "num_return_sequences has to be 1 when doing greedy search, but is #{generation_config[:num_return_sequences]}."
+          end
+          GreedySampler.new(generation_config)
+        end
+      end
+    end
+    class GreedySampler < Sampler
+      def sample(logits, index = -1)
+        # NOTE: no need to do log_softmax here since we only take the maximum
+        logs = get_logits(logits, index)
+        argmax = Utils.max(logs)[1]
+        # Note: score is meaningless in this context, since we are performing
+        # greedy search (p = 1 => log(p) = 0)
+        [
+          [argmax, 0]
+        ]
+      end
+    end
+    class BeamSearchSampler < Sampler
+      def sample(logits, index = -1)
+        k = Utils.dims(logits)[-1] # defaults to vocab size
+        if @generation_config["top_k"] > 0
+          k = [@generation_config["top_k"], k].min
+        end
+        # Get logits of nth token
+        logs = get_logits(logits, index)
+        # Get top k tokens
+        top_logits = Utils.get_top_items(logs, k)
+        # Compute softmax over logits
+        probabilities = Utils.softmax(top_logits.map { |x| x[1] })
+        Array.new(@generation_config["num_beams"]) do |i|
+          [
+            top_logits[i][0],
+            Math.log(probabilities[i])
+          ]
+        end
+      end
+    end
+    class LogitsProcessorList
+      def initialize
+        super
+        @processors = []
+      end
+      def push(item)
+        @processors << item
+      end
+      def concat(items)
+        @processors.concat(items)
+      end
+      def call(input_ids, batched_logits)
+        # NOTE: This is different from the Python code, since vanilla Ruby does not support vectorized operations.
+        # As a result, we apply each processor to each item in the batch.
+        batched_logits.each do |logits|
+          # Modifies logits inplace
+          @processors.each do |func|
+            func.(input_ids, logits)
+          end
+        end
+      end
+      def to_ary
+        @processors
+      end
+    end
+    class LogitsProcessor
+    end
+    class NoRepeatNGramLogitsProcessor < LogitsProcessor
+      def initialize(no_repeat_ngram_size)
+        super()
+        @no_repeat_ngram_size = no_repeat_ngram_size
+      end
+      def get_ngrams(prev_input_ids)
+        cur_len = prev_input_ids.length
+        ngrams = []
+        j = 0
+        while j < cur_len + 1 - @no_repeat_ngram_size
+          ngram = []
+          @no_repeat_ngram_size.times do |k|
+            ngram << prev_input_ids[j + k]
+          end
+          ngrams << ngram
+          j += 1
+        end
+        generated_ngram = {}
+        ngrams.each do |ngram|
+          prev_ngram = ngram.slice(0, ngram.length - 1)
+          prev_ngram_key = JSON.generate(prev_ngram)
+          prev_ngram_value = generated_ngram[prev_ngram_key] || []
+          prev_ngram_value << ngram[ngram.length - 1]
+          generated_ngram[prev_ngram_key] = prev_ngram_value
+        end
+        generated_ngram
+      end
+      def get_generated_ngrams(banned_ngrams, prev_input_ids)
+        ngram_idx = prev_input_ids.slice(prev_input_ids.length + 1 - @no_repeat_ngram_size, prev_input_ids.length)
+        banned = banned_ngrams[JSON.generate(ngram_idx)] || []
+        banned
+      end
+      def calc_banned_ngram_tokens(prev_input_ids)
+        banned_tokens = []
+        if prev_input_ids.length + 1 < @no_repeat_ngram_size
+          # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+          banned_tokens
+        else
+          generated_ngrams = get_ngrams(prev_input_ids)
+          banned_tokens = get_generated_ngrams(generated_ngrams, prev_input_ids)
+          banned_tokens
+        end
+      end
+      def call(input_ids, logits)
+        banned_tokens = calc_banned_ngram_tokens(input_ids)
+        banned_tokens.each do |token|
+          logits[token] = -Float::INFINITY
+        end
+        logits
+      end
+    end
+    class MinLengthLogitsProcessor < LogitsProcessor
+      def initialize(min_length, eos_token_id)
+        super()
+        @min_length = min_length
+        @eos_token_id = eos_token_id.is_a?(Array) ? eos_token_id : [eos_token_id]
+      end
+      def call(input_ids, logits)
+        if input_ids.length < @min_length
+          @eos_token_id.each do |eos_token|
+            logits[eos_token] = -Float::INFINITY
+          end
+        end
+        logits
+      end
+    end
+    class ForcedBOSTokenLogitsProcessor < LogitsProcessor
+      def initialize(bos_token_id)
+        super()
+        @bos_token_id = bos_token_id
+      end
+      def call(input_ids, logits)
+        if input_ids.length == 1
+          logits.map! { -Float::INFINITY }
+          logits[@bos_token_id] = 0
+        end
+        logits
+      end
+    end
+    class ForcedEOSTokenLogitsProcessor < LogitsProcessor
+      def initialize(max_length, forced_eos_token_id)
+        super()
+        @max_length = max_length
+        @forced_eos_token_id = forced_eos_token_id
+      end
+      def call(input_ids, logits)
+      end
+    end
+  end
+end

data/lib/informers/utils/image.rb ADDED Viewed

@@ -0,0 +1,116 @@
+module Informers
+  module Utils
+    class RawImage
+      RESAMPLING_MAPPING = {
+        0 => "nearest",
+        1 => "lanczos",
+        2 => "bilinear",
+        3 => "bicubic",
+        4 => "box",
+        5 => "hamming",
+      }
+      attr_reader :image, :width, :height, :channels
+      def initialize(image)
+        @image = image
+        @width = image.width
+        @height = image.height
+        @channels = image.bands
+      end
+      def data
+        @image.write_to_memory.unpack("C*")
+      end
+      def size
+        [@width, @height]
+      end
+      def resize(width, height, resample: 2)
+        resample_method = RESAMPLING_MAPPING[resample] || resample
+        case resample_method
+        when "bilinear", "bicubic"
+          img =
+            @image.affine(
+              [width / @width.to_f, 0, 0, height / @height.to_f],
+              interpolate: Vips::Interpolate.new(resample_method.to_sym)
+            )
+        else
+          raise Todo
+        end
+        RawImage.new(img)
+      end
+      def center_crop(crop_width, crop_height)
+        # If the image is already the desired size, return it
+        if @width == crop_width && @height == crop_height
+          return self
+        end
+        # Determine bounds of the image in the new canvas
+        width_offset = (@width - crop_width) / 2.0
+        height_offset = (@height - crop_height) / 2.0
+        if width_offset >= 0 && height_offset >= 0
+          # Cropped image lies entirely within the original image
+          img = @image.crop(
+            width_offset.floor,
+            height_offset.floor,
+            crop_width,
+            crop_height
+          )
+        elsif width_offset <= 0 && height_offset <= 0
+          raise Todo
+        else
+          raise Todo
+        end
+        RawImage.new(img)
+      end
+      def rgb
+        if @channels == 3
+          return self
+        end
+        raise Todo
+      end
+      def save(path)
+        @image.write_to_file(path)
+      end
+      def self.read(input)
+        if input.is_a?(RawImage)
+          input
+        elsif input.is_a?(URI)
+          require "open-uri"
+          RawImage.new(Vips::Image.new_from_buffer(input.read, ""))
+        elsif input.is_a?(String)
+          RawImage.new(Vips::Image.new_from_file(input))
+        else
+          raise ArgumentError, "Unsupported input type: #{input.class.name}"
+        end
+      end
+      def self.from_array(input)
+        c, h, w = Utils.dims(input)
+        pixel_data = Array.new(w * h * c)
+        input.each_with_index do |cv, ci|
+          cv.each_with_index do |hv, hi|
+            hv.each_with_index do |v, wi|
+              pixel_data[(hi * w * c) + (wi * c) + ci] = v
+            end
+          end
+        end
+        RawImage.new(Vips::Image.new_from_memory_copy(pixel_data.pack("C*"), w, h, c, :uchar))
+      end
+    end
+  end
+end

data/lib/informers/utils/math.rb CHANGED Viewed

@@ -1,5 +1,75 @@
 module Informers
   module Utils
+    def self.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false)
+      in_channels, in_height, in_width = in_shape
+      out_height, out_width = out_shape
+      # TODO use mode and align_corners
+      # Output image dimensions
+      x_scale = out_width / in_width.to_f
+      y_scale = out_height / in_height.to_f
+      # Output image
+      out_img = Array.new(out_height * out_width * in_channels)
+      # Pre-calculate strides
+      in_stride = in_height * in_width;
+      out_stride = out_height * out_width;
+      out_height.times do |i|
+        out_width.times do |j|
+          # Calculate output offset
+          out_offset = i * out_width + j
+          # Calculate input pixel coordinates
+          x = (j + 0.5) / x_scale - 0.5
+          y = (i + 0.5) / y_scale - 0.5
+          # Calculate the four nearest input pixels
+          # We also check if the input pixel coordinates are within the image bounds
+          x1 = x.floor
+          y1 = y.floor
+          x2 = [x1 + 1, in_width - 1].min
+          y2 = [y1 + 1, in_height - 1].min
+          x1 = [x1, 0].max
+          y1 = [y1, 0].max
+          # Calculate the fractional distances between the input pixel and the four nearest pixels
+          s = x - x1
+          t = y - y1
+          # Perform bilinear interpolation
+          w1 = (1 - s) * (1 - t)
+          w2 = s * (1 - t)
+          w3 = (1 - s) * t
+          w4 = s * t
+          # Calculate the four nearest input pixel indices
+          y_stride = y1 * in_width
+          x_stride = y2 * in_width
+          idx1 = y_stride + x1
+          idx2 = y_stride + x2
+          idx3 = x_stride + x1
+          idx4 = x_stride + x2
+          in_channels.times do |k|
+            # Calculate channel offset
+            c_offset = k * in_stride
+            out_img[k * out_stride + out_offset] =
+              w1 * input[c_offset + idx1] +
+              w2 * input[c_offset + idx2] +
+              w3 * input[c_offset + idx3] +
+              w4 * input[c_offset + idx4]
+          end
+        end
+      end
+      out_img
+    end
     def self.softmax(arr)
       # Compute the maximum value in the array
       max_val = arr.max
@@ -17,6 +87,9 @@ module Informers
     end
     def self.sigmoid(arr)
+      if arr[0].is_a?(Array)
+        return arr.map { |a| sigmoid(a) }
+      end
       arr.map { |v| 1 / (1 + Math.exp(-v)) }
     end