RubyGems - informers - Versions diffs - 1.0.3 → 1.1.1 - Mend

informers 1.0.3 → 1.1.1

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/README.md +137 -7
data/lib/informers/configs.rb +10 -8
data/lib/informers/model.rb +2 -9
data/lib/informers/models.rb +1160 -15
data/lib/informers/pipelines.rb +943 -11
data/lib/informers/processors.rb +856 -0
data/lib/informers/tokenizers.rb +159 -5
data/lib/informers/utils/audio.rb +18 -0
data/lib/informers/utils/core.rb +4 -0
data/lib/informers/utils/ffmpeg.rb +45 -0
data/lib/informers/utils/generation.rb +294 -0
data/lib/informers/utils/image.rb +116 -0
data/lib/informers/utils/math.rb +73 -0
data/lib/informers/utils/tensor.rb +46 -0
data/lib/informers/version.rb +1 -1
data/lib/informers.rb +6 -0
metadata +10 -5

data/lib/informers/pipelines.rb CHANGED Viewed

@@ -7,6 +7,40 @@ module Informers
       @tokenizer = tokenizer
       @processor = processor
     end
+    private
+    def prepare_images(images)
+      if !images.is_a?(Array)
+        images = [images]
+      end
+      # Possibly convert any non-images to images
+      images.map { |x| Utils::RawImage.read(x) }
+    end
+    def prepare_audios(audios, sampling_rate)
+      if !audios.is_a?(Array)
+        audios = [audios]
+      end
+      audios.map do |x|
+        if x.is_a?(String) || x.is_a?(URI)
+          Utils.read_audio(x, sampling_rate)
+        else
+          x
+        end
+      end
+    end
+    def get_bounding_box(box, as_integer)
+      if as_integer
+        box = box.map { |x| x.to_i }
+      end
+      xmin, ymin, xmax, ymax = box
+      {xmin:, ymin:, xmax:, ymax:}
+    end
   end
   class TextClassificationPipeline < Pipeline
@@ -21,13 +55,13 @@ module Informers
       outputs = @model.(model_inputs)
       function_to_apply =
-        if @model.config.problem_type == "multi_label_classification"
+        if @model.config[:problem_type] == "multi_label_classification"
           ->(batch) { Utils.sigmoid(batch) }
         else
           ->(batch) { Utils.softmax(batch) } # single_label_classification (default)
         end
-      id2label = @model.config.id2label
+      id2label = @model.config[:id2label]
       to_return = []
       outputs.logits.each do |batch|
@@ -70,7 +104,7 @@ module Informers
       outputs = @model.(model_inputs)
       logits = outputs.logits
-      id2label = @model.config.id2label
+      id2label = @model.config[:id2label]
       to_return = []
       logits.length.times do |i|
@@ -243,6 +277,547 @@ module Informers
     end
   end
+  class FillMaskPipeline < Pipeline
+    def call(texts, top_k: 5)
+      model_inputs = @tokenizer.(texts, padding: true, truncation: true)
+      outputs = @model.(model_inputs)
+      to_return = []
+      model_inputs[:input_ids].each_with_index do |ids, i|
+        mask_token_index = ids.index(@tokenizer.mask_token_id)
+        if mask_token_index.nil?
+          raise ArgumentError, "Mask token (#{@tokenizer.mask_token}) not found in text."
+        end
+        logits = outputs.logits[i]
+        item_logits = logits[mask_token_index]
+        scores = Utils.get_top_items(Utils.softmax(item_logits), top_k)
+        to_return <<
+          scores.map do |x|
+            sequence = ids.dup
+            sequence[mask_token_index] = x[0]
+            {
+              score: x[1],
+              token: x[0],
+              token_str: @tokenizer.id_to_token(x[0]),
+              sequence: @tokenizer.decode(sequence, skip_special_tokens: true)
+            }
+          end
+      end
+      texts.is_a?(Array) ? to_return : to_return[0]
+    end
+  end
+  class Text2TextGenerationPipeline < Pipeline
+    KEY = :generated_text
+    def call(texts, **generate_kwargs)
+      if !texts.is_a?(Array)
+        texts = [texts]
+      end
+      # Add global prefix, if present
+      if @model.config[:prefix]
+        texts = texts.map { |x| @model.config[:prefix] + x }
+      end
+      # Handle task specific params:
+      task_specific_params = @model.config[:task_specific_params]
+      if task_specific_params && task_specific_params[@task]
+        # Add prefixes, if present
+        if task_specific_params[@task]["prefix"]
+          texts = texts.map { |x| task_specific_params[@task]["prefix"] + x }
+        end
+        # TODO update generation config
+      end
+      tokenizer = @tokenizer
+      tokenizer_options = {
+        padding: true,
+        truncation: true
+      }
+      if is_a?(TranslationPipeline) && tokenizer.respond_to?(:_build_translation_inputs)
+        input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs)[:input_ids]
+      else
+        input_ids = tokenizer.(texts, **tokenizer_options)[:input_ids]
+      end
+      output_token_ids = @model.generate(input_ids, generate_kwargs)
+      tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
+        .map { |text| {self.class.const_get(:KEY) => text} }
+    end
+  end
+  class SummarizationPipeline < Text2TextGenerationPipeline
+    KEY = :summary_text
+  end
+  class TranslationPipeline < Text2TextGenerationPipeline
+    KEY = :translation_text
+  end
+  class TextGenerationPipeline < Pipeline
+    def call(texts, **generate_kwargs)
+      is_batched = false
+      is_chat_input = false
+      # Normalize inputs
+      if texts.is_a?(String)
+        texts = [texts]
+        inputs = texts
+      else
+        raise Todo
+      end
+      # By default, do not add special tokens
+      add_special_tokens = generate_kwargs[:add_special_tokens] || false
+      # /By default, return full text
+      return_full_text =
+        if is_chat_input
+          false
+        else
+          generate_kwargs[:return_full_text] || true
+        end
+      @tokenizer.padding_side = "left"
+      input_ids, attention_mask =
+        @tokenizer.(inputs, add_special_tokens:, padding: true, truncation: true)
+          .values_at(:input_ids, :attention_mask)
+      output_token_ids =
+        @model.generate(
+          input_ids, generate_kwargs, nil, inputs_attention_mask: attention_mask
+        )
+      decoded = @tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
+      if !return_full_text && Utils.dims(input_ids)[-1] > 0
+        prompt_lengths = @tokenizer.batch_decode(input_ids, skip_special_tokens: true).map { |x| x.length }
+      end
+      to_return = Array.new(texts.length) { [] }
+      decoded.length.times do |i|
+        text_index = (i / output_token_ids.length.to_i * texts.length).floor
+        if prompt_lengths
+          raise Todo
+        end
+        # TODO is_chat_input
+        to_return[text_index] << {
+          generated_text: decoded[i]
+        }
+      end
+      !is_batched && to_return.length == 1 ? to_return[0] : to_return
+    end
+  end
+  class ZeroShotClassificationPipeline < Pipeline
+    def initialize(**options)
+      super(**options)
+      @label2id = @model.config[:label2id].transform_keys(&:downcase)
+      @entailment_id = @label2id["entailment"]
+      if @entailment_id.nil?
+        warn "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."
+        @entailment_id = 2
+      end
+      @contradiction_id = @label2id["contradiction"] || @label2id["not_entailment"]
+      if @contradiction_id.nil?
+        warn "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."
+        @contradiction_id = 0
+      end
+    end
+    def call(texts, candidate_labels, hypothesis_template: "This example is {}.", multi_label: false)
+      is_batched = texts.is_a?(Array)
+      if !is_batched
+        texts = [texts]
+      end
+      if !candidate_labels.is_a?(Array)
+        candidate_labels = [candidate_labels]
+      end
+      # Insert labels into hypothesis template
+      hypotheses = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
+      # How to perform the softmax over the logits:
+      #  - true:  softmax over the entailment vs. contradiction dim for each label independently
+      #  - false: softmax the "entailment" logits over all candidate labels
+      softmax_each = multi_label || candidate_labels.length == 1
+      to_return = []
+      texts.each do |premise|
+        entails_logits = []
+        hypotheses.each do |hypothesis|
+          inputs = @tokenizer.(
+            premise,
+            text_pair: hypothesis,
+            padding: true,
+            truncation: true
+          )
+          outputs = @model.(inputs)
+          if softmax_each
+            entails_logits << [
+              outputs.logits[0][@contradiction_id],
+              outputs.logits[0][@entailment_id]
+            ]
+          else
+            entails_logits << outputs.logits[0][@entailment_id]
+          end
+        end
+        scores =
+          if softmax_each
+            entails_logits.map { |x| Utils.softmax(x)[1] }
+          else
+            Utils.softmax(entails_logits)
+          end
+        # Sort by scores (desc) and return scores with indices
+        scores_sorted = scores.map.with_index { |x, i| [x, i] }.sort_by { |v| -v[0] }
+        to_return << {
+          sequence: premise,
+          labels: scores_sorted.map { |x| candidate_labels[x[1]] },
+          scores: scores_sorted.map { |x| x[0] }
+        }
+      end
+      is_batched ? to_return : to_return[0]
+    end
+  end
+  class ImageToTextPipeline < Pipeline
+    def call(images, **generate_kwargs)
+      is_batched = images.is_a?(Array)
+      prepared_images = prepare_images(images)
+      pixel_values = @processor.(prepared_images)[:pixel_values]
+      to_return = []
+      pixel_values.each do |batch|
+        batch = [batch]
+        output = @model.generate(batch, **generate_kwargs)
+        decoded = @tokenizer
+          .batch_decode(output, skip_special_tokens: true)
+          .map { |x| {generated_text: x.strip} }
+        to_return << decoded
+      end
+      is_batched ? to_return : to_return[0]
+    end
+  end
+  class ImageClassificationPipeline < Pipeline
+    def call(images, top_k: 1)
+      is_batched = images.is_a?(Array)
+      prepared_images = prepare_images(images)
+      pixel_values = @processor.(prepared_images)[:pixel_values]
+      output = @model.({pixel_values: pixel_values})
+      id2label = @model.config[:id2label]
+      to_return = []
+      output.logits.each do |batch|
+        scores = Utils.get_top_items(Utils.softmax(batch), top_k)
+        vals =
+          scores.map do |x|
+            {
+              label: id2label[x[0].to_s],
+              score: x[1]
+            }
+          end
+        if top_k == 1
+          to_return.push(*vals)
+        else
+          to_return << vals
+        end
+      end
+      is_batched || top_k == 1 ? to_return : to_return[0]
+    end
+  end
+  class ImageSegmentationPipeline < Pipeline
+    def initialize(**options)
+      super(**options)
+      @subtasks_mapping = {
+        "panoptic" => "post_process_panoptic_segmentation",
+        "instance" => "post_process_instance_segmentation",
+        "semantic" => "post_process_semantic_segmentation"
+      }
+    end
+    def call(
+      images,
+      threshold: 0.5,
+      mask_threshold: 0.5,
+      overlap_mask_area_threshold: 0.8,
+      label_ids_to_fuse: nil,
+      target_sizes: nil,
+      subtask: nil
+    )
+      is_batched = images.is_a?(Array)
+      if is_batched && images.length != 1
+        raise Error, "Image segmentation pipeline currently only supports a batch size of 1."
+      end
+      prepared_images = prepare_images(images)
+      image_sizes = prepared_images.map { |x| [x.height, x.width] }
+      model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
+      output = @model.(model_inputs)
+      if !subtask.nil?
+        fn = @subtasks_mapping[subtask]
+      else
+        @subtasks_mapping.each do |task, func|
+          if @processor.feature_extractor.respond_to?(func)
+            fn = @processor.feature_extractor.method(func)
+            subtask = task
+            break
+          end
+        end
+      end
+      id2label = @model.config[:id2label]
+      annotation = []
+      if subtask == "panoptic" || subtask == "instance"
+        processed = fn.(
+          output,
+          threshold:,
+          mask_threshold:,
+          overlap_mask_area_threshold:,
+          label_ids_to_fuse:,
+          target_sizes: target_sizes || image_sizes, # TODO FIX?
+        )[0]
+        _segmentation = processed[:segmentation]
+        processed[:segments_info].each do |segment|
+          annotation << {
+            label: id2label[segment[:label_id].to_s],
+            score: segment[:score]
+            # TODO mask
+          }
+        end
+      elsif subtask == "semantic"
+        raise Todo
+      else
+        raise Error, "Subtask #{subtask} not supported."
+      end
+      annotation
+    end
+  end
+  class ZeroShotImageClassificationPipeline < Pipeline
+    def call(images, candidate_labels, hypothesis_template: "This is a photo of {}")
+      is_batched = images.is_a?(Array)
+      prepared_images = prepare_images(images)
+      # Insert label into hypothesis template
+      texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
+      #  Run tokenization
+      text_inputs = @tokenizer.(texts,
+        padding: @model.config[:model_type] == "siglip" ? "max_length" : true,
+        truncation: true
+      )
+      # Run processor
+      pixel_values = @processor.(prepared_images)[:pixel_values]
+      # Run model with both text and pixel inputs
+      output = @model.(text_inputs.merge(pixel_values: pixel_values))
+      function_to_apply =
+        if @model.config[:model_type] == "siglip"
+          ->(batch) { Utils.sigmoid(batch) }
+        else
+          ->(batch) { Utils.softmax(batch) }
+        end
+      # Compare each image with each candidate label
+      to_return = []
+      output[0].each do |batch|
+        # Compute softmax per image
+        probs = function_to_apply.(batch)
+        result = probs
+          .map.with_index { |x, i| {label: candidate_labels[i], score: x} }
+          .sort_by { |v| -v[:score] }
+        to_return << result
+      end
+      is_batched ? to_return : to_return[0]
+    end
+  end
+  class ObjectDetectionPipeline < Pipeline
+    def call(images, threshold: 0.9, percentage: false)
+      is_batched = images.is_a?(Array)
+      if is_batched && images.length != 1
+        raise Error, "Object detection pipeline currently only supports a batch size of 1."
+      end
+      prepared_images = prepare_images(images)
+      image_sizes = percentage ? nil : prepared_images.map { |x| [x.height, x.width] }
+      model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
+      output = @model.(model_inputs)
+      processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_sizes)
+      # Add labels
+      id2label = @model.config[:id2label]
+      # Format output
+      result =
+        processed.map do |batch|
+          batch[:boxes].map.with_index do |box, i|
+            {
+              label: id2label[batch[:classes][i].to_s],
+              score: batch[:scores][i],
+              box: get_bounding_box(box, !percentage)
+            }
+          end.sort_by { |v| -v[:score] }
+        end
+      is_batched ? result : result[0]
+    end
+  end
+  class ZeroShotObjectDetectionPipeline < Pipeline
+    def call(
+      images,
+      candidate_labels,
+      threshold: 0.1,
+      top_k: nil,
+      percentage: false
+    )
+      is_batched = images.is_a?(Array)
+      prepared_images = prepare_images(images)
+      # Run tokenization
+      text_inputs = @tokenizer.(candidate_labels,
+        padding: true,
+        truncation: true
+      )
+      # Run processor
+      model_inputs = @processor.(prepared_images)
+      # Since non-maximum suppression is performed for exporting, we need to
+      # process each image separately. For more information, see:
+      # https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
+      to_return = []
+      prepared_images.length.times do |i|
+        image = prepared_images[i]
+        image_size = percentage ? nil : [[image.height, image.width]]
+        pixel_values = [model_inputs[:pixel_values][i]]
+        # Run model with both text and pixel inputs
+        output = @model.(text_inputs.merge(pixel_values: pixel_values))
+        # TODO remove
+        output = @model.instance_variable_get(:@session).outputs.map { |v| v[:name].to_sym }.zip(output).to_h
+        processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_size, true)[0]
+        result =
+          processed[:boxes].map.with_index do |box, i|
+            {
+              label: candidate_labels[processed[:classes][i]],
+              score: processed[:scores][i],
+              box: get_bounding_box(box, !percentage)
+            }
+          end
+        result.sort_by! { |v| -v[:score] }
+        if !top_k.nil?
+          result = result[0...topk]
+        end
+        to_return << result
+      end
+      is_batched ? to_return : to_return[0]
+    end
+  end
+  class DocumentQuestionAnsweringPipeline < Pipeline
+    def call(image, question, **generate_kwargs)
+      # NOTE: For now, we only support a batch size of 1
+      # Preprocess image
+      prepared_image = prepare_images(image)[0]
+      pixel_values = @processor.(prepared_image)[:pixel_values]
+      # Run tokenization
+      task_prompt = "<s_docvqa><s_question>#{question}</s_question><s_answer>"
+      decoder_input_ids =
+        @tokenizer.(
+          task_prompt,
+          add_special_tokens: false,
+          padding: true,
+          truncation: true
+        )[:input_ids]
+      # Run model
+      output =
+        @model.generate(
+          pixel_values,
+          generate_kwargs.merge(
+            decoder_input_ids: decoder_input_ids[0],
+            max_length: @model.config["decoder"]["max_position_embeddings"]
+          ).transform_keys(&:to_s)
+        )
+      # Decode output
+      decoded = @tokenizer.batch_decode(output, skip_special_tokens: false)[0]
+      # Parse answer
+      match = decoded.match(/<s_answer>(.*?)<\/s_answer>/)
+      answer = nil
+      if match && match.length >= 2
+        answer = match[1].strip
+      end
+      [{answer:}]
+    end
+  end
+  class TextToAudioPipeline < Pipeline
+    DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
+    def initialize(**options)
+      super(**options)
+      # TODO: Find a better way for `pipeline` to set the default vocoder
+      @vocoder = options[:vocoder]
+    end
+    def call(text_inputs, speaker_embeddings: nil)
+      # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
+      if @processor
+        call_text_to_spectrogram(text_inputs, speaker_embeddings:)
+      else
+        call_text_to_waveform(text_inputs)
+      end
+    end
+  end
   class FeatureExtractionPipeline < Pipeline
     def call(
       texts,
@@ -262,7 +837,7 @@ module Informers
       if !model_output.nil?
         model_options[:output_names] = Array(model_output)
       elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
-        # optimization for sentence-transformers/all-MiniLM-L6-v2
+        # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
         model_options[:output_names] = ["sentence_embedding"]
         pooling = "none"
         normalize = false
@@ -306,6 +881,164 @@ module Informers
     end
   end
+  class ImageFeatureExtractionPipeline < Pipeline
+    def call(images)
+      prepared_images = prepare_images(images)
+      pixel_values = @processor.(prepared_images)[:pixel_values]
+      outputs = @model.({pixel_values: pixel_values})
+      result = outputs[0]
+      result
+    end
+  end
+  class AudioClassificationPipeline < Pipeline
+    def call(audio, top_k: nil)
+      single = !audio.is_a?(Array)
+      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
+      prepared_audios = prepare_audios(audio, sampling_rate)
+      id2label = @model.config[:id2label]
+      to_return = []
+      prepared_audios.each do |aud|
+        inputs = @processor.(aud)
+        output = @model.(inputs)
+        logits = output.logits[0]
+        scores = Utils.get_top_items(Utils.softmax(logits), top_k)
+        vals =
+          scores.map do |x|
+            {
+              label: id2label[x[0].to_s],
+              score: x[1]
+            }
+          end
+        if top_k == 1
+          to_return.concat(vals)
+        else
+          to_return << vals
+        end
+      end
+      !single || top_k == 1 ? to_return : to_return[0]
+    end
+  end
+  class ZeroShotAudioClassificationPipeline < Pipeline
+    def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
+      single = !audio.is_a?(Array)
+      if single
+        audio = [audio]
+      end
+      # Insert label into hypothesis template
+      texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
+      # Run tokenization
+      text_inputs =
+        @tokenizer.(
+          texts,
+          padding: true,
+          truncation: true
+        )
+      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
+      prepared_audios = prepare_audios(audio, sampling_rate)
+      to_return = []
+      prepared_audios.each do |aud|
+        audio_inputs = @processor.(aud)
+        # Run model with both text and audio inputs
+        output = @model.(text_inputs.merge(audio_inputs))
+        # Compute softmax per audio
+        probs = Utils.softmax(output.logits_per_audio.data)
+        to_return <<
+          probs.map.with_index do |x, i|
+            {
+              label: candidate_labels[i],
+              score: x
+            }
+          end
+      end
+      single ? to_return[0] : to_return
+    end
+  end
+  class AutomaticSpeechRecognitionPipeline < Pipeline
+    def call(audio, **kwargs)
+      case @model.config["model_type"]
+      when "whisper"
+        call_whisper(audio, **kwargs)
+      else
+        raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
+      end
+    end
+    private
+    def call_whisper(audio, **kwargs)
+      raise Todo
+    end
+  end
+  class ImageToImagePipeline < Pipeline
+    def call(images)
+      prepared_images = prepare_images(images)
+      inputs = @processor.(prepared_images)
+      outputs = @model.(inputs)
+      to_return = []
+      outputs[0].each do |batch|
+        # TODO flatten first
+        output =
+          batch.map do |v|
+            v.map do |v2|
+              v2.map do |v3|
+                (v3.clamp(0, 1) * 255).round
+              end
+            end
+          end
+        to_return << Utils::RawImage.from_array(output).image
+      end
+      to_return.length > 1 ? to_return : to_return[0]
+    end
+  end
+  class DepthEstimationPipeline < Pipeline
+    def call(images)
+      prepared_images = prepare_images(images)
+      inputs = @processor.(prepared_images)
+      predicted_depth = @model.(inputs)[0]
+      to_return = []
+      prepared_images.length.times do |i|
+        prediction = Utils.interpolate(predicted_depth[i], prepared_images[i].size.reverse, "bilinear", false)
+        max_prediction = Utils.max(prediction.flatten)[0]
+        formatted =
+          prediction.map do |v|
+            v.map do |v2|
+              v2.map do |v3|
+                (v3 * 255 / max_prediction).round
+              end
+            end
+          end
+        to_return << {
+          predicted_depth: predicted_depth[i],
+          depth: Utils::RawImage.from_array(formatted).image
+        }
+      end
+      to_return.length > 1 ? to_return : to_return[0]
+    end
+  end
   class EmbeddingPipeline < FeatureExtractionPipeline
     def call(
       texts,
@@ -375,6 +1108,186 @@ module Informers
       },
       type: "text"
     },
+    "fill-mask" => {
+      tokenizer: AutoTokenizer,
+      pipeline: FillMaskPipeline,
+      model: AutoModelForMaskedLM,
+      default: {
+        model: "Xenova/bert-base-uncased"
+      },
+      type: "text"
+    },
+    "summarization" => {
+      tokenizer: AutoTokenizer,
+      pipeline: SummarizationPipeline,
+      model: AutoModelForSeq2SeqLM,
+      default: {
+        model: "Xenova/distilbart-cnn-6-6"
+      },
+      type: "text"
+    },
+    "translation" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TranslationPipeline,
+      model: AutoModelForSeq2SeqLM,
+      default: {
+        model: "Xenova/t5-small"
+      },
+      type: "text"
+    },
+    "text2text-generation" => {
+      tokenizer: AutoTokenizer,
+      pipeline: Text2TextGenerationPipeline,
+      model: AutoModelForSeq2SeqLM,
+      default: {
+        model: "Xenova/flan-t5-small"
+      },
+      type: "text"
+    },
+    "text-generation" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TextGenerationPipeline,
+      model: AutoModelForCausalLM,
+      default: {
+        model: "Xenova/gpt2"
+      },
+      type: "text"
+    },
+    "zero-shot-classification" => {
+      tokenizer: AutoTokenizer,
+      pipeline: ZeroShotClassificationPipeline,
+      model: AutoModelForSequenceClassification,
+      default: {
+        model: "Xenova/distilbert-base-uncased-mnli"
+      },
+      type: "text"
+    },
+    "audio-classification" => {
+      pipeline: AudioClassificationPipeline,
+      model: AutoModelForAudioClassification,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/wav2vec2-base-superb-ks"
+      },
+      type: "audio"
+    },
+    # TODO
+    # "zero-shot-audio-classification" => {
+    #   tokenizer: AutoTokenizer,
+    #   pipeline: ZeroShotAudioClassificationPipeline,
+    #   model: AutoModel,
+    #   processor: AutoProcessor,
+    #   default: {
+    #      model: "Xenova/clap-htsat-unfused"
+    #   },
+    #   type: "multimodal"
+    # },
+    # TODO
+    # "automatic-speech-recognition" => {
+    #   tokenizer: AutoTokenizer,
+    #   pipeline: AutomaticSpeechRecognitionPipeline,
+    #   model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
+    #   processor: AutoProcessor,
+    #   default: {
+    #     model: "Xenova/whisper-tiny.en"
+    #   },
+    #   type: "multimodal"
+    # },
+    "text-to-audio" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TextToAudioPipeline,
+      model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
+      processor: [AutoProcessor, nil],
+      default: {
+        model: "Xenova/speecht5_tts"
+      },
+      type: "text"
+    },
+    "image-to-text" => {
+      tokenizer: AutoTokenizer,
+      pipeline: ImageToTextPipeline,
+      model: AutoModelForVision2Seq,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/vit-gpt2-image-captioning"
+      },
+      type: "multimodal"
+    },
+    "image-classification" => {
+      pipeline: ImageClassificationPipeline,
+      model: AutoModelForImageClassification,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/vit-base-patch16-224"
+      },
+      type: "multimodal"
+    },
+    "image-segmentation" => {
+      pipeline: ImageSegmentationPipeline,
+      model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/detr-resnet-50-panoptic"
+      },
+      type: "multimodal"
+    },
+    "zero-shot-image-classification" => {
+      tokenizer: AutoTokenizer,
+      pipeline: ZeroShotImageClassificationPipeline,
+      model: AutoModel,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/clip-vit-base-patch32"
+      },
+      type: "multimodal"
+    },
+    "object-detection" => {
+      pipeline: ObjectDetectionPipeline,
+      model: AutoModelForObjectDetection,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/detr-resnet-50"
+      },
+      type: "multimodal"
+    },
+    "zero-shot-object-detection" => {
+      tokenizer: AutoTokenizer,
+      pipeline: ZeroShotObjectDetectionPipeline,
+      model: AutoModelForZeroShotObjectDetection,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/owlvit-base-patch32"
+      },
+      type: "multimodal"
+    },
+    "document-question-answering" => {
+      tokenizer: AutoTokenizer,
+      pipeline: DocumentQuestionAnsweringPipeline,
+      model: AutoModelForDocumentQuestionAnswering,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/donut-base-finetuned-docvqa"
+      },
+      type: "multimodal"
+    },
+    "image-to-image" => {
+      pipeline: ImageToImagePipeline,
+      model: AutoModelForImageToImage,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/swin2SR-classical-sr-x2-64"
+      },
+      type: "image"
+    },
+    "depth-estimation" => {
+      pipeline: DepthEstimationPipeline,
+      model: AutoModelForDepthEstimation,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/dpt-large"
+      },
+      type: "image"
+    },
     "feature-extraction" => {
       tokenizer: AutoTokenizer,
       pipeline: FeatureExtractionPipeline,
@@ -384,6 +1297,15 @@ module Informers
       },
       type: "text"
     },
+    "image-feature-extraction" => {
+      processor: AutoProcessor,
+      pipeline: ImageFeatureExtractionPipeline,
+      model: [AutoModelForImageFeatureExtraction, AutoModel],
+      default: {
+        model: "Xenova/vit-base-patch16-224"
+      },
+      type: "image"
+    },
     "embedding" => {
       tokenizer: AutoTokenizer,
       pipeline: EmbeddingPipeline,
@@ -406,7 +1328,8 @@ module Informers
   TASK_ALIASES = {
     "sentiment-analysis" => "text-classification",
-    "ner" => "token-classification"
+    "ner" => "token-classification",
+    "text-to-speech" => "text-to-audio"
   }
   DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -439,14 +1362,14 @@ module Informers
       revision: "main",
       model_file_name: nil
     )
+      # Apply aliases
+      task = TASK_ALIASES[task] || task
       if quantized == NO_DEFAULT
         # TODO move default to task class
-        quantized = !["embedding", "reranking"].include?(task)
+        quantized = ["text-classification", "token-classification", "question-answering", "feature-extraction"].include?(task)
       end
-      # Apply aliases
-      task = TASK_ALIASES[task] || task
       # Get pipeline info
       pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
       if !pipeline_info
@@ -479,7 +1402,8 @@ module Informers
       results = load_items(classes, model, pretrained_options)
       results[:task] = task
-      if model == "sentence-transformers/all-MiniLM-L6-v2"
+      # for previous revision of sentence-transformers/all-MiniLM-L6-v2
+      if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
         results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
       end
@@ -502,7 +1426,15 @@ module Informers
         next if !cls
         if cls.is_a?(Array)
-          raise Todo
+          e = nil
+          cls.each do |c|
+            begin
+              result[name] = c.from_pretrained(model, **pretrained_options)
+            rescue => err
+              e = err
+            end
+          end
+          raise e unless result[name]
         else
           result[name] = cls.from_pretrained(model, **pretrained_options)
         end