RubyGems - informers - Versions diffs - 1.1.0 → 1.1.1 - Mend

informers 1.1.0 → 1.1.1

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +14 -7
data/lib/informers/models.rb +165 -5
data/lib/informers/pipelines.rb +180 -8
data/lib/informers/processors.rb +62 -2
data/lib/informers/tokenizers.rb +6 -2
data/lib/informers/utils/audio.rb +18 -0
data/lib/informers/utils/ffmpeg.rb +45 -0
data/lib/informers/utils/image.rb +1 -1
data/lib/informers/utils/math.rb +2 -2
data/lib/informers/utils/tensor.rb +1 -1
data/lib/informers/version.rb +1 -1
data/lib/informers.rb +3 -0
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
-  data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
+  metadata.gz: a61f01755798e81a975641d60e5bfe09484ced7ce6a3453020c9978dc35b1942
+  data.tar.gz: 811f9c1dc4499ae7de8ebf8e02c0c4e98a0c0bc0af6aaca51025e42ba8165540
 SHA512:
-  metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
-  data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
+  metadata.gz: 97b27363fab1e43895e368dbddc819fd4db23d42ce517359e5971347cd902b654f0c66700f07b36cd5f476bd3ea205a91e4f7e7ee0e7d8d455f0dce377bedb2b
+  data.tar.gz: dd1a7f795609423419ce213b00a5aca409f6b4a5bffb111250b4deffcbc6a8113fadf8d603c59fa78fa0f310904a0a3299e3bcdc48101f574171a024d13567e6

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+## 1.1.1 (2024-10-14)
+- Added `audio-classification` pipeline
+- Fixed error with `sentence-transformers/all-MiniLM-L6-v2`
 ## 1.1.0 (2024-09-17)
 - Added more pipelines

data/README.md CHANGED Viewed

@@ -229,19 +229,13 @@ result = model.(query, docs)
 ### Other
-You can use the feature extraction pipeline directly.
-```ruby
-model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
-embeddings = model.(sentences, pooling: "mean", normalize: true)
-```
 The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
 ## Pipelines
 - [Text](#text)
 - [Vision](#vision)
+- [Audio](#audio)
 - [Multimodel](#multimodal)
 ### Text
@@ -332,6 +326,8 @@ extractor.("We are very happy to show you the 🤗 Transformers library.")
 ### Vision
+Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images
 Image classification
 ```ruby
@@ -388,6 +384,17 @@ extractor = Informers.pipeline("image-feature-extraction")
 extractor.("image.jpg")
 ```
+### Audio
+Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files
+Audio classification
+```ruby
+classifier = Informers.pipeline("audio-classification")
+classifier.("audio.wav")
+```
 ### Multimodal
 Image captioning

data/lib/informers/models.rb CHANGED Viewed

@@ -84,6 +84,7 @@ module Informers
         @get_start_beams = method(:decoder_start_beams)
         @update_beam = method(:decoder_update_beam)
         @forward = method(:decoder_forward)
       when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
         @can_generate = true
@@ -91,8 +92,10 @@ module Informers
         @get_start_beams = method(:seq2seq_start_beams)
         @update_beam = method(:seq2seq_update_beam)
         @forward = method(:seq2seq_forward)
       when MODEL_TYPES[:EncoderDecoder]
-        raise Todo
+        @forward = method(:encoder_forward)
       else
         @forward = method(:encoder_forward)
       end
@@ -137,10 +140,18 @@ module Informers
         ]
       elsif model_type == MODEL_TYPES[:MaskGeneration]
-        raise Todo
+        info = [
+          AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
+          construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
+          construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
+        ]
       elsif model_type == MODEL_TYPES[:EncoderDecoder]
-        raise Todo
+        info = [
+          AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
+          construct_session(pretrained_model_name_or_path, "encoder_model", **options),
+          construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
+        ]
       else
         if model_type != MODEL_TYPES[:EncoderOnly]
@@ -293,13 +304,13 @@ module Informers
       grouped_beams = group_beams(beams)
       get_flattened = lambda do |key|
-        grouped_beams.map do |batch|
+        grouped_beams.flat_map do |batch|
           if generation_config["num_return_sequences"] > 1
             raise Todo
           else
             [batch[0][key]]
           end
-        end.flatten(1)
+        end
       end
       sequences = get_flattened.(:output_token_ids) # [1, seqLength]
@@ -904,6 +915,18 @@ module Informers
     end
   end
+  class Wav2Vec2PreTrainedModel < PreTrainedModel
+  end
+  class Wav2Vec2Model < Wav2Vec2PreTrainedModel
+  end
+  class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
+    def call(model_inputs)
+      SequenceClassifierOutput.new(*super(model_inputs))
+    end
+  end
   class RobertaPreTrainedModel < PreTrainedModel
   end
@@ -1066,6 +1089,62 @@ module Informers
   class DonutSwinModel < DonutSwinPreTrainedModel
   end
+  class WhisperPreTrainedModel < PreTrainedModel
+  end
+  class WhisperModel < WhisperPreTrainedModel
+  end
+  class WhisperForConditionalGeneration < WhisperPreTrainedModel
+    REQUIRES_ATTENTION_MASK = false
+    MAIN_INPUT_NAME = :input_features
+    def initialize(config, session, decoder_merged_session, generation_config)
+      super(config, session)
+      @decoder_merged_session = decoder_merged_session
+      @generation_config = generation_config
+      @num_decoder_layers = @config["decoder_layers"]
+      @num_decoder_heads = @config["decoder_attention_heads"]
+      @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f
+      @num_encoder_layers = @config["encoder_layers"]
+      @num_encoder_heads = @config["encoder_attention_heads"]
+      @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
+    end
+    def generate(inputs, generation_config = nil, logits_processor = nil)
+      raise Todo
+    end
+  end
+  class VitsPreTrainedModel < PreTrainedModel
+  end
+  class VitsModel < VitsPreTrainedModel
+    def call(model_inputs)
+      VitsModelOutput.new(*super(model_inputs))
+    end
+  end
+  class SpeechT5PreTrainedModel < PreTrainedModel
+  end
+  class SpeechT5Model < SpeechT5PreTrainedModel
+  end
+  class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
+  end
+  class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
+  end
+  class ClapPreTrainedModel < PreTrainedModel
+  end
+  class ClapModel < ClapPreTrainedModel
+  end
   MODEL_MAPPING_NAMES_ENCODER_ONLY = {
     "bert" => ["BertModel", BertModel],
     "nomic_bert" => ["NomicBertModel", NomicBertModel],
@@ -1074,6 +1153,7 @@ module Informers
     "distilbert" => ["DistilBertModel", DistilBertModel],
     "roberta" => ["RobertaModel", RobertaModel],
     "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
+    "clap" => ["ClapModel", ClapModel],
     "clip" => ["CLIPModel", CLIPModel],
     "detr" => ["DetrModel", DetrModel],
     "vit" => ["ViTModel", ViTModel],
@@ -1085,6 +1165,21 @@ module Informers
     "bart" => ["BartModel", BartModel]
   }
+  MODEL_MAPPING_NAMES_DECODER_ONLY = {
+  }
+  MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
+    "whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
+  }
+  MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
+    "speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
+  }
+  MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
+    "vits" => ["VitsModel", VitsModel]
+  }
   MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
     "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
     "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
@@ -1143,6 +1238,25 @@ module Informers
   MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
   }
+  MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
+  }
+  MODEL_FOR_CTC_MAPPING_NAMES = {
+  }
+  MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
+    "wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
+  }
+  MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
+  }
+  MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
+  }
+  MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
+  }
   MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
     "swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
   }
@@ -1157,9 +1271,11 @@ module Informers
   MODEL_CLASS_TYPE_MAPPING = [
     [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
     [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
+    [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
     [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
+    [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
     [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
     [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
@@ -1167,10 +1283,18 @@ module Informers
     [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
+    [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
+    [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
   ]
@@ -1199,6 +1323,18 @@ module Informers
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
   end
+  class AutoModelForSpeechSeq2Seq < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
+  end
+  class AutoModelForTextToSpectrogram < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
+  end
+  class AutoModelForTextToWaveform < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
+  end
   class AutoModelForCausalLM < PretrainedMixin
     MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
   end
@@ -1235,10 +1371,34 @@ module Informers
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
   end
+  class AutoModelForMaskGeneration < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
+  end
+  class AutoModelForCTC < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
+  end
+  class AutoModelForAudioClassification < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
+  end
+  class AutoModelForXVector < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
+  end
+  class AutoModelForAudioFrameClassification < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
+  end
   class AutoModelForDocumentQuestionAnswering < PretrainedMixin
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
   end
+  class AutoModelForImageMatting < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
+  end
   class AutoModelForImageToImage < PretrainedMixin
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
   end

data/lib/informers/pipelines.rb CHANGED Viewed

@@ -19,6 +19,20 @@ module Informers
       images.map { |x| Utils::RawImage.read(x) }
     end
+    def prepare_audios(audios, sampling_rate)
+      if !audios.is_a?(Array)
+        audios = [audios]
+      end
+      audios.map do |x|
+        if x.is_a?(String) || x.is_a?(URI)
+          Utils.read_audio(x, sampling_rate)
+        else
+          x
+        end
+      end
+    end
     def get_bounding_box(box, as_integer)
       if as_integer
         box = box.map { |x| x.to_i }
@@ -729,7 +743,7 @@ module Informers
             {
               label: candidate_labels[processed[:classes][i]],
               score: processed[:scores][i],
-              box: get_bounding_box(box, !percentage),
+              box: get_bounding_box(box, !percentage)
             }
           end
         result.sort_by! { |v| -v[:score] }
@@ -784,6 +798,26 @@ module Informers
     end
   end
+  class TextToAudioPipeline < Pipeline
+    DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
+    def initialize(**options)
+      super(**options)
+      # TODO: Find a better way for `pipeline` to set the default vocoder
+      @vocoder = options[:vocoder]
+    end
+    def call(text_inputs, speaker_embeddings: nil)
+      # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
+      if @processor
+        call_text_to_spectrogram(text_inputs, speaker_embeddings:)
+      else
+        call_text_to_waveform(text_inputs)
+      end
+    end
+  end
   class FeatureExtractionPipeline < Pipeline
     def call(
       texts,
@@ -803,7 +837,7 @@ module Informers
       if !model_output.nil?
         model_options[:output_names] = Array(model_output)
       elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
-        # optimization for sentence-transformers/all-MiniLM-L6-v2
+        # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
         model_options[:output_names] = ["sentence_embedding"]
         pooling = "none"
         normalize = false
@@ -858,11 +892,106 @@ module Informers
     end
   end
+  class AudioClassificationPipeline < Pipeline
+    def call(audio, top_k: nil)
+      single = !audio.is_a?(Array)
+      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
+      prepared_audios = prepare_audios(audio, sampling_rate)
+      id2label = @model.config[:id2label]
+      to_return = []
+      prepared_audios.each do |aud|
+        inputs = @processor.(aud)
+        output = @model.(inputs)
+        logits = output.logits[0]
+        scores = Utils.get_top_items(Utils.softmax(logits), top_k)
+        vals =
+          scores.map do |x|
+            {
+              label: id2label[x[0].to_s],
+              score: x[1]
+            }
+          end
+        if top_k == 1
+          to_return.concat(vals)
+        else
+          to_return << vals
+        end
+      end
+      !single || top_k == 1 ? to_return : to_return[0]
+    end
+  end
+  class ZeroShotAudioClassificationPipeline < Pipeline
+    def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
+      single = !audio.is_a?(Array)
+      if single
+        audio = [audio]
+      end
+      # Insert label into hypothesis template
+      texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
+      # Run tokenization
+      text_inputs =
+        @tokenizer.(
+          texts,
+          padding: true,
+          truncation: true
+        )
+      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
+      prepared_audios = prepare_audios(audio, sampling_rate)
+      to_return = []
+      prepared_audios.each do |aud|
+        audio_inputs = @processor.(aud)
+        # Run model with both text and audio inputs
+        output = @model.(text_inputs.merge(audio_inputs))
+        # Compute softmax per audio
+        probs = Utils.softmax(output.logits_per_audio.data)
+        to_return <<
+          probs.map.with_index do |x, i|
+            {
+              label: candidate_labels[i],
+              score: x
+            }
+          end
+      end
+      single ? to_return[0] : to_return
+    end
+  end
+  class AutomaticSpeechRecognitionPipeline < Pipeline
+    def call(audio, **kwargs)
+      case @model.config["model_type"]
+      when "whisper"
+        call_whisper(audio, **kwargs)
+      else
+        raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
+      end
+    end
+    private
+    def call_whisper(audio, **kwargs)
+      raise Todo
+    end
+  end
   class ImageToImagePipeline < Pipeline
     def call(images)
       prepared_images = prepare_images(images)
       inputs = @processor.(prepared_images)
-      outputs = @model.(inputs);
+      outputs = @model.(inputs)
       to_return = []
       outputs[0].each do |batch|
@@ -1033,6 +1162,47 @@ module Informers
       },
       type: "text"
     },
+    "audio-classification" => {
+      pipeline: AudioClassificationPipeline,
+      model: AutoModelForAudioClassification,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/wav2vec2-base-superb-ks"
+      },
+      type: "audio"
+    },
+    # TODO
+    # "zero-shot-audio-classification" => {
+    #   tokenizer: AutoTokenizer,
+    #   pipeline: ZeroShotAudioClassificationPipeline,
+    #   model: AutoModel,
+    #   processor: AutoProcessor,
+    #   default: {
+    #      model: "Xenova/clap-htsat-unfused"
+    #   },
+    #   type: "multimodal"
+    # },
+    # TODO
+    # "automatic-speech-recognition" => {
+    #   tokenizer: AutoTokenizer,
+    #   pipeline: AutomaticSpeechRecognitionPipeline,
+    #   model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
+    #   processor: AutoProcessor,
+    #   default: {
+    #     model: "Xenova/whisper-tiny.en"
+    #   },
+    #   type: "multimodal"
+    # },
+    "text-to-audio" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TextToAudioPipeline,
+      model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
+      processor: [AutoProcessor, nil],
+      default: {
+        model: "Xenova/speecht5_tts"
+      },
+      type: "text"
+    },
     "image-to-text" => {
       tokenizer: AutoTokenizer,
       pipeline: ImageToTextPipeline,
@@ -1048,7 +1218,7 @@ module Informers
       model: AutoModelForImageClassification,
       processor: AutoProcessor,
       default: {
-        model: "Xenova/vit-base-patch16-224",
+        model: "Xenova/vit-base-patch16-224"
       },
       type: "multimodal"
     },
@@ -1057,7 +1227,7 @@ module Informers
       model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
       processor: AutoProcessor,
       default: {
-        model: "Xenova/detr-resnet-50-panoptic",
+        model: "Xenova/detr-resnet-50-panoptic"
       },
       type: "multimodal"
     },
@@ -1076,7 +1246,7 @@ module Informers
       model: AutoModelForObjectDetection,
       processor: AutoProcessor,
       default: {
-        model: "Xenova/detr-resnet-50",
+        model: "Xenova/detr-resnet-50"
       },
       type: "multimodal"
     },
@@ -1158,7 +1328,8 @@ module Informers
   TASK_ALIASES = {
     "sentiment-analysis" => "text-classification",
-    "ner" => "token-classification"
+    "ner" => "token-classification",
+    "text-to-speech" => "text-to-audio"
   }
   DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -1231,7 +1402,8 @@ module Informers
       results = load_items(classes, model, pretrained_options)
       results[:task] = task
-      if model == "sentence-transformers/all-MiniLM-L6-v2"
+      # for previous revision of sentence-transformers/all-MiniLM-L6-v2
+      if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
         results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
       end

data/lib/informers/processors.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 module Informers
   class FeatureExtractor
+    attr_reader :config
     def initialize(config)
       super()
       @config = config
@@ -728,6 +730,61 @@ module Informers
     end
   end
+  class WhisperFeatureExtractor < FeatureExtractor
+    def initialize(config)
+      super(config)
+      raise Todo
+    end
+    def _extract_fbank_features(waveform)
+      raise Todo
+    end
+    def call(audio)
+      raise Todo
+    end
+  end
+  class Wav2Vec2FeatureExtractor < FeatureExtractor
+    def _zero_mean_unit_var_norm(input_values)
+      sum = input_values.sum
+      mean = sum / input_values.length.to_f
+      variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
+      input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
+    end
+    def call(audio)
+      # TODO
+      # validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')
+      input_values = audio
+      # zero-mean and unit-variance normalization
+      if @config["do_normalize"]
+        input_values = _zero_mean_unit_var_norm(input_values)
+      end
+      # TODO: allow user to pass in attention mask
+      {
+        input_values: [input_values],
+        attention_mask: [Array.new(input_values.length, 1)]
+      }
+    end
+  end
+  class ClapFeatureExtractor < FeatureExtractor
+    def initialize(config)
+      super(config)
+      # TODO
+    end
+    def call(audio, max_length: nil)
+      raise Todo
+    end
+  end
   class Processor
     attr_reader :feature_extractor
@@ -748,7 +805,10 @@ module Informers
       "DPTFeatureExtractor" => DPTFeatureExtractor,
       "DetrFeatureExtractor" => DetrFeatureExtractor,
       "Swin2SRImageProcessor" => Swin2SRImageProcessor,
-      "DonutFeatureExtractor" => DonutFeatureExtractor
+      "DonutFeatureExtractor" => DonutFeatureExtractor,
+      "WhisperFeatureExtractor" => WhisperFeatureExtractor,
+      "Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
+      "ClapFeatureExtractor" => ClapFeatureExtractor
     }
     PROCESSOR_CLASS_MAPPING = {}
@@ -762,7 +822,7 @@ module Informers
       revision: "main",
       **kwargs
     )
-      preprocessor_config = config || Utils::Hub::get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
+      preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
         progress_callback:,
         config:,
         cache_dir:,

data/lib/informers/tokenizers.rb CHANGED Viewed

@@ -244,6 +244,9 @@ module Informers
     end
   end
+  class SpeechT5Tokenizer < PreTrainedTokenizer
+  end
   class AutoTokenizer
     TOKENIZER_CLASS_MAPPING = {
       "T5Tokenizer" => T5Tokenizer,
@@ -257,7 +260,8 @@ module Informers
       "CLIPTokenizer" => CLIPTokenizer,
       "GPT2Tokenizer" => GPT2Tokenizer,
       "NllbTokenizer" => NllbTokenizer,
-      "M2M100Tokenizer" => M2M100Tokenizer
+      "M2M100Tokenizer" => M2M100Tokenizer,
+      "SpeechT5Tokenizer" => SpeechT5Tokenizer
     }
     def self.from_pretrained(
@@ -296,7 +300,7 @@ module Informers
     def self.load_tokenizer(pretrained_model_name_or_path, **options)
       info = [
         Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
-        Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options),
+        Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
       ]
       # Override legacy option if `options.legacy` is not null

data/lib/informers/utils/audio.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module Informers
+  module Utils
+    def self.read_audio(input, sampling_rate)
+      data =
+        if input.is_a?(URI)
+          require "open-uri"
+          input.read
+        elsif input.is_a?(String)
+          File.binread(input)
+        else
+          raise ArgumentError, "Unsupported input type: #{input.class.name}"
+        end
+      ffmpeg_read(data, sampling_rate)
+    end
+  end
+end

data/lib/informers/utils/ffmpeg.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+module Informers
+  module Utils
+    # from the Transformers Python library
+    def self.ffmpeg_read(data, sampling_rate)
+      ar = "#{sampling_rate}"
+      ac = "1"
+      format_for_conversion = "f32le"
+      ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1"
+      ]
+      stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
+      if !status.success?
+        raise Error, "ffmpeg was not found but is required to load audio files from filename"
+      end
+      stdout.unpack("e*")
+    end
+  end
+end

data/lib/informers/utils/image.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module Informers
         2 => "bilinear",
         3 => "bicubic",
         4 => "box",
-        5 => "hamming",
+        5 => "hamming"
       }
       attr_reader :image, :width, :height, :channels

data/lib/informers/utils/math.rb CHANGED Viewed

@@ -14,8 +14,8 @@ module Informers
       out_img = Array.new(out_height * out_width * in_channels)
       # Pre-calculate strides
-      in_stride = in_height * in_width;
-      out_stride = out_height * out_width;
+      in_stride = in_height * in_width
+      out_stride = out_height * out_width
       out_height.times do |i|
         out_width.times do |j|

data/lib/informers/utils/tensor.rb CHANGED Viewed

@@ -63,7 +63,7 @@ module Informers
     def self.reshape(arr, dims)
       arr = arr.flatten
-      dims[1..-1].reverse.each do |dim|
+      dims[1..-1].reverse_each do |dim|
         arr = arr.each_slice(dim)
       end
       arr.to_a

data/lib/informers/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Informers
-  VERSION = "1.1.0"
+  VERSION = "1.1.1"
 end

data/lib/informers.rb CHANGED Viewed

@@ -6,12 +6,15 @@ require "tokenizers"
 require "io/console"
 require "json"
 require "open-uri"
+require "open3"
 require "stringio"
 require "uri"
 # modules
+require_relative "informers/utils/audio"
 require_relative "informers/utils/core"
 require_relative "informers/utils/generation"
+require_relative "informers/utils/ffmpeg"
 require_relative "informers/utils/hub"
 require_relative "informers/utils/image"
 require_relative "informers/utils/math"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: informers
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.1.1
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-09-17 00:00:00.000000000 Z
+date: 2024-10-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: onnxruntime
@@ -55,7 +55,9 @@ files:
 - lib/informers/pipelines.rb
 - lib/informers/processors.rb
 - lib/informers/tokenizers.rb
+- lib/informers/utils/audio.rb
 - lib/informers/utils/core.rb
+- lib/informers/utils/ffmpeg.rb
 - lib/informers/utils/generation.rb
 - lib/informers/utils/hub.rb
 - lib/informers/utils/image.rb