RubyGems - informers - Versions diffs - 1.1.0 → 1.1.1 - Mend

informers 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +14 -7
data/lib/informers/models.rb +165 -5
data/lib/informers/pipelines.rb +180 -8
data/lib/informers/processors.rb +62 -2
data/lib/informers/tokenizers.rb +6 -2
data/lib/informers/utils/audio.rb +18 -0
data/lib/informers/utils/ffmpeg.rb +45 -0
data/lib/informers/utils/image.rb +1 -1
data/lib/informers/utils/math.rb +2 -2
data/lib/informers/utils/tensor.rb +1 -1
data/lib/informers/version.rb +1 -1
data/lib/informers.rb +3 -0
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
-  data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
+  metadata.gz: a61f01755798e81a975641d60e5bfe09484ced7ce6a3453020c9978dc35b1942
+  data.tar.gz: 811f9c1dc4499ae7de8ebf8e02c0c4e98a0c0bc0af6aaca51025e42ba8165540
 SHA512:
-  metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
-  data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
+  metadata.gz: 97b27363fab1e43895e368dbddc819fd4db23d42ce517359e5971347cd902b654f0c66700f07b36cd5f476bd3ea205a91e4f7e7ee0e7d8d455f0dce377bedb2b
+  data.tar.gz: dd1a7f795609423419ce213b00a5aca409f6b4a5bffb111250b4deffcbc6a8113fadf8d603c59fa78fa0f310904a0a3299e3bcdc48101f574171a024d13567e6

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+## 1.1.1 (2024-10-14)
+- Added `audio-classification` pipeline
+- Fixed error with `sentence-transformers/all-MiniLM-L6-v2`
 ## 1.1.0 (2024-09-17)
 - Added more pipelines

data/README.md CHANGED Viewed

@@ -229,19 +229,13 @@ result = model.(query, docs)
 ### Other
-You can use the feature extraction pipeline directly.
-```ruby
-model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
-embeddings = model.(sentences, pooling: "mean", normalize: true)
-```
 The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
 ## Pipelines
 - [Text](#text)
 - [Vision](#vision)
+- [Audio](#audio)
 - [Multimodel](#multimodal)
 ### Text
@@ -332,6 +326,8 @@ extractor.("We are very happy to show you the 🤗 Transformers library.")
 ### Vision
+Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images
 Image classification
 ```ruby
@@ -388,6 +384,17 @@ extractor = Informers.pipeline("image-feature-extraction")
 extractor.("image.jpg")
 ```
+### Audio
+Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files
+Audio classification
+```ruby
+classifier = Informers.pipeline("audio-classification")
+classifier.("audio.wav")
+```
 ### Multimodal
 Image captioning

data/lib/informers/models.rb CHANGED Viewed

@@ -84,6 +84,7 @@ module Informers
         @get_start_beams = method(:decoder_start_beams)
         @update_beam = method(:decoder_update_beam)
         @forward = method(:decoder_forward)
       when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
         @can_generate = true
@@ -91,8 +92,10 @@ module Informers
         @get_start_beams = method(:seq2seq_start_beams)
         @update_beam = method(:seq2seq_update_beam)
         @forward = method(:seq2seq_forward)
       when MODEL_TYPES[:EncoderDecoder]
-        raise Todo
+        @forward = method(:encoder_forward)
       else
         @forward = method(:encoder_forward)
       end
@@ -137,10 +140,18 @@ module Informers
         ]
       elsif model_type == MODEL_TYPES[:MaskGeneration]
-        raise Todo
+        info = [
+          AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
+          construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
+          construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
+        ]
       elsif model_type == MODEL_TYPES[:EncoderDecoder]
-        raise Todo
+        info = [
+          AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
+          construct_session(pretrained_model_name_or_path, "encoder_model", **options),
+          construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
+        ]
       else
         if model_type != MODEL_TYPES[:EncoderOnly]
@@ -293,13 +304,13 @@ module Informers
       grouped_beams = group_beams(beams)
       get_flattened = lambda do |key|
-        grouped_beams.map do |batch|
+        grouped_beams.flat_map do |batch|
           if generation_config["num_return_sequences"] > 1
             raise Todo
           else
             [batch[0][key]]
           end
-        end.flatten(1)
+        end
       end
       sequences = get_flattened.(:output_token_ids) # [1, seqLength]
@@ -904,6 +915,18 @@ module Informers
     end
   end
+  class Wav2Vec2PreTrainedModel < PreTrainedModel
+  end
+  class Wav2Vec2Model < Wav2Vec2PreTrainedModel
+  end
+  class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
+    def call(model_inputs)
+      SequenceClassifierOutput.new(*super(model_inputs))
+    end
+  end
   class RobertaPreTrainedModel < PreTrainedModel
   end
@@ -1066,6 +1089,62 @@ module Informers
   class DonutSwinModel < DonutSwinPreTrainedModel
   end
+  class WhisperPreTrainedModel < PreTrainedModel
+  end
+  class WhisperModel < WhisperPreTrainedModel
+  end
+  class WhisperForConditionalGeneration < WhisperPreTrainedModel
+    REQUIRES_ATTENTION_MASK = false
+    MAIN_INPUT_NAME = :input_features
+    def initialize(config, session, decoder_merged_session, generation_config)
+      super(config, session)
+      @decoder_merged_session = decoder_merged_session
+      @generation_config = generation_config
+      @num_decoder_layers = @config["decoder_layers"]
+      @num_decoder_heads = @config["decoder_attention_heads"]
+      @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f
+      @num_encoder_layers = @config["encoder_layers"]
+      @num_encoder_heads = @config["encoder_attention_heads"]
+      @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
+    end
+    def generate(inputs, generation_config = nil, logits_processor = nil)
+      raise Todo
+    end
+  end
+  class VitsPreTrainedModel < PreTrainedModel
+  end
+  class VitsModel < VitsPreTrainedModel
+    def call(model_inputs)
+      VitsModelOutput.new(*super(model_inputs))
+    end
+  end
+  class SpeechT5PreTrainedModel < PreTrainedModel
+  end
+  class SpeechT5Model < SpeechT5PreTrainedModel
+  end
+  class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
+  end
+  class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
+  end
+  class ClapPreTrainedModel < PreTrainedModel
+  end
+  class ClapModel < ClapPreTrainedModel
+  end
   MODEL_MAPPING_NAMES_ENCODER_ONLY = {
     "bert" => ["BertModel", BertModel],
     "nomic_bert" => ["NomicBertModel", NomicBertModel],
@@ -1074,6 +1153,7 @@ module Informers
     "distilbert" => ["DistilBertModel", DistilBertModel],
     "roberta" => ["RobertaModel", RobertaModel],
     "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
+    "clap" => ["ClapModel", ClapModel],
     "clip" => ["CLIPModel", CLIPModel],
     "detr" => ["DetrModel", DetrModel],
     "vit" => ["ViTModel", ViTModel],
@@ -1085,6 +1165,21 @@ module Informers
     "bart" => ["BartModel", BartModel]
   }
+  MODEL_MAPPING_NAMES_DECODER_ONLY = {
+  }
+  MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
+    "whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
+  }
+  MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
+    "speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
+  }
+  MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
+    "vits" => ["VitsModel", VitsModel]
+  }
   MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
     "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
     "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
@@ -1143,6 +1238,25 @@ module Informers
   MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
   }
+  MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
+  }
+  MODEL_FOR_CTC_MAPPING_NAMES = {
+  }
+  MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
+    "wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
+  }
+  MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
+  }
+  MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
+  }
+  MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
+  }
   MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
     "swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
   }
@@ -1157,9 +1271,11 @@ module Informers
   MODEL_CLASS_TYPE_MAPPING = [
     [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
     [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
+    [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
     [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
+    [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
     [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
     [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
@@ -1167,10 +1283,18 @@ module Informers
     [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
+    [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
+    [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
+    [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
     [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
   ]
@@ -1199,6 +1323,18 @@ module Informers
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
   end
+  class AutoModelForSpeechSeq2Seq < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
+  end
+  class AutoModelForTextToSpectrogram < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
+  end
+  class AutoModelForTextToWaveform < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
+  end
   class AutoModelForCausalLM < PretrainedMixin
     MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
   end
@@ -1235,10 +1371,34 @@ module Informers
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
   end
+  class AutoModelForMaskGeneration < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
+  end
+  class AutoModelForCTC < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
+  end
+  class AutoModelForAudioClassification < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
+  end
+  class AutoModelForXVector < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
+  end
+  class AutoModelForAudioFrameClassification < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
+  end
   class AutoModelForDocumentQuestionAnswering < PretrainedMixin
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
   end
+  class AutoModelForImageMatting < PretrainedMixin
+    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
+  end
   class AutoModelForImageToImage < PretrainedMixin
     MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
   end

data/lib/informers/pipelines.rb CHANGED Viewed

@@ -19,6 +19,20 @@ module Informers
       images.map { |x| Utils::RawImage.read(x) }
     end
+    def prepare_audios(audios, sampling_rate)
+      if !audios.is_a?(Array)
+        audios = [audios]
+      end
+      audios.map do |x|
+        if x.is_a?(String) || x.is_a?(URI)
+          Utils.read_audio(x, sampling_rate)
+        else
+          x
+        end
+      end
+    end
     def get_bounding_box(box, as_integer)
       if as_integer
         box = box.map { |x| x.to_i }
@@ -729,7 +743,7 @@ module Informers
             {
               label: candidate_labels[processed[:classes][i]],
               score: processed[:scores][i],
-              box: get_bounding_box(box, !percentage),
+              box: get_bounding_box(box, !percentage)
             }
           end
         result.sort_by! { |v| -v[:score] }
@@ -784,6 +798,26 @@ module Informers
     end
   end
+  class TextToAudioPipeline < Pipeline
+    DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
+    def initialize(**options)
+      super(**options)
+      # TODO: Find a better way for `pipeline` to set the default vocoder
+      @vocoder = options[:vocoder]
+    end
+    def call(text_inputs, speaker_embeddings: nil)
+      # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
+      if @processor
+        call_text_to_spectrogram(text_inputs, speaker_embeddings:)
+      else
+        call_text_to_waveform(text_inputs)
+      end
+    end
+  end
   class FeatureExtractionPipeline < Pipeline
     def call(
       texts,
@@ -803,7 +837,7 @@ module Informers
       if !model_output.nil?
         model_options[:output_names] = Array(model_output)
       elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
-        # optimization for sentence-transformers/all-MiniLM-L6-v2
+        # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
         model_options[:output_names] = ["sentence_embedding"]
         pooling = "none"
         normalize = false
@@ -858,11 +892,106 @@ module Informers
     end
   end
+  class AudioClassificationPipeline < Pipeline
+    def call(audio, top_k: nil)
+      single = !audio.is_a?(Array)
+      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
+      prepared_audios = prepare_audios(audio, sampling_rate)
+      id2label = @model.config[:id2label]
+      to_return = []
+      prepared_audios.each do |aud|
+        inputs = @processor.(aud)
+        output = @model.(inputs)
+        logits = output.logits[0]
+        scores = Utils.get_top_items(Utils.softmax(logits), top_k)
+        vals =
+          scores.map do |x|
+            {
+              label: id2label[x[0].to_s],
+              score: x[1]
+            }
+          end
+        if top_k == 1
+          to_return.concat(vals)
+        else
+          to_return << vals
+        end
+      end
+      !single || top_k == 1 ? to_return : to_return[0]
+    end
+  end
+  class ZeroShotAudioClassificationPipeline < Pipeline
+    def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
+      single = !audio.is_a?(Array)
+      if single
+        audio = [audio]
+      end
+      # Insert label into hypothesis template
+      texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
+      # Run tokenization
+      text_inputs =
+        @tokenizer.(
+          texts,
+          padding: true,
+          truncation: true
+        )
+      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
+      prepared_audios = prepare_audios(audio, sampling_rate)
+      to_return = []
+      prepared_audios.each do |aud|
+        audio_inputs = @processor.(aud)
+        # Run model with both text and audio inputs
+        output = @model.(text_inputs.merge(audio_inputs))
+        # Compute softmax per audio
+        probs = Utils.softmax(output.logits_per_audio.data)
+        to_return <<
+          probs.map.with_index do |x, i|
+            {
+              label: candidate_labels[i],
+              score: x
+            }
+          end
+      end
+      single ? to_return[0] : to_return
+    end
+  end
+  class AutomaticSpeechRecognitionPipeline < Pipeline
+    def call(audio, **kwargs)
+      case @model.config["model_type"]
+      when "whisper"
+        call_whisper(audio, **kwargs)
+      else
+        raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
+      end
+    end
+    private
+    def call_whisper(audio, **kwargs)
+      raise Todo
+    end
+  end
   class ImageToImagePipeline < Pipeline
     def call(images)
       prepared_images = prepare_images(images)
       inputs = @processor.(prepared_images)
-      outputs = @model.(inputs);
+      outputs = @model.(inputs)
       to_return = []
       outputs[0].each do |batch|
@@ -1033,6 +1162,47 @@ module Informers
       },
       type: "text"
     },
+    "audio-classification" => {
+      pipeline: AudioClassificationPipeline,
+      model: AutoModelForAudioClassification,
+      processor: AutoProcessor,
+      default: {
+        model: "Xenova/wav2vec2-base-superb-ks"
+      },
+      type: "audio"
+    },
+    # TODO
+    # "zero-shot-audio-classification" => {
+    #   tokenizer: AutoTokenizer,
+    #   pipeline: ZeroShotAudioClassificationPipeline,
+    #   model: AutoModel,
+    #   processor: AutoProcessor,
+    #   default: {
+    #      model: "Xenova/clap-htsat-unfused"
+    #   },
+    #   type: "multimodal"
+    # },
+    # TODO
+    # "automatic-speech-recognition" => {
+    #   tokenizer: AutoTokenizer,
+    #   pipeline: AutomaticSpeechRecognitionPipeline,
+    #   model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
+    #   processor: AutoProcessor,
+    #   default: {
+    #     model: "Xenova/whisper-tiny.en"
+    #   },
+    #   type: "multimodal"
+    # },
+    "text-to-audio" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TextToAudioPipeline,
+      model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
+      processor: [AutoProcessor, nil],
+      default: {
+        model: "Xenova/speecht5_tts"
+      },
+      type: "text"
+    },
     "image-to-text" => {
       tokenizer: AutoTokenizer,
       pipeline: ImageToTextPipeline,
@@ -1048,7 +1218,7 @@ module Informers
       model: AutoModelForImageClassification,
       processor: AutoProcessor,
       default: {
-        model: "Xenova/vit-base-patch16-224",
+        model: "Xenova/vit-base-patch16-224"
       },
       type: "multimodal"
     },
@@ -1057,7 +1227,7 @@ module Informers
       model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
       processor: AutoProcessor,
       default: {
-        model: "Xenova/detr-resnet-50-panoptic",
+        model: "Xenova/detr-resnet-50-panoptic"
       },
       type: "multimodal"
     },
@@ -1076,7 +1246,7 @@ module Informers
       model: AutoModelForObjectDetection,
       processor: AutoProcessor,
       default: {
-        model: "Xenova/detr-resnet-50",
+        model: "Xenova/detr-resnet-50"
       },
       type: "multimodal"
     },
@@ -1158,7 +1328,8 @@ module Informers
   TASK_ALIASES = {
     "sentiment-analysis" => "text-classification",
-    "ner" => "token-classification"
+    "ner" => "token-classification",
+    "text-to-speech" => "text-to-audio"
   }
   DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -1231,7 +1402,8 @@ module Informers
       results = load_items(classes, model, pretrained_options)
       results[:task] = task
-      if model == "sentence-transformers/all-MiniLM-L6-v2"
+      # for previous revision of sentence-transformers/all-MiniLM-L6-v2
+      if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
         results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
       end

data/lib/informers/processors.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 module Informers
   class FeatureExtractor
+    attr_reader :config
     def initialize(config)
       super()
       @config = config
@@ -728,6 +730,61 @@ module Informers
     end
   end
+  class WhisperFeatureExtractor < FeatureExtractor
+    def initialize(config)
+      super(config)
+      raise Todo
+    end
+    def _extract_fbank_features(waveform)
+      raise Todo
+    end
+    def call(audio)
+      raise Todo
+    end
+  end
+  class Wav2Vec2FeatureExtractor < FeatureExtractor
+    def _zero_mean_unit_var_norm(input_values)
+      sum = input_values.sum
+      mean = sum / input_values.length.to_f
+      variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
+      input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
+    end
+    def call(audio)
+      # TODO
+      # validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')
+      input_values = audio
+      # zero-mean and unit-variance normalization
+      if @config["do_normalize"]
+        input_values = _zero_mean_unit_var_norm(input_values)
+      end
+      # TODO: allow user to pass in attention mask
+      {
+        input_values: [input_values],
+        attention_mask: [Array.new(input_values.length, 1)]
+      }
+    end
+  end
+  class ClapFeatureExtractor < FeatureExtractor
+    def initialize(config)
+      super(config)
+      # TODO
+    end
+    def call(audio, max_length: nil)
+      raise Todo
+    end
+  end
   class Processor
     attr_reader :feature_extractor
@@ -748,7 +805,10 @@ module Informers
       "DPTFeatureExtractor" => DPTFeatureExtractor,
       "DetrFeatureExtractor" => DetrFeatureExtractor,
       "Swin2SRImageProcessor" => Swin2SRImageProcessor,
-      "DonutFeatureExtractor" => DonutFeatureExtractor
+      "DonutFeatureExtractor" => DonutFeatureExtractor,
+      "WhisperFeatureExtractor" => WhisperFeatureExtractor,
+      "Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
+      "ClapFeatureExtractor" => ClapFeatureExtractor
     }
     PROCESSOR_CLASS_MAPPING = {}
@@ -762,7 +822,7 @@ module Informers
       revision: "main",
       **kwargs
     )
-      preprocessor_config = config || Utils::Hub::get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
+      preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
         progress_callback:,
         config:,
         cache_dir:,

data/lib/informers/tokenizers.rb CHANGED Viewed

@@ -244,6 +244,9 @@ module Informers
     end
   end
+  class SpeechT5Tokenizer < PreTrainedTokenizer
+  end
   class AutoTokenizer
     TOKENIZER_CLASS_MAPPING = {
       "T5Tokenizer" => T5Tokenizer,
@@ -257,7 +260,8 @@ module Informers
       "CLIPTokenizer" => CLIPTokenizer,
       "GPT2Tokenizer" => GPT2Tokenizer,
       "NllbTokenizer" => NllbTokenizer,
-      "M2M100Tokenizer" => M2M100Tokenizer
+      "M2M100Tokenizer" => M2M100Tokenizer,
+      "SpeechT5Tokenizer" => SpeechT5Tokenizer
     }
     def self.from_pretrained(
@@ -296,7 +300,7 @@ module Informers
     def self.load_tokenizer(pretrained_model_name_or_path, **options)
       info = [
         Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
-        Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options),
+        Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
       ]
       # Override legacy option if `options.legacy` is not null

data/lib/informers/utils/audio.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module Informers
+  module Utils
+    def self.read_audio(input, sampling_rate)
+      data =
+        if input.is_a?(URI)
+          require "open-uri"
+          input.read
+        elsif input.is_a?(String)
+          File.binread(input)
+        else
+          raise ArgumentError, "Unsupported input type: #{input.class.name}"
+        end
+      ffmpeg_read(data, sampling_rate)
+    end
+  end
+end

data/lib/informers/utils/ffmpeg.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+module Informers
+  module Utils
+    # from the Transformers Python library
+    def self.ffmpeg_read(data, sampling_rate)
+      ar = "#{sampling_rate}"
+      ac = "1"
+      format_for_conversion = "f32le"
+      ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1"
+      ]
+      stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
+      if !status.success?
+        raise Error, "ffmpeg was not found but is required to load audio files from filename"
+      end
+      stdout.unpack("e*")
+    end
+  end
+end

data/lib/informers/utils/image.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module Informers
         2 => "bilinear",
         3 => "bicubic",
         4 => "box",
-        5 => "hamming",
+        5 => "hamming"
       }
       attr_reader :image, :width, :height, :channels

data/lib/informers/utils/math.rb CHANGED Viewed

@@ -14,8 +14,8 @@ module Informers
       out_img = Array.new(out_height * out_width * in_channels)
       # Pre-calculate strides
-      in_stride = in_height * in_width;
-      out_stride = out_height * out_width;
+      in_stride = in_height * in_width
+      out_stride = out_height * out_width
       out_height.times do |i|
         out_width.times do |j|

data/lib/informers/utils/tensor.rb CHANGED Viewed

@@ -63,7 +63,7 @@ module Informers
     def self.reshape(arr, dims)
       arr = arr.flatten
-      dims[1..-1].reverse.each do |dim|
+      dims[1..-1].reverse_each do |dim|
         arr = arr.each_slice(dim)
       end
       arr.to_a

data/lib/informers/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Informers
-  VERSION = "1.1.0"
+  VERSION = "1.1.1"
 end

data/lib/informers.rb CHANGED Viewed

@@ -6,12 +6,15 @@ require "tokenizers"
 require "io/console"
 require "json"
 require "open-uri"
+require "open3"
 require "stringio"
 require "uri"
 # modules
+require_relative "informers/utils/audio"
 require_relative "informers/utils/core"
 require_relative "informers/utils/generation"
+require_relative "informers/utils/ffmpeg"
 require_relative "informers/utils/hub"
 require_relative "informers/utils/image"
 require_relative "informers/utils/math"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: informers
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.1.1
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-09-17 00:00:00.000000000 Z
+date: 2024-10-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: onnxruntime
@@ -55,7 +55,9 @@ files:
 - lib/informers/pipelines.rb
 - lib/informers/processors.rb
 - lib/informers/tokenizers.rb
+- lib/informers/utils/audio.rb
 - lib/informers/utils/core.rb
+- lib/informers/utils/ffmpeg.rb
 - lib/informers/utils/generation.rb
 - lib/informers/utils/hub.rb
 - lib/informers/utils/image.rb