informers 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
4
- data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
3
+ metadata.gz: a61f01755798e81a975641d60e5bfe09484ced7ce6a3453020c9978dc35b1942
4
+ data.tar.gz: 811f9c1dc4499ae7de8ebf8e02c0c4e98a0c0bc0af6aaca51025e42ba8165540
5
5
  SHA512:
6
- metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
7
- data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
6
+ metadata.gz: 97b27363fab1e43895e368dbddc819fd4db23d42ce517359e5971347cd902b654f0c66700f07b36cd5f476bd3ea205a91e4f7e7ee0e7d8d455f0dce377bedb2b
7
+ data.tar.gz: dd1a7f795609423419ce213b00a5aca409f6b4a5bffb111250b4deffcbc6a8113fadf8d603c59fa78fa0f310904a0a3299e3bcdc48101f574171a024d13567e6
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 1.1.1 (2024-10-14)
2
+
3
+ - Added `audio-classification` pipeline
4
+ - Fixed error with `sentence-transformers/all-MiniLM-L6-v2`
5
+
1
6
  ## 1.1.0 (2024-09-17)
2
7
 
3
8
  - Added more pipelines
data/README.md CHANGED
@@ -229,19 +229,13 @@ result = model.(query, docs)
229
229
 
230
230
  ### Other
231
231
 
232
- You can use the feature extraction pipeline directly.
233
-
234
- ```ruby
235
- model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
236
- embeddings = model.(sentences, pooling: "mean", normalize: true)
237
- ```
238
-
239
232
  The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
240
233
 
241
234
  ## Pipelines
242
235
 
243
236
  - [Text](#text)
244
237
  - [Vision](#vision)
238
+ - [Audio](#audio)
245
239
  - [Multimodel](#multimodal)
246
240
 
247
241
  ### Text
@@ -332,6 +326,8 @@ extractor.("We are very happy to show you the 🤗 Transformers library.")
332
326
 
333
327
  ### Vision
334
328
 
329
+ Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images
330
+
335
331
  Image classification
336
332
 
337
333
  ```ruby
@@ -388,6 +384,17 @@ extractor = Informers.pipeline("image-feature-extraction")
388
384
  extractor.("image.jpg")
389
385
  ```
390
386
 
387
+ ### Audio
388
+
389
+ Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files
390
+
391
+ Audio classification
392
+
393
+ ```ruby
394
+ classifier = Informers.pipeline("audio-classification")
395
+ classifier.("audio.wav")
396
+ ```
397
+
391
398
  ### Multimodal
392
399
 
393
400
  Image captioning
@@ -84,6 +84,7 @@ module Informers
84
84
  @get_start_beams = method(:decoder_start_beams)
85
85
  @update_beam = method(:decoder_update_beam)
86
86
  @forward = method(:decoder_forward)
87
+
87
88
  when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
88
89
  @can_generate = true
89
90
 
@@ -91,8 +92,10 @@ module Informers
91
92
  @get_start_beams = method(:seq2seq_start_beams)
92
93
  @update_beam = method(:seq2seq_update_beam)
93
94
  @forward = method(:seq2seq_forward)
95
+
94
96
  when MODEL_TYPES[:EncoderDecoder]
95
- raise Todo
97
+ @forward = method(:encoder_forward)
98
+
96
99
  else
97
100
  @forward = method(:encoder_forward)
98
101
  end
@@ -137,10 +140,18 @@ module Informers
137
140
  ]
138
141
 
139
142
  elsif model_type == MODEL_TYPES[:MaskGeneration]
140
- raise Todo
143
+ info = [
144
+ AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
145
+ construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
146
+ construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
147
+ ]
141
148
 
142
149
  elsif model_type == MODEL_TYPES[:EncoderDecoder]
143
- raise Todo
150
+ info = [
151
+ AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
152
+ construct_session(pretrained_model_name_or_path, "encoder_model", **options),
153
+ construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
154
+ ]
144
155
 
145
156
  else
146
157
  if model_type != MODEL_TYPES[:EncoderOnly]
@@ -293,13 +304,13 @@ module Informers
293
304
  grouped_beams = group_beams(beams)
294
305
 
295
306
  get_flattened = lambda do |key|
296
- grouped_beams.map do |batch|
307
+ grouped_beams.flat_map do |batch|
297
308
  if generation_config["num_return_sequences"] > 1
298
309
  raise Todo
299
310
  else
300
311
  [batch[0][key]]
301
312
  end
302
- end.flatten(1)
313
+ end
303
314
  end
304
315
 
305
316
  sequences = get_flattened.(:output_token_ids) # [1, seqLength]
@@ -904,6 +915,18 @@ module Informers
904
915
  end
905
916
  end
906
917
 
918
+ class Wav2Vec2PreTrainedModel < PreTrainedModel
919
+ end
920
+
921
+ class Wav2Vec2Model < Wav2Vec2PreTrainedModel
922
+ end
923
+
924
+ class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
925
+ def call(model_inputs)
926
+ SequenceClassifierOutput.new(*super(model_inputs))
927
+ end
928
+ end
929
+
907
930
  class RobertaPreTrainedModel < PreTrainedModel
908
931
  end
909
932
 
@@ -1066,6 +1089,62 @@ module Informers
1066
1089
  class DonutSwinModel < DonutSwinPreTrainedModel
1067
1090
  end
1068
1091
 
1092
+ class WhisperPreTrainedModel < PreTrainedModel
1093
+ end
1094
+
1095
+ class WhisperModel < WhisperPreTrainedModel
1096
+ end
1097
+
1098
+ class WhisperForConditionalGeneration < WhisperPreTrainedModel
1099
+ REQUIRES_ATTENTION_MASK = false
1100
+ MAIN_INPUT_NAME = :input_features
1101
+
1102
+ def initialize(config, session, decoder_merged_session, generation_config)
1103
+ super(config, session)
1104
+ @decoder_merged_session = decoder_merged_session
1105
+ @generation_config = generation_config
1106
+
1107
+ @num_decoder_layers = @config["decoder_layers"]
1108
+ @num_decoder_heads = @config["decoder_attention_heads"]
1109
+ @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f
1110
+
1111
+ @num_encoder_layers = @config["encoder_layers"]
1112
+ @num_encoder_heads = @config["encoder_attention_heads"]
1113
+ @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
1114
+ end
1115
+
1116
+ def generate(inputs, generation_config = nil, logits_processor = nil)
1117
+ raise Todo
1118
+ end
1119
+ end
1120
+
1121
+ class VitsPreTrainedModel < PreTrainedModel
1122
+ end
1123
+
1124
+ class VitsModel < VitsPreTrainedModel
1125
+ def call(model_inputs)
1126
+ VitsModelOutput.new(*super(model_inputs))
1127
+ end
1128
+ end
1129
+
1130
+ class SpeechT5PreTrainedModel < PreTrainedModel
1131
+ end
1132
+
1133
+ class SpeechT5Model < SpeechT5PreTrainedModel
1134
+ end
1135
+
1136
+ class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
1137
+ end
1138
+
1139
+ class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
1140
+ end
1141
+
1142
+ class ClapPreTrainedModel < PreTrainedModel
1143
+ end
1144
+
1145
+ class ClapModel < ClapPreTrainedModel
1146
+ end
1147
+
1069
1148
  MODEL_MAPPING_NAMES_ENCODER_ONLY = {
1070
1149
  "bert" => ["BertModel", BertModel],
1071
1150
  "nomic_bert" => ["NomicBertModel", NomicBertModel],
@@ -1074,6 +1153,7 @@ module Informers
1074
1153
  "distilbert" => ["DistilBertModel", DistilBertModel],
1075
1154
  "roberta" => ["RobertaModel", RobertaModel],
1076
1155
  "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
1156
+ "clap" => ["ClapModel", ClapModel],
1077
1157
  "clip" => ["CLIPModel", CLIPModel],
1078
1158
  "detr" => ["DetrModel", DetrModel],
1079
1159
  "vit" => ["ViTModel", ViTModel],
@@ -1085,6 +1165,21 @@ module Informers
1085
1165
  "bart" => ["BartModel", BartModel]
1086
1166
  }
1087
1167
 
1168
+ MODEL_MAPPING_NAMES_DECODER_ONLY = {
1169
+ }
1170
+
1171
+ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
1172
+ "whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
1173
+ }
1174
+
1175
+ MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
1176
+ "speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
1177
+ }
1178
+
1179
+ MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
1180
+ "vits" => ["VitsModel", VitsModel]
1181
+ }
1182
+
1088
1183
  MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
1089
1184
  "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
1090
1185
  "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
@@ -1143,6 +1238,25 @@ module Informers
1143
1238
  MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
1144
1239
  }
1145
1240
 
1241
+ MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
1242
+ }
1243
+
1244
+ MODEL_FOR_CTC_MAPPING_NAMES = {
1245
+ }
1246
+
1247
+ MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
1248
+ "wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
1249
+ }
1250
+
1251
+ MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
1252
+ }
1253
+
1254
+ MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
1255
+ }
1256
+
1257
+ MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
1258
+ }
1259
+
1146
1260
  MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
1147
1261
  "swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
1148
1262
  }
@@ -1157,9 +1271,11 @@ module Informers
1157
1271
  MODEL_CLASS_TYPE_MAPPING = [
1158
1272
  [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
1159
1273
  [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
1274
+ [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
1160
1275
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1161
1276
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1162
1277
  [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
1278
+ [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
1163
1279
  [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
1164
1280
  [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1165
1281
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
@@ -1167,10 +1283,18 @@ module Informers
1167
1283
  [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1168
1284
  [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1169
1285
  [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1286
+ [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1170
1287
  [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1171
1288
  [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1172
1289
  [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1173
1290
  [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1291
+ [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
1292
+ [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1293
+ [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1294
+ [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
1295
+ [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1296
+ [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1297
+ [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1174
1298
  [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
1175
1299
  ]
1176
1300
 
@@ -1199,6 +1323,18 @@ module Informers
1199
1323
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
1200
1324
  end
1201
1325
 
1326
+ class AutoModelForSpeechSeq2Seq < PretrainedMixin
1327
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
1328
+ end
1329
+
1330
+ class AutoModelForTextToSpectrogram < PretrainedMixin
1331
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
1332
+ end
1333
+
1334
+ class AutoModelForTextToWaveform < PretrainedMixin
1335
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
1336
+ end
1337
+
1202
1338
  class AutoModelForCausalLM < PretrainedMixin
1203
1339
  MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
1204
1340
  end
@@ -1235,10 +1371,34 @@ module Informers
1235
1371
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
1236
1372
  end
1237
1373
 
1374
+ class AutoModelForMaskGeneration < PretrainedMixin
1375
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
1376
+ end
1377
+
1378
+ class AutoModelForCTC < PretrainedMixin
1379
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
1380
+ end
1381
+
1382
+ class AutoModelForAudioClassification < PretrainedMixin
1383
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
1384
+ end
1385
+
1386
+ class AutoModelForXVector < PretrainedMixin
1387
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
1388
+ end
1389
+
1390
+ class AutoModelForAudioFrameClassification < PretrainedMixin
1391
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
1392
+ end
1393
+
1238
1394
  class AutoModelForDocumentQuestionAnswering < PretrainedMixin
1239
1395
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
1240
1396
  end
1241
1397
 
1398
+ class AutoModelForImageMatting < PretrainedMixin
1399
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
1400
+ end
1401
+
1242
1402
  class AutoModelForImageToImage < PretrainedMixin
1243
1403
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
1244
1404
  end
@@ -19,6 +19,20 @@ module Informers
19
19
  images.map { |x| Utils::RawImage.read(x) }
20
20
  end
21
21
 
22
+ def prepare_audios(audios, sampling_rate)
23
+ if !audios.is_a?(Array)
24
+ audios = [audios]
25
+ end
26
+
27
+ audios.map do |x|
28
+ if x.is_a?(String) || x.is_a?(URI)
29
+ Utils.read_audio(x, sampling_rate)
30
+ else
31
+ x
32
+ end
33
+ end
34
+ end
35
+
22
36
  def get_bounding_box(box, as_integer)
23
37
  if as_integer
24
38
  box = box.map { |x| x.to_i }
@@ -729,7 +743,7 @@ module Informers
729
743
  {
730
744
  label: candidate_labels[processed[:classes][i]],
731
745
  score: processed[:scores][i],
732
- box: get_bounding_box(box, !percentage),
746
+ box: get_bounding_box(box, !percentage)
733
747
  }
734
748
  end
735
749
  result.sort_by! { |v| -v[:score] }
@@ -784,6 +798,26 @@ module Informers
784
798
  end
785
799
  end
786
800
 
801
+ class TextToAudioPipeline < Pipeline
802
+ DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
803
+
804
+ def initialize(**options)
805
+ super(**options)
806
+
807
+ # TODO: Find a better way for `pipeline` to set the default vocoder
808
+ @vocoder = options[:vocoder]
809
+ end
810
+
811
+ def call(text_inputs, speaker_embeddings: nil)
812
+ # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
813
+ if @processor
814
+ call_text_to_spectrogram(text_inputs, speaker_embeddings:)
815
+ else
816
+ call_text_to_waveform(text_inputs)
817
+ end
818
+ end
819
+ end
820
+
787
821
  class FeatureExtractionPipeline < Pipeline
788
822
  def call(
789
823
  texts,
@@ -803,7 +837,7 @@ module Informers
803
837
  if !model_output.nil?
804
838
  model_options[:output_names] = Array(model_output)
805
839
  elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
806
- # optimization for sentence-transformers/all-MiniLM-L6-v2
840
+ # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
807
841
  model_options[:output_names] = ["sentence_embedding"]
808
842
  pooling = "none"
809
843
  normalize = false
@@ -858,11 +892,106 @@ module Informers
858
892
  end
859
893
  end
860
894
 
895
+ class AudioClassificationPipeline < Pipeline
896
+ def call(audio, top_k: nil)
897
+ single = !audio.is_a?(Array)
898
+
899
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
900
+ prepared_audios = prepare_audios(audio, sampling_rate)
901
+
902
+ id2label = @model.config[:id2label]
903
+
904
+ to_return = []
905
+ prepared_audios.each do |aud|
906
+ inputs = @processor.(aud)
907
+ output = @model.(inputs)
908
+ logits = output.logits[0]
909
+
910
+ scores = Utils.get_top_items(Utils.softmax(logits), top_k)
911
+
912
+ vals =
913
+ scores.map do |x|
914
+ {
915
+ label: id2label[x[0].to_s],
916
+ score: x[1]
917
+ }
918
+ end
919
+
920
+ if top_k == 1
921
+ to_return.concat(vals)
922
+ else
923
+ to_return << vals
924
+ end
925
+ end
926
+ !single || top_k == 1 ? to_return : to_return[0]
927
+ end
928
+ end
929
+
930
+ class ZeroShotAudioClassificationPipeline < Pipeline
931
+ def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
932
+ single = !audio.is_a?(Array)
933
+ if single
934
+ audio = [audio]
935
+ end
936
+
937
+ # Insert label into hypothesis template
938
+ texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
939
+
940
+ # Run tokenization
941
+ text_inputs =
942
+ @tokenizer.(
943
+ texts,
944
+ padding: true,
945
+ truncation: true
946
+ )
947
+
948
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
949
+ prepared_audios = prepare_audios(audio, sampling_rate)
950
+
951
+ to_return = []
952
+ prepared_audios.each do |aud|
953
+ audio_inputs = @processor.(aud)
954
+
955
+ # Run model with both text and audio inputs
956
+ output = @model.(text_inputs.merge(audio_inputs))
957
+
958
+ # Compute softmax per audio
959
+ probs = Utils.softmax(output.logits_per_audio.data)
960
+
961
+ to_return <<
962
+ probs.map.with_index do |x, i|
963
+ {
964
+ label: candidate_labels[i],
965
+ score: x
966
+ }
967
+ end
968
+ end
969
+ single ? to_return[0] : to_return
970
+ end
971
+ end
972
+
973
+ class AutomaticSpeechRecognitionPipeline < Pipeline
974
+ def call(audio, **kwargs)
975
+ case @model.config["model_type"]
976
+ when "whisper"
977
+ call_whisper(audio, **kwargs)
978
+ else
979
+ raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
980
+ end
981
+ end
982
+
983
+ private
984
+
985
+ def call_whisper(audio, **kwargs)
986
+ raise Todo
987
+ end
988
+ end
989
+
861
990
  class ImageToImagePipeline < Pipeline
862
991
  def call(images)
863
992
  prepared_images = prepare_images(images)
864
993
  inputs = @processor.(prepared_images)
865
- outputs = @model.(inputs);
994
+ outputs = @model.(inputs)
866
995
 
867
996
  to_return = []
868
997
  outputs[0].each do |batch|
@@ -1033,6 +1162,47 @@ module Informers
1033
1162
  },
1034
1163
  type: "text"
1035
1164
  },
1165
+ "audio-classification" => {
1166
+ pipeline: AudioClassificationPipeline,
1167
+ model: AutoModelForAudioClassification,
1168
+ processor: AutoProcessor,
1169
+ default: {
1170
+ model: "Xenova/wav2vec2-base-superb-ks"
1171
+ },
1172
+ type: "audio"
1173
+ },
1174
+ # TODO
1175
+ # "zero-shot-audio-classification" => {
1176
+ # tokenizer: AutoTokenizer,
1177
+ # pipeline: ZeroShotAudioClassificationPipeline,
1178
+ # model: AutoModel,
1179
+ # processor: AutoProcessor,
1180
+ # default: {
1181
+ # model: "Xenova/clap-htsat-unfused"
1182
+ # },
1183
+ # type: "multimodal"
1184
+ # },
1185
+ # TODO
1186
+ # "automatic-speech-recognition" => {
1187
+ # tokenizer: AutoTokenizer,
1188
+ # pipeline: AutomaticSpeechRecognitionPipeline,
1189
+ # model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
1190
+ # processor: AutoProcessor,
1191
+ # default: {
1192
+ # model: "Xenova/whisper-tiny.en"
1193
+ # },
1194
+ # type: "multimodal"
1195
+ # },
1196
+ "text-to-audio" => {
1197
+ tokenizer: AutoTokenizer,
1198
+ pipeline: TextToAudioPipeline,
1199
+ model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
1200
+ processor: [AutoProcessor, nil],
1201
+ default: {
1202
+ model: "Xenova/speecht5_tts"
1203
+ },
1204
+ type: "text"
1205
+ },
1036
1206
  "image-to-text" => {
1037
1207
  tokenizer: AutoTokenizer,
1038
1208
  pipeline: ImageToTextPipeline,
@@ -1048,7 +1218,7 @@ module Informers
1048
1218
  model: AutoModelForImageClassification,
1049
1219
  processor: AutoProcessor,
1050
1220
  default: {
1051
- model: "Xenova/vit-base-patch16-224",
1221
+ model: "Xenova/vit-base-patch16-224"
1052
1222
  },
1053
1223
  type: "multimodal"
1054
1224
  },
@@ -1057,7 +1227,7 @@ module Informers
1057
1227
  model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
1058
1228
  processor: AutoProcessor,
1059
1229
  default: {
1060
- model: "Xenova/detr-resnet-50-panoptic",
1230
+ model: "Xenova/detr-resnet-50-panoptic"
1061
1231
  },
1062
1232
  type: "multimodal"
1063
1233
  },
@@ -1076,7 +1246,7 @@ module Informers
1076
1246
  model: AutoModelForObjectDetection,
1077
1247
  processor: AutoProcessor,
1078
1248
  default: {
1079
- model: "Xenova/detr-resnet-50",
1249
+ model: "Xenova/detr-resnet-50"
1080
1250
  },
1081
1251
  type: "multimodal"
1082
1252
  },
@@ -1158,7 +1328,8 @@ module Informers
1158
1328
 
1159
1329
  TASK_ALIASES = {
1160
1330
  "sentiment-analysis" => "text-classification",
1161
- "ner" => "token-classification"
1331
+ "ner" => "token-classification",
1332
+ "text-to-speech" => "text-to-audio"
1162
1333
  }
1163
1334
 
1164
1335
  DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -1231,7 +1402,8 @@ module Informers
1231
1402
  results = load_items(classes, model, pretrained_options)
1232
1403
  results[:task] = task
1233
1404
 
1234
- if model == "sentence-transformers/all-MiniLM-L6-v2"
1405
+ # for previous revision of sentence-transformers/all-MiniLM-L6-v2
1406
+ if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
1235
1407
  results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
1236
1408
  end
1237
1409
 
@@ -1,5 +1,7 @@
1
1
  module Informers
2
2
  class FeatureExtractor
3
+ attr_reader :config
4
+
3
5
  def initialize(config)
4
6
  super()
5
7
  @config = config
@@ -728,6 +730,61 @@ module Informers
728
730
  end
729
731
  end
730
732
 
733
+ class WhisperFeatureExtractor < FeatureExtractor
734
+ def initialize(config)
735
+ super(config)
736
+
737
+ raise Todo
738
+ end
739
+
740
+ def _extract_fbank_features(waveform)
741
+ raise Todo
742
+ end
743
+
744
+ def call(audio)
745
+ raise Todo
746
+ end
747
+ end
748
+
749
+ class Wav2Vec2FeatureExtractor < FeatureExtractor
750
+ def _zero_mean_unit_var_norm(input_values)
751
+ sum = input_values.sum
752
+ mean = sum / input_values.length.to_f
753
+ variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
754
+ input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
755
+ end
756
+
757
+ def call(audio)
758
+ # TODO
759
+ # validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')
760
+
761
+ input_values = audio
762
+
763
+ # zero-mean and unit-variance normalization
764
+ if @config["do_normalize"]
765
+ input_values = _zero_mean_unit_var_norm(input_values)
766
+ end
767
+
768
+ # TODO: allow user to pass in attention mask
769
+ {
770
+ input_values: [input_values],
771
+ attention_mask: [Array.new(input_values.length, 1)]
772
+ }
773
+ end
774
+ end
775
+
776
+ class ClapFeatureExtractor < FeatureExtractor
777
+ def initialize(config)
778
+ super(config)
779
+
780
+ # TODO
781
+ end
782
+
783
+ def call(audio, max_length: nil)
784
+ raise Todo
785
+ end
786
+ end
787
+
731
788
  class Processor
732
789
  attr_reader :feature_extractor
733
790
 
@@ -748,7 +805,10 @@ module Informers
748
805
  "DPTFeatureExtractor" => DPTFeatureExtractor,
749
806
  "DetrFeatureExtractor" => DetrFeatureExtractor,
750
807
  "Swin2SRImageProcessor" => Swin2SRImageProcessor,
751
- "DonutFeatureExtractor" => DonutFeatureExtractor
808
+ "DonutFeatureExtractor" => DonutFeatureExtractor,
809
+ "WhisperFeatureExtractor" => WhisperFeatureExtractor,
810
+ "Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
811
+ "ClapFeatureExtractor" => ClapFeatureExtractor
752
812
  }
753
813
 
754
814
  PROCESSOR_CLASS_MAPPING = {}
@@ -762,7 +822,7 @@ module Informers
762
822
  revision: "main",
763
823
  **kwargs
764
824
  )
765
- preprocessor_config = config || Utils::Hub::get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
825
+ preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
766
826
  progress_callback:,
767
827
  config:,
768
828
  cache_dir:,
@@ -244,6 +244,9 @@ module Informers
244
244
  end
245
245
  end
246
246
 
247
+ class SpeechT5Tokenizer < PreTrainedTokenizer
248
+ end
249
+
247
250
  class AutoTokenizer
248
251
  TOKENIZER_CLASS_MAPPING = {
249
252
  "T5Tokenizer" => T5Tokenizer,
@@ -257,7 +260,8 @@ module Informers
257
260
  "CLIPTokenizer" => CLIPTokenizer,
258
261
  "GPT2Tokenizer" => GPT2Tokenizer,
259
262
  "NllbTokenizer" => NllbTokenizer,
260
- "M2M100Tokenizer" => M2M100Tokenizer
263
+ "M2M100Tokenizer" => M2M100Tokenizer,
264
+ "SpeechT5Tokenizer" => SpeechT5Tokenizer
261
265
  }
262
266
 
263
267
  def self.from_pretrained(
@@ -296,7 +300,7 @@ module Informers
296
300
  def self.load_tokenizer(pretrained_model_name_or_path, **options)
297
301
  info = [
298
302
  Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
299
- Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options),
303
+ Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
300
304
  ]
301
305
 
302
306
  # Override legacy option if `options.legacy` is not null
@@ -0,0 +1,18 @@
1
+ module Informers
2
+ module Utils
3
+ def self.read_audio(input, sampling_rate)
4
+ data =
5
+ if input.is_a?(URI)
6
+ require "open-uri"
7
+
8
+ input.read
9
+ elsif input.is_a?(String)
10
+ File.binread(input)
11
+ else
12
+ raise ArgumentError, "Unsupported input type: #{input.class.name}"
13
+ end
14
+
15
+ ffmpeg_read(data, sampling_rate)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,45 @@
1
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Informers
16
+ module Utils
17
+ # from the Transformers Python library
18
+ def self.ffmpeg_read(data, sampling_rate)
19
+ ar = "#{sampling_rate}"
20
+ ac = "1"
21
+ format_for_conversion = "f32le"
22
+ ffmpeg_command = [
23
+ "ffmpeg",
24
+ "-i",
25
+ "pipe:0",
26
+ "-ac",
27
+ ac,
28
+ "-ar",
29
+ ar,
30
+ "-f",
31
+ format_for_conversion,
32
+ "-hide_banner",
33
+ "-loglevel",
34
+ "quiet",
35
+ "pipe:1"
36
+ ]
37
+
38
+ stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
39
+ if !status.success?
40
+ raise Error, "ffmpeg was not found but is required to load audio files from filename"
41
+ end
42
+ stdout.unpack("e*")
43
+ end
44
+ end
45
+ end
@@ -7,7 +7,7 @@ module Informers
7
7
  2 => "bilinear",
8
8
  3 => "bicubic",
9
9
  4 => "box",
10
- 5 => "hamming",
10
+ 5 => "hamming"
11
11
  }
12
12
 
13
13
  attr_reader :image, :width, :height, :channels
@@ -14,8 +14,8 @@ module Informers
14
14
  out_img = Array.new(out_height * out_width * in_channels)
15
15
 
16
16
  # Pre-calculate strides
17
- in_stride = in_height * in_width;
18
- out_stride = out_height * out_width;
17
+ in_stride = in_height * in_width
18
+ out_stride = out_height * out_width
19
19
 
20
20
  out_height.times do |i|
21
21
  out_width.times do |j|
@@ -63,7 +63,7 @@ module Informers
63
63
 
64
64
  def self.reshape(arr, dims)
65
65
  arr = arr.flatten
66
- dims[1..-1].reverse.each do |dim|
66
+ dims[1..-1].reverse_each do |dim|
67
67
  arr = arr.each_slice(dim)
68
68
  end
69
69
  arr.to_a
@@ -1,3 +1,3 @@
1
1
  module Informers
2
- VERSION = "1.1.0"
2
+ VERSION = "1.1.1"
3
3
  end
data/lib/informers.rb CHANGED
@@ -6,12 +6,15 @@ require "tokenizers"
6
6
  require "io/console"
7
7
  require "json"
8
8
  require "open-uri"
9
+ require "open3"
9
10
  require "stringio"
10
11
  require "uri"
11
12
 
12
13
  # modules
14
+ require_relative "informers/utils/audio"
13
15
  require_relative "informers/utils/core"
14
16
  require_relative "informers/utils/generation"
17
+ require_relative "informers/utils/ffmpeg"
15
18
  require_relative "informers/utils/hub"
16
19
  require_relative "informers/utils/image"
17
20
  require_relative "informers/utils/math"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: informers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-17 00:00:00.000000000 Z
11
+ date: 2024-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: onnxruntime
@@ -55,7 +55,9 @@ files:
55
55
  - lib/informers/pipelines.rb
56
56
  - lib/informers/processors.rb
57
57
  - lib/informers/tokenizers.rb
58
+ - lib/informers/utils/audio.rb
58
59
  - lib/informers/utils/core.rb
60
+ - lib/informers/utils/ffmpeg.rb
59
61
  - lib/informers/utils/generation.rb
60
62
  - lib/informers/utils/hub.rb
61
63
  - lib/informers/utils/image.rb