informers 1.1.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
4
- data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
3
+ metadata.gz: a61f01755798e81a975641d60e5bfe09484ced7ce6a3453020c9978dc35b1942
4
+ data.tar.gz: 811f9c1dc4499ae7de8ebf8e02c0c4e98a0c0bc0af6aaca51025e42ba8165540
5
5
  SHA512:
6
- metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
7
- data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
6
+ metadata.gz: 97b27363fab1e43895e368dbddc819fd4db23d42ce517359e5971347cd902b654f0c66700f07b36cd5f476bd3ea205a91e4f7e7ee0e7d8d455f0dce377bedb2b
7
+ data.tar.gz: dd1a7f795609423419ce213b00a5aca409f6b4a5bffb111250b4deffcbc6a8113fadf8d603c59fa78fa0f310904a0a3299e3bcdc48101f574171a024d13567e6
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 1.1.1 (2024-10-14)
2
+
3
+ - Added `audio-classification` pipeline
4
+ - Fixed error with `sentence-transformers/all-MiniLM-L6-v2`
5
+
1
6
  ## 1.1.0 (2024-09-17)
2
7
 
3
8
  - Added more pipelines
data/README.md CHANGED
@@ -229,19 +229,13 @@ result = model.(query, docs)
229
229
 
230
230
  ### Other
231
231
 
232
- You can use the feature extraction pipeline directly.
233
-
234
- ```ruby
235
- model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
236
- embeddings = model.(sentences, pooling: "mean", normalize: true)
237
- ```
238
-
239
232
  The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
240
233
 
241
234
  ## Pipelines
242
235
 
243
236
  - [Text](#text)
244
237
  - [Vision](#vision)
238
+ - [Audio](#audio)
245
239
  - [Multimodel](#multimodal)
246
240
 
247
241
  ### Text
@@ -332,6 +326,8 @@ extractor.("We are very happy to show you the 🤗 Transformers library.")
332
326
 
333
327
  ### Vision
334
328
 
329
+ Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images
330
+
335
331
  Image classification
336
332
 
337
333
  ```ruby
@@ -388,6 +384,17 @@ extractor = Informers.pipeline("image-feature-extraction")
388
384
  extractor.("image.jpg")
389
385
  ```
390
386
 
387
+ ### Audio
388
+
389
+ Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files
390
+
391
+ Audio classification
392
+
393
+ ```ruby
394
+ classifier = Informers.pipeline("audio-classification")
395
+ classifier.("audio.wav")
396
+ ```
397
+
391
398
  ### Multimodal
392
399
 
393
400
  Image captioning
@@ -84,6 +84,7 @@ module Informers
84
84
  @get_start_beams = method(:decoder_start_beams)
85
85
  @update_beam = method(:decoder_update_beam)
86
86
  @forward = method(:decoder_forward)
87
+
87
88
  when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
88
89
  @can_generate = true
89
90
 
@@ -91,8 +92,10 @@ module Informers
91
92
  @get_start_beams = method(:seq2seq_start_beams)
92
93
  @update_beam = method(:seq2seq_update_beam)
93
94
  @forward = method(:seq2seq_forward)
95
+
94
96
  when MODEL_TYPES[:EncoderDecoder]
95
- raise Todo
97
+ @forward = method(:encoder_forward)
98
+
96
99
  else
97
100
  @forward = method(:encoder_forward)
98
101
  end
@@ -137,10 +140,18 @@ module Informers
137
140
  ]
138
141
 
139
142
  elsif model_type == MODEL_TYPES[:MaskGeneration]
140
- raise Todo
143
+ info = [
144
+ AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
145
+ construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
146
+ construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
147
+ ]
141
148
 
142
149
  elsif model_type == MODEL_TYPES[:EncoderDecoder]
143
- raise Todo
150
+ info = [
151
+ AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
152
+ construct_session(pretrained_model_name_or_path, "encoder_model", **options),
153
+ construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
154
+ ]
144
155
 
145
156
  else
146
157
  if model_type != MODEL_TYPES[:EncoderOnly]
@@ -293,13 +304,13 @@ module Informers
293
304
  grouped_beams = group_beams(beams)
294
305
 
295
306
  get_flattened = lambda do |key|
296
- grouped_beams.map do |batch|
307
+ grouped_beams.flat_map do |batch|
297
308
  if generation_config["num_return_sequences"] > 1
298
309
  raise Todo
299
310
  else
300
311
  [batch[0][key]]
301
312
  end
302
- end.flatten(1)
313
+ end
303
314
  end
304
315
 
305
316
  sequences = get_flattened.(:output_token_ids) # [1, seqLength]
@@ -904,6 +915,18 @@ module Informers
904
915
  end
905
916
  end
906
917
 
918
+ class Wav2Vec2PreTrainedModel < PreTrainedModel
919
+ end
920
+
921
+ class Wav2Vec2Model < Wav2Vec2PreTrainedModel
922
+ end
923
+
924
+ class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
925
+ def call(model_inputs)
926
+ SequenceClassifierOutput.new(*super(model_inputs))
927
+ end
928
+ end
929
+
907
930
  class RobertaPreTrainedModel < PreTrainedModel
908
931
  end
909
932
 
@@ -1066,6 +1089,62 @@ module Informers
1066
1089
  class DonutSwinModel < DonutSwinPreTrainedModel
1067
1090
  end
1068
1091
 
1092
+ class WhisperPreTrainedModel < PreTrainedModel
1093
+ end
1094
+
1095
+ class WhisperModel < WhisperPreTrainedModel
1096
+ end
1097
+
1098
+ class WhisperForConditionalGeneration < WhisperPreTrainedModel
1099
+ REQUIRES_ATTENTION_MASK = false
1100
+ MAIN_INPUT_NAME = :input_features
1101
+
1102
+ def initialize(config, session, decoder_merged_session, generation_config)
1103
+ super(config, session)
1104
+ @decoder_merged_session = decoder_merged_session
1105
+ @generation_config = generation_config
1106
+
1107
+ @num_decoder_layers = @config["decoder_layers"]
1108
+ @num_decoder_heads = @config["decoder_attention_heads"]
1109
+ @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f
1110
+
1111
+ @num_encoder_layers = @config["encoder_layers"]
1112
+ @num_encoder_heads = @config["encoder_attention_heads"]
1113
+ @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
1114
+ end
1115
+
1116
+ def generate(inputs, generation_config = nil, logits_processor = nil)
1117
+ raise Todo
1118
+ end
1119
+ end
1120
+
1121
+ class VitsPreTrainedModel < PreTrainedModel
1122
+ end
1123
+
1124
+ class VitsModel < VitsPreTrainedModel
1125
+ def call(model_inputs)
1126
+ VitsModelOutput.new(*super(model_inputs))
1127
+ end
1128
+ end
1129
+
1130
+ class SpeechT5PreTrainedModel < PreTrainedModel
1131
+ end
1132
+
1133
+ class SpeechT5Model < SpeechT5PreTrainedModel
1134
+ end
1135
+
1136
+ class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
1137
+ end
1138
+
1139
+ class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
1140
+ end
1141
+
1142
+ class ClapPreTrainedModel < PreTrainedModel
1143
+ end
1144
+
1145
+ class ClapModel < ClapPreTrainedModel
1146
+ end
1147
+
1069
1148
  MODEL_MAPPING_NAMES_ENCODER_ONLY = {
1070
1149
  "bert" => ["BertModel", BertModel],
1071
1150
  "nomic_bert" => ["NomicBertModel", NomicBertModel],
@@ -1074,6 +1153,7 @@ module Informers
1074
1153
  "distilbert" => ["DistilBertModel", DistilBertModel],
1075
1154
  "roberta" => ["RobertaModel", RobertaModel],
1076
1155
  "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
1156
+ "clap" => ["ClapModel", ClapModel],
1077
1157
  "clip" => ["CLIPModel", CLIPModel],
1078
1158
  "detr" => ["DetrModel", DetrModel],
1079
1159
  "vit" => ["ViTModel", ViTModel],
@@ -1085,6 +1165,21 @@ module Informers
1085
1165
  "bart" => ["BartModel", BartModel]
1086
1166
  }
1087
1167
 
1168
+ MODEL_MAPPING_NAMES_DECODER_ONLY = {
1169
+ }
1170
+
1171
+ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
1172
+ "whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
1173
+ }
1174
+
1175
+ MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
1176
+ "speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
1177
+ }
1178
+
1179
+ MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
1180
+ "vits" => ["VitsModel", VitsModel]
1181
+ }
1182
+
1088
1183
  MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
1089
1184
  "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
1090
1185
  "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
@@ -1143,6 +1238,25 @@ module Informers
1143
1238
  MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
1144
1239
  }
1145
1240
 
1241
+ MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
1242
+ }
1243
+
1244
+ MODEL_FOR_CTC_MAPPING_NAMES = {
1245
+ }
1246
+
1247
+ MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
1248
+ "wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
1249
+ }
1250
+
1251
+ MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
1252
+ }
1253
+
1254
+ MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
1255
+ }
1256
+
1257
+ MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
1258
+ }
1259
+
1146
1260
  MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
1147
1261
  "swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
1148
1262
  }
@@ -1157,9 +1271,11 @@ module Informers
1157
1271
  MODEL_CLASS_TYPE_MAPPING = [
1158
1272
  [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
1159
1273
  [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
1274
+ [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
1160
1275
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1161
1276
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1162
1277
  [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
1278
+ [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
1163
1279
  [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
1164
1280
  [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1165
1281
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
@@ -1167,10 +1283,18 @@ module Informers
1167
1283
  [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1168
1284
  [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1169
1285
  [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1286
+ [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1170
1287
  [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1171
1288
  [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1172
1289
  [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1173
1290
  [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1291
+ [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
1292
+ [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1293
+ [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1294
+ [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
1295
+ [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1296
+ [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1297
+ [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
1174
1298
  [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
1175
1299
  ]
1176
1300
 
@@ -1199,6 +1323,18 @@ module Informers
1199
1323
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
1200
1324
  end
1201
1325
 
1326
+ class AutoModelForSpeechSeq2Seq < PretrainedMixin
1327
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
1328
+ end
1329
+
1330
+ class AutoModelForTextToSpectrogram < PretrainedMixin
1331
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
1332
+ end
1333
+
1334
+ class AutoModelForTextToWaveform < PretrainedMixin
1335
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
1336
+ end
1337
+
1202
1338
  class AutoModelForCausalLM < PretrainedMixin
1203
1339
  MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
1204
1340
  end
@@ -1235,10 +1371,34 @@ module Informers
1235
1371
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
1236
1372
  end
1237
1373
 
1374
+ class AutoModelForMaskGeneration < PretrainedMixin
1375
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
1376
+ end
1377
+
1378
+ class AutoModelForCTC < PretrainedMixin
1379
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
1380
+ end
1381
+
1382
+ class AutoModelForAudioClassification < PretrainedMixin
1383
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
1384
+ end
1385
+
1386
+ class AutoModelForXVector < PretrainedMixin
1387
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
1388
+ end
1389
+
1390
+ class AutoModelForAudioFrameClassification < PretrainedMixin
1391
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
1392
+ end
1393
+
1238
1394
  class AutoModelForDocumentQuestionAnswering < PretrainedMixin
1239
1395
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
1240
1396
  end
1241
1397
 
1398
+ class AutoModelForImageMatting < PretrainedMixin
1399
+ MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
1400
+ end
1401
+
1242
1402
  class AutoModelForImageToImage < PretrainedMixin
1243
1403
  MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
1244
1404
  end
@@ -19,6 +19,20 @@ module Informers
19
19
  images.map { |x| Utils::RawImage.read(x) }
20
20
  end
21
21
 
22
+ def prepare_audios(audios, sampling_rate)
23
+ if !audios.is_a?(Array)
24
+ audios = [audios]
25
+ end
26
+
27
+ audios.map do |x|
28
+ if x.is_a?(String) || x.is_a?(URI)
29
+ Utils.read_audio(x, sampling_rate)
30
+ else
31
+ x
32
+ end
33
+ end
34
+ end
35
+
22
36
  def get_bounding_box(box, as_integer)
23
37
  if as_integer
24
38
  box = box.map { |x| x.to_i }
@@ -729,7 +743,7 @@ module Informers
729
743
  {
730
744
  label: candidate_labels[processed[:classes][i]],
731
745
  score: processed[:scores][i],
732
- box: get_bounding_box(box, !percentage),
746
+ box: get_bounding_box(box, !percentage)
733
747
  }
734
748
  end
735
749
  result.sort_by! { |v| -v[:score] }
@@ -784,6 +798,26 @@ module Informers
784
798
  end
785
799
  end
786
800
 
801
+ class TextToAudioPipeline < Pipeline
802
+ DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
803
+
804
+ def initialize(**options)
805
+ super(**options)
806
+
807
+ # TODO: Find a better way for `pipeline` to set the default vocoder
808
+ @vocoder = options[:vocoder]
809
+ end
810
+
811
+ def call(text_inputs, speaker_embeddings: nil)
812
+ # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
813
+ if @processor
814
+ call_text_to_spectrogram(text_inputs, speaker_embeddings:)
815
+ else
816
+ call_text_to_waveform(text_inputs)
817
+ end
818
+ end
819
+ end
820
+
787
821
  class FeatureExtractionPipeline < Pipeline
788
822
  def call(
789
823
  texts,
@@ -803,7 +837,7 @@ module Informers
803
837
  if !model_output.nil?
804
838
  model_options[:output_names] = Array(model_output)
805
839
  elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
806
- # optimization for sentence-transformers/all-MiniLM-L6-v2
840
+ # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
807
841
  model_options[:output_names] = ["sentence_embedding"]
808
842
  pooling = "none"
809
843
  normalize = false
@@ -858,11 +892,106 @@ module Informers
858
892
  end
859
893
  end
860
894
 
895
+ class AudioClassificationPipeline < Pipeline
896
+ def call(audio, top_k: nil)
897
+ single = !audio.is_a?(Array)
898
+
899
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
900
+ prepared_audios = prepare_audios(audio, sampling_rate)
901
+
902
+ id2label = @model.config[:id2label]
903
+
904
+ to_return = []
905
+ prepared_audios.each do |aud|
906
+ inputs = @processor.(aud)
907
+ output = @model.(inputs)
908
+ logits = output.logits[0]
909
+
910
+ scores = Utils.get_top_items(Utils.softmax(logits), top_k)
911
+
912
+ vals =
913
+ scores.map do |x|
914
+ {
915
+ label: id2label[x[0].to_s],
916
+ score: x[1]
917
+ }
918
+ end
919
+
920
+ if top_k == 1
921
+ to_return.concat(vals)
922
+ else
923
+ to_return << vals
924
+ end
925
+ end
926
+ !single || top_k == 1 ? to_return : to_return[0]
927
+ end
928
+ end
929
+
930
+ class ZeroShotAudioClassificationPipeline < Pipeline
931
+ def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
932
+ single = !audio.is_a?(Array)
933
+ if single
934
+ audio = [audio]
935
+ end
936
+
937
+ # Insert label into hypothesis template
938
+ texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
939
+
940
+ # Run tokenization
941
+ text_inputs =
942
+ @tokenizer.(
943
+ texts,
944
+ padding: true,
945
+ truncation: true
946
+ )
947
+
948
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
949
+ prepared_audios = prepare_audios(audio, sampling_rate)
950
+
951
+ to_return = []
952
+ prepared_audios.each do |aud|
953
+ audio_inputs = @processor.(aud)
954
+
955
+ # Run model with both text and audio inputs
956
+ output = @model.(text_inputs.merge(audio_inputs))
957
+
958
+ # Compute softmax per audio
959
+ probs = Utils.softmax(output.logits_per_audio.data)
960
+
961
+ to_return <<
962
+ probs.map.with_index do |x, i|
963
+ {
964
+ label: candidate_labels[i],
965
+ score: x
966
+ }
967
+ end
968
+ end
969
+ single ? to_return[0] : to_return
970
+ end
971
+ end
972
+
973
+ class AutomaticSpeechRecognitionPipeline < Pipeline
974
+ def call(audio, **kwargs)
975
+ case @model.config["model_type"]
976
+ when "whisper"
977
+ call_whisper(audio, **kwargs)
978
+ else
979
+ raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
980
+ end
981
+ end
982
+
983
+ private
984
+
985
+ def call_whisper(audio, **kwargs)
986
+ raise Todo
987
+ end
988
+ end
989
+
861
990
  class ImageToImagePipeline < Pipeline
862
991
  def call(images)
863
992
  prepared_images = prepare_images(images)
864
993
  inputs = @processor.(prepared_images)
865
- outputs = @model.(inputs);
994
+ outputs = @model.(inputs)
866
995
 
867
996
  to_return = []
868
997
  outputs[0].each do |batch|
@@ -1033,6 +1162,47 @@ module Informers
1033
1162
  },
1034
1163
  type: "text"
1035
1164
  },
1165
+ "audio-classification" => {
1166
+ pipeline: AudioClassificationPipeline,
1167
+ model: AutoModelForAudioClassification,
1168
+ processor: AutoProcessor,
1169
+ default: {
1170
+ model: "Xenova/wav2vec2-base-superb-ks"
1171
+ },
1172
+ type: "audio"
1173
+ },
1174
+ # TODO
1175
+ # "zero-shot-audio-classification" => {
1176
+ # tokenizer: AutoTokenizer,
1177
+ # pipeline: ZeroShotAudioClassificationPipeline,
1178
+ # model: AutoModel,
1179
+ # processor: AutoProcessor,
1180
+ # default: {
1181
+ # model: "Xenova/clap-htsat-unfused"
1182
+ # },
1183
+ # type: "multimodal"
1184
+ # },
1185
+ # TODO
1186
+ # "automatic-speech-recognition" => {
1187
+ # tokenizer: AutoTokenizer,
1188
+ # pipeline: AutomaticSpeechRecognitionPipeline,
1189
+ # model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
1190
+ # processor: AutoProcessor,
1191
+ # default: {
1192
+ # model: "Xenova/whisper-tiny.en"
1193
+ # },
1194
+ # type: "multimodal"
1195
+ # },
1196
+ "text-to-audio" => {
1197
+ tokenizer: AutoTokenizer,
1198
+ pipeline: TextToAudioPipeline,
1199
+ model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
1200
+ processor: [AutoProcessor, nil],
1201
+ default: {
1202
+ model: "Xenova/speecht5_tts"
1203
+ },
1204
+ type: "text"
1205
+ },
1036
1206
  "image-to-text" => {
1037
1207
  tokenizer: AutoTokenizer,
1038
1208
  pipeline: ImageToTextPipeline,
@@ -1048,7 +1218,7 @@ module Informers
1048
1218
  model: AutoModelForImageClassification,
1049
1219
  processor: AutoProcessor,
1050
1220
  default: {
1051
- model: "Xenova/vit-base-patch16-224",
1221
+ model: "Xenova/vit-base-patch16-224"
1052
1222
  },
1053
1223
  type: "multimodal"
1054
1224
  },
@@ -1057,7 +1227,7 @@ module Informers
1057
1227
  model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
1058
1228
  processor: AutoProcessor,
1059
1229
  default: {
1060
- model: "Xenova/detr-resnet-50-panoptic",
1230
+ model: "Xenova/detr-resnet-50-panoptic"
1061
1231
  },
1062
1232
  type: "multimodal"
1063
1233
  },
@@ -1076,7 +1246,7 @@ module Informers
1076
1246
  model: AutoModelForObjectDetection,
1077
1247
  processor: AutoProcessor,
1078
1248
  default: {
1079
- model: "Xenova/detr-resnet-50",
1249
+ model: "Xenova/detr-resnet-50"
1080
1250
  },
1081
1251
  type: "multimodal"
1082
1252
  },
@@ -1158,7 +1328,8 @@ module Informers
1158
1328
 
1159
1329
  TASK_ALIASES = {
1160
1330
  "sentiment-analysis" => "text-classification",
1161
- "ner" => "token-classification"
1331
+ "ner" => "token-classification",
1332
+ "text-to-speech" => "text-to-audio"
1162
1333
  }
1163
1334
 
1164
1335
  DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -1231,7 +1402,8 @@ module Informers
1231
1402
  results = load_items(classes, model, pretrained_options)
1232
1403
  results[:task] = task
1233
1404
 
1234
- if model == "sentence-transformers/all-MiniLM-L6-v2"
1405
+ # for previous revision of sentence-transformers/all-MiniLM-L6-v2
1406
+ if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
1235
1407
  results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
1236
1408
  end
1237
1409
 
@@ -1,5 +1,7 @@
1
1
  module Informers
2
2
  class FeatureExtractor
3
+ attr_reader :config
4
+
3
5
  def initialize(config)
4
6
  super()
5
7
  @config = config
@@ -728,6 +730,61 @@ module Informers
728
730
  end
729
731
  end
730
732
 
733
+ class WhisperFeatureExtractor < FeatureExtractor
734
+ def initialize(config)
735
+ super(config)
736
+
737
+ raise Todo
738
+ end
739
+
740
+ def _extract_fbank_features(waveform)
741
+ raise Todo
742
+ end
743
+
744
+ def call(audio)
745
+ raise Todo
746
+ end
747
+ end
748
+
749
+ class Wav2Vec2FeatureExtractor < FeatureExtractor
750
+ def _zero_mean_unit_var_norm(input_values)
751
+ sum = input_values.sum
752
+ mean = sum / input_values.length.to_f
753
+ variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
754
+ input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
755
+ end
756
+
757
+ def call(audio)
758
+ # TODO
759
+ # validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')
760
+
761
+ input_values = audio
762
+
763
+ # zero-mean and unit-variance normalization
764
+ if @config["do_normalize"]
765
+ input_values = _zero_mean_unit_var_norm(input_values)
766
+ end
767
+
768
+ # TODO: allow user to pass in attention mask
769
+ {
770
+ input_values: [input_values],
771
+ attention_mask: [Array.new(input_values.length, 1)]
772
+ }
773
+ end
774
+ end
775
+
776
+ class ClapFeatureExtractor < FeatureExtractor
777
+ def initialize(config)
778
+ super(config)
779
+
780
+ # TODO
781
+ end
782
+
783
+ def call(audio, max_length: nil)
784
+ raise Todo
785
+ end
786
+ end
787
+
731
788
  class Processor
732
789
  attr_reader :feature_extractor
733
790
 
@@ -748,7 +805,10 @@ module Informers
748
805
  "DPTFeatureExtractor" => DPTFeatureExtractor,
749
806
  "DetrFeatureExtractor" => DetrFeatureExtractor,
750
807
  "Swin2SRImageProcessor" => Swin2SRImageProcessor,
751
- "DonutFeatureExtractor" => DonutFeatureExtractor
808
+ "DonutFeatureExtractor" => DonutFeatureExtractor,
809
+ "WhisperFeatureExtractor" => WhisperFeatureExtractor,
810
+ "Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
811
+ "ClapFeatureExtractor" => ClapFeatureExtractor
752
812
  }
753
813
 
754
814
  PROCESSOR_CLASS_MAPPING = {}
@@ -762,7 +822,7 @@ module Informers
762
822
  revision: "main",
763
823
  **kwargs
764
824
  )
765
- preprocessor_config = config || Utils::Hub::get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
825
+ preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
766
826
  progress_callback:,
767
827
  config:,
768
828
  cache_dir:,
@@ -244,6 +244,9 @@ module Informers
244
244
  end
245
245
  end
246
246
 
247
+ class SpeechT5Tokenizer < PreTrainedTokenizer
248
+ end
249
+
247
250
  class AutoTokenizer
248
251
  TOKENIZER_CLASS_MAPPING = {
249
252
  "T5Tokenizer" => T5Tokenizer,
@@ -257,7 +260,8 @@ module Informers
257
260
  "CLIPTokenizer" => CLIPTokenizer,
258
261
  "GPT2Tokenizer" => GPT2Tokenizer,
259
262
  "NllbTokenizer" => NllbTokenizer,
260
- "M2M100Tokenizer" => M2M100Tokenizer
263
+ "M2M100Tokenizer" => M2M100Tokenizer,
264
+ "SpeechT5Tokenizer" => SpeechT5Tokenizer
261
265
  }
262
266
 
263
267
  def self.from_pretrained(
@@ -296,7 +300,7 @@ module Informers
296
300
  def self.load_tokenizer(pretrained_model_name_or_path, **options)
297
301
  info = [
298
302
  Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
299
- Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options),
303
+ Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
300
304
  ]
301
305
 
302
306
  # Override legacy option if `options.legacy` is not null
@@ -0,0 +1,18 @@
1
+ module Informers
2
+ module Utils
3
+ def self.read_audio(input, sampling_rate)
4
+ data =
5
+ if input.is_a?(URI)
6
+ require "open-uri"
7
+
8
+ input.read
9
+ elsif input.is_a?(String)
10
+ File.binread(input)
11
+ else
12
+ raise ArgumentError, "Unsupported input type: #{input.class.name}"
13
+ end
14
+
15
+ ffmpeg_read(data, sampling_rate)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,45 @@
1
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Informers
16
+ module Utils
17
+ # from the Transformers Python library
18
+ def self.ffmpeg_read(data, sampling_rate)
19
+ ar = "#{sampling_rate}"
20
+ ac = "1"
21
+ format_for_conversion = "f32le"
22
+ ffmpeg_command = [
23
+ "ffmpeg",
24
+ "-i",
25
+ "pipe:0",
26
+ "-ac",
27
+ ac,
28
+ "-ar",
29
+ ar,
30
+ "-f",
31
+ format_for_conversion,
32
+ "-hide_banner",
33
+ "-loglevel",
34
+ "quiet",
35
+ "pipe:1"
36
+ ]
37
+
38
+ stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
39
+ if !status.success?
40
+ raise Error, "ffmpeg was not found but is required to load audio files from filename"
41
+ end
42
+ stdout.unpack("e*")
43
+ end
44
+ end
45
+ end
@@ -7,7 +7,7 @@ module Informers
7
7
  2 => "bilinear",
8
8
  3 => "bicubic",
9
9
  4 => "box",
10
- 5 => "hamming",
10
+ 5 => "hamming"
11
11
  }
12
12
 
13
13
  attr_reader :image, :width, :height, :channels
@@ -14,8 +14,8 @@ module Informers
14
14
  out_img = Array.new(out_height * out_width * in_channels)
15
15
 
16
16
  # Pre-calculate strides
17
- in_stride = in_height * in_width;
18
- out_stride = out_height * out_width;
17
+ in_stride = in_height * in_width
18
+ out_stride = out_height * out_width
19
19
 
20
20
  out_height.times do |i|
21
21
  out_width.times do |j|
@@ -63,7 +63,7 @@ module Informers
63
63
 
64
64
  def self.reshape(arr, dims)
65
65
  arr = arr.flatten
66
- dims[1..-1].reverse.each do |dim|
66
+ dims[1..-1].reverse_each do |dim|
67
67
  arr = arr.each_slice(dim)
68
68
  end
69
69
  arr.to_a
@@ -1,3 +1,3 @@
1
1
  module Informers
2
- VERSION = "1.1.0"
2
+ VERSION = "1.1.1"
3
3
  end
data/lib/informers.rb CHANGED
@@ -6,12 +6,15 @@ require "tokenizers"
6
6
  require "io/console"
7
7
  require "json"
8
8
  require "open-uri"
9
+ require "open3"
9
10
  require "stringio"
10
11
  require "uri"
11
12
 
12
13
  # modules
14
+ require_relative "informers/utils/audio"
13
15
  require_relative "informers/utils/core"
14
16
  require_relative "informers/utils/generation"
17
+ require_relative "informers/utils/ffmpeg"
15
18
  require_relative "informers/utils/hub"
16
19
  require_relative "informers/utils/image"
17
20
  require_relative "informers/utils/math"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: informers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-09-17 00:00:00.000000000 Z
11
+ date: 2024-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: onnxruntime
@@ -55,7 +55,9 @@ files:
55
55
  - lib/informers/pipelines.rb
56
56
  - lib/informers/processors.rb
57
57
  - lib/informers/tokenizers.rb
58
+ - lib/informers/utils/audio.rb
58
59
  - lib/informers/utils/core.rb
60
+ - lib/informers/utils/ffmpeg.rb
59
61
  - lib/informers/utils/generation.rb
60
62
  - lib/informers/utils/hub.rb
61
63
  - lib/informers/utils/image.rb