informers 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +14 -7
- data/lib/informers/models.rb +165 -5
- data/lib/informers/pipelines.rb +180 -8
- data/lib/informers/processors.rb +62 -2
- data/lib/informers/tokenizers.rb +6 -2
- data/lib/informers/utils/audio.rb +18 -0
- data/lib/informers/utils/ffmpeg.rb +45 -0
- data/lib/informers/utils/image.rb +1 -1
- data/lib/informers/utils/math.rb +2 -2
- data/lib/informers/utils/tensor.rb +1 -1
- data/lib/informers/version.rb +1 -1
- data/lib/informers.rb +3 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a61f01755798e81a975641d60e5bfe09484ced7ce6a3453020c9978dc35b1942
|
4
|
+
data.tar.gz: 811f9c1dc4499ae7de8ebf8e02c0c4e98a0c0bc0af6aaca51025e42ba8165540
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97b27363fab1e43895e368dbddc819fd4db23d42ce517359e5971347cd902b654f0c66700f07b36cd5f476bd3ea205a91e4f7e7ee0e7d8d455f0dce377bedb2b
|
7
|
+
data.tar.gz: dd1a7f795609423419ce213b00a5aca409f6b4a5bffb111250b4deffcbc6a8113fadf8d603c59fa78fa0f310904a0a3299e3bcdc48101f574171a024d13567e6
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -229,19 +229,13 @@ result = model.(query, docs)
|
|
229
229
|
|
230
230
|
### Other
|
231
231
|
|
232
|
-
You can use the feature extraction pipeline directly.
|
233
|
-
|
234
|
-
```ruby
|
235
|
-
model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
|
236
|
-
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
237
|
-
```
|
238
|
-
|
239
232
|
The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
|
240
233
|
|
241
234
|
## Pipelines
|
242
235
|
|
243
236
|
- [Text](#text)
|
244
237
|
- [Vision](#vision)
|
238
|
+
- [Audio](#audio)
|
245
239
|
- [Multimodel](#multimodal)
|
246
240
|
|
247
241
|
### Text
|
@@ -332,6 +326,8 @@ extractor.("We are very happy to show you the 🤗 Transformers library.")
|
|
332
326
|
|
333
327
|
### Vision
|
334
328
|
|
329
|
+
Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images
|
330
|
+
|
335
331
|
Image classification
|
336
332
|
|
337
333
|
```ruby
|
@@ -388,6 +384,17 @@ extractor = Informers.pipeline("image-feature-extraction")
|
|
388
384
|
extractor.("image.jpg")
|
389
385
|
```
|
390
386
|
|
387
|
+
### Audio
|
388
|
+
|
389
|
+
Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files
|
390
|
+
|
391
|
+
Audio classification
|
392
|
+
|
393
|
+
```ruby
|
394
|
+
classifier = Informers.pipeline("audio-classification")
|
395
|
+
classifier.("audio.wav")
|
396
|
+
```
|
397
|
+
|
391
398
|
### Multimodal
|
392
399
|
|
393
400
|
Image captioning
|
data/lib/informers/models.rb
CHANGED
@@ -84,6 +84,7 @@ module Informers
|
|
84
84
|
@get_start_beams = method(:decoder_start_beams)
|
85
85
|
@update_beam = method(:decoder_update_beam)
|
86
86
|
@forward = method(:decoder_forward)
|
87
|
+
|
87
88
|
when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
|
88
89
|
@can_generate = true
|
89
90
|
|
@@ -91,8 +92,10 @@ module Informers
|
|
91
92
|
@get_start_beams = method(:seq2seq_start_beams)
|
92
93
|
@update_beam = method(:seq2seq_update_beam)
|
93
94
|
@forward = method(:seq2seq_forward)
|
95
|
+
|
94
96
|
when MODEL_TYPES[:EncoderDecoder]
|
95
|
-
|
97
|
+
@forward = method(:encoder_forward)
|
98
|
+
|
96
99
|
else
|
97
100
|
@forward = method(:encoder_forward)
|
98
101
|
end
|
@@ -137,10 +140,18 @@ module Informers
|
|
137
140
|
]
|
138
141
|
|
139
142
|
elsif model_type == MODEL_TYPES[:MaskGeneration]
|
140
|
-
|
143
|
+
info = [
|
144
|
+
AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
|
145
|
+
construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
|
146
|
+
construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
|
147
|
+
]
|
141
148
|
|
142
149
|
elsif model_type == MODEL_TYPES[:EncoderDecoder]
|
143
|
-
|
150
|
+
info = [
|
151
|
+
AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
|
152
|
+
construct_session(pretrained_model_name_or_path, "encoder_model", **options),
|
153
|
+
construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
|
154
|
+
]
|
144
155
|
|
145
156
|
else
|
146
157
|
if model_type != MODEL_TYPES[:EncoderOnly]
|
@@ -293,13 +304,13 @@ module Informers
|
|
293
304
|
grouped_beams = group_beams(beams)
|
294
305
|
|
295
306
|
get_flattened = lambda do |key|
|
296
|
-
grouped_beams.
|
307
|
+
grouped_beams.flat_map do |batch|
|
297
308
|
if generation_config["num_return_sequences"] > 1
|
298
309
|
raise Todo
|
299
310
|
else
|
300
311
|
[batch[0][key]]
|
301
312
|
end
|
302
|
-
end
|
313
|
+
end
|
303
314
|
end
|
304
315
|
|
305
316
|
sequences = get_flattened.(:output_token_ids) # [1, seqLength]
|
@@ -904,6 +915,18 @@ module Informers
|
|
904
915
|
end
|
905
916
|
end
|
906
917
|
|
918
|
+
class Wav2Vec2PreTrainedModel < PreTrainedModel
|
919
|
+
end
|
920
|
+
|
921
|
+
class Wav2Vec2Model < Wav2Vec2PreTrainedModel
|
922
|
+
end
|
923
|
+
|
924
|
+
class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
|
925
|
+
def call(model_inputs)
|
926
|
+
SequenceClassifierOutput.new(*super(model_inputs))
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
907
930
|
class RobertaPreTrainedModel < PreTrainedModel
|
908
931
|
end
|
909
932
|
|
@@ -1066,6 +1089,62 @@ module Informers
|
|
1066
1089
|
class DonutSwinModel < DonutSwinPreTrainedModel
|
1067
1090
|
end
|
1068
1091
|
|
1092
|
+
class WhisperPreTrainedModel < PreTrainedModel
|
1093
|
+
end
|
1094
|
+
|
1095
|
+
class WhisperModel < WhisperPreTrainedModel
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
class WhisperForConditionalGeneration < WhisperPreTrainedModel
|
1099
|
+
REQUIRES_ATTENTION_MASK = false
|
1100
|
+
MAIN_INPUT_NAME = :input_features
|
1101
|
+
|
1102
|
+
def initialize(config, session, decoder_merged_session, generation_config)
|
1103
|
+
super(config, session)
|
1104
|
+
@decoder_merged_session = decoder_merged_session
|
1105
|
+
@generation_config = generation_config
|
1106
|
+
|
1107
|
+
@num_decoder_layers = @config["decoder_layers"]
|
1108
|
+
@num_decoder_heads = @config["decoder_attention_heads"]
|
1109
|
+
@decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f
|
1110
|
+
|
1111
|
+
@num_encoder_layers = @config["encoder_layers"]
|
1112
|
+
@num_encoder_heads = @config["encoder_attention_heads"]
|
1113
|
+
@encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
def generate(inputs, generation_config = nil, logits_processor = nil)
|
1117
|
+
raise Todo
|
1118
|
+
end
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
class VitsPreTrainedModel < PreTrainedModel
|
1122
|
+
end
|
1123
|
+
|
1124
|
+
class VitsModel < VitsPreTrainedModel
|
1125
|
+
def call(model_inputs)
|
1126
|
+
VitsModelOutput.new(*super(model_inputs))
|
1127
|
+
end
|
1128
|
+
end
|
1129
|
+
|
1130
|
+
class SpeechT5PreTrainedModel < PreTrainedModel
|
1131
|
+
end
|
1132
|
+
|
1133
|
+
class SpeechT5Model < SpeechT5PreTrainedModel
|
1134
|
+
end
|
1135
|
+
|
1136
|
+
class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
|
1140
|
+
end
|
1141
|
+
|
1142
|
+
class ClapPreTrainedModel < PreTrainedModel
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
class ClapModel < ClapPreTrainedModel
|
1146
|
+
end
|
1147
|
+
|
1069
1148
|
MODEL_MAPPING_NAMES_ENCODER_ONLY = {
|
1070
1149
|
"bert" => ["BertModel", BertModel],
|
1071
1150
|
"nomic_bert" => ["NomicBertModel", NomicBertModel],
|
@@ -1074,6 +1153,7 @@ module Informers
|
|
1074
1153
|
"distilbert" => ["DistilBertModel", DistilBertModel],
|
1075
1154
|
"roberta" => ["RobertaModel", RobertaModel],
|
1076
1155
|
"xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
|
1156
|
+
"clap" => ["ClapModel", ClapModel],
|
1077
1157
|
"clip" => ["CLIPModel", CLIPModel],
|
1078
1158
|
"detr" => ["DetrModel", DetrModel],
|
1079
1159
|
"vit" => ["ViTModel", ViTModel],
|
@@ -1085,6 +1165,21 @@ module Informers
|
|
1085
1165
|
"bart" => ["BartModel", BartModel]
|
1086
1166
|
}
|
1087
1167
|
|
1168
|
+
MODEL_MAPPING_NAMES_DECODER_ONLY = {
|
1169
|
+
}
|
1170
|
+
|
1171
|
+
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
|
1172
|
+
"whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
|
1173
|
+
}
|
1174
|
+
|
1175
|
+
MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
|
1176
|
+
"speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
|
1177
|
+
}
|
1178
|
+
|
1179
|
+
MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
|
1180
|
+
"vits" => ["VitsModel", VitsModel]
|
1181
|
+
}
|
1182
|
+
|
1088
1183
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
|
1089
1184
|
"bert" => ["BertForSequenceClassification", BertForSequenceClassification],
|
1090
1185
|
"distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
|
@@ -1143,6 +1238,25 @@ module Informers
|
|
1143
1238
|
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
|
1144
1239
|
}
|
1145
1240
|
|
1241
|
+
MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
|
1242
|
+
}
|
1243
|
+
|
1244
|
+
MODEL_FOR_CTC_MAPPING_NAMES = {
|
1245
|
+
}
|
1246
|
+
|
1247
|
+
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
|
1248
|
+
"wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
|
1252
|
+
}
|
1253
|
+
|
1254
|
+
MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
|
1258
|
+
}
|
1259
|
+
|
1146
1260
|
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
|
1147
1261
|
"swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
|
1148
1262
|
}
|
@@ -1157,9 +1271,11 @@ module Informers
|
|
1157
1271
|
MODEL_CLASS_TYPE_MAPPING = [
|
1158
1272
|
[MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
|
1159
1273
|
[MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
|
1274
|
+
[MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
|
1160
1275
|
[MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1161
1276
|
[MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1162
1277
|
[MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
|
1278
|
+
[MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
|
1163
1279
|
[MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
|
1164
1280
|
[MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1165
1281
|
[MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
@@ -1167,10 +1283,18 @@ module Informers
|
|
1167
1283
|
[MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1168
1284
|
[MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1169
1285
|
[MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1286
|
+
[MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1170
1287
|
[MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1171
1288
|
[MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1172
1289
|
[MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1173
1290
|
[MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1291
|
+
[MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
|
1292
|
+
[MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1293
|
+
[MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1294
|
+
[MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
|
1295
|
+
[MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1296
|
+
[MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1297
|
+
[MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1174
1298
|
[MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
|
1175
1299
|
]
|
1176
1300
|
|
@@ -1199,6 +1323,18 @@ module Informers
|
|
1199
1323
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
|
1200
1324
|
end
|
1201
1325
|
|
1326
|
+
class AutoModelForSpeechSeq2Seq < PretrainedMixin
|
1327
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
|
1328
|
+
end
|
1329
|
+
|
1330
|
+
class AutoModelForTextToSpectrogram < PretrainedMixin
|
1331
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
|
1332
|
+
end
|
1333
|
+
|
1334
|
+
class AutoModelForTextToWaveform < PretrainedMixin
|
1335
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
|
1336
|
+
end
|
1337
|
+
|
1202
1338
|
class AutoModelForCausalLM < PretrainedMixin
|
1203
1339
|
MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
|
1204
1340
|
end
|
@@ -1235,10 +1371,34 @@ module Informers
|
|
1235
1371
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
|
1236
1372
|
end
|
1237
1373
|
|
1374
|
+
class AutoModelForMaskGeneration < PretrainedMixin
|
1375
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
|
1376
|
+
end
|
1377
|
+
|
1378
|
+
class AutoModelForCTC < PretrainedMixin
|
1379
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
|
1380
|
+
end
|
1381
|
+
|
1382
|
+
class AutoModelForAudioClassification < PretrainedMixin
|
1383
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
|
1384
|
+
end
|
1385
|
+
|
1386
|
+
class AutoModelForXVector < PretrainedMixin
|
1387
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
|
1388
|
+
end
|
1389
|
+
|
1390
|
+
class AutoModelForAudioFrameClassification < PretrainedMixin
|
1391
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
|
1392
|
+
end
|
1393
|
+
|
1238
1394
|
class AutoModelForDocumentQuestionAnswering < PretrainedMixin
|
1239
1395
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
|
1240
1396
|
end
|
1241
1397
|
|
1398
|
+
class AutoModelForImageMatting < PretrainedMixin
|
1399
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
|
1400
|
+
end
|
1401
|
+
|
1242
1402
|
class AutoModelForImageToImage < PretrainedMixin
|
1243
1403
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
|
1244
1404
|
end
|
data/lib/informers/pipelines.rb
CHANGED
@@ -19,6 +19,20 @@ module Informers
|
|
19
19
|
images.map { |x| Utils::RawImage.read(x) }
|
20
20
|
end
|
21
21
|
|
22
|
+
def prepare_audios(audios, sampling_rate)
|
23
|
+
if !audios.is_a?(Array)
|
24
|
+
audios = [audios]
|
25
|
+
end
|
26
|
+
|
27
|
+
audios.map do |x|
|
28
|
+
if x.is_a?(String) || x.is_a?(URI)
|
29
|
+
Utils.read_audio(x, sampling_rate)
|
30
|
+
else
|
31
|
+
x
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
22
36
|
def get_bounding_box(box, as_integer)
|
23
37
|
if as_integer
|
24
38
|
box = box.map { |x| x.to_i }
|
@@ -729,7 +743,7 @@ module Informers
|
|
729
743
|
{
|
730
744
|
label: candidate_labels[processed[:classes][i]],
|
731
745
|
score: processed[:scores][i],
|
732
|
-
box: get_bounding_box(box, !percentage)
|
746
|
+
box: get_bounding_box(box, !percentage)
|
733
747
|
}
|
734
748
|
end
|
735
749
|
result.sort_by! { |v| -v[:score] }
|
@@ -784,6 +798,26 @@ module Informers
|
|
784
798
|
end
|
785
799
|
end
|
786
800
|
|
801
|
+
class TextToAudioPipeline < Pipeline
|
802
|
+
DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
|
803
|
+
|
804
|
+
def initialize(**options)
|
805
|
+
super(**options)
|
806
|
+
|
807
|
+
# TODO: Find a better way for `pipeline` to set the default vocoder
|
808
|
+
@vocoder = options[:vocoder]
|
809
|
+
end
|
810
|
+
|
811
|
+
def call(text_inputs, speaker_embeddings: nil)
|
812
|
+
# If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
|
813
|
+
if @processor
|
814
|
+
call_text_to_spectrogram(text_inputs, speaker_embeddings:)
|
815
|
+
else
|
816
|
+
call_text_to_waveform(text_inputs)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
end
|
820
|
+
|
787
821
|
class FeatureExtractionPipeline < Pipeline
|
788
822
|
def call(
|
789
823
|
texts,
|
@@ -803,7 +837,7 @@ module Informers
|
|
803
837
|
if !model_output.nil?
|
804
838
|
model_options[:output_names] = Array(model_output)
|
805
839
|
elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
806
|
-
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
840
|
+
# optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
807
841
|
model_options[:output_names] = ["sentence_embedding"]
|
808
842
|
pooling = "none"
|
809
843
|
normalize = false
|
@@ -858,11 +892,106 @@ module Informers
|
|
858
892
|
end
|
859
893
|
end
|
860
894
|
|
895
|
+
class AudioClassificationPipeline < Pipeline
|
896
|
+
def call(audio, top_k: nil)
|
897
|
+
single = !audio.is_a?(Array)
|
898
|
+
|
899
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
900
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
901
|
+
|
902
|
+
id2label = @model.config[:id2label]
|
903
|
+
|
904
|
+
to_return = []
|
905
|
+
prepared_audios.each do |aud|
|
906
|
+
inputs = @processor.(aud)
|
907
|
+
output = @model.(inputs)
|
908
|
+
logits = output.logits[0]
|
909
|
+
|
910
|
+
scores = Utils.get_top_items(Utils.softmax(logits), top_k)
|
911
|
+
|
912
|
+
vals =
|
913
|
+
scores.map do |x|
|
914
|
+
{
|
915
|
+
label: id2label[x[0].to_s],
|
916
|
+
score: x[1]
|
917
|
+
}
|
918
|
+
end
|
919
|
+
|
920
|
+
if top_k == 1
|
921
|
+
to_return.concat(vals)
|
922
|
+
else
|
923
|
+
to_return << vals
|
924
|
+
end
|
925
|
+
end
|
926
|
+
!single || top_k == 1 ? to_return : to_return[0]
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
930
|
+
class ZeroShotAudioClassificationPipeline < Pipeline
|
931
|
+
def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
|
932
|
+
single = !audio.is_a?(Array)
|
933
|
+
if single
|
934
|
+
audio = [audio]
|
935
|
+
end
|
936
|
+
|
937
|
+
# Insert label into hypothesis template
|
938
|
+
texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
939
|
+
|
940
|
+
# Run tokenization
|
941
|
+
text_inputs =
|
942
|
+
@tokenizer.(
|
943
|
+
texts,
|
944
|
+
padding: true,
|
945
|
+
truncation: true
|
946
|
+
)
|
947
|
+
|
948
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
949
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
950
|
+
|
951
|
+
to_return = []
|
952
|
+
prepared_audios.each do |aud|
|
953
|
+
audio_inputs = @processor.(aud)
|
954
|
+
|
955
|
+
# Run model with both text and audio inputs
|
956
|
+
output = @model.(text_inputs.merge(audio_inputs))
|
957
|
+
|
958
|
+
# Compute softmax per audio
|
959
|
+
probs = Utils.softmax(output.logits_per_audio.data)
|
960
|
+
|
961
|
+
to_return <<
|
962
|
+
probs.map.with_index do |x, i|
|
963
|
+
{
|
964
|
+
label: candidate_labels[i],
|
965
|
+
score: x
|
966
|
+
}
|
967
|
+
end
|
968
|
+
end
|
969
|
+
single ? to_return[0] : to_return
|
970
|
+
end
|
971
|
+
end
|
972
|
+
|
973
|
+
class AutomaticSpeechRecognitionPipeline < Pipeline
|
974
|
+
def call(audio, **kwargs)
|
975
|
+
case @model.config["model_type"]
|
976
|
+
when "whisper"
|
977
|
+
call_whisper(audio, **kwargs)
|
978
|
+
else
|
979
|
+
raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
private
|
984
|
+
|
985
|
+
def call_whisper(audio, **kwargs)
|
986
|
+
raise Todo
|
987
|
+
end
|
988
|
+
end
|
989
|
+
|
861
990
|
class ImageToImagePipeline < Pipeline
|
862
991
|
def call(images)
|
863
992
|
prepared_images = prepare_images(images)
|
864
993
|
inputs = @processor.(prepared_images)
|
865
|
-
outputs = @model.(inputs)
|
994
|
+
outputs = @model.(inputs)
|
866
995
|
|
867
996
|
to_return = []
|
868
997
|
outputs[0].each do |batch|
|
@@ -1033,6 +1162,47 @@ module Informers
|
|
1033
1162
|
},
|
1034
1163
|
type: "text"
|
1035
1164
|
},
|
1165
|
+
"audio-classification" => {
|
1166
|
+
pipeline: AudioClassificationPipeline,
|
1167
|
+
model: AutoModelForAudioClassification,
|
1168
|
+
processor: AutoProcessor,
|
1169
|
+
default: {
|
1170
|
+
model: "Xenova/wav2vec2-base-superb-ks"
|
1171
|
+
},
|
1172
|
+
type: "audio"
|
1173
|
+
},
|
1174
|
+
# TODO
|
1175
|
+
# "zero-shot-audio-classification" => {
|
1176
|
+
# tokenizer: AutoTokenizer,
|
1177
|
+
# pipeline: ZeroShotAudioClassificationPipeline,
|
1178
|
+
# model: AutoModel,
|
1179
|
+
# processor: AutoProcessor,
|
1180
|
+
# default: {
|
1181
|
+
# model: "Xenova/clap-htsat-unfused"
|
1182
|
+
# },
|
1183
|
+
# type: "multimodal"
|
1184
|
+
# },
|
1185
|
+
# TODO
|
1186
|
+
# "automatic-speech-recognition" => {
|
1187
|
+
# tokenizer: AutoTokenizer,
|
1188
|
+
# pipeline: AutomaticSpeechRecognitionPipeline,
|
1189
|
+
# model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
|
1190
|
+
# processor: AutoProcessor,
|
1191
|
+
# default: {
|
1192
|
+
# model: "Xenova/whisper-tiny.en"
|
1193
|
+
# },
|
1194
|
+
# type: "multimodal"
|
1195
|
+
# },
|
1196
|
+
"text-to-audio" => {
|
1197
|
+
tokenizer: AutoTokenizer,
|
1198
|
+
pipeline: TextToAudioPipeline,
|
1199
|
+
model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
|
1200
|
+
processor: [AutoProcessor, nil],
|
1201
|
+
default: {
|
1202
|
+
model: "Xenova/speecht5_tts"
|
1203
|
+
},
|
1204
|
+
type: "text"
|
1205
|
+
},
|
1036
1206
|
"image-to-text" => {
|
1037
1207
|
tokenizer: AutoTokenizer,
|
1038
1208
|
pipeline: ImageToTextPipeline,
|
@@ -1048,7 +1218,7 @@ module Informers
|
|
1048
1218
|
model: AutoModelForImageClassification,
|
1049
1219
|
processor: AutoProcessor,
|
1050
1220
|
default: {
|
1051
|
-
model: "Xenova/vit-base-patch16-224"
|
1221
|
+
model: "Xenova/vit-base-patch16-224"
|
1052
1222
|
},
|
1053
1223
|
type: "multimodal"
|
1054
1224
|
},
|
@@ -1057,7 +1227,7 @@ module Informers
|
|
1057
1227
|
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
|
1058
1228
|
processor: AutoProcessor,
|
1059
1229
|
default: {
|
1060
|
-
model: "Xenova/detr-resnet-50-panoptic"
|
1230
|
+
model: "Xenova/detr-resnet-50-panoptic"
|
1061
1231
|
},
|
1062
1232
|
type: "multimodal"
|
1063
1233
|
},
|
@@ -1076,7 +1246,7 @@ module Informers
|
|
1076
1246
|
model: AutoModelForObjectDetection,
|
1077
1247
|
processor: AutoProcessor,
|
1078
1248
|
default: {
|
1079
|
-
model: "Xenova/detr-resnet-50"
|
1249
|
+
model: "Xenova/detr-resnet-50"
|
1080
1250
|
},
|
1081
1251
|
type: "multimodal"
|
1082
1252
|
},
|
@@ -1158,7 +1328,8 @@ module Informers
|
|
1158
1328
|
|
1159
1329
|
TASK_ALIASES = {
|
1160
1330
|
"sentiment-analysis" => "text-classification",
|
1161
|
-
"ner" => "token-classification"
|
1331
|
+
"ner" => "token-classification",
|
1332
|
+
"text-to-speech" => "text-to-audio"
|
1162
1333
|
}
|
1163
1334
|
|
1164
1335
|
DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
|
@@ -1231,7 +1402,8 @@ module Informers
|
|
1231
1402
|
results = load_items(classes, model, pretrained_options)
|
1232
1403
|
results[:task] = task
|
1233
1404
|
|
1234
|
-
|
1405
|
+
# for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
1406
|
+
if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
|
1235
1407
|
results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
|
1236
1408
|
end
|
1237
1409
|
|
data/lib/informers/processors.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Informers
|
2
2
|
class FeatureExtractor
|
3
|
+
attr_reader :config
|
4
|
+
|
3
5
|
def initialize(config)
|
4
6
|
super()
|
5
7
|
@config = config
|
@@ -728,6 +730,61 @@ module Informers
|
|
728
730
|
end
|
729
731
|
end
|
730
732
|
|
733
|
+
class WhisperFeatureExtractor < FeatureExtractor
|
734
|
+
def initialize(config)
|
735
|
+
super(config)
|
736
|
+
|
737
|
+
raise Todo
|
738
|
+
end
|
739
|
+
|
740
|
+
def _extract_fbank_features(waveform)
|
741
|
+
raise Todo
|
742
|
+
end
|
743
|
+
|
744
|
+
def call(audio)
|
745
|
+
raise Todo
|
746
|
+
end
|
747
|
+
end
|
748
|
+
|
749
|
+
class Wav2Vec2FeatureExtractor < FeatureExtractor
|
750
|
+
def _zero_mean_unit_var_norm(input_values)
|
751
|
+
sum = input_values.sum
|
752
|
+
mean = sum / input_values.length.to_f
|
753
|
+
variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
|
754
|
+
input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
|
755
|
+
end
|
756
|
+
|
757
|
+
def call(audio)
|
758
|
+
# TODO
|
759
|
+
# validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')
|
760
|
+
|
761
|
+
input_values = audio
|
762
|
+
|
763
|
+
# zero-mean and unit-variance normalization
|
764
|
+
if @config["do_normalize"]
|
765
|
+
input_values = _zero_mean_unit_var_norm(input_values)
|
766
|
+
end
|
767
|
+
|
768
|
+
# TODO: allow user to pass in attention mask
|
769
|
+
{
|
770
|
+
input_values: [input_values],
|
771
|
+
attention_mask: [Array.new(input_values.length, 1)]
|
772
|
+
}
|
773
|
+
end
|
774
|
+
end
|
775
|
+
|
776
|
+
class ClapFeatureExtractor < FeatureExtractor
|
777
|
+
def initialize(config)
|
778
|
+
super(config)
|
779
|
+
|
780
|
+
# TODO
|
781
|
+
end
|
782
|
+
|
783
|
+
def call(audio, max_length: nil)
|
784
|
+
raise Todo
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
731
788
|
class Processor
|
732
789
|
attr_reader :feature_extractor
|
733
790
|
|
@@ -748,7 +805,10 @@ module Informers
|
|
748
805
|
"DPTFeatureExtractor" => DPTFeatureExtractor,
|
749
806
|
"DetrFeatureExtractor" => DetrFeatureExtractor,
|
750
807
|
"Swin2SRImageProcessor" => Swin2SRImageProcessor,
|
751
|
-
"DonutFeatureExtractor" => DonutFeatureExtractor
|
808
|
+
"DonutFeatureExtractor" => DonutFeatureExtractor,
|
809
|
+
"WhisperFeatureExtractor" => WhisperFeatureExtractor,
|
810
|
+
"Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
|
811
|
+
"ClapFeatureExtractor" => ClapFeatureExtractor
|
752
812
|
}
|
753
813
|
|
754
814
|
PROCESSOR_CLASS_MAPPING = {}
|
@@ -762,7 +822,7 @@ module Informers
|
|
762
822
|
revision: "main",
|
763
823
|
**kwargs
|
764
824
|
)
|
765
|
-
preprocessor_config = config || Utils::Hub
|
825
|
+
preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
|
766
826
|
progress_callback:,
|
767
827
|
config:,
|
768
828
|
cache_dir:,
|
data/lib/informers/tokenizers.rb
CHANGED
@@ -244,6 +244,9 @@ module Informers
|
|
244
244
|
end
|
245
245
|
end
|
246
246
|
|
247
|
+
class SpeechT5Tokenizer < PreTrainedTokenizer
|
248
|
+
end
|
249
|
+
|
247
250
|
class AutoTokenizer
|
248
251
|
TOKENIZER_CLASS_MAPPING = {
|
249
252
|
"T5Tokenizer" => T5Tokenizer,
|
@@ -257,7 +260,8 @@ module Informers
|
|
257
260
|
"CLIPTokenizer" => CLIPTokenizer,
|
258
261
|
"GPT2Tokenizer" => GPT2Tokenizer,
|
259
262
|
"NllbTokenizer" => NllbTokenizer,
|
260
|
-
"M2M100Tokenizer" => M2M100Tokenizer
|
263
|
+
"M2M100Tokenizer" => M2M100Tokenizer,
|
264
|
+
"SpeechT5Tokenizer" => SpeechT5Tokenizer
|
261
265
|
}
|
262
266
|
|
263
267
|
def self.from_pretrained(
|
@@ -296,7 +300,7 @@ module Informers
|
|
296
300
|
def self.load_tokenizer(pretrained_model_name_or_path, **options)
|
297
301
|
info = [
|
298
302
|
Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
|
299
|
-
Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
|
303
|
+
Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
|
300
304
|
]
|
301
305
|
|
302
306
|
# Override legacy option if `options.legacy` is not null
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Informers
|
2
|
+
module Utils
|
3
|
+
def self.read_audio(input, sampling_rate)
|
4
|
+
data =
|
5
|
+
if input.is_a?(URI)
|
6
|
+
require "open-uri"
|
7
|
+
|
8
|
+
input.read
|
9
|
+
elsif input.is_a?(String)
|
10
|
+
File.binread(input)
|
11
|
+
else
|
12
|
+
raise ArgumentError, "Unsupported input type: #{input.class.name}"
|
13
|
+
end
|
14
|
+
|
15
|
+
ffmpeg_read(data, sampling_rate)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Informers
|
16
|
+
module Utils
|
17
|
+
# from the Transformers Python library
|
18
|
+
def self.ffmpeg_read(data, sampling_rate)
|
19
|
+
ar = "#{sampling_rate}"
|
20
|
+
ac = "1"
|
21
|
+
format_for_conversion = "f32le"
|
22
|
+
ffmpeg_command = [
|
23
|
+
"ffmpeg",
|
24
|
+
"-i",
|
25
|
+
"pipe:0",
|
26
|
+
"-ac",
|
27
|
+
ac,
|
28
|
+
"-ar",
|
29
|
+
ar,
|
30
|
+
"-f",
|
31
|
+
format_for_conversion,
|
32
|
+
"-hide_banner",
|
33
|
+
"-loglevel",
|
34
|
+
"quiet",
|
35
|
+
"pipe:1"
|
36
|
+
]
|
37
|
+
|
38
|
+
stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
|
39
|
+
if !status.success?
|
40
|
+
raise Error, "ffmpeg was not found but is required to load audio files from filename"
|
41
|
+
end
|
42
|
+
stdout.unpack("e*")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/informers/utils/math.rb
CHANGED
@@ -14,8 +14,8 @@ module Informers
|
|
14
14
|
out_img = Array.new(out_height * out_width * in_channels)
|
15
15
|
|
16
16
|
# Pre-calculate strides
|
17
|
-
in_stride = in_height * in_width
|
18
|
-
out_stride = out_height * out_width
|
17
|
+
in_stride = in_height * in_width
|
18
|
+
out_stride = out_height * out_width
|
19
19
|
|
20
20
|
out_height.times do |i|
|
21
21
|
out_width.times do |j|
|
data/lib/informers/version.rb
CHANGED
data/lib/informers.rb
CHANGED
@@ -6,12 +6,15 @@ require "tokenizers"
|
|
6
6
|
require "io/console"
|
7
7
|
require "json"
|
8
8
|
require "open-uri"
|
9
|
+
require "open3"
|
9
10
|
require "stringio"
|
10
11
|
require "uri"
|
11
12
|
|
12
13
|
# modules
|
14
|
+
require_relative "informers/utils/audio"
|
13
15
|
require_relative "informers/utils/core"
|
14
16
|
require_relative "informers/utils/generation"
|
17
|
+
require_relative "informers/utils/ffmpeg"
|
15
18
|
require_relative "informers/utils/hub"
|
16
19
|
require_relative "informers/utils/image"
|
17
20
|
require_relative "informers/utils/math"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: informers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: onnxruntime
|
@@ -55,7 +55,9 @@ files:
|
|
55
55
|
- lib/informers/pipelines.rb
|
56
56
|
- lib/informers/processors.rb
|
57
57
|
- lib/informers/tokenizers.rb
|
58
|
+
- lib/informers/utils/audio.rb
|
58
59
|
- lib/informers/utils/core.rb
|
60
|
+
- lib/informers/utils/ffmpeg.rb
|
59
61
|
- lib/informers/utils/generation.rb
|
60
62
|
- lib/informers/utils/hub.rb
|
61
63
|
- lib/informers/utils/image.rb
|