informers 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +14 -7
- data/lib/informers/models.rb +165 -5
- data/lib/informers/pipelines.rb +180 -8
- data/lib/informers/processors.rb +62 -2
- data/lib/informers/tokenizers.rb +6 -2
- data/lib/informers/utils/audio.rb +18 -0
- data/lib/informers/utils/ffmpeg.rb +45 -0
- data/lib/informers/utils/image.rb +1 -1
- data/lib/informers/utils/math.rb +2 -2
- data/lib/informers/utils/tensor.rb +1 -1
- data/lib/informers/version.rb +1 -1
- data/lib/informers.rb +3 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a61f01755798e81a975641d60e5bfe09484ced7ce6a3453020c9978dc35b1942
|
4
|
+
data.tar.gz: 811f9c1dc4499ae7de8ebf8e02c0c4e98a0c0bc0af6aaca51025e42ba8165540
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97b27363fab1e43895e368dbddc819fd4db23d42ce517359e5971347cd902b654f0c66700f07b36cd5f476bd3ea205a91e4f7e7ee0e7d8d455f0dce377bedb2b
|
7
|
+
data.tar.gz: dd1a7f795609423419ce213b00a5aca409f6b4a5bffb111250b4deffcbc6a8113fadf8d603c59fa78fa0f310904a0a3299e3bcdc48101f574171a024d13567e6
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -229,19 +229,13 @@ result = model.(query, docs)
|
|
229
229
|
|
230
230
|
### Other
|
231
231
|
|
232
|
-
You can use the feature extraction pipeline directly.
|
233
|
-
|
234
|
-
```ruby
|
235
|
-
model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
|
236
|
-
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
237
|
-
```
|
238
|
-
|
239
232
|
The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
|
240
233
|
|
241
234
|
## Pipelines
|
242
235
|
|
243
236
|
- [Text](#text)
|
244
237
|
- [Vision](#vision)
|
238
|
+
- [Audio](#audio)
|
245
239
|
- [Multimodel](#multimodal)
|
246
240
|
|
247
241
|
### Text
|
@@ -332,6 +326,8 @@ extractor.("We are very happy to show you the 🤗 Transformers library.")
|
|
332
326
|
|
333
327
|
### Vision
|
334
328
|
|
329
|
+
Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images
|
330
|
+
|
335
331
|
Image classification
|
336
332
|
|
337
333
|
```ruby
|
@@ -388,6 +384,17 @@ extractor = Informers.pipeline("image-feature-extraction")
|
|
388
384
|
extractor.("image.jpg")
|
389
385
|
```
|
390
386
|
|
387
|
+
### Audio
|
388
|
+
|
389
|
+
Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files
|
390
|
+
|
391
|
+
Audio classification
|
392
|
+
|
393
|
+
```ruby
|
394
|
+
classifier = Informers.pipeline("audio-classification")
|
395
|
+
classifier.("audio.wav")
|
396
|
+
```
|
397
|
+
|
391
398
|
### Multimodal
|
392
399
|
|
393
400
|
Image captioning
|
data/lib/informers/models.rb
CHANGED
@@ -84,6 +84,7 @@ module Informers
|
|
84
84
|
@get_start_beams = method(:decoder_start_beams)
|
85
85
|
@update_beam = method(:decoder_update_beam)
|
86
86
|
@forward = method(:decoder_forward)
|
87
|
+
|
87
88
|
when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
|
88
89
|
@can_generate = true
|
89
90
|
|
@@ -91,8 +92,10 @@ module Informers
|
|
91
92
|
@get_start_beams = method(:seq2seq_start_beams)
|
92
93
|
@update_beam = method(:seq2seq_update_beam)
|
93
94
|
@forward = method(:seq2seq_forward)
|
95
|
+
|
94
96
|
when MODEL_TYPES[:EncoderDecoder]
|
95
|
-
|
97
|
+
@forward = method(:encoder_forward)
|
98
|
+
|
96
99
|
else
|
97
100
|
@forward = method(:encoder_forward)
|
98
101
|
end
|
@@ -137,10 +140,18 @@ module Informers
|
|
137
140
|
]
|
138
141
|
|
139
142
|
elsif model_type == MODEL_TYPES[:MaskGeneration]
|
140
|
-
|
143
|
+
info = [
|
144
|
+
AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
|
145
|
+
construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
|
146
|
+
construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
|
147
|
+
]
|
141
148
|
|
142
149
|
elsif model_type == MODEL_TYPES[:EncoderDecoder]
|
143
|
-
|
150
|
+
info = [
|
151
|
+
AutoConfig.from_pretrained(pretrained_model_name_or_path, **options),
|
152
|
+
construct_session(pretrained_model_name_or_path, "encoder_model", **options),
|
153
|
+
construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
|
154
|
+
]
|
144
155
|
|
145
156
|
else
|
146
157
|
if model_type != MODEL_TYPES[:EncoderOnly]
|
@@ -293,13 +304,13 @@ module Informers
|
|
293
304
|
grouped_beams = group_beams(beams)
|
294
305
|
|
295
306
|
get_flattened = lambda do |key|
|
296
|
-
grouped_beams.
|
307
|
+
grouped_beams.flat_map do |batch|
|
297
308
|
if generation_config["num_return_sequences"] > 1
|
298
309
|
raise Todo
|
299
310
|
else
|
300
311
|
[batch[0][key]]
|
301
312
|
end
|
302
|
-
end
|
313
|
+
end
|
303
314
|
end
|
304
315
|
|
305
316
|
sequences = get_flattened.(:output_token_ids) # [1, seqLength]
|
@@ -904,6 +915,18 @@ module Informers
|
|
904
915
|
end
|
905
916
|
end
|
906
917
|
|
918
|
+
class Wav2Vec2PreTrainedModel < PreTrainedModel
|
919
|
+
end
|
920
|
+
|
921
|
+
class Wav2Vec2Model < Wav2Vec2PreTrainedModel
|
922
|
+
end
|
923
|
+
|
924
|
+
class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
|
925
|
+
def call(model_inputs)
|
926
|
+
SequenceClassifierOutput.new(*super(model_inputs))
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
907
930
|
class RobertaPreTrainedModel < PreTrainedModel
|
908
931
|
end
|
909
932
|
|
@@ -1066,6 +1089,62 @@ module Informers
|
|
1066
1089
|
class DonutSwinModel < DonutSwinPreTrainedModel
|
1067
1090
|
end
|
1068
1091
|
|
1092
|
+
class WhisperPreTrainedModel < PreTrainedModel
|
1093
|
+
end
|
1094
|
+
|
1095
|
+
class WhisperModel < WhisperPreTrainedModel
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
class WhisperForConditionalGeneration < WhisperPreTrainedModel
|
1099
|
+
REQUIRES_ATTENTION_MASK = false
|
1100
|
+
MAIN_INPUT_NAME = :input_features
|
1101
|
+
|
1102
|
+
def initialize(config, session, decoder_merged_session, generation_config)
|
1103
|
+
super(config, session)
|
1104
|
+
@decoder_merged_session = decoder_merged_session
|
1105
|
+
@generation_config = generation_config
|
1106
|
+
|
1107
|
+
@num_decoder_layers = @config["decoder_layers"]
|
1108
|
+
@num_decoder_heads = @config["decoder_attention_heads"]
|
1109
|
+
@decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f
|
1110
|
+
|
1111
|
+
@num_encoder_layers = @config["encoder_layers"]
|
1112
|
+
@num_encoder_heads = @config["encoder_attention_heads"]
|
1113
|
+
@encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
def generate(inputs, generation_config = nil, logits_processor = nil)
|
1117
|
+
raise Todo
|
1118
|
+
end
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
class VitsPreTrainedModel < PreTrainedModel
|
1122
|
+
end
|
1123
|
+
|
1124
|
+
class VitsModel < VitsPreTrainedModel
|
1125
|
+
def call(model_inputs)
|
1126
|
+
VitsModelOutput.new(*super(model_inputs))
|
1127
|
+
end
|
1128
|
+
end
|
1129
|
+
|
1130
|
+
class SpeechT5PreTrainedModel < PreTrainedModel
|
1131
|
+
end
|
1132
|
+
|
1133
|
+
class SpeechT5Model < SpeechT5PreTrainedModel
|
1134
|
+
end
|
1135
|
+
|
1136
|
+
class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
|
1140
|
+
end
|
1141
|
+
|
1142
|
+
class ClapPreTrainedModel < PreTrainedModel
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
class ClapModel < ClapPreTrainedModel
|
1146
|
+
end
|
1147
|
+
|
1069
1148
|
MODEL_MAPPING_NAMES_ENCODER_ONLY = {
|
1070
1149
|
"bert" => ["BertModel", BertModel],
|
1071
1150
|
"nomic_bert" => ["NomicBertModel", NomicBertModel],
|
@@ -1074,6 +1153,7 @@ module Informers
|
|
1074
1153
|
"distilbert" => ["DistilBertModel", DistilBertModel],
|
1075
1154
|
"roberta" => ["RobertaModel", RobertaModel],
|
1076
1155
|
"xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
|
1156
|
+
"clap" => ["ClapModel", ClapModel],
|
1077
1157
|
"clip" => ["CLIPModel", CLIPModel],
|
1078
1158
|
"detr" => ["DetrModel", DetrModel],
|
1079
1159
|
"vit" => ["ViTModel", ViTModel],
|
@@ -1085,6 +1165,21 @@ module Informers
|
|
1085
1165
|
"bart" => ["BartModel", BartModel]
|
1086
1166
|
}
|
1087
1167
|
|
1168
|
+
MODEL_MAPPING_NAMES_DECODER_ONLY = {
|
1169
|
+
}
|
1170
|
+
|
1171
|
+
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
|
1172
|
+
"whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
|
1173
|
+
}
|
1174
|
+
|
1175
|
+
MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
|
1176
|
+
"speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
|
1177
|
+
}
|
1178
|
+
|
1179
|
+
MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
|
1180
|
+
"vits" => ["VitsModel", VitsModel]
|
1181
|
+
}
|
1182
|
+
|
1088
1183
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
|
1089
1184
|
"bert" => ["BertForSequenceClassification", BertForSequenceClassification],
|
1090
1185
|
"distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
|
@@ -1143,6 +1238,25 @@ module Informers
|
|
1143
1238
|
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
|
1144
1239
|
}
|
1145
1240
|
|
1241
|
+
MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
|
1242
|
+
}
|
1243
|
+
|
1244
|
+
MODEL_FOR_CTC_MAPPING_NAMES = {
|
1245
|
+
}
|
1246
|
+
|
1247
|
+
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
|
1248
|
+
"wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
|
1252
|
+
}
|
1253
|
+
|
1254
|
+
MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
|
1258
|
+
}
|
1259
|
+
|
1146
1260
|
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
|
1147
1261
|
"swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
|
1148
1262
|
}
|
@@ -1157,9 +1271,11 @@ module Informers
|
|
1157
1271
|
MODEL_CLASS_TYPE_MAPPING = [
|
1158
1272
|
[MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
|
1159
1273
|
[MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
|
1274
|
+
[MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
|
1160
1275
|
[MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1161
1276
|
[MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1162
1277
|
[MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
|
1278
|
+
[MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
|
1163
1279
|
[MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
|
1164
1280
|
[MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1165
1281
|
[MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
@@ -1167,10 +1283,18 @@ module Informers
|
|
1167
1283
|
[MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1168
1284
|
[MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1169
1285
|
[MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1286
|
+
[MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1170
1287
|
[MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1171
1288
|
[MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1172
1289
|
[MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1173
1290
|
[MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1291
|
+
[MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
|
1292
|
+
[MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1293
|
+
[MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1294
|
+
[MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
|
1295
|
+
[MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1296
|
+
[MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1297
|
+
[MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
1174
1298
|
[MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
|
1175
1299
|
]
|
1176
1300
|
|
@@ -1199,6 +1323,18 @@ module Informers
|
|
1199
1323
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
|
1200
1324
|
end
|
1201
1325
|
|
1326
|
+
class AutoModelForSpeechSeq2Seq < PretrainedMixin
|
1327
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
|
1328
|
+
end
|
1329
|
+
|
1330
|
+
class AutoModelForTextToSpectrogram < PretrainedMixin
|
1331
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
|
1332
|
+
end
|
1333
|
+
|
1334
|
+
class AutoModelForTextToWaveform < PretrainedMixin
|
1335
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
|
1336
|
+
end
|
1337
|
+
|
1202
1338
|
class AutoModelForCausalLM < PretrainedMixin
|
1203
1339
|
MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
|
1204
1340
|
end
|
@@ -1235,10 +1371,34 @@ module Informers
|
|
1235
1371
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
|
1236
1372
|
end
|
1237
1373
|
|
1374
|
+
class AutoModelForMaskGeneration < PretrainedMixin
|
1375
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
|
1376
|
+
end
|
1377
|
+
|
1378
|
+
class AutoModelForCTC < PretrainedMixin
|
1379
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
|
1380
|
+
end
|
1381
|
+
|
1382
|
+
class AutoModelForAudioClassification < PretrainedMixin
|
1383
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
|
1384
|
+
end
|
1385
|
+
|
1386
|
+
class AutoModelForXVector < PretrainedMixin
|
1387
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
|
1388
|
+
end
|
1389
|
+
|
1390
|
+
class AutoModelForAudioFrameClassification < PretrainedMixin
|
1391
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
|
1392
|
+
end
|
1393
|
+
|
1238
1394
|
class AutoModelForDocumentQuestionAnswering < PretrainedMixin
|
1239
1395
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
|
1240
1396
|
end
|
1241
1397
|
|
1398
|
+
class AutoModelForImageMatting < PretrainedMixin
|
1399
|
+
MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
|
1400
|
+
end
|
1401
|
+
|
1242
1402
|
class AutoModelForImageToImage < PretrainedMixin
|
1243
1403
|
MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
|
1244
1404
|
end
|
data/lib/informers/pipelines.rb
CHANGED
@@ -19,6 +19,20 @@ module Informers
|
|
19
19
|
images.map { |x| Utils::RawImage.read(x) }
|
20
20
|
end
|
21
21
|
|
22
|
+
def prepare_audios(audios, sampling_rate)
|
23
|
+
if !audios.is_a?(Array)
|
24
|
+
audios = [audios]
|
25
|
+
end
|
26
|
+
|
27
|
+
audios.map do |x|
|
28
|
+
if x.is_a?(String) || x.is_a?(URI)
|
29
|
+
Utils.read_audio(x, sampling_rate)
|
30
|
+
else
|
31
|
+
x
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
22
36
|
def get_bounding_box(box, as_integer)
|
23
37
|
if as_integer
|
24
38
|
box = box.map { |x| x.to_i }
|
@@ -729,7 +743,7 @@ module Informers
|
|
729
743
|
{
|
730
744
|
label: candidate_labels[processed[:classes][i]],
|
731
745
|
score: processed[:scores][i],
|
732
|
-
box: get_bounding_box(box, !percentage)
|
746
|
+
box: get_bounding_box(box, !percentage)
|
733
747
|
}
|
734
748
|
end
|
735
749
|
result.sort_by! { |v| -v[:score] }
|
@@ -784,6 +798,26 @@ module Informers
|
|
784
798
|
end
|
785
799
|
end
|
786
800
|
|
801
|
+
class TextToAudioPipeline < Pipeline
|
802
|
+
DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
|
803
|
+
|
804
|
+
def initialize(**options)
|
805
|
+
super(**options)
|
806
|
+
|
807
|
+
# TODO: Find a better way for `pipeline` to set the default vocoder
|
808
|
+
@vocoder = options[:vocoder]
|
809
|
+
end
|
810
|
+
|
811
|
+
def call(text_inputs, speaker_embeddings: nil)
|
812
|
+
# If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
|
813
|
+
if @processor
|
814
|
+
call_text_to_spectrogram(text_inputs, speaker_embeddings:)
|
815
|
+
else
|
816
|
+
call_text_to_waveform(text_inputs)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
end
|
820
|
+
|
787
821
|
class FeatureExtractionPipeline < Pipeline
|
788
822
|
def call(
|
789
823
|
texts,
|
@@ -803,7 +837,7 @@ module Informers
|
|
803
837
|
if !model_output.nil?
|
804
838
|
model_options[:output_names] = Array(model_output)
|
805
839
|
elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
806
|
-
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
840
|
+
# optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
807
841
|
model_options[:output_names] = ["sentence_embedding"]
|
808
842
|
pooling = "none"
|
809
843
|
normalize = false
|
@@ -858,11 +892,106 @@ module Informers
|
|
858
892
|
end
|
859
893
|
end
|
860
894
|
|
895
|
+
class AudioClassificationPipeline < Pipeline
|
896
|
+
def call(audio, top_k: nil)
|
897
|
+
single = !audio.is_a?(Array)
|
898
|
+
|
899
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
900
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
901
|
+
|
902
|
+
id2label = @model.config[:id2label]
|
903
|
+
|
904
|
+
to_return = []
|
905
|
+
prepared_audios.each do |aud|
|
906
|
+
inputs = @processor.(aud)
|
907
|
+
output = @model.(inputs)
|
908
|
+
logits = output.logits[0]
|
909
|
+
|
910
|
+
scores = Utils.get_top_items(Utils.softmax(logits), top_k)
|
911
|
+
|
912
|
+
vals =
|
913
|
+
scores.map do |x|
|
914
|
+
{
|
915
|
+
label: id2label[x[0].to_s],
|
916
|
+
score: x[1]
|
917
|
+
}
|
918
|
+
end
|
919
|
+
|
920
|
+
if top_k == 1
|
921
|
+
to_return.concat(vals)
|
922
|
+
else
|
923
|
+
to_return << vals
|
924
|
+
end
|
925
|
+
end
|
926
|
+
!single || top_k == 1 ? to_return : to_return[0]
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
930
|
+
class ZeroShotAudioClassificationPipeline < Pipeline
|
931
|
+
def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
|
932
|
+
single = !audio.is_a?(Array)
|
933
|
+
if single
|
934
|
+
audio = [audio]
|
935
|
+
end
|
936
|
+
|
937
|
+
# Insert label into hypothesis template
|
938
|
+
texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
939
|
+
|
940
|
+
# Run tokenization
|
941
|
+
text_inputs =
|
942
|
+
@tokenizer.(
|
943
|
+
texts,
|
944
|
+
padding: true,
|
945
|
+
truncation: true
|
946
|
+
)
|
947
|
+
|
948
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
949
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
950
|
+
|
951
|
+
to_return = []
|
952
|
+
prepared_audios.each do |aud|
|
953
|
+
audio_inputs = @processor.(aud)
|
954
|
+
|
955
|
+
# Run model with both text and audio inputs
|
956
|
+
output = @model.(text_inputs.merge(audio_inputs))
|
957
|
+
|
958
|
+
# Compute softmax per audio
|
959
|
+
probs = Utils.softmax(output.logits_per_audio.data)
|
960
|
+
|
961
|
+
to_return <<
|
962
|
+
probs.map.with_index do |x, i|
|
963
|
+
{
|
964
|
+
label: candidate_labels[i],
|
965
|
+
score: x
|
966
|
+
}
|
967
|
+
end
|
968
|
+
end
|
969
|
+
single ? to_return[0] : to_return
|
970
|
+
end
|
971
|
+
end
|
972
|
+
|
973
|
+
class AutomaticSpeechRecognitionPipeline < Pipeline
|
974
|
+
def call(audio, **kwargs)
|
975
|
+
case @model.config["model_type"]
|
976
|
+
when "whisper"
|
977
|
+
call_whisper(audio, **kwargs)
|
978
|
+
else
|
979
|
+
raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
private
|
984
|
+
|
985
|
+
def call_whisper(audio, **kwargs)
|
986
|
+
raise Todo
|
987
|
+
end
|
988
|
+
end
|
989
|
+
|
861
990
|
class ImageToImagePipeline < Pipeline
|
862
991
|
def call(images)
|
863
992
|
prepared_images = prepare_images(images)
|
864
993
|
inputs = @processor.(prepared_images)
|
865
|
-
outputs = @model.(inputs)
|
994
|
+
outputs = @model.(inputs)
|
866
995
|
|
867
996
|
to_return = []
|
868
997
|
outputs[0].each do |batch|
|
@@ -1033,6 +1162,47 @@ module Informers
|
|
1033
1162
|
},
|
1034
1163
|
type: "text"
|
1035
1164
|
},
|
1165
|
+
"audio-classification" => {
|
1166
|
+
pipeline: AudioClassificationPipeline,
|
1167
|
+
model: AutoModelForAudioClassification,
|
1168
|
+
processor: AutoProcessor,
|
1169
|
+
default: {
|
1170
|
+
model: "Xenova/wav2vec2-base-superb-ks"
|
1171
|
+
},
|
1172
|
+
type: "audio"
|
1173
|
+
},
|
1174
|
+
# TODO
|
1175
|
+
# "zero-shot-audio-classification" => {
|
1176
|
+
# tokenizer: AutoTokenizer,
|
1177
|
+
# pipeline: ZeroShotAudioClassificationPipeline,
|
1178
|
+
# model: AutoModel,
|
1179
|
+
# processor: AutoProcessor,
|
1180
|
+
# default: {
|
1181
|
+
# model: "Xenova/clap-htsat-unfused"
|
1182
|
+
# },
|
1183
|
+
# type: "multimodal"
|
1184
|
+
# },
|
1185
|
+
# TODO
|
1186
|
+
# "automatic-speech-recognition" => {
|
1187
|
+
# tokenizer: AutoTokenizer,
|
1188
|
+
# pipeline: AutomaticSpeechRecognitionPipeline,
|
1189
|
+
# model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
|
1190
|
+
# processor: AutoProcessor,
|
1191
|
+
# default: {
|
1192
|
+
# model: "Xenova/whisper-tiny.en"
|
1193
|
+
# },
|
1194
|
+
# type: "multimodal"
|
1195
|
+
# },
|
1196
|
+
"text-to-audio" => {
|
1197
|
+
tokenizer: AutoTokenizer,
|
1198
|
+
pipeline: TextToAudioPipeline,
|
1199
|
+
model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
|
1200
|
+
processor: [AutoProcessor, nil],
|
1201
|
+
default: {
|
1202
|
+
model: "Xenova/speecht5_tts"
|
1203
|
+
},
|
1204
|
+
type: "text"
|
1205
|
+
},
|
1036
1206
|
"image-to-text" => {
|
1037
1207
|
tokenizer: AutoTokenizer,
|
1038
1208
|
pipeline: ImageToTextPipeline,
|
@@ -1048,7 +1218,7 @@ module Informers
|
|
1048
1218
|
model: AutoModelForImageClassification,
|
1049
1219
|
processor: AutoProcessor,
|
1050
1220
|
default: {
|
1051
|
-
model: "Xenova/vit-base-patch16-224"
|
1221
|
+
model: "Xenova/vit-base-patch16-224"
|
1052
1222
|
},
|
1053
1223
|
type: "multimodal"
|
1054
1224
|
},
|
@@ -1057,7 +1227,7 @@ module Informers
|
|
1057
1227
|
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
|
1058
1228
|
processor: AutoProcessor,
|
1059
1229
|
default: {
|
1060
|
-
model: "Xenova/detr-resnet-50-panoptic"
|
1230
|
+
model: "Xenova/detr-resnet-50-panoptic"
|
1061
1231
|
},
|
1062
1232
|
type: "multimodal"
|
1063
1233
|
},
|
@@ -1076,7 +1246,7 @@ module Informers
|
|
1076
1246
|
model: AutoModelForObjectDetection,
|
1077
1247
|
processor: AutoProcessor,
|
1078
1248
|
default: {
|
1079
|
-
model: "Xenova/detr-resnet-50"
|
1249
|
+
model: "Xenova/detr-resnet-50"
|
1080
1250
|
},
|
1081
1251
|
type: "multimodal"
|
1082
1252
|
},
|
@@ -1158,7 +1328,8 @@ module Informers
|
|
1158
1328
|
|
1159
1329
|
TASK_ALIASES = {
|
1160
1330
|
"sentiment-analysis" => "text-classification",
|
1161
|
-
"ner" => "token-classification"
|
1331
|
+
"ner" => "token-classification",
|
1332
|
+
"text-to-speech" => "text-to-audio"
|
1162
1333
|
}
|
1163
1334
|
|
1164
1335
|
DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
|
@@ -1231,7 +1402,8 @@ module Informers
|
|
1231
1402
|
results = load_items(classes, model, pretrained_options)
|
1232
1403
|
results[:task] = task
|
1233
1404
|
|
1234
|
-
|
1405
|
+
# for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
1406
|
+
if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
|
1235
1407
|
results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
|
1236
1408
|
end
|
1237
1409
|
|
data/lib/informers/processors.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Informers
|
2
2
|
class FeatureExtractor
|
3
|
+
attr_reader :config
|
4
|
+
|
3
5
|
def initialize(config)
|
4
6
|
super()
|
5
7
|
@config = config
|
@@ -728,6 +730,61 @@ module Informers
|
|
728
730
|
end
|
729
731
|
end
|
730
732
|
|
733
|
+
class WhisperFeatureExtractor < FeatureExtractor
|
734
|
+
def initialize(config)
|
735
|
+
super(config)
|
736
|
+
|
737
|
+
raise Todo
|
738
|
+
end
|
739
|
+
|
740
|
+
def _extract_fbank_features(waveform)
|
741
|
+
raise Todo
|
742
|
+
end
|
743
|
+
|
744
|
+
def call(audio)
|
745
|
+
raise Todo
|
746
|
+
end
|
747
|
+
end
|
748
|
+
|
749
|
+
class Wav2Vec2FeatureExtractor < FeatureExtractor
|
750
|
+
def _zero_mean_unit_var_norm(input_values)
|
751
|
+
sum = input_values.sum
|
752
|
+
mean = sum / input_values.length.to_f
|
753
|
+
variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
|
754
|
+
input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
|
755
|
+
end
|
756
|
+
|
757
|
+
def call(audio)
|
758
|
+
# TODO
|
759
|
+
# validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')
|
760
|
+
|
761
|
+
input_values = audio
|
762
|
+
|
763
|
+
# zero-mean and unit-variance normalization
|
764
|
+
if @config["do_normalize"]
|
765
|
+
input_values = _zero_mean_unit_var_norm(input_values)
|
766
|
+
end
|
767
|
+
|
768
|
+
# TODO: allow user to pass in attention mask
|
769
|
+
{
|
770
|
+
input_values: [input_values],
|
771
|
+
attention_mask: [Array.new(input_values.length, 1)]
|
772
|
+
}
|
773
|
+
end
|
774
|
+
end
|
775
|
+
|
776
|
+
class ClapFeatureExtractor < FeatureExtractor
|
777
|
+
def initialize(config)
|
778
|
+
super(config)
|
779
|
+
|
780
|
+
# TODO
|
781
|
+
end
|
782
|
+
|
783
|
+
def call(audio, max_length: nil)
|
784
|
+
raise Todo
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
731
788
|
class Processor
|
732
789
|
attr_reader :feature_extractor
|
733
790
|
|
@@ -748,7 +805,10 @@ module Informers
|
|
748
805
|
"DPTFeatureExtractor" => DPTFeatureExtractor,
|
749
806
|
"DetrFeatureExtractor" => DetrFeatureExtractor,
|
750
807
|
"Swin2SRImageProcessor" => Swin2SRImageProcessor,
|
751
|
-
"DonutFeatureExtractor" => DonutFeatureExtractor
|
808
|
+
"DonutFeatureExtractor" => DonutFeatureExtractor,
|
809
|
+
"WhisperFeatureExtractor" => WhisperFeatureExtractor,
|
810
|
+
"Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
|
811
|
+
"ClapFeatureExtractor" => ClapFeatureExtractor
|
752
812
|
}
|
753
813
|
|
754
814
|
PROCESSOR_CLASS_MAPPING = {}
|
@@ -762,7 +822,7 @@ module Informers
|
|
762
822
|
revision: "main",
|
763
823
|
**kwargs
|
764
824
|
)
|
765
|
-
preprocessor_config = config || Utils::Hub
|
825
|
+
preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
|
766
826
|
progress_callback:,
|
767
827
|
config:,
|
768
828
|
cache_dir:,
|
data/lib/informers/tokenizers.rb
CHANGED
@@ -244,6 +244,9 @@ module Informers
|
|
244
244
|
end
|
245
245
|
end
|
246
246
|
|
247
|
+
class SpeechT5Tokenizer < PreTrainedTokenizer
|
248
|
+
end
|
249
|
+
|
247
250
|
class AutoTokenizer
|
248
251
|
TOKENIZER_CLASS_MAPPING = {
|
249
252
|
"T5Tokenizer" => T5Tokenizer,
|
@@ -257,7 +260,8 @@ module Informers
|
|
257
260
|
"CLIPTokenizer" => CLIPTokenizer,
|
258
261
|
"GPT2Tokenizer" => GPT2Tokenizer,
|
259
262
|
"NllbTokenizer" => NllbTokenizer,
|
260
|
-
"M2M100Tokenizer" => M2M100Tokenizer
|
263
|
+
"M2M100Tokenizer" => M2M100Tokenizer,
|
264
|
+
"SpeechT5Tokenizer" => SpeechT5Tokenizer
|
261
265
|
}
|
262
266
|
|
263
267
|
def self.from_pretrained(
|
@@ -296,7 +300,7 @@ module Informers
|
|
296
300
|
def self.load_tokenizer(pretrained_model_name_or_path, **options)
|
297
301
|
info = [
|
298
302
|
Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
|
299
|
-
Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
|
303
|
+
Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
|
300
304
|
]
|
301
305
|
|
302
306
|
# Override legacy option if `options.legacy` is not null
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Informers
|
2
|
+
module Utils
|
3
|
+
def self.read_audio(input, sampling_rate)
|
4
|
+
data =
|
5
|
+
if input.is_a?(URI)
|
6
|
+
require "open-uri"
|
7
|
+
|
8
|
+
input.read
|
9
|
+
elsif input.is_a?(String)
|
10
|
+
File.binread(input)
|
11
|
+
else
|
12
|
+
raise ArgumentError, "Unsupported input type: #{input.class.name}"
|
13
|
+
end
|
14
|
+
|
15
|
+
ffmpeg_read(data, sampling_rate)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Informers
|
16
|
+
module Utils
|
17
|
+
# from the Transformers Python library
|
18
|
+
def self.ffmpeg_read(data, sampling_rate)
|
19
|
+
ar = "#{sampling_rate}"
|
20
|
+
ac = "1"
|
21
|
+
format_for_conversion = "f32le"
|
22
|
+
ffmpeg_command = [
|
23
|
+
"ffmpeg",
|
24
|
+
"-i",
|
25
|
+
"pipe:0",
|
26
|
+
"-ac",
|
27
|
+
ac,
|
28
|
+
"-ar",
|
29
|
+
ar,
|
30
|
+
"-f",
|
31
|
+
format_for_conversion,
|
32
|
+
"-hide_banner",
|
33
|
+
"-loglevel",
|
34
|
+
"quiet",
|
35
|
+
"pipe:1"
|
36
|
+
]
|
37
|
+
|
38
|
+
stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
|
39
|
+
if !status.success?
|
40
|
+
raise Error, "ffmpeg was not found but is required to load audio files from filename"
|
41
|
+
end
|
42
|
+
stdout.unpack("e*")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/informers/utils/math.rb
CHANGED
@@ -14,8 +14,8 @@ module Informers
|
|
14
14
|
out_img = Array.new(out_height * out_width * in_channels)
|
15
15
|
|
16
16
|
# Pre-calculate strides
|
17
|
-
in_stride = in_height * in_width
|
18
|
-
out_stride = out_height * out_width
|
17
|
+
in_stride = in_height * in_width
|
18
|
+
out_stride = out_height * out_width
|
19
19
|
|
20
20
|
out_height.times do |i|
|
21
21
|
out_width.times do |j|
|
data/lib/informers/version.rb
CHANGED
data/lib/informers.rb
CHANGED
@@ -6,12 +6,15 @@ require "tokenizers"
|
|
6
6
|
require "io/console"
|
7
7
|
require "json"
|
8
8
|
require "open-uri"
|
9
|
+
require "open3"
|
9
10
|
require "stringio"
|
10
11
|
require "uri"
|
11
12
|
|
12
13
|
# modules
|
14
|
+
require_relative "informers/utils/audio"
|
13
15
|
require_relative "informers/utils/core"
|
14
16
|
require_relative "informers/utils/generation"
|
17
|
+
require_relative "informers/utils/ffmpeg"
|
15
18
|
require_relative "informers/utils/hub"
|
16
19
|
require_relative "informers/utils/image"
|
17
20
|
require_relative "informers/utils/math"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: informers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: onnxruntime
|
@@ -55,7 +55,9 @@ files:
|
|
55
55
|
- lib/informers/pipelines.rb
|
56
56
|
- lib/informers/processors.rb
|
57
57
|
- lib/informers/tokenizers.rb
|
58
|
+
- lib/informers/utils/audio.rb
|
58
59
|
- lib/informers/utils/core.rb
|
60
|
+
- lib/informers/utils/ffmpeg.rb
|
59
61
|
- lib/informers/utils/generation.rb
|
60
62
|
- lib/informers/utils/hub.rb
|
61
63
|
- lib/informers/utils/image.rb
|