informers 1.0.3 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,6 +7,40 @@ module Informers
7
7
  @tokenizer = tokenizer
8
8
  @processor = processor
9
9
  end
10
+
11
+ private
12
+
13
+ def prepare_images(images)
14
+ if !images.is_a?(Array)
15
+ images = [images]
16
+ end
17
+
18
+ # Possibly convert any non-images to images
19
+ images.map { |x| Utils::RawImage.read(x) }
20
+ end
21
+
22
+ def prepare_audios(audios, sampling_rate)
23
+ if !audios.is_a?(Array)
24
+ audios = [audios]
25
+ end
26
+
27
+ audios.map do |x|
28
+ if x.is_a?(String) || x.is_a?(URI)
29
+ Utils.read_audio(x, sampling_rate)
30
+ else
31
+ x
32
+ end
33
+ end
34
+ end
35
+
36
+ def get_bounding_box(box, as_integer)
37
+ if as_integer
38
+ box = box.map { |x| x.to_i }
39
+ end
40
+ xmin, ymin, xmax, ymax = box
41
+
42
+ {xmin:, ymin:, xmax:, ymax:}
43
+ end
10
44
  end
11
45
 
12
46
  class TextClassificationPipeline < Pipeline
@@ -21,13 +55,13 @@ module Informers
21
55
  outputs = @model.(model_inputs)
22
56
 
23
57
  function_to_apply =
24
- if @model.config.problem_type == "multi_label_classification"
58
+ if @model.config[:problem_type] == "multi_label_classification"
25
59
  ->(batch) { Utils.sigmoid(batch) }
26
60
  else
27
61
  ->(batch) { Utils.softmax(batch) } # single_label_classification (default)
28
62
  end
29
63
 
30
- id2label = @model.config.id2label
64
+ id2label = @model.config[:id2label]
31
65
 
32
66
  to_return = []
33
67
  outputs.logits.each do |batch|
@@ -70,7 +104,7 @@ module Informers
70
104
  outputs = @model.(model_inputs)
71
105
 
72
106
  logits = outputs.logits
73
- id2label = @model.config.id2label
107
+ id2label = @model.config[:id2label]
74
108
 
75
109
  to_return = []
76
110
  logits.length.times do |i|
@@ -243,6 +277,547 @@ module Informers
243
277
  end
244
278
  end
245
279
 
280
+ class FillMaskPipeline < Pipeline
281
+ def call(texts, top_k: 5)
282
+ model_inputs = @tokenizer.(texts, padding: true, truncation: true)
283
+ outputs = @model.(model_inputs)
284
+
285
+ to_return = []
286
+ model_inputs[:input_ids].each_with_index do |ids, i|
287
+ mask_token_index = ids.index(@tokenizer.mask_token_id)
288
+
289
+ if mask_token_index.nil?
290
+ raise ArgumentError, "Mask token (#{@tokenizer.mask_token}) not found in text."
291
+ end
292
+ logits = outputs.logits[i]
293
+ item_logits = logits[mask_token_index]
294
+
295
+ scores = Utils.get_top_items(Utils.softmax(item_logits), top_k)
296
+
297
+ to_return <<
298
+ scores.map do |x|
299
+ sequence = ids.dup
300
+ sequence[mask_token_index] = x[0]
301
+
302
+ {
303
+ score: x[1],
304
+ token: x[0],
305
+ token_str: @tokenizer.id_to_token(x[0]),
306
+ sequence: @tokenizer.decode(sequence, skip_special_tokens: true)
307
+ }
308
+ end
309
+ end
310
+ texts.is_a?(Array) ? to_return : to_return[0]
311
+ end
312
+ end
313
+
314
+ class Text2TextGenerationPipeline < Pipeline
315
+ KEY = :generated_text
316
+
317
+ def call(texts, **generate_kwargs)
318
+ if !texts.is_a?(Array)
319
+ texts = [texts]
320
+ end
321
+
322
+ # Add global prefix, if present
323
+ if @model.config[:prefix]
324
+ texts = texts.map { |x| @model.config[:prefix] + x }
325
+ end
326
+
327
+ # Handle task specific params:
328
+ task_specific_params = @model.config[:task_specific_params]
329
+ if task_specific_params && task_specific_params[@task]
330
+ # Add prefixes, if present
331
+ if task_specific_params[@task]["prefix"]
332
+ texts = texts.map { |x| task_specific_params[@task]["prefix"] + x }
333
+ end
334
+
335
+ # TODO update generation config
336
+ end
337
+
338
+ tokenizer = @tokenizer
339
+ tokenizer_options = {
340
+ padding: true,
341
+ truncation: true
342
+ }
343
+ if is_a?(TranslationPipeline) && tokenizer.respond_to?(:_build_translation_inputs)
344
+ input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs)[:input_ids]
345
+ else
346
+ input_ids = tokenizer.(texts, **tokenizer_options)[:input_ids]
347
+ end
348
+
349
+ output_token_ids = @model.generate(input_ids, generate_kwargs)
350
+
351
+ tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
352
+ .map { |text| {self.class.const_get(:KEY) => text} }
353
+ end
354
+ end
355
+
356
+ class SummarizationPipeline < Text2TextGenerationPipeline
357
+ KEY = :summary_text
358
+ end
359
+
360
+ class TranslationPipeline < Text2TextGenerationPipeline
361
+ KEY = :translation_text
362
+ end
363
+
364
+ class TextGenerationPipeline < Pipeline
365
+ def call(texts, **generate_kwargs)
366
+ is_batched = false
367
+ is_chat_input = false
368
+
369
+ # Normalize inputs
370
+ if texts.is_a?(String)
371
+ texts = [texts]
372
+ inputs = texts
373
+ else
374
+ raise Todo
375
+ end
376
+
377
+ # By default, do not add special tokens
378
+ add_special_tokens = generate_kwargs[:add_special_tokens] || false
379
+
380
+ # /By default, return full text
381
+ return_full_text =
382
+ if is_chat_input
383
+ false
384
+ else
385
+ generate_kwargs[:return_full_text] || true
386
+ end
387
+
388
+ @tokenizer.padding_side = "left"
389
+ input_ids, attention_mask =
390
+ @tokenizer.(inputs, add_special_tokens:, padding: true, truncation: true)
391
+ .values_at(:input_ids, :attention_mask)
392
+
393
+ output_token_ids =
394
+ @model.generate(
395
+ input_ids, generate_kwargs, nil, inputs_attention_mask: attention_mask
396
+ )
397
+
398
+ decoded = @tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
399
+
400
+ if !return_full_text && Utils.dims(input_ids)[-1] > 0
401
+ prompt_lengths = @tokenizer.batch_decode(input_ids, skip_special_tokens: true).map { |x| x.length }
402
+ end
403
+
404
+ to_return = Array.new(texts.length) { [] }
405
+ decoded.length.times do |i|
406
+ text_index = (i / output_token_ids.length.to_i * texts.length).floor
407
+
408
+ if prompt_lengths
409
+ raise Todo
410
+ end
411
+ # TODO is_chat_input
412
+ to_return[text_index] << {
413
+ generated_text: decoded[i]
414
+ }
415
+ end
416
+ !is_batched && to_return.length == 1 ? to_return[0] : to_return
417
+ end
418
+ end
419
+
420
+ class ZeroShotClassificationPipeline < Pipeline
421
+ def initialize(**options)
422
+ super(**options)
423
+
424
+ @label2id = @model.config[:label2id].transform_keys(&:downcase)
425
+
426
+ @entailment_id = @label2id["entailment"]
427
+ if @entailment_id.nil?
428
+ warn "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."
429
+ @entailment_id = 2
430
+ end
431
+
432
+ @contradiction_id = @label2id["contradiction"] || @label2id["not_entailment"]
433
+ if @contradiction_id.nil?
434
+ warn "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."
435
+ @contradiction_id = 0
436
+ end
437
+ end
438
+
439
+ def call(texts, candidate_labels, hypothesis_template: "This example is {}.", multi_label: false)
440
+ is_batched = texts.is_a?(Array)
441
+ if !is_batched
442
+ texts = [texts]
443
+ end
444
+ if !candidate_labels.is_a?(Array)
445
+ candidate_labels = [candidate_labels]
446
+ end
447
+
448
+ # Insert labels into hypothesis template
449
+ hypotheses = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
450
+
451
+ # How to perform the softmax over the logits:
452
+ # - true: softmax over the entailment vs. contradiction dim for each label independently
453
+ # - false: softmax the "entailment" logits over all candidate labels
454
+ softmax_each = multi_label || candidate_labels.length == 1
455
+
456
+ to_return = []
457
+ texts.each do |premise|
458
+ entails_logits = []
459
+
460
+ hypotheses.each do |hypothesis|
461
+ inputs = @tokenizer.(
462
+ premise,
463
+ text_pair: hypothesis,
464
+ padding: true,
465
+ truncation: true
466
+ )
467
+ outputs = @model.(inputs)
468
+
469
+ if softmax_each
470
+ entails_logits << [
471
+ outputs.logits[0][@contradiction_id],
472
+ outputs.logits[0][@entailment_id]
473
+ ]
474
+ else
475
+ entails_logits << outputs.logits[0][@entailment_id]
476
+ end
477
+ end
478
+
479
+ scores =
480
+ if softmax_each
481
+ entails_logits.map { |x| Utils.softmax(x)[1] }
482
+ else
483
+ Utils.softmax(entails_logits)
484
+ end
485
+
486
+ # Sort by scores (desc) and return scores with indices
487
+ scores_sorted = scores.map.with_index { |x, i| [x, i] }.sort_by { |v| -v[0] }
488
+
489
+ to_return << {
490
+ sequence: premise,
491
+ labels: scores_sorted.map { |x| candidate_labels[x[1]] },
492
+ scores: scores_sorted.map { |x| x[0] }
493
+ }
494
+ end
495
+ is_batched ? to_return : to_return[0]
496
+ end
497
+ end
498
+
499
+ class ImageToTextPipeline < Pipeline
500
+ def call(images, **generate_kwargs)
501
+ is_batched = images.is_a?(Array)
502
+ prepared_images = prepare_images(images)
503
+
504
+ pixel_values = @processor.(prepared_images)[:pixel_values]
505
+
506
+ to_return = []
507
+ pixel_values.each do |batch|
508
+ batch = [batch]
509
+ output = @model.generate(batch, **generate_kwargs)
510
+ decoded = @tokenizer
511
+ .batch_decode(output, skip_special_tokens: true)
512
+ .map { |x| {generated_text: x.strip} }
513
+ to_return << decoded
514
+ end
515
+
516
+ is_batched ? to_return : to_return[0]
517
+ end
518
+ end
519
+
520
+ class ImageClassificationPipeline < Pipeline
521
+ def call(images, top_k: 1)
522
+ is_batched = images.is_a?(Array)
523
+ prepared_images = prepare_images(images)
524
+
525
+ pixel_values = @processor.(prepared_images)[:pixel_values]
526
+ output = @model.({pixel_values: pixel_values})
527
+
528
+ id2label = @model.config[:id2label]
529
+ to_return = []
530
+ output.logits.each do |batch|
531
+ scores = Utils.get_top_items(Utils.softmax(batch), top_k)
532
+
533
+ vals =
534
+ scores.map do |x|
535
+ {
536
+ label: id2label[x[0].to_s],
537
+ score: x[1]
538
+ }
539
+ end
540
+ if top_k == 1
541
+ to_return.push(*vals)
542
+ else
543
+ to_return << vals
544
+ end
545
+ end
546
+
547
+ is_batched || top_k == 1 ? to_return : to_return[0]
548
+ end
549
+ end
550
+
551
+ class ImageSegmentationPipeline < Pipeline
552
+ def initialize(**options)
553
+ super(**options)
554
+
555
+ @subtasks_mapping = {
556
+ "panoptic" => "post_process_panoptic_segmentation",
557
+ "instance" => "post_process_instance_segmentation",
558
+ "semantic" => "post_process_semantic_segmentation"
559
+ }
560
+ end
561
+
562
+ def call(
563
+ images,
564
+ threshold: 0.5,
565
+ mask_threshold: 0.5,
566
+ overlap_mask_area_threshold: 0.8,
567
+ label_ids_to_fuse: nil,
568
+ target_sizes: nil,
569
+ subtask: nil
570
+ )
571
+ is_batched = images.is_a?(Array)
572
+
573
+ if is_batched && images.length != 1
574
+ raise Error, "Image segmentation pipeline currently only supports a batch size of 1."
575
+ end
576
+
577
+ prepared_images = prepare_images(images)
578
+ image_sizes = prepared_images.map { |x| [x.height, x.width] }
579
+
580
+ model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
581
+ output = @model.(model_inputs)
582
+
583
+ if !subtask.nil?
584
+ fn = @subtasks_mapping[subtask]
585
+ else
586
+ @subtasks_mapping.each do |task, func|
587
+ if @processor.feature_extractor.respond_to?(func)
588
+ fn = @processor.feature_extractor.method(func)
589
+ subtask = task
590
+ break
591
+ end
592
+ end
593
+ end
594
+
595
+ id2label = @model.config[:id2label]
596
+
597
+ annotation = []
598
+ if subtask == "panoptic" || subtask == "instance"
599
+ processed = fn.(
600
+ output,
601
+ threshold:,
602
+ mask_threshold:,
603
+ overlap_mask_area_threshold:,
604
+ label_ids_to_fuse:,
605
+ target_sizes: target_sizes || image_sizes, # TODO FIX?
606
+ )[0]
607
+
608
+ _segmentation = processed[:segmentation]
609
+
610
+ processed[:segments_info].each do |segment|
611
+ annotation << {
612
+ label: id2label[segment[:label_id].to_s],
613
+ score: segment[:score]
614
+ # TODO mask
615
+ }
616
+ end
617
+ elsif subtask == "semantic"
618
+ raise Todo
619
+ else
620
+ raise Error, "Subtask #{subtask} not supported."
621
+ end
622
+
623
+ annotation
624
+ end
625
+ end
626
+
627
+ class ZeroShotImageClassificationPipeline < Pipeline
628
+ def call(images, candidate_labels, hypothesis_template: "This is a photo of {}")
629
+ is_batched = images.is_a?(Array)
630
+ prepared_images = prepare_images(images)
631
+
632
+ # Insert label into hypothesis template
633
+ texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
634
+
635
+ # Run tokenization
636
+ text_inputs = @tokenizer.(texts,
637
+ padding: @model.config[:model_type] == "siglip" ? "max_length" : true,
638
+ truncation: true
639
+ )
640
+
641
+ # Run processor
642
+ pixel_values = @processor.(prepared_images)[:pixel_values]
643
+
644
+ # Run model with both text and pixel inputs
645
+ output = @model.(text_inputs.merge(pixel_values: pixel_values))
646
+
647
+ function_to_apply =
648
+ if @model.config[:model_type] == "siglip"
649
+ ->(batch) { Utils.sigmoid(batch) }
650
+ else
651
+ ->(batch) { Utils.softmax(batch) }
652
+ end
653
+
654
+ # Compare each image with each candidate label
655
+ to_return = []
656
+ output[0].each do |batch|
657
+ # Compute softmax per image
658
+ probs = function_to_apply.(batch)
659
+
660
+ result = probs
661
+ .map.with_index { |x, i| {label: candidate_labels[i], score: x} }
662
+ .sort_by { |v| -v[:score] }
663
+
664
+ to_return << result
665
+ end
666
+
667
+ is_batched ? to_return : to_return[0]
668
+ end
669
+ end
670
+
671
+ class ObjectDetectionPipeline < Pipeline
672
+ def call(images, threshold: 0.9, percentage: false)
673
+ is_batched = images.is_a?(Array)
674
+
675
+ if is_batched && images.length != 1
676
+ raise Error, "Object detection pipeline currently only supports a batch size of 1."
677
+ end
678
+ prepared_images = prepare_images(images)
679
+
680
+ image_sizes = percentage ? nil : prepared_images.map { |x| [x.height, x.width] }
681
+
682
+ model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
683
+ output = @model.(model_inputs)
684
+
685
+ processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_sizes)
686
+
687
+ # Add labels
688
+ id2label = @model.config[:id2label]
689
+
690
+ # Format output
691
+ result =
692
+ processed.map do |batch|
693
+ batch[:boxes].map.with_index do |box, i|
694
+ {
695
+ label: id2label[batch[:classes][i].to_s],
696
+ score: batch[:scores][i],
697
+ box: get_bounding_box(box, !percentage)
698
+ }
699
+ end.sort_by { |v| -v[:score] }
700
+ end
701
+
702
+ is_batched ? result : result[0]
703
+ end
704
+ end
705
+
706
+ class ZeroShotObjectDetectionPipeline < Pipeline
707
+ def call(
708
+ images,
709
+ candidate_labels,
710
+ threshold: 0.1,
711
+ top_k: nil,
712
+ percentage: false
713
+ )
714
+ is_batched = images.is_a?(Array)
715
+ prepared_images = prepare_images(images)
716
+
717
+ # Run tokenization
718
+ text_inputs = @tokenizer.(candidate_labels,
719
+ padding: true,
720
+ truncation: true
721
+ )
722
+
723
+ # Run processor
724
+ model_inputs = @processor.(prepared_images)
725
+
726
+ # Since non-maximum suppression is performed for exporting, we need to
727
+ # process each image separately. For more information, see:
728
+ # https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
729
+ to_return = []
730
+ prepared_images.length.times do |i|
731
+ image = prepared_images[i]
732
+ image_size = percentage ? nil : [[image.height, image.width]]
733
+ pixel_values = [model_inputs[:pixel_values][i]]
734
+
735
+ # Run model with both text and pixel inputs
736
+ output = @model.(text_inputs.merge(pixel_values: pixel_values))
737
+ # TODO remove
738
+ output = @model.instance_variable_get(:@session).outputs.map { |v| v[:name].to_sym }.zip(output).to_h
739
+
740
+ processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_size, true)[0]
741
+ result =
742
+ processed[:boxes].map.with_index do |box, i|
743
+ {
744
+ label: candidate_labels[processed[:classes][i]],
745
+ score: processed[:scores][i],
746
+ box: get_bounding_box(box, !percentage)
747
+ }
748
+ end
749
+ result.sort_by! { |v| -v[:score] }
750
+ if !top_k.nil?
751
+ result = result[0...topk]
752
+ end
753
+ to_return << result
754
+ end
755
+
756
+ is_batched ? to_return : to_return[0]
757
+ end
758
+ end
759
+
760
+ class DocumentQuestionAnsweringPipeline < Pipeline
761
+ def call(image, question, **generate_kwargs)
762
+ # NOTE: For now, we only support a batch size of 1
763
+
764
+ # Preprocess image
765
+ prepared_image = prepare_images(image)[0]
766
+ pixel_values = @processor.(prepared_image)[:pixel_values]
767
+
768
+ # Run tokenization
769
+ task_prompt = "<s_docvqa><s_question>#{question}</s_question><s_answer>"
770
+ decoder_input_ids =
771
+ @tokenizer.(
772
+ task_prompt,
773
+ add_special_tokens: false,
774
+ padding: true,
775
+ truncation: true
776
+ )[:input_ids]
777
+
778
+ # Run model
779
+ output =
780
+ @model.generate(
781
+ pixel_values,
782
+ generate_kwargs.merge(
783
+ decoder_input_ids: decoder_input_ids[0],
784
+ max_length: @model.config["decoder"]["max_position_embeddings"]
785
+ ).transform_keys(&:to_s)
786
+ )
787
+
788
+ # Decode output
789
+ decoded = @tokenizer.batch_decode(output, skip_special_tokens: false)[0]
790
+
791
+ # Parse answer
792
+ match = decoded.match(/<s_answer>(.*?)<\/s_answer>/)
793
+ answer = nil
794
+ if match && match.length >= 2
795
+ answer = match[1].strip
796
+ end
797
+ [{answer:}]
798
+ end
799
+ end
800
+
801
+ class TextToAudioPipeline < Pipeline
802
+ DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
803
+
804
+ def initialize(**options)
805
+ super(**options)
806
+
807
+ # TODO: Find a better way for `pipeline` to set the default vocoder
808
+ @vocoder = options[:vocoder]
809
+ end
810
+
811
+ def call(text_inputs, speaker_embeddings: nil)
812
+ # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
813
+ if @processor
814
+ call_text_to_spectrogram(text_inputs, speaker_embeddings:)
815
+ else
816
+ call_text_to_waveform(text_inputs)
817
+ end
818
+ end
819
+ end
820
+
246
821
  class FeatureExtractionPipeline < Pipeline
247
822
  def call(
248
823
  texts,
@@ -262,7 +837,7 @@ module Informers
262
837
  if !model_output.nil?
263
838
  model_options[:output_names] = Array(model_output)
264
839
  elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
265
- # optimization for sentence-transformers/all-MiniLM-L6-v2
840
+ # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
266
841
  model_options[:output_names] = ["sentence_embedding"]
267
842
  pooling = "none"
268
843
  normalize = false
@@ -306,6 +881,164 @@ module Informers
306
881
  end
307
882
  end
308
883
 
884
+ class ImageFeatureExtractionPipeline < Pipeline
885
+ def call(images)
886
+ prepared_images = prepare_images(images)
887
+ pixel_values = @processor.(prepared_images)[:pixel_values]
888
+ outputs = @model.({pixel_values: pixel_values})
889
+
890
+ result = outputs[0]
891
+ result
892
+ end
893
+ end
894
+
895
+ class AudioClassificationPipeline < Pipeline
896
+ def call(audio, top_k: nil)
897
+ single = !audio.is_a?(Array)
898
+
899
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
900
+ prepared_audios = prepare_audios(audio, sampling_rate)
901
+
902
+ id2label = @model.config[:id2label]
903
+
904
+ to_return = []
905
+ prepared_audios.each do |aud|
906
+ inputs = @processor.(aud)
907
+ output = @model.(inputs)
908
+ logits = output.logits[0]
909
+
910
+ scores = Utils.get_top_items(Utils.softmax(logits), top_k)
911
+
912
+ vals =
913
+ scores.map do |x|
914
+ {
915
+ label: id2label[x[0].to_s],
916
+ score: x[1]
917
+ }
918
+ end
919
+
920
+ if top_k == 1
921
+ to_return.concat(vals)
922
+ else
923
+ to_return << vals
924
+ end
925
+ end
926
+ !single || top_k == 1 ? to_return : to_return[0]
927
+ end
928
+ end
929
+
930
+ class ZeroShotAudioClassificationPipeline < Pipeline
931
+ def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
932
+ single = !audio.is_a?(Array)
933
+ if single
934
+ audio = [audio]
935
+ end
936
+
937
+ # Insert label into hypothesis template
938
+ texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
939
+
940
+ # Run tokenization
941
+ text_inputs =
942
+ @tokenizer.(
943
+ texts,
944
+ padding: true,
945
+ truncation: true
946
+ )
947
+
948
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
949
+ prepared_audios = prepare_audios(audio, sampling_rate)
950
+
951
+ to_return = []
952
+ prepared_audios.each do |aud|
953
+ audio_inputs = @processor.(aud)
954
+
955
+ # Run model with both text and audio inputs
956
+ output = @model.(text_inputs.merge(audio_inputs))
957
+
958
+ # Compute softmax per audio
959
+ probs = Utils.softmax(output.logits_per_audio.data)
960
+
961
+ to_return <<
962
+ probs.map.with_index do |x, i|
963
+ {
964
+ label: candidate_labels[i],
965
+ score: x
966
+ }
967
+ end
968
+ end
969
+ single ? to_return[0] : to_return
970
+ end
971
+ end
972
+
973
+ class AutomaticSpeechRecognitionPipeline < Pipeline
974
+ def call(audio, **kwargs)
975
+ case @model.config["model_type"]
976
+ when "whisper"
977
+ call_whisper(audio, **kwargs)
978
+ else
979
+ raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
980
+ end
981
+ end
982
+
983
+ private
984
+
985
+ def call_whisper(audio, **kwargs)
986
+ raise Todo
987
+ end
988
+ end
989
+
990
+ class ImageToImagePipeline < Pipeline
991
+ def call(images)
992
+ prepared_images = prepare_images(images)
993
+ inputs = @processor.(prepared_images)
994
+ outputs = @model.(inputs)
995
+
996
+ to_return = []
997
+ outputs[0].each do |batch|
998
+ # TODO flatten first
999
+ output =
1000
+ batch.map do |v|
1001
+ v.map do |v2|
1002
+ v2.map do |v3|
1003
+ (v3.clamp(0, 1) * 255).round
1004
+ end
1005
+ end
1006
+ end
1007
+ to_return << Utils::RawImage.from_array(output).image
1008
+ end
1009
+
1010
+ to_return.length > 1 ? to_return : to_return[0]
1011
+ end
1012
+ end
1013
+
1014
+ class DepthEstimationPipeline < Pipeline
1015
+ def call(images)
1016
+ prepared_images = prepare_images(images)
1017
+
1018
+ inputs = @processor.(prepared_images)
1019
+ predicted_depth = @model.(inputs)[0]
1020
+
1021
+ to_return = []
1022
+ prepared_images.length.times do |i|
1023
+ prediction = Utils.interpolate(predicted_depth[i], prepared_images[i].size.reverse, "bilinear", false)
1024
+ max_prediction = Utils.max(prediction.flatten)[0]
1025
+ formatted =
1026
+ prediction.map do |v|
1027
+ v.map do |v2|
1028
+ v2.map do |v3|
1029
+ (v3 * 255 / max_prediction).round
1030
+ end
1031
+ end
1032
+ end
1033
+ to_return << {
1034
+ predicted_depth: predicted_depth[i],
1035
+ depth: Utils::RawImage.from_array(formatted).image
1036
+ }
1037
+ end
1038
+ to_return.length > 1 ? to_return : to_return[0]
1039
+ end
1040
+ end
1041
+
309
1042
  class EmbeddingPipeline < FeatureExtractionPipeline
310
1043
  def call(
311
1044
  texts,
@@ -375,6 +1108,186 @@ module Informers
375
1108
  },
376
1109
  type: "text"
377
1110
  },
1111
+ "fill-mask" => {
1112
+ tokenizer: AutoTokenizer,
1113
+ pipeline: FillMaskPipeline,
1114
+ model: AutoModelForMaskedLM,
1115
+ default: {
1116
+ model: "Xenova/bert-base-uncased"
1117
+ },
1118
+ type: "text"
1119
+ },
1120
+ "summarization" => {
1121
+ tokenizer: AutoTokenizer,
1122
+ pipeline: SummarizationPipeline,
1123
+ model: AutoModelForSeq2SeqLM,
1124
+ default: {
1125
+ model: "Xenova/distilbart-cnn-6-6"
1126
+ },
1127
+ type: "text"
1128
+ },
1129
+ "translation" => {
1130
+ tokenizer: AutoTokenizer,
1131
+ pipeline: TranslationPipeline,
1132
+ model: AutoModelForSeq2SeqLM,
1133
+ default: {
1134
+ model: "Xenova/t5-small"
1135
+ },
1136
+ type: "text"
1137
+ },
1138
+ "text2text-generation" => {
1139
+ tokenizer: AutoTokenizer,
1140
+ pipeline: Text2TextGenerationPipeline,
1141
+ model: AutoModelForSeq2SeqLM,
1142
+ default: {
1143
+ model: "Xenova/flan-t5-small"
1144
+ },
1145
+ type: "text"
1146
+ },
1147
+ "text-generation" => {
1148
+ tokenizer: AutoTokenizer,
1149
+ pipeline: TextGenerationPipeline,
1150
+ model: AutoModelForCausalLM,
1151
+ default: {
1152
+ model: "Xenova/gpt2"
1153
+ },
1154
+ type: "text"
1155
+ },
1156
+ "zero-shot-classification" => {
1157
+ tokenizer: AutoTokenizer,
1158
+ pipeline: ZeroShotClassificationPipeline,
1159
+ model: AutoModelForSequenceClassification,
1160
+ default: {
1161
+ model: "Xenova/distilbert-base-uncased-mnli"
1162
+ },
1163
+ type: "text"
1164
+ },
1165
+ "audio-classification" => {
1166
+ pipeline: AudioClassificationPipeline,
1167
+ model: AutoModelForAudioClassification,
1168
+ processor: AutoProcessor,
1169
+ default: {
1170
+ model: "Xenova/wav2vec2-base-superb-ks"
1171
+ },
1172
+ type: "audio"
1173
+ },
1174
+ # TODO
1175
+ # "zero-shot-audio-classification" => {
1176
+ # tokenizer: AutoTokenizer,
1177
+ # pipeline: ZeroShotAudioClassificationPipeline,
1178
+ # model: AutoModel,
1179
+ # processor: AutoProcessor,
1180
+ # default: {
1181
+ # model: "Xenova/clap-htsat-unfused"
1182
+ # },
1183
+ # type: "multimodal"
1184
+ # },
1185
+ # TODO
1186
+ # "automatic-speech-recognition" => {
1187
+ # tokenizer: AutoTokenizer,
1188
+ # pipeline: AutomaticSpeechRecognitionPipeline,
1189
+ # model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
1190
+ # processor: AutoProcessor,
1191
+ # default: {
1192
+ # model: "Xenova/whisper-tiny.en"
1193
+ # },
1194
+ # type: "multimodal"
1195
+ # },
1196
+ "text-to-audio" => {
1197
+ tokenizer: AutoTokenizer,
1198
+ pipeline: TextToAudioPipeline,
1199
+ model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
1200
+ processor: [AutoProcessor, nil],
1201
+ default: {
1202
+ model: "Xenova/speecht5_tts"
1203
+ },
1204
+ type: "text"
1205
+ },
1206
+ "image-to-text" => {
1207
+ tokenizer: AutoTokenizer,
1208
+ pipeline: ImageToTextPipeline,
1209
+ model: AutoModelForVision2Seq,
1210
+ processor: AutoProcessor,
1211
+ default: {
1212
+ model: "Xenova/vit-gpt2-image-captioning"
1213
+ },
1214
+ type: "multimodal"
1215
+ },
1216
+ "image-classification" => {
1217
+ pipeline: ImageClassificationPipeline,
1218
+ model: AutoModelForImageClassification,
1219
+ processor: AutoProcessor,
1220
+ default: {
1221
+ model: "Xenova/vit-base-patch16-224"
1222
+ },
1223
+ type: "multimodal"
1224
+ },
1225
+ "image-segmentation" => {
1226
+ pipeline: ImageSegmentationPipeline,
1227
+ model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
1228
+ processor: AutoProcessor,
1229
+ default: {
1230
+ model: "Xenova/detr-resnet-50-panoptic"
1231
+ },
1232
+ type: "multimodal"
1233
+ },
1234
+ "zero-shot-image-classification" => {
1235
+ tokenizer: AutoTokenizer,
1236
+ pipeline: ZeroShotImageClassificationPipeline,
1237
+ model: AutoModel,
1238
+ processor: AutoProcessor,
1239
+ default: {
1240
+ model: "Xenova/clip-vit-base-patch32"
1241
+ },
1242
+ type: "multimodal"
1243
+ },
1244
+ "object-detection" => {
1245
+ pipeline: ObjectDetectionPipeline,
1246
+ model: AutoModelForObjectDetection,
1247
+ processor: AutoProcessor,
1248
+ default: {
1249
+ model: "Xenova/detr-resnet-50"
1250
+ },
1251
+ type: "multimodal"
1252
+ },
1253
+ "zero-shot-object-detection" => {
1254
+ tokenizer: AutoTokenizer,
1255
+ pipeline: ZeroShotObjectDetectionPipeline,
1256
+ model: AutoModelForZeroShotObjectDetection,
1257
+ processor: AutoProcessor,
1258
+ default: {
1259
+ model: "Xenova/owlvit-base-patch32"
1260
+ },
1261
+ type: "multimodal"
1262
+ },
1263
+ "document-question-answering" => {
1264
+ tokenizer: AutoTokenizer,
1265
+ pipeline: DocumentQuestionAnsweringPipeline,
1266
+ model: AutoModelForDocumentQuestionAnswering,
1267
+ processor: AutoProcessor,
1268
+ default: {
1269
+ model: "Xenova/donut-base-finetuned-docvqa"
1270
+ },
1271
+ type: "multimodal"
1272
+ },
1273
+ "image-to-image" => {
1274
+ pipeline: ImageToImagePipeline,
1275
+ model: AutoModelForImageToImage,
1276
+ processor: AutoProcessor,
1277
+ default: {
1278
+ model: "Xenova/swin2SR-classical-sr-x2-64"
1279
+ },
1280
+ type: "image"
1281
+ },
1282
+ "depth-estimation" => {
1283
+ pipeline: DepthEstimationPipeline,
1284
+ model: AutoModelForDepthEstimation,
1285
+ processor: AutoProcessor,
1286
+ default: {
1287
+ model: "Xenova/dpt-large"
1288
+ },
1289
+ type: "image"
1290
+ },
378
1291
  "feature-extraction" => {
379
1292
  tokenizer: AutoTokenizer,
380
1293
  pipeline: FeatureExtractionPipeline,
@@ -384,6 +1297,15 @@ module Informers
384
1297
  },
385
1298
  type: "text"
386
1299
  },
1300
+ "image-feature-extraction" => {
1301
+ processor: AutoProcessor,
1302
+ pipeline: ImageFeatureExtractionPipeline,
1303
+ model: [AutoModelForImageFeatureExtraction, AutoModel],
1304
+ default: {
1305
+ model: "Xenova/vit-base-patch16-224"
1306
+ },
1307
+ type: "image"
1308
+ },
387
1309
  "embedding" => {
388
1310
  tokenizer: AutoTokenizer,
389
1311
  pipeline: EmbeddingPipeline,
@@ -406,7 +1328,8 @@ module Informers
406
1328
 
407
1329
  TASK_ALIASES = {
408
1330
  "sentiment-analysis" => "text-classification",
409
- "ner" => "token-classification"
1331
+ "ner" => "token-classification",
1332
+ "text-to-speech" => "text-to-audio"
410
1333
  }
411
1334
 
412
1335
  DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -439,14 +1362,14 @@ module Informers
439
1362
  revision: "main",
440
1363
  model_file_name: nil
441
1364
  )
1365
+ # Apply aliases
1366
+ task = TASK_ALIASES[task] || task
1367
+
442
1368
  if quantized == NO_DEFAULT
443
1369
  # TODO move default to task class
444
- quantized = !["embedding", "reranking"].include?(task)
1370
+ quantized = ["text-classification", "token-classification", "question-answering", "feature-extraction"].include?(task)
445
1371
  end
446
1372
 
447
- # Apply aliases
448
- task = TASK_ALIASES[task] || task
449
-
450
1373
  # Get pipeline info
451
1374
  pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
452
1375
  if !pipeline_info
@@ -479,7 +1402,8 @@ module Informers
479
1402
  results = load_items(classes, model, pretrained_options)
480
1403
  results[:task] = task
481
1404
 
482
- if model == "sentence-transformers/all-MiniLM-L6-v2"
1405
+ # for previous revision of sentence-transformers/all-MiniLM-L6-v2
1406
+ if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
483
1407
  results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
484
1408
  end
485
1409
 
@@ -502,7 +1426,15 @@ module Informers
502
1426
  next if !cls
503
1427
 
504
1428
  if cls.is_a?(Array)
505
- raise Todo
1429
+ e = nil
1430
+ cls.each do |c|
1431
+ begin
1432
+ result[name] = c.from_pretrained(model, **pretrained_options)
1433
+ rescue => err
1434
+ e = err
1435
+ end
1436
+ end
1437
+ raise e unless result[name]
506
1438
  else
507
1439
  result[name] = cls.from_pretrained(model, **pretrained_options)
508
1440
  end