informers 1.0.3 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,40 @@ module Informers
7
7
  @tokenizer = tokenizer
8
8
  @processor = processor
9
9
  end
10
+
11
+ private
12
+
13
+ def prepare_images(images)
14
+ if !images.is_a?(Array)
15
+ images = [images]
16
+ end
17
+
18
+ # Possibly convert any non-images to images
19
+ images.map { |x| Utils::RawImage.read(x) }
20
+ end
21
+
22
+ def prepare_audios(audios, sampling_rate)
23
+ if !audios.is_a?(Array)
24
+ audios = [audios]
25
+ end
26
+
27
+ audios.map do |x|
28
+ if x.is_a?(String) || x.is_a?(URI)
29
+ Utils.read_audio(x, sampling_rate)
30
+ else
31
+ x
32
+ end
33
+ end
34
+ end
35
+
36
+ def get_bounding_box(box, as_integer)
37
+ if as_integer
38
+ box = box.map { |x| x.to_i }
39
+ end
40
+ xmin, ymin, xmax, ymax = box
41
+
42
+ {xmin:, ymin:, xmax:, ymax:}
43
+ end
10
44
  end
11
45
 
12
46
  class TextClassificationPipeline < Pipeline
@@ -21,13 +55,13 @@ module Informers
21
55
  outputs = @model.(model_inputs)
22
56
 
23
57
  function_to_apply =
24
- if @model.config.problem_type == "multi_label_classification"
58
+ if @model.config[:problem_type] == "multi_label_classification"
25
59
  ->(batch) { Utils.sigmoid(batch) }
26
60
  else
27
61
  ->(batch) { Utils.softmax(batch) } # single_label_classification (default)
28
62
  end
29
63
 
30
- id2label = @model.config.id2label
64
+ id2label = @model.config[:id2label]
31
65
 
32
66
  to_return = []
33
67
  outputs.logits.each do |batch|
@@ -70,7 +104,7 @@ module Informers
70
104
  outputs = @model.(model_inputs)
71
105
 
72
106
  logits = outputs.logits
73
- id2label = @model.config.id2label
107
+ id2label = @model.config[:id2label]
74
108
 
75
109
  to_return = []
76
110
  logits.length.times do |i|
@@ -243,6 +277,547 @@ module Informers
243
277
  end
244
278
  end
245
279
 
280
+ class FillMaskPipeline < Pipeline
281
+ def call(texts, top_k: 5)
282
+ model_inputs = @tokenizer.(texts, padding: true, truncation: true)
283
+ outputs = @model.(model_inputs)
284
+
285
+ to_return = []
286
+ model_inputs[:input_ids].each_with_index do |ids, i|
287
+ mask_token_index = ids.index(@tokenizer.mask_token_id)
288
+
289
+ if mask_token_index.nil?
290
+ raise ArgumentError, "Mask token (#{@tokenizer.mask_token}) not found in text."
291
+ end
292
+ logits = outputs.logits[i]
293
+ item_logits = logits[mask_token_index]
294
+
295
+ scores = Utils.get_top_items(Utils.softmax(item_logits), top_k)
296
+
297
+ to_return <<
298
+ scores.map do |x|
299
+ sequence = ids.dup
300
+ sequence[mask_token_index] = x[0]
301
+
302
+ {
303
+ score: x[1],
304
+ token: x[0],
305
+ token_str: @tokenizer.id_to_token(x[0]),
306
+ sequence: @tokenizer.decode(sequence, skip_special_tokens: true)
307
+ }
308
+ end
309
+ end
310
+ texts.is_a?(Array) ? to_return : to_return[0]
311
+ end
312
+ end
313
+
314
+ class Text2TextGenerationPipeline < Pipeline
315
+ KEY = :generated_text
316
+
317
+ def call(texts, **generate_kwargs)
318
+ if !texts.is_a?(Array)
319
+ texts = [texts]
320
+ end
321
+
322
+ # Add global prefix, if present
323
+ if @model.config[:prefix]
324
+ texts = texts.map { |x| @model.config[:prefix] + x }
325
+ end
326
+
327
+ # Handle task specific params:
328
+ task_specific_params = @model.config[:task_specific_params]
329
+ if task_specific_params && task_specific_params[@task]
330
+ # Add prefixes, if present
331
+ if task_specific_params[@task]["prefix"]
332
+ texts = texts.map { |x| task_specific_params[@task]["prefix"] + x }
333
+ end
334
+
335
+ # TODO update generation config
336
+ end
337
+
338
+ tokenizer = @tokenizer
339
+ tokenizer_options = {
340
+ padding: true,
341
+ truncation: true
342
+ }
343
+ if is_a?(TranslationPipeline) && tokenizer.respond_to?(:_build_translation_inputs)
344
+ input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs)[:input_ids]
345
+ else
346
+ input_ids = tokenizer.(texts, **tokenizer_options)[:input_ids]
347
+ end
348
+
349
+ output_token_ids = @model.generate(input_ids, generate_kwargs)
350
+
351
+ tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
352
+ .map { |text| {self.class.const_get(:KEY) => text} }
353
+ end
354
+ end
355
+
356
+ class SummarizationPipeline < Text2TextGenerationPipeline
357
+ KEY = :summary_text
358
+ end
359
+
360
+ class TranslationPipeline < Text2TextGenerationPipeline
361
+ KEY = :translation_text
362
+ end
363
+
364
+ class TextGenerationPipeline < Pipeline
365
+ def call(texts, **generate_kwargs)
366
+ is_batched = false
367
+ is_chat_input = false
368
+
369
+ # Normalize inputs
370
+ if texts.is_a?(String)
371
+ texts = [texts]
372
+ inputs = texts
373
+ else
374
+ raise Todo
375
+ end
376
+
377
+ # By default, do not add special tokens
378
+ add_special_tokens = generate_kwargs[:add_special_tokens] || false
379
+
380
+ # /By default, return full text
381
+ return_full_text =
382
+ if is_chat_input
383
+ false
384
+ else
385
+ generate_kwargs[:return_full_text] || true
386
+ end
387
+
388
+ @tokenizer.padding_side = "left"
389
+ input_ids, attention_mask =
390
+ @tokenizer.(inputs, add_special_tokens:, padding: true, truncation: true)
391
+ .values_at(:input_ids, :attention_mask)
392
+
393
+ output_token_ids =
394
+ @model.generate(
395
+ input_ids, generate_kwargs, nil, inputs_attention_mask: attention_mask
396
+ )
397
+
398
+ decoded = @tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
399
+
400
+ if !return_full_text && Utils.dims(input_ids)[-1] > 0
401
+ prompt_lengths = @tokenizer.batch_decode(input_ids, skip_special_tokens: true).map { |x| x.length }
402
+ end
403
+
404
+ to_return = Array.new(texts.length) { [] }
405
+ decoded.length.times do |i|
406
+ text_index = (i / output_token_ids.length.to_i * texts.length).floor
407
+
408
+ if prompt_lengths
409
+ raise Todo
410
+ end
411
+ # TODO is_chat_input
412
+ to_return[text_index] << {
413
+ generated_text: decoded[i]
414
+ }
415
+ end
416
+ !is_batched && to_return.length == 1 ? to_return[0] : to_return
417
+ end
418
+ end
419
+
420
+ class ZeroShotClassificationPipeline < Pipeline
421
+ def initialize(**options)
422
+ super(**options)
423
+
424
+ @label2id = @model.config[:label2id].transform_keys(&:downcase)
425
+
426
+ @entailment_id = @label2id["entailment"]
427
+ if @entailment_id.nil?
428
+ warn "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."
429
+ @entailment_id = 2
430
+ end
431
+
432
+ @contradiction_id = @label2id["contradiction"] || @label2id["not_entailment"]
433
+ if @contradiction_id.nil?
434
+ warn "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."
435
+ @contradiction_id = 0
436
+ end
437
+ end
438
+
439
+ def call(texts, candidate_labels, hypothesis_template: "This example is {}.", multi_label: false)
440
+ is_batched = texts.is_a?(Array)
441
+ if !is_batched
442
+ texts = [texts]
443
+ end
444
+ if !candidate_labels.is_a?(Array)
445
+ candidate_labels = [candidate_labels]
446
+ end
447
+
448
+ # Insert labels into hypothesis template
449
+ hypotheses = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
450
+
451
+ # How to perform the softmax over the logits:
452
+ # - true: softmax over the entailment vs. contradiction dim for each label independently
453
+ # - false: softmax the "entailment" logits over all candidate labels
454
+ softmax_each = multi_label || candidate_labels.length == 1
455
+
456
+ to_return = []
457
+ texts.each do |premise|
458
+ entails_logits = []
459
+
460
+ hypotheses.each do |hypothesis|
461
+ inputs = @tokenizer.(
462
+ premise,
463
+ text_pair: hypothesis,
464
+ padding: true,
465
+ truncation: true
466
+ )
467
+ outputs = @model.(inputs)
468
+
469
+ if softmax_each
470
+ entails_logits << [
471
+ outputs.logits[0][@contradiction_id],
472
+ outputs.logits[0][@entailment_id]
473
+ ]
474
+ else
475
+ entails_logits << outputs.logits[0][@entailment_id]
476
+ end
477
+ end
478
+
479
+ scores =
480
+ if softmax_each
481
+ entails_logits.map { |x| Utils.softmax(x)[1] }
482
+ else
483
+ Utils.softmax(entails_logits)
484
+ end
485
+
486
+ # Sort by scores (desc) and return scores with indices
487
+ scores_sorted = scores.map.with_index { |x, i| [x, i] }.sort_by { |v| -v[0] }
488
+
489
+ to_return << {
490
+ sequence: premise,
491
+ labels: scores_sorted.map { |x| candidate_labels[x[1]] },
492
+ scores: scores_sorted.map { |x| x[0] }
493
+ }
494
+ end
495
+ is_batched ? to_return : to_return[0]
496
+ end
497
+ end
498
+
499
+ class ImageToTextPipeline < Pipeline
500
+ def call(images, **generate_kwargs)
501
+ is_batched = images.is_a?(Array)
502
+ prepared_images = prepare_images(images)
503
+
504
+ pixel_values = @processor.(prepared_images)[:pixel_values]
505
+
506
+ to_return = []
507
+ pixel_values.each do |batch|
508
+ batch = [batch]
509
+ output = @model.generate(batch, **generate_kwargs)
510
+ decoded = @tokenizer
511
+ .batch_decode(output, skip_special_tokens: true)
512
+ .map { |x| {generated_text: x.strip} }
513
+ to_return << decoded
514
+ end
515
+
516
+ is_batched ? to_return : to_return[0]
517
+ end
518
+ end
519
+
520
+ class ImageClassificationPipeline < Pipeline
521
+ def call(images, top_k: 1)
522
+ is_batched = images.is_a?(Array)
523
+ prepared_images = prepare_images(images)
524
+
525
+ pixel_values = @processor.(prepared_images)[:pixel_values]
526
+ output = @model.({pixel_values: pixel_values})
527
+
528
+ id2label = @model.config[:id2label]
529
+ to_return = []
530
+ output.logits.each do |batch|
531
+ scores = Utils.get_top_items(Utils.softmax(batch), top_k)
532
+
533
+ vals =
534
+ scores.map do |x|
535
+ {
536
+ label: id2label[x[0].to_s],
537
+ score: x[1]
538
+ }
539
+ end
540
+ if top_k == 1
541
+ to_return.push(*vals)
542
+ else
543
+ to_return << vals
544
+ end
545
+ end
546
+
547
+ is_batched || top_k == 1 ? to_return : to_return[0]
548
+ end
549
+ end
550
+
551
+ class ImageSegmentationPipeline < Pipeline
552
+ def initialize(**options)
553
+ super(**options)
554
+
555
+ @subtasks_mapping = {
556
+ "panoptic" => "post_process_panoptic_segmentation",
557
+ "instance" => "post_process_instance_segmentation",
558
+ "semantic" => "post_process_semantic_segmentation"
559
+ }
560
+ end
561
+
562
+ def call(
563
+ images,
564
+ threshold: 0.5,
565
+ mask_threshold: 0.5,
566
+ overlap_mask_area_threshold: 0.8,
567
+ label_ids_to_fuse: nil,
568
+ target_sizes: nil,
569
+ subtask: nil
570
+ )
571
+ is_batched = images.is_a?(Array)
572
+
573
+ if is_batched && images.length != 1
574
+ raise Error, "Image segmentation pipeline currently only supports a batch size of 1."
575
+ end
576
+
577
+ prepared_images = prepare_images(images)
578
+ image_sizes = prepared_images.map { |x| [x.height, x.width] }
579
+
580
+ model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
581
+ output = @model.(model_inputs)
582
+
583
+ if !subtask.nil?
584
+ fn = @subtasks_mapping[subtask]
585
+ else
586
+ @subtasks_mapping.each do |task, func|
587
+ if @processor.feature_extractor.respond_to?(func)
588
+ fn = @processor.feature_extractor.method(func)
589
+ subtask = task
590
+ break
591
+ end
592
+ end
593
+ end
594
+
595
+ id2label = @model.config[:id2label]
596
+
597
+ annotation = []
598
+ if subtask == "panoptic" || subtask == "instance"
599
+ processed = fn.(
600
+ output,
601
+ threshold:,
602
+ mask_threshold:,
603
+ overlap_mask_area_threshold:,
604
+ label_ids_to_fuse:,
605
+ target_sizes: target_sizes || image_sizes, # TODO FIX?
606
+ )[0]
607
+
608
+ _segmentation = processed[:segmentation]
609
+
610
+ processed[:segments_info].each do |segment|
611
+ annotation << {
612
+ label: id2label[segment[:label_id].to_s],
613
+ score: segment[:score]
614
+ # TODO mask
615
+ }
616
+ end
617
+ elsif subtask == "semantic"
618
+ raise Todo
619
+ else
620
+ raise Error, "Subtask #{subtask} not supported."
621
+ end
622
+
623
+ annotation
624
+ end
625
+ end
626
+
627
+ class ZeroShotImageClassificationPipeline < Pipeline
628
+ def call(images, candidate_labels, hypothesis_template: "This is a photo of {}")
629
+ is_batched = images.is_a?(Array)
630
+ prepared_images = prepare_images(images)
631
+
632
+ # Insert label into hypothesis template
633
+ texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
634
+
635
+ # Run tokenization
636
+ text_inputs = @tokenizer.(texts,
637
+ padding: @model.config[:model_type] == "siglip" ? "max_length" : true,
638
+ truncation: true
639
+ )
640
+
641
+ # Run processor
642
+ pixel_values = @processor.(prepared_images)[:pixel_values]
643
+
644
+ # Run model with both text and pixel inputs
645
+ output = @model.(text_inputs.merge(pixel_values: pixel_values))
646
+
647
+ function_to_apply =
648
+ if @model.config[:model_type] == "siglip"
649
+ ->(batch) { Utils.sigmoid(batch) }
650
+ else
651
+ ->(batch) { Utils.softmax(batch) }
652
+ end
653
+
654
+ # Compare each image with each candidate label
655
+ to_return = []
656
+ output[0].each do |batch|
657
+ # Compute softmax per image
658
+ probs = function_to_apply.(batch)
659
+
660
+ result = probs
661
+ .map.with_index { |x, i| {label: candidate_labels[i], score: x} }
662
+ .sort_by { |v| -v[:score] }
663
+
664
+ to_return << result
665
+ end
666
+
667
+ is_batched ? to_return : to_return[0]
668
+ end
669
+ end
670
+
671
+ class ObjectDetectionPipeline < Pipeline
672
+ def call(images, threshold: 0.9, percentage: false)
673
+ is_batched = images.is_a?(Array)
674
+
675
+ if is_batched && images.length != 1
676
+ raise Error, "Object detection pipeline currently only supports a batch size of 1."
677
+ end
678
+ prepared_images = prepare_images(images)
679
+
680
+ image_sizes = percentage ? nil : prepared_images.map { |x| [x.height, x.width] }
681
+
682
+ model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
683
+ output = @model.(model_inputs)
684
+
685
+ processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_sizes)
686
+
687
+ # Add labels
688
+ id2label = @model.config[:id2label]
689
+
690
+ # Format output
691
+ result =
692
+ processed.map do |batch|
693
+ batch[:boxes].map.with_index do |box, i|
694
+ {
695
+ label: id2label[batch[:classes][i].to_s],
696
+ score: batch[:scores][i],
697
+ box: get_bounding_box(box, !percentage)
698
+ }
699
+ end.sort_by { |v| -v[:score] }
700
+ end
701
+
702
+ is_batched ? result : result[0]
703
+ end
704
+ end
705
+
706
+ class ZeroShotObjectDetectionPipeline < Pipeline
707
+ def call(
708
+ images,
709
+ candidate_labels,
710
+ threshold: 0.1,
711
+ top_k: nil,
712
+ percentage: false
713
+ )
714
+ is_batched = images.is_a?(Array)
715
+ prepared_images = prepare_images(images)
716
+
717
+ # Run tokenization
718
+ text_inputs = @tokenizer.(candidate_labels,
719
+ padding: true,
720
+ truncation: true
721
+ )
722
+
723
+ # Run processor
724
+ model_inputs = @processor.(prepared_images)
725
+
726
+ # Since non-maximum suppression is performed for exporting, we need to
727
+ # process each image separately. For more information, see:
728
+ # https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
729
+ to_return = []
730
+ prepared_images.length.times do |i|
731
+ image = prepared_images[i]
732
+ image_size = percentage ? nil : [[image.height, image.width]]
733
+ pixel_values = [model_inputs[:pixel_values][i]]
734
+
735
+ # Run model with both text and pixel inputs
736
+ output = @model.(text_inputs.merge(pixel_values: pixel_values))
737
+ # TODO remove
738
+ output = @model.instance_variable_get(:@session).outputs.map { |v| v[:name].to_sym }.zip(output).to_h
739
+
740
+ processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_size, true)[0]
741
+ result =
742
+ processed[:boxes].map.with_index do |box, i|
743
+ {
744
+ label: candidate_labels[processed[:classes][i]],
745
+ score: processed[:scores][i],
746
+ box: get_bounding_box(box, !percentage)
747
+ }
748
+ end
749
+ result.sort_by! { |v| -v[:score] }
750
+ if !top_k.nil?
751
+ result = result[0...topk]
752
+ end
753
+ to_return << result
754
+ end
755
+
756
+ is_batched ? to_return : to_return[0]
757
+ end
758
+ end
759
+
760
+ class DocumentQuestionAnsweringPipeline < Pipeline
761
+ def call(image, question, **generate_kwargs)
762
+ # NOTE: For now, we only support a batch size of 1
763
+
764
+ # Preprocess image
765
+ prepared_image = prepare_images(image)[0]
766
+ pixel_values = @processor.(prepared_image)[:pixel_values]
767
+
768
+ # Run tokenization
769
+ task_prompt = "<s_docvqa><s_question>#{question}</s_question><s_answer>"
770
+ decoder_input_ids =
771
+ @tokenizer.(
772
+ task_prompt,
773
+ add_special_tokens: false,
774
+ padding: true,
775
+ truncation: true
776
+ )[:input_ids]
777
+
778
+ # Run model
779
+ output =
780
+ @model.generate(
781
+ pixel_values,
782
+ generate_kwargs.merge(
783
+ decoder_input_ids: decoder_input_ids[0],
784
+ max_length: @model.config["decoder"]["max_position_embeddings"]
785
+ ).transform_keys(&:to_s)
786
+ )
787
+
788
+ # Decode output
789
+ decoded = @tokenizer.batch_decode(output, skip_special_tokens: false)[0]
790
+
791
+ # Parse answer
792
+ match = decoded.match(/<s_answer>(.*?)<\/s_answer>/)
793
+ answer = nil
794
+ if match && match.length >= 2
795
+ answer = match[1].strip
796
+ end
797
+ [{answer:}]
798
+ end
799
+ end
800
+
801
+ class TextToAudioPipeline < Pipeline
802
+ DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
803
+
804
+ def initialize(**options)
805
+ super(**options)
806
+
807
+ # TODO: Find a better way for `pipeline` to set the default vocoder
808
+ @vocoder = options[:vocoder]
809
+ end
810
+
811
+ def call(text_inputs, speaker_embeddings: nil)
812
+ # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
813
+ if @processor
814
+ call_text_to_spectrogram(text_inputs, speaker_embeddings:)
815
+ else
816
+ call_text_to_waveform(text_inputs)
817
+ end
818
+ end
819
+ end
820
+
246
821
  class FeatureExtractionPipeline < Pipeline
247
822
  def call(
248
823
  texts,
@@ -262,7 +837,7 @@ module Informers
262
837
  if !model_output.nil?
263
838
  model_options[:output_names] = Array(model_output)
264
839
  elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
265
- # optimization for sentence-transformers/all-MiniLM-L6-v2
840
+ # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
266
841
  model_options[:output_names] = ["sentence_embedding"]
267
842
  pooling = "none"
268
843
  normalize = false
@@ -306,6 +881,164 @@ module Informers
306
881
  end
307
882
  end
308
883
 
884
+ class ImageFeatureExtractionPipeline < Pipeline
885
+ def call(images)
886
+ prepared_images = prepare_images(images)
887
+ pixel_values = @processor.(prepared_images)[:pixel_values]
888
+ outputs = @model.({pixel_values: pixel_values})
889
+
890
+ result = outputs[0]
891
+ result
892
+ end
893
+ end
894
+
895
+ class AudioClassificationPipeline < Pipeline
896
+ def call(audio, top_k: nil)
897
+ single = !audio.is_a?(Array)
898
+
899
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
900
+ prepared_audios = prepare_audios(audio, sampling_rate)
901
+
902
+ id2label = @model.config[:id2label]
903
+
904
+ to_return = []
905
+ prepared_audios.each do |aud|
906
+ inputs = @processor.(aud)
907
+ output = @model.(inputs)
908
+ logits = output.logits[0]
909
+
910
+ scores = Utils.get_top_items(Utils.softmax(logits), top_k)
911
+
912
+ vals =
913
+ scores.map do |x|
914
+ {
915
+ label: id2label[x[0].to_s],
916
+ score: x[1]
917
+ }
918
+ end
919
+
920
+ if top_k == 1
921
+ to_return.concat(vals)
922
+ else
923
+ to_return << vals
924
+ end
925
+ end
926
+ !single || top_k == 1 ? to_return : to_return[0]
927
+ end
928
+ end
929
+
930
+ class ZeroShotAudioClassificationPipeline < Pipeline
931
+ def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
932
+ single = !audio.is_a?(Array)
933
+ if single
934
+ audio = [audio]
935
+ end
936
+
937
+ # Insert label into hypothesis template
938
+ texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
939
+
940
+ # Run tokenization
941
+ text_inputs =
942
+ @tokenizer.(
943
+ texts,
944
+ padding: true,
945
+ truncation: true
946
+ )
947
+
948
+ sampling_rate = @processor.feature_extractor.config["sampling_rate"]
949
+ prepared_audios = prepare_audios(audio, sampling_rate)
950
+
951
+ to_return = []
952
+ prepared_audios.each do |aud|
953
+ audio_inputs = @processor.(aud)
954
+
955
+ # Run model with both text and audio inputs
956
+ output = @model.(text_inputs.merge(audio_inputs))
957
+
958
+ # Compute softmax per audio
959
+ probs = Utils.softmax(output.logits_per_audio.data)
960
+
961
+ to_return <<
962
+ probs.map.with_index do |x, i|
963
+ {
964
+ label: candidate_labels[i],
965
+ score: x
966
+ }
967
+ end
968
+ end
969
+ single ? to_return[0] : to_return
970
+ end
971
+ end
972
+
973
+ class AutomaticSpeechRecognitionPipeline < Pipeline
974
+ def call(audio, **kwargs)
975
+ case @model.config["model_type"]
976
+ when "whisper"
977
+ call_whisper(audio, **kwargs)
978
+ else
979
+ raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
980
+ end
981
+ end
982
+
983
+ private
984
+
985
+ def call_whisper(audio, **kwargs)
986
+ raise Todo
987
+ end
988
+ end
989
+
990
+ class ImageToImagePipeline < Pipeline
991
+ def call(images)
992
+ prepared_images = prepare_images(images)
993
+ inputs = @processor.(prepared_images)
994
+ outputs = @model.(inputs)
995
+
996
+ to_return = []
997
+ outputs[0].each do |batch|
998
+ # TODO flatten first
999
+ output =
1000
+ batch.map do |v|
1001
+ v.map do |v2|
1002
+ v2.map do |v3|
1003
+ (v3.clamp(0, 1) * 255).round
1004
+ end
1005
+ end
1006
+ end
1007
+ to_return << Utils::RawImage.from_array(output).image
1008
+ end
1009
+
1010
+ to_return.length > 1 ? to_return : to_return[0]
1011
+ end
1012
+ end
1013
+
1014
+ class DepthEstimationPipeline < Pipeline
1015
+ def call(images)
1016
+ prepared_images = prepare_images(images)
1017
+
1018
+ inputs = @processor.(prepared_images)
1019
+ predicted_depth = @model.(inputs)[0]
1020
+
1021
+ to_return = []
1022
+ prepared_images.length.times do |i|
1023
+ prediction = Utils.interpolate(predicted_depth[i], prepared_images[i].size.reverse, "bilinear", false)
1024
+ max_prediction = Utils.max(prediction.flatten)[0]
1025
+ formatted =
1026
+ prediction.map do |v|
1027
+ v.map do |v2|
1028
+ v2.map do |v3|
1029
+ (v3 * 255 / max_prediction).round
1030
+ end
1031
+ end
1032
+ end
1033
+ to_return << {
1034
+ predicted_depth: predicted_depth[i],
1035
+ depth: Utils::RawImage.from_array(formatted).image
1036
+ }
1037
+ end
1038
+ to_return.length > 1 ? to_return : to_return[0]
1039
+ end
1040
+ end
1041
+
309
1042
  class EmbeddingPipeline < FeatureExtractionPipeline
310
1043
  def call(
311
1044
  texts,
@@ -375,6 +1108,186 @@ module Informers
375
1108
  },
376
1109
  type: "text"
377
1110
  },
1111
+ "fill-mask" => {
1112
+ tokenizer: AutoTokenizer,
1113
+ pipeline: FillMaskPipeline,
1114
+ model: AutoModelForMaskedLM,
1115
+ default: {
1116
+ model: "Xenova/bert-base-uncased"
1117
+ },
1118
+ type: "text"
1119
+ },
1120
+ "summarization" => {
1121
+ tokenizer: AutoTokenizer,
1122
+ pipeline: SummarizationPipeline,
1123
+ model: AutoModelForSeq2SeqLM,
1124
+ default: {
1125
+ model: "Xenova/distilbart-cnn-6-6"
1126
+ },
1127
+ type: "text"
1128
+ },
1129
+ "translation" => {
1130
+ tokenizer: AutoTokenizer,
1131
+ pipeline: TranslationPipeline,
1132
+ model: AutoModelForSeq2SeqLM,
1133
+ default: {
1134
+ model: "Xenova/t5-small"
1135
+ },
1136
+ type: "text"
1137
+ },
1138
+ "text2text-generation" => {
1139
+ tokenizer: AutoTokenizer,
1140
+ pipeline: Text2TextGenerationPipeline,
1141
+ model: AutoModelForSeq2SeqLM,
1142
+ default: {
1143
+ model: "Xenova/flan-t5-small"
1144
+ },
1145
+ type: "text"
1146
+ },
1147
+ "text-generation" => {
1148
+ tokenizer: AutoTokenizer,
1149
+ pipeline: TextGenerationPipeline,
1150
+ model: AutoModelForCausalLM,
1151
+ default: {
1152
+ model: "Xenova/gpt2"
1153
+ },
1154
+ type: "text"
1155
+ },
1156
+ "zero-shot-classification" => {
1157
+ tokenizer: AutoTokenizer,
1158
+ pipeline: ZeroShotClassificationPipeline,
1159
+ model: AutoModelForSequenceClassification,
1160
+ default: {
1161
+ model: "Xenova/distilbert-base-uncased-mnli"
1162
+ },
1163
+ type: "text"
1164
+ },
1165
+ "audio-classification" => {
1166
+ pipeline: AudioClassificationPipeline,
1167
+ model: AutoModelForAudioClassification,
1168
+ processor: AutoProcessor,
1169
+ default: {
1170
+ model: "Xenova/wav2vec2-base-superb-ks"
1171
+ },
1172
+ type: "audio"
1173
+ },
1174
+ # TODO
1175
+ # "zero-shot-audio-classification" => {
1176
+ # tokenizer: AutoTokenizer,
1177
+ # pipeline: ZeroShotAudioClassificationPipeline,
1178
+ # model: AutoModel,
1179
+ # processor: AutoProcessor,
1180
+ # default: {
1181
+ # model: "Xenova/clap-htsat-unfused"
1182
+ # },
1183
+ # type: "multimodal"
1184
+ # },
1185
+ # TODO
1186
+ # "automatic-speech-recognition" => {
1187
+ # tokenizer: AutoTokenizer,
1188
+ # pipeline: AutomaticSpeechRecognitionPipeline,
1189
+ # model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
1190
+ # processor: AutoProcessor,
1191
+ # default: {
1192
+ # model: "Xenova/whisper-tiny.en"
1193
+ # },
1194
+ # type: "multimodal"
1195
+ # },
1196
+ "text-to-audio" => {
1197
+ tokenizer: AutoTokenizer,
1198
+ pipeline: TextToAudioPipeline,
1199
+ model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
1200
+ processor: [AutoProcessor, nil],
1201
+ default: {
1202
+ model: "Xenova/speecht5_tts"
1203
+ },
1204
+ type: "text"
1205
+ },
1206
+ "image-to-text" => {
1207
+ tokenizer: AutoTokenizer,
1208
+ pipeline: ImageToTextPipeline,
1209
+ model: AutoModelForVision2Seq,
1210
+ processor: AutoProcessor,
1211
+ default: {
1212
+ model: "Xenova/vit-gpt2-image-captioning"
1213
+ },
1214
+ type: "multimodal"
1215
+ },
1216
+ "image-classification" => {
1217
+ pipeline: ImageClassificationPipeline,
1218
+ model: AutoModelForImageClassification,
1219
+ processor: AutoProcessor,
1220
+ default: {
1221
+ model: "Xenova/vit-base-patch16-224"
1222
+ },
1223
+ type: "multimodal"
1224
+ },
1225
+ "image-segmentation" => {
1226
+ pipeline: ImageSegmentationPipeline,
1227
+ model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
1228
+ processor: AutoProcessor,
1229
+ default: {
1230
+ model: "Xenova/detr-resnet-50-panoptic"
1231
+ },
1232
+ type: "multimodal"
1233
+ },
1234
+ "zero-shot-image-classification" => {
1235
+ tokenizer: AutoTokenizer,
1236
+ pipeline: ZeroShotImageClassificationPipeline,
1237
+ model: AutoModel,
1238
+ processor: AutoProcessor,
1239
+ default: {
1240
+ model: "Xenova/clip-vit-base-patch32"
1241
+ },
1242
+ type: "multimodal"
1243
+ },
1244
+ "object-detection" => {
1245
+ pipeline: ObjectDetectionPipeline,
1246
+ model: AutoModelForObjectDetection,
1247
+ processor: AutoProcessor,
1248
+ default: {
1249
+ model: "Xenova/detr-resnet-50"
1250
+ },
1251
+ type: "multimodal"
1252
+ },
1253
+ "zero-shot-object-detection" => {
1254
+ tokenizer: AutoTokenizer,
1255
+ pipeline: ZeroShotObjectDetectionPipeline,
1256
+ model: AutoModelForZeroShotObjectDetection,
1257
+ processor: AutoProcessor,
1258
+ default: {
1259
+ model: "Xenova/owlvit-base-patch32"
1260
+ },
1261
+ type: "multimodal"
1262
+ },
1263
+ "document-question-answering" => {
1264
+ tokenizer: AutoTokenizer,
1265
+ pipeline: DocumentQuestionAnsweringPipeline,
1266
+ model: AutoModelForDocumentQuestionAnswering,
1267
+ processor: AutoProcessor,
1268
+ default: {
1269
+ model: "Xenova/donut-base-finetuned-docvqa"
1270
+ },
1271
+ type: "multimodal"
1272
+ },
1273
+ "image-to-image" => {
1274
+ pipeline: ImageToImagePipeline,
1275
+ model: AutoModelForImageToImage,
1276
+ processor: AutoProcessor,
1277
+ default: {
1278
+ model: "Xenova/swin2SR-classical-sr-x2-64"
1279
+ },
1280
+ type: "image"
1281
+ },
1282
+ "depth-estimation" => {
1283
+ pipeline: DepthEstimationPipeline,
1284
+ model: AutoModelForDepthEstimation,
1285
+ processor: AutoProcessor,
1286
+ default: {
1287
+ model: "Xenova/dpt-large"
1288
+ },
1289
+ type: "image"
1290
+ },
378
1291
  "feature-extraction" => {
379
1292
  tokenizer: AutoTokenizer,
380
1293
  pipeline: FeatureExtractionPipeline,
@@ -384,6 +1297,15 @@ module Informers
384
1297
  },
385
1298
  type: "text"
386
1299
  },
1300
+ "image-feature-extraction" => {
1301
+ processor: AutoProcessor,
1302
+ pipeline: ImageFeatureExtractionPipeline,
1303
+ model: [AutoModelForImageFeatureExtraction, AutoModel],
1304
+ default: {
1305
+ model: "Xenova/vit-base-patch16-224"
1306
+ },
1307
+ type: "image"
1308
+ },
387
1309
  "embedding" => {
388
1310
  tokenizer: AutoTokenizer,
389
1311
  pipeline: EmbeddingPipeline,
@@ -406,7 +1328,8 @@ module Informers
406
1328
 
407
1329
  TASK_ALIASES = {
408
1330
  "sentiment-analysis" => "text-classification",
409
- "ner" => "token-classification"
1331
+ "ner" => "token-classification",
1332
+ "text-to-speech" => "text-to-audio"
410
1333
  }
411
1334
 
412
1335
  DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
@@ -439,14 +1362,14 @@ module Informers
439
1362
  revision: "main",
440
1363
  model_file_name: nil
441
1364
  )
1365
+ # Apply aliases
1366
+ task = TASK_ALIASES[task] || task
1367
+
442
1368
  if quantized == NO_DEFAULT
443
1369
  # TODO move default to task class
444
- quantized = !["embedding", "reranking"].include?(task)
1370
+ quantized = ["text-classification", "token-classification", "question-answering", "feature-extraction"].include?(task)
445
1371
  end
446
1372
 
447
- # Apply aliases
448
- task = TASK_ALIASES[task] || task
449
-
450
1373
  # Get pipeline info
451
1374
  pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
452
1375
  if !pipeline_info
@@ -479,7 +1402,8 @@ module Informers
479
1402
  results = load_items(classes, model, pretrained_options)
480
1403
  results[:task] = task
481
1404
 
482
- if model == "sentence-transformers/all-MiniLM-L6-v2"
1405
+ # for previous revision of sentence-transformers/all-MiniLM-L6-v2
1406
+ if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
483
1407
  results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
484
1408
  end
485
1409
 
@@ -502,7 +1426,15 @@ module Informers
502
1426
  next if !cls
503
1427
 
504
1428
  if cls.is_a?(Array)
505
- raise Todo
1429
+ e = nil
1430
+ cls.each do |c|
1431
+ begin
1432
+ result[name] = c.from_pretrained(model, **pretrained_options)
1433
+ rescue => err
1434
+ e = err
1435
+ end
1436
+ end
1437
+ raise e unless result[name]
506
1438
  else
507
1439
  result[name] = cls.from_pretrained(model, **pretrained_options)
508
1440
  end