informers 1.0.3 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +137 -7
- data/lib/informers/configs.rb +10 -8
- data/lib/informers/model.rb +2 -9
- data/lib/informers/models.rb +1160 -15
- data/lib/informers/pipelines.rb +943 -11
- data/lib/informers/processors.rb +856 -0
- data/lib/informers/tokenizers.rb +159 -5
- data/lib/informers/utils/audio.rb +18 -0
- data/lib/informers/utils/core.rb +4 -0
- data/lib/informers/utils/ffmpeg.rb +45 -0
- data/lib/informers/utils/generation.rb +294 -0
- data/lib/informers/utils/image.rb +116 -0
- data/lib/informers/utils/math.rb +73 -0
- data/lib/informers/utils/tensor.rb +46 -0
- data/lib/informers/version.rb +1 -1
- data/lib/informers.rb +6 -0
- metadata +10 -5
data/lib/informers/pipelines.rb
CHANGED
@@ -7,6 +7,40 @@ module Informers
|
|
7
7
|
@tokenizer = tokenizer
|
8
8
|
@processor = processor
|
9
9
|
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def prepare_images(images)
|
14
|
+
if !images.is_a?(Array)
|
15
|
+
images = [images]
|
16
|
+
end
|
17
|
+
|
18
|
+
# Possibly convert any non-images to images
|
19
|
+
images.map { |x| Utils::RawImage.read(x) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def prepare_audios(audios, sampling_rate)
|
23
|
+
if !audios.is_a?(Array)
|
24
|
+
audios = [audios]
|
25
|
+
end
|
26
|
+
|
27
|
+
audios.map do |x|
|
28
|
+
if x.is_a?(String) || x.is_a?(URI)
|
29
|
+
Utils.read_audio(x, sampling_rate)
|
30
|
+
else
|
31
|
+
x
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_bounding_box(box, as_integer)
|
37
|
+
if as_integer
|
38
|
+
box = box.map { |x| x.to_i }
|
39
|
+
end
|
40
|
+
xmin, ymin, xmax, ymax = box
|
41
|
+
|
42
|
+
{xmin:, ymin:, xmax:, ymax:}
|
43
|
+
end
|
10
44
|
end
|
11
45
|
|
12
46
|
class TextClassificationPipeline < Pipeline
|
@@ -21,13 +55,13 @@ module Informers
|
|
21
55
|
outputs = @model.(model_inputs)
|
22
56
|
|
23
57
|
function_to_apply =
|
24
|
-
if @model.config
|
58
|
+
if @model.config[:problem_type] == "multi_label_classification"
|
25
59
|
->(batch) { Utils.sigmoid(batch) }
|
26
60
|
else
|
27
61
|
->(batch) { Utils.softmax(batch) } # single_label_classification (default)
|
28
62
|
end
|
29
63
|
|
30
|
-
id2label = @model.config
|
64
|
+
id2label = @model.config[:id2label]
|
31
65
|
|
32
66
|
to_return = []
|
33
67
|
outputs.logits.each do |batch|
|
@@ -70,7 +104,7 @@ module Informers
|
|
70
104
|
outputs = @model.(model_inputs)
|
71
105
|
|
72
106
|
logits = outputs.logits
|
73
|
-
id2label = @model.config
|
107
|
+
id2label = @model.config[:id2label]
|
74
108
|
|
75
109
|
to_return = []
|
76
110
|
logits.length.times do |i|
|
@@ -243,6 +277,547 @@ module Informers
|
|
243
277
|
end
|
244
278
|
end
|
245
279
|
|
280
|
+
class FillMaskPipeline < Pipeline
|
281
|
+
def call(texts, top_k: 5)
|
282
|
+
model_inputs = @tokenizer.(texts, padding: true, truncation: true)
|
283
|
+
outputs = @model.(model_inputs)
|
284
|
+
|
285
|
+
to_return = []
|
286
|
+
model_inputs[:input_ids].each_with_index do |ids, i|
|
287
|
+
mask_token_index = ids.index(@tokenizer.mask_token_id)
|
288
|
+
|
289
|
+
if mask_token_index.nil?
|
290
|
+
raise ArgumentError, "Mask token (#{@tokenizer.mask_token}) not found in text."
|
291
|
+
end
|
292
|
+
logits = outputs.logits[i]
|
293
|
+
item_logits = logits[mask_token_index]
|
294
|
+
|
295
|
+
scores = Utils.get_top_items(Utils.softmax(item_logits), top_k)
|
296
|
+
|
297
|
+
to_return <<
|
298
|
+
scores.map do |x|
|
299
|
+
sequence = ids.dup
|
300
|
+
sequence[mask_token_index] = x[0]
|
301
|
+
|
302
|
+
{
|
303
|
+
score: x[1],
|
304
|
+
token: x[0],
|
305
|
+
token_str: @tokenizer.id_to_token(x[0]),
|
306
|
+
sequence: @tokenizer.decode(sequence, skip_special_tokens: true)
|
307
|
+
}
|
308
|
+
end
|
309
|
+
end
|
310
|
+
texts.is_a?(Array) ? to_return : to_return[0]
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
class Text2TextGenerationPipeline < Pipeline
|
315
|
+
KEY = :generated_text
|
316
|
+
|
317
|
+
def call(texts, **generate_kwargs)
|
318
|
+
if !texts.is_a?(Array)
|
319
|
+
texts = [texts]
|
320
|
+
end
|
321
|
+
|
322
|
+
# Add global prefix, if present
|
323
|
+
if @model.config[:prefix]
|
324
|
+
texts = texts.map { |x| @model.config[:prefix] + x }
|
325
|
+
end
|
326
|
+
|
327
|
+
# Handle task specific params:
|
328
|
+
task_specific_params = @model.config[:task_specific_params]
|
329
|
+
if task_specific_params && task_specific_params[@task]
|
330
|
+
# Add prefixes, if present
|
331
|
+
if task_specific_params[@task]["prefix"]
|
332
|
+
texts = texts.map { |x| task_specific_params[@task]["prefix"] + x }
|
333
|
+
end
|
334
|
+
|
335
|
+
# TODO update generation config
|
336
|
+
end
|
337
|
+
|
338
|
+
tokenizer = @tokenizer
|
339
|
+
tokenizer_options = {
|
340
|
+
padding: true,
|
341
|
+
truncation: true
|
342
|
+
}
|
343
|
+
if is_a?(TranslationPipeline) && tokenizer.respond_to?(:_build_translation_inputs)
|
344
|
+
input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs)[:input_ids]
|
345
|
+
else
|
346
|
+
input_ids = tokenizer.(texts, **tokenizer_options)[:input_ids]
|
347
|
+
end
|
348
|
+
|
349
|
+
output_token_ids = @model.generate(input_ids, generate_kwargs)
|
350
|
+
|
351
|
+
tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
|
352
|
+
.map { |text| {self.class.const_get(:KEY) => text} }
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
class SummarizationPipeline < Text2TextGenerationPipeline
|
357
|
+
KEY = :summary_text
|
358
|
+
end
|
359
|
+
|
360
|
+
class TranslationPipeline < Text2TextGenerationPipeline
|
361
|
+
KEY = :translation_text
|
362
|
+
end
|
363
|
+
|
364
|
+
class TextGenerationPipeline < Pipeline
|
365
|
+
def call(texts, **generate_kwargs)
|
366
|
+
is_batched = false
|
367
|
+
is_chat_input = false
|
368
|
+
|
369
|
+
# Normalize inputs
|
370
|
+
if texts.is_a?(String)
|
371
|
+
texts = [texts]
|
372
|
+
inputs = texts
|
373
|
+
else
|
374
|
+
raise Todo
|
375
|
+
end
|
376
|
+
|
377
|
+
# By default, do not add special tokens
|
378
|
+
add_special_tokens = generate_kwargs[:add_special_tokens] || false
|
379
|
+
|
380
|
+
# /By default, return full text
|
381
|
+
return_full_text =
|
382
|
+
if is_chat_input
|
383
|
+
false
|
384
|
+
else
|
385
|
+
generate_kwargs[:return_full_text] || true
|
386
|
+
end
|
387
|
+
|
388
|
+
@tokenizer.padding_side = "left"
|
389
|
+
input_ids, attention_mask =
|
390
|
+
@tokenizer.(inputs, add_special_tokens:, padding: true, truncation: true)
|
391
|
+
.values_at(:input_ids, :attention_mask)
|
392
|
+
|
393
|
+
output_token_ids =
|
394
|
+
@model.generate(
|
395
|
+
input_ids, generate_kwargs, nil, inputs_attention_mask: attention_mask
|
396
|
+
)
|
397
|
+
|
398
|
+
decoded = @tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
|
399
|
+
|
400
|
+
if !return_full_text && Utils.dims(input_ids)[-1] > 0
|
401
|
+
prompt_lengths = @tokenizer.batch_decode(input_ids, skip_special_tokens: true).map { |x| x.length }
|
402
|
+
end
|
403
|
+
|
404
|
+
to_return = Array.new(texts.length) { [] }
|
405
|
+
decoded.length.times do |i|
|
406
|
+
text_index = (i / output_token_ids.length.to_i * texts.length).floor
|
407
|
+
|
408
|
+
if prompt_lengths
|
409
|
+
raise Todo
|
410
|
+
end
|
411
|
+
# TODO is_chat_input
|
412
|
+
to_return[text_index] << {
|
413
|
+
generated_text: decoded[i]
|
414
|
+
}
|
415
|
+
end
|
416
|
+
!is_batched && to_return.length == 1 ? to_return[0] : to_return
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
class ZeroShotClassificationPipeline < Pipeline
|
421
|
+
def initialize(**options)
|
422
|
+
super(**options)
|
423
|
+
|
424
|
+
@label2id = @model.config[:label2id].transform_keys(&:downcase)
|
425
|
+
|
426
|
+
@entailment_id = @label2id["entailment"]
|
427
|
+
if @entailment_id.nil?
|
428
|
+
warn "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."
|
429
|
+
@entailment_id = 2
|
430
|
+
end
|
431
|
+
|
432
|
+
@contradiction_id = @label2id["contradiction"] || @label2id["not_entailment"]
|
433
|
+
if @contradiction_id.nil?
|
434
|
+
warn "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."
|
435
|
+
@contradiction_id = 0
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
def call(texts, candidate_labels, hypothesis_template: "This example is {}.", multi_label: false)
|
440
|
+
is_batched = texts.is_a?(Array)
|
441
|
+
if !is_batched
|
442
|
+
texts = [texts]
|
443
|
+
end
|
444
|
+
if !candidate_labels.is_a?(Array)
|
445
|
+
candidate_labels = [candidate_labels]
|
446
|
+
end
|
447
|
+
|
448
|
+
# Insert labels into hypothesis template
|
449
|
+
hypotheses = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
450
|
+
|
451
|
+
# How to perform the softmax over the logits:
|
452
|
+
# - true: softmax over the entailment vs. contradiction dim for each label independently
|
453
|
+
# - false: softmax the "entailment" logits over all candidate labels
|
454
|
+
softmax_each = multi_label || candidate_labels.length == 1
|
455
|
+
|
456
|
+
to_return = []
|
457
|
+
texts.each do |premise|
|
458
|
+
entails_logits = []
|
459
|
+
|
460
|
+
hypotheses.each do |hypothesis|
|
461
|
+
inputs = @tokenizer.(
|
462
|
+
premise,
|
463
|
+
text_pair: hypothesis,
|
464
|
+
padding: true,
|
465
|
+
truncation: true
|
466
|
+
)
|
467
|
+
outputs = @model.(inputs)
|
468
|
+
|
469
|
+
if softmax_each
|
470
|
+
entails_logits << [
|
471
|
+
outputs.logits[0][@contradiction_id],
|
472
|
+
outputs.logits[0][@entailment_id]
|
473
|
+
]
|
474
|
+
else
|
475
|
+
entails_logits << outputs.logits[0][@entailment_id]
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
scores =
|
480
|
+
if softmax_each
|
481
|
+
entails_logits.map { |x| Utils.softmax(x)[1] }
|
482
|
+
else
|
483
|
+
Utils.softmax(entails_logits)
|
484
|
+
end
|
485
|
+
|
486
|
+
# Sort by scores (desc) and return scores with indices
|
487
|
+
scores_sorted = scores.map.with_index { |x, i| [x, i] }.sort_by { |v| -v[0] }
|
488
|
+
|
489
|
+
to_return << {
|
490
|
+
sequence: premise,
|
491
|
+
labels: scores_sorted.map { |x| candidate_labels[x[1]] },
|
492
|
+
scores: scores_sorted.map { |x| x[0] }
|
493
|
+
}
|
494
|
+
end
|
495
|
+
is_batched ? to_return : to_return[0]
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
499
|
+
class ImageToTextPipeline < Pipeline
|
500
|
+
def call(images, **generate_kwargs)
|
501
|
+
is_batched = images.is_a?(Array)
|
502
|
+
prepared_images = prepare_images(images)
|
503
|
+
|
504
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
505
|
+
|
506
|
+
to_return = []
|
507
|
+
pixel_values.each do |batch|
|
508
|
+
batch = [batch]
|
509
|
+
output = @model.generate(batch, **generate_kwargs)
|
510
|
+
decoded = @tokenizer
|
511
|
+
.batch_decode(output, skip_special_tokens: true)
|
512
|
+
.map { |x| {generated_text: x.strip} }
|
513
|
+
to_return << decoded
|
514
|
+
end
|
515
|
+
|
516
|
+
is_batched ? to_return : to_return[0]
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
class ImageClassificationPipeline < Pipeline
|
521
|
+
def call(images, top_k: 1)
|
522
|
+
is_batched = images.is_a?(Array)
|
523
|
+
prepared_images = prepare_images(images)
|
524
|
+
|
525
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
526
|
+
output = @model.({pixel_values: pixel_values})
|
527
|
+
|
528
|
+
id2label = @model.config[:id2label]
|
529
|
+
to_return = []
|
530
|
+
output.logits.each do |batch|
|
531
|
+
scores = Utils.get_top_items(Utils.softmax(batch), top_k)
|
532
|
+
|
533
|
+
vals =
|
534
|
+
scores.map do |x|
|
535
|
+
{
|
536
|
+
label: id2label[x[0].to_s],
|
537
|
+
score: x[1]
|
538
|
+
}
|
539
|
+
end
|
540
|
+
if top_k == 1
|
541
|
+
to_return.push(*vals)
|
542
|
+
else
|
543
|
+
to_return << vals
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
is_batched || top_k == 1 ? to_return : to_return[0]
|
548
|
+
end
|
549
|
+
end
|
550
|
+
|
551
|
+
class ImageSegmentationPipeline < Pipeline
|
552
|
+
def initialize(**options)
|
553
|
+
super(**options)
|
554
|
+
|
555
|
+
@subtasks_mapping = {
|
556
|
+
"panoptic" => "post_process_panoptic_segmentation",
|
557
|
+
"instance" => "post_process_instance_segmentation",
|
558
|
+
"semantic" => "post_process_semantic_segmentation"
|
559
|
+
}
|
560
|
+
end
|
561
|
+
|
562
|
+
def call(
|
563
|
+
images,
|
564
|
+
threshold: 0.5,
|
565
|
+
mask_threshold: 0.5,
|
566
|
+
overlap_mask_area_threshold: 0.8,
|
567
|
+
label_ids_to_fuse: nil,
|
568
|
+
target_sizes: nil,
|
569
|
+
subtask: nil
|
570
|
+
)
|
571
|
+
is_batched = images.is_a?(Array)
|
572
|
+
|
573
|
+
if is_batched && images.length != 1
|
574
|
+
raise Error, "Image segmentation pipeline currently only supports a batch size of 1."
|
575
|
+
end
|
576
|
+
|
577
|
+
prepared_images = prepare_images(images)
|
578
|
+
image_sizes = prepared_images.map { |x| [x.height, x.width] }
|
579
|
+
|
580
|
+
model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
|
581
|
+
output = @model.(model_inputs)
|
582
|
+
|
583
|
+
if !subtask.nil?
|
584
|
+
fn = @subtasks_mapping[subtask]
|
585
|
+
else
|
586
|
+
@subtasks_mapping.each do |task, func|
|
587
|
+
if @processor.feature_extractor.respond_to?(func)
|
588
|
+
fn = @processor.feature_extractor.method(func)
|
589
|
+
subtask = task
|
590
|
+
break
|
591
|
+
end
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
id2label = @model.config[:id2label]
|
596
|
+
|
597
|
+
annotation = []
|
598
|
+
if subtask == "panoptic" || subtask == "instance"
|
599
|
+
processed = fn.(
|
600
|
+
output,
|
601
|
+
threshold:,
|
602
|
+
mask_threshold:,
|
603
|
+
overlap_mask_area_threshold:,
|
604
|
+
label_ids_to_fuse:,
|
605
|
+
target_sizes: target_sizes || image_sizes, # TODO FIX?
|
606
|
+
)[0]
|
607
|
+
|
608
|
+
_segmentation = processed[:segmentation]
|
609
|
+
|
610
|
+
processed[:segments_info].each do |segment|
|
611
|
+
annotation << {
|
612
|
+
label: id2label[segment[:label_id].to_s],
|
613
|
+
score: segment[:score]
|
614
|
+
# TODO mask
|
615
|
+
}
|
616
|
+
end
|
617
|
+
elsif subtask == "semantic"
|
618
|
+
raise Todo
|
619
|
+
else
|
620
|
+
raise Error, "Subtask #{subtask} not supported."
|
621
|
+
end
|
622
|
+
|
623
|
+
annotation
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
class ZeroShotImageClassificationPipeline < Pipeline
|
628
|
+
def call(images, candidate_labels, hypothesis_template: "This is a photo of {}")
|
629
|
+
is_batched = images.is_a?(Array)
|
630
|
+
prepared_images = prepare_images(images)
|
631
|
+
|
632
|
+
# Insert label into hypothesis template
|
633
|
+
texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
634
|
+
|
635
|
+
# Run tokenization
|
636
|
+
text_inputs = @tokenizer.(texts,
|
637
|
+
padding: @model.config[:model_type] == "siglip" ? "max_length" : true,
|
638
|
+
truncation: true
|
639
|
+
)
|
640
|
+
|
641
|
+
# Run processor
|
642
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
643
|
+
|
644
|
+
# Run model with both text and pixel inputs
|
645
|
+
output = @model.(text_inputs.merge(pixel_values: pixel_values))
|
646
|
+
|
647
|
+
function_to_apply =
|
648
|
+
if @model.config[:model_type] == "siglip"
|
649
|
+
->(batch) { Utils.sigmoid(batch) }
|
650
|
+
else
|
651
|
+
->(batch) { Utils.softmax(batch) }
|
652
|
+
end
|
653
|
+
|
654
|
+
# Compare each image with each candidate label
|
655
|
+
to_return = []
|
656
|
+
output[0].each do |batch|
|
657
|
+
# Compute softmax per image
|
658
|
+
probs = function_to_apply.(batch)
|
659
|
+
|
660
|
+
result = probs
|
661
|
+
.map.with_index { |x, i| {label: candidate_labels[i], score: x} }
|
662
|
+
.sort_by { |v| -v[:score] }
|
663
|
+
|
664
|
+
to_return << result
|
665
|
+
end
|
666
|
+
|
667
|
+
is_batched ? to_return : to_return[0]
|
668
|
+
end
|
669
|
+
end
|
670
|
+
|
671
|
+
class ObjectDetectionPipeline < Pipeline
|
672
|
+
def call(images, threshold: 0.9, percentage: false)
|
673
|
+
is_batched = images.is_a?(Array)
|
674
|
+
|
675
|
+
if is_batched && images.length != 1
|
676
|
+
raise Error, "Object detection pipeline currently only supports a batch size of 1."
|
677
|
+
end
|
678
|
+
prepared_images = prepare_images(images)
|
679
|
+
|
680
|
+
image_sizes = percentage ? nil : prepared_images.map { |x| [x.height, x.width] }
|
681
|
+
|
682
|
+
model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
|
683
|
+
output = @model.(model_inputs)
|
684
|
+
|
685
|
+
processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_sizes)
|
686
|
+
|
687
|
+
# Add labels
|
688
|
+
id2label = @model.config[:id2label]
|
689
|
+
|
690
|
+
# Format output
|
691
|
+
result =
|
692
|
+
processed.map do |batch|
|
693
|
+
batch[:boxes].map.with_index do |box, i|
|
694
|
+
{
|
695
|
+
label: id2label[batch[:classes][i].to_s],
|
696
|
+
score: batch[:scores][i],
|
697
|
+
box: get_bounding_box(box, !percentage)
|
698
|
+
}
|
699
|
+
end.sort_by { |v| -v[:score] }
|
700
|
+
end
|
701
|
+
|
702
|
+
is_batched ? result : result[0]
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
class ZeroShotObjectDetectionPipeline < Pipeline
|
707
|
+
def call(
|
708
|
+
images,
|
709
|
+
candidate_labels,
|
710
|
+
threshold: 0.1,
|
711
|
+
top_k: nil,
|
712
|
+
percentage: false
|
713
|
+
)
|
714
|
+
is_batched = images.is_a?(Array)
|
715
|
+
prepared_images = prepare_images(images)
|
716
|
+
|
717
|
+
# Run tokenization
|
718
|
+
text_inputs = @tokenizer.(candidate_labels,
|
719
|
+
padding: true,
|
720
|
+
truncation: true
|
721
|
+
)
|
722
|
+
|
723
|
+
# Run processor
|
724
|
+
model_inputs = @processor.(prepared_images)
|
725
|
+
|
726
|
+
# Since non-maximum suppression is performed for exporting, we need to
|
727
|
+
# process each image separately. For more information, see:
|
728
|
+
# https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
|
729
|
+
to_return = []
|
730
|
+
prepared_images.length.times do |i|
|
731
|
+
image = prepared_images[i]
|
732
|
+
image_size = percentage ? nil : [[image.height, image.width]]
|
733
|
+
pixel_values = [model_inputs[:pixel_values][i]]
|
734
|
+
|
735
|
+
# Run model with both text and pixel inputs
|
736
|
+
output = @model.(text_inputs.merge(pixel_values: pixel_values))
|
737
|
+
# TODO remove
|
738
|
+
output = @model.instance_variable_get(:@session).outputs.map { |v| v[:name].to_sym }.zip(output).to_h
|
739
|
+
|
740
|
+
processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_size, true)[0]
|
741
|
+
result =
|
742
|
+
processed[:boxes].map.with_index do |box, i|
|
743
|
+
{
|
744
|
+
label: candidate_labels[processed[:classes][i]],
|
745
|
+
score: processed[:scores][i],
|
746
|
+
box: get_bounding_box(box, !percentage)
|
747
|
+
}
|
748
|
+
end
|
749
|
+
result.sort_by! { |v| -v[:score] }
|
750
|
+
if !top_k.nil?
|
751
|
+
result = result[0...topk]
|
752
|
+
end
|
753
|
+
to_return << result
|
754
|
+
end
|
755
|
+
|
756
|
+
is_batched ? to_return : to_return[0]
|
757
|
+
end
|
758
|
+
end
|
759
|
+
|
760
|
+
class DocumentQuestionAnsweringPipeline < Pipeline
|
761
|
+
def call(image, question, **generate_kwargs)
|
762
|
+
# NOTE: For now, we only support a batch size of 1
|
763
|
+
|
764
|
+
# Preprocess image
|
765
|
+
prepared_image = prepare_images(image)[0]
|
766
|
+
pixel_values = @processor.(prepared_image)[:pixel_values]
|
767
|
+
|
768
|
+
# Run tokenization
|
769
|
+
task_prompt = "<s_docvqa><s_question>#{question}</s_question><s_answer>"
|
770
|
+
decoder_input_ids =
|
771
|
+
@tokenizer.(
|
772
|
+
task_prompt,
|
773
|
+
add_special_tokens: false,
|
774
|
+
padding: true,
|
775
|
+
truncation: true
|
776
|
+
)[:input_ids]
|
777
|
+
|
778
|
+
# Run model
|
779
|
+
output =
|
780
|
+
@model.generate(
|
781
|
+
pixel_values,
|
782
|
+
generate_kwargs.merge(
|
783
|
+
decoder_input_ids: decoder_input_ids[0],
|
784
|
+
max_length: @model.config["decoder"]["max_position_embeddings"]
|
785
|
+
).transform_keys(&:to_s)
|
786
|
+
)
|
787
|
+
|
788
|
+
# Decode output
|
789
|
+
decoded = @tokenizer.batch_decode(output, skip_special_tokens: false)[0]
|
790
|
+
|
791
|
+
# Parse answer
|
792
|
+
match = decoded.match(/<s_answer>(.*?)<\/s_answer>/)
|
793
|
+
answer = nil
|
794
|
+
if match && match.length >= 2
|
795
|
+
answer = match[1].strip
|
796
|
+
end
|
797
|
+
[{answer:}]
|
798
|
+
end
|
799
|
+
end
|
800
|
+
|
801
|
+
class TextToAudioPipeline < Pipeline
|
802
|
+
DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
|
803
|
+
|
804
|
+
def initialize(**options)
|
805
|
+
super(**options)
|
806
|
+
|
807
|
+
# TODO: Find a better way for `pipeline` to set the default vocoder
|
808
|
+
@vocoder = options[:vocoder]
|
809
|
+
end
|
810
|
+
|
811
|
+
def call(text_inputs, speaker_embeddings: nil)
|
812
|
+
# If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
|
813
|
+
if @processor
|
814
|
+
call_text_to_spectrogram(text_inputs, speaker_embeddings:)
|
815
|
+
else
|
816
|
+
call_text_to_waveform(text_inputs)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
end
|
820
|
+
|
246
821
|
class FeatureExtractionPipeline < Pipeline
|
247
822
|
def call(
|
248
823
|
texts,
|
@@ -262,7 +837,7 @@ module Informers
|
|
262
837
|
if !model_output.nil?
|
263
838
|
model_options[:output_names] = Array(model_output)
|
264
839
|
elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
265
|
-
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
840
|
+
# optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
266
841
|
model_options[:output_names] = ["sentence_embedding"]
|
267
842
|
pooling = "none"
|
268
843
|
normalize = false
|
@@ -306,6 +881,164 @@ module Informers
|
|
306
881
|
end
|
307
882
|
end
|
308
883
|
|
884
|
+
class ImageFeatureExtractionPipeline < Pipeline
|
885
|
+
def call(images)
|
886
|
+
prepared_images = prepare_images(images)
|
887
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
888
|
+
outputs = @model.({pixel_values: pixel_values})
|
889
|
+
|
890
|
+
result = outputs[0]
|
891
|
+
result
|
892
|
+
end
|
893
|
+
end
|
894
|
+
|
895
|
+
class AudioClassificationPipeline < Pipeline
|
896
|
+
def call(audio, top_k: nil)
|
897
|
+
single = !audio.is_a?(Array)
|
898
|
+
|
899
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
900
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
901
|
+
|
902
|
+
id2label = @model.config[:id2label]
|
903
|
+
|
904
|
+
to_return = []
|
905
|
+
prepared_audios.each do |aud|
|
906
|
+
inputs = @processor.(aud)
|
907
|
+
output = @model.(inputs)
|
908
|
+
logits = output.logits[0]
|
909
|
+
|
910
|
+
scores = Utils.get_top_items(Utils.softmax(logits), top_k)
|
911
|
+
|
912
|
+
vals =
|
913
|
+
scores.map do |x|
|
914
|
+
{
|
915
|
+
label: id2label[x[0].to_s],
|
916
|
+
score: x[1]
|
917
|
+
}
|
918
|
+
end
|
919
|
+
|
920
|
+
if top_k == 1
|
921
|
+
to_return.concat(vals)
|
922
|
+
else
|
923
|
+
to_return << vals
|
924
|
+
end
|
925
|
+
end
|
926
|
+
!single || top_k == 1 ? to_return : to_return[0]
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
930
|
+
class ZeroShotAudioClassificationPipeline < Pipeline
|
931
|
+
def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
|
932
|
+
single = !audio.is_a?(Array)
|
933
|
+
if single
|
934
|
+
audio = [audio]
|
935
|
+
end
|
936
|
+
|
937
|
+
# Insert label into hypothesis template
|
938
|
+
texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
939
|
+
|
940
|
+
# Run tokenization
|
941
|
+
text_inputs =
|
942
|
+
@tokenizer.(
|
943
|
+
texts,
|
944
|
+
padding: true,
|
945
|
+
truncation: true
|
946
|
+
)
|
947
|
+
|
948
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
949
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
950
|
+
|
951
|
+
to_return = []
|
952
|
+
prepared_audios.each do |aud|
|
953
|
+
audio_inputs = @processor.(aud)
|
954
|
+
|
955
|
+
# Run model with both text and audio inputs
|
956
|
+
output = @model.(text_inputs.merge(audio_inputs))
|
957
|
+
|
958
|
+
# Compute softmax per audio
|
959
|
+
probs = Utils.softmax(output.logits_per_audio.data)
|
960
|
+
|
961
|
+
to_return <<
|
962
|
+
probs.map.with_index do |x, i|
|
963
|
+
{
|
964
|
+
label: candidate_labels[i],
|
965
|
+
score: x
|
966
|
+
}
|
967
|
+
end
|
968
|
+
end
|
969
|
+
single ? to_return[0] : to_return
|
970
|
+
end
|
971
|
+
end
|
972
|
+
|
973
|
+
class AutomaticSpeechRecognitionPipeline < Pipeline
|
974
|
+
def call(audio, **kwargs)
|
975
|
+
case @model.config["model_type"]
|
976
|
+
when "whisper"
|
977
|
+
call_whisper(audio, **kwargs)
|
978
|
+
else
|
979
|
+
raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
private
|
984
|
+
|
985
|
+
def call_whisper(audio, **kwargs)
|
986
|
+
raise Todo
|
987
|
+
end
|
988
|
+
end
|
989
|
+
|
990
|
+
class ImageToImagePipeline < Pipeline
|
991
|
+
def call(images)
|
992
|
+
prepared_images = prepare_images(images)
|
993
|
+
inputs = @processor.(prepared_images)
|
994
|
+
outputs = @model.(inputs)
|
995
|
+
|
996
|
+
to_return = []
|
997
|
+
outputs[0].each do |batch|
|
998
|
+
# TODO flatten first
|
999
|
+
output =
|
1000
|
+
batch.map do |v|
|
1001
|
+
v.map do |v2|
|
1002
|
+
v2.map do |v3|
|
1003
|
+
(v3.clamp(0, 1) * 255).round
|
1004
|
+
end
|
1005
|
+
end
|
1006
|
+
end
|
1007
|
+
to_return << Utils::RawImage.from_array(output).image
|
1008
|
+
end
|
1009
|
+
|
1010
|
+
to_return.length > 1 ? to_return : to_return[0]
|
1011
|
+
end
|
1012
|
+
end
|
1013
|
+
|
1014
|
+
class DepthEstimationPipeline < Pipeline
|
1015
|
+
def call(images)
|
1016
|
+
prepared_images = prepare_images(images)
|
1017
|
+
|
1018
|
+
inputs = @processor.(prepared_images)
|
1019
|
+
predicted_depth = @model.(inputs)[0]
|
1020
|
+
|
1021
|
+
to_return = []
|
1022
|
+
prepared_images.length.times do |i|
|
1023
|
+
prediction = Utils.interpolate(predicted_depth[i], prepared_images[i].size.reverse, "bilinear", false)
|
1024
|
+
max_prediction = Utils.max(prediction.flatten)[0]
|
1025
|
+
formatted =
|
1026
|
+
prediction.map do |v|
|
1027
|
+
v.map do |v2|
|
1028
|
+
v2.map do |v3|
|
1029
|
+
(v3 * 255 / max_prediction).round
|
1030
|
+
end
|
1031
|
+
end
|
1032
|
+
end
|
1033
|
+
to_return << {
|
1034
|
+
predicted_depth: predicted_depth[i],
|
1035
|
+
depth: Utils::RawImage.from_array(formatted).image
|
1036
|
+
}
|
1037
|
+
end
|
1038
|
+
to_return.length > 1 ? to_return : to_return[0]
|
1039
|
+
end
|
1040
|
+
end
|
1041
|
+
|
309
1042
|
class EmbeddingPipeline < FeatureExtractionPipeline
|
310
1043
|
def call(
|
311
1044
|
texts,
|
@@ -375,6 +1108,186 @@ module Informers
|
|
375
1108
|
},
|
376
1109
|
type: "text"
|
377
1110
|
},
|
1111
|
+
"fill-mask" => {
|
1112
|
+
tokenizer: AutoTokenizer,
|
1113
|
+
pipeline: FillMaskPipeline,
|
1114
|
+
model: AutoModelForMaskedLM,
|
1115
|
+
default: {
|
1116
|
+
model: "Xenova/bert-base-uncased"
|
1117
|
+
},
|
1118
|
+
type: "text"
|
1119
|
+
},
|
1120
|
+
"summarization" => {
|
1121
|
+
tokenizer: AutoTokenizer,
|
1122
|
+
pipeline: SummarizationPipeline,
|
1123
|
+
model: AutoModelForSeq2SeqLM,
|
1124
|
+
default: {
|
1125
|
+
model: "Xenova/distilbart-cnn-6-6"
|
1126
|
+
},
|
1127
|
+
type: "text"
|
1128
|
+
},
|
1129
|
+
"translation" => {
|
1130
|
+
tokenizer: AutoTokenizer,
|
1131
|
+
pipeline: TranslationPipeline,
|
1132
|
+
model: AutoModelForSeq2SeqLM,
|
1133
|
+
default: {
|
1134
|
+
model: "Xenova/t5-small"
|
1135
|
+
},
|
1136
|
+
type: "text"
|
1137
|
+
},
|
1138
|
+
"text2text-generation" => {
|
1139
|
+
tokenizer: AutoTokenizer,
|
1140
|
+
pipeline: Text2TextGenerationPipeline,
|
1141
|
+
model: AutoModelForSeq2SeqLM,
|
1142
|
+
default: {
|
1143
|
+
model: "Xenova/flan-t5-small"
|
1144
|
+
},
|
1145
|
+
type: "text"
|
1146
|
+
},
|
1147
|
+
"text-generation" => {
|
1148
|
+
tokenizer: AutoTokenizer,
|
1149
|
+
pipeline: TextGenerationPipeline,
|
1150
|
+
model: AutoModelForCausalLM,
|
1151
|
+
default: {
|
1152
|
+
model: "Xenova/gpt2"
|
1153
|
+
},
|
1154
|
+
type: "text"
|
1155
|
+
},
|
1156
|
+
"zero-shot-classification" => {
|
1157
|
+
tokenizer: AutoTokenizer,
|
1158
|
+
pipeline: ZeroShotClassificationPipeline,
|
1159
|
+
model: AutoModelForSequenceClassification,
|
1160
|
+
default: {
|
1161
|
+
model: "Xenova/distilbert-base-uncased-mnli"
|
1162
|
+
},
|
1163
|
+
type: "text"
|
1164
|
+
},
|
1165
|
+
"audio-classification" => {
|
1166
|
+
pipeline: AudioClassificationPipeline,
|
1167
|
+
model: AutoModelForAudioClassification,
|
1168
|
+
processor: AutoProcessor,
|
1169
|
+
default: {
|
1170
|
+
model: "Xenova/wav2vec2-base-superb-ks"
|
1171
|
+
},
|
1172
|
+
type: "audio"
|
1173
|
+
},
|
1174
|
+
# TODO
|
1175
|
+
# "zero-shot-audio-classification" => {
|
1176
|
+
# tokenizer: AutoTokenizer,
|
1177
|
+
# pipeline: ZeroShotAudioClassificationPipeline,
|
1178
|
+
# model: AutoModel,
|
1179
|
+
# processor: AutoProcessor,
|
1180
|
+
# default: {
|
1181
|
+
# model: "Xenova/clap-htsat-unfused"
|
1182
|
+
# },
|
1183
|
+
# type: "multimodal"
|
1184
|
+
# },
|
1185
|
+
# TODO
|
1186
|
+
# "automatic-speech-recognition" => {
|
1187
|
+
# tokenizer: AutoTokenizer,
|
1188
|
+
# pipeline: AutomaticSpeechRecognitionPipeline,
|
1189
|
+
# model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
|
1190
|
+
# processor: AutoProcessor,
|
1191
|
+
# default: {
|
1192
|
+
# model: "Xenova/whisper-tiny.en"
|
1193
|
+
# },
|
1194
|
+
# type: "multimodal"
|
1195
|
+
# },
|
1196
|
+
"text-to-audio" => {
|
1197
|
+
tokenizer: AutoTokenizer,
|
1198
|
+
pipeline: TextToAudioPipeline,
|
1199
|
+
model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
|
1200
|
+
processor: [AutoProcessor, nil],
|
1201
|
+
default: {
|
1202
|
+
model: "Xenova/speecht5_tts"
|
1203
|
+
},
|
1204
|
+
type: "text"
|
1205
|
+
},
|
1206
|
+
"image-to-text" => {
|
1207
|
+
tokenizer: AutoTokenizer,
|
1208
|
+
pipeline: ImageToTextPipeline,
|
1209
|
+
model: AutoModelForVision2Seq,
|
1210
|
+
processor: AutoProcessor,
|
1211
|
+
default: {
|
1212
|
+
model: "Xenova/vit-gpt2-image-captioning"
|
1213
|
+
},
|
1214
|
+
type: "multimodal"
|
1215
|
+
},
|
1216
|
+
"image-classification" => {
|
1217
|
+
pipeline: ImageClassificationPipeline,
|
1218
|
+
model: AutoModelForImageClassification,
|
1219
|
+
processor: AutoProcessor,
|
1220
|
+
default: {
|
1221
|
+
model: "Xenova/vit-base-patch16-224"
|
1222
|
+
},
|
1223
|
+
type: "multimodal"
|
1224
|
+
},
|
1225
|
+
"image-segmentation" => {
|
1226
|
+
pipeline: ImageSegmentationPipeline,
|
1227
|
+
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
|
1228
|
+
processor: AutoProcessor,
|
1229
|
+
default: {
|
1230
|
+
model: "Xenova/detr-resnet-50-panoptic"
|
1231
|
+
},
|
1232
|
+
type: "multimodal"
|
1233
|
+
},
|
1234
|
+
"zero-shot-image-classification" => {
|
1235
|
+
tokenizer: AutoTokenizer,
|
1236
|
+
pipeline: ZeroShotImageClassificationPipeline,
|
1237
|
+
model: AutoModel,
|
1238
|
+
processor: AutoProcessor,
|
1239
|
+
default: {
|
1240
|
+
model: "Xenova/clip-vit-base-patch32"
|
1241
|
+
},
|
1242
|
+
type: "multimodal"
|
1243
|
+
},
|
1244
|
+
"object-detection" => {
|
1245
|
+
pipeline: ObjectDetectionPipeline,
|
1246
|
+
model: AutoModelForObjectDetection,
|
1247
|
+
processor: AutoProcessor,
|
1248
|
+
default: {
|
1249
|
+
model: "Xenova/detr-resnet-50"
|
1250
|
+
},
|
1251
|
+
type: "multimodal"
|
1252
|
+
},
|
1253
|
+
"zero-shot-object-detection" => {
|
1254
|
+
tokenizer: AutoTokenizer,
|
1255
|
+
pipeline: ZeroShotObjectDetectionPipeline,
|
1256
|
+
model: AutoModelForZeroShotObjectDetection,
|
1257
|
+
processor: AutoProcessor,
|
1258
|
+
default: {
|
1259
|
+
model: "Xenova/owlvit-base-patch32"
|
1260
|
+
},
|
1261
|
+
type: "multimodal"
|
1262
|
+
},
|
1263
|
+
"document-question-answering" => {
|
1264
|
+
tokenizer: AutoTokenizer,
|
1265
|
+
pipeline: DocumentQuestionAnsweringPipeline,
|
1266
|
+
model: AutoModelForDocumentQuestionAnswering,
|
1267
|
+
processor: AutoProcessor,
|
1268
|
+
default: {
|
1269
|
+
model: "Xenova/donut-base-finetuned-docvqa"
|
1270
|
+
},
|
1271
|
+
type: "multimodal"
|
1272
|
+
},
|
1273
|
+
"image-to-image" => {
|
1274
|
+
pipeline: ImageToImagePipeline,
|
1275
|
+
model: AutoModelForImageToImage,
|
1276
|
+
processor: AutoProcessor,
|
1277
|
+
default: {
|
1278
|
+
model: "Xenova/swin2SR-classical-sr-x2-64"
|
1279
|
+
},
|
1280
|
+
type: "image"
|
1281
|
+
},
|
1282
|
+
"depth-estimation" => {
|
1283
|
+
pipeline: DepthEstimationPipeline,
|
1284
|
+
model: AutoModelForDepthEstimation,
|
1285
|
+
processor: AutoProcessor,
|
1286
|
+
default: {
|
1287
|
+
model: "Xenova/dpt-large"
|
1288
|
+
},
|
1289
|
+
type: "image"
|
1290
|
+
},
|
378
1291
|
"feature-extraction" => {
|
379
1292
|
tokenizer: AutoTokenizer,
|
380
1293
|
pipeline: FeatureExtractionPipeline,
|
@@ -384,6 +1297,15 @@ module Informers
|
|
384
1297
|
},
|
385
1298
|
type: "text"
|
386
1299
|
},
|
1300
|
+
"image-feature-extraction" => {
|
1301
|
+
processor: AutoProcessor,
|
1302
|
+
pipeline: ImageFeatureExtractionPipeline,
|
1303
|
+
model: [AutoModelForImageFeatureExtraction, AutoModel],
|
1304
|
+
default: {
|
1305
|
+
model: "Xenova/vit-base-patch16-224"
|
1306
|
+
},
|
1307
|
+
type: "image"
|
1308
|
+
},
|
387
1309
|
"embedding" => {
|
388
1310
|
tokenizer: AutoTokenizer,
|
389
1311
|
pipeline: EmbeddingPipeline,
|
@@ -406,7 +1328,8 @@ module Informers
|
|
406
1328
|
|
407
1329
|
TASK_ALIASES = {
|
408
1330
|
"sentiment-analysis" => "text-classification",
|
409
|
-
"ner" => "token-classification"
|
1331
|
+
"ner" => "token-classification",
|
1332
|
+
"text-to-speech" => "text-to-audio"
|
410
1333
|
}
|
411
1334
|
|
412
1335
|
DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
|
@@ -439,14 +1362,14 @@ module Informers
|
|
439
1362
|
revision: "main",
|
440
1363
|
model_file_name: nil
|
441
1364
|
)
|
1365
|
+
# Apply aliases
|
1366
|
+
task = TASK_ALIASES[task] || task
|
1367
|
+
|
442
1368
|
if quantized == NO_DEFAULT
|
443
1369
|
# TODO move default to task class
|
444
|
-
quantized =
|
1370
|
+
quantized = ["text-classification", "token-classification", "question-answering", "feature-extraction"].include?(task)
|
445
1371
|
end
|
446
1372
|
|
447
|
-
# Apply aliases
|
448
|
-
task = TASK_ALIASES[task] || task
|
449
|
-
|
450
1373
|
# Get pipeline info
|
451
1374
|
pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
|
452
1375
|
if !pipeline_info
|
@@ -479,7 +1402,8 @@ module Informers
|
|
479
1402
|
results = load_items(classes, model, pretrained_options)
|
480
1403
|
results[:task] = task
|
481
1404
|
|
482
|
-
|
1405
|
+
# for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
1406
|
+
if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
|
483
1407
|
results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
|
484
1408
|
end
|
485
1409
|
|
@@ -502,7 +1426,15 @@ module Informers
|
|
502
1426
|
next if !cls
|
503
1427
|
|
504
1428
|
if cls.is_a?(Array)
|
505
|
-
|
1429
|
+
e = nil
|
1430
|
+
cls.each do |c|
|
1431
|
+
begin
|
1432
|
+
result[name] = c.from_pretrained(model, **pretrained_options)
|
1433
|
+
rescue => err
|
1434
|
+
e = err
|
1435
|
+
end
|
1436
|
+
end
|
1437
|
+
raise e unless result[name]
|
506
1438
|
else
|
507
1439
|
result[name] = cls.from_pretrained(model, **pretrained_options)
|
508
1440
|
end
|