informers 1.0.3 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +137 -7
- data/lib/informers/configs.rb +10 -8
- data/lib/informers/model.rb +2 -9
- data/lib/informers/models.rb +1160 -15
- data/lib/informers/pipelines.rb +943 -11
- data/lib/informers/processors.rb +856 -0
- data/lib/informers/tokenizers.rb +159 -5
- data/lib/informers/utils/audio.rb +18 -0
- data/lib/informers/utils/core.rb +4 -0
- data/lib/informers/utils/ffmpeg.rb +45 -0
- data/lib/informers/utils/generation.rb +294 -0
- data/lib/informers/utils/image.rb +116 -0
- data/lib/informers/utils/math.rb +73 -0
- data/lib/informers/utils/tensor.rb +46 -0
- data/lib/informers/version.rb +1 -1
- data/lib/informers.rb +6 -0
- metadata +10 -5
data/lib/informers/pipelines.rb
CHANGED
@@ -7,6 +7,40 @@ module Informers
|
|
7
7
|
@tokenizer = tokenizer
|
8
8
|
@processor = processor
|
9
9
|
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def prepare_images(images)
|
14
|
+
if !images.is_a?(Array)
|
15
|
+
images = [images]
|
16
|
+
end
|
17
|
+
|
18
|
+
# Possibly convert any non-images to images
|
19
|
+
images.map { |x| Utils::RawImage.read(x) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def prepare_audios(audios, sampling_rate)
|
23
|
+
if !audios.is_a?(Array)
|
24
|
+
audios = [audios]
|
25
|
+
end
|
26
|
+
|
27
|
+
audios.map do |x|
|
28
|
+
if x.is_a?(String) || x.is_a?(URI)
|
29
|
+
Utils.read_audio(x, sampling_rate)
|
30
|
+
else
|
31
|
+
x
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_bounding_box(box, as_integer)
|
37
|
+
if as_integer
|
38
|
+
box = box.map { |x| x.to_i }
|
39
|
+
end
|
40
|
+
xmin, ymin, xmax, ymax = box
|
41
|
+
|
42
|
+
{xmin:, ymin:, xmax:, ymax:}
|
43
|
+
end
|
10
44
|
end
|
11
45
|
|
12
46
|
class TextClassificationPipeline < Pipeline
|
@@ -21,13 +55,13 @@ module Informers
|
|
21
55
|
outputs = @model.(model_inputs)
|
22
56
|
|
23
57
|
function_to_apply =
|
24
|
-
if @model.config
|
58
|
+
if @model.config[:problem_type] == "multi_label_classification"
|
25
59
|
->(batch) { Utils.sigmoid(batch) }
|
26
60
|
else
|
27
61
|
->(batch) { Utils.softmax(batch) } # single_label_classification (default)
|
28
62
|
end
|
29
63
|
|
30
|
-
id2label = @model.config
|
64
|
+
id2label = @model.config[:id2label]
|
31
65
|
|
32
66
|
to_return = []
|
33
67
|
outputs.logits.each do |batch|
|
@@ -70,7 +104,7 @@ module Informers
|
|
70
104
|
outputs = @model.(model_inputs)
|
71
105
|
|
72
106
|
logits = outputs.logits
|
73
|
-
id2label = @model.config
|
107
|
+
id2label = @model.config[:id2label]
|
74
108
|
|
75
109
|
to_return = []
|
76
110
|
logits.length.times do |i|
|
@@ -243,6 +277,547 @@ module Informers
|
|
243
277
|
end
|
244
278
|
end
|
245
279
|
|
280
|
+
class FillMaskPipeline < Pipeline
|
281
|
+
def call(texts, top_k: 5)
|
282
|
+
model_inputs = @tokenizer.(texts, padding: true, truncation: true)
|
283
|
+
outputs = @model.(model_inputs)
|
284
|
+
|
285
|
+
to_return = []
|
286
|
+
model_inputs[:input_ids].each_with_index do |ids, i|
|
287
|
+
mask_token_index = ids.index(@tokenizer.mask_token_id)
|
288
|
+
|
289
|
+
if mask_token_index.nil?
|
290
|
+
raise ArgumentError, "Mask token (#{@tokenizer.mask_token}) not found in text."
|
291
|
+
end
|
292
|
+
logits = outputs.logits[i]
|
293
|
+
item_logits = logits[mask_token_index]
|
294
|
+
|
295
|
+
scores = Utils.get_top_items(Utils.softmax(item_logits), top_k)
|
296
|
+
|
297
|
+
to_return <<
|
298
|
+
scores.map do |x|
|
299
|
+
sequence = ids.dup
|
300
|
+
sequence[mask_token_index] = x[0]
|
301
|
+
|
302
|
+
{
|
303
|
+
score: x[1],
|
304
|
+
token: x[0],
|
305
|
+
token_str: @tokenizer.id_to_token(x[0]),
|
306
|
+
sequence: @tokenizer.decode(sequence, skip_special_tokens: true)
|
307
|
+
}
|
308
|
+
end
|
309
|
+
end
|
310
|
+
texts.is_a?(Array) ? to_return : to_return[0]
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
class Text2TextGenerationPipeline < Pipeline
|
315
|
+
KEY = :generated_text
|
316
|
+
|
317
|
+
def call(texts, **generate_kwargs)
|
318
|
+
if !texts.is_a?(Array)
|
319
|
+
texts = [texts]
|
320
|
+
end
|
321
|
+
|
322
|
+
# Add global prefix, if present
|
323
|
+
if @model.config[:prefix]
|
324
|
+
texts = texts.map { |x| @model.config[:prefix] + x }
|
325
|
+
end
|
326
|
+
|
327
|
+
# Handle task specific params:
|
328
|
+
task_specific_params = @model.config[:task_specific_params]
|
329
|
+
if task_specific_params && task_specific_params[@task]
|
330
|
+
# Add prefixes, if present
|
331
|
+
if task_specific_params[@task]["prefix"]
|
332
|
+
texts = texts.map { |x| task_specific_params[@task]["prefix"] + x }
|
333
|
+
end
|
334
|
+
|
335
|
+
# TODO update generation config
|
336
|
+
end
|
337
|
+
|
338
|
+
tokenizer = @tokenizer
|
339
|
+
tokenizer_options = {
|
340
|
+
padding: true,
|
341
|
+
truncation: true
|
342
|
+
}
|
343
|
+
if is_a?(TranslationPipeline) && tokenizer.respond_to?(:_build_translation_inputs)
|
344
|
+
input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs)[:input_ids]
|
345
|
+
else
|
346
|
+
input_ids = tokenizer.(texts, **tokenizer_options)[:input_ids]
|
347
|
+
end
|
348
|
+
|
349
|
+
output_token_ids = @model.generate(input_ids, generate_kwargs)
|
350
|
+
|
351
|
+
tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
|
352
|
+
.map { |text| {self.class.const_get(:KEY) => text} }
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
class SummarizationPipeline < Text2TextGenerationPipeline
|
357
|
+
KEY = :summary_text
|
358
|
+
end
|
359
|
+
|
360
|
+
class TranslationPipeline < Text2TextGenerationPipeline
|
361
|
+
KEY = :translation_text
|
362
|
+
end
|
363
|
+
|
364
|
+
class TextGenerationPipeline < Pipeline
|
365
|
+
def call(texts, **generate_kwargs)
|
366
|
+
is_batched = false
|
367
|
+
is_chat_input = false
|
368
|
+
|
369
|
+
# Normalize inputs
|
370
|
+
if texts.is_a?(String)
|
371
|
+
texts = [texts]
|
372
|
+
inputs = texts
|
373
|
+
else
|
374
|
+
raise Todo
|
375
|
+
end
|
376
|
+
|
377
|
+
# By default, do not add special tokens
|
378
|
+
add_special_tokens = generate_kwargs[:add_special_tokens] || false
|
379
|
+
|
380
|
+
# /By default, return full text
|
381
|
+
return_full_text =
|
382
|
+
if is_chat_input
|
383
|
+
false
|
384
|
+
else
|
385
|
+
generate_kwargs[:return_full_text] || true
|
386
|
+
end
|
387
|
+
|
388
|
+
@tokenizer.padding_side = "left"
|
389
|
+
input_ids, attention_mask =
|
390
|
+
@tokenizer.(inputs, add_special_tokens:, padding: true, truncation: true)
|
391
|
+
.values_at(:input_ids, :attention_mask)
|
392
|
+
|
393
|
+
output_token_ids =
|
394
|
+
@model.generate(
|
395
|
+
input_ids, generate_kwargs, nil, inputs_attention_mask: attention_mask
|
396
|
+
)
|
397
|
+
|
398
|
+
decoded = @tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
|
399
|
+
|
400
|
+
if !return_full_text && Utils.dims(input_ids)[-1] > 0
|
401
|
+
prompt_lengths = @tokenizer.batch_decode(input_ids, skip_special_tokens: true).map { |x| x.length }
|
402
|
+
end
|
403
|
+
|
404
|
+
to_return = Array.new(texts.length) { [] }
|
405
|
+
decoded.length.times do |i|
|
406
|
+
text_index = (i / output_token_ids.length.to_i * texts.length).floor
|
407
|
+
|
408
|
+
if prompt_lengths
|
409
|
+
raise Todo
|
410
|
+
end
|
411
|
+
# TODO is_chat_input
|
412
|
+
to_return[text_index] << {
|
413
|
+
generated_text: decoded[i]
|
414
|
+
}
|
415
|
+
end
|
416
|
+
!is_batched && to_return.length == 1 ? to_return[0] : to_return
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
class ZeroShotClassificationPipeline < Pipeline
|
421
|
+
def initialize(**options)
|
422
|
+
super(**options)
|
423
|
+
|
424
|
+
@label2id = @model.config[:label2id].transform_keys(&:downcase)
|
425
|
+
|
426
|
+
@entailment_id = @label2id["entailment"]
|
427
|
+
if @entailment_id.nil?
|
428
|
+
warn "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."
|
429
|
+
@entailment_id = 2
|
430
|
+
end
|
431
|
+
|
432
|
+
@contradiction_id = @label2id["contradiction"] || @label2id["not_entailment"]
|
433
|
+
if @contradiction_id.nil?
|
434
|
+
warn "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."
|
435
|
+
@contradiction_id = 0
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
def call(texts, candidate_labels, hypothesis_template: "This example is {}.", multi_label: false)
|
440
|
+
is_batched = texts.is_a?(Array)
|
441
|
+
if !is_batched
|
442
|
+
texts = [texts]
|
443
|
+
end
|
444
|
+
if !candidate_labels.is_a?(Array)
|
445
|
+
candidate_labels = [candidate_labels]
|
446
|
+
end
|
447
|
+
|
448
|
+
# Insert labels into hypothesis template
|
449
|
+
hypotheses = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
450
|
+
|
451
|
+
# How to perform the softmax over the logits:
|
452
|
+
# - true: softmax over the entailment vs. contradiction dim for each label independently
|
453
|
+
# - false: softmax the "entailment" logits over all candidate labels
|
454
|
+
softmax_each = multi_label || candidate_labels.length == 1
|
455
|
+
|
456
|
+
to_return = []
|
457
|
+
texts.each do |premise|
|
458
|
+
entails_logits = []
|
459
|
+
|
460
|
+
hypotheses.each do |hypothesis|
|
461
|
+
inputs = @tokenizer.(
|
462
|
+
premise,
|
463
|
+
text_pair: hypothesis,
|
464
|
+
padding: true,
|
465
|
+
truncation: true
|
466
|
+
)
|
467
|
+
outputs = @model.(inputs)
|
468
|
+
|
469
|
+
if softmax_each
|
470
|
+
entails_logits << [
|
471
|
+
outputs.logits[0][@contradiction_id],
|
472
|
+
outputs.logits[0][@entailment_id]
|
473
|
+
]
|
474
|
+
else
|
475
|
+
entails_logits << outputs.logits[0][@entailment_id]
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
scores =
|
480
|
+
if softmax_each
|
481
|
+
entails_logits.map { |x| Utils.softmax(x)[1] }
|
482
|
+
else
|
483
|
+
Utils.softmax(entails_logits)
|
484
|
+
end
|
485
|
+
|
486
|
+
# Sort by scores (desc) and return scores with indices
|
487
|
+
scores_sorted = scores.map.with_index { |x, i| [x, i] }.sort_by { |v| -v[0] }
|
488
|
+
|
489
|
+
to_return << {
|
490
|
+
sequence: premise,
|
491
|
+
labels: scores_sorted.map { |x| candidate_labels[x[1]] },
|
492
|
+
scores: scores_sorted.map { |x| x[0] }
|
493
|
+
}
|
494
|
+
end
|
495
|
+
is_batched ? to_return : to_return[0]
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
499
|
+
class ImageToTextPipeline < Pipeline
|
500
|
+
def call(images, **generate_kwargs)
|
501
|
+
is_batched = images.is_a?(Array)
|
502
|
+
prepared_images = prepare_images(images)
|
503
|
+
|
504
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
505
|
+
|
506
|
+
to_return = []
|
507
|
+
pixel_values.each do |batch|
|
508
|
+
batch = [batch]
|
509
|
+
output = @model.generate(batch, **generate_kwargs)
|
510
|
+
decoded = @tokenizer
|
511
|
+
.batch_decode(output, skip_special_tokens: true)
|
512
|
+
.map { |x| {generated_text: x.strip} }
|
513
|
+
to_return << decoded
|
514
|
+
end
|
515
|
+
|
516
|
+
is_batched ? to_return : to_return[0]
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
class ImageClassificationPipeline < Pipeline
|
521
|
+
def call(images, top_k: 1)
|
522
|
+
is_batched = images.is_a?(Array)
|
523
|
+
prepared_images = prepare_images(images)
|
524
|
+
|
525
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
526
|
+
output = @model.({pixel_values: pixel_values})
|
527
|
+
|
528
|
+
id2label = @model.config[:id2label]
|
529
|
+
to_return = []
|
530
|
+
output.logits.each do |batch|
|
531
|
+
scores = Utils.get_top_items(Utils.softmax(batch), top_k)
|
532
|
+
|
533
|
+
vals =
|
534
|
+
scores.map do |x|
|
535
|
+
{
|
536
|
+
label: id2label[x[0].to_s],
|
537
|
+
score: x[1]
|
538
|
+
}
|
539
|
+
end
|
540
|
+
if top_k == 1
|
541
|
+
to_return.push(*vals)
|
542
|
+
else
|
543
|
+
to_return << vals
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
is_batched || top_k == 1 ? to_return : to_return[0]
|
548
|
+
end
|
549
|
+
end
|
550
|
+
|
551
|
+
class ImageSegmentationPipeline < Pipeline
|
552
|
+
def initialize(**options)
|
553
|
+
super(**options)
|
554
|
+
|
555
|
+
@subtasks_mapping = {
|
556
|
+
"panoptic" => "post_process_panoptic_segmentation",
|
557
|
+
"instance" => "post_process_instance_segmentation",
|
558
|
+
"semantic" => "post_process_semantic_segmentation"
|
559
|
+
}
|
560
|
+
end
|
561
|
+
|
562
|
+
def call(
|
563
|
+
images,
|
564
|
+
threshold: 0.5,
|
565
|
+
mask_threshold: 0.5,
|
566
|
+
overlap_mask_area_threshold: 0.8,
|
567
|
+
label_ids_to_fuse: nil,
|
568
|
+
target_sizes: nil,
|
569
|
+
subtask: nil
|
570
|
+
)
|
571
|
+
is_batched = images.is_a?(Array)
|
572
|
+
|
573
|
+
if is_batched && images.length != 1
|
574
|
+
raise Error, "Image segmentation pipeline currently only supports a batch size of 1."
|
575
|
+
end
|
576
|
+
|
577
|
+
prepared_images = prepare_images(images)
|
578
|
+
image_sizes = prepared_images.map { |x| [x.height, x.width] }
|
579
|
+
|
580
|
+
model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
|
581
|
+
output = @model.(model_inputs)
|
582
|
+
|
583
|
+
if !subtask.nil?
|
584
|
+
fn = @subtasks_mapping[subtask]
|
585
|
+
else
|
586
|
+
@subtasks_mapping.each do |task, func|
|
587
|
+
if @processor.feature_extractor.respond_to?(func)
|
588
|
+
fn = @processor.feature_extractor.method(func)
|
589
|
+
subtask = task
|
590
|
+
break
|
591
|
+
end
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
id2label = @model.config[:id2label]
|
596
|
+
|
597
|
+
annotation = []
|
598
|
+
if subtask == "panoptic" || subtask == "instance"
|
599
|
+
processed = fn.(
|
600
|
+
output,
|
601
|
+
threshold:,
|
602
|
+
mask_threshold:,
|
603
|
+
overlap_mask_area_threshold:,
|
604
|
+
label_ids_to_fuse:,
|
605
|
+
target_sizes: target_sizes || image_sizes, # TODO FIX?
|
606
|
+
)[0]
|
607
|
+
|
608
|
+
_segmentation = processed[:segmentation]
|
609
|
+
|
610
|
+
processed[:segments_info].each do |segment|
|
611
|
+
annotation << {
|
612
|
+
label: id2label[segment[:label_id].to_s],
|
613
|
+
score: segment[:score]
|
614
|
+
# TODO mask
|
615
|
+
}
|
616
|
+
end
|
617
|
+
elsif subtask == "semantic"
|
618
|
+
raise Todo
|
619
|
+
else
|
620
|
+
raise Error, "Subtask #{subtask} not supported."
|
621
|
+
end
|
622
|
+
|
623
|
+
annotation
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
class ZeroShotImageClassificationPipeline < Pipeline
|
628
|
+
def call(images, candidate_labels, hypothesis_template: "This is a photo of {}")
|
629
|
+
is_batched = images.is_a?(Array)
|
630
|
+
prepared_images = prepare_images(images)
|
631
|
+
|
632
|
+
# Insert label into hypothesis template
|
633
|
+
texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
634
|
+
|
635
|
+
# Run tokenization
|
636
|
+
text_inputs = @tokenizer.(texts,
|
637
|
+
padding: @model.config[:model_type] == "siglip" ? "max_length" : true,
|
638
|
+
truncation: true
|
639
|
+
)
|
640
|
+
|
641
|
+
# Run processor
|
642
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
643
|
+
|
644
|
+
# Run model with both text and pixel inputs
|
645
|
+
output = @model.(text_inputs.merge(pixel_values: pixel_values))
|
646
|
+
|
647
|
+
function_to_apply =
|
648
|
+
if @model.config[:model_type] == "siglip"
|
649
|
+
->(batch) { Utils.sigmoid(batch) }
|
650
|
+
else
|
651
|
+
->(batch) { Utils.softmax(batch) }
|
652
|
+
end
|
653
|
+
|
654
|
+
# Compare each image with each candidate label
|
655
|
+
to_return = []
|
656
|
+
output[0].each do |batch|
|
657
|
+
# Compute softmax per image
|
658
|
+
probs = function_to_apply.(batch)
|
659
|
+
|
660
|
+
result = probs
|
661
|
+
.map.with_index { |x, i| {label: candidate_labels[i], score: x} }
|
662
|
+
.sort_by { |v| -v[:score] }
|
663
|
+
|
664
|
+
to_return << result
|
665
|
+
end
|
666
|
+
|
667
|
+
is_batched ? to_return : to_return[0]
|
668
|
+
end
|
669
|
+
end
|
670
|
+
|
671
|
+
class ObjectDetectionPipeline < Pipeline
|
672
|
+
def call(images, threshold: 0.9, percentage: false)
|
673
|
+
is_batched = images.is_a?(Array)
|
674
|
+
|
675
|
+
if is_batched && images.length != 1
|
676
|
+
raise Error, "Object detection pipeline currently only supports a batch size of 1."
|
677
|
+
end
|
678
|
+
prepared_images = prepare_images(images)
|
679
|
+
|
680
|
+
image_sizes = percentage ? nil : prepared_images.map { |x| [x.height, x.width] }
|
681
|
+
|
682
|
+
model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
|
683
|
+
output = @model.(model_inputs)
|
684
|
+
|
685
|
+
processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_sizes)
|
686
|
+
|
687
|
+
# Add labels
|
688
|
+
id2label = @model.config[:id2label]
|
689
|
+
|
690
|
+
# Format output
|
691
|
+
result =
|
692
|
+
processed.map do |batch|
|
693
|
+
batch[:boxes].map.with_index do |box, i|
|
694
|
+
{
|
695
|
+
label: id2label[batch[:classes][i].to_s],
|
696
|
+
score: batch[:scores][i],
|
697
|
+
box: get_bounding_box(box, !percentage)
|
698
|
+
}
|
699
|
+
end.sort_by { |v| -v[:score] }
|
700
|
+
end
|
701
|
+
|
702
|
+
is_batched ? result : result[0]
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
class ZeroShotObjectDetectionPipeline < Pipeline
|
707
|
+
def call(
|
708
|
+
images,
|
709
|
+
candidate_labels,
|
710
|
+
threshold: 0.1,
|
711
|
+
top_k: nil,
|
712
|
+
percentage: false
|
713
|
+
)
|
714
|
+
is_batched = images.is_a?(Array)
|
715
|
+
prepared_images = prepare_images(images)
|
716
|
+
|
717
|
+
# Run tokenization
|
718
|
+
text_inputs = @tokenizer.(candidate_labels,
|
719
|
+
padding: true,
|
720
|
+
truncation: true
|
721
|
+
)
|
722
|
+
|
723
|
+
# Run processor
|
724
|
+
model_inputs = @processor.(prepared_images)
|
725
|
+
|
726
|
+
# Since non-maximum suppression is performed for exporting, we need to
|
727
|
+
# process each image separately. For more information, see:
|
728
|
+
# https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
|
729
|
+
to_return = []
|
730
|
+
prepared_images.length.times do |i|
|
731
|
+
image = prepared_images[i]
|
732
|
+
image_size = percentage ? nil : [[image.height, image.width]]
|
733
|
+
pixel_values = [model_inputs[:pixel_values][i]]
|
734
|
+
|
735
|
+
# Run model with both text and pixel inputs
|
736
|
+
output = @model.(text_inputs.merge(pixel_values: pixel_values))
|
737
|
+
# TODO remove
|
738
|
+
output = @model.instance_variable_get(:@session).outputs.map { |v| v[:name].to_sym }.zip(output).to_h
|
739
|
+
|
740
|
+
processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_size, true)[0]
|
741
|
+
result =
|
742
|
+
processed[:boxes].map.with_index do |box, i|
|
743
|
+
{
|
744
|
+
label: candidate_labels[processed[:classes][i]],
|
745
|
+
score: processed[:scores][i],
|
746
|
+
box: get_bounding_box(box, !percentage)
|
747
|
+
}
|
748
|
+
end
|
749
|
+
result.sort_by! { |v| -v[:score] }
|
750
|
+
if !top_k.nil?
|
751
|
+
result = result[0...topk]
|
752
|
+
end
|
753
|
+
to_return << result
|
754
|
+
end
|
755
|
+
|
756
|
+
is_batched ? to_return : to_return[0]
|
757
|
+
end
|
758
|
+
end
|
759
|
+
|
760
|
+
class DocumentQuestionAnsweringPipeline < Pipeline
|
761
|
+
def call(image, question, **generate_kwargs)
|
762
|
+
# NOTE: For now, we only support a batch size of 1
|
763
|
+
|
764
|
+
# Preprocess image
|
765
|
+
prepared_image = prepare_images(image)[0]
|
766
|
+
pixel_values = @processor.(prepared_image)[:pixel_values]
|
767
|
+
|
768
|
+
# Run tokenization
|
769
|
+
task_prompt = "<s_docvqa><s_question>#{question}</s_question><s_answer>"
|
770
|
+
decoder_input_ids =
|
771
|
+
@tokenizer.(
|
772
|
+
task_prompt,
|
773
|
+
add_special_tokens: false,
|
774
|
+
padding: true,
|
775
|
+
truncation: true
|
776
|
+
)[:input_ids]
|
777
|
+
|
778
|
+
# Run model
|
779
|
+
output =
|
780
|
+
@model.generate(
|
781
|
+
pixel_values,
|
782
|
+
generate_kwargs.merge(
|
783
|
+
decoder_input_ids: decoder_input_ids[0],
|
784
|
+
max_length: @model.config["decoder"]["max_position_embeddings"]
|
785
|
+
).transform_keys(&:to_s)
|
786
|
+
)
|
787
|
+
|
788
|
+
# Decode output
|
789
|
+
decoded = @tokenizer.batch_decode(output, skip_special_tokens: false)[0]
|
790
|
+
|
791
|
+
# Parse answer
|
792
|
+
match = decoded.match(/<s_answer>(.*?)<\/s_answer>/)
|
793
|
+
answer = nil
|
794
|
+
if match && match.length >= 2
|
795
|
+
answer = match[1].strip
|
796
|
+
end
|
797
|
+
[{answer:}]
|
798
|
+
end
|
799
|
+
end
|
800
|
+
|
801
|
+
class TextToAudioPipeline < Pipeline
|
802
|
+
DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"
|
803
|
+
|
804
|
+
def initialize(**options)
|
805
|
+
super(**options)
|
806
|
+
|
807
|
+
# TODO: Find a better way for `pipeline` to set the default vocoder
|
808
|
+
@vocoder = options[:vocoder]
|
809
|
+
end
|
810
|
+
|
811
|
+
def call(text_inputs, speaker_embeddings: nil)
|
812
|
+
# If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
|
813
|
+
if @processor
|
814
|
+
call_text_to_spectrogram(text_inputs, speaker_embeddings:)
|
815
|
+
else
|
816
|
+
call_text_to_waveform(text_inputs)
|
817
|
+
end
|
818
|
+
end
|
819
|
+
end
|
820
|
+
|
246
821
|
class FeatureExtractionPipeline < Pipeline
|
247
822
|
def call(
|
248
823
|
texts,
|
@@ -262,7 +837,7 @@ module Informers
|
|
262
837
|
if !model_output.nil?
|
263
838
|
model_options[:output_names] = Array(model_output)
|
264
839
|
elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
265
|
-
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
840
|
+
# optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
266
841
|
model_options[:output_names] = ["sentence_embedding"]
|
267
842
|
pooling = "none"
|
268
843
|
normalize = false
|
@@ -306,6 +881,164 @@ module Informers
|
|
306
881
|
end
|
307
882
|
end
|
308
883
|
|
884
|
+
class ImageFeatureExtractionPipeline < Pipeline
|
885
|
+
def call(images)
|
886
|
+
prepared_images = prepare_images(images)
|
887
|
+
pixel_values = @processor.(prepared_images)[:pixel_values]
|
888
|
+
outputs = @model.({pixel_values: pixel_values})
|
889
|
+
|
890
|
+
result = outputs[0]
|
891
|
+
result
|
892
|
+
end
|
893
|
+
end
|
894
|
+
|
895
|
+
class AudioClassificationPipeline < Pipeline
|
896
|
+
def call(audio, top_k: nil)
|
897
|
+
single = !audio.is_a?(Array)
|
898
|
+
|
899
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
900
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
901
|
+
|
902
|
+
id2label = @model.config[:id2label]
|
903
|
+
|
904
|
+
to_return = []
|
905
|
+
prepared_audios.each do |aud|
|
906
|
+
inputs = @processor.(aud)
|
907
|
+
output = @model.(inputs)
|
908
|
+
logits = output.logits[0]
|
909
|
+
|
910
|
+
scores = Utils.get_top_items(Utils.softmax(logits), top_k)
|
911
|
+
|
912
|
+
vals =
|
913
|
+
scores.map do |x|
|
914
|
+
{
|
915
|
+
label: id2label[x[0].to_s],
|
916
|
+
score: x[1]
|
917
|
+
}
|
918
|
+
end
|
919
|
+
|
920
|
+
if top_k == 1
|
921
|
+
to_return.concat(vals)
|
922
|
+
else
|
923
|
+
to_return << vals
|
924
|
+
end
|
925
|
+
end
|
926
|
+
!single || top_k == 1 ? to_return : to_return[0]
|
927
|
+
end
|
928
|
+
end
|
929
|
+
|
930
|
+
class ZeroShotAudioClassificationPipeline < Pipeline
|
931
|
+
def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
|
932
|
+
single = !audio.is_a?(Array)
|
933
|
+
if single
|
934
|
+
audio = [audio]
|
935
|
+
end
|
936
|
+
|
937
|
+
# Insert label into hypothesis template
|
938
|
+
texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }
|
939
|
+
|
940
|
+
# Run tokenization
|
941
|
+
text_inputs =
|
942
|
+
@tokenizer.(
|
943
|
+
texts,
|
944
|
+
padding: true,
|
945
|
+
truncation: true
|
946
|
+
)
|
947
|
+
|
948
|
+
sampling_rate = @processor.feature_extractor.config["sampling_rate"]
|
949
|
+
prepared_audios = prepare_audios(audio, sampling_rate)
|
950
|
+
|
951
|
+
to_return = []
|
952
|
+
prepared_audios.each do |aud|
|
953
|
+
audio_inputs = @processor.(aud)
|
954
|
+
|
955
|
+
# Run model with both text and audio inputs
|
956
|
+
output = @model.(text_inputs.merge(audio_inputs))
|
957
|
+
|
958
|
+
# Compute softmax per audio
|
959
|
+
probs = Utils.softmax(output.logits_per_audio.data)
|
960
|
+
|
961
|
+
to_return <<
|
962
|
+
probs.map.with_index do |x, i|
|
963
|
+
{
|
964
|
+
label: candidate_labels[i],
|
965
|
+
score: x
|
966
|
+
}
|
967
|
+
end
|
968
|
+
end
|
969
|
+
single ? to_return[0] : to_return
|
970
|
+
end
|
971
|
+
end
|
972
|
+
|
973
|
+
class AutomaticSpeechRecognitionPipeline < Pipeline
|
974
|
+
def call(audio, **kwargs)
|
975
|
+
case @model.config["model_type"]
|
976
|
+
when "whisper"
|
977
|
+
call_whisper(audio, **kwargs)
|
978
|
+
else
|
979
|
+
raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
private
|
984
|
+
|
985
|
+
def call_whisper(audio, **kwargs)
|
986
|
+
raise Todo
|
987
|
+
end
|
988
|
+
end
|
989
|
+
|
990
|
+
class ImageToImagePipeline < Pipeline
|
991
|
+
def call(images)
|
992
|
+
prepared_images = prepare_images(images)
|
993
|
+
inputs = @processor.(prepared_images)
|
994
|
+
outputs = @model.(inputs)
|
995
|
+
|
996
|
+
to_return = []
|
997
|
+
outputs[0].each do |batch|
|
998
|
+
# TODO flatten first
|
999
|
+
output =
|
1000
|
+
batch.map do |v|
|
1001
|
+
v.map do |v2|
|
1002
|
+
v2.map do |v3|
|
1003
|
+
(v3.clamp(0, 1) * 255).round
|
1004
|
+
end
|
1005
|
+
end
|
1006
|
+
end
|
1007
|
+
to_return << Utils::RawImage.from_array(output).image
|
1008
|
+
end
|
1009
|
+
|
1010
|
+
to_return.length > 1 ? to_return : to_return[0]
|
1011
|
+
end
|
1012
|
+
end
|
1013
|
+
|
1014
|
+
class DepthEstimationPipeline < Pipeline
|
1015
|
+
def call(images)
|
1016
|
+
prepared_images = prepare_images(images)
|
1017
|
+
|
1018
|
+
inputs = @processor.(prepared_images)
|
1019
|
+
predicted_depth = @model.(inputs)[0]
|
1020
|
+
|
1021
|
+
to_return = []
|
1022
|
+
prepared_images.length.times do |i|
|
1023
|
+
prediction = Utils.interpolate(predicted_depth[i], prepared_images[i].size.reverse, "bilinear", false)
|
1024
|
+
max_prediction = Utils.max(prediction.flatten)[0]
|
1025
|
+
formatted =
|
1026
|
+
prediction.map do |v|
|
1027
|
+
v.map do |v2|
|
1028
|
+
v2.map do |v3|
|
1029
|
+
(v3 * 255 / max_prediction).round
|
1030
|
+
end
|
1031
|
+
end
|
1032
|
+
end
|
1033
|
+
to_return << {
|
1034
|
+
predicted_depth: predicted_depth[i],
|
1035
|
+
depth: Utils::RawImage.from_array(formatted).image
|
1036
|
+
}
|
1037
|
+
end
|
1038
|
+
to_return.length > 1 ? to_return : to_return[0]
|
1039
|
+
end
|
1040
|
+
end
|
1041
|
+
|
309
1042
|
class EmbeddingPipeline < FeatureExtractionPipeline
|
310
1043
|
def call(
|
311
1044
|
texts,
|
@@ -375,6 +1108,186 @@ module Informers
|
|
375
1108
|
},
|
376
1109
|
type: "text"
|
377
1110
|
},
|
1111
|
+
"fill-mask" => {
|
1112
|
+
tokenizer: AutoTokenizer,
|
1113
|
+
pipeline: FillMaskPipeline,
|
1114
|
+
model: AutoModelForMaskedLM,
|
1115
|
+
default: {
|
1116
|
+
model: "Xenova/bert-base-uncased"
|
1117
|
+
},
|
1118
|
+
type: "text"
|
1119
|
+
},
|
1120
|
+
"summarization" => {
|
1121
|
+
tokenizer: AutoTokenizer,
|
1122
|
+
pipeline: SummarizationPipeline,
|
1123
|
+
model: AutoModelForSeq2SeqLM,
|
1124
|
+
default: {
|
1125
|
+
model: "Xenova/distilbart-cnn-6-6"
|
1126
|
+
},
|
1127
|
+
type: "text"
|
1128
|
+
},
|
1129
|
+
"translation" => {
|
1130
|
+
tokenizer: AutoTokenizer,
|
1131
|
+
pipeline: TranslationPipeline,
|
1132
|
+
model: AutoModelForSeq2SeqLM,
|
1133
|
+
default: {
|
1134
|
+
model: "Xenova/t5-small"
|
1135
|
+
},
|
1136
|
+
type: "text"
|
1137
|
+
},
|
1138
|
+
"text2text-generation" => {
|
1139
|
+
tokenizer: AutoTokenizer,
|
1140
|
+
pipeline: Text2TextGenerationPipeline,
|
1141
|
+
model: AutoModelForSeq2SeqLM,
|
1142
|
+
default: {
|
1143
|
+
model: "Xenova/flan-t5-small"
|
1144
|
+
},
|
1145
|
+
type: "text"
|
1146
|
+
},
|
1147
|
+
"text-generation" => {
|
1148
|
+
tokenizer: AutoTokenizer,
|
1149
|
+
pipeline: TextGenerationPipeline,
|
1150
|
+
model: AutoModelForCausalLM,
|
1151
|
+
default: {
|
1152
|
+
model: "Xenova/gpt2"
|
1153
|
+
},
|
1154
|
+
type: "text"
|
1155
|
+
},
|
1156
|
+
"zero-shot-classification" => {
|
1157
|
+
tokenizer: AutoTokenizer,
|
1158
|
+
pipeline: ZeroShotClassificationPipeline,
|
1159
|
+
model: AutoModelForSequenceClassification,
|
1160
|
+
default: {
|
1161
|
+
model: "Xenova/distilbert-base-uncased-mnli"
|
1162
|
+
},
|
1163
|
+
type: "text"
|
1164
|
+
},
|
1165
|
+
"audio-classification" => {
|
1166
|
+
pipeline: AudioClassificationPipeline,
|
1167
|
+
model: AutoModelForAudioClassification,
|
1168
|
+
processor: AutoProcessor,
|
1169
|
+
default: {
|
1170
|
+
model: "Xenova/wav2vec2-base-superb-ks"
|
1171
|
+
},
|
1172
|
+
type: "audio"
|
1173
|
+
},
|
1174
|
+
# TODO
|
1175
|
+
# "zero-shot-audio-classification" => {
|
1176
|
+
# tokenizer: AutoTokenizer,
|
1177
|
+
# pipeline: ZeroShotAudioClassificationPipeline,
|
1178
|
+
# model: AutoModel,
|
1179
|
+
# processor: AutoProcessor,
|
1180
|
+
# default: {
|
1181
|
+
# model: "Xenova/clap-htsat-unfused"
|
1182
|
+
# },
|
1183
|
+
# type: "multimodal"
|
1184
|
+
# },
|
1185
|
+
# TODO
|
1186
|
+
# "automatic-speech-recognition" => {
|
1187
|
+
# tokenizer: AutoTokenizer,
|
1188
|
+
# pipeline: AutomaticSpeechRecognitionPipeline,
|
1189
|
+
# model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
|
1190
|
+
# processor: AutoProcessor,
|
1191
|
+
# default: {
|
1192
|
+
# model: "Xenova/whisper-tiny.en"
|
1193
|
+
# },
|
1194
|
+
# type: "multimodal"
|
1195
|
+
# },
|
1196
|
+
"text-to-audio" => {
|
1197
|
+
tokenizer: AutoTokenizer,
|
1198
|
+
pipeline: TextToAudioPipeline,
|
1199
|
+
model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
|
1200
|
+
processor: [AutoProcessor, nil],
|
1201
|
+
default: {
|
1202
|
+
model: "Xenova/speecht5_tts"
|
1203
|
+
},
|
1204
|
+
type: "text"
|
1205
|
+
},
|
1206
|
+
"image-to-text" => {
|
1207
|
+
tokenizer: AutoTokenizer,
|
1208
|
+
pipeline: ImageToTextPipeline,
|
1209
|
+
model: AutoModelForVision2Seq,
|
1210
|
+
processor: AutoProcessor,
|
1211
|
+
default: {
|
1212
|
+
model: "Xenova/vit-gpt2-image-captioning"
|
1213
|
+
},
|
1214
|
+
type: "multimodal"
|
1215
|
+
},
|
1216
|
+
"image-classification" => {
|
1217
|
+
pipeline: ImageClassificationPipeline,
|
1218
|
+
model: AutoModelForImageClassification,
|
1219
|
+
processor: AutoProcessor,
|
1220
|
+
default: {
|
1221
|
+
model: "Xenova/vit-base-patch16-224"
|
1222
|
+
},
|
1223
|
+
type: "multimodal"
|
1224
|
+
},
|
1225
|
+
"image-segmentation" => {
|
1226
|
+
pipeline: ImageSegmentationPipeline,
|
1227
|
+
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
|
1228
|
+
processor: AutoProcessor,
|
1229
|
+
default: {
|
1230
|
+
model: "Xenova/detr-resnet-50-panoptic"
|
1231
|
+
},
|
1232
|
+
type: "multimodal"
|
1233
|
+
},
|
1234
|
+
"zero-shot-image-classification" => {
|
1235
|
+
tokenizer: AutoTokenizer,
|
1236
|
+
pipeline: ZeroShotImageClassificationPipeline,
|
1237
|
+
model: AutoModel,
|
1238
|
+
processor: AutoProcessor,
|
1239
|
+
default: {
|
1240
|
+
model: "Xenova/clip-vit-base-patch32"
|
1241
|
+
},
|
1242
|
+
type: "multimodal"
|
1243
|
+
},
|
1244
|
+
"object-detection" => {
|
1245
|
+
pipeline: ObjectDetectionPipeline,
|
1246
|
+
model: AutoModelForObjectDetection,
|
1247
|
+
processor: AutoProcessor,
|
1248
|
+
default: {
|
1249
|
+
model: "Xenova/detr-resnet-50"
|
1250
|
+
},
|
1251
|
+
type: "multimodal"
|
1252
|
+
},
|
1253
|
+
"zero-shot-object-detection" => {
|
1254
|
+
tokenizer: AutoTokenizer,
|
1255
|
+
pipeline: ZeroShotObjectDetectionPipeline,
|
1256
|
+
model: AutoModelForZeroShotObjectDetection,
|
1257
|
+
processor: AutoProcessor,
|
1258
|
+
default: {
|
1259
|
+
model: "Xenova/owlvit-base-patch32"
|
1260
|
+
},
|
1261
|
+
type: "multimodal"
|
1262
|
+
},
|
1263
|
+
"document-question-answering" => {
|
1264
|
+
tokenizer: AutoTokenizer,
|
1265
|
+
pipeline: DocumentQuestionAnsweringPipeline,
|
1266
|
+
model: AutoModelForDocumentQuestionAnswering,
|
1267
|
+
processor: AutoProcessor,
|
1268
|
+
default: {
|
1269
|
+
model: "Xenova/donut-base-finetuned-docvqa"
|
1270
|
+
},
|
1271
|
+
type: "multimodal"
|
1272
|
+
},
|
1273
|
+
"image-to-image" => {
|
1274
|
+
pipeline: ImageToImagePipeline,
|
1275
|
+
model: AutoModelForImageToImage,
|
1276
|
+
processor: AutoProcessor,
|
1277
|
+
default: {
|
1278
|
+
model: "Xenova/swin2SR-classical-sr-x2-64"
|
1279
|
+
},
|
1280
|
+
type: "image"
|
1281
|
+
},
|
1282
|
+
"depth-estimation" => {
|
1283
|
+
pipeline: DepthEstimationPipeline,
|
1284
|
+
model: AutoModelForDepthEstimation,
|
1285
|
+
processor: AutoProcessor,
|
1286
|
+
default: {
|
1287
|
+
model: "Xenova/dpt-large"
|
1288
|
+
},
|
1289
|
+
type: "image"
|
1290
|
+
},
|
378
1291
|
"feature-extraction" => {
|
379
1292
|
tokenizer: AutoTokenizer,
|
380
1293
|
pipeline: FeatureExtractionPipeline,
|
@@ -384,6 +1297,15 @@ module Informers
|
|
384
1297
|
},
|
385
1298
|
type: "text"
|
386
1299
|
},
|
1300
|
+
"image-feature-extraction" => {
|
1301
|
+
processor: AutoProcessor,
|
1302
|
+
pipeline: ImageFeatureExtractionPipeline,
|
1303
|
+
model: [AutoModelForImageFeatureExtraction, AutoModel],
|
1304
|
+
default: {
|
1305
|
+
model: "Xenova/vit-base-patch16-224"
|
1306
|
+
},
|
1307
|
+
type: "image"
|
1308
|
+
},
|
387
1309
|
"embedding" => {
|
388
1310
|
tokenizer: AutoTokenizer,
|
389
1311
|
pipeline: EmbeddingPipeline,
|
@@ -406,7 +1328,8 @@ module Informers
|
|
406
1328
|
|
407
1329
|
TASK_ALIASES = {
|
408
1330
|
"sentiment-analysis" => "text-classification",
|
409
|
-
"ner" => "token-classification"
|
1331
|
+
"ner" => "token-classification",
|
1332
|
+
"text-to-speech" => "text-to-audio"
|
410
1333
|
}
|
411
1334
|
|
412
1335
|
DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
|
@@ -439,14 +1362,14 @@ module Informers
|
|
439
1362
|
revision: "main",
|
440
1363
|
model_file_name: nil
|
441
1364
|
)
|
1365
|
+
# Apply aliases
|
1366
|
+
task = TASK_ALIASES[task] || task
|
1367
|
+
|
442
1368
|
if quantized == NO_DEFAULT
|
443
1369
|
# TODO move default to task class
|
444
|
-
quantized =
|
1370
|
+
quantized = ["text-classification", "token-classification", "question-answering", "feature-extraction"].include?(task)
|
445
1371
|
end
|
446
1372
|
|
447
|
-
# Apply aliases
|
448
|
-
task = TASK_ALIASES[task] || task
|
449
|
-
|
450
1373
|
# Get pipeline info
|
451
1374
|
pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
|
452
1375
|
if !pipeline_info
|
@@ -479,7 +1402,8 @@ module Informers
|
|
479
1402
|
results = load_items(classes, model, pretrained_options)
|
480
1403
|
results[:task] = task
|
481
1404
|
|
482
|
-
|
1405
|
+
# for previous revision of sentence-transformers/all-MiniLM-L6-v2
|
1406
|
+
if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
|
483
1407
|
results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
|
484
1408
|
end
|
485
1409
|
|
@@ -502,7 +1426,15 @@ module Informers
|
|
502
1426
|
next if !cls
|
503
1427
|
|
504
1428
|
if cls.is_a?(Array)
|
505
|
-
|
1429
|
+
e = nil
|
1430
|
+
cls.each do |c|
|
1431
|
+
begin
|
1432
|
+
result[name] = c.from_pretrained(model, **pretrained_options)
|
1433
|
+
rescue => err
|
1434
|
+
e = err
|
1435
|
+
end
|
1436
|
+
end
|
1437
|
+
raise e unless result[name]
|
506
1438
|
else
|
507
1439
|
result[name] = cls.from_pretrained(model, **pretrained_options)
|
508
1440
|
end
|