gimlet-api 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gimlet_api-0.0.7.dist-info → gimlet_api-0.0.8.dist-info}/METADATA +1 -1
- {gimlet_api-0.0.7.dist-info → gimlet_api-0.0.8.dist-info}/RECORD +18 -17
- gml/client.py +7 -6
- gml/compile.py +13 -93
- gml/device.py +3 -7
- gml/hf.py +294 -32
- gml/model.py +2 -1
- gml/pipelines.py +125 -6
- gml/preprocessing.py +2 -1
- gml/proto/src/api/corepb/v1/controlplane_pb2.py +35 -20
- gml/proto/src/api/corepb/v1/cp_edge_pb2.py +43 -49
- gml/proto/src/api/corepb/v1/device_info_pb2.py +15 -7
- gml/proto/src/api/corepb/v1/mediastream_pb2.py +23 -19
- gml/proto/src/api/corepb/v1/model_exec_pb2.py +127 -112
- gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py +7 -11
- gml/register_submodules.py +134 -0
- gml/tensor.py +2 -1
- {gimlet_api-0.0.7.dist-info → gimlet_api-0.0.8.dist-info}/WHEEL +0 -0
gml/hf.py
CHANGED
@@ -15,14 +15,23 @@
|
|
15
15
|
# SPDX-License-Identifier: Apache-2.0
|
16
16
|
|
17
17
|
import glob
|
18
|
+
import math
|
18
19
|
import tempfile
|
20
|
+
import warnings
|
19
21
|
from collections.abc import Iterable
|
20
22
|
from pathlib import Path
|
21
23
|
from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple
|
22
24
|
|
23
|
-
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
24
25
|
import torch
|
25
26
|
import transformers
|
27
|
+
from transformers import (
|
28
|
+
BaseImageProcessor,
|
29
|
+
Pipeline,
|
30
|
+
PreTrainedModel,
|
31
|
+
PreTrainedTokenizer,
|
32
|
+
)
|
33
|
+
|
34
|
+
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
26
35
|
from gml.asset_manager import AssetManager
|
27
36
|
from gml.model import GenerationConfig, Model, TorchModel
|
28
37
|
from gml.preprocessing import (
|
@@ -34,10 +43,12 @@ from gml.preprocessing import (
|
|
34
43
|
)
|
35
44
|
from gml.tensor import (
|
36
45
|
AttentionKeyValueCacheTensorSemantics,
|
46
|
+
AttentionMaskDimension,
|
37
47
|
BatchDimension,
|
38
48
|
BoundingBoxFormat,
|
39
49
|
DetectionNumCandidatesDimension,
|
40
50
|
DetectionOutputDimension,
|
51
|
+
DimensionSemantics,
|
41
52
|
ImageChannelDimension,
|
42
53
|
ImageHeightDimension,
|
43
54
|
ImageWidthDimension,
|
@@ -46,12 +57,8 @@ from gml.tensor import (
|
|
46
57
|
TokensDimension,
|
47
58
|
VocabLogitsDimension,
|
48
59
|
)
|
49
|
-
|
50
|
-
|
51
|
-
Pipeline,
|
52
|
-
PreTrainedModel,
|
53
|
-
PreTrainedTokenizer,
|
54
|
-
)
|
60
|
+
|
61
|
+
FALLBACK_RESIZE_SIZE = 512
|
55
62
|
|
56
63
|
|
57
64
|
class HuggingFaceTokenizer(Model):
|
@@ -77,7 +84,6 @@ class HuggingFaceTokenizer(Model):
|
|
77
84
|
|
78
85
|
|
79
86
|
class HuggingFaceGenerationConfig(GenerationConfig):
|
80
|
-
|
81
87
|
def __init__(self, model: PreTrainedModel):
|
82
88
|
config = model.generation_config
|
83
89
|
eos_tokens = config.eos_token_id
|
@@ -242,25 +248,34 @@ class HuggingFaceTextGenerationPipeline:
|
|
242
248
|
|
243
249
|
|
244
250
|
class HuggingFaceImageProcessor:
|
245
|
-
|
246
251
|
def __init__(
|
247
252
|
self,
|
248
253
|
model: PreTrainedModel,
|
249
254
|
processor: BaseImageProcessor,
|
255
|
+
image_size_override: Optional[Tuple[int, int]] = None,
|
250
256
|
):
|
251
257
|
self.model = model
|
252
258
|
self.processor = processor
|
259
|
+
self.image_size_override = image_size_override
|
253
260
|
|
254
261
|
def input_spec(self) -> Dict[str, Any]:
|
255
262
|
target_size = None
|
256
263
|
image_preprocessing_steps = []
|
257
|
-
|
258
|
-
hasattr(self.processor, "do_resize")
|
259
|
-
|
260
|
-
|
261
|
-
|
264
|
+
has_do_resize = (
|
265
|
+
hasattr(self.processor, "do_resize") and self.processor.do_resize
|
266
|
+
)
|
267
|
+
has_do_pad = hasattr(self.processor, "do_pad") and self.processor.do_pad
|
268
|
+
# NOTE: it is possible for both do_resize and do_pad to be set, in which case we only use do_resize.
|
269
|
+
if has_do_resize:
|
262
270
|
target_size, preprocessing_step = self._convert_resize()
|
263
271
|
image_preprocessing_steps.append(preprocessing_step)
|
272
|
+
elif has_do_pad:
|
273
|
+
target_size, preprocessing_step = self._convert_pad()
|
274
|
+
image_preprocessing_steps.append(preprocessing_step)
|
275
|
+
else:
|
276
|
+
raise ValueError(
|
277
|
+
"could not determine target size for resize from model config"
|
278
|
+
)
|
264
279
|
|
265
280
|
if (
|
266
281
|
hasattr(self.processor, "do_rescale")
|
@@ -291,7 +306,7 @@ class HuggingFaceImageProcessor:
|
|
291
306
|
# TODO(james): figure out if this is specified anywhere in the huggingface pipeline.
|
292
307
|
channel_format = "rgb"
|
293
308
|
|
294
|
-
dimensions = [
|
309
|
+
dimensions: list[DimensionSemantics] = [
|
295
310
|
BatchDimension(),
|
296
311
|
]
|
297
312
|
input_shape = [1]
|
@@ -342,21 +357,38 @@ class HuggingFaceImageProcessor:
|
|
342
357
|
"class_labels": labels,
|
343
358
|
}
|
344
359
|
|
345
|
-
def
|
360
|
+
def output_spec_depth(self) -> Dict[str, Any]:
|
361
|
+
dimensions = [
|
362
|
+
BatchDimension(),
|
363
|
+
ImageHeightDimension(),
|
364
|
+
ImageWidthDimension(),
|
365
|
+
]
|
366
|
+
output_tensor_semantics = [
|
367
|
+
TensorSemantics(dimensions),
|
368
|
+
]
|
369
|
+
return {
|
370
|
+
"output_tensor_semantics": output_tensor_semantics,
|
371
|
+
}
|
372
|
+
|
373
|
+
def output_spec_object_detection(self, zero_shot=False) -> Dict[str, Any]:
|
346
374
|
if not hasattr(self.processor, "post_process_object_detection"):
|
347
375
|
raise NotImplementedError(
|
348
376
|
"processor must have post_process_object_detection set"
|
349
377
|
)
|
350
378
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
379
|
+
if zero_shot:
|
380
|
+
num_classes = -1
|
381
|
+
labels = []
|
382
|
+
else:
|
383
|
+
id_to_label = self.model.config.id2label
|
384
|
+
max_id = max(id_to_label)
|
385
|
+
labels = []
|
386
|
+
for i in range(max_id):
|
387
|
+
if i not in id_to_label:
|
388
|
+
labels.append("")
|
389
|
+
continue
|
390
|
+
labels.append(id_to_label[i])
|
391
|
+
num_classes = max_id + 1
|
360
392
|
|
361
393
|
# TODO(james): verify assumptions made here apply broadly.
|
362
394
|
output_tensor_semantics = []
|
@@ -366,7 +398,7 @@ class HuggingFaceImageProcessor:
|
|
366
398
|
DetectionNumCandidatesDimension(is_nms=False),
|
367
399
|
DetectionOutputDimension(
|
368
400
|
scores_range=(0, num_classes),
|
369
|
-
scores_are_logits=
|
401
|
+
scores_are_logits=not zero_shot,
|
370
402
|
),
|
371
403
|
]
|
372
404
|
output_tensor_semantics.append(TensorSemantics(logits_dimensions))
|
@@ -385,12 +417,45 @@ class HuggingFaceImageProcessor:
|
|
385
417
|
"class_labels": labels,
|
386
418
|
}
|
387
419
|
|
420
|
+
def _get_size(self) -> Dict[str, int]:
|
421
|
+
size = None
|
422
|
+
if self.image_size_override:
|
423
|
+
size = {
|
424
|
+
"height": self.image_size_override[0],
|
425
|
+
"width": self.image_size_override[1],
|
426
|
+
}
|
427
|
+
elif hasattr(self.processor, "size") and self.processor.size is not None:
|
428
|
+
size = self.processor.size
|
429
|
+
elif (
|
430
|
+
hasattr(self.model.config, "image_size")
|
431
|
+
and self.model.config.image_size is not None
|
432
|
+
):
|
433
|
+
size = {
|
434
|
+
"height": self.model.config.image_size,
|
435
|
+
"width": self.model.config.image_size,
|
436
|
+
}
|
437
|
+
else:
|
438
|
+
warnings.warn(
|
439
|
+
f"using fallback resize size of {FALLBACK_RESIZE_SIZE} for model",
|
440
|
+
stacklevel=1,
|
441
|
+
)
|
442
|
+
size = {
|
443
|
+
"width": FALLBACK_RESIZE_SIZE,
|
444
|
+
"height": FALLBACK_RESIZE_SIZE,
|
445
|
+
}
|
446
|
+
return size
|
447
|
+
|
388
448
|
def _convert_resize(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
|
389
|
-
size = self.
|
449
|
+
size = self._get_size()
|
450
|
+
size_divisor: int | None = None
|
451
|
+
if hasattr(self.processor, "size_divisor"):
|
452
|
+
size_divisor = self.processor.size_divisor
|
453
|
+
|
390
454
|
target_size = None
|
391
455
|
preprocess_step = None
|
456
|
+
|
392
457
|
if "height" in size and "width" in size:
|
393
|
-
target_size =
|
458
|
+
target_size = (size["height"], size["width"])
|
394
459
|
preprocess_step = ResizeImage()
|
395
460
|
elif (
|
396
461
|
"shortest_edge" in size
|
@@ -410,12 +475,55 @@ class HuggingFaceImageProcessor:
|
|
410
475
|
if not min_size or edge_size < min_size:
|
411
476
|
min_size = edge_size
|
412
477
|
|
413
|
-
|
478
|
+
if min_size is None:
|
479
|
+
raise ValueError(
|
480
|
+
"could not determine target size for resize from model config"
|
481
|
+
)
|
482
|
+
target_size = (min_size, min_size)
|
483
|
+
preprocess_step = LetterboxImage()
|
484
|
+
else:
|
485
|
+
raise ValueError(
|
486
|
+
"could not determine target size for resize from model config"
|
487
|
+
)
|
488
|
+
if size_divisor:
|
489
|
+
target_size = (
|
490
|
+
math.ceil(target_size[0] / size_divisor) * size_divisor,
|
491
|
+
math.ceil(target_size[1] / size_divisor) * size_divisor,
|
492
|
+
)
|
493
|
+
return target_size, preprocess_step
|
494
|
+
|
495
|
+
def _convert_pad(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
|
496
|
+
# NOTE: There is a wide variety of ways that huggingface pads images.
|
497
|
+
# We found at least 3 different ways to pad images in the codebase:
|
498
|
+
# 1. Center pad (pad top,left, bottom, right) to match target size
|
499
|
+
# https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/dpt/image_processing_dpt.py#L231
|
500
|
+
# 2. Right/Top pad (pad top, and right) to match target size
|
501
|
+
# https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/models/conditional_detr/image_processing_conditional_detr.py#L846
|
502
|
+
# 3. Pad to nearest multiple of size_divisor
|
503
|
+
# https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/llava_onevision/image_processing_llava_onevision.py#L177-179
|
504
|
+
#
|
505
|
+
# We decided to simply implement padding with LetterBoxImage(),
|
506
|
+
# because we assume the models won't be that sensitive to the type of padding,
|
507
|
+
# but this may need to be revisited in the future.
|
508
|
+
size = self._get_size()
|
509
|
+
size_divisor: int | None = None
|
510
|
+
if hasattr(self.processor, "size_divisor"):
|
511
|
+
size_divisor = self.processor.size_divisor
|
512
|
+
|
513
|
+
target_size = None
|
514
|
+
preprocess_step = None
|
515
|
+
if "height" in size and "width" in size:
|
516
|
+
target_size = (size["height"], size["width"])
|
414
517
|
preprocess_step = LetterboxImage()
|
415
518
|
else:
|
416
519
|
raise ValueError(
|
417
520
|
"could not determine target size for resize from model config"
|
418
521
|
)
|
522
|
+
if size_divisor:
|
523
|
+
target_size = (
|
524
|
+
math.ceil(target_size[0] / size_divisor) * size_divisor,
|
525
|
+
math.ceil(target_size[1] / size_divisor) * size_divisor,
|
526
|
+
)
|
419
527
|
return target_size, preprocess_step
|
420
528
|
|
421
529
|
|
@@ -424,11 +532,13 @@ class HuggingFaceImageSegmentationPipeline:
|
|
424
532
|
self,
|
425
533
|
pipeline: Pipeline,
|
426
534
|
name: Optional[str] = None,
|
535
|
+
image_size_override: Optional[Tuple[int, int]] = None,
|
427
536
|
):
|
428
537
|
self.pipeline = pipeline
|
429
538
|
if name is None:
|
430
539
|
name = pipeline.model.name_or_path
|
431
540
|
|
541
|
+
self.image_size_override = image_size_override
|
432
542
|
self.model = TorchModel(
|
433
543
|
name,
|
434
544
|
torch_module=self.pipeline.model,
|
@@ -446,7 +556,9 @@ class HuggingFaceImageSegmentationPipeline:
|
|
446
556
|
)
|
447
557
|
|
448
558
|
image_processor = HuggingFaceImageProcessor(
|
449
|
-
self.pipeline.model,
|
559
|
+
self.pipeline.model,
|
560
|
+
self.pipeline.image_processor,
|
561
|
+
image_size_override=self.image_size_override,
|
450
562
|
)
|
451
563
|
spec = image_processor.input_spec()
|
452
564
|
spec.update(image_processor.output_spec_segmentation())
|
@@ -471,11 +583,13 @@ class HuggingFaceObjectDetectionPipeline:
|
|
471
583
|
self,
|
472
584
|
pipeline: Pipeline,
|
473
585
|
name: Optional[str] = None,
|
586
|
+
image_size_override: Optional[Tuple[int, int]] = None,
|
474
587
|
):
|
475
588
|
self.pipeline = pipeline
|
476
589
|
if name is None:
|
477
590
|
name = pipeline.model.name_or_path
|
478
591
|
|
592
|
+
self.image_size_override = image_size_override
|
479
593
|
self.model = TorchModel(
|
480
594
|
name,
|
481
595
|
torch_module=ObjectDetectionWrapper(self.pipeline.model),
|
@@ -493,7 +607,9 @@ class HuggingFaceObjectDetectionPipeline:
|
|
493
607
|
)
|
494
608
|
|
495
609
|
image_processor = HuggingFaceImageProcessor(
|
496
|
-
self.pipeline.model,
|
610
|
+
self.pipeline.model,
|
611
|
+
self.pipeline.image_processor,
|
612
|
+
image_size_override=self.image_size_override,
|
497
613
|
)
|
498
614
|
spec = image_processor.input_spec()
|
499
615
|
spec.update(image_processor.output_spec_object_detection())
|
@@ -503,6 +619,141 @@ class HuggingFaceObjectDetectionPipeline:
|
|
503
619
|
return [self.model]
|
504
620
|
|
505
621
|
|
622
|
+
class ZeroShotObjectDetectionWrapper(torch.nn.Module):
|
623
|
+
def __init__(self, model: PreTrainedModel):
|
624
|
+
super().__init__()
|
625
|
+
self.model = model
|
626
|
+
|
627
|
+
def forward(self, image, tokens, attention_mask):
|
628
|
+
outputs = self.model(
|
629
|
+
input_ids=tokens, pixel_values=image, attention_mask=attention_mask
|
630
|
+
)
|
631
|
+
return torch.sigmoid(outputs.logits), outputs.pred_boxes
|
632
|
+
|
633
|
+
|
634
|
+
class HuggingFaceZeroShotObjectDetectionPipeline:
|
635
|
+
def __init__(
|
636
|
+
self,
|
637
|
+
pipeline: Pipeline,
|
638
|
+
name: Optional[str] = None,
|
639
|
+
tokenizer_name: Optional[str] = None,
|
640
|
+
image_size_override: Optional[Tuple[int, int]] = None,
|
641
|
+
):
|
642
|
+
self.pipeline = pipeline
|
643
|
+
if name is None:
|
644
|
+
name = pipeline.model.name_or_path
|
645
|
+
|
646
|
+
self.tokenizer_model = HuggingFaceTokenizer(
|
647
|
+
self.pipeline.tokenizer, tokenizer_name
|
648
|
+
)
|
649
|
+
|
650
|
+
self.image_size_override = image_size_override
|
651
|
+
self.detection_model = TorchModel(
|
652
|
+
name,
|
653
|
+
torch_module=ZeroShotObjectDetectionWrapper(self.pipeline.model),
|
654
|
+
**self._guess_model_spec(),
|
655
|
+
)
|
656
|
+
|
657
|
+
def _add_zero_shot_inputs(self, spec: Dict):
|
658
|
+
example_inputs = spec["example_inputs"]
|
659
|
+
if "dynamic_shapes" not in spec:
|
660
|
+
spec["dynamic_shapes"] = [{} for _ in example_inputs]
|
661
|
+
|
662
|
+
max_length = self.pipeline.model.config.text_config.max_length
|
663
|
+
example_inputs.extend(
|
664
|
+
[
|
665
|
+
torch.randint(200, [2, max_length]).to(torch.int32),
|
666
|
+
torch.ones([2, max_length]).to(torch.int32),
|
667
|
+
]
|
668
|
+
)
|
669
|
+
|
670
|
+
input_tensor_semantics = spec["input_tensor_semantics"]
|
671
|
+
input_tensor_semantics.extend(
|
672
|
+
[
|
673
|
+
TensorSemantics(
|
674
|
+
[
|
675
|
+
BatchDimension(),
|
676
|
+
TokensDimension(),
|
677
|
+
]
|
678
|
+
),
|
679
|
+
TensorSemantics(
|
680
|
+
[
|
681
|
+
BatchDimension(),
|
682
|
+
AttentionMaskDimension(),
|
683
|
+
]
|
684
|
+
),
|
685
|
+
]
|
686
|
+
)
|
687
|
+
|
688
|
+
spec["dynamic_shapes"].extend(
|
689
|
+
[
|
690
|
+
{0: "num_labels"},
|
691
|
+
{0: "num_labels"},
|
692
|
+
]
|
693
|
+
)
|
694
|
+
|
695
|
+
def _guess_model_spec(self) -> Dict:
|
696
|
+
if self.pipeline.image_processor is None:
|
697
|
+
raise ValueError(
|
698
|
+
"Could not determine image preprocessing for pipeline with image_processor=None"
|
699
|
+
)
|
700
|
+
|
701
|
+
image_processor = HuggingFaceImageProcessor(
|
702
|
+
self.pipeline.model,
|
703
|
+
self.pipeline.image_processor,
|
704
|
+
image_size_override=self.image_size_override,
|
705
|
+
)
|
706
|
+
spec = image_processor.input_spec()
|
707
|
+
self._add_zero_shot_inputs(spec)
|
708
|
+
spec.update(image_processor.output_spec_object_detection(zero_shot=True))
|
709
|
+
return spec
|
710
|
+
|
711
|
+
def models(self) -> List[Model]:
|
712
|
+
return [self.detection_model, self.tokenizer_model]
|
713
|
+
|
714
|
+
|
715
|
+
class HuggingFaceDepthEstimationPipeline:
|
716
|
+
def __init__(
|
717
|
+
self,
|
718
|
+
pipeline: Pipeline,
|
719
|
+
name: Optional[str] = None,
|
720
|
+
image_size_override: Optional[Tuple[int, int]] = None,
|
721
|
+
):
|
722
|
+
self.pipeline = pipeline
|
723
|
+
if name is None:
|
724
|
+
name = pipeline.model.name_or_path
|
725
|
+
|
726
|
+
self.image_size_override = image_size_override
|
727
|
+
|
728
|
+
self.model = TorchModel(
|
729
|
+
name,
|
730
|
+
torch_module=self.pipeline.model,
|
731
|
+
**self._guess_model_spec(),
|
732
|
+
)
|
733
|
+
|
734
|
+
def _guess_model_spec(self) -> Dict:
|
735
|
+
if self.pipeline.image_processor is None:
|
736
|
+
raise ValueError(
|
737
|
+
"Could not determine image preprocessing for pipeline with image_processor=None"
|
738
|
+
)
|
739
|
+
if self.pipeline.tokenizer is not None:
|
740
|
+
raise NotImplementedError(
|
741
|
+
"HuggingFaceDepthEstimationPipeline does not yet support token inputs"
|
742
|
+
)
|
743
|
+
|
744
|
+
image_processor = HuggingFaceImageProcessor(
|
745
|
+
self.pipeline.model,
|
746
|
+
self.pipeline.image_processor,
|
747
|
+
image_size_override=self.image_size_override,
|
748
|
+
)
|
749
|
+
spec = image_processor.input_spec()
|
750
|
+
spec.update(image_processor.output_spec_depth())
|
751
|
+
return spec
|
752
|
+
|
753
|
+
def models(self) -> List[Model]:
|
754
|
+
return [self.model]
|
755
|
+
|
756
|
+
|
506
757
|
def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
|
507
758
|
if pipeline.framework != "pt":
|
508
759
|
raise ValueError(
|
@@ -517,8 +768,19 @@ def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
|
|
517
768
|
return HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
|
518
769
|
elif pipeline.task == "object-detection":
|
519
770
|
return HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
|
771
|
+
elif pipeline.task == "zero-shot-object-detection":
|
772
|
+
return HuggingFaceZeroShotObjectDetectionPipeline(pipeline, **kwargs).models()
|
773
|
+
elif pipeline.task == "depth-estimation":
|
774
|
+
return HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
|
520
775
|
raise ValueError(
|
521
776
|
"unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
|
522
|
-
pipeline.task,
|
777
|
+
pipeline.task,
|
778
|
+
[
|
779
|
+
"text-generation",
|
780
|
+
"image-segmentation",
|
781
|
+
"object-detection",
|
782
|
+
"zero-shot-object-detection",
|
783
|
+
"depth-estimation",
|
784
|
+
],
|
523
785
|
)
|
524
786
|
)
|
gml/model.py
CHANGED
@@ -21,8 +21,9 @@ import io
|
|
21
21
|
from pathlib import Path
|
22
22
|
from typing import BinaryIO, Dict, List, Literal, Optional, Sequence, TextIO, Tuple
|
23
23
|
|
24
|
-
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
25
24
|
import torch
|
25
|
+
|
26
|
+
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
26
27
|
from gml.asset_manager import AssetManager, TempFileAssetManager
|
27
28
|
from gml.compile import to_torch_mlir
|
28
29
|
from gml.preprocessing import ImagePreprocessingStep
|
gml/pipelines.py
CHANGED
@@ -15,7 +15,7 @@
|
|
15
15
|
# SPDX-License-Identifier: Apache-2.0
|
16
16
|
|
17
17
|
import abc
|
18
|
-
from typing import List
|
18
|
+
from typing import List, Optional
|
19
19
|
|
20
20
|
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
21
21
|
from gml.model import Model
|
@@ -41,12 +41,44 @@ class SingleModelPipeline(Pipeline):
|
|
41
41
|
|
42
42
|
|
43
43
|
class SimpleDetectionPipeline(SingleModelPipeline):
|
44
|
-
def __init__(
|
45
|
-
self
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
track_objects: Optional[bool] = None,
|
47
|
+
add_tracking_id: Optional[bool] = None,
|
48
|
+
):
|
49
|
+
self.track_objects = False
|
50
|
+
if add_tracking_id is not None:
|
51
|
+
import warnings
|
52
|
+
|
53
|
+
warnings.warn(
|
54
|
+
"The 'add_tracking_id' parameter is deprecated and will be removed in a future version.",
|
55
|
+
DeprecationWarning,
|
56
|
+
stacklevel=2,
|
57
|
+
)
|
58
|
+
self.track_objects = add_tracking_id
|
59
|
+
|
60
|
+
if track_objects is not None:
|
61
|
+
self.track_objects = track_objects
|
62
|
+
|
63
|
+
if track_objects is not None and add_tracking_id is not None:
|
64
|
+
raise ValueError(
|
65
|
+
"'track_objects' and 'add_tracking_id' cannot be set simultaneously."
|
66
|
+
)
|
46
67
|
|
47
68
|
def _to_yaml(self, model_name: str, org_name: str):
|
48
|
-
add_tracking_id = "true" if self.add_tracking_id else "false"
|
49
69
|
# editorconfig-checker-disable
|
70
|
+
video_stream_detections = ".detect.detections"
|
71
|
+
track_node = ""
|
72
|
+
if self.track_objects:
|
73
|
+
track_node = """
|
74
|
+
- name: track
|
75
|
+
kind: Track
|
76
|
+
inputs:
|
77
|
+
detections: .detect.detections
|
78
|
+
outputs:
|
79
|
+
- tracked_detections
|
80
|
+
"""
|
81
|
+
video_stream_detections = ".track.tracked_detections"
|
50
82
|
return f"""---
|
51
83
|
nodes:
|
52
84
|
- name: camera_source
|
@@ -56,7 +88,6 @@ nodes:
|
|
56
88
|
- name: detect
|
57
89
|
kind: Detect
|
58
90
|
attributes:
|
59
|
-
add_tracking_id: {add_tracking_id}
|
60
91
|
model:
|
61
92
|
model:
|
62
93
|
name: {model_name}
|
@@ -66,13 +97,14 @@ nodes:
|
|
66
97
|
frame: .camera_source.frame
|
67
98
|
outputs:
|
68
99
|
- detections
|
100
|
+
{track_node}
|
69
101
|
- name: video_stream_sink
|
70
102
|
kind: VideoStreamSink
|
71
103
|
attributes:
|
72
104
|
frame_rate_limit: 30
|
73
105
|
inputs:
|
74
106
|
frame: .camera_source.frame
|
75
|
-
detections:
|
107
|
+
detections: {video_stream_detections}
|
76
108
|
"""
|
77
109
|
|
78
110
|
|
@@ -110,6 +142,36 @@ nodes:
|
|
110
142
|
"""
|
111
143
|
|
112
144
|
|
145
|
+
class SimpleDepthEstimationPipeline(SingleModelPipeline):
|
146
|
+
def _to_yaml(self, model_name: str, org_name: str):
|
147
|
+
# editorconfig-checker-disable
|
148
|
+
return f"""---
|
149
|
+
nodes:
|
150
|
+
- name: camera_source
|
151
|
+
kind: CameraSource
|
152
|
+
outputs:
|
153
|
+
- frame
|
154
|
+
- name: estimate_depth
|
155
|
+
kind: EstimateDepth
|
156
|
+
attributes:
|
157
|
+
model:
|
158
|
+
model:
|
159
|
+
name: {model_name}
|
160
|
+
org: {org_name}
|
161
|
+
frame_rate_limit: 30
|
162
|
+
inputs:
|
163
|
+
frame: .camera_source.frame
|
164
|
+
outputs:
|
165
|
+
- depth
|
166
|
+
- name: video_stream_sink
|
167
|
+
kind: VideoStreamSink
|
168
|
+
attributes:
|
169
|
+
frame_rate_limit: 30
|
170
|
+
inputs:
|
171
|
+
frame: .estimate_depth.depth
|
172
|
+
"""
|
173
|
+
|
174
|
+
|
113
175
|
class LiveChatPipeline(Pipeline):
|
114
176
|
def to_yaml(self, models: List[Model], org_name: str) -> str:
|
115
177
|
if len(models) != 2:
|
@@ -173,4 +235,61 @@ nodes:
|
|
173
235
|
"""
|
174
236
|
|
175
237
|
|
238
|
+
class ZeroShotObjectDetectionPipeline(Pipeline):
|
239
|
+
def __init__(self, conf_threshold=0.1):
|
240
|
+
self.conf_threshold = conf_threshold
|
241
|
+
|
242
|
+
def to_yaml(self, models: List[Model], org_name: str) -> str:
|
243
|
+
if len(models) != 2:
|
244
|
+
raise ValueError(
|
245
|
+
"ZeroShotObjectDetectionPipeline expects two models (a detection model and a tokenizer)"
|
246
|
+
)
|
247
|
+
tokenizer = None
|
248
|
+
detect = None
|
249
|
+
for m in models:
|
250
|
+
if m.storage_format == modelexecpb.ModelInfo.MODEL_STORAGE_FORMAT_OPAQUE:
|
251
|
+
tokenizer = m
|
252
|
+
else:
|
253
|
+
detect = m
|
254
|
+
if tokenizer is None or detect is None:
|
255
|
+
raise ValueError(
|
256
|
+
"ZeroShotObjectDetectionPipeline expects both a tokenizer model and a detection model)"
|
257
|
+
)
|
258
|
+
return f"""---
|
259
|
+
nodes:
|
260
|
+
- name: camera_source
|
261
|
+
kind: CameraSource
|
262
|
+
outputs:
|
263
|
+
- frame
|
264
|
+
- name: text_source
|
265
|
+
kind: TextStreamSource
|
266
|
+
outputs:
|
267
|
+
- prompt
|
268
|
+
- name: detect
|
269
|
+
kind: Detect
|
270
|
+
attributes:
|
271
|
+
model:
|
272
|
+
model:
|
273
|
+
name: {detect.name}
|
274
|
+
org: {org_name}
|
275
|
+
tokenizer:
|
276
|
+
model:
|
277
|
+
name: {tokenizer.name}
|
278
|
+
org: {org_name}
|
279
|
+
conf_threshold: {self.conf_threshold}
|
280
|
+
inputs:
|
281
|
+
frame: .camera_source.frame
|
282
|
+
prompt: .text_source.prompt
|
283
|
+
outputs:
|
284
|
+
- detections
|
285
|
+
- name: video_stream_sink
|
286
|
+
kind: VideoStreamSink
|
287
|
+
attributes:
|
288
|
+
frame_rate_limit: 30
|
289
|
+
inputs:
|
290
|
+
frame: .camera_source.frame
|
291
|
+
detections: .detect.detections
|
292
|
+
"""
|
293
|
+
|
294
|
+
|
176
295
|
# editorconfig-checker-enable
|
gml/preprocessing.py
CHANGED
@@ -17,9 +17,10 @@
|
|
17
17
|
import abc
|
18
18
|
from typing import List
|
19
19
|
|
20
|
-
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
21
20
|
import google.protobuf.wrappers_pb2 as wrapperspb
|
22
21
|
|
22
|
+
import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
|
23
|
+
|
23
24
|
|
24
25
|
class ImagePreprocessingStep(abc.ABC):
|
25
26
|
@abc.abstractmethod
|