gimlet-api 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gml/hf.py CHANGED
@@ -15,14 +15,23 @@
15
15
  # SPDX-License-Identifier: Apache-2.0
16
16
 
17
17
  import glob
18
+ import math
18
19
  import tempfile
20
+ import warnings
19
21
  from collections.abc import Iterable
20
22
  from pathlib import Path
21
23
  from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple
22
24
 
23
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
24
25
  import torch
25
26
  import transformers
27
+ from transformers import (
28
+ BaseImageProcessor,
29
+ Pipeline,
30
+ PreTrainedModel,
31
+ PreTrainedTokenizer,
32
+ )
33
+
34
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
26
35
  from gml.asset_manager import AssetManager
27
36
  from gml.model import GenerationConfig, Model, TorchModel
28
37
  from gml.preprocessing import (
@@ -34,10 +43,12 @@ from gml.preprocessing import (
34
43
  )
35
44
  from gml.tensor import (
36
45
  AttentionKeyValueCacheTensorSemantics,
46
+ AttentionMaskDimension,
37
47
  BatchDimension,
38
48
  BoundingBoxFormat,
39
49
  DetectionNumCandidatesDimension,
40
50
  DetectionOutputDimension,
51
+ DimensionSemantics,
41
52
  ImageChannelDimension,
42
53
  ImageHeightDimension,
43
54
  ImageWidthDimension,
@@ -46,12 +57,8 @@ from gml.tensor import (
46
57
  TokensDimension,
47
58
  VocabLogitsDimension,
48
59
  )
49
- from transformers import (
50
- BaseImageProcessor,
51
- Pipeline,
52
- PreTrainedModel,
53
- PreTrainedTokenizer,
54
- )
60
+
61
+ FALLBACK_RESIZE_SIZE = 512
55
62
 
56
63
 
57
64
  class HuggingFaceTokenizer(Model):
@@ -77,7 +84,6 @@ class HuggingFaceTokenizer(Model):
77
84
 
78
85
 
79
86
  class HuggingFaceGenerationConfig(GenerationConfig):
80
-
81
87
  def __init__(self, model: PreTrainedModel):
82
88
  config = model.generation_config
83
89
  eos_tokens = config.eos_token_id
@@ -242,25 +248,34 @@ class HuggingFaceTextGenerationPipeline:
242
248
 
243
249
 
244
250
  class HuggingFaceImageProcessor:
245
-
246
251
  def __init__(
247
252
  self,
248
253
  model: PreTrainedModel,
249
254
  processor: BaseImageProcessor,
255
+ image_size_override: Optional[Tuple[int, int]] = None,
250
256
  ):
251
257
  self.model = model
252
258
  self.processor = processor
259
+ self.image_size_override = image_size_override
253
260
 
254
261
  def input_spec(self) -> Dict[str, Any]:
255
262
  target_size = None
256
263
  image_preprocessing_steps = []
257
- if (
258
- hasattr(self.processor, "do_resize")
259
- and self.processor.do_resize
260
- and hasattr(self.processor, "size")
261
- ):
264
+ has_do_resize = (
265
+ hasattr(self.processor, "do_resize") and self.processor.do_resize
266
+ )
267
+ has_do_pad = hasattr(self.processor, "do_pad") and self.processor.do_pad
268
+ # NOTE: it is possible for both do_resize and do_pad to be set, in which case we only use do_resize.
269
+ if has_do_resize:
262
270
  target_size, preprocessing_step = self._convert_resize()
263
271
  image_preprocessing_steps.append(preprocessing_step)
272
+ elif has_do_pad:
273
+ target_size, preprocessing_step = self._convert_pad()
274
+ image_preprocessing_steps.append(preprocessing_step)
275
+ else:
276
+ raise ValueError(
277
+ "could not determine target size for resize from model config"
278
+ )
264
279
 
265
280
  if (
266
281
  hasattr(self.processor, "do_rescale")
@@ -291,7 +306,7 @@ class HuggingFaceImageProcessor:
291
306
  # TODO(james): figure out if this is specified anywhere in the huggingface pipeline.
292
307
  channel_format = "rgb"
293
308
 
294
- dimensions = [
309
+ dimensions: list[DimensionSemantics] = [
295
310
  BatchDimension(),
296
311
  ]
297
312
  input_shape = [1]
@@ -342,21 +357,38 @@ class HuggingFaceImageProcessor:
342
357
  "class_labels": labels,
343
358
  }
344
359
 
345
- def output_spec_object_detection(self) -> Dict[str, Any]:
360
+ def output_spec_depth(self) -> Dict[str, Any]:
361
+ dimensions = [
362
+ BatchDimension(),
363
+ ImageHeightDimension(),
364
+ ImageWidthDimension(),
365
+ ]
366
+ output_tensor_semantics = [
367
+ TensorSemantics(dimensions),
368
+ ]
369
+ return {
370
+ "output_tensor_semantics": output_tensor_semantics,
371
+ }
372
+
373
+ def output_spec_object_detection(self, zero_shot=False) -> Dict[str, Any]:
346
374
  if not hasattr(self.processor, "post_process_object_detection"):
347
375
  raise NotImplementedError(
348
376
  "processor must have post_process_object_detection set"
349
377
  )
350
378
 
351
- id_to_label = self.model.config.id2label
352
- max_id = max(id_to_label)
353
- labels = []
354
- for i in range(max_id):
355
- if i not in id_to_label:
356
- labels.append("")
357
- continue
358
- labels.append(id_to_label[i])
359
- num_classes = max_id + 1
379
+ if zero_shot:
380
+ num_classes = -1
381
+ labels = []
382
+ else:
383
+ id_to_label = self.model.config.id2label
384
+ max_id = max(id_to_label)
385
+ labels = []
386
+ for i in range(max_id):
387
+ if i not in id_to_label:
388
+ labels.append("")
389
+ continue
390
+ labels.append(id_to_label[i])
391
+ num_classes = max_id + 1
360
392
 
361
393
  # TODO(james): verify assumptions made here apply broadly.
362
394
  output_tensor_semantics = []
@@ -366,7 +398,7 @@ class HuggingFaceImageProcessor:
366
398
  DetectionNumCandidatesDimension(is_nms=False),
367
399
  DetectionOutputDimension(
368
400
  scores_range=(0, num_classes),
369
- scores_are_logits=True,
401
+ scores_are_logits=not zero_shot,
370
402
  ),
371
403
  ]
372
404
  output_tensor_semantics.append(TensorSemantics(logits_dimensions))
@@ -385,12 +417,45 @@ class HuggingFaceImageProcessor:
385
417
  "class_labels": labels,
386
418
  }
387
419
 
420
+ def _get_size(self) -> Dict[str, int]:
421
+ size = None
422
+ if self.image_size_override:
423
+ size = {
424
+ "height": self.image_size_override[0],
425
+ "width": self.image_size_override[1],
426
+ }
427
+ elif hasattr(self.processor, "size") and self.processor.size is not None:
428
+ size = self.processor.size
429
+ elif (
430
+ hasattr(self.model.config, "image_size")
431
+ and self.model.config.image_size is not None
432
+ ):
433
+ size = {
434
+ "height": self.model.config.image_size,
435
+ "width": self.model.config.image_size,
436
+ }
437
+ else:
438
+ warnings.warn(
439
+ f"using fallback resize size of {FALLBACK_RESIZE_SIZE} for model",
440
+ stacklevel=1,
441
+ )
442
+ size = {
443
+ "width": FALLBACK_RESIZE_SIZE,
444
+ "height": FALLBACK_RESIZE_SIZE,
445
+ }
446
+ return size
447
+
388
448
  def _convert_resize(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
389
- size = self.processor.size
449
+ size = self._get_size()
450
+ size_divisor: int | None = None
451
+ if hasattr(self.processor, "size_divisor"):
452
+ size_divisor = self.processor.size_divisor
453
+
390
454
  target_size = None
391
455
  preprocess_step = None
456
+
392
457
  if "height" in size and "width" in size:
393
- target_size = [size["height"], size["width"]]
458
+ target_size = (size["height"], size["width"])
394
459
  preprocess_step = ResizeImage()
395
460
  elif (
396
461
  "shortest_edge" in size
@@ -410,12 +475,55 @@ class HuggingFaceImageProcessor:
410
475
  if not min_size or edge_size < min_size:
411
476
  min_size = edge_size
412
477
 
413
- target_size = [min_size, min_size]
478
+ if min_size is None:
479
+ raise ValueError(
480
+ "could not determine target size for resize from model config"
481
+ )
482
+ target_size = (min_size, min_size)
483
+ preprocess_step = LetterboxImage()
484
+ else:
485
+ raise ValueError(
486
+ "could not determine target size for resize from model config"
487
+ )
488
+ if size_divisor:
489
+ target_size = (
490
+ math.ceil(target_size[0] / size_divisor) * size_divisor,
491
+ math.ceil(target_size[1] / size_divisor) * size_divisor,
492
+ )
493
+ return target_size, preprocess_step
494
+
495
+ def _convert_pad(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
496
+ # NOTE: There is a wide variety of ways that huggingface pads images.
497
+ # We found at least 3 different ways to pad images in the codebase:
498
+ # 1. Center pad (pad top,left, bottom, right) to match target size
499
+ # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/dpt/image_processing_dpt.py#L231
500
+ # 2. Right/Top pad (pad top, and right) to match target size
501
+ # https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/models/conditional_detr/image_processing_conditional_detr.py#L846
502
+ # 3. Pad to nearest multiple of size_divisor
503
+ # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/llava_onevision/image_processing_llava_onevision.py#L177-179
504
+ #
505
+ # We decided to simply implement padding with LetterBoxImage(),
506
+ # because we assume the models won't be that sensitive to the type of padding,
507
+ # but this may need to be revisited in the future.
508
+ size = self._get_size()
509
+ size_divisor: int | None = None
510
+ if hasattr(self.processor, "size_divisor"):
511
+ size_divisor = self.processor.size_divisor
512
+
513
+ target_size = None
514
+ preprocess_step = None
515
+ if "height" in size and "width" in size:
516
+ target_size = (size["height"], size["width"])
414
517
  preprocess_step = LetterboxImage()
415
518
  else:
416
519
  raise ValueError(
417
520
  "could not determine target size for resize from model config"
418
521
  )
522
+ if size_divisor:
523
+ target_size = (
524
+ math.ceil(target_size[0] / size_divisor) * size_divisor,
525
+ math.ceil(target_size[1] / size_divisor) * size_divisor,
526
+ )
419
527
  return target_size, preprocess_step
420
528
 
421
529
 
@@ -424,11 +532,13 @@ class HuggingFaceImageSegmentationPipeline:
424
532
  self,
425
533
  pipeline: Pipeline,
426
534
  name: Optional[str] = None,
535
+ image_size_override: Optional[Tuple[int, int]] = None,
427
536
  ):
428
537
  self.pipeline = pipeline
429
538
  if name is None:
430
539
  name = pipeline.model.name_or_path
431
540
 
541
+ self.image_size_override = image_size_override
432
542
  self.model = TorchModel(
433
543
  name,
434
544
  torch_module=self.pipeline.model,
@@ -446,7 +556,9 @@ class HuggingFaceImageSegmentationPipeline:
446
556
  )
447
557
 
448
558
  image_processor = HuggingFaceImageProcessor(
449
- self.pipeline.model, self.pipeline.image_processor
559
+ self.pipeline.model,
560
+ self.pipeline.image_processor,
561
+ image_size_override=self.image_size_override,
450
562
  )
451
563
  spec = image_processor.input_spec()
452
564
  spec.update(image_processor.output_spec_segmentation())
@@ -471,11 +583,13 @@ class HuggingFaceObjectDetectionPipeline:
471
583
  self,
472
584
  pipeline: Pipeline,
473
585
  name: Optional[str] = None,
586
+ image_size_override: Optional[Tuple[int, int]] = None,
474
587
  ):
475
588
  self.pipeline = pipeline
476
589
  if name is None:
477
590
  name = pipeline.model.name_or_path
478
591
 
592
+ self.image_size_override = image_size_override
479
593
  self.model = TorchModel(
480
594
  name,
481
595
  torch_module=ObjectDetectionWrapper(self.pipeline.model),
@@ -493,7 +607,9 @@ class HuggingFaceObjectDetectionPipeline:
493
607
  )
494
608
 
495
609
  image_processor = HuggingFaceImageProcessor(
496
- self.pipeline.model, self.pipeline.image_processor
610
+ self.pipeline.model,
611
+ self.pipeline.image_processor,
612
+ image_size_override=self.image_size_override,
497
613
  )
498
614
  spec = image_processor.input_spec()
499
615
  spec.update(image_processor.output_spec_object_detection())
@@ -503,6 +619,141 @@ class HuggingFaceObjectDetectionPipeline:
503
619
  return [self.model]
504
620
 
505
621
 
622
+ class ZeroShotObjectDetectionWrapper(torch.nn.Module):
623
+ def __init__(self, model: PreTrainedModel):
624
+ super().__init__()
625
+ self.model = model
626
+
627
+ def forward(self, image, tokens, attention_mask):
628
+ outputs = self.model(
629
+ input_ids=tokens, pixel_values=image, attention_mask=attention_mask
630
+ )
631
+ return torch.sigmoid(outputs.logits), outputs.pred_boxes
632
+
633
+
634
+ class HuggingFaceZeroShotObjectDetectionPipeline:
635
+ def __init__(
636
+ self,
637
+ pipeline: Pipeline,
638
+ name: Optional[str] = None,
639
+ tokenizer_name: Optional[str] = None,
640
+ image_size_override: Optional[Tuple[int, int]] = None,
641
+ ):
642
+ self.pipeline = pipeline
643
+ if name is None:
644
+ name = pipeline.model.name_or_path
645
+
646
+ self.tokenizer_model = HuggingFaceTokenizer(
647
+ self.pipeline.tokenizer, tokenizer_name
648
+ )
649
+
650
+ self.image_size_override = image_size_override
651
+ self.detection_model = TorchModel(
652
+ name,
653
+ torch_module=ZeroShotObjectDetectionWrapper(self.pipeline.model),
654
+ **self._guess_model_spec(),
655
+ )
656
+
657
+ def _add_zero_shot_inputs(self, spec: Dict):
658
+ example_inputs = spec["example_inputs"]
659
+ if "dynamic_shapes" not in spec:
660
+ spec["dynamic_shapes"] = [{} for _ in example_inputs]
661
+
662
+ max_length = self.pipeline.model.config.text_config.max_length
663
+ example_inputs.extend(
664
+ [
665
+ torch.randint(200, [2, max_length]).to(torch.int32),
666
+ torch.ones([2, max_length]).to(torch.int32),
667
+ ]
668
+ )
669
+
670
+ input_tensor_semantics = spec["input_tensor_semantics"]
671
+ input_tensor_semantics.extend(
672
+ [
673
+ TensorSemantics(
674
+ [
675
+ BatchDimension(),
676
+ TokensDimension(),
677
+ ]
678
+ ),
679
+ TensorSemantics(
680
+ [
681
+ BatchDimension(),
682
+ AttentionMaskDimension(),
683
+ ]
684
+ ),
685
+ ]
686
+ )
687
+
688
+ spec["dynamic_shapes"].extend(
689
+ [
690
+ {0: "num_labels"},
691
+ {0: "num_labels"},
692
+ ]
693
+ )
694
+
695
+ def _guess_model_spec(self) -> Dict:
696
+ if self.pipeline.image_processor is None:
697
+ raise ValueError(
698
+ "Could not determine image preprocessing for pipeline with image_processor=None"
699
+ )
700
+
701
+ image_processor = HuggingFaceImageProcessor(
702
+ self.pipeline.model,
703
+ self.pipeline.image_processor,
704
+ image_size_override=self.image_size_override,
705
+ )
706
+ spec = image_processor.input_spec()
707
+ self._add_zero_shot_inputs(spec)
708
+ spec.update(image_processor.output_spec_object_detection(zero_shot=True))
709
+ return spec
710
+
711
+ def models(self) -> List[Model]:
712
+ return [self.detection_model, self.tokenizer_model]
713
+
714
+
715
+ class HuggingFaceDepthEstimationPipeline:
716
+ def __init__(
717
+ self,
718
+ pipeline: Pipeline,
719
+ name: Optional[str] = None,
720
+ image_size_override: Optional[Tuple[int, int]] = None,
721
+ ):
722
+ self.pipeline = pipeline
723
+ if name is None:
724
+ name = pipeline.model.name_or_path
725
+
726
+ self.image_size_override = image_size_override
727
+
728
+ self.model = TorchModel(
729
+ name,
730
+ torch_module=self.pipeline.model,
731
+ **self._guess_model_spec(),
732
+ )
733
+
734
+ def _guess_model_spec(self) -> Dict:
735
+ if self.pipeline.image_processor is None:
736
+ raise ValueError(
737
+ "Could not determine image preprocessing for pipeline with image_processor=None"
738
+ )
739
+ if self.pipeline.tokenizer is not None:
740
+ raise NotImplementedError(
741
+ "HuggingFaceDepthEstimationPipeline does not yet support token inputs"
742
+ )
743
+
744
+ image_processor = HuggingFaceImageProcessor(
745
+ self.pipeline.model,
746
+ self.pipeline.image_processor,
747
+ image_size_override=self.image_size_override,
748
+ )
749
+ spec = image_processor.input_spec()
750
+ spec.update(image_processor.output_spec_depth())
751
+ return spec
752
+
753
+ def models(self) -> List[Model]:
754
+ return [self.model]
755
+
756
+
506
757
  def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
507
758
  if pipeline.framework != "pt":
508
759
  raise ValueError(
@@ -517,8 +768,19 @@ def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
517
768
  return HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
518
769
  elif pipeline.task == "object-detection":
519
770
  return HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
771
+ elif pipeline.task == "zero-shot-object-detection":
772
+ return HuggingFaceZeroShotObjectDetectionPipeline(pipeline, **kwargs).models()
773
+ elif pipeline.task == "depth-estimation":
774
+ return HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
520
775
  raise ValueError(
521
776
  "unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
522
- pipeline.task, ["text-generation", "image-segmentation", "object-detection"]
777
+ pipeline.task,
778
+ [
779
+ "text-generation",
780
+ "image-segmentation",
781
+ "object-detection",
782
+ "zero-shot-object-detection",
783
+ "depth-estimation",
784
+ ],
523
785
  )
524
786
  )
gml/model.py CHANGED
@@ -21,8 +21,9 @@ import io
21
21
  from pathlib import Path
22
22
  from typing import BinaryIO, Dict, List, Literal, Optional, Sequence, TextIO, Tuple
23
23
 
24
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
25
24
  import torch
25
+
26
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
26
27
  from gml.asset_manager import AssetManager, TempFileAssetManager
27
28
  from gml.compile import to_torch_mlir
28
29
  from gml.preprocessing import ImagePreprocessingStep
gml/pipelines.py CHANGED
@@ -15,7 +15,7 @@
15
15
  # SPDX-License-Identifier: Apache-2.0
16
16
 
17
17
  import abc
18
- from typing import List
18
+ from typing import List, Optional
19
19
 
20
20
  import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
21
21
  from gml.model import Model
@@ -41,12 +41,44 @@ class SingleModelPipeline(Pipeline):
41
41
 
42
42
 
43
43
  class SimpleDetectionPipeline(SingleModelPipeline):
44
- def __init__(self, add_tracking_id: bool = False):
45
- self.add_tracking_id = add_tracking_id
44
+ def __init__(
45
+ self,
46
+ track_objects: Optional[bool] = None,
47
+ add_tracking_id: Optional[bool] = None,
48
+ ):
49
+ self.track_objects = False
50
+ if add_tracking_id is not None:
51
+ import warnings
52
+
53
+ warnings.warn(
54
+ "The 'add_tracking_id' parameter is deprecated and will be removed in a future version.",
55
+ DeprecationWarning,
56
+ stacklevel=2,
57
+ )
58
+ self.track_objects = add_tracking_id
59
+
60
+ if track_objects is not None:
61
+ self.track_objects = track_objects
62
+
63
+ if track_objects is not None and add_tracking_id is not None:
64
+ raise ValueError(
65
+ "'track_objects' and 'add_tracking_id' cannot be set simultaneously."
66
+ )
46
67
 
47
68
  def _to_yaml(self, model_name: str, org_name: str):
48
- add_tracking_id = "true" if self.add_tracking_id else "false"
49
69
  # editorconfig-checker-disable
70
+ video_stream_detections = ".detect.detections"
71
+ track_node = ""
72
+ if self.track_objects:
73
+ track_node = """
74
+ - name: track
75
+ kind: Track
76
+ inputs:
77
+ detections: .detect.detections
78
+ outputs:
79
+ - tracked_detections
80
+ """
81
+ video_stream_detections = ".track.tracked_detections"
50
82
  return f"""---
51
83
  nodes:
52
84
  - name: camera_source
@@ -56,7 +88,6 @@ nodes:
56
88
  - name: detect
57
89
  kind: Detect
58
90
  attributes:
59
- add_tracking_id: {add_tracking_id}
60
91
  model:
61
92
  model:
62
93
  name: {model_name}
@@ -66,13 +97,14 @@ nodes:
66
97
  frame: .camera_source.frame
67
98
  outputs:
68
99
  - detections
100
+ {track_node}
69
101
  - name: video_stream_sink
70
102
  kind: VideoStreamSink
71
103
  attributes:
72
104
  frame_rate_limit: 30
73
105
  inputs:
74
106
  frame: .camera_source.frame
75
- detections: .detect.detections
107
+ detections: {video_stream_detections}
76
108
  """
77
109
 
78
110
 
@@ -110,6 +142,36 @@ nodes:
110
142
  """
111
143
 
112
144
 
145
+ class SimpleDepthEstimationPipeline(SingleModelPipeline):
146
+ def _to_yaml(self, model_name: str, org_name: str):
147
+ # editorconfig-checker-disable
148
+ return f"""---
149
+ nodes:
150
+ - name: camera_source
151
+ kind: CameraSource
152
+ outputs:
153
+ - frame
154
+ - name: estimate_depth
155
+ kind: EstimateDepth
156
+ attributes:
157
+ model:
158
+ model:
159
+ name: {model_name}
160
+ org: {org_name}
161
+ frame_rate_limit: 30
162
+ inputs:
163
+ frame: .camera_source.frame
164
+ outputs:
165
+ - depth
166
+ - name: video_stream_sink
167
+ kind: VideoStreamSink
168
+ attributes:
169
+ frame_rate_limit: 30
170
+ inputs:
171
+ frame: .estimate_depth.depth
172
+ """
173
+
174
+
113
175
  class LiveChatPipeline(Pipeline):
114
176
  def to_yaml(self, models: List[Model], org_name: str) -> str:
115
177
  if len(models) != 2:
@@ -173,4 +235,61 @@ nodes:
173
235
  """
174
236
 
175
237
 
238
+ class ZeroShotObjectDetectionPipeline(Pipeline):
239
+ def __init__(self, conf_threshold=0.1):
240
+ self.conf_threshold = conf_threshold
241
+
242
+ def to_yaml(self, models: List[Model], org_name: str) -> str:
243
+ if len(models) != 2:
244
+ raise ValueError(
245
+ "ZeroShotObjectDetectionPipeline expects two models (a detection model and a tokenizer)"
246
+ )
247
+ tokenizer = None
248
+ detect = None
249
+ for m in models:
250
+ if m.storage_format == modelexecpb.ModelInfo.MODEL_STORAGE_FORMAT_OPAQUE:
251
+ tokenizer = m
252
+ else:
253
+ detect = m
254
+ if tokenizer is None or detect is None:
255
+ raise ValueError(
256
+ "ZeroShotObjectDetectionPipeline expects both a tokenizer model and a detection model)"
257
+ )
258
+ return f"""---
259
+ nodes:
260
+ - name: camera_source
261
+ kind: CameraSource
262
+ outputs:
263
+ - frame
264
+ - name: text_source
265
+ kind: TextStreamSource
266
+ outputs:
267
+ - prompt
268
+ - name: detect
269
+ kind: Detect
270
+ attributes:
271
+ model:
272
+ model:
273
+ name: {detect.name}
274
+ org: {org_name}
275
+ tokenizer:
276
+ model:
277
+ name: {tokenizer.name}
278
+ org: {org_name}
279
+ conf_threshold: {self.conf_threshold}
280
+ inputs:
281
+ frame: .camera_source.frame
282
+ prompt: .text_source.prompt
283
+ outputs:
284
+ - detections
285
+ - name: video_stream_sink
286
+ kind: VideoStreamSink
287
+ attributes:
288
+ frame_rate_limit: 30
289
+ inputs:
290
+ frame: .camera_source.frame
291
+ detections: .detect.detections
292
+ """
293
+
294
+
176
295
  # editorconfig-checker-enable
gml/preprocessing.py CHANGED
@@ -17,9 +17,10 @@
17
17
  import abc
18
18
  from typing import List
19
19
 
20
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
21
20
  import google.protobuf.wrappers_pb2 as wrapperspb
22
21
 
22
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
23
+
23
24
 
24
25
  class ImagePreprocessingStep(abc.ABC):
25
26
  @abc.abstractmethod