gimlet-api 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gml/hf.py CHANGED
@@ -15,14 +15,24 @@
15
15
  # SPDX-License-Identifier: Apache-2.0
16
16
 
17
17
  import glob
18
+ import math
18
19
  import tempfile
20
+ import warnings
19
21
  from collections.abc import Iterable
20
22
  from pathlib import Path
21
23
  from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple
22
24
 
23
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
24
25
  import torch
25
26
  import transformers
27
+ from transformers import (
28
+ BaseImageProcessor,
29
+ Pipeline,
30
+ PreTrainedModel,
31
+ PreTrainedTokenizer,
32
+ )
33
+
34
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
35
+ from gml.asset_manager import AssetManager
26
36
  from gml.model import GenerationConfig, Model, TorchModel
27
37
  from gml.preprocessing import (
28
38
  ImagePreprocessingStep,
@@ -33,10 +43,12 @@ from gml.preprocessing import (
33
43
  )
34
44
  from gml.tensor import (
35
45
  AttentionKeyValueCacheTensorSemantics,
46
+ AttentionMaskDimension,
36
47
  BatchDimension,
37
48
  BoundingBoxFormat,
38
49
  DetectionNumCandidatesDimension,
39
50
  DetectionOutputDimension,
51
+ DimensionSemantics,
40
52
  ImageChannelDimension,
41
53
  ImageHeightDimension,
42
54
  ImageWidthDimension,
@@ -45,12 +57,8 @@ from gml.tensor import (
45
57
  TokensDimension,
46
58
  VocabLogitsDimension,
47
59
  )
48
- from transformers import (
49
- BaseImageProcessor,
50
- Pipeline,
51
- PreTrainedModel,
52
- PreTrainedTokenizer,
53
- )
60
+
61
+ FALLBACK_RESIZE_SIZE = 512
54
62
 
55
63
 
56
64
  class HuggingFaceTokenizer(Model):
@@ -66,7 +74,9 @@ class HuggingFaceTokenizer(Model):
66
74
  )
67
75
  self.tokenizer = tokenizer
68
76
 
69
- def _collect_assets(self) -> Dict[str, TextIO | BinaryIO | Path]:
77
+ def _collect_assets(
78
+ self, weight_manager: Optional[AssetManager] = None
79
+ ) -> Dict[str, TextIO | BinaryIO | Path]:
70
80
  with tempfile.TemporaryDirectory() as tmpdir:
71
81
  self.tokenizer.save_pretrained(tmpdir)
72
82
  paths = [Path(f) for f in glob.glob(tmpdir + "/*")]
@@ -74,7 +84,6 @@ class HuggingFaceTokenizer(Model):
74
84
 
75
85
 
76
86
  class HuggingFaceGenerationConfig(GenerationConfig):
77
-
78
87
  def __init__(self, model: PreTrainedModel):
79
88
  config = model.generation_config
80
89
  eos_tokens = config.eos_token_id
@@ -239,25 +248,34 @@ class HuggingFaceTextGenerationPipeline:
239
248
 
240
249
 
241
250
  class HuggingFaceImageProcessor:
242
-
243
251
  def __init__(
244
252
  self,
245
253
  model: PreTrainedModel,
246
254
  processor: BaseImageProcessor,
255
+ image_size_override: Optional[Tuple[int, int]] = None,
247
256
  ):
248
257
  self.model = model
249
258
  self.processor = processor
259
+ self.image_size_override = image_size_override
250
260
 
251
261
  def input_spec(self) -> Dict[str, Any]:
252
262
  target_size = None
253
263
  image_preprocessing_steps = []
254
- if (
255
- hasattr(self.processor, "do_resize")
256
- and self.processor.do_resize
257
- and hasattr(self.processor, "size")
258
- ):
264
+ has_do_resize = (
265
+ hasattr(self.processor, "do_resize") and self.processor.do_resize
266
+ )
267
+ has_do_pad = hasattr(self.processor, "do_pad") and self.processor.do_pad
268
+ # NOTE: it is possible for both do_resize and do_pad to be set, in which case we only use do_resize.
269
+ if has_do_resize:
259
270
  target_size, preprocessing_step = self._convert_resize()
260
271
  image_preprocessing_steps.append(preprocessing_step)
272
+ elif has_do_pad:
273
+ target_size, preprocessing_step = self._convert_pad()
274
+ image_preprocessing_steps.append(preprocessing_step)
275
+ else:
276
+ raise ValueError(
277
+ "could not determine target size for resize from model config"
278
+ )
261
279
 
262
280
  if (
263
281
  hasattr(self.processor, "do_rescale")
@@ -288,7 +306,7 @@ class HuggingFaceImageProcessor:
288
306
  # TODO(james): figure out if this is specified anywhere in the huggingface pipeline.
289
307
  channel_format = "rgb"
290
308
 
291
- dimensions = [
309
+ dimensions: list[DimensionSemantics] = [
292
310
  BatchDimension(),
293
311
  ]
294
312
  input_shape = [1]
@@ -339,21 +357,38 @@ class HuggingFaceImageProcessor:
339
357
  "class_labels": labels,
340
358
  }
341
359
 
342
- def output_spec_object_detection(self) -> Dict[str, Any]:
360
+ def output_spec_depth(self) -> Dict[str, Any]:
361
+ dimensions = [
362
+ BatchDimension(),
363
+ ImageHeightDimension(),
364
+ ImageWidthDimension(),
365
+ ]
366
+ output_tensor_semantics = [
367
+ TensorSemantics(dimensions),
368
+ ]
369
+ return {
370
+ "output_tensor_semantics": output_tensor_semantics,
371
+ }
372
+
373
+ def output_spec_object_detection(self, zero_shot=False) -> Dict[str, Any]:
343
374
  if not hasattr(self.processor, "post_process_object_detection"):
344
375
  raise NotImplementedError(
345
- "only semantic segmentation is currently supported"
376
+ "processor must have post_process_object_detection set"
346
377
  )
347
378
 
348
- id_to_label = self.model.config.id2label
349
- max_id = max(id_to_label)
350
- labels = []
351
- for i in range(max_id):
352
- if i not in id_to_label:
353
- labels.append("")
354
- continue
355
- labels.append(id_to_label[i])
356
- num_classes = max_id + 1
379
+ if zero_shot:
380
+ num_classes = -1
381
+ labels = []
382
+ else:
383
+ id_to_label = self.model.config.id2label
384
+ max_id = max(id_to_label)
385
+ labels = []
386
+ for i in range(max_id):
387
+ if i not in id_to_label:
388
+ labels.append("")
389
+ continue
390
+ labels.append(id_to_label[i])
391
+ num_classes = max_id + 1
357
392
 
358
393
  # TODO(james): verify assumptions made here apply broadly.
359
394
  output_tensor_semantics = []
@@ -363,7 +398,7 @@ class HuggingFaceImageProcessor:
363
398
  DetectionNumCandidatesDimension(is_nms=False),
364
399
  DetectionOutputDimension(
365
400
  scores_range=(0, num_classes),
366
- scores_are_logits=True,
401
+ scores_are_logits=not zero_shot,
367
402
  ),
368
403
  ]
369
404
  output_tensor_semantics.append(TensorSemantics(logits_dimensions))
@@ -382,12 +417,45 @@ class HuggingFaceImageProcessor:
382
417
  "class_labels": labels,
383
418
  }
384
419
 
420
+ def _get_size(self) -> Dict[str, int]:
421
+ size = None
422
+ if self.image_size_override:
423
+ size = {
424
+ "height": self.image_size_override[0],
425
+ "width": self.image_size_override[1],
426
+ }
427
+ elif hasattr(self.processor, "size") and self.processor.size is not None:
428
+ size = self.processor.size
429
+ elif (
430
+ hasattr(self.model.config, "image_size")
431
+ and self.model.config.image_size is not None
432
+ ):
433
+ size = {
434
+ "height": self.model.config.image_size,
435
+ "width": self.model.config.image_size,
436
+ }
437
+ else:
438
+ warnings.warn(
439
+ f"using fallback resize size of {FALLBACK_RESIZE_SIZE} for model",
440
+ stacklevel=1,
441
+ )
442
+ size = {
443
+ "width": FALLBACK_RESIZE_SIZE,
444
+ "height": FALLBACK_RESIZE_SIZE,
445
+ }
446
+ return size
447
+
385
448
  def _convert_resize(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
386
- size = self.processor.size
449
+ size = self._get_size()
450
+ size_divisor: int | None = None
451
+ if hasattr(self.processor, "size_divisor"):
452
+ size_divisor = self.processor.size_divisor
453
+
387
454
  target_size = None
388
455
  preprocess_step = None
456
+
389
457
  if "height" in size and "width" in size:
390
- target_size = [size["height"], size["width"]]
458
+ target_size = (size["height"], size["width"])
391
459
  preprocess_step = ResizeImage()
392
460
  elif (
393
461
  "shortest_edge" in size
@@ -407,12 +475,55 @@ class HuggingFaceImageProcessor:
407
475
  if not min_size or edge_size < min_size:
408
476
  min_size = edge_size
409
477
 
410
- target_size = [min_size, min_size]
478
+ if min_size is None:
479
+ raise ValueError(
480
+ "could not determine target size for resize from model config"
481
+ )
482
+ target_size = (min_size, min_size)
483
+ preprocess_step = LetterboxImage()
484
+ else:
485
+ raise ValueError(
486
+ "could not determine target size for resize from model config"
487
+ )
488
+ if size_divisor:
489
+ target_size = (
490
+ math.ceil(target_size[0] / size_divisor) * size_divisor,
491
+ math.ceil(target_size[1] / size_divisor) * size_divisor,
492
+ )
493
+ return target_size, preprocess_step
494
+
495
+ def _convert_pad(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
496
+ # NOTE: There is a wide variety of ways that huggingface pads images.
497
+ # We found at least 3 different ways to pad images in the codebase:
498
+ # 1. Center pad (pad top,left, bottom, right) to match target size
499
+ # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/dpt/image_processing_dpt.py#L231
500
+ # 2. Right/Top pad (pad top, and right) to match target size
501
+ # https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/models/conditional_detr/image_processing_conditional_detr.py#L846
502
+ # 3. Pad to nearest multiple of size_divisor
503
+ # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/llava_onevision/image_processing_llava_onevision.py#L177-179
504
+ #
505
+ # We decided to simply implement padding with LetterBoxImage(),
506
+ # because we assume the models won't be that sensitive to the type of padding,
507
+ # but this may need to be revisited in the future.
508
+ size = self._get_size()
509
+ size_divisor: int | None = None
510
+ if hasattr(self.processor, "size_divisor"):
511
+ size_divisor = self.processor.size_divisor
512
+
513
+ target_size = None
514
+ preprocess_step = None
515
+ if "height" in size and "width" in size:
516
+ target_size = (size["height"], size["width"])
411
517
  preprocess_step = LetterboxImage()
412
518
  else:
413
519
  raise ValueError(
414
520
  "could not determine target size for resize from model config"
415
521
  )
522
+ if size_divisor:
523
+ target_size = (
524
+ math.ceil(target_size[0] / size_divisor) * size_divisor,
525
+ math.ceil(target_size[1] / size_divisor) * size_divisor,
526
+ )
416
527
  return target_size, preprocess_step
417
528
 
418
529
 
@@ -421,11 +532,13 @@ class HuggingFaceImageSegmentationPipeline:
421
532
  self,
422
533
  pipeline: Pipeline,
423
534
  name: Optional[str] = None,
535
+ image_size_override: Optional[Tuple[int, int]] = None,
424
536
  ):
425
537
  self.pipeline = pipeline
426
538
  if name is None:
427
539
  name = pipeline.model.name_or_path
428
540
 
541
+ self.image_size_override = image_size_override
429
542
  self.model = TorchModel(
430
543
  name,
431
544
  torch_module=self.pipeline.model,
@@ -443,7 +556,9 @@ class HuggingFaceImageSegmentationPipeline:
443
556
  )
444
557
 
445
558
  image_processor = HuggingFaceImageProcessor(
446
- self.pipeline.model, self.pipeline.image_processor
559
+ self.pipeline.model,
560
+ self.pipeline.image_processor,
561
+ image_size_override=self.image_size_override,
447
562
  )
448
563
  spec = image_processor.input_spec()
449
564
  spec.update(image_processor.output_spec_segmentation())
@@ -468,11 +583,13 @@ class HuggingFaceObjectDetectionPipeline:
468
583
  self,
469
584
  pipeline: Pipeline,
470
585
  name: Optional[str] = None,
586
+ image_size_override: Optional[Tuple[int, int]] = None,
471
587
  ):
472
588
  self.pipeline = pipeline
473
589
  if name is None:
474
590
  name = pipeline.model.name_or_path
475
591
 
592
+ self.image_size_override = image_size_override
476
593
  self.model = TorchModel(
477
594
  name,
478
595
  torch_module=ObjectDetectionWrapper(self.pipeline.model),
@@ -490,7 +607,9 @@ class HuggingFaceObjectDetectionPipeline:
490
607
  )
491
608
 
492
609
  image_processor = HuggingFaceImageProcessor(
493
- self.pipeline.model, self.pipeline.image_processor
610
+ self.pipeline.model,
611
+ self.pipeline.image_processor,
612
+ image_size_override=self.image_size_override,
494
613
  )
495
614
  spec = image_processor.input_spec()
496
615
  spec.update(image_processor.output_spec_object_detection())
@@ -500,6 +619,141 @@ class HuggingFaceObjectDetectionPipeline:
500
619
  return [self.model]
501
620
 
502
621
 
622
+ class ZeroShotObjectDetectionWrapper(torch.nn.Module):
623
+ def __init__(self, model: PreTrainedModel):
624
+ super().__init__()
625
+ self.model = model
626
+
627
+ def forward(self, image, tokens, attention_mask):
628
+ outputs = self.model(
629
+ input_ids=tokens, pixel_values=image, attention_mask=attention_mask
630
+ )
631
+ return torch.sigmoid(outputs.logits), outputs.pred_boxes
632
+
633
+
634
+ class HuggingFaceZeroShotObjectDetectionPipeline:
635
+ def __init__(
636
+ self,
637
+ pipeline: Pipeline,
638
+ name: Optional[str] = None,
639
+ tokenizer_name: Optional[str] = None,
640
+ image_size_override: Optional[Tuple[int, int]] = None,
641
+ ):
642
+ self.pipeline = pipeline
643
+ if name is None:
644
+ name = pipeline.model.name_or_path
645
+
646
+ self.tokenizer_model = HuggingFaceTokenizer(
647
+ self.pipeline.tokenizer, tokenizer_name
648
+ )
649
+
650
+ self.image_size_override = image_size_override
651
+ self.detection_model = TorchModel(
652
+ name,
653
+ torch_module=ZeroShotObjectDetectionWrapper(self.pipeline.model),
654
+ **self._guess_model_spec(),
655
+ )
656
+
657
+ def _add_zero_shot_inputs(self, spec: Dict):
658
+ example_inputs = spec["example_inputs"]
659
+ if "dynamic_shapes" not in spec:
660
+ spec["dynamic_shapes"] = [{} for _ in example_inputs]
661
+
662
+ max_length = self.pipeline.model.config.text_config.max_length
663
+ example_inputs.extend(
664
+ [
665
+ torch.randint(200, [2, max_length]).to(torch.int32),
666
+ torch.ones([2, max_length]).to(torch.int32),
667
+ ]
668
+ )
669
+
670
+ input_tensor_semantics = spec["input_tensor_semantics"]
671
+ input_tensor_semantics.extend(
672
+ [
673
+ TensorSemantics(
674
+ [
675
+ BatchDimension(),
676
+ TokensDimension(),
677
+ ]
678
+ ),
679
+ TensorSemantics(
680
+ [
681
+ BatchDimension(),
682
+ AttentionMaskDimension(),
683
+ ]
684
+ ),
685
+ ]
686
+ )
687
+
688
+ spec["dynamic_shapes"].extend(
689
+ [
690
+ {0: "num_labels"},
691
+ {0: "num_labels"},
692
+ ]
693
+ )
694
+
695
+ def _guess_model_spec(self) -> Dict:
696
+ if self.pipeline.image_processor is None:
697
+ raise ValueError(
698
+ "Could not determine image preprocessing for pipeline with image_processor=None"
699
+ )
700
+
701
+ image_processor = HuggingFaceImageProcessor(
702
+ self.pipeline.model,
703
+ self.pipeline.image_processor,
704
+ image_size_override=self.image_size_override,
705
+ )
706
+ spec = image_processor.input_spec()
707
+ self._add_zero_shot_inputs(spec)
708
+ spec.update(image_processor.output_spec_object_detection(zero_shot=True))
709
+ return spec
710
+
711
+ def models(self) -> List[Model]:
712
+ return [self.detection_model, self.tokenizer_model]
713
+
714
+
715
+ class HuggingFaceDepthEstimationPipeline:
716
+ def __init__(
717
+ self,
718
+ pipeline: Pipeline,
719
+ name: Optional[str] = None,
720
+ image_size_override: Optional[Tuple[int, int]] = None,
721
+ ):
722
+ self.pipeline = pipeline
723
+ if name is None:
724
+ name = pipeline.model.name_or_path
725
+
726
+ self.image_size_override = image_size_override
727
+
728
+ self.model = TorchModel(
729
+ name,
730
+ torch_module=self.pipeline.model,
731
+ **self._guess_model_spec(),
732
+ )
733
+
734
+ def _guess_model_spec(self) -> Dict:
735
+ if self.pipeline.image_processor is None:
736
+ raise ValueError(
737
+ "Could not determine image preprocessing for pipeline with image_processor=None"
738
+ )
739
+ if self.pipeline.tokenizer is not None:
740
+ raise NotImplementedError(
741
+ "HuggingFaceDepthEstimationPipeline does not yet support token inputs"
742
+ )
743
+
744
+ image_processor = HuggingFaceImageProcessor(
745
+ self.pipeline.model,
746
+ self.pipeline.image_processor,
747
+ image_size_override=self.image_size_override,
748
+ )
749
+ spec = image_processor.input_spec()
750
+ spec.update(image_processor.output_spec_depth())
751
+ return spec
752
+
753
+ def models(self) -> List[Model]:
754
+ return [self.model]
755
+
756
+
503
757
  def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
504
758
  if pipeline.framework != "pt":
505
759
  raise ValueError(
@@ -514,8 +768,19 @@ def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
514
768
  return HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
515
769
  elif pipeline.task == "object-detection":
516
770
  return HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
771
+ elif pipeline.task == "zero-shot-object-detection":
772
+ return HuggingFaceZeroShotObjectDetectionPipeline(pipeline, **kwargs).models()
773
+ elif pipeline.task == "depth-estimation":
774
+ return HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
517
775
  raise ValueError(
518
776
  "unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
519
- pipeline.task, ["text-generation", "image-segmentation", "object-detection"]
777
+ pipeline.task,
778
+ [
779
+ "text-generation",
780
+ "image-segmentation",
781
+ "object-detection",
782
+ "zero-shot-object-detection",
783
+ "depth-estimation",
784
+ ],
520
785
  )
521
786
  )
gml/model.py CHANGED
@@ -21,9 +21,11 @@ import io
21
21
  from pathlib import Path
22
22
  from typing import BinaryIO, Dict, List, Literal, Optional, Sequence, TextIO, Tuple
23
23
 
24
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
25
24
  import torch
26
- from gml.compile import to_torch_mlir, torch_mlir_output_kind
25
+
26
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
27
+ from gml.asset_manager import AssetManager, TempFileAssetManager
28
+ from gml.compile import to_torch_mlir
27
29
  from gml.preprocessing import ImagePreprocessingStep
28
30
  from gml.tensor import TensorSemantics
29
31
 
@@ -90,12 +92,14 @@ class Model(abc.ABC):
90
92
  )
91
93
 
92
94
  @abc.abstractmethod
93
- def _collect_assets(self) -> Dict[str, TextIO | BinaryIO | Path]:
95
+ def _collect_assets(
96
+ self, weight_manager: Optional[AssetManager] = None
97
+ ) -> Dict[str, TextIO | BinaryIO | Path]:
94
98
  pass
95
99
 
96
100
  @contextlib.contextmanager
97
- def collect_assets(self):
98
- yield from self._collect_assets()
101
+ def collect_assets(self, weight_manager: Optional[AssetManager] = None):
102
+ yield from self._collect_assets(weight_manager)
99
103
 
100
104
 
101
105
  class TorchModel(Model):
@@ -111,7 +115,7 @@ class TorchModel(Model):
111
115
  ):
112
116
  super().__init__(
113
117
  name,
114
- torch_mlir_output_kind(),
118
+ modelexecpb.ModelInfo.MODEL_KIND_TORCH,
115
119
  modelexecpb.ModelInfo.MODEL_STORAGE_FORMAT_MLIR_TEXT,
116
120
  **kwargs,
117
121
  )
@@ -130,17 +134,27 @@ class TorchModel(Model):
130
134
  for shape, dtype in zip(self.input_shapes, self.input_dtypes)
131
135
  ]
132
136
 
133
- def _convert_to_torch_mlir(self):
137
+ def _convert_to_torch_mlir(self, weight_manager: Optional[AssetManager] = None):
134
138
  return to_torch_mlir(
135
139
  self.torch_module,
136
140
  self.example_inputs,
137
141
  self.dynamic_shapes,
142
+ weight_manager=weight_manager,
138
143
  )
139
144
 
140
- def _collect_assets(self) -> Dict[str, TextIO | BinaryIO | Path]:
141
- compiled = self._convert_to_torch_mlir()
142
- file = io.BytesIO(str(compiled).encode("utf-8"))
143
- yield {"": file}
145
+ def _collect_assets(
146
+ self, weight_manager: Optional[AssetManager] = None
147
+ ) -> Dict[str, TextIO | BinaryIO | Path]:
148
+ if weight_manager is None:
149
+ # If the user does not provide a weight manager, use temp files.
150
+ weight_manager = TempFileAssetManager()
151
+
152
+ with weight_manager as weight_mgr:
153
+ compiled = self._convert_to_torch_mlir(weight_mgr)
154
+ file = io.BytesIO(str(compiled).encode("utf-8"))
155
+ assets = {"": file}
156
+ assets.update(weight_mgr.assets())
157
+ yield assets
144
158
 
145
159
 
146
160
  def _kind_str_to_kind_format_protos(
@@ -178,5 +192,7 @@ class ModelFromFiles(Model):
178
192
  super().__init__(name=name, kind=kind, storage_format=storage_format, **kwargs)
179
193
  self.files = files
180
194
 
181
- def _collect_assets(self) -> Dict[str, TextIO | BinaryIO | Path]:
195
+ def _collect_assets(
196
+ self, weight_manager: Optional[AssetManager] = None
197
+ ) -> Dict[str, TextIO | BinaryIO | Path]:
182
198
  yield self.files