gimlet-api 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gml/hf.py CHANGED
@@ -15,14 +15,23 @@
15
15
  # SPDX-License-Identifier: Apache-2.0
16
16
 
17
17
  import glob
18
+ import math
18
19
  import tempfile
20
+ import warnings
19
21
  from collections.abc import Iterable
20
22
  from pathlib import Path
21
23
  from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple
22
24
 
23
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
24
25
  import torch
25
26
  import transformers
27
+ from transformers import (
28
+ BaseImageProcessor,
29
+ Pipeline,
30
+ PreTrainedModel,
31
+ PreTrainedTokenizer,
32
+ )
33
+
34
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
26
35
  from gml.asset_manager import AssetManager
27
36
  from gml.model import GenerationConfig, Model, TorchModel
28
37
  from gml.preprocessing import (
@@ -34,10 +43,12 @@ from gml.preprocessing import (
34
43
  )
35
44
  from gml.tensor import (
36
45
  AttentionKeyValueCacheTensorSemantics,
46
+ AttentionMaskDimension,
37
47
  BatchDimension,
38
48
  BoundingBoxFormat,
39
49
  DetectionNumCandidatesDimension,
40
50
  DetectionOutputDimension,
51
+ DimensionSemantics,
41
52
  ImageChannelDimension,
42
53
  ImageHeightDimension,
43
54
  ImageWidthDimension,
@@ -46,12 +57,8 @@ from gml.tensor import (
46
57
  TokensDimension,
47
58
  VocabLogitsDimension,
48
59
  )
49
- from transformers import (
50
- BaseImageProcessor,
51
- Pipeline,
52
- PreTrainedModel,
53
- PreTrainedTokenizer,
54
- )
60
+
61
+ FALLBACK_RESIZE_SIZE = 512
55
62
 
56
63
 
57
64
  class HuggingFaceTokenizer(Model):
@@ -77,7 +84,6 @@ class HuggingFaceTokenizer(Model):
77
84
 
78
85
 
79
86
  class HuggingFaceGenerationConfig(GenerationConfig):
80
-
81
87
  def __init__(self, model: PreTrainedModel):
82
88
  config = model.generation_config
83
89
  eos_tokens = config.eos_token_id
@@ -242,25 +248,34 @@ class HuggingFaceTextGenerationPipeline:
242
248
 
243
249
 
244
250
  class HuggingFaceImageProcessor:
245
-
246
251
  def __init__(
247
252
  self,
248
253
  model: PreTrainedModel,
249
254
  processor: BaseImageProcessor,
255
+ image_size_override: Optional[Tuple[int, int]] = None,
250
256
  ):
251
257
  self.model = model
252
258
  self.processor = processor
259
+ self.image_size_override = image_size_override
253
260
 
254
261
  def input_spec(self) -> Dict[str, Any]:
255
262
  target_size = None
256
263
  image_preprocessing_steps = []
257
- if (
258
- hasattr(self.processor, "do_resize")
259
- and self.processor.do_resize
260
- and hasattr(self.processor, "size")
261
- ):
264
+ has_do_resize = (
265
+ hasattr(self.processor, "do_resize") and self.processor.do_resize
266
+ )
267
+ has_do_pad = hasattr(self.processor, "do_pad") and self.processor.do_pad
268
+ # NOTE: it is possible for both do_resize and do_pad to be set, in which case we only use do_resize.
269
+ if has_do_resize:
262
270
  target_size, preprocessing_step = self._convert_resize()
263
271
  image_preprocessing_steps.append(preprocessing_step)
272
+ elif has_do_pad:
273
+ target_size, preprocessing_step = self._convert_pad()
274
+ image_preprocessing_steps.append(preprocessing_step)
275
+ else:
276
+ raise ValueError(
277
+ "could not determine target size for resize from model config"
278
+ )
264
279
 
265
280
  if (
266
281
  hasattr(self.processor, "do_rescale")
@@ -291,7 +306,7 @@ class HuggingFaceImageProcessor:
291
306
  # TODO(james): figure out if this is specified anywhere in the huggingface pipeline.
292
307
  channel_format = "rgb"
293
308
 
294
- dimensions = [
309
+ dimensions: list[DimensionSemantics] = [
295
310
  BatchDimension(),
296
311
  ]
297
312
  input_shape = [1]
@@ -319,6 +334,14 @@ class HuggingFaceImageProcessor:
319
334
  raise NotImplementedError(
320
335
  "only semantic segmentation is currently supported"
321
336
  )
337
+ # TODO(philkuz): Support panoptic segmentation models. Multiple outputs come from panoptic segmentation models.
338
+ # We need to decide whether we should invest in converting the panoptic segmentation output to semantic segmentation
339
+ # format or if we should directly support panoptic segmentation output.
340
+ if hasattr(self.processor, "post_process_panoptic_segmentation"):
341
+ raise NotImplementedError(
342
+ "panoptic segmentation models are not supported yet"
343
+ )
344
+
322
345
  dimensions = [
323
346
  BatchDimension(),
324
347
  # TODO(james): verify all semantic segmentation in hugging face output a logits mask.
@@ -342,21 +365,38 @@ class HuggingFaceImageProcessor:
342
365
  "class_labels": labels,
343
366
  }
344
367
 
345
- def output_spec_object_detection(self) -> Dict[str, Any]:
368
+ def output_spec_depth(self) -> Dict[str, Any]:
369
+ dimensions = [
370
+ BatchDimension(),
371
+ ImageHeightDimension(),
372
+ ImageWidthDimension(),
373
+ ]
374
+ output_tensor_semantics = [
375
+ TensorSemantics(dimensions),
376
+ ]
377
+ return {
378
+ "output_tensor_semantics": output_tensor_semantics,
379
+ }
380
+
381
+ def output_spec_object_detection(self, zero_shot=False) -> Dict[str, Any]:
346
382
  if not hasattr(self.processor, "post_process_object_detection"):
347
383
  raise NotImplementedError(
348
384
  "processor must have post_process_object_detection set"
349
385
  )
350
386
 
351
- id_to_label = self.model.config.id2label
352
- max_id = max(id_to_label)
353
- labels = []
354
- for i in range(max_id):
355
- if i not in id_to_label:
356
- labels.append("")
357
- continue
358
- labels.append(id_to_label[i])
359
- num_classes = max_id + 1
387
+ if zero_shot:
388
+ num_classes = -1
389
+ labels = []
390
+ else:
391
+ id_to_label = self.model.config.id2label
392
+ max_id = max(id_to_label)
393
+ labels = []
394
+ for i in range(max_id):
395
+ if i not in id_to_label:
396
+ labels.append("")
397
+ continue
398
+ labels.append(id_to_label[i])
399
+ num_classes = max_id + 1
360
400
 
361
401
  # TODO(james): verify assumptions made here apply broadly.
362
402
  output_tensor_semantics = []
@@ -366,7 +406,7 @@ class HuggingFaceImageProcessor:
366
406
  DetectionNumCandidatesDimension(is_nms=False),
367
407
  DetectionOutputDimension(
368
408
  scores_range=(0, num_classes),
369
- scores_are_logits=True,
409
+ scores_are_logits=not zero_shot,
370
410
  ),
371
411
  ]
372
412
  output_tensor_semantics.append(TensorSemantics(logits_dimensions))
@@ -385,12 +425,45 @@ class HuggingFaceImageProcessor:
385
425
  "class_labels": labels,
386
426
  }
387
427
 
428
+ def _get_size(self) -> Dict[str, int]:
429
+ size = None
430
+ if self.image_size_override:
431
+ size = {
432
+ "height": self.image_size_override[0],
433
+ "width": self.image_size_override[1],
434
+ }
435
+ elif hasattr(self.processor, "size") and self.processor.size is not None:
436
+ size = self.processor.size
437
+ elif (
438
+ hasattr(self.model.config, "image_size")
439
+ and self.model.config.image_size is not None
440
+ ):
441
+ size = {
442
+ "height": self.model.config.image_size,
443
+ "width": self.model.config.image_size,
444
+ }
445
+ else:
446
+ warnings.warn(
447
+ f"using fallback resize size of {FALLBACK_RESIZE_SIZE} for model",
448
+ stacklevel=1,
449
+ )
450
+ size = {
451
+ "width": FALLBACK_RESIZE_SIZE,
452
+ "height": FALLBACK_RESIZE_SIZE,
453
+ }
454
+ return size
455
+
388
456
  def _convert_resize(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
389
- size = self.processor.size
457
+ size = self._get_size()
458
+ size_divisor: int | None = None
459
+ if hasattr(self.processor, "size_divisor"):
460
+ size_divisor = self.processor.size_divisor
461
+
390
462
  target_size = None
391
463
  preprocess_step = None
464
+
392
465
  if "height" in size and "width" in size:
393
- target_size = [size["height"], size["width"]]
466
+ target_size = (size["height"], size["width"])
394
467
  preprocess_step = ResizeImage()
395
468
  elif (
396
469
  "shortest_edge" in size
@@ -410,12 +483,55 @@ class HuggingFaceImageProcessor:
410
483
  if not min_size or edge_size < min_size:
411
484
  min_size = edge_size
412
485
 
413
- target_size = [min_size, min_size]
486
+ if min_size is None:
487
+ raise ValueError(
488
+ "could not determine target size for resize from model config"
489
+ )
490
+ target_size = (min_size, min_size)
414
491
  preprocess_step = LetterboxImage()
415
492
  else:
416
493
  raise ValueError(
417
494
  "could not determine target size for resize from model config"
418
495
  )
496
+ if size_divisor:
497
+ target_size = (
498
+ math.ceil(target_size[0] / size_divisor) * size_divisor,
499
+ math.ceil(target_size[1] / size_divisor) * size_divisor,
500
+ )
501
+ return target_size, preprocess_step
502
+
503
+ def _convert_pad(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
504
+ # NOTE: There is a wide variety of ways that huggingface pads images.
505
+ # We found at least 3 different ways to pad images in the codebase:
506
+ # 1. Center pad (pad top,left, bottom, right) to match target size
507
+ # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/dpt/image_processing_dpt.py#L231
508
+ # 2. Right/Top pad (pad top, and right) to match target size
509
+ # https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/models/conditional_detr/image_processing_conditional_detr.py#L846
510
+ # 3. Pad to nearest multiple of size_divisor
511
+ # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/llava_onevision/image_processing_llava_onevision.py#L177-179
512
+ #
513
+ # We decided to simply implement padding with LetterBoxImage(),
514
+ # because we assume the models won't be that sensitive to the type of padding,
515
+ # but this may need to be revisited in the future.
516
+ size = self._get_size()
517
+ size_divisor: int | None = None
518
+ if hasattr(self.processor, "size_divisor"):
519
+ size_divisor = self.processor.size_divisor
520
+
521
+ target_size = None
522
+ preprocess_step = None
523
+ if "height" in size and "width" in size:
524
+ target_size = (size["height"], size["width"])
525
+ preprocess_step = LetterboxImage()
526
+ else:
527
+ raise ValueError(
528
+ "could not determine target size for resize from model config"
529
+ )
530
+ if size_divisor:
531
+ target_size = (
532
+ math.ceil(target_size[0] / size_divisor) * size_divisor,
533
+ math.ceil(target_size[1] / size_divisor) * size_divisor,
534
+ )
419
535
  return target_size, preprocess_step
420
536
 
421
537
 
@@ -424,11 +540,13 @@ class HuggingFaceImageSegmentationPipeline:
424
540
  self,
425
541
  pipeline: Pipeline,
426
542
  name: Optional[str] = None,
543
+ image_size_override: Optional[Tuple[int, int]] = None,
427
544
  ):
428
545
  self.pipeline = pipeline
429
546
  if name is None:
430
547
  name = pipeline.model.name_or_path
431
548
 
549
+ self.image_size_override = image_size_override
432
550
  self.model = TorchModel(
433
551
  name,
434
552
  torch_module=self.pipeline.model,
@@ -446,7 +564,9 @@ class HuggingFaceImageSegmentationPipeline:
446
564
  )
447
565
 
448
566
  image_processor = HuggingFaceImageProcessor(
449
- self.pipeline.model, self.pipeline.image_processor
567
+ self.pipeline.model,
568
+ self.pipeline.image_processor,
569
+ image_size_override=self.image_size_override,
450
570
  )
451
571
  spec = image_processor.input_spec()
452
572
  spec.update(image_processor.output_spec_segmentation())
@@ -471,11 +591,13 @@ class HuggingFaceObjectDetectionPipeline:
471
591
  self,
472
592
  pipeline: Pipeline,
473
593
  name: Optional[str] = None,
594
+ image_size_override: Optional[Tuple[int, int]] = None,
474
595
  ):
475
596
  self.pipeline = pipeline
476
597
  if name is None:
477
598
  name = pipeline.model.name_or_path
478
599
 
600
+ self.image_size_override = image_size_override
479
601
  self.model = TorchModel(
480
602
  name,
481
603
  torch_module=ObjectDetectionWrapper(self.pipeline.model),
@@ -493,7 +615,9 @@ class HuggingFaceObjectDetectionPipeline:
493
615
  )
494
616
 
495
617
  image_processor = HuggingFaceImageProcessor(
496
- self.pipeline.model, self.pipeline.image_processor
618
+ self.pipeline.model,
619
+ self.pipeline.image_processor,
620
+ image_size_override=self.image_size_override,
497
621
  )
498
622
  spec = image_processor.input_spec()
499
623
  spec.update(image_processor.output_spec_object_detection())
@@ -503,6 +627,141 @@ class HuggingFaceObjectDetectionPipeline:
503
627
  return [self.model]
504
628
 
505
629
 
630
+ class ZeroShotObjectDetectionWrapper(torch.nn.Module):
631
+ def __init__(self, model: PreTrainedModel):
632
+ super().__init__()
633
+ self.model = model
634
+
635
+ def forward(self, image, tokens, attention_mask):
636
+ outputs = self.model(
637
+ input_ids=tokens, pixel_values=image, attention_mask=attention_mask
638
+ )
639
+ return torch.sigmoid(outputs.logits), outputs.pred_boxes
640
+
641
+
642
+ class HuggingFaceZeroShotObjectDetectionPipeline:
643
+ def __init__(
644
+ self,
645
+ pipeline: Pipeline,
646
+ name: Optional[str] = None,
647
+ tokenizer_name: Optional[str] = None,
648
+ image_size_override: Optional[Tuple[int, int]] = None,
649
+ ):
650
+ self.pipeline = pipeline
651
+ if name is None:
652
+ name = pipeline.model.name_or_path
653
+
654
+ self.tokenizer_model = HuggingFaceTokenizer(
655
+ self.pipeline.tokenizer, tokenizer_name
656
+ )
657
+
658
+ self.image_size_override = image_size_override
659
+ self.detection_model = TorchModel(
660
+ name,
661
+ torch_module=ZeroShotObjectDetectionWrapper(self.pipeline.model),
662
+ **self._guess_model_spec(),
663
+ )
664
+
665
+ def _add_zero_shot_inputs(self, spec: Dict):
666
+ example_inputs = spec["example_inputs"]
667
+ if "dynamic_shapes" not in spec:
668
+ spec["dynamic_shapes"] = [{} for _ in example_inputs]
669
+
670
+ max_length = self.pipeline.model.config.text_config.max_length
671
+ example_inputs.extend(
672
+ [
673
+ torch.randint(200, [2, max_length]).to(torch.int32),
674
+ torch.ones([2, max_length]).to(torch.int32),
675
+ ]
676
+ )
677
+
678
+ input_tensor_semantics = spec["input_tensor_semantics"]
679
+ input_tensor_semantics.extend(
680
+ [
681
+ TensorSemantics(
682
+ [
683
+ BatchDimension(),
684
+ TokensDimension(),
685
+ ]
686
+ ),
687
+ TensorSemantics(
688
+ [
689
+ BatchDimension(),
690
+ AttentionMaskDimension(),
691
+ ]
692
+ ),
693
+ ]
694
+ )
695
+
696
+ spec["dynamic_shapes"].extend(
697
+ [
698
+ {0: "num_labels"},
699
+ {0: "num_labels"},
700
+ ]
701
+ )
702
+
703
+ def _guess_model_spec(self) -> Dict:
704
+ if self.pipeline.image_processor is None:
705
+ raise ValueError(
706
+ "Could not determine image preprocessing for pipeline with image_processor=None"
707
+ )
708
+
709
+ image_processor = HuggingFaceImageProcessor(
710
+ self.pipeline.model,
711
+ self.pipeline.image_processor,
712
+ image_size_override=self.image_size_override,
713
+ )
714
+ spec = image_processor.input_spec()
715
+ self._add_zero_shot_inputs(spec)
716
+ spec.update(image_processor.output_spec_object_detection(zero_shot=True))
717
+ return spec
718
+
719
+ def models(self) -> List[Model]:
720
+ return [self.detection_model, self.tokenizer_model]
721
+
722
+
723
+ class HuggingFaceDepthEstimationPipeline:
724
+ def __init__(
725
+ self,
726
+ pipeline: Pipeline,
727
+ name: Optional[str] = None,
728
+ image_size_override: Optional[Tuple[int, int]] = None,
729
+ ):
730
+ self.pipeline = pipeline
731
+ if name is None:
732
+ name = pipeline.model.name_or_path
733
+
734
+ self.image_size_override = image_size_override
735
+
736
+ self.model = TorchModel(
737
+ name,
738
+ torch_module=self.pipeline.model,
739
+ **self._guess_model_spec(),
740
+ )
741
+
742
+ def _guess_model_spec(self) -> Dict:
743
+ if self.pipeline.image_processor is None:
744
+ raise ValueError(
745
+ "Could not determine image preprocessing for pipeline with image_processor=None"
746
+ )
747
+ if self.pipeline.tokenizer is not None:
748
+ raise NotImplementedError(
749
+ "HuggingFaceDepthEstimationPipeline does not yet support token inputs"
750
+ )
751
+
752
+ image_processor = HuggingFaceImageProcessor(
753
+ self.pipeline.model,
754
+ self.pipeline.image_processor,
755
+ image_size_override=self.image_size_override,
756
+ )
757
+ spec = image_processor.input_spec()
758
+ spec.update(image_processor.output_spec_depth())
759
+ return spec
760
+
761
+ def models(self) -> List[Model]:
762
+ return [self.model]
763
+
764
+
506
765
  def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
507
766
  if pipeline.framework != "pt":
508
767
  raise ValueError(
@@ -517,8 +776,19 @@ def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
517
776
  return HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
518
777
  elif pipeline.task == "object-detection":
519
778
  return HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
779
+ elif pipeline.task == "zero-shot-object-detection":
780
+ return HuggingFaceZeroShotObjectDetectionPipeline(pipeline, **kwargs).models()
781
+ elif pipeline.task == "depth-estimation":
782
+ return HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
520
783
  raise ValueError(
521
784
  "unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
522
- pipeline.task, ["text-generation", "image-segmentation", "object-detection"]
785
+ pipeline.task,
786
+ [
787
+ "text-generation",
788
+ "image-segmentation",
789
+ "object-detection",
790
+ "zero-shot-object-detection",
791
+ "depth-estimation",
792
+ ],
523
793
  )
524
794
  )
gml/model.py CHANGED
@@ -21,8 +21,9 @@ import io
21
21
  from pathlib import Path
22
22
  from typing import BinaryIO, Dict, List, Literal, Optional, Sequence, TextIO, Tuple
23
23
 
24
- import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
25
24
  import torch
25
+
26
+ import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
26
27
  from gml.asset_manager import AssetManager, TempFileAssetManager
27
28
  from gml.compile import to_torch_mlir
28
29
  from gml.preprocessing import ImagePreprocessingStep