PyPI - gimlet-api - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl - Mend

gimlet-api 0.0.7py3-none-any.whl → 0.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{gimlet_api-0.0.7.dist-info → gimlet_api-0.0.9.dist-info}/METADATA +1 -1
{gimlet_api-0.0.7.dist-info → gimlet_api-0.0.9.dist-info}/RECORD +19 -18
gml/client.py +9 -8
gml/compile.py +13 -93
gml/device.py +5 -7
gml/hf.py +302 -32
gml/model.py +2 -1
gml/pipelines.py +146 -7
gml/preprocessing.py +2 -1
gml/proto/src/api/corepb/v1/controlplane_pb2.py +40 -20
gml/proto/src/api/corepb/v1/cp_edge_pb2.py +43 -49
gml/proto/src/api/corepb/v1/device_info_pb2.py +19 -7
gml/proto/src/api/corepb/v1/gem_config_pb2.py +24 -15
gml/proto/src/api/corepb/v1/mediastream_pb2.py +23 -19
gml/proto/src/api/corepb/v1/model_exec_pb2.py +131 -112
gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py +10 -11
gml/register_submodules.py +134 -0
gml/tensor.py +6 -1
{gimlet_api-0.0.7.dist-info → gimlet_api-0.0.9.dist-info}/WHEEL +0 -0

gml/hf.py CHANGED Viewed

@@ -15,14 +15,23 @@
 # SPDX-License-Identifier: Apache-2.0
 import glob
+import math
 import tempfile
+import warnings
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple
-import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
 import torch
 import transformers
+from transformers import (
+    BaseImageProcessor,
+    Pipeline,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
 from gml.asset_manager import AssetManager
 from gml.model import GenerationConfig, Model, TorchModel
 from gml.preprocessing import (
@@ -34,10 +43,12 @@ from gml.preprocessing import (
 )
 from gml.tensor import (
     AttentionKeyValueCacheTensorSemantics,
+    AttentionMaskDimension,
     BatchDimension,
     BoundingBoxFormat,
     DetectionNumCandidatesDimension,
     DetectionOutputDimension,
+    DimensionSemantics,
     ImageChannelDimension,
     ImageHeightDimension,
     ImageWidthDimension,
@@ -46,12 +57,8 @@ from gml.tensor import (
     TokensDimension,
     VocabLogitsDimension,
 )
-from transformers import (
-    BaseImageProcessor,
-    Pipeline,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-)
+FALLBACK_RESIZE_SIZE = 512
 class HuggingFaceTokenizer(Model):
@@ -77,7 +84,6 @@ class HuggingFaceTokenizer(Model):
 class HuggingFaceGenerationConfig(GenerationConfig):
     def __init__(self, model: PreTrainedModel):
         config = model.generation_config
         eos_tokens = config.eos_token_id
@@ -242,25 +248,34 @@ class HuggingFaceTextGenerationPipeline:
 class HuggingFaceImageProcessor:
     def __init__(
         self,
         model: PreTrainedModel,
         processor: BaseImageProcessor,
+        image_size_override: Optional[Tuple[int, int]] = None,
     ):
         self.model = model
         self.processor = processor
+        self.image_size_override = image_size_override
     def input_spec(self) -> Dict[str, Any]:
         target_size = None
         image_preprocessing_steps = []
-        if (
-            hasattr(self.processor, "do_resize")
-            and self.processor.do_resize
-            and hasattr(self.processor, "size")
-        ):
+        has_do_resize = (
+            hasattr(self.processor, "do_resize") and self.processor.do_resize
+        )
+        has_do_pad = hasattr(self.processor, "do_pad") and self.processor.do_pad
+        # NOTE: it is possible for both do_resize and do_pad to be set, in which case we only use do_resize.
+        if has_do_resize:
             target_size, preprocessing_step = self._convert_resize()
             image_preprocessing_steps.append(preprocessing_step)
+        elif has_do_pad:
+            target_size, preprocessing_step = self._convert_pad()
+            image_preprocessing_steps.append(preprocessing_step)
+        else:
+            raise ValueError(
+                "could not determine target size for resize from model config"
+            )
         if (
             hasattr(self.processor, "do_rescale")
@@ -291,7 +306,7 @@ class HuggingFaceImageProcessor:
         # TODO(james): figure out if this is specified anywhere in the huggingface pipeline.
         channel_format = "rgb"
-        dimensions = [
+        dimensions: list[DimensionSemantics] = [
             BatchDimension(),
         ]
         input_shape = [1]
@@ -319,6 +334,14 @@ class HuggingFaceImageProcessor:
             raise NotImplementedError(
                 "only semantic segmentation is currently supported"
             )
+        # TODO(philkuz): Support panoptic segmentation models. Multiple outputs come from panoptic segmentation models.
+        # We need to decide whether we should invest in converting the panoptic segmentation output to semantic segmentation
+        # format or if we should directly support panoptic segmentation output.
+        if hasattr(self.processor, "post_process_panoptic_segmentation"):
+            raise NotImplementedError(
+                "panoptic segmentation models are not supported yet"
+            )
         dimensions = [
             BatchDimension(),
             # TODO(james): verify all semantic segmentation in hugging face output a logits mask.
@@ -342,21 +365,38 @@ class HuggingFaceImageProcessor:
             "class_labels": labels,
         }
-    def output_spec_object_detection(self) -> Dict[str, Any]:
+    def output_spec_depth(self) -> Dict[str, Any]:
+        dimensions = [
+            BatchDimension(),
+            ImageHeightDimension(),
+            ImageWidthDimension(),
+        ]
+        output_tensor_semantics = [
+            TensorSemantics(dimensions),
+        ]
+        return {
+            "output_tensor_semantics": output_tensor_semantics,
+        }
+    def output_spec_object_detection(self, zero_shot=False) -> Dict[str, Any]:
         if not hasattr(self.processor, "post_process_object_detection"):
             raise NotImplementedError(
                 "processor must have post_process_object_detection set"
             )
-        id_to_label = self.model.config.id2label
-        max_id = max(id_to_label)
-        labels = []
-        for i in range(max_id):
-            if i not in id_to_label:
-                labels.append("")
-                continue
-            labels.append(id_to_label[i])
-        num_classes = max_id + 1
+        if zero_shot:
+            num_classes = -1
+            labels = []
+        else:
+            id_to_label = self.model.config.id2label
+            max_id = max(id_to_label)
+            labels = []
+            for i in range(max_id):
+                if i not in id_to_label:
+                    labels.append("")
+                    continue
+                labels.append(id_to_label[i])
+            num_classes = max_id + 1
         # TODO(james): verify assumptions made here apply broadly.
         output_tensor_semantics = []
@@ -366,7 +406,7 @@ class HuggingFaceImageProcessor:
             DetectionNumCandidatesDimension(is_nms=False),
             DetectionOutputDimension(
                 scores_range=(0, num_classes),
-                scores_are_logits=True,
+                scores_are_logits=not zero_shot,
             ),
         ]
         output_tensor_semantics.append(TensorSemantics(logits_dimensions))
@@ -385,12 +425,45 @@ class HuggingFaceImageProcessor:
             "class_labels": labels,
         }
+    def _get_size(self) -> Dict[str, int]:
+        size = None
+        if self.image_size_override:
+            size = {
+                "height": self.image_size_override[0],
+                "width": self.image_size_override[1],
+            }
+        elif hasattr(self.processor, "size") and self.processor.size is not None:
+            size = self.processor.size
+        elif (
+            hasattr(self.model.config, "image_size")
+            and self.model.config.image_size is not None
+        ):
+            size = {
+                "height": self.model.config.image_size,
+                "width": self.model.config.image_size,
+            }
+        else:
+            warnings.warn(
+                f"using fallback resize size of {FALLBACK_RESIZE_SIZE} for model",
+                stacklevel=1,
+            )
+            size = {
+                "width": FALLBACK_RESIZE_SIZE,
+                "height": FALLBACK_RESIZE_SIZE,
+            }
+        return size
     def _convert_resize(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
-        size = self.processor.size
+        size = self._get_size()
+        size_divisor: int | None = None
+        if hasattr(self.processor, "size_divisor"):
+            size_divisor = self.processor.size_divisor
         target_size = None
         preprocess_step = None
         if "height" in size and "width" in size:
-            target_size = [size["height"], size["width"]]
+            target_size = (size["height"], size["width"])
             preprocess_step = ResizeImage()
         elif (
             "shortest_edge" in size
@@ -410,12 +483,55 @@ class HuggingFaceImageProcessor:
                 if not min_size or edge_size < min_size:
                     min_size = edge_size
-            target_size = [min_size, min_size]
+            if min_size is None:
+                raise ValueError(
+                    "could not determine target size for resize from model config"
+                )
+            target_size = (min_size, min_size)
             preprocess_step = LetterboxImage()
         else:
             raise ValueError(
                 "could not determine target size for resize from model config"
             )
+        if size_divisor:
+            target_size = (
+                math.ceil(target_size[0] / size_divisor) * size_divisor,
+                math.ceil(target_size[1] / size_divisor) * size_divisor,
+            )
+        return target_size, preprocess_step
+    def _convert_pad(self) -> Tuple[Tuple[int, int], ImagePreprocessingStep]:
+        # NOTE: There is a wide variety of ways that huggingface pads images.
+        # We found at least 3 different ways to pad images in the codebase:
+        # 1. Center pad (pad top,left, bottom, right) to match target size
+        # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/dpt/image_processing_dpt.py#L231
+        # 2. Right/Top pad (pad top, and right) to match target size
+        # https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/models/conditional_detr/image_processing_conditional_detr.py#L846
+        # 3. Pad to nearest multiple of size_divisor
+        # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/models/llava_onevision/image_processing_llava_onevision.py#L177-179
+        #
+        # We decided to simply implement padding with LetterBoxImage(),
+        # because we assume the models won't be that sensitive to the type of padding,
+        # but this may need to be revisited in the future.
+        size = self._get_size()
+        size_divisor: int | None = None
+        if hasattr(self.processor, "size_divisor"):
+            size_divisor = self.processor.size_divisor
+        target_size = None
+        preprocess_step = None
+        if "height" in size and "width" in size:
+            target_size = (size["height"], size["width"])
+            preprocess_step = LetterboxImage()
+        else:
+            raise ValueError(
+                "could not determine target size for resize from model config"
+            )
+        if size_divisor:
+            target_size = (
+                math.ceil(target_size[0] / size_divisor) * size_divisor,
+                math.ceil(target_size[1] / size_divisor) * size_divisor,
+            )
         return target_size, preprocess_step
@@ -424,11 +540,13 @@ class HuggingFaceImageSegmentationPipeline:
         self,
         pipeline: Pipeline,
         name: Optional[str] = None,
+        image_size_override: Optional[Tuple[int, int]] = None,
     ):
         self.pipeline = pipeline
         if name is None:
             name = pipeline.model.name_or_path
+        self.image_size_override = image_size_override
         self.model = TorchModel(
             name,
             torch_module=self.pipeline.model,
@@ -446,7 +564,9 @@ class HuggingFaceImageSegmentationPipeline:
             )
         image_processor = HuggingFaceImageProcessor(
-            self.pipeline.model, self.pipeline.image_processor
+            self.pipeline.model,
+            self.pipeline.image_processor,
+            image_size_override=self.image_size_override,
         )
         spec = image_processor.input_spec()
         spec.update(image_processor.output_spec_segmentation())
@@ -471,11 +591,13 @@ class HuggingFaceObjectDetectionPipeline:
         self,
         pipeline: Pipeline,
         name: Optional[str] = None,
+        image_size_override: Optional[Tuple[int, int]] = None,
     ):
         self.pipeline = pipeline
         if name is None:
             name = pipeline.model.name_or_path
+        self.image_size_override = image_size_override
         self.model = TorchModel(
             name,
             torch_module=ObjectDetectionWrapper(self.pipeline.model),
@@ -493,7 +615,9 @@ class HuggingFaceObjectDetectionPipeline:
             )
         image_processor = HuggingFaceImageProcessor(
-            self.pipeline.model, self.pipeline.image_processor
+            self.pipeline.model,
+            self.pipeline.image_processor,
+            image_size_override=self.image_size_override,
         )
         spec = image_processor.input_spec()
         spec.update(image_processor.output_spec_object_detection())
@@ -503,6 +627,141 @@ class HuggingFaceObjectDetectionPipeline:
         return [self.model]
+class ZeroShotObjectDetectionWrapper(torch.nn.Module):
+    def __init__(self, model: PreTrainedModel):
+        super().__init__()
+        self.model = model
+    def forward(self, image, tokens, attention_mask):
+        outputs = self.model(
+            input_ids=tokens, pixel_values=image, attention_mask=attention_mask
+        )
+        return torch.sigmoid(outputs.logits), outputs.pred_boxes
+class HuggingFaceZeroShotObjectDetectionPipeline:
+    def __init__(
+        self,
+        pipeline: Pipeline,
+        name: Optional[str] = None,
+        tokenizer_name: Optional[str] = None,
+        image_size_override: Optional[Tuple[int, int]] = None,
+    ):
+        self.pipeline = pipeline
+        if name is None:
+            name = pipeline.model.name_or_path
+        self.tokenizer_model = HuggingFaceTokenizer(
+            self.pipeline.tokenizer, tokenizer_name
+        )
+        self.image_size_override = image_size_override
+        self.detection_model = TorchModel(
+            name,
+            torch_module=ZeroShotObjectDetectionWrapper(self.pipeline.model),
+            **self._guess_model_spec(),
+        )
+    def _add_zero_shot_inputs(self, spec: Dict):
+        example_inputs = spec["example_inputs"]
+        if "dynamic_shapes" not in spec:
+            spec["dynamic_shapes"] = [{} for _ in example_inputs]
+        max_length = self.pipeline.model.config.text_config.max_length
+        example_inputs.extend(
+            [
+                torch.randint(200, [2, max_length]).to(torch.int32),
+                torch.ones([2, max_length]).to(torch.int32),
+            ]
+        )
+        input_tensor_semantics = spec["input_tensor_semantics"]
+        input_tensor_semantics.extend(
+            [
+                TensorSemantics(
+                    [
+                        BatchDimension(),
+                        TokensDimension(),
+                    ]
+                ),
+                TensorSemantics(
+                    [
+                        BatchDimension(),
+                        AttentionMaskDimension(),
+                    ]
+                ),
+            ]
+        )
+        spec["dynamic_shapes"].extend(
+            [
+                {0: "num_labels"},
+                {0: "num_labels"},
+            ]
+        )
+    def _guess_model_spec(self) -> Dict:
+        if self.pipeline.image_processor is None:
+            raise ValueError(
+                "Could not determine image preprocessing for pipeline with image_processor=None"
+            )
+        image_processor = HuggingFaceImageProcessor(
+            self.pipeline.model,
+            self.pipeline.image_processor,
+            image_size_override=self.image_size_override,
+        )
+        spec = image_processor.input_spec()
+        self._add_zero_shot_inputs(spec)
+        spec.update(image_processor.output_spec_object_detection(zero_shot=True))
+        return spec
+    def models(self) -> List[Model]:
+        return [self.detection_model, self.tokenizer_model]
+class HuggingFaceDepthEstimationPipeline:
+    def __init__(
+        self,
+        pipeline: Pipeline,
+        name: Optional[str] = None,
+        image_size_override: Optional[Tuple[int, int]] = None,
+    ):
+        self.pipeline = pipeline
+        if name is None:
+            name = pipeline.model.name_or_path
+        self.image_size_override = image_size_override
+        self.model = TorchModel(
+            name,
+            torch_module=self.pipeline.model,
+            **self._guess_model_spec(),
+        )
+    def _guess_model_spec(self) -> Dict:
+        if self.pipeline.image_processor is None:
+            raise ValueError(
+                "Could not determine image preprocessing for pipeline with image_processor=None"
+            )
+        if self.pipeline.tokenizer is not None:
+            raise NotImplementedError(
+                "HuggingFaceDepthEstimationPipeline does not yet support token inputs"
+            )
+        image_processor = HuggingFaceImageProcessor(
+            self.pipeline.model,
+            self.pipeline.image_processor,
+            image_size_override=self.image_size_override,
+        )
+        spec = image_processor.input_spec()
+        spec.update(image_processor.output_spec_depth())
+        return spec
+    def models(self) -> List[Model]:
+        return [self.model]
 def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
     if pipeline.framework != "pt":
         raise ValueError(
@@ -517,8 +776,19 @@ def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
         return HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
     elif pipeline.task == "object-detection":
         return HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
+    elif pipeline.task == "zero-shot-object-detection":
+        return HuggingFaceZeroShotObjectDetectionPipeline(pipeline, **kwargs).models()
+    elif pipeline.task == "depth-estimation":
+        return HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
     raise ValueError(
         "unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
-            pipeline.task, ["text-generation", "image-segmentation", "object-detection"]
+            pipeline.task,
+            [
+                "text-generation",
+                "image-segmentation",
+                "object-detection",
+                "zero-shot-object-detection",
+                "depth-estimation",
+            ],
         )
     )

gml/model.py CHANGED Viewed

@@ -21,8 +21,9 @@ import io
 from pathlib import Path
 from typing import BinaryIO, Dict, List, Literal, Optional, Sequence, TextIO, Tuple
-import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
 import torch
+import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
 from gml.asset_manager import AssetManager, TempFileAssetManager
 from gml.compile import to_torch_mlir
 from gml.preprocessing import ImagePreprocessingStep

gimlet-api 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

gimlet-api 0.0.7py3-none-any.whl → 0.0.9py3-none-any.whl