PyPI - inference-models - Versions diffs - 0.18.3__py3-none-any.whl - Mend

inference-models 0.18.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

inference_models/__init__.py +36 -0
inference_models/configuration.py +72 -0
inference_models/constants.py +2 -0
inference_models/entities.py +5 -0
inference_models/errors.py +137 -0
inference_models/logger.py +52 -0
inference_models/model_pipelines/__init__.py +0 -0
inference_models/model_pipelines/auto_loaders/__init__.py +0 -0
inference_models/model_pipelines/auto_loaders/core.py +120 -0
inference_models/model_pipelines/auto_loaders/pipelines_registry.py +36 -0
inference_models/model_pipelines/face_and_gaze_detection/__init__.py +0 -0
inference_models/model_pipelines/face_and_gaze_detection/mediapipe_l2cs.py +200 -0
inference_models/models/__init__.py +0 -0
inference_models/models/auto_loaders/__init__.py +0 -0
inference_models/models/auto_loaders/access_manager.py +168 -0
inference_models/models/auto_loaders/auto_negotiation.py +1329 -0
inference_models/models/auto_loaders/auto_resolution_cache.py +129 -0
inference_models/models/auto_loaders/constants.py +7 -0
inference_models/models/auto_loaders/core.py +1341 -0
inference_models/models/auto_loaders/dependency_models.py +52 -0
inference_models/models/auto_loaders/entities.py +57 -0
inference_models/models/auto_loaders/models_registry.py +497 -0
inference_models/models/auto_loaders/presentation_utils.py +333 -0
inference_models/models/auto_loaders/ranking.py +413 -0
inference_models/models/auto_loaders/utils.py +31 -0
inference_models/models/base/__init__.py +0 -0
inference_models/models/base/classification.py +123 -0
inference_models/models/base/depth_estimation.py +62 -0
inference_models/models/base/documents_parsing.py +111 -0
inference_models/models/base/embeddings.py +66 -0
inference_models/models/base/instance_segmentation.py +87 -0
inference_models/models/base/keypoints_detection.py +93 -0
inference_models/models/base/object_detection.py +143 -0
inference_models/models/base/semantic_segmentation.py +74 -0
inference_models/models/base/types.py +5 -0
inference_models/models/clip/__init__.py +0 -0
inference_models/models/clip/clip_onnx.py +148 -0
inference_models/models/clip/clip_pytorch.py +104 -0
inference_models/models/clip/preprocessing.py +162 -0
inference_models/models/common/__init__.py +0 -0
inference_models/models/common/cuda.py +30 -0
inference_models/models/common/model_packages.py +25 -0
inference_models/models/common/onnx.py +379 -0
inference_models/models/common/roboflow/__init__.py +0 -0
inference_models/models/common/roboflow/model_packages.py +361 -0
inference_models/models/common/roboflow/post_processing.py +436 -0
inference_models/models/common/roboflow/pre_processing.py +1332 -0
inference_models/models/common/torch.py +20 -0
inference_models/models/common/trt.py +266 -0
inference_models/models/deep_lab_v3_plus/__init__.py +0 -0
inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_onnx.py +282 -0
inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_torch.py +264 -0
inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py +313 -0
inference_models/models/depth_anything_v2/__init__.py +0 -0
inference_models/models/depth_anything_v2/depth_anything_v2_hf.py +77 -0
inference_models/models/dinov3/__init__.py +0 -0
inference_models/models/dinov3/dinov3_classification_onnx.py +348 -0
inference_models/models/dinov3/dinov3_classification_torch.py +323 -0
inference_models/models/doctr/__init__.py +0 -0
inference_models/models/doctr/doctr_torch.py +304 -0
inference_models/models/easy_ocr/__init__.py +0 -0
inference_models/models/easy_ocr/easy_ocr_torch.py +222 -0
inference_models/models/florence2/__init__.py +0 -0
inference_models/models/florence2/florence2_hf.py +897 -0
inference_models/models/grounding_dino/__init__.py +0 -0
inference_models/models/grounding_dino/grounding_dino_torch.py +227 -0
inference_models/models/l2cs/__init__.py +0 -0
inference_models/models/l2cs/l2cs_onnx.py +216 -0
inference_models/models/mediapipe_face_detection/__init__.py +0 -0
inference_models/models/mediapipe_face_detection/face_detection.py +203 -0
inference_models/models/moondream2/__init__.py +0 -0
inference_models/models/moondream2/moondream2_hf.py +281 -0
inference_models/models/owlv2/__init__.py +0 -0
inference_models/models/owlv2/cache.py +182 -0
inference_models/models/owlv2/entities.py +112 -0
inference_models/models/owlv2/owlv2_hf.py +695 -0
inference_models/models/owlv2/reference_dataset.py +291 -0
inference_models/models/paligemma/__init__.py +0 -0
inference_models/models/paligemma/paligemma_hf.py +209 -0
inference_models/models/perception_encoder/__init__.py +0 -0
inference_models/models/perception_encoder/perception_encoder_pytorch.py +197 -0
inference_models/models/perception_encoder/vision_encoder/__init__.py +0 -0
inference_models/models/perception_encoder/vision_encoder/config.py +160 -0
inference_models/models/perception_encoder/vision_encoder/pe.py +742 -0
inference_models/models/perception_encoder/vision_encoder/rope.py +344 -0
inference_models/models/perception_encoder/vision_encoder/tokenizer.py +342 -0
inference_models/models/perception_encoder/vision_encoder/transforms.py +33 -0
inference_models/models/qwen25vl/__init__.py +1 -0
inference_models/models/qwen25vl/qwen25vl_hf.py +285 -0
inference_models/models/resnet/__init__.py +0 -0
inference_models/models/resnet/resnet_classification_onnx.py +330 -0
inference_models/models/resnet/resnet_classification_torch.py +305 -0
inference_models/models/resnet/resnet_classification_trt.py +369 -0
inference_models/models/rfdetr/__init__.py +0 -0
inference_models/models/rfdetr/backbone_builder.py +101 -0
inference_models/models/rfdetr/class_remapping.py +41 -0
inference_models/models/rfdetr/common.py +115 -0
inference_models/models/rfdetr/default_labels.py +108 -0
inference_models/models/rfdetr/dinov2_with_windowed_attn.py +1330 -0
inference_models/models/rfdetr/misc.py +26 -0
inference_models/models/rfdetr/ms_deform_attn.py +180 -0
inference_models/models/rfdetr/ms_deform_attn_func.py +60 -0
inference_models/models/rfdetr/position_encoding.py +166 -0
inference_models/models/rfdetr/post_processor.py +83 -0
inference_models/models/rfdetr/projector.py +373 -0
inference_models/models/rfdetr/rfdetr_backbone_pytorch.py +394 -0
inference_models/models/rfdetr/rfdetr_base_pytorch.py +807 -0
inference_models/models/rfdetr/rfdetr_instance_segmentation_onnx.py +206 -0
inference_models/models/rfdetr/rfdetr_instance_segmentation_pytorch.py +373 -0
inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +227 -0
inference_models/models/rfdetr/rfdetr_object_detection_onnx.py +244 -0
inference_models/models/rfdetr/rfdetr_object_detection_pytorch.py +470 -0
inference_models/models/rfdetr/rfdetr_object_detection_trt.py +270 -0
inference_models/models/rfdetr/segmentation_head.py +273 -0
inference_models/models/rfdetr/transformer.py +767 -0
inference_models/models/roboflow_instant/__init__.py +0 -0
inference_models/models/roboflow_instant/roboflow_instant_hf.py +141 -0
inference_models/models/sam/__init__.py +0 -0
inference_models/models/sam/cache.py +147 -0
inference_models/models/sam/entities.py +25 -0
inference_models/models/sam/sam_torch.py +675 -0
inference_models/models/sam2/__init__.py +0 -0
inference_models/models/sam2/cache.py +162 -0
inference_models/models/sam2/entities.py +43 -0
inference_models/models/sam2/sam2_torch.py +905 -0
inference_models/models/sam2_rt/__init__.py +0 -0
inference_models/models/sam2_rt/sam2_pytorch.py +119 -0
inference_models/models/smolvlm/__init__.py +0 -0
inference_models/models/smolvlm/smolvlm_hf.py +245 -0
inference_models/models/trocr/__init__.py +0 -0
inference_models/models/trocr/trocr_hf.py +53 -0
inference_models/models/vit/__init__.py +0 -0
inference_models/models/vit/vit_classification_huggingface.py +319 -0
inference_models/models/vit/vit_classification_onnx.py +326 -0
inference_models/models/vit/vit_classification_trt.py +365 -0
inference_models/models/yolact/__init__.py +1 -0
inference_models/models/yolact/yolact_instance_segmentation_onnx.py +336 -0
inference_models/models/yolact/yolact_instance_segmentation_trt.py +361 -0
inference_models/models/yolo_world/__init__.py +1 -0
inference_models/models/yolonas/__init__.py +0 -0
inference_models/models/yolonas/nms.py +44 -0
inference_models/models/yolonas/yolonas_object_detection_onnx.py +204 -0
inference_models/models/yolonas/yolonas_object_detection_trt.py +230 -0
inference_models/models/yolov10/__init__.py +0 -0
inference_models/models/yolov10/yolov10_object_detection_onnx.py +187 -0
inference_models/models/yolov10/yolov10_object_detection_trt.py +215 -0
inference_models/models/yolov11/__init__.py +0 -0
inference_models/models/yolov11/yolov11_onnx.py +28 -0
inference_models/models/yolov11/yolov11_torch_script.py +25 -0
inference_models/models/yolov11/yolov11_trt.py +21 -0
inference_models/models/yolov12/__init__.py +0 -0
inference_models/models/yolov12/yolov12_onnx.py +7 -0
inference_models/models/yolov12/yolov12_torch_script.py +7 -0
inference_models/models/yolov12/yolov12_trt.py +7 -0
inference_models/models/yolov5/__init__.py +0 -0
inference_models/models/yolov5/nms.py +99 -0
inference_models/models/yolov5/yolov5_instance_segmentation_onnx.py +225 -0
inference_models/models/yolov5/yolov5_instance_segmentation_trt.py +255 -0
inference_models/models/yolov5/yolov5_object_detection_onnx.py +192 -0
inference_models/models/yolov5/yolov5_object_detection_trt.py +218 -0
inference_models/models/yolov7/__init__.py +0 -0
inference_models/models/yolov7/yolov7_instance_segmentation_onnx.py +226 -0
inference_models/models/yolov7/yolov7_instance_segmentation_trt.py +253 -0
inference_models/models/yolov8/__init__.py +0 -0
inference_models/models/yolov8/yolov8_classification_onnx.py +181 -0
inference_models/models/yolov8/yolov8_instance_segmentation_onnx.py +239 -0
inference_models/models/yolov8/yolov8_instance_segmentation_torch_script.py +201 -0
inference_models/models/yolov8/yolov8_instance_segmentation_trt.py +268 -0
inference_models/models/yolov8/yolov8_key_points_detection_onnx.py +263 -0
inference_models/models/yolov8/yolov8_key_points_detection_torch_script.py +218 -0
inference_models/models/yolov8/yolov8_key_points_detection_trt.py +287 -0
inference_models/models/yolov8/yolov8_object_detection_onnx.py +213 -0
inference_models/models/yolov8/yolov8_object_detection_torch_script.py +166 -0
inference_models/models/yolov8/yolov8_object_detection_trt.py +231 -0
inference_models/models/yolov9/__init__.py +0 -0
inference_models/models/yolov9/yolov9_onnx.py +7 -0
inference_models/models/yolov9/yolov9_torch_script.py +7 -0
inference_models/models/yolov9/yolov9_trt.py +7 -0
inference_models/runtime_introspection/__init__.py +0 -0
inference_models/runtime_introspection/core.py +410 -0
inference_models/utils/__init__.py +0 -0
inference_models/utils/download.py +608 -0
inference_models/utils/environment.py +28 -0
inference_models/utils/file_system.py +51 -0
inference_models/utils/hashing.py +7 -0
inference_models/utils/imports.py +48 -0
inference_models/utils/onnx_introspection.py +17 -0
inference_models/weights_providers/__init__.py +0 -0
inference_models/weights_providers/core.py +20 -0
inference_models/weights_providers/entities.py +159 -0
inference_models/weights_providers/roboflow.py +601 -0
inference_models-0.18.3.dist-info/METADATA +466 -0
inference_models-0.18.3.dist-info/RECORD +195 -0
inference_models-0.18.3.dist-info/WHEEL +5 -0
inference_models-0.18.3.dist-info/top_level.txt +1 -0

inference_models/models/owlv2/reference_dataset.py ADDED Viewed

@@ -0,0 +1,291 @@
+import hashlib
+import os.path
+import re
+import urllib.parse
+from typing import List, Optional, Union
+import backoff
+import cv2
+import numpy as np
+import pybase64
+import requests
+import torch
+from requests import Timeout
+from tldextract import tldextract
+from tldextract.tldextract import ExtractResult
+from inference_models.configuration import (
+    API_CALLS_MAX_TRIES,
+    IDEMPOTENT_API_REQUEST_CODES_TO_RETRY,
+)
+from inference_models.errors import ModelInputError, ModelRuntimeError, RetryError
+BASE64_DATA_TYPE_PATTERN = re.compile(r"^data:image\/[a-z]+;base64,")
+class LazyImageWrapper:
+    @classmethod
+    def init(
+        cls,
+        image: Union[np.ndarray, torch.Tensor, str, bytes],
+        allow_url_input: bool,
+        allow_non_https_url: bool,
+        allow_url_without_fqdn: bool,
+        whitelisted_domains: Optional[List[str]],
+        blacklisted_domains: Optional[List[str]],
+        allow_local_storage_access: bool,
+    ):
+        image_in_memory, image_reference = None, None
+        if isinstance(image, (torch.Tensor, np.ndarray)):
+            image_in_memory = image
+        else:
+            image_reference = image
+        return cls(
+            allow_url_input=allow_url_input,
+            allow_non_https_url=allow_non_https_url,
+            allow_url_without_fqdn=allow_url_without_fqdn,
+            whitelisted_domains=whitelisted_domains,
+            blacklisted_domains=blacklisted_domains,
+            allow_local_storage_access=allow_local_storage_access,
+            image_in_memory=image_in_memory,
+            image_reference=image_reference,
+        )
+    def __init__(
+        self,
+        allow_url_input: bool,
+        allow_non_https_url: bool,
+        allow_url_without_fqdn: bool,
+        whitelisted_domains: Optional[List[str]],
+        blacklisted_domains: Optional[List[str]],
+        allow_local_storage_access: bool,
+        image_in_memory: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        image_reference: Optional[Union[str, bytes]] = None,
+        image_hash: Optional[str] = None,
+    ):
+        self._allow_url_input = allow_url_input
+        self._allow_non_https_url = allow_non_https_url
+        self._allow_url_without_fqdn = allow_url_without_fqdn
+        self._whitelisted_domains = whitelisted_domains
+        self._blacklisted_domains = blacklisted_domains
+        self._allow_local_storage_access = allow_local_storage_access
+        if image_in_memory is None and image_reference is None:
+            raise ModelRuntimeError(
+                message="Attempted to use OWLv2 image lazy loading not providing neither image "
+                "location nor image instance - this is invalid input. Contact Roboflow to get help.",
+                help_url="https://todo",
+            )
+        self._image_in_memory = image_in_memory
+        self._image_reference = image_reference
+        self._image_hash = image_hash
+    def as_numpy(self) -> np.ndarray:
+        if self._image_in_memory is not None:
+            if isinstance(self._image_in_memory, torch.Tensor):
+                self._image_in_memory = self._image_in_memory.cpu().numpy()
+            return self._image_in_memory
+        image = load_image_reference(
+            image_reference=self._image_reference,
+            allow_url_input=self._allow_url_input,
+            allow_non_https_url=self._allow_non_https_url,
+            allow_url_without_fqdn=self._allow_url_without_fqdn,
+            whitelisted_domains=self._whitelisted_domains,
+            blacklisted_domains=self._blacklisted_domains,
+            allow_local_storage_access=self._allow_local_storage_access,
+        )
+        self._image_in_memory = image
+        return image
+    def get_hash(self) -> str:
+        if self._image_hash is not None:
+            return self._image_hash
+        if self._image_reference is not None:
+            self._image_hash = hash_function(value=self._image_reference)
+        else:
+            self._image_hash = hash_function(value=self.as_numpy().tobytes())
+        return self._image_hash
+    def unload_image(self) -> None:
+        if self._image_in_memory is not None and self._image_reference is not None:
+            self._image_in_memory = None
+def load_image_reference(
+    image_reference: Union[str, bytes],
+    allow_url_input: bool,
+    allow_non_https_url: bool,
+    allow_url_without_fqdn: bool,
+    whitelisted_domains: Optional[List[str]],
+    blacklisted_domains: Optional[List[str]],
+    allow_local_storage_access: bool,
+) -> np.ndarray:
+    if isinstance(image_reference, bytes):
+        return decode_image_from_bytes(image_bytes=image_reference)
+    if is_url(reference=image_reference):
+        return decode_image_from_url(
+            url=image_reference,
+            allow_url_input=allow_url_input,
+            allow_non_https_url=allow_non_https_url,
+            allow_url_without_fqdn=allow_url_without_fqdn,
+            whitelisted_domains=whitelisted_domains,
+            blacklisted_domains=blacklisted_domains,
+        )
+    if not allow_local_storage_access:
+        return decode_image_from_base64(value=image_reference)
+    elif os.path.isfile(image_reference):
+        return cv2.imread(image_reference)
+    else:
+        return decode_image_from_base64(value=image_reference)
+def decode_image_from_url(
+    url: str,
+    allow_url_input: bool,
+    allow_non_https_url: bool,
+    allow_url_without_fqdn: bool,
+    whitelisted_domains: Optional[List[str]],
+    blacklisted_domains: Optional[List[str]],
+):
+    if not allow_url_input:
+        raise ModelInputError(
+            message="Providing images via URL is not supported in this configuration of `inference-models`.",
+            help_url="https://todo",
+        )
+    try:
+        parsed_url = urllib.parse.urlparse(url)
+    except ValueError as error:
+        raise ModelInputError(
+            message="Provided image URL is invalid.", help_url="https://todo"
+        ) from error
+    if parsed_url.scheme != "https" and not allow_non_https_url:
+        raise ModelInputError(
+            message="Providing images via non https:// URL is not supported in this configuration of `inference-models`.",
+            help_url="https://todo",
+        )
+    domain_extraction_result = tldextract.TLDExtract(suffix_list_urls=())(
+        parsed_url.netloc
+    )  # we get rid of potential ports and parse FQDNs
+    _ensure_resource_fqdn_allowed(
+        fqdn=domain_extraction_result.fqdn,
+        allow_url_without_fqdn=allow_url_without_fqdn,
+    )
+    address_parts_concatenated = _concatenate_chunks_of_network_location(
+        extraction_result=domain_extraction_result
+    )  # concatenation of chunks - even if there is no FQDN, but address
+    # it allows white-/black-list verification
+    _ensure_location_matches_destination_whitelist(
+        destination=address_parts_concatenated,
+        whitelisted_domains=whitelisted_domains,
+    )
+    _ensure_location_matches_destination_blacklist(
+        destination=address_parts_concatenated,
+        blacklisted_domains=blacklisted_domains,
+    )
+    image_content = _get_from_url(url=url)
+    return decode_image_from_bytes(image_bytes=image_content)
+def decode_image_from_base64(value: str) -> np.ndarray:
+    try:
+        value = BASE64_DATA_TYPE_PATTERN.sub("", value)
+        decoded = pybase64.b64decode(value, validate=True)
+        return decode_image_from_bytes(image_bytes=decoded)
+    except Exception as error:
+        value_prefix = value[:16]
+        raise ModelInputError(
+            message=f"Could not decode bas64 image fro reference {value_prefix}.",
+            help_url="https://todo",
+        ) from error
+def decode_image_from_bytes(image_bytes: bytes) -> np.ndarray:
+    byte_array = np.frombuffer(image_bytes, dtype=np.uint8)
+    return cv2.imdecode(byte_array, cv2.IMREAD_COLOR)
+def is_url(reference: str) -> bool:
+    return reference.startswith("http://") or reference.startswith("https://")
+def _ensure_resource_fqdn_allowed(fqdn: str, allow_url_without_fqdn: bool) -> None:
+    if not fqdn and not allow_url_without_fqdn:
+        raise ModelInputError(
+            message="Providing images via URL without FQDN is not supported in this configuration of  `inference-models`.",
+            help_url="https://todo",
+        )
+    return None
+def _concatenate_chunks_of_network_location(extraction_result: ExtractResult) -> str:
+    chunks = [
+        extraction_result.subdomain,
+        extraction_result.domain,
+        extraction_result.suffix,
+    ]
+    non_empty_chunks = [chunk for chunk in chunks if chunk]
+    result = ".".join(non_empty_chunks)
+    if result.startswith("[") and result.endswith("]"):
+        # dropping brackets for IPv6
+        return result[1:-1]
+    return result
+def _ensure_location_matches_destination_whitelist(
+    destination: str, whitelisted_domains: Optional[List[str]]
+) -> None:
+    if whitelisted_domains is None:
+        return None
+    if destination not in whitelisted_domains:
+        raise ModelInputError(
+            message="It is not allowed to reach image URL - prohibited by whitelisted destinations",
+            help_url="https://todo",
+        )
+    return None
+def _ensure_location_matches_destination_blacklist(
+    destination: str,
+    blacklisted_domains: Optional[List[str]],
+) -> None:
+    if blacklisted_domains is None:
+        return None
+    if destination in blacklisted_domains:
+        raise ModelInputError(
+            message="It is not allowed to reach image URL - prohibited by blacklisted destinations.",
+            help_url="https://todo",
+        )
+    return None
+@backoff.on_exception(
+    backoff.constant,
+    exception=RetryError,
+    max_tries=API_CALLS_MAX_TRIES,
+    interval=1,
+)
+def _get_from_url(url: str, timeout: int = 5) -> bytes:
+    try:
+        with requests.get(url, stream=True, timeout=timeout) as response:
+            if response.status_code in IDEMPOTENT_API_REQUEST_CODES_TO_RETRY:
+                raise RetryError(
+                    message=f"File hosting returned {response.status_code}",
+                    help_url="https://todo",
+                )
+            response.raise_for_status()
+            return response.content
+    except (ConnectionError, Timeout, requests.exceptions.ConnectionError):
+        raise RetryError(
+            message=f"Connectivity error",
+            help_url="https://todo",
+        )
+def compute_image_hash(image: Union[torch.Tensor, np.ndarray]) -> str:
+    if isinstance(image, torch.Tensor):
+        image = image.cpu().numpy()
+    return hash_function(value=image.tobytes())
+def hash_function(value: Union[str, bytes]) -> str:
+    return hashlib.sha1(value).hexdigest()

inference_models/models/paligemma/__init__.py ADDED Viewed

File without changes

inference_models/models/paligemma/paligemma_hf.py ADDED Viewed

@@ -0,0 +1,209 @@
+import os
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from peft import PeftModel
+from transformers import (
+    AutoProcessor,
+    BitsAndBytesConfig,
+    PaliGemmaForConditionalGeneration,
+)
+from inference_models.configuration import DEFAULT_DEVICE
+from inference_models.entities import ColorFormat
+from inference_models.models.common.roboflow.model_packages import (
+    InferenceConfig,
+    ResizeMode,
+    parse_inference_config,
+)
+from inference_models.models.common.roboflow.pre_processing import (
+    pre_process_network_input,
+)
+class PaliGemmaHF:
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path: str,
+        device: torch.device = DEFAULT_DEVICE,
+        trust_remote_code: bool = False,
+        local_files_only: bool = True,
+        quantization_config: Optional[BitsAndBytesConfig] = None,
+        disable_quantization: bool = False,
+        **kwargs,
+    ) -> "PaliGemmaHF":
+        torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
+        inference_config_path = os.path.join(
+            model_name_or_path, "inference_config.json"
+        )
+        inference_config = None
+        if os.path.exists(inference_config_path):
+            inference_config = parse_inference_config(
+                config_path=inference_config_path,
+                allowed_resize_modes={
+                    ResizeMode.STRETCH_TO,
+                    ResizeMode.LETTERBOX,
+                    ResizeMode.CENTER_CROP,
+                    ResizeMode.LETTERBOX_REFLECT_EDGES,
+                },
+            )
+        if (
+            quantization_config is None
+            and device.type == "cuda"
+            and not disable_quantization
+        ):
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+        adapter_config_path = os.path.join(model_name_or_path, "adapter_config.json")
+        if os.path.exists(adapter_config_path):
+            base_model_path = os.path.join(model_name_or_path, "base")
+            model = PaliGemmaForConditionalGeneration.from_pretrained(
+                base_model_path,
+                dtype=torch_dtype,
+                trust_remote_code=trust_remote_code,
+                local_files_only=local_files_only,
+                quantization_config=quantization_config,
+            )
+            model = PeftModel.from_pretrained(model, model_name_or_path)
+            if quantization_config is None:
+                model.merge_and_unload()
+            model.to(device)
+            processor = AutoProcessor.from_pretrained(
+                base_model_path,
+                trust_remote_code=trust_remote_code,
+                local_files_only=local_files_only,
+                use_fast=True,
+            )
+        else:
+            model = PaliGemmaForConditionalGeneration.from_pretrained(
+                model_name_or_path,
+                dtype=torch_dtype,
+                device_map=device,
+                trust_remote_code=trust_remote_code,
+                local_files_only=local_files_only,
+                quantization_config=quantization_config,
+            ).eval()
+            processor = AutoProcessor.from_pretrained(
+                model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                local_files_only=local_files_only,
+                use_fast=True,
+            )
+        return cls(
+            model=model,
+            processor=processor,
+            inference_config=inference_config,
+            device=device,
+            torch_dtype=torch_dtype,
+        )
+    def __init__(
+        self,
+        model: PaliGemmaForConditionalGeneration,
+        processor: AutoProcessor,
+        inference_config: Optional[InferenceConfig],
+        device: torch.device,
+        torch_dtype: torch.dtype,
+    ):
+        self._model = model
+        self._processor = processor
+        self._inference_config = inference_config
+        self._device = device
+        self._torch_dtype = torch_dtype
+    def prompt(
+        self,
+        images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
+        prompt: str,
+        input_color_format: Optional[ColorFormat] = None,
+        max_new_tokens: int = 400,
+        do_sample: bool = False,
+        skip_special_tokens: bool = True,
+        **kwargs,
+    ) -> List[str]:
+        inputs = self.pre_process_generation(
+            images=images, prompt=prompt, input_color_format=input_color_format
+        )
+        generated_ids = self.generate(
+            inputs=inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+        )
+        return self.post_process_generation(
+            generated_ids=generated_ids,
+            skip_special_tokens=skip_special_tokens,
+        )
+    def pre_process_generation(
+        self,
+        images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
+        prompt: str,
+        input_color_format: Optional[ColorFormat] = None,
+        image_size: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ) -> dict:
+        def _to_tensor(image: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
+            is_numpy = isinstance(image, np.ndarray)
+            if is_numpy:
+                tensor_image = torch.from_numpy(image.copy()).permute(2, 0, 1)
+            else:
+                tensor_image = image
+            if input_color_format == "bgr" or (is_numpy and input_color_format is None):
+                tensor_image = tensor_image[[2, 1, 0], :, :]
+            return tensor_image
+        if self._inference_config is None:
+            if isinstance(images, torch.Tensor) and images.ndim > 3:
+                image_list = [_to_tensor(img) for img in images]
+            elif not isinstance(images, list):
+                image_list = [_to_tensor(images)]
+            else:
+                image_list = [_to_tensor(img) for img in images]
+        else:
+            images = pre_process_network_input(
+                images=images,
+                image_pre_processing=self._inference_config.image_pre_processing,
+                network_input=self._inference_config.network_input,
+                target_device=self._device,
+                input_color_format=input_color_format,
+                image_size_wh=image_size,
+            )[0]
+            image_list = [e[0] for e in torch.split(images, 1, dim=0)]
+        num_images = len(image_list)
+        if isinstance(prompt, str) and num_images > 1:
+            prompt = [prompt] * num_images
+        return self._processor(text=prompt, images=image_list, return_tensors="pt").to(
+            self._device
+        )
+    def generate(
+        self,
+        inputs: dict,
+        max_new_tokens: int = 400,
+        do_sample: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        with torch.inference_mode():
+            generation = self._model.generate(
+                **inputs, max_new_tokens=max_new_tokens, do_sample=do_sample
+            )
+            input_len = inputs["input_ids"].shape[-1]
+            return generation[:, input_len:]
+    def post_process_generation(
+        self,
+        generated_ids: torch.Tensor,
+        skip_special_tokens: bool = False,
+        **kwargs,
+    ) -> List[str]:
+        return self._processor.batch_decode(
+            generated_ids, skip_special_tokens=skip_special_tokens
+        )

inference_models/models/perception_encoder/__init__.py ADDED Viewed

File without changes

inference_models/models/perception_encoder/perception_encoder_pytorch.py ADDED Viewed

@@ -0,0 +1,197 @@
+import json
+from typing import Callable, List, Optional, Union
+import numpy as np
+import torch
+import torchvision.transforms as T
+from pydantic import BaseModel, ValidationError
+import inference_models.models.perception_encoder.vision_encoder.pe as pe
+import inference_models.models.perception_encoder.vision_encoder.transforms as transforms
+from inference_models.configuration import DEFAULT_DEVICE
+from inference_models.entities import ColorFormat
+from inference_models.errors import CorruptedModelPackageError
+from inference_models.models.base.embeddings import TextImageEmbeddingModel
+from inference_models.models.common.model_packages import get_model_package_contents
+class PerceptionEncoderConfig(BaseModel):
+    vision_encoder_config: str
+def load_config(config_path: str) -> PerceptionEncoderConfig:
+    config_data = {}
+    try:
+        with open(config_path) as f:
+            config_data = json.load(f)
+    except (IOError, json.JSONDecodeError) as e:
+        raise CorruptedModelPackageError(
+            message=f"Could not load or parse perception encoder model package config file: {config_path}. Details: {e}",
+            help_url="https://todo",
+        ) from e
+    try:
+        config = PerceptionEncoderConfig.model_validate(config_data)
+        return config
+    except ValidationError as e:
+        raise CorruptedModelPackageError(
+            f"Failed validate perception encoder model package config file: {config_path}. Details: {e}"
+        ) from e
+# based on original implementation using PIL images found in vision_encoder/transforms.py
+# but adjusted to work directly on tensors
+def create_image_resize_transform(
+    image_size: int,
+    center_crop: bool = False,
+    interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR,
+):
+    if center_crop:
+        crop = [
+            T.Resize(image_size, interpolation=interpolation, antialias=True),
+            T.CenterCrop(image_size),
+        ]
+    else:
+        # "Squash": most versatile
+        crop = [
+            T.Resize(
+                (image_size, image_size), interpolation=interpolation, antialias=True
+            )
+        ]
+    return T.Compose(crop)
+def create_image_normalize_transform():
+    return T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True)
+def create_preprocessor(image_size: int) -> Callable:
+    resize_transform = create_image_resize_transform(image_size)
+    normalize_transform = create_image_normalize_transform()
+    def _preprocess(
+        images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
+        input_color_format: Optional[ColorFormat] = None,
+    ) -> torch.Tensor:
+        def _to_tensor(image: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
+            is_numpy = isinstance(image, np.ndarray)
+            if is_numpy:
+                tensor_image = torch.from_numpy(image).permute(2, 0, 1)
+            else:
+                tensor_image = image
+            # For numpy array inputs, we default to BGR -> RGB conversion for compatibility.
+            # For tensor inputs, we only convert if BGR is explicitly specified, otherwise RGB is assumed.
+            if input_color_format == "bgr" or (is_numpy and input_color_format is None):
+                # BGR -> RGB
+                tensor_image = tensor_image[[2, 1, 0], :, :]
+            return tensor_image
+        if isinstance(images, list):
+            # Resize each image individually, then stack to a batch
+            resized_images = [resize_transform(_to_tensor(img)) for img in images]
+            tensor_batch = torch.stack(resized_images, dim=0)
+        else:
+            # Handle single image or pre-batched tensor
+            tensor_batch = resize_transform(_to_tensor(images))
+        # Ensure there is a batch dimension for single images
+        if tensor_batch.ndim == 3:
+            tensor_batch = tensor_batch.unsqueeze(0)
+        # Perform dtype conversion and normalization on the whole batch for efficiency
+        if tensor_batch.dtype == torch.uint8:
+            tensor_batch = tensor_batch.to(torch.float32) / 255.0
+        transformed_batch = normalize_transform(tensor_batch)
+        return transformed_batch
+    return _preprocess
+class PerceptionEncoderTorch(TextImageEmbeddingModel):
+    def __init__(
+        self,
+        model: pe.CLIP,
+        device: torch.device,
+    ):
+        self.model = model
+        self.device = device
+        self.preprocessor = create_preprocessor(model.image_size)
+        self.tokenizer = transforms.get_text_tokenizer(model.context_length)
+    @classmethod
+    def from_pretrained(
+        cls, model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, **kwargs
+    ) -> "PerceptionEncoderTorch":
+        # here model name came from path before, which maybe doesn't match directly with how our registry works
+        # instead should this be adopted to read config file that is served as part of model package?
+        # model_config = model_name_or_path.split("/")[-1]
+        # checkpoint_path = os.path.join(model_name_or_path, "model.pt")
+        model_package_content = get_model_package_contents(
+            model_package_dir=model_name_or_path,
+            elements=["config.json", "model.pt"],
+        )
+        model_config_file = model_package_content["config.json"]
+        model_weights_file = model_package_content["model.pt"]
+        config = load_config(model_config_file)
+        model = pe.CLIP.from_config(
+            config.vision_encoder_config,
+            pretrained=True,
+            checkpoint_path=model_weights_file,
+        )
+        model = model.to(device)
+        model.eval()
+        return cls(
+            model=model,
+            device=device,
+        )
+    def embed_images(
+        self,
+        images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
+        input_color_format: Optional[ColorFormat] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        img_in = self.preprocessor(images, input_color_format=input_color_format).to(
+            self.device
+        )
+        if self.device.type == "cpu" or self.device.type == "mps":
+            with torch.inference_mode():
+                image_features, _, _ = self.model(img_in, None)
+                embeddings = image_features.float()
+        else:
+            with torch.inference_mode(), torch.autocast(self.device.type):
+                image_features, _, _ = self.model(img_in, None)
+                embeddings = image_features.float()
+        return embeddings
+    def embed_text(
+        self,
+        texts: Union[str, List[str]],
+        **kwargs,
+    ) -> torch.Tensor:
+        if isinstance(texts, list):
+            texts_to_embed = texts
+        else:
+            texts_to_embed = [texts]
+        # results = []
+        # The original implementation had batching here based on CLIP_MAX_BATCH_SIZE, but not entirely sure how to handle that with Tensor output
+        # I will leave it out for now, see https://github.com/roboflow/inference/blob/main/inference/models/perception_encoder/perception_encoder.py#L227
+        tokenized = self.tokenizer(texts_to_embed).to(self.device)
+        if self.device.type == "cpu" or self.device.type == "mps":
+            with torch.no_grad():
+                _, text_features, _ = self.model(None, tokenized)
+        else:
+            with torch.inference_mode(), torch.autocast(self.device.type):
+                _, text_features, _ = self.model(None, tokenized)
+        embeddings = text_features.float()
+        return embeddings

inference_models/models/perception_encoder/vision_encoder/__init__.py ADDED Viewed

File without changes