PyPI - inference-models - Versions diffs - 0.18.3__py3-none-any.whl - Mend

inference-models 0.18.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

inference_models/__init__.py +36 -0
inference_models/configuration.py +72 -0
inference_models/constants.py +2 -0
inference_models/entities.py +5 -0
inference_models/errors.py +137 -0
inference_models/logger.py +52 -0
inference_models/model_pipelines/__init__.py +0 -0
inference_models/model_pipelines/auto_loaders/__init__.py +0 -0
inference_models/model_pipelines/auto_loaders/core.py +120 -0
inference_models/model_pipelines/auto_loaders/pipelines_registry.py +36 -0
inference_models/model_pipelines/face_and_gaze_detection/__init__.py +0 -0
inference_models/model_pipelines/face_and_gaze_detection/mediapipe_l2cs.py +200 -0
inference_models/models/__init__.py +0 -0
inference_models/models/auto_loaders/__init__.py +0 -0
inference_models/models/auto_loaders/access_manager.py +168 -0
inference_models/models/auto_loaders/auto_negotiation.py +1329 -0
inference_models/models/auto_loaders/auto_resolution_cache.py +129 -0
inference_models/models/auto_loaders/constants.py +7 -0
inference_models/models/auto_loaders/core.py +1341 -0
inference_models/models/auto_loaders/dependency_models.py +52 -0
inference_models/models/auto_loaders/entities.py +57 -0
inference_models/models/auto_loaders/models_registry.py +497 -0
inference_models/models/auto_loaders/presentation_utils.py +333 -0
inference_models/models/auto_loaders/ranking.py +413 -0
inference_models/models/auto_loaders/utils.py +31 -0
inference_models/models/base/__init__.py +0 -0
inference_models/models/base/classification.py +123 -0
inference_models/models/base/depth_estimation.py +62 -0
inference_models/models/base/documents_parsing.py +111 -0
inference_models/models/base/embeddings.py +66 -0
inference_models/models/base/instance_segmentation.py +87 -0
inference_models/models/base/keypoints_detection.py +93 -0
inference_models/models/base/object_detection.py +143 -0
inference_models/models/base/semantic_segmentation.py +74 -0
inference_models/models/base/types.py +5 -0
inference_models/models/clip/__init__.py +0 -0
inference_models/models/clip/clip_onnx.py +148 -0
inference_models/models/clip/clip_pytorch.py +104 -0
inference_models/models/clip/preprocessing.py +162 -0
inference_models/models/common/__init__.py +0 -0
inference_models/models/common/cuda.py +30 -0
inference_models/models/common/model_packages.py +25 -0
inference_models/models/common/onnx.py +379 -0
inference_models/models/common/roboflow/__init__.py +0 -0
inference_models/models/common/roboflow/model_packages.py +361 -0
inference_models/models/common/roboflow/post_processing.py +436 -0
inference_models/models/common/roboflow/pre_processing.py +1332 -0
inference_models/models/common/torch.py +20 -0
inference_models/models/common/trt.py +266 -0
inference_models/models/deep_lab_v3_plus/__init__.py +0 -0
inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_onnx.py +282 -0
inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_torch.py +264 -0
inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py +313 -0
inference_models/models/depth_anything_v2/__init__.py +0 -0
inference_models/models/depth_anything_v2/depth_anything_v2_hf.py +77 -0
inference_models/models/dinov3/__init__.py +0 -0
inference_models/models/dinov3/dinov3_classification_onnx.py +348 -0
inference_models/models/dinov3/dinov3_classification_torch.py +323 -0
inference_models/models/doctr/__init__.py +0 -0
inference_models/models/doctr/doctr_torch.py +304 -0
inference_models/models/easy_ocr/__init__.py +0 -0
inference_models/models/easy_ocr/easy_ocr_torch.py +222 -0
inference_models/models/florence2/__init__.py +0 -0
inference_models/models/florence2/florence2_hf.py +897 -0
inference_models/models/grounding_dino/__init__.py +0 -0
inference_models/models/grounding_dino/grounding_dino_torch.py +227 -0
inference_models/models/l2cs/__init__.py +0 -0
inference_models/models/l2cs/l2cs_onnx.py +216 -0
inference_models/models/mediapipe_face_detection/__init__.py +0 -0
inference_models/models/mediapipe_face_detection/face_detection.py +203 -0
inference_models/models/moondream2/__init__.py +0 -0
inference_models/models/moondream2/moondream2_hf.py +281 -0
inference_models/models/owlv2/__init__.py +0 -0
inference_models/models/owlv2/cache.py +182 -0
inference_models/models/owlv2/entities.py +112 -0
inference_models/models/owlv2/owlv2_hf.py +695 -0
inference_models/models/owlv2/reference_dataset.py +291 -0
inference_models/models/paligemma/__init__.py +0 -0
inference_models/models/paligemma/paligemma_hf.py +209 -0
inference_models/models/perception_encoder/__init__.py +0 -0
inference_models/models/perception_encoder/perception_encoder_pytorch.py +197 -0
inference_models/models/perception_encoder/vision_encoder/__init__.py +0 -0
inference_models/models/perception_encoder/vision_encoder/config.py +160 -0
inference_models/models/perception_encoder/vision_encoder/pe.py +742 -0
inference_models/models/perception_encoder/vision_encoder/rope.py +344 -0
inference_models/models/perception_encoder/vision_encoder/tokenizer.py +342 -0
inference_models/models/perception_encoder/vision_encoder/transforms.py +33 -0
inference_models/models/qwen25vl/__init__.py +1 -0
inference_models/models/qwen25vl/qwen25vl_hf.py +285 -0
inference_models/models/resnet/__init__.py +0 -0
inference_models/models/resnet/resnet_classification_onnx.py +330 -0
inference_models/models/resnet/resnet_classification_torch.py +305 -0
inference_models/models/resnet/resnet_classification_trt.py +369 -0
inference_models/models/rfdetr/__init__.py +0 -0
inference_models/models/rfdetr/backbone_builder.py +101 -0
inference_models/models/rfdetr/class_remapping.py +41 -0
inference_models/models/rfdetr/common.py +115 -0
inference_models/models/rfdetr/default_labels.py +108 -0
inference_models/models/rfdetr/dinov2_with_windowed_attn.py +1330 -0
inference_models/models/rfdetr/misc.py +26 -0
inference_models/models/rfdetr/ms_deform_attn.py +180 -0
inference_models/models/rfdetr/ms_deform_attn_func.py +60 -0
inference_models/models/rfdetr/position_encoding.py +166 -0
inference_models/models/rfdetr/post_processor.py +83 -0
inference_models/models/rfdetr/projector.py +373 -0
inference_models/models/rfdetr/rfdetr_backbone_pytorch.py +394 -0
inference_models/models/rfdetr/rfdetr_base_pytorch.py +807 -0
inference_models/models/rfdetr/rfdetr_instance_segmentation_onnx.py +206 -0
inference_models/models/rfdetr/rfdetr_instance_segmentation_pytorch.py +373 -0
inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +227 -0
inference_models/models/rfdetr/rfdetr_object_detection_onnx.py +244 -0
inference_models/models/rfdetr/rfdetr_object_detection_pytorch.py +470 -0
inference_models/models/rfdetr/rfdetr_object_detection_trt.py +270 -0
inference_models/models/rfdetr/segmentation_head.py +273 -0
inference_models/models/rfdetr/transformer.py +767 -0
inference_models/models/roboflow_instant/__init__.py +0 -0
inference_models/models/roboflow_instant/roboflow_instant_hf.py +141 -0
inference_models/models/sam/__init__.py +0 -0
inference_models/models/sam/cache.py +147 -0
inference_models/models/sam/entities.py +25 -0
inference_models/models/sam/sam_torch.py +675 -0
inference_models/models/sam2/__init__.py +0 -0
inference_models/models/sam2/cache.py +162 -0
inference_models/models/sam2/entities.py +43 -0
inference_models/models/sam2/sam2_torch.py +905 -0
inference_models/models/sam2_rt/__init__.py +0 -0
inference_models/models/sam2_rt/sam2_pytorch.py +119 -0
inference_models/models/smolvlm/__init__.py +0 -0
inference_models/models/smolvlm/smolvlm_hf.py +245 -0
inference_models/models/trocr/__init__.py +0 -0
inference_models/models/trocr/trocr_hf.py +53 -0
inference_models/models/vit/__init__.py +0 -0
inference_models/models/vit/vit_classification_huggingface.py +319 -0
inference_models/models/vit/vit_classification_onnx.py +326 -0
inference_models/models/vit/vit_classification_trt.py +365 -0
inference_models/models/yolact/__init__.py +1 -0
inference_models/models/yolact/yolact_instance_segmentation_onnx.py +336 -0
inference_models/models/yolact/yolact_instance_segmentation_trt.py +361 -0
inference_models/models/yolo_world/__init__.py +1 -0
inference_models/models/yolonas/__init__.py +0 -0
inference_models/models/yolonas/nms.py +44 -0
inference_models/models/yolonas/yolonas_object_detection_onnx.py +204 -0
inference_models/models/yolonas/yolonas_object_detection_trt.py +230 -0
inference_models/models/yolov10/__init__.py +0 -0
inference_models/models/yolov10/yolov10_object_detection_onnx.py +187 -0
inference_models/models/yolov10/yolov10_object_detection_trt.py +215 -0
inference_models/models/yolov11/__init__.py +0 -0
inference_models/models/yolov11/yolov11_onnx.py +28 -0
inference_models/models/yolov11/yolov11_torch_script.py +25 -0
inference_models/models/yolov11/yolov11_trt.py +21 -0
inference_models/models/yolov12/__init__.py +0 -0
inference_models/models/yolov12/yolov12_onnx.py +7 -0
inference_models/models/yolov12/yolov12_torch_script.py +7 -0
inference_models/models/yolov12/yolov12_trt.py +7 -0
inference_models/models/yolov5/__init__.py +0 -0
inference_models/models/yolov5/nms.py +99 -0
inference_models/models/yolov5/yolov5_instance_segmentation_onnx.py +225 -0
inference_models/models/yolov5/yolov5_instance_segmentation_trt.py +255 -0
inference_models/models/yolov5/yolov5_object_detection_onnx.py +192 -0
inference_models/models/yolov5/yolov5_object_detection_trt.py +218 -0
inference_models/models/yolov7/__init__.py +0 -0
inference_models/models/yolov7/yolov7_instance_segmentation_onnx.py +226 -0
inference_models/models/yolov7/yolov7_instance_segmentation_trt.py +253 -0
inference_models/models/yolov8/__init__.py +0 -0
inference_models/models/yolov8/yolov8_classification_onnx.py +181 -0
inference_models/models/yolov8/yolov8_instance_segmentation_onnx.py +239 -0
inference_models/models/yolov8/yolov8_instance_segmentation_torch_script.py +201 -0
inference_models/models/yolov8/yolov8_instance_segmentation_trt.py +268 -0
inference_models/models/yolov8/yolov8_key_points_detection_onnx.py +263 -0
inference_models/models/yolov8/yolov8_key_points_detection_torch_script.py +218 -0
inference_models/models/yolov8/yolov8_key_points_detection_trt.py +287 -0
inference_models/models/yolov8/yolov8_object_detection_onnx.py +213 -0
inference_models/models/yolov8/yolov8_object_detection_torch_script.py +166 -0
inference_models/models/yolov8/yolov8_object_detection_trt.py +231 -0
inference_models/models/yolov9/__init__.py +0 -0
inference_models/models/yolov9/yolov9_onnx.py +7 -0
inference_models/models/yolov9/yolov9_torch_script.py +7 -0
inference_models/models/yolov9/yolov9_trt.py +7 -0
inference_models/runtime_introspection/__init__.py +0 -0
inference_models/runtime_introspection/core.py +410 -0
inference_models/utils/__init__.py +0 -0
inference_models/utils/download.py +608 -0
inference_models/utils/environment.py +28 -0
inference_models/utils/file_system.py +51 -0
inference_models/utils/hashing.py +7 -0
inference_models/utils/imports.py +48 -0
inference_models/utils/onnx_introspection.py +17 -0
inference_models/weights_providers/__init__.py +0 -0
inference_models/weights_providers/core.py +20 -0
inference_models/weights_providers/entities.py +159 -0
inference_models/weights_providers/roboflow.py +601 -0
inference_models-0.18.3.dist-info/METADATA +466 -0
inference_models-0.18.3.dist-info/RECORD +195 -0
inference_models-0.18.3.dist-info/WHEEL +5 -0
inference_models-0.18.3.dist-info/top_level.txt +1 -0

inference_models/models/rfdetr/rfdetr_base_pytorch.py ADDED Viewed

@@ -0,0 +1,807 @@
+import argparse
+import copy
+import math
+from typing import Callable, List, Literal, Optional, Union
+import torch
+import torch.nn.functional as F
+import torchvision
+from pydantic import BaseModel, ConfigDict
+from torch import Tensor, nn
+from inference_models.models.rfdetr.backbone_builder import build_backbone
+from inference_models.models.rfdetr.misc import NestedTensor
+from inference_models.models.rfdetr.segmentation_head import SegmentationHead
+from inference_models.models.rfdetr.transformer import build_transformer
+class ModelConfig(BaseModel):
+    encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"]
+    out_feature_indexes: List[int]
+    dec_layers: int
+    two_stage: bool = True
+    projector_scale: List[Literal["P3", "P4", "P5"]]
+    hidden_dim: int
+    patch_size: int
+    num_windows: int
+    sa_nheads: int
+    ca_nheads: int
+    dec_n_points: int
+    bbox_reparam: bool = True
+    lite_refpoint_refine: bool = True
+    layer_norm: bool = True
+    amp: bool = True
+    num_classes: int = 90
+    pretrain_weights: Optional[str] = None
+    device: torch.device
+    resolution: int
+    group_detr: int = 13
+    gradient_checkpointing: bool = False
+    positional_encoding_size: int
+    ia_bce_loss: bool = True
+    cls_loss_coef: float = 1.0
+    segmentation_head: bool = False
+    mask_downsample_ratio: int = 4
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+class RFDETRBaseConfig(ModelConfig):
+    encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = (
+        "dinov2_windowed_small"
+    )
+    hidden_dim: int = 256
+    patch_size: int = 14
+    num_windows: int = 4
+    dec_layers: int = 3
+    sa_nheads: int = 8
+    ca_nheads: int = 16
+    dec_n_points: int = 2
+    num_queries: int = 300
+    num_select: int = 300
+    projector_scale: List[Literal["P3", "P4", "P5"]] = ["P4"]
+    out_feature_indexes: List[int] = [2, 5, 8, 11]
+    pretrain_weights: Optional[str] = "rf-detr-base.pth"
+    resolution: int = 560
+    positional_encoding_size: int = 37
+class RFDETRLargeConfig(RFDETRBaseConfig):
+    encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = (
+        "dinov2_windowed_base"
+    )
+    hidden_dim: int = 384
+    sa_nheads: int = 12
+    ca_nheads: int = 24
+    dec_n_points: int = 4
+    projector_scale: List[Literal["P3", "P4", "P5"]] = ["P3", "P5"]
+    pretrain_weights: Optional[str] = "rf-detr-large.pth"
+class RFDETRNanoConfig(RFDETRBaseConfig):
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 2
+    patch_size: int = 16
+    resolution: int = 384
+    positional_encoding_size: int = 24
+    pretrain_weights: Optional[str] = "rf-detr-nano.pth"
+class RFDETRSmallConfig(RFDETRBaseConfig):
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 3
+    patch_size: int = 16
+    resolution: int = 512
+    positional_encoding_size: int = 32
+    pretrain_weights: Optional[str] = "rf-detr-small.pth"
+class RFDETRMediumConfig(RFDETRBaseConfig):
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 4
+    patch_size: int = 16
+    resolution: int = 576
+    positional_encoding_size: int = 36
+    pretrain_weights: Optional[str] = "rf-detr-medium.pth"
+class RFDETRSegPreviewConfig(RFDETRBaseConfig):
+    segmentation_head: bool = True
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 4
+    patch_size: int = 12
+    resolution: int = 432
+    positional_encoding_size: int = 36
+    num_queries: int = 200
+    num_select: int = 200
+    pretrain_weights: Optional[str] = "rf-detr-seg-preview.pt"
+    num_classes: int = 90
+class LWDETR(nn.Module):
+    """This is the Group DETR v3 module that performs object detection"""
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        segmentation_head,
+        num_classes,
+        num_queries,
+        aux_loss=False,
+        group_detr=1,
+        two_stage=False,
+        lite_refpoint_refine=False,
+        bbox_reparam=False,
+    ):
+        """Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+            group_detr: Number of groups to speed detr training. Default is 1.
+            lite_refpoint_refine: TODO
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.class_embed = nn.Linear(hidden_dim, num_classes)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.segmentation_head = segmentation_head
+        query_dim = 4
+        self.refpoint_embed = nn.Embedding(num_queries * group_detr, query_dim)
+        self.query_feat = nn.Embedding(num_queries * group_detr, hidden_dim)
+        nn.init.constant_(self.refpoint_embed.weight.data, 0)
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.group_detr = group_detr
+        # iter update
+        self.lite_refpoint_refine = lite_refpoint_refine
+        if not self.lite_refpoint_refine:
+            self.transformer.decoder.bbox_embed = self.bbox_embed
+        else:
+            self.transformer.decoder.bbox_embed = None
+        self.bbox_reparam = bbox_reparam
+        # init prior_prob setting for focal loss
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
+        # init bbox_mebed
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        # two_stage
+        self.two_stage = two_stage
+        if self.two_stage:
+            self.transformer.enc_out_bbox_embed = nn.ModuleList(
+                [copy.deepcopy(self.bbox_embed) for _ in range(group_detr)]
+            )
+            self.transformer.enc_out_class_embed = nn.ModuleList(
+                [copy.deepcopy(self.class_embed) for _ in range(group_detr)]
+            )
+        self._export = False
+    def reinitialize_detection_head(self, num_classes):
+        base = self.class_embed.weight.shape[0]
+        num_repeats = int(math.ceil(num_classes / base))
+        self.class_embed.weight.data = self.class_embed.weight.data.repeat(
+            num_repeats, 1
+        )
+        self.class_embed.weight.data = self.class_embed.weight.data[:num_classes]
+        self.class_embed.bias.data = self.class_embed.bias.data.repeat(num_repeats)
+        self.class_embed.bias.data = self.class_embed.bias.data[:num_classes]
+        if self.two_stage:
+            for enc_out_class_embed in self.transformer.enc_out_class_embed:
+                enc_out_class_embed.weight.data = (
+                    enc_out_class_embed.weight.data.repeat(num_repeats, 1)
+                )
+                enc_out_class_embed.weight.data = enc_out_class_embed.weight.data[
+                    :num_classes
+                ]
+                enc_out_class_embed.bias.data = enc_out_class_embed.bias.data.repeat(
+                    num_repeats
+                )
+                enc_out_class_embed.bias.data = enc_out_class_embed.bias.data[
+                    :num_classes
+                ]
+    def export(self):
+        self._export = True
+        self._forward_origin = self.forward
+        self.forward = self.forward_export
+        for name, m in self.named_modules():
+            if (
+                hasattr(m, "export")
+                and isinstance(m.export, Callable)
+                and hasattr(m, "_export")
+                and not m._export
+            ):
+                m.export()
+    def forward(self, samples: NestedTensor, targets=None):
+        """The forward expects a NestedTensor, which consists of:
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x num_classes]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, width, height). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, poss = self.backbone(samples)
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(src)
+            masks.append(mask)
+            assert mask is not None
+        if self.training:
+            refpoint_embed_weight = self.refpoint_embed.weight
+            query_feat_weight = self.query_feat.weight
+        else:
+            # only use one group in inference
+            refpoint_embed_weight = self.refpoint_embed.weight[: self.num_queries]
+            query_feat_weight = self.query_feat.weight[: self.num_queries]
+        hs, ref_unsigmoid, hs_enc, ref_enc = self.transformer(
+            srcs, masks, poss, refpoint_embed_weight, query_feat_weight
+        )
+        if hs is not None:
+            if self.bbox_reparam:
+                outputs_coord_delta = self.bbox_embed(hs)
+                outputs_coord_cxcy = (
+                    outputs_coord_delta[..., :2] * ref_unsigmoid[..., 2:]
+                    + ref_unsigmoid[..., :2]
+                )
+                outputs_coord_wh = (
+                    outputs_coord_delta[..., 2:].exp() * ref_unsigmoid[..., 2:]
+                )
+                outputs_coord = torch.concat(
+                    [outputs_coord_cxcy, outputs_coord_wh], dim=-1
+                )
+            else:
+                outputs_coord = (self.bbox_embed(hs) + ref_unsigmoid).sigmoid()
+            outputs_class = self.class_embed(hs)
+            if self.segmentation_head is not None:
+                outputs_masks = self.segmentation_head(
+                    features[0].tensors, hs, samples.tensors.shape[-2:]
+                )
+            out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
+            if self.segmentation_head is not None:
+                out["pred_masks"] = outputs_masks[-1]
+            if self.aux_loss:
+                out["aux_outputs"] = self._set_aux_loss(
+                    outputs_class,
+                    outputs_coord,
+                    outputs_masks if self.segmentation_head is not None else None,
+                )
+        if self.two_stage:
+            group_detr = self.group_detr if self.training else 1
+            hs_enc_list = hs_enc.chunk(group_detr, dim=1)
+            cls_enc = []
+            for g_idx in range(group_detr):
+                cls_enc_gidx = self.transformer.enc_out_class_embed[g_idx](
+                    hs_enc_list[g_idx]
+                )
+                cls_enc.append(cls_enc_gidx)
+            cls_enc = torch.cat(cls_enc, dim=1)
+            if self.segmentation_head is not None:
+                masks_enc = self.segmentation_head(
+                    features[0].tensors,
+                    [
+                        hs_enc,
+                    ],
+                    samples.tensors.shape[-2:],
+                    skip_blocks=True,
+                )
+                masks_enc = torch.cat(masks_enc, dim=1)
+            if hs is not None:
+                out["enc_outputs"] = {"pred_logits": cls_enc, "pred_boxes": ref_enc}
+                if self.segmentation_head is not None:
+                    out["enc_outputs"]["pred_masks"] = masks_enc
+            else:
+                out = {"pred_logits": cls_enc, "pred_boxes": ref_enc}
+                if self.segmentation_head is not None:
+                    out["pred_masks"] = masks_enc
+        return out
+    def forward_export(self, tensors):
+        srcs, _, poss = self.backbone(tensors)
+        # only use one group in inference
+        refpoint_embed_weight = self.refpoint_embed.weight[: self.num_queries]
+        query_feat_weight = self.query_feat.weight[: self.num_queries]
+        hs, ref_unsigmoid, hs_enc, ref_enc = self.transformer(
+            srcs, None, poss, refpoint_embed_weight, query_feat_weight
+        )
+        outputs_masks = None
+        if hs is not None:
+            if self.bbox_reparam:
+                outputs_coord_delta = self.bbox_embed(hs)
+                outputs_coord_cxcy = (
+                    outputs_coord_delta[..., :2] * ref_unsigmoid[..., 2:]
+                    + ref_unsigmoid[..., :2]
+                )
+                outputs_coord_wh = (
+                    outputs_coord_delta[..., 2:].exp() * ref_unsigmoid[..., 2:]
+                )
+                outputs_coord = torch.concat(
+                    [outputs_coord_cxcy, outputs_coord_wh], dim=-1
+                )
+            else:
+                outputs_coord = (self.bbox_embed(hs) + ref_unsigmoid).sigmoid()
+            outputs_class = self.class_embed(hs)
+            if self.segmentation_head is not None:
+                outputs_masks = self.segmentation_head(
+                    srcs[0],
+                    [
+                        hs,
+                    ],
+                    tensors.shape[-2:],
+                )[0]
+        else:
+            assert self.two_stage, "if not using decoder, two_stage must be True"
+            outputs_class = self.transformer.enc_out_class_embed[0](hs_enc)
+            outputs_coord = ref_enc
+            if self.segmentation_head is not None:
+                outputs_masks = self.segmentation_head(
+                    srcs[0],
+                    [
+                        hs_enc,
+                    ],
+                    tensors.shape[-2:],
+                    skip_blocks=True,
+                )[0]
+        if outputs_masks is not None:
+            return outputs_coord, outputs_class, outputs_masks
+        else:
+            return outputs_coord, outputs_class
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if outputs_masks is not None:
+            return [
+                {"pred_logits": a, "pred_boxes": b, "pred_masks": c}
+                for a, b, c in zip(
+                    outputs_class[:-1], outputs_coord[:-1], outputs_masks[:-1]
+                )
+            ]
+        else:
+            return [
+                {"pred_logits": a, "pred_boxes": b}
+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
+            ]
+    def update_drop_path(self, drop_path_rate, vit_encoder_num_layers):
+        """ """
+        dp_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, vit_encoder_num_layers)
+        ]
+        for i in range(vit_encoder_num_layers):
+            if hasattr(self.backbone[0].encoder, "blocks"):  # Not aimv2
+                if hasattr(self.backbone[0].encoder.blocks[i].drop_path, "drop_prob"):
+                    self.backbone[0].encoder.blocks[i].drop_path.drop_prob = dp_rates[i]
+            else:  # aimv2
+                if hasattr(
+                    self.backbone[0].encoder.trunk.blocks[i].drop_path, "drop_prob"
+                ):
+                    self.backbone[0].encoder.trunk.blocks[i].drop_path.drop_prob = (
+                        dp_rates[i]
+                    )
+    def update_dropout(self, drop_rate):
+        for module in self.transformer.modules():
+            if isinstance(module, nn.Dropout):
+                module.p = drop_rate
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(
+            img, (0, padding[2], 0, padding[1], 0, padding[0])
+        )
+        padded_imgs.append(padded_img)
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(
+            m, (0, padding[2], 0, padding[1]), "constant", 1
+        )
+        padded_masks.append(padded_mask.to(torch.bool))
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+    return NestedTensor(tensor, mask=mask)
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+def build_model(config: ModelConfig) -> LWDETR:
+    # the `num_classes` naming here is somewhat misleading.
+    # it indeed corresponds to `max_obj_id + 1`, where max_obj_id
+    # is the maximum id for a class in your dataset. For example,
+    # COCO has a max_obj_id of 90, so we pass `num_classes` to be 91.
+    # As another example, for a dataset that has a single class with id 1,
+    # you should pass `num_classes` to be 2 (max_obj_id + 1).
+    # For more details on this, check the following discussion
+    # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223
+    args = populate_args(**config.dict())
+    num_classes = args.num_classes + 1
+    backbone = build_backbone(
+        encoder=args.encoder,
+        vit_encoder_num_layers=args.vit_encoder_num_layers,
+        pretrained_encoder=args.pretrained_encoder,
+        window_block_indexes=args.window_block_indexes,
+        drop_path=args.drop_path,
+        out_channels=args.hidden_dim,
+        out_feature_indexes=args.out_feature_indexes,
+        projector_scale=args.projector_scale,
+        use_cls_token=args.use_cls_token,
+        hidden_dim=args.hidden_dim,
+        position_embedding=args.position_embedding,
+        freeze_encoder=args.freeze_encoder,
+        layer_norm=args.layer_norm,
+        target_shape=(
+            args.shape
+            if hasattr(args, "shape")
+            else (
+                (args.resolution, args.resolution)
+                if hasattr(args, "resolution")
+                else (640, 640)
+            )
+        ),
+        rms_norm=args.rms_norm,
+        backbone_lora=args.backbone_lora,
+        force_no_pretrain=args.force_no_pretrain,
+        gradient_checkpointing=args.gradient_checkpointing,
+        load_dinov2_weights=args.pretrain_weights is None,
+        patch_size=config.patch_size,
+        num_windows=config.num_windows,
+        positional_encoding_size=config.positional_encoding_size,
+    )
+    if args.encoder_only:
+        return backbone[0].encoder, None, None
+    if args.backbone_only:
+        return backbone, None, None
+    args.num_feature_levels = len(args.projector_scale)
+    transformer = build_transformer(args)
+    segmentation_head = (
+        SegmentationHead(
+            args.hidden_dim,
+            args.dec_layers,
+            downsample_ratio=args.mask_downsample_ratio,
+        )
+        if args.segmentation_head
+        else None
+    )
+    return LWDETR(
+        backbone,
+        transformer,
+        segmentation_head,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        aux_loss=args.aux_loss,
+        group_detr=args.group_detr,
+        two_stage=args.two_stage,
+        lite_refpoint_refine=args.lite_refpoint_refine,
+        bbox_reparam=args.bbox_reparam,
+    )
+def populate_args(
+    # Basic training parameters
+    num_classes=2,
+    grad_accum_steps=1,
+    amp=False,
+    lr=1e-4,
+    lr_encoder=1.5e-4,
+    batch_size=2,
+    weight_decay=1e-4,
+    epochs=12,
+    lr_drop=11,
+    clip_max_norm=0.1,
+    lr_vit_layer_decay=0.8,
+    lr_component_decay=1.0,
+    do_benchmark=False,
+    # Drop parameters
+    dropout=0,
+    drop_path=0,
+    drop_mode="standard",
+    drop_schedule="constant",
+    cutoff_epoch=0,
+    # Model parameters
+    pretrained_encoder=None,
+    pretrain_weights=None,
+    pretrain_exclude_keys=None,
+    pretrain_keys_modify_to_load=None,
+    pretrained_distiller=None,
+    # Backbone parameters
+    encoder="vit_tiny",
+    vit_encoder_num_layers=12,
+    window_block_indexes=None,
+    position_embedding="sine",
+    out_feature_indexes=[-1],
+    freeze_encoder=False,
+    layer_norm=False,
+    rms_norm=False,
+    backbone_lora=False,
+    force_no_pretrain=False,
+    # Transformer parameters
+    dec_layers=3,
+    dim_feedforward=2048,
+    hidden_dim=256,
+    sa_nheads=8,
+    ca_nheads=8,
+    num_queries=300,
+    group_detr=13,
+    two_stage=False,
+    projector_scale="P4",
+    lite_refpoint_refine=False,
+    num_select=100,
+    dec_n_points=4,
+    decoder_norm="LN",
+    bbox_reparam=False,
+    freeze_batch_norm=False,
+    # Matcher parameters
+    set_cost_class=2,
+    set_cost_bbox=5,
+    set_cost_giou=2,
+    # Loss coefficients
+    cls_loss_coef=2,
+    bbox_loss_coef=5,
+    giou_loss_coef=2,
+    focal_alpha=0.25,
+    aux_loss=True,
+    sum_group_losses=False,
+    use_varifocal_loss=False,
+    use_position_supervised_loss=False,
+    ia_bce_loss=False,
+    # Dataset parameters
+    dataset_file="coco",
+    coco_path=None,
+    dataset_dir=None,
+    square_resize_div_64=False,
+    # Output parameters
+    output_dir="output",
+    dont_save_weights=False,
+    checkpoint_interval=10,
+    seed=42,
+    resume="",
+    start_epoch=0,
+    eval=False,
+    use_ema=False,
+    ema_decay=0.9997,
+    ema_tau=0,
+    num_workers=2,
+    # Distributed training parameters
+    device="cuda",
+    world_size=1,
+    dist_url="env://",
+    sync_bn=True,
+    # FP16
+    fp16_eval=False,
+    # Custom args
+    encoder_only=False,
+    backbone_only=False,
+    resolution=640,
+    use_cls_token=False,
+    multi_scale=False,
+    expanded_scales=False,
+    warmup_epochs=1,
+    lr_scheduler="step",
+    lr_min_factor=0.0,
+    # Early stopping parameters
+    early_stopping=True,
+    early_stopping_patience=10,
+    early_stopping_min_delta=0.001,
+    early_stopping_use_ema=False,
+    gradient_checkpointing=False,
+    # Additional
+    subcommand=None,
+    **extra_kwargs,  # To handle any unexpected arguments
+):
+    args = argparse.Namespace(
+        num_classes=num_classes,
+        grad_accum_steps=grad_accum_steps,
+        amp=amp,
+        lr=lr,
+        lr_encoder=lr_encoder,
+        batch_size=batch_size,
+        weight_decay=weight_decay,
+        epochs=epochs,
+        lr_drop=lr_drop,
+        clip_max_norm=clip_max_norm,
+        lr_vit_layer_decay=lr_vit_layer_decay,
+        lr_component_decay=lr_component_decay,
+        do_benchmark=do_benchmark,
+        dropout=dropout,
+        drop_path=drop_path,
+        drop_mode=drop_mode,
+        drop_schedule=drop_schedule,
+        cutoff_epoch=cutoff_epoch,
+        pretrained_encoder=pretrained_encoder,
+        pretrain_weights=pretrain_weights,
+        pretrain_exclude_keys=pretrain_exclude_keys,
+        pretrain_keys_modify_to_load=pretrain_keys_modify_to_load,
+        pretrained_distiller=pretrained_distiller,
+        encoder=encoder,
+        vit_encoder_num_layers=vit_encoder_num_layers,
+        window_block_indexes=window_block_indexes,
+        position_embedding=position_embedding,
+        out_feature_indexes=out_feature_indexes,
+        freeze_encoder=freeze_encoder,
+        layer_norm=layer_norm,
+        rms_norm=rms_norm,
+        backbone_lora=backbone_lora,
+        force_no_pretrain=force_no_pretrain,
+        dec_layers=dec_layers,
+        dim_feedforward=dim_feedforward,
+        hidden_dim=hidden_dim,
+        sa_nheads=sa_nheads,
+        ca_nheads=ca_nheads,
+        num_queries=num_queries,
+        group_detr=group_detr,
+        two_stage=two_stage,
+        projector_scale=projector_scale,
+        lite_refpoint_refine=lite_refpoint_refine,
+        num_select=num_select,
+        dec_n_points=dec_n_points,
+        decoder_norm=decoder_norm,
+        bbox_reparam=bbox_reparam,
+        freeze_batch_norm=freeze_batch_norm,
+        set_cost_class=set_cost_class,
+        set_cost_bbox=set_cost_bbox,
+        set_cost_giou=set_cost_giou,
+        cls_loss_coef=cls_loss_coef,
+        bbox_loss_coef=bbox_loss_coef,
+        giou_loss_coef=giou_loss_coef,
+        focal_alpha=focal_alpha,
+        aux_loss=aux_loss,
+        sum_group_losses=sum_group_losses,
+        use_varifocal_loss=use_varifocal_loss,
+        use_position_supervised_loss=use_position_supervised_loss,
+        ia_bce_loss=ia_bce_loss,
+        dataset_file=dataset_file,
+        coco_path=coco_path,
+        dataset_dir=dataset_dir,
+        square_resize_div_64=square_resize_div_64,
+        output_dir=output_dir,
+        dont_save_weights=dont_save_weights,
+        checkpoint_interval=checkpoint_interval,
+        seed=seed,
+        resume=resume,
+        start_epoch=start_epoch,
+        eval=eval,
+        use_ema=use_ema,
+        ema_decay=ema_decay,
+        ema_tau=ema_tau,
+        num_workers=num_workers,
+        device=device,
+        world_size=world_size,
+        dist_url=dist_url,
+        sync_bn=sync_bn,
+        fp16_eval=fp16_eval,
+        encoder_only=encoder_only,
+        backbone_only=backbone_only,
+        resolution=resolution,
+        use_cls_token=use_cls_token,
+        multi_scale=multi_scale,
+        expanded_scales=expanded_scales,
+        warmup_epochs=warmup_epochs,
+        lr_scheduler=lr_scheduler,
+        lr_min_factor=lr_min_factor,
+        early_stopping=early_stopping,
+        early_stopping_patience=early_stopping_patience,
+        early_stopping_min_delta=early_stopping_min_delta,
+        early_stopping_use_ema=early_stopping_use_ema,
+        gradient_checkpointing=gradient_checkpointing,
+        **extra_kwargs,
+    )
+    return args