PyPI - rslearn - Versions diffs - 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl - Mend

rslearn 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

rslearn/arg_parser.py +2 -9
rslearn/config/__init__.py +2 -0
rslearn/config/dataset.py +64 -20
rslearn/dataset/add_windows.py +1 -1
rslearn/dataset/dataset.py +34 -84
rslearn/dataset/materialize.py +5 -5
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +26 -80
rslearn/lightning_cli.py +22 -11
rslearn/main.py +12 -37
rslearn/models/anysat.py +11 -9
rslearn/models/attention_pooling.py +177 -0
rslearn/models/clay/clay.py +8 -9
rslearn/models/clip.py +18 -15
rslearn/models/component.py +111 -0
rslearn/models/concatenate_features.py +21 -11
rslearn/models/conv.py +15 -8
rslearn/models/croma.py +13 -8
rslearn/models/detr/detr.py +25 -14
rslearn/models/dinov3.py +11 -6
rslearn/models/faster_rcnn.py +19 -9
rslearn/models/feature_center_crop.py +12 -9
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/galileo.py +23 -18
rslearn/models/module_wrapper.py +26 -57
rslearn/models/molmo.py +16 -14
rslearn/models/multitask.py +102 -73
rslearn/models/olmoearth_pretrain/model.py +135 -38
rslearn/models/panopticon.py +8 -7
rslearn/models/pick_features.py +18 -24
rslearn/models/pooling_decoder.py +22 -14
rslearn/models/presto/presto.py +16 -10
rslearn/models/presto/single_file_presto.py +4 -10
rslearn/models/prithvi.py +12 -8
rslearn/models/resize_features.py +21 -7
rslearn/models/sam2_enc.py +11 -9
rslearn/models/satlaspretrain.py +15 -9
rslearn/models/simple_time_series.py +37 -17
rslearn/models/singletask.py +24 -17
rslearn/models/ssl4eo_s12.py +15 -10
rslearn/models/swin.py +22 -13
rslearn/models/terramind.py +24 -7
rslearn/models/trunk.py +6 -3
rslearn/models/unet.py +18 -9
rslearn/models/upsample.py +22 -9
rslearn/train/all_patches_dataset.py +89 -37
rslearn/train/dataset.py +105 -97
rslearn/train/lightning_module.py +51 -32
rslearn/train/model_context.py +54 -0
rslearn/train/prediction_writer.py +111 -41
rslearn/train/scheduler.py +15 -0
rslearn/train/tasks/classification.py +34 -15
rslearn/train/tasks/detection.py +24 -31
rslearn/train/tasks/embedding.py +33 -29
rslearn/train/tasks/multi_task.py +7 -7
rslearn/train/tasks/per_pixel_regression.py +41 -19
rslearn/train/tasks/regression.py +38 -21
rslearn/train/tasks/segmentation.py +33 -15
rslearn/train/tasks/task.py +3 -2
rslearn/train/transforms/resize.py +74 -0
rslearn/utils/geometry.py +73 -0
rslearn/utils/jsonargparse.py +66 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/METADATA +1 -1
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/RECORD +71 -66
rslearn/dataset/index.py +0 -173
rslearn/models/registry.py +0 -22
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/WHEEL +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/top_level.txt +0 -0

rslearn/models/attention_pooling.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""An attention pooling layer."""
+import math
+from typing import Any
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from rslearn.models.component import (
+    FeatureMaps,
+    IntermediateComponent,
+    TokenFeatureMaps,
+)
+from rslearn.train.model_context import ModelContext
+class SimpleAttentionPool(IntermediateComponent):
+    """Simple Attention Pooling.
+    Given a token feature map of shape BCHWN,
+    learn an attention layer which aggregates over
+    the N dimension.
+    This is done simply by learning a mapping D->1 which is the weight
+    which should be assigned to each token during averaging:
+    output = sum [feat_token * W(feat_token) for feat_token in feat_tokens]
+    """
+    def __init__(self, in_dim: int, hidden_linear: bool = False) -> None:
+        """Initialize the simple attention pooling layer.
+        Args:
+            in_dim: the encoding dimension D
+            hidden_linear: whether to apply an additional linear transformation D -> D
+                to the feat tokens. If this is True, a ReLU activation is applied
+                after the first linear transformation.
+        """
+        super().__init__()
+        if hidden_linear:
+            self.hidden_linear = nn.Linear(in_features=in_dim, out_features=in_dim)
+        else:
+            self.hidden_linear = None
+        self.linear = nn.Linear(in_features=in_dim, out_features=1)
+    def forward_for_map(self, feat_tokens: torch.Tensor) -> torch.Tensor:
+        """Attention pooling for a single feature map (BCHWN tensor)."""
+        B, D, H, W, N = feat_tokens.shape
+        feat_tokens = rearrange(feat_tokens, "b d h w n -> (b h w) n d")
+        if self.hidden_linear is not None:
+            feat_tokens = torch.nn.functional.relu(self.hidden_linear(feat_tokens))
+        attention_scores = torch.nn.functional.softmax(self.linear(feat_tokens), dim=1)
+        feat_tokens = (attention_scores * feat_tokens).sum(dim=1)
+        return rearrange(feat_tokens, "(b h w) d -> b d h w", b=B, h=H, w=W)
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
+        """Forward pass for attention pooling linear probe.
+        Args:
+            intermediates: the output from the previous component, which must be a TokenFeatureMaps.
+                We pool over the final dimension in the TokenFeatureMaps. If multiple maps
+                are passed, we apply the same linear layers to all of them.
+            context: the model context.
+            feat_tokens (torch.Tensor): Input feature tokens of shape (B, C, H, W, N).
+        Returns:
+            torch.Tensor:
+                - output, attentioned pool over the last dimension (B, C, H, W)
+        """
+        if not isinstance(intermediates, TokenFeatureMaps):
+            raise ValueError("input to Attention Pool must be a TokenFeatureMaps")
+        features = []
+        for feat in intermediates.feature_maps:
+            features.append(self.forward_for_map(feat))
+        return FeatureMaps(features)
+class AttentionPool(IntermediateComponent):
+    """Attention Pooling.
+    Given a feature map of shape BCHWN,
+    learn an attention layer which aggregates over
+    the N dimension.
+    We do this by learning a query token, and applying a standard
+    attention mechanism against this learned query token.
+    """
+    def __init__(self, in_dim: int, num_heads: int, linear_on_kv: bool = True) -> None:
+        """Initialize the attention pooling layer.
+        Args:
+            in_dim: the encoding dimension D
+            num_heads: the number of heads to use
+            linear_on_kv: Whether to apply a linear layer on the input tokens
+            to create the key and value tokens.
+        """
+        super().__init__()
+        self.query_token: nn.Parameter = nn.Parameter(torch.empty(in_dim))
+        if linear_on_kv:
+            self.k_linear = nn.Linear(in_dim, in_dim)
+            self.v_linear = nn.Linear(in_dim, in_dim)
+        else:
+            self.k_linear = None
+            self.v_linear = None
+        if in_dim % num_heads != 0:
+            raise ValueError(
+                f"in_dim must be divisible by num_heads. Got {in_dim} and {num_heads}."
+            )
+        self.num_heads = num_heads
+        self.init_weights()
+    def init_weights(self) -> None:
+        """Initialize weights for the probe."""
+        nn.init.trunc_normal_(self.query_token, std=0.02)
+    def forward_for_map(self, feat_tokens: torch.Tensor) -> torch.Tensor:
+        """Attention pooling for a single feature map (BCHWN tensor)."""
+        B, D, H, W, N = feat_tokens.shape
+        feat_tokens = rearrange(feat_tokens, "b d h w n -> (b h w) n d")
+        collapsed_dim = B * H * W
+        q = self.query_token.expand(collapsed_dim, 1, -1)
+        q = q.reshape(
+            collapsed_dim, 1, self.num_heads, D // self.num_heads
+        )  # [B, 1, head, D_head]
+        q = rearrange(q, "b h n d -> b n h d")
+        if self.k_linear is not None:
+            assert self.v_linear is not None
+            k = self.k_linear(feat_tokens).reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+            v = self.v_linear(feat_tokens).reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+        else:
+            k = feat_tokens.reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+            v = feat_tokens.reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+        k = rearrange(k, "b n h d -> b h n d")
+        v = rearrange(v, "b n h d -> b h n d")
+        # Compute attention scores
+        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
+            D // self.num_heads
+        )
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        x = torch.matmul(attn_weights, v)  # [B, head, 1, D_head]
+        return x.reshape(B, D, H, W)
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
+        """Forward pass for attention pooling linear probe.
+        Args:
+            intermediates: the output from the previous component, which must be a TokenFeatureMaps.
+                We pool over the final dimension in the TokenFeatureMaps. If multiple feature
+                maps are passed, we apply the same attention weights (query token and linear k, v layers)
+                to all the maps.
+            context: the model context.
+            feat_tokens (torch.Tensor): Input feature tokens of shape (B, C, H, W, N).
+        Returns:
+            torch.Tensor:
+                - output, attentioned pool over the last dimension (B, C, H, W)
+        """
+        if not isinstance(intermediates, TokenFeatureMaps):
+            raise ValueError("input to Attention Pool must be a TokenFeatureMaps")
+        features = []
+        for feat in intermediates.feature_maps:
+            features.append(self.forward_for_map(feat))
+        return FeatureMaps(features)

rslearn/models/clay/clay.py CHANGED Viewed

@@ -16,6 +16,8 @@ from huggingface_hub import hf_hub_download
 # from claymodel.module import ClayMAEModule
 from terratorch.models.backbones.clay_v15.module import ClayMAEModule
+from rslearn.models.component import FeatureExtractor, FeatureMaps
+from rslearn.train.model_context import ModelContext
 from rslearn.train.transforms.normalize import Normalize
 from rslearn.train.transforms.transform import Transform
@@ -42,7 +44,7 @@ def get_clay_checkpoint_path(
     return hf_hub_download(repo_id=repo_id, filename=filename)  # nosec B615
-class Clay(torch.nn.Module):
+class Clay(FeatureExtractor):
     """Clay backbones."""
     def __init__(
@@ -108,23 +110,20 @@ class Clay(torch.nn.Module):
             image, size=(new_hw, new_hw), mode="bilinear", align_corners=False
         )
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass for the Clay model.
         Args:
-            inputs: input dicts that must include `self.modality` as a key
+            context: the model context. Input dicts must include `self.modality` as a key
         Returns:
-            List[torch.Tensor]: Single-scale feature tensors from the encoder.
+            a FeatureMaps consisting of one feature map, computed by Clay.
         """
-        if self.modality not in inputs[0]:
-            raise ValueError(f"Missing modality {self.modality} in inputs.")
         param = next(self.model.parameters())
         device = param.device
         chips = torch.stack(
-            [inp[self.modality] for inp in inputs], dim=0
+            [inp[self.modality] for inp in context.inputs], dim=0
         )  # (B, C, H, W)
         if self.do_resizing:
             chips = self._resize_image(chips, chips.shape[2])
@@ -163,7 +162,7 @@ class Clay(torch.nn.Module):
             )
         features = rearrange(spatial, "b (h w) d -> b d h w", h=side, w=side)
-        return [features]
+        return FeatureMaps([features])
     def get_backbone_channels(self) -> list:
         """Return output channels of this model when used as a backbone."""

rslearn/models/clip.py CHANGED Viewed

@@ -1,12 +1,13 @@
 """OpenAI CLIP models."""
-from typing import Any
-import torch
 from transformers import AutoModelForZeroShotImageClassification, AutoProcessor
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
-class CLIP(torch.nn.Module):
+class CLIP(FeatureExtractor):
     """CLIP image encoder."""
     def __init__(
@@ -31,17 +32,17 @@ class CLIP(torch.nn.Module):
         self.height = crop_size["height"] // stride[0]
         self.width = crop_size["width"] // stride[1]
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute outputs from the backbone.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process. The images should have values 0-255.
+        Args:
+            context: the model context. Input dicts must include "image" key containing
+                the image to process. The images should have values 0-255.
         Returns:
-            list of feature maps. The ViT produces features at one scale, so the list
-                contains a single Bx24x24x1024 feature map.
+            a FeatureMaps with one feature map from the ViT, which is always Bx24x24x1024.
         """
+        inputs = context.inputs
         device = inputs[0]["image"].device
         clip_inputs = self.processor(
             images=[inp["image"].cpu().numpy().transpose(1, 2, 0) for inp in inputs],
@@ -55,8 +56,10 @@ class CLIP(torch.nn.Module):
         batch_size = image_features.shape[0]
         # 576x1024 -> HxWxC
-        return [
-            image_features.reshape(
-                batch_size, self.height, self.width, self.num_features
-            ).permute(0, 3, 1, 2)
-        ]
+        return FeatureMaps(
+            [
+                image_features.reshape(
+                    batch_size, self.height, self.width, self.num_features
+                ).permute(0, 3, 1, 2)
+            ]
+        )

rslearn/models/component.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Model component API."""
+import abc
+from dataclasses import dataclass
+from typing import Any
+import torch
+from rslearn.train.model_context import ModelContext, ModelOutput
+class FeatureExtractor(torch.nn.Module, abc.ABC):
+    """A feature extractor that performs initial processing of the inputs.
+    The FeatureExtractor is the first component in the encoders list for
+    SingleTaskModel and MultiTaskModel.
+    """
+    @abc.abstractmethod
+    def forward(self, context: ModelContext) -> Any:
+        """Extract an initial intermediate from the model context.
+        Args:
+            context: the model context.
+        Returns:
+            any intermediate to pass to downstream components. Oftentimes this is a
+                FeatureMaps.
+        """
+        raise NotImplementedError
+class IntermediateComponent(torch.nn.Module, abc.ABC):
+    """An intermediate component in the model.
+    In SingleTaskModel and MultiTaskModel, modules after the first module
+    in the encoders list are IntermediateComponents, as are modules before the last
+    module in the decoders list(s).
+    """
+    @abc.abstractmethod
+    def forward(self, intermediates: Any, context: ModelContext) -> Any:
+        """Process the given intermediate into another intermediate.
+        Args:
+            intermediates: the output from the previous component (either a
+                FeatureExtractor or another IntermediateComponent).
+            context: the model context.
+        Returns:
+            any intermediate to pass to downstream components.
+        """
+        raise NotImplementedError
+class Predictor(torch.nn.Module, abc.ABC):
+    """A predictor that computes task-specific outputs and a loss dict.
+    In SingleTaskModel and MultiTaskModel, the last module(s) in the decoders list(s)
+    are Predictors.
+    """
+    @abc.abstractmethod
+    def forward(
+        self,
+        intermediates: Any,
+        context: ModelContext,
+        targets: list[dict[str, torch.Tensor]] | None = None,
+    ) -> ModelOutput:
+        """Compute task-specific outputs and loss dict.
+        Args:
+            intermediates: the output from the previous component.
+            context: the model context.
+            targets: the training targets, or None during prediction.
+        Returns:
+            a tuple of the task-specific outputs (which should be compatible with the
+                configured Task) and loss dict. The loss dict maps from a name for each
+                loss to a scalar tensor.
+        """
+        raise NotImplementedError
+@dataclass
+class FeatureMaps:
+    """An intermediate output type for multi-resolution feature maps."""
+    # List of BxCxHxW feature maps at different scales, ordered from highest resolution
+    # (most fine-grained) to lowest resolution (coarsest).
+    feature_maps: list[torch.Tensor]
+@dataclass
+class TokenFeatureMaps:
+    """An intermediate output type for multi-resolution BCHWN feature maps with a token dimension.
+    Unlike `FeatureMaps`, these include an additional dimension for unpooled tokens.
+    """
+    # List of BxCxHxWxN feature maps at different scales, ordered from highest resolution
+    # (most fine-grained) to lowest resolution (coarsest).
+    feature_maps: list[torch.Tensor]
+@dataclass
+class FeatureVector:
+    """An intermediate output type for a flat feature vector."""
+    # Flat BxC feature vector.
+    feature_vector: torch.Tensor

rslearn/models/concatenate_features.py CHANGED Viewed

@@ -4,8 +4,12 @@ from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext
-class ConcatenateFeatures(torch.nn.Module):
+from .component import FeatureMaps, IntermediateComponent
+class ConcatenateFeatures(IntermediateComponent):
     """Concatenate feature map with additional raw data inputs."""
     def __init__(
@@ -55,26 +59,32 @@ class ConcatenateFeatures(torch.nn.Module):
         self.conv_layers = torch.nn.Sequential(*conv_layers)
-    def forward(
-        self, features: list[torch.Tensor], inputs: list[dict[str, Any]]
-    ) -> list[torch.Tensor]:
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
         """Concatenate the feature map with the raw data inputs.
         Args:
-            features: list of feature maps at different resolutions.
-            inputs: original inputs.
+            intermediates: the previous output, which must be a FeatureMaps.
+            context: the model context. The input dicts must have a key matching the
+                configured key.
         Returns:
             concatenated feature maps.
         """
-        if not features:
-            raise ValueError("Expected at least one feature map, got none.")
+        if (
+            not isinstance(intermediates, FeatureMaps)
+            or len(intermediates.feature_maps) == 0
+        ):
+            raise ValueError(
+                "Expected input to be FeatureMaps with at least one feature map"
+            )
-        add_data = torch.stack([input_data[self.key] for input_data in inputs], dim=0)
+        add_data = torch.stack(
+            [input_data[self.key] for input_data in context.inputs], dim=0
+        )
         add_features = self.conv_layers(add_data)
         new_features: list[torch.Tensor] = []
-        for feature_map in features:
+        for feature_map in intermediates.feature_maps:
             # Shape of feature map: BCHW
             feat_h, feat_w = feature_map.shape[2], feature_map.shape[3]
@@ -90,4 +100,4 @@ class ConcatenateFeatures(torch.nn.Module):
             new_features.append(torch.cat([feature_map, resized_add_features], dim=1))
-        return new_features
+        return FeatureMaps(new_features)

rslearn/models/conv.py CHANGED Viewed

@@ -4,8 +4,12 @@ from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext
-class Conv(torch.nn.Module):
+from .component import FeatureMaps, IntermediateComponent
+class Conv(IntermediateComponent):
     """A single convolutional layer.
     It inputs a set of feature maps; the conv layer is applied to each feature map
@@ -38,19 +42,22 @@ class Conv(torch.nn.Module):
         )
         self.activation = activation
-    def forward(self, features: list[torch.Tensor], inputs: Any) -> list[torch.Tensor]:
-        """Compute flat output vector from multi-scale feature map.
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
+        """Apply conv layer on each feature map.
         Args:
-            features: list of feature maps at different resolutions.
-            inputs: original inputs (ignored).
+            intermediates: the previous output, which must be a FeatureMaps.
+            context: the model context.
         Returns:
-            flat feature vector
+            the resulting feature maps after applying the same Conv2d on each one.
         """
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to Conv must be FeatureMaps")
         new_features = []
-        for feat_map in features:
+        for feat_map in intermediates.feature_maps:
             feat_map = self.layer(feat_map)
             feat_map = self.activation(feat_map)
             new_features.append(feat_map)
-        return new_features
+        return FeatureMaps(new_features)

rslearn/models/croma.py CHANGED Viewed

@@ -12,9 +12,11 @@ from einops import rearrange
 from upath import UPath
 from rslearn.log_utils import get_logger
+from rslearn.train.model_context import ModelContext
 from rslearn.train.transforms.transform import Transform
 from rslearn.utils.fsspec import open_atomic
+from .component import FeatureExtractor, FeatureMaps
 from .use_croma import PretrainedCROMA
 logger = get_logger(__name__)
@@ -76,7 +78,7 @@ MODALITY_BANDS = {
 }
-class Croma(torch.nn.Module):
+class Croma(FeatureExtractor):
     """CROMA backbones.
     There are two model sizes, base and large.
@@ -160,20 +162,23 @@ class Croma(torch.nn.Module):
             align_corners=False,
         )
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Croma backbone.
-        Inputs:
-            inputs: input dicts that must include either/both of "sentinel2" or
-                "sentinel1" keys depending on the configured modality.
+        Args:
+            context: the model context. Input dicts must include either/both of
+                "sentinel2" or "sentinel1" keys depending on the configured modality.
+        Returns:
+            a FeatureMaps with one feature map at 1/8 the input resolution.
         """
         sentinel1: torch.Tensor | None = None
         sentinel2: torch.Tensor | None = None
         if self.modality in [CromaModality.BOTH, CromaModality.SENTINEL1]:
-            sentinel1 = torch.stack([inp["sentinel1"] for inp in inputs], dim=0)
+            sentinel1 = torch.stack([inp["sentinel1"] for inp in context.inputs], dim=0)
             sentinel1 = self._resize_image(sentinel1) if self.do_resizing else sentinel1
         if self.modality in [CromaModality.BOTH, CromaModality.SENTINEL2]:
-            sentinel2 = torch.stack([inp["sentinel2"] for inp in inputs], dim=0)
+            sentinel2 = torch.stack([inp["sentinel2"] for inp in context.inputs], dim=0)
             sentinel2 = self._resize_image(sentinel2) if self.do_resizing else sentinel2
         outputs = self.model(
@@ -200,7 +205,7 @@ class Croma(torch.nn.Module):
             w=num_patches_per_dim,
         )
-        return [features]
+        return FeatureMaps([features])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/detr/detr.py CHANGED Viewed

@@ -13,6 +13,8 @@ import torch.nn.functional as F
 from torch import nn
 import rslearn.models.detr.box_ops as box_ops
+from rslearn.models.component import FeatureMaps, Predictor
+from rslearn.train.model_context import ModelContext, ModelOutput
 from .matcher import HungarianMatcher
 from .position_encoding import PositionEmbeddingSine
@@ -405,7 +407,7 @@ class PostProcess(nn.Module):
         return results
-class Detr(nn.Module):
+class Detr(Predictor):
     """DETR prediction module.
     This combines PositionEmbeddingSine, DetrPredictor, SetCriterion, and PostProcess.
@@ -440,33 +442,39 @@ class Detr(nn.Module):
     def forward(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+    ) -> ModelOutput:
         """Compute the detection outputs and loss from features.
         DETR will use only the last feature map, which should correspond to the lowest
         resolution one.
         Args:
-            features: multi-scale feature maps.
-            inputs: original inputs, should contain image key for original image size.
-            targets: should contain class key that stores the class label.
+            intermediates: the output from the previous component. It must be a FeatureMaps.
+            context: the model context. Input dicts must contain an "image" key which we will
+                be used to establish the original image size.
+            targets: must contain class key that stores the class label.
         Returns:
-            tuple of outputs and loss dict.
+            the model output.
         """
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to Detr must be a FeatureMaps")
+        # We only use the last feature map (most fine-grained).
+        features = intermediates.feature_maps[-1]
         # Get image sizes.
         image_sizes = torch.tensor(
-            [[inp["image"].shape[2], inp["image"].shape[1]] for inp in inputs],
+            [[inp["image"].shape[2], inp["image"].shape[1]] for inp in context.inputs],
             dtype=torch.int32,
-            device=features[0].device,
+            device=features.device,
         )
-        feat_map = features[-1]
-        pos_embedding = self.pos_embedding(feat_map)
-        outputs = self.predictor(feat_map, pos_embedding)
+        pos_embedding = self.pos_embedding(features)
+        outputs = self.predictor(features, pos_embedding)
         if targets is not None:
             # Convert boxes from [x0, y0, x1, y1] to [cx, cy, w, h].
@@ -490,4 +498,7 @@ class Detr(nn.Module):
         results = self.postprocess(outputs, image_sizes)
-        return results, losses
+        return ModelOutput(
+            outputs=results,
+            loss_dict=losses,
+        )

rslearn 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

rslearn 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl