PyPI - rslearn - Versions diffs - 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl - Mend

rslearn 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

rslearn/config/__init__.py +2 -0
rslearn/config/dataset.py +55 -4
rslearn/dataset/add_windows.py +1 -1
rslearn/dataset/dataset.py +9 -65
rslearn/dataset/materialize.py +5 -5
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +26 -80
rslearn/lightning_cli.py +10 -3
rslearn/main.py +11 -36
rslearn/models/anysat.py +11 -9
rslearn/models/clay/clay.py +8 -9
rslearn/models/clip.py +18 -15
rslearn/models/component.py +99 -0
rslearn/models/concatenate_features.py +21 -11
rslearn/models/conv.py +15 -8
rslearn/models/croma.py +13 -8
rslearn/models/detr/detr.py +25 -14
rslearn/models/dinov3.py +11 -6
rslearn/models/faster_rcnn.py +19 -9
rslearn/models/feature_center_crop.py +12 -9
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/galileo.py +23 -18
rslearn/models/module_wrapper.py +26 -57
rslearn/models/molmo.py +16 -14
rslearn/models/multitask.py +102 -73
rslearn/models/olmoearth_pretrain/model.py +20 -17
rslearn/models/panopticon.py +8 -7
rslearn/models/pick_features.py +18 -24
rslearn/models/pooling_decoder.py +22 -14
rslearn/models/presto/presto.py +16 -10
rslearn/models/presto/single_file_presto.py +4 -10
rslearn/models/prithvi.py +12 -8
rslearn/models/resize_features.py +21 -7
rslearn/models/sam2_enc.py +11 -9
rslearn/models/satlaspretrain.py +15 -9
rslearn/models/simple_time_series.py +31 -17
rslearn/models/singletask.py +24 -17
rslearn/models/ssl4eo_s12.py +15 -10
rslearn/models/swin.py +22 -13
rslearn/models/terramind.py +24 -7
rslearn/models/trunk.py +6 -3
rslearn/models/unet.py +18 -9
rslearn/models/upsample.py +22 -9
rslearn/train/all_patches_dataset.py +22 -18
rslearn/train/dataset.py +69 -54
rslearn/train/lightning_module.py +51 -32
rslearn/train/model_context.py +54 -0
rslearn/train/prediction_writer.py +111 -41
rslearn/train/tasks/classification.py +34 -15
rslearn/train/tasks/detection.py +24 -31
rslearn/train/tasks/embedding.py +33 -29
rslearn/train/tasks/multi_task.py +7 -7
rslearn/train/tasks/per_pixel_regression.py +41 -19
rslearn/train/tasks/regression.py +38 -21
rslearn/train/tasks/segmentation.py +33 -15
rslearn/train/tasks/task.py +3 -2
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/METADATA +58 -25
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/RECORD +65 -62
rslearn/dataset/index.py +0 -173
rslearn/models/registry.py +0 -22
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/WHEEL +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/top_level.txt +0 -0

rslearn/models/presto/presto.py CHANGED Viewed

@@ -2,14 +2,13 @@
 import logging
 import tempfile
-from typing import Any
 import torch
 from einops import rearrange, repeat
 from huggingface_hub import hf_hub_download
-from torch import nn
 from upath import UPath
+from rslearn.models.component import FeatureExtractor, FeatureMaps
 from rslearn.models.presto.single_file_presto import (
     ERA5_BANDS,
     NUM_DYNAMIC_WORLD_CLASSES,
@@ -21,6 +20,7 @@ from rslearn.models.presto.single_file_presto import (
     SRTM_BANDS,
 )
 from rslearn.models.presto.single_file_presto import Presto as SFPresto
+from rslearn.train.model_context import ModelContext
 logger = logging.getLogger(__name__)
@@ -36,7 +36,7 @@ HF_HUB_ID = "nasaharvest/presto"
 MODEL_FILENAME = "default_model.pt"
-class Presto(nn.Module):
+class Presto(FeatureExtractor):
     """Presto."""
     input_keys = [
@@ -184,22 +184,26 @@ class Presto(nn.Module):
             x = (x + PRESTO_ADD_BY.to(device=device)) / PRESTO_DIV_BY.to(device=device)
         return x, mask, dynamic_world.long(), months.long()
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Presto backbone.
-        Inputs:
-            inputs
+        Args:
+            context: the model context. Input dicts should have some subset of Presto.input_keys.
+        Returns:
+            a FeatureMaps with one feature map that is at the same resolution as the
+                input (since Presto operates per-pixel).
         """
         stacked_inputs = {}
         latlons: torch.Tensor | None = None
-        for key in inputs[0].keys():
+        for key in context.inputs[0].keys():
             # assume all the keys in an input are consistent
             if key in self.input_keys:
                 if key == "latlon":
-                    latlons = torch.stack([inp[key] for inp in inputs], dim=0)
+                    latlons = torch.stack([inp[key] for inp in context.inputs], dim=0)
                 else:
                     stacked_inputs[key] = torch.stack(
-                        [inp[key] for inp in inputs], dim=0
+                        [inp[key] for inp in context.inputs], dim=0
                     )
         (
@@ -247,7 +251,9 @@ class Presto(nn.Module):
             )
             output_features[batch_idx : batch_idx + self.pixel_batch_size] = output_b
-        return [rearrange(output_features, "(b h w) d -> b d h w", h=h, w=w, b=b)]
+        return FeatureMaps(
+            [rearrange(output_features, "(b h w) d -> b d h w", h=h, w=w, b=b)]
+        )
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/presto/single_file_presto.py CHANGED Viewed

@@ -281,10 +281,7 @@ def get_sinusoid_encoding_table(
     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
-    if torch.cuda.is_available():
-        return torch.FloatTensor(sinusoid_table).cuda()
-    else:
-        return torch.FloatTensor(sinusoid_table)
+    return torch.FloatTensor(sinusoid_table)
 def get_month_encoding_table(d_hid: int) -> torch.Tensor:
@@ -296,10 +293,7 @@ def get_month_encoding_table(d_hid: int) -> torch.Tensor:
     cos_table = np.cos(np.stack([angles for _ in range(d_hid // 2)], axis=-1))
     month_table = np.concatenate([sin_table[:-1], cos_table[:-1]], axis=-1)
-    if torch.cuda.is_available():
-        return torch.FloatTensor(month_table).cuda()
-    else:
-        return torch.FloatTensor(month_table)
+    return torch.FloatTensor(month_table)
 def month_to_tensor(
@@ -405,7 +399,7 @@ class Encoder(nn.Module):
         """initialize_weights."""
         pos_embed = get_sinusoid_encoding_table(
             self.pos_embed.shape[1], self.pos_embed.shape[-1]
-        )
+        ).to(device=self.pos_embed.device)
         self.pos_embed.data.copy_(pos_embed)
         # initialize nn.Linear and nn.LayerNorm
@@ -640,7 +634,7 @@ class Decoder(nn.Module):
         """initialize_weights."""
         pos_embed = get_sinusoid_encoding_table(
             self.pos_embed.shape[1], self.pos_embed.shape[-1]
-        )
+        ).to(device=self.pos_embed.device)
         self.pos_embed.data.copy_(pos_embed)
         # initialize nn.Linear and nn.LayerNorm

rslearn/models/prithvi.py CHANGED Viewed

@@ -25,9 +25,12 @@ from timm.layers import to_2tuple
 from timm.models.vision_transformer import Block
 from torch.nn import functional as F
+from rslearn.train.model_context import ModelContext
 from rslearn.train.transforms.normalize import Normalize
 from rslearn.train.transforms.transform import Transform
+from .component import FeatureExtractor, FeatureMaps
 logger = logging.getLogger(__name__)
@@ -77,7 +80,7 @@ def get_config(cache_dir: Path, hf_hub_id: str, hf_hub_revision: str) -> dict[st
         return json.load(f)["pretrained_cfg"]
-class PrithviV2(nn.Module):
+class PrithviV2(FeatureExtractor):
     """An Rslearn wrapper for Prithvi 2.0."""
     INPUT_KEY = "image"
@@ -157,18 +160,18 @@ class PrithviV2(nn.Module):
         )
         return data
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Prithvi V2 backbone.
         Args:
-            inputs: input dicts that must include "image" key containing HLS
-                (Harmonized Landsat-Sentinel) data.
+            context: the model context. Input dicts must include "image" key containing
+                HLS (Harmonized Landsat-Sentinel) data.
         Returns:
-            11 feature maps (one per transformer block in the Prithvi model),
-            of shape [B, H/p_s, W/p_s, D=1024] where p_s=16 is the patch size.
+            a FeatureMaps with one map of shape [B, H/p_s, W/p_s, 11*1024] that contains stacked
+                feature maps across the 11 transformer blocks.
         """
-        x = torch.stack([inp[self.INPUT_KEY] for inp in inputs], dim=0)
+        x = torch.stack([inp[self.INPUT_KEY] for inp in context.inputs], dim=0)
         x = self._resize_data(x)
         num_timesteps = x.shape[1] // len(self.bands)
         x = rearrange(x, "b (t c) h w -> b c t h w", t=num_timesteps)
@@ -177,9 +180,10 @@ class PrithviV2(nn.Module):
         # know the number of timesteps and don't need to recompute it.
         # in addition we average along the time dimension (instead of concatenating)
         # to keep the embeddings reasonably sized.
-        return self.model.encoder.prepare_features_for_image_model(
+        result = self.model.encoder.prepare_features_for_image_model(
             features, num_timesteps
         )
+        return FeatureMaps([torch.cat(result, dim=1)])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/resize_features.py CHANGED Viewed

@@ -1,9 +1,18 @@
 """The ResizeFeatures module."""
+from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext
+from .component import (
+    FeatureMaps,
+    IntermediateComponent,
+)
-class ResizeFeatures(torch.nn.Module):
+class ResizeFeatures(IntermediateComponent):
     """Resize input features to new sizes."""
     def __init__(
@@ -30,16 +39,21 @@ class ResizeFeatures(torch.nn.Module):
             )
         self.layers = torch.nn.ModuleList(layers)
-    def forward(
-        self, features: list[torch.Tensor], inputs: list[torch.Tensor]
-    ) -> list[torch.Tensor]:
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
         """Resize the input feature maps to new sizes.
         Args:
-            features: list of feature maps at different resolutions.
-            inputs: original inputs (ignored).
+            intermediates: the outputs from the previous component, which must be a FeatureMaps.
+            context: the model context.
         Returns:
             resized feature maps
         """
-        return [self.layers[idx](feat_map) for idx, feat_map in enumerate(features)]
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to ResizeFeatures must be a FeatureMaps")
+        feat_maps = intermediates.feature_maps
+        resized_feat_maps = [
+            self.layers[idx](feat_map) for idx, feat_map in enumerate(feat_maps)
+        ]
+        return FeatureMaps(resized_feat_maps)

rslearn/models/sam2_enc.py CHANGED Viewed

@@ -1,14 +1,15 @@
 """SegmentAnything2 encoders."""
-from typing import Any
 import torch
-import torch.nn as nn
 from sam2.build_sam import build_sam2
 from upath import UPath
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
-class SAM2Encoder(nn.Module):
+class SAM2Encoder(FeatureExtractor):
     """SAM2's image encoder."""
     def __init__(self, model_identifier: str) -> None:
@@ -84,18 +85,19 @@ class SAM2Encoder(nn.Module):
         del self.model.obj_ptr_proj
         del self.model.image_encoder.neck
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Extract multi-scale features from a batch of images.
         Args:
-            inputs: List of dictionaries, each containing the input image under the key 'image'.
+            context: the model context. Input dicts must have a key 'image' containing
+                the input for the SAM2 image encoder.
         Returns:
-            List[torch.Tensor]: Multi-scale feature tensors from the encoder.
+            feature maps from the encoder.
         """
-        images = torch.stack([inp["image"] for inp in inputs], dim=0)
+        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
         features = self.encoder(images)
-        return features
+        return FeatureMaps(features)
     def get_backbone_channels(self) -> list[list[int]]:
         """Returns the output channels of the encoder at different scales.

rslearn/models/satlaspretrain.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """SatlasPretrain models."""
-from typing import Any
 import satlaspretrain_models
 import torch
 import torch.nn.functional as F
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
-class SatlasPretrain(torch.nn.Module):
+class SatlasPretrain(FeatureExtractor):
     """SatlasPretrain backbones."""
     def __init__(
@@ -64,15 +66,19 @@ class SatlasPretrain(torch.nn.Module):
         else:
             return data
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the SatlasPretrain backbone.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process.
+        Args:
+            context: the model context. Input dicts must contain an "image" key
+                containing the image input to the model.
+        Returns:
+            multi-resolution feature maps computed by the model.
         """
-        images = torch.stack([inp["image"] for inp in inputs], dim=0)
-        return self.model(self.maybe_resize(images))
+        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        feature_maps = self.model(self.maybe_resize(images))
+        return FeatureMaps(feature_maps)
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/simple_time_series.py CHANGED Viewed

@@ -4,11 +4,15 @@ from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext
-class SimpleTimeSeries(torch.nn.Module):
-    """SimpleTimeSeries wraps another encoder and applies it on an image time series.
+from .component import FeatureExtractor, FeatureMaps
-    It independently applies the other encoder on each image in the time series to
+class SimpleTimeSeries(FeatureExtractor):
+    """SimpleTimeSeries wraps another FeatureExtractor and applies it on an image time series.
+    It independently applies the other FeatureExtractor on each image in the time series to
     extract feature maps. It then provides a few ways to combine the features into one
     final feature map:
     - Temporal max pooling.
@@ -19,7 +23,7 @@ class SimpleTimeSeries(torch.nn.Module):
     def __init__(
         self,
-        encoder: torch.nn.Module,
+        encoder: FeatureExtractor,
         image_channels: int | None = None,
         op: str = "max",
         groups: list[list[int]] | None = None,
@@ -31,9 +35,9 @@ class SimpleTimeSeries(torch.nn.Module):
         """Create a new SimpleTimeSeries.
         Args:
-            encoder: the underlying encoder. It must provide get_backbone_channels
-                function that returns the output channels, or backbone_channels must be
-                set.
+            encoder: the underlying FeatureExtractor. It must provide get_backbone_channels
+                function that returns the output channels, or backbone_channels must be set.
+                It must output a FeatureMaps.
             image_channels: the number of channels per image of the time series. The
                 input should have multiple images concatenated on the channel axis, so
                 this parameter is used to distinguish the different images.
@@ -179,24 +183,27 @@ class SimpleTimeSeries(torch.nn.Module):
     def forward(
         self,
-        inputs: list[dict[str, Any]],
-    ) -> list[torch.Tensor]:
+        context: ModelContext,
+    ) -> FeatureMaps:
         """Compute outputs from the backbone.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image time
+        Args:
+            context: the model context. Input dicts must include "image" key containing the image time
                 series to process (with images concatenated on the channel dimension).
+        Returns:
+            the FeatureMaps aggregated temporally.
         """
         # First get features of each image.
         # To do so, we need to split up each grouped image into its component images (which have had their channels stacked).
         batched_inputs: list[dict[str, Any]] | None = None
-        n_batch = len(inputs)
+        n_batch = len(context.inputs)
         n_images: int | None = None
         if self.image_keys is not None:
             for image_key, image_channels in self.image_keys.items():
                 batched_images = self._get_batched_images(
-                    inputs, image_key, image_channels
+                    context.inputs, image_key, image_channels
                 )
                 if batched_inputs is None:
@@ -213,13 +220,20 @@ class SimpleTimeSeries(torch.nn.Module):
         else:
             assert self.image_channels is not None
             batched_images = self._get_batched_images(
-                inputs, self.image_key, self.image_channels
+                context.inputs, self.image_key, self.image_channels
             )
             batched_inputs = [{self.image_key: image} for image in batched_images]
             n_images = batched_images.shape[0] // n_batch
         assert n_images is not None
+        # Now we can apply the underlying FeatureExtractor.
+        # Its output must be a FeatureMaps.
+        encoder_output = self.encoder(batched_inputs)
+        if not isinstance(encoder_output, FeatureMaps):
+            raise ValueError(
+                "output of underlying FeatureExtractor in SimpleTimeSeries must be a FeatureMaps"
+            )
         all_features = [
             feat_map.reshape(
                 n_batch,
@@ -228,7 +242,7 @@ class SimpleTimeSeries(torch.nn.Module):
                 feat_map.shape[2],
                 feat_map.shape[3],
             )
-            for feat_map in self.encoder(batched_inputs)
+            for feat_map in encoder_output.feature_maps
         ]
         # Groups defaults to flattening all the feature maps.
@@ -284,7 +298,7 @@ class SimpleTimeSeries(torch.nn.Module):
                         .permute(0, 3, 1, 2)
                     )
                 else:
-                    raise Exception(f"unknown aggregation op {self.op}")
+                    raise ValueError(f"unknown aggregation op {self.op}")
                 aggregated_features.append(group_features)
@@ -293,4 +307,4 @@ class SimpleTimeSeries(torch.nn.Module):
             output_features.append(aggregated_features)
-        return output_features
+        return FeatureMaps(output_features)

rslearn/models/singletask.py CHANGED Viewed

@@ -4,6 +4,10 @@ from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext, ModelOutput
+from .component import FeatureExtractor, IntermediateComponent, Predictor
 class SingleTaskModel(torch.nn.Module):
     """Standard model wrapper.
@@ -14,38 +18,41 @@ class SingleTaskModel(torch.nn.Module):
     outputs and targets from the last module (which also receives the targets).
     """
-    def __init__(self, encoder: list[torch.nn.Module], decoder: list[torch.nn.Module]):
+    def __init__(
+        self,
+        encoder: list[FeatureExtractor | IntermediateComponent],
+        decoder: list[IntermediateComponent | Predictor],
+    ):
         """Initialize a new SingleTaskModel.
         Args:
-            encoder: modules to compute intermediate feature representations.
-            decoder: modules to compute outputs and loss.
+            encoder: modules to compute intermediate feature representations. The first
+                module must be a FeatureExtractor, and following modules must be
+                IntermediateComponents.
+            decoder: modules to compute outputs and loss. The last module must be a
+                Predictor, while the previous modules must be IntermediateComponents.
         """
         super().__init__()
-        self.encoder = torch.nn.Sequential(*encoder)
+        self.encoder = torch.nn.ModuleList(encoder)
         self.decoder = torch.nn.ModuleList(decoder)
     def forward(
         self,
-        inputs: list[dict[str, Any]],
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply the sequence of modules on the inputs.
         Args:
-            inputs: list of input dicts
+            context: the model context.
             targets: optional list of target dicts
-            info: optional dictionary of info to pass to the last module
         Returns:
-            dict with keys "outputs" and "loss_dict".
+            the model output.
         """
-        features = self.encoder(inputs)
-        cur = features
+        cur = self.encoder[0](context)
+        for module in self.encoder[1:]:
+            cur = module(cur, context)
         for module in self.decoder[:-1]:
-            cur = module(cur, inputs)
-        outputs, loss_dict = self.decoder[-1](cur, inputs, targets)
-        return {
-            "outputs": outputs,
-            "loss_dict": loss_dict,
-        }
+            cur = module(cur, context)
+        return self.decoder[-1](cur, context, targets)

rslearn/models/ssl4eo_s12.py CHANGED Viewed

@@ -1,12 +1,14 @@
 """SSL4EO-S12 models."""
-from typing import Any
 import torch
 import torchvision
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
-class Ssl4eoS12(torch.nn.Module):
+class Ssl4eoS12(FeatureExtractor):
     """The SSL4EO-S12 family of pretrained models."""
     def __init__(
@@ -74,19 +76,22 @@ class Ssl4eoS12(torch.nn.Module):
     def forward(
         self,
-        inputs: list[dict[str, Any]],
-    ) -> list[torch.Tensor]:
+        context: ModelContext,
+    ) -> FeatureMaps:
         """Compute outputs from the backbone.
         If output_layers is set, then the outputs are multi-scale feature maps;
         otherwise, the model is being used for classification so the outputs are class
         probabilities and the loss.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process.
+        Args:
+            context: the model context. Input dicts must include "image" key containing
+                the images to process.
+        Returns:
+            feature maps computed by the pre-trained model.
         """
-        x = torch.stack([inp["image"] for inp in inputs], dim=0)
+        x = torch.stack([inp["image"] for inp in context.inputs], dim=0)
         x = self.model.conv1(x)
         x = self.model.bn1(x)
         x = self.model.relu(x)
@@ -97,4 +102,4 @@ class Ssl4eoS12(torch.nn.Module):
         layer3 = self.model.layer3(layer2)
         layer4 = self.model.layer4(layer3)
         all_features = [layer1, layer2, layer3, layer4]
-        return [all_features[idx] for idx in self.output_layers]
+        return FeatureMaps([all_features[idx] for idx in self.output_layers])

rslearn/models/swin.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """Swin Transformer."""
-from typing import Any
 import torch
 import torchvision
 from torchvision.models.swin_transformer import (
@@ -13,8 +11,12 @@ from torchvision.models.swin_transformer import (
     Swin_V2_T_Weights,
 )
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps, FeatureVector
-class Swin(torch.nn.Module):
+class Swin(FeatureExtractor):
     """A Swin Transformer model.
     It can either be used stand-alone for classification, or as a feature extractor in
@@ -34,9 +36,12 @@ class Swin(torch.nn.Module):
         Args:
             arch: the architecture, e.g. "swin_v2_b" (default) or "swin_t"
             pretrained: set True to use ImageNet pre-trained weights
-            input_channels: number of input channels (default 3)
+            input_channels: number of input channels (default 3). If not 3, the first
+                layer is updated and will be randomly initialized even if pretrained is
+                set.
             output_layers: list of layers to output, default use as classification
-                model. For feature extraction, [1, 3, 5, 7] is recommended.
+                model (output FeatureVector). For feature extraction, [1, 3, 5, 7] is
+                recommended.
             num_outputs: number of output logits, defaults to 1000 which matches the
                 pretrained models.
         """
@@ -130,19 +135,23 @@ class Swin(torch.nn.Module):
     def forward(
         self,
-        inputs: list[dict[str, Any]],
-    ) -> list[torch.Tensor]:
+        context: ModelContext,
+    ) -> FeatureVector | FeatureMaps:
         """Compute outputs from the backbone.
         If output_layers is set, then the outputs are multi-scale feature maps;
         otherwise, the model is being used for classification so the outputs are class
         probabilities and the loss.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process.
+        Args:
+            context: the model context. Input dicts must include "image" key containing
+                the image to process.
+        Returns:
+            a FeatureVector if the configured output_layers is None, or a FeatureMaps
+                otherwise containing one feature map per configured output layer.
         """
-        images = torch.stack([inp["image"] for inp in inputs], dim=0)
+        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
         if self.output_layers:
             layer_features = []
@@ -150,7 +159,7 @@ class Swin(torch.nn.Module):
             for layer in self.model.features:
                 x = layer(x)
                 layer_features.append(x.permute(0, 3, 1, 2))
-            return [layer_features[idx] for idx in self.output_layers]
+            return FeatureMaps([layer_features[idx] for idx in self.output_layers])
         else:
-            return self.model(images)
+            return FeatureVector(self.model(images))

rslearn 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

rslearn 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl