PyPI - rslearn - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl - Mend

rslearn 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

rslearn/arg_parser.py +2 -9
rslearn/config/dataset.py +15 -16
rslearn/dataset/dataset.py +28 -22
rslearn/lightning_cli.py +22 -11
rslearn/main.py +1 -1
rslearn/models/anysat.py +35 -33
rslearn/models/attention_pooling.py +177 -0
rslearn/models/clip.py +5 -2
rslearn/models/component.py +12 -0
rslearn/models/croma.py +11 -3
rslearn/models/dinov3.py +2 -1
rslearn/models/faster_rcnn.py +2 -1
rslearn/models/galileo/galileo.py +58 -31
rslearn/models/module_wrapper.py +6 -1
rslearn/models/molmo.py +4 -2
rslearn/models/olmoearth_pretrain/model.py +206 -51
rslearn/models/olmoearth_pretrain/norm.py +5 -3
rslearn/models/panopticon.py +3 -1
rslearn/models/presto/presto.py +45 -15
rslearn/models/prithvi.py +9 -7
rslearn/models/sam2_enc.py +3 -1
rslearn/models/satlaspretrain.py +4 -1
rslearn/models/simple_time_series.py +43 -17
rslearn/models/ssl4eo_s12.py +19 -14
rslearn/models/swin.py +3 -1
rslearn/models/terramind.py +5 -4
rslearn/train/all_patches_dataset.py +96 -28
rslearn/train/dataset.py +102 -53
rslearn/train/model_context.py +35 -1
rslearn/train/scheduler.py +15 -0
rslearn/train/tasks/classification.py +8 -2
rslearn/train/tasks/detection.py +3 -2
rslearn/train/tasks/multi_task.py +2 -3
rslearn/train/tasks/per_pixel_regression.py +14 -5
rslearn/train/tasks/regression.py +8 -2
rslearn/train/tasks/segmentation.py +13 -4
rslearn/train/tasks/task.py +2 -2
rslearn/train/transforms/concatenate.py +45 -5
rslearn/train/transforms/crop.py +22 -8
rslearn/train/transforms/flip.py +13 -5
rslearn/train/transforms/mask.py +11 -2
rslearn/train/transforms/normalize.py +46 -15
rslearn/train/transforms/pad.py +15 -3
rslearn/train/transforms/resize.py +83 -0
rslearn/train/transforms/select_bands.py +11 -2
rslearn/train/transforms/sentinel1.py +18 -3
rslearn/utils/geometry.py +73 -0
rslearn/utils/jsonargparse.py +66 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/METADATA +1 -1
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/RECORD +55 -53
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/WHEEL +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/top_level.txt +0 -0

rslearn/models/presto/presto.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import logging
 import tempfile
+from datetime import datetime
 import torch
 from einops import rearrange, repeat
@@ -118,21 +119,21 @@ class Presto(FeatureExtractor):
             of each timestep for that pixel
         """
         bs = [x.shape[0] for x in [s1, s2, era5, srtm] if x is not None]
-        hs = [x.shape[2] for x in [s1, s2, era5, srtm] if x is not None]
-        ws = [x.shape[3] for x in [s1, s2, era5, srtm] if x is not None]
+        ts = [x.shape[2] for x in [s1, s2, era5, srtm] if x is not None]
+        hs = [x.shape[3] for x in [s1, s2, era5, srtm] if x is not None]
+        ws = [x.shape[4] for x in [s1, s2, era5, srtm] if x is not None]
         devices = [x.device for x in [s1, s2, era5, srtm] if x is not None]
         assert len(set(bs)) == 1
         assert len(set(hs)) == 1
         assert len(set(ws)) == 1
         assert len(set(devices)) == 1
-        b, h, w, device = bs[0], hs[0], ws[0], devices[0]
+        assert len(set(ts)) == 1
+        b, h, w, t, device = bs[0], hs[0], ws[0], ts[0], devices[0]
         # these values will be initialized as
         # we iterate through the data
         x: torch.Tensor | None = None
         mask: torch.Tensor | None = None
-        t: int | None = None
         for band_group in [
             (s1, s1_bands),
@@ -146,14 +147,7 @@ class Presto(FeatureExtractor):
             else:
                 continue
-            m_t = data.shape[1] // len(input_bands)
-            if t is None:
-                t = m_t
-            else:
-                if t != m_t:
-                    raise ValueError("inconsistent values for t")
-            data = rearrange(data, "b (t c) h w -> b t h w c", t=m_t)
+            data = rearrange(data, "b c t h w -> b t h w c")
             if x is None:
                 x = torch.zeros(b, t, h, w, len(INPUT_PRESTO_BANDS), device=device)
             if mask is None:
@@ -184,6 +178,23 @@ class Presto(FeatureExtractor):
             x = (x + PRESTO_ADD_BY.to(device=device)) / PRESTO_DIV_BY.to(device=device)
         return x, mask, dynamic_world.long(), months.long()
+    @staticmethod
+    def time_ranges_to_timestamps(
+        time_ranges: list[tuple[datetime, datetime]],
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Turn the time ranges stored in a RasterImage to timestamps accepted by Presto.
+        Presto only uses the month associated with each timestamp, so we take the midpoint
+        the time range. For some inputs (e.g. Sentinel 2) we take an image from a specific
+        time so that start_time == end_time == mid_time.
+        """
+        mid_ranges = [t[0] + ((t[1] - t[0]) / 2) for t in time_ranges]
+        # months are indexed 0-11
+        return torch.tensor(
+            [d.month - 1 for d in mid_ranges], dtype=torch.int32, device=device
+        )
     def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Presto backbone.
@@ -194,17 +205,36 @@ class Presto(FeatureExtractor):
             a FeatureMaps with one feature map that is at the same resolution as the
                 input (since Presto operates per-pixel).
         """
+        time_modalities = ["s1", "s2", "era5"]
         stacked_inputs = {}
         latlons: torch.Tensor | None = None
+        months: torch.Tensor | None = None
         for key in context.inputs[0].keys():
             # assume all the keys in an input are consistent
             if key in self.input_keys:
                 if key == "latlon":
-                    latlons = torch.stack([inp[key] for inp in context.inputs], dim=0)
+                    latlons = torch.stack(
+                        [inp[key].image for inp in context.inputs], dim=0
+                    )
                 else:
                     stacked_inputs[key] = torch.stack(
-                        [inp[key] for inp in context.inputs], dim=0
+                        [inp[key].image for inp in context.inputs], dim=0
                     )
+                if key in time_modalities:
+                    if months is None:
+                        if context.inputs[0][key].timestamps is not None:
+                            months = torch.stack(
+                                [
+                                    self.time_ranges_to_timestamps(
+                                        inp[key].timestamps,  # type: ignore
+                                        device=stacked_inputs[key].device,
+                                    )
+                                    for inp in context.inputs
+                                ],
+                                dim=0,
+                            )
+        if months is not None:
+            stacked_inputs["months"] = months
         (
             x,

rslearn/models/prithvi.py CHANGED Viewed

@@ -144,13 +144,15 @@ class PrithviV2(FeatureExtractor):
         """Process individual modality data.
         Args:
-            data: Input tensor of shape [B, C, H, W]
+            data: Input tensor of shape [B, C, T, H, W]
         Returns:
-            list of tensors of shape [B, C, H, W]
+            list of tensors of shape [B, C, T, H, W]
         """
         # Get original dimensions
-        original_height = data.shape[2]
+        B, C, T, H, W = data.shape
+        data = rearrange(data, "b c t h w -> b (c t) h w")
+        original_height = H
         new_height = self.patch_size if original_height == 1 else self.image_resolution
         data = F.interpolate(
             data,
@@ -158,6 +160,7 @@ class PrithviV2(FeatureExtractor):
             mode="bilinear",
             align_corners=False,
         )
+        data = rearrange(data, "b (c t) h w -> b c t h w", c=C, t=T)
         return data
     def forward(self, context: ModelContext) -> FeatureMaps:
@@ -171,17 +174,16 @@ class PrithviV2(FeatureExtractor):
             a FeatureMaps with one map of shape [B, H/p_s, W/p_s, 11*1024] that contains stacked
                 feature maps across the 11 transformer blocks.
         """
-        x = torch.stack([inp[self.INPUT_KEY] for inp in context.inputs], dim=0)
+        # x has shape BCTHW
+        x = torch.stack([inp[self.INPUT_KEY].image for inp in context.inputs], dim=0)
         x = self._resize_data(x)
-        num_timesteps = x.shape[1] // len(self.bands)
-        x = rearrange(x, "b (t c) h w -> b c t h w", t=num_timesteps)
         features = self.model.encoder.forward_features(x)
         # prepare_features_for_image_model was slightly modified since we already
         # know the number of timesteps and don't need to recompute it.
         # in addition we average along the time dimension (instead of concatenating)
         # to keep the embeddings reasonably sized.
         result = self.model.encoder.prepare_features_for_image_model(
-            features, num_timesteps
+            features, x.shape[2]
         )
         return FeatureMaps([torch.cat(result, dim=1)])

rslearn/models/sam2_enc.py CHANGED Viewed

@@ -95,7 +95,9 @@ class SAM2Encoder(FeatureExtractor):
         Returns:
             feature maps from the encoder.
         """
-        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        images = torch.stack(
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+        )
         features = self.encoder(images)
         return FeatureMaps(features)

rslearn/models/satlaspretrain.py CHANGED Viewed

@@ -76,7 +76,10 @@ class SatlasPretrain(FeatureExtractor):
         Returns:
             multi-resolution feature maps computed by the model.
         """
-        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        # take the first (assumed to be only) timestep
+        images = torch.stack(
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+        )
         feature_maps = self.model(self.maybe_resize(images))
         return FeatureMaps(feature_maps)

rslearn/models/simple_time_series.py CHANGED Viewed

@@ -3,8 +3,9 @@
 from typing import Any
 import torch
+from einops import rearrange
-from rslearn.train.model_context import ModelContext
+from rslearn.train.model_context import ModelContext, RasterImage
 from .component import FeatureExtractor, FeatureMaps
@@ -163,23 +164,44 @@ class SimpleTimeSeries(FeatureExtractor):
     def _get_batched_images(
         self, input_dicts: list[dict[str, Any]], image_key: str, image_channels: int
-    ) -> torch.Tensor:
+    ) -> list[RasterImage]:
         """Collect and reshape images across input dicts.
         The BTCHW image time series are reshaped to (B*T)CHW so they can be passed to
         the forward pass of a per-image (unitemporal) model.
         """
         images = torch.stack(
-            [input_dict[image_key] for input_dict in input_dicts], dim=0
+            [input_dict[image_key].image for input_dict in input_dicts], dim=0
+        )  # B, C, T, H, W
+        timestamps = [input_dict[image_key].timestamps for input_dict in input_dicts]
+        # if image channels is not equal to the actual number of channels, then
+        # then every N images should be batched together. For example, if the
+        # number of input channels c == 2, and image_channels == 4, then we
+        # want to pass 2 timesteps to the model.
+        # TODO is probably to make this behaviour clearer but lets leave it like
+        # this for now to not break things.
+        num_timesteps = images.shape[1] // image_channels
+        batched_timesteps = images.shape[2] // num_timesteps
+        images = rearrange(
+            images,
+            "b c (b_t k_t) h w -> (b b_t) c k_t h w",
+            b_t=batched_timesteps,
+            k_t=num_timesteps,
         )
-        n_batch = images.shape[0]
-        n_images = images.shape[1] // image_channels
-        n_height = images.shape[2]
-        n_width = images.shape[3]
-        batched_images = images.reshape(
-            n_batch * n_images, image_channels, n_height, n_width
-        )
-        return batched_images
+        if timestamps[0] is None:
+            new_timestamps = [None] * images.shape[0]
+        else:
+            # we also need to split the timestamps
+            new_timestamps = []
+            for t in timestamps:
+                for i in range(batched_timesteps):
+                    new_timestamps.append(
+                        t[i * num_timesteps : (i + 1) * num_timesteps]
+                    )
+        return [
+            RasterImage(image=image, timestamps=timestamps)
+            for image, timestamps in zip(images, new_timestamps)
+        ]  # C, T, H, W
     def forward(
         self,
@@ -208,8 +230,8 @@ class SimpleTimeSeries(FeatureExtractor):
                 if batched_inputs is None:
                     batched_inputs = [{} for _ in batched_images]
-                    n_images = batched_images.shape[0] // n_batch
-                elif n_images != batched_images.shape[0] // n_batch:
+                    n_images = len(batched_images) // n_batch
+                elif n_images != len(batched_images) // n_batch:
                     raise ValueError(
                         "expected all modalities to have the same number of timesteps"
                     )
@@ -223,13 +245,18 @@ class SimpleTimeSeries(FeatureExtractor):
                 context.inputs, self.image_key, self.image_channels
             )
             batched_inputs = [{self.image_key: image} for image in batched_images]
-            n_images = batched_images.shape[0] // n_batch
+            n_images = len(batched_images) // n_batch
         assert n_images is not None
         # Now we can apply the underlying FeatureExtractor.
         # Its output must be a FeatureMaps.
-        encoder_output = self.encoder(batched_inputs)
+        assert batched_inputs is not None
+        encoder_output = self.encoder(
+            ModelContext(
+                inputs=batched_inputs,
+                metadatas=context.metadatas,
+            )
+        )
         if not isinstance(encoder_output, FeatureMaps):
             raise ValueError(
                 "output of underlying FeatureExtractor in SimpleTimeSeries must be a FeatureMaps"
@@ -244,7 +271,6 @@ class SimpleTimeSeries(FeatureExtractor):
             )
             for feat_map in encoder_output.feature_maps
         ]
         # Groups defaults to flattening all the feature maps.
         groups = self.groups
         if not groups:

rslearn/models/ssl4eo_s12.py CHANGED Viewed

@@ -13,7 +13,7 @@ class Ssl4eoS12(FeatureExtractor):
     def __init__(
         self,
-        backbone_ckpt_path: str,
+        backbone_ckpt_path: str | None,
         arch: str = "resnet50",
         output_layers: list[int] = [0, 1, 2, 3],
     ) -> None:
@@ -39,19 +39,22 @@ class Ssl4eoS12(FeatureExtractor):
         else:
             raise ValueError(f"unknown SSL4EO-S12 architecture {arch}")
-        state_dict = torch.load(backbone_ckpt_path, weights_only=True)
-        state_dict = state_dict["teacher"]
-        prefix = "module.backbone."
-        state_dict = {
-            k[len(prefix) :]: v for k, v in state_dict.items() if k.startswith(prefix)
-        }
-        missing_keys, unexpected_keys = self.model.load_state_dict(
-            state_dict, strict=False
-        )
-        if missing_keys or unexpected_keys:
-            print(
-                f"warning: got missing_keys={missing_keys}, unexpected_keys={unexpected_keys} when loading SSL4EO-S12 state dict"
+        if backbone_ckpt_path is not None:
+            state_dict = torch.load(backbone_ckpt_path, weights_only=True)
+            state_dict = state_dict["teacher"]
+            prefix = "module.backbone."
+            state_dict = {
+                k[len(prefix) :]: v
+                for k, v in state_dict.items()
+                if k.startswith(prefix)
+            }
+            missing_keys, unexpected_keys = self.model.load_state_dict(
+                state_dict, strict=False
             )
+            if missing_keys or unexpected_keys:
+                print(
+                    f"warning: got missing_keys={missing_keys}, unexpected_keys={unexpected_keys} when loading SSL4EO-S12 state dict"
+                )
     def get_backbone_channels(self) -> list[tuple[int, int]]:
         """Returns the output channels of this model when used as a backbone.
@@ -91,7 +94,9 @@ class Ssl4eoS12(FeatureExtractor):
         Returns:
             feature maps computed by the pre-trained model.
         """
-        x = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        x = torch.stack(
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+        )
         x = self.model.conv1(x)
         x = self.model.bn1(x)
         x = self.model.relu(x)

rslearn/models/swin.py CHANGED Viewed

@@ -151,7 +151,9 @@ class Swin(FeatureExtractor):
             a FeatureVector if the configured output_layers is None, or a FeatureMaps
                 otherwise containing one feature map per configured output layer.
         """
-        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        images = torch.stack(
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+        )
         if self.output_layers:
             layer_features = []

rslearn/models/terramind.py CHANGED Viewed

@@ -143,7 +143,8 @@ class Terramind(FeatureExtractor):
             if modality not in context.inputs[0]:
                 continue
             cur = torch.stack(
-                [inp[modality] for inp in context.inputs], dim=0
+                [inp[modality].single_ts_to_chw_tensor() for inp in context.inputs],
+                dim=0,
             )  # (B, C, H, W)
             if self.do_resizing and (
                 cur.shape[2] != IMAGE_SIZE or cur.shape[3] != IMAGE_SIZE
@@ -219,7 +220,7 @@ class TerramindNormalize(Transform):
         Returns:
             The normalized image.
         """
-        images = image.float()  # (C, H, W)
+        images = image.float()  # (C, 1, H, W)
         if images.shape[0] % len(means) != 0:
             raise ValueError(
                 f"the number of image channels {images.shape[0]} is not multiple of expected number of bands {len(means)}"
@@ -247,8 +248,8 @@ class TerramindNormalize(Transform):
             band_info = PRETRAINED_BANDS[modality]
             means = [band_info[band][0] for band in band_info]
             stds = [band_info[band][1] for band in band_info]
-            input_dict[modality] = self.apply_image(
-                input_dict[modality],
+            input_dict[modality].image = self.apply_image(
+                input_dict[modality].image,
                 means,
                 stds,
             )

rslearn/train/all_patches_dataset.py CHANGED Viewed

@@ -9,8 +9,8 @@ import shapely
 import torch
 from rslearn.dataset import Window
-from rslearn.train.dataset import ModelDataset
-from rslearn.train.model_context import SampleMetadata
+from rslearn.train.dataset import DataInput, ModelDataset
+from rslearn.train.model_context import RasterImage, SampleMetadata
 from rslearn.utils.geometry import PixelBounds, STGeometry
@@ -34,22 +34,28 @@ def get_window_patch_options(
             bottommost patches may extend beyond the provided bounds.
     """
     # We stride the patches by patch_size - overlap_size until the last patch.
+    # We handle the first patch with a special case to ensure it is always used.
     # We handle the last patch with a special case to ensure it does not exceed the
     # window bounds. Instead, it may overlap the previous patch.
-    cols = list(
+    cols = [bounds[0]] + list(
         range(
-            bounds[0],
+            bounds[0] + patch_size[0],
             bounds[2] - patch_size[0],
             patch_size[0] - overlap_size[0],
         )
-    ) + [bounds[2] - patch_size[0]]
-    rows = list(
+    )
+    rows = [bounds[1]] + list(
         range(
-            bounds[1],
+            bounds[1] + patch_size[1],
             bounds[3] - patch_size[1],
             patch_size[1] - overlap_size[1],
         )
-    ) + [bounds[3] - patch_size[1]]
+    )
+    # Add last patches only if the input is larger than one patch.
+    if bounds[2] - patch_size[0] > bounds[0]:
+        cols.append(bounds[2] - patch_size[0])
+    if bounds[3] - patch_size[1] > bounds[1]:
+        rows.append(bounds[3] - patch_size[1])
     patch_bounds: list[PixelBounds] = []
     for col in cols:
@@ -62,13 +68,17 @@ def pad_slice_protect(
     raw_inputs: dict[str, Any],
     passthrough_inputs: dict[str, Any],
     patch_size: tuple[int, int],
+    inputs: dict[str, DataInput],
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Pad tensors in-place by patch size to protect slicing near right/bottom edges.
+    The padding is scaled based on each input's resolution_factor.
     Args:
         raw_inputs: the raw inputs to pad.
         passthrough_inputs: the passthrough inputs to pad.
-        patch_size: the size of the patches to extract.
+        patch_size: the size of the patches to extract (at window resolution).
+        inputs: the DataInput definitions, used to get resolution_factor per input.
     Returns:
         a tuple of (raw_inputs, passthrough_inputs).
@@ -77,12 +87,42 @@ def pad_slice_protect(
         for input_name, value in list(d.items()):
             if not isinstance(value, torch.Tensor):
                 continue
+            # Get resolution scale for this input
+            rf = inputs[input_name].resolution_factor
+            scale = rf.numerator / rf.denominator
+            # Scale the padding amount
+            scaled_pad_x = int(patch_size[0] * scale)
+            scaled_pad_y = int(patch_size[1] * scale)
             d[input_name] = torch.nn.functional.pad(
-                value, pad=(0, patch_size[0], 0, patch_size[1])
+                value, pad=(0, scaled_pad_x, 0, scaled_pad_y)
             )
     return raw_inputs, passthrough_inputs
+def crop_tensor_or_rasterimage(
+    x: torch.Tensor | RasterImage, start: tuple[int, int], end: tuple[int, int]
+) -> torch.Tensor | RasterImage:
+    """Crop a tensor or a RasterImage."""
+    if isinstance(x, torch.Tensor):
+        # Crop the CHW tensor with scaled coordinates.
+        return x[
+            :,
+            start[1] : end[1],
+            start[0] : end[0],
+        ].clone()
+    else:
+        # Crop the CTHW tensor with scaled coordinates.
+        return RasterImage(
+            x.image[
+                :,
+                :,
+                start[1] : end[1],
+                start[0] : end[0],
+            ].clone(),
+            x.timestamps,
+        )
 class IterableAllPatchesDataset(torch.utils.data.IterableDataset):
     """This wraps a ModelDataset to iterate over all patches in that dataset.
@@ -123,6 +163,7 @@ class IterableAllPatchesDataset(torch.utils.data.IterableDataset):
         self.rank = rank
         self.world_size = world_size
         self.windows = self.dataset.get_dataset_examples()
+        self.inputs = dataset.inputs
     def set_name(self, name: str) -> None:
         """Sets dataset name.
@@ -235,8 +276,10 @@ class IterableAllPatchesDataset(torch.utils.data.IterableDataset):
                 # For simplicity, pad tensors by patch size to ensure that any patch bounds
                 # extending outside the window bounds will not have issues when we slice
-                # the tensors later.
-                pad_slice_protect(raw_inputs, passthrough_inputs, self.patch_size)
+                # the tensors later. Padding is scaled per-input based on resolution_factor.
+                pad_slice_protect(
+                    raw_inputs, passthrough_inputs, self.patch_size, self.inputs
+                )
                 # Now iterate over the patches and extract/yield the crops.
                 # Note that, in case user is leveraging RslearnWriter, it is important that
@@ -258,16 +301,26 @@ class IterableAllPatchesDataset(torch.utils.data.IterableDataset):
                     )
                     # Define a helper function to handle each input dict.
+                    # Crop coordinates are scaled based on each input's resolution_factor.
                     def crop_input_dict(d: dict[str, Any]) -> dict[str, Any]:
                         cropped = {}
                         for input_name, value in d.items():
-                            if isinstance(value, torch.Tensor):
-                                # Crop the CHW tensor.
-                                cropped[input_name] = value[
-                                    :,
-                                    start_offset[1] : end_offset[1],
-                                    start_offset[0] : end_offset[0],
-                                ].clone()
+                            if isinstance(value, torch.Tensor | RasterImage):
+                                # Get resolution scale for this input
+                                rf = self.inputs[input_name].resolution_factor
+                                scale = rf.numerator / rf.denominator
+                                # Scale the crop coordinates
+                                scaled_start = (
+                                    int(start_offset[0] * scale),
+                                    int(start_offset[1] * scale),
+                                )
+                                scaled_end = (
+                                    int(end_offset[0] * scale),
+                                    int(end_offset[1] * scale),
+                                )
+                                cropped[input_name] = crop_tensor_or_rasterimage(
+                                    value, scaled_start, scaled_end
+                                )
                             elif isinstance(value, list):
                                 cropped[input_name] = [
                                     feat
@@ -348,6 +401,7 @@ class InMemoryAllPatchesDataset(torch.utils.data.Dataset):
             round(self.patch_size[1] * overlap_ratio),
         )
         self.windows = self.dataset.get_dataset_examples()
+        self.inputs = dataset.inputs
         self.window_cache: dict[
             int, tuple[dict[str, Any], dict[str, Any], SampleMetadata]
         ] = {}
@@ -378,27 +432,41 @@ class InMemoryAllPatchesDataset(torch.utils.data.Dataset):
             return self.window_cache[index]
         raw_inputs, passthrough_inputs, metadata = self.dataset.get_raw_inputs(index)
-        pad_slice_protect(raw_inputs, passthrough_inputs, self.patch_size)
+        pad_slice_protect(raw_inputs, passthrough_inputs, self.patch_size, self.inputs)
         self.window_cache[index] = (raw_inputs, passthrough_inputs, metadata)
         return self.window_cache[index]
-    @staticmethod
     def _crop_input_dict(
+        self,
         d: dict[str, Any],
         start_offset: tuple[int, int],
         end_offset: tuple[int, int],
         cur_geom: STGeometry,
     ) -> dict[str, Any]:
-        """Crop a dictionary of inputs to the given bounds."""
+        """Crop a dictionary of inputs to the given bounds.
+        Crop coordinates are scaled based on each input's resolution_factor.
+        """
         cropped = {}
         for input_name, value in d.items():
-            if isinstance(value, torch.Tensor):
-                cropped[input_name] = value[
-                    :,
-                    start_offset[1] : end_offset[1],
-                    start_offset[0] : end_offset[0],
-                ].clone()
+            if isinstance(value, torch.Tensor | RasterImage):
+                # Get resolution scale for this input
+                rf = self.inputs[input_name].resolution_factor
+                scale = rf.numerator / rf.denominator
+                # Scale the crop coordinates
+                scaled_start = (
+                    int(start_offset[0] * scale),
+                    int(start_offset[1] * scale),
+                )
+                scaled_end = (
+                    int(end_offset[0] * scale),
+                    int(end_offset[1] * scale),
+                )
+                cropped[input_name] = crop_tensor_or_rasterimage(
+                    value, scaled_start, scaled_end
+                )
             elif isinstance(value, list):
                 cropped[input_name] = [
                     feat for feat in value if cur_geom.intersects(feat.geometry)

rslearn 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

rslearn 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl