PyPI - rslearn - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl - Mend

rslearn 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

rslearn/arg_parser.py +2 -9
rslearn/config/dataset.py +15 -16
rslearn/dataset/dataset.py +28 -22
rslearn/lightning_cli.py +22 -11
rslearn/main.py +1 -1
rslearn/models/anysat.py +35 -33
rslearn/models/attention_pooling.py +177 -0
rslearn/models/clip.py +5 -2
rslearn/models/component.py +12 -0
rslearn/models/croma.py +11 -3
rslearn/models/dinov3.py +2 -1
rslearn/models/faster_rcnn.py +2 -1
rslearn/models/galileo/galileo.py +58 -31
rslearn/models/module_wrapper.py +6 -1
rslearn/models/molmo.py +4 -2
rslearn/models/olmoearth_pretrain/model.py +206 -51
rslearn/models/olmoearth_pretrain/norm.py +5 -3
rslearn/models/panopticon.py +3 -1
rslearn/models/presto/presto.py +45 -15
rslearn/models/prithvi.py +9 -7
rslearn/models/sam2_enc.py +3 -1
rslearn/models/satlaspretrain.py +4 -1
rslearn/models/simple_time_series.py +43 -17
rslearn/models/ssl4eo_s12.py +19 -14
rslearn/models/swin.py +3 -1
rslearn/models/terramind.py +5 -4
rslearn/train/all_patches_dataset.py +96 -28
rslearn/train/dataset.py +102 -53
rslearn/train/model_context.py +35 -1
rslearn/train/scheduler.py +15 -0
rslearn/train/tasks/classification.py +8 -2
rslearn/train/tasks/detection.py +3 -2
rslearn/train/tasks/multi_task.py +2 -3
rslearn/train/tasks/per_pixel_regression.py +14 -5
rslearn/train/tasks/regression.py +8 -2
rslearn/train/tasks/segmentation.py +13 -4
rslearn/train/tasks/task.py +2 -2
rslearn/train/transforms/concatenate.py +45 -5
rslearn/train/transforms/crop.py +22 -8
rslearn/train/transforms/flip.py +13 -5
rslearn/train/transforms/mask.py +11 -2
rslearn/train/transforms/normalize.py +46 -15
rslearn/train/transforms/pad.py +15 -3
rslearn/train/transforms/resize.py +83 -0
rslearn/train/transforms/select_bands.py +11 -2
rslearn/train/transforms/sentinel1.py +18 -3
rslearn/utils/geometry.py +73 -0
rslearn/utils/jsonargparse.py +66 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/METADATA +1 -1
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/RECORD +55 -53
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/WHEEL +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.20.dist-info}/top_level.txt +0 -0

rslearn/models/galileo/galileo.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import math
 import tempfile
 from contextlib import nullcontext
+from datetime import datetime
 from enum import StrEnum
 from typing import cast
@@ -411,6 +412,23 @@ class GalileoModel(FeatureExtractor):
             months=months,
         )
+    @staticmethod
+    def time_ranges_to_timestamps(
+        time_ranges: list[tuple[datetime, datetime]],
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Turn the time ranges stored in a RasterImage to timestamps accepted by Galileo.
+        Galileo only uses the month associated with each timestamp, so we take the midpoint
+        the time range. For some inputs (e.g. Sentinel 2) we take an image from a specific
+        time so that start_time == end_time == mid_time.
+        """
+        mid_ranges = [t[0] + ((t[1] - t[0]) / 2) for t in time_ranges]
+        # months are indexed 0-11
+        return torch.tensor(
+            [d.month - 1 for d in mid_ranges], dtype=torch.int32, device=device
+        )
     def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Galileo backbone.
@@ -418,16 +436,16 @@ class GalileoModel(FeatureExtractor):
             context: the model context. Input dicts should contain keys corresponding to Galileo.input_keys
                 (also documented below) and values are tensors of the following shapes,
                 per input key:
-                    "s1": B (T * C) H W
-                    "s2": B (T * C) H W
-                    "era5": B (T * C) H W  (we will average over the H, W dimensions)
-                    "tc": B (T * C) H W  (we will average over the H, W dimensions)
-                    "viirs": B (T * C) H W  (we will average over the H, W dimensions)
-                    "srtm": B C H W (SRTM has no temporal dimension)
-                    "dw": : B C H W (Dynamic World should be averaged over time)
-                    "wc": B C H W (WorldCereal has no temporal dimension)
-                    "landscan":  B C H W  (we will average over the H, W dimensions)
-                    "latlon":  B C H W  (we will average over the H, W dimensions)
+                    "s1": B C T H W
+                    "s2": B C T H W
+                    "era5": B C T H W  (we will average over the H, W dimensions)
+                    "tc": B C T H W  (we will average over the H, W dimensions)
+                    "viirs": B C T H W  (we will average over the H, W dimensions)
+                    "srtm": B C 1 H W (SRTM has no temporal dimension)
+                    "dw": : B C 1 H W (Dynamic World should be averaged over time)
+                    "wc": B C 1 H W (WorldCereal has no temporal dimension)
+                    "landscan":  B C 1 H W  (we will average over the H, W dimensions)
+                    "latlon":  B C 1 H W  (we will average over the H, W dimensions)
         The output will be an embedding representing the pooled tokens. If there is
         only a single token per h/w dimension (i.e. patch_size == h,w), then we will take
@@ -436,15 +454,35 @@ class GalileoModel(FeatureExtractor):
         If there are many spatial tokens per h/w dimension (patch_size > h,w), then we will
         take a pool of the space_time unmasked tokens (i.e. of the s1 and s2 tokens).
         """
+        space_time_modalities = ["s1", "s2"]
+        time_modalities = ["era5", "tc", "viirs"]
         stacked_inputs = {}
+        months: torch.Tensor | None = None
         for key in context.inputs[0].keys():
             # assume all the keys in an input are consistent
             if key in self.input_keys:
                 stacked_inputs[key] = torch.stack(
-                    [inp[key] for inp in context.inputs], dim=0
+                    [inp[key].image for inp in context.inputs], dim=0
                 )
+                if key in space_time_modalities + time_modalities:
+                    if months is None:
+                        if context.inputs[0][key].timestamps is not None:
+                            months = torch.stack(
+                                [
+                                    self.time_ranges_to_timestamps(
+                                        inp[key].timestamps,  # type: ignore
+                                        device=stacked_inputs[key].device,
+                                    )
+                                    for inp in context.inputs
+                                ],
+                                dim=0,
+                            )
+        if months is not None:
+            stacked_inputs["months"] = months
         s_t_channels = []
-        for space_time_modality in ["s1", "s2"]:
+        for space_time_modality in space_time_modalities:
             if space_time_modality not in stacked_inputs:
                 continue
             if space_time_modality == "s1":
@@ -452,36 +490,27 @@ class GalileoModel(FeatureExtractor):
             else:
                 s_t_channels += self.s_t_channels_s2
             cur = stacked_inputs[space_time_modality]
-            # Check if it's single or multitemporal, and reshape accordingly
-            num_bands = len(S2_BANDS) if space_time_modality == "s2" else len(S1_BANDS)
-            num_timesteps = cur.shape[1] // num_bands
-            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=num_timesteps)
+            cur = rearrange(cur, "b c t h w -> b h w t c")
             stacked_inputs[space_time_modality] = cur
         for space_modality in ["srtm", "dw", "wc"]:
             if space_modality not in stacked_inputs:
                 continue
+            # take the first (and assumed only) timestep
+            stacked_inputs[space_modality] = stacked_inputs[space_modality][:, :, 0]
             stacked_inputs[space_modality] = rearrange(
                 stacked_inputs[space_modality], "b c h w -> b h w c"
             )
-        for time_modality in ["era5", "tc", "viirs"]:
+        for time_modality in time_modalities:
             if time_modality not in stacked_inputs:
                 continue
             cur = stacked_inputs[time_modality]
-            # Check if it's single or multitemporal, and reshape accordingly
-            num_bands = {
-                "era5": len(ERA5_BANDS),
-                "tc": len(TC_BANDS),
-                "viirs": len(VIIRS_BANDS),
-            }[time_modality]
-            num_timesteps = cur.shape[1] // num_bands
             # take the average over the h, w bands since Galileo
             # treats it as a pixel-timeseries
             cur = rearrange(
-                torch.nanmean(torch.nanmean(cur, dim=-1), dim=-1),
-                "b (t c) -> b t c",
-                t=num_timesteps,
+                torch.nanmean(cur, dim=(-1, -2)),
+                "b c t -> b t c",
             )
             stacked_inputs[time_modality] = cur
@@ -489,9 +518,8 @@ class GalileoModel(FeatureExtractor):
             if static_modality not in stacked_inputs:
                 continue
             cur = stacked_inputs[static_modality]
-            stacked_inputs[static_modality] = torch.nanmean(
-                torch.nanmean(cur, dim=-1), dim=-1
-            )
+            stacked_inputs[static_modality] = torch.nanmean(cur, dim=(2, 3, 4))
         galileo_input = self.construct_galileo_input(**stacked_inputs, normalize=True)
         h = galileo_input.s_t_x.shape[1]
         if h < self.patch_size:
@@ -511,7 +539,6 @@ class GalileoModel(FeatureExtractor):
             torch_context = torch.amp.autocast(
                 device_type=device.type, dtype=self.autocast_dtype
             )
         with torch_context:
             outputs = self.model(
                 s_t_x=galileo_input.s_t_x,

rslearn/models/module_wrapper.py CHANGED Viewed

@@ -53,7 +53,12 @@ class EncoderModuleWrapper(FeatureExtractor):
         Returns:
             the output from the last wrapped module.
         """
-        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        # take the first and only timestep. Currently no intermediate
+        # components support multi temporal inputs, so if the input is
+        # multitemporal it should be wrapped in a simple time series wrapper.
+        images = torch.stack(
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+        )
         cur: Any = FeatureMaps([images])
         for m in self.encoder_modules:
             cur = m(cur, context)

rslearn/models/molmo.py CHANGED Viewed

@@ -47,11 +47,13 @@ class Molmo(FeatureExtractor):
             a FeatureMaps. Molmo produces features at one scale, so it will contain one
                 feature map that is a Bx24x24x2048 tensor.
         """
-        device = context.inputs[0]["image"].device
+        device = context.inputs[0]["image"].image.device
         molmo_inputs_list = []
         # Process each one so we can isolate just the full image without any crops.
         for inp in context.inputs:
-            image = inp["image"].cpu().numpy().transpose(1, 2, 0)
+            image = (
+                inp["image"].single_ts_to_chw_tensor().cpu().numpy().transpose(1, 2, 0)
+            )
             processed = self.processor.process(
                 images=[image],
                 text="",

rslearn/models/olmoearth_pretrain/model.py CHANGED Viewed

@@ -1,26 +1,27 @@
 """OlmoEarth model wrapper for fine-tuning in rslearn."""
 import json
+import warnings
 from contextlib import nullcontext
+from datetime import datetime
 from typing import Any
 import torch
 from einops import rearrange
-from olmo_core.config import Config
-from olmo_core.distributed.checkpoint import load_model_and_optim_state
+from olmoearth_pretrain.config import Config, require_olmo_core
 from olmoearth_pretrain.data.constants import Modality
+from olmoearth_pretrain.datatypes import MaskedOlmoEarthSample, MaskValue
 from olmoearth_pretrain.model_loader import (
     ModelID,
     load_model_from_id,
     load_model_from_path,
 )
 from olmoearth_pretrain.nn.flexihelios import Encoder, TokensAndMasks
-from olmoearth_pretrain.train.masking import MaskedOlmoEarthSample, MaskValue
 from upath import UPath
 from rslearn.log_utils import get_logger
-from rslearn.models.component import FeatureExtractor, FeatureMaps
-from rslearn.train.model_context import ModelContext
+from rslearn.models.component import FeatureExtractor, FeatureMaps, TokenFeatureMaps
+from rslearn.train.model_context import ModelContext, RasterImage
 logger = get_logger(__name__)
@@ -60,6 +61,8 @@ class OlmoEarth(FeatureExtractor):
         random_initialization: bool = False,
         embedding_size: int | None = None,
         autocast_dtype: str | None = "bfloat16",
+        token_pooling: bool = True,
+        use_legacy_timestamps: bool = True,
     ):
         """Create a new OlmoEarth model.
@@ -83,7 +86,18 @@ class OlmoEarth(FeatureExtractor):
             embedding_size: optional embedding size to report via
                 get_backbone_channels (if model_id is not set).
             autocast_dtype: which dtype to use for autocasting, or set None to disable.
+            token_pooling: whether or not to pool the tokens. If True, the output will be BxCxHxW. If False,
+                there will be an extra dimension, N, (BxCxHxWxN) representing the temporal and channel
+                dimensions.
+            use_legacy_timestamps: In our original implementation of OlmoEarth, we applied timestamps starting
+                from 0 (instead of the actual timestamps of the input). The option to do this is preserved
+                for backwards compatability with finetuned models which were trained against this implementation.
         """
+        if use_legacy_timestamps:
+            warnings.warn(
+                "For new projects, don't use legacy timesteps.", DeprecationWarning
+            )
         if (
             sum(
                 [
@@ -133,6 +147,8 @@ class OlmoEarth(FeatureExtractor):
             else:
                 model = model[part]
         self.model = model
+        self.token_pooling = token_pooling
+        self.use_legacy_timestamps = use_legacy_timestamps
     def _load_model_from_checkpoint(
         self, checkpoint_upath: UPath, random_initialization: bool
@@ -143,9 +159,12 @@ class OlmoEarth(FeatureExtractor):
         that contains the distributed checkpoint. This is the format produced by
         pre-training runs in olmoearth_pretrain.
         """
-        # Load the model config and initialize it.
         # We avoid loading the train module here because it depends on running within
         # olmo_core.
+        # Only pull in olmo_core when trying to load a distributed checkpoint to avoid dependency.
+        require_olmo_core("_load_model_from_checkpoint")
+        from olmo_core.distributed.checkpoint import load_model_and_optim_state
         with (checkpoint_upath / "config.json").open() as f:
             config_dict = json.load(f)
             model_config = Config.from_dict(config_dict["model"])
@@ -160,58 +179,161 @@ class OlmoEarth(FeatureExtractor):
         return model
-    def forward(self, context: ModelContext) -> FeatureMaps:
-        """Compute feature maps from the OlmoEarth backbone.
+    @staticmethod
+    def time_ranges_to_timestamps(
+        time_ranges: list[tuple[datetime, datetime]],
+        max_timestamps: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Turn the time ranges stored in a RasterImage to timestamps accepted by OlmoEarth.
+        OlmoEarth only uses the month associated with each timestamp, so we take the midpoint
+        the time range. For some inputs (e.g. Sentinel 2) we take an image from a specific
+        time so that start_time == end_time == mid_time.
+        """
+        timestamps = torch.zeros((max_timestamps, 3), dtype=torch.int32, device=device)
+        mid_ranges = [t[0] + ((t[1] - t[0]) / 2) for t in time_ranges]
+        timestamps[: len(time_ranges), 0] = torch.tensor(
+            [d.day for d in mid_ranges], dtype=torch.int32
+        )
+        # months are indexed 0-11
+        timestamps[: len(time_ranges), 1] = torch.tensor(
+            [d.month - 1 for d in mid_ranges], dtype=torch.int32
+        )
+        timestamps[: len(time_ranges), 2] = torch.tensor(
+            [d.year for d in mid_ranges], dtype=torch.int32
+        )
+        return timestamps
+    def _prepare_modality_inputs(
+        self, context: ModelContext
+    ) -> tuple[MaskedOlmoEarthSample, list[str], torch.device]:
+        """Prepare modality tensors and masks for the OlmoEarth model.
+        Uses a two-pass approach to ensure all modalities have consistent timestep
+        dimensions for position encoding.
         Args:
-            context: the model context. Input dicts should include keys corresponding
-                to the modalities that should be passed to the OlmoEarth model.
+            context: the model context with input tensors.
         Returns:
-            a FeatureMaps consisting of one feature map, at 1/patch_size of the input
-                resolution. Embeddings will be pooled across modalities and timesteps.
+            tuple of (sample, present_modalities, device)
         """
         kwargs = {}
         present_modalities = []
         device = None
-        # Handle the case where some modalities are multitemporal and some are not.
-        # We assume all multitemporal modalities have the same number of timesteps.
+        # First pass: find global max_timesteps across all modalities and samples
+        # TODO: currently we assume all modalities have the same number of timesteps,
+        # which is not true for all cases, and time series time steps are assumed to
+        # be 1-month apart. It also assumes continuity between available timesteps.
+        # We'll have to fix all that.
         max_timesteps = 1
+        modality_data = {}
+        # we will just store the longest time range
+        # per instance in the batch. This means it may not be
+        # aligned per modality
+        timestamps_per_instance: list[list[tuple[datetime, datetime]]] = [[]] * len(
+            context.inputs
+        )
         for modality in MODALITY_NAMES:
             if modality not in context.inputs[0]:
                 continue
             present_modalities.append(modality)
-            cur = torch.stack([inp[modality] for inp in context.inputs], dim=0)
-            device = cur.device
-            # Check if it's single or multitemporal, and reshape accordingly
-            num_bands = Modality.get(modality).num_bands
-            num_timesteps = cur.shape[1] // num_bands
-            max_timesteps = max(max_timesteps, num_timesteps)
-            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=num_timesteps)
+            tensors = []
+            for idx, inp in enumerate(context.inputs):
+                assert isinstance(inp, RasterImage)
+                tensors.append(inp[modality].image)
+                cur_timestamps = inp[modality].timestamps
+                if cur_timestamps is not None and len(cur_timestamps) > len(
+                    timestamps_per_instance[idx]
+                ):
+                    timestamps_per_instance[idx] = cur_timestamps
+            tensors = [inp[modality].image for inp in context.inputs]
+            device = tensors[0].device
+            max_t = max(t.shape[1] for t in tensors)
+            max_timesteps = max(max_timesteps, max_t)
+            modality_data[modality] = (
+                tensors,
+                len(Modality.get(modality).band_sets),
+            )
+        # Second pass: pad and process each modality with global max_timesteps
+        for modality in present_modalities:
+            tensors, num_band_sets = modality_data[modality]
+            # Pad tensors to target_ch and track original timesteps for masking
+            padded = []
+            original_timesteps = []
+            for t in tensors:
+                orig_t = t.shape[1]
+                original_timesteps.append(orig_t)
+                if orig_t < max_timesteps:
+                    pad = torch.zeros(
+                        t.shape[:1] + (max_timesteps - orig_t,) + t.shape[2:],
+                        dtype=t.dtype,
+                        device=device,
+                    )
+                    t = torch.cat([t, pad], dim=1)
+                padded.append(t)
+            cur = torch.stack(padded, dim=0)
+            cur = rearrange(cur, "b c t h w -> b h w t c")
             kwargs[modality] = cur
-            # Create mask array which is BHWTS (without channels but with band sets).
-            num_band_sets = len(Modality.get(modality).band_sets)
-            mask_shape = cur.shape[0:4] + (num_band_sets,)
-            mask = (
-                torch.ones(mask_shape, dtype=torch.int32, device=device)
-                * MaskValue.ONLINE_ENCODER.value
+            # Create mask: ONLINE_ENCODER for valid, MISSING for padded timesteps
+            b, h, w = cur.shape[0], cur.shape[1], cur.shape[2]
+            mask = torch.full(
+                (b, h, w, max_timesteps, num_band_sets),
+                fill_value=MaskValue.ONLINE_ENCODER.value,
+                dtype=torch.int32,
+                device=device,
             )
+            for sample_idx, orig_t in enumerate(original_timesteps):
+                if orig_t < max_timesteps:
+                    mask[sample_idx, :, :, orig_t:, :] = MaskValue.MISSING.value
             kwargs[f"{modality}_mask"] = mask
-        # Timestamps is required.
-        # Note that only months (0 to 11) are used in OlmoEarth position encoding.
-        # For now, we assign same timestamps to all inputs, but later we should handle varying timestamps per input.
-        timestamps = torch.zeros(
-            (len(context.inputs), max_timesteps, 3), dtype=torch.int32, device=device
-        )
-        timestamps[:, :, 0] = 1  # day
-        timestamps[:, :, 1] = torch.arange(max_timesteps, device=device)[
-            None, :
-        ]  # month
-        timestamps[:, :, 2] = 2024  # year
-        kwargs["timestamps"] = timestamps
+        if self.use_legacy_timestamps:
+            # Note that only months (0 to 11) are used in OlmoEarth position encoding.
+            timestamps = torch.zeros(
+                (len(context.inputs), max_timesteps, 3),
+                dtype=torch.int32,
+                device=device,
+            )
+            timestamps[:, :, 0] = 1  # day
+            timestamps[:, :, 1] = torch.arange(max_timesteps, device=device)[
+                None, :
+            ]  # month
+            timestamps[:, :, 2] = 2024  # year
+            kwargs["timestamps"] = timestamps
+        else:
+            if max([len(t) for t in timestamps_per_instance]) == 0:
+                # Timestamps is required.
+                raise ValueError("No inputs had timestamps.")
+            # Note that only months (0 to 11) are used in OlmoEarth position encoding.
+            kwargs["timestamps"] = torch.stack(
+                [
+                    self.time_ranges_to_timestamps(time_range, max_timesteps, device)
+                    for time_range in timestamps_per_instance
+                ],
+                dim=0,
+            )
+        return MaskedOlmoEarthSample(**kwargs), present_modalities, device
+    def forward(self, context: ModelContext) -> FeatureMaps | TokenFeatureMaps:
+        """Compute feature maps from the OlmoEarth backbone.
+        Args:
+            context: the model context. Input dicts should include keys corresponding
+                to the modalities that should be passed to the OlmoEarth model.
-        sample = MaskedOlmoEarthSample(**kwargs)
+        Returns:
+            a FeatureMaps consisting of one feature map, at 1/patch_size of the input
+                resolution. Embeddings will be pooled across modalities and timesteps.
+        """
+        sample, present_modalities, device = self._prepare_modality_inputs(context)
         # Decide context based on self.autocast_dtype.
         if self.autocast_dtype is None:
@@ -222,6 +344,14 @@ class OlmoEarth(FeatureExtractor):
                 device_type=device.type, dtype=self.autocast_dtype
             )
+        # Check if we can bypass masks (fast_pass=True)
+        missing_tokens = False
+        for modality in present_modalities:
+            modality_mask = getattr(sample, f"{modality}_mask")
+            if torch.any(modality_mask == MaskValue.MISSING.value):
+                missing_tokens = True
+                break
         with torch_context:
             # Currently we assume the provided model always returns a TokensAndMasks object.
             tokens_and_masks: TokensAndMasks
@@ -229,7 +359,7 @@ class OlmoEarth(FeatureExtractor):
                 # Encoder has a fast_pass argument to indicate mask is not needed.
                 tokens_and_masks = self.model(
                     sample,
-                    fast_pass=True,
+                    fast_pass=not missing_tokens,
                     patch_size=self.patch_size,
                     **self.forward_kwargs,
                 )["tokens_and_masks"]
@@ -241,16 +371,41 @@ class OlmoEarth(FeatureExtractor):
         # Apply temporal/modality pooling so we just have one feature per patch.
         features = []
-        for modality in present_modalities:
-            modality_features = getattr(tokens_and_masks, modality)
-            # Pool over band sets and timesteps (BHWTSC -> BHWC).
-            pooled = modality_features.mean(dim=[3, 4])
-            # We want BHWC -> BCHW.
-            pooled = rearrange(pooled, "b h w c -> b c h w")
-            features.append(pooled)
-        # Pool over the modalities, so we get one BCHW feature map.
-        pooled = torch.stack(features, dim=0).mean(dim=0)
-        return FeatureMaps([pooled])
+        if self.token_pooling:
+            for modality in present_modalities:
+                modality_features = getattr(tokens_and_masks, modality)  # BHWTSC
+                # If fast_pass is False, we need to mask the missing tokens before pooling.
+                if missing_tokens:
+                    modality_masks = getattr(
+                        tokens_and_masks, f"{modality}_mask"
+                    )  # BHWTS
+                    modality_masks_bool = (
+                        modality_masks != MaskValue.MISSING.value
+                    ).unsqueeze(-1)
+                    count = modality_masks_bool.sum(dim=[3, 4])
+                    # Masked average over band sets and timesteps (BHWTSC -> BHWC).
+                    pooled = (modality_features * modality_masks_bool).sum(
+                        dim=[3, 4]
+                    ) / count.clamp(min=1)
+                else:
+                    # Pool over band sets and timesteps (BHWTSC -> BHWC).
+                    pooled = modality_features.mean(dim=[3, 4])
+                # We want BHWC -> BCHW.
+                pooled = rearrange(pooled, "b h w c -> b c h w")
+                features.append(pooled)
+            # Pool over the modalities, so we get one BCHW feature map.
+            pooled = torch.stack(features, dim=0).mean(dim=0)
+            return FeatureMaps([pooled])
+        else:
+            for modality in present_modalities:
+                modality_features = getattr(tokens_and_masks, modality)
+                # Combine band sets and timesteps into last dim (BHWTSC -> BHWCN).
+                modality_features = rearrange(
+                    modality_features, "b h w t s c -> b c h w (t s)"
+                )
+                features.append(modality_features)
+            pooled = torch.cat(features, dim=-1)
+            return TokenFeatureMaps([pooled])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/olmoearth_pretrain/norm.py CHANGED Viewed

@@ -64,8 +64,8 @@ class OlmoEarthNormalize(Transform):
             band_norms = self.norm_config[modality_name]
             image = input_dict[modality_name]
             # Keep a set of indices to make sure that we normalize all of them.
-            needed_band_indices = set(range(image.shape[0]))
-            num_timesteps = image.shape[0] // len(cur_band_names)
+            needed_band_indices = set(range(image.image.shape[0]))
+            num_timesteps = image.image.shape[0] // len(cur_band_names)
             for band, norm_dict in band_norms.items():
                 # If multitemporal, normalize each timestep separately.
@@ -73,7 +73,9 @@ class OlmoEarthNormalize(Transform):
                     band_idx = cur_band_names.index(band) + t * len(cur_band_names)
                     min_val = norm_dict["mean"] - self.std_multiplier * norm_dict["std"]
                     max_val = norm_dict["mean"] + self.std_multiplier * norm_dict["std"]
-                    image[band_idx] = (image[band_idx] - min_val) / (max_val - min_val)
+                    image.image[band_idx] = (image.image[band_idx] - min_val) / (
+                        max_val - min_val
+                    )
                     needed_band_indices.remove(band_idx)
             if len(needed_band_indices) > 0:

rslearn/models/panopticon.py CHANGED Viewed

@@ -142,7 +142,9 @@ class Panopticon(FeatureExtractor):
     def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass through the panopticon model."""
         batch_inputs = {
-            key: torch.stack([inp[key] for inp in context.inputs], dim=0)
+            key: torch.stack(
+                [inp[key].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+            )
             for key in context.inputs[0].keys()
         }
         panopticon_inputs = self.prepare_input(batch_inputs)

rslearn 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

rslearn 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl