PyPI - rslearn - Versions diffs - 0.0.19__tar.gz → 0.0.20__tar.gz - Mend

rslearn 0.0.19tar.gz → 0.0.20tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

{rslearn-0.0.19/rslearn.egg-info → rslearn-0.0.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rslearn
-Version: 0.0.19
+Version: 0.0.20
 Summary: A library for developing remote sensing datasets and models
 Author: OlmoEarth Team
 License:                                  Apache License

{rslearn-0.0.19 → rslearn-0.0.20}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "rslearn"
-version = "0.0.19"
+version = "0.0.20"
 description = "A library for developing remote sensing datasets and models"
 authors = [
     { name = "OlmoEarth Team" },

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/anysat.py RENAMED Viewed

@@ -4,6 +4,8 @@ This code loads the AnySat model from torch hub. See
 https://github.com/gastruc/AnySat for applicable license and copyright information.
 """
+from datetime import datetime
 import torch
 from einops import rearrange
@@ -53,7 +55,6 @@ class AnySat(FeatureExtractor):
         self,
         modalities: list[str],
         patch_size_meters: int,
-        dates: dict[str, list[int]],
         output: str = "patch",
         output_modality: str | None = None,
         hub_repo: str = "gastruc/anysat",
@@ -85,14 +86,6 @@ class AnySat(FeatureExtractor):
             if m not in MODALITY_RESOLUTIONS:
                 raise ValueError(f"Invalid modality: {m}")
-        if not all(m in TIME_SERIES_MODALITIES for m in dates.keys()):
-            raise ValueError("`dates` keys must be time-series modalities only.")
-        for m in modalities:
-            if m in TIME_SERIES_MODALITIES and m not in dates:
-                raise ValueError(
-                    f"Missing required dates for time-series modality '{m}'."
-                )
         if patch_size_meters % 10 != 0:
             raise ValueError(
                 "In AnySat, `patch_size` is in meters and must be a multiple of 10."
@@ -106,7 +99,6 @@ class AnySat(FeatureExtractor):
         self.modalities = modalities
         self.patch_size_meters = int(patch_size_meters)
-        self.dates = dates
         self.output = output
         self.output_modality = output_modality
@@ -119,6 +111,20 @@ class AnySat(FeatureExtractor):
         )
         self._embed_dim = 768  # base width, 'dense' returns 2x
+    @staticmethod
+    def time_ranges_to_doy(
+        time_ranges: list[tuple[datetime, datetime]],
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Turn the time ranges stored in a RasterImage to timestamps accepted by AnySat.
+        AnySat uses the doy with each timestamp, so we take the midpoint
+        the time range. For some inputs (e.g. Sentinel 2) we take an image from a specific
+        time so that start_time == end_time == mid_time.
+        """
+        doys = [(t[0] + ((t[1] - t[0]) / 2)).timetuple().tm_yday for t in time_ranges]
+        return torch.tensor(doys, dtype=torch.int32, device=device)
     def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass for the AnySat model.
@@ -139,17 +145,29 @@ class AnySat(FeatureExtractor):
                 raise ValueError(f"Modality '{modality}' not present in inputs.")
             cur = torch.stack(
-                [inp[modality] for inp in inputs], dim=0
-            )  # (B, C, H, W) or (B, T*C, H, W)
+                [inp[modality].image for inp in inputs], dim=0
+            )  # (B, C, T, H, W)
             if modality in TIME_SERIES_MODALITIES:
-                num_dates = len(self.dates[modality])
-                num_bands = cur.shape[1] // num_dates
-                cur = rearrange(
-                    cur, "b (t c) h w -> b t c h w", t=num_dates, c=num_bands
-                )
+                num_bands = cur.shape[1]
+                cur = rearrange(cur, "b c t h w -> b t c h w")
                 H, W = cur.shape[-2], cur.shape[-1]
+                if inputs[0][modality].timestamps is None:
+                    raise ValueError(
+                        f"Require timestamps for time series modality {modality}"
+                    )
+                timestamps = torch.stack(
+                    [
+                        self.time_ranges_to_doy(inp[modality].timestamps, cur.device)  # type: ignore
+                        for inp in inputs
+                    ],
+                    dim=0,
+                )
+                batch[f"{modality}_dates"] = timestamps
             else:
+                # take the first (assumed only) timestep
+                cur = cur[:, :, 0]
                 num_bands = cur.shape[1]
                 H, W = cur.shape[-2], cur.shape[-1]
@@ -173,22 +191,6 @@ class AnySat(FeatureExtractor):
                     "All modalities must share the same spatial extent (H*res, W*res)."
                 )
-        # Add *_dates
-        to_add = {}
-        for modality, x in list(batch.items()):
-            if modality in TIME_SERIES_MODALITIES:
-                B, T = x.shape[0], x.shape[1]
-                d = torch.as_tensor(
-                    self.dates[modality], dtype=torch.long, device=x.device
-                )
-                if d.ndim != 1 or d.numel() != T:
-                    raise ValueError(
-                        f"dates for '{modality}' must be 1D length {T}, got {tuple(d.shape)}"
-                    )
-                to_add[f"{modality}_dates"] = d.unsqueeze(0).repeat(B, 1)
-        batch.update(to_add)
         kwargs = {"patch_size": self.patch_size_meters, "output": self.output}
         if self.output == "dense":
             kwargs["output_modality"] = self.output_modality

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/clip.py RENAMED Viewed

@@ -43,9 +43,12 @@ class CLIP(FeatureExtractor):
             a FeatureMaps with one feature map from the ViT, which is always Bx24x24x1024.
         """
         inputs = context.inputs
-        device = inputs[0]["image"].device
+        device = inputs[0]["image"].image.device
         clip_inputs = self.processor(
-            images=[inp["image"].cpu().numpy().transpose(1, 2, 0) for inp in inputs],
+            images=[
+                inp["image"].single_ts_to_chw_tensor().cpu().numpy().transpose(1, 2, 0)
+                for inp in inputs
+            ],
             return_tensors="pt",
             padding=True,
         )

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/croma.py RENAMED Viewed

@@ -175,10 +175,16 @@ class Croma(FeatureExtractor):
         sentinel1: torch.Tensor | None = None
         sentinel2: torch.Tensor | None = None
         if self.modality in [CromaModality.BOTH, CromaModality.SENTINEL1]:
-            sentinel1 = torch.stack([inp["sentinel1"] for inp in context.inputs], dim=0)
+            sentinel1 = torch.stack(
+                [inp["sentinel1"].single_ts_to_chw_tensor() for inp in context.inputs],
+                dim=0,
+            )
             sentinel1 = self._resize_image(sentinel1) if self.do_resizing else sentinel1
         if self.modality in [CromaModality.BOTH, CromaModality.SENTINEL2]:
-            sentinel2 = torch.stack([inp["sentinel2"] for inp in context.inputs], dim=0)
+            sentinel2 = torch.stack(
+                [inp["sentinel2"].single_ts_to_chw_tensor() for inp in context.inputs],
+                dim=0,
+            )
             sentinel2 = self._resize_image(sentinel2) if self.do_resizing else sentinel2
         outputs = self.model(
@@ -294,5 +300,7 @@ class CromaNormalize(Transform):
         for modality in MODALITY_BANDS.keys():
             if modality not in input_dict:
                 continue
-            input_dict[modality] = self.apply_image(input_dict[modality], modality)
+            input_dict[modality].image = self.apply_image(
+                input_dict[modality].image, modality
+            )
         return input_dict, target_dict

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/dinov3.py RENAMED Viewed

@@ -104,7 +104,8 @@ class DinoV3(FeatureExtractor):
             a FeatureMaps with one feature map.
         """
         cur = torch.stack(
-            [inp["image"] for inp in context.inputs], dim=0
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs],
+            dim=0,
         )  # (B, C, H, W)
         if self.do_resizing and (

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/faster_rcnn.py RENAMED Viewed

@@ -210,7 +210,8 @@ class FasterRCNN(Predictor):
                     ),
                 )
-        image_list = [inp["image"] for inp in context.inputs]
+        # take the first (and assumed to be only) timestep
+        image_list = [inp["image"].image[:, 0] for inp in context.inputs]
         images, targets = self.noop_transform(image_list, targets)
         feature_dict = collections.OrderedDict()

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/galileo/galileo.py RENAMED Viewed

@@ -3,6 +3,7 @@
 import math
 import tempfile
 from contextlib import nullcontext
+from datetime import datetime
 from enum import StrEnum
 from typing import cast
@@ -411,6 +412,23 @@ class GalileoModel(FeatureExtractor):
             months=months,
         )
+    @staticmethod
+    def time_ranges_to_timestamps(
+        time_ranges: list[tuple[datetime, datetime]],
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Turn the time ranges stored in a RasterImage to timestamps accepted by Galileo.
+        Galileo only uses the month associated with each timestamp, so we take the midpoint
+        the time range. For some inputs (e.g. Sentinel 2) we take an image from a specific
+        time so that start_time == end_time == mid_time.
+        """
+        mid_ranges = [t[0] + ((t[1] - t[0]) / 2) for t in time_ranges]
+        # months are indexed 0-11
+        return torch.tensor(
+            [d.month - 1 for d in mid_ranges], dtype=torch.int32, device=device
+        )
     def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Galileo backbone.
@@ -418,16 +436,16 @@ class GalileoModel(FeatureExtractor):
             context: the model context. Input dicts should contain keys corresponding to Galileo.input_keys
                 (also documented below) and values are tensors of the following shapes,
                 per input key:
-                    "s1": B (T * C) H W
-                    "s2": B (T * C) H W
-                    "era5": B (T * C) H W  (we will average over the H, W dimensions)
-                    "tc": B (T * C) H W  (we will average over the H, W dimensions)
-                    "viirs": B (T * C) H W  (we will average over the H, W dimensions)
-                    "srtm": B C H W (SRTM has no temporal dimension)
-                    "dw": : B C H W (Dynamic World should be averaged over time)
-                    "wc": B C H W (WorldCereal has no temporal dimension)
-                    "landscan":  B C H W  (we will average over the H, W dimensions)
-                    "latlon":  B C H W  (we will average over the H, W dimensions)
+                    "s1": B C T H W
+                    "s2": B C T H W
+                    "era5": B C T H W  (we will average over the H, W dimensions)
+                    "tc": B C T H W  (we will average over the H, W dimensions)
+                    "viirs": B C T H W  (we will average over the H, W dimensions)
+                    "srtm": B C 1 H W (SRTM has no temporal dimension)
+                    "dw": : B C 1 H W (Dynamic World should be averaged over time)
+                    "wc": B C 1 H W (WorldCereal has no temporal dimension)
+                    "landscan":  B C 1 H W  (we will average over the H, W dimensions)
+                    "latlon":  B C 1 H W  (we will average over the H, W dimensions)
         The output will be an embedding representing the pooled tokens. If there is
         only a single token per h/w dimension (i.e. patch_size == h,w), then we will take
@@ -436,15 +454,35 @@ class GalileoModel(FeatureExtractor):
         If there are many spatial tokens per h/w dimension (patch_size > h,w), then we will
         take a pool of the space_time unmasked tokens (i.e. of the s1 and s2 tokens).
         """
+        space_time_modalities = ["s1", "s2"]
+        time_modalities = ["era5", "tc", "viirs"]
         stacked_inputs = {}
+        months: torch.Tensor | None = None
         for key in context.inputs[0].keys():
             # assume all the keys in an input are consistent
             if key in self.input_keys:
                 stacked_inputs[key] = torch.stack(
-                    [inp[key] for inp in context.inputs], dim=0
+                    [inp[key].image for inp in context.inputs], dim=0
                 )
+                if key in space_time_modalities + time_modalities:
+                    if months is None:
+                        if context.inputs[0][key].timestamps is not None:
+                            months = torch.stack(
+                                [
+                                    self.time_ranges_to_timestamps(
+                                        inp[key].timestamps,  # type: ignore
+                                        device=stacked_inputs[key].device,
+                                    )
+                                    for inp in context.inputs
+                                ],
+                                dim=0,
+                            )
+        if months is not None:
+            stacked_inputs["months"] = months
         s_t_channels = []
-        for space_time_modality in ["s1", "s2"]:
+        for space_time_modality in space_time_modalities:
             if space_time_modality not in stacked_inputs:
                 continue
             if space_time_modality == "s1":
@@ -452,36 +490,27 @@ class GalileoModel(FeatureExtractor):
             else:
                 s_t_channels += self.s_t_channels_s2
             cur = stacked_inputs[space_time_modality]
-            # Check if it's single or multitemporal, and reshape accordingly
-            num_bands = len(S2_BANDS) if space_time_modality == "s2" else len(S1_BANDS)
-            num_timesteps = cur.shape[1] // num_bands
-            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=num_timesteps)
+            cur = rearrange(cur, "b c t h w -> b h w t c")
             stacked_inputs[space_time_modality] = cur
         for space_modality in ["srtm", "dw", "wc"]:
             if space_modality not in stacked_inputs:
                 continue
+            # take the first (and assumed only) timestep
+            stacked_inputs[space_modality] = stacked_inputs[space_modality][:, :, 0]
             stacked_inputs[space_modality] = rearrange(
                 stacked_inputs[space_modality], "b c h w -> b h w c"
             )
-        for time_modality in ["era5", "tc", "viirs"]:
+        for time_modality in time_modalities:
             if time_modality not in stacked_inputs:
                 continue
             cur = stacked_inputs[time_modality]
-            # Check if it's single or multitemporal, and reshape accordingly
-            num_bands = {
-                "era5": len(ERA5_BANDS),
-                "tc": len(TC_BANDS),
-                "viirs": len(VIIRS_BANDS),
-            }[time_modality]
-            num_timesteps = cur.shape[1] // num_bands
             # take the average over the h, w bands since Galileo
             # treats it as a pixel-timeseries
             cur = rearrange(
-                torch.nanmean(torch.nanmean(cur, dim=-1), dim=-1),
-                "b (t c) -> b t c",
-                t=num_timesteps,
+                torch.nanmean(cur, dim=(-1, -2)),
+                "b c t -> b t c",
             )
             stacked_inputs[time_modality] = cur
@@ -489,9 +518,8 @@ class GalileoModel(FeatureExtractor):
             if static_modality not in stacked_inputs:
                 continue
             cur = stacked_inputs[static_modality]
-            stacked_inputs[static_modality] = torch.nanmean(
-                torch.nanmean(cur, dim=-1), dim=-1
-            )
+            stacked_inputs[static_modality] = torch.nanmean(cur, dim=(2, 3, 4))
         galileo_input = self.construct_galileo_input(**stacked_inputs, normalize=True)
         h = galileo_input.s_t_x.shape[1]
         if h < self.patch_size:
@@ -511,7 +539,6 @@ class GalileoModel(FeatureExtractor):
             torch_context = torch.amp.autocast(
                 device_type=device.type, dtype=self.autocast_dtype
             )
         with torch_context:
             outputs = self.model(
                 s_t_x=galileo_input.s_t_x,

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/module_wrapper.py RENAMED Viewed

@@ -53,7 +53,12 @@ class EncoderModuleWrapper(FeatureExtractor):
         Returns:
             the output from the last wrapped module.
         """
-        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        # take the first and only timestep. Currently no intermediate
+        # components support multi temporal inputs, so if the input is
+        # multitemporal it should be wrapped in a simple time series wrapper.
+        images = torch.stack(
+            [inp["image"].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+        )
         cur: Any = FeatureMaps([images])
         for m in self.encoder_modules:
             cur = m(cur, context)

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/molmo.py RENAMED Viewed

@@ -47,11 +47,13 @@ class Molmo(FeatureExtractor):
             a FeatureMaps. Molmo produces features at one scale, so it will contain one
                 feature map that is a Bx24x24x2048 tensor.
         """
-        device = context.inputs[0]["image"].device
+        device = context.inputs[0]["image"].image.device
         molmo_inputs_list = []
         # Process each one so we can isolate just the full image without any crops.
         for inp in context.inputs:
-            image = inp["image"].cpu().numpy().transpose(1, 2, 0)
+            image = (
+                inp["image"].single_ts_to_chw_tensor().cpu().numpy().transpose(1, 2, 0)
+            )
             processed = self.processor.process(
                 images=[image],
                 text="",

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/olmoearth_pretrain/model.py RENAMED Viewed

@@ -1,26 +1,27 @@
 """OlmoEarth model wrapper for fine-tuning in rslearn."""
 import json
+import warnings
 from contextlib import nullcontext
+from datetime import datetime
 from typing import Any
 import torch
 from einops import rearrange
-from olmo_core.config import Config
-from olmo_core.distributed.checkpoint import load_model_and_optim_state
+from olmoearth_pretrain.config import Config, require_olmo_core
 from olmoearth_pretrain.data.constants import Modality
+from olmoearth_pretrain.datatypes import MaskedOlmoEarthSample, MaskValue
 from olmoearth_pretrain.model_loader import (
     ModelID,
     load_model_from_id,
     load_model_from_path,
 )
 from olmoearth_pretrain.nn.flexihelios import Encoder, TokensAndMasks
-from olmoearth_pretrain.train.masking import MaskedOlmoEarthSample, MaskValue
 from upath import UPath
 from rslearn.log_utils import get_logger
 from rslearn.models.component import FeatureExtractor, FeatureMaps, TokenFeatureMaps
-from rslearn.train.model_context import ModelContext
+from rslearn.train.model_context import ModelContext, RasterImage
 logger = get_logger(__name__)
@@ -61,6 +62,7 @@ class OlmoEarth(FeatureExtractor):
         embedding_size: int | None = None,
         autocast_dtype: str | None = "bfloat16",
         token_pooling: bool = True,
+        use_legacy_timestamps: bool = True,
     ):
         """Create a new OlmoEarth model.
@@ -87,7 +89,15 @@ class OlmoEarth(FeatureExtractor):
             token_pooling: whether or not to pool the tokens. If True, the output will be BxCxHxW. If False,
                 there will be an extra dimension, N, (BxCxHxWxN) representing the temporal and channel
                 dimensions.
+            use_legacy_timestamps: In our original implementation of OlmoEarth, we applied timestamps starting
+                from 0 (instead of the actual timestamps of the input). The option to do this is preserved
+                for backwards compatability with finetuned models which were trained against this implementation.
         """
+        if use_legacy_timestamps:
+            warnings.warn(
+                "For new projects, don't use legacy timesteps.", DeprecationWarning
+            )
         if (
             sum(
                 [
@@ -138,6 +148,7 @@ class OlmoEarth(FeatureExtractor):
                 model = model[part]
         self.model = model
         self.token_pooling = token_pooling
+        self.use_legacy_timestamps = use_legacy_timestamps
     def _load_model_from_checkpoint(
         self, checkpoint_upath: UPath, random_initialization: bool
@@ -148,9 +159,12 @@ class OlmoEarth(FeatureExtractor):
         that contains the distributed checkpoint. This is the format produced by
         pre-training runs in olmoearth_pretrain.
         """
-        # Load the model config and initialize it.
         # We avoid loading the train module here because it depends on running within
         # olmo_core.
+        # Only pull in olmo_core when trying to load a distributed checkpoint to avoid dependency.
+        require_olmo_core("_load_model_from_checkpoint")
+        from olmo_core.distributed.checkpoint import load_model_and_optim_state
         with (checkpoint_upath / "config.json").open() as f:
             config_dict = json.load(f)
             model_config = Config.from_dict(config_dict["model"])
@@ -165,6 +179,32 @@ class OlmoEarth(FeatureExtractor):
         return model
+    @staticmethod
+    def time_ranges_to_timestamps(
+        time_ranges: list[tuple[datetime, datetime]],
+        max_timestamps: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Turn the time ranges stored in a RasterImage to timestamps accepted by OlmoEarth.
+        OlmoEarth only uses the month associated with each timestamp, so we take the midpoint
+        the time range. For some inputs (e.g. Sentinel 2) we take an image from a specific
+        time so that start_time == end_time == mid_time.
+        """
+        timestamps = torch.zeros((max_timestamps, 3), dtype=torch.int32, device=device)
+        mid_ranges = [t[0] + ((t[1] - t[0]) / 2) for t in time_ranges]
+        timestamps[: len(time_ranges), 0] = torch.tensor(
+            [d.day for d in mid_ranges], dtype=torch.int32
+        )
+        # months are indexed 0-11
+        timestamps[: len(time_ranges), 1] = torch.tensor(
+            [d.month - 1 for d in mid_ranges], dtype=torch.int32
+        )
+        timestamps[: len(time_ranges), 2] = torch.tensor(
+            [d.year for d in mid_ranges], dtype=torch.int32
+        )
+        return timestamps
     def _prepare_modality_inputs(
         self, context: ModelContext
     ) -> tuple[MaskedOlmoEarthSample, list[str], torch.device]:
@@ -190,43 +230,55 @@ class OlmoEarth(FeatureExtractor):
         # We'll have to fix all that.
         max_timesteps = 1
         modality_data = {}
+        # we will just store the longest time range
+        # per instance in the batch. This means it may not be
+        # aligned per modality
+        timestamps_per_instance: list[list[tuple[datetime, datetime]]] = [[]] * len(
+            context.inputs
+        )
         for modality in MODALITY_NAMES:
             if modality not in context.inputs[0]:
                 continue
             present_modalities.append(modality)
-            tensors = [inp[modality] for inp in context.inputs]
+            tensors = []
+            for idx, inp in enumerate(context.inputs):
+                assert isinstance(inp, RasterImage)
+                tensors.append(inp[modality].image)
+                cur_timestamps = inp[modality].timestamps
+                if cur_timestamps is not None and len(cur_timestamps) > len(
+                    timestamps_per_instance[idx]
+                ):
+                    timestamps_per_instance[idx] = cur_timestamps
+            tensors = [inp[modality].image for inp in context.inputs]
             device = tensors[0].device
-            num_bands = Modality.get(modality).num_bands
-            max_t = max(t.shape[0] for t in tensors) // num_bands
+            max_t = max(t.shape[1] for t in tensors)
             max_timesteps = max(max_timesteps, max_t)
             modality_data[modality] = (
                 tensors,
-                num_bands,
                 len(Modality.get(modality).band_sets),
             )
         # Second pass: pad and process each modality with global max_timesteps
         for modality in present_modalities:
-            tensors, num_bands, num_band_sets = modality_data[modality]
-            target_ch = max_timesteps * num_bands
+            tensors, num_band_sets = modality_data[modality]
             # Pad tensors to target_ch and track original timesteps for masking
             padded = []
             original_timesteps = []
             for t in tensors:
-                orig_t = t.shape[0] // num_bands
+                orig_t = t.shape[1]
                 original_timesteps.append(orig_t)
-                if t.shape[0] < target_ch:
+                if orig_t < max_timesteps:
                     pad = torch.zeros(
-                        (target_ch - t.shape[0],) + t.shape[1:],
+                        t.shape[:1] + (max_timesteps - orig_t,) + t.shape[2:],
                         dtype=t.dtype,
                         device=device,
                     )
-                    t = torch.cat([t, pad], dim=0)
+                    t = torch.cat([t, pad], dim=1)
                 padded.append(t)
             cur = torch.stack(padded, dim=0)
-            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=max_timesteps)
+            cur = rearrange(cur, "b c t h w -> b h w t c")
             kwargs[modality] = cur
             # Create mask: ONLINE_ENCODER for valid, MISSING for padded timesteps
@@ -242,19 +294,31 @@ class OlmoEarth(FeatureExtractor):
                     mask[sample_idx, :, :, orig_t:, :] = MaskValue.MISSING.value
             kwargs[f"{modality}_mask"] = mask
-        # Timestamps is required.
-        # Note that only months (0 to 11) are used in OlmoEarth position encoding.
-        # For now, we assign same timestamps to all inputs, but later we should
-        # handle varying timestamps per input.
-        timestamps = torch.zeros(
-            (len(context.inputs), max_timesteps, 3), dtype=torch.int32, device=device
-        )
-        timestamps[:, :, 0] = 1  # day
-        timestamps[:, :, 1] = torch.arange(max_timesteps, device=device)[
-            None, :
-        ]  # month
-        timestamps[:, :, 2] = 2024  # year
-        kwargs["timestamps"] = timestamps
+        if self.use_legacy_timestamps:
+            # Note that only months (0 to 11) are used in OlmoEarth position encoding.
+            timestamps = torch.zeros(
+                (len(context.inputs), max_timesteps, 3),
+                dtype=torch.int32,
+                device=device,
+            )
+            timestamps[:, :, 0] = 1  # day
+            timestamps[:, :, 1] = torch.arange(max_timesteps, device=device)[
+                None, :
+            ]  # month
+            timestamps[:, :, 2] = 2024  # year
+            kwargs["timestamps"] = timestamps
+        else:
+            if max([len(t) for t in timestamps_per_instance]) == 0:
+                # Timestamps is required.
+                raise ValueError("No inputs had timestamps.")
+            # Note that only months (0 to 11) are used in OlmoEarth position encoding.
+            kwargs["timestamps"] = torch.stack(
+                [
+                    self.time_ranges_to_timestamps(time_range, max_timesteps, device)
+                    for time_range in timestamps_per_instance
+                ],
+                dim=0,
+            )
         return MaskedOlmoEarthSample(**kwargs), present_modalities, device

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/olmoearth_pretrain/norm.py RENAMED Viewed

@@ -64,8 +64,8 @@ class OlmoEarthNormalize(Transform):
             band_norms = self.norm_config[modality_name]
             image = input_dict[modality_name]
             # Keep a set of indices to make sure that we normalize all of them.
-            needed_band_indices = set(range(image.shape[0]))
-            num_timesteps = image.shape[0] // len(cur_band_names)
+            needed_band_indices = set(range(image.image.shape[0]))
+            num_timesteps = image.image.shape[0] // len(cur_band_names)
             for band, norm_dict in band_norms.items():
                 # If multitemporal, normalize each timestep separately.
@@ -73,7 +73,9 @@ class OlmoEarthNormalize(Transform):
                     band_idx = cur_band_names.index(band) + t * len(cur_band_names)
                     min_val = norm_dict["mean"] - self.std_multiplier * norm_dict["std"]
                     max_val = norm_dict["mean"] + self.std_multiplier * norm_dict["std"]
-                    image[band_idx] = (image[band_idx] - min_val) / (max_val - min_val)
+                    image.image[band_idx] = (image.image[band_idx] - min_val) / (
+                        max_val - min_val
+                    )
                     needed_band_indices.remove(band_idx)
             if len(needed_band_indices) > 0:

{rslearn-0.0.19 → rslearn-0.0.20}/rslearn/models/panopticon.py RENAMED Viewed

@@ -142,7 +142,9 @@ class Panopticon(FeatureExtractor):
     def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass through the panopticon model."""
         batch_inputs = {
-            key: torch.stack([inp[key] for inp in context.inputs], dim=0)
+            key: torch.stack(
+                [inp[key].single_ts_to_chw_tensor() for inp in context.inputs], dim=0
+            )
             for key in context.inputs[0].keys()
         }
         panopticon_inputs = self.prepare_input(batch_inputs)

rslearn 0.0.19__tar.gz → 0.0.20__tar.gz

rslearn 0.0.19tar.gz → 0.0.20tar.gz