PyPI - rslearn - Versions diffs - 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl - Mend

rslearn 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

rslearn/arg_parser.py +2 -9
rslearn/config/__init__.py +2 -0
rslearn/config/dataset.py +64 -20
rslearn/dataset/add_windows.py +1 -1
rslearn/dataset/dataset.py +34 -84
rslearn/dataset/materialize.py +5 -5
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +26 -80
rslearn/lightning_cli.py +22 -11
rslearn/main.py +12 -37
rslearn/models/anysat.py +11 -9
rslearn/models/attention_pooling.py +177 -0
rslearn/models/clay/clay.py +8 -9
rslearn/models/clip.py +18 -15
rslearn/models/component.py +111 -0
rslearn/models/concatenate_features.py +21 -11
rslearn/models/conv.py +15 -8
rslearn/models/croma.py +13 -8
rslearn/models/detr/detr.py +25 -14
rslearn/models/dinov3.py +11 -6
rslearn/models/faster_rcnn.py +19 -9
rslearn/models/feature_center_crop.py +12 -9
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/galileo.py +23 -18
rslearn/models/module_wrapper.py +26 -57
rslearn/models/molmo.py +16 -14
rslearn/models/multitask.py +102 -73
rslearn/models/olmoearth_pretrain/model.py +135 -38
rslearn/models/panopticon.py +8 -7
rslearn/models/pick_features.py +18 -24
rslearn/models/pooling_decoder.py +22 -14
rslearn/models/presto/presto.py +16 -10
rslearn/models/presto/single_file_presto.py +4 -10
rslearn/models/prithvi.py +12 -8
rslearn/models/resize_features.py +21 -7
rslearn/models/sam2_enc.py +11 -9
rslearn/models/satlaspretrain.py +15 -9
rslearn/models/simple_time_series.py +37 -17
rslearn/models/singletask.py +24 -17
rslearn/models/ssl4eo_s12.py +15 -10
rslearn/models/swin.py +22 -13
rslearn/models/terramind.py +24 -7
rslearn/models/trunk.py +6 -3
rslearn/models/unet.py +18 -9
rslearn/models/upsample.py +22 -9
rslearn/train/all_patches_dataset.py +89 -37
rslearn/train/dataset.py +105 -97
rslearn/train/lightning_module.py +51 -32
rslearn/train/model_context.py +54 -0
rslearn/train/prediction_writer.py +111 -41
rslearn/train/scheduler.py +15 -0
rslearn/train/tasks/classification.py +34 -15
rslearn/train/tasks/detection.py +24 -31
rslearn/train/tasks/embedding.py +33 -29
rslearn/train/tasks/multi_task.py +7 -7
rslearn/train/tasks/per_pixel_regression.py +41 -19
rslearn/train/tasks/regression.py +38 -21
rslearn/train/tasks/segmentation.py +33 -15
rslearn/train/tasks/task.py +3 -2
rslearn/train/transforms/resize.py +74 -0
rslearn/utils/geometry.py +73 -0
rslearn/utils/jsonargparse.py +66 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/METADATA +1 -1
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/RECORD +71 -66
rslearn/dataset/index.py +0 -173
rslearn/models/registry.py +0 -22
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/WHEEL +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.19.dist-info}/top_level.txt +0 -0

rslearn/models/multitask.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """MultiTaskModel for rslearn."""
+from collections.abc import Iterable
 from copy import deepcopy
 from typing import Any
@@ -7,6 +8,9 @@ import torch
 from rslearn.log_utils import get_logger
 from rslearn.models.trunk import DecoderTrunk
+from rslearn.train.model_context import ModelContext, ModelOutput
+from .component import FeatureExtractor, IntermediateComponent, Predictor
 logger = get_logger(__name__)
@@ -58,8 +62,8 @@ class MultiTaskModel(torch.nn.Module):
     def __init__(
         self,
-        encoder: list[torch.nn.Module],
-        decoders: dict[str, list[torch.nn.Module]],
+        encoder: list[FeatureExtractor | IntermediateComponent],
+        decoders: dict[str, list[IntermediateComponent | Predictor]],
         lazy_decode: bool = False,
         loss_weights: dict[str, float] | None = None,
         trunk: DecoderTrunk | None = None,
@@ -67,8 +71,12 @@ class MultiTaskModel(torch.nn.Module):
         """Initialize a new MultiTaskModel.
         Args:
-            encoder: modules to compute intermediate feature representations.
+            encoder: modules to compute intermediate feature representations. The first
+                module must be a FeatureExtractor, and following modules must be
+                IntermediateComponents.
             decoders: modules to compute outputs and loss, should match number of tasks.
+                The last module must be a Predictor, while the previous modules must be
+                IntermediateComponents.
             lazy_decode: if True, only decode the outputs specified in the batch.
             loss_weights: weights for each task's loss (default: None = equal weights).
             trunk: if provided, use this trunk module to postprocess the features
@@ -76,7 +84,7 @@ class MultiTaskModel(torch.nn.Module):
         """
         super().__init__()
         self.lazy_decode = lazy_decode
-        self.encoder = torch.nn.Sequential(*encoder)
+        self.encoder = torch.nn.ModuleList(encoder)
         self.decoders = torch.nn.ModuleDict(
             sort_keys(
                 {
@@ -120,32 +128,28 @@ class MultiTaskModel(torch.nn.Module):
     def apply_decoder(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None,
-        decoder: list[torch.nn.Module],
+        decoder: list[IntermediateComponent | Predictor],
         task_name: str,
-        outputs: list[dict[str, Any]],
-        losses: dict[str, torch.Tensor],
-    ) -> tuple[list[dict[str, Any]], dict[str, torch.Tensor]]:
+    ) -> ModelOutput:
         """Apply a decoder to a list of inputs and targets.
         Args:
-            features: list of features
-            inputs: list of input dicts
+            intermediates: the intermediate output from the encoder.
+            context: the model context.
             targets: list of target dicts
             decoder: list of decoder modules
             task_name: the name of the task
-            outputs: list of output dicts
-            losses: dictionary of loss values
         Returns:
-            tuple of (outputs, losses)
+            a ModelOutput containing outputs across all the decoders.
         """
         # First, apply all but the last module in the decoder to the features
-        cur = features
+        cur = intermediates
         for module in decoder[:-1]:
-            cur = module(cur, inputs)
+            cur = module(cur, context)
         if targets is None:
             cur_targets = None
@@ -153,14 +157,7 @@ class MultiTaskModel(torch.nn.Module):
             cur_targets = [target[task_name] for target in targets]
         # Then, apply the last module to the features and targets
-        cur_output, cur_loss_dict = decoder[-1](cur, inputs, cur_targets)
-        for idx, entry in enumerate(cur_output):
-            outputs[idx][task_name] = entry
-        for loss_name, loss_value in cur_loss_dict.items():
-            losses[f"{task_name}_{loss_name}"] = (
-                loss_value * self.loss_weights[task_name]
-            )
-        return outputs, losses
+        return decoder[-1](cur, context, cur_targets)
     def _get_tasks_from_decoder(self, decoder: str) -> list[str]:
         """Get the tasks corresponding to this decoder.
@@ -172,66 +169,84 @@ class MultiTaskModel(torch.nn.Module):
     def apply_decoders(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply all the decoders to the features and targets.
         Args:
-            features: list of features
-            inputs: list of input dicts
+            intermediates: the intermediates from the encoder.
+            context: the model context
             targets: list of target dicts
         Returns:
-            dict of outputs and losses
+            combined ModelOutput. The outputs is a list of output dicts, one per example,
+                where the dict maps from task name to the corresponding task output. The
+                losses is a flat dict but the task name is prepended to the loss names.
         """
-        outputs: list[dict[str, torch.Tensor | dict]] = [{} for _ in inputs]
+        outputs: list[dict[str, torch.Tensor | dict]] = [{} for _ in context.inputs]
         losses: dict[str, torch.Tensor] = {}
         if self.lazy_decode:
             # Assume that all inputs have the same dataset_source
-            dataset_source = inputs[0]["dataset_source"]
-            decoder = self.decoders[
-                self.target_to_decoder.get(dataset_source, dataset_source)
-            ]
-            self.apply_decoder(
-                features, inputs, targets, decoder, dataset_source, outputs, losses
+            task_name = context.metadatas[0].dataset_source
+            if task_name is None:
+                raise ValueError("dataset_source must be set for lazy decoding")
+            decoder = self.decoders[self.target_to_decoder.get(task_name, task_name)]
+            model_output = self.apply_decoder(
+                intermediates, context, targets, decoder, task_name
             )
+            for idx, entry in enumerate(model_output.outputs):
+                outputs[idx][task_name] = entry
+            for loss_name, loss_value in model_output.loss_dict.items():
+                losses[f"{task_name}_{loss_name}"] = (
+                    loss_value * self.loss_weights[task_name]
+                )
         else:
             for decoder_name, decoder in self.decoders.items():
                 for task_name in self._get_tasks_from_decoder(decoder_name):
-                    self.apply_decoder(
-                        features, inputs, targets, decoder, task_name, outputs, losses
+                    model_output = self.apply_decoder(
+                        intermediates, context, targets, decoder, task_name
                     )
-        return {
-            "outputs": outputs,
-            "loss_dict": losses,
-        }
+                    for idx, entry in enumerate(model_output.outputs):
+                        outputs[idx][task_name] = entry
+                    for loss_name, loss_value in model_output.loss_dict.items():
+                        losses[f"{task_name}_{loss_name}"] = (
+                            loss_value * self.loss_weights[task_name]
+                        )
+        return ModelOutput(
+            outputs=outputs,
+            loss_dict=losses,
+        )
     def forward(
         self,
-        inputs: list[dict[str, Any]],
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply the sequence of modules on the inputs, including shared trunk.
         Args:
-            inputs: list of input dicts
+            context: the model context.
             targets: optional list of target dicts
         Returns:
-            dict with keys "outputs" and "loss_dict".
+            the model output from apply_decoders.
         """
-        features = self.encoder(inputs)
+        cur = self.encoder[0](context)
+        for module in self.encoder[1:]:
+            cur = module(cur, context)
         if self.trunk is not None:
-            trunk_out = self.trunk(features, inputs)
-            outs = self.apply_decoders(trunk_out.pop("outputs"), inputs, targets)
+            trunk_out = self.trunk(cur, context)
+            outs = self.apply_decoders(trunk_out.pop("outputs"), context, targets)
             self.trunk.apply_auxiliary_losses(trunk_out, outs)
             return outs | trunk_out
         else:
-            return self.apply_decoders(features, inputs, targets)
+            return self.apply_decoders(cur, context, targets)
 class MultiTaskMergedModel(MultiTaskModel):
@@ -247,8 +262,8 @@ class MultiTaskMergedModel(MultiTaskModel):
     def __init__(
         self,
-        encoder: list[torch.nn.Module],
-        decoders: dict[str, list[torch.nn.Module]],
+        encoder: list[FeatureExtractor | IntermediateComponent],
+        decoders: dict[str, list[IntermediateComponent | Predictor]],
         decoder_to_target: dict[str, list[str]],
         task_label_offsets: dict[str, dict[str, Any]],
         lazy_decode: bool = False,
@@ -273,7 +288,7 @@ class MultiTaskMergedModel(MultiTaskModel):
         torch.nn.Module.__init__(self)
         self.lazy_decode = lazy_decode
-        self.encoder = torch.nn.Sequential(*encoder)
+        self.encoder = torch.nn.ModuleList(encoder)
         self.decoders = torch.nn.ModuleDict(
             sort_keys(
                 {
@@ -329,9 +344,9 @@ class MultiTaskMergedModel(MultiTaskModel):
         return offset_targets
     def unmerge_output_labels(
-        self, outputs: list[dict[str, torch.Tensor | dict]], task_name: str
-    ) -> None:
-        """Unmerge the task labels in place.
+        self, outputs: Iterable[Any], task_name: str
+    ) -> list[dict[str, torch.Tensor | dict]]:
+        """Unmerge the task outputs.
         For most tasks, this means chopping off the corresponding label dimensions.
         For some, we might just need to subtract an offset from the target (ex: segmentation).
@@ -340,10 +355,15 @@ class MultiTaskMergedModel(MultiTaskModel):
         Args:
             outputs: the predictions
             task_name: the name of the task
+        Returns:
+            the unmerged outputs.
         """
         offset = self.task_label_offsets[task_name]["offset"]
         num_outputs = self.task_label_offsets[task_name]["num_outputs"]
         output_key = self.task_label_offsets[task_name]["outputs_key"]
+        unmerged_outputs: list[dict[str, torch.Tensor | dict]] = [{} for _ in outputs]
         with torch.no_grad():
             for i, output in enumerate(outputs):
                 if not output:
@@ -353,35 +373,44 @@ class MultiTaskMergedModel(MultiTaskModel):
                 if isinstance(output, dict):
                     # For some tasks (eg object detection), we have discrete label
                     # predictions instead of a distribution over labels
-                    output[output_key] -= offset
+                    unmerged_output = output.copy()
+                    unmerged_output[output_key] = unmerged_output[output_key] - offset
+                    unmerged_outputs[i][task_name] = unmerged_output
                 elif isinstance(output, torch.Tensor):
                     # For classification/segmentation tasks, we have a distribution
                     # over labels, so we need to scale the predictions so that they
                     # sum to 1 since we chop off some of the probability densities
-                    tensor: torch.Tensor = output[offset : offset + num_outputs, ...]
-                    tensor /= tensor.sum(dim=0, keepdim=True).type(torch.float32)
-                    outputs[i][task_name] = tensor
+                    unmerged_output = output[offset : offset + num_outputs, ...]
+                    unmerged_output /= unmerged_output.sum(dim=0, keepdim=True).type(
+                        torch.float32
+                    )
+                    unmerged_outputs[i][task_name] = unmerged_output
+        return unmerged_outputs
     def forward(
         self,
-        inputs: list[dict[str, Any]],
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply the sequence of modules on the inputs.
         Args:
-            inputs: list of input dicts
+            context: the model context.
             targets: optional list of target dicts
         Returns:
-            dict with keys "outputs" and "loss_dict", and possibly other keys.
+            the model output.
         """
-        dataset_source = inputs[0].get("dataset_source", None)
-        assert isinstance(dataset_source, str)
-        targets = self.merge_task_labels(targets, dataset_source)
-        outs = super().forward(inputs, targets)
-        self.unmerge_output_labels(outs["outputs"], dataset_source)
-        return outs
+        dataset_source = context.metadatas[0].dataset_source
+        assert dataset_source is not None
+        merged_targets = self.merge_task_labels(targets, dataset_source)
+        outs = super().forward(context, merged_targets)
+        unmerged_outputs = self.unmerge_output_labels(outs.outputs, dataset_source)
+        return ModelOutput(
+            outputs=unmerged_outputs,
+            loss_dict=outs.loss_dict,
+        )
     def _get_tasks_from_decoder(self, decoder: str) -> list[str]:
         """Get the tasks corresponding to this decoder.

rslearn/models/olmoearth_pretrain/model.py CHANGED Viewed

@@ -19,6 +19,8 @@ from olmoearth_pretrain.train.masking import MaskedOlmoEarthSample, MaskValue
 from upath import UPath
 from rslearn.log_utils import get_logger
+from rslearn.models.component import FeatureExtractor, FeatureMaps, TokenFeatureMaps
+from rslearn.train.model_context import ModelContext
 logger = get_logger(__name__)
@@ -44,7 +46,7 @@ EMBEDDING_SIZES = {
 }
-class OlmoEarth(torch.nn.Module):
+class OlmoEarth(FeatureExtractor):
     """A wrapper to support the OlmoEarth model."""
     def __init__(
@@ -58,6 +60,7 @@ class OlmoEarth(torch.nn.Module):
         random_initialization: bool = False,
         embedding_size: int | None = None,
         autocast_dtype: str | None = "bfloat16",
+        token_pooling: bool = True,
     ):
         """Create a new OlmoEarth model.
@@ -81,6 +84,9 @@ class OlmoEarth(torch.nn.Module):
             embedding_size: optional embedding size to report via
                 get_backbone_channels (if model_id is not set).
             autocast_dtype: which dtype to use for autocasting, or set None to disable.
+            token_pooling: whether or not to pool the tokens. If True, the output will be BxCxHxW. If False,
+                there will be an extra dimension, N, (BxCxHxWxN) representing the temporal and channel
+                dimensions.
         """
         if (
             sum(
@@ -131,6 +137,7 @@ class OlmoEarth(torch.nn.Module):
             else:
                 model = model[part]
         self.model = model
+        self.token_pooling = token_pooling
     def _load_model_from_checkpoint(
         self, checkpoint_upath: UPath, random_initialization: bool
@@ -158,45 +165,89 @@ class OlmoEarth(torch.nn.Module):
         return model
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
-        """Compute feature maps from the OlmoEarth backbone.
+    def _prepare_modality_inputs(
+        self, context: ModelContext
+    ) -> tuple[MaskedOlmoEarthSample, list[str], torch.device]:
+        """Prepare modality tensors and masks for the OlmoEarth model.
+        Uses a two-pass approach to ensure all modalities have consistent timestep
+        dimensions for position encoding.
+        Args:
+            context: the model context with input tensors.
-        Inputs:
-            inputs: input dicts. It should include keys corresponding to the modalities
-                that should be passed to the OlmoEarth model.
+        Returns:
+            tuple of (sample, present_modalities, device)
         """
         kwargs = {}
         present_modalities = []
         device = None
-        # Handle the case where some modalities are multitemporal and some are not.
-        # We assume all multitemporal modalities have the same number of timesteps.
+        # First pass: find global max_timesteps across all modalities and samples
+        # TODO: currently we assume all modalities have the same number of timesteps,
+        # which is not true for all cases, and time series time steps are assumed to
+        # be 1-month apart. It also assumes continuity between available timesteps.
+        # We'll have to fix all that.
         max_timesteps = 1
+        modality_data = {}
         for modality in MODALITY_NAMES:
-            if modality not in inputs[0]:
+            if modality not in context.inputs[0]:
                 continue
             present_modalities.append(modality)
-            cur = torch.stack([inp[modality] for inp in inputs], dim=0)
-            device = cur.device
-            # Check if it's single or multitemporal, and reshape accordingly
+            tensors = [inp[modality] for inp in context.inputs]
+            device = tensors[0].device
             num_bands = Modality.get(modality).num_bands
-            num_timesteps = cur.shape[1] // num_bands
-            max_timesteps = max(max_timesteps, num_timesteps)
-            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=num_timesteps)
+            max_t = max(t.shape[0] for t in tensors) // num_bands
+            max_timesteps = max(max_timesteps, max_t)
+            modality_data[modality] = (
+                tensors,
+                num_bands,
+                len(Modality.get(modality).band_sets),
+            )
+        # Second pass: pad and process each modality with global max_timesteps
+        for modality in present_modalities:
+            tensors, num_bands, num_band_sets = modality_data[modality]
+            target_ch = max_timesteps * num_bands
+            # Pad tensors to target_ch and track original timesteps for masking
+            padded = []
+            original_timesteps = []
+            for t in tensors:
+                orig_t = t.shape[0] // num_bands
+                original_timesteps.append(orig_t)
+                if t.shape[0] < target_ch:
+                    pad = torch.zeros(
+                        (target_ch - t.shape[0],) + t.shape[1:],
+                        dtype=t.dtype,
+                        device=device,
+                    )
+                    t = torch.cat([t, pad], dim=0)
+                padded.append(t)
+            cur = torch.stack(padded, dim=0)
+            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=max_timesteps)
             kwargs[modality] = cur
-            # Create mask array which is BHWTS (without channels but with band sets).
-            num_band_sets = len(Modality.get(modality).band_sets)
-            mask_shape = cur.shape[0:4] + (num_band_sets,)
-            mask = (
-                torch.ones(mask_shape, dtype=torch.int32, device=device)
-                * MaskValue.ONLINE_ENCODER.value
+            # Create mask: ONLINE_ENCODER for valid, MISSING for padded timesteps
+            b, h, w = cur.shape[0], cur.shape[1], cur.shape[2]
+            mask = torch.full(
+                (b, h, w, max_timesteps, num_band_sets),
+                fill_value=MaskValue.ONLINE_ENCODER.value,
+                dtype=torch.int32,
+                device=device,
             )
+            for sample_idx, orig_t in enumerate(original_timesteps):
+                if orig_t < max_timesteps:
+                    mask[sample_idx, :, :, orig_t:, :] = MaskValue.MISSING.value
             kwargs[f"{modality}_mask"] = mask
         # Timestamps is required.
         # Note that only months (0 to 11) are used in OlmoEarth position encoding.
-        # For now, we assign same timestamps to all inputs, but later we should handle varying timestamps per input.
+        # For now, we assign same timestamps to all inputs, but later we should
+        # handle varying timestamps per input.
         timestamps = torch.zeros(
-            (len(inputs), max_timesteps, 3), dtype=torch.int32, device=device
+            (len(context.inputs), max_timesteps, 3), dtype=torch.int32, device=device
         )
         timestamps[:, :, 0] = 1  # day
         timestamps[:, :, 1] = torch.arange(max_timesteps, device=device)[
@@ -205,25 +256,46 @@ class OlmoEarth(torch.nn.Module):
         timestamps[:, :, 2] = 2024  # year
         kwargs["timestamps"] = timestamps
-        sample = MaskedOlmoEarthSample(**kwargs)
+        return MaskedOlmoEarthSample(**kwargs), present_modalities, device
+    def forward(self, context: ModelContext) -> FeatureMaps | TokenFeatureMaps:
+        """Compute feature maps from the OlmoEarth backbone.
+        Args:
+            context: the model context. Input dicts should include keys corresponding
+                to the modalities that should be passed to the OlmoEarth model.
+        Returns:
+            a FeatureMaps consisting of one feature map, at 1/patch_size of the input
+                resolution. Embeddings will be pooled across modalities and timesteps.
+        """
+        sample, present_modalities, device = self._prepare_modality_inputs(context)
         # Decide context based on self.autocast_dtype.
         if self.autocast_dtype is None:
-            context = nullcontext()
+            torch_context = nullcontext()
         else:
             assert device is not None
-            context = torch.amp.autocast(
+            torch_context = torch.amp.autocast(
                 device_type=device.type, dtype=self.autocast_dtype
             )
-        with context:
+        # Check if we can bypass masks (fast_pass=True)
+        missing_tokens = False
+        for modality in present_modalities:
+            modality_mask = getattr(sample, f"{modality}_mask")
+            if torch.any(modality_mask == MaskValue.MISSING.value):
+                missing_tokens = True
+                break
+        with torch_context:
             # Currently we assume the provided model always returns a TokensAndMasks object.
             tokens_and_masks: TokensAndMasks
             if isinstance(self.model, Encoder):
                 # Encoder has a fast_pass argument to indicate mask is not needed.
                 tokens_and_masks = self.model(
                     sample,
-                    fast_pass=True,
+                    fast_pass=not missing_tokens,
                     patch_size=self.patch_size,
                     **self.forward_kwargs,
                 )["tokens_and_masks"]
@@ -235,16 +307,41 @@ class OlmoEarth(torch.nn.Module):
         # Apply temporal/modality pooling so we just have one feature per patch.
         features = []
-        for modality in present_modalities:
-            modality_features = getattr(tokens_and_masks, modality)
-            # Pool over band sets and timesteps (BHWTSC -> BHWC).
-            pooled = modality_features.mean(dim=[3, 4])
-            # We want BHWC -> BCHW.
-            pooled = rearrange(pooled, "b h w c -> b c h w")
-            features.append(pooled)
-        # Pool over the modalities, so we get one BCHW feature map.
-        pooled = torch.stack(features, dim=0).mean(dim=0)
-        return [pooled]
+        if self.token_pooling:
+            for modality in present_modalities:
+                modality_features = getattr(tokens_and_masks, modality)  # BHWTSC
+                # If fast_pass is False, we need to mask the missing tokens before pooling.
+                if missing_tokens:
+                    modality_masks = getattr(
+                        tokens_and_masks, f"{modality}_mask"
+                    )  # BHWTS
+                    modality_masks_bool = (
+                        modality_masks != MaskValue.MISSING.value
+                    ).unsqueeze(-1)
+                    count = modality_masks_bool.sum(dim=[3, 4])
+                    # Masked average over band sets and timesteps (BHWTSC -> BHWC).
+                    pooled = (modality_features * modality_masks_bool).sum(
+                        dim=[3, 4]
+                    ) / count.clamp(min=1)
+                else:
+                    # Pool over band sets and timesteps (BHWTSC -> BHWC).
+                    pooled = modality_features.mean(dim=[3, 4])
+                # We want BHWC -> BCHW.
+                pooled = rearrange(pooled, "b h w c -> b c h w")
+                features.append(pooled)
+            # Pool over the modalities, so we get one BCHW feature map.
+            pooled = torch.stack(features, dim=0).mean(dim=0)
+            return FeatureMaps([pooled])
+        else:
+            for modality in present_modalities:
+                modality_features = getattr(tokens_and_masks, modality)
+                # Combine band sets and timesteps into last dim (BHWTSC -> BHWCN).
+                modality_features = rearrange(
+                    modality_features, "b h w t s c -> b c h w (t s)"
+                )
+                features.append(modality_features)
+            pooled = torch.cat(features, dim=-1)
+            return TokenFeatureMaps([pooled])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/panopticon.py CHANGED Viewed

@@ -3,15 +3,16 @@
 import math
 from enum import StrEnum
 from importlib import resources
-from typing import Any
 import torch
 import torch.nn.functional as F
 import yaml
 from einops import rearrange, repeat
-from torch import nn
 from rslearn.log_utils import get_logger
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
 logger = get_logger(__name__)
@@ -28,7 +29,7 @@ class PanopticonModalities(StrEnum):
     # Add more modalities as needed
-class Panopticon(nn.Module):
+class Panopticon(FeatureExtractor):
     """Class containing the Panopticon model that can ingest MaskedHeliosSample objects."""
     patch_size: int = 14
@@ -138,11 +139,11 @@ class Panopticon(nn.Module):
             "chn_ids": chn_ids,
         }
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass through the panopticon model."""
         batch_inputs = {
-            key: torch.stack([inp[key] for inp in inputs], dim=0)
-            for key in inputs[0].keys()
+            key: torch.stack([inp[key] for inp in context.inputs], dim=0)
+            for key in context.inputs[0].keys()
         }
         panopticon_inputs = self.prepare_input(batch_inputs)
         output_features = self.model.forward_features(panopticon_inputs)[
@@ -154,7 +155,7 @@ class Panopticon(nn.Module):
         output_features = rearrange(
             output_features, "b (h w) d -> b d h w", h=height, w=height
         )
-        return [output_features]
+        return FeatureMaps([output_features])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

rslearn 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl