PyPI - rslearn - Versions diffs - 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl - Mend

rslearn 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

rslearn/config/__init__.py +2 -0
rslearn/config/dataset.py +55 -4
rslearn/dataset/add_windows.py +1 -1
rslearn/dataset/dataset.py +9 -65
rslearn/dataset/materialize.py +5 -5
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +26 -80
rslearn/lightning_cli.py +10 -3
rslearn/main.py +11 -36
rslearn/models/anysat.py +11 -9
rslearn/models/clay/clay.py +8 -9
rslearn/models/clip.py +18 -15
rslearn/models/component.py +99 -0
rslearn/models/concatenate_features.py +21 -11
rslearn/models/conv.py +15 -8
rslearn/models/croma.py +13 -8
rslearn/models/detr/detr.py +25 -14
rslearn/models/dinov3.py +11 -6
rslearn/models/faster_rcnn.py +19 -9
rslearn/models/feature_center_crop.py +12 -9
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/galileo.py +23 -18
rslearn/models/module_wrapper.py +26 -57
rslearn/models/molmo.py +16 -14
rslearn/models/multitask.py +102 -73
rslearn/models/olmoearth_pretrain/model.py +20 -17
rslearn/models/panopticon.py +8 -7
rslearn/models/pick_features.py +18 -24
rslearn/models/pooling_decoder.py +22 -14
rslearn/models/presto/presto.py +16 -10
rslearn/models/presto/single_file_presto.py +4 -10
rslearn/models/prithvi.py +12 -8
rslearn/models/resize_features.py +21 -7
rslearn/models/sam2_enc.py +11 -9
rslearn/models/satlaspretrain.py +15 -9
rslearn/models/simple_time_series.py +31 -17
rslearn/models/singletask.py +24 -17
rslearn/models/ssl4eo_s12.py +15 -10
rslearn/models/swin.py +22 -13
rslearn/models/terramind.py +24 -7
rslearn/models/trunk.py +6 -3
rslearn/models/unet.py +18 -9
rslearn/models/upsample.py +22 -9
rslearn/train/all_patches_dataset.py +22 -18
rslearn/train/dataset.py +69 -54
rslearn/train/lightning_module.py +51 -32
rslearn/train/model_context.py +54 -0
rslearn/train/prediction_writer.py +111 -41
rslearn/train/tasks/classification.py +34 -15
rslearn/train/tasks/detection.py +24 -31
rslearn/train/tasks/embedding.py +33 -29
rslearn/train/tasks/multi_task.py +7 -7
rslearn/train/tasks/per_pixel_regression.py +41 -19
rslearn/train/tasks/regression.py +38 -21
rslearn/train/tasks/segmentation.py +33 -15
rslearn/train/tasks/task.py +3 -2
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/METADATA +58 -25
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/RECORD +65 -62
rslearn/dataset/index.py +0 -173
rslearn/models/registry.py +0 -22
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/WHEEL +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.16.dist-info → rslearn-0.0.18.dist-info}/top_level.txt +0 -0

rslearn/models/molmo.py CHANGED Viewed

@@ -1,12 +1,14 @@
 """Molmo model."""
-from typing import Any
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
-class Molmo(torch.nn.Module):
+class Molmo(FeatureExtractor):
     """Molmo image encoder."""
     def __init__(
@@ -34,21 +36,21 @@ class Molmo(torch.nn.Module):
         )  # nosec
         self.encoder = model.model.vision_backbone
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute outputs from the backbone.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process. The images should have values 0-255.
+        Args:
+            context: the model context. Input dicts must include "image" key containing
+                the image to process. The images should have values 0-255.
         Returns:
-            list of feature maps. Molmo produces features at one scale, so the list
-                contains a single Bx24x24x2048 tensor.
+            a FeatureMaps. Molmo produces features at one scale, so it will contain one
+                feature map that is a Bx24x24x2048 tensor.
         """
-        device = inputs[0]["image"].device
+        device = context.inputs[0]["image"].device
         molmo_inputs_list = []
         # Process each one so we can isolate just the full image without any crops.
-        for inp in inputs:
+        for inp in context.inputs:
             image = inp["image"].cpu().numpy().transpose(1, 2, 0)
             processed = self.processor.process(
                 images=[image],
@@ -60,6 +62,6 @@ class Molmo(torch.nn.Module):
         image_features, _ = self.encoder.encode_image(molmo_inputs.to(device))
         # 576x2048 -> 24x24x2048
-        return [
-            image_features[:, 0, :, :].reshape(-1, 24, 24, 2048).permute(0, 3, 1, 2)
-        ]
+        return FeatureMaps(
+            [image_features[:, 0, :, :].reshape(-1, 24, 24, 2048).permute(0, 3, 1, 2)]
+        )

rslearn/models/multitask.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """MultiTaskModel for rslearn."""
+from collections.abc import Iterable
 from copy import deepcopy
 from typing import Any
@@ -7,6 +8,9 @@ import torch
 from rslearn.log_utils import get_logger
 from rslearn.models.trunk import DecoderTrunk
+from rslearn.train.model_context import ModelContext, ModelOutput
+from .component import FeatureExtractor, IntermediateComponent, Predictor
 logger = get_logger(__name__)
@@ -58,8 +62,8 @@ class MultiTaskModel(torch.nn.Module):
     def __init__(
         self,
-        encoder: list[torch.nn.Module],
-        decoders: dict[str, list[torch.nn.Module]],
+        encoder: list[FeatureExtractor | IntermediateComponent],
+        decoders: dict[str, list[IntermediateComponent | Predictor]],
         lazy_decode: bool = False,
         loss_weights: dict[str, float] | None = None,
         trunk: DecoderTrunk | None = None,
@@ -67,8 +71,12 @@ class MultiTaskModel(torch.nn.Module):
         """Initialize a new MultiTaskModel.
         Args:
-            encoder: modules to compute intermediate feature representations.
+            encoder: modules to compute intermediate feature representations. The first
+                module must be a FeatureExtractor, and following modules must be
+                IntermediateComponents.
             decoders: modules to compute outputs and loss, should match number of tasks.
+                The last module must be a Predictor, while the previous modules must be
+                IntermediateComponents.
             lazy_decode: if True, only decode the outputs specified in the batch.
             loss_weights: weights for each task's loss (default: None = equal weights).
             trunk: if provided, use this trunk module to postprocess the features
@@ -76,7 +84,7 @@ class MultiTaskModel(torch.nn.Module):
         """
         super().__init__()
         self.lazy_decode = lazy_decode
-        self.encoder = torch.nn.Sequential(*encoder)
+        self.encoder = torch.nn.ModuleList(encoder)
         self.decoders = torch.nn.ModuleDict(
             sort_keys(
                 {
@@ -120,32 +128,28 @@ class MultiTaskModel(torch.nn.Module):
     def apply_decoder(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None,
-        decoder: list[torch.nn.Module],
+        decoder: list[IntermediateComponent | Predictor],
         task_name: str,
-        outputs: list[dict[str, Any]],
-        losses: dict[str, torch.Tensor],
-    ) -> tuple[list[dict[str, Any]], dict[str, torch.Tensor]]:
+    ) -> ModelOutput:
         """Apply a decoder to a list of inputs and targets.
         Args:
-            features: list of features
-            inputs: list of input dicts
+            intermediates: the intermediate output from the encoder.
+            context: the model context.
             targets: list of target dicts
             decoder: list of decoder modules
             task_name: the name of the task
-            outputs: list of output dicts
-            losses: dictionary of loss values
         Returns:
-            tuple of (outputs, losses)
+            a ModelOutput containing outputs across all the decoders.
         """
         # First, apply all but the last module in the decoder to the features
-        cur = features
+        cur = intermediates
         for module in decoder[:-1]:
-            cur = module(cur, inputs)
+            cur = module(cur, context)
         if targets is None:
             cur_targets = None
@@ -153,14 +157,7 @@ class MultiTaskModel(torch.nn.Module):
             cur_targets = [target[task_name] for target in targets]
         # Then, apply the last module to the features and targets
-        cur_output, cur_loss_dict = decoder[-1](cur, inputs, cur_targets)
-        for idx, entry in enumerate(cur_output):
-            outputs[idx][task_name] = entry
-        for loss_name, loss_value in cur_loss_dict.items():
-            losses[f"{task_name}_{loss_name}"] = (
-                loss_value * self.loss_weights[task_name]
-            )
-        return outputs, losses
+        return decoder[-1](cur, context, cur_targets)
     def _get_tasks_from_decoder(self, decoder: str) -> list[str]:
         """Get the tasks corresponding to this decoder.
@@ -172,66 +169,84 @@ class MultiTaskModel(torch.nn.Module):
     def apply_decoders(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply all the decoders to the features and targets.
         Args:
-            features: list of features
-            inputs: list of input dicts
+            intermediates: the intermediates from the encoder.
+            context: the model context
             targets: list of target dicts
         Returns:
-            dict of outputs and losses
+            combined ModelOutput. The outputs is a list of output dicts, one per example,
+                where the dict maps from task name to the corresponding task output. The
+                losses is a flat dict but the task name is prepended to the loss names.
         """
-        outputs: list[dict[str, torch.Tensor | dict]] = [{} for _ in inputs]
+        outputs: list[dict[str, torch.Tensor | dict]] = [{} for _ in context.inputs]
         losses: dict[str, torch.Tensor] = {}
         if self.lazy_decode:
             # Assume that all inputs have the same dataset_source
-            dataset_source = inputs[0]["dataset_source"]
-            decoder = self.decoders[
-                self.target_to_decoder.get(dataset_source, dataset_source)
-            ]
-            self.apply_decoder(
-                features, inputs, targets, decoder, dataset_source, outputs, losses
+            task_name = context.metadatas[0].dataset_source
+            if task_name is None:
+                raise ValueError("dataset_source must be set for lazy decoding")
+            decoder = self.decoders[self.target_to_decoder.get(task_name, task_name)]
+            model_output = self.apply_decoder(
+                intermediates, context, targets, decoder, task_name
             )
+            for idx, entry in enumerate(model_output.outputs):
+                outputs[idx][task_name] = entry
+            for loss_name, loss_value in model_output.loss_dict.items():
+                losses[f"{task_name}_{loss_name}"] = (
+                    loss_value * self.loss_weights[task_name]
+                )
         else:
             for decoder_name, decoder in self.decoders.items():
                 for task_name in self._get_tasks_from_decoder(decoder_name):
-                    self.apply_decoder(
-                        features, inputs, targets, decoder, task_name, outputs, losses
+                    model_output = self.apply_decoder(
+                        intermediates, context, targets, decoder, task_name
                     )
-        return {
-            "outputs": outputs,
-            "loss_dict": losses,
-        }
+                    for idx, entry in enumerate(model_output.outputs):
+                        outputs[idx][task_name] = entry
+                    for loss_name, loss_value in model_output.loss_dict.items():
+                        losses[f"{task_name}_{loss_name}"] = (
+                            loss_value * self.loss_weights[task_name]
+                        )
+        return ModelOutput(
+            outputs=outputs,
+            loss_dict=losses,
+        )
     def forward(
         self,
-        inputs: list[dict[str, Any]],
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply the sequence of modules on the inputs, including shared trunk.
         Args:
-            inputs: list of input dicts
+            context: the model context.
             targets: optional list of target dicts
         Returns:
-            dict with keys "outputs" and "loss_dict".
+            the model output from apply_decoders.
         """
-        features = self.encoder(inputs)
+        cur = self.encoder[0](context)
+        for module in self.encoder[1:]:
+            cur = module(cur, context)
         if self.trunk is not None:
-            trunk_out = self.trunk(features, inputs)
-            outs = self.apply_decoders(trunk_out.pop("outputs"), inputs, targets)
+            trunk_out = self.trunk(cur, context)
+            outs = self.apply_decoders(trunk_out.pop("outputs"), context, targets)
             self.trunk.apply_auxiliary_losses(trunk_out, outs)
             return outs | trunk_out
         else:
-            return self.apply_decoders(features, inputs, targets)
+            return self.apply_decoders(cur, context, targets)
 class MultiTaskMergedModel(MultiTaskModel):
@@ -247,8 +262,8 @@ class MultiTaskMergedModel(MultiTaskModel):
     def __init__(
         self,
-        encoder: list[torch.nn.Module],
-        decoders: dict[str, list[torch.nn.Module]],
+        encoder: list[FeatureExtractor | IntermediateComponent],
+        decoders: dict[str, list[IntermediateComponent | Predictor]],
         decoder_to_target: dict[str, list[str]],
         task_label_offsets: dict[str, dict[str, Any]],
         lazy_decode: bool = False,
@@ -273,7 +288,7 @@ class MultiTaskMergedModel(MultiTaskModel):
         torch.nn.Module.__init__(self)
         self.lazy_decode = lazy_decode
-        self.encoder = torch.nn.Sequential(*encoder)
+        self.encoder = torch.nn.ModuleList(encoder)
         self.decoders = torch.nn.ModuleDict(
             sort_keys(
                 {
@@ -329,9 +344,9 @@ class MultiTaskMergedModel(MultiTaskModel):
         return offset_targets
     def unmerge_output_labels(
-        self, outputs: list[dict[str, torch.Tensor | dict]], task_name: str
-    ) -> None:
-        """Unmerge the task labels in place.
+        self, outputs: Iterable[Any], task_name: str
+    ) -> list[dict[str, torch.Tensor | dict]]:
+        """Unmerge the task outputs.
         For most tasks, this means chopping off the corresponding label dimensions.
         For some, we might just need to subtract an offset from the target (ex: segmentation).
@@ -340,10 +355,15 @@ class MultiTaskMergedModel(MultiTaskModel):
         Args:
             outputs: the predictions
             task_name: the name of the task
+        Returns:
+            the unmerged outputs.
         """
         offset = self.task_label_offsets[task_name]["offset"]
         num_outputs = self.task_label_offsets[task_name]["num_outputs"]
         output_key = self.task_label_offsets[task_name]["outputs_key"]
+        unmerged_outputs: list[dict[str, torch.Tensor | dict]] = [{} for _ in outputs]
         with torch.no_grad():
             for i, output in enumerate(outputs):
                 if not output:
@@ -353,35 +373,44 @@ class MultiTaskMergedModel(MultiTaskModel):
                 if isinstance(output, dict):
                     # For some tasks (eg object detection), we have discrete label
                     # predictions instead of a distribution over labels
-                    output[output_key] -= offset
+                    unmerged_output = output.copy()
+                    unmerged_output[output_key] = unmerged_output[output_key] - offset
+                    unmerged_outputs[i][task_name] = unmerged_output
                 elif isinstance(output, torch.Tensor):
                     # For classification/segmentation tasks, we have a distribution
                     # over labels, so we need to scale the predictions so that they
                     # sum to 1 since we chop off some of the probability densities
-                    tensor: torch.Tensor = output[offset : offset + num_outputs, ...]
-                    tensor /= tensor.sum(dim=0, keepdim=True).type(torch.float32)
-                    outputs[i][task_name] = tensor
+                    unmerged_output = output[offset : offset + num_outputs, ...]
+                    unmerged_output /= unmerged_output.sum(dim=0, keepdim=True).type(
+                        torch.float32
+                    )
+                    unmerged_outputs[i][task_name] = unmerged_output
+        return unmerged_outputs
     def forward(
         self,
-        inputs: list[dict[str, Any]],
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> dict[str, Any]:
+    ) -> ModelOutput:
         """Apply the sequence of modules on the inputs.
         Args:
-            inputs: list of input dicts
+            context: the model context.
             targets: optional list of target dicts
         Returns:
-            dict with keys "outputs" and "loss_dict", and possibly other keys.
+            the model output.
         """
-        dataset_source = inputs[0].get("dataset_source", None)
-        assert isinstance(dataset_source, str)
-        targets = self.merge_task_labels(targets, dataset_source)
-        outs = super().forward(inputs, targets)
-        self.unmerge_output_labels(outs["outputs"], dataset_source)
-        return outs
+        dataset_source = context.metadatas[0].dataset_source
+        assert dataset_source is not None
+        merged_targets = self.merge_task_labels(targets, dataset_source)
+        outs = super().forward(context, merged_targets)
+        unmerged_outputs = self.unmerge_output_labels(outs.outputs, dataset_source)
+        return ModelOutput(
+            outputs=unmerged_outputs,
+            loss_dict=outs.loss_dict,
+        )
     def _get_tasks_from_decoder(self, decoder: str) -> list[str]:
         """Get the tasks corresponding to this decoder.

rslearn/models/olmoearth_pretrain/model.py CHANGED Viewed

@@ -19,6 +19,8 @@ from olmoearth_pretrain.train.masking import MaskedOlmoEarthSample, MaskValue
 from upath import UPath
 from rslearn.log_utils import get_logger
+from rslearn.models.component import FeatureExtractor, FeatureMaps
+from rslearn.train.model_context import ModelContext
 logger = get_logger(__name__)
@@ -44,7 +46,7 @@ EMBEDDING_SIZES = {
 }
-class OlmoEarth(torch.nn.Module):
+class OlmoEarth(FeatureExtractor):
     """A wrapper to support the OlmoEarth model."""
     def __init__(
@@ -153,20 +155,21 @@ class OlmoEarth(torch.nn.Module):
         # Load the checkpoint.
         if not random_initialization:
             train_module_dir = checkpoint_upath / "model_and_optim"
-            if train_module_dir.exists():
-                load_model_and_optim_state(str(train_module_dir), model)
-                logger.info(f"loaded OlmoEarth encoder from {train_module_dir}")
-            else:
-                logger.info(f"could not find OlmoEarth encoder at {train_module_dir}")
+            load_model_and_optim_state(str(train_module_dir), model)
+            logger.info(f"loaded OlmoEarth encoder from {train_module_dir}")
         return model
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the OlmoEarth backbone.
-        Inputs:
-            inputs: input dicts. It should include keys corresponding to the modalities
-                that should be passed to the OlmoEarth model.
+        Args:
+            context: the model context. Input dicts should include keys corresponding
+                to the modalities that should be passed to the OlmoEarth model.
+        Returns:
+            a FeatureMaps consisting of one feature map, at 1/patch_size of the input
+                resolution. Embeddings will be pooled across modalities and timesteps.
         """
         kwargs = {}
         present_modalities = []
@@ -175,10 +178,10 @@ class OlmoEarth(torch.nn.Module):
         # We assume all multitemporal modalities have the same number of timesteps.
         max_timesteps = 1
         for modality in MODALITY_NAMES:
-            if modality not in inputs[0]:
+            if modality not in context.inputs[0]:
                 continue
             present_modalities.append(modality)
-            cur = torch.stack([inp[modality] for inp in inputs], dim=0)
+            cur = torch.stack([inp[modality] for inp in context.inputs], dim=0)
             device = cur.device
             # Check if it's single or multitemporal, and reshape accordingly
             num_bands = Modality.get(modality).num_bands
@@ -199,7 +202,7 @@ class OlmoEarth(torch.nn.Module):
         # Note that only months (0 to 11) are used in OlmoEarth position encoding.
         # For now, we assign same timestamps to all inputs, but later we should handle varying timestamps per input.
         timestamps = torch.zeros(
-            (len(inputs), max_timesteps, 3), dtype=torch.int32, device=device
+            (len(context.inputs), max_timesteps, 3), dtype=torch.int32, device=device
         )
         timestamps[:, :, 0] = 1  # day
         timestamps[:, :, 1] = torch.arange(max_timesteps, device=device)[
@@ -212,14 +215,14 @@ class OlmoEarth(torch.nn.Module):
         # Decide context based on self.autocast_dtype.
         if self.autocast_dtype is None:
-            context = nullcontext()
+            torch_context = nullcontext()
         else:
             assert device is not None
-            context = torch.amp.autocast(
+            torch_context = torch.amp.autocast(
                 device_type=device.type, dtype=self.autocast_dtype
             )
-        with context:
+        with torch_context:
             # Currently we assume the provided model always returns a TokensAndMasks object.
             tokens_and_masks: TokensAndMasks
             if isinstance(self.model, Encoder):
@@ -247,7 +250,7 @@ class OlmoEarth(torch.nn.Module):
             features.append(pooled)
         # Pool over the modalities, so we get one BCHW feature map.
         pooled = torch.stack(features, dim=0).mean(dim=0)
-        return [pooled]
+        return FeatureMaps([pooled])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/panopticon.py CHANGED Viewed

@@ -3,15 +3,16 @@
 import math
 from enum import StrEnum
 from importlib import resources
-from typing import Any
 import torch
 import torch.nn.functional as F
 import yaml
 from einops import rearrange, repeat
-from torch import nn
 from rslearn.log_utils import get_logger
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
 logger = get_logger(__name__)
@@ -28,7 +29,7 @@ class PanopticonModalities(StrEnum):
     # Add more modalities as needed
-class Panopticon(nn.Module):
+class Panopticon(FeatureExtractor):
     """Class containing the Panopticon model that can ingest MaskedHeliosSample objects."""
     patch_size: int = 14
@@ -138,11 +139,11 @@ class Panopticon(nn.Module):
             "chn_ids": chn_ids,
         }
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass through the panopticon model."""
         batch_inputs = {
-            key: torch.stack([inp[key] for inp in inputs], dim=0)
-            for key in inputs[0].keys()
+            key: torch.stack([inp[key] for inp in context.inputs], dim=0)
+            for key in context.inputs[0].keys()
         }
         panopticon_inputs = self.prepare_input(batch_inputs)
         output_features = self.model.forward_features(panopticon_inputs)[
@@ -154,7 +155,7 @@ class Panopticon(nn.Module):
         output_features = rearrange(
             output_features, "b (h w) d -> b d h w", h=height, w=height
         )
-        return [output_features]
+        return FeatureMaps([output_features])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/pick_features.py CHANGED Viewed

@@ -2,45 +2,39 @@
 from typing import Any
-import torch
+from rslearn.train.model_context import ModelContext
+from .component import (
+    FeatureMaps,
+    IntermediateComponent,
+)
-class PickFeatures(torch.nn.Module):
+class PickFeatures(IntermediateComponent):
     """Picks a subset of feature maps in a multi-scale feature map list."""
-    def __init__(self, indexes: list[int], collapse: bool = False):
+    def __init__(self, indexes: list[int]):
         """Create a new PickFeatures.
         Args:
             indexes: the indexes of the input feature map list to select.
-            collapse: return one feature map instead of list. If enabled, indexes must
-                consist of one index. This is mainly useful for using PickFeatures as
-                the final module in the decoder, since the final prediction is expected
-                to be one feature map for most tasks like segmentation.
         """
         super().__init__()
         self.indexes = indexes
-        self.collapse = collapse
-        if self.collapse and len(self.indexes) != 1:
-            raise ValueError("if collapse is enabled, must get exactly one index")
     def forward(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]] | None = None,
-        targets: list[dict[str, Any]] | None = None,
-    ) -> list[torch.Tensor]:
+        intermediates: Any,
+        context: ModelContext,
+    ) -> FeatureMaps:
         """Pick a subset of the features.
         Args:
-            features: input features
-            inputs: raw inputs, not used
-            targets: targets, not used
+            intermediates: the output from the previous component, which must be a FeatureMaps.
+            context: the model context.
         """
-        new_features = [features[idx] for idx in self.indexes]
-        if self.collapse:
-            assert len(new_features) == 1
-            return new_features[0]
-        else:
-            return new_features
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to PickFeatures must be FeatureMaps")
+        new_features = [intermediates.feature_maps[idx] for idx in self.indexes]
+        return FeatureMaps(new_features)

rslearn/models/pooling_decoder.py CHANGED Viewed

@@ -4,8 +4,16 @@ from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext
-class PoolingDecoder(torch.nn.Module):
+from .component import (
+    FeatureMaps,
+    FeatureVector,
+    IntermediateComponent,
+)
+class PoolingDecoder(IntermediateComponent):
     """Decoder that computes flat vector from a 2D feature map.
     It inputs multi-scale features, but only uses the last feature map. Then applies a
@@ -57,25 +65,26 @@ class PoolingDecoder(torch.nn.Module):
         self.output_layer = torch.nn.Linear(prev_channels, out_channels)
-    def forward(
-        self, features: list[torch.Tensor], inputs: list[dict[str, Any]]
-    ) -> torch.Tensor:
+    def forward(self, intermediates: Any, context: ModelContext) -> Any:
         """Compute flat output vector from multi-scale feature map.
         Args:
-            features: list of feature maps at different resolutions.
-            inputs: original inputs (ignored).
+            intermediates: the output from the previous component, which must be a FeatureMaps.
+            context: the model context.
         Returns:
             flat feature vector
         """
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to PoolingDecoder must be a FeatureMaps")
         # Only use last feature map.
-        features = features[-1]
+        features = intermediates.feature_maps[-1]
         features = self.conv_layers(features)
         features = torch.amax(features, dim=(2, 3))
         features = self.fc_layers(features)
-        return self.output_layer(features)
+        return FeatureVector(self.output_layer(features))
 class SegmentationPoolingDecoder(PoolingDecoder):
@@ -108,14 +117,13 @@ class SegmentationPoolingDecoder(PoolingDecoder):
         super().__init__(in_channels=in_channels, out_channels=out_channels, **kwargs)
         self.image_key = image_key
-    def forward(
-        self, features: list[torch.Tensor], inputs: list[dict[str, Any]]
-    ) -> torch.Tensor:
+    def forward(self, intermediates: Any, context: ModelContext) -> Any:
         """Extend PoolingDecoder forward to upsample the output to a segmentation mask.
         This only works when all of the pixels have the same segmentation target.
         """
-        output_probs = super().forward(features, inputs)
+        output_probs = super().forward(intermediates, context)
         # BC -> BCHW
-        h, w = inputs[0][self.image_key].shape[1:3]
-        return output_probs[:, :, None, None].repeat([1, 1, h, w])
+        h, w = context.inputs[0][self.image_key].shape[1:3]
+        feat_map = output_probs.feature_vector[:, :, None, None].repeat([1, 1, h, w])
+        return FeatureMaps([feat_map])

rslearn 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

rslearn 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl