PyPI - rslearn - Versions diffs - 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl - Mend

rslearn 0.0.17py3-none-any.whl → 0.0.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

rslearn/config/__init__.py +2 -0
rslearn/config/dataset.py +49 -4
rslearn/dataset/add_windows.py +1 -1
rslearn/dataset/dataset.py +9 -65
rslearn/dataset/materialize.py +5 -5
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +26 -80
rslearn/main.py +11 -36
rslearn/models/anysat.py +11 -9
rslearn/models/clay/clay.py +8 -9
rslearn/models/clip.py +18 -15
rslearn/models/component.py +99 -0
rslearn/models/concatenate_features.py +21 -11
rslearn/models/conv.py +15 -8
rslearn/models/croma.py +13 -8
rslearn/models/detr/detr.py +25 -14
rslearn/models/dinov3.py +11 -6
rslearn/models/faster_rcnn.py +19 -9
rslearn/models/feature_center_crop.py +12 -9
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/galileo.py +23 -18
rslearn/models/module_wrapper.py +26 -57
rslearn/models/molmo.py +16 -14
rslearn/models/multitask.py +102 -73
rslearn/models/olmoearth_pretrain/model.py +18 -12
rslearn/models/panopticon.py +8 -7
rslearn/models/pick_features.py +18 -24
rslearn/models/pooling_decoder.py +22 -14
rslearn/models/presto/presto.py +16 -10
rslearn/models/presto/single_file_presto.py +4 -10
rslearn/models/prithvi.py +12 -8
rslearn/models/resize_features.py +21 -7
rslearn/models/sam2_enc.py +11 -9
rslearn/models/satlaspretrain.py +15 -9
rslearn/models/simple_time_series.py +31 -17
rslearn/models/singletask.py +24 -17
rslearn/models/ssl4eo_s12.py +15 -10
rslearn/models/swin.py +22 -13
rslearn/models/terramind.py +24 -7
rslearn/models/trunk.py +6 -3
rslearn/models/unet.py +18 -9
rslearn/models/upsample.py +22 -9
rslearn/train/all_patches_dataset.py +22 -18
rslearn/train/dataset.py +69 -54
rslearn/train/lightning_module.py +51 -32
rslearn/train/model_context.py +54 -0
rslearn/train/prediction_writer.py +111 -41
rslearn/train/tasks/classification.py +34 -15
rslearn/train/tasks/detection.py +24 -31
rslearn/train/tasks/embedding.py +33 -29
rslearn/train/tasks/multi_task.py +7 -7
rslearn/train/tasks/per_pixel_regression.py +41 -19
rslearn/train/tasks/regression.py +38 -21
rslearn/train/tasks/segmentation.py +33 -15
rslearn/train/tasks/task.py +3 -2
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/METADATA +1 -1
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/RECORD +64 -61
rslearn/dataset/index.py +0 -173
rslearn/models/registry.py +0 -22
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/WHEEL +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.17.dist-info → rslearn-0.0.18.dist-info}/top_level.txt +0 -0

rslearn/models/croma.py CHANGED Viewed

@@ -12,9 +12,11 @@ from einops import rearrange
 from upath import UPath
 from rslearn.log_utils import get_logger
+from rslearn.train.model_context import ModelContext
 from rslearn.train.transforms.transform import Transform
 from rslearn.utils.fsspec import open_atomic
+from .component import FeatureExtractor, FeatureMaps
 from .use_croma import PretrainedCROMA
 logger = get_logger(__name__)
@@ -76,7 +78,7 @@ MODALITY_BANDS = {
 }
-class Croma(torch.nn.Module):
+class Croma(FeatureExtractor):
     """CROMA backbones.
     There are two model sizes, base and large.
@@ -160,20 +162,23 @@ class Croma(torch.nn.Module):
             align_corners=False,
         )
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Croma backbone.
-        Inputs:
-            inputs: input dicts that must include either/both of "sentinel2" or
-                "sentinel1" keys depending on the configured modality.
+        Args:
+            context: the model context. Input dicts must include either/both of
+                "sentinel2" or "sentinel1" keys depending on the configured modality.
+        Returns:
+            a FeatureMaps with one feature map at 1/8 the input resolution.
         """
         sentinel1: torch.Tensor | None = None
         sentinel2: torch.Tensor | None = None
         if self.modality in [CromaModality.BOTH, CromaModality.SENTINEL1]:
-            sentinel1 = torch.stack([inp["sentinel1"] for inp in inputs], dim=0)
+            sentinel1 = torch.stack([inp["sentinel1"] for inp in context.inputs], dim=0)
             sentinel1 = self._resize_image(sentinel1) if self.do_resizing else sentinel1
         if self.modality in [CromaModality.BOTH, CromaModality.SENTINEL2]:
-            sentinel2 = torch.stack([inp["sentinel2"] for inp in inputs], dim=0)
+            sentinel2 = torch.stack([inp["sentinel2"] for inp in context.inputs], dim=0)
             sentinel2 = self._resize_image(sentinel2) if self.do_resizing else sentinel2
         outputs = self.model(
@@ -200,7 +205,7 @@ class Croma(torch.nn.Module):
             w=num_patches_per_dim,
         )
-        return [features]
+        return FeatureMaps([features])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/detr/detr.py CHANGED Viewed

@@ -13,6 +13,8 @@ import torch.nn.functional as F
 from torch import nn
 import rslearn.models.detr.box_ops as box_ops
+from rslearn.models.component import FeatureMaps, Predictor
+from rslearn.train.model_context import ModelContext, ModelOutput
 from .matcher import HungarianMatcher
 from .position_encoding import PositionEmbeddingSine
@@ -405,7 +407,7 @@ class PostProcess(nn.Module):
         return results
-class Detr(nn.Module):
+class Detr(Predictor):
     """DETR prediction module.
     This combines PositionEmbeddingSine, DetrPredictor, SetCriterion, and PostProcess.
@@ -440,33 +442,39 @@ class Detr(nn.Module):
     def forward(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+    ) -> ModelOutput:
         """Compute the detection outputs and loss from features.
         DETR will use only the last feature map, which should correspond to the lowest
         resolution one.
         Args:
-            features: multi-scale feature maps.
-            inputs: original inputs, should contain image key for original image size.
-            targets: should contain class key that stores the class label.
+            intermediates: the output from the previous component. It must be a FeatureMaps.
+            context: the model context. Input dicts must contain an "image" key which we will
+                be used to establish the original image size.
+            targets: must contain class key that stores the class label.
         Returns:
-            tuple of outputs and loss dict.
+            the model output.
         """
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to Detr must be a FeatureMaps")
+        # We only use the last feature map (most fine-grained).
+        features = intermediates.feature_maps[-1]
         # Get image sizes.
         image_sizes = torch.tensor(
-            [[inp["image"].shape[2], inp["image"].shape[1]] for inp in inputs],
+            [[inp["image"].shape[2], inp["image"].shape[1]] for inp in context.inputs],
             dtype=torch.int32,
-            device=features[0].device,
+            device=features.device,
         )
-        feat_map = features[-1]
-        pos_embedding = self.pos_embedding(feat_map)
-        outputs = self.predictor(feat_map, pos_embedding)
+        pos_embedding = self.pos_embedding(features)
+        outputs = self.predictor(features, pos_embedding)
         if targets is not None:
             # Convert boxes from [x0, y0, x1, y1] to [cx, cy, w, h].
@@ -490,4 +498,7 @@ class Detr(nn.Module):
         results = self.postprocess(outputs, image_sizes)
-        return results, losses
+        return ModelOutput(
+            outputs=results,
+            loss_dict=losses,
+        )

rslearn/models/dinov3.py CHANGED Viewed

@@ -13,9 +13,12 @@ import torch
 import torchvision
 from einops import rearrange
+from rslearn.train.model_context import ModelContext
 from rslearn.train.transforms.normalize import Normalize
 from rslearn.train.transforms.transform import Transform
+from .component import FeatureExtractor, FeatureMaps
 class DinoV3Models(StrEnum):
     """Names for different DinoV3 images on torch hub."""
@@ -40,7 +43,7 @@ DINOV3_PTHS: dict[str, str] = {
 }
-class DinoV3(torch.nn.Module):
+class DinoV3(FeatureExtractor):
     """DinoV3 Backbones.
     Must have the pretrained weights downloaded in checkpoint_dir for them to be loaded.
@@ -91,16 +94,18 @@ class DinoV3(torch.nn.Module):
         self.do_resizing = do_resizing
         self.model = self._load_model(size, checkpoint_dir)
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Forward pass for the dinov3 model.
         Args:
-            inputs: input dicts that must include "image" key.
+            context: the model context. Input dicts must include "image" key.
         Returns:
-            List[torch.Tensor]: Single-scale feature tensors from the encoder.
+            a FeatureMaps with one feature map.
         """
-        cur = torch.stack([inp["image"] for inp in inputs], dim=0)  # (B, C, H, W)
+        cur = torch.stack(
+            [inp["image"] for inp in context.inputs], dim=0
+        )  # (B, C, H, W)
         if self.do_resizing and (
             cur.shape[2] != self.image_size or cur.shape[3] != self.image_size
@@ -118,7 +123,7 @@ class DinoV3(torch.nn.Module):
             height, width = int(num_patches**0.5), int(num_patches**0.5)
             features = rearrange(features, "b (h w) d -> b d h w", h=height, w=width)
-        return [features]
+        return FeatureMaps([features])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/faster_rcnn.py CHANGED Viewed

@@ -6,6 +6,10 @@ from typing import Any
 import torch
 import torchvision
+from rslearn.train.model_context import ModelContext, ModelOutput
+from .component import FeatureMaps, Predictor
 class NoopTransform(torch.nn.Module):
     """A placeholder transform used with torchvision detection model."""
@@ -55,7 +59,7 @@ class NoopTransform(torch.nn.Module):
         return image_list, targets
-class FasterRCNN(torch.nn.Module):
+class FasterRCNN(Predictor):
     """Faster R-CNN head for predicting bounding boxes.
     It inputs multi-scale features, using each feature map to predict ROIs and then
@@ -176,20 +180,23 @@ class FasterRCNN(torch.nn.Module):
     def forward(
         self,
-        features: list[torch.Tensor],
-        inputs: list[dict[str, Any]],
+        intermediates: Any,
+        context: ModelContext,
         targets: list[dict[str, Any]] | None = None,
-    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+    ) -> ModelOutput:
         """Compute the detection outputs and loss from features.
         Args:
-            features: multi-scale feature maps.
-            inputs: original inputs, should cotnain image key for original image size.
+            intermediates: the output from the previous component, which must be a FeatureMaps.
+            context: the model context. Input dicts must contain image key for original image size.
             targets: should contain class key that stores the class label.
         Returns:
             tuple of outputs and loss dict
         """
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to FasterRCNN must be FeatureMaps")
         # Fix target labels to be 1 size in case it's empty.
         # For some reason this is needed.
         if targets:
@@ -203,11 +210,11 @@ class FasterRCNN(torch.nn.Module):
                     ),
                 )
-        image_list = [inp["image"] for inp in inputs]
+        image_list = [inp["image"] for inp in context.inputs]
         images, targets = self.noop_transform(image_list, targets)
         feature_dict = collections.OrderedDict()
-        for i, feat_map in enumerate(features):
+        for i, feat_map in enumerate(intermediates.feature_maps):
             feature_dict[f"feat{i}"] = feat_map
         proposals, proposal_losses = self.rpn(images, feature_dict, targets)
@@ -219,4 +226,7 @@ class FasterRCNN(torch.nn.Module):
         losses.update(proposal_losses)
         losses.update(detector_losses)
-        return detections, losses
+        return ModelOutput(
+            outputs=detections,
+            loss_dict=losses,
+        )

rslearn/models/feature_center_crop.py CHANGED Viewed

@@ -2,10 +2,12 @@
 from typing import Any
-import torch
+from rslearn.train.model_context import ModelContext
+from .component import FeatureMaps, IntermediateComponent
-class FeatureCenterCrop(torch.nn.Module):
+class FeatureCenterCrop(IntermediateComponent):
     """Apply center cropping on the input feature maps."""
     def __init__(
@@ -24,20 +26,21 @@ class FeatureCenterCrop(torch.nn.Module):
         super().__init__()
         self.sizes = sizes
-    def forward(
-        self, features: list[torch.Tensor], inputs: list[dict[str, Any]]
-    ) -> list[torch.Tensor]:
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
         """Apply center cropping on the feature maps.
         Args:
-            features: list of feature maps at different resolutions.
-            inputs: original inputs (ignored).
+            intermediates: output from the previous model component, which must be a FeatureMaps.
+            context: the model context.
         Returns:
             center cropped feature maps.
         """
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to FeatureCenterCrop must be FeatureMaps")
         new_features = []
-        for i, feat in enumerate(features):
+        for i, feat in enumerate(intermediates.feature_maps):
             height, width = self.sizes[i]
             if feat.shape[2] < height or feat.shape[3] < width:
                 raise ValueError(
@@ -47,4 +50,4 @@ class FeatureCenterCrop(torch.nn.Module):
             start_w = feat.shape[3] // 2 - width // 2
             feat = feat[:, :, start_h : start_h + height, start_w : start_w + width]
             new_features.append(feat)
-        return new_features
+        return FeatureMaps(new_features)

rslearn/models/fpn.py CHANGED Viewed

@@ -1,12 +1,16 @@
 """Feature pyramid network."""
 import collections
+from typing import Any
-import torch
 import torchvision
+from rslearn.train.model_context import ModelContext
-class Fpn(torch.nn.Module):
+from .component import FeatureMaps, IntermediateComponent
+class Fpn(IntermediateComponent):
     """A feature pyramid network (FPN).
     The FPN inputs a multi-scale feature map. At each scale, it computes new features
@@ -32,20 +36,27 @@ class Fpn(torch.nn.Module):
             in_channels_list=in_channels, out_channels=out_channels
         )
-    def forward(self, x: list[torch.Tensor]) -> list[torch.Tensor]:
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
         """Compute outputs of the FPN.
         Args:
-            x: the multi-scale feature maps
+            intermediates: the output from the previous component, which must be a FeatureMaps.
+            context: the model context.
         Returns:
-            new multi-scale feature maps from the FPN
+            new multi-scale feature maps from the FPN.
         """
-        inp = collections.OrderedDict([(f"feat{i}", el) for i, el in enumerate(x)])
+        if not isinstance(intermediates, FeatureMaps):
+            raise ValueError("input to Fpn must be FeatureMaps")
+        feature_maps = intermediates.feature_maps
+        inp = collections.OrderedDict(
+            [(f"feat{i}", el) for i, el in enumerate(feature_maps)]
+        )
         output = self.fpn(inp)
         output = list(output.values())
         if self.prepend:
-            return output + x
+            return FeatureMaps(output + feature_maps)
         else:
-            return output
+            return FeatureMaps(output)

rslearn/models/galileo/galileo.py CHANGED Viewed

@@ -4,16 +4,16 @@ import math
 import tempfile
 from contextlib import nullcontext
 from enum import StrEnum
-from typing import Any, cast
+from typing import cast
 import numpy as np
 import torch
-import torch.nn as nn
 from einops import rearrange, repeat
 from huggingface_hub import hf_hub_download
 from upath import UPath
 from rslearn.log_utils import get_logger
+from rslearn.models.component import FeatureExtractor, FeatureMaps
 from rslearn.models.galileo.single_file_galileo import (
     CONFIG_FILENAME,
     DW_BANDS,
@@ -39,6 +39,7 @@ from rslearn.models.galileo.single_file_galileo import (
     MaskedOutput,
     Normalizer,
 )
+from rslearn.train.model_context import ModelContext
 logger = get_logger(__name__)
@@ -70,7 +71,7 @@ AUTOCAST_DTYPE_MAP = {
 }
-class GalileoModel(nn.Module):
+class GalileoModel(FeatureExtractor):
     """Galileo backbones."""
     input_keys = [
@@ -410,11 +411,11 @@ class GalileoModel(nn.Module):
             months=months,
         )
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute feature maps from the Galileo backbone.
-        Inputs:
-            inputs: a dictionary of tensors, where the keys are one of Galileo.input_keys
+        Args:
+            context: the model context. Input dicts should contain keys corresponding to Galileo.input_keys
                 (also documented below) and values are tensors of the following shapes,
                 per input key:
                     "s1": B (T * C) H W
@@ -436,10 +437,12 @@ class GalileoModel(nn.Module):
         take a pool of the space_time unmasked tokens (i.e. of the s1 and s2 tokens).
         """
         stacked_inputs = {}
-        for key in inputs[0].keys():
+        for key in context.inputs[0].keys():
             # assume all the keys in an input are consistent
             if key in self.input_keys:
-                stacked_inputs[key] = torch.stack([inp[key] for inp in inputs], dim=0)
+                stacked_inputs[key] = torch.stack(
+                    [inp[key] for inp in context.inputs], dim=0
+                )
         s_t_channels = []
         for space_time_modality in ["s1", "s2"]:
             if space_time_modality not in stacked_inputs:
@@ -502,14 +505,14 @@ class GalileoModel(nn.Module):
         # Decide context based on self.autocast_dtype.
         device = galileo_input.s_t_x.device
         if self.autocast_dtype is None:
-            context = nullcontext()
+            torch_context = nullcontext()
         else:
             assert device is not None
-            context = torch.amp.autocast(
+            torch_context = torch.amp.autocast(
                 device_type=device.type, dtype=self.autocast_dtype
             )
-        with context:
+        with torch_context:
             outputs = self.model(
                 s_t_x=galileo_input.s_t_x,
                 s_t_m=galileo_input.s_t_m,
@@ -530,18 +533,20 @@ class GalileoModel(nn.Module):
             averaged = self.model.average_tokens(
                 s_t_x, sp_x, t_x, st_x, s_t_m, sp_m, t_m, st_m
             )
-            return [repeat(averaged, "b d -> b d 1 1")]
+            return FeatureMaps([repeat(averaged, "b d -> b d 1 1")])
         else:
             s_t_x = outputs[0]
             # we will be assuming we only want s_t_x, and (for now) that we want s1 or s2 bands
             # s_t_x has shape [b, h, w, t, c_g, d]
             # and we want [b, d, h, w]
-            return [
-                rearrange(
-                    s_t_x[:, :, :, :, s_t_channels, :].mean(dim=3),
-                    "b h w c_g d -> b c_g d h w",
-                ).mean(dim=1)
-            ]
+            return FeatureMaps(
+                [
+                    rearrange(
+                        s_t_x[:, :, :, :, s_t_channels, :].mean(dim=3),
+                        "b h w c_g d -> b c_g d h w",
+                    ).mean(dim=1)
+                ]
+            )
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn/models/module_wrapper.py CHANGED Viewed

@@ -1,67 +1,35 @@
-"""Module wrappers."""
+"""Module wrapper provided for backwards compatibility."""
 from typing import Any
 import torch
+from rslearn.train.model_context import ModelContext
-class DecoderModuleWrapper(torch.nn.Module):
-    """Wrapper for a module that processes features to work in decoder.
+from .component import (
+    FeatureExtractor,
+    FeatureMaps,
+    IntermediateComponent,
+)
-    The module should input feature map and produce a new feature map.
-    We wrap it to process each feature map in multi-scale features which is what's used
-    for most decoders.
-    """
-    def __init__(
-        self,
-        module: torch.nn.Module,
-    ):
-        """Initialize a DecoderModuleWrapper.
+class EncoderModuleWrapper(FeatureExtractor):
+    """Wraps one or more IntermediateComponents to function as the feature extractor.
-        Args:
-            module: the module to wrap
-        """
-        super().__init__()
-        self.module = module
-    def forward(
-        self, features: list[torch.Tensor], inputs: list[torch.Tensor]
-    ) -> list[torch.Tensor]:
-        """Apply the wrapped module on each feature map.
-        Args:
-            features: list of feature maps at different resolutions.
-            inputs: original inputs (ignored).
-        Returns:
-            new features
-        """
-        new_features = []
-        for feat_map in features:
-            feat_map = self.module(feat_map)
-            new_features.append(feat_map)
-        return new_features
-class EncoderModuleWrapper(torch.nn.Module):
-    """Wraps a module that is intended to be used as the decoder to work in encoder.
-    The module should input a feature map that corresponds to the original image, i.e.
-    the depth of the feature map would be the number of bands in the input image.
+    The first component should input a FeatureMaps, which will be computed from the
+    overall inputs by stacking the "image" key from each input dict.
     """
     def __init__(
         self,
-        module: torch.nn.Module | None = None,
-        modules: list[torch.nn.Module] = [],
+        module: IntermediateComponent | None = None,
+        modules: list[IntermediateComponent] = [],
     ):
         """Initialize an EncoderModuleWrapper.
         Args:
-            module: the encoder module to wrap. Exactly one one of module or modules
-                must be set.
+            module: the IntermediateComponent to wrap for use as a FeatureExtractor.
+                Exactly one of module or modules must be set.
             modules: list of modules to wrap
         """
         super().__init__()
@@ -74,18 +42,19 @@ class EncoderModuleWrapper(torch.nn.Module):
         else:
             raise ValueError("one of module or modules must be set")
-    def forward(
-        self,
-        inputs: list[dict[str, Any]],
-    ) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> Any:
         """Compute outputs from the wrapped module.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process.
+        Args:
+            context: the model context. Input dicts must include "image" key containing
+                the image to convert to a FeatureMaps, which will be passed to the
+                first wrapped module.
+        Returns:
+            the output from the last wrapped module.
         """
-        images = torch.stack([inp["image"] for inp in inputs], dim=0)
-        cur = [images]
+        images = torch.stack([inp["image"] for inp in context.inputs], dim=0)
+        cur: Any = FeatureMaps([images])
         for m in self.encoder_modules:
-            cur = m(cur, inputs)
+            cur = m(cur, context)
         return cur

rslearn/models/molmo.py CHANGED Viewed

@@ -1,12 +1,14 @@
 """Molmo model."""
-from typing import Any
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
+from rslearn.train.model_context import ModelContext
+from .component import FeatureExtractor, FeatureMaps
-class Molmo(torch.nn.Module):
+class Molmo(FeatureExtractor):
     """Molmo image encoder."""
     def __init__(
@@ -34,21 +36,21 @@ class Molmo(torch.nn.Module):
         )  # nosec
         self.encoder = model.model.vision_backbone
-    def forward(self, inputs: list[dict[str, Any]]) -> list[torch.Tensor]:
+    def forward(self, context: ModelContext) -> FeatureMaps:
         """Compute outputs from the backbone.
-        Inputs:
-            inputs: input dicts that must include "image" key containing the image to
-                process. The images should have values 0-255.
+        Args:
+            context: the model context. Input dicts must include "image" key containing
+                the image to process. The images should have values 0-255.
         Returns:
-            list of feature maps. Molmo produces features at one scale, so the list
-                contains a single Bx24x24x2048 tensor.
+            a FeatureMaps. Molmo produces features at one scale, so it will contain one
+                feature map that is a Bx24x24x2048 tensor.
         """
-        device = inputs[0]["image"].device
+        device = context.inputs[0]["image"].device
         molmo_inputs_list = []
         # Process each one so we can isolate just the full image without any crops.
-        for inp in inputs:
+        for inp in context.inputs:
             image = inp["image"].cpu().numpy().transpose(1, 2, 0)
             processed = self.processor.process(
                 images=[image],
@@ -60,6 +62,6 @@ class Molmo(torch.nn.Module):
         image_features, _ = self.encoder.encode_image(molmo_inputs.to(device))
         # 576x2048 -> 24x24x2048
-        return [
-            image_features[:, 0, :, :].reshape(-1, 24, 24, 2048).permute(0, 3, 1, 2)
-        ]
+        return FeatureMaps(
+            [image_features[:, 0, :, :].reshape(-1, 24, 24, 2048).permute(0, 3, 1, 2)]
+        )

rslearn 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl

rslearn 0.0.17py3-none-any.whl → 0.0.18py3-none-any.whl