PyPI - rslearn - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.19__py3-none-any.whl - Mend

rslearn 0.0.18py3-none-any.whl → 0.0.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

rslearn/arg_parser.py +2 -9
rslearn/config/dataset.py +15 -16
rslearn/dataset/dataset.py +28 -22
rslearn/lightning_cli.py +22 -11
rslearn/main.py +1 -1
rslearn/models/attention_pooling.py +177 -0
rslearn/models/component.py +12 -0
rslearn/models/olmoearth_pretrain/model.py +125 -34
rslearn/models/simple_time_series.py +7 -1
rslearn/train/all_patches_dataset.py +67 -19
rslearn/train/dataset.py +36 -43
rslearn/train/scheduler.py +15 -0
rslearn/train/transforms/resize.py +74 -0
rslearn/utils/geometry.py +73 -0
rslearn/utils/jsonargparse.py +66 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/METADATA +1 -1
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/RECORD +22 -20
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/WHEEL +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.18.dist-info → rslearn-0.0.19.dist-info}/top_level.txt +0 -0

rslearn/arg_parser.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Custom Lightning ArgumentParser with environment variable substitution support."""
-import os
 from typing import Any
 from jsonargparse import Namespace
@@ -21,11 +20,7 @@ class RslearnArgumentParser(LightningArgumentParser):
     def parse_string(
         self,
         cfg_str: str,
-        cfg_path: str | os.PathLike = "",
-        ext_vars: dict | None = None,
-        env: bool | None = None,
-        defaults: bool = True,
-        with_meta: bool | None = None,
+        *args: Any,
         **kwargs: Any,
     ) -> Namespace:
         """Pre-processes string for environment variable substitution before parsing."""
@@ -33,6 +28,4 @@ class RslearnArgumentParser(LightningArgumentParser):
         substituted_cfg_str = substitute_env_vars_in_string(cfg_str)
         # Call the parent method with the substituted config
-        return super().parse_string(
-            substituted_cfg_str, cfg_path, ext_vars, env, defaults, with_meta, **kwargs
-        )
+        return super().parse_string(substituted_cfg_str, *args, **kwargs)

rslearn/config/dataset.py CHANGED Viewed

@@ -25,7 +25,7 @@ from rasterio.enums import Resampling
 from upath import UPath
 from rslearn.log_utils import get_logger
-from rslearn.utils import PixelBounds, Projection
+from rslearn.utils.geometry import PixelBounds, Projection, ResolutionFactor
 from rslearn.utils.raster_format import RasterFormat
 from rslearn.utils.vector_format import VectorFormat
@@ -215,22 +215,12 @@ class BandSetConfig(BaseModel):
         Returns:
             tuple of updated projection and bounds with zoom offset applied
         """
-        if self.zoom_offset == 0:
-            return projection, bounds
-        projection = Projection(
-            projection.crs,
-            projection.x_resolution / (2**self.zoom_offset),
-            projection.y_resolution / (2**self.zoom_offset),
-        )
-        if self.zoom_offset > 0:
-            zoom_factor = 2**self.zoom_offset
-            bounds = tuple(x * zoom_factor for x in bounds)  # type: ignore
+        if self.zoom_offset >= 0:
+            factor = ResolutionFactor(numerator=2**self.zoom_offset)
         else:
-            bounds = tuple(
-                x // (2 ** (-self.zoom_offset))
-                for x in bounds  # type: ignore
-            )
-        return projection, bounds
+            factor = ResolutionFactor(denominator=2 ** (-self.zoom_offset))
+        return (factor.multiply_projection(projection), factor.multiply_bounds(bounds))
     @field_validator("format", mode="before")
     @classmethod
@@ -645,3 +635,12 @@ class DatasetConfig(BaseModel):
         default_factory=lambda: StorageConfig(),
         description="jsonargparse configuration for the WindowStorageFactory.",
     )
+    @field_validator("layers", mode="after")
+    @classmethod
+    def layer_names_validator(cls, v: dict[str, LayerConfig]) -> dict[str, LayerConfig]:
+        """Ensure layer names don't contain periods, since we use periods to distinguish different materialized groups within a layer."""
+        for layer_name in v.keys():
+            if "." in layer_name:
+                raise ValueError(f"layer names must not contain periods: {layer_name}")
+        return v

rslearn/dataset/dataset.py CHANGED Viewed

@@ -23,7 +23,7 @@ class Dataset:
     .. code-block:: none
         dataset/
-            config.json
+            config.json  # optional, if config provided as runtime object
             windows/
                 group1/
                     epsg:3857_10_623565_1528020/
@@ -40,37 +40,43 @@ class Dataset:
     materialize.
     """
-    def __init__(self, path: UPath, disabled_layers: list[str] = []) -> None:
+    def __init__(
+        self,
+        path: UPath,
+        disabled_layers: list[str] = [],
+        dataset_config: DatasetConfig | None = None,
+    ) -> None:
         """Initializes a new Dataset.
         Args:
             path: the root directory of the dataset
             disabled_layers: list of layers to disable
+            dataset_config: optional dataset configuration to use instead of loading from the dataset directory
         """
         self.path = path
-        # Load dataset configuration.
-        with (self.path / "config.json").open("r") as f:
-            config_content = f.read()
-            config_content = substitute_env_vars_in_string(config_content)
-            config = DatasetConfig.model_validate(json.loads(config_content))
-            self.layers = {}
-            for layer_name, layer_config in config.layers.items():
-                # Layer names must not contain period, since we use period to
-                # distinguish different materialized groups within a layer.
-                assert "." not in layer_name, "layer names must not contain periods"
-                if layer_name in disabled_layers:
-                    logger.warning(f"Layer {layer_name} is disabled")
-                    continue
-                self.layers[layer_name] = layer_config
-            self.tile_store_config = config.tile_store
-            self.storage = (
-                config.storage.instantiate_window_storage_factory().get_storage(
-                    self.path
+        if dataset_config is None:
+            # Load dataset configuration from the dataset directory.
+            with (self.path / "config.json").open("r") as f:
+                config_content = f.read()
+                config_content = substitute_env_vars_in_string(config_content)
+                dataset_config = DatasetConfig.model_validate(
+                    json.loads(config_content)
                 )
+        self.layers = {}
+        for layer_name, layer_config in dataset_config.layers.items():
+            if layer_name in disabled_layers:
+                logger.warning(f"Layer {layer_name} is disabled")
+                continue
+            self.layers[layer_name] = layer_config
+        self.tile_store_config = dataset_config.tile_store
+        self.storage = (
+            dataset_config.storage.instantiate_window_storage_factory().get_storage(
+                self.path
             )
+        )
     def load_windows(
         self,

rslearn/lightning_cli.py CHANGED Viewed

@@ -21,6 +21,7 @@ from rslearn.log_utils import get_logger
 from rslearn.train.data_module import RslearnDataModule
 from rslearn.train.lightning_module import RslearnLightningModule
 from rslearn.utils.fsspec import open_atomic
+from rslearn.utils.jsonargparse import init_jsonargparse
 WANDB_ID_FNAME = "wandb_id"
@@ -390,8 +391,15 @@ class RslearnLightningCLI(LightningCLI):
         Sets the dataset path for any configured RslearnPredictionWriter callbacks.
         """
-        subcommand = self.config.subcommand
-        c = self.config[subcommand]
+        if not hasattr(self.config, "subcommand"):
+            logger.warning(
+                "Config does not have subcommand attribute, assuming we are in run=False mode"
+            )
+            subcommand = None
+            c = self.config
+        else:
+            subcommand = self.config.subcommand
+            c = self.config[subcommand]
         # If there is a RslearnPredictionWriter, set its path.
         prediction_writer_callback = None
@@ -415,16 +423,17 @@ class RslearnLightningCLI(LightningCLI):
         if subcommand == "predict":
             c.return_predictions = False
-        # For now we use DDP strategy with find_unused_parameters=True.
+        # Default to DDP with find_unused_parameters. Likely won't get called with unified config
         if subcommand == "fit":
-            c.trainer.strategy = jsonargparse.Namespace(
-                {
-                    "class_path": "lightning.pytorch.strategies.DDPStrategy",
-                    "init_args": jsonargparse.Namespace(
-                        {"find_unused_parameters": True}
-                    ),
-                }
-            )
+            if not c.trainer.strategy:
+                c.trainer.strategy = jsonargparse.Namespace(
+                    {
+                        "class_path": "lightning.pytorch.strategies.DDPStrategy",
+                        "init_args": jsonargparse.Namespace(
+                            {"find_unused_parameters": True}
+                        ),
+                    }
+                )
         if c.management_dir:
             self.enable_project_management(c.management_dir)
@@ -432,6 +441,8 @@ class RslearnLightningCLI(LightningCLI):
 def model_handler() -> None:
     """Handler for any rslearn model X commands."""
+    init_jsonargparse()
     RslearnLightningCLI(
         model_class=RslearnLightningModule,
         datamodule_class=RslearnDataModule,

rslearn/main.py CHANGED Viewed

@@ -380,7 +380,7 @@ def apply_on_windows(
 def apply_on_windows_args(f: Callable[..., Any], args: argparse.Namespace) -> None:
     """Call apply_on_windows with arguments passed via command-line interface."""
-    dataset = Dataset(UPath(args.root), args.disabled_layers)
+    dataset = Dataset(UPath(args.root), disabled_layers=args.disabled_layers)
     apply_on_windows(
         f=f,
         dataset=dataset,

rslearn/models/attention_pooling.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""An attention pooling layer."""
+import math
+from typing import Any
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from rslearn.models.component import (
+    FeatureMaps,
+    IntermediateComponent,
+    TokenFeatureMaps,
+)
+from rslearn.train.model_context import ModelContext
+class SimpleAttentionPool(IntermediateComponent):
+    """Simple Attention Pooling.
+    Given a token feature map of shape BCHWN,
+    learn an attention layer which aggregates over
+    the N dimension.
+    This is done simply by learning a mapping D->1 which is the weight
+    which should be assigned to each token during averaging:
+    output = sum [feat_token * W(feat_token) for feat_token in feat_tokens]
+    """
+    def __init__(self, in_dim: int, hidden_linear: bool = False) -> None:
+        """Initialize the simple attention pooling layer.
+        Args:
+            in_dim: the encoding dimension D
+            hidden_linear: whether to apply an additional linear transformation D -> D
+                to the feat tokens. If this is True, a ReLU activation is applied
+                after the first linear transformation.
+        """
+        super().__init__()
+        if hidden_linear:
+            self.hidden_linear = nn.Linear(in_features=in_dim, out_features=in_dim)
+        else:
+            self.hidden_linear = None
+        self.linear = nn.Linear(in_features=in_dim, out_features=1)
+    def forward_for_map(self, feat_tokens: torch.Tensor) -> torch.Tensor:
+        """Attention pooling for a single feature map (BCHWN tensor)."""
+        B, D, H, W, N = feat_tokens.shape
+        feat_tokens = rearrange(feat_tokens, "b d h w n -> (b h w) n d")
+        if self.hidden_linear is not None:
+            feat_tokens = torch.nn.functional.relu(self.hidden_linear(feat_tokens))
+        attention_scores = torch.nn.functional.softmax(self.linear(feat_tokens), dim=1)
+        feat_tokens = (attention_scores * feat_tokens).sum(dim=1)
+        return rearrange(feat_tokens, "(b h w) d -> b d h w", b=B, h=H, w=W)
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
+        """Forward pass for attention pooling linear probe.
+        Args:
+            intermediates: the output from the previous component, which must be a TokenFeatureMaps.
+                We pool over the final dimension in the TokenFeatureMaps. If multiple maps
+                are passed, we apply the same linear layers to all of them.
+            context: the model context.
+            feat_tokens (torch.Tensor): Input feature tokens of shape (B, C, H, W, N).
+        Returns:
+            torch.Tensor:
+                - output, attentioned pool over the last dimension (B, C, H, W)
+        """
+        if not isinstance(intermediates, TokenFeatureMaps):
+            raise ValueError("input to Attention Pool must be a TokenFeatureMaps")
+        features = []
+        for feat in intermediates.feature_maps:
+            features.append(self.forward_for_map(feat))
+        return FeatureMaps(features)
+class AttentionPool(IntermediateComponent):
+    """Attention Pooling.
+    Given a feature map of shape BCHWN,
+    learn an attention layer which aggregates over
+    the N dimension.
+    We do this by learning a query token, and applying a standard
+    attention mechanism against this learned query token.
+    """
+    def __init__(self, in_dim: int, num_heads: int, linear_on_kv: bool = True) -> None:
+        """Initialize the attention pooling layer.
+        Args:
+            in_dim: the encoding dimension D
+            num_heads: the number of heads to use
+            linear_on_kv: Whether to apply a linear layer on the input tokens
+            to create the key and value tokens.
+        """
+        super().__init__()
+        self.query_token: nn.Parameter = nn.Parameter(torch.empty(in_dim))
+        if linear_on_kv:
+            self.k_linear = nn.Linear(in_dim, in_dim)
+            self.v_linear = nn.Linear(in_dim, in_dim)
+        else:
+            self.k_linear = None
+            self.v_linear = None
+        if in_dim % num_heads != 0:
+            raise ValueError(
+                f"in_dim must be divisible by num_heads. Got {in_dim} and {num_heads}."
+            )
+        self.num_heads = num_heads
+        self.init_weights()
+    def init_weights(self) -> None:
+        """Initialize weights for the probe."""
+        nn.init.trunc_normal_(self.query_token, std=0.02)
+    def forward_for_map(self, feat_tokens: torch.Tensor) -> torch.Tensor:
+        """Attention pooling for a single feature map (BCHWN tensor)."""
+        B, D, H, W, N = feat_tokens.shape
+        feat_tokens = rearrange(feat_tokens, "b d h w n -> (b h w) n d")
+        collapsed_dim = B * H * W
+        q = self.query_token.expand(collapsed_dim, 1, -1)
+        q = q.reshape(
+            collapsed_dim, 1, self.num_heads, D // self.num_heads
+        )  # [B, 1, head, D_head]
+        q = rearrange(q, "b h n d -> b n h d")
+        if self.k_linear is not None:
+            assert self.v_linear is not None
+            k = self.k_linear(feat_tokens).reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+            v = self.v_linear(feat_tokens).reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+        else:
+            k = feat_tokens.reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+            v = feat_tokens.reshape(
+                collapsed_dim, N, self.num_heads, D // self.num_heads
+            )
+        k = rearrange(k, "b n h d -> b h n d")
+        v = rearrange(v, "b n h d -> b h n d")
+        # Compute attention scores
+        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
+            D // self.num_heads
+        )
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        x = torch.matmul(attn_weights, v)  # [B, head, 1, D_head]
+        return x.reshape(B, D, H, W)
+    def forward(self, intermediates: Any, context: ModelContext) -> FeatureMaps:
+        """Forward pass for attention pooling linear probe.
+        Args:
+            intermediates: the output from the previous component, which must be a TokenFeatureMaps.
+                We pool over the final dimension in the TokenFeatureMaps. If multiple feature
+                maps are passed, we apply the same attention weights (query token and linear k, v layers)
+                to all the maps.
+            context: the model context.
+            feat_tokens (torch.Tensor): Input feature tokens of shape (B, C, H, W, N).
+        Returns:
+            torch.Tensor:
+                - output, attentioned pool over the last dimension (B, C, H, W)
+        """
+        if not isinstance(intermediates, TokenFeatureMaps):
+            raise ValueError("input to Attention Pool must be a TokenFeatureMaps")
+        features = []
+        for feat in intermediates.feature_maps:
+            features.append(self.forward_for_map(feat))
+        return FeatureMaps(features)

rslearn/models/component.py CHANGED Viewed

@@ -91,6 +91,18 @@ class FeatureMaps:
     feature_maps: list[torch.Tensor]
+@dataclass
+class TokenFeatureMaps:
+    """An intermediate output type for multi-resolution BCHWN feature maps with a token dimension.
+    Unlike `FeatureMaps`, these include an additional dimension for unpooled tokens.
+    """
+    # List of BxCxHxWxN feature maps at different scales, ordered from highest resolution
+    # (most fine-grained) to lowest resolution (coarsest).
+    feature_maps: list[torch.Tensor]
 @dataclass
 class FeatureVector:
     """An intermediate output type for a flat feature vector."""

rslearn/models/olmoearth_pretrain/model.py CHANGED Viewed

@@ -19,7 +19,7 @@ from olmoearth_pretrain.train.masking import MaskedOlmoEarthSample, MaskValue
 from upath import UPath
 from rslearn.log_utils import get_logger
-from rslearn.models.component import FeatureExtractor, FeatureMaps
+from rslearn.models.component import FeatureExtractor, FeatureMaps, TokenFeatureMaps
 from rslearn.train.model_context import ModelContext
 logger = get_logger(__name__)
@@ -60,6 +60,7 @@ class OlmoEarth(FeatureExtractor):
         random_initialization: bool = False,
         embedding_size: int | None = None,
         autocast_dtype: str | None = "bfloat16",
+        token_pooling: bool = True,
     ):
         """Create a new OlmoEarth model.
@@ -83,6 +84,9 @@ class OlmoEarth(FeatureExtractor):
             embedding_size: optional embedding size to report via
                 get_backbone_channels (if model_id is not set).
             autocast_dtype: which dtype to use for autocasting, or set None to disable.
+            token_pooling: whether or not to pool the tokens. If True, the output will be BxCxHxW. If False,
+                there will be an extra dimension, N, (BxCxHxWxN) representing the temporal and channel
+                dimensions.
         """
         if (
             sum(
@@ -133,6 +137,7 @@ class OlmoEarth(FeatureExtractor):
             else:
                 model = model[part]
         self.model = model
+        self.token_pooling = token_pooling
     def _load_model_from_checkpoint(
         self, checkpoint_upath: UPath, random_initialization: bool
@@ -160,47 +165,87 @@ class OlmoEarth(FeatureExtractor):
         return model
-    def forward(self, context: ModelContext) -> FeatureMaps:
-        """Compute feature maps from the OlmoEarth backbone.
+    def _prepare_modality_inputs(
+        self, context: ModelContext
+    ) -> tuple[MaskedOlmoEarthSample, list[str], torch.device]:
+        """Prepare modality tensors and masks for the OlmoEarth model.
+        Uses a two-pass approach to ensure all modalities have consistent timestep
+        dimensions for position encoding.
         Args:
-            context: the model context. Input dicts should include keys corresponding
-                to the modalities that should be passed to the OlmoEarth model.
+            context: the model context with input tensors.
         Returns:
-            a FeatureMaps consisting of one feature map, at 1/patch_size of the input
-                resolution. Embeddings will be pooled across modalities and timesteps.
+            tuple of (sample, present_modalities, device)
         """
         kwargs = {}
         present_modalities = []
         device = None
-        # Handle the case where some modalities are multitemporal and some are not.
-        # We assume all multitemporal modalities have the same number of timesteps.
+        # First pass: find global max_timesteps across all modalities and samples
+        # TODO: currently we assume all modalities have the same number of timesteps,
+        # which is not true for all cases, and time series time steps are assumed to
+        # be 1-month apart. It also assumes continuity between available timesteps.
+        # We'll have to fix all that.
         max_timesteps = 1
+        modality_data = {}
         for modality in MODALITY_NAMES:
             if modality not in context.inputs[0]:
                 continue
             present_modalities.append(modality)
-            cur = torch.stack([inp[modality] for inp in context.inputs], dim=0)
-            device = cur.device
-            # Check if it's single or multitemporal, and reshape accordingly
+            tensors = [inp[modality] for inp in context.inputs]
+            device = tensors[0].device
             num_bands = Modality.get(modality).num_bands
-            num_timesteps = cur.shape[1] // num_bands
-            max_timesteps = max(max_timesteps, num_timesteps)
-            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=num_timesteps)
+            max_t = max(t.shape[0] for t in tensors) // num_bands
+            max_timesteps = max(max_timesteps, max_t)
+            modality_data[modality] = (
+                tensors,
+                num_bands,
+                len(Modality.get(modality).band_sets),
+            )
+        # Second pass: pad and process each modality with global max_timesteps
+        for modality in present_modalities:
+            tensors, num_bands, num_band_sets = modality_data[modality]
+            target_ch = max_timesteps * num_bands
+            # Pad tensors to target_ch and track original timesteps for masking
+            padded = []
+            original_timesteps = []
+            for t in tensors:
+                orig_t = t.shape[0] // num_bands
+                original_timesteps.append(orig_t)
+                if t.shape[0] < target_ch:
+                    pad = torch.zeros(
+                        (target_ch - t.shape[0],) + t.shape[1:],
+                        dtype=t.dtype,
+                        device=device,
+                    )
+                    t = torch.cat([t, pad], dim=0)
+                padded.append(t)
+            cur = torch.stack(padded, dim=0)
+            cur = rearrange(cur, "b (t c) h w -> b h w t c", t=max_timesteps)
             kwargs[modality] = cur
-            # Create mask array which is BHWTS (without channels but with band sets).
-            num_band_sets = len(Modality.get(modality).band_sets)
-            mask_shape = cur.shape[0:4] + (num_band_sets,)
-            mask = (
-                torch.ones(mask_shape, dtype=torch.int32, device=device)
-                * MaskValue.ONLINE_ENCODER.value
+            # Create mask: ONLINE_ENCODER for valid, MISSING for padded timesteps
+            b, h, w = cur.shape[0], cur.shape[1], cur.shape[2]
+            mask = torch.full(
+                (b, h, w, max_timesteps, num_band_sets),
+                fill_value=MaskValue.ONLINE_ENCODER.value,
+                dtype=torch.int32,
+                device=device,
             )
+            for sample_idx, orig_t in enumerate(original_timesteps):
+                if orig_t < max_timesteps:
+                    mask[sample_idx, :, :, orig_t:, :] = MaskValue.MISSING.value
             kwargs[f"{modality}_mask"] = mask
         # Timestamps is required.
         # Note that only months (0 to 11) are used in OlmoEarth position encoding.
-        # For now, we assign same timestamps to all inputs, but later we should handle varying timestamps per input.
+        # For now, we assign same timestamps to all inputs, but later we should
+        # handle varying timestamps per input.
         timestamps = torch.zeros(
             (len(context.inputs), max_timesteps, 3), dtype=torch.int32, device=device
         )
@@ -211,7 +256,20 @@ class OlmoEarth(FeatureExtractor):
         timestamps[:, :, 2] = 2024  # year
         kwargs["timestamps"] = timestamps
-        sample = MaskedOlmoEarthSample(**kwargs)
+        return MaskedOlmoEarthSample(**kwargs), present_modalities, device
+    def forward(self, context: ModelContext) -> FeatureMaps | TokenFeatureMaps:
+        """Compute feature maps from the OlmoEarth backbone.
+        Args:
+            context: the model context. Input dicts should include keys corresponding
+                to the modalities that should be passed to the OlmoEarth model.
+        Returns:
+            a FeatureMaps consisting of one feature map, at 1/patch_size of the input
+                resolution. Embeddings will be pooled across modalities and timesteps.
+        """
+        sample, present_modalities, device = self._prepare_modality_inputs(context)
         # Decide context based on self.autocast_dtype.
         if self.autocast_dtype is None:
@@ -222,6 +280,14 @@ class OlmoEarth(FeatureExtractor):
                 device_type=device.type, dtype=self.autocast_dtype
             )
+        # Check if we can bypass masks (fast_pass=True)
+        missing_tokens = False
+        for modality in present_modalities:
+            modality_mask = getattr(sample, f"{modality}_mask")
+            if torch.any(modality_mask == MaskValue.MISSING.value):
+                missing_tokens = True
+                break
         with torch_context:
             # Currently we assume the provided model always returns a TokensAndMasks object.
             tokens_and_masks: TokensAndMasks
@@ -229,7 +295,7 @@ class OlmoEarth(FeatureExtractor):
                 # Encoder has a fast_pass argument to indicate mask is not needed.
                 tokens_and_masks = self.model(
                     sample,
-                    fast_pass=True,
+                    fast_pass=not missing_tokens,
                     patch_size=self.patch_size,
                     **self.forward_kwargs,
                 )["tokens_and_masks"]
@@ -241,16 +307,41 @@ class OlmoEarth(FeatureExtractor):
         # Apply temporal/modality pooling so we just have one feature per patch.
         features = []
-        for modality in present_modalities:
-            modality_features = getattr(tokens_and_masks, modality)
-            # Pool over band sets and timesteps (BHWTSC -> BHWC).
-            pooled = modality_features.mean(dim=[3, 4])
-            # We want BHWC -> BCHW.
-            pooled = rearrange(pooled, "b h w c -> b c h w")
-            features.append(pooled)
-        # Pool over the modalities, so we get one BCHW feature map.
-        pooled = torch.stack(features, dim=0).mean(dim=0)
-        return FeatureMaps([pooled])
+        if self.token_pooling:
+            for modality in present_modalities:
+                modality_features = getattr(tokens_and_masks, modality)  # BHWTSC
+                # If fast_pass is False, we need to mask the missing tokens before pooling.
+                if missing_tokens:
+                    modality_masks = getattr(
+                        tokens_and_masks, f"{modality}_mask"
+                    )  # BHWTS
+                    modality_masks_bool = (
+                        modality_masks != MaskValue.MISSING.value
+                    ).unsqueeze(-1)
+                    count = modality_masks_bool.sum(dim=[3, 4])
+                    # Masked average over band sets and timesteps (BHWTSC -> BHWC).
+                    pooled = (modality_features * modality_masks_bool).sum(
+                        dim=[3, 4]
+                    ) / count.clamp(min=1)
+                else:
+                    # Pool over band sets and timesteps (BHWTSC -> BHWC).
+                    pooled = modality_features.mean(dim=[3, 4])
+                # We want BHWC -> BCHW.
+                pooled = rearrange(pooled, "b h w c -> b c h w")
+                features.append(pooled)
+            # Pool over the modalities, so we get one BCHW feature map.
+            pooled = torch.stack(features, dim=0).mean(dim=0)
+            return FeatureMaps([pooled])
+        else:
+            for modality in present_modalities:
+                modality_features = getattr(tokens_and_masks, modality)
+                # Combine band sets and timesteps into last dim (BHWTSC -> BHWCN).
+                modality_features = rearrange(
+                    modality_features, "b h w t s c -> b c h w (t s)"
+                )
+                features.append(modality_features)
+            pooled = torch.cat(features, dim=-1)
+            return TokenFeatureMaps([pooled])
     def get_backbone_channels(self) -> list:
         """Returns the output channels of this model when used as a backbone.

rslearn 0.0.18__py3-none-any.whl → 0.0.19__py3-none-any.whl

rslearn 0.0.18py3-none-any.whl → 0.0.19py3-none-any.whl