PyPI - rslearn - Versions diffs - 0.0.22__tar.gz → 0.0.24__tar.gz - Mend

rslearn 0.0.22tar.gz → 0.0.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (176) hide show

{rslearn-0.0.22/rslearn.egg-info → rslearn-0.0.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rslearn
-Version: 0.0.22
+Version: 0.0.24
 Summary: A library for developing remote sensing datasets and models
 Author: OlmoEarth Team
 License:                                  Apache License

{rslearn-0.0.22 → rslearn-0.0.24}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "rslearn"
-version = "0.0.22"
+version = "0.0.24"
 description = "A library for developing remote sensing datasets and models"
 authors = [
     { name = "OlmoEarth Team" },

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/data_sources/planetary_computer.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import os
 import tempfile
 import xml.etree.ElementTree as ET
-from datetime import timedelta
+from datetime import datetime, timedelta
 from typing import Any
 import affine
@@ -12,6 +12,7 @@ import planetary_computer
 import rasterio
 import requests
 from rasterio.enums import Resampling
+from typing_extensions import override
 from upath import UPath
 from rslearn.config import LayerConfig
@@ -24,11 +25,104 @@ from rslearn.tile_stores import TileStore, TileStoreWithLayer
 from rslearn.utils.fsspec import join_upath
 from rslearn.utils.geometry import PixelBounds, Projection, STGeometry
 from rslearn.utils.raster_format import get_raster_projection_and_bounds
+from rslearn.utils.stac import StacClient, StacItem
 from .copernicus import get_harmonize_callback
 logger = get_logger(__name__)
+# Max limit accepted by Planetary Computer API.
+PLANETARY_COMPUTER_LIMIT = 1000
+class PlanetaryComputerStacClient(StacClient):
+    """A StacClient subclass that handles Planetary Computer's pagination limits.
+    Planetary Computer STAC API does not support standard pagination and has a max
+    limit of 1000. If the initial query returns 1000 items, this client paginates
+    by sorting by ID and using gt (greater than) queries to fetch subsequent pages.
+    """
+    @override
+    def search(
+        self,
+        collections: list[str] | None = None,
+        bbox: tuple[float, float, float, float] | None = None,
+        intersects: dict[str, Any] | None = None,
+        date_time: datetime | tuple[datetime, datetime] | None = None,
+        ids: list[str] | None = None,
+        limit: int | None = None,
+        query: dict[str, Any] | None = None,
+        sortby: list[dict[str, str]] | None = None,
+    ) -> list[StacItem]:
+        # We will use sortby for pagination, so the caller must not set it.
+        if sortby is not None:
+            raise ValueError("sortby must not be set for PlanetaryComputerStacClient")
+        # First, try a simple query with the PC limit to detect if pagination is needed.
+        # We always use PLANETARY_COMPUTER_LIMIT for the request because PC doesn't
+        # support standard pagination, and we need to detect when we hit the limit
+        # to switch to ID-based pagination.
+        # We could just start sorting by ID here and do pagination, but we treate it as
+        # a special case to avoid sorting since that seems to speed up the query.
+        stac_items = super().search(
+            collections=collections,
+            bbox=bbox,
+            intersects=intersects,
+            date_time=date_time,
+            ids=ids,
+            limit=PLANETARY_COMPUTER_LIMIT,
+            query=query,
+        )
+        # If we got fewer than the PC limit, we have all the results.
+        if len(stac_items) < PLANETARY_COMPUTER_LIMIT:
+            return stac_items
+        # We hit the limit, so we need to paginate by ID.
+        # Re-fetch with sorting by ID to ensure consistent ordering for pagination.
+        logger.debug(
+            "Initial request returned %d items (at limit), switching to ID pagination",
+            len(stac_items),
+        )
+        all_items: list[StacItem] = []
+        last_id: str | None = None
+        while True:
+            # Build query with id > last_id if we're paginating.
+            combined_query: dict[str, Any] = dict(query) if query else {}
+            if last_id is not None:
+                combined_query["id"] = {"gt": last_id}
+            stac_items = super().search(
+                collections=collections,
+                bbox=bbox,
+                intersects=intersects,
+                date_time=date_time,
+                ids=ids,
+                limit=PLANETARY_COMPUTER_LIMIT,
+                query=combined_query if combined_query else None,
+                sortby=[{"field": "id", "direction": "asc"}],
+            )
+            all_items.extend(stac_items)
+            # If we got fewer than the limit, we've fetched everything.
+            if len(stac_items) < PLANETARY_COMPUTER_LIMIT:
+                break
+            # Otherwise, paginate using the last item's ID.
+            last_id = stac_items[-1].id
+            logger.debug(
+                "Got %d items, paginating with id > %s",
+                len(stac_items),
+                last_id,
+            )
+        logger.debug("Total items fetched: %d", len(all_items))
+        return all_items
 class PlanetaryComputer(StacDataSource, TileStore):
     """Modality-agnostic data source for data on Microsoft Planetary Computer.
@@ -100,6 +194,10 @@ class PlanetaryComputer(StacDataSource, TileStore):
             required_assets=required_assets,
             cache_dir=cache_upath,
         )
+        # Replace the client with PlanetaryComputerStacClient to handle PC's pagination limits.
+        self.client = PlanetaryComputerStacClient(self.STAC_ENDPOINT)
         self.asset_bands = asset_bands
         self.timeout = timeout
         self.skip_items_missing_assets = skip_items_missing_assets
@@ -567,3 +665,53 @@ class Naip(PlanetaryComputer):
             context=context,
             **kwargs,
         )
+class CopDemGlo30(PlanetaryComputer):
+    """A data source for Copernicus DEM GLO-30 (30m) on Microsoft Planetary Computer.
+    See https://planetarycomputer.microsoft.com/dataset/cop-dem-glo-30.
+    """
+    COLLECTION_NAME = "cop-dem-glo-30"
+    DATA_ASSET = "data"
+    def __init__(
+        self,
+        band_name: str = "DEM",
+        context: DataSourceContext = DataSourceContext(),
+        **kwargs: Any,
+    ):
+        """Initialize a new CopDemGlo30 instance.
+        Args:
+            band_name: band name to use if the layer config is missing from the
+                context.
+            context: the data source context.
+            kwargs: additional arguments to pass to PlanetaryComputer.
+        """
+        if context.layer_config is not None:
+            if len(context.layer_config.band_sets) != 1:
+                raise ValueError("expected a single band set")
+            if len(context.layer_config.band_sets[0].bands) != 1:
+                raise ValueError("expected band set to have a single band")
+            band_name = context.layer_config.band_sets[0].bands[0]
+        super().__init__(
+            collection_name=self.COLLECTION_NAME,
+            asset_bands={self.DATA_ASSET: [band_name]},
+            # Skip since all items should have the same asset(s).
+            skip_items_missing_assets=True,
+            context=context,
+            **kwargs,
+        )
+    def _stac_item_to_item(self, stac_item: Any) -> SourceItem:
+        # Copernicus DEM is static; ignore item timestamps so it matches any window.
+        item = super()._stac_item_to_item(stac_item)
+        item.geometry = STGeometry(item.geometry.projection, item.geometry.shp, None)
+        return item
+    def _get_search_time_range(self, geometry: STGeometry) -> None:
+        # Copernicus DEM is static; do not filter STAC searches by time.
+        return None

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/data_sources/stac.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """A partial data source implementation providing get_items using a STAC API."""
 import json
+from datetime import datetime
 from typing import Any
 import shapely
@@ -11,6 +12,7 @@ from rslearn.const import WGS84_PROJECTION
 from rslearn.data_sources.data_source import Item, ItemLookupDataSource
 from rslearn.data_sources.utils import match_candidate_items_to_window
 from rslearn.log_utils import get_logger
+from rslearn.utils.fsspec import open_atomic
 from rslearn.utils.geometry import STGeometry
 from rslearn.utils.stac import StacClient, StacItem
@@ -132,6 +134,24 @@ class StacDataSource(ItemLookupDataSource[SourceItem]):
         return SourceItem(stac_item.id, geom, asset_urls, properties)
+    def _get_search_time_range(
+        self, geometry: STGeometry
+    ) -> datetime | tuple[datetime, datetime] | None:
+        """Get time range to include in STAC API search.
+        By default, we filter STAC searches to the window's time range. Subclasses can
+        override this to disable time filtering for "static" datasets.
+        Args:
+            geometry: the geometry we are searching for.
+        Returns:
+            the time range (or timestamp) to pass to the STAC search, or None to avoid
+                temporal filtering in the search request.
+        """
+        # Note: StacClient.search accepts either a datetime or a (start, end) tuple.
+        return geometry.time_range
     def get_item_by_name(self, name: str) -> SourceItem:
         """Gets an item by name.
@@ -168,7 +188,7 @@ class StacDataSource(ItemLookupDataSource[SourceItem]):
         # Finally we cache it if cache_dir is set.
         if cache_fname is not None:
-            with cache_fname.open("w") as f:
+            with open_atomic(cache_fname, "w") as f:
                 json.dump(item.serialize(), f)
         return item
@@ -191,10 +211,11 @@ class StacDataSource(ItemLookupDataSource[SourceItem]):
             # for each requested geometry.
             wgs84_geometry = geometry.to_projection(WGS84_PROJECTION)
             logger.debug("performing STAC search for geometry %s", wgs84_geometry)
+            search_time_range = self._get_search_time_range(wgs84_geometry)
             stac_items = self.client.search(
                 collections=[self.collection_name],
                 intersects=json.loads(shapely.to_geojson(wgs84_geometry.shp)),
-                date_time=wgs84_geometry.time_range,
+                date_time=search_time_range,
                 query=self.query,
                 limit=self.limit,
             )
@@ -239,7 +260,7 @@ class StacDataSource(ItemLookupDataSource[SourceItem]):
                     cache_fname = self.cache_dir / f"{item.name}.json"
                     if cache_fname.exists():
                         continue
-                    with cache_fname.open("w") as f:
+                    with open_atomic(cache_fname, "w") as f:
                         json.dump(item.serialize(), f)
             cur_groups = match_candidate_items_to_window(

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/main.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import argparse
 import multiprocessing
+import os
 import random
 import sys
 import time
@@ -45,6 +46,7 @@ handler_registry = {}
 ItemType = TypeVar("ItemType", bound="Item")
 MULTIPROCESSING_CONTEXT = "forkserver"
+MP_CONTEXT_ENV_VAR = "RSLEARN_MULTIPROCESSING_CONTEXT"
 def register_handler(category: Any, command: str) -> Callable:
@@ -837,7 +839,8 @@ def model_predict() -> None:
 def main() -> None:
     """CLI entrypoint."""
     try:
-        multiprocessing.set_start_method(MULTIPROCESSING_CONTEXT)
+        mp_context = os.environ.get(MP_CONTEXT_ENV_VAR, MULTIPROCESSING_CONTEXT)
+        multiprocessing.set_start_method(mp_context)
     except RuntimeError as e:
         logger.error(
             f"Multiprocessing context already set to {multiprocessing.get_context()}: "

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/models/simple_time_series.py RENAMED Viewed

@@ -180,7 +180,7 @@ class SimpleTimeSeries(FeatureExtractor):
         # want to pass 2 timesteps to the model.
         # TODO is probably to make this behaviour clearer but lets leave it like
         # this for now to not break things.
-        num_timesteps = images.shape[1] // image_channels
+        num_timesteps = image_channels // images.shape[1]
         batched_timesteps = images.shape[2] // num_timesteps
         images = rearrange(
             images,

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/train/lightning_module.py RENAMED Viewed

@@ -210,11 +210,30 @@ class RslearnLightningModule(L.LightningModule):
             # Fail silently for single-dataset case, which is okay
             pass
+    def on_validation_epoch_end(self) -> None:
+        """Compute and log validation metrics at epoch end.
+        We manually compute and log metrics here (instead of passing the MetricCollection
+        to log_dict) because MetricCollection.compute() properly flattens dict-returning
+        metrics, while log_dict expects each metric to return a scalar tensor.
+        """
+        metrics = self.val_metrics.compute()
+        self.log_dict(metrics)
+        self.val_metrics.reset()
     def on_test_epoch_end(self) -> None:
-        """Optionally save the test metrics to a file."""
+        """Compute and log test metrics at epoch end, optionally save to file.
+        We manually compute and log metrics here (instead of passing the MetricCollection
+        to log_dict) because MetricCollection.compute() properly flattens dict-returning
+        metrics, while log_dict expects each metric to return a scalar tensor.
+        """
+        metrics = self.test_metrics.compute()
+        self.log_dict(metrics)
+        self.test_metrics.reset()
         if self.metrics_file:
             with open(self.metrics_file, "w") as f:
-                metrics = self.test_metrics.compute()
                 metrics_dict = {k: v.item() for k, v in metrics.items()}
                 json.dump(metrics_dict, f, indent=4)
                 logger.info(f"Saved metrics to {self.metrics_file}")
@@ -300,9 +319,6 @@ class RslearnLightningModule(L.LightningModule):
             sync_dist=True,
         )
         self.val_metrics.update(outputs, targets)
-        self.log_dict(
-            self.val_metrics, batch_size=batch_size, on_epoch=True, sync_dist=True
-        )
     def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
         """Compute the test loss and additional metrics.
@@ -340,9 +356,6 @@ class RslearnLightningModule(L.LightningModule):
             sync_dist=True,
         )
         self.test_metrics.update(outputs, targets)
-        self.log_dict(
-            self.test_metrics, batch_size=batch_size, on_epoch=True, sync_dist=True
-        )
         if self.visualize_dir:
             for inp, target, output, metadata in zip(

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/train/tasks/multi_task.py RENAMED Viewed

@@ -118,13 +118,16 @@ class MultiTask(Task):
     def get_metrics(self) -> MetricCollection:
         """Get metrics for this task."""
-        metrics = []
+        # Flatten metrics into a single dict with task_name/ prefix to avoid nested
+        # MetricCollections. Nested collections cause issues because MetricCollection
+        # has postfix=None which breaks MetricCollection.compute().
+        all_metrics = {}
         for task_name, task in self.tasks.items():
-            cur_metrics = {}
             for metric_name, metric in task.get_metrics().items():
-                cur_metrics[metric_name] = MetricWrapper(task_name, metric)
-            metrics.append(MetricCollection(cur_metrics, prefix=f"{task_name}/"))
-        return MetricCollection(metrics)
+                all_metrics[f"{task_name}/{metric_name}"] = MetricWrapper(
+                    task_name, metric
+                )
+        return MetricCollection(all_metrics)
 class MetricWrapper(Metric):

{rslearn-0.0.22 → rslearn-0.0.24}/rslearn/train/tasks/per_pixel_regression.py RENAMED Viewed

@@ -100,7 +100,7 @@ class PerPixelRegressionTask(BasicTask):
             raise ValueError(
                 f"PerPixelRegressionTask output must be an HW tensor, but got shape {raw_output.shape}"
             )
-        return (raw_output / self.scale_factor).cpu().numpy()
+        return (raw_output[None, :, :] / self.scale_factor).cpu().numpy()
     def visualize(
         self,

rslearn 0.0.22__tar.gz → 0.0.24__tar.gz

rslearn 0.0.22tar.gz → 0.0.24tar.gz