PyPI - rslearn - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

rslearn 0.0.1py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

rslearn/arg_parser.py +31 -0
rslearn/config/__init__.py +6 -12
rslearn/config/dataset.py +520 -401
rslearn/const.py +9 -15
rslearn/data_sources/__init__.py +8 -23
rslearn/data_sources/aws_landsat.py +242 -98
rslearn/data_sources/aws_open_data.py +111 -151
rslearn/data_sources/aws_sentinel1.py +131 -0
rslearn/data_sources/climate_data_store.py +471 -0
rslearn/data_sources/copernicus.py +884 -12
rslearn/data_sources/data_source.py +43 -12
rslearn/data_sources/earthdaily.py +484 -0
rslearn/data_sources/earthdata_srtm.py +282 -0
rslearn/data_sources/eurocrops.py +242 -0
rslearn/data_sources/gcp_public_data.py +578 -222
rslearn/data_sources/google_earth_engine.py +461 -135
rslearn/data_sources/local_files.py +219 -150
rslearn/data_sources/openstreetmap.py +51 -89
rslearn/data_sources/planet.py +24 -60
rslearn/data_sources/planet_basemap.py +275 -0
rslearn/data_sources/planetary_computer.py +798 -0
rslearn/data_sources/usda_cdl.py +195 -0
rslearn/data_sources/usgs_landsat.py +115 -83
rslearn/data_sources/utils.py +249 -61
rslearn/data_sources/vector_source.py +1 -0
rslearn/data_sources/worldcereal.py +449 -0
rslearn/data_sources/worldcover.py +144 -0
rslearn/data_sources/worldpop.py +153 -0
rslearn/data_sources/xyz_tiles.py +150 -107
rslearn/dataset/__init__.py +8 -2
rslearn/dataset/add_windows.py +2 -2
rslearn/dataset/dataset.py +40 -51
rslearn/dataset/handler_summaries.py +131 -0
rslearn/dataset/manage.py +313 -74
rslearn/dataset/materialize.py +431 -107
rslearn/dataset/remap.py +29 -4
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +181 -44
rslearn/lightning_cli.py +454 -0
rslearn/log_utils.py +24 -0
rslearn/main.py +384 -181
rslearn/models/anysat.py +215 -0
rslearn/models/attention_pooling.py +177 -0
rslearn/models/clay/clay.py +231 -0
rslearn/models/clay/configs/metadata.yaml +295 -0
rslearn/models/clip.py +68 -0
rslearn/models/component.py +111 -0
rslearn/models/concatenate_features.py +103 -0
rslearn/models/conv.py +63 -0
rslearn/models/croma.py +306 -0
rslearn/models/detr/__init__.py +5 -0
rslearn/models/detr/box_ops.py +103 -0
rslearn/models/detr/detr.py +504 -0
rslearn/models/detr/matcher.py +107 -0
rslearn/models/detr/position_encoding.py +114 -0
rslearn/models/detr/transformer.py +429 -0
rslearn/models/detr/util.py +24 -0
rslearn/models/dinov3.py +177 -0
rslearn/models/faster_rcnn.py +30 -28
rslearn/models/feature_center_crop.py +53 -0
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/__init__.py +5 -0
rslearn/models/galileo/galileo.py +595 -0
rslearn/models/galileo/single_file_galileo.py +1678 -0
rslearn/models/module_wrapper.py +65 -0
rslearn/models/molmo.py +69 -0
rslearn/models/multitask.py +384 -28
rslearn/models/olmoearth_pretrain/__init__.py +1 -0
rslearn/models/olmoearth_pretrain/model.py +421 -0
rslearn/models/olmoearth_pretrain/norm.py +86 -0
rslearn/models/panopticon.py +170 -0
rslearn/models/panopticon_data/sensors/drone.yaml +32 -0
rslearn/models/panopticon_data/sensors/enmap.yaml +904 -0
rslearn/models/panopticon_data/sensors/goes.yaml +9 -0
rslearn/models/panopticon_data/sensors/himawari.yaml +9 -0
rslearn/models/panopticon_data/sensors/intuition.yaml +606 -0
rslearn/models/panopticon_data/sensors/landsat8.yaml +84 -0
rslearn/models/panopticon_data/sensors/modis_terra.yaml +99 -0
rslearn/models/panopticon_data/sensors/qb2_ge1.yaml +34 -0
rslearn/models/panopticon_data/sensors/sentinel1.yaml +85 -0
rslearn/models/panopticon_data/sensors/sentinel2.yaml +97 -0
rslearn/models/panopticon_data/sensors/superdove.yaml +60 -0
rslearn/models/panopticon_data/sensors/wv23.yaml +63 -0
rslearn/models/pick_features.py +17 -10
rslearn/models/pooling_decoder.py +60 -7
rslearn/models/presto/__init__.py +5 -0
rslearn/models/presto/presto.py +297 -0
rslearn/models/presto/single_file_presto.py +926 -0
rslearn/models/prithvi.py +1147 -0
rslearn/models/resize_features.py +59 -0
rslearn/models/sam2_enc.py +13 -9
rslearn/models/satlaspretrain.py +38 -18
rslearn/models/simple_time_series.py +188 -77
rslearn/models/singletask.py +24 -13
rslearn/models/ssl4eo_s12.py +40 -30
rslearn/models/swin.py +44 -32
rslearn/models/task_embedding.py +250 -0
rslearn/models/terramind.py +256 -0
rslearn/models/trunk.py +139 -0
rslearn/models/unet.py +68 -22
rslearn/models/upsample.py +48 -0
rslearn/models/use_croma.py +508 -0
rslearn/template_params.py +26 -0
rslearn/tile_stores/__init__.py +41 -18
rslearn/tile_stores/default.py +409 -0
rslearn/tile_stores/tile_store.py +236 -132
rslearn/train/all_patches_dataset.py +530 -0
rslearn/train/callbacks/adapters.py +53 -0
rslearn/train/callbacks/freeze_unfreeze.py +348 -17
rslearn/train/callbacks/gradients.py +129 -0
rslearn/train/callbacks/peft.py +116 -0
rslearn/train/data_module.py +444 -20
rslearn/train/dataset.py +588 -235
rslearn/train/lightning_module.py +192 -62
rslearn/train/model_context.py +88 -0
rslearn/train/optimizer.py +31 -0
rslearn/train/prediction_writer.py +319 -84
rslearn/train/scheduler.py +92 -0
rslearn/train/tasks/classification.py +55 -28
rslearn/train/tasks/detection.py +132 -76
rslearn/train/tasks/embedding.py +120 -0
rslearn/train/tasks/multi_task.py +28 -14
rslearn/train/tasks/per_pixel_regression.py +291 -0
rslearn/train/tasks/regression.py +161 -44
rslearn/train/tasks/segmentation.py +428 -53
rslearn/train/tasks/task.py +6 -5
rslearn/train/transforms/__init__.py +1 -1
rslearn/train/transforms/concatenate.py +54 -10
rslearn/train/transforms/crop.py +29 -11
rslearn/train/transforms/flip.py +18 -6
rslearn/train/transforms/mask.py +78 -0
rslearn/train/transforms/normalize.py +101 -17
rslearn/train/transforms/pad.py +19 -7
rslearn/train/transforms/resize.py +83 -0
rslearn/train/transforms/select_bands.py +76 -0
rslearn/train/transforms/sentinel1.py +75 -0
rslearn/train/transforms/transform.py +89 -70
rslearn/utils/__init__.py +2 -6
rslearn/utils/array.py +8 -6
rslearn/utils/feature.py +2 -2
rslearn/utils/fsspec.py +90 -1
rslearn/utils/geometry.py +347 -7
rslearn/utils/get_utm_ups_crs.py +2 -3
rslearn/utils/grid_index.py +5 -5
rslearn/utils/jsonargparse.py +178 -0
rslearn/utils/mp.py +4 -3
rslearn/utils/raster_format.py +268 -116
rslearn/utils/rtree_index.py +64 -17
rslearn/utils/sqlite_index.py +7 -1
rslearn/utils/vector_format.py +252 -97
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/METADATA +532 -283
rslearn-0.0.21.dist-info/RECORD +167 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/WHEEL +1 -1
rslearn-0.0.21.dist-info/licenses/NOTICE +115 -0
rslearn/data_sources/raster_source.py +0 -309
rslearn/models/registry.py +0 -5
rslearn/tile_stores/file.py +0 -242
rslearn/utils/mgrs.py +0 -24
rslearn/utils/utils.py +0 -22
rslearn-0.0.1.dist-info/RECORD +0 -88
/rslearn/{data_sources/geotiff.py → py.typed} +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info/licenses}/LICENSE +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/top_level.txt +0 -0

rslearn/config/dataset.py CHANGED Viewed

@@ -1,25 +1,84 @@
 """Classes for storing configuration of a dataset."""
+import copy
+import functools
+import json
+import warnings
 from datetime import timedelta
-from enum import Enum
-from typing import Any
+from enum import StrEnum
+from typing import TYPE_CHECKING, Annotated, Any
+import jsonargparse
 import numpy as np
 import numpy.typing as npt
 import pytimeparse
-import torch
+from pydantic import (
+    BaseModel,
+    BeforeValidator,
+    ConfigDict,
+    Field,
+    PlainSerializer,
+    field_validator,
+    model_validator,
+)
 from rasterio.enums import Resampling
+from upath import UPath
-from rslearn.utils import PixelBounds, Projection
+from rslearn.log_utils import get_logger
+from rslearn.utils.geometry import PixelBounds, Projection, ResolutionFactor
+from rslearn.utils.raster_format import RasterFormat
+from rslearn.utils.vector_format import VectorFormat
+if TYPE_CHECKING:
+    from rslearn.data_sources.data_source import DataSource
+    from rslearn.dataset.storage.storage import WindowStorageFactory
-class DType(Enum):
+logger = get_logger("__name__")
+def ensure_timedelta(v: Any) -> Any:
+    """Ensure the value is a timedelta.
+    If the value is a string, we try to parse it with pytimeparse.
+    This function is meant to be used like Annotated[timedelta, BeforeValidator(ensure_timedelta)].
+    """
+    if isinstance(v, timedelta):
+        return v
+    if isinstance(v, str):
+        return pytimeparse.parse(v)
+    raise TypeError(f"Invalid type for timedelta: {type(v).__name__}")
+def ensure_optional_timedelta(v: Any) -> Any:
+    """Like ensure_timedelta, but allows None as a value."""
+    if v is None:
+        return None
+    if isinstance(v, timedelta):
+        return v
+    if isinstance(v, str):
+        return pytimeparse.parse(v)
+    raise TypeError(f"Invalid type for timedelta: {type(v).__name__}")
+def serialize_optional_timedelta(v: timedelta | None) -> str | None:
+    """Serialize an optional timedelta for compatibility with pytimeparse."""
+    if v is None:
+        return None
+    return str(v.total_seconds()) + "s"
+class DType(StrEnum):
     """Data type of a raster."""
     UINT8 = "uint8"
     UINT16 = "uint16"
     UINT32 = "uint32"
+    UINT64 = "uint64"
+    INT8 = "int8"
+    INT16 = "int16"
     INT32 = "int32"
+    INT64 = "int64"
     FLOAT32 = "float32"
     def get_numpy_dtype(self) -> npt.DTypeLike:
@@ -30,77 +89,43 @@ class DType(Enum):
             return np.uint16
         elif self == DType.UINT32:
             return np.uint32
+        elif self == DType.UINT64:
+            return np.uint64
+        elif self == DType.INT8:
+            return np.int8
+        elif self == DType.INT16:
+            return np.int16
         elif self == DType.INT32:
             return np.int32
+        elif self == DType.INT64:
+            return np.int64
         elif self == DType.FLOAT32:
             return np.float32
         raise ValueError(f"unable to handle numpy dtype {self}")
-    def get_torch_dtype(self) -> torch.dtype:
-        """Returns pytorch dtype object corresponding to this DType."""
-        if self == DType.INT32:
-            return torch.int32
-        elif self == DType.FLOAT32:
-            return torch.float32
-        else:
-            raise ValueError(f"unable to handle torch dtype {self}")
+class ResamplingMethod(StrEnum):
+    """An enum representing the rasterio Resampling."""
-RESAMPLING_METHODS = {
-    "nearest": Resampling.nearest,
-    "bilinear": Resampling.bilinear,
-    "cubic": Resampling.cubic,
-    "cubic_spline": Resampling.cubic_spline,
-}
+    NEAREST = "nearest"
+    BILINEAR = "bilinear"
+    CUBIC = "cubic"
+    CUBIC_SPLINE = "cubic_spline"
+    def get_rasterio_resampling(self) -> Resampling:
+        """Get the rasterio Resampling corresponding to this ResamplingMethod."""
+        return RESAMPLING_METHODS[self]
-class RasterFormatConfig:
-    """A configuration specifying a RasterFormat."""
-    def __init__(self, name: str, config_dict: dict[str, Any]) -> None:
-        """Initialize a new RasterFormatConfig.
-        Args:
-            name: the name of the RasterFormat to use.
-            config_dict: configuration to pass to the RasterFormat.
-        """
-        self.name = name
-        self.config_dict = config_dict
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "RasterFormatConfig":
-        """Create a RasterFormatConfig from config dict.
-        Args:
-            config: the config dict for this RasterFormatConfig
-        """
-        return RasterFormatConfig(name=config["name"], config_dict=config)
-class VectorFormatConfig:
-    """A configuration specifying a VectorFormat."""
-    def __init__(self, name: str, config_dict: dict[str, Any] = {}) -> None:
-        """Initialize a new VectorFormatConfig.
-        Args:
-            name: the name of the VectorFormat to use.
-            config_dict: configuration to pass to the VectorFormat.
-        """
-        self.name = name
-        self.config_dict = config_dict
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "VectorFormatConfig":
-        """Create a VectorFormatConfig from config dict.
-        Args:
-            config: the config dict for this VectorFormatConfig
-        """
-        return VectorFormatConfig(name=config["name"], config_dict=config)
+RESAMPLING_METHODS = {
+    ResamplingMethod.NEAREST: Resampling.nearest,
+    ResamplingMethod.BILINEAR: Resampling.bilinear,
+    ResamplingMethod.CUBIC: Resampling.cubic,
+    ResamplingMethod.CUBIC_SPLINE: Resampling.cubic_spline,
+}
-class BandSetConfig:
+class BandSetConfig(BaseModel):
     """A configuration for a band set in a raster layer.
     Each band set specifies one or more bands that should be stored together.
@@ -108,65 +133,75 @@ class BandSetConfig:
     bands.
     """
-    def __init__(
-        self,
-        config_dict: dict[str, Any],
-        dtype: DType,
-        bands: list[str] | None = None,
-        format: dict[str, Any] | None = None,
-        zoom_offset: int = 0,
-        remap: dict[str, Any] | None = None,
-    ) -> None:
-        """Creates a new BandSetConfig instance.
-        Args:
-            config_dict: the config dict used to configure this BandSetConfig
-            dtype: the pixel value type to store tiles in
-            bands: list of band names in this BandSetConfig
-            format: the format to store tiles in, defaults to geotiff
-            zoom_offset: non-negative integer, store images at window resolution
-                divided by 2^(zoom_offset).
-            remap: config dict for Remapper to remap pixel values
-        """
-        self.config_dict = config_dict
-        self.bands = bands
-        self.format = format
-        self.dtype = dtype
-        self.zoom_offset = zoom_offset
-        self.remap = remap
-        if not self.format:
-            self.format = {"name": "geotiff"}
-    def serialize(self) -> dict[str, Any]:
-        """Serialize this BandSetConfig to a config dict, currently unused."""
-        return {
-            "bands": self.bands,
-            "format": self.format,
-            "dtype": self.dtype,
-            "zoom_offset": self.zoom_offset,
-            "remap": self.remap,
-        }
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "BandSetConfig":
-        """Create a BandSetConfig from config dict.
-        Args:
-            config: the config dict for this BandSetConfig
-        """
-        kwargs = dict(
-            config_dict=config,
-            dtype=DType(config["dtype"]),
-        )
-        for k in ["bands", "format", "zoom_offset", "remap"]:
-            if k in config:
-                kwargs[k] = config[k]
-        return BandSetConfig(**kwargs)
+    model_config = ConfigDict(extra="forbid")
+    dtype: DType = Field(
+        description="Pixel value type to store the data under. This is used during dataset materialize and model predict."
+    )
+    bands: list[str] = Field(
+        default_factory=lambda: [],
+        description="List of band names in this BandSetConfig. One of bands or num_bands must be set.",
+    )
+    num_bands: int | None = Field(
+        default=None,
+        description="The number of bands in this band set. The bands will be named B0, B1, B2, etc.",
+    )
+    format: dict[str, Any] = Field(
+        default_factory=lambda: {
+            "class_path": "rslearn.utils.raster_format.GeotiffRasterFormat"
+        },
+        description="jsonargparse configuration for the RasterFormat to store the tiles in.",
+    )
+    # Store images at a resolution higher or lower than the window resolution. This
+    # enables keeping source data at its native resolution, either to save storage
+    # space (for lower resolution data) or to retain details (for higher resolution
+    # data). If positive, store data at the window resolution divided by
+    # 2^(zoom_offset) (higher resolution). If negative, store data at the window
+    # resolution multiplied by 2^(-zoom_offset) (lower resolution).
+    zoom_offset: int = Field(
+        default=0,
+        description="Store data at the window resolution multiplied by 2^(-zoom_offset).",
+    )
+    remap: dict[str, Any] | None = Field(
+        default=None,
+        description="Optional jsonargparse configuration for a Remapper to remap pixel values.",
+    )
+    # Optional list of names for the different possible values of each band. The length
+    # of this list must equal the number of bands. For example, [["forest", "desert"]]
+    # means that it is a single-band raster where values can be 0 (forest) or 1
+    # (desert).
+    class_names: list[list[str]] | None = Field(
+        default=None,
+        description="Optional list of names for the different possible values of each band.",
+    )
+    # Optional list of nodata values for this band set. This is used during
+    # materialization when creating mosaics, to determine which parts of the source
+    # images should be copied.
+    nodata_vals: list[float] | None = Field(
+        default=None, description="Optional nodata value for each band."
+    )
+    @model_validator(mode="after")
+    def after_validator(self) -> "BandSetConfig":
+        """Ensure the BandSetConfig is valid, and handle the num_bands field."""
+        if (len(self.bands) == 0 and self.num_bands is None) or (
+            len(self.bands) != 0 and self.num_bands is not None
+        ):
+            raise ValueError("exactly one of bands and num_bands must be specified")
+        if self.num_bands is not None:
+            self.bands = [f"B{band_idx}" for band_idx in range(self.num_bands)]
+            self.num_bands = None
+        return self
     def get_final_projection_and_bounds(
-        self, projection: Projection, bounds: PixelBounds | None
-    ) -> tuple[Projection, PixelBounds | None]:
+        self, projection: Projection, bounds: PixelBounds
+    ) -> tuple[Projection, PixelBounds]:
         """Gets the final projection/bounds based on band set config.
         The band set config may apply a non-zero zoom offset that modifies the window's
@@ -180,348 +215,432 @@ class BandSetConfig:
         Returns:
             tuple of updated projection and bounds with zoom offset applied
         """
-        if self.zoom_offset == 0:
-            return projection, bounds
-        projection = Projection(
-            projection.crs,
-            projection.x_resolution / (2**self.zoom_offset),
-            projection.y_resolution / (2**self.zoom_offset),
+        if self.zoom_offset >= 0:
+            factor = ResolutionFactor(numerator=2**self.zoom_offset)
+        else:
+            factor = ResolutionFactor(denominator=2 ** (-self.zoom_offset))
+        return (factor.multiply_projection(projection), factor.multiply_bounds(bounds))
+    @field_validator("format", mode="before")
+    @classmethod
+    def convert_format_from_legacy(cls, v: dict[str, Any]) -> dict[str, Any]:
+        """Support legacy format of the RasterFormat.
+        The legacy format sets 'name' instead of 'class_path', and uses custom parsing
+        for the init_args.
+        """
+        if "name" not in v:
+            # New version, it is all good.
+            return v
+        warnings.warn(
+            "`format = {'name': ...}` is deprecated; "
+            "use `{'class_path': '...', 'init_args': {...}}` instead.",
+            DeprecationWarning,
+        )
+        logger.warning(
+            "BandSet.format uses legacy format; support will be removed after 2026-03-01."
+        )
+        legacy_name_to_class_path = {
+            "image_tile": "rslearn.utils.raster_format.ImageTileRasterFormat",
+            "geotiff": "rslearn.utils.raster_format.GeotiffRasterFormat",
+            "single_image": "rslearn.utils.raster_format.SingleImageRasterFormat",
+        }
+        if v["name"] not in legacy_name_to_class_path:
+            raise ValueError(
+                f"could not parse legacy format with unknown raster format {v['name']}"
+            )
+        init_args = dict(v)
+        class_path = legacy_name_to_class_path[init_args.pop("name")]
+        return dict(
+            class_path=class_path,
+            init_args=init_args,
         )
-        if bounds:
-            if self.zoom_offset > 0:
-                bounds = tuple(x * (2**self.zoom_offset) for x in bounds)
-            else:
-                bounds = tuple(x // (2 ** (-self.zoom_offset)) for x in bounds)
-        return projection, bounds
+    def instantiate_raster_format(self) -> RasterFormat:
+        """Instantiate the RasterFormat specified by this BandSetConfig."""
+        from rslearn.utils.jsonargparse import init_jsonargparse
-class SpaceMode(Enum):
+        init_jsonargparse()
+        parser = jsonargparse.ArgumentParser()
+        parser.add_argument("--raster_format", type=RasterFormat)
+        cfg = parser.parse_object({"raster_format": self.format})
+        raster_format = parser.instantiate_classes(cfg).raster_format
+        return raster_format
+class SpaceMode(StrEnum):
     """Spatial matching mode when looking up items corresponding to a window."""
-    CONTAINS = 1
+    CONTAINS = "CONTAINS"
     """Items must contain the entire window."""
-    INTERSECTS = 2
+    INTERSECTS = "INTERSECTS"
     """Items must overlap any portion of the window."""
-    MOSAIC = 3
+    MOSAIC = "MOSAIC"
     """Groups of items should be computed that cover the entire window.
     During materialization, items in each group are merged to form a mosaic in the
     dataset.
     """
+    PER_PERIOD_MOSAIC = "PER_PERIOD_MOSAIC"
+    """Create one mosaic per sub-period of the time range.
-class TimeMode(Enum):
-    """Temporal  matching mode when looking up items corresponding to a window."""
-    WITHIN = 1
-    """Items must be within the window time range."""
+    The duration of the sub-periods is controlled by another option in QueryConfig.
+    """
-    NEAREST = 2
-    """Select items closest to the window time range, up to max_matches."""
+    COMPOSITE = "COMPOSITE"
+    """Creates one composite covering the entire window.
-    BEFORE = 3
-    """Select items before the start of the window time range, up to max_matches."""
+    During querying all items intersecting the window are placed in one group.
+    The compositing_method in the rasterlayer config specifies how these items are reduced
+    to a single item (e.g MEAN/MEDIAN/FIRST_VALID) during materialization.
+    """
-    AFTER = 4
-    """Select items after the end of the window time range, up to max_matches."""
+    # TODO add PER_PERIOD_COMPOSITE
-class QueryConfig:
-    """A configuration for querying items in a data source."""
+class TimeMode(StrEnum):
+    """Temporal  matching mode when looking up items corresponding to a window."""
-    def __init__(
-        self,
-        space_mode: SpaceMode = SpaceMode.MOSAIC,
-        time_mode: TimeMode = TimeMode.WITHIN,
-        max_matches: int = 1,
-    ):
-        """Creates a new query configuration.
+    WITHIN = "WITHIN"
+    """Items must be within the window time range."""
-        The provided options determine how a DataSource should lookup items that match a
-        spatiotemporal window.
+    NEAREST = "NEAREST"
+    """Select items closest to the window time range, up to max_matches."""
-        Args:
-            space_mode: specifies how items should be matched with windows spatially
-            time_mode: specifies how items should be matched with windows temporally
-            max_matches: the maximum number of items (or groups of items, if space_mode
-                is MOSAIC) to match
-        """
-        self.space_mode = space_mode
-        self.time_mode = time_mode
-        self.max_matches = max_matches
-    def serialize(self) -> dict[str, Any]:
-        """Serialize this QueryConfig to a config dict, currently unused."""
-        return {
-            "space_mode": str(self.space_mode),
-            "time_mode": str(self.time_mode),
-            "max_matches": self.max_matches,
-        }
+    BEFORE = "BEFORE"
+    """Select items before the end of the window time range, up to max_matches."""
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "QueryConfig":
-        """Create a QueryConfig from config dict.
+    AFTER = "AFTER"
+    """Select items after the start of the window time range, up to max_matches."""
-        Args:
-            config: the config dict for this QueryConfig
-        """
-        return QueryConfig(
-            space_mode=SpaceMode[config.get("space_mode", "MOSAIC")],
-            time_mode=TimeMode[config.get("time_mode", "WITHIN")],
-            max_matches=config.get("max_matches", 1),
-        )
+class QueryConfig(BaseModel):
+    """A configuration for querying items in a data source."""
-class DataSourceConfig:
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    space_mode: SpaceMode = Field(
+        default=SpaceMode.MOSAIC,
+        description="Specifies how items should be matched with windows spatially.",
+    )
+    time_mode: TimeMode = Field(
+        default=TimeMode.WITHIN,
+        description="Specifies how items should be matched with windows temporally.",
+    )
+    # Minimum number of item groups. If there are fewer than this many matches, then no
+    # matches will be returned. This can be used to prevent unnecessary data ingestion
+    # if the user plans to discard windows that do not have a sufficient amount of data.
+    min_matches: int = Field(
+        default=0, description="The minimum number of item groups."
+    )
+    max_matches: int = Field(
+        default=1, description="The maximum number of item groups."
+    )
+    period_duration: Annotated[
+        timedelta,
+        BeforeValidator(ensure_timedelta),
+        PlainSerializer(serialize_optional_timedelta),
+    ] = Field(
+        default=timedelta(days=30),
+        description="The duration of the periods, if the space mode is PER_PERIOD_MOSAIC.",
+    )
+class DataSourceConfig(BaseModel):
     """Configuration for a DataSource in a dataset layer."""
-    def __init__(
-        self,
-        name: str,
-        query_config: QueryConfig,
-        config_dict: dict[str, Any],
-        time_offset: timedelta | None = None,
-        duration: timedelta | None = None,
-        ingest: bool = True,
-    ) -> None:
-        """Initializes a new DataSourceConfig.
-        Args:
-            name: the data source class name
-            query_config: the QueryConfig specifying how to match items with windows
-            config_dict: additional config passed to initialize the DataSource
-            time_offset: optional, add this timedelta to the window's time range before
-                matching
-            duration: optional, if window's time range is (t0, t1), then update to
-                (t0, t0 + duration)
-            ingest: whether to ingest this layer or directly materialize it
-                (default true)
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    class_path: str = Field(description="Class path for the data source.")
+    init_args: dict[str, Any] = Field(
+        default_factory=lambda: {},
+        description="jsonargparse init args for the data source.",
+    )
+    query_config: QueryConfig = Field(
+        default_factory=lambda: QueryConfig(),
+        description="QueryConfig specifying how to match items with windows.",
+    )
+    time_offset: Annotated[
+        timedelta | None,
+        BeforeValidator(ensure_optional_timedelta),
+        PlainSerializer(serialize_optional_timedelta),
+    ] = Field(
+        default=None,
+        description="Optional timedelta to add to the window's time range before matching.",
+    )
+    duration: Annotated[
+        timedelta | None,
+        BeforeValidator(ensure_optional_timedelta),
+        PlainSerializer(serialize_optional_timedelta),
+    ] = Field(
+        default=None,
+        description="Optional, if the window's time range is (t0, t1), then update to (t0, t0 + duration).",
+    )
+    ingest: bool = Field(
+        default=True,
+        description="Whether to ingest this layer (default True). If False, it will be directly materialized without ingestion.",
+    )
+    @model_validator(mode="before")
+    @classmethod
+    def convert_from_legacy(cls, d: dict[str, Any]) -> dict[str, Any]:
+        """Support legacy format of the DataSourceConfig.
+        The legacy format sets 'name' instead of 'class_path', and mixes the arguments
+        for the DataSource in with the DataSourceConfig keys.
         """
-        self.name = name
-        self.query_config = query_config
-        self.config_dict = config_dict
-        self.time_offset = time_offset
-        self.duration = duration
-        self.ingest = ingest
-    def serialize(self) -> dict[str, Any]:
-        """Serialize this DataSourceConfig to a config dict, currently unused."""
-        config_dict = self.config_dict.copy()
-        config_dict["name"] = self.name
-        config_dict["query_config"] = self.query_config.serialize()
-        config_dict["ingest"] = self.ingest
-        if self.time_offset:
-            config_dict["time_offset"] = str(self.time_offset)
-        if self.duration:
-            config_dict["duration"] = str(self.duration)
-        return config_dict
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "DataSourceConfig":
-        """Create a DataSourceConfig from config dict.
-        Args:
-            config: the config dict for this DataSourceConfig
-        """
-        kwargs = dict(
-            name=config["name"],
-            query_config=QueryConfig.from_config(config.get("query_config", {})),
-            config_dict=config,
+        if "name" not in d:
+            # New version, it is all good.
+            return d
+        warnings.warn(
+            "`Data source configuration {'name': ...}` is deprecated; "
+            "use `{'class_path': '...', 'init_args': {...}, ...}` instead.",
+            DeprecationWarning,
         )
-        if "time_offset" in config:
-            kwargs["time_offset"] = timedelta(
-                seconds=pytimeparse.parse(config["time_offset"])
-            )
-        if "duration" in config:
-            kwargs["duration"] = timedelta(
-                seconds=pytimeparse.parse(config["duration"])
+        logger.warning(
+            "Data source configuration uses legacy format; support will be removed after 2026-03-01."
+        )
+        # Split the dict into the base config that is in the pydantic model, and the
+        # source-specific options that should be moved to init_args dict.
+        class_path = d["name"]
+        base_config: dict[str, Any] = {}
+        ds_init_args: dict[str, Any] = {}
+        for k, v in d.items():
+            if k == "name":
+                continue
+            if k in cls.model_fields:
+                base_config[k] = v
+            else:
+                ds_init_args[k] = v
+        # Some legacy configs erroneously specify these keys, which are now caught by
+        # validation. But we still want those specific legacy configs to work.
+        if (
+            class_path == "rslearn.data_sources.planetary_computer.Sentinel2"
+            and "max_cloud_cover" in ds_init_args
+        ):
+            warnings.warn(
+                "Data source configuration specifies invalid 'max_cloud_cover' option.",
+                DeprecationWarning,
             )
-        if "ingest" in config:
-            kwargs["ingest"] = config["ingest"]
-        return DataSourceConfig(**kwargs)
+            del ds_init_args["max_cloud_cover"]
+        base_config["class_path"] = class_path
+        base_config["init_args"] = ds_init_args
+        return base_config
-class LayerType(Enum):
+class LayerType(StrEnum):
     """The layer type (raster or vector)."""
     RASTER = "raster"
     VECTOR = "vector"
-class LayerConfig:
-    """Configuration of a layer in a dataset."""
-    def __init__(
-        self,
-        layer_type: LayerType,
-        data_source: DataSourceConfig | None = None,
-        alias: str | None = None,
-    ):
-        """Initialize a new LayerConfig.
+class CompositingMethod(StrEnum):
+    """Method how to select pixels for the composite from corresponding items of a window."""
-        Args:
-            layer_type: the LayerType (raster or vector)
-            data_source: optional DataSourceConfig if this layer is retrievable
-            alias: alias for this layer to use in the tile store
-        """
-        self.layer_type = layer_type
-        self.data_source = data_source
-        self.alias = alias
-    def serialize(self) -> dict[str, Any]:
-        """Serialize this LayerConfig to a config dict, currently unused."""
-        return {
-            "layer_type": str(self.layer_type),
-            "data_source": self.data_source,
-            "alias": self.alias,
-        }
+    FIRST_VALID = "FIRST_VALID"
+    """Select first valid pixel in order of corresponding items (might be sorted)"""
+    MEAN = "MEAN"
+    """Select per-pixel mean value of corresponding items of a window"""
-class RasterLayerConfig(LayerConfig):
-    """Configuration of a raster layer."""
+    MEDIAN = "MEDIAN"
+    """Select per-pixel median value of corresponding items of a window"""
-    def __init__(
-        self,
-        layer_type: LayerType,
-        band_sets: list[BandSetConfig],
-        data_source: DataSourceConfig | None = None,
-        resampling_method: Resampling = Resampling.bilinear,
-        alias: str | None = None,
-    ):
-        """Initialize a new RasterLayerConfig.
-        Args:
-            layer_type: the LayerType (must be raster)
-            band_sets: the bands to store in this layer
-            data_source: optional DataSourceConfig if this layer is retrievable
-            resampling_method: how to resample rasters (if needed), default bilinear resampling
-            alias: alias for this layer to use in the tile store
-        """
-        super().__init__(layer_type, data_source, alias)
-        self.band_sets = band_sets
-        self.resampling_method = resampling_method
+class LayerConfig(BaseModel):
+    """Configuration of a layer in a dataset."""
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "RasterLayerConfig":
-        """Create a RasterLayerConfig from config dict.
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    type: LayerType = Field(description="The LayerType (raster or vector).")
+    data_source: DataSourceConfig | None = Field(
+        default=None,
+        description="Optional DataSourceConfig if this layer is retrievable.",
+    )
+    alias: str | None = Field(
+        default=None, description="Alias for this layer to use in the tile store."
+    )
+    # Raster layer options.
+    band_sets: list[BandSetConfig] = Field(
+        default_factory=lambda: [],
+        description="For raster layers, the bands to store in this layer.",
+    )
+    resampling_method: ResamplingMethod = Field(
+        default=ResamplingMethod.BILINEAR,
+        description="For raster layers, how to resample rasters (if neeed), default bilinear resampling.",
+    )
+    compositing_method: CompositingMethod = Field(
+        default=CompositingMethod.FIRST_VALID,
+        description="For raster layers, how to compute pixel values in the composite of each window's items.",
+    )
+    # Vector layer options.
+    vector_format: dict[str, Any] = Field(
+        default_factory=lambda: {
+            "class_path": "rslearn.utils.vector_format.GeojsonVectorFormat"
+        },
+        description="For vector layers, the jsonargparse configuration for the VectorFormat.",
+    )
+    class_property_name: str | None = Field(
+        default=None,
+        description="Optional metadata field indicating that the GeoJSON features contain a property that corresponds to a class label, and this is the name of that property.",
+    )
+    class_names: list[str] | None = Field(
+        default=None,
+        description="The list of classes that the class_property_name property could be set to.",
+    )
+    @model_validator(mode="after")
+    def after_validator(self) -> "LayerConfig":
+        """Ensure the LayerConfig is valid."""
+        if self.type == LayerType.RASTER and len(self.band_sets) == 0:
+            raise ValueError(
+                "band sets must be specified and non-empty for raster layers"
+            )
-        Args:
-            config: the config dict for this RasterLayerConfig
-        """
-        kwargs = {
-            "layer_type": LayerType(config["type"]),
-            "band_sets": [BandSetConfig.from_config(el) for el in config["band_sets"]],
-        }
-        if "data_source" in config:
-            kwargs["data_source"] = DataSourceConfig.from_config(config["data_source"])
-        if "resampling_method" in config:
-            kwargs["resampling_method"] = RESAMPLING_METHODS[
-                config["resampling_method"]
-            ]
-        if "alias" in config:
-            kwargs["alias"] = config["alias"]
-        return RasterLayerConfig(**kwargs)
-class VectorLayerConfig(LayerConfig):
-    """Configuration of a vector layer."""
-    def __init__(
-        self,
-        layer_type: LayerType,
-        data_source: DataSourceConfig | None = None,
-        zoom_offset: int = 0,
-        format: VectorFormatConfig = VectorFormatConfig("geojson"),
-        alias: str | None = None,
-    ):
-        """Initialize a new VectorLayerConfig.
+        return self
-        Args:
-            layer_type: the LayerType (must be vector)
-            data_source: optional DataSourceConfig if this layer is retrievable
-            zoom_offset: zoom offset at which to store the vector data
-            format: the VectorFormatConfig, default storing as GeoJSON
-            alias: alias for this layer to use in the tile store
-        """
-        super().__init__(layer_type, data_source, alias)
-        self.zoom_offset = zoom_offset
-        self.format = format
+    def __hash__(self) -> int:
+        """Return a hash of this LayerConfig."""
+        return hash(json.dumps(self.model_dump(mode="json"), sort_keys=True))
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "VectorLayerConfig":
-        """Create a VectorLayerConfig from config dict.
+    def __eq__(self, other: Any) -> bool:
+        """Returns whether other is the same as this LayerConfig.
         Args:
-            config: the config dict for this VectorLayerConfig
+            other: the other object to compare.
         """
-        kwargs = {"layer_type": LayerType(config["type"])}
-        if "data_source" in config:
-            kwargs["data_source"] = DataSourceConfig.from_config(config["data_source"])
-        if "zoom_offset" in config:
-            kwargs["zoom_offset"] = config["zoom_offset"]
-        if "format" in config:
-            kwargs["format"] = VectorFormatConfig.from_config(config["format"])
-        if "alias" in config:
-            kwargs["alias"] = config["alias"]
-        return VectorLayerConfig(**kwargs)
+        if not isinstance(other, LayerConfig):
+            return False
+        return self.model_dump() == other.model_dump()
-    def get_final_projection_and_bounds(
-        self, projection: Projection, bounds: PixelBounds | None
-    ) -> tuple[Projection, PixelBounds | None]:
-        """Gets the final projection/bounds based on zoom offset.
+    @functools.cache
+    def instantiate_data_source(self, ds_path: UPath | None = None) -> "DataSource":
+        """Instantiate the data source specified by this config.
         Args:
-            projection: the window's projection
-            bounds: the window's bounds (optional)
+            ds_path: optional dataset path to include in the DataSourceContext.
         Returns:
-            tuple of updated projection and bounds with zoom offset applied
+            the DataSource object.
         """
-        if self.zoom_offset == 0:
-            return projection, bounds
-        projection = Projection(
-            projection.crs,
-            projection.x_resolution / (2**self.zoom_offset),
-            projection.y_resolution / (2**self.zoom_offset),
-        )
-        if bounds:
-            if self.zoom_offset > 0:
-                bounds = tuple(x * (2**self.zoom_offset) for x in bounds)
-            else:
-                bounds = tuple(x // (2 ** (-self.zoom_offset)) for x in bounds)
-        return projection, bounds
-def load_layer_config(config: dict[str, Any]) -> LayerConfig:
-    """Load a LayerConfig from a config dict."""
-    layer_type = LayerType(config.get("type"))
-    if layer_type == LayerType.RASTER:
-        return RasterLayerConfig.from_config(config)
-    elif layer_type == LayerType.VECTOR:
-        return VectorLayerConfig.from_config(config)
-    raise ValueError(f"Unknown layer type {layer_type}")
+        from rslearn.data_sources.data_source import DataSource, DataSourceContext
+        from rslearn.utils.jsonargparse import data_source_context_serializer
+        logger.debug("getting a data source for dataset at %s", ds_path)
+        if self.data_source is None:
+            raise ValueError("This layer does not specify a data source")
-class TileStoreConfig:
-    """A configuration specifying a TileStore."""
-    def __init__(self, name: str, config_dict: dict[str, Any]) -> None:
-        """Create a new TileStoreConfig.
+        # Inject the DataSourceContext into the args.
+        context = DataSourceContext(
+            ds_path=ds_path,
+            layer_config=self,
+        )
+        ds_config: dict[str, Any] = {
+            "class_path": self.data_source.class_path,
+            "init_args": copy.deepcopy(self.data_source.init_args),
+        }
+        ds_config["init_args"]["context"] = data_source_context_serializer(context)
-        Args:
-            name: the tile store implementation name to use
-            config_dict: configuration options
-        """
-        self.name = name
-        self.config_dict = config_dict
+        # Now we can parse with jsonargparse.
+        from rslearn.utils.jsonargparse import (
+            data_source_context_serializer,
+            init_jsonargparse,
+        )
-    @staticmethod
-    def from_config(config: dict[str, Any]) -> "TileStoreConfig":
-        """Create a TileStoreConfig from config dict.
+        init_jsonargparse()
+        parser = jsonargparse.ArgumentParser()
+        parser.add_argument("--data_source", type=DataSource)
+        cfg = parser.parse_object({"data_source": ds_config})
+        data_source = parser.instantiate_classes(cfg).data_source
+        return data_source
+    def instantiate_vector_format(self) -> VectorFormat:
+        """Instantiate the vector format specified by this config."""
+        if self.type != LayerType.VECTOR:
+            raise ValueError(
+                f"cannot instantiate vector format for layer with type {self.type}"
+            )
-        Args:
-            config: the config dict for this TileStoreConfig
-        """
-        return TileStoreConfig(name=config["name"], config_dict=config)
+        from rslearn.utils.jsonargparse import init_jsonargparse
+        init_jsonargparse()
+        parser = jsonargparse.ArgumentParser()
+        parser.add_argument("--vector_format", type=VectorFormat)
+        cfg = parser.parse_object({"vector_format": self.vector_format})
+        vector_format = parser.instantiate_classes(cfg).vector_format
+        return vector_format
+class StorageConfig(BaseModel):
+    """Configuration for the WindowStorageFactory (window metadata storage backend)."""
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    class_path: str = Field(
+        default="rslearn.dataset.storage.file.FileWindowStorageFactory",
+        description="Class path for the WindowStorageFactory.",
+    )
+    init_args: dict[str, Any] = Field(
+        default_factory=lambda: {},
+        description="jsonargparse init args for the WindowStorageFactory.",
+    )
+    def instantiate_window_storage_factory(self) -> "WindowStorageFactory":
+        """Instantiate the WindowStorageFactory specified by this config."""
+        from rslearn.dataset.storage.storage import WindowStorageFactory
+        from rslearn.utils.jsonargparse import init_jsonargparse
+        init_jsonargparse()
+        parser = jsonargparse.ArgumentParser()
+        parser.add_argument("--wsf", type=WindowStorageFactory)
+        cfg = parser.parse_object(
+            {
+                "wsf": dict(
+                    class_path=self.class_path,
+                    init_args=self.init_args,
+                )
+            }
+        )
+        wsf = parser.instantiate_classes(cfg).wsf
+        return wsf
+class DatasetConfig(BaseModel):
+    """Overall dataset configuration."""
+    model_config = ConfigDict(extra="forbid")
+    layers: dict[str, LayerConfig] = Field(description="Layers in the dataset.")
+    tile_store: dict[str, Any] = Field(
+        default={"class_path": "rslearn.tile_stores.default.DefaultTileStore"},
+        description="jsonargparse configuration for the TileStore.",
+    )
+    storage: StorageConfig = Field(
+        default_factory=lambda: StorageConfig(),
+        description="jsonargparse configuration for the WindowStorageFactory.",
+    )
+    @field_validator("layers", mode="after")
+    @classmethod
+    def layer_names_validator(cls, v: dict[str, LayerConfig]) -> dict[str, LayerConfig]:
+        """Ensure layer names don't contain periods, since we use periods to distinguish different materialized groups within a layer."""
+        for layer_name in v.keys():
+            if "." in layer_name:
+                raise ValueError(f"layer names must not contain periods: {layer_name}")
+        return v

rslearn 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl

rslearn 0.0.1py3-none-any.whl → 0.0.21py3-none-any.whl