PyPI - rslearn - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

rslearn 0.0.1py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

rslearn/arg_parser.py +31 -0
rslearn/config/__init__.py +6 -12
rslearn/config/dataset.py +520 -401
rslearn/const.py +9 -15
rslearn/data_sources/__init__.py +8 -23
rslearn/data_sources/aws_landsat.py +242 -98
rslearn/data_sources/aws_open_data.py +111 -151
rslearn/data_sources/aws_sentinel1.py +131 -0
rslearn/data_sources/climate_data_store.py +471 -0
rslearn/data_sources/copernicus.py +884 -12
rslearn/data_sources/data_source.py +43 -12
rslearn/data_sources/earthdaily.py +484 -0
rslearn/data_sources/earthdata_srtm.py +282 -0
rslearn/data_sources/eurocrops.py +242 -0
rslearn/data_sources/gcp_public_data.py +578 -222
rslearn/data_sources/google_earth_engine.py +461 -135
rslearn/data_sources/local_files.py +219 -150
rslearn/data_sources/openstreetmap.py +51 -89
rslearn/data_sources/planet.py +24 -60
rslearn/data_sources/planet_basemap.py +275 -0
rslearn/data_sources/planetary_computer.py +798 -0
rslearn/data_sources/usda_cdl.py +195 -0
rslearn/data_sources/usgs_landsat.py +115 -83
rslearn/data_sources/utils.py +249 -61
rslearn/data_sources/vector_source.py +1 -0
rslearn/data_sources/worldcereal.py +449 -0
rslearn/data_sources/worldcover.py +144 -0
rslearn/data_sources/worldpop.py +153 -0
rslearn/data_sources/xyz_tiles.py +150 -107
rslearn/dataset/__init__.py +8 -2
rslearn/dataset/add_windows.py +2 -2
rslearn/dataset/dataset.py +40 -51
rslearn/dataset/handler_summaries.py +131 -0
rslearn/dataset/manage.py +313 -74
rslearn/dataset/materialize.py +431 -107
rslearn/dataset/remap.py +29 -4
rslearn/dataset/storage/__init__.py +1 -0
rslearn/dataset/storage/file.py +202 -0
rslearn/dataset/storage/storage.py +140 -0
rslearn/dataset/window.py +181 -44
rslearn/lightning_cli.py +454 -0
rslearn/log_utils.py +24 -0
rslearn/main.py +384 -181
rslearn/models/anysat.py +215 -0
rslearn/models/attention_pooling.py +177 -0
rslearn/models/clay/clay.py +231 -0
rslearn/models/clay/configs/metadata.yaml +295 -0
rslearn/models/clip.py +68 -0
rslearn/models/component.py +111 -0
rslearn/models/concatenate_features.py +103 -0
rslearn/models/conv.py +63 -0
rslearn/models/croma.py +306 -0
rslearn/models/detr/__init__.py +5 -0
rslearn/models/detr/box_ops.py +103 -0
rslearn/models/detr/detr.py +504 -0
rslearn/models/detr/matcher.py +107 -0
rslearn/models/detr/position_encoding.py +114 -0
rslearn/models/detr/transformer.py +429 -0
rslearn/models/detr/util.py +24 -0
rslearn/models/dinov3.py +177 -0
rslearn/models/faster_rcnn.py +30 -28
rslearn/models/feature_center_crop.py +53 -0
rslearn/models/fpn.py +19 -8
rslearn/models/galileo/__init__.py +5 -0
rslearn/models/galileo/galileo.py +595 -0
rslearn/models/galileo/single_file_galileo.py +1678 -0
rslearn/models/module_wrapper.py +65 -0
rslearn/models/molmo.py +69 -0
rslearn/models/multitask.py +384 -28
rslearn/models/olmoearth_pretrain/__init__.py +1 -0
rslearn/models/olmoearth_pretrain/model.py +421 -0
rslearn/models/olmoearth_pretrain/norm.py +86 -0
rslearn/models/panopticon.py +170 -0
rslearn/models/panopticon_data/sensors/drone.yaml +32 -0
rslearn/models/panopticon_data/sensors/enmap.yaml +904 -0
rslearn/models/panopticon_data/sensors/goes.yaml +9 -0
rslearn/models/panopticon_data/sensors/himawari.yaml +9 -0
rslearn/models/panopticon_data/sensors/intuition.yaml +606 -0
rslearn/models/panopticon_data/sensors/landsat8.yaml +84 -0
rslearn/models/panopticon_data/sensors/modis_terra.yaml +99 -0
rslearn/models/panopticon_data/sensors/qb2_ge1.yaml +34 -0
rslearn/models/panopticon_data/sensors/sentinel1.yaml +85 -0
rslearn/models/panopticon_data/sensors/sentinel2.yaml +97 -0
rslearn/models/panopticon_data/sensors/superdove.yaml +60 -0
rslearn/models/panopticon_data/sensors/wv23.yaml +63 -0
rslearn/models/pick_features.py +17 -10
rslearn/models/pooling_decoder.py +60 -7
rslearn/models/presto/__init__.py +5 -0
rslearn/models/presto/presto.py +297 -0
rslearn/models/presto/single_file_presto.py +926 -0
rslearn/models/prithvi.py +1147 -0
rslearn/models/resize_features.py +59 -0
rslearn/models/sam2_enc.py +13 -9
rslearn/models/satlaspretrain.py +38 -18
rslearn/models/simple_time_series.py +188 -77
rslearn/models/singletask.py +24 -13
rslearn/models/ssl4eo_s12.py +40 -30
rslearn/models/swin.py +44 -32
rslearn/models/task_embedding.py +250 -0
rslearn/models/terramind.py +256 -0
rslearn/models/trunk.py +139 -0
rslearn/models/unet.py +68 -22
rslearn/models/upsample.py +48 -0
rslearn/models/use_croma.py +508 -0
rslearn/template_params.py +26 -0
rslearn/tile_stores/__init__.py +41 -18
rslearn/tile_stores/default.py +409 -0
rslearn/tile_stores/tile_store.py +236 -132
rslearn/train/all_patches_dataset.py +530 -0
rslearn/train/callbacks/adapters.py +53 -0
rslearn/train/callbacks/freeze_unfreeze.py +348 -17
rslearn/train/callbacks/gradients.py +129 -0
rslearn/train/callbacks/peft.py +116 -0
rslearn/train/data_module.py +444 -20
rslearn/train/dataset.py +588 -235
rslearn/train/lightning_module.py +192 -62
rslearn/train/model_context.py +88 -0
rslearn/train/optimizer.py +31 -0
rslearn/train/prediction_writer.py +319 -84
rslearn/train/scheduler.py +92 -0
rslearn/train/tasks/classification.py +55 -28
rslearn/train/tasks/detection.py +132 -76
rslearn/train/tasks/embedding.py +120 -0
rslearn/train/tasks/multi_task.py +28 -14
rslearn/train/tasks/per_pixel_regression.py +291 -0
rslearn/train/tasks/regression.py +161 -44
rslearn/train/tasks/segmentation.py +428 -53
rslearn/train/tasks/task.py +6 -5
rslearn/train/transforms/__init__.py +1 -1
rslearn/train/transforms/concatenate.py +54 -10
rslearn/train/transforms/crop.py +29 -11
rslearn/train/transforms/flip.py +18 -6
rslearn/train/transforms/mask.py +78 -0
rslearn/train/transforms/normalize.py +101 -17
rslearn/train/transforms/pad.py +19 -7
rslearn/train/transforms/resize.py +83 -0
rslearn/train/transforms/select_bands.py +76 -0
rslearn/train/transforms/sentinel1.py +75 -0
rslearn/train/transforms/transform.py +89 -70
rslearn/utils/__init__.py +2 -6
rslearn/utils/array.py +8 -6
rslearn/utils/feature.py +2 -2
rslearn/utils/fsspec.py +90 -1
rslearn/utils/geometry.py +347 -7
rslearn/utils/get_utm_ups_crs.py +2 -3
rslearn/utils/grid_index.py +5 -5
rslearn/utils/jsonargparse.py +178 -0
rslearn/utils/mp.py +4 -3
rslearn/utils/raster_format.py +268 -116
rslearn/utils/rtree_index.py +64 -17
rslearn/utils/sqlite_index.py +7 -1
rslearn/utils/vector_format.py +252 -97
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/METADATA +532 -283
rslearn-0.0.21.dist-info/RECORD +167 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/WHEEL +1 -1
rslearn-0.0.21.dist-info/licenses/NOTICE +115 -0
rslearn/data_sources/raster_source.py +0 -309
rslearn/models/registry.py +0 -5
rslearn/tile_stores/file.py +0 -242
rslearn/utils/mgrs.py +0 -24
rslearn/utils/utils.py +0 -22
rslearn-0.0.1.dist-info/RECORD +0 -88
/rslearn/{data_sources/geotiff.py → py.typed} +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info/licenses}/LICENSE +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/top_level.txt +0 -0

rslearn/train/dataset.py CHANGED Viewed

@@ -1,30 +1,55 @@
 """Default Dataset for rslearn."""
+import hashlib
+import json
 import multiprocessing
 import os
 import random
+import tempfile
 import time
+import uuid
+from datetime import datetime
 from typing import Any
 import torch
 import tqdm
+from rasterio.warp import Resampling
 import rslearn.train.transforms.transform
 from rslearn.config import (
     DType,
-    RasterFormatConfig,
-    RasterLayerConfig,
-    VectorLayerConfig,
+    LayerConfig,
 )
-from rslearn.dataset import Dataset, Window
-from rslearn.train.tasks import Task
-from rslearn.utils import logger
+from rslearn.data_sources.data_source import Item
+from rslearn.dataset.dataset import Dataset
+from rslearn.dataset.storage.file import FileWindowStorage
+from rslearn.dataset.window import (
+    Window,
+    WindowLayerData,
+    get_layer_and_group_from_dir_name,
+)
+from rslearn.log_utils import get_logger
+from rslearn.train.model_context import RasterImage
+from rslearn.utils.feature import Feature
+from rslearn.utils.geometry import PixelBounds, ResolutionFactor
 from rslearn.utils.mp import star_imap_unordered
-from rslearn.utils.raster_format import load_raster_format
-from rslearn.utils.vector_format import load_vector_format
+from .model_context import SampleMetadata
+from .tasks import Task
 from .transforms import Sequential
+logger = get_logger(__name__)
+def get_torch_dtype(dtype: DType) -> torch.dtype:
+    """Convert rslearn DType to torch dtype."""
+    if dtype == DType.INT32:
+        return torch.int32
+    elif dtype == DType.FLOAT32:
+        return torch.float32
+    else:
+        raise ValueError(f"unable to handle {dtype} as a torch dtype")
 class SamplerFactory:
     """Factory to produce a Sampler.
@@ -47,7 +72,9 @@ class SamplerFactory:
 class RandomSamplerFactory(SamplerFactory):
     """A sampler factory for RandomSampler."""
-    def __init__(self, replacement: bool = False, num_samples: int | None = None):
+    def __init__(
+        self, replacement: bool = False, num_samples: int | None = None
+    ) -> None:
         """Initialize a RandomSamplerFactory.
         Args:
@@ -75,7 +102,9 @@ class RandomSamplerFactory(SamplerFactory):
 class WeightedRandomSamplerFactory(SamplerFactory):
     """A sampler factory for WeightedRandomSampler."""
-    def __init__(self, option_key: str, num_samples: int, replacement: bool = True):
+    def __init__(
+        self, option_key: str, num_samples: int, replacement: bool = True
+    ) -> None:
         """Initialize a WeightedRandomSamplerFactory.
         Args:
@@ -97,7 +126,7 @@ class WeightedRandomSamplerFactory(SamplerFactory):
             a RandomSampler
         """
         weights = []
-        for window in dataset.get_windows():
+        for window in dataset.get_dataset_examples():
             weights.append(window.options[self.option_key])
         return torch.utils.data.WeightedRandomSampler(
             weights, self.num_samples, replacement=self.replacement
@@ -108,6 +137,10 @@ class DataInput:
     """Specification of a piece of data from a window that is needed for training.
     The DataInput includes which layer(s) the data can be obtained from for each window.
+    Note that this class is not a dataclass because jsonargparse does not play well
+    with dataclasses without enabling specialized options which we have not validated
+    will work with the rest of our code.
     """
     def __init__(
@@ -119,6 +152,10 @@ class DataInput:
         passthrough: bool = False,
         is_target: bool = False,
         dtype: DType = DType.FLOAT32,
+        load_all_layers: bool = False,
+        load_all_item_groups: bool = False,
+        resolution_factor: ResolutionFactor = ResolutionFactor(),
+        resampling: Resampling = Resampling.nearest,
     ):
         """Initialize a new DataInput.
@@ -132,6 +169,21 @@ class DataInput:
             is_target: whether this DataInput represents a target for the task. Targets
                 are not read during prediction phase.
             dtype: data type to load the raster as
+            load_all_layers: whether to load all of the layers specified in the list of
+                layer names. By default, we randomly pick one layer to read. When
+                reading multiple layers, the images are stacked on the channel
+                dimension. This option will also cause the dataset to only include
+                windows where all of the layers are materialized (by default, only
+                windows with none of the layers materialized would be excluded).
+            load_all_item_groups: whether to load all item groups in the layer(s) we
+                are reading from. By default, we assume the specified layer name is of
+                the form "{layer_name}.{group_idx}" and read that item group only. With
+                this option enabled, we ignore the group_idx and read all item groups.
+            resolution_factor: controls the resolution at which raster data is loaded for training.
+                By default (factor=1), data is loaded at the window resolution.
+                E.g. for a 64x64 window at 10 m/pixel with resolution_factor=1/2,
+                the resulting tensor is 32x32 (covering the same geographic area at 20 m/pixel).
+            resampling: resampling method (default nearest neighbor).
         """
         self.data_type = data_type
         self.layers = layers
@@ -140,6 +192,241 @@ class DataInput:
         self.passthrough = passthrough
         self.is_target = is_target
         self.dtype = dtype
+        self.load_all_layers = load_all_layers
+        self.load_all_item_groups = load_all_item_groups
+        self.resolution_factor = resolution_factor
+        self.resampling = resampling
+def read_raster_layer_for_data_input(
+    window: Window,
+    bounds: PixelBounds,
+    layer_name: str,
+    group_idx: int,
+    layer_config: LayerConfig,
+    data_input: DataInput,
+) -> torch.Tensor:
+    """Read a raster layer for a DataInput.
+    This scans the available rasters for the layer at the window to determine which
+    ones are needed to get all of the configured bands.
+    Args:
+        window: the window to read from.
+        bounds: the bounds to read.
+        layer_name: the layer.
+        group_idx: the item group.
+        layer_config: the layer configuration.
+        data_input: the DataInput that specifies the bands and dtype.
+    Returns:
+        Raster data as a tensor.
+    """
+    # See what different sets of bands we need to read to get all the
+    # configured bands.
+    needed_bands = data_input.bands
+    if needed_bands is None:
+        raise ValueError(f"No bands specified for {layer_name}")
+    needed_band_indexes = {}
+    for i, band in enumerate(needed_bands):
+        needed_band_indexes[band] = i
+    needed_sets_and_indexes = []
+    for band_set in layer_config.band_sets:
+        needed_src_indexes = []
+        needed_dst_indexes = []
+        if band_set.bands is None:
+            continue
+        for i, band in enumerate(band_set.bands):
+            if band not in needed_band_indexes:
+                continue
+            needed_src_indexes.append(i)
+            needed_dst_indexes.append(needed_band_indexes[band])
+            del needed_band_indexes[band]
+        if len(needed_src_indexes) == 0:
+            continue
+        needed_sets_and_indexes.append(
+            (band_set, needed_src_indexes, needed_dst_indexes)
+        )
+    if len(needed_band_indexes) > 0:
+        raise ValueError(
+            "could not get all the needed bands from "
+            + f"window {window.name} layer {layer_name} group {group_idx}"
+        )
+    # Get the projection and bounds to read under (multiply window resolution # by
+    # the specified resolution factor).
+    final_projection = data_input.resolution_factor.multiply_projection(
+        window.projection
+    )
+    final_bounds = data_input.resolution_factor.multiply_bounds(bounds)
+    image = torch.zeros(
+        (
+            len(needed_bands),
+            final_bounds[3] - final_bounds[1],
+            final_bounds[2] - final_bounds[0],
+        ),
+        dtype=get_torch_dtype(data_input.dtype),
+    )
+    for band_set, src_indexes, dst_indexes in needed_sets_and_indexes:
+        if band_set.format is None:
+            raise ValueError(f"No format specified for {layer_name}")
+        raster_format = band_set.instantiate_raster_format()
+        raster_dir = window.get_raster_dir(
+            layer_name, band_set.bands, group_idx=group_idx
+        )
+        # TODO: previously we try to read based on band_set.zoom_offset when possible,
+        # and handle zooming in with torch.repeat (if resampling method is nearest
+        # neighbor). However, we have not benchmarked whether this actually improves
+        # data loading speed, so for simplicity, for now we let rasterio handle the
+        # resampling. If it really is much faster to handle it via torch, then it may
+        # make sense to bring back that functionality.
+        src = raster_format.decode_raster(
+            raster_dir, final_projection, final_bounds, resampling=Resampling.nearest
+        )
+        image[dst_indexes, :, :] = torch.as_tensor(
+            src[src_indexes, :, :].astype(data_input.dtype.get_numpy_dtype())
+        )
+    return image
+def read_layer_time_range(
+    layer_data: WindowLayerData | None, group_idx: int
+) -> tuple[datetime, datetime] | None:
+    """Extract the combined time range from all items in a layer data group.
+    Returns the min start time and max end time across all items, or None if
+    no items have time ranges.
+    Raises:
+        ValueError: If some items have time_range and others don't.
+    """
+    if layer_data is None:
+        return None
+    serialized_items = layer_data.serialized_item_groups[group_idx]
+    if not serialized_items:
+        return None
+    first_item = Item.deserialize(serialized_items[0])
+    if first_item.geometry.time_range is None:
+        return None
+    # If the first item has a time_range, all items must have one
+    time_ranges: list[tuple[datetime, datetime]] = []
+    for serialized_item in serialized_items:
+        item = Item.deserialize(serialized_item)
+        if item.geometry.time_range is None:
+            raise ValueError(
+                f"Item '{item.name}' has no time_range, but first item does. "
+                "All items in a group must consistently have or lack time_range."
+            )
+        time_ranges.append(item.geometry.time_range)
+    return (
+        min(tr[0] for tr in time_ranges),
+        max(tr[1] for tr in time_ranges),
+    )
+def read_data_input(
+    dataset: Dataset,
+    window: Window,
+    bounds: PixelBounds,
+    data_input: DataInput,
+    rng: random.Random,
+) -> RasterImage | list[Feature]:
+    """Read the data specified by the DataInput from the window.
+    Args:
+        dataset: the dataset, to get layer configs.
+        window: the window to read from.
+        bounds: the bounds of the patch we are reading.
+        data_input: the DataInput that specifies what layers to read.
+        rng: random number generator
+    Returns:
+        the raster or vector data.
+    """
+    # We first enumerate which layers are available.
+    # If load_all_item_groups is set, we need to check each item group within the
+    # layer.
+    layer_options: list[tuple[str, int]] = []
+    if data_input.load_all_item_groups:
+        wanted_layers = set(data_input.layers)
+        for layer_name, group_idx in window.list_completed_layers():
+            if layer_name not in wanted_layers:
+                continue
+            layer_options.append((layer_name, group_idx))
+    else:
+        for option in data_input.layers:
+            layer_name, group_idx = get_layer_and_group_from_dir_name(option)
+            if not window.is_layer_completed(layer_name, group_idx):
+                continue
+            layer_options.append((layer_name, group_idx))
+    # Now determine the layers that we should actually read.
+    # We randomly pick one, unless load_all_layers is set, in which case we read all of
+    # them.
+    layers_to_read: list[tuple[str, int]]
+    if data_input.load_all_layers:
+        # We assume that the user has ensured the layers are compatible, e.g. raster
+        # layers will need to have the same number of bands.
+        layers_to_read = layer_options
+    else:
+        layers_to_read = [rng.choice(layer_options)]
+    if data_input.data_type == "raster":
+        # load it once here
+        layer_datas = window.load_layer_datas()
+        images: list[torch.Tensor] = []
+        time_ranges: list[tuple[datetime, datetime] | None] = []
+        for layer_name, group_idx in layers_to_read:
+            layer_config = dataset.layers[layer_name]
+            image = read_raster_layer_for_data_input(
+                window,
+                bounds,
+                layer_name,
+                group_idx,
+                layer_config,
+                data_input,
+            )
+            # some layers (e.g. "label_raster") won't have associated layer datas
+            layer_data = layer_datas.get(layer_name)
+            time_range = read_layer_time_range(layer_data, group_idx)
+            if len(time_ranges) > 0:
+                if type(time_ranges[-1]) is not type(time_range):
+                    raise ValueError(
+                        f"All time ranges should be datetime tuples or None. Got {type(time_range)} amd {type(time_ranges[-1])}"
+                    )
+            images.append(image)
+            time_ranges.append(time_range)
+        return RasterImage(
+            torch.stack(images, dim=1),
+            time_ranges if time_ranges[0] is not None else None,  # type: ignore
+        )
+    elif data_input.data_type == "vector":
+        # We don't really support time series for vector data currently, we just
+        # concatenate the features together.
+        features: list[Feature] = []
+        for layer_name, group_idx in layers_to_read:
+            layer_config = dataset.layers[layer_name]
+            vector_format = layer_config.instantiate_vector_format()
+            layer_dir = window.get_layer_dir(layer_name, group_idx=group_idx)
+            cur_features = vector_format.decode_vector(
+                layer_dir, window.projection, window.bounds
+            )
+            features.extend(cur_features)
+        return features
+    else:
+        raise ValueError(f"unknown data type {data_input.data_type}")
 class SplitConfig:
@@ -149,15 +436,16 @@ class SplitConfig:
         self,
         groups: list[str] | None = None,
         names: list[str] | None = None,
-        tags: dict[str, str] | None = None,
+        tags: dict[str, Any] | None = None,
         num_samples: int | None = None,
+        num_patches: int | None = None,
         transforms: list[torch.nn.Module] | None = None,
         sampler: SamplerFactory | None = None,
         patch_size: int | tuple[int, int] | None = None,
         overlap_ratio: float | None = None,
         load_all_patches: bool | None = None,
         skip_targets: bool | None = None,
-    ):
+    ) -> None:
         """Initialize a new SplitConfig.
         Args:
@@ -168,6 +456,7 @@ class SplitConfig:
                 value. If value is empty, then only the existince of the key in the
                 window options is checked.
             num_samples: limit this split to this many examples
+            num_patches: limit this split to this many patches
             transforms: transforms to apply
             sampler: SamplerFactory for this split
             patch_size: an optional square size or (width, height) tuple. If set, read
@@ -183,15 +472,19 @@ class SplitConfig:
         self.names = names
         self.tags = tags
         self.num_samples = num_samples
+        self.num_patches = num_patches
         self.transforms = transforms
         self.sampler = sampler
         self.patch_size = patch_size
-        self.load_all_patches = load_all_patches
         self.skip_targets = skip_targets
+        # Note that load_all_patches are handled by the RslearnDataModule rather than
+        # the ModelDataset.
+        self.load_all_patches = load_all_patches
         self.overlap_ratio = overlap_ratio
-        if self.overlap_ratio is not None:
-            if not (0 < self.overlap_ratio < 1):
-                raise ValueError("overlap_ratio must be between 0 and 1 (exclusive)")
+        if self.overlap_ratio is not None and not (0 < self.overlap_ratio < 1):
+            raise ValueError("overlap_ratio must be between 0 and 1 (exclusive)")
     def update(self, other: "SplitConfig") -> "SplitConfig":
         """Override settings in this SplitConfig with those in another.
@@ -204,6 +497,7 @@ class SplitConfig:
             names=self.names,
             tags=self.tags,
             num_samples=self.num_samples,
+            num_patches=self.num_patches,
             transforms=self.transforms,
             sampler=self.sampler,
             patch_size=self.patch_size,
@@ -219,6 +513,8 @@ class SplitConfig:
             result.tags = other.tags
         if other.num_samples:
             result.num_samples = other.num_samples
+        if other.num_patches:
+            result.num_patches = other.num_patches
         if other.transforms:
             result.transforms = other.transforms
         if other.sampler:
@@ -233,6 +529,18 @@ class SplitConfig:
             result.skip_targets = other.skip_targets
         return result
+    def get_patch_size(self) -> tuple[int, int] | None:
+        """Get patch size normalized to int tuple."""
+        if self.patch_size is None:
+            return None
+        if isinstance(self.patch_size, int):
+            return (self.patch_size, self.patch_size)
+        return self.patch_size
+    def get_overlap_ratio(self) -> float:
+        """Get the overlap ratio (default 0)."""
+        return self.overlap_ratio if self.overlap_ratio is not None else 0.0
     def get_load_all_patches(self) -> bool:
         """Returns whether loading all patches is enabled (default False)."""
         return True if self.load_all_patches is True else False
@@ -242,7 +550,7 @@ class SplitConfig:
         return True if self.skip_targets is True else False
-def check_window(inputs: dict[str, DataInput], window: Window) -> bool:
+def check_window(inputs: dict[str, DataInput], window: Window) -> Window | None:
     """Verify that the window has the required layers based on the specified inputs.
     Args:
@@ -254,17 +562,25 @@ def check_window(inputs: dict[str, DataInput], window: Window) -> bool:
     """
     # Make sure window has all the needed layers.
-    def is_any_layer_available(data_input):
+    def is_available(data_input: DataInput) -> bool:
+        # If load_all_layers is enabled, we should check that all the layers are
+        # present. Otherwise, we just need one layer.
+        is_any_layer_available = False
+        are_all_layers_available = True
         for layer_name in data_input.layers:
-            completed_fname = window.path / "layers" / layer_name / "completed"
-            if completed_fname.exists():
-                return True
-        return False
+            if window.is_layer_completed(layer_name):
+                is_any_layer_available = True
+            else:
+                are_all_layers_available = False
+        if data_input.load_all_layers:
+            return are_all_layers_available
+        else:
+            return is_any_layer_available
     for data_input in inputs.values():
         if not data_input.required:
             continue
-        if not is_any_layer_available(data_input):
+        if not is_available(data_input):
             logger.debug(
                 "Skipping window %s since check for layers %s failed",
                 window.name,
@@ -285,7 +601,9 @@ class ModelDataset(torch.utils.data.Dataset):
         inputs: dict[str, DataInput],
         task: Task,
         workers: int,
-    ):
+        name: str | None = None,
+        fix_patch_pick: bool = False,
+    ) -> None:
         """Instantiate a new ModelDataset.
         Args:
@@ -294,50 +612,30 @@ class ModelDataset(torch.utils.data.Dataset):
             inputs: data to read from the dataset for training
             task: the task to train on
             workers: number of workers to use for initializing the dataset
+            name: name of the dataset (default: None)
+            fix_patch_pick: if True, fix the patch pick to be the same every time
+                for a given window. Useful for testing (default: False)
         """
         self.dataset = dataset
         self.split_config = split_config
         self.inputs = inputs
         self.task = task
+        self.name = name
+        self.fix_patch_pick = fix_patch_pick
         if split_config.transforms:
             self.transforms = Sequential(*split_config.transforms)
         else:
             self.transforms = rslearn.train.transforms.transform.Identity()
-        # Convert patch size to (width, height) format if needed.
-        if not split_config.patch_size:
+        # Get normalized patch size from the SplitConfig.
+        # But if load all patches is enabled, this is handled by AllPatchesDataset, so
+        # here we instead load the entire windows.
+        if split_config.get_load_all_patches():
             self.patch_size = None
-        elif isinstance(split_config.patch_size, int):
-            self.patch_size = (split_config.patch_size, split_config.patch_size)
         else:
-            self.patch_size = split_config.patch_size
+            self.patch_size = split_config.get_patch_size()
-        if split_config.names:
-            windows = self.dataset.load_windows(
-                groups=split_config.groups,
-                names=split_config.names,
-                show_progress=True,
-                workers=workers,
-            )
-        elif split_config.groups:
-            windows = self.dataset.load_windows(
-                groups=split_config.groups, show_progress=True, workers=workers
-            )
-        else:
-            windows = self.dataset.load_windows(show_progress=True, workers=workers)
-        if split_config.tags:
-            # Filter the window.options.
-            new_windows = []
-            for window in windows:
-                for k, v in split_config.tags.items():
-                    if k not in window.options:
-                        continue
-                    if v and window.options[k] != v:
-                        continue
-                    new_windows.append(window)
-            windows = new_windows
+        windows = self._get_initial_windows(split_config, workers)
         # If targets are not needed, remove them from the inputs.
         if split_config.get_skip_targets():
@@ -347,98 +645,178 @@ class ModelDataset(torch.utils.data.Dataset):
         # Eliminate windows that are missing either a requisite input layer, or missing
         # all target layers.
-        p = multiprocessing.Pool(workers)
-        outputs = star_imap_unordered(
-            p,
-            check_window,
-            [
-                dict(
-                    inputs=self.inputs,
-                    window=window,
-                )
-                for window in windows
-            ],
-        )
         new_windows = []
-        for window in tqdm.tqdm(
-            outputs, total=len(windows), desc="Checking available layers in windows"
-        ):
-            if window is None:
-                continue
-            new_windows.append(window)
-        p.close()
+        if workers == 0:
+            for window in windows:
+                if check_window(self.inputs, window) is None:
+                    continue
+                new_windows.append(window)
+        else:
+            p = multiprocessing.Pool(workers)
+            outputs = star_imap_unordered(
+                p,
+                check_window,
+                [
+                    dict(
+                        inputs=self.inputs,
+                        window=window,
+                    )
+                    for window in windows
+                ],
+            )
+            for window in tqdm.tqdm(
+                outputs, total=len(windows), desc="Checking available layers in windows"
+            ):
+                if window is None:
+                    continue
+                new_windows.append(window)
+            p.close()
         windows = new_windows
+        # Sort the windows to ensure that the dataset is consistent across GPUs.
+        # Inconsistent ordering can lead to a subset of windows being processed during
+        # "model test" / "model predict" when using multiple GPUs.
+        # We use a hash so that functionality like num_samples limit gets a random
+        # subset of windows (with respect to the hash function choice).
+        windows.sort(
+            key=lambda window: hashlib.sha256(window.name.encode()).hexdigest()
+        )
         # Limit windows to num_samples if requested.
         if split_config.num_samples:
-            # TODO: use hash of window names so this is deterministic and not arbitrarily ordered according to load_windows
+            # The windows are sorted by hash of window name so this distribution should
+            # be representative of the population.
             windows = windows[0 : split_config.num_samples]
-        self.windows = windows
+        # Write dataset_examples to a file so that we can load it lazily in the worker
+        # processes. Otherwise it takes a long time to transmit it when spawning each
+        # process.
+        self.dataset_examples_fname = os.path.join(
+            tempfile.gettempdir(),
+            "rslearn_dataset_examples",
+            f"{os.getpid()}_{uuid.uuid4()}.json",
+        )
+        self.num_dataset_examples = len(windows)
+        self.dataset_examples: list[Window] | None = None
+        logger.info(
+            f"Writing {len(windows)} dataset examples to {self.dataset_examples_fname}"
+        )
+        os.makedirs(os.path.dirname(self.dataset_examples_fname), exist_ok=True)
+        with open(self.dataset_examples_fname, "w") as f:
+            json.dump([self._serialize_item(example) for example in windows], f)
-        # If we're loading all patches, we need to include the patch details.
-        if split_config.get_load_all_patches():
-            patches = []
-            overlap_size = int(
-                self.patch_size[0] * split_config.overlap_ratio
-                if split_config.overlap_ratio
-                else 0
+    def _get_initial_windows(
+        self, split_config: SplitConfig, workers: int
+    ) -> list[Window]:
+        """Get the initial windows before input layer filtering.
+        The windows are filtered based on configured window names, groups, and tags.
+        This is a helper for the init function.
+        Args:
+            split_config: the split configuration.
+            workers: number of worker processes.
+        Returns:
+            list of windows from the dataset after applying the aforementioned filters.
+        """
+        # Load windows from dataset.
+        # If the window storage is FileWindowStorage, we pass the workers/show_progress arguments.
+        kwargs: dict[str, Any] = {}
+        if isinstance(self.dataset.storage, FileWindowStorage):
+            kwargs["workers"] = workers
+            kwargs["show_progress"] = True
+        # We also add the name/group filters to the kwargs.
+        if split_config.names:
+            kwargs["names"] = split_config.names
+        if split_config.groups:
+            kwargs["groups"] = split_config.groups
+        windows = self.dataset.load_windows(**kwargs)
+        # Filter by tags (if provided) using the window.options.
+        if split_config.tags:
+            new_windows = []
+            num_removed: dict[str, int] = {}
+            for window in windows:
+                for k, v in split_config.tags.items():
+                    if k not in window.options or (v and window.options[k] != v):
+                        num_removed[k] = num_removed.get(k, 0) + 1
+                        break
+                else:
+                    new_windows.append(window)
+            logger.info(
+                f"Started with {len(windows)} windows, ended with {len(new_windows)} windows for {self.dataset.path}"
+            )
+            for k, v in num_removed.items():
+                logger.info(f"Removed {v} windows due to tag {k}")
+            windows = new_windows
+        return windows
+    def _serialize_item(self, example: Window) -> dict[str, Any]:
+        return example.get_metadata()
+    def _deserialize_item(self, d: dict[str, Any]) -> Window:
+        return Window.from_metadata(
+            self.dataset.storage,
+            d,
+        )
+    def get_dataset_examples(self) -> list[Window]:
+        """Get a list of examples in the dataset.
+        If load_all_patches is False, this is a list of Windows. Otherwise, this is a
+        list of (window, patch_bounds, (patch_idx, # patches)) tuples.
+        """
+        if self.dataset_examples is None:
+            logger.debug(
+                f"Loading dataset examples from {self.dataset_examples_fname} in process {os.getpid()}"
             )
-            for window in self.windows:
-                cur_patches = []
-                for col in range(
-                    window.bounds[0],
-                    window.bounds[2],
-                    self.patch_size[0] - overlap_size,
-                ):
-                    for row in range(
-                        window.bounds[1],
-                        window.bounds[3],
-                        self.patch_size[1] - overlap_size,
-                    ):
-                        cur_patches.append(
-                            (
-                                col,
-                                row,
-                                col + self.patch_size[0],
-                                row + self.patch_size[1],
-                            )
-                        )
-                for i, patch_bounds in enumerate(cur_patches):
-                    patches.append((window, patch_bounds, (i, len(cur_patches))))
-            self.windows = patches
+            with open(self.dataset_examples_fname) as f:
+                self.dataset_examples = [
+                    self._deserialize_item(d) for d in json.load(f)
+                ]
+            logger.debug(f"Finished loading dataset examples in process {os.getpid()}")
+        return self.dataset_examples
     def __len__(self) -> int:
         """Returns the dataset length."""
-        return len(self.windows)
+        return self.num_dataset_examples
-    def __getitem__(self, idx) -> tuple[dict[str, Any], dict[str, Any]]:
-        """Read one training example.
+    def get_raw_inputs(
+        self, idx: int
+    ) -> tuple[dict[str, Any], dict[str, Any], SampleMetadata]:
+        """Get the raw inputs and base metadata for this example.
+        This is the raster or vector data before being processed by the Task. So it
+        should be a Tensor for raster and list[Feature] for vector.
         Args:
             idx: the index in the dataset.
         Returns:
-            a tuple (input_dict, target_dict)
+            a tuple (raw_inputs, passthrough_inputs, metadata).
         """
-        logger.debug("__getitem__ start pid=%d item_idx=%d", os.getpid(), idx)
-        window = self.windows[idx]
+        dataset_examples = self.get_dataset_examples()
+        example = dataset_examples[idx]
+        rng = random.Random(idx if self.fix_patch_pick else None)
         # Select bounds to read.
-        if self.split_config.get_load_all_patches():
-            window, bounds, (patch_idx, num_patches) = window
-        elif self.patch_size:
+        if self.patch_size:
+            window = example
-            def get_patch_range(n_patch, n_window):
+            def get_patch_range(n_patch: int, n_window: int) -> list[int]:
                 if n_patch > n_window:
                     # Select arbitrary range containing the entire window.
                     # Basically arbitrarily padding the window to get to patch size.
-                    start = random.randint(n_window - n_patch, 0)
+                    start = rng.randint(n_window - n_patch, 0)
                     return [start, start + n_patch]
                 else:
                     # Select arbitrary patch within the window.
-                    start = random.randint(0, n_window - n_patch)
+                    start = rng.randint(0, n_window - n_patch)
                     return [start, start + n_patch]
             window_size = (
@@ -449,128 +827,56 @@ class ModelDataset(torch.utils.data.Dataset):
                 get_patch_range(self.patch_size[0], window_size[0]),
                 get_patch_range(self.patch_size[1], window_size[1]),
             ]
-            bounds = [
+            bounds = (
                 window.bounds[0] + patch_ranges[0][0],
                 window.bounds[1] + patch_ranges[1][0],
                 window.bounds[0] + patch_ranges[0][1],
                 window.bounds[1] + patch_ranges[1][1],
-            ]
+            )
         else:
+            window = example
             bounds = window.bounds
-        # Read the inputs and targets.
-        def read_input(data_input: DataInput):
-            # First enumerate all options of individual layers to read.
-            layer_options = []
-            for layer_name in data_input.layers:
-                completed_fname = window.path / "layers" / layer_name / "completed"
-                if not completed_fname.exists():
-                    continue
-                layer_options.append(layer_name)
-            # For now we just randomly pick one option.
-            # In the future we need to support different configuration for how to pick
-            # the options, as well as picking multiple for series inputs.
-            layer = random.choice(layer_options)
-            layer_dir = window.path / "layers" / layer
-            layer_config = self.dataset.layers[layer]
-            if data_input.data_type == "raster":
-                assert isinstance(layer_config, RasterLayerConfig)
-                # See what different sets of bands we need to read to get all the
-                # configured bands.
-                needed_bands = data_input.bands
-                needed_band_indexes = {}
-                for i, band in enumerate(needed_bands):
-                    needed_band_indexes[band] = i
-                needed_sets_and_indexes = []
-                for band_set in layer_config.band_sets:
-                    needed_src_indexes = []
-                    needed_dst_indexes = []
-                    for i, band in enumerate(band_set.bands):
-                        if band not in needed_band_indexes:
-                            continue
-                        needed_src_indexes.append(i)
-                        needed_dst_indexes.append(needed_band_indexes[band])
-                        del needed_band_indexes[band]
-                    if len(needed_src_indexes) == 0:
-                        continue
-                    needed_sets_and_indexes.append(
-                        (band_set, needed_src_indexes, needed_dst_indexes)
-                    )
-                if len(needed_band_indexes) > 0:
-                    raise Exception(
-                        "could not get all the needed bands from "
-                        + f"window {window.name} layer {layer}"
-                    )
-                image = torch.zeros(
-                    (len(needed_bands), bounds[3] - bounds[1], bounds[2] - bounds[0]),
-                    dtype=data_input.dtype.get_torch_dtype(),
-                )
-                for band_set, src_indexes, dst_indexes in needed_sets_and_indexes:
-                    _, final_bounds = band_set.get_final_projection_and_bounds(
-                        window.projection, bounds
-                    )
-                    raster_format = load_raster_format(
-                        RasterFormatConfig(band_set.format["name"], band_set.format)
-                    )
-                    cur_path = layer_dir / "_".join(band_set.bands)
-                    src = raster_format.decode_raster(cur_path, final_bounds)
-                    # Resize to patch size if needed.
-                    # This is for band sets that are stored at a lower resolution.
-                    # Here we assume that it is a multiple.
-                    if src.shape[1:3] != image.shape[1:3]:
-                        if src.shape[1] < image.shape[1]:
-                            factor = image.shape[1] // src.shape[1]
-                            src = src.repeat(repeats=factor, axis=1).repeat(
-                                repeats=factor, axis=2
-                            )
-                        else:
-                            factor = src.shape[1] // image.shape[1]
-                            src = src[:, ::factor, ::factor]
-                    image[dst_indexes, :, :] = torch.as_tensor(
-                        src[src_indexes, :, :].astype(
-                            data_input.dtype.get_numpy_dtype()
-                        )
-                    )
-                return image
-            elif data_input.data_type == "vector":
-                assert isinstance(layer_config, VectorLayerConfig)
-                vector_format = load_vector_format(layer_config.format)
-                features = vector_format.decode_vector(layer_dir, bounds)
-                return features
-            else:
-                raise Exception(f"unknown data type {data_input.data_type}")
+        assert isinstance(window, Window)
         raw_inputs = {}
         passthrough_inputs = {}
         for name, data_input in self.inputs.items():
-            raw_inputs[name] = read_input(data_input)
+            raw_inputs[name] = read_data_input(
+                self.dataset, window, bounds, data_input, rng
+            )
             if data_input.passthrough:
                 passthrough_inputs[name] = raw_inputs[name]
-        metadata = {
-            "group": window.group,
-            "window_name": window.name,
-            "window_bounds": window.bounds,
-            "bounds": bounds,
-            "time_range": window.time_range,
-            "projection": window.projection,
-        }
-        if self.split_config.get_load_all_patches():
-            metadata["patch_idx"] = patch_idx
-            metadata["num_patches"] = num_patches
-        else:
-            metadata["patch_idx"] = 0
-            metadata["num_patches"] = 1
+        metadata = SampleMetadata(
+            window_group=window.group,
+            window_name=window.name,
+            window_bounds=window.bounds,
+            patch_bounds=bounds,
+            patch_idx=0,
+            num_patches_in_window=1,
+            time_range=window.time_range,
+            projection=window.projection,
+            dataset_source=self.name,
+        )
+        return raw_inputs, passthrough_inputs, metadata
+    def __getitem__(
+        self, idx: int
+    ) -> tuple[dict[str, Any], dict[str, Any], SampleMetadata]:
+        """Read one training example.
+        Args:
+            idx: the index in the dataset.
+        Returns:
+            a tuple (input_dict, target_dict, metadata)
+        """
+        logger.debug("__getitem__ start pid=%d item_idx=%d", os.getpid(), idx)
+        raw_inputs, passthrough_inputs, metadata = self.get_raw_inputs(idx)
         input_dict, target_dict = self.task.process_inputs(
             raw_inputs,
@@ -584,17 +890,21 @@ class ModelDataset(torch.utils.data.Dataset):
         return input_dict, target_dict, metadata
-    def get_windows(self) -> list[Window]:
-        """Returns a list of windows in this dataset."""
-        return self.windows
+    def set_name(self, name: str) -> None:
+        """Set the name of the dataset.
+        Args:
+            name: the name to set.
+        """
+        self.name = name
 class RetryDataset(torch.utils.data.Dataset):
     """A dataset wrapper that retries getitem upon encountering error."""
     def __init__(
-        self, dataset: torch.utils.data.Dataset, retries: int = 3, delay: float = 5
-    ):
+        self, dataset: ModelDataset, retries: int = 3, delay: float = 5
+    ) -> None:
         """Create a new RetryDataset.
         Args:
@@ -606,7 +916,15 @@ class RetryDataset(torch.utils.data.Dataset):
         self.retries = retries
         self.delay = delay
-    def __len__(self):
+    def set_name(self, name: str) -> None:
+        """Set the name of the dataset.
+        Args:
+            name: the name to set.
+        """
+        self.dataset.set_name(name)
+    def __len__(self) -> int:
         """Return length of the dataset."""
         return len(self.dataset)
@@ -632,6 +950,41 @@ class RetryDataset(torch.utils.data.Dataset):
         # One last try -- but don't catch any more errors.
         return self.dataset[idx]
-    def get_windows(self) -> list[Window]:
+    def get_dataset_examples(self) -> list[Window]:
         """Returns a list of windows in this dataset."""
-        return self.dataset.get_windows()
+        return self.dataset.get_dataset_examples()
+class MultiDataset(torch.utils.data.Dataset):
+    """A dataset that combines multiple datasets."""
+    def __init__(self, datasets: dict[str, RetryDataset]) -> None:
+        """Create a new MultiDataset.
+        Args:
+            datasets: map of dataset name to dataset.
+        """
+        self.datasets = datasets
+        self.buckets = {}
+        curr_offset = 0
+        for name, ds in datasets.items():
+            self.buckets[name] = range(curr_offset, curr_offset + len(ds))
+            curr_offset += len(ds)
+    def __len__(self) -> int:
+        """Return length of the dataset."""
+        return sum(len(ds) for ds in self.datasets.values())
+    def __getitem__(self, idx: int) -> Any:
+        """Get item from the dataset.
+        Args:
+            idx: the item index.
+        Returns:
+            the item data.
+        """
+        for name, bucket in self.buckets.items():
+            if idx in bucket:
+                return self.datasets[name][idx - bucket.start]
+        raise IndexError(f"Index {idx} out of range (len={len(self)})")

rslearn 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl

rslearn 0.0.1py3-none-any.whl → 0.0.21py3-none-any.whl