PyPI - rslearn - Versions diffs - 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

rslearn 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

rslearn/config/dataset.py +30 -23
rslearn/data_sources/__init__.py +2 -0
rslearn/data_sources/aws_landsat.py +44 -161
rslearn/data_sources/aws_open_data.py +2 -4
rslearn/data_sources/aws_sentinel1.py +1 -3
rslearn/data_sources/aws_sentinel2_element84.py +54 -165
rslearn/data_sources/climate_data_store.py +1 -3
rslearn/data_sources/copernicus.py +1 -2
rslearn/data_sources/data_source.py +1 -1
rslearn/data_sources/direct_materialize_data_source.py +336 -0
rslearn/data_sources/earthdaily.py +52 -155
rslearn/data_sources/earthdatahub.py +425 -0
rslearn/data_sources/eurocrops.py +1 -2
rslearn/data_sources/gcp_public_data.py +1 -2
rslearn/data_sources/google_earth_engine.py +1 -2
rslearn/data_sources/hf_srtm.py +595 -0
rslearn/data_sources/local_files.py +3 -3
rslearn/data_sources/openstreetmap.py +1 -1
rslearn/data_sources/planet.py +1 -2
rslearn/data_sources/planet_basemap.py +1 -2
rslearn/data_sources/planetary_computer.py +183 -186
rslearn/data_sources/soilgrids.py +3 -3
rslearn/data_sources/stac.py +1 -2
rslearn/data_sources/usda_cdl.py +1 -3
rslearn/data_sources/usgs_landsat.py +7 -254
rslearn/data_sources/utils.py +204 -64
rslearn/data_sources/worldcereal.py +1 -1
rslearn/data_sources/worldcover.py +1 -1
rslearn/data_sources/worldpop.py +1 -1
rslearn/data_sources/xyz_tiles.py +5 -9
rslearn/dataset/materialize.py +5 -1
rslearn/models/clay/clay.py +3 -3
rslearn/models/concatenate_features.py +6 -1
rslearn/models/detr/detr.py +4 -1
rslearn/models/dinov3.py +0 -1
rslearn/models/olmoearth_pretrain/model.py +3 -1
rslearn/models/pooling_decoder.py +1 -1
rslearn/models/prithvi.py +0 -1
rslearn/models/simple_time_series.py +97 -35
rslearn/train/{all_patches_dataset.py → all_crops_dataset.py} +120 -117
rslearn/train/data_module.py +32 -27
rslearn/train/dataset.py +260 -117
rslearn/train/dataset_index.py +156 -0
rslearn/train/lightning_module.py +1 -1
rslearn/train/model_context.py +19 -3
rslearn/train/prediction_writer.py +69 -41
rslearn/train/tasks/classification.py +1 -1
rslearn/train/tasks/detection.py +5 -5
rslearn/train/tasks/per_pixel_regression.py +13 -13
rslearn/train/tasks/regression.py +1 -1
rslearn/train/tasks/segmentation.py +26 -13
rslearn/train/transforms/concatenate.py +17 -27
rslearn/train/transforms/crop.py +8 -19
rslearn/train/transforms/flip.py +4 -10
rslearn/train/transforms/mask.py +9 -15
rslearn/train/transforms/normalize.py +31 -82
rslearn/train/transforms/pad.py +7 -13
rslearn/train/transforms/resize.py +5 -22
rslearn/train/transforms/select_bands.py +16 -36
rslearn/train/transforms/sentinel1.py +4 -16
rslearn/utils/__init__.py +2 -0
rslearn/utils/geometry.py +21 -0
rslearn/utils/m2m_api.py +251 -0
rslearn/utils/retry_session.py +43 -0
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/METADATA +6 -3
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/RECORD +71 -66
rslearn/data_sources/earthdata_srtm.py +0 -282
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/WHEEL +0 -0
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/licenses/LICENSE +0 -0
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/licenses/NOTICE +0 -0
{rslearn-0.0.25.dist-info → rslearn-0.0.27.dist-info}/top_level.txt +0 -0

rslearn/train/dataset.py CHANGED Viewed

@@ -8,7 +8,9 @@ import random
 import tempfile
 import time
 import uuid
+import warnings
 from datetime import datetime
+from enum import StrEnum
 from typing import Any
 import torch
@@ -29,6 +31,7 @@ from rslearn.dataset.window import (
     get_layer_and_group_from_dir_name,
 )
 from rslearn.log_utils import get_logger
+from rslearn.train.dataset_index import DatasetIndex
 from rslearn.train.model_context import RasterImage
 from rslearn.utils.feature import Feature
 from rslearn.utils.geometry import PixelBounds, ResolutionFactor
@@ -41,6 +44,19 @@ from .transforms import Sequential
 logger = get_logger(__name__)
+class IndexMode(StrEnum):
+    """Controls dataset index caching behavior."""
+    OFF = "off"
+    """No caching - always load windows from dataset."""
+    USE = "use"
+    """Use cached index if available, create if not."""
+    REFRESH = "refresh"
+    """Ignore existing cache and rebuild."""
 def get_torch_dtype(dtype: DType) -> torch.dtype:
     """Convert rslearn DType to torch dtype."""
     if dtype == DType.INT32:
@@ -441,11 +457,15 @@ class SplitConfig:
         num_patches: int | None = None,
         transforms: list[torch.nn.Module] | None = None,
         sampler: SamplerFactory | None = None,
+        crop_size: int | tuple[int, int] | None = None,
+        overlap_pixels: int | None = None,
+        load_all_crops: bool | None = None,
+        skip_targets: bool | None = None,
+        output_layer_name_skip_inference_if_exists: str | None = None,
+        # Deprecated parameters (for backwards compatibility)
         patch_size: int | tuple[int, int] | None = None,
         overlap_ratio: float | None = None,
         load_all_patches: bool | None = None,
-        skip_targets: bool | None = None,
-        output_layer_name_skip_inference_if_exists: str | None = None,
     ) -> None:
         """Initialize a new SplitConfig.
@@ -460,19 +480,69 @@ class SplitConfig:
             num_patches: limit this split to this many patches
             transforms: transforms to apply
             sampler: SamplerFactory for this split
-            patch_size: an optional square size or (width, height) tuple. If set, read
+            crop_size: an optional square size or (width, height) tuple. If set, read
                 crops of this size rather than entire windows.
-            overlap_ratio: an optional float between 0 and 1. If set, read patches with
-                this ratio of overlap.
-            load_all_patches: with patch_size set, rather than sampling a random patch
-                for each window, read all patches as separate sequential items in the
+            overlap_pixels: the number of pixels shared between adjacent crops during
+                sliding window inference.
+            load_all_crops: with crop_size set, rather than sampling a random crop
+                for each window, read all crops as separate sequential items in the
                 dataset.
             skip_targets: whether to skip targets when loading inputs
             output_layer_name_skip_inference_if_exists: optional name of the output layer used during prediction.
                 If set, windows that already
                 have this layer completed will be skipped (useful for resuming
                 partial inference runs).
+            patch_size: deprecated, use crop_size instead
+            overlap_ratio: deprecated, use overlap_pixels instead
+            load_all_patches: deprecated, use load_all_crops instead
         """
+        # Handle deprecated load_all_patches parameter
+        if load_all_patches is not None:
+            warnings.warn(
+                "load_all_patches is deprecated, use load_all_crops instead",
+                FutureWarning,
+                stacklevel=2,
+            )
+            if load_all_crops is not None:
+                raise ValueError(
+                    "Cannot specify both load_all_patches and load_all_crops"
+                )
+            load_all_crops = load_all_patches
+        # Handle deprecated patch_size parameter
+        if patch_size is not None:
+            warnings.warn(
+                "patch_size is deprecated, use crop_size instead",
+                FutureWarning,
+                stacklevel=2,
+            )
+            if crop_size is not None:
+                raise ValueError("Cannot specify both patch_size and crop_size")
+            crop_size = patch_size
+        # Normalize crop_size to tuple[int, int] | None
+        self.crop_size: tuple[int, int] | None = None
+        if crop_size is not None:
+            if isinstance(crop_size, int):
+                self.crop_size = (crop_size, crop_size)
+            else:
+                self.crop_size = crop_size
+        # Handle deprecated overlap_ratio parameter
+        if overlap_ratio is not None:
+            warnings.warn(
+                "overlap_ratio is deprecated, use overlap_pixels instead",
+                FutureWarning,
+                stacklevel=2,
+            )
+            if overlap_pixels is not None:
+                raise ValueError("Cannot specify both overlap_ratio and overlap_pixels")
+            if self.crop_size is None:
+                raise ValueError("overlap_ratio requires crop_size to be set")
+            overlap_pixels = round(self.crop_size[0] * overlap_ratio)
+        if overlap_pixels is not None and overlap_pixels < 0:
+            raise ValueError("overlap_pixels must be non-negative")
         self.groups = groups
         self.names = names
         self.tags = tags
@@ -480,19 +550,15 @@ class SplitConfig:
         self.num_patches = num_patches
         self.transforms = transforms
         self.sampler = sampler
-        self.patch_size = patch_size
         self.skip_targets = skip_targets
         self.output_layer_name_skip_inference_if_exists = (
             output_layer_name_skip_inference_if_exists
         )
-        # Note that load_all_patches are handled by the RslearnDataModule rather than
-        # the ModelDataset.
-        self.load_all_patches = load_all_patches
-        self.overlap_ratio = overlap_ratio
-        if self.overlap_ratio is not None and not (0 < self.overlap_ratio < 1):
-            raise ValueError("overlap_ratio must be between 0 and 1 (exclusive)")
+        # Note that load_all_crops is handled by the RslearnDataModule rather than the
+        # ModelDataset.
+        self.load_all_crops = load_all_crops
+        self.overlap_pixels = overlap_pixels
     def update(self, other: "SplitConfig") -> "SplitConfig":
         """Override settings in this SplitConfig with those in another.
@@ -508,9 +574,9 @@ class SplitConfig:
             num_patches=self.num_patches,
             transforms=self.transforms,
             sampler=self.sampler,
-            patch_size=self.patch_size,
-            overlap_ratio=self.overlap_ratio,
-            load_all_patches=self.load_all_patches,
+            crop_size=self.crop_size,
+            overlap_pixels=self.overlap_pixels,
+            load_all_crops=self.load_all_crops,
             skip_targets=self.skip_targets,
             output_layer_name_skip_inference_if_exists=self.output_layer_name_skip_inference_if_exists,
         )
@@ -528,12 +594,12 @@ class SplitConfig:
             result.transforms = other.transforms
         if other.sampler:
             result.sampler = other.sampler
-        if other.patch_size:
-            result.patch_size = other.patch_size
-        if other.overlap_ratio is not None:
-            result.overlap_ratio = other.overlap_ratio
-        if other.load_all_patches is not None:
-            result.load_all_patches = other.load_all_patches
+        if other.crop_size:
+            result.crop_size = other.crop_size
+        if other.overlap_pixels is not None:
+            result.overlap_pixels = other.overlap_pixels
+        if other.load_all_crops is not None:
+            result.load_all_crops = other.load_all_crops
         if other.skip_targets is not None:
             result.skip_targets = other.skip_targets
         if other.output_layer_name_skip_inference_if_exists is not None:
@@ -542,21 +608,17 @@ class SplitConfig:
             )
         return result
-    def get_patch_size(self) -> tuple[int, int] | None:
-        """Get patch size normalized to int tuple."""
-        if self.patch_size is None:
-            return None
-        if isinstance(self.patch_size, int):
-            return (self.patch_size, self.patch_size)
-        return self.patch_size
+    def get_crop_size(self) -> tuple[int, int] | None:
+        """Get crop size as tuple."""
+        return self.crop_size
-    def get_overlap_ratio(self) -> float:
-        """Get the overlap ratio (default 0)."""
-        return self.overlap_ratio if self.overlap_ratio is not None else 0.0
+    def get_overlap_pixels(self) -> int:
+        """Get the overlap pixels (default 0)."""
+        return self.overlap_pixels if self.overlap_pixels is not None else 0
-    def get_load_all_patches(self) -> bool:
+    def get_load_all_crops(self) -> bool:
         """Returns whether loading all patches is enabled (default False)."""
-        return True if self.load_all_patches is True else False
+        return True if self.load_all_crops is True else False
     def get_skip_targets(self) -> bool:
         """Returns whether skip_targets is enabled (default False)."""
@@ -636,6 +698,7 @@ class ModelDataset(torch.utils.data.Dataset):
         workers: int,
         name: str | None = None,
         fix_patch_pick: bool = False,
+        index_mode: IndexMode = IndexMode.OFF,
     ) -> None:
         """Instantiate a new ModelDataset.
@@ -645,9 +708,10 @@ class ModelDataset(torch.utils.data.Dataset):
             inputs: data to read from the dataset for training
             task: the task to train on
             workers: number of workers to use for initializing the dataset
-            name: name of the dataset (default: None)
+            name: name of the dataset
             fix_patch_pick: if True, fix the patch pick to be the same every time
                 for a given window. Useful for testing (default: False)
+            index_mode: controls dataset index caching behavior (default: IndexMode.OFF)
         """
         self.dataset = dataset
         self.split_config = split_config
@@ -660,15 +724,13 @@ class ModelDataset(torch.utils.data.Dataset):
         else:
             self.transforms = rslearn.train.transforms.transform.Identity()
-        # Get normalized patch size from the SplitConfig.
-        # But if load all patches is enabled, this is handled by AllPatchesDataset, so
+        # Get normalized crop size from the SplitConfig.
+        # But if load all patches is enabled, this is handled by AllCropsDataset, so
         # here we instead load the entire windows.
-        if split_config.get_load_all_patches():
-            self.patch_size = None
+        if split_config.get_load_all_crops():
+            self.crop_size = None
         else:
-            self.patch_size = split_config.get_patch_size()
-        windows = self._get_initial_windows(split_config, workers)
+            self.crop_size = split_config.get_crop_size()
         # If targets are not needed, remove them from the inputs.
         if split_config.get_skip_targets():
@@ -676,58 +738,8 @@ class ModelDataset(torch.utils.data.Dataset):
                 if self.inputs[k].is_target:
                     del self.inputs[k]
-        # Eliminate windows that are missing either a requisite input layer, or missing
-        # all target layers.
-        new_windows = []
-        if workers == 0:
-            for window in windows:
-                if (
-                    check_window(
-                        self.inputs,
-                        window,
-                        output_layer_name_skip_inference_if_exists=self.split_config.get_output_layer_name_skip_inference_if_exists(),
-                    )
-                    is None
-                ):
-                    continue
-                new_windows.append(window)
-        else:
-            p = multiprocessing.Pool(workers)
-            outputs = star_imap_unordered(
-                p,
-                check_window,
-                [
-                    dict(
-                        inputs=self.inputs,
-                        window=window,
-                        output_layer_name_skip_inference_if_exists=self.split_config.get_output_layer_name_skip_inference_if_exists(),
-                    )
-                    for window in windows
-                ],
-            )
-            for window in tqdm.tqdm(
-                outputs, total=len(windows), desc="Checking available layers in windows"
-            ):
-                if window is None:
-                    continue
-                new_windows.append(window)
-            p.close()
-        windows = new_windows
-        # Sort the windows to ensure that the dataset is consistent across GPUs.
-        # Inconsistent ordering can lead to a subset of windows being processed during
-        # "model test" / "model predict" when using multiple GPUs.
-        # We use a hash so that functionality like num_samples limit gets a random
-        # subset of windows (with respect to the hash function choice).
-        windows.sort(
-            key=lambda window: hashlib.sha256(window.name.encode()).hexdigest()
-        )
-        # Limit windows to num_samples if requested.
-        if split_config.num_samples:
-            # The windows are sorted by hash of window name so this distribution should
-            # be representative of the population.
-            windows = windows[0 : split_config.num_samples]
+        # Load windows (from index if available, otherwise from dataset)
+        windows = self._load_windows(split_config, workers, index_mode)
         # Write dataset_examples to a file so that we can load it lazily in the worker
         # processes. Otherwise it takes a long time to transmit it when spawning each
@@ -796,6 +808,137 @@ class ModelDataset(torch.utils.data.Dataset):
         return windows
+    def _load_windows(
+        self,
+        split_config: SplitConfig,
+        workers: int,
+        index_mode: IndexMode,
+    ) -> list[Window]:
+        """Load windows, using index if available.
+        This method handles:
+        1. Loading from index if index_mode is USE and index exists
+        2. Otherwise, loading from dataset, filtering, sorting, limiting
+        3. Saving to index if index_mode is USE or REFRESH
+        Args:
+            split_config: the split configuration.
+            workers: number of worker processes.
+            index_mode: controls caching behavior.
+        Returns:
+            list of processed windows ready for training.
+        """
+        # Try to load from index
+        index: DatasetIndex | None = None
+        if index_mode != IndexMode.OFF:
+            logger.info(f"Checking index for dataset {self.dataset.path}")
+            index = DatasetIndex(
+                storage=self.dataset.storage,
+                dataset_path=self.dataset.path,
+                groups=split_config.groups,
+                names=split_config.names,
+                tags=split_config.tags,
+                num_samples=split_config.num_samples,
+                skip_targets=split_config.get_skip_targets(),
+                inputs=self.inputs,
+            )
+            refresh = index_mode == IndexMode.REFRESH
+            indexed_windows = index.load_windows(refresh)
+            if indexed_windows is not None:
+                logger.info(f"Loaded {len(indexed_windows)} windows from index")
+                return indexed_windows
+        # No index available, load and process windows from dataset
+        logger.debug("Loading windows from dataset...")
+        windows = self._get_initial_windows(split_config, workers)
+        windows = self._filter_windows_by_layers(windows, workers)
+        windows = self._sort_and_limit_windows(windows, split_config)
+        # Save to index if enabled
+        if index is not None:
+            index.save_windows(windows)
+        return windows
+    def _filter_windows_by_layers(
+        self, windows: list[Window], workers: int
+    ) -> list[Window]:
+        """Filter windows to only include those with required layers.
+        Args:
+            windows: list of windows to filter.
+            workers: number of worker processes for parallel filtering.
+        Returns:
+            list of windows that have all required input layers.
+        """
+        output_layer_skip = (
+            self.split_config.get_output_layer_name_skip_inference_if_exists()
+        )
+        if workers == 0:
+            return [
+                w
+                for w in windows
+                if check_window(
+                    self.inputs,
+                    w,
+                    output_layer_name_skip_inference_if_exists=output_layer_skip,
+                )
+                is not None
+            ]
+        p = multiprocessing.Pool(workers)
+        outputs = star_imap_unordered(
+            p,
+            check_window,
+            [
+                dict(
+                    inputs=self.inputs,
+                    window=window,
+                    output_layer_name_skip_inference_if_exists=output_layer_skip,
+                )
+                for window in windows
+            ],
+        )
+        filtered = []
+        for window in tqdm.tqdm(
+            outputs,
+            total=len(windows),
+            desc="Checking available layers in windows",
+        ):
+            if window is not None:
+                filtered.append(window)
+        p.close()
+        return filtered
+    def _sort_and_limit_windows(
+        self, windows: list[Window], split_config: SplitConfig
+    ) -> list[Window]:
+        """Sort windows by hash and apply num_samples limit.
+        Sorting ensures consistent ordering across GPUs. Using hash gives a
+        pseudo-random but deterministic order for sampling.
+        Args:
+            windows: list of windows to sort and limit.
+            split_config: the split configuration with num_samples.
+        Returns:
+            sorted and optionally limited list of windows.
+        """
+        windows.sort(
+            key=lambda window: hashlib.sha256(window.name.encode()).hexdigest()
+        )
+        if split_config.num_samples:
+            windows = windows[: split_config.num_samples]
+        return windows
     def _serialize_item(self, example: Window) -> dict[str, Any]:
         return example.get_metadata()
@@ -808,8 +951,8 @@ class ModelDataset(torch.utils.data.Dataset):
     def get_dataset_examples(self) -> list[Window]:
         """Get a list of examples in the dataset.
-        If load_all_patches is False, this is a list of Windows. Otherwise, this is a
-        list of (window, patch_bounds, (patch_idx, # patches)) tuples.
+        If load_all_crops is False, this is a list of Windows. Otherwise, this is a
+        list of (window, crop_bounds, (crop_idx, # patches)) tuples.
         """
         if self.dataset_examples is None:
             logger.debug(
@@ -845,34 +988,34 @@ class ModelDataset(torch.utils.data.Dataset):
         rng = random.Random(idx if self.fix_patch_pick else None)
         # Select bounds to read.
-        if self.patch_size:
+        if self.crop_size:
             window = example
-            def get_patch_range(n_patch: int, n_window: int) -> list[int]:
-                if n_patch > n_window:
+            def get_crop_range(n_crop: int, n_window: int) -> list[int]:
+                if n_crop > n_window:
                     # Select arbitrary range containing the entire window.
-                    # Basically arbitrarily padding the window to get to patch size.
-                    start = rng.randint(n_window - n_patch, 0)
-                    return [start, start + n_patch]
+                    # Basically arbitrarily padding the window to get to crop size.
+                    start = rng.randint(n_window - n_crop, 0)
+                    return [start, start + n_crop]
                 else:
-                    # Select arbitrary patch within the window.
-                    start = rng.randint(0, n_window - n_patch)
-                    return [start, start + n_patch]
+                    # Select arbitrary crop within the window.
+                    start = rng.randint(0, n_window - n_crop)
+                    return [start, start + n_crop]
             window_size = (
                 window.bounds[2] - window.bounds[0],
                 window.bounds[3] - window.bounds[1],
             )
-            patch_ranges = [
-                get_patch_range(self.patch_size[0], window_size[0]),
-                get_patch_range(self.patch_size[1], window_size[1]),
+            crop_ranges = [
+                get_crop_range(self.crop_size[0], window_size[0]),
+                get_crop_range(self.crop_size[1], window_size[1]),
             ]
             bounds = (
-                window.bounds[0] + patch_ranges[0][0],
-                window.bounds[1] + patch_ranges[1][0],
-                window.bounds[0] + patch_ranges[0][1],
-                window.bounds[1] + patch_ranges[1][1],
+                window.bounds[0] + crop_ranges[0][0],
+                window.bounds[1] + crop_ranges[1][0],
+                window.bounds[0] + crop_ranges[0][1],
+                window.bounds[1] + crop_ranges[1][1],
             )
         else:
@@ -894,9 +1037,9 @@ class ModelDataset(torch.utils.data.Dataset):
             window_group=window.group,
             window_name=window.name,
             window_bounds=window.bounds,
-            patch_bounds=bounds,
-            patch_idx=0,
-            num_patches_in_window=1,
+            crop_bounds=bounds,
+            crop_idx=0,
+            num_crops_in_window=1,
             time_range=window.time_range,
             projection=window.projection,
             dataset_source=self.name,

rslearn/train/dataset_index.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Dataset index for caching window lists to speed up ModelDataset initialization."""
+import hashlib
+import json
+from datetime import datetime
+from typing import TYPE_CHECKING, Any
+from upath import UPath
+from rslearn.dataset.window import Window
+from rslearn.log_utils import get_logger
+from rslearn.utils.fsspec import open_atomic
+if TYPE_CHECKING:
+    from rslearn.dataset.storage.storage import WindowStorage
+logger = get_logger(__name__)
+# Increment this when the index format changes to force rebuild
+INDEX_VERSION = 1
+# Directory name for storing index files
+INDEX_DIR_NAME = ".rslearn_dataset_index"
+class DatasetIndex:
+    """Manages indexed window lists for faster ModelDataset initialization.
+    Note: The index does NOT automatically detect when windows are added or removed
+    from the dataset. Use refresh=True after modifying dataset windows.
+    """
+    def __init__(
+        self,
+        storage: "WindowStorage",
+        dataset_path: UPath,
+        groups: list[str] | None,
+        names: list[str] | None,
+        tags: dict[str, Any] | None,
+        num_samples: int | None,
+        skip_targets: bool,
+        inputs: dict[str, Any],
+    ) -> None:
+        """Initialize DatasetIndex with specific configuration.
+        Args:
+            storage: WindowStorage for deserializing windows.
+            dataset_path: Path to the dataset directory.
+            groups: list of window groups to include.
+            names: list of window names to include.
+            tags: tags to filter windows by.
+            num_samples: limit on number of samples.
+            skip_targets: whether targets are skipped.
+            inputs: dict mapping input names to DataInput objects.
+        """
+        self.storage = storage
+        self.dataset_path = dataset_path
+        self.index_dir = dataset_path / INDEX_DIR_NAME
+        # Compute index key from configuration
+        inputs_data = {}
+        for name, inp in inputs.items():
+            inputs_data[name] = {
+                "layers": inp.layers,
+                "required": inp.required,
+                "load_all_layers": inp.load_all_layers,
+                "is_target": inp.is_target,
+            }
+        key_data = {
+            "groups": groups,
+            "names": names,
+            "tags": tags,
+            "num_samples": num_samples,
+            "skip_targets": skip_targets,
+            "inputs": inputs_data,
+        }
+        self.index_key = hashlib.sha256(
+            json.dumps(key_data, sort_keys=True).encode()
+        ).hexdigest()
+    def _get_config_hash(self) -> str:
+        """Get hash of config.json for quick validation.
+        Returns:
+            A 16-character hex string hash of the config, or empty string if no config.
+        """
+        config_path = self.dataset_path / "config.json"
+        if config_path.exists():
+            with config_path.open() as f:
+                return hashlib.sha256(f.read().encode()).hexdigest()[:16]
+        return ""
+    def load_windows(self, refresh: bool = False) -> list[Window] | None:
+        """Load indexed window list if valid, else return None.
+        Args:
+            refresh: If True, ignore existing index and return None.
+        Returns:
+            List of Window objects if index is valid, None otherwise.
+        """
+        if refresh:
+            logger.info("refresh=True, rebuilding index")
+            return None
+        index_file = self.index_dir / f"{self.index_key}.json"
+        if not index_file.exists():
+            logger.info(f"No index found at {index_file}, will build")
+            return None
+        try:
+            with index_file.open() as f:
+                index_data = json.load(f)
+        except (OSError, json.JSONDecodeError):
+            logger.warning(f"Corrupted index file at {index_file}, will rebuild")
+            return None
+        # Check index version
+        if index_data.get("version") != INDEX_VERSION:
+            logger.info(
+                f"Index version mismatch (got {index_data.get('version')}, "
+                f"expected {INDEX_VERSION}), will rebuild"
+            )
+            return None
+        # Quick validation: check config hash
+        if index_data.get("config_hash") != self._get_config_hash():
+            logger.info("Config hash mismatch, index invalidated")
+            return None
+        # Deserialize windows
+        return [Window.from_metadata(self.storage, w) for w in index_data["windows"]]
+    def save_windows(self, windows: list[Window]) -> None:
+        """Save processed windows to index with atomic write.
+        Args:
+            windows: List of Window objects to index.
+        """
+        self.index_dir.mkdir(parents=True, exist_ok=True)
+        index_file = self.index_dir / f"{self.index_key}.json"
+        # Serialize windows
+        serialized_windows = [w.get_metadata() for w in windows]
+        index_data = {
+            "version": INDEX_VERSION,
+            "config_hash": self._get_config_hash(),
+            "created_at": datetime.now().isoformat(),
+            "num_windows": len(windows),
+            "windows": serialized_windows,
+        }
+        with open_atomic(index_file, "w") as f:
+            json.dump(index_data, f)
+        logger.info(f"Saved {len(windows)} windows to index at {index_file}")

rslearn 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

rslearn 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl