PyPI - rslearn - Versions diffs - 0.0.16__tar.gz → 0.0.18__tar.gz - Mend

rslearn 0.0.16tar.gz → 0.0.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (176) hide show

{rslearn-0.0.16/rslearn.egg-info → rslearn-0.0.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rslearn
-Version: 0.0.16
+Version: 0.0.18
 Summary: A library for developing remote sensing datasets and models
 Author: OlmoEarth Team
 License:                                  Apache License
@@ -343,10 +343,12 @@ directory `/path/to/dataset` and corresponding configuration file at
                 "bands": ["R", "G", "B"]
             }],
             "data_source": {
-                "name": "rslearn.data_sources.gcp_public_data.Sentinel2",
-                "index_cache_dir": "cache/sentinel2/",
-                "sort_by": "cloud_cover",
-                "use_rtree_index": false
+                "class_path": "rslearn.data_sources.gcp_public_data.Sentinel2",
+                "init_args": {
+                    "index_cache_dir": "cache/sentinel2/",
+                    "sort_by": "cloud_cover",
+                    "use_rtree_index": false
+                }
             }
         }
     }
@@ -453,8 +455,10 @@ automate this process. Update the dataset `config.json` with a new layer:
         }],
         "resampling_method": "nearest",
         "data_source": {
-            "name": "rslearn.data_sources.local_files.LocalFiles",
-            "src_dir": "file:///path/to/world_cover_tifs/"
+            "class_path": "rslearn.data_sources.local_files.LocalFiles",
+            "init_args": {
+                "src_dir": "file:///path/to/world_cover_tifs/"
+            }
         }
     }
 },
@@ -516,8 +520,7 @@ model:
 data:
   class_path: rslearn.train.data_module.RslearnDataModule
   init_args:
-    # Replace this with the dataset path.
-    path: /path/to/dataset/
+    path: ${DATASET_PATH}
     # This defines the layers that should be read for each window.
     # The key ("image" / "targets") is what the data will be called in the model,
     # while the layers option specifies which layers will be read.
@@ -615,7 +618,9 @@ trainer:
       ...
     - class_path: rslearn.train.prediction_writer.RslearnWriter
       init_args:
-        path: /path/to/dataset/
+        # We need to include this argument, but it will be overridden with the dataset
+        # path from data.init_args.path.
+        path: placeholder
         output_layer: output
 ```
@@ -768,24 +773,43 @@ This will produce PNGs in the vis directory. The visualizations are produced by
 SegmentationTask and overriding the visualize function.
-### Logging to Weights & Biases
+### Checkpoint and Logging Management
+Above, we needed to configure the checkpoint directory in the model config (the
+`dirpath` option under `lightning.pytorch.callbacks.ModelCheckpoint`), and explicitly
+specify the checkpoint path when applying the model. Additionally, metrics are logged
+to the local filesystem and not well organized.
-We can log to W&B by setting the logger under trainer in the model configuration file:
+We can instead let rslearn automatically manage checkpoints, along with logging to
+Weights & Biases. To do so, we add project_name, run_name, and management_dir options
+to the model config. The project_name corresponds to the W&B project, and the run name
+corresponds to the W&B name. The management_dir is a directory to store project data;
+rslearn determines a per-project directory at `{management_dir}/{project_name}/{run_name}/`
+and uses it to store checkpoints.
 ```yaml
+model:
+  # ...
+data:
+  # ...
 trainer:
   # ...
-  logger:
-    class_path: lightning.pytorch.loggers.WandbLogger
-    init_args:
-      project: land_cover_model
-      name: version_00
+project_name: land_cover_model
+run_name: version_00
+# This sets the option via the MANAGEMENT_DIR environment variable.
+management_dir: ${MANAGEMENT_DIR}
 ```
-Now, runs with this model configuration should show on W&B. For `model fit` runs,
-the training and validation loss and accuracy metric will be logged. The accuracy
-metric is provided by SegmentationTask, and additional metrics can be enabled by
-passing the relevant init_args to the task, e.g. mean IoU and F1:
+Now, set the `MANAGEMENT_DIR` environment variable and run `model fit`:
+```
+export MANAGEMENT_DIR=./project_data
+rslearn model fit --config land_cover_model.yaml
+```
+The training and validation loss and accuracy metric should now be logged to W&B. The
+accuracy metric is provided by SegmentationTask, and additional metrics can be enabled
+by passing the relevant init_args to the task, e.g. mean IoU and F1:
 ```yaml
       class_path: rslearn.train.tasks.segmentation.SegmentationTask
@@ -796,6 +820,13 @@ passing the relevant init_args to the task, e.g. mean IoU and F1:
         enable_f1_metric: true
 ```
+When calling `model test` and `model predict` with management_dir set, rslearn will
+automatically load the best checkpoint from the project directory, or raise an error if
+no existing checkpoint exists. This behavior can be overridden with the
+`--load_checkpoint_mode` and `--load_checkpoint_required` options (see `--help` for
+details). Logging will be enabled during fit but not test/predict, and this can also
+be overridden, using `--log_mode`.
 ### Inputting Multiple Sentinel-2 Images
@@ -818,10 +849,12 @@ query_config section. This can replace the sentinel2 layer:
             "bands": ["R", "G", "B"]
         }],
         "data_source": {
-            "name": "rslearn.data_sources.gcp_public_data.Sentinel2",
-            "index_cache_dir": "cache/sentinel2/",
-            "sort_by": "cloud_cover",
-            "use_rtree_index": false,
+            "class_path": "rslearn.data_sources.gcp_public_data.Sentinel2",
+            "init_args": {
+              "index_cache_dir": "cache/sentinel2/",
+              "sort_by": "cloud_cover",
+              "use_rtree_index": false
+            },
             "query_config": {
                 "max_matches": 3
             }

{rslearn-0.0.16 → rslearn-0.0.18}/README.md RENAMED Viewed

@@ -79,10 +79,12 @@ directory `/path/to/dataset` and corresponding configuration file at
                 "bands": ["R", "G", "B"]
             }],
             "data_source": {
-                "name": "rslearn.data_sources.gcp_public_data.Sentinel2",
-                "index_cache_dir": "cache/sentinel2/",
-                "sort_by": "cloud_cover",
-                "use_rtree_index": false
+                "class_path": "rslearn.data_sources.gcp_public_data.Sentinel2",
+                "init_args": {
+                    "index_cache_dir": "cache/sentinel2/",
+                    "sort_by": "cloud_cover",
+                    "use_rtree_index": false
+                }
             }
         }
     }
@@ -189,8 +191,10 @@ automate this process. Update the dataset `config.json` with a new layer:
         }],
         "resampling_method": "nearest",
         "data_source": {
-            "name": "rslearn.data_sources.local_files.LocalFiles",
-            "src_dir": "file:///path/to/world_cover_tifs/"
+            "class_path": "rslearn.data_sources.local_files.LocalFiles",
+            "init_args": {
+                "src_dir": "file:///path/to/world_cover_tifs/"
+            }
         }
     }
 },
@@ -252,8 +256,7 @@ model:
 data:
   class_path: rslearn.train.data_module.RslearnDataModule
   init_args:
-    # Replace this with the dataset path.
-    path: /path/to/dataset/
+    path: ${DATASET_PATH}
     # This defines the layers that should be read for each window.
     # The key ("image" / "targets") is what the data will be called in the model,
     # while the layers option specifies which layers will be read.
@@ -351,7 +354,9 @@ trainer:
       ...
     - class_path: rslearn.train.prediction_writer.RslearnWriter
       init_args:
-        path: /path/to/dataset/
+        # We need to include this argument, but it will be overridden with the dataset
+        # path from data.init_args.path.
+        path: placeholder
         output_layer: output
 ```
@@ -504,24 +509,43 @@ This will produce PNGs in the vis directory. The visualizations are produced by
 SegmentationTask and overriding the visualize function.
-### Logging to Weights & Biases
+### Checkpoint and Logging Management
+Above, we needed to configure the checkpoint directory in the model config (the
+`dirpath` option under `lightning.pytorch.callbacks.ModelCheckpoint`), and explicitly
+specify the checkpoint path when applying the model. Additionally, metrics are logged
+to the local filesystem and not well organized.
-We can log to W&B by setting the logger under trainer in the model configuration file:
+We can instead let rslearn automatically manage checkpoints, along with logging to
+Weights & Biases. To do so, we add project_name, run_name, and management_dir options
+to the model config. The project_name corresponds to the W&B project, and the run name
+corresponds to the W&B name. The management_dir is a directory to store project data;
+rslearn determines a per-project directory at `{management_dir}/{project_name}/{run_name}/`
+and uses it to store checkpoints.
 ```yaml
+model:
+  # ...
+data:
+  # ...
 trainer:
   # ...
-  logger:
-    class_path: lightning.pytorch.loggers.WandbLogger
-    init_args:
-      project: land_cover_model
-      name: version_00
+project_name: land_cover_model
+run_name: version_00
+# This sets the option via the MANAGEMENT_DIR environment variable.
+management_dir: ${MANAGEMENT_DIR}
 ```
-Now, runs with this model configuration should show on W&B. For `model fit` runs,
-the training and validation loss and accuracy metric will be logged. The accuracy
-metric is provided by SegmentationTask, and additional metrics can be enabled by
-passing the relevant init_args to the task, e.g. mean IoU and F1:
+Now, set the `MANAGEMENT_DIR` environment variable and run `model fit`:
+```
+export MANAGEMENT_DIR=./project_data
+rslearn model fit --config land_cover_model.yaml
+```
+The training and validation loss and accuracy metric should now be logged to W&B. The
+accuracy metric is provided by SegmentationTask, and additional metrics can be enabled
+by passing the relevant init_args to the task, e.g. mean IoU and F1:
 ```yaml
       class_path: rslearn.train.tasks.segmentation.SegmentationTask
@@ -532,6 +556,13 @@ passing the relevant init_args to the task, e.g. mean IoU and F1:
         enable_f1_metric: true
 ```
+When calling `model test` and `model predict` with management_dir set, rslearn will
+automatically load the best checkpoint from the project directory, or raise an error if
+no existing checkpoint exists. This behavior can be overridden with the
+`--load_checkpoint_mode` and `--load_checkpoint_required` options (see `--help` for
+details). Logging will be enabled during fit but not test/predict, and this can also
+be overridden, using `--log_mode`.
 ### Inputting Multiple Sentinel-2 Images
@@ -554,10 +585,12 @@ query_config section. This can replace the sentinel2 layer:
             "bands": ["R", "G", "B"]
         }],
         "data_source": {
-            "name": "rslearn.data_sources.gcp_public_data.Sentinel2",
-            "index_cache_dir": "cache/sentinel2/",
-            "sort_by": "cloud_cover",
-            "use_rtree_index": false,
+            "class_path": "rslearn.data_sources.gcp_public_data.Sentinel2",
+            "init_args": {
+              "index_cache_dir": "cache/sentinel2/",
+              "sort_by": "cloud_cover",
+              "use_rtree_index": false
+            },
             "query_config": {
                 "max_matches": 3
             }

{rslearn-0.0.16 → rslearn-0.0.18}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "rslearn"
-version = "0.0.16"
+version = "0.0.18"
 description = "A library for developing remote sensing datasets and models"
 authors = [
     { name = "OlmoEarth Team" },

{rslearn-0.0.16 → rslearn-0.0.18}/rslearn/config/__init__.py RENAMED Viewed

@@ -10,6 +10,7 @@ from .dataset import (
     LayerType,
     QueryConfig,
     SpaceMode,
+    StorageConfig,
     TimeMode,
 )
@@ -23,5 +24,6 @@ __all__ = [
     "LayerType",
     "QueryConfig",
     "SpaceMode",
+    "StorageConfig",
     "TimeMode",
 ]

{rslearn-0.0.16 → rslearn-0.0.18}/rslearn/config/dataset.py RENAMED Viewed

@@ -31,6 +31,7 @@ from rslearn.utils.vector_format import VectorFormat
 if TYPE_CHECKING:
     from rslearn.data_sources.data_source import DataSource
+    from rslearn.dataset.storage.storage import WindowStorageFactory
 logger = get_logger("__name__")
@@ -132,7 +133,11 @@ class BandSetConfig(BaseModel):
     bands.
     """
-    dtype: DType = Field(description="Pixel value type to store the data under")
+    model_config = ConfigDict(extra="forbid")
+    dtype: DType = Field(
+        description="Pixel value type to store the data under. This is used during dataset materialize and model predict."
+    )
     bands: list[str] = Field(
         default_factory=lambda: [],
         description="List of band names in this BandSetConfig. One of bands or num_bands must be set.",
@@ -244,6 +249,9 @@ class BandSetConfig(BaseModel):
             "use `{'class_path': '...', 'init_args': {...}}` instead.",
             DeprecationWarning,
         )
+        logger.warning(
+            "BandSet.format uses legacy format; support will be removed after 2026-03-01."
+        )
         legacy_name_to_class_path = {
             "image_tile": "rslearn.utils.raster_format.ImageTileRasterFormat",
@@ -326,7 +334,7 @@ class TimeMode(StrEnum):
 class QueryConfig(BaseModel):
     """A configuration for querying items in a data source."""
-    model_config = ConfigDict(frozen=True)
+    model_config = ConfigDict(frozen=True, extra="forbid")
     space_mode: SpaceMode = Field(
         default=SpaceMode.MOSAIC,
@@ -360,7 +368,7 @@ class QueryConfig(BaseModel):
 class DataSourceConfig(BaseModel):
     """Configuration for a DataSource in a dataset layer."""
-    model_config = ConfigDict(frozen=True)
+    model_config = ConfigDict(frozen=True, extra="forbid")
     class_path: str = Field(description="Class path for the data source.")
     init_args: dict[str, Any] = Field(
@@ -409,6 +417,9 @@ class DataSourceConfig(BaseModel):
             "use `{'class_path': '...', 'init_args': {...}, ...}` instead.",
             DeprecationWarning,
         )
+        logger.warning(
+            "Data source configuration uses legacy format; support will be removed after 2026-03-01."
+        )
         # Split the dict into the base config that is in the pydantic model, and the
         # source-specific options that should be moved to init_args dict.
@@ -463,7 +474,7 @@ class CompositingMethod(StrEnum):
 class LayerConfig(BaseModel):
     """Configuration of a layer in a dataset."""
-    model_config = ConfigDict(frozen=True)
+    model_config = ConfigDict(frozen=True, extra="forbid")
     type: LayerType = Field(description="The LayerType (raster or vector).")
     data_source: DataSourceConfig | None = Field(
@@ -586,11 +597,51 @@ class LayerConfig(BaseModel):
         return vector_format
+class StorageConfig(BaseModel):
+    """Configuration for the WindowStorageFactory (window metadata storage backend)."""
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    class_path: str = Field(
+        default="rslearn.dataset.storage.file.FileWindowStorageFactory",
+        description="Class path for the WindowStorageFactory.",
+    )
+    init_args: dict[str, Any] = Field(
+        default_factory=lambda: {},
+        description="jsonargparse init args for the WindowStorageFactory.",
+    )
+    def instantiate_window_storage_factory(self) -> "WindowStorageFactory":
+        """Instantiate the WindowStorageFactory specified by this config."""
+        from rslearn.dataset.storage.storage import WindowStorageFactory
+        from rslearn.utils.jsonargparse import init_jsonargparse
+        init_jsonargparse()
+        parser = jsonargparse.ArgumentParser()
+        parser.add_argument("--wsf", type=WindowStorageFactory)
+        cfg = parser.parse_object(
+            {
+                "wsf": dict(
+                    class_path=self.class_path,
+                    init_args=self.init_args,
+                )
+            }
+        )
+        wsf = parser.instantiate_classes(cfg).wsf
+        return wsf
 class DatasetConfig(BaseModel):
     """Overall dataset configuration."""
+    model_config = ConfigDict(extra="forbid")
     layers: dict[str, LayerConfig] = Field(description="Layers in the dataset.")
     tile_store: dict[str, Any] = Field(
         default={"class_path": "rslearn.tile_stores.default.DefaultTileStore"},
         description="jsonargparse configuration for the TileStore.",
     )
+    storage: StorageConfig = Field(
+        default_factory=lambda: StorageConfig(),
+        description="jsonargparse configuration for the WindowStorageFactory.",
+    )

{rslearn-0.0.16 → rslearn-0.0.18}/rslearn/dataset/add_windows.py RENAMED Viewed

@@ -131,7 +131,7 @@ def add_windows_from_geometries(
                     f"_{time_range[0].isoformat()}_{time_range[1].isoformat()}"
                 )
         window = Window(
-            path=dataset.path / "windows" / group / cur_window_name,
+            storage=dataset.storage,
             group=group,
             name=cur_window_name,
             projection=cur_projection,

{rslearn-0.0.16 → rslearn-0.0.18}/rslearn/dataset/dataset.py RENAMED Viewed

@@ -1,9 +1,8 @@
 """rslearn dataset class."""
 import json
-import multiprocessing
+from typing import Any
-import tqdm
 from upath import UPath
 from rslearn.config import DatasetConfig
@@ -11,7 +10,6 @@ from rslearn.log_utils import get_logger
 from rslearn.template_params import substitute_env_vars_in_string
 from rslearn.tile_stores import TileStore, load_tile_store
-from .index import DatasetIndex
 from .window import Window
 logger = get_logger(__name__)
@@ -68,80 +66,26 @@ class Dataset:
                 self.layers[layer_name] = layer_config
             self.tile_store_config = config.tile_store
-    def _get_index(self) -> DatasetIndex | None:
-        index_fname = self.path / DatasetIndex.FNAME
-        if not index_fname.exists():
-            return None
-        return DatasetIndex.load_index(self.path)
+            self.storage = (
+                config.storage.instantiate_window_storage_factory().get_storage(
+                    self.path
+                )
+            )
     def load_windows(
         self,
         groups: list[str] | None = None,
         names: list[str] | None = None,
-        show_progress: bool = False,
-        workers: int = 0,
-        no_index: bool = False,
+        **kwargs: Any,
     ) -> list[Window]:
         """Load the windows in the dataset.
         Args:
             groups: an optional list of groups to filter loading
             names: an optional list of window names to filter loading
-            show_progress: whether to show tqdm progress bar
-            workers: number of parallel workers, default 0 (use main thread only to load windows)
-            no_index: don't use the dataset index even if it exists.
+            kwargs: optional keyword arguments to pass to WindowStorage.get_windows.
         """
-        # Load from index if it exists.
-        # We never use the index if names is set since loading the index will likely be
-        # slower than loading a few windows.
-        if not no_index and names is None:
-            dataset_index = self._get_index()
-            if dataset_index is not None:
-                return dataset_index.get_windows(groups=groups, names=names)
-        # Avoid directory does not exist errors later.
-        if not (self.path / "windows").exists():
-            return []
-        window_dirs = []
-        if not groups:
-            groups = []
-            for p in (self.path / "windows").iterdir():
-                groups.append(p.name)
-        for group in groups:
-            group_dir = self.path / "windows" / group
-            if not group_dir.exists():
-                logger.warning(
-                    f"Skipping group directory {group_dir} since it does not exist"
-                )
-                continue
-            if names:
-                cur_names = names
-            else:
-                cur_names = []
-                for p in group_dir.iterdir():
-                    cur_names.append(p.name)
-            for window_name in cur_names:
-                window_dir = group_dir / window_name
-                window_dirs.append(window_dir)
-        if workers == 0:
-            windows = [Window.load(window_dir) for window_dir in window_dirs]
-        else:
-            p = multiprocessing.Pool(workers)
-            outputs = p.imap_unordered(Window.load, window_dirs)
-            if show_progress:
-                outputs = tqdm.tqdm(
-                    outputs, total=len(window_dirs), desc="Loading windows"
-                )
-            windows = []
-            for window in outputs:
-                windows.append(window)
-            p.close()
-        return windows
+        return self.storage.get_windows(groups=groups, names=names, **kwargs)
     def get_tile_store(self) -> TileStore:
         """Get the tile store associated with this dataset.

{rslearn-0.0.16 → rslearn-0.0.18}/rslearn/dataset/materialize.py RENAMED Viewed

@@ -161,7 +161,7 @@ def build_first_valid_composite(
     nodata_vals: list[Any],
     bands: list[str],
     bounds: PixelBounds,
-    band_dtype: Any,
+    band_dtype: npt.DTypeLike,
     tile_store: TileStoreWithLayer,
     projection: Projection,
     remapper: Remapper | None,
@@ -233,7 +233,7 @@ def read_and_stack_raster_windows(
     projection: Projection,
     nodata_vals: list[Any],
     remapper: Remapper | None,
-    band_dtype: Any,
+    band_dtype: npt.DTypeLike,
     resampling_method: Resampling = Resampling.bilinear,
 ) -> npt.NDArray[np.generic]:
     """Create a stack of extent aligned raster windows.
@@ -326,7 +326,7 @@ def build_mean_composite(
     nodata_vals: list[Any],
     bands: list[str],
     bounds: PixelBounds,
-    band_dtype: Any,
+    band_dtype: npt.DTypeLike,
     tile_store: TileStoreWithLayer,
     projection: Projection,
     remapper: Remapper | None,
@@ -383,7 +383,7 @@ def build_median_composite(
     nodata_vals: list[Any],
     bands: list[str],
     bounds: PixelBounds,
-    band_dtype: Any,
+    band_dtype: npt.DTypeLike,
     tile_store: TileStoreWithLayer,
     projection: Projection,
     remapper: Remapper | None,
@@ -471,7 +471,7 @@ def build_composite(
         nodata_vals=nodata_vals,
         bands=band_cfg.bands,
         bounds=bounds,
-        band_dtype=band_cfg.dtype.value,
+        band_dtype=band_cfg.dtype.get_numpy_dtype(),
         tile_store=tile_store,
         projection=projection,
         resampling_method=layer_cfg.resampling_method.get_rasterio_resampling(),

rslearn-0.0.18/rslearn/dataset/storage/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Storage backends for rslearn window metadata."""

rslearn 0.0.16__tar.gz → 0.0.18__tar.gz

rslearn 0.0.16tar.gz → 0.0.18tar.gz