PyPI - rslearn - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

rslearn 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

rslearn/config/dataset.py +22 -13
rslearn/data_sources/__init__.py +8 -0
rslearn/data_sources/aws_landsat.py +27 -18
rslearn/data_sources/aws_open_data.py +41 -42
rslearn/data_sources/copernicus.py +148 -2
rslearn/data_sources/data_source.py +17 -10
rslearn/data_sources/gcp_public_data.py +177 -100
rslearn/data_sources/geotiff.py +1 -0
rslearn/data_sources/google_earth_engine.py +17 -15
rslearn/data_sources/local_files.py +59 -32
rslearn/data_sources/openstreetmap.py +27 -23
rslearn/data_sources/planet.py +10 -9
rslearn/data_sources/planet_basemap.py +303 -0
rslearn/data_sources/raster_source.py +23 -13
rslearn/data_sources/usgs_landsat.py +56 -27
rslearn/data_sources/utils.py +13 -6
rslearn/data_sources/vector_source.py +1 -0
rslearn/data_sources/xyz_tiles.py +8 -9
rslearn/dataset/add_windows.py +1 -1
rslearn/dataset/dataset.py +16 -5
rslearn/dataset/manage.py +9 -4
rslearn/dataset/materialize.py +26 -5
rslearn/dataset/window.py +5 -0
rslearn/log_utils.py +24 -0
rslearn/main.py +123 -59
rslearn/models/clip.py +62 -0
rslearn/models/conv.py +56 -0
rslearn/models/faster_rcnn.py +2 -19
rslearn/models/fpn.py +1 -1
rslearn/models/module_wrapper.py +43 -0
rslearn/models/molmo.py +65 -0
rslearn/models/multitask.py +1 -1
rslearn/models/pooling_decoder.py +4 -2
rslearn/models/satlaspretrain.py +4 -7
rslearn/models/simple_time_series.py +61 -55
rslearn/models/ssl4eo_s12.py +9 -9
rslearn/models/swin.py +22 -21
rslearn/models/unet.py +4 -2
rslearn/models/upsample.py +35 -0
rslearn/tile_stores/file.py +6 -3
rslearn/tile_stores/tile_store.py +19 -7
rslearn/train/callbacks/freeze_unfreeze.py +3 -3
rslearn/train/data_module.py +5 -4
rslearn/train/dataset.py +79 -36
rslearn/train/lightning_module.py +15 -11
rslearn/train/prediction_writer.py +22 -11
rslearn/train/tasks/classification.py +9 -8
rslearn/train/tasks/detection.py +94 -37
rslearn/train/tasks/multi_task.py +1 -1
rslearn/train/tasks/regression.py +8 -4
rslearn/train/tasks/segmentation.py +23 -19
rslearn/train/transforms/__init__.py +1 -1
rslearn/train/transforms/concatenate.py +6 -2
rslearn/train/transforms/crop.py +6 -2
rslearn/train/transforms/flip.py +5 -1
rslearn/train/transforms/normalize.py +9 -5
rslearn/train/transforms/pad.py +1 -1
rslearn/train/transforms/transform.py +3 -3
rslearn/utils/__init__.py +4 -5
rslearn/utils/array.py +2 -2
rslearn/utils/feature.py +1 -1
rslearn/utils/fsspec.py +70 -1
rslearn/utils/geometry.py +155 -3
rslearn/utils/grid_index.py +5 -5
rslearn/utils/mp.py +4 -3
rslearn/utils/raster_format.py +81 -73
rslearn/utils/rtree_index.py +64 -17
rslearn/utils/sqlite_index.py +7 -1
rslearn/utils/utils.py +11 -3
rslearn/utils/vector_format.py +113 -17
{rslearn-0.0.1.dist-info → rslearn-0.0.2.dist-info}/METADATA +32 -27
rslearn-0.0.2.dist-info/RECORD +94 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.2.dist-info}/WHEEL +1 -1
rslearn/utils/mgrs.py +0 -24
rslearn-0.0.1.dist-info/RECORD +0 -88
{rslearn-0.0.1.dist-info → rslearn-0.0.2.dist-info}/LICENSE +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.2.dist-info}/entry_points.txt +0 -0
{rslearn-0.0.1.dist-info → rslearn-0.0.2.dist-info}/top_level.txt +0 -0

rslearn/train/data_module.py CHANGED Viewed

@@ -15,7 +15,7 @@ from .dataset import DataInput, ModelDataset, RetryDataset, SplitConfig
 def collate_fn(
     batch: list[tuple[dict[str, Any], dict[str, Any]]],
-) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+) -> tuple:
     """Collate batch of training examples.
     We just make list of the inputs and another of the targets.
@@ -48,7 +48,7 @@ class RslearnDataModule(L.LightningDataModule):
         val_config: SplitConfig = SplitConfig(),
         test_config: SplitConfig = SplitConfig(),
         predict_config: SplitConfig = SplitConfig(),
-    ):
+    ) -> None:
         """Initialize a new RslearnDataModule.
         Args:
@@ -79,7 +79,7 @@ class RslearnDataModule(L.LightningDataModule):
             "predict": default_config.update(predict_config),
         }
-    def setup(self, stage: str):
+    def setup(self, stage: str) -> None:
         """Set up datasets and samplers.
         Args:
@@ -106,12 +106,13 @@ class RslearnDataModule(L.LightningDataModule):
     def _get_dataloader(self, split: str) -> DataLoader[dict[str, torch.Tensor]]:
         dataset = self.datasets[split]
+        persistent_workers = self.num_workers > 0
         kwargs = dict(
             dataset=dataset,
             batch_size=self.batch_size,
             num_workers=self.num_workers,
             collate_fn=collate_fn,
-            persistent_workers=True,
+            persistent_workers=persistent_workers,
         )
         sampler_factory = self.split_configs[split].sampler
         if sampler_factory:

rslearn/train/dataset.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Default Dataset for rslearn."""
+import hashlib
 import multiprocessing
 import os
 import random
@@ -47,7 +48,9 @@ class SamplerFactory:
 class RandomSamplerFactory(SamplerFactory):
     """A sampler factory for RandomSampler."""
-    def __init__(self, replacement: bool = False, num_samples: int | None = None):
+    def __init__(
+        self, replacement: bool = False, num_samples: int | None = None
+    ) -> None:
         """Initialize a RandomSamplerFactory.
         Args:
@@ -75,7 +78,9 @@ class RandomSamplerFactory(SamplerFactory):
 class WeightedRandomSamplerFactory(SamplerFactory):
     """A sampler factory for WeightedRandomSampler."""
-    def __init__(self, option_key: str, num_samples: int, replacement: bool = True):
+    def __init__(
+        self, option_key: str, num_samples: int, replacement: bool = True
+    ) -> None:
         """Initialize a WeightedRandomSamplerFactory.
         Args:
@@ -119,7 +124,7 @@ class DataInput:
         passthrough: bool = False,
         is_target: bool = False,
         dtype: DType = DType.FLOAT32,
-    ):
+    ) -> None:
         """Initialize a new DataInput.
         Args:
@@ -157,7 +162,7 @@ class SplitConfig:
         overlap_ratio: float | None = None,
         load_all_patches: bool | None = None,
         skip_targets: bool | None = None,
-    ):
+    ) -> None:
         """Initialize a new SplitConfig.
         Args:
@@ -242,7 +247,7 @@ class SplitConfig:
         return True if self.skip_targets is True else False
-def check_window(inputs: dict[str, DataInput], window: Window) -> bool:
+def check_window(inputs: dict[str, DataInput], window: Window) -> Window | None:
     """Verify that the window has the required layers based on the specified inputs.
     Args:
@@ -254,7 +259,7 @@ def check_window(inputs: dict[str, DataInput], window: Window) -> bool:
     """
     # Make sure window has all the needed layers.
-    def is_any_layer_available(data_input):
+    def is_any_layer_available(data_input: DataInput) -> bool:
         for layer_name in data_input.layers:
             completed_fname = window.path / "layers" / layer_name / "completed"
             if completed_fname.exists():
@@ -285,7 +290,7 @@ class ModelDataset(torch.utils.data.Dataset):
         inputs: dict[str, DataInput],
         task: Task,
         workers: int,
-    ):
+    ) -> None:
         """Instantiate a new ModelDataset.
         Args:
@@ -347,37 +352,53 @@ class ModelDataset(torch.utils.data.Dataset):
         # Eliminate windows that are missing either a requisite input layer, or missing
         # all target layers.
-        p = multiprocessing.Pool(workers)
-        outputs = star_imap_unordered(
-            p,
-            check_window,
-            [
-                dict(
-                    inputs=self.inputs,
-                    window=window,
-                )
-                for window in windows
-            ],
-        )
         new_windows = []
-        for window in tqdm.tqdm(
-            outputs, total=len(windows), desc="Checking available layers in windows"
-        ):
-            if window is None:
-                continue
-            new_windows.append(window)
-        p.close()
+        if workers == 0:
+            for window in windows:
+                if check_window(self.inputs, window) is None:
+                    continue
+                new_windows.append(window)
+        else:
+            p = multiprocessing.Pool(workers)
+            outputs = star_imap_unordered(
+                p,
+                check_window,
+                [
+                    dict(
+                        inputs=self.inputs,
+                        window=window,
+                    )
+                    for window in windows
+                ],
+            )
+            for window in tqdm.tqdm(
+                outputs, total=len(windows), desc="Checking available layers in windows"
+            ):
+                if window is None:
+                    continue
+                new_windows.append(window)
+            p.close()
         windows = new_windows
+        # Sort the windows to ensure that the dataset is consistent across GPUs.
+        # Inconsistent ordering can lead to a subset of windows being processed during
+        # "model test" / "model predict" when using multiple GPUs.
+        # We use a hash so that functionality like num_samples limit gets a random
+        # subset of windows (with respect to the hash function choice).
+        windows.sort(
+            key=lambda window: hashlib.sha256(window.name.encode()).hexdigest()
+        )
         # Limit windows to num_samples if requested.
         if split_config.num_samples:
-            # TODO: use hash of window names so this is deterministic and not arbitrarily ordered according to load_windows
+            # The windows are sorted by hash of window name so this distribution should
+            # be representative of the population.
             windows = windows[0 : split_config.num_samples]
-        self.windows = windows
+        self.windows: list = windows
         # If we're loading all patches, we need to include the patch details.
-        if split_config.get_load_all_patches():
+        if split_config.get_load_all_patches() and self.patch_size is not None:
             patches = []
             overlap_size = int(
                 self.patch_size[0] * split_config.overlap_ratio
@@ -386,6 +407,8 @@ class ModelDataset(torch.utils.data.Dataset):
             )
             for window in self.windows:
                 cur_patches = []
+                if window is None:
+                    raise ValueError("Window is None in load_all_patches")
                 for col in range(
                     window.bounds[0],
                     window.bounds[2],
@@ -412,7 +435,9 @@ class ModelDataset(torch.utils.data.Dataset):
         """Returns the dataset length."""
         return len(self.windows)
-    def __getitem__(self, idx) -> tuple[dict[str, Any], dict[str, Any]]:
+    def __getitem__(
+        self, idx: int
+    ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
         """Read one training example.
         Args:
@@ -429,7 +454,7 @@ class ModelDataset(torch.utils.data.Dataset):
             window, bounds, (patch_idx, num_patches) = window
         elif self.patch_size:
-            def get_patch_range(n_patch, n_window):
+            def get_patch_range(n_patch: int, n_window: int) -> list[int]:
                 if n_patch > n_window:
                     # Select arbitrary range containing the entire window.
                     # Basically arbitrarily padding the window to get to patch size.
@@ -459,7 +484,7 @@ class ModelDataset(torch.utils.data.Dataset):
             bounds = window.bounds
         # Read the inputs and targets.
-        def read_input(data_input: DataInput):
+        def read_input(data_input: DataInput) -> torch.Tensor:
             # First enumerate all options of individual layers to read.
             layer_options = []
             for layer_name in data_input.layers:
@@ -473,7 +498,13 @@ class ModelDataset(torch.utils.data.Dataset):
             # the options, as well as picking multiple for series inputs.
             layer = random.choice(layer_options)
             layer_dir = window.path / "layers" / layer
-            layer_config = self.dataset.layers[layer]
+            # The model config may reference a specific group within a layer, like
+            # "image.2" in a dataset that has a layer "image" with max_matches > 1.
+            # So we need to split off the period. Layer names should not contain
+            # period.
+            layer_ds_key = layer.split(".")[0]
+            layer_config = self.dataset.layers[layer_ds_key]
             if data_input.data_type == "raster":
                 assert isinstance(layer_config, RasterLayerConfig)
@@ -481,6 +512,8 @@ class ModelDataset(torch.utils.data.Dataset):
                 # See what different sets of bands we need to read to get all the
                 # configured bands.
                 needed_bands = data_input.bands
+                if needed_bands is None:
+                    raise ValueError(f"No bands specified for {layer}")
                 needed_band_indexes = {}
                 for i, band in enumerate(needed_bands):
                     needed_band_indexes[band] = i
@@ -488,6 +521,8 @@ class ModelDataset(torch.utils.data.Dataset):
                 for band_set in layer_config.band_sets:
                     needed_src_indexes = []
                     needed_dst_indexes = []
+                    if band_set.bands is None:
+                        continue
                     for i, band in enumerate(band_set.bands):
                         if band not in needed_band_indexes:
                             continue
@@ -514,12 +549,20 @@ class ModelDataset(torch.utils.data.Dataset):
                     _, final_bounds = band_set.get_final_projection_and_bounds(
                         window.projection, bounds
                     )
+                    if band_set.format is None:
+                        raise ValueError(f"No format specified for {layer}")
                     raster_format = load_raster_format(
                         RasterFormatConfig(band_set.format["name"], band_set.format)
                     )
+                    if band_set.bands is None:
+                        # Raising Error as It is unclear the intended behavior here.
+                        raise ValueError("No bands specified for band set")
                     cur_path = layer_dir / "_".join(band_set.bands)
+                    if final_bounds is None:
+                        raise ValueError("Final bounds are None")
                     src = raster_format.decode_raster(cur_path, final_bounds)
+                    if src is None:
+                        raise ValueError(f"Source is None for {data_input}")
                     # Resize to patch size if needed.
                     # This is for band sets that are stored at a lower resolution.
                     # Here we assume that it is a multiple.
@@ -594,7 +637,7 @@ class RetryDataset(torch.utils.data.Dataset):
     def __init__(
         self, dataset: torch.utils.data.Dataset, retries: int = 3, delay: float = 5
-    ):
+    ) -> None:
         """Create a new RetryDataset.
         Args:
@@ -606,7 +649,7 @@ class RetryDataset(torch.utils.data.Dataset):
         self.retries = retries
         self.delay = delay
-    def __len__(self):
+    def __len__(self) -> int:
         """Return length of the dataset."""
         return len(self.dataset)

rslearn/train/lightning_module.py CHANGED Viewed

@@ -49,7 +49,7 @@ class RestoreConfig:
         """Returns the state dict configured in this RestoreConfig."""
         print(f"loading state dict from {self.restore_path}")
         with self.restore_path.open("rb") as f:
-            state_dict = torch.load(f)
+            state_dict = torch.load(f, map_location="cpu")
         for k in self.selector:
             state_dict = state_dict[k]
@@ -124,6 +124,7 @@ class RslearnLightningModule(L.LightningModule):
         self.plateau_min_lr = plateau_min_lr
         self.plateau_cooldown = plateau_cooldown
         self.visualize_dir = visualize_dir
+        self.restore_config = restore_config
         if print_parameters:
             for name, param in self.named_parameters():
@@ -132,8 +133,19 @@ class RslearnLightningModule(L.LightningModule):
         if print_model:
             print(self.model)
-        if restore_config:
-            state_dict = restore_config.get_state_dict()
+        self.epochs = 0
+        metrics = self.task.get_metrics()
+        self.val_metrics = metrics.clone(prefix="val_")
+        self.test_metrics = metrics.clone(prefix="test_")
+        self.schedulers: dict = {}
+    def on_fit_start(self) -> None:
+        """Called when the fit begins."""
+        # Only restore if doing a fresh fit.
+        if self.trainer.ckpt_path is None and self.restore_config:
+            state_dict = self.restore_config.get_state_dict()
             missing_keys, unexpected_keys = self.model.load_state_dict(
                 state_dict, strict=False
             )
@@ -142,14 +154,6 @@ class RslearnLightningModule(L.LightningModule):
                     f"warning: restore yielded missing_keys={missing_keys} and unexpected_keys={unexpected_keys}"
                 )
-        self.epochs = 0
-        metrics = self.task.get_metrics()
-        self.val_metrics = metrics.clone(prefix="val_")
-        self.test_metrics = metrics.clone(prefix="test_")
-        self.schedulers = {}
     def configure_optimizers(self) -> OptimizerLRSchedulerConfig:
         """Initialize the optimizer and learning rate scheduler.

rslearn/train/prediction_writer.py CHANGED Viewed

@@ -8,7 +8,12 @@ from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.callbacks import BasePredictionWriter
 from upath import UPath
-from rslearn.config import LayerType, RasterFormatConfig
+from rslearn.config import (
+    LayerType,
+    RasterFormatConfig,
+    RasterLayerConfig,
+    VectorLayerConfig,
+)
 from rslearn.dataset import Dataset
 from rslearn.utils.array import copy_spatial_array
 from rslearn.utils.raster_format import load_raster_format
@@ -20,17 +25,14 @@ from .lightning_module import RslearnLightningModule
 class PatchPredictionMerger:
     """Base class for merging predictions from multiple patches."""
-    def merge(
-        self, outputs: Sequence[Any], metadatas: Sequence[Any]
-    ) -> tuple[Sequence[Any], Sequence[Any]]:
-        """Merge the outputs and metadatas.
+    def merge(self, outputs: Sequence[Any]) -> tuple[Sequence[Any]]:
+        """Merge the outputs.
         Args:
             outputs: the outputs to process.
-            metadatas: the metadatas to process.
         Returns:
-            the merged outputs and metadatas.
+            the merged outputs.
         """
         raise NotImplementedError
@@ -57,6 +59,7 @@ class RslearnWriter(BasePredictionWriter):
             output_layer: which layer to write the outputs under.
             path_options: additional options for path to pass to fsspec
             selector: keys to access the desired output in the output dict if needed.
+                e.g ["key1", "key2"] gets output["key1"]["key2"]
             merger: merger to use to merge outputs from overlapped patches.
         """
         super().__init__(write_interval="batch")
@@ -65,13 +68,16 @@ class RslearnWriter(BasePredictionWriter):
         self.path = UPath(path, **path_options)
         self.dataset = Dataset(self.path)
         self.layer_config = self.dataset.layers[self.output_layer]
+        # TODO: This is a bit of a hack to get the type checker to be happy.
+        self.format: Any
         if self.layer_config.layer_type == LayerType.RASTER:
+            assert isinstance(self.layer_config, RasterLayerConfig)
             band_cfg = self.layer_config.band_sets[0]
             self.format = load_raster_format(
                 RasterFormatConfig(band_cfg.format["name"], band_cfg.format)
             )
         elif self.layer_config.layer_type == LayerType.VECTOR:
+            assert isinstance(self.layer_config, VectorLayerConfig)
             self.format = load_vector_format(self.layer_config.format)
         else:
             raise ValueError(f"invalid layer type {self.layer_config.layer_type}")
@@ -81,7 +87,7 @@ class RslearnWriter(BasePredictionWriter):
         # Map from window name to pending data to write.
         # This is used when windows are split up into patches, so the data from all the
         # patches of each window need to be reconstituted.
-        self.pending_outputs = {}
+        self.pending_outputs: dict[str, Any] = {}
     def write_on_batch_end(
         self,
@@ -92,7 +98,7 @@ class RslearnWriter(BasePredictionWriter):
         batch: Any,
         batch_idx: int,
         dataloader_idx: int,
-    ):
+    ) -> None:
         """Write a batch of predictions into the rslearn dataset.
         Args:
@@ -112,6 +118,8 @@ class RslearnWriter(BasePredictionWriter):
         ]
         for output, metadata in zip(outputs, metadatas):
+            if not isinstance(output, dict):
+                raise ValueError(f"Unsupported output type {type(output)}")
             for k in self.selector:
                 output = output[k]
@@ -120,7 +128,9 @@ class RslearnWriter(BasePredictionWriter):
             window_bounds = metadata["window_bounds"]
             if self.layer_config.layer_type == LayerType.RASTER:
-                if window_name not in self.pending_outputs:
+                if window_name not in self.pending_outputs and isinstance(
+                    output, np.ndarray
+                ):
                     self.pending_outputs[window_name] = np.zeros(
                         (
                             output.shape[0],
@@ -167,6 +177,7 @@ class RslearnWriter(BasePredictionWriter):
             )
             if self.layer_config.layer_type == LayerType.RASTER:
+                assert isinstance(self.layer_config, RasterLayerConfig)
                 band_dir = layer_dir / "_".join(self.layer_config.band_sets[0].bands)
                 self.format.encode_raster(
                     band_dir, metadata["projection"], window_bounds, pending_output

rslearn/train/tasks/classification.py CHANGED Viewed

@@ -26,8 +26,8 @@ class ClassificationTask(BasicTask):
     def __init__(
         self,
         property_name: str,
-        classes: list[str],
-        filters: list[tuple[str, str]] | None = None,
+        classes: list,  # TODO: Should this be a list of str or int or can it be both?
+        filters: list[tuple[str, str]] = [],
         read_class_id: bool = False,
         allow_invalid: bool = False,
         skip_unknown_categories: bool = False,
@@ -37,7 +37,7 @@ class ClassificationTask(BasicTask):
         f1_metric_kwargs: dict[str, Any] = {},
         positive_class: str | None = None,
         positive_class_threshold: float = 0.5,
-        **kwargs,
+        **kwargs: Any,
     ):
         """Initialize a new ClassificationTask.
@@ -95,9 +95,6 @@ class ClassificationTask(BasicTask):
             else:
                 self.positive_class_id = self.classes.index(self.positive_class)
-        if not self.filters:
-            self.filters = []
     def process_inputs(
         self,
         raw_inputs: dict[str, torch.Tensor | list[Feature]],
@@ -120,6 +117,8 @@ class ClassificationTask(BasicTask):
         data = raw_inputs["targets"]
         for feat in data:
+            if feat.properties is None:
+                continue
             for property_name, property_value in self.filters:
                 if feat.properties.get(property_name) != property_value:
                     continue
@@ -178,7 +177,7 @@ class ClassificationTask(BasicTask):
             class_idx = probs.argmax()
         if not self.read_class_id:
-            value = self.classes[class_idx]
+            value = self.classes[class_idx]  # type: ignore
         else:
             value = class_idx
@@ -192,7 +191,7 @@ class ClassificationTask(BasicTask):
                 self.property_name: value,
             },
         )
-        if self.prob_property:
+        if self.prob_property is not None and feature.properties is not None:
             feature.properties[self.prob_property] = probs.tolist()
         return [feature]
@@ -215,6 +214,8 @@ class ClassificationTask(BasicTask):
         image = super().visualize(input_dict, target_dict, output)["image"]
         image = Image.fromarray(image)
         draw = ImageDraw.Draw(image)
+        if target_dict is None:
+            raise ValueError("target_dict is required for visualization")
         target_class = self.classes[target_dict["class"]]
         output_class = self.classes[output.argmax()]
         text = f"Label: {target_class}\nOutput: {output_class}"

rslearn 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

rslearn 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl