rslearn 0.0.1__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. rslearn/arg_parser.py +31 -0
  2. rslearn/config/__init__.py +6 -12
  3. rslearn/config/dataset.py +520 -401
  4. rslearn/const.py +9 -15
  5. rslearn/data_sources/__init__.py +8 -23
  6. rslearn/data_sources/aws_landsat.py +242 -98
  7. rslearn/data_sources/aws_open_data.py +111 -151
  8. rslearn/data_sources/aws_sentinel1.py +131 -0
  9. rslearn/data_sources/climate_data_store.py +471 -0
  10. rslearn/data_sources/copernicus.py +884 -12
  11. rslearn/data_sources/data_source.py +43 -12
  12. rslearn/data_sources/earthdaily.py +484 -0
  13. rslearn/data_sources/earthdata_srtm.py +282 -0
  14. rslearn/data_sources/eurocrops.py +242 -0
  15. rslearn/data_sources/gcp_public_data.py +578 -222
  16. rslearn/data_sources/google_earth_engine.py +461 -135
  17. rslearn/data_sources/local_files.py +219 -150
  18. rslearn/data_sources/openstreetmap.py +51 -89
  19. rslearn/data_sources/planet.py +24 -60
  20. rslearn/data_sources/planet_basemap.py +275 -0
  21. rslearn/data_sources/planetary_computer.py +798 -0
  22. rslearn/data_sources/usda_cdl.py +195 -0
  23. rslearn/data_sources/usgs_landsat.py +115 -83
  24. rslearn/data_sources/utils.py +249 -61
  25. rslearn/data_sources/vector_source.py +1 -0
  26. rslearn/data_sources/worldcereal.py +449 -0
  27. rslearn/data_sources/worldcover.py +144 -0
  28. rslearn/data_sources/worldpop.py +153 -0
  29. rslearn/data_sources/xyz_tiles.py +150 -107
  30. rslearn/dataset/__init__.py +8 -2
  31. rslearn/dataset/add_windows.py +2 -2
  32. rslearn/dataset/dataset.py +40 -51
  33. rslearn/dataset/handler_summaries.py +131 -0
  34. rslearn/dataset/manage.py +313 -74
  35. rslearn/dataset/materialize.py +431 -107
  36. rslearn/dataset/remap.py +29 -4
  37. rslearn/dataset/storage/__init__.py +1 -0
  38. rslearn/dataset/storage/file.py +202 -0
  39. rslearn/dataset/storage/storage.py +140 -0
  40. rslearn/dataset/window.py +181 -44
  41. rslearn/lightning_cli.py +454 -0
  42. rslearn/log_utils.py +24 -0
  43. rslearn/main.py +384 -181
  44. rslearn/models/anysat.py +215 -0
  45. rslearn/models/attention_pooling.py +177 -0
  46. rslearn/models/clay/clay.py +231 -0
  47. rslearn/models/clay/configs/metadata.yaml +295 -0
  48. rslearn/models/clip.py +68 -0
  49. rslearn/models/component.py +111 -0
  50. rslearn/models/concatenate_features.py +103 -0
  51. rslearn/models/conv.py +63 -0
  52. rslearn/models/croma.py +306 -0
  53. rslearn/models/detr/__init__.py +5 -0
  54. rslearn/models/detr/box_ops.py +103 -0
  55. rslearn/models/detr/detr.py +504 -0
  56. rslearn/models/detr/matcher.py +107 -0
  57. rslearn/models/detr/position_encoding.py +114 -0
  58. rslearn/models/detr/transformer.py +429 -0
  59. rslearn/models/detr/util.py +24 -0
  60. rslearn/models/dinov3.py +177 -0
  61. rslearn/models/faster_rcnn.py +30 -28
  62. rslearn/models/feature_center_crop.py +53 -0
  63. rslearn/models/fpn.py +19 -8
  64. rslearn/models/galileo/__init__.py +5 -0
  65. rslearn/models/galileo/galileo.py +595 -0
  66. rslearn/models/galileo/single_file_galileo.py +1678 -0
  67. rslearn/models/module_wrapper.py +65 -0
  68. rslearn/models/molmo.py +69 -0
  69. rslearn/models/multitask.py +384 -28
  70. rslearn/models/olmoearth_pretrain/__init__.py +1 -0
  71. rslearn/models/olmoearth_pretrain/model.py +421 -0
  72. rslearn/models/olmoearth_pretrain/norm.py +86 -0
  73. rslearn/models/panopticon.py +170 -0
  74. rslearn/models/panopticon_data/sensors/drone.yaml +32 -0
  75. rslearn/models/panopticon_data/sensors/enmap.yaml +904 -0
  76. rslearn/models/panopticon_data/sensors/goes.yaml +9 -0
  77. rslearn/models/panopticon_data/sensors/himawari.yaml +9 -0
  78. rslearn/models/panopticon_data/sensors/intuition.yaml +606 -0
  79. rslearn/models/panopticon_data/sensors/landsat8.yaml +84 -0
  80. rslearn/models/panopticon_data/sensors/modis_terra.yaml +99 -0
  81. rslearn/models/panopticon_data/sensors/qb2_ge1.yaml +34 -0
  82. rslearn/models/panopticon_data/sensors/sentinel1.yaml +85 -0
  83. rslearn/models/panopticon_data/sensors/sentinel2.yaml +97 -0
  84. rslearn/models/panopticon_data/sensors/superdove.yaml +60 -0
  85. rslearn/models/panopticon_data/sensors/wv23.yaml +63 -0
  86. rslearn/models/pick_features.py +17 -10
  87. rslearn/models/pooling_decoder.py +60 -7
  88. rslearn/models/presto/__init__.py +5 -0
  89. rslearn/models/presto/presto.py +297 -0
  90. rslearn/models/presto/single_file_presto.py +926 -0
  91. rslearn/models/prithvi.py +1147 -0
  92. rslearn/models/resize_features.py +59 -0
  93. rslearn/models/sam2_enc.py +13 -9
  94. rslearn/models/satlaspretrain.py +38 -18
  95. rslearn/models/simple_time_series.py +188 -77
  96. rslearn/models/singletask.py +24 -13
  97. rslearn/models/ssl4eo_s12.py +40 -30
  98. rslearn/models/swin.py +44 -32
  99. rslearn/models/task_embedding.py +250 -0
  100. rslearn/models/terramind.py +256 -0
  101. rslearn/models/trunk.py +139 -0
  102. rslearn/models/unet.py +68 -22
  103. rslearn/models/upsample.py +48 -0
  104. rslearn/models/use_croma.py +508 -0
  105. rslearn/template_params.py +26 -0
  106. rslearn/tile_stores/__init__.py +41 -18
  107. rslearn/tile_stores/default.py +409 -0
  108. rslearn/tile_stores/tile_store.py +236 -132
  109. rslearn/train/all_patches_dataset.py +530 -0
  110. rslearn/train/callbacks/adapters.py +53 -0
  111. rslearn/train/callbacks/freeze_unfreeze.py +348 -17
  112. rslearn/train/callbacks/gradients.py +129 -0
  113. rslearn/train/callbacks/peft.py +116 -0
  114. rslearn/train/data_module.py +444 -20
  115. rslearn/train/dataset.py +588 -235
  116. rslearn/train/lightning_module.py +192 -62
  117. rslearn/train/model_context.py +88 -0
  118. rslearn/train/optimizer.py +31 -0
  119. rslearn/train/prediction_writer.py +319 -84
  120. rslearn/train/scheduler.py +92 -0
  121. rslearn/train/tasks/classification.py +55 -28
  122. rslearn/train/tasks/detection.py +132 -76
  123. rslearn/train/tasks/embedding.py +120 -0
  124. rslearn/train/tasks/multi_task.py +28 -14
  125. rslearn/train/tasks/per_pixel_regression.py +291 -0
  126. rslearn/train/tasks/regression.py +161 -44
  127. rslearn/train/tasks/segmentation.py +428 -53
  128. rslearn/train/tasks/task.py +6 -5
  129. rslearn/train/transforms/__init__.py +1 -1
  130. rslearn/train/transforms/concatenate.py +54 -10
  131. rslearn/train/transforms/crop.py +29 -11
  132. rslearn/train/transforms/flip.py +18 -6
  133. rslearn/train/transforms/mask.py +78 -0
  134. rslearn/train/transforms/normalize.py +101 -17
  135. rslearn/train/transforms/pad.py +19 -7
  136. rslearn/train/transforms/resize.py +83 -0
  137. rslearn/train/transforms/select_bands.py +76 -0
  138. rslearn/train/transforms/sentinel1.py +75 -0
  139. rslearn/train/transforms/transform.py +89 -70
  140. rslearn/utils/__init__.py +2 -6
  141. rslearn/utils/array.py +8 -6
  142. rslearn/utils/feature.py +2 -2
  143. rslearn/utils/fsspec.py +90 -1
  144. rslearn/utils/geometry.py +347 -7
  145. rslearn/utils/get_utm_ups_crs.py +2 -3
  146. rslearn/utils/grid_index.py +5 -5
  147. rslearn/utils/jsonargparse.py +178 -0
  148. rslearn/utils/mp.py +4 -3
  149. rslearn/utils/raster_format.py +268 -116
  150. rslearn/utils/rtree_index.py +64 -17
  151. rslearn/utils/sqlite_index.py +7 -1
  152. rslearn/utils/vector_format.py +252 -97
  153. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/METADATA +532 -283
  154. rslearn-0.0.21.dist-info/RECORD +167 -0
  155. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/WHEEL +1 -1
  156. rslearn-0.0.21.dist-info/licenses/NOTICE +115 -0
  157. rslearn/data_sources/raster_source.py +0 -309
  158. rslearn/models/registry.py +0 -5
  159. rslearn/tile_stores/file.py +0 -242
  160. rslearn/utils/mgrs.py +0 -24
  161. rslearn/utils/utils.py +0 -22
  162. rslearn-0.0.1.dist-info/RECORD +0 -88
  163. /rslearn/{data_sources/geotiff.py → py.typed} +0 -0
  164. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/entry_points.txt +0 -0
  165. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info/licenses}/LICENSE +0 -0
  166. {rslearn-0.0.1.dist-info → rslearn-0.0.21.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,131 @@
1
+ """This module contains dataclasses for summarizing the results of dataset operations.
2
+
3
+ They can be used by callers to emit telemetry / logs, or discarded.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass
10
+ class LayerPrepareSummary:
11
+ """Results for preparing a single layer."""
12
+
13
+ # Identity
14
+ layer_name: str
15
+ data_source_name: str
16
+
17
+ # Timing
18
+ duration_seconds: float
19
+
20
+ # Counts
21
+ windows_prepared: int
22
+ windows_skipped: int
23
+ windows_rejected: int
24
+ get_items_attempts: int
25
+
26
+
27
+ @dataclass
28
+ class PrepareDatasetWindowsSummary:
29
+ """Results from prepare_dataset_windows operation for telemetry purposes."""
30
+
31
+ # Timing
32
+ duration_seconds: float
33
+
34
+ # Counts
35
+ total_windows_requested: int
36
+
37
+ # Per-layer summaries
38
+ layer_summaries: list[LayerPrepareSummary]
39
+
40
+
41
+ @dataclass
42
+ class IngestCounts:
43
+ """Known ingestion counts."""
44
+
45
+ items_ingested: int
46
+ geometries_ingested: int
47
+
48
+
49
+ @dataclass
50
+ class UnknownIngestCounts:
51
+ """Indicates ingestion counts are unknown due to partial failure."""
52
+
53
+ items_attempted: int
54
+ geometries_attempted: int
55
+
56
+
57
+ @dataclass
58
+ class LayerIngestSummary:
59
+ """Results for ingesting a single layer."""
60
+
61
+ # Identity
62
+ layer_name: str
63
+ data_source_name: str
64
+
65
+ # Timing
66
+ duration_seconds: float
67
+
68
+ # Counts - either known or unknown
69
+ ingest_counts: IngestCounts | UnknownIngestCounts
70
+ ingest_attempts: int
71
+
72
+
73
+ @dataclass
74
+ class IngestDatasetJobsSummary:
75
+ """Results from ingesting a set of jobs; for telemetry purposes."""
76
+
77
+ # Timing
78
+ duration_seconds: float
79
+
80
+ # Counts
81
+ num_jobs: int
82
+
83
+ # Per-layer summaries
84
+ layer_summaries: list[LayerIngestSummary]
85
+
86
+
87
+ @dataclass
88
+ class MaterializeWindowLayerSummary:
89
+ """Results for materializing a single window layer."""
90
+
91
+ skipped: bool
92
+ materialize_attempts: int
93
+
94
+
95
+ @dataclass
96
+ class MaterializeWindowLayersSummary:
97
+ """Results for materialize a given layer for all windows in a materialize call."""
98
+
99
+ # Identity
100
+ layer_name: str
101
+ data_source_name: str
102
+
103
+ # Timing
104
+ duration_seconds: float
105
+
106
+ # Counts
107
+ total_windows_requested: int
108
+ num_windows_materialized: int
109
+ materialize_attempts: int
110
+
111
+
112
+ @dataclass
113
+ class MaterializeDatasetWindowsSummary:
114
+ """Results from materialize_dataset_windows operation for telemetry purposes."""
115
+
116
+ # Timing
117
+ duration_seconds: float
118
+
119
+ # Counts
120
+ total_windows_requested: int
121
+
122
+ # Per-layer summaries
123
+ layer_summaries: list[MaterializeWindowLayersSummary]
124
+
125
+
126
+ @dataclass
127
+ class ErrorOutcome:
128
+ """TBD what goes in here, if anything."""
129
+
130
+ # Timing
131
+ duration_seconds: float
rslearn/dataset/manage.py CHANGED
@@ -1,19 +1,89 @@
1
1
  """Functions to manage datasets."""
2
2
 
3
- import rslearn.data_sources
4
- from rslearn.config import LayerConfig, LayerType
3
+ import random
4
+ import time
5
+ from collections.abc import Callable
6
+ from datetime import timedelta
7
+ from typing import Any
8
+
9
+ from rslearn.config import (
10
+ LayerConfig,
11
+ LayerType,
12
+ )
5
13
  from rslearn.data_sources import DataSource, Item
6
- from rslearn.tile_stores import TileStore, get_tile_store_for_layer
7
- from rslearn.utils import logger
14
+ from rslearn.dataset.handler_summaries import (
15
+ LayerPrepareSummary,
16
+ MaterializeDatasetWindowsSummary,
17
+ MaterializeWindowLayersSummary,
18
+ MaterializeWindowLayerSummary,
19
+ PrepareDatasetWindowsSummary,
20
+ )
21
+ from rslearn.log_utils import get_logger
22
+ from rslearn.tile_stores import TileStore, get_tile_store_with_layer
8
23
 
9
24
  from .dataset import Dataset
10
- from .materialize import Materializers
25
+ from .materialize import Materializer, RasterMaterializer, VectorMaterializer
11
26
  from .window import Window, WindowLayerData
12
27
 
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ class AttemptsCounter:
32
+ """A simple counter for tracking attempts (including initial attempt and retries)."""
33
+
34
+ def __init__(self) -> None:
35
+ """Initialize counter with value 0."""
36
+ self.value = 0
37
+
38
+ def increment(self) -> None:
39
+ """Increment the counter by 1."""
40
+ self.value += 1
41
+
42
+
43
+ def retry(
44
+ fn: Callable,
45
+ retry_max_attempts: int,
46
+ retry_backoff: timedelta,
47
+ attempts_counter: AttemptsCounter | None = None,
48
+ ) -> Any:
49
+ """Retry the function multiple times in case of error.
50
+
51
+ The function is retried until either the attempts are exhausted, or the function
52
+ runs successfully without raising an Exception.
53
+
54
+ Args:
55
+ fn: the function to call.
56
+ retry_max_attempts: retry this many times (plus the original attempt) before
57
+ giving up (and raising Exception).
58
+ retry_backoff: the base backoff time used to compute how long to wait between
59
+ retries. The actual time is (retry_backoff * attempts) * r, where r is a
60
+ random number between 1 and 2, and attempts is the number of attempts tried
61
+ so far.
62
+ attempts_counter: an optional counter to increment for each attempt
63
+ """
64
+ for attempt_idx in range(retry_max_attempts):
65
+ if attempts_counter:
66
+ attempts_counter.increment()
67
+ try:
68
+ return fn()
69
+ except Exception as e:
70
+ logger.warning(f"Retrying after catching error in retry loop: {e}")
71
+ sleep_base_seconds = retry_backoff.total_seconds() * (attempt_idx + 1)
72
+ time.sleep(sleep_base_seconds * (1 + random.random()))
73
+
74
+ # Last attempt. This time we don't catch the exception.
75
+ if attempts_counter:
76
+ attempts_counter.increment()
77
+ return fn()
78
+
13
79
 
14
80
  def prepare_dataset_windows(
15
- dataset: Dataset, windows: list[Window], force: bool = False
16
- ) -> None:
81
+ dataset: Dataset,
82
+ windows: list[Window],
83
+ force: bool = False,
84
+ retry_max_attempts: int = 0,
85
+ retry_backoff: timedelta = timedelta(minutes=1),
86
+ ) -> PrepareDatasetWindowsSummary:
17
87
  """Prepare windows in a dataset.
18
88
 
19
89
  Preparing a window involves looking up items corresponding to the window in each of
@@ -24,28 +94,73 @@ def prepare_dataset_windows(
24
94
  windows: the windows to prepare
25
95
  force: whether to prepare windows even if they were previously prepared
26
96
  (default false)
97
+ retry_max_attempts: set greater than zero to retry for this many attempts in
98
+ case of error.
99
+ retry_backoff: how long to wait before retrying (see retry).
100
+
101
+ Returns:
102
+ a summary of the prepare operation, fit for telemetry purposes
27
103
  """
104
+ start_time = time.monotonic()
105
+ layer_summaries: list[LayerPrepareSummary] = []
106
+
28
107
  # Iterate over retrieved layers, and prepare each one.
29
108
  for layer_name, layer_cfg in dataset.layers.items():
109
+ layer_start_time = time.monotonic()
110
+
30
111
  if not layer_cfg.data_source:
112
+ layer_summaries.append(
113
+ LayerPrepareSummary(
114
+ layer_name=layer_name,
115
+ data_source_name="N/A",
116
+ duration_seconds=time.monotonic() - layer_start_time,
117
+ windows_prepared=0,
118
+ windows_skipped=len(windows),
119
+ windows_rejected=0,
120
+ get_items_attempts=0,
121
+ )
122
+ )
31
123
  continue
124
+ data_source_cfg = layer_cfg.data_source
125
+ min_matches = data_source_cfg.query_config.min_matches
32
126
 
33
127
  # Get windows that need to be prepared for this layer.
128
+ # Also track which windows are skipped vs previously rejected.
34
129
  needed_windows = []
130
+ windows_skipped = 0
131
+ windows_rejected = 0
35
132
  for window in windows:
36
133
  layer_datas = window.load_layer_datas()
37
134
  if layer_name in layer_datas and not force:
135
+ # Window already has layer data - check if it was previously rejected
136
+ layer_data = layer_datas[layer_name]
137
+ if len(layer_data.serialized_item_groups) == 0 and min_matches > 0:
138
+ # Previously rejected due to min_matches
139
+ windows_rejected += 1
140
+ else:
141
+ # Successfully prepared previously
142
+ windows_skipped += 1
38
143
  continue
39
144
  needed_windows.append(window)
40
- print(f"Preparing {len(needed_windows)} windows for layer {layer_name}")
145
+ logger.info(f"Preparing {len(needed_windows)} windows for layer {layer_name}")
146
+
41
147
  if len(needed_windows) == 0:
148
+ layer_summaries.append(
149
+ LayerPrepareSummary(
150
+ layer_name=layer_name,
151
+ data_source_name=data_source_cfg.class_path,
152
+ duration_seconds=time.monotonic() - layer_start_time,
153
+ windows_prepared=0,
154
+ windows_skipped=windows_skipped,
155
+ windows_rejected=windows_rejected,
156
+ get_items_attempts=0,
157
+ )
158
+ )
42
159
  continue
43
160
 
44
161
  # Create data source after checking for at least one window so it can be fast
45
162
  # if there are no windows to prepare.
46
- data_source = rslearn.data_sources.data_source_from_config(
47
- layer_cfg, dataset.path
48
- )
163
+ data_source = layer_cfg.instantiate_data_source(dataset.path)
49
164
 
50
165
  # Get STGeometry for each window.
51
166
  geometries = []
@@ -53,13 +168,13 @@ def prepare_dataset_windows(
53
168
  geometry = window.get_geometry()
54
169
 
55
170
  # Apply temporal modifiers.
56
- time_offset = layer_cfg.data_source.time_offset
171
+ time_offset = data_source_cfg.time_offset
57
172
  if geometry.time_range and time_offset:
58
173
  geometry.time_range = (
59
174
  geometry.time_range[0] + time_offset,
60
175
  geometry.time_range[1] + time_offset,
61
176
  )
62
- duration = layer_cfg.data_source.duration
177
+ duration = data_source_cfg.duration
63
178
  if geometry.time_range and duration:
64
179
  geometry.time_range = (
65
180
  geometry.time_range[0],
@@ -68,7 +183,15 @@ def prepare_dataset_windows(
68
183
 
69
184
  geometries.append(geometry)
70
185
 
71
- results = data_source.get_items(geometries, layer_cfg.data_source.query_config)
186
+ attempts_counter = AttemptsCounter()
187
+ results = retry(
188
+ fn=lambda: data_source.get_items(geometries, data_source_cfg.query_config),
189
+ retry_max_attempts=retry_max_attempts,
190
+ retry_backoff=retry_backoff,
191
+ attempts_counter=attempts_counter,
192
+ )
193
+
194
+ windows_prepared = 0
72
195
  for window, result in zip(needed_windows, results):
73
196
  layer_datas = window.load_layer_datas()
74
197
  layer_datas[layer_name] = WindowLayerData(
@@ -79,8 +202,39 @@ def prepare_dataset_windows(
79
202
  )
80
203
  window.save_layer_datas(layer_datas)
81
204
 
205
+ # If result is empty and min_matches > 0, window was rejected due to min_matches
206
+ if len(result) == 0 and min_matches > 0:
207
+ windows_rejected += 1
208
+ else:
209
+ windows_prepared += 1
82
210
 
83
- def ingest_dataset_windows(dataset: Dataset, windows: list[Window]) -> None:
211
+ layer_summaries.append(
212
+ LayerPrepareSummary(
213
+ layer_name=layer_name,
214
+ data_source_name=data_source_cfg.class_path,
215
+ duration_seconds=time.monotonic() - layer_start_time,
216
+ windows_prepared=windows_prepared,
217
+ windows_skipped=windows_skipped,
218
+ windows_rejected=windows_rejected,
219
+ get_items_attempts=attempts_counter.value,
220
+ )
221
+ )
222
+
223
+ summary = PrepareDatasetWindowsSummary(
224
+ duration_seconds=time.monotonic() - start_time,
225
+ total_windows_requested=len(windows),
226
+ layer_summaries=layer_summaries,
227
+ )
228
+
229
+ return summary
230
+
231
+
232
+ def ingest_dataset_windows(
233
+ dataset: Dataset,
234
+ windows: list[Window],
235
+ retry_max_attempts: int = 0,
236
+ retry_backoff: timedelta = timedelta(minutes=1),
237
+ ) -> None:
84
238
  """Ingest items for retrieved layers in a dataset.
85
239
 
86
240
  The items associated with the specified windows are downloaded and divided into
@@ -89,6 +243,9 @@ def ingest_dataset_windows(dataset: Dataset, windows: list[Window]) -> None:
89
243
  Args:
90
244
  dataset: the dataset
91
245
  windows: the windows to ingest
246
+ retry_max_attempts: set greater than zero to retry for this many attempts in
247
+ case of error.
248
+ retry_backoff: how long to wait before retrying (see retry).
92
249
  """
93
250
  tile_store = dataset.get_tile_store()
94
251
  for layer_name, layer_cfg in dataset.layers.items():
@@ -97,11 +254,9 @@ def ingest_dataset_windows(dataset: Dataset, windows: list[Window]) -> None:
97
254
  if not layer_cfg.data_source.ingest:
98
255
  continue
99
256
 
100
- data_source = rslearn.data_sources.data_source_from_config(
101
- layer_cfg, dataset.path
102
- )
257
+ data_source = layer_cfg.instantiate_data_source(dataset.path)
103
258
 
104
- geometries_by_item = {}
259
+ geometries_by_item: dict = {}
105
260
  for window in windows:
106
261
  layer_datas = window.load_layer_datas()
107
262
  if layer_name not in layer_datas:
@@ -116,12 +271,20 @@ def ingest_dataset_windows(dataset: Dataset, windows: list[Window]) -> None:
116
271
  geometries_by_item[item].append(geometry)
117
272
 
118
273
  print(f"Ingesting {len(geometries_by_item)} items in layer {layer_name}")
119
- cur_tile_store = get_tile_store_for_layer(tile_store, layer_name, layer_cfg)
120
274
  geometries_and_items = list(geometries_by_item.items())
121
- data_source.ingest(
122
- tile_store=cur_tile_store,
123
- items=[item for item, _ in geometries_and_items],
124
- geometries=[geometries for _, geometries in geometries_and_items],
275
+
276
+ # Use retry loop for the actual data source ingest call.
277
+ def ingest() -> None:
278
+ data_source.ingest(
279
+ tile_store=get_tile_store_with_layer(tile_store, layer_name, layer_cfg),
280
+ items=[item for item, _ in geometries_and_items],
281
+ geometries=[geometries for _, geometries in geometries_and_items],
282
+ )
283
+
284
+ retry(
285
+ fn=ingest,
286
+ retry_max_attempts=retry_max_attempts,
287
+ retry_backoff=retry_backoff,
125
288
  )
126
289
 
127
290
 
@@ -145,47 +308,31 @@ def is_window_ingested(
145
308
  continue
146
309
  if layer_name not in layer_datas:
147
310
  return False
311
+
312
+ layer_tile_store = get_tile_store_with_layer(tile_store, layer_name, layer_cfg)
313
+
148
314
  layer_data = layer_datas[layer_name]
149
315
  for group in layer_data.serialized_item_groups:
150
316
  for serialized_item in group:
151
317
  item = Item.deserialize(serialized_item)
152
318
 
153
- if layer_cfg.layer_type == LayerType.RASTER:
319
+ if layer_cfg.type == LayerType.RASTER:
154
320
  for band_set in layer_cfg.band_sets:
155
- projection, _ = band_set.get_final_projection_and_bounds(
156
- window.projection, window.bounds
157
- )
158
- cur_tile_store = get_tile_store_for_layer(
159
- tile_store, layer_name, layer_cfg
160
- )
161
- layer_prefix = (item.name,)
162
321
  # Make sure that layers exist containing each configured band.
163
322
  # And that those layers are marked completed.
164
- suffixes = cur_tile_store.list_layers(layer_prefix)
165
- needed_suffixes = []
166
- needed_bands = {band for band in band_set.bands}
167
- for suffix in suffixes:
168
- cur_bands = suffix.split("_")
323
+ available_bands = layer_tile_store.get_raster_bands(item.name)
324
+ wanted_bands = {band for band in band_set.bands}
325
+ for cur_bands in available_bands:
169
326
  is_needed = False
170
327
  for band in cur_bands:
171
- if band in needed_bands:
328
+ if band in wanted_bands:
172
329
  is_needed = True
173
- needed_bands.remove(band)
330
+ wanted_bands.remove(band)
174
331
  if not is_needed:
175
332
  continue
176
- needed_suffixes.append(suffix)
177
- if len(needed_bands) > 0:
333
+ if len(wanted_bands) > 0:
178
334
  return False
179
335
 
180
- for suffix in needed_suffixes:
181
- layer_id = (item.name, suffix, str(projection))
182
- ts_layer = get_tile_store_for_layer(
183
- tile_store, layer_name, layer_cfg
184
- ).get_layer(layer_id)
185
- if not ts_layer:
186
- return False
187
- if not ts_layer.get_metadata().properties.get("completed"):
188
- return False
189
336
  return True
190
337
 
191
338
 
@@ -196,7 +343,9 @@ def materialize_window(
196
343
  tile_store: TileStore,
197
344
  layer_name: str,
198
345
  layer_cfg: LayerConfig,
199
- ) -> None:
346
+ retry_max_attempts: int = 0,
347
+ retry_backoff: timedelta = timedelta(minutes=1),
348
+ ) -> MaterializeWindowLayerSummary:
200
349
  """Materialize a window.
201
350
 
202
351
  Args:
@@ -206,11 +355,19 @@ def materialize_window(
206
355
  tile_store: tile store of the dataset to materialize from
207
356
  layer_name: the layer name
208
357
  layer_cfg: the layer config
358
+ retry_max_attempts: set greater than zero to retry for this many attempts in
359
+ case of error.
360
+ retry_backoff: how long to wait before retrying (see retry).
361
+
362
+ Returns:
363
+ a summary of the materialize operation, fit for telemetry purposes
209
364
  """
210
365
  # Check if layer is materialized already.
211
- completed_fname = window.path / "layers" / layer_name / "completed"
212
- if completed_fname.exists():
213
- return
366
+ if window.is_layer_completed(layer_name):
367
+ return MaterializeWindowLayerSummary(
368
+ skipped=True,
369
+ materialize_attempts=0,
370
+ )
214
371
 
215
372
  layer_datas = window.load_layer_datas()
216
373
  if layer_name not in layer_datas:
@@ -219,7 +376,11 @@ def materialize_window(
219
376
  layer_name,
220
377
  window.name,
221
378
  )
222
- return
379
+ return MaterializeWindowLayerSummary(
380
+ skipped=True,
381
+ materialize_attempts=0,
382
+ )
383
+
223
384
  layer_data = layer_datas[layer_name]
224
385
  item_groups = []
225
386
  for serialized_group in layer_data.serialized_item_groups:
@@ -229,6 +390,10 @@ def materialize_window(
229
390
  item_group.append(item)
230
391
  item_groups.append(item_group)
231
392
 
393
+ if layer_cfg.data_source is None:
394
+ raise ValueError("data_source is required")
395
+
396
+ attempts_counter = AttemptsCounter()
232
397
  if layer_cfg.data_source.ingest:
233
398
  if not is_window_ingested(dataset, window, check_layer_name=layer_name):
234
399
  logger.info(
@@ -236,30 +401,62 @@ def materialize_window(
236
401
  layer_name,
237
402
  window.name,
238
403
  )
239
- return
404
+ return MaterializeWindowLayerSummary(
405
+ skipped=True,
406
+ materialize_attempts=0,
407
+ )
240
408
 
241
- print(
409
+ logger.info(
242
410
  f"Materializing {len(item_groups)} item groups in layer {layer_name} from tile store"
243
411
  )
244
412
 
245
- if dataset.materializer_name:
246
- materializer = Materializers[dataset.materializer_name]
413
+ materializer: Materializer
414
+ if layer_cfg.type == LayerType.RASTER:
415
+ materializer = RasterMaterializer()
416
+ elif layer_cfg.type == LayerType.VECTOR:
417
+ materializer = VectorMaterializer()
247
418
  else:
248
- materializer = Materializers[layer_cfg.layer_type.value]
249
- materializer.materialize(tile_store, window, layer_name, layer_cfg, item_groups)
419
+ raise ValueError(f"unknown layer type {layer_cfg.type}")
420
+
421
+ retry(
422
+ fn=lambda: materializer.materialize(
423
+ get_tile_store_with_layer(tile_store, layer_name, layer_cfg),
424
+ window,
425
+ layer_name,
426
+ layer_cfg,
427
+ item_groups,
428
+ ),
429
+ retry_max_attempts=retry_max_attempts,
430
+ retry_backoff=retry_backoff,
431
+ attempts_counter=attempts_counter,
432
+ )
250
433
 
251
434
  else:
252
435
  # This window is meant to be materialized directly from the data source.
253
- print(
436
+ logger.info(
254
437
  f"Materializing {len(item_groups)} item groups in layer {layer_name} via data source"
255
438
  )
256
- try:
257
- data_source.materialize(window, item_groups, layer_name, layer_cfg)
258
- except Exception as e:
259
- print(f"error materializing window {window.name}: {e}")
439
+ retry(
440
+ fn=lambda: data_source.materialize(
441
+ window, item_groups, layer_name, layer_cfg
442
+ ),
443
+ retry_max_attempts=retry_max_attempts,
444
+ retry_backoff=retry_backoff,
445
+ attempts_counter=attempts_counter,
446
+ )
260
447
 
448
+ return MaterializeWindowLayerSummary(
449
+ skipped=False,
450
+ materialize_attempts=attempts_counter.value,
451
+ )
261
452
 
262
- def materialize_dataset_windows(dataset: Dataset, windows: list[Window]) -> None:
453
+
454
+ def materialize_dataset_windows(
455
+ dataset: Dataset,
456
+ windows: list[Window],
457
+ retry_max_attempts: int = 0,
458
+ retry_backoff: timedelta = timedelta(minutes=1),
459
+ ) -> MaterializeDatasetWindowsSummary:
263
460
  """Materialize items for retrieved layers in a dataset.
264
461
 
265
462
  The portions of items corresponding to dataset windows are extracted from the tile
@@ -268,17 +465,59 @@ def materialize_dataset_windows(dataset: Dataset, windows: list[Window]) -> None
268
465
  Args:
269
466
  dataset: the dataset
270
467
  windows: the windows to materialize
468
+ retry_max_attempts: set greater than zero to retry for this many attempts in
469
+ case of error.
470
+ retry_backoff: how long to wait before retrying (see retry).
471
+
472
+ Returns:
473
+ a summary of the materialize operation, fit for telemetry purposes
271
474
  """
475
+ start_time = time.monotonic()
476
+
477
+ layer_summaries: list[MaterializeWindowLayersSummary] = []
478
+
272
479
  tile_store = dataset.get_tile_store()
273
480
  for layer_name, layer_cfg in dataset.layers.items():
481
+ layer_start_time = time.monotonic()
482
+
483
+ total_materialize_attempts = 0
484
+ total_skipped = 0
485
+ data_source_name = "N/A"
486
+
274
487
  if not layer_cfg.data_source:
275
- continue
488
+ total_skipped = len(windows)
489
+ else:
490
+ data_source_name = layer_cfg.data_source.class_path
491
+ data_source = layer_cfg.instantiate_data_source(dataset.path)
492
+
493
+ for window in windows:
494
+ window_summary = materialize_window(
495
+ window=window,
496
+ dataset=dataset,
497
+ data_source=data_source,
498
+ tile_store=tile_store,
499
+ layer_name=layer_name,
500
+ layer_cfg=layer_cfg,
501
+ retry_max_attempts=retry_max_attempts,
502
+ retry_backoff=retry_backoff,
503
+ )
504
+ total_materialize_attempts += window_summary.materialize_attempts
505
+ if window_summary.skipped:
506
+ total_skipped += 1
276
507
 
277
- data_source = rslearn.data_sources.data_source_from_config(
278
- layer_cfg, dataset.path
508
+ layer_summaries.append(
509
+ MaterializeWindowLayersSummary(
510
+ layer_name=layer_name,
511
+ data_source_name=data_source_name,
512
+ duration_seconds=time.monotonic() - layer_start_time,
513
+ total_windows_requested=len(windows),
514
+ num_windows_materialized=len(windows) - total_skipped,
515
+ materialize_attempts=total_materialize_attempts,
516
+ )
279
517
  )
280
518
 
281
- for window in windows:
282
- materialize_window(
283
- window, dataset, data_source, tile_store, layer_name, layer_cfg
284
- )
519
+ return MaterializeDatasetWindowsSummary(
520
+ duration_seconds=time.monotonic() - start_time,
521
+ total_windows_requested=len(windows),
522
+ layer_summaries=layer_summaries,
523
+ )