rslearn 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,12 +2,12 @@
2
2
 
3
3
  import functools
4
4
  import json
5
+ from collections.abc import Callable
5
6
  from typing import Any, Generic, TypeVar
6
7
 
7
8
  import fiona
8
9
  import shapely
9
10
  import shapely.geometry
10
- from class_registry import ClassRegistry
11
11
  from rasterio.crs import CRS
12
12
  from upath import UPath
13
13
 
@@ -23,7 +23,24 @@ from rslearn.utils.geometry import Projection, STGeometry, get_global_geometry
23
23
  from .data_source import DataSource, Item, QueryConfig
24
24
 
25
25
  logger = get_logger("__name__")
26
- Importers = ClassRegistry()
26
+ _ImporterT = TypeVar("_ImporterT", bound="Importer")
27
+
28
+
29
+ class _ImporterRegistry(dict[str, type["Importer"]]):
30
+ """Registry for Importer classes."""
31
+
32
+ def register(self, name: str) -> Callable[[type[_ImporterT]], type[_ImporterT]]:
33
+ """Decorator to register an importer class."""
34
+
35
+ def decorator(cls: type[_ImporterT]) -> type[_ImporterT]:
36
+ self[name] = cls
37
+ return cls
38
+
39
+ return decorator
40
+
41
+
42
+ Importers = _ImporterRegistry()
43
+
27
44
 
28
45
  ItemType = TypeVar("ItemType", bound=Item)
29
46
  LayerConfigType = TypeVar("LayerConfigType", bound=LayerConfig)
@@ -425,7 +442,7 @@ class LocalFiles(DataSource):
425
442
  """
426
443
  self.config = config
427
444
 
428
- self.importer = Importers[config.layer_type.value]
445
+ self.importer = Importers[config.layer_type.value]()
429
446
  self.src_dir = src_dir
430
447
 
431
448
  @staticmethod
@@ -83,6 +83,10 @@ class PlanetaryComputer(DataSource, TileStore):
83
83
 
84
84
  STAC_ENDPOINT = "https://planetarycomputer.microsoft.com/api/stac/v1"
85
85
 
86
+ # Default threshold for recreating the STAC client to prevent memory leaks
87
+ # from the pystac Catalog's resolved objects cache growing unbounded
88
+ DEFAULT_MAX_ITEMS_PER_CLIENT = 1000
89
+
86
90
  def __init__(
87
91
  self,
88
92
  collection_name: str,
@@ -93,6 +97,7 @@ class PlanetaryComputer(DataSource, TileStore):
93
97
  timeout: timedelta = timedelta(seconds=10),
94
98
  skip_items_missing_assets: bool = False,
95
99
  cache_dir: UPath | None = None,
100
+ max_items_per_client: int | None = None,
96
101
  ):
97
102
  """Initialize a new PlanetaryComputer instance.
98
103
 
@@ -109,6 +114,9 @@ class PlanetaryComputer(DataSource, TileStore):
109
114
  cache_dir: optional directory to cache items by name, including asset URLs.
110
115
  If not set, there will be no cache and instead STAC requests will be
111
116
  needed each time.
117
+ max_items_per_client: number of STAC items to process before recreating
118
+ the client to prevent memory leaks from the resolved objects cache.
119
+ Defaults to DEFAULT_MAX_ITEMS_PER_CLIENT.
112
120
  """
113
121
  self.collection_name = collection_name
114
122
  self.asset_bands = asset_bands
@@ -118,12 +126,15 @@ class PlanetaryComputer(DataSource, TileStore):
118
126
  self.timeout = timeout
119
127
  self.skip_items_missing_assets = skip_items_missing_assets
120
128
  self.cache_dir = cache_dir
129
+ self.max_items_per_client = (
130
+ max_items_per_client or self.DEFAULT_MAX_ITEMS_PER_CLIENT
131
+ )
121
132
 
122
133
  if self.cache_dir is not None:
123
134
  self.cache_dir.mkdir(parents=True, exist_ok=True)
124
135
 
125
136
  self.client: pystac_client.Client | None = None
126
- self.collection: pystac_client.CollectionClient | None = None
137
+ self._client_item_count = 0
127
138
 
128
139
  @staticmethod
129
140
  def from_config(config: RasterLayerConfig, ds_path: UPath) -> "PlanetaryComputer":
@@ -142,7 +153,12 @@ class PlanetaryComputer(DataSource, TileStore):
142
153
  if "cache_dir" in d:
143
154
  kwargs["cache_dir"] = join_upath(ds_path, d["cache_dir"])
144
155
 
145
- simple_optionals = ["query", "sort_by", "sort_ascending"]
156
+ simple_optionals = [
157
+ "query",
158
+ "sort_by",
159
+ "sort_ascending",
160
+ "max_items_per_client",
161
+ ]
146
162
  for k in simple_optionals:
147
163
  if k in d:
148
164
  kwargs[k] = d[k]
@@ -151,20 +167,40 @@ class PlanetaryComputer(DataSource, TileStore):
151
167
 
152
168
  def _load_client(
153
169
  self,
154
- ) -> tuple[pystac_client.Client, pystac_client.CollectionClient]:
170
+ ) -> pystac_client.Client:
155
171
  """Lazily load pystac client.
156
172
 
157
173
  We don't load it when creating the data source because it takes time and caller
158
174
  may not be calling get_items. Additionally, loading it during the get_items
159
175
  call enables leveraging the retry loop functionality in
160
176
  prepare_dataset_windows.
161
- """
162
- if self.client is not None:
163
- return self.client, self.collection
164
177
 
178
+ Note: We periodically recreate the client to prevent memory leaks from the
179
+ pystac Catalog's resolved objects cache, which grows unbounded as STAC items
180
+ are deserialized and cached. The cache cannot be cleared or disabled.
181
+ """
182
+ if self.client is None:
183
+ logger.info("Creating initial STAC client")
184
+ self.client = pystac_client.Client.open(self.STAC_ENDPOINT)
185
+ return self.client
186
+
187
+ if self._client_item_count < self.max_items_per_client:
188
+ return self.client
189
+
190
+ # Recreate client to clear the resolved objects cache
191
+ current_client = self.client
192
+ logger.debug(
193
+ "Recreating STAC client after processing %d items (threshold: %d)",
194
+ self._client_item_count,
195
+ self.max_items_per_client,
196
+ )
197
+ client_root = current_client.get_root()
198
+ client_root.clear_links()
199
+ client_root.clear_items()
200
+ client_root.clear_children()
201
+ self._client_item_count = 0
165
202
  self.client = pystac_client.Client.open(self.STAC_ENDPOINT)
166
- self.collection = self.client.get_collection(self.collection_name)
167
- return self.client, self.collection
203
+ return self.client
168
204
 
169
205
  def _stac_item_to_item(self, stac_item: pystac.Item) -> PlanetaryComputerItem:
170
206
  shp = shapely.geometry.shape(stac_item.geometry)
@@ -210,10 +246,26 @@ class PlanetaryComputer(DataSource, TileStore):
210
246
 
211
247
  # No cache or not in cache, so we need to make the STAC request.
212
248
  logger.debug("Getting STAC item {name}")
213
- _, collection = self._load_client()
214
- stac_item = collection.get_item(name)
249
+ client = self._load_client()
250
+
251
+ search_result = client.search(ids=[name], collections=[self.collection_name])
252
+ stac_items = list(search_result.items())
253
+
254
+ if not stac_items:
255
+ raise ValueError(
256
+ f"Item {name} not found in collection {self.collection_name}"
257
+ )
258
+ if len(stac_items) > 1:
259
+ raise ValueError(
260
+ f"Multiple items found for ID {name} in collection {self.collection_name}"
261
+ )
262
+
263
+ stac_item = stac_items[0]
215
264
  item = self._stac_item_to_item(stac_item)
216
265
 
266
+ # Track items processed for client recreation threshold (after deserialization)
267
+ self._client_item_count += 1
268
+
217
269
  # Finally we cache it if cache_dir is set.
218
270
  if cache_fname is not None:
219
271
  with cache_fname.open("w") as f:
@@ -233,7 +285,7 @@ class PlanetaryComputer(DataSource, TileStore):
233
285
  Returns:
234
286
  List of groups of items that should be retrieved for each geometry.
235
287
  """
236
- client, _ = self._load_client()
288
+ client = self._load_client()
237
289
 
238
290
  groups = []
239
291
  for geometry in geometries:
@@ -247,7 +299,9 @@ class PlanetaryComputer(DataSource, TileStore):
247
299
  datetime=wgs84_geometry.time_range,
248
300
  query=self.query,
249
301
  )
250
- stac_items = [item for item in result.item_collection()]
302
+ stac_items = [item for item in result.items()]
303
+ # Track items processed for client recreation threshold (after deserialization)
304
+ self._client_item_count += len(stac_items)
251
305
  logger.debug("STAC search yielded %d items", len(stac_items))
252
306
 
253
307
  if self.skip_items_missing_assets:
@@ -580,7 +634,13 @@ class Sentinel2(PlanetaryComputer):
580
634
  if "cache_dir" in d:
581
635
  kwargs["cache_dir"] = join_upath(ds_path, d["cache_dir"])
582
636
 
583
- simple_optionals = ["harmonize", "query", "sort_by", "sort_ascending"]
637
+ simple_optionals = [
638
+ "harmonize",
639
+ "query",
640
+ "sort_by",
641
+ "sort_ascending",
642
+ "max_items_per_client",
643
+ ]
584
644
  for k in simple_optionals:
585
645
  if k in d:
586
646
  kwargs[k] = d[k]
@@ -756,7 +816,12 @@ class Sentinel1(PlanetaryComputer):
756
816
  if "cache_dir" in d:
757
817
  kwargs["cache_dir"] = join_upath(ds_path, d["cache_dir"])
758
818
 
759
- simple_optionals = ["query", "sort_by", "sort_ascending"]
819
+ simple_optionals = [
820
+ "query",
821
+ "sort_by",
822
+ "sort_ascending",
823
+ "max_items_per_client",
824
+ ]
760
825
  for k in simple_optionals:
761
826
  if k in d:
762
827
  kwargs[k] = d[k]
@@ -0,0 +1,130 @@
1
+ """This module contains dataclasses for summarizing the results of dataset operations.
2
+
3
+ They can be used by callers to emit telemetry / logs, or discarded.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass
10
+ class LayerPrepareSummary:
11
+ """Results for preparing a single layer."""
12
+
13
+ # Identity
14
+ layer_name: str
15
+ data_source_name: str
16
+
17
+ # Timing
18
+ duration_seconds: float
19
+
20
+ # Counts
21
+ windows_prepared: int
22
+ windows_skipped: int
23
+ get_items_attempts: int
24
+
25
+
26
+ @dataclass
27
+ class PrepareDatasetWindowsSummary:
28
+ """Results from prepare_dataset_windows operation for telemetry purposes."""
29
+
30
+ # Timing
31
+ duration_seconds: float
32
+
33
+ # Counts
34
+ total_windows_requested: int
35
+
36
+ # Per-layer summaries
37
+ layer_summaries: list[LayerPrepareSummary]
38
+
39
+
40
+ @dataclass
41
+ class IngestCounts:
42
+ """Known ingestion counts."""
43
+
44
+ items_ingested: int
45
+ geometries_ingested: int
46
+
47
+
48
+ @dataclass
49
+ class UnknownIngestCounts:
50
+ """Indicates ingestion counts are unknown due to partial failure."""
51
+
52
+ items_attempted: int
53
+ geometries_attempted: int
54
+
55
+
56
+ @dataclass
57
+ class LayerIngestSummary:
58
+ """Results for ingesting a single layer."""
59
+
60
+ # Identity
61
+ layer_name: str
62
+ data_source_name: str
63
+
64
+ # Timing
65
+ duration_seconds: float
66
+
67
+ # Counts - either known or unknown
68
+ ingest_counts: IngestCounts | UnknownIngestCounts
69
+ ingest_attempts: int
70
+
71
+
72
+ @dataclass
73
+ class IngestDatasetJobsSummary:
74
+ """Results from ingesting a set of jobs; for telemetry purposes."""
75
+
76
+ # Timing
77
+ duration_seconds: float
78
+
79
+ # Counts
80
+ num_jobs: int
81
+
82
+ # Per-layer summaries
83
+ layer_summaries: list[LayerIngestSummary]
84
+
85
+
86
+ @dataclass
87
+ class MaterializeWindowLayerSummary:
88
+ """Results for materializing a single window layer."""
89
+
90
+ skipped: bool
91
+ materialize_attempts: int
92
+
93
+
94
+ @dataclass
95
+ class MaterializeWindowLayersSummary:
96
+ """Results for materialize a given layer for all windows in a materialize call."""
97
+
98
+ # Identity
99
+ layer_name: str
100
+ data_source_name: str
101
+
102
+ # Timing
103
+ duration_seconds: float
104
+
105
+ # Counts
106
+ total_windows_requested: int
107
+ num_windows_materialized: int
108
+ materialize_attempts: int
109
+
110
+
111
+ @dataclass
112
+ class MaterializeDatasetWindowsSummary:
113
+ """Results from materialize_dataset_windows operation for telemetry purposes."""
114
+
115
+ # Timing
116
+ duration_seconds: float
117
+
118
+ # Counts
119
+ total_windows_requested: int
120
+
121
+ # Per-layer summaries
122
+ layer_summaries: list[MaterializeWindowLayersSummary]
123
+
124
+
125
+ @dataclass
126
+ class ErrorOutcome:
127
+ """TBD what goes in here, if anything."""
128
+
129
+ # Timing
130
+ duration_seconds: float
rslearn/dataset/manage.py CHANGED
@@ -13,6 +13,13 @@ from rslearn.config import (
13
13
  RasterLayerConfig,
14
14
  )
15
15
  from rslearn.data_sources import DataSource, Item
16
+ from rslearn.dataset.handler_summaries import (
17
+ LayerPrepareSummary,
18
+ MaterializeDatasetWindowsSummary,
19
+ MaterializeWindowLayersSummary,
20
+ MaterializeWindowLayerSummary,
21
+ PrepareDatasetWindowsSummary,
22
+ )
16
23
  from rslearn.log_utils import get_logger
17
24
  from rslearn.tile_stores import TileStore, get_tile_store_with_layer
18
25
 
@@ -23,7 +30,24 @@ from .window import Window, WindowLayerData
23
30
  logger = get_logger(__name__)
24
31
 
25
32
 
26
- def retry(fn: Callable, retry_max_attempts: int, retry_backoff: timedelta) -> Any:
33
+ class AttemptsCounter:
34
+ """A simple counter for tracking attempts (including initial attempt and retries)."""
35
+
36
+ def __init__(self) -> None:
37
+ """Initialize counter with value 0."""
38
+ self.value = 0
39
+
40
+ def increment(self) -> None:
41
+ """Increment the counter by 1."""
42
+ self.value += 1
43
+
44
+
45
+ def retry(
46
+ fn: Callable,
47
+ retry_max_attempts: int,
48
+ retry_backoff: timedelta,
49
+ attempts_counter: AttemptsCounter | None = None,
50
+ ) -> Any:
27
51
  """Retry the function multiple times in case of error.
28
52
 
29
53
  The function is retried until either the attempts are exhausted, or the function
@@ -37,8 +61,11 @@ def retry(fn: Callable, retry_max_attempts: int, retry_backoff: timedelta) -> An
37
61
  retries. The actual time is (retry_backoff * attempts) * r, where r is a
38
62
  random number between 1 and 2, and attempts is the number of attempts tried
39
63
  so far.
64
+ attempts_counter: an optional counter to increment for each attempt
40
65
  """
41
66
  for attempt_idx in range(retry_max_attempts):
67
+ if attempts_counter:
68
+ attempts_counter.increment()
42
69
  try:
43
70
  return fn()
44
71
  except Exception as e:
@@ -47,6 +74,8 @@ def retry(fn: Callable, retry_max_attempts: int, retry_backoff: timedelta) -> An
47
74
  time.sleep(sleep_base_seconds * (1 + random.random()))
48
75
 
49
76
  # Last attempt. This time we don't catch the exception.
77
+ if attempts_counter:
78
+ attempts_counter.increment()
50
79
  return fn()
51
80
 
52
81
 
@@ -56,7 +85,7 @@ def prepare_dataset_windows(
56
85
  force: bool = False,
57
86
  retry_max_attempts: int = 0,
58
87
  retry_backoff: timedelta = timedelta(minutes=1),
59
- ) -> None:
88
+ ) -> PrepareDatasetWindowsSummary:
60
89
  """Prepare windows in a dataset.
61
90
 
62
91
  Preparing a window involves looking up items corresponding to the window in each of
@@ -70,10 +99,28 @@ def prepare_dataset_windows(
70
99
  retry_max_attempts: set greater than zero to retry for this many attempts in
71
100
  case of error.
72
101
  retry_backoff: how long to wait before retrying (see retry).
102
+
103
+ Returns:
104
+ a summary of the prepare operation, fit for telemetry purposes
73
105
  """
106
+ start_time = time.monotonic()
107
+ layer_summaries: list[LayerPrepareSummary] = []
108
+
74
109
  # Iterate over retrieved layers, and prepare each one.
75
110
  for layer_name, layer_cfg in dataset.layers.items():
111
+ layer_start_time = time.monotonic()
112
+
76
113
  if not layer_cfg.data_source:
114
+ layer_summaries.append(
115
+ LayerPrepareSummary(
116
+ layer_name=layer_name,
117
+ data_source_name="N/A",
118
+ duration_seconds=time.monotonic() - layer_start_time,
119
+ windows_prepared=0,
120
+ windows_skipped=len(windows),
121
+ get_items_attempts=0,
122
+ )
123
+ )
77
124
  continue
78
125
  data_source_cfg = layer_cfg.data_source
79
126
 
@@ -85,7 +132,18 @@ def prepare_dataset_windows(
85
132
  continue
86
133
  needed_windows.append(window)
87
134
  logger.info(f"Preparing {len(needed_windows)} windows for layer {layer_name}")
135
+
88
136
  if len(needed_windows) == 0:
137
+ layer_summaries.append(
138
+ LayerPrepareSummary(
139
+ layer_name=layer_name,
140
+ data_source_name=data_source_cfg.name,
141
+ duration_seconds=time.monotonic() - layer_start_time,
142
+ windows_prepared=0,
143
+ windows_skipped=len(windows),
144
+ get_items_attempts=0,
145
+ )
146
+ )
89
147
  continue
90
148
 
91
149
  # Create data source after checking for at least one window so it can be fast
@@ -115,10 +173,12 @@ def prepare_dataset_windows(
115
173
 
116
174
  geometries.append(geometry)
117
175
 
176
+ attempts_counter = AttemptsCounter()
118
177
  results = retry(
119
178
  fn=lambda: data_source.get_items(geometries, data_source_cfg.query_config),
120
179
  retry_max_attempts=retry_max_attempts,
121
180
  retry_backoff=retry_backoff,
181
+ attempts_counter=attempts_counter,
122
182
  )
123
183
 
124
184
  for window, result in zip(needed_windows, results):
@@ -131,6 +191,25 @@ def prepare_dataset_windows(
131
191
  )
132
192
  window.save_layer_datas(layer_datas)
133
193
 
194
+ layer_summaries.append(
195
+ LayerPrepareSummary(
196
+ layer_name=layer_name,
197
+ data_source_name=data_source_cfg.name,
198
+ duration_seconds=time.monotonic() - layer_start_time,
199
+ windows_prepared=len(needed_windows), # we assume all have succeeded
200
+ windows_skipped=len(windows) - len(needed_windows),
201
+ get_items_attempts=attempts_counter.value,
202
+ )
203
+ )
204
+
205
+ summary = PrepareDatasetWindowsSummary(
206
+ duration_seconds=time.monotonic() - start_time,
207
+ total_windows_requested=len(windows),
208
+ layer_summaries=layer_summaries,
209
+ )
210
+
211
+ return summary
212
+
134
213
 
135
214
  def ingest_dataset_windows(
136
215
  dataset: Dataset,
@@ -251,7 +330,7 @@ def materialize_window(
251
330
  layer_cfg: LayerConfig,
252
331
  retry_max_attempts: int = 0,
253
332
  retry_backoff: timedelta = timedelta(minutes=1),
254
- ) -> None:
333
+ ) -> MaterializeWindowLayerSummary:
255
334
  """Materialize a window.
256
335
 
257
336
  Args:
@@ -264,10 +343,16 @@ def materialize_window(
264
343
  retry_max_attempts: set greater than zero to retry for this many attempts in
265
344
  case of error.
266
345
  retry_backoff: how long to wait before retrying (see retry).
346
+
347
+ Returns:
348
+ a summary of the materialize operation, fit for telemetry purposes
267
349
  """
268
350
  # Check if layer is materialized already.
269
351
  if window.is_layer_completed(layer_name):
270
- return
352
+ return MaterializeWindowLayerSummary(
353
+ skipped=True,
354
+ materialize_attempts=0,
355
+ )
271
356
 
272
357
  layer_datas = window.load_layer_datas()
273
358
  if layer_name not in layer_datas:
@@ -276,7 +361,11 @@ def materialize_window(
276
361
  layer_name,
277
362
  window.name,
278
363
  )
279
- return
364
+ return MaterializeWindowLayerSummary(
365
+ skipped=True,
366
+ materialize_attempts=0,
367
+ )
368
+
280
369
  layer_data = layer_datas[layer_name]
281
370
  item_groups = []
282
371
  for serialized_group in layer_data.serialized_item_groups:
@@ -288,6 +377,8 @@ def materialize_window(
288
377
 
289
378
  if layer_cfg.data_source is None:
290
379
  raise ValueError("data_source is required")
380
+
381
+ attempts_counter = AttemptsCounter()
291
382
  if layer_cfg.data_source.ingest:
292
383
  if not is_window_ingested(dataset, window, check_layer_name=layer_name):
293
384
  logger.info(
@@ -295,16 +386,19 @@ def materialize_window(
295
386
  layer_name,
296
387
  window.name,
297
388
  )
298
- return
389
+ return MaterializeWindowLayerSummary(
390
+ skipped=True,
391
+ materialize_attempts=0,
392
+ )
299
393
 
300
- print(
394
+ logger.info(
301
395
  f"Materializing {len(item_groups)} item groups in layer {layer_name} from tile store"
302
396
  )
303
397
 
304
398
  if dataset.materializer_name:
305
- materializer = Materializers[dataset.materializer_name]
399
+ materializer = Materializers[dataset.materializer_name]()
306
400
  else:
307
- materializer = Materializers[layer_cfg.layer_type.value]
401
+ materializer = Materializers[layer_cfg.layer_type.value]()
308
402
 
309
403
  retry(
310
404
  fn=lambda: materializer.materialize(
@@ -316,11 +410,12 @@ def materialize_window(
316
410
  ),
317
411
  retry_max_attempts=retry_max_attempts,
318
412
  retry_backoff=retry_backoff,
413
+ attempts_counter=attempts_counter,
319
414
  )
320
415
 
321
416
  else:
322
417
  # This window is meant to be materialized directly from the data source.
323
- print(
418
+ logger.info(
324
419
  f"Materializing {len(item_groups)} item groups in layer {layer_name} via data source"
325
420
  )
326
421
  retry(
@@ -329,15 +424,21 @@ def materialize_window(
329
424
  ),
330
425
  retry_max_attempts=retry_max_attempts,
331
426
  retry_backoff=retry_backoff,
427
+ attempts_counter=attempts_counter,
332
428
  )
333
429
 
430
+ return MaterializeWindowLayerSummary(
431
+ skipped=False,
432
+ materialize_attempts=attempts_counter.value,
433
+ )
434
+
334
435
 
335
436
  def materialize_dataset_windows(
336
437
  dataset: Dataset,
337
438
  windows: list[Window],
338
439
  retry_max_attempts: int = 0,
339
440
  retry_backoff: timedelta = timedelta(minutes=1),
340
- ) -> None:
441
+ ) -> MaterializeDatasetWindowsSummary:
341
442
  """Materialize items for retrieved layers in a dataset.
342
443
 
343
444
  The portions of items corresponding to dataset windows are extracted from the tile
@@ -349,24 +450,58 @@ def materialize_dataset_windows(
349
450
  retry_max_attempts: set greater than zero to retry for this many attempts in
350
451
  case of error.
351
452
  retry_backoff: how long to wait before retrying (see retry).
453
+
454
+ Returns:
455
+ a summary of the materialize operation, fit for telemetry purposes
352
456
  """
457
+ start_time = time.monotonic()
458
+
459
+ layer_summaries: list[MaterializeWindowLayersSummary] = []
460
+
353
461
  tile_store = dataset.get_tile_store()
354
462
  for layer_name, layer_cfg in dataset.layers.items():
463
+ layer_start_time = time.monotonic()
464
+
465
+ total_materialize_attempts = 0
466
+ total_skipped = 0
467
+ data_source_name = "N/A"
468
+
355
469
  if not layer_cfg.data_source:
356
- continue
470
+ total_skipped = len(windows)
471
+ else:
472
+ data_source_name = layer_cfg.data_source.name
473
+ data_source = rslearn.data_sources.data_source_from_config(
474
+ layer_cfg, dataset.path
475
+ )
357
476
 
358
- data_source = rslearn.data_sources.data_source_from_config(
359
- layer_cfg, dataset.path
360
- )
477
+ for window in windows:
478
+ window_summary = materialize_window(
479
+ window=window,
480
+ dataset=dataset,
481
+ data_source=data_source,
482
+ tile_store=tile_store,
483
+ layer_name=layer_name,
484
+ layer_cfg=layer_cfg,
485
+ retry_max_attempts=retry_max_attempts,
486
+ retry_backoff=retry_backoff,
487
+ )
488
+ total_materialize_attempts += window_summary.materialize_attempts
489
+ if window_summary.skipped:
490
+ total_skipped += 1
361
491
 
362
- for window in windows:
363
- materialize_window(
364
- window=window,
365
- dataset=dataset,
366
- data_source=data_source,
367
- tile_store=tile_store,
492
+ layer_summaries.append(
493
+ MaterializeWindowLayersSummary(
368
494
  layer_name=layer_name,
369
- layer_cfg=layer_cfg,
370
- retry_max_attempts=retry_max_attempts,
371
- retry_backoff=retry_backoff,
495
+ data_source_name=data_source_name,
496
+ duration_seconds=time.monotonic() - layer_start_time,
497
+ total_windows_requested=len(windows),
498
+ num_windows_materialized=len(windows) - total_skipped,
499
+ materialize_attempts=total_materialize_attempts,
372
500
  )
501
+ )
502
+
503
+ return MaterializeDatasetWindowsSummary(
504
+ duration_seconds=time.monotonic() - start_time,
505
+ total_windows_requested=len(windows),
506
+ layer_summaries=layer_summaries,
507
+ )