openeo-gfmap 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openeo_gfmap/features/feature_extractor.py +9 -0
- openeo_gfmap/fetching/__init__.py +16 -4
- openeo_gfmap/fetching/commons.py +1 -0
- openeo_gfmap/fetching/generic.py +81 -73
- openeo_gfmap/fetching/s1.py +1 -3
- openeo_gfmap/fetching/s2.py +1 -0
- openeo_gfmap/inference/model_inference.py +5 -2
- openeo_gfmap/manager/job_manager.py +271 -84
- openeo_gfmap/manager/job_splitters.py +169 -21
- openeo_gfmap/preprocessing/sar.py +12 -33
- openeo_gfmap/stac/constants.py +1 -1
- openeo_gfmap/utils/__init__.py +16 -0
- openeo_gfmap/utils/catalogue.py +172 -35
- openeo_gfmap/utils/split_stac.py +125 -0
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/METADATA +5 -4
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/RECORD +18 -18
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/WHEEL +1 -1
- openeo_gfmap/fetching/meteo.py +0 -126
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,11 @@
|
|
1
1
|
import json
|
2
|
+
import pickle
|
2
3
|
import threading
|
4
|
+
import time
|
3
5
|
from concurrent.futures import ThreadPoolExecutor
|
4
|
-
from enum import Enum
|
5
6
|
from functools import partial
|
6
7
|
from pathlib import Path
|
8
|
+
from threading import Lock
|
7
9
|
from typing import Callable, Optional, Union
|
8
10
|
|
9
11
|
import pandas as pd
|
@@ -15,29 +17,60 @@ from pystac import CatalogType
|
|
15
17
|
from openeo_gfmap.manager import _log
|
16
18
|
from openeo_gfmap.stac import constants
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
+
|
21
|
+
def retry_on_exception(max_retries: int, delay_s: int = 180):
|
22
|
+
"""Decorator to retry a function if an exception occurs.
|
23
|
+
Used for post-job actions that can crash due to internal backend issues. Restarting the action
|
24
|
+
usually helps to solve the issue.
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
max_retries: int
|
29
|
+
The maximum number of retries to attempt before finally raising the exception.
|
30
|
+
delay: int (default=180 seconds)
|
31
|
+
The delay in seconds to wait before retrying the decorated function.
|
32
|
+
"""
|
33
|
+
|
34
|
+
def decorator(func):
|
35
|
+
def wrapper(*args, **kwargs):
|
36
|
+
latest_exception = None
|
37
|
+
for _ in range(max_retries):
|
38
|
+
try:
|
39
|
+
return func(*args, **kwargs)
|
40
|
+
except Exception as e:
|
41
|
+
time.sleep(
|
42
|
+
delay_s
|
43
|
+
) # Waits before retrying, while allowing other futures to run.
|
44
|
+
latest_exception = e
|
45
|
+
raise latest_exception
|
46
|
+
|
47
|
+
return wrapper
|
48
|
+
|
49
|
+
return decorator
|
20
50
|
|
21
51
|
|
22
52
|
def done_callback(future, df, idx):
|
23
|
-
"""
|
53
|
+
"""Changes the status of the job when the post-job action future is done."""
|
24
54
|
current_status = df.loc[idx, "status"]
|
25
|
-
|
55
|
+
exception = future.exception()
|
56
|
+
if exception is None:
|
26
57
|
if current_status == "postprocessing":
|
27
58
|
df.loc[idx, "status"] = "finished"
|
28
59
|
elif current_status == "postprocessing-error":
|
29
60
|
df.loc[idx, "status"] = "error"
|
61
|
+
elif current_status == "running":
|
62
|
+
df.loc[idx, "status"] = "running"
|
30
63
|
else:
|
31
64
|
raise ValueError(
|
32
65
|
f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
|
33
66
|
)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
67
|
+
else:
|
68
|
+
_log.exception(
|
69
|
+
"Exception occurred in post-job future for job %s:\n%s",
|
70
|
+
df.loc[idx, "id"],
|
71
|
+
exception,
|
72
|
+
)
|
73
|
+
df.loc[idx, "status"] = "error"
|
41
74
|
|
42
75
|
|
43
76
|
class GFMAPJobManager(MultiBackendJobManager):
|
@@ -53,13 +86,51 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
53
86
|
post_job_action: Optional[Callable] = None,
|
54
87
|
poll_sleep: int = 5,
|
55
88
|
n_threads: int = 1,
|
56
|
-
post_job_params: dict = {},
|
57
89
|
resume_postproc: bool = True, # If we need to check for post-job actions that crashed
|
58
90
|
restart_failed: bool = False, # If we need to restart failed jobs
|
91
|
+
stac_enabled: bool = True,
|
59
92
|
):
|
93
|
+
"""
|
94
|
+
Initializes the GFMAP job manager.
|
95
|
+
|
96
|
+
Parameters
|
97
|
+
----------
|
98
|
+
output_dir: Path
|
99
|
+
The base output directory where the results/stac/logs of the jobs will be stored.
|
100
|
+
output_path_generator: Callable
|
101
|
+
User defined function that generates the output path for the job results. Expects as
|
102
|
+
inputs the output directory, the index of the job in the job dataframe
|
103
|
+
and the row of the job, and returns the final path where to save a job result asset.
|
104
|
+
collection_id: Optional[str]
|
105
|
+
The ID of the STAC collection that is being generated. Can be left empty if the STAC
|
106
|
+
catalogue is not being generated or if it is being resumed from an existing catalogue.
|
107
|
+
collection_description: Optional[str]
|
108
|
+
The description of the STAC collection that is being generated.
|
109
|
+
stac: Optional[Union[str, Path]]
|
110
|
+
The path to the STAC collection to be saved or resumed.
|
111
|
+
If None, the default path will be used.
|
112
|
+
post_job_action: Optional[Callable]
|
113
|
+
A user defined function that will be called after a job is finished. It will receive
|
114
|
+
the list of items generated by the job and the row of the job, and should return the
|
115
|
+
updated list of items.
|
116
|
+
poll_sleep: int
|
117
|
+
The time in seconds to wait between polling the backend for job status.
|
118
|
+
n_threads: int
|
119
|
+
The number of threads to execute `on_job_done` and `on_job_error` functions.
|
120
|
+
resume_postproc: bool
|
121
|
+
If set to true, all `on_job_done` and `on_job_error` functions that failed are resumed.
|
122
|
+
restart_failed: bool
|
123
|
+
If set to true, all jobs that failed within the OpenEO backend are restarted.
|
124
|
+
stac_enabled: bool (default=True)
|
125
|
+
If the STAC generation is enabled or not. Disabling it will prevent the creation,
|
126
|
+
update and loading of the STAC collection.
|
127
|
+
"""
|
60
128
|
self._output_dir = output_dir
|
129
|
+
self._catalogue_cache = output_dir / "catalogue_cache.bin"
|
61
130
|
|
62
131
|
self.stac = stac
|
132
|
+
self.lock = Lock()
|
133
|
+
self.stac_enabled = stac_enabled
|
63
134
|
self.collection_id = collection_id
|
64
135
|
self.collection_description = collection_description
|
65
136
|
|
@@ -74,41 +145,73 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
74
145
|
|
75
146
|
self._output_path_gen = output_path_generator
|
76
147
|
self._post_job_action = post_job_action
|
77
|
-
self._post_job_params = post_job_params
|
78
148
|
|
79
149
|
# Monkey patching the _normalize_df method to ensure we have no modification on the
|
80
150
|
# geometry column
|
81
151
|
MultiBackendJobManager._normalize_df = self._normalize_df
|
82
152
|
super().__init__(poll_sleep)
|
83
153
|
|
84
|
-
|
154
|
+
if self.stac_enabled:
|
155
|
+
self._root_collection = self._initialize_stac()
|
85
156
|
|
86
|
-
def
|
157
|
+
def _load_stac(self) -> Optional[pystac.Collection]:
|
158
|
+
"""
|
159
|
+
Loads the STAC collection from the cache, the specified `stac` path or the default path.
|
160
|
+
If no STAC collection is found, returns None.
|
161
|
+
"""
|
87
162
|
default_collection_path = self._output_dir / "stac/collection.json"
|
88
|
-
if self.
|
163
|
+
if self._catalogue_cache.exists():
|
164
|
+
_log.info(
|
165
|
+
"Loading the STAC collection from the persisted binary file: %s.",
|
166
|
+
self._catalogue_cache,
|
167
|
+
)
|
168
|
+
with open(self._catalogue_cache, "rb") as file:
|
169
|
+
return pickle.load(file)
|
170
|
+
elif self.stac is not None:
|
89
171
|
_log.info(
|
90
|
-
|
172
|
+
"Reloading the STAC collection from the provided path: %s.", self.stac
|
91
173
|
)
|
92
|
-
|
174
|
+
return pystac.read_file(str(self.stac))
|
93
175
|
elif default_collection_path.exists():
|
94
176
|
_log.info(
|
95
|
-
|
177
|
+
"Reload the STAC collection from the default path: %s.",
|
178
|
+
default_collection_path,
|
96
179
|
)
|
97
180
|
self.stac = default_collection_path
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
181
|
+
return pystac.read_file(str(self.stac))
|
182
|
+
|
183
|
+
_log.info(
|
184
|
+
"No STAC collection found as cache, in the default path or in the provided path."
|
185
|
+
)
|
186
|
+
return None
|
187
|
+
|
188
|
+
def _create_stac(self) -> pystac.Collection:
|
189
|
+
"""
|
190
|
+
Creates and returns new STAC collection. The created stac collection will use the
|
191
|
+
`collection_id` and `collection_description` parameters set in the constructor.
|
192
|
+
"""
|
193
|
+
if self.collection_id is None:
|
194
|
+
raise ValueError(
|
195
|
+
"A collection ID is required to generate a STAC collection."
|
108
196
|
)
|
109
|
-
|
110
|
-
|
111
|
-
|
197
|
+
collection = pystac.Collection(
|
198
|
+
id=self.collection_id,
|
199
|
+
description=self.collection_description,
|
200
|
+
extent=None,
|
201
|
+
)
|
202
|
+
collection.license = constants.LICENSE
|
203
|
+
collection.add_link(constants.LICENSE_LINK)
|
204
|
+
collection.stac_extensions = constants.STAC_EXTENSIONS
|
205
|
+
return collection
|
206
|
+
|
207
|
+
def _initialize_stac(self) -> pystac.Collection:
|
208
|
+
"""
|
209
|
+
Loads and returns if possible an existing stac collection, otherwise creates a new one.
|
210
|
+
"""
|
211
|
+
root_collection = self._load_stac()
|
212
|
+
if not root_collection:
|
213
|
+
_log.info("Starting a fresh STAC collection.")
|
214
|
+
root_collection = self._create_stac()
|
112
215
|
|
113
216
|
return root_collection
|
114
217
|
|
@@ -150,24 +253,40 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
150
253
|
job = connection.job(row.id)
|
151
254
|
if row.status == "postprocessing":
|
152
255
|
_log.info(
|
153
|
-
|
256
|
+
"Resuming postprocessing of job %s, queueing on_job_finished...",
|
257
|
+
row.id,
|
154
258
|
)
|
155
259
|
future = self._executor.submit(self.on_job_done, job, row)
|
156
|
-
future.add_done_callback(
|
260
|
+
future.add_done_callback(
|
261
|
+
partial(
|
262
|
+
done_callback,
|
263
|
+
df=df,
|
264
|
+
idx=idx,
|
265
|
+
)
|
266
|
+
)
|
157
267
|
else:
|
158
268
|
_log.info(
|
159
|
-
|
269
|
+
"Resuming postprocessing of job %s, queueing on_job_error...",
|
270
|
+
row.id,
|
160
271
|
)
|
161
272
|
future = self._executor.submit(self.on_job_error, job, row)
|
162
|
-
future.add_done_callback(
|
273
|
+
future.add_done_callback(
|
274
|
+
partial(
|
275
|
+
done_callback,
|
276
|
+
df=df,
|
277
|
+
idx=idx,
|
278
|
+
)
|
279
|
+
)
|
163
280
|
self._futures.append(future)
|
164
281
|
|
165
282
|
def _restart_failed_jobs(self, df: pd.DataFrame):
|
166
283
|
"""Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
|
167
|
-
failed_tasks = df[df.status
|
284
|
+
failed_tasks = df[df.status.isin(["error", "start_failed"])]
|
168
285
|
not_started_tasks = df[df.status == "not_started"]
|
169
286
|
_log.info(
|
170
|
-
|
287
|
+
"Resetting %s failed jobs to 'not_started'. %s jobs are already 'not_started'.",
|
288
|
+
len(failed_tasks),
|
289
|
+
len(not_started_tasks),
|
171
290
|
)
|
172
291
|
for idx, _ in failed_tasks.iterrows():
|
173
292
|
df.loc[idx, "status"] = "not_started"
|
@@ -203,27 +322,53 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
203
322
|
job_metadata["status"] == "finished"
|
204
323
|
):
|
205
324
|
_log.info(
|
206
|
-
|
325
|
+
"Job %s finished successfully, queueing on_job_done...", job.job_id
|
207
326
|
)
|
208
327
|
job_status = "postprocessing"
|
209
328
|
future = self._executor.submit(self.on_job_done, job, row)
|
210
329
|
# Future will setup the status to finished when the job is done
|
211
|
-
future.add_done_callback(
|
330
|
+
future.add_done_callback(
|
331
|
+
partial(
|
332
|
+
done_callback,
|
333
|
+
df=df,
|
334
|
+
idx=idx,
|
335
|
+
)
|
336
|
+
)
|
212
337
|
self._futures.append(future)
|
213
|
-
|
338
|
+
if "costs" in job_metadata:
|
339
|
+
df.loc[idx, "costs"] = job_metadata["costs"]
|
340
|
+
df.loc[idx, "memory"] = (
|
341
|
+
job_metadata["usage"]
|
342
|
+
.get("max_executor_memory", {})
|
343
|
+
.get("value", None)
|
344
|
+
)
|
345
|
+
|
346
|
+
else:
|
347
|
+
_log.warning(
|
348
|
+
"Costs not found in job %s metadata. Costs will be set to 'None'.",
|
349
|
+
job.job_id,
|
350
|
+
)
|
214
351
|
|
215
352
|
# Case in which it failed
|
216
353
|
if (df.loc[idx, "status"] != "error") and (
|
217
354
|
job_metadata["status"] == "error"
|
218
355
|
):
|
219
356
|
_log.info(
|
220
|
-
|
357
|
+
"Job %s finished with error, queueing on_job_error...",
|
358
|
+
job.job_id,
|
221
359
|
)
|
222
360
|
job_status = "postprocessing-error"
|
223
361
|
future = self._executor.submit(self.on_job_error, job, row)
|
224
362
|
# Future will setup the status to error when the job is done
|
225
|
-
future.add_done_callback(
|
363
|
+
future.add_done_callback(
|
364
|
+
partial(
|
365
|
+
done_callback,
|
366
|
+
df=df,
|
367
|
+
idx=idx,
|
368
|
+
)
|
369
|
+
)
|
226
370
|
self._futures.append(future)
|
371
|
+
if "costs" in job_metadata:
|
227
372
|
df.loc[idx, "costs"] = job_metadata["costs"]
|
228
373
|
|
229
374
|
df.loc[idx, "status"] = job_status
|
@@ -231,6 +376,7 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
231
376
|
# Clear the futures that are done and raise their potential exceptions if they occurred.
|
232
377
|
self._clear_queued_actions()
|
233
378
|
|
379
|
+
@retry_on_exception(max_retries=2, delay_s=180)
|
234
380
|
def on_job_error(self, job: BatchJob, row: pd.Series):
|
235
381
|
"""Method called when a job finishes with an error.
|
236
382
|
|
@@ -241,7 +387,14 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
241
387
|
row: pd.Series
|
242
388
|
The row in the dataframe that contains the job relative information.
|
243
389
|
"""
|
244
|
-
|
390
|
+
try:
|
391
|
+
logs = job.logs()
|
392
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
393
|
+
_log.exception(
|
394
|
+
"Error getting logs in `on_job_error` for job %s:\n%s", job.job_id, e
|
395
|
+
)
|
396
|
+
logs = []
|
397
|
+
|
245
398
|
error_logs = [log for log in logs if log.level.lower() == "error"]
|
246
399
|
|
247
400
|
job_metadata = job.describe_job()
|
@@ -260,28 +413,43 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
260
413
|
f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
|
261
414
|
)
|
262
415
|
|
416
|
+
@retry_on_exception(max_retries=2, delay_s=30)
|
263
417
|
def on_job_done(self, job: BatchJob, row: pd.Series):
|
264
418
|
"""Method called when a job finishes successfully. It will first download the results of
|
265
419
|
the job and then call the `post_job_action` method.
|
266
420
|
"""
|
421
|
+
|
267
422
|
job_products = {}
|
268
|
-
|
423
|
+
job_results = job.get_results()
|
424
|
+
asset_ids = [a.name for a in job_results.get_assets()]
|
425
|
+
for idx, asset_id in enumerate(asset_ids):
|
269
426
|
try:
|
427
|
+
asset = job_results.get_asset(asset_id)
|
270
428
|
_log.debug(
|
271
|
-
|
429
|
+
"Generating output path for asset %s from job %s...",
|
430
|
+
asset_id,
|
431
|
+
job.job_id,
|
432
|
+
)
|
433
|
+
output_path = self._output_path_gen(
|
434
|
+
self._output_dir, idx, row, asset_id
|
272
435
|
)
|
273
|
-
output_path = self._output_path_gen(self._output_dir, idx, row)
|
274
436
|
# Make the output path
|
275
437
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
276
438
|
asset.download(output_path)
|
277
439
|
# Add to the list of downloaded products
|
278
|
-
job_products[f"{job.job_id}_{
|
440
|
+
job_products[f"{job.job_id}_{asset_id}"] = [output_path]
|
279
441
|
_log.debug(
|
280
|
-
|
442
|
+
"Downloaded %s from job %s -> %s",
|
443
|
+
asset_id,
|
444
|
+
job.job_id,
|
445
|
+
output_path,
|
281
446
|
)
|
282
447
|
except Exception as e:
|
283
448
|
_log.exception(
|
284
|
-
|
449
|
+
"Error downloading asset %s from job %s:\n%s",
|
450
|
+
asset_id,
|
451
|
+
job.job_id,
|
452
|
+
e,
|
285
453
|
)
|
286
454
|
raise e
|
287
455
|
|
@@ -302,53 +470,35 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
302
470
|
asset.href = str(
|
303
471
|
asset_path
|
304
472
|
) # Update the asset href to the output location set by the output_path_generator
|
305
|
-
|
473
|
+
|
306
474
|
# Add the item to the the current job items.
|
307
475
|
job_items.append(item)
|
308
|
-
_log.info(
|
476
|
+
_log.info("Parsed item %s from job %s", item.id, job.job_id)
|
309
477
|
except Exception as e:
|
310
478
|
_log.exception(
|
311
|
-
|
479
|
+
"Error failed to add item %s from job %s to STAC collection:\n%s",
|
480
|
+
item.id,
|
481
|
+
job.job_id,
|
312
482
|
e,
|
313
483
|
)
|
314
|
-
raise e
|
315
484
|
|
316
485
|
# _post_job_action returns an updated list of stac items. Post job action can therefore
|
317
486
|
# update the stac items and access their products through the HREF. It is also the
|
318
487
|
# reponsible of adding the appropriate metadata/assets to the items.
|
319
488
|
if self._post_job_action is not None:
|
320
|
-
_log.debug(
|
321
|
-
job_items = self._post_job_action(job_items, row
|
489
|
+
_log.debug("Calling post job action for job %s...", job.job_id)
|
490
|
+
job_items = self._post_job_action(job_items, row)
|
322
491
|
|
323
|
-
_log.info(
|
492
|
+
_log.info("Adding %s items to the STAC collection...", len(job_items))
|
324
493
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
job_items = [item for item in job_items if item.id not in existing_ids]
|
494
|
+
if self.stac_enabled:
|
495
|
+
with self.lock:
|
496
|
+
self._update_stac(job.job_id, job_items)
|
329
497
|
|
330
|
-
|
331
|
-
_log.info(f"Added {len(job_items)} items to the STAC collection.")
|
332
|
-
|
333
|
-
_log.info(f"Writing STAC collection for {job.job_id} to file...")
|
334
|
-
try:
|
335
|
-
self._write_stac()
|
336
|
-
except Exception as e:
|
337
|
-
_log.exception(
|
338
|
-
f"Error writing STAC collection for job {job.job_id} to file.", e
|
339
|
-
)
|
340
|
-
raise e
|
341
|
-
_log.info(f"Wrote STAC collection for {job.job_id} to file.")
|
342
|
-
|
343
|
-
_log.info(f"Job {job.job_id} and post job action finished successfully.")
|
498
|
+
_log.info("Job %s and post job action finished successfully.", job.job_id)
|
344
499
|
|
345
500
|
def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
|
346
|
-
"""Ensure we have the required columns and the expected type for the geometry column.
|
347
|
-
|
348
|
-
:param df: The dataframe to normalize.
|
349
|
-
:return: a new dataframe that is normalized.
|
350
|
-
"""
|
351
|
-
|
501
|
+
"""Ensure we have the required columns and the expected type for the geometry column."""
|
352
502
|
# check for some required columns.
|
353
503
|
required_with_default = [
|
354
504
|
("status", "not_started"),
|
@@ -366,7 +516,7 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
366
516
|
}
|
367
517
|
df = df.assign(**new_columns)
|
368
518
|
|
369
|
-
_log.debug(
|
519
|
+
_log.debug("Normalizing dataframe. Columns: %s", df.columns)
|
370
520
|
|
371
521
|
return df
|
372
522
|
|
@@ -401,7 +551,7 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
401
551
|
The file to track the results of the jobs.
|
402
552
|
"""
|
403
553
|
# Starts the thread pool to work on the on_job_done and on_job_error methods
|
404
|
-
_log.info(
|
554
|
+
_log.info("Starting ThreadPoolExecutor with %s workers.", self._n_threads)
|
405
555
|
with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
|
406
556
|
_log.info("Creating and running jobs.")
|
407
557
|
self._executor = executor
|
@@ -412,6 +562,13 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
412
562
|
self._wait_queued_actions()
|
413
563
|
_log.info("Exiting ThreadPoolExecutor.")
|
414
564
|
self._executor = None
|
565
|
+
_log.info("All jobs finished running.")
|
566
|
+
if self.stac_enabled:
|
567
|
+
_log.info("Saving persisted STAC collection to final .json collection.")
|
568
|
+
self._write_stac()
|
569
|
+
_log.info("Saved STAC catalogue to JSON format, all tasks finished!")
|
570
|
+
else:
|
571
|
+
_log.info("STAC was disabled, skipping generation of the catalogue.")
|
415
572
|
|
416
573
|
def _write_stac(self):
|
417
574
|
"""Writes the STAC collection to the output directory."""
|
@@ -428,6 +585,36 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
428
585
|
self._root_collection.normalize_hrefs(str(root_path))
|
429
586
|
self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
|
430
587
|
|
588
|
+
def _persist_stac(self):
|
589
|
+
"""Persists the STAC collection by saving it into a binary file."""
|
590
|
+
_log.debug("Validating the STAC collection before persisting.")
|
591
|
+
self._root_collection.validate_all()
|
592
|
+
_log.info("Persisting STAC collection to temp file %s.", self._catalogue_cache)
|
593
|
+
with open(self._catalogue_cache, "wb") as file:
|
594
|
+
pickle.dump(self._root_collection, file)
|
595
|
+
|
596
|
+
def _update_stac(self, job_id: str, job_items: list[pystac.Item]):
|
597
|
+
"""Updates the STAC collection by adding the items generated by the job.
|
598
|
+
Does not add duplicates or override with the same item ID.
|
599
|
+
"""
|
600
|
+
try:
|
601
|
+
_log.info("Thread %s entered the STAC lock.", threading.get_ident())
|
602
|
+
# Filters the job items to only keep the ones that are not already in the collection
|
603
|
+
existing_ids = [item.id for item in self._root_collection.get_all_items()]
|
604
|
+
job_items = [item for item in job_items if item.id not in existing_ids]
|
605
|
+
|
606
|
+
self._root_collection.add_items(job_items)
|
607
|
+
_log.info("Added %s items to the STAC collection.", len(job_items))
|
608
|
+
|
609
|
+
self._persist_stac()
|
610
|
+
except Exception as e:
|
611
|
+
_log.exception(
|
612
|
+
"Error adding items to the STAC collection for job %s:\n%s ",
|
613
|
+
job_id,
|
614
|
+
str(e),
|
615
|
+
)
|
616
|
+
raise e
|
617
|
+
|
431
618
|
def setup_stac(
|
432
619
|
self,
|
433
620
|
constellation: Optional[str] = None,
|