openeo-gfmap 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openeo_gfmap/features/feature_extractor.py +9 -0
- openeo_gfmap/fetching/__init__.py +16 -4
- openeo_gfmap/fetching/commons.py +1 -0
- openeo_gfmap/fetching/generic.py +81 -73
- openeo_gfmap/fetching/s1.py +1 -3
- openeo_gfmap/fetching/s2.py +1 -0
- openeo_gfmap/inference/model_inference.py +5 -2
- openeo_gfmap/manager/job_manager.py +269 -83
- openeo_gfmap/manager/job_splitters.py +41 -18
- openeo_gfmap/stac/constants.py +1 -1
- openeo_gfmap/utils/__init__.py +16 -0
- openeo_gfmap/utils/catalogue.py +165 -34
- openeo_gfmap/utils/split_stac.py +125 -0
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/METADATA +1 -1
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/RECORD +17 -17
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/WHEEL +1 -1
- openeo_gfmap/fetching/meteo.py +0 -126
- {openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,11 @@
|
|
1
1
|
import json
|
2
|
+
import pickle
|
2
3
|
import threading
|
4
|
+
import time
|
3
5
|
from concurrent.futures import ThreadPoolExecutor
|
4
|
-
from enum import Enum
|
5
6
|
from functools import partial
|
6
7
|
from pathlib import Path
|
8
|
+
from threading import Lock
|
7
9
|
from typing import Callable, Optional, Union
|
8
10
|
|
9
11
|
import pandas as pd
|
@@ -16,28 +18,62 @@ from openeo_gfmap.manager import _log
|
|
16
18
|
from openeo_gfmap.stac import constants
|
17
19
|
|
18
20
|
# Lock to use when writing to the STAC collection
|
19
|
-
_stac_lock =
|
21
|
+
_stac_lock = Lock()
|
22
|
+
|
23
|
+
|
24
|
+
def retry_on_exception(max_retries: int, delay_s: int = 180):
|
25
|
+
"""Decorator to retry a function if an exception occurs.
|
26
|
+
Used for post-job actions that can crash due to internal backend issues. Restarting the action
|
27
|
+
usually helps to solve the issue.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
max_retries: int
|
32
|
+
The maximum number of retries to attempt before finally raising the exception.
|
33
|
+
delay: int (default=180 seconds)
|
34
|
+
The delay in seconds to wait before retrying the decorated function.
|
35
|
+
"""
|
36
|
+
|
37
|
+
def decorator(func):
|
38
|
+
def wrapper(*args, **kwargs):
|
39
|
+
latest_exception = None
|
40
|
+
for _ in range(max_retries):
|
41
|
+
try:
|
42
|
+
return func(*args, **kwargs)
|
43
|
+
except Exception as e:
|
44
|
+
time.sleep(
|
45
|
+
delay_s
|
46
|
+
) # Waits before retrying, while allowing other futures to run.
|
47
|
+
latest_exception = e
|
48
|
+
raise latest_exception
|
49
|
+
|
50
|
+
return wrapper
|
51
|
+
|
52
|
+
return decorator
|
20
53
|
|
21
54
|
|
22
55
|
def done_callback(future, df, idx):
|
23
|
-
"""
|
56
|
+
"""Changes the status of the job when the post-job action future is done."""
|
24
57
|
current_status = df.loc[idx, "status"]
|
25
|
-
|
58
|
+
exception = future.exception()
|
59
|
+
if exception is None:
|
26
60
|
if current_status == "postprocessing":
|
27
61
|
df.loc[idx, "status"] = "finished"
|
28
62
|
elif current_status == "postprocessing-error":
|
29
63
|
df.loc[idx, "status"] = "error"
|
64
|
+
elif current_status == "running":
|
65
|
+
df.loc[idx, "status"] = "running"
|
30
66
|
else:
|
31
67
|
raise ValueError(
|
32
68
|
f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
|
33
69
|
)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
70
|
+
else:
|
71
|
+
_log.exception(
|
72
|
+
"Exception occurred in post-job future for job %s:\n%s",
|
73
|
+
df.loc[idx, "id"],
|
74
|
+
exception,
|
75
|
+
)
|
76
|
+
df.loc[idx, "status"] = "error"
|
41
77
|
|
42
78
|
|
43
79
|
class GFMAPJobManager(MultiBackendJobManager):
|
@@ -53,13 +89,50 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
53
89
|
post_job_action: Optional[Callable] = None,
|
54
90
|
poll_sleep: int = 5,
|
55
91
|
n_threads: int = 1,
|
56
|
-
post_job_params: dict = {},
|
57
92
|
resume_postproc: bool = True, # If we need to check for post-job actions that crashed
|
58
93
|
restart_failed: bool = False, # If we need to restart failed jobs
|
94
|
+
stac_enabled: bool = True,
|
59
95
|
):
|
96
|
+
"""
|
97
|
+
Initializes the GFMAP job manager.
|
98
|
+
|
99
|
+
Parameters
|
100
|
+
----------
|
101
|
+
output_dir: Path
|
102
|
+
The base output directory where the results/stac/logs of the jobs will be stored.
|
103
|
+
output_path_generator: Callable
|
104
|
+
User defined function that generates the output path for the job results. Expects as
|
105
|
+
inputs the output directory, the index of the job in the job dataframe
|
106
|
+
and the row of the job, and returns the final path where to save a job result asset.
|
107
|
+
collection_id: Optional[str]
|
108
|
+
The ID of the STAC collection that is being generated. Can be left empty if the STAC
|
109
|
+
catalogue is not being generated or if it is being resumed from an existing catalogue.
|
110
|
+
collection_description: Optional[str]
|
111
|
+
The description of the STAC collection that is being generated.
|
112
|
+
stac: Optional[Union[str, Path]]
|
113
|
+
The path to the STAC collection to be saved or resumed.
|
114
|
+
If None, the default path will be used.
|
115
|
+
post_job_action: Optional[Callable]
|
116
|
+
A user defined function that will be called after a job is finished. It will receive
|
117
|
+
the list of items generated by the job and the row of the job, and should return the
|
118
|
+
updated list of items.
|
119
|
+
poll_sleep: int
|
120
|
+
The time in seconds to wait between polling the backend for job status.
|
121
|
+
n_threads: int
|
122
|
+
The number of threads to execute `on_job_done` and `on_job_error` functions.
|
123
|
+
resume_postproc: bool
|
124
|
+
If set to true, all `on_job_done` and `on_job_error` functions that failed are resumed.
|
125
|
+
restart_failed: bool
|
126
|
+
If set to true, all jobs that failed within the OpenEO backend are restarted.
|
127
|
+
stac_enabled: bool (default=True)
|
128
|
+
If the STAC generation is enabled or not. Disabling it will prevent the creation,
|
129
|
+
update and loading of the STAC collection.
|
130
|
+
"""
|
60
131
|
self._output_dir = output_dir
|
132
|
+
self._catalogue_cache = output_dir / "catalogue_cache.bin"
|
61
133
|
|
62
134
|
self.stac = stac
|
135
|
+
self.stac_enabled = stac_enabled
|
63
136
|
self.collection_id = collection_id
|
64
137
|
self.collection_description = collection_description
|
65
138
|
|
@@ -74,41 +147,73 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
74
147
|
|
75
148
|
self._output_path_gen = output_path_generator
|
76
149
|
self._post_job_action = post_job_action
|
77
|
-
self._post_job_params = post_job_params
|
78
150
|
|
79
151
|
# Monkey patching the _normalize_df method to ensure we have no modification on the
|
80
152
|
# geometry column
|
81
153
|
MultiBackendJobManager._normalize_df = self._normalize_df
|
82
154
|
super().__init__(poll_sleep)
|
83
155
|
|
84
|
-
|
156
|
+
if self.stac_enabled:
|
157
|
+
self._root_collection = self._initialize_stac()
|
85
158
|
|
86
|
-
def
|
159
|
+
def _load_stac(self) -> Optional[pystac.Collection]:
|
160
|
+
"""
|
161
|
+
Loads the STAC collection from the cache, the specified `stac` path or the default path.
|
162
|
+
If no STAC collection is found, returns None.
|
163
|
+
"""
|
87
164
|
default_collection_path = self._output_dir / "stac/collection.json"
|
88
|
-
if self.
|
165
|
+
if self._catalogue_cache.exists():
|
89
166
|
_log.info(
|
90
|
-
|
167
|
+
"Loading the STAC collection from the persisted binary file: %s.",
|
168
|
+
self._catalogue_cache,
|
91
169
|
)
|
92
|
-
|
170
|
+
with open(self._catalogue_cache, "rb") as file:
|
171
|
+
return pickle.load(file)
|
172
|
+
elif self.stac is not None:
|
173
|
+
_log.info(
|
174
|
+
"Reloading the STAC collection from the provided path: %s.", self.stac
|
175
|
+
)
|
176
|
+
return pystac.read_file(str(self.stac))
|
93
177
|
elif default_collection_path.exists():
|
94
178
|
_log.info(
|
95
|
-
|
179
|
+
"Reload the STAC collection from the default path: %s.",
|
180
|
+
default_collection_path,
|
96
181
|
)
|
97
182
|
self.stac = default_collection_path
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
183
|
+
return pystac.read_file(str(self.stac))
|
184
|
+
|
185
|
+
_log.info(
|
186
|
+
"No STAC collection found as cache, in the default path or in the provided path."
|
187
|
+
)
|
188
|
+
return None
|
189
|
+
|
190
|
+
def _create_stac(self) -> pystac.Collection:
|
191
|
+
"""
|
192
|
+
Creates and returns new STAC collection. The created stac collection will use the
|
193
|
+
`collection_id` and `collection_description` parameters set in the constructor.
|
194
|
+
"""
|
195
|
+
if self.collection_id is None:
|
196
|
+
raise ValueError(
|
197
|
+
"A collection ID is required to generate a STAC collection."
|
108
198
|
)
|
109
|
-
|
110
|
-
|
111
|
-
|
199
|
+
collection = pystac.Collection(
|
200
|
+
id=self.collection_id,
|
201
|
+
description=self.collection_description,
|
202
|
+
extent=None,
|
203
|
+
)
|
204
|
+
collection.license = constants.LICENSE
|
205
|
+
collection.add_link(constants.LICENSE_LINK)
|
206
|
+
collection.stac_extensions = constants.STAC_EXTENSIONS
|
207
|
+
return collection
|
208
|
+
|
209
|
+
def _initialize_stac(self) -> pystac.Collection:
|
210
|
+
"""
|
211
|
+
Loads and returns if possible an existing stac collection, otherwise creates a new one.
|
212
|
+
"""
|
213
|
+
root_collection = self._load_stac()
|
214
|
+
if not root_collection:
|
215
|
+
_log.info("Starting a fresh STAC collection.")
|
216
|
+
root_collection = self._create_stac()
|
112
217
|
|
113
218
|
return root_collection
|
114
219
|
|
@@ -150,24 +255,40 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
150
255
|
job = connection.job(row.id)
|
151
256
|
if row.status == "postprocessing":
|
152
257
|
_log.info(
|
153
|
-
|
258
|
+
"Resuming postprocessing of job %s, queueing on_job_finished...",
|
259
|
+
row.id,
|
260
|
+
)
|
261
|
+
future = self._executor.submit(self.on_job_done, job, row, _stac_lock)
|
262
|
+
future.add_done_callback(
|
263
|
+
partial(
|
264
|
+
done_callback,
|
265
|
+
df=df,
|
266
|
+
idx=idx,
|
267
|
+
)
|
154
268
|
)
|
155
|
-
future = self._executor.submit(self.on_job_done, job, row)
|
156
|
-
future.add_done_callback(partial(done_callback, df=df, idx=idx))
|
157
269
|
else:
|
158
270
|
_log.info(
|
159
|
-
|
271
|
+
"Resuming postprocessing of job %s, queueing on_job_error...",
|
272
|
+
row.id,
|
160
273
|
)
|
161
274
|
future = self._executor.submit(self.on_job_error, job, row)
|
162
|
-
future.add_done_callback(
|
275
|
+
future.add_done_callback(
|
276
|
+
partial(
|
277
|
+
done_callback,
|
278
|
+
df=df,
|
279
|
+
idx=idx,
|
280
|
+
)
|
281
|
+
)
|
163
282
|
self._futures.append(future)
|
164
283
|
|
165
284
|
def _restart_failed_jobs(self, df: pd.DataFrame):
|
166
285
|
"""Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
|
167
|
-
failed_tasks = df[df.status
|
286
|
+
failed_tasks = df[df.status.isin(["error", "start_failed"])]
|
168
287
|
not_started_tasks = df[df.status == "not_started"]
|
169
288
|
_log.info(
|
170
|
-
|
289
|
+
"Resetting %s failed jobs to 'not_started'. %s jobs are already 'not_started'.",
|
290
|
+
len(failed_tasks),
|
291
|
+
len(not_started_tasks),
|
171
292
|
)
|
172
293
|
for idx, _ in failed_tasks.iterrows():
|
173
294
|
df.loc[idx, "status"] = "not_started"
|
@@ -203,27 +324,53 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
203
324
|
job_metadata["status"] == "finished"
|
204
325
|
):
|
205
326
|
_log.info(
|
206
|
-
|
327
|
+
"Job %s finished successfully, queueing on_job_done...", job.job_id
|
207
328
|
)
|
208
329
|
job_status = "postprocessing"
|
209
|
-
future = self._executor.submit(self.on_job_done, job, row)
|
330
|
+
future = self._executor.submit(self.on_job_done, job, row, _stac_lock)
|
210
331
|
# Future will setup the status to finished when the job is done
|
211
|
-
future.add_done_callback(
|
332
|
+
future.add_done_callback(
|
333
|
+
partial(
|
334
|
+
done_callback,
|
335
|
+
df=df,
|
336
|
+
idx=idx,
|
337
|
+
)
|
338
|
+
)
|
212
339
|
self._futures.append(future)
|
213
|
-
|
340
|
+
if "costs" in job_metadata:
|
341
|
+
df.loc[idx, "costs"] = job_metadata["costs"]
|
342
|
+
df.loc[idx, "memory"] = (
|
343
|
+
job_metadata["usage"]
|
344
|
+
.get("max_executor_memory", {})
|
345
|
+
.get("value", None)
|
346
|
+
)
|
347
|
+
|
348
|
+
else:
|
349
|
+
_log.warning(
|
350
|
+
"Costs not found in job %s metadata. Costs will be set to 'None'.",
|
351
|
+
job.job_id,
|
352
|
+
)
|
214
353
|
|
215
354
|
# Case in which it failed
|
216
355
|
if (df.loc[idx, "status"] != "error") and (
|
217
356
|
job_metadata["status"] == "error"
|
218
357
|
):
|
219
358
|
_log.info(
|
220
|
-
|
359
|
+
"Job %s finished with error, queueing on_job_error...",
|
360
|
+
job.job_id,
|
221
361
|
)
|
222
362
|
job_status = "postprocessing-error"
|
223
363
|
future = self._executor.submit(self.on_job_error, job, row)
|
224
364
|
# Future will setup the status to error when the job is done
|
225
|
-
future.add_done_callback(
|
365
|
+
future.add_done_callback(
|
366
|
+
partial(
|
367
|
+
done_callback,
|
368
|
+
df=df,
|
369
|
+
idx=idx,
|
370
|
+
)
|
371
|
+
)
|
226
372
|
self._futures.append(future)
|
373
|
+
if "costs" in job_metadata:
|
227
374
|
df.loc[idx, "costs"] = job_metadata["costs"]
|
228
375
|
|
229
376
|
df.loc[idx, "status"] = job_status
|
@@ -231,6 +378,7 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
231
378
|
# Clear the futures that are done and raise their potential exceptions if they occurred.
|
232
379
|
self._clear_queued_actions()
|
233
380
|
|
381
|
+
@retry_on_exception(max_retries=2, delay_s=180)
|
234
382
|
def on_job_error(self, job: BatchJob, row: pd.Series):
|
235
383
|
"""Method called when a job finishes with an error.
|
236
384
|
|
@@ -241,7 +389,14 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
241
389
|
row: pd.Series
|
242
390
|
The row in the dataframe that contains the job relative information.
|
243
391
|
"""
|
244
|
-
|
392
|
+
try:
|
393
|
+
logs = job.logs()
|
394
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
395
|
+
_log.exception(
|
396
|
+
"Error getting logs in `on_job_error` for job %s:\n%s", job.job_id, e
|
397
|
+
)
|
398
|
+
logs = []
|
399
|
+
|
245
400
|
error_logs = [log for log in logs if log.level.lower() == "error"]
|
246
401
|
|
247
402
|
job_metadata = job.describe_job()
|
@@ -260,15 +415,21 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
260
415
|
f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
|
261
416
|
)
|
262
417
|
|
263
|
-
|
418
|
+
@retry_on_exception(max_retries=2, delay_s=30)
|
419
|
+
def on_job_done(
|
420
|
+
self, job: BatchJob, row: pd.Series, lock: Lock
|
421
|
+
): # pylint: disable=arguments-differ
|
264
422
|
"""Method called when a job finishes successfully. It will first download the results of
|
265
423
|
the job and then call the `post_job_action` method.
|
266
424
|
"""
|
425
|
+
|
267
426
|
job_products = {}
|
268
427
|
for idx, asset in enumerate(job.get_results().get_assets()):
|
269
428
|
try:
|
270
429
|
_log.debug(
|
271
|
-
|
430
|
+
"Generating output path for asset %s from job %s...",
|
431
|
+
asset.name,
|
432
|
+
job.job_id,
|
272
433
|
)
|
273
434
|
output_path = self._output_path_gen(self._output_dir, idx, row)
|
274
435
|
# Make the output path
|
@@ -277,11 +438,17 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
277
438
|
# Add to the list of downloaded products
|
278
439
|
job_products[f"{job.job_id}_{asset.name}"] = [output_path]
|
279
440
|
_log.debug(
|
280
|
-
|
441
|
+
"Downloaded %s from job %s -> %s",
|
442
|
+
asset.name,
|
443
|
+
job.job_id,
|
444
|
+
output_path,
|
281
445
|
)
|
282
446
|
except Exception as e:
|
283
447
|
_log.exception(
|
284
|
-
|
448
|
+
"Error downloading asset %s from job %s:\n%s",
|
449
|
+
asset.name,
|
450
|
+
job.job_id,
|
451
|
+
e,
|
285
452
|
)
|
286
453
|
raise e
|
287
454
|
|
@@ -302,53 +469,35 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
302
469
|
asset.href = str(
|
303
470
|
asset_path
|
304
471
|
) # Update the asset href to the output location set by the output_path_generator
|
305
|
-
|
472
|
+
|
306
473
|
# Add the item to the the current job items.
|
307
474
|
job_items.append(item)
|
308
|
-
_log.info(
|
475
|
+
_log.info("Parsed item %s from job %s", item.id, job.job_id)
|
309
476
|
except Exception as e:
|
310
477
|
_log.exception(
|
311
|
-
|
478
|
+
"Error failed to add item %s from job %s to STAC collection:\n%s",
|
479
|
+
item.id,
|
480
|
+
job.job_id,
|
312
481
|
e,
|
313
482
|
)
|
314
|
-
raise e
|
315
483
|
|
316
484
|
# _post_job_action returns an updated list of stac items. Post job action can therefore
|
317
485
|
# update the stac items and access their products through the HREF. It is also the
|
318
486
|
# reponsible of adding the appropriate metadata/assets to the items.
|
319
487
|
if self._post_job_action is not None:
|
320
|
-
_log.debug(
|
321
|
-
job_items = self._post_job_action(job_items, row
|
488
|
+
_log.debug("Calling post job action for job %s...", job.job_id)
|
489
|
+
job_items = self._post_job_action(job_items, row)
|
322
490
|
|
323
|
-
_log.info(
|
491
|
+
_log.info("Adding %s items to the STAC collection...", len(job_items))
|
324
492
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
job_items = [item for item in job_items if item.id not in existing_ids]
|
493
|
+
if self.stac_enabled:
|
494
|
+
with lock:
|
495
|
+
self._update_stac(job.job_id, job_items)
|
329
496
|
|
330
|
-
|
331
|
-
_log.info(f"Added {len(job_items)} items to the STAC collection.")
|
332
|
-
|
333
|
-
_log.info(f"Writing STAC collection for {job.job_id} to file...")
|
334
|
-
try:
|
335
|
-
self._write_stac()
|
336
|
-
except Exception as e:
|
337
|
-
_log.exception(
|
338
|
-
f"Error writing STAC collection for job {job.job_id} to file.", e
|
339
|
-
)
|
340
|
-
raise e
|
341
|
-
_log.info(f"Wrote STAC collection for {job.job_id} to file.")
|
342
|
-
|
343
|
-
_log.info(f"Job {job.job_id} and post job action finished successfully.")
|
497
|
+
_log.info("Job %s and post job action finished successfully.", job.job_id)
|
344
498
|
|
345
499
|
def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
|
346
|
-
"""Ensure we have the required columns and the expected type for the geometry column.
|
347
|
-
|
348
|
-
:param df: The dataframe to normalize.
|
349
|
-
:return: a new dataframe that is normalized.
|
350
|
-
"""
|
351
|
-
|
500
|
+
"""Ensure we have the required columns and the expected type for the geometry column."""
|
352
501
|
# check for some required columns.
|
353
502
|
required_with_default = [
|
354
503
|
("status", "not_started"),
|
@@ -366,7 +515,7 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
366
515
|
}
|
367
516
|
df = df.assign(**new_columns)
|
368
517
|
|
369
|
-
_log.debug(
|
518
|
+
_log.debug("Normalizing dataframe. Columns: %s", df.columns)
|
370
519
|
|
371
520
|
return df
|
372
521
|
|
@@ -401,7 +550,7 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
401
550
|
The file to track the results of the jobs.
|
402
551
|
"""
|
403
552
|
# Starts the thread pool to work on the on_job_done and on_job_error methods
|
404
|
-
_log.info(
|
553
|
+
_log.info("Starting ThreadPoolExecutor with %s workers.", self._n_threads)
|
405
554
|
with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
|
406
555
|
_log.info("Creating and running jobs.")
|
407
556
|
self._executor = executor
|
@@ -412,6 +561,13 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
412
561
|
self._wait_queued_actions()
|
413
562
|
_log.info("Exiting ThreadPoolExecutor.")
|
414
563
|
self._executor = None
|
564
|
+
_log.info("All jobs finished running.")
|
565
|
+
if self.stac_enabled:
|
566
|
+
_log.info("Saving persisted STAC collection to final .json collection.")
|
567
|
+
self._write_stac()
|
568
|
+
_log.info("Saved STAC catalogue to JSON format, all tasks finished!")
|
569
|
+
else:
|
570
|
+
_log.info("STAC was disabled, skipping generation of the catalogue.")
|
415
571
|
|
416
572
|
def _write_stac(self):
|
417
573
|
"""Writes the STAC collection to the output directory."""
|
@@ -428,6 +584,36 @@ class GFMAPJobManager(MultiBackendJobManager):
|
|
428
584
|
self._root_collection.normalize_hrefs(str(root_path))
|
429
585
|
self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
|
430
586
|
|
587
|
+
def _persist_stac(self):
|
588
|
+
"""Persists the STAC collection by saving it into a binary file."""
|
589
|
+
_log.debug("Validating the STAC collection before persisting.")
|
590
|
+
self._root_collection.validate_all()
|
591
|
+
_log.info("Persisting STAC collection to temp file %s.", self._catalogue_cache)
|
592
|
+
with open(self._catalogue_cache, "wb") as file:
|
593
|
+
pickle.dump(self._root_collection, file)
|
594
|
+
|
595
|
+
def _update_stac(self, job_id: str, job_items: list[pystac.Item]):
|
596
|
+
"""Updates the STAC collection by adding the items generated by the job.
|
597
|
+
Does not add duplicates or override with the same item ID.
|
598
|
+
"""
|
599
|
+
try:
|
600
|
+
_log.info("Thread %s entered the STAC lock.", threading.get_ident())
|
601
|
+
# Filters the job items to only keep the ones that are not already in the collection
|
602
|
+
existing_ids = [item.id for item in self._root_collection.get_all_items()]
|
603
|
+
job_items = [item for item in job_items if item.id not in existing_ids]
|
604
|
+
|
605
|
+
self._root_collection.add_items(job_items)
|
606
|
+
_log.info("Added %s items to the STAC collection.", len(job_items))
|
607
|
+
|
608
|
+
self._persist_stac()
|
609
|
+
except Exception as e:
|
610
|
+
_log.exception(
|
611
|
+
"Error adding items to the STAC collection for job %s:\n%s ",
|
612
|
+
job_id,
|
613
|
+
str(e),
|
614
|
+
)
|
615
|
+
raise e
|
616
|
+
|
431
617
|
def setup_stac(
|
432
618
|
self,
|
433
619
|
constellation: Optional[str] = None,
|
@@ -12,21 +12,32 @@ import requests
|
|
12
12
|
from openeo_gfmap.manager import _log
|
13
13
|
|
14
14
|
|
15
|
-
def load_s2_grid() -> gpd.GeoDataFrame:
|
15
|
+
def load_s2_grid(web_mercator: bool = False) -> gpd.GeoDataFrame:
|
16
16
|
"""Returns a geo data frame from the S2 grid."""
|
17
17
|
# Builds the path where the geodataframe should be
|
18
|
-
|
18
|
+
if not web_mercator:
|
19
|
+
gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_v2.geoparquet"
|
20
|
+
url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_v2.geoparquet"
|
21
|
+
else:
|
22
|
+
gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_v2.geoparquet"
|
23
|
+
url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_v2.geoparquet"
|
24
|
+
|
19
25
|
if not gdf_path.exists():
|
20
26
|
_log.info("S2 grid not found, downloading it from artifactory.")
|
21
27
|
# Downloads the file from the artifactory URL
|
22
28
|
gdf_path.parent.mkdir(exist_ok=True)
|
23
29
|
response = requests.get(
|
24
|
-
|
30
|
+
url,
|
25
31
|
timeout=180, # 3mins
|
26
32
|
)
|
33
|
+
if response.status_code != 200:
|
34
|
+
raise ValueError(
|
35
|
+
"Failed to download the S2 grid from the artifactory. "
|
36
|
+
f"Status code: {response.status_code}"
|
37
|
+
)
|
27
38
|
with open(gdf_path, "wb") as f:
|
28
39
|
f.write(response.content)
|
29
|
-
return gpd.
|
40
|
+
return gpd.read_parquet(gdf_path)
|
30
41
|
|
31
42
|
|
32
43
|
def _resplit_group(
|
@@ -38,7 +49,7 @@ def _resplit_group(
|
|
38
49
|
|
39
50
|
|
40
51
|
def split_job_s2grid(
|
41
|
-
polygons: gpd.GeoDataFrame, max_points: int = 500
|
52
|
+
polygons: gpd.GeoDataFrame, max_points: int = 500, web_mercator: bool = False
|
42
53
|
) -> List[gpd.GeoDataFrame]:
|
43
54
|
"""Split a job into multiple jobs from the position of the polygons/points. The centroid of
|
44
55
|
the geometries to extract are used to select tile in the Sentinel-2 tile grid.
|
@@ -60,17 +71,25 @@ def split_job_s2grid(
|
|
60
71
|
if polygons.crs is None:
|
61
72
|
raise ValueError("The GeoDataFrame must contain a CRS")
|
62
73
|
|
63
|
-
|
64
|
-
|
65
|
-
|
74
|
+
epsg = 3857 if web_mercator else 4326
|
75
|
+
|
76
|
+
original_crs = polygons.crs
|
77
|
+
|
78
|
+
polygons = polygons.to_crs(epsg=epsg)
|
79
|
+
|
80
|
+
polygons["centroid"] = polygons.geometry.centroid
|
66
81
|
|
67
82
|
# Dataset containing all the S2 tiles, find the nearest S2 tile for each point
|
68
|
-
s2_grid = load_s2_grid()
|
83
|
+
s2_grid = load_s2_grid(web_mercator)
|
69
84
|
s2_grid["geometry"] = s2_grid.geometry.centroid
|
70
85
|
|
71
|
-
|
72
|
-
|
73
|
-
|
86
|
+
s2_grid = s2_grid[s2_grid.cdse_valid]
|
87
|
+
|
88
|
+
polygons = gpd.sjoin_nearest(
|
89
|
+
polygons.set_geometry("centroid"), s2_grid[["tile", "geometry"]]
|
90
|
+
).drop(columns=["index_right", "centroid"])
|
91
|
+
|
92
|
+
polygons = polygons.set_geometry("geometry").to_crs(original_crs)
|
74
93
|
|
75
94
|
split_datasets = []
|
76
95
|
for _, sub_gdf in polygons.groupby("tile"):
|
@@ -86,10 +105,13 @@ def append_h3_index(
|
|
86
105
|
polygons: gpd.GeoDataFrame, grid_resolution: int = 3
|
87
106
|
) -> gpd.GeoDataFrame:
|
88
107
|
"""Append the H3 index to the polygons."""
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
108
|
+
|
109
|
+
# Project to Web mercator to calculate centroids
|
110
|
+
polygons = polygons.to_crs(epsg=3857)
|
111
|
+
geom_col = polygons.geometry.centroid
|
112
|
+
# Project to lat lon to calculate the h3 index
|
113
|
+
geom_col = geom_col.to_crs(epsg=4326)
|
114
|
+
|
93
115
|
polygons["h3index"] = geom_col.apply(
|
94
116
|
lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
|
95
117
|
)
|
@@ -127,12 +149,13 @@ def split_job_hex(
|
|
127
149
|
if polygons.crs is None:
|
128
150
|
raise ValueError("The GeoDataFrame must contain a CRS")
|
129
151
|
|
130
|
-
|
131
|
-
polygons = polygons.to_crs(epsg=4326)
|
152
|
+
original_crs = polygons.crs
|
132
153
|
|
133
154
|
# Split the polygons into multiple jobs
|
134
155
|
polygons = append_h3_index(polygons, grid_resolution)
|
135
156
|
|
157
|
+
polygons = polygons.to_crs(original_crs)
|
158
|
+
|
136
159
|
split_datasets = []
|
137
160
|
for _, sub_gdf in polygons.groupby("h3index"):
|
138
161
|
if len(sub_gdf) > max_points:
|
openeo_gfmap/stac/constants.py
CHANGED