openeo-gfmap 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openeo_gfmap/__init__.py +23 -0
- openeo_gfmap/backend.py +122 -0
- openeo_gfmap/features/__init__.py +17 -0
- openeo_gfmap/features/feature_extractor.py +389 -0
- openeo_gfmap/fetching/__init__.py +21 -0
- openeo_gfmap/fetching/commons.py +213 -0
- openeo_gfmap/fetching/fetching.py +98 -0
- openeo_gfmap/fetching/generic.py +165 -0
- openeo_gfmap/fetching/meteo.py +126 -0
- openeo_gfmap/fetching/s1.py +195 -0
- openeo_gfmap/fetching/s2.py +236 -0
- openeo_gfmap/inference/__init__.py +3 -0
- openeo_gfmap/inference/model_inference.py +347 -0
- openeo_gfmap/manager/__init__.py +31 -0
- openeo_gfmap/manager/job_manager.py +469 -0
- openeo_gfmap/manager/job_splitters.py +144 -0
- openeo_gfmap/metadata.py +24 -0
- openeo_gfmap/preprocessing/__init__.py +22 -0
- openeo_gfmap/preprocessing/cloudmasking.py +268 -0
- openeo_gfmap/preprocessing/compositing.py +74 -0
- openeo_gfmap/preprocessing/interpolation.py +12 -0
- openeo_gfmap/preprocessing/sar.py +64 -0
- openeo_gfmap/preprocessing/scaling.py +65 -0
- openeo_gfmap/preprocessing/udf_cldmask.py +36 -0
- openeo_gfmap/preprocessing/udf_rank.py +37 -0
- openeo_gfmap/preprocessing/udf_score.py +103 -0
- openeo_gfmap/spatial.py +53 -0
- openeo_gfmap/stac/__init__.py +2 -0
- openeo_gfmap/stac/constants.py +51 -0
- openeo_gfmap/temporal.py +22 -0
- openeo_gfmap/utils/__init__.py +23 -0
- openeo_gfmap/utils/build_df.py +48 -0
- openeo_gfmap/utils/catalogue.py +248 -0
- openeo_gfmap/utils/intervals.py +64 -0
- openeo_gfmap/utils/netcdf.py +25 -0
- openeo_gfmap/utils/tile_processing.py +64 -0
- openeo_gfmap-0.1.0.dist-info/METADATA +57 -0
- openeo_gfmap-0.1.0.dist-info/RECORD +40 -0
- openeo_gfmap-0.1.0.dist-info/WHEEL +4 -0
- openeo_gfmap-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,469 @@
|
|
1
|
+
import json
|
2
|
+
import threading
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
4
|
+
from enum import Enum
|
5
|
+
from functools import partial
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Callable, Optional, Union
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
import pystac
|
11
|
+
from openeo.extra.job_management import MultiBackendJobManager
|
12
|
+
from openeo.rest.job import BatchJob
|
13
|
+
from pystac import CatalogType
|
14
|
+
|
15
|
+
from openeo_gfmap.manager import _log
|
16
|
+
from openeo_gfmap.stac import constants
|
17
|
+
|
18
|
+
# Lock to use when writing to the STAC collection
|
19
|
+
_stac_lock = threading.Lock()
|
20
|
+
|
21
|
+
|
22
|
+
def done_callback(future, df, idx):
|
23
|
+
"""Sets the status of the job to the given status when the future is done."""
|
24
|
+
current_status = df.loc[idx, "status"]
|
25
|
+
if not future.exception():
|
26
|
+
if current_status == "postprocessing":
|
27
|
+
df.loc[idx, "status"] = "finished"
|
28
|
+
elif current_status == "postprocessing-error":
|
29
|
+
df.loc[idx, "status"] = "error"
|
30
|
+
else:
|
31
|
+
raise ValueError(
|
32
|
+
f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
class PostJobStatus(Enum):
|
37
|
+
"""Indicates the workers if the job finished as sucessful or with an error."""
|
38
|
+
|
39
|
+
FINISHED = "finished"
|
40
|
+
ERROR = "error"
|
41
|
+
|
42
|
+
|
43
|
+
class GFMAPJobManager(MultiBackendJobManager):
|
44
|
+
"""A job manager for the GFMAP backend."""
|
45
|
+
|
46
|
+
def __init__(
|
47
|
+
self,
|
48
|
+
output_dir: Path,
|
49
|
+
output_path_generator: Callable,
|
50
|
+
collection_id: Optional[str] = None,
|
51
|
+
collection_description: Optional[str] = "",
|
52
|
+
stac: Optional[Union[str, Path]] = None,
|
53
|
+
post_job_action: Optional[Callable] = None,
|
54
|
+
poll_sleep: int = 5,
|
55
|
+
n_threads: int = 1,
|
56
|
+
post_job_params: dict = {},
|
57
|
+
resume_postproc: bool = True, # If we need to check for post-job actions that crashed
|
58
|
+
restart_failed: bool = False, # If we need to restart failed jobs
|
59
|
+
):
|
60
|
+
self._output_dir = output_dir
|
61
|
+
|
62
|
+
self.stac = stac
|
63
|
+
self.collection_id = collection_id
|
64
|
+
self.collection_description = collection_description
|
65
|
+
|
66
|
+
# Setup the threads to work on the on_job_done and on_job_error methods
|
67
|
+
self._n_threads = n_threads
|
68
|
+
self._executor = None # Will be set in run_jobs, is a threadpool executor
|
69
|
+
self._futures = []
|
70
|
+
self._to_resume_postjob = (
|
71
|
+
resume_postproc # If we need to check for post-job actions that crashed
|
72
|
+
)
|
73
|
+
self._to_restart_failed = restart_failed # If we need to restart failed jobs
|
74
|
+
|
75
|
+
self._output_path_gen = output_path_generator
|
76
|
+
self._post_job_action = post_job_action
|
77
|
+
self._post_job_params = post_job_params
|
78
|
+
|
79
|
+
# Monkey patching the _normalize_df method to ensure we have no modification on the
|
80
|
+
# geometry column
|
81
|
+
MultiBackendJobManager._normalize_df = self._normalize_df
|
82
|
+
super().__init__(poll_sleep)
|
83
|
+
|
84
|
+
self._root_collection = self._normalize_stac()
|
85
|
+
|
86
|
+
def _normalize_stac(self):
|
87
|
+
default_collection_path = self._output_dir / "stac/collection.json"
|
88
|
+
if self.stac is not None:
|
89
|
+
_log.info(
|
90
|
+
f"Reloading the STAC collection from the provided path: {self.stac}."
|
91
|
+
)
|
92
|
+
root_collection = pystac.read_file(str(self.stac))
|
93
|
+
elif default_collection_path.exists():
|
94
|
+
_log.info(
|
95
|
+
f"Reload the STAC collection from the default path: {default_collection_path}."
|
96
|
+
)
|
97
|
+
self.stac = default_collection_path
|
98
|
+
root_collection = pystac.read_file(str(self.stac))
|
99
|
+
else:
|
100
|
+
_log.info("Starting a fresh STAC collection.")
|
101
|
+
assert (
|
102
|
+
self.collection_id is not None
|
103
|
+
), "A collection ID is required to generate a STAC collection."
|
104
|
+
root_collection = pystac.Collection(
|
105
|
+
id=self.collection_id,
|
106
|
+
description=self.collection_description,
|
107
|
+
extent=None,
|
108
|
+
)
|
109
|
+
root_collection.license = constants.LICENSE
|
110
|
+
root_collection.add_link(constants.LICENSE_LINK)
|
111
|
+
root_collection.stac_extensions = constants.STAC_EXTENSIONS
|
112
|
+
|
113
|
+
return root_collection
|
114
|
+
|
115
|
+
def _clear_queued_actions(self):
|
116
|
+
"""Checks if the post-job actions are finished and clears them from the list of futures.
|
117
|
+
If an exception occured, it is raised to the GFMAPJobManage main thread.
|
118
|
+
"""
|
119
|
+
# Checking if any post-job action has finished or not
|
120
|
+
futures_to_clear = []
|
121
|
+
for future in self._futures:
|
122
|
+
if future.done():
|
123
|
+
exception = future.exception(timeout=1.0)
|
124
|
+
if exception:
|
125
|
+
raise exception
|
126
|
+
futures_to_clear.append(future)
|
127
|
+
for future in futures_to_clear:
|
128
|
+
self._futures.remove(future)
|
129
|
+
|
130
|
+
def _wait_queued_actions(self):
|
131
|
+
"""Waits for all the queued actions to finish."""
|
132
|
+
for future in self._futures:
|
133
|
+
# Wait for the future to finish and get the potential exception
|
134
|
+
exception = future.exception(timeout=None)
|
135
|
+
if exception:
|
136
|
+
raise exception
|
137
|
+
|
138
|
+
def _resume_postjob_actions(self, df: pd.DataFrame):
|
139
|
+
"""Resumes the jobs that were in the `postprocessing` or `postprocessing-error` state, as
|
140
|
+
they most likely crashed before finishing their post-job action.
|
141
|
+
|
142
|
+
df: pd.DataFrame
|
143
|
+
The job-tracking dataframe initialized or loaded by the multibackend job manager.
|
144
|
+
"""
|
145
|
+
postprocessing_tasks = df[
|
146
|
+
df.status.isin(["postprocessing", "postprocessing-error"])
|
147
|
+
]
|
148
|
+
for idx, row in postprocessing_tasks.iterrows():
|
149
|
+
connection = self._get_connection(row.backend_name)
|
150
|
+
job = connection.job(row.id)
|
151
|
+
if row.status == "postprocessing":
|
152
|
+
_log.info(
|
153
|
+
f"Resuming postprocessing of job {row.id}, queueing on_job_finished..."
|
154
|
+
)
|
155
|
+
future = self._executor.submit(self.on_job_done, job, row)
|
156
|
+
future.add_done_callback(partial(done_callback, df=df, idx=idx))
|
157
|
+
else:
|
158
|
+
_log.info(
|
159
|
+
f"Resuming postprocessing of job {row.id}, queueing on_job_error..."
|
160
|
+
)
|
161
|
+
future = self._executor.submit(self.on_job_error, job, row)
|
162
|
+
future.add_done_callback(partial(done_callback, df=df, idx=idx))
|
163
|
+
self._futures.append(future)
|
164
|
+
|
165
|
+
def _restart_failed_jobs(self, df: pd.DataFrame):
|
166
|
+
"""Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
|
167
|
+
failed_tasks = df[df.status == "error"]
|
168
|
+
not_started_tasks = df[df.status == "not_started"]
|
169
|
+
_log.info(
|
170
|
+
f"Resetting {len(failed_tasks)} failed jobs to 'not_started'. {len(not_started_tasks)} jobs are already 'not_started'."
|
171
|
+
)
|
172
|
+
for idx, _ in failed_tasks.iterrows():
|
173
|
+
df.loc[idx, "status"] = "not_started"
|
174
|
+
|
175
|
+
def _update_statuses(self, df: pd.DataFrame):
|
176
|
+
"""Updates the statues of the jobs in the dataframe from the backend. If a job is finished
|
177
|
+
or failed, it will be queued to the `on_job_done` or `on_job_error` methods.
|
178
|
+
|
179
|
+
The method is executed every `poll_sleep` seconds.
|
180
|
+
"""
|
181
|
+
if self._to_restart_failed: # Make sure it runs only the first time
|
182
|
+
self._restart_failed_jobs(df)
|
183
|
+
self._to_restart_failed = False
|
184
|
+
|
185
|
+
if self._to_resume_postjob: # Make sure it runs only the first time
|
186
|
+
self._resume_postjob_actions(df)
|
187
|
+
self._to_resume_postjob = False
|
188
|
+
|
189
|
+
active = df[df.status.isin(["created", "queued", "running"])]
|
190
|
+
for idx, row in active.iterrows():
|
191
|
+
# Parses the backend from the csv
|
192
|
+
connection = self._get_connection(row.backend_name)
|
193
|
+
job = connection.job(row.id)
|
194
|
+
job_metadata = job.describe_job()
|
195
|
+
job_status = job_metadata["status"]
|
196
|
+
_log.debug(
|
197
|
+
msg=f"Status of job {job.job_id} is {job_status} (on backend {row.backend_name}).",
|
198
|
+
)
|
199
|
+
|
200
|
+
# Update the status if the job finished since last check
|
201
|
+
# Case is which it finished sucessfully
|
202
|
+
if (df.loc[idx, "status"] in ["created", "queued", "running"]) and (
|
203
|
+
job_metadata["status"] == "finished"
|
204
|
+
):
|
205
|
+
_log.info(
|
206
|
+
f"Job {job.job_id} finished successfully, queueing on_job_done..."
|
207
|
+
)
|
208
|
+
job_status = "postprocessing"
|
209
|
+
future = self._executor.submit(self.on_job_done, job, row)
|
210
|
+
# Future will setup the status to finished when the job is done
|
211
|
+
future.add_done_callback(partial(done_callback, df=df, idx=idx))
|
212
|
+
self._futures.append(future)
|
213
|
+
df.loc[idx, "costs"] = job_metadata["costs"]
|
214
|
+
|
215
|
+
# Case in which it failed
|
216
|
+
if (df.loc[idx, "status"] != "error") and (
|
217
|
+
job_metadata["status"] == "error"
|
218
|
+
):
|
219
|
+
_log.info(
|
220
|
+
f"Job {job.job_id} finished with error, queueing on_job_error..."
|
221
|
+
)
|
222
|
+
job_status = "postprocessing-error"
|
223
|
+
future = self._executor.submit(self.on_job_error, job, row)
|
224
|
+
# Future will setup the status to error when the job is done
|
225
|
+
future.add_done_callback(partial(done_callback, df=df, idx=idx))
|
226
|
+
self._futures.append(future)
|
227
|
+
df.loc[idx, "costs"] = job_metadata["costs"]
|
228
|
+
|
229
|
+
df.loc[idx, "status"] = job_status
|
230
|
+
|
231
|
+
# Clear the futures that are done and raise their potential exceptions if they occurred.
|
232
|
+
self._clear_queued_actions()
|
233
|
+
|
234
|
+
def on_job_error(self, job: BatchJob, row: pd.Series):
|
235
|
+
"""Method called when a job finishes with an error.
|
236
|
+
|
237
|
+
Parameters
|
238
|
+
----------
|
239
|
+
job: BatchJob
|
240
|
+
The job that finished with an error.
|
241
|
+
row: pd.Series
|
242
|
+
The row in the dataframe that contains the job relative information.
|
243
|
+
"""
|
244
|
+
logs = job.logs()
|
245
|
+
error_logs = [log for log in logs if log.level.lower() == "error"]
|
246
|
+
|
247
|
+
job_metadata = job.describe_job()
|
248
|
+
title = job_metadata["title"]
|
249
|
+
job_id = job_metadata["id"]
|
250
|
+
|
251
|
+
output_log_path = (
|
252
|
+
Path(self._output_dir) / "failed_jobs" / f"{title}_{job_id}.log"
|
253
|
+
)
|
254
|
+
output_log_path.parent.mkdir(parents=True, exist_ok=True)
|
255
|
+
|
256
|
+
if len(error_logs) > 0:
|
257
|
+
output_log_path.write_text(json.dumps(error_logs, indent=2))
|
258
|
+
else:
|
259
|
+
output_log_path.write_text(
|
260
|
+
f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
|
261
|
+
)
|
262
|
+
|
263
|
+
def on_job_done(self, job: BatchJob, row: pd.Series):
|
264
|
+
"""Method called when a job finishes successfully. It will first download the results of
|
265
|
+
the job and then call the `post_job_action` method.
|
266
|
+
"""
|
267
|
+
job_products = {}
|
268
|
+
for idx, asset in enumerate(job.get_results().get_assets()):
|
269
|
+
try:
|
270
|
+
_log.debug(
|
271
|
+
f"Generating output path for asset {asset.name} from job {job.job_id}..."
|
272
|
+
)
|
273
|
+
output_path = self._output_path_gen(self._output_dir, idx, row)
|
274
|
+
# Make the output path
|
275
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
276
|
+
asset.download(output_path)
|
277
|
+
# Add to the list of downloaded products
|
278
|
+
job_products[f"{job.job_id}_{asset.name}"] = [output_path]
|
279
|
+
_log.debug(
|
280
|
+
f"Downloaded {asset.name} from job {job.job_id} -> {output_path}"
|
281
|
+
)
|
282
|
+
except Exception as e:
|
283
|
+
_log.exception(
|
284
|
+
f"Error downloading asset {asset.name} from job {job.job_id}", e
|
285
|
+
)
|
286
|
+
raise e
|
287
|
+
|
288
|
+
# First update the STAC collection with the assets directly resulting from the OpenEO batch job
|
289
|
+
job_metadata = pystac.Collection.from_dict(job.get_results().get_metadata())
|
290
|
+
job_items = []
|
291
|
+
|
292
|
+
for item_metadata in job_metadata.get_all_items():
|
293
|
+
try:
|
294
|
+
item = pystac.read_file(item_metadata.get_self_href())
|
295
|
+
asset_name = list(item.assets.values())[0].title
|
296
|
+
asset_path = job_products[f"{job.job_id}_{asset_name}"][0]
|
297
|
+
|
298
|
+
assert (
|
299
|
+
len(item.assets.values()) == 1
|
300
|
+
), "Each item should only contain one asset"
|
301
|
+
for asset in item.assets.values():
|
302
|
+
asset.href = str(
|
303
|
+
asset_path
|
304
|
+
) # Update the asset href to the output location set by the output_path_generator
|
305
|
+
# item.id = f"{job.job_id}_{item.id}"
|
306
|
+
# Add the item to the the current job items.
|
307
|
+
job_items.append(item)
|
308
|
+
_log.info(f"Parsed item {item.id} from job {job.job_id}")
|
309
|
+
except Exception as e:
|
310
|
+
_log.exception(
|
311
|
+
f"Error failed to add item {item.id} from job {job.job_id} to STAC collection",
|
312
|
+
e,
|
313
|
+
)
|
314
|
+
raise e
|
315
|
+
|
316
|
+
# _post_job_action returns an updated list of stac items. Post job action can therefore
|
317
|
+
# update the stac items and access their products through the HREF. It is also the
|
318
|
+
# reponsible of adding the appropriate metadata/assets to the items.
|
319
|
+
if self._post_job_action is not None:
|
320
|
+
_log.debug(f"Calling post job action for job {job.job_id}...")
|
321
|
+
job_items = self._post_job_action(job_items, row, self._post_job_params)
|
322
|
+
|
323
|
+
_log.info(f"Adding {len(job_items)} items to the STAC collection...")
|
324
|
+
|
325
|
+
with _stac_lock: # Take the STAC lock to avoid concurrence issues
|
326
|
+
# Filters the job items to only keep the ones that are not already in the collection
|
327
|
+
existing_ids = [item.id for item in self._root_collection.get_all_items()]
|
328
|
+
job_items = [item for item in job_items if item.id not in existing_ids]
|
329
|
+
|
330
|
+
self._root_collection.add_items(job_items)
|
331
|
+
_log.info(f"Added {len(job_items)} items to the STAC collection.")
|
332
|
+
|
333
|
+
_log.info(f"Writing STAC collection for {job.job_id} to file...")
|
334
|
+
try:
|
335
|
+
self._write_stac()
|
336
|
+
except Exception as e:
|
337
|
+
_log.exception(
|
338
|
+
f"Error writing STAC collection for job {job.job_id} to file.", e
|
339
|
+
)
|
340
|
+
raise e
|
341
|
+
_log.info(f"Wrote STAC collection for {job.job_id} to file.")
|
342
|
+
|
343
|
+
_log.info(f"Job {job.job_id} and post job action finished successfully.")
|
344
|
+
|
345
|
+
def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
|
346
|
+
"""Ensure we have the required columns and the expected type for the geometry column.
|
347
|
+
|
348
|
+
:param df: The dataframe to normalize.
|
349
|
+
:return: a new dataframe that is normalized.
|
350
|
+
"""
|
351
|
+
|
352
|
+
# check for some required columns.
|
353
|
+
required_with_default = [
|
354
|
+
("status", "not_started"),
|
355
|
+
("id", None),
|
356
|
+
("start_time", None),
|
357
|
+
("cpu", None),
|
358
|
+
("memory", None),
|
359
|
+
("duration", None),
|
360
|
+
("backend_name", None),
|
361
|
+
("description", None),
|
362
|
+
("costs", None),
|
363
|
+
]
|
364
|
+
new_columns = {
|
365
|
+
col: val for (col, val) in required_with_default if col not in df.columns
|
366
|
+
}
|
367
|
+
df = df.assign(**new_columns)
|
368
|
+
|
369
|
+
_log.debug(f"Normalizing dataframe. Columns: {df.columns}")
|
370
|
+
|
371
|
+
return df
|
372
|
+
|
373
|
+
def run_jobs(
|
374
|
+
self, df: pd.DataFrame, start_job: Callable, output_file: Union[str, Path]
|
375
|
+
):
|
376
|
+
"""Starts the jobs defined in the dataframe and runs the `start_job` function on each job.
|
377
|
+
|
378
|
+
Parameters
|
379
|
+
----------
|
380
|
+
df: pd.DataFrame
|
381
|
+
The dataframe containing the jobs to be started. The dataframe expects the following columns:
|
382
|
+
|
383
|
+
* `backend_name`: Name of the backend to use.
|
384
|
+
* Additional fields that will be used in your custom job creation function `start_job`
|
385
|
+
as well as in post-job actions and path generator.
|
386
|
+
|
387
|
+
The following column names are RESERVED for the managed of the jobs, please do not
|
388
|
+
provide them in the input df:
|
389
|
+
|
390
|
+
* `status`: Current status of the job.
|
391
|
+
* `id`: Job ID, used to access job information from the backend.
|
392
|
+
* `start_time`: The time at which the job was started.
|
393
|
+
* `cpu`: The amount of CPU used by the job.
|
394
|
+
* `memory`: The amount of memory used by the job.
|
395
|
+
* `duration`: The duration of the job.
|
396
|
+
|
397
|
+
start_job: Callable
|
398
|
+
Callable function that will take in argument the rows of each job and that will
|
399
|
+
create a datacube.
|
400
|
+
output_file: Union[str, Path]
|
401
|
+
The file to track the results of the jobs.
|
402
|
+
"""
|
403
|
+
# Starts the thread pool to work on the on_job_done and on_job_error methods
|
404
|
+
_log.info(f"Starting ThreadPoolExecutor with {self._n_threads} workers.")
|
405
|
+
with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
|
406
|
+
_log.info("Creating and running jobs.")
|
407
|
+
self._executor = executor
|
408
|
+
super().run_jobs(df, start_job, output_file)
|
409
|
+
_log.info(
|
410
|
+
"Quitting job tracking & waiting for last post-job actions to finish."
|
411
|
+
)
|
412
|
+
self._wait_queued_actions()
|
413
|
+
_log.info("Exiting ThreadPoolExecutor.")
|
414
|
+
self._executor = None
|
415
|
+
|
416
|
+
def _write_stac(self):
|
417
|
+
"""Writes the STAC collection to the output directory."""
|
418
|
+
if not self._root_collection.get_self_href():
|
419
|
+
self._root_collection.set_self_href(str(self._output_dir / "stac"))
|
420
|
+
|
421
|
+
self._root_collection.update_extent_from_items()
|
422
|
+
|
423
|
+
# Setups the root path for the normalization
|
424
|
+
root_path = Path(self._root_collection.self_href)
|
425
|
+
if root_path.is_file():
|
426
|
+
root_path = root_path.parent
|
427
|
+
|
428
|
+
self._root_collection.normalize_hrefs(str(root_path))
|
429
|
+
self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
|
430
|
+
|
431
|
+
def setup_stac(
|
432
|
+
self,
|
433
|
+
constellation: Optional[str] = None,
|
434
|
+
output_path: Optional[Union[str, Path]] = None,
|
435
|
+
item_assets: Optional[dict] = None,
|
436
|
+
):
|
437
|
+
"""Method to be called after run_jobs to setup details of the STAC collection
|
438
|
+
such as the constellation, root directory and item assets extensions.
|
439
|
+
|
440
|
+
Parameters
|
441
|
+
----------
|
442
|
+
constellation: Optional[str]
|
443
|
+
The constellation for which to create the STAC metadata, if None no STAC metadata will be added
|
444
|
+
The following constellations are supported:
|
445
|
+
|
446
|
+
* 'sentinel1'
|
447
|
+
* 'sentinel2'
|
448
|
+
|
449
|
+
output_path: Optional[Union[str, Path]]
|
450
|
+
The path to write the STAC collection to. If None, the STAC collection will be written to self.output_dir / 'stac'
|
451
|
+
item_assets: Optional[dict]
|
452
|
+
A dictionary containing pystac.extensions.item_assets.AssetDefinition objects to be added to the STAC collection
|
453
|
+
https://github.com/stac-extensions/item-assets
|
454
|
+
"""
|
455
|
+
if output_path:
|
456
|
+
self._root_collection.set_self_href(str(output_path))
|
457
|
+
|
458
|
+
if constellation and "summaries" not in self._root_collection.extra_fields:
|
459
|
+
self._root_collection.extra_fields["summaries"] = constants.SUMMARIES.get(
|
460
|
+
constellation, pystac.summaries.Summaries({})
|
461
|
+
).to_dict()
|
462
|
+
|
463
|
+
if item_assets and "item_assets" not in self._root_collection.extra_fields:
|
464
|
+
item_asset_extension = (
|
465
|
+
pystac.extensions.item_assets.ItemAssetsExtension.ext(
|
466
|
+
self._root_collection, add_if_missing=True
|
467
|
+
)
|
468
|
+
)
|
469
|
+
item_asset_extension.item_assets = item_assets
|
@@ -0,0 +1,144 @@
|
|
1
|
+
"""Job splitter functionalities, except input points/polygons to extract in the
|
2
|
+
form of a GeoDataFrames.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import List
|
7
|
+
|
8
|
+
import geopandas as gpd
|
9
|
+
import h3
|
10
|
+
import requests
|
11
|
+
|
12
|
+
from openeo_gfmap.manager import _log
|
13
|
+
|
14
|
+
|
15
|
+
def load_s2_grid() -> gpd.GeoDataFrame:
|
16
|
+
"""Returns a geo data frame from the S2 grid."""
|
17
|
+
# Builds the path where the geodataframe should be
|
18
|
+
gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds.geojson"
|
19
|
+
if not gdf_path.exists():
|
20
|
+
_log.info("S2 grid not found, downloading it from artifactory.")
|
21
|
+
# Downloads the file from the artifactory URL
|
22
|
+
gdf_path.parent.mkdir(exist_ok=True)
|
23
|
+
response = requests.get(
|
24
|
+
"https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds.geojson",
|
25
|
+
timeout=180, # 3mins
|
26
|
+
)
|
27
|
+
with open(gdf_path, "wb") as f:
|
28
|
+
f.write(response.content)
|
29
|
+
return gpd.read_file(gdf_path)
|
30
|
+
|
31
|
+
|
32
|
+
def _resplit_group(
|
33
|
+
polygons: gpd.GeoDataFrame, max_points: int
|
34
|
+
) -> List[gpd.GeoDataFrame]:
|
35
|
+
"""Performs re-splitting of a dataset of polygons in a list of datasets"""
|
36
|
+
for i in range(0, len(polygons), max_points):
|
37
|
+
yield polygons.iloc[i : i + max_points].reset_index(drop=True)
|
38
|
+
|
39
|
+
|
40
|
+
def split_job_s2grid(
|
41
|
+
polygons: gpd.GeoDataFrame, max_points: int = 500
|
42
|
+
) -> List[gpd.GeoDataFrame]:
|
43
|
+
"""Split a job into multiple jobs from the position of the polygons/points. The centroid of
|
44
|
+
the geometries to extract are used to select tile in the Sentinel-2 tile grid.
|
45
|
+
|
46
|
+
Parameters
|
47
|
+
----------
|
48
|
+
polygons: gpd.GeoDataFrae
|
49
|
+
Dataset containing the polygons to split the job by with a `geometry` column.
|
50
|
+
max_points: int
|
51
|
+
The maximum number of points to be included in each job.
|
52
|
+
Returns:
|
53
|
+
--------
|
54
|
+
split_polygons: list
|
55
|
+
List of jobs, split by the GeoDataFrame.
|
56
|
+
"""
|
57
|
+
if "geometry" not in polygons.columns:
|
58
|
+
raise ValueError("The GeoDataFrame must contain a 'geometry' column.")
|
59
|
+
|
60
|
+
if polygons.crs is None:
|
61
|
+
raise ValueError("The GeoDataFrame must contain a CRS")
|
62
|
+
|
63
|
+
polygons = polygons.to_crs(epsg=4326)
|
64
|
+
if polygons.geometry.geom_type[0] != "Point":
|
65
|
+
polygons["geometry"] = polygons.geometry.centroid
|
66
|
+
|
67
|
+
# Dataset containing all the S2 tiles, find the nearest S2 tile for each point
|
68
|
+
s2_grid = load_s2_grid()
|
69
|
+
s2_grid["geometry"] = s2_grid.geometry.centroid
|
70
|
+
|
71
|
+
polygons = gpd.sjoin_nearest(polygons, s2_grid[["tile", "geometry"]]).drop(
|
72
|
+
columns=["index_right"]
|
73
|
+
)
|
74
|
+
|
75
|
+
split_datasets = []
|
76
|
+
for _, sub_gdf in polygons.groupby("tile"):
|
77
|
+
if len(sub_gdf) > max_points:
|
78
|
+
# Performs another split
|
79
|
+
split_datasets.extend(_resplit_group(sub_gdf, max_points))
|
80
|
+
else:
|
81
|
+
split_datasets.append(sub_gdf.reset_index(drop=True))
|
82
|
+
return split_datasets
|
83
|
+
|
84
|
+
|
85
|
+
def append_h3_index(
|
86
|
+
polygons: gpd.GeoDataFrame, grid_resolution: int = 3
|
87
|
+
) -> gpd.GeoDataFrame:
|
88
|
+
"""Append the H3 index to the polygons."""
|
89
|
+
if polygons.geometry.geom_type[0] != "Point":
|
90
|
+
geom_col = polygons.geometry.centroid
|
91
|
+
else:
|
92
|
+
geom_col = polygons.geometry
|
93
|
+
polygons["h3index"] = geom_col.apply(
|
94
|
+
lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
|
95
|
+
)
|
96
|
+
return polygons
|
97
|
+
|
98
|
+
|
99
|
+
def split_job_hex(
|
100
|
+
polygons: gpd.GeoDataFrame, max_points: int = 500, grid_resolution: int = 3
|
101
|
+
) -> List[gpd.GeoDataFrame]:
|
102
|
+
"""Split a job into multiple jobs from the position of the polygons/points. The centroid of
|
103
|
+
the geometries to extract are used to select a hexagon in the H3 grid. Using the H3 grid
|
104
|
+
allows to split jobs in equal areas, which is useful for parallel processing while taking into
|
105
|
+
account OpenEO's limitations.
|
106
|
+
|
107
|
+
Parameters
|
108
|
+
----------
|
109
|
+
polygons: gpd.GeoDataFrae
|
110
|
+
Dataset containing the polygons to split the job by with a `geometry` column.
|
111
|
+
max_points: int
|
112
|
+
The maximum number of points to be included in each job.
|
113
|
+
grid_resolution: int
|
114
|
+
The scale to use in the H3 hexagonal grid to split jobs to, default is 4. Changing the
|
115
|
+
grid scale will drastically increase/decrease the area on which jobs will work.
|
116
|
+
More information on the H3 grid can be found at
|
117
|
+
https://h3geo.org/docs/core-library/restable
|
118
|
+
Returns:
|
119
|
+
--------
|
120
|
+
split_polygons: list
|
121
|
+
List of jobs, split by the GeoDataFrame.
|
122
|
+
"""
|
123
|
+
|
124
|
+
if "geometry" not in polygons.columns:
|
125
|
+
raise ValueError("The GeoDataFrame must contain a 'geometry' column.")
|
126
|
+
|
127
|
+
if polygons.crs is None:
|
128
|
+
raise ValueError("The GeoDataFrame must contain a CRS")
|
129
|
+
|
130
|
+
# Project to lat/lon positions
|
131
|
+
polygons = polygons.to_crs(epsg=4326)
|
132
|
+
|
133
|
+
# Split the polygons into multiple jobs
|
134
|
+
polygons = append_h3_index(polygons, grid_resolution)
|
135
|
+
|
136
|
+
split_datasets = []
|
137
|
+
for _, sub_gdf in polygons.groupby("h3index"):
|
138
|
+
if len(sub_gdf) > max_points:
|
139
|
+
# Performs another split
|
140
|
+
split_datasets.extend(_resplit_group(sub_gdf, max_points))
|
141
|
+
else:
|
142
|
+
split_datasets.append(sub_gdf.reset_index(drop=True))
|
143
|
+
|
144
|
+
return split_datasets
|
openeo_gfmap/metadata.py
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
"""Metadata utilities related to the usage of a DataCube. Used to interract
|
2
|
+
with the OpenEO backends and cover some shortcomings.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class FakeMetadata:
|
10
|
+
"""Fake metdata object used for datacubes fetched from STAC catalogues.
|
11
|
+
This is used as a temporal fix for OpenEO backend shortcomings, but
|
12
|
+
will become unused with the time.
|
13
|
+
"""
|
14
|
+
|
15
|
+
band_names: list
|
16
|
+
|
17
|
+
def rename_labels(self, _, target, source):
|
18
|
+
"""Rename the labels of the band dimension."""
|
19
|
+
mapping = dict(zip(target, source))
|
20
|
+
band_names = self.band_names.copy()
|
21
|
+
for idx, name in enumerate(band_names):
|
22
|
+
if name in target:
|
23
|
+
self.band_names[idx] = mapping[name]
|
24
|
+
return self
|
@@ -0,0 +1,22 @@
|
|
1
|
+
"""Preprocessing functions for OpenEO DataCubes. The prepreocessing occurs
|
2
|
+
right after the extraction and the execution of the features UDF.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from openeo_gfmap.preprocessing.cloudmasking import (
|
6
|
+
bap_masking,
|
7
|
+
get_bap_mask,
|
8
|
+
get_bap_score,
|
9
|
+
mask_scl_dilation,
|
10
|
+
)
|
11
|
+
from openeo_gfmap.preprocessing.compositing import mean_compositing, median_compositing
|
12
|
+
from openeo_gfmap.preprocessing.interpolation import linear_interpolation
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
"mask_scl_dilation",
|
16
|
+
"linear_interpolation",
|
17
|
+
"median_compositing",
|
18
|
+
"mean_compositing",
|
19
|
+
"get_bap_score",
|
20
|
+
"get_bap_mask",
|
21
|
+
"bap_masking",
|
22
|
+
]
|