openeo-gfmap 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. openeo_gfmap/__init__.py +23 -0
  2. openeo_gfmap/backend.py +122 -0
  3. openeo_gfmap/features/__init__.py +17 -0
  4. openeo_gfmap/features/feature_extractor.py +389 -0
  5. openeo_gfmap/fetching/__init__.py +21 -0
  6. openeo_gfmap/fetching/commons.py +213 -0
  7. openeo_gfmap/fetching/fetching.py +98 -0
  8. openeo_gfmap/fetching/generic.py +165 -0
  9. openeo_gfmap/fetching/meteo.py +126 -0
  10. openeo_gfmap/fetching/s1.py +195 -0
  11. openeo_gfmap/fetching/s2.py +236 -0
  12. openeo_gfmap/inference/__init__.py +3 -0
  13. openeo_gfmap/inference/model_inference.py +347 -0
  14. openeo_gfmap/manager/__init__.py +31 -0
  15. openeo_gfmap/manager/job_manager.py +469 -0
  16. openeo_gfmap/manager/job_splitters.py +144 -0
  17. openeo_gfmap/metadata.py +24 -0
  18. openeo_gfmap/preprocessing/__init__.py +22 -0
  19. openeo_gfmap/preprocessing/cloudmasking.py +268 -0
  20. openeo_gfmap/preprocessing/compositing.py +74 -0
  21. openeo_gfmap/preprocessing/interpolation.py +12 -0
  22. openeo_gfmap/preprocessing/sar.py +64 -0
  23. openeo_gfmap/preprocessing/scaling.py +65 -0
  24. openeo_gfmap/preprocessing/udf_cldmask.py +36 -0
  25. openeo_gfmap/preprocessing/udf_rank.py +37 -0
  26. openeo_gfmap/preprocessing/udf_score.py +103 -0
  27. openeo_gfmap/spatial.py +53 -0
  28. openeo_gfmap/stac/__init__.py +2 -0
  29. openeo_gfmap/stac/constants.py +51 -0
  30. openeo_gfmap/temporal.py +22 -0
  31. openeo_gfmap/utils/__init__.py +23 -0
  32. openeo_gfmap/utils/build_df.py +48 -0
  33. openeo_gfmap/utils/catalogue.py +248 -0
  34. openeo_gfmap/utils/intervals.py +64 -0
  35. openeo_gfmap/utils/netcdf.py +25 -0
  36. openeo_gfmap/utils/tile_processing.py +64 -0
  37. openeo_gfmap-0.1.0.dist-info/METADATA +57 -0
  38. openeo_gfmap-0.1.0.dist-info/RECORD +40 -0
  39. openeo_gfmap-0.1.0.dist-info/WHEEL +4 -0
  40. openeo_gfmap-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,469 @@
1
+ import json
2
+ import threading
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from enum import Enum
5
+ from functools import partial
6
+ from pathlib import Path
7
+ from typing import Callable, Optional, Union
8
+
9
+ import pandas as pd
10
+ import pystac
11
+ from openeo.extra.job_management import MultiBackendJobManager
12
+ from openeo.rest.job import BatchJob
13
+ from pystac import CatalogType
14
+
15
+ from openeo_gfmap.manager import _log
16
+ from openeo_gfmap.stac import constants
17
+
18
+ # Lock to use when writing to the STAC collection
19
+ _stac_lock = threading.Lock()
20
+
21
+
22
+ def done_callback(future, df, idx):
23
+ """Sets the status of the job to the given status when the future is done."""
24
+ current_status = df.loc[idx, "status"]
25
+ if not future.exception():
26
+ if current_status == "postprocessing":
27
+ df.loc[idx, "status"] = "finished"
28
+ elif current_status == "postprocessing-error":
29
+ df.loc[idx, "status"] = "error"
30
+ else:
31
+ raise ValueError(
32
+ f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
33
+ )
34
+
35
+
36
+ class PostJobStatus(Enum):
37
+ """Indicates the workers if the job finished as sucessful or with an error."""
38
+
39
+ FINISHED = "finished"
40
+ ERROR = "error"
41
+
42
+
43
+ class GFMAPJobManager(MultiBackendJobManager):
44
+ """A job manager for the GFMAP backend."""
45
+
46
+ def __init__(
47
+ self,
48
+ output_dir: Path,
49
+ output_path_generator: Callable,
50
+ collection_id: Optional[str] = None,
51
+ collection_description: Optional[str] = "",
52
+ stac: Optional[Union[str, Path]] = None,
53
+ post_job_action: Optional[Callable] = None,
54
+ poll_sleep: int = 5,
55
+ n_threads: int = 1,
56
+ post_job_params: dict = {},
57
+ resume_postproc: bool = True, # If we need to check for post-job actions that crashed
58
+ restart_failed: bool = False, # If we need to restart failed jobs
59
+ ):
60
+ self._output_dir = output_dir
61
+
62
+ self.stac = stac
63
+ self.collection_id = collection_id
64
+ self.collection_description = collection_description
65
+
66
+ # Setup the threads to work on the on_job_done and on_job_error methods
67
+ self._n_threads = n_threads
68
+ self._executor = None # Will be set in run_jobs, is a threadpool executor
69
+ self._futures = []
70
+ self._to_resume_postjob = (
71
+ resume_postproc # If we need to check for post-job actions that crashed
72
+ )
73
+ self._to_restart_failed = restart_failed # If we need to restart failed jobs
74
+
75
+ self._output_path_gen = output_path_generator
76
+ self._post_job_action = post_job_action
77
+ self._post_job_params = post_job_params
78
+
79
+ # Monkey patching the _normalize_df method to ensure we have no modification on the
80
+ # geometry column
81
+ MultiBackendJobManager._normalize_df = self._normalize_df
82
+ super().__init__(poll_sleep)
83
+
84
+ self._root_collection = self._normalize_stac()
85
+
86
+ def _normalize_stac(self):
87
+ default_collection_path = self._output_dir / "stac/collection.json"
88
+ if self.stac is not None:
89
+ _log.info(
90
+ f"Reloading the STAC collection from the provided path: {self.stac}."
91
+ )
92
+ root_collection = pystac.read_file(str(self.stac))
93
+ elif default_collection_path.exists():
94
+ _log.info(
95
+ f"Reload the STAC collection from the default path: {default_collection_path}."
96
+ )
97
+ self.stac = default_collection_path
98
+ root_collection = pystac.read_file(str(self.stac))
99
+ else:
100
+ _log.info("Starting a fresh STAC collection.")
101
+ assert (
102
+ self.collection_id is not None
103
+ ), "A collection ID is required to generate a STAC collection."
104
+ root_collection = pystac.Collection(
105
+ id=self.collection_id,
106
+ description=self.collection_description,
107
+ extent=None,
108
+ )
109
+ root_collection.license = constants.LICENSE
110
+ root_collection.add_link(constants.LICENSE_LINK)
111
+ root_collection.stac_extensions = constants.STAC_EXTENSIONS
112
+
113
+ return root_collection
114
+
115
+ def _clear_queued_actions(self):
116
+ """Checks if the post-job actions are finished and clears them from the list of futures.
117
+ If an exception occured, it is raised to the GFMAPJobManage main thread.
118
+ """
119
+ # Checking if any post-job action has finished or not
120
+ futures_to_clear = []
121
+ for future in self._futures:
122
+ if future.done():
123
+ exception = future.exception(timeout=1.0)
124
+ if exception:
125
+ raise exception
126
+ futures_to_clear.append(future)
127
+ for future in futures_to_clear:
128
+ self._futures.remove(future)
129
+
130
+ def _wait_queued_actions(self):
131
+ """Waits for all the queued actions to finish."""
132
+ for future in self._futures:
133
+ # Wait for the future to finish and get the potential exception
134
+ exception = future.exception(timeout=None)
135
+ if exception:
136
+ raise exception
137
+
138
+ def _resume_postjob_actions(self, df: pd.DataFrame):
139
+ """Resumes the jobs that were in the `postprocessing` or `postprocessing-error` state, as
140
+ they most likely crashed before finishing their post-job action.
141
+
142
+ df: pd.DataFrame
143
+ The job-tracking dataframe initialized or loaded by the multibackend job manager.
144
+ """
145
+ postprocessing_tasks = df[
146
+ df.status.isin(["postprocessing", "postprocessing-error"])
147
+ ]
148
+ for idx, row in postprocessing_tasks.iterrows():
149
+ connection = self._get_connection(row.backend_name)
150
+ job = connection.job(row.id)
151
+ if row.status == "postprocessing":
152
+ _log.info(
153
+ f"Resuming postprocessing of job {row.id}, queueing on_job_finished..."
154
+ )
155
+ future = self._executor.submit(self.on_job_done, job, row)
156
+ future.add_done_callback(partial(done_callback, df=df, idx=idx))
157
+ else:
158
+ _log.info(
159
+ f"Resuming postprocessing of job {row.id}, queueing on_job_error..."
160
+ )
161
+ future = self._executor.submit(self.on_job_error, job, row)
162
+ future.add_done_callback(partial(done_callback, df=df, idx=idx))
163
+ self._futures.append(future)
164
+
165
+ def _restart_failed_jobs(self, df: pd.DataFrame):
166
+ """Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
167
+ failed_tasks = df[df.status == "error"]
168
+ not_started_tasks = df[df.status == "not_started"]
169
+ _log.info(
170
+ f"Resetting {len(failed_tasks)} failed jobs to 'not_started'. {len(not_started_tasks)} jobs are already 'not_started'."
171
+ )
172
+ for idx, _ in failed_tasks.iterrows():
173
+ df.loc[idx, "status"] = "not_started"
174
+
175
+ def _update_statuses(self, df: pd.DataFrame):
176
+ """Updates the statues of the jobs in the dataframe from the backend. If a job is finished
177
+ or failed, it will be queued to the `on_job_done` or `on_job_error` methods.
178
+
179
+ The method is executed every `poll_sleep` seconds.
180
+ """
181
+ if self._to_restart_failed: # Make sure it runs only the first time
182
+ self._restart_failed_jobs(df)
183
+ self._to_restart_failed = False
184
+
185
+ if self._to_resume_postjob: # Make sure it runs only the first time
186
+ self._resume_postjob_actions(df)
187
+ self._to_resume_postjob = False
188
+
189
+ active = df[df.status.isin(["created", "queued", "running"])]
190
+ for idx, row in active.iterrows():
191
+ # Parses the backend from the csv
192
+ connection = self._get_connection(row.backend_name)
193
+ job = connection.job(row.id)
194
+ job_metadata = job.describe_job()
195
+ job_status = job_metadata["status"]
196
+ _log.debug(
197
+ msg=f"Status of job {job.job_id} is {job_status} (on backend {row.backend_name}).",
198
+ )
199
+
200
+ # Update the status if the job finished since last check
201
+ # Case is which it finished sucessfully
202
+ if (df.loc[idx, "status"] in ["created", "queued", "running"]) and (
203
+ job_metadata["status"] == "finished"
204
+ ):
205
+ _log.info(
206
+ f"Job {job.job_id} finished successfully, queueing on_job_done..."
207
+ )
208
+ job_status = "postprocessing"
209
+ future = self._executor.submit(self.on_job_done, job, row)
210
+ # Future will setup the status to finished when the job is done
211
+ future.add_done_callback(partial(done_callback, df=df, idx=idx))
212
+ self._futures.append(future)
213
+ df.loc[idx, "costs"] = job_metadata["costs"]
214
+
215
+ # Case in which it failed
216
+ if (df.loc[idx, "status"] != "error") and (
217
+ job_metadata["status"] == "error"
218
+ ):
219
+ _log.info(
220
+ f"Job {job.job_id} finished with error, queueing on_job_error..."
221
+ )
222
+ job_status = "postprocessing-error"
223
+ future = self._executor.submit(self.on_job_error, job, row)
224
+ # Future will setup the status to error when the job is done
225
+ future.add_done_callback(partial(done_callback, df=df, idx=idx))
226
+ self._futures.append(future)
227
+ df.loc[idx, "costs"] = job_metadata["costs"]
228
+
229
+ df.loc[idx, "status"] = job_status
230
+
231
+ # Clear the futures that are done and raise their potential exceptions if they occurred.
232
+ self._clear_queued_actions()
233
+
234
+ def on_job_error(self, job: BatchJob, row: pd.Series):
235
+ """Method called when a job finishes with an error.
236
+
237
+ Parameters
238
+ ----------
239
+ job: BatchJob
240
+ The job that finished with an error.
241
+ row: pd.Series
242
+ The row in the dataframe that contains the job relative information.
243
+ """
244
+ logs = job.logs()
245
+ error_logs = [log for log in logs if log.level.lower() == "error"]
246
+
247
+ job_metadata = job.describe_job()
248
+ title = job_metadata["title"]
249
+ job_id = job_metadata["id"]
250
+
251
+ output_log_path = (
252
+ Path(self._output_dir) / "failed_jobs" / f"{title}_{job_id}.log"
253
+ )
254
+ output_log_path.parent.mkdir(parents=True, exist_ok=True)
255
+
256
+ if len(error_logs) > 0:
257
+ output_log_path.write_text(json.dumps(error_logs, indent=2))
258
+ else:
259
+ output_log_path.write_text(
260
+ f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
261
+ )
262
+
263
+ def on_job_done(self, job: BatchJob, row: pd.Series):
264
+ """Method called when a job finishes successfully. It will first download the results of
265
+ the job and then call the `post_job_action` method.
266
+ """
267
+ job_products = {}
268
+ for idx, asset in enumerate(job.get_results().get_assets()):
269
+ try:
270
+ _log.debug(
271
+ f"Generating output path for asset {asset.name} from job {job.job_id}..."
272
+ )
273
+ output_path = self._output_path_gen(self._output_dir, idx, row)
274
+ # Make the output path
275
+ output_path.parent.mkdir(parents=True, exist_ok=True)
276
+ asset.download(output_path)
277
+ # Add to the list of downloaded products
278
+ job_products[f"{job.job_id}_{asset.name}"] = [output_path]
279
+ _log.debug(
280
+ f"Downloaded {asset.name} from job {job.job_id} -> {output_path}"
281
+ )
282
+ except Exception as e:
283
+ _log.exception(
284
+ f"Error downloading asset {asset.name} from job {job.job_id}", e
285
+ )
286
+ raise e
287
+
288
+ # First update the STAC collection with the assets directly resulting from the OpenEO batch job
289
+ job_metadata = pystac.Collection.from_dict(job.get_results().get_metadata())
290
+ job_items = []
291
+
292
+ for item_metadata in job_metadata.get_all_items():
293
+ try:
294
+ item = pystac.read_file(item_metadata.get_self_href())
295
+ asset_name = list(item.assets.values())[0].title
296
+ asset_path = job_products[f"{job.job_id}_{asset_name}"][0]
297
+
298
+ assert (
299
+ len(item.assets.values()) == 1
300
+ ), "Each item should only contain one asset"
301
+ for asset in item.assets.values():
302
+ asset.href = str(
303
+ asset_path
304
+ ) # Update the asset href to the output location set by the output_path_generator
305
+ # item.id = f"{job.job_id}_{item.id}"
306
+ # Add the item to the the current job items.
307
+ job_items.append(item)
308
+ _log.info(f"Parsed item {item.id} from job {job.job_id}")
309
+ except Exception as e:
310
+ _log.exception(
311
+ f"Error failed to add item {item.id} from job {job.job_id} to STAC collection",
312
+ e,
313
+ )
314
+ raise e
315
+
316
+ # _post_job_action returns an updated list of stac items. Post job action can therefore
317
+ # update the stac items and access their products through the HREF. It is also the
318
+ # reponsible of adding the appropriate metadata/assets to the items.
319
+ if self._post_job_action is not None:
320
+ _log.debug(f"Calling post job action for job {job.job_id}...")
321
+ job_items = self._post_job_action(job_items, row, self._post_job_params)
322
+
323
+ _log.info(f"Adding {len(job_items)} items to the STAC collection...")
324
+
325
+ with _stac_lock: # Take the STAC lock to avoid concurrence issues
326
+ # Filters the job items to only keep the ones that are not already in the collection
327
+ existing_ids = [item.id for item in self._root_collection.get_all_items()]
328
+ job_items = [item for item in job_items if item.id not in existing_ids]
329
+
330
+ self._root_collection.add_items(job_items)
331
+ _log.info(f"Added {len(job_items)} items to the STAC collection.")
332
+
333
+ _log.info(f"Writing STAC collection for {job.job_id} to file...")
334
+ try:
335
+ self._write_stac()
336
+ except Exception as e:
337
+ _log.exception(
338
+ f"Error writing STAC collection for job {job.job_id} to file.", e
339
+ )
340
+ raise e
341
+ _log.info(f"Wrote STAC collection for {job.job_id} to file.")
342
+
343
+ _log.info(f"Job {job.job_id} and post job action finished successfully.")
344
+
345
+ def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
346
+ """Ensure we have the required columns and the expected type for the geometry column.
347
+
348
+ :param df: The dataframe to normalize.
349
+ :return: a new dataframe that is normalized.
350
+ """
351
+
352
+ # check for some required columns.
353
+ required_with_default = [
354
+ ("status", "not_started"),
355
+ ("id", None),
356
+ ("start_time", None),
357
+ ("cpu", None),
358
+ ("memory", None),
359
+ ("duration", None),
360
+ ("backend_name", None),
361
+ ("description", None),
362
+ ("costs", None),
363
+ ]
364
+ new_columns = {
365
+ col: val for (col, val) in required_with_default if col not in df.columns
366
+ }
367
+ df = df.assign(**new_columns)
368
+
369
+ _log.debug(f"Normalizing dataframe. Columns: {df.columns}")
370
+
371
+ return df
372
+
373
+ def run_jobs(
374
+ self, df: pd.DataFrame, start_job: Callable, output_file: Union[str, Path]
375
+ ):
376
+ """Starts the jobs defined in the dataframe and runs the `start_job` function on each job.
377
+
378
+ Parameters
379
+ ----------
380
+ df: pd.DataFrame
381
+ The dataframe containing the jobs to be started. The dataframe expects the following columns:
382
+
383
+ * `backend_name`: Name of the backend to use.
384
+ * Additional fields that will be used in your custom job creation function `start_job`
385
+ as well as in post-job actions and path generator.
386
+
387
+ The following column names are RESERVED for the managed of the jobs, please do not
388
+ provide them in the input df:
389
+
390
+ * `status`: Current status of the job.
391
+ * `id`: Job ID, used to access job information from the backend.
392
+ * `start_time`: The time at which the job was started.
393
+ * `cpu`: The amount of CPU used by the job.
394
+ * `memory`: The amount of memory used by the job.
395
+ * `duration`: The duration of the job.
396
+
397
+ start_job: Callable
398
+ Callable function that will take in argument the rows of each job and that will
399
+ create a datacube.
400
+ output_file: Union[str, Path]
401
+ The file to track the results of the jobs.
402
+ """
403
+ # Starts the thread pool to work on the on_job_done and on_job_error methods
404
+ _log.info(f"Starting ThreadPoolExecutor with {self._n_threads} workers.")
405
+ with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
406
+ _log.info("Creating and running jobs.")
407
+ self._executor = executor
408
+ super().run_jobs(df, start_job, output_file)
409
+ _log.info(
410
+ "Quitting job tracking & waiting for last post-job actions to finish."
411
+ )
412
+ self._wait_queued_actions()
413
+ _log.info("Exiting ThreadPoolExecutor.")
414
+ self._executor = None
415
+
416
+ def _write_stac(self):
417
+ """Writes the STAC collection to the output directory."""
418
+ if not self._root_collection.get_self_href():
419
+ self._root_collection.set_self_href(str(self._output_dir / "stac"))
420
+
421
+ self._root_collection.update_extent_from_items()
422
+
423
+ # Setups the root path for the normalization
424
+ root_path = Path(self._root_collection.self_href)
425
+ if root_path.is_file():
426
+ root_path = root_path.parent
427
+
428
+ self._root_collection.normalize_hrefs(str(root_path))
429
+ self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
430
+
431
+ def setup_stac(
432
+ self,
433
+ constellation: Optional[str] = None,
434
+ output_path: Optional[Union[str, Path]] = None,
435
+ item_assets: Optional[dict] = None,
436
+ ):
437
+ """Method to be called after run_jobs to setup details of the STAC collection
438
+ such as the constellation, root directory and item assets extensions.
439
+
440
+ Parameters
441
+ ----------
442
+ constellation: Optional[str]
443
+ The constellation for which to create the STAC metadata, if None no STAC metadata will be added
444
+ The following constellations are supported:
445
+
446
+ * 'sentinel1'
447
+ * 'sentinel2'
448
+
449
+ output_path: Optional[Union[str, Path]]
450
+ The path to write the STAC collection to. If None, the STAC collection will be written to self.output_dir / 'stac'
451
+ item_assets: Optional[dict]
452
+ A dictionary containing pystac.extensions.item_assets.AssetDefinition objects to be added to the STAC collection
453
+ https://github.com/stac-extensions/item-assets
454
+ """
455
+ if output_path:
456
+ self._root_collection.set_self_href(str(output_path))
457
+
458
+ if constellation and "summaries" not in self._root_collection.extra_fields:
459
+ self._root_collection.extra_fields["summaries"] = constants.SUMMARIES.get(
460
+ constellation, pystac.summaries.Summaries({})
461
+ ).to_dict()
462
+
463
+ if item_assets and "item_assets" not in self._root_collection.extra_fields:
464
+ item_asset_extension = (
465
+ pystac.extensions.item_assets.ItemAssetsExtension.ext(
466
+ self._root_collection, add_if_missing=True
467
+ )
468
+ )
469
+ item_asset_extension.item_assets = item_assets
@@ -0,0 +1,144 @@
1
+ """Job splitter functionalities, except input points/polygons to extract in the
2
+ form of a GeoDataFrames.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import geopandas as gpd
9
+ import h3
10
+ import requests
11
+
12
+ from openeo_gfmap.manager import _log
13
+
14
+
15
+ def load_s2_grid() -> gpd.GeoDataFrame:
16
+ """Returns a geo data frame from the S2 grid."""
17
+ # Builds the path where the geodataframe should be
18
+ gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds.geojson"
19
+ if not gdf_path.exists():
20
+ _log.info("S2 grid not found, downloading it from artifactory.")
21
+ # Downloads the file from the artifactory URL
22
+ gdf_path.parent.mkdir(exist_ok=True)
23
+ response = requests.get(
24
+ "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds.geojson",
25
+ timeout=180, # 3mins
26
+ )
27
+ with open(gdf_path, "wb") as f:
28
+ f.write(response.content)
29
+ return gpd.read_file(gdf_path)
30
+
31
+
32
+ def _resplit_group(
33
+ polygons: gpd.GeoDataFrame, max_points: int
34
+ ) -> List[gpd.GeoDataFrame]:
35
+ """Performs re-splitting of a dataset of polygons in a list of datasets"""
36
+ for i in range(0, len(polygons), max_points):
37
+ yield polygons.iloc[i : i + max_points].reset_index(drop=True)
38
+
39
+
40
+ def split_job_s2grid(
41
+ polygons: gpd.GeoDataFrame, max_points: int = 500
42
+ ) -> List[gpd.GeoDataFrame]:
43
+ """Split a job into multiple jobs from the position of the polygons/points. The centroid of
44
+ the geometries to extract are used to select tile in the Sentinel-2 tile grid.
45
+
46
+ Parameters
47
+ ----------
48
+ polygons: gpd.GeoDataFrae
49
+ Dataset containing the polygons to split the job by with a `geometry` column.
50
+ max_points: int
51
+ The maximum number of points to be included in each job.
52
+ Returns:
53
+ --------
54
+ split_polygons: list
55
+ List of jobs, split by the GeoDataFrame.
56
+ """
57
+ if "geometry" not in polygons.columns:
58
+ raise ValueError("The GeoDataFrame must contain a 'geometry' column.")
59
+
60
+ if polygons.crs is None:
61
+ raise ValueError("The GeoDataFrame must contain a CRS")
62
+
63
+ polygons = polygons.to_crs(epsg=4326)
64
+ if polygons.geometry.geom_type[0] != "Point":
65
+ polygons["geometry"] = polygons.geometry.centroid
66
+
67
+ # Dataset containing all the S2 tiles, find the nearest S2 tile for each point
68
+ s2_grid = load_s2_grid()
69
+ s2_grid["geometry"] = s2_grid.geometry.centroid
70
+
71
+ polygons = gpd.sjoin_nearest(polygons, s2_grid[["tile", "geometry"]]).drop(
72
+ columns=["index_right"]
73
+ )
74
+
75
+ split_datasets = []
76
+ for _, sub_gdf in polygons.groupby("tile"):
77
+ if len(sub_gdf) > max_points:
78
+ # Performs another split
79
+ split_datasets.extend(_resplit_group(sub_gdf, max_points))
80
+ else:
81
+ split_datasets.append(sub_gdf.reset_index(drop=True))
82
+ return split_datasets
83
+
84
+
85
+ def append_h3_index(
86
+ polygons: gpd.GeoDataFrame, grid_resolution: int = 3
87
+ ) -> gpd.GeoDataFrame:
88
+ """Append the H3 index to the polygons."""
89
+ if polygons.geometry.geom_type[0] != "Point":
90
+ geom_col = polygons.geometry.centroid
91
+ else:
92
+ geom_col = polygons.geometry
93
+ polygons["h3index"] = geom_col.apply(
94
+ lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
95
+ )
96
+ return polygons
97
+
98
+
99
+ def split_job_hex(
100
+ polygons: gpd.GeoDataFrame, max_points: int = 500, grid_resolution: int = 3
101
+ ) -> List[gpd.GeoDataFrame]:
102
+ """Split a job into multiple jobs from the position of the polygons/points. The centroid of
103
+ the geometries to extract are used to select a hexagon in the H3 grid. Using the H3 grid
104
+ allows to split jobs in equal areas, which is useful for parallel processing while taking into
105
+ account OpenEO's limitations.
106
+
107
+ Parameters
108
+ ----------
109
+ polygons: gpd.GeoDataFrae
110
+ Dataset containing the polygons to split the job by with a `geometry` column.
111
+ max_points: int
112
+ The maximum number of points to be included in each job.
113
+ grid_resolution: int
114
+ The scale to use in the H3 hexagonal grid to split jobs to, default is 4. Changing the
115
+ grid scale will drastically increase/decrease the area on which jobs will work.
116
+ More information on the H3 grid can be found at
117
+ https://h3geo.org/docs/core-library/restable
118
+ Returns:
119
+ --------
120
+ split_polygons: list
121
+ List of jobs, split by the GeoDataFrame.
122
+ """
123
+
124
+ if "geometry" not in polygons.columns:
125
+ raise ValueError("The GeoDataFrame must contain a 'geometry' column.")
126
+
127
+ if polygons.crs is None:
128
+ raise ValueError("The GeoDataFrame must contain a CRS")
129
+
130
+ # Project to lat/lon positions
131
+ polygons = polygons.to_crs(epsg=4326)
132
+
133
+ # Split the polygons into multiple jobs
134
+ polygons = append_h3_index(polygons, grid_resolution)
135
+
136
+ split_datasets = []
137
+ for _, sub_gdf in polygons.groupby("h3index"):
138
+ if len(sub_gdf) > max_points:
139
+ # Performs another split
140
+ split_datasets.extend(_resplit_group(sub_gdf, max_points))
141
+ else:
142
+ split_datasets.append(sub_gdf.reset_index(drop=True))
143
+
144
+ return split_datasets
@@ -0,0 +1,24 @@
1
+ """Metadata utilities related to the usage of a DataCube. Used to interract
2
+ with the OpenEO backends and cover some shortcomings.
3
+ """
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass
9
+ class FakeMetadata:
10
+ """Fake metdata object used for datacubes fetched from STAC catalogues.
11
+ This is used as a temporal fix for OpenEO backend shortcomings, but
12
+ will become unused with the time.
13
+ """
14
+
15
+ band_names: list
16
+
17
+ def rename_labels(self, _, target, source):
18
+ """Rename the labels of the band dimension."""
19
+ mapping = dict(zip(target, source))
20
+ band_names = self.band_names.copy()
21
+ for idx, name in enumerate(band_names):
22
+ if name in target:
23
+ self.band_names[idx] = mapping[name]
24
+ return self
@@ -0,0 +1,22 @@
1
+ """Preprocessing functions for OpenEO DataCubes. The prepreocessing occurs
2
+ right after the extraction and the execution of the features UDF.
3
+ """
4
+
5
+ from openeo_gfmap.preprocessing.cloudmasking import (
6
+ bap_masking,
7
+ get_bap_mask,
8
+ get_bap_score,
9
+ mask_scl_dilation,
10
+ )
11
+ from openeo_gfmap.preprocessing.compositing import mean_compositing, median_compositing
12
+ from openeo_gfmap.preprocessing.interpolation import linear_interpolation
13
+
14
+ __all__ = [
15
+ "mask_scl_dilation",
16
+ "linear_interpolation",
17
+ "median_compositing",
18
+ "mean_compositing",
19
+ "get_bap_score",
20
+ "get_bap_mask",
21
+ "bap_masking",
22
+ ]