openeo-gfmap 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,11 @@
1
1
  import json
2
+ import pickle
2
3
  import threading
4
+ import time
3
5
  from concurrent.futures import ThreadPoolExecutor
4
- from enum import Enum
5
6
  from functools import partial
6
7
  from pathlib import Path
8
+ from threading import Lock
7
9
  from typing import Callable, Optional, Union
8
10
 
9
11
  import pandas as pd
@@ -15,29 +17,60 @@ from pystac import CatalogType
15
17
  from openeo_gfmap.manager import _log
16
18
  from openeo_gfmap.stac import constants
17
19
 
18
- # Lock to use when writing to the STAC collection
19
- _stac_lock = threading.Lock()
20
+
21
+ def retry_on_exception(max_retries: int, delay_s: int = 180):
22
+ """Decorator to retry a function if an exception occurs.
23
+ Used for post-job actions that can crash due to internal backend issues. Restarting the action
24
+ usually helps to solve the issue.
25
+
26
+ Parameters
27
+ ----------
28
+ max_retries: int
29
+ The maximum number of retries to attempt before finally raising the exception.
30
+ delay: int (default=180 seconds)
31
+ The delay in seconds to wait before retrying the decorated function.
32
+ """
33
+
34
+ def decorator(func):
35
+ def wrapper(*args, **kwargs):
36
+ latest_exception = None
37
+ for _ in range(max_retries):
38
+ try:
39
+ return func(*args, **kwargs)
40
+ except Exception as e:
41
+ time.sleep(
42
+ delay_s
43
+ ) # Waits before retrying, while allowing other futures to run.
44
+ latest_exception = e
45
+ raise latest_exception
46
+
47
+ return wrapper
48
+
49
+ return decorator
20
50
 
21
51
 
22
52
  def done_callback(future, df, idx):
23
- """Sets the status of the job to the given status when the future is done."""
53
+ """Changes the status of the job when the post-job action future is done."""
24
54
  current_status = df.loc[idx, "status"]
25
- if not future.exception():
55
+ exception = future.exception()
56
+ if exception is None:
26
57
  if current_status == "postprocessing":
27
58
  df.loc[idx, "status"] = "finished"
28
59
  elif current_status == "postprocessing-error":
29
60
  df.loc[idx, "status"] = "error"
61
+ elif current_status == "running":
62
+ df.loc[idx, "status"] = "running"
30
63
  else:
31
64
  raise ValueError(
32
65
  f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
33
66
  )
34
-
35
-
36
- class PostJobStatus(Enum):
37
- """Indicates the workers if the job finished as sucessful or with an error."""
38
-
39
- FINISHED = "finished"
40
- ERROR = "error"
67
+ else:
68
+ _log.exception(
69
+ "Exception occurred in post-job future for job %s:\n%s",
70
+ df.loc[idx, "id"],
71
+ exception,
72
+ )
73
+ df.loc[idx, "status"] = "error"
41
74
 
42
75
 
43
76
  class GFMAPJobManager(MultiBackendJobManager):
@@ -53,13 +86,51 @@ class GFMAPJobManager(MultiBackendJobManager):
53
86
  post_job_action: Optional[Callable] = None,
54
87
  poll_sleep: int = 5,
55
88
  n_threads: int = 1,
56
- post_job_params: dict = {},
57
89
  resume_postproc: bool = True, # If we need to check for post-job actions that crashed
58
90
  restart_failed: bool = False, # If we need to restart failed jobs
91
+ stac_enabled: bool = True,
59
92
  ):
93
+ """
94
+ Initializes the GFMAP job manager.
95
+
96
+ Parameters
97
+ ----------
98
+ output_dir: Path
99
+ The base output directory where the results/stac/logs of the jobs will be stored.
100
+ output_path_generator: Callable
101
+ User defined function that generates the output path for the job results. Expects as
102
+ inputs the output directory, the index of the job in the job dataframe
103
+ and the row of the job, and returns the final path where to save a job result asset.
104
+ collection_id: Optional[str]
105
+ The ID of the STAC collection that is being generated. Can be left empty if the STAC
106
+ catalogue is not being generated or if it is being resumed from an existing catalogue.
107
+ collection_description: Optional[str]
108
+ The description of the STAC collection that is being generated.
109
+ stac: Optional[Union[str, Path]]
110
+ The path to the STAC collection to be saved or resumed.
111
+ If None, the default path will be used.
112
+ post_job_action: Optional[Callable]
113
+ A user defined function that will be called after a job is finished. It will receive
114
+ the list of items generated by the job and the row of the job, and should return the
115
+ updated list of items.
116
+ poll_sleep: int
117
+ The time in seconds to wait between polling the backend for job status.
118
+ n_threads: int
119
+ The number of threads to execute `on_job_done` and `on_job_error` functions.
120
+ resume_postproc: bool
121
+ If set to true, all `on_job_done` and `on_job_error` functions that failed are resumed.
122
+ restart_failed: bool
123
+ If set to true, all jobs that failed within the OpenEO backend are restarted.
124
+ stac_enabled: bool (default=True)
125
+ If the STAC generation is enabled or not. Disabling it will prevent the creation,
126
+ update and loading of the STAC collection.
127
+ """
60
128
  self._output_dir = output_dir
129
+ self._catalogue_cache = output_dir / "catalogue_cache.bin"
61
130
 
62
131
  self.stac = stac
132
+ self.lock = Lock()
133
+ self.stac_enabled = stac_enabled
63
134
  self.collection_id = collection_id
64
135
  self.collection_description = collection_description
65
136
 
@@ -74,41 +145,73 @@ class GFMAPJobManager(MultiBackendJobManager):
74
145
 
75
146
  self._output_path_gen = output_path_generator
76
147
  self._post_job_action = post_job_action
77
- self._post_job_params = post_job_params
78
148
 
79
149
  # Monkey patching the _normalize_df method to ensure we have no modification on the
80
150
  # geometry column
81
151
  MultiBackendJobManager._normalize_df = self._normalize_df
82
152
  super().__init__(poll_sleep)
83
153
 
84
- self._root_collection = self._normalize_stac()
154
+ if self.stac_enabled:
155
+ self._root_collection = self._initialize_stac()
85
156
 
86
- def _normalize_stac(self):
157
+ def _load_stac(self) -> Optional[pystac.Collection]:
158
+ """
159
+ Loads the STAC collection from the cache, the specified `stac` path or the default path.
160
+ If no STAC collection is found, returns None.
161
+ """
87
162
  default_collection_path = self._output_dir / "stac/collection.json"
88
- if self.stac is not None:
163
+ if self._catalogue_cache.exists():
164
+ _log.info(
165
+ "Loading the STAC collection from the persisted binary file: %s.",
166
+ self._catalogue_cache,
167
+ )
168
+ with open(self._catalogue_cache, "rb") as file:
169
+ return pickle.load(file)
170
+ elif self.stac is not None:
89
171
  _log.info(
90
- f"Reloading the STAC collection from the provided path: {self.stac}."
172
+ "Reloading the STAC collection from the provided path: %s.", self.stac
91
173
  )
92
- root_collection = pystac.read_file(str(self.stac))
174
+ return pystac.read_file(str(self.stac))
93
175
  elif default_collection_path.exists():
94
176
  _log.info(
95
- f"Reload the STAC collection from the default path: {default_collection_path}."
177
+ "Reload the STAC collection from the default path: %s.",
178
+ default_collection_path,
96
179
  )
97
180
  self.stac = default_collection_path
98
- root_collection = pystac.read_file(str(self.stac))
99
- else:
100
- _log.info("Starting a fresh STAC collection.")
101
- assert (
102
- self.collection_id is not None
103
- ), "A collection ID is required to generate a STAC collection."
104
- root_collection = pystac.Collection(
105
- id=self.collection_id,
106
- description=self.collection_description,
107
- extent=None,
181
+ return pystac.read_file(str(self.stac))
182
+
183
+ _log.info(
184
+ "No STAC collection found as cache, in the default path or in the provided path."
185
+ )
186
+ return None
187
+
188
+ def _create_stac(self) -> pystac.Collection:
189
+ """
190
+ Creates and returns new STAC collection. The created stac collection will use the
191
+ `collection_id` and `collection_description` parameters set in the constructor.
192
+ """
193
+ if self.collection_id is None:
194
+ raise ValueError(
195
+ "A collection ID is required to generate a STAC collection."
108
196
  )
109
- root_collection.license = constants.LICENSE
110
- root_collection.add_link(constants.LICENSE_LINK)
111
- root_collection.stac_extensions = constants.STAC_EXTENSIONS
197
+ collection = pystac.Collection(
198
+ id=self.collection_id,
199
+ description=self.collection_description,
200
+ extent=None,
201
+ )
202
+ collection.license = constants.LICENSE
203
+ collection.add_link(constants.LICENSE_LINK)
204
+ collection.stac_extensions = constants.STAC_EXTENSIONS
205
+ return collection
206
+
207
+ def _initialize_stac(self) -> pystac.Collection:
208
+ """
209
+ Loads and returns if possible an existing stac collection, otherwise creates a new one.
210
+ """
211
+ root_collection = self._load_stac()
212
+ if not root_collection:
213
+ _log.info("Starting a fresh STAC collection.")
214
+ root_collection = self._create_stac()
112
215
 
113
216
  return root_collection
114
217
 
@@ -150,24 +253,40 @@ class GFMAPJobManager(MultiBackendJobManager):
150
253
  job = connection.job(row.id)
151
254
  if row.status == "postprocessing":
152
255
  _log.info(
153
- f"Resuming postprocessing of job {row.id}, queueing on_job_finished..."
256
+ "Resuming postprocessing of job %s, queueing on_job_finished...",
257
+ row.id,
154
258
  )
155
259
  future = self._executor.submit(self.on_job_done, job, row)
156
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
260
+ future.add_done_callback(
261
+ partial(
262
+ done_callback,
263
+ df=df,
264
+ idx=idx,
265
+ )
266
+ )
157
267
  else:
158
268
  _log.info(
159
- f"Resuming postprocessing of job {row.id}, queueing on_job_error..."
269
+ "Resuming postprocessing of job %s, queueing on_job_error...",
270
+ row.id,
160
271
  )
161
272
  future = self._executor.submit(self.on_job_error, job, row)
162
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
273
+ future.add_done_callback(
274
+ partial(
275
+ done_callback,
276
+ df=df,
277
+ idx=idx,
278
+ )
279
+ )
163
280
  self._futures.append(future)
164
281
 
165
282
  def _restart_failed_jobs(self, df: pd.DataFrame):
166
283
  """Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
167
- failed_tasks = df[df.status == "error"]
284
+ failed_tasks = df[df.status.isin(["error", "start_failed"])]
168
285
  not_started_tasks = df[df.status == "not_started"]
169
286
  _log.info(
170
- f"Resetting {len(failed_tasks)} failed jobs to 'not_started'. {len(not_started_tasks)} jobs are already 'not_started'."
287
+ "Resetting %s failed jobs to 'not_started'. %s jobs are already 'not_started'.",
288
+ len(failed_tasks),
289
+ len(not_started_tasks),
171
290
  )
172
291
  for idx, _ in failed_tasks.iterrows():
173
292
  df.loc[idx, "status"] = "not_started"
@@ -203,27 +322,53 @@ class GFMAPJobManager(MultiBackendJobManager):
203
322
  job_metadata["status"] == "finished"
204
323
  ):
205
324
  _log.info(
206
- f"Job {job.job_id} finished successfully, queueing on_job_done..."
325
+ "Job %s finished successfully, queueing on_job_done...", job.job_id
207
326
  )
208
327
  job_status = "postprocessing"
209
328
  future = self._executor.submit(self.on_job_done, job, row)
210
329
  # Future will setup the status to finished when the job is done
211
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
330
+ future.add_done_callback(
331
+ partial(
332
+ done_callback,
333
+ df=df,
334
+ idx=idx,
335
+ )
336
+ )
212
337
  self._futures.append(future)
213
- df.loc[idx, "costs"] = job_metadata["costs"]
338
+ if "costs" in job_metadata:
339
+ df.loc[idx, "costs"] = job_metadata["costs"]
340
+ df.loc[idx, "memory"] = (
341
+ job_metadata["usage"]
342
+ .get("max_executor_memory", {})
343
+ .get("value", None)
344
+ )
345
+
346
+ else:
347
+ _log.warning(
348
+ "Costs not found in job %s metadata. Costs will be set to 'None'.",
349
+ job.job_id,
350
+ )
214
351
 
215
352
  # Case in which it failed
216
353
  if (df.loc[idx, "status"] != "error") and (
217
354
  job_metadata["status"] == "error"
218
355
  ):
219
356
  _log.info(
220
- f"Job {job.job_id} finished with error, queueing on_job_error..."
357
+ "Job %s finished with error, queueing on_job_error...",
358
+ job.job_id,
221
359
  )
222
360
  job_status = "postprocessing-error"
223
361
  future = self._executor.submit(self.on_job_error, job, row)
224
362
  # Future will setup the status to error when the job is done
225
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
363
+ future.add_done_callback(
364
+ partial(
365
+ done_callback,
366
+ df=df,
367
+ idx=idx,
368
+ )
369
+ )
226
370
  self._futures.append(future)
371
+ if "costs" in job_metadata:
227
372
  df.loc[idx, "costs"] = job_metadata["costs"]
228
373
 
229
374
  df.loc[idx, "status"] = job_status
@@ -231,6 +376,7 @@ class GFMAPJobManager(MultiBackendJobManager):
231
376
  # Clear the futures that are done and raise their potential exceptions if they occurred.
232
377
  self._clear_queued_actions()
233
378
 
379
+ @retry_on_exception(max_retries=2, delay_s=180)
234
380
  def on_job_error(self, job: BatchJob, row: pd.Series):
235
381
  """Method called when a job finishes with an error.
236
382
 
@@ -241,7 +387,14 @@ class GFMAPJobManager(MultiBackendJobManager):
241
387
  row: pd.Series
242
388
  The row in the dataframe that contains the job relative information.
243
389
  """
244
- logs = job.logs()
390
+ try:
391
+ logs = job.logs()
392
+ except Exception as e: # pylint: disable=broad-exception-caught
393
+ _log.exception(
394
+ "Error getting logs in `on_job_error` for job %s:\n%s", job.job_id, e
395
+ )
396
+ logs = []
397
+
245
398
  error_logs = [log for log in logs if log.level.lower() == "error"]
246
399
 
247
400
  job_metadata = job.describe_job()
@@ -260,28 +413,43 @@ class GFMAPJobManager(MultiBackendJobManager):
260
413
  f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
261
414
  )
262
415
 
416
+ @retry_on_exception(max_retries=2, delay_s=30)
263
417
  def on_job_done(self, job: BatchJob, row: pd.Series):
264
418
  """Method called when a job finishes successfully. It will first download the results of
265
419
  the job and then call the `post_job_action` method.
266
420
  """
421
+
267
422
  job_products = {}
268
- for idx, asset in enumerate(job.get_results().get_assets()):
423
+ job_results = job.get_results()
424
+ asset_ids = [a.name for a in job_results.get_assets()]
425
+ for idx, asset_id in enumerate(asset_ids):
269
426
  try:
427
+ asset = job_results.get_asset(asset_id)
270
428
  _log.debug(
271
- f"Generating output path for asset {asset.name} from job {job.job_id}..."
429
+ "Generating output path for asset %s from job %s...",
430
+ asset_id,
431
+ job.job_id,
432
+ )
433
+ output_path = self._output_path_gen(
434
+ self._output_dir, idx, row, asset_id
272
435
  )
273
- output_path = self._output_path_gen(self._output_dir, idx, row)
274
436
  # Make the output path
275
437
  output_path.parent.mkdir(parents=True, exist_ok=True)
276
438
  asset.download(output_path)
277
439
  # Add to the list of downloaded products
278
- job_products[f"{job.job_id}_{asset.name}"] = [output_path]
440
+ job_products[f"{job.job_id}_{asset_id}"] = [output_path]
279
441
  _log.debug(
280
- f"Downloaded {asset.name} from job {job.job_id} -> {output_path}"
442
+ "Downloaded %s from job %s -> %s",
443
+ asset_id,
444
+ job.job_id,
445
+ output_path,
281
446
  )
282
447
  except Exception as e:
283
448
  _log.exception(
284
- f"Error downloading asset {asset.name} from job {job.job_id}", e
449
+ "Error downloading asset %s from job %s:\n%s",
450
+ asset_id,
451
+ job.job_id,
452
+ e,
285
453
  )
286
454
  raise e
287
455
 
@@ -302,53 +470,35 @@ class GFMAPJobManager(MultiBackendJobManager):
302
470
  asset.href = str(
303
471
  asset_path
304
472
  ) # Update the asset href to the output location set by the output_path_generator
305
- # item.id = f"{job.job_id}_{item.id}"
473
+
306
474
  # Add the item to the the current job items.
307
475
  job_items.append(item)
308
- _log.info(f"Parsed item {item.id} from job {job.job_id}")
476
+ _log.info("Parsed item %s from job %s", item.id, job.job_id)
309
477
  except Exception as e:
310
478
  _log.exception(
311
- f"Error failed to add item {item.id} from job {job.job_id} to STAC collection",
479
+ "Error failed to add item %s from job %s to STAC collection:\n%s",
480
+ item.id,
481
+ job.job_id,
312
482
  e,
313
483
  )
314
- raise e
315
484
 
316
485
  # _post_job_action returns an updated list of stac items. Post job action can therefore
317
486
  # update the stac items and access their products through the HREF. It is also the
318
487
  # reponsible of adding the appropriate metadata/assets to the items.
319
488
  if self._post_job_action is not None:
320
- _log.debug(f"Calling post job action for job {job.job_id}...")
321
- job_items = self._post_job_action(job_items, row, self._post_job_params)
489
+ _log.debug("Calling post job action for job %s...", job.job_id)
490
+ job_items = self._post_job_action(job_items, row)
322
491
 
323
- _log.info(f"Adding {len(job_items)} items to the STAC collection...")
492
+ _log.info("Adding %s items to the STAC collection...", len(job_items))
324
493
 
325
- with _stac_lock: # Take the STAC lock to avoid concurrence issues
326
- # Filters the job items to only keep the ones that are not already in the collection
327
- existing_ids = [item.id for item in self._root_collection.get_all_items()]
328
- job_items = [item for item in job_items if item.id not in existing_ids]
494
+ if self.stac_enabled:
495
+ with self.lock:
496
+ self._update_stac(job.job_id, job_items)
329
497
 
330
- self._root_collection.add_items(job_items)
331
- _log.info(f"Added {len(job_items)} items to the STAC collection.")
332
-
333
- _log.info(f"Writing STAC collection for {job.job_id} to file...")
334
- try:
335
- self._write_stac()
336
- except Exception as e:
337
- _log.exception(
338
- f"Error writing STAC collection for job {job.job_id} to file.", e
339
- )
340
- raise e
341
- _log.info(f"Wrote STAC collection for {job.job_id} to file.")
342
-
343
- _log.info(f"Job {job.job_id} and post job action finished successfully.")
498
+ _log.info("Job %s and post job action finished successfully.", job.job_id)
344
499
 
345
500
  def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
346
- """Ensure we have the required columns and the expected type for the geometry column.
347
-
348
- :param df: The dataframe to normalize.
349
- :return: a new dataframe that is normalized.
350
- """
351
-
501
+ """Ensure we have the required columns and the expected type for the geometry column."""
352
502
  # check for some required columns.
353
503
  required_with_default = [
354
504
  ("status", "not_started"),
@@ -366,7 +516,7 @@ class GFMAPJobManager(MultiBackendJobManager):
366
516
  }
367
517
  df = df.assign(**new_columns)
368
518
 
369
- _log.debug(f"Normalizing dataframe. Columns: {df.columns}")
519
+ _log.debug("Normalizing dataframe. Columns: %s", df.columns)
370
520
 
371
521
  return df
372
522
 
@@ -401,7 +551,7 @@ class GFMAPJobManager(MultiBackendJobManager):
401
551
  The file to track the results of the jobs.
402
552
  """
403
553
  # Starts the thread pool to work on the on_job_done and on_job_error methods
404
- _log.info(f"Starting ThreadPoolExecutor with {self._n_threads} workers.")
554
+ _log.info("Starting ThreadPoolExecutor with %s workers.", self._n_threads)
405
555
  with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
406
556
  _log.info("Creating and running jobs.")
407
557
  self._executor = executor
@@ -412,6 +562,13 @@ class GFMAPJobManager(MultiBackendJobManager):
412
562
  self._wait_queued_actions()
413
563
  _log.info("Exiting ThreadPoolExecutor.")
414
564
  self._executor = None
565
+ _log.info("All jobs finished running.")
566
+ if self.stac_enabled:
567
+ _log.info("Saving persisted STAC collection to final .json collection.")
568
+ self._write_stac()
569
+ _log.info("Saved STAC catalogue to JSON format, all tasks finished!")
570
+ else:
571
+ _log.info("STAC was disabled, skipping generation of the catalogue.")
415
572
 
416
573
  def _write_stac(self):
417
574
  """Writes the STAC collection to the output directory."""
@@ -428,6 +585,36 @@ class GFMAPJobManager(MultiBackendJobManager):
428
585
  self._root_collection.normalize_hrefs(str(root_path))
429
586
  self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
430
587
 
588
+ def _persist_stac(self):
589
+ """Persists the STAC collection by saving it into a binary file."""
590
+ _log.debug("Validating the STAC collection before persisting.")
591
+ self._root_collection.validate_all()
592
+ _log.info("Persisting STAC collection to temp file %s.", self._catalogue_cache)
593
+ with open(self._catalogue_cache, "wb") as file:
594
+ pickle.dump(self._root_collection, file)
595
+
596
+ def _update_stac(self, job_id: str, job_items: list[pystac.Item]):
597
+ """Updates the STAC collection by adding the items generated by the job.
598
+ Does not add duplicates or override with the same item ID.
599
+ """
600
+ try:
601
+ _log.info("Thread %s entered the STAC lock.", threading.get_ident())
602
+ # Filters the job items to only keep the ones that are not already in the collection
603
+ existing_ids = [item.id for item in self._root_collection.get_all_items()]
604
+ job_items = [item for item in job_items if item.id not in existing_ids]
605
+
606
+ self._root_collection.add_items(job_items)
607
+ _log.info("Added %s items to the STAC collection.", len(job_items))
608
+
609
+ self._persist_stac()
610
+ except Exception as e:
611
+ _log.exception(
612
+ "Error adding items to the STAC collection for job %s:\n%s ",
613
+ job_id,
614
+ str(e),
615
+ )
616
+ raise e
617
+
431
618
  def setup_stac(
432
619
  self,
433
620
  constellation: Optional[str] = None,