PyPI - openeo-gfmap - Versions diffs - 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

openeo-gfmap 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

openeo_gfmap/features/feature_extractor.py +9 -0
openeo_gfmap/fetching/__init__.py +16 -4
openeo_gfmap/fetching/commons.py +1 -0
openeo_gfmap/fetching/generic.py +81 -73
openeo_gfmap/fetching/s1.py +1 -3
openeo_gfmap/fetching/s2.py +1 -0
openeo_gfmap/inference/model_inference.py +5 -2
openeo_gfmap/manager/job_manager.py +271 -84
openeo_gfmap/manager/job_splitters.py +169 -21
openeo_gfmap/preprocessing/sar.py +12 -33
openeo_gfmap/stac/constants.py +1 -1
openeo_gfmap/utils/__init__.py +16 -0
openeo_gfmap/utils/catalogue.py +172 -35
openeo_gfmap/utils/split_stac.py +125 -0
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/METADATA +5 -4
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/RECORD +18 -18
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/WHEEL +1 -1
openeo_gfmap/fetching/meteo.py +0 -126
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/licenses/LICENSE +0 -0

openeo_gfmap/manager/job_manager.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import json
+import pickle
 import threading
+import time
 from concurrent.futures import ThreadPoolExecutor
-from enum import Enum
 from functools import partial
 from pathlib import Path
+from threading import Lock
 from typing import Callable, Optional, Union
 import pandas as pd
@@ -15,29 +17,60 @@ from pystac import CatalogType
 from openeo_gfmap.manager import _log
 from openeo_gfmap.stac import constants
-# Lock to use when writing to the STAC collection
-_stac_lock = threading.Lock()
+def retry_on_exception(max_retries: int, delay_s: int = 180):
+    """Decorator to retry a function if an exception occurs.
+    Used for post-job actions that can crash due to internal backend issues. Restarting the action
+    usually helps to solve the issue.
+    Parameters
+    ----------
+    max_retries: int
+        The maximum number of retries to attempt before finally raising the exception.
+    delay: int (default=180 seconds)
+        The delay in seconds to wait before retrying the decorated function.
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            latest_exception = None
+            for _ in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    time.sleep(
+                        delay_s
+                    )  # Waits before retrying, while allowing other futures to run.
+                    latest_exception = e
+            raise latest_exception
+        return wrapper
+    return decorator
 def done_callback(future, df, idx):
-    """Sets the status of the job to the given status when the future is done."""
+    """Changes the status of the job when the post-job action future is done."""
     current_status = df.loc[idx, "status"]
-    if not future.exception():
+    exception = future.exception()
+    if exception is None:
         if current_status == "postprocessing":
             df.loc[idx, "status"] = "finished"
         elif current_status == "postprocessing-error":
             df.loc[idx, "status"] = "error"
+        elif current_status == "running":
+            df.loc[idx, "status"] = "running"
         else:
             raise ValueError(
                 f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
             )
-class PostJobStatus(Enum):
-    """Indicates the workers if the job finished as sucessful or with an error."""
-    FINISHED = "finished"
-    ERROR = "error"
+    else:
+        _log.exception(
+            "Exception occurred in post-job future for job %s:\n%s",
+            df.loc[idx, "id"],
+            exception,
+        )
+        df.loc[idx, "status"] = "error"
 class GFMAPJobManager(MultiBackendJobManager):
@@ -53,13 +86,51 @@ class GFMAPJobManager(MultiBackendJobManager):
         post_job_action: Optional[Callable] = None,
         poll_sleep: int = 5,
         n_threads: int = 1,
-        post_job_params: dict = {},
         resume_postproc: bool = True,  # If we need to check for post-job actions that crashed
         restart_failed: bool = False,  # If we need to restart failed jobs
+        stac_enabled: bool = True,
     ):
+        """
+        Initializes the GFMAP job manager.
+        Parameters
+        ----------
+        output_dir: Path
+            The base output directory where the results/stac/logs of the jobs will be stored.
+        output_path_generator: Callable
+            User defined function that generates the output path for the job results. Expects as
+            inputs the output directory, the index of the job in the job dataframe
+            and the row of the job, and returns the final path where to save a job result asset.
+        collection_id: Optional[str]
+            The ID of the STAC collection that is being generated. Can be left empty if the STAC
+            catalogue is not being generated or if it is being resumed from an existing catalogue.
+        collection_description: Optional[str]
+            The description of the STAC collection that is being generated.
+        stac: Optional[Union[str, Path]]
+            The path to the STAC collection to be saved or resumed.
+            If None, the default path will be used.
+        post_job_action: Optional[Callable]
+            A user defined function that will be called after a job is finished. It will receive
+            the list of items generated by the job and the row of the job, and should return the
+            updated list of items.
+        poll_sleep: int
+            The time in seconds to wait between polling the backend for job status.
+        n_threads: int
+            The number of threads to execute `on_job_done` and `on_job_error` functions.
+        resume_postproc: bool
+            If set to true, all `on_job_done` and `on_job_error` functions that failed are resumed.
+        restart_failed: bool
+            If set to true, all jobs that failed within the OpenEO backend are restarted.
+        stac_enabled: bool (default=True)
+            If the STAC generation is enabled or not. Disabling it will prevent the creation,
+            update and loading of the STAC collection.
+        """
         self._output_dir = output_dir
+        self._catalogue_cache = output_dir / "catalogue_cache.bin"
         self.stac = stac
+        self.lock = Lock()
+        self.stac_enabled = stac_enabled
         self.collection_id = collection_id
         self.collection_description = collection_description
@@ -74,41 +145,73 @@ class GFMAPJobManager(MultiBackendJobManager):
         self._output_path_gen = output_path_generator
         self._post_job_action = post_job_action
-        self._post_job_params = post_job_params
         # Monkey patching the _normalize_df method to ensure we have no modification on the
         # geometry column
         MultiBackendJobManager._normalize_df = self._normalize_df
         super().__init__(poll_sleep)
-        self._root_collection = self._normalize_stac()
+        if self.stac_enabled:
+            self._root_collection = self._initialize_stac()
-    def _normalize_stac(self):
+    def _load_stac(self) -> Optional[pystac.Collection]:
+        """
+        Loads the STAC collection from the cache, the specified `stac` path or the default path.
+        If no STAC collection is found, returns None.
+        """
         default_collection_path = self._output_dir / "stac/collection.json"
-        if self.stac is not None:
+        if self._catalogue_cache.exists():
+            _log.info(
+                "Loading the STAC collection from the persisted binary file: %s.",
+                self._catalogue_cache,
+            )
+            with open(self._catalogue_cache, "rb") as file:
+                return pickle.load(file)
+        elif self.stac is not None:
             _log.info(
-                f"Reloading the STAC collection from the provided path: {self.stac}."
+                "Reloading the STAC collection from the provided path: %s.", self.stac
             )
-            root_collection = pystac.read_file(str(self.stac))
+            return pystac.read_file(str(self.stac))
         elif default_collection_path.exists():
             _log.info(
-                f"Reload the STAC collection from the default path: {default_collection_path}."
+                "Reload the STAC collection from the default path: %s.",
+                default_collection_path,
             )
             self.stac = default_collection_path
-            root_collection = pystac.read_file(str(self.stac))
-        else:
-            _log.info("Starting a fresh STAC collection.")
-            assert (
-                self.collection_id is not None
-            ), "A collection ID is required to generate a STAC collection."
-            root_collection = pystac.Collection(
-                id=self.collection_id,
-                description=self.collection_description,
-                extent=None,
+            return pystac.read_file(str(self.stac))
+        _log.info(
+            "No STAC collection found as cache, in the default path or in the provided path."
+        )
+        return None
+    def _create_stac(self) -> pystac.Collection:
+        """
+        Creates and returns new STAC collection. The created stac collection will use the
+        `collection_id` and `collection_description` parameters set in the constructor.
+        """
+        if self.collection_id is None:
+            raise ValueError(
+                "A collection ID is required to generate a STAC collection."
             )
-            root_collection.license = constants.LICENSE
-            root_collection.add_link(constants.LICENSE_LINK)
-            root_collection.stac_extensions = constants.STAC_EXTENSIONS
+        collection = pystac.Collection(
+            id=self.collection_id,
+            description=self.collection_description,
+            extent=None,
+        )
+        collection.license = constants.LICENSE
+        collection.add_link(constants.LICENSE_LINK)
+        collection.stac_extensions = constants.STAC_EXTENSIONS
+        return collection
+    def _initialize_stac(self) -> pystac.Collection:
+        """
+        Loads and returns if possible an existing stac collection, otherwise creates a new one.
+        """
+        root_collection = self._load_stac()
+        if not root_collection:
+            _log.info("Starting a fresh STAC collection.")
+            root_collection = self._create_stac()
         return root_collection
@@ -150,24 +253,40 @@ class GFMAPJobManager(MultiBackendJobManager):
             job = connection.job(row.id)
             if row.status == "postprocessing":
                 _log.info(
-                    f"Resuming postprocessing of job {row.id}, queueing on_job_finished..."
+                    "Resuming postprocessing of job %s, queueing on_job_finished...",
+                    row.id,
                 )
                 future = self._executor.submit(self.on_job_done, job, row)
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
             else:
                 _log.info(
-                    f"Resuming postprocessing of job {row.id}, queueing on_job_error..."
+                    "Resuming postprocessing of job %s, queueing on_job_error...",
+                    row.id,
                 )
                 future = self._executor.submit(self.on_job_error, job, row)
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
             self._futures.append(future)
     def _restart_failed_jobs(self, df: pd.DataFrame):
         """Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
-        failed_tasks = df[df.status == "error"]
+        failed_tasks = df[df.status.isin(["error", "start_failed"])]
         not_started_tasks = df[df.status == "not_started"]
         _log.info(
-            f"Resetting {len(failed_tasks)} failed jobs to 'not_started'. {len(not_started_tasks)} jobs are already 'not_started'."
+            "Resetting %s failed jobs to 'not_started'. %s jobs are already 'not_started'.",
+            len(failed_tasks),
+            len(not_started_tasks),
         )
         for idx, _ in failed_tasks.iterrows():
             df.loc[idx, "status"] = "not_started"
@@ -203,27 +322,53 @@ class GFMAPJobManager(MultiBackendJobManager):
                 job_metadata["status"] == "finished"
             ):
                 _log.info(
-                    f"Job {job.job_id} finished successfully, queueing on_job_done..."
+                    "Job %s finished successfully, queueing on_job_done...", job.job_id
                 )
                 job_status = "postprocessing"
                 future = self._executor.submit(self.on_job_done, job, row)
                 # Future will setup the status to finished when the job is done
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
                 self._futures.append(future)
-                df.loc[idx, "costs"] = job_metadata["costs"]
+                if "costs" in job_metadata:
+                    df.loc[idx, "costs"] = job_metadata["costs"]
+                    df.loc[idx, "memory"] = (
+                        job_metadata["usage"]
+                        .get("max_executor_memory", {})
+                        .get("value", None)
+                    )
+                else:
+                    _log.warning(
+                        "Costs not found in job %s metadata. Costs will be set to 'None'.",
+                        job.job_id,
+                    )
             # Case in which it failed
             if (df.loc[idx, "status"] != "error") and (
                 job_metadata["status"] == "error"
             ):
                 _log.info(
-                    f"Job {job.job_id} finished with error, queueing on_job_error..."
+                    "Job %s finished with error, queueing on_job_error...",
+                    job.job_id,
                 )
                 job_status = "postprocessing-error"
                 future = self._executor.submit(self.on_job_error, job, row)
                 # Future will setup the status to error when the job is done
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
                 self._futures.append(future)
+            if "costs" in job_metadata:
                 df.loc[idx, "costs"] = job_metadata["costs"]
             df.loc[idx, "status"] = job_status
@@ -231,6 +376,7 @@ class GFMAPJobManager(MultiBackendJobManager):
         # Clear the futures that are done and raise their potential exceptions if they occurred.
         self._clear_queued_actions()
+    @retry_on_exception(max_retries=2, delay_s=180)
     def on_job_error(self, job: BatchJob, row: pd.Series):
         """Method called when a job finishes with an error.
@@ -241,7 +387,14 @@ class GFMAPJobManager(MultiBackendJobManager):
         row: pd.Series
             The row in the dataframe that contains the job relative information.
         """
-        logs = job.logs()
+        try:
+            logs = job.logs()
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            _log.exception(
+                "Error getting logs in `on_job_error` for job %s:\n%s", job.job_id, e
+            )
+            logs = []
         error_logs = [log for log in logs if log.level.lower() == "error"]
         job_metadata = job.describe_job()
@@ -260,28 +413,43 @@ class GFMAPJobManager(MultiBackendJobManager):
                 f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
             )
+    @retry_on_exception(max_retries=2, delay_s=30)
     def on_job_done(self, job: BatchJob, row: pd.Series):
         """Method called when a job finishes successfully. It will first download the results of
         the job and then call the `post_job_action` method.
         """
         job_products = {}
-        for idx, asset in enumerate(job.get_results().get_assets()):
+        job_results = job.get_results()
+        asset_ids = [a.name for a in job_results.get_assets()]
+        for idx, asset_id in enumerate(asset_ids):
             try:
+                asset = job_results.get_asset(asset_id)
                 _log.debug(
-                    f"Generating output path for asset {asset.name} from job {job.job_id}..."
+                    "Generating output path for asset %s from job %s...",
+                    asset_id,
+                    job.job_id,
+                )
+                output_path = self._output_path_gen(
+                    self._output_dir, idx, row, asset_id
                 )
-                output_path = self._output_path_gen(self._output_dir, idx, row)
                 # Make the output path
                 output_path.parent.mkdir(parents=True, exist_ok=True)
                 asset.download(output_path)
                 # Add to the list of downloaded products
-                job_products[f"{job.job_id}_{asset.name}"] = [output_path]
+                job_products[f"{job.job_id}_{asset_id}"] = [output_path]
                 _log.debug(
-                    f"Downloaded {asset.name} from job {job.job_id} -> {output_path}"
+                    "Downloaded %s from job %s -> %s",
+                    asset_id,
+                    job.job_id,
+                    output_path,
                 )
             except Exception as e:
                 _log.exception(
-                    f"Error downloading asset {asset.name} from job {job.job_id}", e
+                    "Error downloading asset %s from job %s:\n%s",
+                    asset_id,
+                    job.job_id,
+                    e,
                 )
                 raise e
@@ -302,53 +470,35 @@ class GFMAPJobManager(MultiBackendJobManager):
                     asset.href = str(
                         asset_path
                     )  # Update the asset href to the output location set by the output_path_generator
-                # item.id = f"{job.job_id}_{item.id}"
                 # Add the item to the the current job items.
                 job_items.append(item)
-                _log.info(f"Parsed item {item.id} from job {job.job_id}")
+                _log.info("Parsed item %s from job %s", item.id, job.job_id)
             except Exception as e:
                 _log.exception(
-                    f"Error failed to add item {item.id} from job {job.job_id} to STAC collection",
+                    "Error failed to add item %s from job %s to STAC collection:\n%s",
+                    item.id,
+                    job.job_id,
                     e,
                 )
-                raise e
         # _post_job_action returns an updated list of stac items. Post job action can therefore
         # update the stac items and access their products through the HREF. It is also the
         # reponsible of adding the appropriate metadata/assets to the items.
         if self._post_job_action is not None:
-            _log.debug(f"Calling post job action for job {job.job_id}...")
-            job_items = self._post_job_action(job_items, row, self._post_job_params)
+            _log.debug("Calling post job action for job %s...", job.job_id)
+            job_items = self._post_job_action(job_items, row)
-        _log.info(f"Adding {len(job_items)} items to the STAC collection...")
+        _log.info("Adding %s items to the STAC collection...", len(job_items))
-        with _stac_lock:  # Take the STAC lock to avoid concurrence issues
-            # Filters the job items to only keep the ones that are not already in the collection
-            existing_ids = [item.id for item in self._root_collection.get_all_items()]
-            job_items = [item for item in job_items if item.id not in existing_ids]
+        if self.stac_enabled:
+            with self.lock:
+                self._update_stac(job.job_id, job_items)
-            self._root_collection.add_items(job_items)
-            _log.info(f"Added {len(job_items)} items to the STAC collection.")
-            _log.info(f"Writing STAC collection for {job.job_id} to file...")
-            try:
-                self._write_stac()
-            except Exception as e:
-                _log.exception(
-                    f"Error writing STAC collection for job {job.job_id} to file.", e
-                )
-                raise e
-            _log.info(f"Wrote STAC collection for {job.job_id} to file.")
-        _log.info(f"Job {job.job_id} and post job action finished successfully.")
+        _log.info("Job %s and post job action finished successfully.", job.job_id)
     def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Ensure we have the required columns and the expected type for the geometry column.
-        :param df: The dataframe to normalize.
-        :return: a new dataframe that is normalized.
-        """
+        """Ensure we have the required columns and the expected type for the geometry column."""
         # check for some required columns.
         required_with_default = [
             ("status", "not_started"),
@@ -366,7 +516,7 @@ class GFMAPJobManager(MultiBackendJobManager):
         }
         df = df.assign(**new_columns)
-        _log.debug(f"Normalizing dataframe. Columns: {df.columns}")
+        _log.debug("Normalizing dataframe. Columns: %s", df.columns)
         return df
@@ -401,7 +551,7 @@ class GFMAPJobManager(MultiBackendJobManager):
             The file to track the results of the jobs.
         """
         # Starts the thread pool to work on the on_job_done and on_job_error methods
-        _log.info(f"Starting ThreadPoolExecutor with {self._n_threads} workers.")
+        _log.info("Starting ThreadPoolExecutor with %s workers.", self._n_threads)
         with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
             _log.info("Creating and running jobs.")
             self._executor = executor
@@ -412,6 +562,13 @@ class GFMAPJobManager(MultiBackendJobManager):
             self._wait_queued_actions()
             _log.info("Exiting ThreadPoolExecutor.")
             self._executor = None
+        _log.info("All jobs finished running.")
+        if self.stac_enabled:
+            _log.info("Saving persisted STAC collection to final .json collection.")
+            self._write_stac()
+            _log.info("Saved STAC catalogue to JSON format, all tasks finished!")
+        else:
+            _log.info("STAC was disabled, skipping generation of the catalogue.")
     def _write_stac(self):
         """Writes the STAC collection to the output directory."""
@@ -428,6 +585,36 @@ class GFMAPJobManager(MultiBackendJobManager):
         self._root_collection.normalize_hrefs(str(root_path))
         self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
+    def _persist_stac(self):
+        """Persists the STAC collection by saving it into a binary file."""
+        _log.debug("Validating the STAC collection before persisting.")
+        self._root_collection.validate_all()
+        _log.info("Persisting STAC collection to temp file %s.", self._catalogue_cache)
+        with open(self._catalogue_cache, "wb") as file:
+            pickle.dump(self._root_collection, file)
+    def _update_stac(self, job_id: str, job_items: list[pystac.Item]):
+        """Updates the STAC collection by adding the items generated by the job.
+        Does not add duplicates or override with the same item ID.
+        """
+        try:
+            _log.info("Thread %s entered the STAC lock.", threading.get_ident())
+            # Filters the job items to only keep the ones that are not already in the collection
+            existing_ids = [item.id for item in self._root_collection.get_all_items()]
+            job_items = [item for item in job_items if item.id not in existing_ids]
+            self._root_collection.add_items(job_items)
+            _log.info("Added %s items to the STAC collection.", len(job_items))
+            self._persist_stac()
+        except Exception as e:
+            _log.exception(
+                "Error adding items to the STAC collection for job %s:\n%s ",
+                job_id,
+                str(e),
+            )
+            raise e
     def setup_stac(
         self,
         constellation: Optional[str] = None,

openeo-gfmap 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

openeo-gfmap 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl