PyPI - openeo-gfmap - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

openeo-gfmap 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

openeo_gfmap/features/feature_extractor.py +9 -0
openeo_gfmap/fetching/__init__.py +16 -4
openeo_gfmap/fetching/commons.py +1 -0
openeo_gfmap/fetching/generic.py +81 -73
openeo_gfmap/fetching/s1.py +1 -3
openeo_gfmap/fetching/s2.py +1 -0
openeo_gfmap/inference/model_inference.py +5 -2
openeo_gfmap/manager/job_manager.py +269 -83
openeo_gfmap/manager/job_splitters.py +41 -18
openeo_gfmap/stac/constants.py +1 -1
openeo_gfmap/utils/__init__.py +16 -0
openeo_gfmap/utils/catalogue.py +165 -34
openeo_gfmap/utils/split_stac.py +125 -0
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/METADATA +1 -1
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/RECORD +17 -17
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/WHEEL +1 -1
openeo_gfmap/fetching/meteo.py +0 -126
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.2.0.dist-info}/licenses/LICENSE +0 -0

openeo_gfmap/manager/job_manager.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import json
+import pickle
 import threading
+import time
 from concurrent.futures import ThreadPoolExecutor
-from enum import Enum
 from functools import partial
 from pathlib import Path
+from threading import Lock
 from typing import Callable, Optional, Union
 import pandas as pd
@@ -16,28 +18,62 @@ from openeo_gfmap.manager import _log
 from openeo_gfmap.stac import constants
 # Lock to use when writing to the STAC collection
-_stac_lock = threading.Lock()
+_stac_lock = Lock()
+def retry_on_exception(max_retries: int, delay_s: int = 180):
+    """Decorator to retry a function if an exception occurs.
+    Used for post-job actions that can crash due to internal backend issues. Restarting the action
+    usually helps to solve the issue.
+    Parameters
+    ----------
+    max_retries: int
+        The maximum number of retries to attempt before finally raising the exception.
+    delay: int (default=180 seconds)
+        The delay in seconds to wait before retrying the decorated function.
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            latest_exception = None
+            for _ in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    time.sleep(
+                        delay_s
+                    )  # Waits before retrying, while allowing other futures to run.
+                    latest_exception = e
+            raise latest_exception
+        return wrapper
+    return decorator
 def done_callback(future, df, idx):
-    """Sets the status of the job to the given status when the future is done."""
+    """Changes the status of the job when the post-job action future is done."""
     current_status = df.loc[idx, "status"]
-    if not future.exception():
+    exception = future.exception()
+    if exception is None:
         if current_status == "postprocessing":
             df.loc[idx, "status"] = "finished"
         elif current_status == "postprocessing-error":
             df.loc[idx, "status"] = "error"
+        elif current_status == "running":
+            df.loc[idx, "status"] = "running"
         else:
             raise ValueError(
                 f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
             )
-class PostJobStatus(Enum):
-    """Indicates the workers if the job finished as sucessful or with an error."""
-    FINISHED = "finished"
-    ERROR = "error"
+    else:
+        _log.exception(
+            "Exception occurred in post-job future for job %s:\n%s",
+            df.loc[idx, "id"],
+            exception,
+        )
+        df.loc[idx, "status"] = "error"
 class GFMAPJobManager(MultiBackendJobManager):
@@ -53,13 +89,50 @@ class GFMAPJobManager(MultiBackendJobManager):
         post_job_action: Optional[Callable] = None,
         poll_sleep: int = 5,
         n_threads: int = 1,
-        post_job_params: dict = {},
         resume_postproc: bool = True,  # If we need to check for post-job actions that crashed
         restart_failed: bool = False,  # If we need to restart failed jobs
+        stac_enabled: bool = True,
     ):
+        """
+        Initializes the GFMAP job manager.
+        Parameters
+        ----------
+        output_dir: Path
+            The base output directory where the results/stac/logs of the jobs will be stored.
+        output_path_generator: Callable
+            User defined function that generates the output path for the job results. Expects as
+            inputs the output directory, the index of the job in the job dataframe
+            and the row of the job, and returns the final path where to save a job result asset.
+        collection_id: Optional[str]
+            The ID of the STAC collection that is being generated. Can be left empty if the STAC
+            catalogue is not being generated or if it is being resumed from an existing catalogue.
+        collection_description: Optional[str]
+            The description of the STAC collection that is being generated.
+        stac: Optional[Union[str, Path]]
+            The path to the STAC collection to be saved or resumed.
+            If None, the default path will be used.
+        post_job_action: Optional[Callable]
+            A user defined function that will be called after a job is finished. It will receive
+            the list of items generated by the job and the row of the job, and should return the
+            updated list of items.
+        poll_sleep: int
+            The time in seconds to wait between polling the backend for job status.
+        n_threads: int
+            The number of threads to execute `on_job_done` and `on_job_error` functions.
+        resume_postproc: bool
+            If set to true, all `on_job_done` and `on_job_error` functions that failed are resumed.
+        restart_failed: bool
+            If set to true, all jobs that failed within the OpenEO backend are restarted.
+        stac_enabled: bool (default=True)
+            If the STAC generation is enabled or not. Disabling it will prevent the creation,
+            update and loading of the STAC collection.
+        """
         self._output_dir = output_dir
+        self._catalogue_cache = output_dir / "catalogue_cache.bin"
         self.stac = stac
+        self.stac_enabled = stac_enabled
         self.collection_id = collection_id
         self.collection_description = collection_description
@@ -74,41 +147,73 @@ class GFMAPJobManager(MultiBackendJobManager):
         self._output_path_gen = output_path_generator
         self._post_job_action = post_job_action
-        self._post_job_params = post_job_params
         # Monkey patching the _normalize_df method to ensure we have no modification on the
         # geometry column
         MultiBackendJobManager._normalize_df = self._normalize_df
         super().__init__(poll_sleep)
-        self._root_collection = self._normalize_stac()
+        if self.stac_enabled:
+            self._root_collection = self._initialize_stac()
-    def _normalize_stac(self):
+    def _load_stac(self) -> Optional[pystac.Collection]:
+        """
+        Loads the STAC collection from the cache, the specified `stac` path or the default path.
+        If no STAC collection is found, returns None.
+        """
         default_collection_path = self._output_dir / "stac/collection.json"
-        if self.stac is not None:
+        if self._catalogue_cache.exists():
             _log.info(
-                f"Reloading the STAC collection from the provided path: {self.stac}."
+                "Loading the STAC collection from the persisted binary file: %s.",
+                self._catalogue_cache,
             )
-            root_collection = pystac.read_file(str(self.stac))
+            with open(self._catalogue_cache, "rb") as file:
+                return pickle.load(file)
+        elif self.stac is not None:
+            _log.info(
+                "Reloading the STAC collection from the provided path: %s.", self.stac
+            )
+            return pystac.read_file(str(self.stac))
         elif default_collection_path.exists():
             _log.info(
-                f"Reload the STAC collection from the default path: {default_collection_path}."
+                "Reload the STAC collection from the default path: %s.",
+                default_collection_path,
             )
             self.stac = default_collection_path
-            root_collection = pystac.read_file(str(self.stac))
-        else:
-            _log.info("Starting a fresh STAC collection.")
-            assert (
-                self.collection_id is not None
-            ), "A collection ID is required to generate a STAC collection."
-            root_collection = pystac.Collection(
-                id=self.collection_id,
-                description=self.collection_description,
-                extent=None,
+            return pystac.read_file(str(self.stac))
+        _log.info(
+            "No STAC collection found as cache, in the default path or in the provided path."
+        )
+        return None
+    def _create_stac(self) -> pystac.Collection:
+        """
+        Creates and returns new STAC collection. The created stac collection will use the
+        `collection_id` and `collection_description` parameters set in the constructor.
+        """
+        if self.collection_id is None:
+            raise ValueError(
+                "A collection ID is required to generate a STAC collection."
             )
-            root_collection.license = constants.LICENSE
-            root_collection.add_link(constants.LICENSE_LINK)
-            root_collection.stac_extensions = constants.STAC_EXTENSIONS
+        collection = pystac.Collection(
+            id=self.collection_id,
+            description=self.collection_description,
+            extent=None,
+        )
+        collection.license = constants.LICENSE
+        collection.add_link(constants.LICENSE_LINK)
+        collection.stac_extensions = constants.STAC_EXTENSIONS
+        return collection
+    def _initialize_stac(self) -> pystac.Collection:
+        """
+        Loads and returns if possible an existing stac collection, otherwise creates a new one.
+        """
+        root_collection = self._load_stac()
+        if not root_collection:
+            _log.info("Starting a fresh STAC collection.")
+            root_collection = self._create_stac()
         return root_collection
@@ -150,24 +255,40 @@ class GFMAPJobManager(MultiBackendJobManager):
             job = connection.job(row.id)
             if row.status == "postprocessing":
                 _log.info(
-                    f"Resuming postprocessing of job {row.id}, queueing on_job_finished..."
+                    "Resuming postprocessing of job %s, queueing on_job_finished...",
+                    row.id,
+                )
+                future = self._executor.submit(self.on_job_done, job, row, _stac_lock)
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
                 )
-                future = self._executor.submit(self.on_job_done, job, row)
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
             else:
                 _log.info(
-                    f"Resuming postprocessing of job {row.id}, queueing on_job_error..."
+                    "Resuming postprocessing of job %s, queueing on_job_error...",
+                    row.id,
                 )
                 future = self._executor.submit(self.on_job_error, job, row)
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
             self._futures.append(future)
     def _restart_failed_jobs(self, df: pd.DataFrame):
         """Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
-        failed_tasks = df[df.status == "error"]
+        failed_tasks = df[df.status.isin(["error", "start_failed"])]
         not_started_tasks = df[df.status == "not_started"]
         _log.info(
-            f"Resetting {len(failed_tasks)} failed jobs to 'not_started'. {len(not_started_tasks)} jobs are already 'not_started'."
+            "Resetting %s failed jobs to 'not_started'. %s jobs are already 'not_started'.",
+            len(failed_tasks),
+            len(not_started_tasks),
         )
         for idx, _ in failed_tasks.iterrows():
             df.loc[idx, "status"] = "not_started"
@@ -203,27 +324,53 @@ class GFMAPJobManager(MultiBackendJobManager):
                 job_metadata["status"] == "finished"
             ):
                 _log.info(
-                    f"Job {job.job_id} finished successfully, queueing on_job_done..."
+                    "Job %s finished successfully, queueing on_job_done...", job.job_id
                 )
                 job_status = "postprocessing"
-                future = self._executor.submit(self.on_job_done, job, row)
+                future = self._executor.submit(self.on_job_done, job, row, _stac_lock)
                 # Future will setup the status to finished when the job is done
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
                 self._futures.append(future)
-                df.loc[idx, "costs"] = job_metadata["costs"]
+                if "costs" in job_metadata:
+                    df.loc[idx, "costs"] = job_metadata["costs"]
+                    df.loc[idx, "memory"] = (
+                        job_metadata["usage"]
+                        .get("max_executor_memory", {})
+                        .get("value", None)
+                    )
+                else:
+                    _log.warning(
+                        "Costs not found in job %s metadata. Costs will be set to 'None'.",
+                        job.job_id,
+                    )
             # Case in which it failed
             if (df.loc[idx, "status"] != "error") and (
                 job_metadata["status"] == "error"
             ):
                 _log.info(
-                    f"Job {job.job_id} finished with error, queueing on_job_error..."
+                    "Job %s finished with error, queueing on_job_error...",
+                    job.job_id,
                 )
                 job_status = "postprocessing-error"
                 future = self._executor.submit(self.on_job_error, job, row)
                 # Future will setup the status to error when the job is done
-                future.add_done_callback(partial(done_callback, df=df, idx=idx))
+                future.add_done_callback(
+                    partial(
+                        done_callback,
+                        df=df,
+                        idx=idx,
+                    )
+                )
                 self._futures.append(future)
+            if "costs" in job_metadata:
                 df.loc[idx, "costs"] = job_metadata["costs"]
             df.loc[idx, "status"] = job_status
@@ -231,6 +378,7 @@ class GFMAPJobManager(MultiBackendJobManager):
         # Clear the futures that are done and raise their potential exceptions if they occurred.
         self._clear_queued_actions()
+    @retry_on_exception(max_retries=2, delay_s=180)
     def on_job_error(self, job: BatchJob, row: pd.Series):
         """Method called when a job finishes with an error.
@@ -241,7 +389,14 @@ class GFMAPJobManager(MultiBackendJobManager):
         row: pd.Series
             The row in the dataframe that contains the job relative information.
         """
-        logs = job.logs()
+        try:
+            logs = job.logs()
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            _log.exception(
+                "Error getting logs in `on_job_error` for job %s:\n%s", job.job_id, e
+            )
+            logs = []
         error_logs = [log for log in logs if log.level.lower() == "error"]
         job_metadata = job.describe_job()
@@ -260,15 +415,21 @@ class GFMAPJobManager(MultiBackendJobManager):
                 f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
             )
-    def on_job_done(self, job: BatchJob, row: pd.Series):
+    @retry_on_exception(max_retries=2, delay_s=30)
+    def on_job_done(
+        self, job: BatchJob, row: pd.Series, lock: Lock
+    ):  # pylint: disable=arguments-differ
         """Method called when a job finishes successfully. It will first download the results of
         the job and then call the `post_job_action` method.
         """
         job_products = {}
         for idx, asset in enumerate(job.get_results().get_assets()):
             try:
                 _log.debug(
-                    f"Generating output path for asset {asset.name} from job {job.job_id}..."
+                    "Generating output path for asset %s from job %s...",
+                    asset.name,
+                    job.job_id,
                 )
                 output_path = self._output_path_gen(self._output_dir, idx, row)
                 # Make the output path
@@ -277,11 +438,17 @@ class GFMAPJobManager(MultiBackendJobManager):
                 # Add to the list of downloaded products
                 job_products[f"{job.job_id}_{asset.name}"] = [output_path]
                 _log.debug(
-                    f"Downloaded {asset.name} from job {job.job_id} -> {output_path}"
+                    "Downloaded %s from job %s -> %s",
+                    asset.name,
+                    job.job_id,
+                    output_path,
                 )
             except Exception as e:
                 _log.exception(
-                    f"Error downloading asset {asset.name} from job {job.job_id}", e
+                    "Error downloading asset %s from job %s:\n%s",
+                    asset.name,
+                    job.job_id,
+                    e,
                 )
                 raise e
@@ -302,53 +469,35 @@ class GFMAPJobManager(MultiBackendJobManager):
                     asset.href = str(
                         asset_path
                     )  # Update the asset href to the output location set by the output_path_generator
-                # item.id = f"{job.job_id}_{item.id}"
                 # Add the item to the the current job items.
                 job_items.append(item)
-                _log.info(f"Parsed item {item.id} from job {job.job_id}")
+                _log.info("Parsed item %s from job %s", item.id, job.job_id)
             except Exception as e:
                 _log.exception(
-                    f"Error failed to add item {item.id} from job {job.job_id} to STAC collection",
+                    "Error failed to add item %s from job %s to STAC collection:\n%s",
+                    item.id,
+                    job.job_id,
                     e,
                 )
-                raise e
         # _post_job_action returns an updated list of stac items. Post job action can therefore
         # update the stac items and access their products through the HREF. It is also the
         # reponsible of adding the appropriate metadata/assets to the items.
         if self._post_job_action is not None:
-            _log.debug(f"Calling post job action for job {job.job_id}...")
-            job_items = self._post_job_action(job_items, row, self._post_job_params)
+            _log.debug("Calling post job action for job %s...", job.job_id)
+            job_items = self._post_job_action(job_items, row)
-        _log.info(f"Adding {len(job_items)} items to the STAC collection...")
+        _log.info("Adding %s items to the STAC collection...", len(job_items))
-        with _stac_lock:  # Take the STAC lock to avoid concurrence issues
-            # Filters the job items to only keep the ones that are not already in the collection
-            existing_ids = [item.id for item in self._root_collection.get_all_items()]
-            job_items = [item for item in job_items if item.id not in existing_ids]
+        if self.stac_enabled:
+            with lock:
+                self._update_stac(job.job_id, job_items)
-            self._root_collection.add_items(job_items)
-            _log.info(f"Added {len(job_items)} items to the STAC collection.")
-            _log.info(f"Writing STAC collection for {job.job_id} to file...")
-            try:
-                self._write_stac()
-            except Exception as e:
-                _log.exception(
-                    f"Error writing STAC collection for job {job.job_id} to file.", e
-                )
-                raise e
-            _log.info(f"Wrote STAC collection for {job.job_id} to file.")
-        _log.info(f"Job {job.job_id} and post job action finished successfully.")
+        _log.info("Job %s and post job action finished successfully.", job.job_id)
     def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Ensure we have the required columns and the expected type for the geometry column.
-        :param df: The dataframe to normalize.
-        :return: a new dataframe that is normalized.
-        """
+        """Ensure we have the required columns and the expected type for the geometry column."""
         # check for some required columns.
         required_with_default = [
             ("status", "not_started"),
@@ -366,7 +515,7 @@ class GFMAPJobManager(MultiBackendJobManager):
         }
         df = df.assign(**new_columns)
-        _log.debug(f"Normalizing dataframe. Columns: {df.columns}")
+        _log.debug("Normalizing dataframe. Columns: %s", df.columns)
         return df
@@ -401,7 +550,7 @@ class GFMAPJobManager(MultiBackendJobManager):
             The file to track the results of the jobs.
         """
         # Starts the thread pool to work on the on_job_done and on_job_error methods
-        _log.info(f"Starting ThreadPoolExecutor with {self._n_threads} workers.")
+        _log.info("Starting ThreadPoolExecutor with %s workers.", self._n_threads)
         with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
             _log.info("Creating and running jobs.")
             self._executor = executor
@@ -412,6 +561,13 @@ class GFMAPJobManager(MultiBackendJobManager):
             self._wait_queued_actions()
             _log.info("Exiting ThreadPoolExecutor.")
             self._executor = None
+        _log.info("All jobs finished running.")
+        if self.stac_enabled:
+            _log.info("Saving persisted STAC collection to final .json collection.")
+            self._write_stac()
+            _log.info("Saved STAC catalogue to JSON format, all tasks finished!")
+        else:
+            _log.info("STAC was disabled, skipping generation of the catalogue.")
     def _write_stac(self):
         """Writes the STAC collection to the output directory."""
@@ -428,6 +584,36 @@ class GFMAPJobManager(MultiBackendJobManager):
         self._root_collection.normalize_hrefs(str(root_path))
         self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
+    def _persist_stac(self):
+        """Persists the STAC collection by saving it into a binary file."""
+        _log.debug("Validating the STAC collection before persisting.")
+        self._root_collection.validate_all()
+        _log.info("Persisting STAC collection to temp file %s.", self._catalogue_cache)
+        with open(self._catalogue_cache, "wb") as file:
+            pickle.dump(self._root_collection, file)
+    def _update_stac(self, job_id: str, job_items: list[pystac.Item]):
+        """Updates the STAC collection by adding the items generated by the job.
+        Does not add duplicates or override with the same item ID.
+        """
+        try:
+            _log.info("Thread %s entered the STAC lock.", threading.get_ident())
+            # Filters the job items to only keep the ones that are not already in the collection
+            existing_ids = [item.id for item in self._root_collection.get_all_items()]
+            job_items = [item for item in job_items if item.id not in existing_ids]
+            self._root_collection.add_items(job_items)
+            _log.info("Added %s items to the STAC collection.", len(job_items))
+            self._persist_stac()
+        except Exception as e:
+            _log.exception(
+                "Error adding items to the STAC collection for job %s:\n%s ",
+                job_id,
+                str(e),
+            )
+            raise e
     def setup_stac(
         self,
         constellation: Optional[str] = None,

openeo_gfmap/manager/job_splitters.py CHANGED Viewed

@@ -12,21 +12,32 @@ import requests
 from openeo_gfmap.manager import _log
-def load_s2_grid() -> gpd.GeoDataFrame:
+def load_s2_grid(web_mercator: bool = False) -> gpd.GeoDataFrame:
     """Returns a geo data frame from the S2 grid."""
     # Builds the path where the geodataframe should be
-    gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds.geojson"
+    if not web_mercator:
+        gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_v2.geoparquet"
+        url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_v2.geoparquet"
+    else:
+        gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_v2.geoparquet"
+        url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_v2.geoparquet"
     if not gdf_path.exists():
         _log.info("S2 grid not found, downloading it from artifactory.")
         # Downloads the file from the artifactory URL
         gdf_path.parent.mkdir(exist_ok=True)
         response = requests.get(
-            "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds.geojson",
+            url,
             timeout=180,  # 3mins
         )
+        if response.status_code != 200:
+            raise ValueError(
+                "Failed to download the S2 grid from the artifactory. "
+                f"Status code: {response.status_code}"
+            )
         with open(gdf_path, "wb") as f:
             f.write(response.content)
-    return gpd.read_file(gdf_path)
+    return gpd.read_parquet(gdf_path)
 def _resplit_group(
@@ -38,7 +49,7 @@ def _resplit_group(
 def split_job_s2grid(
-    polygons: gpd.GeoDataFrame, max_points: int = 500
+    polygons: gpd.GeoDataFrame, max_points: int = 500, web_mercator: bool = False
 ) -> List[gpd.GeoDataFrame]:
     """Split a job into multiple jobs from the position of the polygons/points. The centroid of
     the geometries to extract are used to select tile in the Sentinel-2 tile grid.
@@ -60,17 +71,25 @@ def split_job_s2grid(
     if polygons.crs is None:
         raise ValueError("The GeoDataFrame must contain a CRS")
-    polygons = polygons.to_crs(epsg=4326)
-    if polygons.geometry.geom_type[0] != "Point":
-        polygons["geometry"] = polygons.geometry.centroid
+    epsg = 3857 if web_mercator else 4326
+    original_crs = polygons.crs
+    polygons = polygons.to_crs(epsg=epsg)
+    polygons["centroid"] = polygons.geometry.centroid
     # Dataset containing all the S2 tiles, find the nearest S2 tile for each point
-    s2_grid = load_s2_grid()
+    s2_grid = load_s2_grid(web_mercator)
     s2_grid["geometry"] = s2_grid.geometry.centroid
-    polygons = gpd.sjoin_nearest(polygons, s2_grid[["tile", "geometry"]]).drop(
-        columns=["index_right"]
-    )
+    s2_grid = s2_grid[s2_grid.cdse_valid]
+    polygons = gpd.sjoin_nearest(
+        polygons.set_geometry("centroid"), s2_grid[["tile", "geometry"]]
+    ).drop(columns=["index_right", "centroid"])
+    polygons = polygons.set_geometry("geometry").to_crs(original_crs)
     split_datasets = []
     for _, sub_gdf in polygons.groupby("tile"):
@@ -86,10 +105,13 @@ def append_h3_index(
     polygons: gpd.GeoDataFrame, grid_resolution: int = 3
 ) -> gpd.GeoDataFrame:
     """Append the H3 index to the polygons."""
-    if polygons.geometry.geom_type[0] != "Point":
-        geom_col = polygons.geometry.centroid
-    else:
-        geom_col = polygons.geometry
+    # Project to Web mercator to calculate centroids
+    polygons = polygons.to_crs(epsg=3857)
+    geom_col = polygons.geometry.centroid
+    # Project to lat lon to calculate the h3 index
+    geom_col = geom_col.to_crs(epsg=4326)
     polygons["h3index"] = geom_col.apply(
         lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
     )
@@ -127,12 +149,13 @@ def split_job_hex(
     if polygons.crs is None:
         raise ValueError("The GeoDataFrame must contain a CRS")
-    # Project to lat/lon positions
-    polygons = polygons.to_crs(epsg=4326)
+    original_crs = polygons.crs
     # Split the polygons into multiple jobs
     polygons = append_h3_index(polygons, grid_resolution)
+    polygons = polygons.to_crs(original_crs)
     split_datasets = []
     for _, sub_gdf in polygons.groupby("h3index"):
         if len(sub_gdf) > max_points:

openeo_gfmap/stac/constants.py CHANGED Viewed

@@ -29,7 +29,7 @@ PLATFORM = {
 INSTRUMENTS = {"sentinel2": ["msi"], "sentinel1": ["c-sar"]}
-GSD = {"sentinel2": [10, 20, 60], "sentinel1": [10]}
+GSD = {"sentinel2": [10, 20, 60], "sentinel1": [20]}
 SUMMARIES = {
     "sentinel2": pystac.summaries.Summaries(

openeo-gfmap 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

openeo-gfmap 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl