PyPI - openeo-gfmap - Versions diffs - 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

openeo-gfmap 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

openeo_gfmap/features/feature_extractor.py +9 -0
openeo_gfmap/fetching/__init__.py +16 -4
openeo_gfmap/fetching/commons.py +1 -0
openeo_gfmap/fetching/generic.py +81 -73
openeo_gfmap/fetching/s1.py +1 -3
openeo_gfmap/fetching/s2.py +1 -0
openeo_gfmap/inference/model_inference.py +5 -2
openeo_gfmap/manager/job_manager.py +271 -84
openeo_gfmap/manager/job_splitters.py +169 -21
openeo_gfmap/preprocessing/sar.py +12 -33
openeo_gfmap/stac/constants.py +1 -1
openeo_gfmap/utils/__init__.py +16 -0
openeo_gfmap/utils/catalogue.py +172 -35
openeo_gfmap/utils/split_stac.py +125 -0
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/METADATA +5 -4
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/RECORD +18 -18
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/WHEEL +1 -1
openeo_gfmap/fetching/meteo.py +0 -126
{openeo_gfmap-0.1.0.dist-info → openeo_gfmap-0.3.0.dist-info}/licenses/LICENSE +0 -0

openeo_gfmap/manager/job_splitters.py CHANGED Viewed

@@ -8,25 +8,69 @@ from typing import List
 import geopandas as gpd
 import h3
 import requests
+import s2sphere
 from openeo_gfmap.manager import _log
-def load_s2_grid() -> gpd.GeoDataFrame:
+def load_s2_grid(web_mercator: bool = False) -> gpd.GeoDataFrame:
     """Returns a geo data frame from the S2 grid."""
     # Builds the path where the geodataframe should be
-    gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds.geojson"
+    if not web_mercator:
+        gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_v2.geoparquet"
+        url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_v2.geoparquet"
+    else:
+        gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_v2.geoparquet"
+        url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_v2.geoparquet"
     if not gdf_path.exists():
         _log.info("S2 grid not found, downloading it from artifactory.")
         # Downloads the file from the artifactory URL
         gdf_path.parent.mkdir(exist_ok=True)
         response = requests.get(
-            "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds.geojson",
+            url,
             timeout=180,  # 3mins
         )
+        if response.status_code != 200:
+            raise ValueError(
+                "Failed to download the S2 grid from the artifactory. "
+                f"Status code: {response.status_code}"
+            )
         with open(gdf_path, "wb") as f:
             f.write(response.content)
-    return gpd.read_file(gdf_path)
+    return gpd.read_parquet(gdf_path)
+def load_s2_grid_centroids(web_mercator: bool = False) -> gpd.GeoDataFrame:
+    """Returns a geo data frame from the S2 grid centroids."""
+    # Builds the path where the geodataframe should be
+    if not web_mercator:
+        gdf_path = (
+            Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_centroids.geoparquet"
+        )
+        url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_centroids.geoparquet"
+    else:
+        gdf_path = (
+            Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_centroids.geoparquet"
+        )
+        url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_centroids.geoparquet"
+    if not gdf_path.exists():
+        _log.info("S2 grid centroids not found, downloading it from artifactory.")
+        # Downloads the file from the artifactory URL
+        gdf_path.parent.mkdir(exist_ok=True)
+        response = requests.get(
+            url,
+            timeout=180,  # 3mins
+        )
+        if response.status_code != 200:
+            raise ValueError(
+                "Failed to download the S2 grid centroids from the artifactory. "
+                f"Status code: {response.status_code}"
+            )
+        with open(gdf_path, "wb") as f:
+            f.write(response.content)
+    return gpd.read_parquet(gdf_path)
 def _resplit_group(
@@ -38,7 +82,7 @@ def _resplit_group(
 def split_job_s2grid(
-    polygons: gpd.GeoDataFrame, max_points: int = 500
+    polygons: gpd.GeoDataFrame, max_points: int = 500, web_mercator: bool = False
 ) -> List[gpd.GeoDataFrame]:
     """Split a job into multiple jobs from the position of the polygons/points. The centroid of
     the geometries to extract are used to select tile in the Sentinel-2 tile grid.
@@ -60,17 +104,24 @@ def split_job_s2grid(
     if polygons.crs is None:
         raise ValueError("The GeoDataFrame must contain a CRS")
-    polygons = polygons.to_crs(epsg=4326)
-    if polygons.geometry.geom_type[0] != "Point":
-        polygons["geometry"] = polygons.geometry.centroid
+    epsg = 3857 if web_mercator else 4326
-    # Dataset containing all the S2 tiles, find the nearest S2 tile for each point
-    s2_grid = load_s2_grid()
-    s2_grid["geometry"] = s2_grid.geometry.centroid
+    original_crs = polygons.crs
-    polygons = gpd.sjoin_nearest(polygons, s2_grid[["tile", "geometry"]]).drop(
-        columns=["index_right"]
-    )
+    polygons = polygons.to_crs(epsg=epsg)
+    polygons["centroid"] = polygons.geometry.centroid
+    # Dataset containing all the S2 tile centroids, find the nearest S2 tile for each point
+    s2_grid = load_s2_grid_centroids(web_mercator)
+    s2_grid = s2_grid[s2_grid.cdse_valid]
+    polygons = gpd.sjoin_nearest(
+        polygons.set_geometry("centroid"), s2_grid[["tile", "geometry"]]
+    ).drop(columns=["index_right", "centroid"])
+    polygons = polygons.set_geometry("geometry").to_crs(original_crs)
     split_datasets = []
     for _, sub_gdf in polygons.groupby("tile"):
@@ -86,12 +137,15 @@ def append_h3_index(
     polygons: gpd.GeoDataFrame, grid_resolution: int = 3
 ) -> gpd.GeoDataFrame:
     """Append the H3 index to the polygons."""
-    if polygons.geometry.geom_type[0] != "Point":
-        geom_col = polygons.geometry.centroid
-    else:
-        geom_col = polygons.geometry
+    # Project to Web mercator to calculate centroids
+    polygons = polygons.to_crs(epsg=3857)
+    geom_col = polygons.geometry.centroid
+    # Project to lat lon to calculate the h3 index
+    geom_col = geom_col.to_crs(epsg=4326)
     polygons["h3index"] = geom_col.apply(
-        lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
+        lambda pt: h3.latlng_to_cell(pt.y, pt.x, grid_resolution)
     )
     return polygons
@@ -127,12 +181,13 @@ def split_job_hex(
     if polygons.crs is None:
         raise ValueError("The GeoDataFrame must contain a CRS")
-    # Project to lat/lon positions
-    polygons = polygons.to_crs(epsg=4326)
+    original_crs = polygons.crs
     # Split the polygons into multiple jobs
     polygons = append_h3_index(polygons, grid_resolution)
+    polygons = polygons.to_crs(original_crs)
     split_datasets = []
     for _, sub_gdf in polygons.groupby("h3index"):
         if len(sub_gdf) > max_points:
@@ -142,3 +197,96 @@ def split_job_hex(
             split_datasets.append(sub_gdf.reset_index(drop=True))
     return split_datasets
+def split_job_s2sphere(
+    gdf: gpd.GeoDataFrame, max_points=500, start_level=8
+) -> List[gpd.GeoDataFrame]:
+    """
+    EXPERIMENTAL
+    Split a GeoDataFrame into multiple groups based on the S2geometry cell ID of each geometry.
+    S2geometry is a library that provides a way to index and query spatial data. This function splits
+    the GeoDataFrame into groups based on the S2 cell ID of each geometry, based on it's centroid.
+    If a cell contains more points than max_points, it will be recursively split into
+    smaller cells until each cell contains at most max_points points.
+    More information on S2geometry can be found at https://s2geometry.io/
+    An overview of the S2 cell hierarchy can be found at https://s2geometry.io/resources/s2cell_statistics.html
+    :param gdf: GeoDataFrame containing points to split
+    :param max_points: Maximum number of points per group
+    :param start_level: Starting S2 cell level
+    :return: List of GeoDataFrames containing the split groups
+    """
+    if "geometry" not in gdf.columns:
+        raise ValueError("The GeoDataFrame must contain a 'geometry' column.")
+    if gdf.crs is None:
+        raise ValueError("The GeoDataFrame must contain a CRS")
+    # Store the original CRS of the GeoDataFrame and reproject to EPSG:3857
+    original_crs = gdf.crs
+    gdf = gdf.to_crs(epsg=3857)
+    # Add a centroid column to the GeoDataFrame and convert it to EPSG:4326
+    gdf["centroid"] = gdf.geometry.centroid
+    # Reproject the GeoDataFrame to its orginial CRS
+    gdf = gdf.to_crs(original_crs)
+    # Set the GeoDataFrame's geometry to the centroid column and reproject to EPSG:4326
+    gdf = gdf.set_geometry("centroid")
+    gdf = gdf.to_crs(epsg=4326)
+    # Create a dictionary to store points by their S2 cell ID
+    cell_dict = {}
+    # Iterate over each point in the GeoDataFrame
+    for idx, row in gdf.iterrows():
+        # Get the S2 cell ID for the point at a given level
+        cell_id = _get_s2cell_id(row.centroid, start_level)
+        if cell_id not in cell_dict:
+            cell_dict[cell_id] = []
+        cell_dict[cell_id].append(row)
+    result_groups = []
+    # Function to recursively split cells if they contain more points than max_points
+    def _split_s2cell(cell_id, points, current_level=start_level):
+        if len(points) <= max_points:
+            if len(points) > 0:
+                points = gpd.GeoDataFrame(
+                    points, crs=original_crs, geometry="geometry"
+                ).drop(columns=["centroid"])
+                points["s2sphere_cell_id"] = cell_id
+                points["s2sphere_cell_level"] = current_level
+                result_groups.append(gpd.GeoDataFrame(points))
+        else:
+            children = s2sphere.CellId(cell_id).children()
+            child_cells = {child.id(): [] for child in children}
+            for point in points:
+                child_cell_id = _get_s2cell_id(point.centroid, current_level + 1)
+                child_cells[child_cell_id].append(point)
+            for child_cell_id, child_points in child_cells.items():
+                _split_s2cell(child_cell_id, child_points, current_level + 1)
+    # Split cells that contain more points than max_points
+    for cell_id, points in cell_dict.items():
+        _split_s2cell(cell_id, points)
+    return result_groups
+def _get_s2cell_id(point, level):
+    lat, lon = point.y, point.x
+    cell_id = s2sphere.CellId.from_lat_lng(
+        s2sphere.LatLng.from_degrees(lat, lon)
+    ).parent(level)
+    return cell_id.id()

openeo_gfmap/preprocessing/sar.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Routines to pre-process sar signals."""
 import openeo
-from openeo.processes import array_create, if_, is_nodata, power
+from openeo.processes import array_create, power
-from openeo_gfmap import Backend, BackendContext
+from openeo_gfmap import BackendContext
 def compress_backscatter_uint16(
@@ -27,38 +27,17 @@ def compress_backscatter_uint16(
     openeo.DataCube
         The datacube with the backscatter values compressed to uint16.
     """
-    backend = backend_context.backend
-    # Additional check related to problematic values present in creodias collections.
-    # https://github.com/Open-EO/openeo-geopyspark-driver/issues/293
-    if backend in [Backend.CDSE, Backend.CDSE_STAGING, Backend.FED]:
-        cube = cube.apply_dimension(
-            dimension="bands",
-            process=lambda x: array_create(
-                [
-                    if_(
-                        is_nodata(x[0]),
-                        1,
-                        power(base=10, p=(10.0 * x[0].log(base=10) + 83.0) / 20.0),
-                    ),
-                    if_(
-                        is_nodata(x[1]),
-                        1,
-                        power(base=10, p=(10.0 * x[1].log(base=10) + 83.0) / 20.0),
-                    ),
-                ]
-            ),
-        )
-    else:
-        cube = cube.apply_dimension(
-            dimension="bands",
-            process=lambda x: array_create(
-                [
-                    power(base=10, p=(10.0 * x[0].log(base=10) + 83.0) / 20.0),
-                    power(base=10, p=(10.0 * x[1].log(base=10) + 83.0) / 20.0),
-                ]
-            ),
-        )
+    # Apply rescaling of power values in a logarithmic way
+    cube = cube.apply_dimension(
+        dimension="bands",
+        process=lambda x: array_create(
+            [
+                power(base=10, p=(10.0 * x[0].log(base=10) + 83.0) / 20.0),
+                power(base=10, p=(10.0 * x[1].log(base=10) + 83.0) / 20.0),
+            ]
+        ),
+    )
     # Change the data type to uint16 for optimization purposes
     return cube.linear_scale_range(1, 65534, 1, 65534)

openeo_gfmap/stac/constants.py CHANGED Viewed

@@ -29,7 +29,7 @@ PLATFORM = {
 INSTRUMENTS = {"sentinel2": ["msi"], "sentinel1": ["c-sar"]}
-GSD = {"sentinel2": [10, 20, 60], "sentinel1": [10]}
+GSD = {"sentinel2": [10, 20, 60], "sentinel1": [20]}
 SUMMARIES = {
     "sentinel2": pystac.summaries.Summaries(

openeo_gfmap/utils/__init__.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """This sub-module contains utilitary function and tools for OpenEO-GFMap"""
+import logging
 from openeo_gfmap.utils.build_df import load_json
 from openeo_gfmap.utils.intervals import quintad_intervals
 from openeo_gfmap.utils.netcdf import update_nc_attributes
+from openeo_gfmap.utils.split_stac import split_collection_by_epsg
 from openeo_gfmap.utils.tile_processing import (
     array_bounds,
     arrays_cosine_similarity,
@@ -11,6 +14,18 @@ from openeo_gfmap.utils.tile_processing import (
     select_sar_bands,
 )
+_log = logging.getLogger(__name__)
+_log.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ch.setFormatter(formatter)
+_log.addHandler(ch)
 __all__ = [
     "load_json",
     "normalize_array",
@@ -19,5 +34,6 @@ __all__ = [
     "select_sar_bands",
     "arrays_cosine_similarity",
     "quintad_intervals",
+    "split_collection_by_epsg",
     "update_nc_attributes",
 ]

openeo-gfmap 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

openeo-gfmap 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl