openeo-gfmap 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,25 +8,69 @@ from typing import List
8
8
  import geopandas as gpd
9
9
  import h3
10
10
  import requests
11
+ import s2sphere
11
12
 
12
13
  from openeo_gfmap.manager import _log
13
14
 
14
15
 
15
- def load_s2_grid() -> gpd.GeoDataFrame:
16
+ def load_s2_grid(web_mercator: bool = False) -> gpd.GeoDataFrame:
16
17
  """Returns a geo data frame from the S2 grid."""
17
18
  # Builds the path where the geodataframe should be
18
- gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds.geojson"
19
+ if not web_mercator:
20
+ gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_v2.geoparquet"
21
+ url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_v2.geoparquet"
22
+ else:
23
+ gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_v2.geoparquet"
24
+ url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_v2.geoparquet"
25
+
19
26
  if not gdf_path.exists():
20
27
  _log.info("S2 grid not found, downloading it from artifactory.")
21
28
  # Downloads the file from the artifactory URL
22
29
  gdf_path.parent.mkdir(exist_ok=True)
23
30
  response = requests.get(
24
- "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds.geojson",
31
+ url,
25
32
  timeout=180, # 3mins
26
33
  )
34
+ if response.status_code != 200:
35
+ raise ValueError(
36
+ "Failed to download the S2 grid from the artifactory. "
37
+ f"Status code: {response.status_code}"
38
+ )
27
39
  with open(gdf_path, "wb") as f:
28
40
  f.write(response.content)
29
- return gpd.read_file(gdf_path)
41
+ return gpd.read_parquet(gdf_path)
42
+
43
+
44
+ def load_s2_grid_centroids(web_mercator: bool = False) -> gpd.GeoDataFrame:
45
+ """Returns a geo data frame from the S2 grid centroids."""
46
+ # Builds the path where the geodataframe should be
47
+ if not web_mercator:
48
+ gdf_path = (
49
+ Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_centroids.geoparquet"
50
+ )
51
+ url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_centroids.geoparquet"
52
+ else:
53
+ gdf_path = (
54
+ Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_centroids.geoparquet"
55
+ )
56
+ url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_centroids.geoparquet"
57
+
58
+ if not gdf_path.exists():
59
+ _log.info("S2 grid centroids not found, downloading it from artifactory.")
60
+ # Downloads the file from the artifactory URL
61
+ gdf_path.parent.mkdir(exist_ok=True)
62
+ response = requests.get(
63
+ url,
64
+ timeout=180, # 3mins
65
+ )
66
+ if response.status_code != 200:
67
+ raise ValueError(
68
+ "Failed to download the S2 grid centroids from the artifactory. "
69
+ f"Status code: {response.status_code}"
70
+ )
71
+ with open(gdf_path, "wb") as f:
72
+ f.write(response.content)
73
+ return gpd.read_parquet(gdf_path)
30
74
 
31
75
 
32
76
  def _resplit_group(
@@ -38,7 +82,7 @@ def _resplit_group(
38
82
 
39
83
 
40
84
  def split_job_s2grid(
41
- polygons: gpd.GeoDataFrame, max_points: int = 500
85
+ polygons: gpd.GeoDataFrame, max_points: int = 500, web_mercator: bool = False
42
86
  ) -> List[gpd.GeoDataFrame]:
43
87
  """Split a job into multiple jobs from the position of the polygons/points. The centroid of
44
88
  the geometries to extract are used to select tile in the Sentinel-2 tile grid.
@@ -60,17 +104,24 @@ def split_job_s2grid(
60
104
  if polygons.crs is None:
61
105
  raise ValueError("The GeoDataFrame must contain a CRS")
62
106
 
63
- polygons = polygons.to_crs(epsg=4326)
64
- if polygons.geometry.geom_type[0] != "Point":
65
- polygons["geometry"] = polygons.geometry.centroid
107
+ epsg = 3857 if web_mercator else 4326
66
108
 
67
- # Dataset containing all the S2 tiles, find the nearest S2 tile for each point
68
- s2_grid = load_s2_grid()
69
- s2_grid["geometry"] = s2_grid.geometry.centroid
109
+ original_crs = polygons.crs
70
110
 
71
- polygons = gpd.sjoin_nearest(polygons, s2_grid[["tile", "geometry"]]).drop(
72
- columns=["index_right"]
73
- )
111
+ polygons = polygons.to_crs(epsg=epsg)
112
+
113
+ polygons["centroid"] = polygons.geometry.centroid
114
+
115
+ # Dataset containing all the S2 tile centroids, find the nearest S2 tile for each point
116
+ s2_grid = load_s2_grid_centroids(web_mercator)
117
+
118
+ s2_grid = s2_grid[s2_grid.cdse_valid]
119
+
120
+ polygons = gpd.sjoin_nearest(
121
+ polygons.set_geometry("centroid"), s2_grid[["tile", "geometry"]]
122
+ ).drop(columns=["index_right", "centroid"])
123
+
124
+ polygons = polygons.set_geometry("geometry").to_crs(original_crs)
74
125
 
75
126
  split_datasets = []
76
127
  for _, sub_gdf in polygons.groupby("tile"):
@@ -86,12 +137,15 @@ def append_h3_index(
86
137
  polygons: gpd.GeoDataFrame, grid_resolution: int = 3
87
138
  ) -> gpd.GeoDataFrame:
88
139
  """Append the H3 index to the polygons."""
89
- if polygons.geometry.geom_type[0] != "Point":
90
- geom_col = polygons.geometry.centroid
91
- else:
92
- geom_col = polygons.geometry
140
+
141
+ # Project to Web mercator to calculate centroids
142
+ polygons = polygons.to_crs(epsg=3857)
143
+ geom_col = polygons.geometry.centroid
144
+ # Project to lat lon to calculate the h3 index
145
+ geom_col = geom_col.to_crs(epsg=4326)
146
+
93
147
  polygons["h3index"] = geom_col.apply(
94
- lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
148
+ lambda pt: h3.latlng_to_cell(pt.y, pt.x, grid_resolution)
95
149
  )
96
150
  return polygons
97
151
 
@@ -127,12 +181,13 @@ def split_job_hex(
127
181
  if polygons.crs is None:
128
182
  raise ValueError("The GeoDataFrame must contain a CRS")
129
183
 
130
- # Project to lat/lon positions
131
- polygons = polygons.to_crs(epsg=4326)
184
+ original_crs = polygons.crs
132
185
 
133
186
  # Split the polygons into multiple jobs
134
187
  polygons = append_h3_index(polygons, grid_resolution)
135
188
 
189
+ polygons = polygons.to_crs(original_crs)
190
+
136
191
  split_datasets = []
137
192
  for _, sub_gdf in polygons.groupby("h3index"):
138
193
  if len(sub_gdf) > max_points:
@@ -142,3 +197,96 @@ def split_job_hex(
142
197
  split_datasets.append(sub_gdf.reset_index(drop=True))
143
198
 
144
199
  return split_datasets
200
+
201
+
202
+ def split_job_s2sphere(
203
+ gdf: gpd.GeoDataFrame, max_points=500, start_level=8
204
+ ) -> List[gpd.GeoDataFrame]:
205
+ """
206
+ EXPERIMENTAL
207
+ Split a GeoDataFrame into multiple groups based on the S2geometry cell ID of each geometry.
208
+
209
+ S2geometry is a library that provides a way to index and query spatial data. This function splits
210
+ the GeoDataFrame into groups based on the S2 cell ID of each geometry, based on it's centroid.
211
+
212
+ If a cell contains more points than max_points, it will be recursively split into
213
+ smaller cells until each cell contains at most max_points points.
214
+
215
+ More information on S2geometry can be found at https://s2geometry.io/
216
+ An overview of the S2 cell hierarchy can be found at https://s2geometry.io/resources/s2cell_statistics.html
217
+
218
+ :param gdf: GeoDataFrame containing points to split
219
+ :param max_points: Maximum number of points per group
220
+ :param start_level: Starting S2 cell level
221
+ :return: List of GeoDataFrames containing the split groups
222
+ """
223
+
224
+ if "geometry" not in gdf.columns:
225
+ raise ValueError("The GeoDataFrame must contain a 'geometry' column.")
226
+
227
+ if gdf.crs is None:
228
+ raise ValueError("The GeoDataFrame must contain a CRS")
229
+
230
+ # Store the original CRS of the GeoDataFrame and reproject to EPSG:3857
231
+ original_crs = gdf.crs
232
+ gdf = gdf.to_crs(epsg=3857)
233
+
234
+ # Add a centroid column to the GeoDataFrame and convert it to EPSG:4326
235
+ gdf["centroid"] = gdf.geometry.centroid
236
+
237
+ # Reproject the GeoDataFrame to its orginial CRS
238
+ gdf = gdf.to_crs(original_crs)
239
+
240
+ # Set the GeoDataFrame's geometry to the centroid column and reproject to EPSG:4326
241
+ gdf = gdf.set_geometry("centroid")
242
+ gdf = gdf.to_crs(epsg=4326)
243
+
244
+ # Create a dictionary to store points by their S2 cell ID
245
+ cell_dict = {}
246
+
247
+ # Iterate over each point in the GeoDataFrame
248
+ for idx, row in gdf.iterrows():
249
+ # Get the S2 cell ID for the point at a given level
250
+ cell_id = _get_s2cell_id(row.centroid, start_level)
251
+
252
+ if cell_id not in cell_dict:
253
+ cell_dict[cell_id] = []
254
+
255
+ cell_dict[cell_id].append(row)
256
+
257
+ result_groups = []
258
+
259
+ # Function to recursively split cells if they contain more points than max_points
260
+ def _split_s2cell(cell_id, points, current_level=start_level):
261
+ if len(points) <= max_points:
262
+ if len(points) > 0:
263
+ points = gpd.GeoDataFrame(
264
+ points, crs=original_crs, geometry="geometry"
265
+ ).drop(columns=["centroid"])
266
+ points["s2sphere_cell_id"] = cell_id
267
+ points["s2sphere_cell_level"] = current_level
268
+ result_groups.append(gpd.GeoDataFrame(points))
269
+ else:
270
+ children = s2sphere.CellId(cell_id).children()
271
+ child_cells = {child.id(): [] for child in children}
272
+
273
+ for point in points:
274
+ child_cell_id = _get_s2cell_id(point.centroid, current_level + 1)
275
+ child_cells[child_cell_id].append(point)
276
+
277
+ for child_cell_id, child_points in child_cells.items():
278
+ _split_s2cell(child_cell_id, child_points, current_level + 1)
279
+
280
+ # Split cells that contain more points than max_points
281
+ for cell_id, points in cell_dict.items():
282
+ _split_s2cell(cell_id, points)
283
+
284
+ return result_groups
285
+
286
+
287
+ def _get_s2cell_id(point, level):
288
+ lat, lon = point.y, point.x
289
+ cell_id = s2sphere.CellId.from_lat_lng(
290
+ s2sphere.LatLng.from_degrees(lat, lon)
291
+ ).parent(level)
292
+ return cell_id.id()
@@ -1,9 +1,9 @@
1
1
  """Routines to pre-process sar signals."""
2
2
 
3
3
  import openeo
4
- from openeo.processes import array_create, if_, is_nodata, power
4
+ from openeo.processes import array_create, power
5
5
 
6
- from openeo_gfmap import Backend, BackendContext
6
+ from openeo_gfmap import BackendContext
7
7
 
8
8
 
9
9
  def compress_backscatter_uint16(
@@ -27,38 +27,17 @@ def compress_backscatter_uint16(
27
27
  openeo.DataCube
28
28
  The datacube with the backscatter values compressed to uint16.
29
29
  """
30
- backend = backend_context.backend
31
30
 
32
- # Additional check related to problematic values present in creodias collections.
33
- # https://github.com/Open-EO/openeo-geopyspark-driver/issues/293
34
- if backend in [Backend.CDSE, Backend.CDSE_STAGING, Backend.FED]:
35
- cube = cube.apply_dimension(
36
- dimension="bands",
37
- process=lambda x: array_create(
38
- [
39
- if_(
40
- is_nodata(x[0]),
41
- 1,
42
- power(base=10, p=(10.0 * x[0].log(base=10) + 83.0) / 20.0),
43
- ),
44
- if_(
45
- is_nodata(x[1]),
46
- 1,
47
- power(base=10, p=(10.0 * x[1].log(base=10) + 83.0) / 20.0),
48
- ),
49
- ]
50
- ),
51
- )
52
- else:
53
- cube = cube.apply_dimension(
54
- dimension="bands",
55
- process=lambda x: array_create(
56
- [
57
- power(base=10, p=(10.0 * x[0].log(base=10) + 83.0) / 20.0),
58
- power(base=10, p=(10.0 * x[1].log(base=10) + 83.0) / 20.0),
59
- ]
60
- ),
61
- )
31
+ # Apply rescaling of power values in a logarithmic way
32
+ cube = cube.apply_dimension(
33
+ dimension="bands",
34
+ process=lambda x: array_create(
35
+ [
36
+ power(base=10, p=(10.0 * x[0].log(base=10) + 83.0) / 20.0),
37
+ power(base=10, p=(10.0 * x[1].log(base=10) + 83.0) / 20.0),
38
+ ]
39
+ ),
40
+ )
62
41
 
63
42
  # Change the data type to uint16 for optimization purposes
64
43
  return cube.linear_scale_range(1, 65534, 1, 65534)
@@ -29,7 +29,7 @@ PLATFORM = {
29
29
 
30
30
  INSTRUMENTS = {"sentinel2": ["msi"], "sentinel1": ["c-sar"]}
31
31
 
32
- GSD = {"sentinel2": [10, 20, 60], "sentinel1": [10]}
32
+ GSD = {"sentinel2": [10, 20, 60], "sentinel1": [20]}
33
33
 
34
34
  SUMMARIES = {
35
35
  "sentinel2": pystac.summaries.Summaries(
@@ -1,8 +1,11 @@
1
1
  """This sub-module contains utilitary function and tools for OpenEO-GFMap"""
2
2
 
3
+ import logging
4
+
3
5
  from openeo_gfmap.utils.build_df import load_json
4
6
  from openeo_gfmap.utils.intervals import quintad_intervals
5
7
  from openeo_gfmap.utils.netcdf import update_nc_attributes
8
+ from openeo_gfmap.utils.split_stac import split_collection_by_epsg
6
9
  from openeo_gfmap.utils.tile_processing import (
7
10
  array_bounds,
8
11
  arrays_cosine_similarity,
@@ -11,6 +14,18 @@ from openeo_gfmap.utils.tile_processing import (
11
14
  select_sar_bands,
12
15
  )
13
16
 
17
+ _log = logging.getLogger(__name__)
18
+ _log.setLevel(logging.INFO)
19
+
20
+ ch = logging.StreamHandler()
21
+ ch.setLevel(logging.INFO)
22
+
23
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
24
+ ch.setFormatter(formatter)
25
+
26
+ _log.addHandler(ch)
27
+
28
+
14
29
  __all__ = [
15
30
  "load_json",
16
31
  "normalize_array",
@@ -19,5 +34,6 @@ __all__ = [
19
34
  "select_sar_bands",
20
35
  "arrays_cosine_similarity",
21
36
  "quintad_intervals",
37
+ "split_collection_by_epsg",
22
38
  "update_nc_attributes",
23
39
  ]