openeo-gfmap 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,11 @@
1
1
  import json
2
+ import pickle
2
3
  import threading
4
+ import time
3
5
  from concurrent.futures import ThreadPoolExecutor
4
- from enum import Enum
5
6
  from functools import partial
6
7
  from pathlib import Path
8
+ from threading import Lock
7
9
  from typing import Callable, Optional, Union
8
10
 
9
11
  import pandas as pd
@@ -16,28 +18,62 @@ from openeo_gfmap.manager import _log
16
18
  from openeo_gfmap.stac import constants
17
19
 
18
20
  # Lock to use when writing to the STAC collection
19
- _stac_lock = threading.Lock()
21
+ _stac_lock = Lock()
22
+
23
+
24
+ def retry_on_exception(max_retries: int, delay_s: int = 180):
25
+ """Decorator to retry a function if an exception occurs.
26
+ Used for post-job actions that can crash due to internal backend issues. Restarting the action
27
+ usually helps to solve the issue.
28
+
29
+ Parameters
30
+ ----------
31
+ max_retries: int
32
+ The maximum number of retries to attempt before finally raising the exception.
33
+ delay: int (default=180 seconds)
34
+ The delay in seconds to wait before retrying the decorated function.
35
+ """
36
+
37
+ def decorator(func):
38
+ def wrapper(*args, **kwargs):
39
+ latest_exception = None
40
+ for _ in range(max_retries):
41
+ try:
42
+ return func(*args, **kwargs)
43
+ except Exception as e:
44
+ time.sleep(
45
+ delay_s
46
+ ) # Waits before retrying, while allowing other futures to run.
47
+ latest_exception = e
48
+ raise latest_exception
49
+
50
+ return wrapper
51
+
52
+ return decorator
20
53
 
21
54
 
22
55
  def done_callback(future, df, idx):
23
- """Sets the status of the job to the given status when the future is done."""
56
+ """Changes the status of the job when the post-job action future is done."""
24
57
  current_status = df.loc[idx, "status"]
25
- if not future.exception():
58
+ exception = future.exception()
59
+ if exception is None:
26
60
  if current_status == "postprocessing":
27
61
  df.loc[idx, "status"] = "finished"
28
62
  elif current_status == "postprocessing-error":
29
63
  df.loc[idx, "status"] = "error"
64
+ elif current_status == "running":
65
+ df.loc[idx, "status"] = "running"
30
66
  else:
31
67
  raise ValueError(
32
68
  f"Invalid status {current_status} for job {df.loc[idx, 'id']} for done_callback!"
33
69
  )
34
-
35
-
36
- class PostJobStatus(Enum):
37
- """Indicates the workers if the job finished as sucessful or with an error."""
38
-
39
- FINISHED = "finished"
40
- ERROR = "error"
70
+ else:
71
+ _log.exception(
72
+ "Exception occurred in post-job future for job %s:\n%s",
73
+ df.loc[idx, "id"],
74
+ exception,
75
+ )
76
+ df.loc[idx, "status"] = "error"
41
77
 
42
78
 
43
79
  class GFMAPJobManager(MultiBackendJobManager):
@@ -53,13 +89,50 @@ class GFMAPJobManager(MultiBackendJobManager):
53
89
  post_job_action: Optional[Callable] = None,
54
90
  poll_sleep: int = 5,
55
91
  n_threads: int = 1,
56
- post_job_params: dict = {},
57
92
  resume_postproc: bool = True, # If we need to check for post-job actions that crashed
58
93
  restart_failed: bool = False, # If we need to restart failed jobs
94
+ stac_enabled: bool = True,
59
95
  ):
96
+ """
97
+ Initializes the GFMAP job manager.
98
+
99
+ Parameters
100
+ ----------
101
+ output_dir: Path
102
+ The base output directory where the results/stac/logs of the jobs will be stored.
103
+ output_path_generator: Callable
104
+ User defined function that generates the output path for the job results. Expects as
105
+ inputs the output directory, the index of the job in the job dataframe
106
+ and the row of the job, and returns the final path where to save a job result asset.
107
+ collection_id: Optional[str]
108
+ The ID of the STAC collection that is being generated. Can be left empty if the STAC
109
+ catalogue is not being generated or if it is being resumed from an existing catalogue.
110
+ collection_description: Optional[str]
111
+ The description of the STAC collection that is being generated.
112
+ stac: Optional[Union[str, Path]]
113
+ The path to the STAC collection to be saved or resumed.
114
+ If None, the default path will be used.
115
+ post_job_action: Optional[Callable]
116
+ A user defined function that will be called after a job is finished. It will receive
117
+ the list of items generated by the job and the row of the job, and should return the
118
+ updated list of items.
119
+ poll_sleep: int
120
+ The time in seconds to wait between polling the backend for job status.
121
+ n_threads: int
122
+ The number of threads to execute `on_job_done` and `on_job_error` functions.
123
+ resume_postproc: bool
124
+ If set to true, all `on_job_done` and `on_job_error` functions that failed are resumed.
125
+ restart_failed: bool
126
+ If set to true, all jobs that failed within the OpenEO backend are restarted.
127
+ stac_enabled: bool (default=True)
128
+ If the STAC generation is enabled or not. Disabling it will prevent the creation,
129
+ update and loading of the STAC collection.
130
+ """
60
131
  self._output_dir = output_dir
132
+ self._catalogue_cache = output_dir / "catalogue_cache.bin"
61
133
 
62
134
  self.stac = stac
135
+ self.stac_enabled = stac_enabled
63
136
  self.collection_id = collection_id
64
137
  self.collection_description = collection_description
65
138
 
@@ -74,41 +147,73 @@ class GFMAPJobManager(MultiBackendJobManager):
74
147
 
75
148
  self._output_path_gen = output_path_generator
76
149
  self._post_job_action = post_job_action
77
- self._post_job_params = post_job_params
78
150
 
79
151
  # Monkey patching the _normalize_df method to ensure we have no modification on the
80
152
  # geometry column
81
153
  MultiBackendJobManager._normalize_df = self._normalize_df
82
154
  super().__init__(poll_sleep)
83
155
 
84
- self._root_collection = self._normalize_stac()
156
+ if self.stac_enabled:
157
+ self._root_collection = self._initialize_stac()
85
158
 
86
- def _normalize_stac(self):
159
+ def _load_stac(self) -> Optional[pystac.Collection]:
160
+ """
161
+ Loads the STAC collection from the cache, the specified `stac` path or the default path.
162
+ If no STAC collection is found, returns None.
163
+ """
87
164
  default_collection_path = self._output_dir / "stac/collection.json"
88
- if self.stac is not None:
165
+ if self._catalogue_cache.exists():
89
166
  _log.info(
90
- f"Reloading the STAC collection from the provided path: {self.stac}."
167
+ "Loading the STAC collection from the persisted binary file: %s.",
168
+ self._catalogue_cache,
91
169
  )
92
- root_collection = pystac.read_file(str(self.stac))
170
+ with open(self._catalogue_cache, "rb") as file:
171
+ return pickle.load(file)
172
+ elif self.stac is not None:
173
+ _log.info(
174
+ "Reloading the STAC collection from the provided path: %s.", self.stac
175
+ )
176
+ return pystac.read_file(str(self.stac))
93
177
  elif default_collection_path.exists():
94
178
  _log.info(
95
- f"Reload the STAC collection from the default path: {default_collection_path}."
179
+ "Reload the STAC collection from the default path: %s.",
180
+ default_collection_path,
96
181
  )
97
182
  self.stac = default_collection_path
98
- root_collection = pystac.read_file(str(self.stac))
99
- else:
100
- _log.info("Starting a fresh STAC collection.")
101
- assert (
102
- self.collection_id is not None
103
- ), "A collection ID is required to generate a STAC collection."
104
- root_collection = pystac.Collection(
105
- id=self.collection_id,
106
- description=self.collection_description,
107
- extent=None,
183
+ return pystac.read_file(str(self.stac))
184
+
185
+ _log.info(
186
+ "No STAC collection found as cache, in the default path or in the provided path."
187
+ )
188
+ return None
189
+
190
+ def _create_stac(self) -> pystac.Collection:
191
+ """
192
+ Creates and returns new STAC collection. The created stac collection will use the
193
+ `collection_id` and `collection_description` parameters set in the constructor.
194
+ """
195
+ if self.collection_id is None:
196
+ raise ValueError(
197
+ "A collection ID is required to generate a STAC collection."
108
198
  )
109
- root_collection.license = constants.LICENSE
110
- root_collection.add_link(constants.LICENSE_LINK)
111
- root_collection.stac_extensions = constants.STAC_EXTENSIONS
199
+ collection = pystac.Collection(
200
+ id=self.collection_id,
201
+ description=self.collection_description,
202
+ extent=None,
203
+ )
204
+ collection.license = constants.LICENSE
205
+ collection.add_link(constants.LICENSE_LINK)
206
+ collection.stac_extensions = constants.STAC_EXTENSIONS
207
+ return collection
208
+
209
+ def _initialize_stac(self) -> pystac.Collection:
210
+ """
211
+ Loads and returns if possible an existing stac collection, otherwise creates a new one.
212
+ """
213
+ root_collection = self._load_stac()
214
+ if not root_collection:
215
+ _log.info("Starting a fresh STAC collection.")
216
+ root_collection = self._create_stac()
112
217
 
113
218
  return root_collection
114
219
 
@@ -150,24 +255,40 @@ class GFMAPJobManager(MultiBackendJobManager):
150
255
  job = connection.job(row.id)
151
256
  if row.status == "postprocessing":
152
257
  _log.info(
153
- f"Resuming postprocessing of job {row.id}, queueing on_job_finished..."
258
+ "Resuming postprocessing of job %s, queueing on_job_finished...",
259
+ row.id,
260
+ )
261
+ future = self._executor.submit(self.on_job_done, job, row, _stac_lock)
262
+ future.add_done_callback(
263
+ partial(
264
+ done_callback,
265
+ df=df,
266
+ idx=idx,
267
+ )
154
268
  )
155
- future = self._executor.submit(self.on_job_done, job, row)
156
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
157
269
  else:
158
270
  _log.info(
159
- f"Resuming postprocessing of job {row.id}, queueing on_job_error..."
271
+ "Resuming postprocessing of job %s, queueing on_job_error...",
272
+ row.id,
160
273
  )
161
274
  future = self._executor.submit(self.on_job_error, job, row)
162
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
275
+ future.add_done_callback(
276
+ partial(
277
+ done_callback,
278
+ df=df,
279
+ idx=idx,
280
+ )
281
+ )
163
282
  self._futures.append(future)
164
283
 
165
284
  def _restart_failed_jobs(self, df: pd.DataFrame):
166
285
  """Sets-up failed jobs as "not_started" as they will be restarted by the manager."""
167
- failed_tasks = df[df.status == "error"]
286
+ failed_tasks = df[df.status.isin(["error", "start_failed"])]
168
287
  not_started_tasks = df[df.status == "not_started"]
169
288
  _log.info(
170
- f"Resetting {len(failed_tasks)} failed jobs to 'not_started'. {len(not_started_tasks)} jobs are already 'not_started'."
289
+ "Resetting %s failed jobs to 'not_started'. %s jobs are already 'not_started'.",
290
+ len(failed_tasks),
291
+ len(not_started_tasks),
171
292
  )
172
293
  for idx, _ in failed_tasks.iterrows():
173
294
  df.loc[idx, "status"] = "not_started"
@@ -203,27 +324,53 @@ class GFMAPJobManager(MultiBackendJobManager):
203
324
  job_metadata["status"] == "finished"
204
325
  ):
205
326
  _log.info(
206
- f"Job {job.job_id} finished successfully, queueing on_job_done..."
327
+ "Job %s finished successfully, queueing on_job_done...", job.job_id
207
328
  )
208
329
  job_status = "postprocessing"
209
- future = self._executor.submit(self.on_job_done, job, row)
330
+ future = self._executor.submit(self.on_job_done, job, row, _stac_lock)
210
331
  # Future will setup the status to finished when the job is done
211
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
332
+ future.add_done_callback(
333
+ partial(
334
+ done_callback,
335
+ df=df,
336
+ idx=idx,
337
+ )
338
+ )
212
339
  self._futures.append(future)
213
- df.loc[idx, "costs"] = job_metadata["costs"]
340
+ if "costs" in job_metadata:
341
+ df.loc[idx, "costs"] = job_metadata["costs"]
342
+ df.loc[idx, "memory"] = (
343
+ job_metadata["usage"]
344
+ .get("max_executor_memory", {})
345
+ .get("value", None)
346
+ )
347
+
348
+ else:
349
+ _log.warning(
350
+ "Costs not found in job %s metadata. Costs will be set to 'None'.",
351
+ job.job_id,
352
+ )
214
353
 
215
354
  # Case in which it failed
216
355
  if (df.loc[idx, "status"] != "error") and (
217
356
  job_metadata["status"] == "error"
218
357
  ):
219
358
  _log.info(
220
- f"Job {job.job_id} finished with error, queueing on_job_error..."
359
+ "Job %s finished with error, queueing on_job_error...",
360
+ job.job_id,
221
361
  )
222
362
  job_status = "postprocessing-error"
223
363
  future = self._executor.submit(self.on_job_error, job, row)
224
364
  # Future will setup the status to error when the job is done
225
- future.add_done_callback(partial(done_callback, df=df, idx=idx))
365
+ future.add_done_callback(
366
+ partial(
367
+ done_callback,
368
+ df=df,
369
+ idx=idx,
370
+ )
371
+ )
226
372
  self._futures.append(future)
373
+ if "costs" in job_metadata:
227
374
  df.loc[idx, "costs"] = job_metadata["costs"]
228
375
 
229
376
  df.loc[idx, "status"] = job_status
@@ -231,6 +378,7 @@ class GFMAPJobManager(MultiBackendJobManager):
231
378
  # Clear the futures that are done and raise their potential exceptions if they occurred.
232
379
  self._clear_queued_actions()
233
380
 
381
+ @retry_on_exception(max_retries=2, delay_s=180)
234
382
  def on_job_error(self, job: BatchJob, row: pd.Series):
235
383
  """Method called when a job finishes with an error.
236
384
 
@@ -241,7 +389,14 @@ class GFMAPJobManager(MultiBackendJobManager):
241
389
  row: pd.Series
242
390
  The row in the dataframe that contains the job relative information.
243
391
  """
244
- logs = job.logs()
392
+ try:
393
+ logs = job.logs()
394
+ except Exception as e: # pylint: disable=broad-exception-caught
395
+ _log.exception(
396
+ "Error getting logs in `on_job_error` for job %s:\n%s", job.job_id, e
397
+ )
398
+ logs = []
399
+
245
400
  error_logs = [log for log in logs if log.level.lower() == "error"]
246
401
 
247
402
  job_metadata = job.describe_job()
@@ -260,15 +415,21 @@ class GFMAPJobManager(MultiBackendJobManager):
260
415
  f"Couldn't find any error logs. Please check the error manually on job ID: {job.job_id}."
261
416
  )
262
417
 
263
- def on_job_done(self, job: BatchJob, row: pd.Series):
418
+ @retry_on_exception(max_retries=2, delay_s=30)
419
+ def on_job_done(
420
+ self, job: BatchJob, row: pd.Series, lock: Lock
421
+ ): # pylint: disable=arguments-differ
264
422
  """Method called when a job finishes successfully. It will first download the results of
265
423
  the job and then call the `post_job_action` method.
266
424
  """
425
+
267
426
  job_products = {}
268
427
  for idx, asset in enumerate(job.get_results().get_assets()):
269
428
  try:
270
429
  _log.debug(
271
- f"Generating output path for asset {asset.name} from job {job.job_id}..."
430
+ "Generating output path for asset %s from job %s...",
431
+ asset.name,
432
+ job.job_id,
272
433
  )
273
434
  output_path = self._output_path_gen(self._output_dir, idx, row)
274
435
  # Make the output path
@@ -277,11 +438,17 @@ class GFMAPJobManager(MultiBackendJobManager):
277
438
  # Add to the list of downloaded products
278
439
  job_products[f"{job.job_id}_{asset.name}"] = [output_path]
279
440
  _log.debug(
280
- f"Downloaded {asset.name} from job {job.job_id} -> {output_path}"
441
+ "Downloaded %s from job %s -> %s",
442
+ asset.name,
443
+ job.job_id,
444
+ output_path,
281
445
  )
282
446
  except Exception as e:
283
447
  _log.exception(
284
- f"Error downloading asset {asset.name} from job {job.job_id}", e
448
+ "Error downloading asset %s from job %s:\n%s",
449
+ asset.name,
450
+ job.job_id,
451
+ e,
285
452
  )
286
453
  raise e
287
454
 
@@ -302,53 +469,35 @@ class GFMAPJobManager(MultiBackendJobManager):
302
469
  asset.href = str(
303
470
  asset_path
304
471
  ) # Update the asset href to the output location set by the output_path_generator
305
- # item.id = f"{job.job_id}_{item.id}"
472
+
306
473
  # Add the item to the the current job items.
307
474
  job_items.append(item)
308
- _log.info(f"Parsed item {item.id} from job {job.job_id}")
475
+ _log.info("Parsed item %s from job %s", item.id, job.job_id)
309
476
  except Exception as e:
310
477
  _log.exception(
311
- f"Error failed to add item {item.id} from job {job.job_id} to STAC collection",
478
+ "Error failed to add item %s from job %s to STAC collection:\n%s",
479
+ item.id,
480
+ job.job_id,
312
481
  e,
313
482
  )
314
- raise e
315
483
 
316
484
  # _post_job_action returns an updated list of stac items. Post job action can therefore
317
485
  # update the stac items and access their products through the HREF. It is also the
318
486
  # reponsible of adding the appropriate metadata/assets to the items.
319
487
  if self._post_job_action is not None:
320
- _log.debug(f"Calling post job action for job {job.job_id}...")
321
- job_items = self._post_job_action(job_items, row, self._post_job_params)
488
+ _log.debug("Calling post job action for job %s...", job.job_id)
489
+ job_items = self._post_job_action(job_items, row)
322
490
 
323
- _log.info(f"Adding {len(job_items)} items to the STAC collection...")
491
+ _log.info("Adding %s items to the STAC collection...", len(job_items))
324
492
 
325
- with _stac_lock: # Take the STAC lock to avoid concurrence issues
326
- # Filters the job items to only keep the ones that are not already in the collection
327
- existing_ids = [item.id for item in self._root_collection.get_all_items()]
328
- job_items = [item for item in job_items if item.id not in existing_ids]
493
+ if self.stac_enabled:
494
+ with lock:
495
+ self._update_stac(job.job_id, job_items)
329
496
 
330
- self._root_collection.add_items(job_items)
331
- _log.info(f"Added {len(job_items)} items to the STAC collection.")
332
-
333
- _log.info(f"Writing STAC collection for {job.job_id} to file...")
334
- try:
335
- self._write_stac()
336
- except Exception as e:
337
- _log.exception(
338
- f"Error writing STAC collection for job {job.job_id} to file.", e
339
- )
340
- raise e
341
- _log.info(f"Wrote STAC collection for {job.job_id} to file.")
342
-
343
- _log.info(f"Job {job.job_id} and post job action finished successfully.")
497
+ _log.info("Job %s and post job action finished successfully.", job.job_id)
344
498
 
345
499
  def _normalize_df(self, df: pd.DataFrame) -> pd.DataFrame:
346
- """Ensure we have the required columns and the expected type for the geometry column.
347
-
348
- :param df: The dataframe to normalize.
349
- :return: a new dataframe that is normalized.
350
- """
351
-
500
+ """Ensure we have the required columns and the expected type for the geometry column."""
352
501
  # check for some required columns.
353
502
  required_with_default = [
354
503
  ("status", "not_started"),
@@ -366,7 +515,7 @@ class GFMAPJobManager(MultiBackendJobManager):
366
515
  }
367
516
  df = df.assign(**new_columns)
368
517
 
369
- _log.debug(f"Normalizing dataframe. Columns: {df.columns}")
518
+ _log.debug("Normalizing dataframe. Columns: %s", df.columns)
370
519
 
371
520
  return df
372
521
 
@@ -401,7 +550,7 @@ class GFMAPJobManager(MultiBackendJobManager):
401
550
  The file to track the results of the jobs.
402
551
  """
403
552
  # Starts the thread pool to work on the on_job_done and on_job_error methods
404
- _log.info(f"Starting ThreadPoolExecutor with {self._n_threads} workers.")
553
+ _log.info("Starting ThreadPoolExecutor with %s workers.", self._n_threads)
405
554
  with ThreadPoolExecutor(max_workers=self._n_threads) as executor:
406
555
  _log.info("Creating and running jobs.")
407
556
  self._executor = executor
@@ -412,6 +561,13 @@ class GFMAPJobManager(MultiBackendJobManager):
412
561
  self._wait_queued_actions()
413
562
  _log.info("Exiting ThreadPoolExecutor.")
414
563
  self._executor = None
564
+ _log.info("All jobs finished running.")
565
+ if self.stac_enabled:
566
+ _log.info("Saving persisted STAC collection to final .json collection.")
567
+ self._write_stac()
568
+ _log.info("Saved STAC catalogue to JSON format, all tasks finished!")
569
+ else:
570
+ _log.info("STAC was disabled, skipping generation of the catalogue.")
415
571
 
416
572
  def _write_stac(self):
417
573
  """Writes the STAC collection to the output directory."""
@@ -428,6 +584,36 @@ class GFMAPJobManager(MultiBackendJobManager):
428
584
  self._root_collection.normalize_hrefs(str(root_path))
429
585
  self._root_collection.save(catalog_type=CatalogType.SELF_CONTAINED)
430
586
 
587
+ def _persist_stac(self):
588
+ """Persists the STAC collection by saving it into a binary file."""
589
+ _log.debug("Validating the STAC collection before persisting.")
590
+ self._root_collection.validate_all()
591
+ _log.info("Persisting STAC collection to temp file %s.", self._catalogue_cache)
592
+ with open(self._catalogue_cache, "wb") as file:
593
+ pickle.dump(self._root_collection, file)
594
+
595
+ def _update_stac(self, job_id: str, job_items: list[pystac.Item]):
596
+ """Updates the STAC collection by adding the items generated by the job.
597
+ Does not add duplicates or override with the same item ID.
598
+ """
599
+ try:
600
+ _log.info("Thread %s entered the STAC lock.", threading.get_ident())
601
+ # Filters the job items to only keep the ones that are not already in the collection
602
+ existing_ids = [item.id for item in self._root_collection.get_all_items()]
603
+ job_items = [item for item in job_items if item.id not in existing_ids]
604
+
605
+ self._root_collection.add_items(job_items)
606
+ _log.info("Added %s items to the STAC collection.", len(job_items))
607
+
608
+ self._persist_stac()
609
+ except Exception as e:
610
+ _log.exception(
611
+ "Error adding items to the STAC collection for job %s:\n%s ",
612
+ job_id,
613
+ str(e),
614
+ )
615
+ raise e
616
+
431
617
  def setup_stac(
432
618
  self,
433
619
  constellation: Optional[str] = None,
@@ -12,21 +12,32 @@ import requests
12
12
  from openeo_gfmap.manager import _log
13
13
 
14
14
 
15
- def load_s2_grid() -> gpd.GeoDataFrame:
15
+ def load_s2_grid(web_mercator: bool = False) -> gpd.GeoDataFrame:
16
16
  """Returns a geo data frame from the S2 grid."""
17
17
  # Builds the path where the geodataframe should be
18
- gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds.geojson"
18
+ if not web_mercator:
19
+ gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_4326_v2.geoparquet"
20
+ url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_4326_v2.geoparquet"
21
+ else:
22
+ gdf_path = Path.home() / ".openeo-gfmap" / "s2grid_bounds_3857_v2.geoparquet"
23
+ url = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds_3857_v2.geoparquet"
24
+
19
25
  if not gdf_path.exists():
20
26
  _log.info("S2 grid not found, downloading it from artifactory.")
21
27
  # Downloads the file from the artifactory URL
22
28
  gdf_path.parent.mkdir(exist_ok=True)
23
29
  response = requests.get(
24
- "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/s2grid_bounds.geojson",
30
+ url,
25
31
  timeout=180, # 3mins
26
32
  )
33
+ if response.status_code != 200:
34
+ raise ValueError(
35
+ "Failed to download the S2 grid from the artifactory. "
36
+ f"Status code: {response.status_code}"
37
+ )
27
38
  with open(gdf_path, "wb") as f:
28
39
  f.write(response.content)
29
- return gpd.read_file(gdf_path)
40
+ return gpd.read_parquet(gdf_path)
30
41
 
31
42
 
32
43
  def _resplit_group(
@@ -38,7 +49,7 @@ def _resplit_group(
38
49
 
39
50
 
40
51
  def split_job_s2grid(
41
- polygons: gpd.GeoDataFrame, max_points: int = 500
52
+ polygons: gpd.GeoDataFrame, max_points: int = 500, web_mercator: bool = False
42
53
  ) -> List[gpd.GeoDataFrame]:
43
54
  """Split a job into multiple jobs from the position of the polygons/points. The centroid of
44
55
  the geometries to extract are used to select tile in the Sentinel-2 tile grid.
@@ -60,17 +71,25 @@ def split_job_s2grid(
60
71
  if polygons.crs is None:
61
72
  raise ValueError("The GeoDataFrame must contain a CRS")
62
73
 
63
- polygons = polygons.to_crs(epsg=4326)
64
- if polygons.geometry.geom_type[0] != "Point":
65
- polygons["geometry"] = polygons.geometry.centroid
74
+ epsg = 3857 if web_mercator else 4326
75
+
76
+ original_crs = polygons.crs
77
+
78
+ polygons = polygons.to_crs(epsg=epsg)
79
+
80
+ polygons["centroid"] = polygons.geometry.centroid
66
81
 
67
82
  # Dataset containing all the S2 tiles, find the nearest S2 tile for each point
68
- s2_grid = load_s2_grid()
83
+ s2_grid = load_s2_grid(web_mercator)
69
84
  s2_grid["geometry"] = s2_grid.geometry.centroid
70
85
 
71
- polygons = gpd.sjoin_nearest(polygons, s2_grid[["tile", "geometry"]]).drop(
72
- columns=["index_right"]
73
- )
86
+ s2_grid = s2_grid[s2_grid.cdse_valid]
87
+
88
+ polygons = gpd.sjoin_nearest(
89
+ polygons.set_geometry("centroid"), s2_grid[["tile", "geometry"]]
90
+ ).drop(columns=["index_right", "centroid"])
91
+
92
+ polygons = polygons.set_geometry("geometry").to_crs(original_crs)
74
93
 
75
94
  split_datasets = []
76
95
  for _, sub_gdf in polygons.groupby("tile"):
@@ -86,10 +105,13 @@ def append_h3_index(
86
105
  polygons: gpd.GeoDataFrame, grid_resolution: int = 3
87
106
  ) -> gpd.GeoDataFrame:
88
107
  """Append the H3 index to the polygons."""
89
- if polygons.geometry.geom_type[0] != "Point":
90
- geom_col = polygons.geometry.centroid
91
- else:
92
- geom_col = polygons.geometry
108
+
109
+ # Project to Web mercator to calculate centroids
110
+ polygons = polygons.to_crs(epsg=3857)
111
+ geom_col = polygons.geometry.centroid
112
+ # Project to lat lon to calculate the h3 index
113
+ geom_col = geom_col.to_crs(epsg=4326)
114
+
93
115
  polygons["h3index"] = geom_col.apply(
94
116
  lambda pt: h3.geo_to_h3(pt.y, pt.x, grid_resolution)
95
117
  )
@@ -127,12 +149,13 @@ def split_job_hex(
127
149
  if polygons.crs is None:
128
150
  raise ValueError("The GeoDataFrame must contain a CRS")
129
151
 
130
- # Project to lat/lon positions
131
- polygons = polygons.to_crs(epsg=4326)
152
+ original_crs = polygons.crs
132
153
 
133
154
  # Split the polygons into multiple jobs
134
155
  polygons = append_h3_index(polygons, grid_resolution)
135
156
 
157
+ polygons = polygons.to_crs(original_crs)
158
+
136
159
  split_datasets = []
137
160
  for _, sub_gdf in polygons.groupby("h3index"):
138
161
  if len(sub_gdf) > max_points:
@@ -29,7 +29,7 @@ PLATFORM = {
29
29
 
30
30
  INSTRUMENTS = {"sentinel2": ["msi"], "sentinel1": ["c-sar"]}
31
31
 
32
- GSD = {"sentinel2": [10, 20, 60], "sentinel1": [10]}
32
+ GSD = {"sentinel2": [10, 20, 60], "sentinel1": [20]}
33
33
 
34
34
  SUMMARIES = {
35
35
  "sentinel2": pystac.summaries.Summaries(