openforis-whisp 2.0.0b3__py3-none-any.whl → 3.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2070 @@
1
+ """
2
+ Advanced statistics processing for WHISP - concurrent and sequential endpoints.
3
+
4
+ This module provides optimized functions for processing GeoJSON FeatureCollections
5
+ with Whisp datasets using concurrent batching (for high-volume processing)
6
+ and standard sequential processing.
7
+
8
+ NOTE: This module is a transition state. The plan is to eventually merge these
9
+ functions into stats.py and replace the standard functions there as the primary
10
+ implementation, deprecating the legacy versions.
11
+
12
+ Key features:
13
+ - whisp_stats_geojson_to_df_concurrent
14
+ - whisp_stats_geojson_to_df_sequential (standard endpoint, sequential)
15
+ - Proper logging at different levels (WARNING, INFO, DEBUG)
16
+ - Progress tracking without external dependencies
17
+ - Client-side and server-side metadata extraction options
18
+ - Endpoint validation and warnings
19
+ """
20
+
21
+ import ee
22
+ import pandas as pd
23
+ import geopandas as gpd
24
+ import logging
25
+ import sys
26
+ import threading
27
+ import time
28
+ import warnings
29
+ import json
30
+ import io
31
+ import os
32
+ import subprocess
33
+ from contextlib import redirect_stdout, contextmanager
34
+ from pathlib import Path
35
+ from typing import Optional, List, Dict, Any, Tuple
36
+ from concurrent.futures import ThreadPoolExecutor, as_completed
37
+ import tempfile
38
+
39
+ # ============================================================================
40
+ # STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
41
+ # ============================================================================
42
+
43
+
44
+ @contextmanager
45
+ def suppress_c_level_output():
46
+ """Suppress C-level stdout/stderr writes from libraries like Fiona."""
47
+ if sys.platform == "win32":
48
+ # Windows doesn't support dup2() reliably for STDOUT/STDERR
49
+ # Fall back to Python-level suppression
50
+ with redirect_stdout(io.StringIO()):
51
+ yield
52
+ else:
53
+ # Unix-like systems: use file descriptor redirection
54
+ saved_stdout = os.dup(1)
55
+ saved_stderr = os.dup(2)
56
+ try:
57
+ devnull = os.open(os.devnull, os.O_WRONLY)
58
+ os.dup2(devnull, 1)
59
+ os.dup2(devnull, 2)
60
+ yield
61
+ finally:
62
+ os.dup2(saved_stdout, 1)
63
+ os.dup2(saved_stderr, 2)
64
+ os.close(devnull)
65
+ os.close(saved_stdout)
66
+ os.close(saved_stderr)
67
+
68
+
69
+ # Suppress verbose warnings globally for this module
70
+ # Note: FutureWarnings are kept (they signal important API changes)
71
+ warnings.filterwarnings("ignore", category=UserWarning, message=".*geographic CRS.*")
72
+ warnings.simplefilter("ignore", UserWarning)
73
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
74
+
75
+ # Suppress verbose logging from GeoPandas/Fiona/pyogrio
76
+ logging.getLogger("fiona").setLevel(logging.WARNING)
77
+ logging.getLogger("fiona.ogrext").setLevel(logging.WARNING)
78
+ logging.getLogger("pyogrio").setLevel(logging.WARNING)
79
+ logging.getLogger("pyogrio._io").setLevel(logging.WARNING)
80
+
81
+ from openforis_whisp.parameters.config_runtime import (
82
+ plot_id_column,
83
+ external_id_column,
84
+ geometry_type_column,
85
+ geometry_area_column,
86
+ centroid_x_coord_column,
87
+ centroid_y_coord_column,
88
+ iso3_country_column,
89
+ iso2_country_column,
90
+ admin_1_column,
91
+ water_flag,
92
+ geometry_area_column_formatting,
93
+ stats_area_columns_formatting,
94
+ stats_percent_columns_formatting,
95
+ )
96
+ from openforis_whisp.data_conversion import (
97
+ convert_geojson_to_ee,
98
+ convert_ee_to_df,
99
+ convert_ee_to_geojson,
100
+ )
101
+ from openforis_whisp.datasets import combine_datasets
102
+ from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
103
+ from openforis_whisp.stats import (
104
+ reformat_geometry_type,
105
+ set_point_geometry_area_to_zero,
106
+ )
107
+
108
+
109
+ # ============================================================================
110
+ # LOGGING & PROGRESS UTILITIES
111
+ # ============================================================================
112
+
113
+
114
+ def _suppress_verbose_output(max_concurrent: int = None):
115
+ """
116
+ Suppress verbose warnings and logging from dependencies.
117
+
118
+ Dynamically adjusts urllib3 logger level based on max_concurrent to prevent
119
+ "Connection pool is full" warnings during high-concurrency scenarios.
120
+
121
+ Parameters
122
+ ----------
123
+ max_concurrent : int, optional
124
+ Maximum concurrent workers. Adjusts urllib3 logging level:
125
+ - max_concurrent <= 20: WARNING (pool rarely full)
126
+ - max_concurrent 21-35: CRITICAL (suppress pool warnings)
127
+ - max_concurrent >= 36: CRITICAL (maximum suppression)
128
+ """
129
+ warnings.filterwarnings("ignore", category=UserWarning)
130
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
131
+
132
+ # Suppress urllib3 connection pool warnings via filters
133
+ warnings.filterwarnings("ignore", message=".*Connection pool is full.*")
134
+ warnings.filterwarnings("ignore", message=".*discarding connection.*")
135
+
136
+ # Set logger levels to WARNING to suppress INFO messages
137
+ for mod_name in [
138
+ "openforis_whisp.reformat",
139
+ "openforis_whisp.data_conversion",
140
+ "geopandas",
141
+ "fiona",
142
+ "pyogrio._io",
143
+ "urllib3",
144
+ ]:
145
+ logging.getLogger(mod_name).setLevel(logging.WARNING)
146
+
147
+ # ALL urllib3 loggers: use CRITICAL to suppress ALL connection pool warnings
148
+ # (these appear at WARNING level during high concurrency)
149
+ urllib3_loggers = [
150
+ "urllib3.connectionpool",
151
+ "urllib3.poolmanager",
152
+ "urllib3",
153
+ "requests.packages.urllib3.connectionpool",
154
+ "requests.packages.urllib3.poolmanager",
155
+ "requests.packages.urllib3",
156
+ ]
157
+
158
+ for logger_name in urllib3_loggers:
159
+ logging.getLogger(logger_name).setLevel(logging.CRITICAL)
160
+
161
+ # Suppress warning logs specifically from reformat module during validation
162
+ reformat_logger = logging.getLogger("openforis_whisp.reformat")
163
+ reformat_logger.setLevel(logging.ERROR)
164
+
165
+
166
+ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
167
+ """Load GeoJSON file with all output suppressed."""
168
+ fiona_logger = logging.getLogger("fiona")
169
+ pyogrio_logger = logging.getLogger("pyogrio._io")
170
+ old_fiona_level = fiona_logger.level
171
+ old_pyogrio_level = pyogrio_logger.level
172
+ fiona_logger.setLevel(logging.CRITICAL)
173
+ pyogrio_logger.setLevel(logging.CRITICAL)
174
+
175
+ try:
176
+ with redirect_stdout(io.StringIO()):
177
+ gdf = gpd.read_file(filepath)
178
+ return gdf
179
+ finally:
180
+ fiona_logger.setLevel(old_fiona_level)
181
+ pyogrio_logger.setLevel(old_pyogrio_level)
182
+
183
+
184
+ def _extract_decimal_places(format_string: str) -> int:
185
+ """
186
+ Extract decimal places from a format string like '%.3f'.
187
+
188
+ Parameters
189
+ ----------
190
+ format_string : str
191
+ Format string (e.g., '%.3f' → 3)
192
+
193
+ Returns
194
+ -------
195
+ int
196
+ Number of decimal places
197
+ """
198
+ import re
199
+
200
+ match = re.search(r"\.(\d+)f", format_string)
201
+ if match:
202
+ return int(match.group(1))
203
+ return 2 # Default to 2 decimal places
204
+
205
+
206
+ def _add_admin_context(
207
+ df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
208
+ ) -> pd.DataFrame:
209
+ """
210
+ Join admin codes to get Country, ProducerCountry, and Admin_Level_1 information.
211
+
212
+ Uses GAUL 2024 Level 1 administrative lookup to map admin codes to country and
213
+ administrative region names.
214
+
215
+ Parameters
216
+ ----------
217
+ df : pd.DataFrame
218
+ DataFrame with admin_code_median column from reduceRegions
219
+ admin_code_col : str
220
+ Name of the admin code column (default: "admin_code_median")
221
+ debug : bool
222
+ If True, print detailed debugging information (default: False)
223
+
224
+ Returns
225
+ -------
226
+ pd.DataFrame
227
+ DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
228
+ """
229
+ logger = logging.getLogger("whisp-concurrent")
230
+
231
+ # Return early if admin code column doesn't exist
232
+ if admin_code_col not in df.columns:
233
+ logger.debug(f"Admin code column '{admin_code_col}' not found in dataframe")
234
+ if debug:
235
+ print(f"DEBUG: Admin code column '{admin_code_col}' not found")
236
+ print(f"DEBUG: Available columns: {df.columns.tolist()}")
237
+ return df
238
+
239
+ try:
240
+ from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
241
+
242
+ if debug:
243
+ print(f"DEBUG: Found admin_code_col '{admin_code_col}'")
244
+ print(f"DEBUG: Sample values: {df[admin_code_col].head()}")
245
+ print(f"DEBUG: Value types: {df[admin_code_col].dtype}")
246
+ print(f"DEBUG: Null count: {df[admin_code_col].isna().sum()}")
247
+
248
+ # Create lookup dataframe
249
+ lookup_data = []
250
+ for gaul_code, info in lookup_dict.items():
251
+ lookup_data.append(
252
+ {
253
+ "gaul1_code": gaul_code,
254
+ "gaul1_name": info.get("gaul1_name"),
255
+ "iso3_code": info.get("iso3_code"),
256
+ "iso2_code": info.get("iso2_code"),
257
+ }
258
+ )
259
+
260
+ lookup_df = pd.DataFrame(lookup_data)
261
+
262
+ if debug:
263
+ print(f"DEBUG: Lookup dictionary has {len(lookup_df)} entries")
264
+ print(f"DEBUG: Sample lookup codes: {lookup_df['gaul1_code'].head()}")
265
+
266
+ # Prepare data for join
267
+ df = df.copy()
268
+ df["admin_code_for_join"] = df[admin_code_col].fillna(-9999).astype("int32")
269
+ lookup_df["gaul1_code"] = lookup_df["gaul1_code"].astype("int32")
270
+
271
+ if debug:
272
+ print(
273
+ f"DEBUG: Codes to join (first 10): {df['admin_code_for_join'].unique()[:10]}"
274
+ )
275
+
276
+ # Perform join
277
+ df_joined = df.merge(
278
+ lookup_df, left_on="admin_code_for_join", right_on="gaul1_code", how="left"
279
+ )
280
+
281
+ if debug:
282
+ matched = df_joined["iso3_code"].notna().sum()
283
+ print(f"DEBUG: Merge result - {matched}/{len(df_joined)} rows matched")
284
+ print(f"DEBUG: Sample matched rows:")
285
+ print(
286
+ df_joined[
287
+ ["admin_code_for_join", "iso3_code", "iso2_code", "gaul1_name"]
288
+ ].head()
289
+ )
290
+
291
+ # Rename columns to match output schema
292
+ df_joined = df_joined.rename(
293
+ columns={
294
+ "iso3_code": iso3_country_column, # 'Country'
295
+ "iso2_code": iso2_country_column, # 'ProducerCountry'
296
+ "gaul1_name": admin_1_column, # 'Admin_Level_1'
297
+ }
298
+ )
299
+
300
+ # Drop temporary columns
301
+ df_joined = df_joined.drop(
302
+ columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
303
+ )
304
+
305
+ logger.debug(
306
+ f"Admin context added: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
307
+ )
308
+ return df_joined
309
+
310
+ except ImportError:
311
+ logger.warning(
312
+ "Could not import GAUL lookup dictionary - admin context not added"
313
+ )
314
+ if debug:
315
+ print("DEBUG: ImportError - could not load lookup dictionary")
316
+ return df
317
+ except Exception as e:
318
+ logger.warning(f"Error adding admin context: {e}")
319
+ if debug:
320
+ print(f"DEBUG: Exception in _add_admin_context: {e}")
321
+ import traceback
322
+
323
+ traceback.print_exc()
324
+ return df
325
+
326
+
327
+ def join_admin_codes(
328
+ df: pd.DataFrame, lookup_dict: Dict, id_col: str = "admin_code_median"
329
+ ) -> pd.DataFrame:
330
+ """
331
+ Join admin codes to DataFrame using a lookup dictionary.
332
+
333
+ Converts the admin code column to integer and performs a left join with
334
+ the lookup dictionary to add Country, ProducerCountry, and Admin_Level_1.
335
+
336
+ Parameters
337
+ ----------
338
+ df : pd.DataFrame
339
+ DataFrame with admin code column
340
+ lookup_dict : dict
341
+ Dictionary mapping GAUL codes to admin info (iso3_code, iso2_code, gaul1_name)
342
+ id_col : str
343
+ Name of the admin code column (default: "admin_code_median")
344
+
345
+ Returns
346
+ -------
347
+ pd.DataFrame
348
+ DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
349
+ """
350
+ logger = logging.getLogger("whisp-concurrent")
351
+
352
+ # Return early if admin code column doesn't exist
353
+ if id_col not in df.columns:
354
+ logger.debug(f"Admin code column '{id_col}' not found in dataframe")
355
+ return df
356
+
357
+ try:
358
+ # Create lookup dataframe
359
+ lookup_data = []
360
+ for gaul_code, info in lookup_dict.items():
361
+ lookup_data.append(
362
+ {
363
+ "gaul1_code": gaul_code,
364
+ "gaul1_name": info.get("gaul1_name"),
365
+ "iso3_code": info.get("iso3_code"),
366
+ "iso2_code": info.get("iso2_code"),
367
+ }
368
+ )
369
+
370
+ lookup_df = pd.DataFrame(lookup_data)
371
+
372
+ # Prepare data for join
373
+ df = df.copy()
374
+ # Round to nearest integer (handles float values from EE reducers)
375
+ df["admin_code_for_join"] = df[id_col].fillna(-9999).astype("int32")
376
+ lookup_df["gaul1_code"] = lookup_df["gaul1_code"].astype("int32")
377
+
378
+ # Perform join
379
+ df_joined = df.merge(
380
+ lookup_df, left_on="admin_code_for_join", right_on="gaul1_code", how="left"
381
+ )
382
+
383
+ # Rename columns to match output schema
384
+ df_joined = df_joined.rename(
385
+ columns={
386
+ "iso3_code": iso3_country_column, # 'Country'
387
+ "iso2_code": iso2_country_column, # 'ProducerCountry'
388
+ "gaul1_name": admin_1_column, # 'Admin_Level_1'
389
+ }
390
+ )
391
+
392
+ # Drop temporary columns
393
+ df_joined = df_joined.drop(
394
+ columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
395
+ )
396
+
397
+ logger.debug(
398
+ f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
399
+ )
400
+ return df_joined
401
+
402
+ except Exception as e:
403
+ logger.warning(f"Error joining admin codes: {e}")
404
+ return df
405
+
406
+
407
+ class ProgressTracker:
408
+ """
409
+ Track batch processing progress with time estimation.
410
+
411
+ Shows progress at key milestones (25%, 50%, 75%, 100%) with estimated
412
+ time remaining based on processing speed.
413
+ """
414
+
415
+ def __init__(self, total: int, logger: logging.Logger = None):
416
+ """
417
+ Initialize progress tracker.
418
+
419
+ Parameters
420
+ ----------
421
+ total : int
422
+ Total number of items to process
423
+ logger : logging.Logger, optional
424
+ Logger for output
425
+ """
426
+ self.total = total
427
+ self.completed = 0
428
+ self.lock = threading.Lock()
429
+ self.logger = logger or logging.getLogger("whisp-concurrent")
430
+ self.milestones = {25, 50, 75, 100}
431
+ self.shown_milestones = set()
432
+ self.start_time = time.time()
433
+ self.last_update_time = self.start_time
434
+
435
+ def update(self, n: int = 1) -> None:
436
+ """
437
+ Update progress count.
438
+
439
+ Parameters
440
+ ----------
441
+ n : int
442
+ Number of items completed
443
+ """
444
+ with self.lock:
445
+ self.completed += n
446
+ percent = int((self.completed / self.total) * 100)
447
+
448
+ # Show milestone messages (25%, 50%, 75%, 100%)
449
+ for milestone in sorted(self.milestones):
450
+ if percent >= milestone and milestone not in self.shown_milestones:
451
+ self.shown_milestones.add(milestone)
452
+
453
+ # Calculate time metrics
454
+ elapsed = time.time() - self.start_time
455
+ rate = self.completed / elapsed if elapsed > 0 else 0
456
+ remaining_items = self.total - self.completed
457
+ eta_seconds = remaining_items / rate if rate > 0 else 0
458
+
459
+ # Format time strings
460
+ eta_str = self._format_time(eta_seconds)
461
+ elapsed_str = self._format_time(elapsed)
462
+
463
+ # Build progress message
464
+ msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
465
+ if percent < 100:
466
+ msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
467
+ else:
468
+ msg += f" | Total time: {elapsed_str}"
469
+
470
+ self.logger.info(msg)
471
+
472
+ @staticmethod
473
+ def _format_time(seconds: float) -> str:
474
+ """Format seconds as human-readable string."""
475
+ if seconds < 60:
476
+ return f"{seconds:.0f}s"
477
+ elif seconds < 3600:
478
+ mins = seconds / 60
479
+ return f"{mins:.1f}m"
480
+ else:
481
+ hours = seconds / 3600
482
+ return f"{hours:.1f}h"
483
+
484
+ def finish(self) -> None:
485
+ """Log completion."""
486
+ with self.lock:
487
+ total_time = time.time() - self.start_time
488
+ time_str = self._format_time(total_time)
489
+ self.logger.info(
490
+ f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
491
+ )
492
+
493
+
494
+ # ============================================================================
495
+ # ENDPOINT VALIDATION
496
+ # ============================================================================
497
+
498
+
499
+ def check_ee_endpoint(endpoint_type: str = "high-volume") -> bool:
500
+ """
501
+ Check if Earth Engine is using the correct endpoint.
502
+
503
+ Parameters
504
+ ----------
505
+ endpoint_type : str
506
+ Expected endpoint type: "high-volume" or "standard"
507
+
508
+ Returns
509
+ -------
510
+ bool
511
+ True if using expected endpoint, False otherwise
512
+ """
513
+ api_url = str(ee.data._cloud_api_base_url)
514
+
515
+ if endpoint_type == "high-volume":
516
+ return "highvolume" in api_url.lower()
517
+ elif endpoint_type == "standard":
518
+ return "highvolume" not in api_url.lower()
519
+ else:
520
+ return False
521
+
522
+
523
+ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool = True):
524
+ """
525
+ Validate Earth Engine endpoint and warn/error if incorrect.
526
+
527
+ Parameters
528
+ ----------
529
+ endpoint_type : str
530
+ Expected endpoint type
531
+ raise_error : bool
532
+ If True, raise error if incorrect endpoint; if False, warn
533
+
534
+ Raises
535
+ ------
536
+ RuntimeError
537
+ If incorrect endpoint and raise_error=True
538
+ """
539
+ if not check_ee_endpoint(endpoint_type):
540
+ msg = (
541
+ f"Not using {endpoint_type.upper()} endpoint.\n"
542
+ f"Current URL: {ee.data._cloud_api_base_url}\n"
543
+ f"\nTo use {endpoint_type} endpoint, run:\n"
544
+ )
545
+ msg += "ee.Reset()\n"
546
+ if endpoint_type == "high-volume":
547
+ msg += " ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
548
+ else:
549
+ msg += " ee.Initialize() # Uses standard endpoint by default"
550
+
551
+ if raise_error:
552
+ raise RuntimeError(msg)
553
+ else:
554
+ logging.warning(msg)
555
+
556
+
557
+ # ============================================================================
558
+ # METADATA EXTRACTION (CLIENT & SERVER SIDE)
559
+ # ============================================================================
560
+
561
+
562
+ def extract_centroid_and_geomtype_client(
563
+ gdf: gpd.GeoDataFrame,
564
+ x_col: str = None,
565
+ y_col: str = None,
566
+ type_col: str = None,
567
+ external_id_column: str = None,
568
+ return_attributes_only: bool = True,
569
+ ) -> pd.DataFrame:
570
+ """
571
+ Extract centroid coordinates and geometry type using GeoPandas (client-side).
572
+
573
+ Parameters
574
+ ----------
575
+ gdf : gpd.GeoDataFrame
576
+ Input GeoDataFrame
577
+ x_col : str, optional
578
+ Column name for centroid x. Defaults to config value
579
+ y_col : str, optional
580
+ Column name for centroid y. Defaults to config value
581
+ type_col : str, optional
582
+ Column name for geometry type. Defaults to config value
583
+ external_id_column: : str, optional
584
+ Name of external ID column to preserve
585
+ return_attributes_only : bool
586
+ If True, return only attribute columns (no geometry)
587
+
588
+ Returns
589
+ -------
590
+ pd.DataFrame or gpd.GeoDataFrame
591
+ DataFrame/GeoDataFrame with centroid and geometry type columns
592
+ """
593
+ x_col = x_col or centroid_x_coord_column
594
+ y_col = y_col or centroid_y_coord_column
595
+ type_col = type_col or geometry_type_column
596
+
597
+ gdf = gdf.copy()
598
+
599
+ # Extract centroid coordinates (suppressing geographic CRS warning from Shapely)
600
+ with warnings.catch_warnings():
601
+ warnings.filterwarnings("ignore", category=UserWarning)
602
+ warnings.simplefilter("ignore", UserWarning) # Additional suppression
603
+ centroid_points = gdf.geometry.centroid
604
+
605
+ gdf[x_col] = centroid_points.x.round(6)
606
+ gdf[y_col] = centroid_points.y.round(6)
607
+ gdf[type_col] = gdf.geometry.geom_type
608
+
609
+ if return_attributes_only:
610
+ # Build column list starting with merge keys
611
+ cols = []
612
+
613
+ # Always include __row_id__ first if present (needed for row-level merging)
614
+ if "__row_id__" in gdf.columns:
615
+ cols.append("__row_id__")
616
+
617
+ # Always include plot_id_column if present (needed for merging batches)
618
+ if plot_id_column in gdf.columns:
619
+ cols.append(plot_id_column)
620
+
621
+ # Include external_id_column if provided and exists
622
+ if (
623
+ external_id_column
624
+ and external_id_column in gdf.columns
625
+ and external_id_column not in cols
626
+ ):
627
+ cols.append(external_id_column)
628
+
629
+ # Always include metadata columns (centroid, geometry type)
630
+ cols.extend([x_col, y_col, type_col])
631
+
632
+ # Remove any duplicates while preserving order
633
+ cols = list(dict.fromkeys(cols))
634
+
635
+ return gdf[cols].reset_index(drop=True)
636
+
637
+ return gdf
638
+
639
+
640
+ def extract_centroid_and_geomtype_server(
641
+ fc: ee.FeatureCollection,
642
+ x_col: str = None,
643
+ y_col: str = None,
644
+ type_col: str = None,
645
+ max_error: float = 1.0,
646
+ ) -> ee.FeatureCollection:
647
+ """
648
+ Extract centroid coordinates and geometry type using Earth Engine (server-side).
649
+
650
+ Parameters
651
+ ----------
652
+ fc : ee.FeatureCollection
653
+ Input FeatureCollection
654
+ x_col : str, optional
655
+ Column name for centroid x
656
+ y_col : str, optional
657
+ Column name for centroid y
658
+ type_col : str, optional
659
+ Column name for geometry type
660
+ max_error : float
661
+ Maximum error for centroid calculation (meters)
662
+
663
+ Returns
664
+ -------
665
+ ee.FeatureCollection
666
+ FeatureCollection with centroid and geometry type properties
667
+ """
668
+ x_col = x_col or centroid_x_coord_column
669
+ y_col = y_col or centroid_y_coord_column
670
+ type_col = type_col or geometry_type_column
671
+
672
+ def add_metadata(feature):
673
+ centroid = feature.geometry().centroid(max_error)
674
+ coords = centroid.coordinates()
675
+ x = ee.Number(coords.get(0)).multiply(1e6).round().divide(1e6)
676
+ y = ee.Number(coords.get(1)).multiply(1e6).round().divide(1e6)
677
+ return feature.set({x_col: x, y_col: y, type_col: feature.geometry().type()})
678
+
679
+ return fc.map(add_metadata)
680
+
681
+
682
+ # ============================================================================
683
+ # BATCH PROCESSING UTILITIES
684
+ # ============================================================================
685
+
686
+
687
+ def batch_geodataframe(
688
+ gdf: gpd.GeoDataFrame,
689
+ batch_size: int,
690
+ ) -> List[gpd.GeoDataFrame]:
691
+ """
692
+ Split a GeoDataFrame into batches.
693
+
694
+ Parameters
695
+ ----------
696
+ gdf : gpd.GeoDataFrame
697
+ Input GeoDataFrame
698
+ batch_size : int
699
+ Size of each batch
700
+
701
+ Returns
702
+ -------
703
+ List[gpd.GeoDataFrame]
704
+ List of batch GeoDataFrames
705
+ """
706
+ batches = []
707
+ for i in range(0, len(gdf), batch_size):
708
+ batches.append(gdf.iloc[i : i + batch_size].copy())
709
+ return batches
710
+
711
+
712
+ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
713
+ """
714
+ Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
715
+
716
+ OPTIMIZATION: Uses GeoJSON dict input directly to avoid temp file I/O.
717
+ This provides ~67% performance improvement over writing to disk.
718
+
719
+ Preserves the __row_id__ column if present so it can be retrieved after processing.
720
+
721
+ Parameters
722
+ ----------
723
+ batch_gdf : gpd.GeoDataFrame
724
+ Input batch (should have __row_id__ column)
725
+
726
+ Returns
727
+ -------
728
+ ee.FeatureCollection
729
+ EE FeatureCollection with __row_id__ as a feature property
730
+ """
731
+ # OPTIMIZATION: Convert to GeoJSON dict and pass directly
732
+ # This eliminates the need to write to/read from temp files (~67% faster)
733
+ geojson_dict = json.loads(batch_gdf.to_json())
734
+ fc = convert_geojson_to_ee(geojson_dict)
735
+
736
+ # If __row_id__ is in the original GeoDataFrame, it will be preserved
737
+ # as a feature property in the GeoJSON and thus in the EE FeatureCollection
738
+ return fc
739
+
740
+
741
+ def clean_geodataframe(
742
+ gdf: gpd.GeoDataFrame,
743
+ remove_nulls: bool = True,
744
+ fix_invalid: bool = True,
745
+ logger: logging.Logger = None,
746
+ ) -> gpd.GeoDataFrame:
747
+ """
748
+ Validate and clean GeoDataFrame geometries.
749
+
750
+ Parameters
751
+ ----------
752
+ gdf : gpd.GeoDataFrame
753
+ Input GeoDataFrame
754
+ remove_nulls : bool
755
+ Remove null geometries
756
+ fix_invalid : bool
757
+ Fix invalid geometries
758
+ logger : logging.Logger, optional
759
+ Logger for output
760
+
761
+ Returns
762
+ -------
763
+ gpd.GeoDataFrame
764
+ Cleaned GeoDataFrame
765
+ """
766
+ logger = logger or logging.getLogger("whisp-concurrent")
767
+
768
+ if remove_nulls:
769
+ null_count = gdf.geometry.isna().sum()
770
+ if null_count > 0:
771
+ logger.warning(f"Removing {null_count} null geometries")
772
+ gdf = gdf[~gdf.geometry.isna()].copy()
773
+
774
+ if fix_invalid:
775
+ valid_count = gdf.geometry.is_valid.sum()
776
+ invalid_count = len(gdf) - valid_count
777
+ if invalid_count > 0:
778
+ logger.warning(f"Fixing {invalid_count} invalid geometries")
779
+ from shapely.validation import make_valid
780
+
781
+ gdf = gdf.copy()
782
+ gdf["geometry"] = gdf["geometry"].apply(
783
+ lambda g: make_valid(g) if g and not g.is_valid else g
784
+ )
785
+
786
+ logger.debug(f"Validation complete: {len(gdf):,} geometries ready")
787
+ return gdf
788
+
789
+
790
+ # ============================================================================
791
+ # EE PROCESSING WITH RETRY LOGIC
792
+ # ============================================================================
793
+
794
+
795
+ def process_ee_batch(
796
+ fc: ee.FeatureCollection,
797
+ whisp_image: ee.Image,
798
+ reducer: ee.Reducer,
799
+ batch_idx: int,
800
+ max_retries: int = 3,
801
+ logger: logging.Logger = None,
802
+ ) -> pd.DataFrame:
803
+ """
804
+ Process an EE FeatureCollection with automatic retry logic.
805
+
806
+ Parameters
807
+ ----------
808
+ fc : ee.FeatureCollection
809
+ Input FeatureCollection
810
+ whisp_image : ee.Image
811
+ Image containing bands to reduce
812
+ reducer : ee.Reducer
813
+ Reducer to apply
814
+ batch_idx : int
815
+ Batch index (for logging)
816
+ max_retries : int
817
+ Maximum retry attempts
818
+ logger : logging.Logger, optional
819
+ Logger for output
820
+
821
+ Returns
822
+ -------
823
+ pd.DataFrame
824
+ Results as DataFrame
825
+
826
+ Raises
827
+ ------
828
+ RuntimeError
829
+ If processing fails after all retries
830
+ """
831
+ logger = logger or logging.getLogger("whisp-concurrent")
832
+
833
+ for attempt in range(max_retries):
834
+ try:
835
+ results = whisp_image.reduceRegions(
836
+ collection=fc,
837
+ reducer=reducer,
838
+ scale=10,
839
+ )
840
+ df = convert_ee_to_df(results)
841
+
842
+ # Ensure plot_id_column is present for merging
843
+ # It should come from the feature properties (added before EE processing)
844
+ if plot_id_column not in df.columns:
845
+ df[plot_id_column] = range(len(df))
846
+
847
+ # Ensure all column names are strings (fixes pandas .str accessor issues)
848
+ df.columns = df.columns.astype(str)
849
+
850
+ return df
851
+
852
+ except ee.EEException as e:
853
+ error_msg = str(e)
854
+
855
+ if "Quota" in error_msg or "limit" in error_msg.lower():
856
+ if attempt < max_retries - 1:
857
+ wait_time = min(30, 2**attempt)
858
+ logger.warning(
859
+ f"Batch {batch_idx + 1}: Rate limited, waiting {wait_time}s..."
860
+ )
861
+ time.sleep(wait_time)
862
+ else:
863
+ raise RuntimeError(f"Batch {batch_idx + 1}: Quota exhausted")
864
+
865
+ elif "timeout" in error_msg.lower():
866
+ if attempt < max_retries - 1:
867
+ wait_time = min(15, 2**attempt)
868
+ logger.warning(
869
+ f"Batch {batch_idx + 1}: Timeout, retrying in {wait_time}s..."
870
+ )
871
+ time.sleep(wait_time)
872
+ else:
873
+ raise
874
+
875
+ else:
876
+ if attempt < max_retries - 1:
877
+ wait_time = min(5, 2**attempt)
878
+ time.sleep(wait_time)
879
+ else:
880
+ raise
881
+
882
+ except Exception as e:
883
+ if attempt < max_retries - 1:
884
+ time.sleep(min(5, 2**attempt))
885
+ else:
886
+ raise RuntimeError(f"Batch {batch_idx + 1}: {str(e)}")
887
+
888
+ raise RuntimeError(f"Batch {batch_idx + 1}: Failed after {max_retries} attempts")
889
+
890
+
891
+ # ============================================================================
892
+ # CONCURRENT PROCESSING FUNCTIONS
893
+ # ============================================================================
894
+
895
+
896
+ def whisp_stats_geojson_to_df_concurrent(
897
+ input_geojson_filepath: str,
898
+ external_id_column: str = None,
899
+ remove_geom: bool = False,
900
+ national_codes: List[str] = None,
901
+ unit_type: str = "ha",
902
+ whisp_image: ee.Image = None,
903
+ custom_bands: Dict[str, Any] = None,
904
+ batch_size: int = 10,
905
+ max_concurrent: int = 20,
906
+ validate_geometries: bool = True,
907
+ max_retries: int = 3,
908
+ add_metadata_server: bool = False,
909
+ logger: logging.Logger = None,
910
+ # Format parameters (auto-detect from config if not provided)
911
+ decimal_places: int = None,
912
+ ) -> pd.DataFrame:
913
+ """
914
+ Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
915
+
916
+ Uses high-volume endpoint and concurrent batching. Client-side metadata
917
+ extraction is always applied; optionally add server-side metadata too.
918
+ Automatically formats output (converts units, removes noise columns, etc.).
919
+
920
+ Parameters
921
+ ----------
922
+ input_geojson_filepath : str
923
+ Path to input GeoJSON file
924
+ external_id_column : str, optional
925
+ Column name for external IDs
926
+ remove_geom : bool
927
+ Remove geometry column from output
928
+ national_codes : List[str], optional
929
+ ISO2 codes for national datasets
930
+ unit_type : str
931
+ "ha" or "percent"
932
+ whisp_image : ee.Image, optional
933
+ Pre-combined image (created with combine_datasets if None)
934
+ custom_bands : Dict[str, Any], optional
935
+ Custom band information
936
+ batch_size : int
937
+ Features per batch
938
+ max_concurrent : int
939
+ Maximum concurrent EE calls
940
+ validate_geometries : bool
941
+ Validate and clean geometries
942
+ max_retries : int
943
+ Retry attempts per batch
944
+ add_metadata_server : bool
945
+ Add metadata server-side (in addition to client-side)
946
+ logger : logging.Logger, optional
947
+ Logger for output
948
+ decimal_places : int, optional
949
+ Decimal places for formatting. If None, auto-detects from config.
950
+
951
+ Returns
952
+ -------
953
+ pd.DataFrame
954
+ Formatted results DataFrame with Whisp statistics
955
+ """
956
+ from openforis_whisp.reformat import format_stats_dataframe
957
+
958
+ logger = logger or logging.getLogger("whisp-concurrent")
959
+
960
+ # Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
961
+ _suppress_verbose_output(max_concurrent=max_concurrent)
962
+
963
+ # Auto-detect decimal places from config if not provided
964
+ if decimal_places is None:
965
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
966
+ logger.debug(f"Using decimal_places={decimal_places} from config")
967
+
968
+ # Validate endpoint
969
+ validate_ee_endpoint("high-volume", raise_error=True)
970
+
971
+ # Load GeoJSON with output suppressed
972
+ gdf = _load_geojson_silently(input_geojson_filepath)
973
+ logger.info(f"Loaded {len(gdf):,} features")
974
+
975
+ if validate_geometries:
976
+ gdf = clean_geodataframe(gdf, logger=logger)
977
+
978
+ # Add stable plotIds for merging (starting from 1, not 0)
979
+ gdf[plot_id_column] = range(1, len(gdf) + 1)
980
+
981
+ # Create image if not provided
982
+ if whisp_image is None:
983
+ logger.debug("Creating Whisp image...")
984
+ # Suppress print statements from combine_datasets
985
+ with redirect_stdout(io.StringIO()):
986
+ try:
987
+ # First try without validation
988
+ whisp_image = combine_datasets(
989
+ national_codes=national_codes, validate_bands=False
990
+ )
991
+ except Exception as e:
992
+ logger.warning(
993
+ f"First attempt failed: {str(e)[:100]}. Retrying with validate_bands=True..."
994
+ )
995
+ # Retry with validation to catch and fix bad bands
996
+ whisp_image = combine_datasets(
997
+ national_codes=national_codes, validate_bands=True
998
+ )
999
+
1000
+ # Create reducer
1001
+ reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1002
+
1003
+ # Batch the data
1004
+ batches = batch_geodataframe(gdf, batch_size)
1005
+ logger.info(f"Processing {len(gdf):,} features in {len(batches)} batches")
1006
+
1007
+ # Setup semaphore for EE concurrency control
1008
+ ee_semaphore = threading.BoundedSemaphore(max_concurrent)
1009
+
1010
+ # Progress tracker
1011
+ progress = ProgressTracker(len(batches), logger=logger)
1012
+
1013
+ results = []
1014
+
1015
+ def process_batch(
1016
+ batch_idx: int, batch: gpd.GeoDataFrame
1017
+ ) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
1018
+ """Process one batch: server EE work + client metadata."""
1019
+ with ee_semaphore:
1020
+ # Server-side: convert to EE, optionally add metadata, reduce
1021
+ fc = convert_batch_to_ee(batch)
1022
+ if add_metadata_server:
1023
+ fc = extract_centroid_and_geomtype_server(fc)
1024
+ df_server = process_ee_batch(
1025
+ fc, whisp_image, reducer, batch_idx, max_retries, logger
1026
+ )
1027
+
1028
+ # Client-side: extract metadata using GeoPandas
1029
+ df_client = extract_centroid_and_geomtype_client(
1030
+ batch,
1031
+ external_id_column=external_id_column,
1032
+ return_attributes_only=True,
1033
+ )
1034
+
1035
+ return batch_idx, df_server, df_client
1036
+
1037
+ # Process batches with thread pool
1038
+ pool_workers = max(2 * max_concurrent, max_concurrent + 2)
1039
+
1040
+ # Track if we had errors that suggest bad bands
1041
+ batch_errors = []
1042
+
1043
+ # Suppress fiona logging during batch processing (threads create new loggers)
1044
+ fiona_logger = logging.getLogger("fiona")
1045
+ pyogrio_logger = logging.getLogger("pyogrio._io")
1046
+ old_fiona_level = fiona_logger.level
1047
+ old_pyogrio_level = pyogrio_logger.level
1048
+ fiona_logger.setLevel(logging.CRITICAL)
1049
+ pyogrio_logger.setLevel(logging.CRITICAL)
1050
+
1051
+ try:
1052
+ with redirect_stdout(io.StringIO()):
1053
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1054
+ futures = {
1055
+ executor.submit(process_batch, i, batch): i
1056
+ for i, batch in enumerate(batches)
1057
+ }
1058
+
1059
+ for future in as_completed(futures):
1060
+ try:
1061
+ batch_idx, df_server, df_client = future.result()
1062
+
1063
+ # Merge server and client results
1064
+ if plot_id_column not in df_server.columns:
1065
+ df_server[plot_id_column] = range(len(df_server))
1066
+
1067
+ merged = df_server.merge(
1068
+ df_client,
1069
+ on=plot_id_column,
1070
+ how="left",
1071
+ suffixes=("_ee", "_client"),
1072
+ )
1073
+ results.append(merged)
1074
+ progress.update()
1075
+
1076
+ except Exception as e:
1077
+ error_msg = str(e)
1078
+ logger.error(f"Batch processing error: {error_msg[:100]}")
1079
+ import traceback
1080
+
1081
+ logger.debug(traceback.format_exc())
1082
+ batch_errors.append(error_msg)
1083
+ finally:
1084
+ # Restore logger levels
1085
+ fiona_logger.setLevel(old_fiona_level)
1086
+ pyogrio_logger.setLevel(old_pyogrio_level)
1087
+
1088
+ progress.finish()
1089
+
1090
+ # Check if we should retry with validation due to band errors
1091
+ if batch_errors and not results:
1092
+ # All batches failed - likely a bad band issue
1093
+ is_band_error = any(
1094
+ keyword in str(batch_errors)
1095
+ for keyword in ["Image.load", "asset", "not found", "does not exist"]
1096
+ )
1097
+
1098
+ if is_band_error:
1099
+ logger.warning(
1100
+ "Detected potential bad band error. Retrying with validate_bands=True..."
1101
+ )
1102
+ try:
1103
+ with redirect_stdout(io.StringIO()):
1104
+ whisp_image = combine_datasets(
1105
+ national_codes=national_codes, validate_bands=True
1106
+ )
1107
+ logger.info(
1108
+ "Image recreated with validation. Retrying batch processing..."
1109
+ )
1110
+
1111
+ # Retry batch processing with validated image
1112
+ results = []
1113
+ progress = ProgressTracker(len(batches), logger=logger)
1114
+
1115
+ # Suppress fiona logging during batch processing (threads create new loggers)
1116
+ fiona_logger = logging.getLogger("fiona")
1117
+ pyogrio_logger = logging.getLogger("pyogrio._io")
1118
+ old_fiona_level = fiona_logger.level
1119
+ old_pyogrio_level = pyogrio_logger.level
1120
+ fiona_logger.setLevel(logging.CRITICAL)
1121
+ pyogrio_logger.setLevel(logging.CRITICAL)
1122
+
1123
+ try:
1124
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1125
+ futures = {
1126
+ executor.submit(process_batch, i, batch): i
1127
+ for i, batch in enumerate(batches)
1128
+ }
1129
+
1130
+ for future in as_completed(futures):
1131
+ try:
1132
+ batch_idx, df_server, df_client = future.result()
1133
+ if plot_id_column not in df_server.columns:
1134
+ df_server[plot_id_column] = range(len(df_server))
1135
+ merged = df_server.merge(
1136
+ df_client,
1137
+ on=plot_id_column,
1138
+ how="left",
1139
+ suffixes=("", "_client"),
1140
+ )
1141
+ results.append(merged)
1142
+ progress.update()
1143
+ except Exception as e:
1144
+ logger.error(
1145
+ f"Batch processing error (retry): {str(e)[:100]}"
1146
+ )
1147
+
1148
+ progress.finish()
1149
+ finally:
1150
+ # Restore logger levels
1151
+ fiona_logger.setLevel(old_fiona_level)
1152
+ pyogrio_logger.setLevel(old_pyogrio_level)
1153
+ except Exception as validation_e:
1154
+ logger.error(
1155
+ f"Failed to recover with validation: {str(validation_e)[:100]}"
1156
+ )
1157
+ return pd.DataFrame()
1158
+
1159
+ if results:
1160
+ # Filter out empty DataFrames and all-NA columns to avoid FutureWarning in pd.concat
1161
+ results_filtered = []
1162
+ for df in results:
1163
+ if not df.empty:
1164
+ # Drop columns that are entirely NA
1165
+ df_clean = df.dropna(axis=1, how="all")
1166
+ if not df_clean.empty:
1167
+ results_filtered.append(df_clean)
1168
+ results = results_filtered
1169
+
1170
+ if results:
1171
+ # Concatenate with explicit dtype handling to suppress FutureWarning
1172
+ combined = pd.concat(results, ignore_index=True, sort=False)
1173
+ # Ensure all column names are strings (fixes pandas .str accessor issues later)
1174
+ combined.columns = combined.columns.astype(str)
1175
+ else:
1176
+ return pd.DataFrame()
1177
+
1178
+ # Clean up duplicate external_id columns created by merges
1179
+ # Rename external_id_column to standardized 'external_id' for schema validation
1180
+ if external_id_column:
1181
+ # Find all columns related to external_id
1182
+ external_id_variants = [
1183
+ col
1184
+ for col in combined.columns
1185
+ if external_id_column.lower() in col.lower()
1186
+ ]
1187
+
1188
+ if external_id_variants:
1189
+ # Use the base column name if it exists, otherwise use first variant
1190
+ base_col = (
1191
+ external_id_column
1192
+ if external_id_column in combined.columns
1193
+ else external_id_variants[0]
1194
+ )
1195
+
1196
+ # Rename to standardized 'external_id'
1197
+ if base_col != "external_id":
1198
+ combined = combined.rename(columns={base_col: "external_id"})
1199
+
1200
+ # Drop all other variants
1201
+ cols_to_drop = [c for c in external_id_variants if c != base_col]
1202
+ combined = combined.drop(columns=cols_to_drop, errors="ignore")
1203
+
1204
+ # plotId column is already present from batch processing
1205
+ # Just ensure it's at position 0
1206
+ if plot_id_column in combined.columns:
1207
+ combined = combined[
1208
+ [plot_id_column]
1209
+ + [col for col in combined.columns if col != plot_id_column]
1210
+ ]
1211
+
1212
+ # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1213
+ # MUST be done BEFORE formatting (which removes _median columns)
1214
+ logger.debug("Adding administrative context...")
1215
+ try:
1216
+ from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
1217
+
1218
+ combined = join_admin_codes(
1219
+ df=combined, lookup_dict=lookup_dict, id_col="admin_code_median"
1220
+ )
1221
+ except ImportError:
1222
+ logger.warning(
1223
+ "Could not import lookup dictionary - admin context not added"
1224
+ )
1225
+
1226
+ # Format the output with error handling for bad bands
1227
+ logger.debug("Formatting output...")
1228
+ try:
1229
+ formatted = format_stats_dataframe(
1230
+ df=combined,
1231
+ area_col=f"{geometry_area_column}_sum",
1232
+ decimal_places=decimal_places,
1233
+ unit_type=unit_type,
1234
+ remove_columns=True,
1235
+ convert_water_flag=True,
1236
+ )
1237
+ except Exception as e:
1238
+ # If formatting fails, try recreating the image with validation
1239
+ logger.warning(
1240
+ f"Formatting failed: {str(e)[:100]}. Attempting to recreate image with band validation..."
1241
+ )
1242
+ try:
1243
+ with redirect_stdout(io.StringIO()):
1244
+ whisp_image_validated = combine_datasets(
1245
+ national_codes=national_codes, validate_bands=True
1246
+ )
1247
+
1248
+ # Reprocess batches with validated image - create a local process function
1249
+ logger.info("Reprocessing batches with validated image...")
1250
+ results_validated = []
1251
+
1252
+ def process_batch_validated(
1253
+ batch_idx: int, batch: gpd.GeoDataFrame
1254
+ ) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
1255
+ """Process one batch with validated image."""
1256
+ with ee_semaphore:
1257
+ fc = convert_batch_to_ee(batch)
1258
+ if add_metadata_server:
1259
+ fc = extract_centroid_and_geomtype_server(fc)
1260
+ df_server = process_ee_batch(
1261
+ fc,
1262
+ whisp_image_validated,
1263
+ reducer,
1264
+ batch_idx,
1265
+ max_retries,
1266
+ logger,
1267
+ )
1268
+ df_client = extract_centroid_and_geomtype_client(
1269
+ batch,
1270
+ external_id_column=external_id_column,
1271
+ return_attributes_only=True,
1272
+ )
1273
+ return batch_idx, df_server, df_client
1274
+
1275
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1276
+ futures = {
1277
+ executor.submit(process_batch_validated, i, batch): i
1278
+ for i, batch in enumerate(batches)
1279
+ }
1280
+
1281
+ for future in as_completed(futures):
1282
+ try:
1283
+ batch_idx, df_server, df_client = future.result()
1284
+ if plot_id_column not in df_server.columns:
1285
+ df_server[plot_id_column] = range(len(df_server))
1286
+
1287
+ # Drop external_id_column from df_client if it exists (already in df_server)
1288
+ if (
1289
+ external_id_column
1290
+ and external_id_column in df_client.columns
1291
+ ):
1292
+ df_client = df_client.drop(columns=[external_id_column])
1293
+
1294
+ merged = df_server.merge(
1295
+ df_client,
1296
+ on=plot_id_column,
1297
+ how="left",
1298
+ suffixes=("", "_client"),
1299
+ )
1300
+ results_validated.append(merged)
1301
+ except Exception as batch_e:
1302
+ logger.error(
1303
+ f"Batch reprocessing error: {str(batch_e)[:100]}"
1304
+ )
1305
+
1306
+ if results_validated:
1307
+ # Concatenate with explicit dtype handling to suppress FutureWarning
1308
+ combined = pd.concat(
1309
+ results_validated, ignore_index=True, sort=False
1310
+ )
1311
+ # Ensure all column names are strings (fixes pandas .str accessor issues later)
1312
+ combined.columns = combined.columns.astype(str)
1313
+
1314
+ # Clean up duplicate external_id columns created by merges
1315
+ if external_id_column:
1316
+ external_id_variants = [
1317
+ col
1318
+ for col in combined.columns
1319
+ if external_id_column.lower() in col.lower()
1320
+ ]
1321
+
1322
+ if external_id_variants:
1323
+ base_col = external_id_column
1324
+ if (
1325
+ base_col not in combined.columns
1326
+ and external_id_variants
1327
+ ):
1328
+ base_col = external_id_variants[0]
1329
+ combined = combined.rename(
1330
+ columns={base_col: "external_id"}
1331
+ )
1332
+
1333
+ cols_to_drop = [
1334
+ c for c in external_id_variants if c != base_col
1335
+ ]
1336
+ combined = combined.drop(
1337
+ columns=cols_to_drop, errors="ignore"
1338
+ )
1339
+
1340
+ # plotId column is already present, just ensure it's at position 0
1341
+ if plot_id_column in combined.columns:
1342
+ combined = combined[
1343
+ [plot_id_column]
1344
+ + [col for col in combined.columns if col != plot_id_column]
1345
+ ]
1346
+
1347
+ # Add admin context again
1348
+ try:
1349
+ from openforis_whisp.parameters.lookup_gaul1_admin import (
1350
+ lookup_dict,
1351
+ )
1352
+
1353
+ combined = join_admin_codes(
1354
+ df=combined,
1355
+ lookup_dict=lookup_dict,
1356
+ id_col="admin_code_median",
1357
+ )
1358
+ except ImportError:
1359
+ logger.warning(
1360
+ "Could not import lookup dictionary - admin context not added"
1361
+ )
1362
+
1363
+ # Try formatting again with validated data
1364
+ formatted = format_stats_dataframe(
1365
+ df=combined,
1366
+ area_col=f"{geometry_area_column}_sum",
1367
+ decimal_places=decimal_places,
1368
+ unit_type=unit_type,
1369
+ remove_columns=True,
1370
+ convert_water_flag=True,
1371
+ )
1372
+ else:
1373
+ logger.error(" Reprocessing with validation produced no results")
1374
+ return pd.DataFrame()
1375
+ except Exception as retry_e:
1376
+ logger.error(
1377
+ f"Failed to recover from formatting error: {str(retry_e)[:100]}"
1378
+ )
1379
+ raise retry_e
1380
+
1381
+ logger.info(f"Processed {len(formatted):,} features successfully")
1382
+ return formatted
1383
+ else:
1384
+ logger.error(" No results produced")
1385
+ return pd.DataFrame()
1386
+
1387
+
1388
+ # ============================================================================
1389
+ # SEQUENTIAL PROCESSING (STANDARD ENDPOINT)
1390
+ # ============================================================================
1391
+
1392
+
1393
+ def whisp_stats_geojson_to_df_sequential(
1394
+ input_geojson_filepath: str,
1395
+ external_id_column: str = None,
1396
+ remove_geom: bool = False,
1397
+ national_codes: List[str] = None,
1398
+ unit_type: str = "ha",
1399
+ whisp_image: ee.Image = None,
1400
+ custom_bands: Dict[str, Any] = None,
1401
+ add_metadata_client_side: bool = True,
1402
+ logger: logging.Logger = None,
1403
+ # Format parameters (auto-detect from config if not provided)
1404
+ decimal_places: int = None,
1405
+ ) -> pd.DataFrame:
1406
+ """
1407
+ Process GeoJSON sequentially using standard EE endpoint with automatic formatting.
1408
+
1409
+ Uses reduceRegions for server-side processing and client-side metadata
1410
+ extraction via GeoPandas. Suitable for smaller datasets or when high-volume
1411
+ endpoint is not available. Automatically formats output.
1412
+
1413
+ Requires: standard EE endpoint (default)
1414
+
1415
+ Parameters
1416
+ ----------
1417
+ input_geojson_filepath : str
1418
+ Path to input GeoJSON
1419
+ external_id_column : str, optional
1420
+ Column name for external IDs
1421
+ remove_geom : bool
1422
+ Remove geometry from output
1423
+ national_codes : List[str], optional
1424
+ ISO2 codes for national datasets
1425
+ unit_type : str
1426
+ "ha" or "percent"
1427
+ whisp_image : ee.Image, optional
1428
+ Pre-combined image
1429
+ custom_bands : Dict[str, Any], optional
1430
+ Custom band information
1431
+ add_metadata_client_side : bool
1432
+ Add client-side metadata (recommended)
1433
+ logger : logging.Logger, optional
1434
+ Logger for output
1435
+ decimal_places : int, optional
1436
+ Decimal places for formatting. If None, auto-detects from config.
1437
+
1438
+ Returns
1439
+ -------
1440
+ pd.DataFrame
1441
+ Formatted results DataFrame
1442
+ """
1443
+ from openforis_whisp.reformat import format_stats_dataframe
1444
+
1445
+ logger = logger or logging.getLogger("whisp-concurrent")
1446
+
1447
+ # Suppress verbose output from dependencies (sequential has lower concurrency, use default)
1448
+ _suppress_verbose_output(max_concurrent=1)
1449
+
1450
+ # Auto-detect decimal places from config if not provided
1451
+ if decimal_places is None:
1452
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1453
+ logger.debug(f"Using decimal_places={decimal_places} from config")
1454
+
1455
+ # Validate endpoint
1456
+ validate_ee_endpoint("standard", raise_error=True)
1457
+
1458
+ # Load GeoJSON with output suppressed
1459
+ gdf = _load_geojson_silently(input_geojson_filepath)
1460
+ logger.info(f"Loaded {len(gdf):,} features")
1461
+
1462
+ # Clean geometries
1463
+ gdf = clean_geodataframe(gdf, logger=logger)
1464
+
1465
+ # Add stable plotIds for merging (starting from 1, not 0)
1466
+ gdf[plot_id_column] = range(1, len(gdf) + 1)
1467
+
1468
+ # Add stable row IDs
1469
+ row_id_col = "__row_id__"
1470
+ gdf[row_id_col] = range(len(gdf))
1471
+
1472
+ # Create image if not provided
1473
+ if whisp_image is None:
1474
+ logger.debug("Creating Whisp image...")
1475
+ # Suppress print statements from combine_datasets
1476
+ with redirect_stdout(io.StringIO()):
1477
+ try:
1478
+ # First try without validation
1479
+ whisp_image = combine_datasets(
1480
+ national_codes=national_codes, validate_bands=False
1481
+ )
1482
+ except Exception as e:
1483
+ logger.warning(
1484
+ f"First attempt failed: {str(e)[:100]}. Retrying with validate_bands=True..."
1485
+ )
1486
+ # Retry with validation to catch and fix bad bands
1487
+ whisp_image = combine_datasets(
1488
+ national_codes=national_codes, validate_bands=True
1489
+ )
1490
+
1491
+ # Convert to EE (suppress print statements from convert_geojson_to_ee)
1492
+ logger.debug("Converting to EE FeatureCollection...")
1493
+ with redirect_stdout(io.StringIO()):
1494
+ fc = convert_geojson_to_ee(input_geojson_filepath)
1495
+
1496
+ # Create reducer
1497
+ reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1498
+
1499
+ # Process server-side with error handling for bad bands
1500
+ logger.info("Processing with Earth Engine...")
1501
+ try:
1502
+ results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
1503
+ df_server = convert_ee_to_df(results_fc)
1504
+ except Exception as e:
1505
+ # Check if this is a band error
1506
+ error_msg = str(e)
1507
+ is_band_error = any(
1508
+ keyword in error_msg
1509
+ for keyword in ["Image.load", "asset", "not found", "does not exist"]
1510
+ )
1511
+
1512
+ if is_band_error and whisp_image is not None:
1513
+ logger.warning(
1514
+ f"Detected bad band error: {error_msg[:100]}. Retrying with validate_bands=True..."
1515
+ )
1516
+ try:
1517
+ with redirect_stdout(io.StringIO()):
1518
+ whisp_image = combine_datasets(
1519
+ national_codes=national_codes, validate_bands=True
1520
+ )
1521
+ logger.info("Image recreated with validation. Retrying processing...")
1522
+ results_fc = whisp_image.reduceRegions(
1523
+ collection=fc, reducer=reducer, scale=10
1524
+ )
1525
+ df_server = convert_ee_to_df(results_fc)
1526
+ except Exception as retry_e:
1527
+ logger.error(f"Retry failed: {str(retry_e)[:100]}")
1528
+ raise
1529
+ else:
1530
+ raise
1531
+
1532
+ logger.debug("Server-side processing complete")
1533
+
1534
+ # Add row_id if missing
1535
+ if row_id_col not in df_server.columns:
1536
+ df_server[row_id_col] = range(len(df_server))
1537
+
1538
+ # Add client-side metadata if requested
1539
+ if add_metadata_client_side:
1540
+ logger.debug("Extracting client-side metadata...")
1541
+ df_client = extract_centroid_and_geomtype_client(
1542
+ gdf,
1543
+ external_id_column=external_id_column,
1544
+ return_attributes_only=True,
1545
+ )
1546
+
1547
+ # Drop external_id_column from df_client if it exists (already in df_server)
1548
+ if external_id_column and external_id_column in df_client.columns:
1549
+ df_client = df_client.drop(columns=[external_id_column])
1550
+
1551
+ # Merge
1552
+ result = df_server.merge(
1553
+ df_client, on=row_id_col, how="left", suffixes=("", "_client")
1554
+ )
1555
+ else:
1556
+ result = df_server
1557
+
1558
+ # Remove internal __row_id__ column if present
1559
+ if row_id_col in result.columns:
1560
+ result = result.drop(columns=[row_id_col])
1561
+
1562
+ # Format the output
1563
+ # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1564
+ # MUST be done BEFORE formatting (which removes _median columns)
1565
+ logger.debug("Adding administrative context...")
1566
+ try:
1567
+ from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
1568
+
1569
+ result = join_admin_codes(
1570
+ df=result, lookup_dict=lookup_dict, id_col="admin_code_median"
1571
+ )
1572
+ except ImportError:
1573
+ logger.warning("Could not import lookup dictionary - admin context not added")
1574
+
1575
+ # Format the output
1576
+ logger.debug("Formatting output...")
1577
+ formatted = format_stats_dataframe(
1578
+ df=result,
1579
+ area_col=f"{geometry_area_column}_sum",
1580
+ decimal_places=decimal_places,
1581
+ unit_type=unit_type,
1582
+ remove_columns=True,
1583
+ convert_water_flag=True,
1584
+ )
1585
+
1586
+ logger.info(f"Processed {len(formatted):,} features")
1587
+
1588
+ # Consolidate external_id_column to standardized 'external_id'
1589
+ if external_id_column:
1590
+ variants = [
1591
+ col
1592
+ for col in formatted.columns
1593
+ if external_id_column.lower() in col.lower()
1594
+ ]
1595
+ if variants:
1596
+ base_col = (
1597
+ external_id_column
1598
+ if external_id_column in formatted.columns
1599
+ else variants[0]
1600
+ )
1601
+ if base_col != "external_id":
1602
+ formatted = formatted.rename(columns={base_col: "external_id"})
1603
+ # Drop other variants
1604
+ formatted = formatted.drop(
1605
+ columns=[c for c in variants if c != base_col], errors="ignore"
1606
+ )
1607
+
1608
+ return formatted
1609
+
1610
+
1611
+ # ============================================================================
1612
+ # FORMATTED WRAPPER FUNCTIONS (STATS + FORMAT)
1613
+ # ============================================================================
1614
+
1615
+
1616
+ def whisp_formatted_stats_geojson_to_df_concurrent(
1617
+ input_geojson_filepath: str,
1618
+ external_id_column: str = None,
1619
+ remove_geom: bool = False,
1620
+ national_codes: List[str] = None,
1621
+ unit_type: str = "ha",
1622
+ whisp_image: ee.Image = None,
1623
+ custom_bands: Dict[str, Any] = None,
1624
+ batch_size: int = 10,
1625
+ max_concurrent: int = 20,
1626
+ validate_geometries: bool = True,
1627
+ max_retries: int = 3,
1628
+ add_metadata_server: bool = False,
1629
+ logger: logging.Logger = None,
1630
+ # Format parameters (auto-detect from config if not provided)
1631
+ decimal_places: int = None,
1632
+ remove_median_columns: bool = True,
1633
+ convert_water_flag: bool = True,
1634
+ water_flag_threshold: float = 0.5,
1635
+ sort_column: str = "plotId",
1636
+ ) -> pd.DataFrame:
1637
+ """
1638
+ Process GeoJSON concurrently with automatic formatting and validation.
1639
+
1640
+ Combines whisp_stats_geojson_to_df_concurrent + format_stats_dataframe + validation
1641
+ for a complete pipeline: extract stats → convert units → format output → validate schema.
1642
+
1643
+ Uses high-volume endpoint and concurrent batching.
1644
+
1645
+ Parameters
1646
+ ----------
1647
+ input_geojson_filepath : str
1648
+ Path to input GeoJSON file
1649
+ external_id_column : str, optional
1650
+ Column name for external IDs
1651
+ remove_geom : bool
1652
+ Remove geometry column from output
1653
+ national_codes : List[str], optional
1654
+ ISO2 codes for national datasets
1655
+ unit_type : str
1656
+ "ha" or "percent"
1657
+ whisp_image : ee.Image, optional
1658
+ Pre-combined image
1659
+ custom_bands : Dict[str, Any], optional
1660
+ Custom band information
1661
+ batch_size : int
1662
+ Features per batch (default 25)
1663
+ max_concurrent : int
1664
+ Maximum concurrent EE calls (default 10)
1665
+ validate_geometries : bool
1666
+ Validate and clean geometries (default True)
1667
+ max_retries : int
1668
+ Retry attempts per batch (default 3)
1669
+ add_metadata_server : bool
1670
+ Add metadata server-side (default False)
1671
+ logger : logging.Logger, optional
1672
+ Logger for output
1673
+ decimal_places : int, optional
1674
+ Decimal places for rounding. If None, auto-detects from config:
1675
+ - Area columns: geometry_area_column_formatting
1676
+ - Percent columns: stats_percent_columns_formatting
1677
+ - Other columns: stats_area_columns_formatting
1678
+ remove_median_columns : bool
1679
+ Remove '_median' columns (default True)
1680
+ convert_water_flag : bool
1681
+ Convert water flag to boolean (default True)
1682
+ water_flag_threshold : float
1683
+ Water flag ratio threshold (default 0.5)
1684
+ sort_column : str
1685
+ Column to sort by (default "plotId", None to skip)
1686
+
1687
+ Returns
1688
+ -------
1689
+ pd.DataFrame
1690
+ Validated, formatted results DataFrame
1691
+ """
1692
+ from openforis_whisp.reformat import format_stats_dataframe
1693
+
1694
+ logger = logger or logging.getLogger("whisp-concurrent")
1695
+
1696
+ # Auto-detect decimal places from config if not provided
1697
+ if decimal_places is None:
1698
+ # Use stats_area_columns_formatting as default for most columns
1699
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1700
+ logger.debug(f"Using decimal_places={decimal_places} from config")
1701
+
1702
+ # Step 1: Get raw stats
1703
+ logger.debug("Step 1/2: Extracting statistics (concurrent)...")
1704
+ df_raw = whisp_stats_geojson_to_df_concurrent(
1705
+ input_geojson_filepath=input_geojson_filepath,
1706
+ external_id_column=external_id_column,
1707
+ remove_geom=remove_geom,
1708
+ national_codes=national_codes,
1709
+ unit_type=unit_type,
1710
+ whisp_image=whisp_image,
1711
+ custom_bands=custom_bands,
1712
+ batch_size=batch_size,
1713
+ max_concurrent=max_concurrent,
1714
+ validate_geometries=validate_geometries,
1715
+ max_retries=max_retries,
1716
+ add_metadata_server=add_metadata_server,
1717
+ logger=logger,
1718
+ )
1719
+
1720
+ # Step 2: Format the output
1721
+ logger.debug("Step 2/2: Formatting output...")
1722
+ median_cols_before = [c for c in df_raw.columns if c.endswith("_median")]
1723
+ logger.debug(
1724
+ f"Columns ending with '_median' BEFORE formatting: {median_cols_before}"
1725
+ )
1726
+
1727
+ df_formatted = format_stats_dataframe(
1728
+ df=df_raw,
1729
+ area_col=f"{geometry_area_column}_sum",
1730
+ decimal_places=decimal_places,
1731
+ unit_type=unit_type,
1732
+ remove_columns=remove_median_columns,
1733
+ convert_water_flag=convert_water_flag,
1734
+ water_flag_threshold=water_flag_threshold,
1735
+ sort_column=sort_column,
1736
+ )
1737
+
1738
+ median_cols_after = [c for c in df_formatted.columns if c.endswith("_median")]
1739
+ logger.debug(f"Columns ending with '_median' AFTER formatting: {median_cols_after}")
1740
+
1741
+ # Step 2b: Reformat geometry and handle point areas
1742
+ try:
1743
+ df_formatted = reformat_geometry_type(df_formatted)
1744
+ except Exception as e:
1745
+ logger.warning(f"Error reformatting geometry type: {e}")
1746
+
1747
+ try:
1748
+ df_formatted = set_point_geometry_area_to_zero(df_formatted)
1749
+ except Exception as e:
1750
+ logger.warning(f"Error setting point geometry area to zero: {e}")
1751
+
1752
+ # Step 3: Validate against schema
1753
+ logger.debug("Step 3/3: Validating against schema...")
1754
+ from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
1755
+
1756
+ df_validated = validate_dataframe_using_lookups_flexible(
1757
+ df_stats=df_formatted,
1758
+ national_codes=national_codes,
1759
+ custom_bands=custom_bands,
1760
+ )
1761
+
1762
+ logger.info("Concurrent processing + formatting + validation complete")
1763
+ return df_validated
1764
+
1765
+
1766
+ def whisp_formatted_stats_geojson_to_df_sequential(
1767
+ input_geojson_filepath: str,
1768
+ external_id_column: str = None,
1769
+ remove_geom: bool = False,
1770
+ national_codes: List[str] = None,
1771
+ unit_type: str = "ha",
1772
+ whisp_image: ee.Image = None,
1773
+ custom_bands: Dict[str, Any] = None,
1774
+ add_metadata_client_side: bool = True,
1775
+ logger: logging.Logger = None,
1776
+ # Format parameters (auto-detect from config if not provided)
1777
+ decimal_places: int = None,
1778
+ remove_median_columns: bool = True,
1779
+ convert_water_flag: bool = True,
1780
+ water_flag_threshold: float = 0.5,
1781
+ sort_column: str = "plotId",
1782
+ ) -> pd.DataFrame:
1783
+ """
1784
+ Process GeoJSON sequentially with automatic formatting and validation.
1785
+
1786
+ Combines whisp_stats_geojson_to_df_sequential + format_stats_dataframe + validation
1787
+ for a complete pipeline: extract stats → convert units → format output → validate schema.
1788
+
1789
+ Uses standard endpoint for sequential processing.
1790
+
1791
+ Parameters
1792
+ ----------
1793
+ input_geojson_filepath : str
1794
+ Path to input GeoJSON file
1795
+ external_id_column : str, optional
1796
+ Column name for external IDs
1797
+ remove_geom : bool
1798
+ Remove geometry from output
1799
+ national_codes : List[str], optional
1800
+ ISO2 codes for national datasets
1801
+ unit_type : str
1802
+ "ha" or "percent"
1803
+ whisp_image : ee.Image, optional
1804
+ Pre-combined image
1805
+ custom_bands : Dict[str, Any], optional
1806
+ Custom band information
1807
+ add_metadata_client_side : bool
1808
+ Add client-side metadata (default True)
1809
+ logger : logging.Logger, optional
1810
+ Logger for output
1811
+ decimal_places : int, optional
1812
+ Decimal places for rounding. If None, auto-detects from config:
1813
+ - Area columns: geometry_area_column_formatting
1814
+ - Percent columns: stats_percent_columns_formatting
1815
+ - Other columns: stats_area_columns_formatting
1816
+ remove_median_columns : bool
1817
+ Remove '_median' columns (default True)
1818
+ convert_water_flag : bool
1819
+ Convert water flag to boolean (default True)
1820
+ water_flag_threshold : float
1821
+ Water flag ratio threshold (default 0.5)
1822
+ sort_column : str
1823
+ Column to sort by (default "plotId", None to skip)
1824
+
1825
+ Returns
1826
+ -------
1827
+ pd.DataFrame
1828
+ Validated, formatted results DataFrame
1829
+ """
1830
+ from openforis_whisp.reformat import format_stats_dataframe
1831
+
1832
+ logger = logger or logging.getLogger("whisp-concurrent")
1833
+
1834
+ # Auto-detect decimal places from config if not provided
1835
+ if decimal_places is None:
1836
+ # Use stats_area_columns_formatting as default for most columns
1837
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1838
+ logger.debug(f"Using decimal_places={decimal_places} from config")
1839
+
1840
+ # Step 1: Get raw stats
1841
+ logger.debug("Step 1/2: Extracting statistics (sequential)...")
1842
+ df_raw = whisp_stats_geojson_to_df_sequential(
1843
+ input_geojson_filepath=input_geojson_filepath,
1844
+ external_id_column=external_id_column,
1845
+ remove_geom=remove_geom,
1846
+ national_codes=national_codes,
1847
+ unit_type=unit_type,
1848
+ whisp_image=whisp_image,
1849
+ custom_bands=custom_bands,
1850
+ add_metadata_client_side=add_metadata_client_side,
1851
+ logger=logger,
1852
+ )
1853
+
1854
+ # Step 2: Format the output
1855
+ logger.debug("Step 2/2: Formatting output...")
1856
+ median_cols_before = [c for c in df_raw.columns if c.endswith("_median")]
1857
+ logger.debug(
1858
+ f"Columns ending with '_median' BEFORE formatting: {median_cols_before}"
1859
+ )
1860
+
1861
+ df_formatted = format_stats_dataframe(
1862
+ df=df_raw,
1863
+ area_col=f"{geometry_area_column}_sum",
1864
+ decimal_places=decimal_places,
1865
+ unit_type=unit_type,
1866
+ remove_columns=remove_median_columns,
1867
+ convert_water_flag=convert_water_flag,
1868
+ water_flag_threshold=water_flag_threshold,
1869
+ sort_column=sort_column,
1870
+ )
1871
+
1872
+ median_cols_after = [c for c in df_formatted.columns if c.endswith("_median")]
1873
+ logger.debug(f"Columns ending with '_median' AFTER formatting: {median_cols_after}")
1874
+
1875
+ # Step 2b: Reformat geometry and handle point areas
1876
+ try:
1877
+ df_formatted = reformat_geometry_type(df_formatted)
1878
+ except Exception as e:
1879
+ logger.warning(f"Error reformatting geometry type: {e}")
1880
+
1881
+ try:
1882
+ df_formatted = set_point_geometry_area_to_zero(df_formatted)
1883
+ except Exception as e:
1884
+ logger.warning(f"Error setting point geometry area to zero: {e}")
1885
+
1886
+ # Step 3: Validate against schema
1887
+ logger.debug("Step 3/3: Validating against schema...")
1888
+ from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
1889
+
1890
+ df_validated = validate_dataframe_using_lookups_flexible(
1891
+ df_stats=df_formatted,
1892
+ national_codes=national_codes,
1893
+ custom_bands=custom_bands,
1894
+ )
1895
+
1896
+ logger.info("Sequential processing + formatting + validation complete")
1897
+ return df_validated
1898
+
1899
+
1900
+ # ============================================================================
1901
+ # FAST PROCESSING WITH AUTO-ROUTING
1902
+ # ============================================================================
1903
+
1904
+
1905
+ def whisp_formatted_stats_geojson_to_df_fast(
1906
+ input_geojson_filepath: str,
1907
+ external_id_column: str = None,
1908
+ remove_geom: bool = False,
1909
+ national_codes: List[str] = None,
1910
+ unit_type: str = "ha",
1911
+ whisp_image: ee.Image = None,
1912
+ custom_bands: Dict[str, Any] = None,
1913
+ mode: str = "auto",
1914
+ # Concurrent-specific parameters
1915
+ batch_size: int = 10,
1916
+ max_concurrent: int = 20,
1917
+ validate_geometries: bool = True,
1918
+ max_retries: int = 3,
1919
+ add_metadata_server: bool = False,
1920
+ # Format parameters (auto-detect from config if not provided)
1921
+ decimal_places: int = None,
1922
+ remove_median_columns: bool = True,
1923
+ convert_water_flag: bool = True,
1924
+ water_flag_threshold: float = 0.5,
1925
+ sort_column: str = "plotId",
1926
+ ) -> pd.DataFrame:
1927
+ """
1928
+ Process GeoJSON to Whisp statistics with optimized fast processing.
1929
+
1930
+ Automatically selects between concurrent (high-volume endpoint) and sequential
1931
+ (standard endpoint) based on file size, or allows explicit mode selection.
1932
+
1933
+ This is the recommended entry point for most users who want automatic optimization.
1934
+
1935
+ Parameters
1936
+ ----------
1937
+ input_geojson_filepath : str
1938
+ Path to input GeoJSON file
1939
+ external_id_column : str, optional
1940
+ Column name for external IDs
1941
+ remove_geom : bool
1942
+ Remove geometry column from output
1943
+ national_codes : List[str], optional
1944
+ ISO2 codes for national datasets
1945
+ unit_type : str
1946
+ "ha" or "percent"
1947
+ whisp_image : ee.Image, optional
1948
+ Pre-combined image
1949
+ custom_bands : Dict[str, Any], optional
1950
+ Custom band information
1951
+ mode : str
1952
+ Processing mode:
1953
+ - "auto": Choose based on file size (default)
1954
+ * <1MB: sequential
1955
+ * 1-5MB: sequential
1956
+ * >5MB: concurrent
1957
+ - "concurrent": Force high-volume endpoint (batch processing)
1958
+ - "sequential": Force standard endpoint (single-threaded)
1959
+ batch_size : int
1960
+ Features per batch (only for concurrent mode)
1961
+ max_concurrent : int
1962
+ Maximum concurrent EE calls (only for concurrent mode)
1963
+ validate_geometries : bool
1964
+ Validate and clean geometries
1965
+ max_retries : int
1966
+ Retry attempts per batch (only for concurrent mode)
1967
+ add_metadata_server : bool
1968
+ Add metadata server-side (only for concurrent mode)
1969
+ decimal_places : int, optional
1970
+ Decimal places for rounding. If None, auto-detects from config.
1971
+ remove_median_columns : bool
1972
+ Remove '_median' columns
1973
+ convert_water_flag : bool
1974
+ Convert water flag to boolean
1975
+ water_flag_threshold : float
1976
+ Water flag ratio threshold
1977
+ sort_column : str
1978
+ Column to sort by
1979
+
1980
+ Returns
1981
+ -------
1982
+ pd.DataFrame
1983
+ Validated, formatted results DataFrame
1984
+
1985
+ Examples
1986
+ --------
1987
+ >>> # Auto-detect best method based on file size
1988
+ >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
1989
+
1990
+ >>> # Force concurrent processing for large datasets
1991
+ >>> df = whisp_formatted_stats_geojson_to_df_fast(
1992
+ ... "large_data.geojson",
1993
+ ... mode="concurrent"
1994
+ ... )
1995
+
1996
+ >>> # Use sequential for guaranteed completion
1997
+ >>> df = whisp_formatted_stats_geojson_to_df_fast(
1998
+ ... "data.geojson",
1999
+ ... mode="sequential"
2000
+ ... )
2001
+ """
2002
+ logger = logging.getLogger("whisp-concurrent")
2003
+
2004
+ # Determine processing mode
2005
+ if mode == "auto":
2006
+ try:
2007
+ file_size = Path(input_geojson_filepath).stat().st_size
2008
+ if file_size > 5_000_000: # >5MB
2009
+ chosen_mode = "concurrent"
2010
+ logger.info(
2011
+ f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
2012
+ )
2013
+ else: # <=5MB
2014
+ chosen_mode = "sequential"
2015
+ logger.info(
2016
+ f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
2017
+ )
2018
+ except Exception as e:
2019
+ logger.warning(
2020
+ f"Could not determine file size: {e}. Defaulting to sequential."
2021
+ )
2022
+ chosen_mode = "sequential"
2023
+ elif mode in ("concurrent", "sequential"):
2024
+ chosen_mode = mode
2025
+ logger.info(f"Mode explicitly set to: {mode}")
2026
+ else:
2027
+ raise ValueError(
2028
+ f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
2029
+ )
2030
+
2031
+ # Route to appropriate function
2032
+ if chosen_mode == "concurrent":
2033
+ logger.debug("Routing to concurrent processing...")
2034
+ return whisp_formatted_stats_geojson_to_df_concurrent(
2035
+ input_geojson_filepath=input_geojson_filepath,
2036
+ external_id_column=external_id_column,
2037
+ remove_geom=remove_geom,
2038
+ national_codes=national_codes,
2039
+ unit_type=unit_type,
2040
+ whisp_image=whisp_image,
2041
+ custom_bands=custom_bands,
2042
+ batch_size=batch_size,
2043
+ max_concurrent=max_concurrent,
2044
+ validate_geometries=validate_geometries,
2045
+ max_retries=max_retries,
2046
+ add_metadata_server=add_metadata_server,
2047
+ logger=logger,
2048
+ decimal_places=decimal_places,
2049
+ remove_median_columns=remove_median_columns,
2050
+ convert_water_flag=convert_water_flag,
2051
+ water_flag_threshold=water_flag_threshold,
2052
+ sort_column=sort_column,
2053
+ )
2054
+ else: # sequential
2055
+ logger.debug("Routing to sequential processing...")
2056
+ return whisp_formatted_stats_geojson_to_df_sequential(
2057
+ input_geojson_filepath=input_geojson_filepath,
2058
+ external_id_column=external_id_column,
2059
+ remove_geom=remove_geom,
2060
+ national_codes=national_codes,
2061
+ unit_type=unit_type,
2062
+ whisp_image=whisp_image,
2063
+ custom_bands=custom_bands,
2064
+ logger=logger,
2065
+ decimal_places=decimal_places,
2066
+ remove_median_columns=remove_median_columns,
2067
+ convert_water_flag=convert_water_flag,
2068
+ water_flag_threshold=water_flag_threshold,
2069
+ sort_column=sort_column,
2070
+ )