openforis-whisp 2.0.0b3__py3-none-any.whl → 3.0.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2428 @@
1
+ """
2
+ Advanced statistics processing for WHISP - concurrent and sequential endpoints.
3
+
4
+ This module provides optimized functions for processing GeoJSON FeatureCollections
5
+ with Whisp datasets using concurrent batching (for high-volume processing)
6
+ and standard sequential processing.
7
+
8
+ NOTE: This module is a transition state. The plan is to eventually merge these
9
+ functions into stats.py and replace the standard functions there as the primary
10
+ implementation, deprecating the legacy versions.
11
+
12
+ Key features:
13
+ - whisp_stats_geojson_to_df_concurrent
14
+ - whisp_stats_geojson_to_df_sequential (standard endpoint, sequential)
15
+ - Proper logging at different levels (WARNING, INFO, DEBUG)
16
+ - Progress tracking without external dependencies
17
+ - Client-side and server-side metadata extraction options
18
+ - Endpoint validation and warnings
19
+ """
20
+
21
+ import ee
22
+ import pandas as pd
23
+ import geopandas as gpd
24
+ import logging
25
+ import sys
26
+ import threading
27
+ import time
28
+ import warnings
29
+ import json
30
+ import io
31
+ import os
32
+ import subprocess
33
+ from contextlib import redirect_stdout, contextmanager
34
+ from pathlib import Path
35
+ from typing import Optional, List, Dict, Any, Tuple, Union
36
+ from concurrent.futures import ThreadPoolExecutor, as_completed
37
+ import tempfile
38
+
39
+ # ============================================================================
40
+ # STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
41
+ # ============================================================================
42
+
43
+
44
+ @contextmanager
45
+ def suppress_c_level_output():
46
+ """Suppress C-level stdout/stderr writes from libraries like Fiona."""
47
+ if sys.platform == "win32":
48
+ # Windows doesn't support dup2() reliably for STDOUT/STDERR
49
+ # Fall back to Python-level suppression
50
+ with redirect_stdout(io.StringIO()):
51
+ yield
52
+ else:
53
+ # Unix-like systems: use file descriptor redirection
54
+ saved_stdout = os.dup(1)
55
+ saved_stderr = os.dup(2)
56
+ try:
57
+ devnull = os.open(os.devnull, os.O_WRONLY)
58
+ os.dup2(devnull, 1)
59
+ os.dup2(devnull, 2)
60
+ yield
61
+ finally:
62
+ os.dup2(saved_stdout, 1)
63
+ os.dup2(saved_stderr, 2)
64
+ os.close(devnull)
65
+ os.close(saved_stdout)
66
+ os.close(saved_stderr)
67
+
68
+
69
+ # Suppress verbose warnings globally for this module
70
+ # Note: FutureWarnings are kept (they signal important API changes)
71
+ warnings.filterwarnings("ignore", category=UserWarning, message=".*geographic CRS.*")
72
+ warnings.simplefilter("ignore", UserWarning)
73
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
74
+
75
+ # Suppress verbose logging from GeoPandas/Fiona/pyogrio
76
+ logging.getLogger("fiona").setLevel(logging.WARNING)
77
+ logging.getLogger("fiona.ogrext").setLevel(logging.WARNING)
78
+ logging.getLogger("pyogrio").setLevel(logging.WARNING)
79
+ logging.getLogger("pyogrio._io").setLevel(logging.WARNING)
80
+
81
+ from openforis_whisp.parameters.config_runtime import (
82
+ plot_id_column,
83
+ external_id_column,
84
+ geometry_type_column,
85
+ geometry_area_column,
86
+ centroid_x_coord_column,
87
+ centroid_y_coord_column,
88
+ iso3_country_column,
89
+ iso2_country_column,
90
+ admin_1_column,
91
+ water_flag,
92
+ geometry_area_column_formatting,
93
+ stats_area_columns_formatting,
94
+ stats_percent_columns_formatting,
95
+ )
96
+ from openforis_whisp.data_conversion import (
97
+ convert_geojson_to_ee,
98
+ convert_ee_to_df,
99
+ convert_ee_to_geojson,
100
+ )
101
+ from openforis_whisp.datasets import combine_datasets
102
+ from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
103
+ from openforis_whisp.stats import (
104
+ reformat_geometry_type,
105
+ set_point_geometry_area_to_zero,
106
+ )
107
+
108
+
109
+ # ============================================================================
110
+ # LOGGING & PROGRESS UTILITIES
111
+ # ============================================================================
112
+
113
+
114
+ def _suppress_verbose_output(max_concurrent: int = None):
115
+ """
116
+ Suppress verbose warnings and logging from dependencies.
117
+
118
+ Dynamically adjusts urllib3 logger level based on max_concurrent to prevent
119
+ "Connection pool is full" warnings during high-concurrency scenarios.
120
+
121
+ Parameters
122
+ ----------
123
+ max_concurrent : int, optional
124
+ Maximum concurrent workers. Adjusts urllib3 logging level:
125
+ - max_concurrent <= 20: WARNING (pool rarely full)
126
+ - max_concurrent 21-35: CRITICAL (suppress pool warnings)
127
+ - max_concurrent >= 36: CRITICAL (maximum suppression)
128
+ """
129
+ warnings.filterwarnings("ignore", category=UserWarning)
130
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
131
+
132
+ # Suppress urllib3 connection pool warnings via filters
133
+ warnings.filterwarnings("ignore", message=".*Connection pool is full.*")
134
+ warnings.filterwarnings("ignore", message=".*discarding connection.*")
135
+
136
+ # Set logger levels to WARNING to suppress INFO messages
137
+ for mod_name in [
138
+ "openforis_whisp.reformat",
139
+ "openforis_whisp.data_conversion",
140
+ "geopandas",
141
+ "fiona",
142
+ "pyogrio._io",
143
+ "urllib3",
144
+ ]:
145
+ logging.getLogger(mod_name).setLevel(logging.WARNING)
146
+
147
+ # ALL urllib3 loggers: use CRITICAL to suppress ALL connection pool warnings
148
+ # (these appear at WARNING level during high concurrency)
149
+ urllib3_loggers = [
150
+ "urllib3.connectionpool",
151
+ "urllib3.poolmanager",
152
+ "urllib3",
153
+ "requests.packages.urllib3.connectionpool",
154
+ "requests.packages.urllib3.poolmanager",
155
+ "requests.packages.urllib3",
156
+ ]
157
+
158
+ for logger_name in urllib3_loggers:
159
+ logging.getLogger(logger_name).setLevel(logging.CRITICAL)
160
+
161
+ # Suppress warning logs specifically from reformat module during validation
162
+ reformat_logger = logging.getLogger("openforis_whisp.reformat")
163
+ reformat_logger.setLevel(logging.ERROR)
164
+
165
+
166
+ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
167
+ """Load GeoJSON file with all output suppressed."""
168
+ fiona_logger = logging.getLogger("fiona")
169
+ pyogrio_logger = logging.getLogger("pyogrio._io")
170
+ old_fiona_level = fiona_logger.level
171
+ old_pyogrio_level = pyogrio_logger.level
172
+ fiona_logger.setLevel(logging.CRITICAL)
173
+ pyogrio_logger.setLevel(logging.CRITICAL)
174
+
175
+ try:
176
+ with redirect_stdout(io.StringIO()):
177
+ gdf = gpd.read_file(filepath)
178
+ return gdf
179
+ finally:
180
+ fiona_logger.setLevel(old_fiona_level)
181
+ pyogrio_logger.setLevel(old_pyogrio_level)
182
+
183
+
184
+ def _extract_decimal_places(format_string: str) -> int:
185
+ """
186
+ Extract decimal places from a format string like '%.3f'.
187
+
188
+ Parameters
189
+ ----------
190
+ format_string : str
191
+ Format string (e.g., '%.3f' → 3)
192
+
193
+ Returns
194
+ -------
195
+ int
196
+ Number of decimal places
197
+ """
198
+ import re
199
+
200
+ match = re.search(r"\.(\d+)f", format_string)
201
+ if match:
202
+ return int(match.group(1))
203
+ return 2 # Default to 2 decimal places
204
+
205
+
206
+ def _normalize_keep_external_columns(
207
+ keep_external_columns: Union[bool, List[str]],
208
+ all_columns: List[str],
209
+ plot_id_column: str = "plotId",
210
+ ) -> List[str]:
211
+ """
212
+ Normalize keep_external_columns parameter to a list of column names.
213
+
214
+ Converts flexible user input (bool or list) to a concrete list of columns to keep.
215
+
216
+ Parameters
217
+ ----------
218
+ keep_external_columns : bool or List[str]
219
+ - False: keep nothing (return empty list)
220
+ - True: keep all columns except geometry and plot_id
221
+ - List[str]: keep specific columns (return as-is)
222
+ all_columns : List[str]
223
+ All available columns to choose from
224
+ plot_id_column : str
225
+ Name of plot ID column to exclude
226
+
227
+ Returns
228
+ -------
229
+ List[str]
230
+ Columns to keep from external (GeoJSON) data
231
+
232
+ Examples
233
+ --------
234
+ >>> cols = _normalize_keep_external_columns(False, ["id", "Country", "geom"], "id")
235
+ >>> cols
236
+ []
237
+
238
+ >>> cols = _normalize_keep_external_columns(True, ["id", "Country", "geom"], "id")
239
+ >>> cols
240
+ ['Country']
241
+
242
+ >>> cols = _normalize_keep_external_columns(["Country"], ["id", "Country", "geom"], "id")
243
+ >>> cols
244
+ ['Country']
245
+ """
246
+ if keep_external_columns is True:
247
+ # Keep all columns except geometry and plot_id
248
+ return [c for c in all_columns if c not in [plot_id_column, "geometry"]]
249
+ elif keep_external_columns is False:
250
+ # Keep nothing
251
+ return []
252
+ else:
253
+ # Use provided list (handle None case)
254
+ return keep_external_columns or []
255
+
256
+
257
+ def _add_admin_context(
258
+ df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
259
+ ) -> pd.DataFrame:
260
+ """
261
+ Join admin codes to get Country, ProducerCountry, and Admin_Level_1 information.
262
+
263
+ Uses GAUL 2024 Level 1 administrative lookup to map admin codes to country and
264
+ administrative region names.
265
+
266
+ Parameters
267
+ ----------
268
+ df : pd.DataFrame
269
+ DataFrame with admin_code_median column from reduceRegions
270
+ admin_code_col : str
271
+ Name of the admin code column (default: "admin_code_median")
272
+ debug : bool
273
+ If True, print detailed debugging information (default: False)
274
+
275
+ Returns
276
+ -------
277
+ pd.DataFrame
278
+ DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
279
+ """
280
+ logger = logging.getLogger("whisp")
281
+
282
+ # Return early if admin code column doesn't exist
283
+ if admin_code_col not in df.columns:
284
+ logger.debug(f"Admin code column '{admin_code_col}' not found in dataframe")
285
+ if debug:
286
+ print(f"DEBUG: Admin code column '{admin_code_col}' not found")
287
+ print(f"DEBUG: Available columns: {df.columns.tolist()}")
288
+ return df
289
+
290
+ try:
291
+ from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
292
+
293
+ if debug:
294
+ print(f"DEBUG: Found admin_code_col '{admin_code_col}'")
295
+ print(f"DEBUG: Sample values: {df[admin_code_col].head()}")
296
+ print(f"DEBUG: Value types: {df[admin_code_col].dtype}")
297
+ print(f"DEBUG: Null count: {df[admin_code_col].isna().sum()}")
298
+
299
+ # Create lookup dataframe
300
+ lookup_data = []
301
+ for gaul_code, info in lookup_dict.items():
302
+ lookup_data.append(
303
+ {
304
+ "gaul1_code": gaul_code,
305
+ "gaul1_name": info.get("gaul1_name"),
306
+ "iso3_code": info.get("iso3_code"),
307
+ "iso2_code": info.get("iso2_code"),
308
+ }
309
+ )
310
+
311
+ lookup_df = pd.DataFrame(lookup_data)
312
+
313
+ if debug:
314
+ print(f"DEBUG: Lookup dictionary has {len(lookup_df)} entries")
315
+ print(f"DEBUG: Sample lookup codes: {lookup_df['gaul1_code'].head()}")
316
+
317
+ # Prepare data for join
318
+ df = df.copy()
319
+ df["admin_code_for_join"] = df[admin_code_col].fillna(-9999).astype("int32")
320
+ lookup_df["gaul1_code"] = lookup_df["gaul1_code"].astype("int32")
321
+
322
+ if debug:
323
+ print(
324
+ f"DEBUG: Codes to join (first 10): {df['admin_code_for_join'].unique()[:10]}"
325
+ )
326
+
327
+ # Perform join
328
+ df_joined = df.merge(
329
+ lookup_df, left_on="admin_code_for_join", right_on="gaul1_code", how="left"
330
+ )
331
+
332
+ if debug:
333
+ matched = df_joined["iso3_code"].notna().sum()
334
+ print(f"DEBUG: Merge result - {matched}/{len(df_joined)} rows matched")
335
+ print(f"DEBUG: Sample matched rows:")
336
+ print(
337
+ df_joined[
338
+ ["admin_code_for_join", "iso3_code", "iso2_code", "gaul1_name"]
339
+ ].head()
340
+ )
341
+
342
+ # Rename columns to match output schema
343
+ df_joined = df_joined.rename(
344
+ columns={
345
+ "iso3_code": iso3_country_column, # 'Country'
346
+ "iso2_code": iso2_country_column, # 'ProducerCountry'
347
+ "gaul1_name": admin_1_column, # 'Admin_Level_1'
348
+ }
349
+ )
350
+
351
+ # Drop temporary columns
352
+ df_joined = df_joined.drop(
353
+ columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
354
+ )
355
+
356
+ logger.debug(
357
+ f"Admin context added: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
358
+ )
359
+ return df_joined
360
+
361
+ except ImportError:
362
+ logger.warning(
363
+ "Could not import GAUL lookup dictionary - admin context not added"
364
+ )
365
+ if debug:
366
+ print("DEBUG: ImportError - could not load lookup dictionary")
367
+ return df
368
+ except Exception as e:
369
+ logger.warning(f"Error adding admin context: {e}")
370
+ if debug:
371
+ print(f"DEBUG: Exception in _add_admin_context: {e}")
372
+ import traceback
373
+
374
+ traceback.print_exc()
375
+ return df
376
+
377
+
378
+ def join_admin_codes(
379
+ df: pd.DataFrame, lookup_dict: Dict, id_col: str = "admin_code_median"
380
+ ) -> pd.DataFrame:
381
+ """
382
+ Join admin codes to DataFrame using a lookup dictionary.
383
+
384
+ Converts the admin code column to integer and performs a left join with
385
+ the lookup dictionary to add Country, ProducerCountry, and Admin_Level_1.
386
+
387
+ Parameters
388
+ ----------
389
+ df : pd.DataFrame
390
+ DataFrame with admin code column
391
+ lookup_dict : dict
392
+ Dictionary mapping GAUL codes to admin info (iso3_code, iso2_code, gaul1_name)
393
+ id_col : str
394
+ Name of the admin code column (default: "admin_code_median")
395
+
396
+ Returns
397
+ -------
398
+ pd.DataFrame
399
+ DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
400
+ """
401
+ logger = logging.getLogger("whisp")
402
+
403
+ # Return early if admin code column doesn't exist
404
+ if id_col not in df.columns:
405
+ logger.debug(f"Admin code column '{id_col}' not found in dataframe")
406
+ return df
407
+
408
+ try:
409
+ # Create lookup dataframe
410
+ lookup_data = []
411
+ for gaul_code, info in lookup_dict.items():
412
+ lookup_data.append(
413
+ {
414
+ "gaul1_code": gaul_code,
415
+ "gaul1_name": info.get("gaul1_name"),
416
+ "iso3_code": info.get("iso3_code"),
417
+ "iso2_code": info.get("iso2_code"),
418
+ }
419
+ )
420
+
421
+ lookup_df = pd.DataFrame(lookup_data)
422
+
423
+ # Prepare data for join
424
+ df = df.copy()
425
+ # Round to nearest integer (handles float values from EE reducers)
426
+ df["admin_code_for_join"] = df[id_col].fillna(-9999).astype("int32")
427
+ lookup_df["gaul1_code"] = lookup_df["gaul1_code"].astype("int32")
428
+
429
+ # Perform join
430
+ df_joined = df.merge(
431
+ lookup_df, left_on="admin_code_for_join", right_on="gaul1_code", how="left"
432
+ )
433
+
434
+ # Rename columns to match output schema
435
+ df_joined = df_joined.rename(
436
+ columns={
437
+ "iso3_code": iso3_country_column, # 'Country'
438
+ "iso2_code": iso2_country_column, # 'ProducerCountry'
439
+ "gaul1_name": admin_1_column, # 'Admin_Level_1'
440
+ }
441
+ )
442
+
443
+ # Drop temporary columns
444
+ df_joined = df_joined.drop(
445
+ columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
446
+ )
447
+
448
+ logger.debug(
449
+ f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
450
+ )
451
+ return df_joined
452
+
453
+ except Exception as e:
454
+ logger.warning(f"Error joining admin codes: {e}")
455
+ return df
456
+
457
+
458
+ class ProgressTracker:
459
+ """
460
+ Track batch processing progress with time estimation.
461
+
462
+ Shows progress at adaptive milestones (more frequent for small datasets,
463
+ less frequent for large datasets) with estimated time remaining based on
464
+ processing speed.
465
+ """
466
+
467
+ def __init__(self, total: int, logger: logging.Logger = None):
468
+ """
469
+ Initialize progress tracker.
470
+
471
+ Parameters
472
+ ----------
473
+ total : int
474
+ Total number of items to process
475
+ logger : logging.Logger, optional
476
+ Logger for output
477
+ """
478
+ self.total = total
479
+ self.completed = 0
480
+ self.lock = threading.Lock()
481
+ self.logger = logger or logging.getLogger("whisp")
482
+
483
+ # Adaptive milestones based on dataset size
484
+ # Small datasets (< 50): show every 25% (not too spammy)
485
+ # Medium (50-500): show every 20%
486
+ # Large (500+): show every 10% (more frequent feedback on long runs)
487
+ if total < 50:
488
+ self.milestones = {25, 50, 75, 100}
489
+ elif total < 500:
490
+ self.milestones = {20, 40, 60, 80, 100}
491
+ else:
492
+ self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
493
+
494
+ self.shown_milestones = set()
495
+ self.start_time = time.time()
496
+ self.last_update_time = self.start_time
497
+
498
+ def update(self, n: int = 1) -> None:
499
+ """
500
+ Update progress count.
501
+
502
+ Parameters
503
+ ----------
504
+ n : int
505
+ Number of items completed
506
+ """
507
+ with self.lock:
508
+ self.completed += n
509
+ percent = int((self.completed / self.total) * 100)
510
+
511
+ # Show milestone messages (25%, 50%, 75%, 100%)
512
+ for milestone in sorted(self.milestones):
513
+ if percent >= milestone and milestone not in self.shown_milestones:
514
+ self.shown_milestones.add(milestone)
515
+
516
+ # Calculate time metrics
517
+ elapsed = time.time() - self.start_time
518
+ rate = self.completed / elapsed if elapsed > 0 else 0
519
+ remaining_items = self.total - self.completed
520
+ eta_seconds = remaining_items / rate if rate > 0 else 0
521
+
522
+ # Format time strings
523
+ eta_str = self._format_time(eta_seconds)
524
+ elapsed_str = self._format_time(elapsed)
525
+
526
+ # Build progress message
527
+ msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
528
+ if percent < 100:
529
+ msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
530
+ else:
531
+ msg += f" | Total time: {elapsed_str}"
532
+
533
+ self.logger.info(msg)
534
+
535
+ @staticmethod
536
+ def _format_time(seconds: float) -> str:
537
+ """Format seconds as human-readable string."""
538
+ if seconds < 60:
539
+ return f"{seconds:.0f}s"
540
+ elif seconds < 3600:
541
+ mins = seconds / 60
542
+ return f"{mins:.1f}m"
543
+ else:
544
+ hours = seconds / 3600
545
+ return f"{hours:.1f}h"
546
+
547
+ def finish(self) -> None:
548
+ """Log completion."""
549
+ with self.lock:
550
+ total_time = time.time() - self.start_time
551
+ time_str = self._format_time(total_time)
552
+ self.logger.info(
553
+ f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
554
+ )
555
+
556
+
557
+ # ============================================================================
558
+ # ENDPOINT VALIDATION
559
+ # ============================================================================
560
+
561
+
562
+ def check_ee_endpoint(endpoint_type: str = "high-volume") -> bool:
563
+ """
564
+ Check if Earth Engine is using the correct endpoint.
565
+
566
+ Parameters
567
+ ----------
568
+ endpoint_type : str
569
+ Expected endpoint type: "high-volume" or "standard"
570
+
571
+ Returns
572
+ -------
573
+ bool
574
+ True if using expected endpoint, False otherwise
575
+ """
576
+ api_url = str(ee.data._cloud_api_base_url)
577
+
578
+ if endpoint_type == "high-volume":
579
+ return "highvolume" in api_url.lower()
580
+ elif endpoint_type == "standard":
581
+ return "highvolume" not in api_url.lower()
582
+ else:
583
+ return False
584
+
585
+
586
+ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool = True):
587
+ """
588
+ Validate Earth Engine endpoint and warn/error if incorrect.
589
+
590
+ Parameters
591
+ ----------
592
+ endpoint_type : str
593
+ Expected endpoint type
594
+ raise_error : bool
595
+ If True, raise error if incorrect endpoint; if False, warn
596
+
597
+ Raises
598
+ ------
599
+ RuntimeError
600
+ If incorrect endpoint and raise_error=True
601
+ """
602
+ if not check_ee_endpoint(endpoint_type):
603
+ msg = (
604
+ f"Not using {endpoint_type.upper()} endpoint.\n"
605
+ f"Current URL: {ee.data._cloud_api_base_url}\n"
606
+ f"\nTo use {endpoint_type} endpoint, run:\n"
607
+ )
608
+ msg += "ee.Reset()\n"
609
+ if endpoint_type == "high-volume":
610
+ msg += (
611
+ "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
612
+ )
613
+ else:
614
+ msg += "ee.Initialize() # Uses standard endpoint by default"
615
+
616
+ if raise_error:
617
+ raise RuntimeError(msg)
618
+ else:
619
+ logging.warning(msg)
620
+
621
+
622
+ # ============================================================================
623
+ # METADATA EXTRACTION (CLIENT & SERVER SIDE)
624
+ # ============================================================================
625
+
626
+
627
+ def extract_centroid_and_geomtype_client(
628
+ gdf: gpd.GeoDataFrame,
629
+ x_col: str = None,
630
+ y_col: str = None,
631
+ type_col: str = None,
632
+ external_id_column: str = None,
633
+ return_attributes_only: bool = True,
634
+ ) -> pd.DataFrame:
635
+ """
636
+ Extract centroid coordinates and geometry type using GeoPandas (client-side).
637
+
638
+ Parameters
639
+ ----------
640
+ gdf : gpd.GeoDataFrame
641
+ Input GeoDataFrame
642
+ x_col : str, optional
643
+ Column name for centroid x. Defaults to config value
644
+ y_col : str, optional
645
+ Column name for centroid y. Defaults to config value
646
+ type_col : str, optional
647
+ Column name for geometry type. Defaults to config value
648
+ external_id_column: : str, optional
649
+ Name of external ID column to preserve
650
+ return_attributes_only : bool
651
+ If True, return only attribute columns (no geometry)
652
+
653
+ Returns
654
+ -------
655
+ pd.DataFrame or gpd.GeoDataFrame
656
+ DataFrame/GeoDataFrame with centroid and geometry type columns
657
+ """
658
+ x_col = x_col or centroid_x_coord_column
659
+ y_col = y_col or centroid_y_coord_column
660
+ type_col = type_col or geometry_type_column
661
+
662
+ gdf = gdf.copy()
663
+
664
+ # Extract centroid coordinates (suppressing geographic CRS warning from Shapely)
665
+ with warnings.catch_warnings():
666
+ warnings.filterwarnings("ignore", category=UserWarning)
667
+ warnings.simplefilter("ignore", UserWarning) # Additional suppression
668
+ centroid_points = gdf.geometry.centroid
669
+
670
+ gdf[x_col] = centroid_points.x.round(6)
671
+ gdf[y_col] = centroid_points.y.round(6)
672
+ gdf[type_col] = gdf.geometry.geom_type
673
+
674
+ if return_attributes_only:
675
+ # Build column list starting with merge keys
676
+ cols = []
677
+
678
+ # Always include __row_id__ first if present (needed for row-level merging)
679
+ if "__row_id__" in gdf.columns:
680
+ cols.append("__row_id__")
681
+
682
+ # Always include plot_id_column if present (needed for merging batches)
683
+ if plot_id_column in gdf.columns:
684
+ cols.append(plot_id_column)
685
+
686
+ # Include external_id_column if provided and exists
687
+ if (
688
+ external_id_column
689
+ and external_id_column in gdf.columns
690
+ and external_id_column not in cols
691
+ ):
692
+ cols.append(external_id_column)
693
+
694
+ # Always include metadata columns (centroid, geometry type)
695
+ cols.extend([x_col, y_col, type_col])
696
+
697
+ # Remove any duplicates while preserving order
698
+ cols = list(dict.fromkeys(cols))
699
+
700
+ return gdf[cols].reset_index(drop=True)
701
+
702
+ return gdf
703
+
704
+
705
+ def extract_centroid_and_geomtype_server(
706
+ fc: ee.FeatureCollection,
707
+ x_col: str = None,
708
+ y_col: str = None,
709
+ type_col: str = None,
710
+ max_error: float = 1.0,
711
+ ) -> ee.FeatureCollection:
712
+ """
713
+ Extract centroid coordinates and geometry type using Earth Engine (server-side).
714
+
715
+ Parameters
716
+ ----------
717
+ fc : ee.FeatureCollection
718
+ Input FeatureCollection
719
+ x_col : str, optional
720
+ Column name for centroid x
721
+ y_col : str, optional
722
+ Column name for centroid y
723
+ type_col : str, optional
724
+ Column name for geometry type
725
+ max_error : float
726
+ Maximum error for centroid calculation (meters)
727
+
728
+ Returns
729
+ -------
730
+ ee.FeatureCollection
731
+ FeatureCollection with centroid and geometry type properties
732
+ """
733
+ x_col = x_col or centroid_x_coord_column
734
+ y_col = y_col or centroid_y_coord_column
735
+ type_col = type_col or geometry_type_column
736
+
737
+ def add_metadata(feature):
738
+ centroid = feature.geometry().centroid(max_error)
739
+ coords = centroid.coordinates()
740
+ x = ee.Number(coords.get(0)).multiply(1e6).round().divide(1e6)
741
+ y = ee.Number(coords.get(1)).multiply(1e6).round().divide(1e6)
742
+ return feature.set({x_col: x, y_col: y, type_col: feature.geometry().type()})
743
+
744
+ return fc.map(add_metadata)
745
+
746
+
747
+ # ============================================================================
748
+ # BATCH PROCESSING UTILITIES
749
+ # ============================================================================
750
+
751
+
752
+ def batch_geodataframe(
753
+ gdf: gpd.GeoDataFrame,
754
+ batch_size: int,
755
+ ) -> List[gpd.GeoDataFrame]:
756
+ """
757
+ Split a GeoDataFrame into batches.
758
+
759
+ Parameters
760
+ ----------
761
+ gdf : gpd.GeoDataFrame
762
+ Input GeoDataFrame
763
+ batch_size : int
764
+ Size of each batch
765
+
766
+ Returns
767
+ -------
768
+ List[gpd.GeoDataFrame]
769
+ List of batch GeoDataFrames
770
+ """
771
+ batches = []
772
+ for i in range(0, len(gdf), batch_size):
773
+ batches.append(gdf.iloc[i : i + batch_size].copy())
774
+ return batches
775
+
776
+
777
+ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
778
+ """
779
+ Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
780
+
781
+ OPTIMIZATION: Passes GeoDataFrame directly to convert_geojson_to_ee to preserve CRS.
782
+ This ensures proper coordinate system handling and reprojection to WGS84 if needed.
783
+
784
+ Preserves the __row_id__ column if present so it can be retrieved after processing.
785
+
786
+ Parameters
787
+ ----------
788
+ batch_gdf : gpd.GeoDataFrame
789
+ Input batch (should have __row_id__ column)
790
+
791
+ Returns
792
+ -------
793
+ ee.FeatureCollection
794
+ EE FeatureCollection with __row_id__ as a feature property
795
+ """
796
+ # Pass GeoDataFrame directly to preserve CRS metadata
797
+ # convert_geojson_to_ee will handle:
798
+ # - CRS detection and conversion to WGS84 if needed
799
+ # - Data type sanitization (datetime, object columns)
800
+ # - Geometry validation and Z-coordinate stripping
801
+
802
+ fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
803
+
804
+ # If __row_id__ is in the original GeoDataFrame, it will be preserved
805
+ # as a feature property in the GeoJSON and thus in the EE FeatureCollection
806
+ return fc
807
+
808
+
809
+ def clean_geodataframe(
810
+ gdf: gpd.GeoDataFrame,
811
+ remove_nulls: bool = True,
812
+ fix_invalid: bool = True,
813
+ logger: logging.Logger = None,
814
+ ) -> gpd.GeoDataFrame:
815
+ """
816
+ Validate and clean GeoDataFrame geometries.
817
+
818
+ Parameters
819
+ ----------
820
+ gdf : gpd.GeoDataFrame
821
+ Input GeoDataFrame
822
+ remove_nulls : bool
823
+ Remove null geometries
824
+ fix_invalid : bool
825
+ Fix invalid geometries
826
+ logger : logging.Logger, optional
827
+ Logger for output
828
+
829
+ Returns
830
+ -------
831
+ gpd.GeoDataFrame
832
+ Cleaned GeoDataFrame
833
+ """
834
+ logger = logger or logging.getLogger("whisp")
835
+
836
+ if remove_nulls:
837
+ null_count = gdf.geometry.isna().sum()
838
+ if null_count > 0:
839
+ logger.warning(f"Removing {null_count} null geometries")
840
+ gdf = gdf[~gdf.geometry.isna()].copy()
841
+
842
+ if fix_invalid:
843
+ valid_count = gdf.geometry.is_valid.sum()
844
+ invalid_count = len(gdf) - valid_count
845
+ if invalid_count > 0:
846
+ logger.warning(f"Fixing {invalid_count} invalid geometries")
847
+ from shapely.validation import make_valid
848
+
849
+ gdf = gdf.copy()
850
+ gdf["geometry"] = gdf["geometry"].apply(
851
+ lambda g: make_valid(g) if g and not g.is_valid else g
852
+ )
853
+
854
+ logger.debug(f"Validation complete: {len(gdf):,} geometries ready")
855
+ return gdf
856
+
857
+
858
+ # ============================================================================
859
+ # EE PROCESSING WITH RETRY LOGIC
860
+ # ============================================================================
861
+
862
+
863
+ def process_ee_batch(
864
+ fc: ee.FeatureCollection,
865
+ whisp_image: ee.Image,
866
+ reducer: ee.Reducer,
867
+ batch_idx: int,
868
+ max_retries: int = 3,
869
+ logger: logging.Logger = None,
870
+ ) -> pd.DataFrame:
871
+ """
872
+ Process an EE FeatureCollection with automatic retry logic.
873
+
874
+ Parameters
875
+ ----------
876
+ fc : ee.FeatureCollection
877
+ Input FeatureCollection
878
+ whisp_image : ee.Image
879
+ Image containing bands to reduce
880
+ reducer : ee.Reducer
881
+ Reducer to apply
882
+ batch_idx : int
883
+ Batch index (for logging)
884
+ max_retries : int
885
+ Maximum retry attempts
886
+ logger : logging.Logger, optional
887
+ Logger for output
888
+
889
+ Returns
890
+ -------
891
+ pd.DataFrame
892
+ Results as DataFrame
893
+
894
+ Raises
895
+ ------
896
+ RuntimeError
897
+ If processing fails after all retries
898
+ """
899
+ logger = logger or logging.getLogger("whisp")
900
+
901
+ for attempt in range(max_retries):
902
+ try:
903
+ results = whisp_image.reduceRegions(
904
+ collection=fc,
905
+ reducer=reducer,
906
+ scale=10,
907
+ )
908
+ df = convert_ee_to_df(results)
909
+
910
+ # Ensure plot_id_column is present for merging
911
+ # It should come from the feature properties (added before EE processing)
912
+ if plot_id_column not in df.columns:
913
+ df[plot_id_column] = range(len(df))
914
+
915
+ # Ensure all column names are strings (fixes pandas .str accessor issues)
916
+ df.columns = df.columns.astype(str)
917
+
918
+ return df
919
+
920
+ except ee.EEException as e:
921
+ error_msg = str(e)
922
+
923
+ if "Quota" in error_msg or "limit" in error_msg.lower():
924
+ if attempt < max_retries - 1:
925
+ wait_time = min(30, 2**attempt)
926
+ logger.warning(
927
+ f"Batch {batch_idx + 1}: Rate limited, waiting {wait_time}s..."
928
+ )
929
+ time.sleep(wait_time)
930
+ else:
931
+ raise RuntimeError(f"Batch {batch_idx + 1}: Quota exhausted")
932
+
933
+ elif "timeout" in error_msg.lower():
934
+ if attempt < max_retries - 1:
935
+ wait_time = min(15, 2**attempt)
936
+ logger.warning(
937
+ f"Batch {batch_idx + 1}: Timeout, retrying in {wait_time}s..."
938
+ )
939
+ time.sleep(wait_time)
940
+ else:
941
+ raise
942
+
943
+ else:
944
+ if attempt < max_retries - 1:
945
+ wait_time = min(5, 2**attempt)
946
+ time.sleep(wait_time)
947
+ else:
948
+ raise
949
+
950
+ except Exception as e:
951
+ if attempt < max_retries - 1:
952
+ time.sleep(min(5, 2**attempt))
953
+ else:
954
+ raise RuntimeError(f"Batch {batch_idx + 1}: {str(e)}")
955
+
956
+ raise RuntimeError(f"Batch {batch_idx + 1}: Failed after {max_retries} attempts")
957
+
958
+
959
+ # ============================================================================
960
+ # CONCURRENT PROCESSING FUNCTIONS
961
+ # ============================================================================
962
+
963
+
964
+ def whisp_stats_geojson_to_df_concurrent(
965
+ input_geojson_filepath: str,
966
+ external_id_column: str = None,
967
+ remove_geom: bool = False,
968
+ national_codes: List[str] = None,
969
+ unit_type: str = "ha",
970
+ whisp_image: ee.Image = None,
971
+ custom_bands: Dict[str, Any] = None,
972
+ batch_size: int = 10,
973
+ max_concurrent: int = 20,
974
+ validate_geometries: bool = True,
975
+ max_retries: int = 3,
976
+ add_metadata_server: bool = False,
977
+ logger: logging.Logger = None,
978
+ # Format parameters (auto-detect from config if not provided)
979
+ decimal_places: int = None,
980
+ ) -> pd.DataFrame:
981
+ """
982
+ Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
983
+
984
+ Uses high-volume endpoint and concurrent batching. Client-side metadata
985
+ extraction is always applied; optionally add server-side metadata too.
986
+ Automatically formats output (converts units, removes noise columns, etc.).
987
+
988
+ Parameters
989
+ ----------
990
+ input_geojson_filepath : str
991
+ Path to input GeoJSON file
992
+ external_id_column : str, optional
993
+ Column name for external IDs
994
+ remove_geom : bool
995
+ Remove geometry column from output
996
+ national_codes : List[str], optional
997
+ ISO2 codes for national datasets
998
+ unit_type : str
999
+ "ha" or "percent"
1000
+ whisp_image : ee.Image, optional
1001
+ Pre-combined image (created with combine_datasets if None)
1002
+ custom_bands : Dict[str, Any], optional
1003
+ Custom band information
1004
+ batch_size : int
1005
+ Features per batch
1006
+ max_concurrent : int
1007
+ Maximum concurrent EE calls
1008
+ validate_geometries : bool
1009
+ Validate and clean geometries
1010
+ max_retries : int
1011
+ Retry attempts per batch
1012
+ add_metadata_server : bool
1013
+ Add metadata server-side (in addition to client-side)
1014
+ logger : logging.Logger, optional
1015
+ Logger for output
1016
+ decimal_places : int, optional
1017
+ Decimal places for formatting. If None, auto-detects from config.
1018
+
1019
+ Returns
1020
+ -------
1021
+ pd.DataFrame
1022
+ Formatted results DataFrame with Whisp statistics
1023
+ """
1024
+ from openforis_whisp.reformat import format_stats_dataframe
1025
+
1026
+ logger = logger or logging.getLogger("whisp")
1027
+
1028
+ # Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
1029
+ _suppress_verbose_output(max_concurrent=max_concurrent)
1030
+
1031
+ # Auto-detect decimal places from config if not provided
1032
+ if decimal_places is None:
1033
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1034
+ logger.debug(f"Using decimal_places={decimal_places} from config")
1035
+
1036
+ # Validate endpoint
1037
+ validate_ee_endpoint("high-volume", raise_error=True)
1038
+
1039
+ # Load GeoJSON with output suppressed
1040
+ gdf = _load_geojson_silently(input_geojson_filepath)
1041
+ logger.info(f"Loaded {len(gdf):,} features")
1042
+
1043
+ if validate_geometries:
1044
+ gdf = clean_geodataframe(gdf, logger=logger)
1045
+
1046
+ # Add stable plotIds for merging (starting from 1, not 0)
1047
+ gdf[plot_id_column] = range(1, len(gdf) + 1)
1048
+
1049
+ # Strip unnecessary properties before sending to EE
1050
+ # Keep only: geometry, plot_id_column, and external_id_column
1051
+ # This prevents duplication of GeoJSON properties in EE results
1052
+ keep_cols = ["geometry", plot_id_column]
1053
+ if external_id_column and external_id_column in gdf.columns:
1054
+ keep_cols.append(external_id_column)
1055
+
1056
+ gdf_for_ee = gdf[keep_cols].copy()
1057
+ logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1058
+
1059
+ # Create image if not provided
1060
+ if whisp_image is None:
1061
+ logger.debug("Creating Whisp image...")
1062
+ # Suppress print statements from combine_datasets
1063
+ with redirect_stdout(io.StringIO()):
1064
+ try:
1065
+ # First try without validation
1066
+ whisp_image = combine_datasets(
1067
+ national_codes=national_codes, validate_bands=False
1068
+ )
1069
+ except Exception as e:
1070
+ logger.warning(
1071
+ f"First attempt failed: {str(e)[:100]}. Retrying with validate_bands=True..."
1072
+ )
1073
+ # Retry with validation to catch and fix bad bands
1074
+ whisp_image = combine_datasets(
1075
+ national_codes=national_codes, validate_bands=True
1076
+ )
1077
+
1078
+ # Create reducer
1079
+ reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1080
+
1081
+ # Batch the data
1082
+ batches = batch_geodataframe(gdf_for_ee, batch_size)
1083
+ logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
1084
+
1085
+ # Setup semaphore for EE concurrency control
1086
+ ee_semaphore = threading.BoundedSemaphore(max_concurrent)
1087
+
1088
+ # Progress tracker
1089
+ progress = ProgressTracker(len(batches), logger=logger)
1090
+
1091
+ results = []
1092
+
1093
+ def process_batch(
1094
+ batch_idx: int, batch: gpd.GeoDataFrame
1095
+ ) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
1096
+ """Process one batch: server EE work + client metadata."""
1097
+ with ee_semaphore:
1098
+ # Server-side: convert to EE, optionally add metadata, reduce
1099
+ fc = convert_batch_to_ee(batch)
1100
+ if add_metadata_server:
1101
+ fc = extract_centroid_and_geomtype_server(fc)
1102
+ df_server = process_ee_batch(
1103
+ fc, whisp_image, reducer, batch_idx, max_retries, logger
1104
+ )
1105
+
1106
+ # Client-side: extract metadata using GeoPandas
1107
+ df_client = extract_centroid_and_geomtype_client(
1108
+ batch,
1109
+ external_id_column=external_id_column,
1110
+ return_attributes_only=True,
1111
+ )
1112
+
1113
+ return batch_idx, df_server, df_client
1114
+
1115
+ # Process batches with thread pool
1116
+ pool_workers = max(2 * max_concurrent, max_concurrent + 2)
1117
+
1118
+ # Track if we had errors that suggest bad bands
1119
+ batch_errors = []
1120
+
1121
+ # Suppress fiona logging during batch processing (threads create new loggers)
1122
+ fiona_logger = logging.getLogger("fiona")
1123
+ pyogrio_logger = logging.getLogger("pyogrio._io")
1124
+ old_fiona_level = fiona_logger.level
1125
+ old_pyogrio_level = pyogrio_logger.level
1126
+ fiona_logger.setLevel(logging.CRITICAL)
1127
+ pyogrio_logger.setLevel(logging.CRITICAL)
1128
+
1129
+ try:
1130
+ with redirect_stdout(io.StringIO()):
1131
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1132
+ futures = {
1133
+ executor.submit(process_batch, i, batch): i
1134
+ for i, batch in enumerate(batches)
1135
+ }
1136
+
1137
+ for future in as_completed(futures):
1138
+ try:
1139
+ batch_idx, df_server, df_client = future.result()
1140
+
1141
+ # Merge server and client results
1142
+ if plot_id_column not in df_server.columns:
1143
+ df_server[plot_id_column] = range(len(df_server))
1144
+
1145
+ # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1146
+ # These are the actual EE processing results
1147
+ df_server_clean = df_server.copy()
1148
+
1149
+ # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1150
+ # (formatted wrapper handles keep_external_columns parameter)
1151
+ keep_external_columns = [plot_id_column]
1152
+ if (
1153
+ external_id_column
1154
+ and external_id_column in df_client.columns
1155
+ ):
1156
+ keep_external_columns.append(external_id_column)
1157
+ if "geometry" in df_client.columns:
1158
+ keep_external_columns.append("geometry")
1159
+ # Keep geometry type column (Geometry_type)
1160
+ if geometry_type_column in df_client.columns:
1161
+ keep_external_columns.append(geometry_type_column)
1162
+ # Also keep centroid columns (Centroid_lon, Centroid_lat)
1163
+ centroid_cols = [
1164
+ c for c in df_client.columns if c.startswith("Centroid_")
1165
+ ]
1166
+ keep_external_columns.extend(centroid_cols)
1167
+
1168
+ df_client_clean = df_client[
1169
+ [c for c in keep_external_columns if c in df_client.columns]
1170
+ ].drop_duplicates()
1171
+
1172
+ merged = df_server_clean.merge(
1173
+ df_client_clean,
1174
+ on=plot_id_column,
1175
+ how="left",
1176
+ suffixes=("_ee", "_client"),
1177
+ )
1178
+ results.append(merged)
1179
+ progress.update()
1180
+
1181
+ except Exception as e:
1182
+ error_msg = str(e)
1183
+ logger.error(f"Batch processing error: {error_msg[:100]}")
1184
+ import traceback
1185
+
1186
+ logger.debug(traceback.format_exc())
1187
+ batch_errors.append(error_msg)
1188
+ finally:
1189
+ # Restore logger levels
1190
+ fiona_logger.setLevel(old_fiona_level)
1191
+ pyogrio_logger.setLevel(old_pyogrio_level)
1192
+
1193
+ progress.finish()
1194
+
1195
+ # Check if we should retry with validation due to band errors
1196
+ if batch_errors and not results:
1197
+ # All batches failed - likely a bad band issue
1198
+ is_band_error = any(
1199
+ keyword in str(batch_errors)
1200
+ for keyword in ["Image.load", "asset", "not found", "does not exist"]
1201
+ )
1202
+
1203
+ if is_band_error:
1204
+ logger.warning(
1205
+ "Detected potential bad band error. Retrying with validate_bands=True..."
1206
+ )
1207
+ try:
1208
+ with redirect_stdout(io.StringIO()):
1209
+ whisp_image = combine_datasets(
1210
+ national_codes=national_codes, validate_bands=True
1211
+ )
1212
+ logger.info(
1213
+ "Image recreated with validation. Retrying batch processing..."
1214
+ )
1215
+
1216
+ # Retry batch processing with validated image
1217
+ results = []
1218
+ progress = ProgressTracker(len(batches), logger=logger)
1219
+
1220
+ # Suppress fiona logging during batch processing (threads create new loggers)
1221
+ fiona_logger = logging.getLogger("fiona")
1222
+ pyogrio_logger = logging.getLogger("pyogrio._io")
1223
+ old_fiona_level = fiona_logger.level
1224
+ old_pyogrio_level = pyogrio_logger.level
1225
+ fiona_logger.setLevel(logging.CRITICAL)
1226
+ pyogrio_logger.setLevel(logging.CRITICAL)
1227
+
1228
+ try:
1229
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1230
+ futures = {
1231
+ executor.submit(process_batch, i, batch): i
1232
+ for i, batch in enumerate(batches)
1233
+ }
1234
+
1235
+ for future in as_completed(futures):
1236
+ try:
1237
+ batch_idx, df_server, df_client = future.result()
1238
+ if plot_id_column not in df_server.columns:
1239
+ df_server[plot_id_column] = range(len(df_server))
1240
+ merged = df_server.merge(
1241
+ df_client,
1242
+ on=plot_id_column,
1243
+ how="left",
1244
+ suffixes=("", "_client"),
1245
+ )
1246
+ results.append(merged)
1247
+ progress.update()
1248
+ except Exception as e:
1249
+ logger.error(
1250
+ f"Batch processing error (retry): {str(e)[:100]}"
1251
+ )
1252
+
1253
+ progress.finish()
1254
+ finally:
1255
+ # Restore logger levels
1256
+ fiona_logger.setLevel(old_fiona_level)
1257
+ pyogrio_logger.setLevel(old_pyogrio_level)
1258
+ except Exception as validation_e:
1259
+ logger.error(
1260
+ f"Failed to recover with validation: {str(validation_e)[:100]}"
1261
+ )
1262
+ return pd.DataFrame()
1263
+
1264
+ if results:
1265
+ # Filter out empty DataFrames and all-NA columns to avoid FutureWarning in pd.concat
1266
+ results_filtered = []
1267
+ for df in results:
1268
+ if not df.empty:
1269
+ # Drop columns that are entirely NA
1270
+ df_clean = df.dropna(axis=1, how="all")
1271
+ if not df_clean.empty:
1272
+ results_filtered.append(df_clean)
1273
+ results = results_filtered
1274
+
1275
+ if results:
1276
+ # Concatenate with explicit dtype handling to suppress FutureWarning
1277
+ combined = pd.concat(results, ignore_index=True, sort=False)
1278
+ # Ensure all column names are strings (fixes pandas .str accessor issues later)
1279
+ combined.columns = combined.columns.astype(str)
1280
+ else:
1281
+ return pd.DataFrame()
1282
+
1283
+ # Clean up duplicate external_id columns created by merges
1284
+ # Rename external_id_column to standardized 'external_id' for schema validation
1285
+ if external_id_column:
1286
+ # Find all columns related to external_id
1287
+ external_id_variants = [
1288
+ col
1289
+ for col in combined.columns
1290
+ if external_id_column.lower() in col.lower()
1291
+ ]
1292
+
1293
+ if external_id_variants:
1294
+ # Use the base column name if it exists, otherwise use first variant
1295
+ base_col = (
1296
+ external_id_column
1297
+ if external_id_column in combined.columns
1298
+ else external_id_variants[0]
1299
+ )
1300
+
1301
+ # Rename to standardized 'external_id'
1302
+ if base_col != "external_id":
1303
+ combined = combined.rename(columns={base_col: "external_id"})
1304
+
1305
+ # Drop all other variants
1306
+ cols_to_drop = [c for c in external_id_variants if c != base_col]
1307
+ combined = combined.drop(columns=cols_to_drop, errors="ignore")
1308
+
1309
+ # plotId column is already present from batch processing
1310
+ # Just ensure it's at position 0
1311
+ if plot_id_column in combined.columns:
1312
+ combined = combined[
1313
+ [plot_id_column]
1314
+ + [col for col in combined.columns if col != plot_id_column]
1315
+ ]
1316
+
1317
+ # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1318
+ # MUST be done BEFORE formatting (which removes _median columns)
1319
+ logger.debug("Adding administrative context...")
1320
+ try:
1321
+ from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
1322
+
1323
+ combined = join_admin_codes(
1324
+ df=combined, lookup_dict=lookup_dict, id_col="admin_code_median"
1325
+ )
1326
+ except ImportError:
1327
+ logger.warning(
1328
+ "Could not import lookup dictionary - admin context not added"
1329
+ )
1330
+
1331
+ # Format the output with error handling for bad bands
1332
+ logger.debug("Formatting output...")
1333
+ try:
1334
+ formatted = format_stats_dataframe(
1335
+ df=combined,
1336
+ area_col=f"{geometry_area_column}_sum",
1337
+ decimal_places=decimal_places,
1338
+ unit_type=unit_type,
1339
+ remove_columns=True,
1340
+ convert_water_flag=True,
1341
+ )
1342
+ except Exception as e:
1343
+ # If formatting fails, try recreating the image with validation
1344
+ logger.warning(
1345
+ f"Formatting failed: {str(e)[:100]}. Attempting to recreate image with band validation..."
1346
+ )
1347
+ try:
1348
+ with redirect_stdout(io.StringIO()):
1349
+ whisp_image_validated = combine_datasets(
1350
+ national_codes=national_codes, validate_bands=True
1351
+ )
1352
+
1353
+ # Reprocess batches with validated image - create a local process function
1354
+ logger.info("Reprocessing batches with validated image...")
1355
+ results_validated = []
1356
+
1357
+ def process_batch_validated(
1358
+ batch_idx: int, batch: gpd.GeoDataFrame
1359
+ ) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
1360
+ """Process one batch with validated image."""
1361
+ with ee_semaphore:
1362
+ fc = convert_batch_to_ee(batch)
1363
+ if add_metadata_server:
1364
+ fc = extract_centroid_and_geomtype_server(fc)
1365
+ df_server = process_ee_batch(
1366
+ fc,
1367
+ whisp_image_validated,
1368
+ reducer,
1369
+ batch_idx,
1370
+ max_retries,
1371
+ logger,
1372
+ )
1373
+ df_client = extract_centroid_and_geomtype_client(
1374
+ batch,
1375
+ external_id_column=external_id_column,
1376
+ return_attributes_only=True,
1377
+ )
1378
+ return batch_idx, df_server, df_client
1379
+
1380
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1381
+ futures = {
1382
+ executor.submit(process_batch_validated, i, batch): i
1383
+ for i, batch in enumerate(batches)
1384
+ }
1385
+
1386
+ for future in as_completed(futures):
1387
+ try:
1388
+ batch_idx, df_server, df_client = future.result()
1389
+ if plot_id_column not in df_server.columns:
1390
+ df_server[plot_id_column] = range(len(df_server))
1391
+
1392
+ # Drop external_id_column from df_client if it exists (already in df_server)
1393
+ if (
1394
+ external_id_column
1395
+ and external_id_column in df_client.columns
1396
+ ):
1397
+ df_client = df_client.drop(columns=[external_id_column])
1398
+
1399
+ merged = df_server.merge(
1400
+ df_client,
1401
+ on=plot_id_column,
1402
+ how="left",
1403
+ suffixes=("", "_client"),
1404
+ )
1405
+ results_validated.append(merged)
1406
+ except Exception as batch_e:
1407
+ logger.error(
1408
+ f"Batch reprocessing error: {str(batch_e)[:100]}"
1409
+ )
1410
+
1411
+ if results_validated:
1412
+ # Concatenate with explicit dtype handling to suppress FutureWarning
1413
+ combined = pd.concat(
1414
+ results_validated, ignore_index=True, sort=False
1415
+ )
1416
+ # Ensure all column names are strings (fixes pandas .str accessor issues later)
1417
+ combined.columns = combined.columns.astype(str)
1418
+
1419
+ # Clean up duplicate external_id columns created by merges
1420
+ if external_id_column:
1421
+ external_id_variants = [
1422
+ col
1423
+ for col in combined.columns
1424
+ if external_id_column.lower() in col.lower()
1425
+ ]
1426
+
1427
+ if external_id_variants:
1428
+ base_col = external_id_column
1429
+ if (
1430
+ base_col not in combined.columns
1431
+ and external_id_variants
1432
+ ):
1433
+ base_col = external_id_variants[0]
1434
+ combined = combined.rename(
1435
+ columns={base_col: "external_id"}
1436
+ )
1437
+
1438
+ cols_to_drop = [
1439
+ c for c in external_id_variants if c != base_col
1440
+ ]
1441
+ combined = combined.drop(
1442
+ columns=cols_to_drop, errors="ignore"
1443
+ )
1444
+
1445
+ # plotId column is already present, just ensure it's at position 0
1446
+ if plot_id_column in combined.columns:
1447
+ combined = combined[
1448
+ [plot_id_column]
1449
+ + [col for col in combined.columns if col != plot_id_column]
1450
+ ]
1451
+
1452
+ # Add admin context again
1453
+ try:
1454
+ from openforis_whisp.parameters.lookup_gaul1_admin import (
1455
+ lookup_dict,
1456
+ )
1457
+
1458
+ combined = join_admin_codes(
1459
+ df=combined,
1460
+ lookup_dict=lookup_dict,
1461
+ id_col="admin_code_median",
1462
+ )
1463
+ except ImportError:
1464
+ logger.warning(
1465
+ "Could not import lookup dictionary - admin context not added"
1466
+ )
1467
+
1468
+ # Try formatting again with validated data
1469
+ formatted = format_stats_dataframe(
1470
+ df=combined,
1471
+ area_col=f"{geometry_area_column}_sum",
1472
+ decimal_places=decimal_places,
1473
+ unit_type=unit_type,
1474
+ remove_columns=True,
1475
+ convert_water_flag=True,
1476
+ )
1477
+ else:
1478
+ logger.error(" Reprocessing with validation produced no results")
1479
+ return pd.DataFrame()
1480
+ except Exception as retry_e:
1481
+ logger.error(
1482
+ f"Failed to recover from formatting error: {str(retry_e)[:100]}"
1483
+ )
1484
+ raise retry_e
1485
+
1486
+ logger.info(f"Processed {len(formatted):,} features successfully")
1487
+ return formatted
1488
+ else:
1489
+ logger.error(" No results produced")
1490
+ return pd.DataFrame()
1491
+
1492
+
1493
+ # ============================================================================
1494
+ # SEQUENTIAL PROCESSING (STANDARD ENDPOINT)
1495
+ # ============================================================================
1496
+
1497
+
1498
+ def whisp_stats_geojson_to_df_sequential(
1499
+ input_geojson_filepath: str,
1500
+ external_id_column: str = None,
1501
+ remove_geom: bool = False,
1502
+ national_codes: List[str] = None,
1503
+ unit_type: str = "ha",
1504
+ whisp_image: ee.Image = None,
1505
+ custom_bands: Dict[str, Any] = None,
1506
+ add_metadata_client_side: bool = True,
1507
+ logger: logging.Logger = None,
1508
+ # Format parameters (auto-detect from config if not provided)
1509
+ decimal_places: int = None,
1510
+ ) -> pd.DataFrame:
1511
+ """
1512
+ Process GeoJSON sequentially using standard EE endpoint with automatic formatting.
1513
+
1514
+ Uses reduceRegions for server-side processing and client-side metadata
1515
+ extraction via GeoPandas. Suitable for smaller datasets or when high-volume
1516
+ endpoint is not available. Automatically formats output.
1517
+
1518
+ Requires: standard EE endpoint (default)
1519
+
1520
+ Parameters
1521
+ ----------
1522
+ input_geojson_filepath : str
1523
+ Path to input GeoJSON
1524
+ external_id_column : str, optional
1525
+ Column name for external IDs
1526
+ remove_geom : bool
1527
+ Remove geometry from output
1528
+ national_codes : List[str], optional
1529
+ ISO2 codes for national datasets
1530
+ unit_type : str
1531
+ "ha" or "percent"
1532
+ whisp_image : ee.Image, optional
1533
+ Pre-combined image
1534
+ custom_bands : Dict[str, Any], optional
1535
+ Custom band information
1536
+ add_metadata_client_side : bool
1537
+ Add client-side metadata (recommended)
1538
+ logger : logging.Logger, optional
1539
+ Logger for output
1540
+ decimal_places : int, optional
1541
+ Decimal places for formatting. If None, auto-detects from config.
1542
+
1543
+ Returns
1544
+ -------
1545
+ pd.DataFrame
1546
+ Formatted results DataFrame
1547
+ """
1548
+ from openforis_whisp.reformat import format_stats_dataframe
1549
+
1550
+ logger = logger or logging.getLogger("whisp")
1551
+
1552
+ # Suppress verbose output from dependencies (sequential has lower concurrency, use default)
1553
+ _suppress_verbose_output(max_concurrent=1)
1554
+
1555
+ # Auto-detect decimal places from config if not provided
1556
+ if decimal_places is None:
1557
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1558
+ logger.debug(f"Using decimal_places={decimal_places} from config")
1559
+
1560
+ # Validate endpoint
1561
+ validate_ee_endpoint("standard", raise_error=True)
1562
+
1563
+ # Load GeoJSON with output suppressed
1564
+ gdf = _load_geojson_silently(input_geojson_filepath)
1565
+ logger.info(f"Loaded {len(gdf):,} features")
1566
+
1567
+ # Clean geometries
1568
+ gdf = clean_geodataframe(gdf, logger=logger)
1569
+
1570
+ # Add stable plotIds for merging (starting from 1, not 0)
1571
+ gdf[plot_id_column] = range(1, len(gdf) + 1)
1572
+
1573
+ # Add stable row IDs
1574
+ row_id_col = "__row_id__"
1575
+ gdf[row_id_col] = range(len(gdf))
1576
+
1577
+ # Strip unnecessary properties before sending to EE
1578
+ # Keep only: geometry, plot_id_column, and external_id_column
1579
+ # This prevents duplication of GeoJSON properties in EE results
1580
+ keep_cols = ["geometry", plot_id_column, row_id_col]
1581
+ if external_id_column and external_id_column in gdf.columns:
1582
+ keep_cols.append(external_id_column)
1583
+
1584
+ gdf_for_ee = gdf[keep_cols].copy()
1585
+ logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1586
+
1587
+ # Create image if not provided
1588
+ if whisp_image is None:
1589
+ logger.debug("Creating Whisp image...")
1590
+ # Suppress print statements from combine_datasets
1591
+ with redirect_stdout(io.StringIO()):
1592
+ try:
1593
+ # First try without validation
1594
+ whisp_image = combine_datasets(
1595
+ national_codes=national_codes, validate_bands=False
1596
+ )
1597
+ except Exception as e:
1598
+ logger.warning(
1599
+ f"First attempt failed: {str(e)[:100]}. Retrying with validate_bands=True..."
1600
+ )
1601
+ # Retry with validation to catch and fix bad bands
1602
+ whisp_image = combine_datasets(
1603
+ national_codes=national_codes, validate_bands=True
1604
+ )
1605
+
1606
+ # Convert to EE (suppress print statements from convert_geojson_to_ee)
1607
+ logger.debug("Converting to EE FeatureCollection...")
1608
+ with redirect_stdout(io.StringIO()):
1609
+ fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
1610
+
1611
+ # Create reducer
1612
+ reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1613
+
1614
+ # Process server-side with error handling for bad bands
1615
+ logger.info("Processing with Earth Engine...")
1616
+ try:
1617
+ results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
1618
+ df_server = convert_ee_to_df(results_fc)
1619
+ except Exception as e:
1620
+ # Check if this is a band error
1621
+ error_msg = str(e)
1622
+ is_band_error = any(
1623
+ keyword in error_msg
1624
+ for keyword in ["Image.load", "asset", "not found", "does not exist"]
1625
+ )
1626
+
1627
+ if is_band_error and whisp_image is not None:
1628
+ logger.warning(
1629
+ f"Detected bad band error: {error_msg[:100]}. Retrying with validate_bands=True..."
1630
+ )
1631
+ try:
1632
+ with redirect_stdout(io.StringIO()):
1633
+ whisp_image = combine_datasets(
1634
+ national_codes=national_codes, validate_bands=True
1635
+ )
1636
+ logger.info("Image recreated with validation. Retrying processing...")
1637
+ results_fc = whisp_image.reduceRegions(
1638
+ collection=fc, reducer=reducer, scale=10
1639
+ )
1640
+ df_server = convert_ee_to_df(results_fc)
1641
+ except Exception as retry_e:
1642
+ logger.error(f"Retry failed: {str(retry_e)[:100]}")
1643
+ raise
1644
+ else:
1645
+ raise
1646
+
1647
+ logger.debug("Server-side processing complete")
1648
+
1649
+ # Add row_id if missing
1650
+ if row_id_col not in df_server.columns:
1651
+ df_server[row_id_col] = range(len(df_server))
1652
+
1653
+ # Add client-side metadata if requested
1654
+ if add_metadata_client_side:
1655
+ logger.debug("Extracting client-side metadata...")
1656
+ df_client = extract_centroid_and_geomtype_client(
1657
+ gdf,
1658
+ external_id_column=external_id_column,
1659
+ return_attributes_only=True,
1660
+ )
1661
+
1662
+ # Drop external_id_column from df_client if it exists (already in df_server)
1663
+ if external_id_column and external_id_column in df_client.columns:
1664
+ df_client = df_client.drop(columns=[external_id_column])
1665
+
1666
+ # Merge
1667
+ result = df_server.merge(
1668
+ df_client, on=row_id_col, how="left", suffixes=("", "_client")
1669
+ )
1670
+ else:
1671
+ result = df_server
1672
+
1673
+ # Remove internal __row_id__ column if present
1674
+ if row_id_col in result.columns:
1675
+ result = result.drop(columns=[row_id_col])
1676
+
1677
+ # Format the output
1678
+ # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1679
+ # MUST be done BEFORE formatting (which removes _median columns)
1680
+ logger.debug("Adding administrative context...")
1681
+ try:
1682
+ from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
1683
+
1684
+ result = join_admin_codes(
1685
+ df=result, lookup_dict=lookup_dict, id_col="admin_code_median"
1686
+ )
1687
+ except ImportError:
1688
+ logger.warning("Could not import lookup dictionary - admin context not added")
1689
+
1690
+ # Format the output
1691
+ logger.debug("Formatting output...")
1692
+ formatted = format_stats_dataframe(
1693
+ df=result,
1694
+ area_col=f"{geometry_area_column}_sum",
1695
+ decimal_places=decimal_places,
1696
+ unit_type=unit_type,
1697
+ remove_columns=True,
1698
+ convert_water_flag=True,
1699
+ )
1700
+
1701
+ logger.info(f"Processed {len(formatted):,} features")
1702
+
1703
+ # Consolidate external_id_column to standardized 'external_id'
1704
+ if external_id_column:
1705
+ variants = [
1706
+ col
1707
+ for col in formatted.columns
1708
+ if external_id_column.lower() in col.lower()
1709
+ ]
1710
+ if variants:
1711
+ base_col = (
1712
+ external_id_column
1713
+ if external_id_column in formatted.columns
1714
+ else variants[0]
1715
+ )
1716
+ if base_col != "external_id":
1717
+ formatted = formatted.rename(columns={base_col: "external_id"})
1718
+ # Drop other variants
1719
+ formatted = formatted.drop(
1720
+ columns=[c for c in variants if c != base_col], errors="ignore"
1721
+ )
1722
+
1723
+ return formatted
1724
+
1725
+
1726
+ # ============================================================================
1727
+ # FORMATTED WRAPPER FUNCTIONS (STATS + FORMAT)
1728
+ # ============================================================================
1729
+
1730
+
1731
+ def whisp_formatted_stats_geojson_to_df_concurrent(
1732
+ input_geojson_filepath: str,
1733
+ external_id_column: str = None,
1734
+ remove_geom: bool = False,
1735
+ national_codes: List[str] = None,
1736
+ unit_type: str = "ha",
1737
+ whisp_image: ee.Image = None,
1738
+ custom_bands: Dict[str, Any] = None,
1739
+ batch_size: int = 10,
1740
+ max_concurrent: int = 20,
1741
+ validate_geometries: bool = True,
1742
+ max_retries: int = 3,
1743
+ add_metadata_server: bool = False,
1744
+ logger: logging.Logger = None,
1745
+ # Format parameters (auto-detect from config if not provided)
1746
+ decimal_places: int = None,
1747
+ remove_median_columns: bool = True,
1748
+ convert_water_flag: bool = True,
1749
+ water_flag_threshold: float = 0.5,
1750
+ sort_column: str = "plotId",
1751
+ include_geometry_audit_trail: bool = False,
1752
+ ) -> pd.DataFrame:
1753
+ """
1754
+ Process GeoJSON concurrently with automatic formatting and validation.
1755
+
1756
+ Combines whisp_stats_geojson_to_df_concurrent + format_stats_dataframe + validation
1757
+ for a complete pipeline: extract stats → convert units → format output → validate schema.
1758
+
1759
+ Uses high-volume endpoint and concurrent batching.
1760
+
1761
+ Parameters
1762
+ ----------
1763
+ input_geojson_filepath : str
1764
+ Path to input GeoJSON file
1765
+ external_id_column : str, optional
1766
+ Column name for external IDs
1767
+ remove_geom : bool
1768
+ Remove geometry column from output
1769
+ national_codes : List[str], optional
1770
+ ISO2 codes for national datasets
1771
+ unit_type : str
1772
+ "ha" or "percent"
1773
+ whisp_image : ee.Image, optional
1774
+ Pre-combined image
1775
+ custom_bands : Dict[str, Any], optional
1776
+ Custom band information
1777
+ batch_size : int
1778
+ Features per batch (default 25)
1779
+ max_concurrent : int
1780
+ Maximum concurrent EE calls (default 10)
1781
+ validate_geometries : bool
1782
+ Validate and clean geometries (default True)
1783
+ max_retries : int
1784
+ Retry attempts per batch (default 3)
1785
+ add_metadata_server : bool
1786
+ Add metadata server-side (default False)
1787
+ logger : logging.Logger, optional
1788
+ Logger for output
1789
+ decimal_places : int, optional
1790
+ Decimal places for rounding. If None, auto-detects from config:
1791
+ - Area columns: geometry_area_column_formatting
1792
+ - Percent columns: stats_percent_columns_formatting
1793
+ - Other columns: stats_area_columns_formatting
1794
+ remove_median_columns : bool
1795
+ Remove '_median' columns (default True)
1796
+ convert_water_flag : bool
1797
+ Convert water flag to boolean (default True)
1798
+ water_flag_threshold : float
1799
+ Water flag ratio threshold (default 0.5)
1800
+ sort_column : str
1801
+ Column to sort by (default "plotId", None to skip)
1802
+ include_geometry_audit_trail : bool, default False
1803
+ If True, includes audit trail columns:
1804
+ - geo_original: Original input geometry (before EE processing)
1805
+ - geometry_type_original: Original geometry type
1806
+ - geometry_type: Processed geometry type (from EE)
1807
+ - geometry_type_changed: Boolean flag if geometry changed
1808
+ - geometry_type_transition: Description of how it changed
1809
+ These columns enable full transparency and auditability for compliance tracking.
1810
+
1811
+ Returns
1812
+ -------
1813
+ pd.DataFrame
1814
+ Validated, formatted results DataFrame with optional audit trail
1815
+ """
1816
+ from openforis_whisp.reformat import format_stats_dataframe
1817
+ from datetime import datetime, timezone
1818
+ import json
1819
+ from shapely.geometry import mapping
1820
+
1821
+ logger = logger or logging.getLogger("whisp")
1822
+
1823
+ # Auto-detect decimal places from config if not provided
1824
+ if decimal_places is None:
1825
+ # Use stats_area_columns_formatting as default for most columns
1826
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1827
+ logger.debug(f"Using decimal_places={decimal_places} from config")
1828
+
1829
+ # Normalize keep_external_columns parameter early (will be used in merge logic later)
1830
+ # Load GeoJSON temporarily to get column names for normalization
1831
+
1832
+ # Step 1: Get raw stats
1833
+ logger.debug("Step 1/2: Extracting statistics (concurrent)...")
1834
+ df_raw = whisp_stats_geojson_to_df_concurrent(
1835
+ input_geojson_filepath=input_geojson_filepath,
1836
+ external_id_column=external_id_column,
1837
+ remove_geom=remove_geom,
1838
+ national_codes=national_codes,
1839
+ unit_type=unit_type,
1840
+ whisp_image=whisp_image,
1841
+ custom_bands=custom_bands,
1842
+ batch_size=batch_size,
1843
+ max_concurrent=max_concurrent,
1844
+ validate_geometries=validate_geometries,
1845
+ max_retries=max_retries,
1846
+ add_metadata_server=add_metadata_server,
1847
+ logger=logger,
1848
+ )
1849
+
1850
+ # Step 2: Format the output
1851
+ logger.debug("Step 2/2: Formatting output...")
1852
+ median_cols_before = [c for c in df_raw.columns if c.endswith("_median")]
1853
+ logger.debug(
1854
+ f"Columns ending with '_median' BEFORE formatting: {median_cols_before}"
1855
+ )
1856
+
1857
+ df_formatted = format_stats_dataframe(
1858
+ df=df_raw,
1859
+ area_col=f"{geometry_area_column}_sum",
1860
+ decimal_places=decimal_places,
1861
+ unit_type=unit_type,
1862
+ remove_columns=remove_median_columns,
1863
+ convert_water_flag=convert_water_flag,
1864
+ water_flag_threshold=water_flag_threshold,
1865
+ sort_column=sort_column,
1866
+ )
1867
+
1868
+ median_cols_after = [c for c in df_formatted.columns if c.endswith("_median")]
1869
+ logger.debug(f"Columns ending with '_median' AFTER formatting: {median_cols_after}")
1870
+
1871
+ # Step 2b: Reformat geometry and handle point areas
1872
+ try:
1873
+ df_formatted = reformat_geometry_type(df_formatted)
1874
+ except Exception as e:
1875
+ logger.warning(f"Error reformatting geometry type: {e}")
1876
+
1877
+ try:
1878
+ df_formatted = set_point_geometry_area_to_zero(df_formatted)
1879
+ except Exception as e:
1880
+ logger.warning(f"Error setting point geometry area to zero: {e}")
1881
+
1882
+ # Step 3: Validate against schema
1883
+ logger.debug("Step 3/3: Validating against schema...")
1884
+ from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
1885
+
1886
+ df_validated = validate_dataframe_using_lookups_flexible(
1887
+ df_stats=df_formatted,
1888
+ national_codes=national_codes,
1889
+ custom_bands=custom_bands,
1890
+ )
1891
+
1892
+ # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
1893
+ if include_geometry_audit_trail:
1894
+ logger.debug("Adding audit trail columns...")
1895
+ try:
1896
+ # Capture original geometries AFTER we have the raw stats
1897
+ logger.debug("Capturing original geometries for audit trail...")
1898
+ gdf_original = _load_geojson_silently(input_geojson_filepath)
1899
+
1900
+ # Use plotId from df_validated to maintain mapping
1901
+ df_original_geom = pd.DataFrame(
1902
+ {
1903
+ "plotId": df_validated["plotId"].values[: len(gdf_original)],
1904
+ "geo_original": gdf_original["geometry"].apply(
1905
+ lambda g: json.dumps(mapping(g)) if g is not None else None
1906
+ ),
1907
+ "geometry_type_original": gdf_original["geometry"].geom_type.values,
1908
+ }
1909
+ )
1910
+
1911
+ # Merge original geometries back
1912
+ df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
1913
+
1914
+ # Extract geometry type from processed 'geo' column if it exists
1915
+ # Note: 'geo' column may not exist after validation removes extra columns
1916
+ if "geo" in df_validated.columns:
1917
+ # Use geo column from validated dataframe
1918
+ def extract_geom_type(x):
1919
+ try:
1920
+ if isinstance(x, dict):
1921
+ return x.get("type")
1922
+ elif isinstance(x, str):
1923
+ # Handle both JSON strings and Python dict string representations
1924
+ try:
1925
+ parsed = json.loads(x)
1926
+ except:
1927
+ # Try ast.literal_eval for Python dict representations
1928
+ import ast
1929
+
1930
+ parsed = ast.literal_eval(x)
1931
+ return (
1932
+ parsed.get("type") if isinstance(parsed, dict) else None
1933
+ )
1934
+ except:
1935
+ pass
1936
+ return None
1937
+
1938
+ df_validated["geometry_type"] = df_validated["geo"].apply(
1939
+ extract_geom_type
1940
+ )
1941
+ else:
1942
+ # If geo doesn't exist, just use the original type
1943
+ df_validated["geometry_type"] = df_validated["geometry_type_original"]
1944
+
1945
+ # Flag if geometry changed
1946
+ df_validated["geometry_type_changed"] = (
1947
+ df_validated["geometry_type_original"] != df_validated["geometry_type"]
1948
+ )
1949
+
1950
+ # Classify the geometry type transition
1951
+ def classify_transition(orig, proc):
1952
+ if orig == proc:
1953
+ return "no_change"
1954
+ elif proc == "LineString":
1955
+ return f"{orig}_simplified_to_linestring"
1956
+ elif proc == "Point":
1957
+ return f"{orig}_simplified_to_point"
1958
+ else:
1959
+ return f"{orig}_to_{proc}"
1960
+
1961
+ df_validated["geometry_type_transition"] = df_validated.apply(
1962
+ lambda row: classify_transition(
1963
+ row["geometry_type_original"], row["geometry_type"]
1964
+ ),
1965
+ axis=1,
1966
+ )
1967
+
1968
+ # Store processing metadata
1969
+ df_validated.attrs["processing_metadata"] = {
1970
+ "whisp_version": "2.0",
1971
+ "processing_date": datetime.now().isoformat(),
1972
+ "processing_mode": "concurrent",
1973
+ "ee_endpoint": "high_volume",
1974
+ "validate_geometries": validate_geometries,
1975
+ "datasets_used": national_codes or [],
1976
+ "include_geometry_audit_trail": True,
1977
+ }
1978
+
1979
+ logger.info(
1980
+ f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
1981
+ )
1982
+
1983
+ except Exception as e:
1984
+ logger.warning(f"Error adding audit trail: {e}")
1985
+ # Continue without audit trail if something fails
1986
+
1987
+ # Add processing metadata column using pd.concat to avoid fragmentation warning
1988
+ metadata_dict = {
1989
+ "whisp_version": "3.0.0a1",
1990
+ "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
1991
+ "%Y-%m-%d %H:%M:%S UTC"
1992
+ ),
1993
+ }
1994
+ metadata_series = pd.Series(
1995
+ [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
1996
+ )
1997
+ df_validated = pd.concat([df_validated, metadata_series], axis=1)
1998
+
1999
+ logger.info("Concurrent processing + formatting + validation complete")
2000
+ return df_validated
2001
+
2002
+
2003
+ def whisp_formatted_stats_geojson_to_df_sequential(
2004
+ input_geojson_filepath: str,
2005
+ external_id_column: str = None,
2006
+ remove_geom: bool = False,
2007
+ national_codes: List[str] = None,
2008
+ unit_type: str = "ha",
2009
+ whisp_image: ee.Image = None,
2010
+ custom_bands: Dict[str, Any] = None,
2011
+ add_metadata_client_side: bool = True,
2012
+ logger: logging.Logger = None,
2013
+ # Format parameters (auto-detect from config if not provided)
2014
+ decimal_places: int = None,
2015
+ remove_median_columns: bool = True,
2016
+ convert_water_flag: bool = True,
2017
+ water_flag_threshold: float = 0.5,
2018
+ sort_column: str = "plotId",
2019
+ include_geometry_audit_trail: bool = False,
2020
+ ) -> pd.DataFrame:
2021
+ """
2022
+ Process GeoJSON sequentially with automatic formatting and validation.
2023
+
2024
+ Combines whisp_stats_geojson_to_df_sequential + format_stats_dataframe + validation
2025
+ for a complete pipeline: extract stats → convert units → format output → validate schema.
2026
+
2027
+ Uses standard endpoint for sequential processing.
2028
+
2029
+ Parameters
2030
+ ----------
2031
+ input_geojson_filepath : str
2032
+ Path to input GeoJSON file
2033
+ external_id_column : str, optional
2034
+ Column name for external IDs
2035
+ remove_geom : bool
2036
+ Remove geometry from output
2037
+ national_codes : List[str], optional
2038
+ ISO2 codes for national datasets
2039
+ unit_type : str
2040
+ "ha" or "percent"
2041
+ whisp_image : ee.Image, optional
2042
+ Pre-combined image
2043
+ custom_bands : Dict[str, Any], optional
2044
+ Custom band information
2045
+ add_metadata_client_side : bool
2046
+ Add client-side metadata (default True)
2047
+ logger : logging.Logger, optional
2048
+ Logger for output
2049
+ decimal_places : int, optional
2050
+ Decimal places for rounding. If None, auto-detects from config:
2051
+ - Area columns: geometry_area_column_formatting
2052
+ - Percent columns: stats_percent_columns_formatting
2053
+ - Other columns: stats_area_columns_formatting
2054
+ remove_median_columns : bool
2055
+ Remove '_median' columns (default True)
2056
+ convert_water_flag : bool
2057
+ Convert water flag to boolean (default True)
2058
+ water_flag_threshold : float
2059
+ Water flag ratio threshold (default 0.5)
2060
+ sort_column : str
2061
+ Column to sort by (default "plotId", None to skip)
2062
+ include_geometry_audit_trail : bool, default True
2063
+ If True, includes audit trail columns:
2064
+ - geo_original: Original input geometry (before EE processing)
2065
+ - geometry_type_original: Original geometry type
2066
+ - geometry_type: Processed geometry type (from EE)
2067
+ - geometry_type_changed: Boolean flag if geometry changed
2068
+ - geometry_type_transition: Description of how it changed
2069
+ These columns enable full transparency and auditability for EUDR compliance.
2070
+
2071
+ Returns
2072
+ -------
2073
+ pd.DataFrame
2074
+ Validated, formatted results DataFrame with optional audit trail
2075
+ """
2076
+ from openforis_whisp.reformat import format_stats_dataframe
2077
+ from datetime import datetime, timezone
2078
+ import json
2079
+ from shapely.geometry import mapping
2080
+
2081
+ logger = logger or logging.getLogger("whisp")
2082
+
2083
+ # Auto-detect decimal places from config if not provided
2084
+ if decimal_places is None:
2085
+ # Use stats_area_columns_formatting as default for most columns
2086
+ decimal_places = _extract_decimal_places(stats_area_columns_formatting)
2087
+ logger.debug(f"Using decimal_places={decimal_places} from config")
2088
+
2089
+ # Step 1: Get raw stats
2090
+ logger.debug("Step 1/2: Extracting statistics (sequential)...")
2091
+ df_raw = whisp_stats_geojson_to_df_sequential(
2092
+ input_geojson_filepath=input_geojson_filepath,
2093
+ external_id_column=external_id_column,
2094
+ remove_geom=remove_geom,
2095
+ national_codes=national_codes,
2096
+ unit_type=unit_type,
2097
+ whisp_image=whisp_image,
2098
+ custom_bands=custom_bands,
2099
+ add_metadata_client_side=add_metadata_client_side,
2100
+ logger=logger,
2101
+ )
2102
+
2103
+ # Step 2: Format the output
2104
+ logger.debug("Step 2/2: Formatting output...")
2105
+ median_cols_before = [c for c in df_raw.columns if c.endswith("_median")]
2106
+ logger.debug(
2107
+ f"Columns ending with '_median' BEFORE formatting: {median_cols_before}"
2108
+ )
2109
+
2110
+ df_formatted = format_stats_dataframe(
2111
+ df=df_raw,
2112
+ area_col=f"{geometry_area_column}_sum",
2113
+ decimal_places=decimal_places,
2114
+ unit_type=unit_type,
2115
+ remove_columns=remove_median_columns,
2116
+ convert_water_flag=convert_water_flag,
2117
+ water_flag_threshold=water_flag_threshold,
2118
+ sort_column=sort_column,
2119
+ )
2120
+
2121
+ median_cols_after = [c for c in df_formatted.columns if c.endswith("_median")]
2122
+ logger.debug(f"Columns ending with '_median' AFTER formatting: {median_cols_after}")
2123
+
2124
+ # Step 2b: Reformat geometry and handle point areas
2125
+ try:
2126
+ df_formatted = reformat_geometry_type(df_formatted)
2127
+ except Exception as e:
2128
+ logger.warning(f"Error reformatting geometry type: {e}")
2129
+
2130
+ try:
2131
+ df_formatted = set_point_geometry_area_to_zero(df_formatted)
2132
+ except Exception as e:
2133
+ logger.warning(f"Error setting point geometry area to zero: {e}")
2134
+
2135
+ # Step 3: Validate against schema
2136
+ logger.debug("Step 3/3: Validating against schema...")
2137
+ from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
2138
+
2139
+ df_validated = validate_dataframe_using_lookups_flexible(
2140
+ df_stats=df_formatted,
2141
+ national_codes=national_codes,
2142
+ custom_bands=custom_bands,
2143
+ )
2144
+
2145
+ # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2146
+ if include_geometry_audit_trail:
2147
+ logger.debug("Adding audit trail columns...")
2148
+ try:
2149
+ # Capture original geometries AFTER we have the raw stats
2150
+ logger.debug("Capturing original geometries for audit trail...")
2151
+ gdf_original = _load_geojson_silently(input_geojson_filepath)
2152
+
2153
+ # Use plotId from df_validated to maintain mapping
2154
+ df_original_geom = pd.DataFrame(
2155
+ {
2156
+ "plotId": df_validated["plotId"].values[: len(gdf_original)],
2157
+ "geo_original": gdf_original["geometry"].apply(
2158
+ lambda g: json.dumps(mapping(g)) if g is not None else None
2159
+ ),
2160
+ "geometry_type_original": gdf_original["geometry"].geom_type.values,
2161
+ }
2162
+ )
2163
+
2164
+ # Merge original geometries back
2165
+ df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2166
+
2167
+ # Extract geometry type from processed 'geo' column if it exists
2168
+ # Note: 'geo' column may not exist after validation removes extra columns
2169
+ if "geo" in df_validated.columns:
2170
+ # Use geo column from validated dataframe
2171
+ def extract_geom_type(x):
2172
+ try:
2173
+ if isinstance(x, dict):
2174
+ return x.get("type")
2175
+ elif isinstance(x, str):
2176
+ # Handle both JSON strings and Python dict string representations
2177
+ try:
2178
+ parsed = json.loads(x)
2179
+ except:
2180
+ # Try ast.literal_eval for Python dict representations
2181
+ import ast
2182
+
2183
+ parsed = ast.literal_eval(x)
2184
+ return (
2185
+ parsed.get("type") if isinstance(parsed, dict) else None
2186
+ )
2187
+ except:
2188
+ pass
2189
+ return None
2190
+
2191
+ df_validated["geometry_type"] = df_validated["geo"].apply(
2192
+ extract_geom_type
2193
+ )
2194
+ else:
2195
+ # If geo doesn't exist, just use the original type
2196
+ df_validated["geometry_type"] = df_validated["geometry_type_original"]
2197
+
2198
+ # Flag if geometry changed
2199
+ df_validated["geometry_type_changed"] = (
2200
+ df_validated["geometry_type_original"] != df_validated["geometry_type"]
2201
+ )
2202
+
2203
+ # Classify the geometry type transition
2204
+ def classify_transition(orig, proc):
2205
+ if orig == proc:
2206
+ return "no_change"
2207
+ elif proc == "LineString":
2208
+ return f"{orig}_simplified_to_linestring"
2209
+ elif proc == "Point":
2210
+ return f"{orig}_simplified_to_point"
2211
+ else:
2212
+ return f"{orig}_to_{proc}"
2213
+
2214
+ df_validated["geometry_type_transition"] = df_validated.apply(
2215
+ lambda row: classify_transition(
2216
+ row["geometry_type_original"], row["geometry_type"]
2217
+ ),
2218
+ axis=1,
2219
+ )
2220
+
2221
+ # Store processing metadata
2222
+ df_validated.attrs["processing_metadata"] = {
2223
+ "whisp_version": "2.0",
2224
+ "processing_date": datetime.now().isoformat(),
2225
+ "processing_mode": "sequential",
2226
+ "ee_endpoint": "standard",
2227
+ "datasets_used": national_codes or [],
2228
+ "include_geometry_audit_trail": True,
2229
+ }
2230
+
2231
+ logger.info(
2232
+ f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
2233
+ )
2234
+
2235
+ except Exception as e:
2236
+ logger.warning(f"Error adding audit trail: {e}")
2237
+ # Continue without audit trail if something fails
2238
+
2239
+ # Add processing metadata column using pd.concat to avoid fragmentation warning
2240
+ metadata_dict = {
2241
+ "whisp_version": "3.0.0a1",
2242
+ "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2243
+ "%Y-%m-%d %H:%M:%S UTC"
2244
+ ),
2245
+ }
2246
+ metadata_series = pd.Series(
2247
+ [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
2248
+ )
2249
+ df_validated = pd.concat([df_validated, metadata_series], axis=1)
2250
+
2251
+ logger.info("Sequential processing + formatting + validation complete")
2252
+ return df_validated
2253
+
2254
+
2255
+ # ============================================================================
2256
+ # FAST PROCESSING WITH AUTO-ROUTING
2257
+ # ============================================================================
2258
+
2259
+
2260
+ def whisp_formatted_stats_geojson_to_df_fast(
2261
+ input_geojson_filepath: str,
2262
+ external_id_column: str = None,
2263
+ remove_geom: bool = False,
2264
+ national_codes: List[str] = None,
2265
+ unit_type: str = "ha",
2266
+ whisp_image: ee.Image = None,
2267
+ custom_bands: Dict[str, Any] = None,
2268
+ mode: str = "auto",
2269
+ # Concurrent-specific parameters
2270
+ batch_size: int = 10,
2271
+ max_concurrent: int = 20,
2272
+ validate_geometries: bool = True,
2273
+ max_retries: int = 3,
2274
+ add_metadata_server: bool = False,
2275
+ # Format parameters (auto-detect from config if not provided)
2276
+ decimal_places: int = None,
2277
+ remove_median_columns: bool = True,
2278
+ convert_water_flag: bool = True,
2279
+ water_flag_threshold: float = 0.5,
2280
+ sort_column: str = "plotId",
2281
+ include_geometry_audit_trail: bool = False,
2282
+ ) -> pd.DataFrame:
2283
+ """
2284
+ Process GeoJSON to Whisp statistics with optimized fast processing.
2285
+
2286
+ Automatically selects between concurrent (high-volume endpoint) and sequential
2287
+ (standard endpoint) based on file size, or allows explicit mode selection.
2288
+
2289
+ This is the recommended entry point for most users who want automatic optimization.
2290
+
2291
+ Parameters
2292
+ ----------
2293
+ input_geojson_filepath : str
2294
+ Path to input GeoJSON file
2295
+ external_id_column : str, optional
2296
+ Column name for external IDs
2297
+ remove_geom : bool
2298
+ Remove geometry column from output
2299
+ national_codes : List[str], optional
2300
+ ISO2 codes for national datasets
2301
+ unit_type : str
2302
+ "ha" or "percent"
2303
+ whisp_image : ee.Image, optional
2304
+ Pre-combined image
2305
+ custom_bands : Dict[str, Any], optional
2306
+ Custom band information
2307
+ mode : str
2308
+ Processing mode:
2309
+ - "auto": Choose based on file size (default)
2310
+ * <1MB: sequential
2311
+ * 1-5MB: sequential
2312
+ * >5MB: concurrent
2313
+ - "concurrent": Force high-volume endpoint (batch processing)
2314
+ - "sequential": Force standard endpoint (single-threaded)
2315
+ batch_size : int
2316
+ Features per batch (only for concurrent mode)
2317
+ max_concurrent : int
2318
+ Maximum concurrent EE calls (only for concurrent mode)
2319
+ validate_geometries : bool
2320
+ Validate and clean geometries
2321
+ max_retries : int
2322
+ Retry attempts per batch (only for concurrent mode)
2323
+ add_metadata_server : bool
2324
+ Add metadata server-side (only for concurrent mode)
2325
+ decimal_places : int, optional
2326
+ Decimal places for rounding. If None, auto-detects from config.
2327
+ remove_median_columns : bool
2328
+ Remove '_median' columns
2329
+ convert_water_flag : bool
2330
+ Convert water flag to boolean
2331
+ water_flag_threshold : float
2332
+ Water flag ratio threshold
2333
+ sort_column : str
2334
+ Column to sort by
2335
+
2336
+ Returns
2337
+ -------
2338
+ pd.DataFrame
2339
+ Validated, formatted results DataFrame
2340
+
2341
+ Examples
2342
+ --------
2343
+ >>> # Auto-detect best method based on file size
2344
+ >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
2345
+
2346
+ >>> # Force concurrent processing for large datasets
2347
+ >>> df = whisp_formatted_stats_geojson_to_df_fast(
2348
+ ... "large_data.geojson",
2349
+ ... mode="concurrent"
2350
+ ... )
2351
+
2352
+ >>> # Use sequential for guaranteed completion
2353
+ >>> df = whisp_formatted_stats_geojson_to_df_fast(
2354
+ ... "data.geojson",
2355
+ ... mode="sequential"
2356
+ ... )
2357
+ """
2358
+ logger = logging.getLogger("whisp")
2359
+
2360
+ # Determine processing mode
2361
+ if mode == "auto":
2362
+ try:
2363
+ file_size = Path(input_geojson_filepath).stat().st_size
2364
+ if file_size > 5_000_000: # >5MB
2365
+ chosen_mode = "concurrent"
2366
+ logger.info(
2367
+ f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
2368
+ )
2369
+ else: # <=5MB
2370
+ chosen_mode = "sequential"
2371
+ logger.info(
2372
+ f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
2373
+ )
2374
+ except Exception as e:
2375
+ logger.warning(
2376
+ f"Could not determine file size: {e}. Defaulting to sequential."
2377
+ )
2378
+ chosen_mode = "sequential"
2379
+ elif mode in ("concurrent", "sequential"):
2380
+ chosen_mode = mode
2381
+ logger.info(f"Mode explicitly set to: {mode}")
2382
+ else:
2383
+ raise ValueError(
2384
+ f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
2385
+ )
2386
+
2387
+ # Route to appropriate function
2388
+ if chosen_mode == "concurrent":
2389
+ logger.debug("Routing to concurrent processing...")
2390
+ return whisp_formatted_stats_geojson_to_df_concurrent(
2391
+ input_geojson_filepath=input_geojson_filepath,
2392
+ external_id_column=external_id_column,
2393
+ remove_geom=remove_geom,
2394
+ national_codes=national_codes,
2395
+ unit_type=unit_type,
2396
+ whisp_image=whisp_image,
2397
+ custom_bands=custom_bands,
2398
+ batch_size=batch_size,
2399
+ max_concurrent=max_concurrent,
2400
+ validate_geometries=validate_geometries,
2401
+ max_retries=max_retries,
2402
+ add_metadata_server=add_metadata_server,
2403
+ logger=logger,
2404
+ decimal_places=decimal_places,
2405
+ remove_median_columns=remove_median_columns,
2406
+ convert_water_flag=convert_water_flag,
2407
+ water_flag_threshold=water_flag_threshold,
2408
+ sort_column=sort_column,
2409
+ include_geometry_audit_trail=include_geometry_audit_trail,
2410
+ )
2411
+ else: # sequential
2412
+ logger.debug("Routing to sequential processing...")
2413
+ return whisp_formatted_stats_geojson_to_df_sequential(
2414
+ input_geojson_filepath=input_geojson_filepath,
2415
+ external_id_column=external_id_column,
2416
+ remove_geom=remove_geom,
2417
+ national_codes=national_codes,
2418
+ unit_type=unit_type,
2419
+ whisp_image=whisp_image,
2420
+ custom_bands=custom_bands,
2421
+ logger=logger,
2422
+ decimal_places=decimal_places,
2423
+ remove_median_columns=remove_median_columns,
2424
+ convert_water_flag=convert_water_flag,
2425
+ water_flag_threshold=water_flag_threshold,
2426
+ sort_column=sort_column,
2427
+ include_geometry_audit_trail=include_geometry_audit_trail,
2428
+ )