openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
36
36
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
37
  import tempfile
38
38
 
39
+ # Configure the "whisp" logger with auto-flush handler for Colab visibility
40
+ _whisp_logger = logging.getLogger("whisp")
41
+ if not _whisp_logger.handlers:
42
+ _handler = logging.StreamHandler(sys.stdout)
43
+ _handler.setLevel(logging.DEBUG)
44
+ _handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
45
+ # Override emit to force flush after each message for Colab
46
+ _original_emit = _handler.emit
47
+
48
+ def _emit_with_flush(record):
49
+ _original_emit(record)
50
+ sys.stdout.flush()
51
+
52
+ _handler.emit = _emit_with_flush
53
+ _whisp_logger.addHandler(_handler)
54
+ _whisp_logger.setLevel(logging.INFO)
55
+ _whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
56
+
39
57
  # ============================================================================
40
58
  # STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
41
59
  # ============================================================================
@@ -445,6 +463,16 @@ def join_admin_codes(
445
463
  columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
446
464
  )
447
465
 
466
+ # Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
467
+ # (e.g., points in the ocean or international waters)
468
+ df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
469
+ "Unknown"
470
+ )
471
+ df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
472
+ "not found"
473
+ )
474
+ df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
475
+
448
476
  logger.debug(
449
477
  f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
450
478
  )
@@ -461,10 +489,16 @@ class ProgressTracker:
461
489
 
462
490
  Shows progress at adaptive milestones (more frequent for small datasets,
463
491
  less frequent for large datasets) with estimated time remaining based on
464
- processing speed.
492
+ processing speed. Includes time-based heartbeat to prevent long silences.
465
493
  """
466
494
 
467
- def __init__(self, total: int, logger: logging.Logger = None):
495
+ def __init__(
496
+ self,
497
+ total: int,
498
+ logger: logging.Logger = None,
499
+ heartbeat_interval: int = 180,
500
+ status_file: str = None,
501
+ ):
468
502
  """
469
503
  Initialize progress tracker.
470
504
 
@@ -474,26 +508,147 @@ class ProgressTracker:
474
508
  Total number of items to process
475
509
  logger : logging.Logger, optional
476
510
  Logger for output
511
+ heartbeat_interval : int, optional
512
+ Seconds between heartbeat messages (default: 180 = 3 minutes)
513
+ status_file : str, optional
514
+ Path to JSON status file for API/web app consumption.
515
+ Checkpoints auto-save to same directory as status_file.
477
516
  """
478
517
  self.total = total
479
518
  self.completed = 0
480
519
  self.lock = threading.Lock()
481
520
  self.logger = logger or logging.getLogger("whisp")
521
+ self.heartbeat_interval = heartbeat_interval
522
+
523
+ # Handle status_file: if directory passed, auto-generate filename
524
+ if status_file:
525
+ import os
526
+
527
+ if os.path.isdir(status_file):
528
+ self.status_file = os.path.join(
529
+ status_file, "whisp_processing_status.json"
530
+ )
531
+ else:
532
+ # Validate that parent directory exists
533
+ parent_dir = os.path.dirname(status_file)
534
+ if parent_dir and not os.path.isdir(parent_dir):
535
+ self.logger.warning(
536
+ f"Status file directory does not exist: {parent_dir}"
537
+ )
538
+ self.status_file = None
539
+ else:
540
+ self.status_file = status_file
541
+ else:
542
+ self.status_file = None
482
543
 
483
544
  # Adaptive milestones based on dataset size
484
545
  # Small datasets (< 50): show every 25% (not too spammy)
485
546
  # Medium (50-500): show every 20%
486
- # Large (500+): show every 10% (more frequent feedback on long runs)
547
+ # Large (500-1000): show every 10%
548
+ # Very large (1000+): show every 5% (cleaner for long jobs)
487
549
  if total < 50:
488
550
  self.milestones = {25, 50, 75, 100}
489
551
  elif total < 500:
490
552
  self.milestones = {20, 40, 60, 80, 100}
491
- else:
553
+ elif total < 1000:
492
554
  self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
555
+ else:
556
+ self.milestones = {
557
+ 5,
558
+ 10,
559
+ 15,
560
+ 20,
561
+ 25,
562
+ 30,
563
+ 35,
564
+ 40,
565
+ 45,
566
+ 50,
567
+ 55,
568
+ 60,
569
+ 65,
570
+ 70,
571
+ 75,
572
+ 80,
573
+ 85,
574
+ 90,
575
+ 95,
576
+ 100,
577
+ }
493
578
 
494
579
  self.shown_milestones = set()
495
580
  self.start_time = time.time()
496
581
  self.last_update_time = self.start_time
582
+ self.heartbeat_stop = threading.Event()
583
+ self.heartbeat_thread = None
584
+
585
+ def _write_status_file(self, status: str = "processing") -> None:
586
+ """Write current progress to JSON status file using atomic write."""
587
+ if not self.status_file:
588
+ return
589
+
590
+ try:
591
+ import json
592
+ import os
593
+
594
+ elapsed = time.time() - self.start_time
595
+ percent = (self.completed / self.total * 100) if self.total > 0 else 0
596
+ rate = self.completed / elapsed if elapsed > 0 else 0
597
+ eta = (
598
+ (self.total - self.completed) / rate * 1.15
599
+ if rate > 0 and percent >= 5
600
+ else None
601
+ )
602
+
603
+ # Write to temp file then atomic rename to prevent partial reads
604
+ from datetime import datetime
605
+
606
+ temp_file = self.status_file + ".tmp"
607
+ with open(temp_file, "w") as f:
608
+ json.dump(
609
+ {
610
+ "status": status,
611
+ "progress": f"{self.completed}/{self.total}",
612
+ "percent": round(percent, 1),
613
+ "elapsed_sec": round(elapsed),
614
+ "eta_sec": round(eta) if eta else None,
615
+ "updated_at": datetime.now().isoformat(),
616
+ },
617
+ f,
618
+ )
619
+ os.replace(temp_file, self.status_file)
620
+ except Exception:
621
+ pass
622
+
623
+ def start_heartbeat(self) -> None:
624
+ """Start background heartbeat thread for time-based progress updates."""
625
+ if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
626
+ self.heartbeat_stop.clear()
627
+ self.heartbeat_thread = threading.Thread(
628
+ target=self._heartbeat_loop, daemon=True
629
+ )
630
+ self.heartbeat_thread.start()
631
+ # Write initial status
632
+ self._write_status_file(status="processing")
633
+
634
+ def _heartbeat_loop(self) -> None:
635
+ """Background loop that logs progress at time intervals."""
636
+ while not self.heartbeat_stop.wait(self.heartbeat_interval):
637
+ with self.lock:
638
+ # Only log if we haven't shown a milestone recently
639
+ time_since_update = time.time() - self.last_update_time
640
+ if (
641
+ time_since_update >= self.heartbeat_interval
642
+ and self.completed < self.total
643
+ ):
644
+ elapsed = time.time() - self.start_time
645
+ percent = int((self.completed / self.total) * 100)
646
+ elapsed_str = self._format_time(elapsed)
647
+ self.logger.info(
648
+ f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
649
+ f"Elapsed: {elapsed_str}"
650
+ )
651
+ self.last_update_time = time.time()
497
652
 
498
653
  def update(self, n: int = 1) -> None:
499
654
  """
@@ -508,7 +663,7 @@ class ProgressTracker:
508
663
  self.completed += n
509
664
  percent = int((self.completed / self.total) * 100)
510
665
 
511
- # Show milestone messages (25%, 50%, 75%, 100%)
666
+ # Show milestone messages (5%, 10%, 15%... for large datasets)
512
667
  for milestone in sorted(self.milestones):
513
668
  if percent >= milestone and milestone not in self.shown_milestones:
514
669
  self.shown_milestones.add(milestone)
@@ -517,20 +672,36 @@ class ProgressTracker:
517
672
  elapsed = time.time() - self.start_time
518
673
  rate = self.completed / elapsed if elapsed > 0 else 0
519
674
  remaining_items = self.total - self.completed
520
- eta_seconds = remaining_items / rate if rate > 0 else 0
675
+
676
+ # Calculate ETA with padding for overhead (loading, joins, etc.)
677
+ # Don't show ETA until we have some samples (at least 5% complete)
678
+ if rate > 0 and self.completed >= max(5, self.total * 0.05):
679
+ eta_seconds = (
680
+ remaining_items / rate
681
+ ) * 1.15 # Add 15% padding for overhead
682
+ else:
683
+ eta_seconds = 0
521
684
 
522
685
  # Format time strings
523
- eta_str = self._format_time(eta_seconds)
686
+ eta_str = (
687
+ self._format_time(eta_seconds)
688
+ if eta_seconds > 0
689
+ else "calculating..."
690
+ )
524
691
  elapsed_str = self._format_time(elapsed)
525
692
 
526
693
  # Build progress message
527
- msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
694
+ msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
528
695
  if percent < 100:
529
696
  msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
530
697
  else:
531
698
  msg += f" | Total time: {elapsed_str}"
532
699
 
533
700
  self.logger.info(msg)
701
+ self.last_update_time = time.time()
702
+
703
+ # Update status file for API consumption
704
+ self._write_status_file()
534
705
 
535
706
  @staticmethod
536
707
  def _format_time(seconds: float) -> str:
@@ -544,14 +715,21 @@ class ProgressTracker:
544
715
  hours = seconds / 3600
545
716
  return f"{hours:.1f}h"
546
717
 
547
- def finish(self) -> None:
548
- """Log completion."""
718
+ def finish(self, output_file: str = None) -> None:
719
+ """Stop heartbeat and log completion."""
720
+ # Stop heartbeat thread
721
+ self.heartbeat_stop.set()
722
+ if self.heartbeat_thread and self.heartbeat_thread.is_alive():
723
+ self.heartbeat_thread.join(timeout=1)
724
+
549
725
  with self.lock:
550
726
  total_time = time.time() - self.start_time
551
727
  time_str = self._format_time(total_time)
552
- self.logger.info(
553
- f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
554
- )
728
+ msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
729
+ self.logger.info(msg)
730
+
731
+ # Write final status
732
+ self._write_status_file(status="completed")
555
733
 
556
734
 
557
735
  # ============================================================================
@@ -600,18 +778,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
600
778
  If incorrect endpoint and raise_error=True
601
779
  """
602
780
  if not check_ee_endpoint(endpoint_type):
603
- msg = (
604
- f"Not using {endpoint_type.upper()} endpoint.\n"
605
- f"Current URL: {ee.data._cloud_api_base_url}\n"
606
- f"\nTo use {endpoint_type} endpoint, run:\n"
607
- )
608
- msg += "ee.Reset()\n"
609
781
  if endpoint_type == "high-volume":
610
- msg += (
611
- "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
782
+ msg = (
783
+ "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
784
+ "ee.Reset()\n"
785
+ "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
786
+ "Or with project specified (e.g. when in Colab):\n"
787
+ "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
788
+ )
789
+ else: # standard endpoint
790
+ msg = (
791
+ "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
792
+ "ee.Reset()\n"
793
+ "ee.Initialize()\n"
794
+ "Or with project specified (e.g. when in Colab):\n"
795
+ "ee.Initialize(project='your_cloud_project_name')"
612
796
  )
613
- else:
614
- msg += "ee.Initialize() # Uses standard endpoint by default"
615
797
 
616
798
  if raise_error:
617
799
  raise RuntimeError(msg)
@@ -808,8 +990,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
808
990
 
809
991
  def clean_geodataframe(
810
992
  gdf: gpd.GeoDataFrame,
811
- remove_nulls: bool = True,
812
- fix_invalid: bool = True,
993
+ remove_nulls: bool = False,
994
+ repair_geometries: bool = False,
813
995
  logger: logging.Logger = None,
814
996
  ) -> gpd.GeoDataFrame:
815
997
  """
@@ -820,9 +1002,11 @@ def clean_geodataframe(
820
1002
  gdf : gpd.GeoDataFrame
821
1003
  Input GeoDataFrame
822
1004
  remove_nulls : bool
823
- Remove null geometries
824
- fix_invalid : bool
825
- Fix invalid geometries
1005
+ Remove null geometries. Defaults to False to preserve data integrity.
1006
+ Set to True only if you explicitly want to drop rows with null geometries.
1007
+ repair_geometries : bool
1008
+ Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
1009
+ original geometries. Set to True only if you want to automatically repair invalid geometries.
826
1010
  logger : logging.Logger, optional
827
1011
  Logger for output
828
1012
 
@@ -839,11 +1023,11 @@ def clean_geodataframe(
839
1023
  logger.warning(f"Removing {null_count} null geometries")
840
1024
  gdf = gdf[~gdf.geometry.isna()].copy()
841
1025
 
842
- if fix_invalid:
1026
+ if repair_geometries:
843
1027
  valid_count = gdf.geometry.is_valid.sum()
844
1028
  invalid_count = len(gdf) - valid_count
845
1029
  if invalid_count > 0:
846
- logger.warning(f"Fixing {invalid_count} invalid geometries")
1030
+ logger.warning(f"Repairing {invalid_count} invalid geometries")
847
1031
  from shapely.validation import make_valid
848
1032
 
849
1033
  gdf = gdf.copy()
@@ -855,6 +1039,19 @@ def clean_geodataframe(
855
1039
  return gdf
856
1040
 
857
1041
 
1042
+ # ============================================================================
1043
+ # BATCH RETRY HELPER
1044
+ # ============================================================================
1045
+
1046
+
1047
+ # ============================================================================
1048
+ # BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
1049
+ # ============================================================================
1050
+ # Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
1051
+ # approach: when a batch fails, reduce batch_size parameter and retry manually.
1052
+ # This avoids semaphore deadlocks and provides clearer error messages.
1053
+
1054
+
858
1055
  # ============================================================================
859
1056
  # EE PROCESSING WITH RETRY LOGIC
860
1057
  # ============================================================================
@@ -964,7 +1161,6 @@ def process_ee_batch(
964
1161
  def whisp_stats_geojson_to_df_concurrent(
965
1162
  input_geojson_filepath: str,
966
1163
  external_id_column: str = None,
967
- remove_geom: bool = False,
968
1164
  national_codes: List[str] = None,
969
1165
  unit_type: str = "ha",
970
1166
  whisp_image: ee.Image = None,
@@ -977,6 +1173,7 @@ def whisp_stats_geojson_to_df_concurrent(
977
1173
  logger: logging.Logger = None,
978
1174
  # Format parameters (auto-detect from config if not provided)
979
1175
  decimal_places: int = None,
1176
+ status_file: str = None,
980
1177
  ) -> pd.DataFrame:
981
1178
  """
982
1179
  Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
@@ -991,8 +1188,6 @@ def whisp_stats_geojson_to_df_concurrent(
991
1188
  Path to input GeoJSON file
992
1189
  external_id_column : str, optional
993
1190
  Column name for external IDs
994
- remove_geom : bool
995
- Remove geometry column from output
996
1191
  national_codes : List[str], optional
997
1192
  ISO2 codes for national datasets
998
1193
  unit_type : str
@@ -1040,8 +1235,29 @@ def whisp_stats_geojson_to_df_concurrent(
1040
1235
  gdf = _load_geojson_silently(input_geojson_filepath)
1041
1236
  logger.info(f"Loaded {len(gdf):,} features")
1042
1237
 
1238
+ # Validate external_id_column if provided (lightweight client-side check)
1239
+ if external_id_column and external_id_column not in gdf.columns:
1240
+ # Exclude geometry column from available columns list
1241
+ available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1242
+ raise ValueError(
1243
+ f"Column '{external_id_column}' not found in GeoJSON properties. "
1244
+ f"Available columns: {available_cols}"
1245
+ )
1246
+
1247
+ # Check completeness of external_id_column (warn if nulls exist)
1248
+ if external_id_column and external_id_column in gdf.columns:
1249
+ null_count = gdf[external_id_column].isna().sum()
1250
+ if null_count > 0:
1251
+ null_pct = (null_count / len(gdf)) * 100
1252
+ logger.warning(
1253
+ f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1254
+ f"These features may have missing external IDs in output."
1255
+ )
1256
+
1043
1257
  if validate_geometries:
1044
- gdf = clean_geodataframe(gdf, logger=logger)
1258
+ gdf = clean_geodataframe(
1259
+ gdf, remove_nulls=False, repair_geometries=False, logger=logger
1260
+ )
1045
1261
 
1046
1262
  # Add stable plotIds for merging (starting from 1, not 0)
1047
1263
  gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1080,13 +1296,18 @@ def whisp_stats_geojson_to_df_concurrent(
1080
1296
 
1081
1297
  # Batch the data
1082
1298
  batches = batch_geodataframe(gdf_for_ee, batch_size)
1083
- logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
1299
+ logger.info(
1300
+ f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
1301
+ )
1084
1302
 
1085
1303
  # Setup semaphore for EE concurrency control
1086
1304
  ee_semaphore = threading.BoundedSemaphore(max_concurrent)
1087
1305
 
1088
- # Progress tracker
1089
- progress = ProgressTracker(len(batches), logger=logger)
1306
+ # Progress tracker with heartbeat for long-running jobs
1307
+ progress = ProgressTracker(
1308
+ len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
1309
+ )
1310
+ progress.start_heartbeat()
1090
1311
 
1091
1312
  results = []
1092
1313
 
@@ -1127,64 +1348,77 @@ def whisp_stats_geojson_to_df_concurrent(
1127
1348
  pyogrio_logger.setLevel(logging.CRITICAL)
1128
1349
 
1129
1350
  try:
1130
- with redirect_stdout(io.StringIO()):
1131
- with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1132
- futures = {
1133
- executor.submit(process_batch, i, batch): i
1134
- for i, batch in enumerate(batches)
1135
- }
1136
-
1137
- for future in as_completed(futures):
1138
- try:
1139
- batch_idx, df_server, df_client = future.result()
1140
-
1141
- # Merge server and client results
1142
- if plot_id_column not in df_server.columns:
1143
- df_server[plot_id_column] = range(len(df_server))
1144
-
1145
- # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1146
- # These are the actual EE processing results
1147
- df_server_clean = df_server.copy()
1148
-
1149
- # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1150
- # (formatted wrapper handles keep_external_columns parameter)
1151
- keep_external_columns = [plot_id_column]
1152
- if (
1153
- external_id_column
1154
- and external_id_column in df_client.columns
1155
- ):
1156
- keep_external_columns.append(external_id_column)
1157
- if "geometry" in df_client.columns:
1158
- keep_external_columns.append("geometry")
1159
- # Keep geometry type column (Geometry_type)
1160
- if geometry_type_column in df_client.columns:
1161
- keep_external_columns.append(geometry_type_column)
1162
- # Also keep centroid columns (Centroid_lon, Centroid_lat)
1163
- centroid_cols = [
1164
- c for c in df_client.columns if c.startswith("Centroid_")
1165
- ]
1166
- keep_external_columns.extend(centroid_cols)
1167
-
1168
- df_client_clean = df_client[
1169
- [c for c in keep_external_columns if c in df_client.columns]
1170
- ].drop_duplicates()
1171
-
1172
- merged = df_server_clean.merge(
1173
- df_client_clean,
1174
- on=plot_id_column,
1175
- how="left",
1176
- suffixes=("_ee", "_client"),
1177
- )
1178
- results.append(merged)
1179
- progress.update()
1351
+ # Don't suppress stdout here - we want progress messages to show in Colab
1352
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1353
+ futures = {
1354
+ executor.submit(process_batch, i, batch): i
1355
+ for i, batch in enumerate(batches)
1356
+ }
1180
1357
 
1181
- except Exception as e:
1182
- error_msg = str(e)
1183
- logger.error(f"Batch processing error: {error_msg[:100]}")
1184
- import traceback
1358
+ # Track which batches failed for retry
1359
+ batch_map = {i: batch for i, batch in enumerate(batches)}
1360
+ batch_futures = {future: i for future, i in futures.items()}
1185
1361
 
1186
- logger.debug(traceback.format_exc())
1187
- batch_errors.append(error_msg)
1362
+ for future in as_completed(futures):
1363
+ batch_idx = batch_futures[future]
1364
+ try:
1365
+ batch_idx, df_server, df_client = future.result()
1366
+
1367
+ # Merge server and client results
1368
+ if plot_id_column not in df_server.columns:
1369
+ df_server[plot_id_column] = range(len(df_server))
1370
+
1371
+ # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1372
+ # These are the actual EE processing results
1373
+ df_server_clean = df_server.copy()
1374
+
1375
+ # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1376
+ # (formatted wrapper handles keep_external_columns parameter)
1377
+ keep_external_columns = [plot_id_column]
1378
+ if external_id_column and external_id_column in df_client.columns:
1379
+ keep_external_columns.append(external_id_column)
1380
+ if "geometry" in df_client.columns:
1381
+ keep_external_columns.append("geometry")
1382
+ # Keep geometry type column (Geometry_type)
1383
+ if geometry_type_column in df_client.columns:
1384
+ keep_external_columns.append(geometry_type_column)
1385
+ # Also keep centroid columns (Centroid_lon, Centroid_lat)
1386
+ centroid_cols = [
1387
+ c for c in df_client.columns if c.startswith("Centroid_")
1388
+ ]
1389
+ keep_external_columns.extend(centroid_cols)
1390
+
1391
+ df_client_clean = df_client[
1392
+ [c for c in keep_external_columns if c in df_client.columns]
1393
+ ]
1394
+ # Don't drop duplicates - we need one row per feature (one per plot_id)
1395
+ # Each plot_id should have exactly one row with its metadata
1396
+
1397
+ merged = df_server_clean.merge(
1398
+ df_client_clean,
1399
+ on=plot_id_column,
1400
+ how="left",
1401
+ suffixes=("_ee", "_client"),
1402
+ )
1403
+ results.append(merged)
1404
+ progress.update()
1405
+
1406
+ except Exception as e:
1407
+ # Batch failed - fail fast with clear guidance
1408
+ error_msg = str(e)
1409
+ logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1410
+ logger.debug(f"Full error: {error_msg}")
1411
+
1412
+ # Get original batch for error reporting
1413
+ original_batch = batch_map[batch_idx]
1414
+
1415
+ # Add to batch errors for final reporting
1416
+ batch_errors.append((batch_idx, original_batch, error_msg))
1417
+ except (KeyboardInterrupt, SystemExit) as interrupt:
1418
+ logger.warning("Processing interrupted by user")
1419
+ # Update status file with interrupted state
1420
+ progress._write_status_file(status="interrupted")
1421
+ raise interrupt
1188
1422
  finally:
1189
1423
  # Restore logger levels
1190
1424
  fiona_logger.setLevel(old_fiona_level)
@@ -1192,8 +1426,60 @@ def whisp_stats_geojson_to_df_concurrent(
1192
1426
 
1193
1427
  progress.finish()
1194
1428
 
1195
- # Check if we should retry with validation due to band errors
1196
- if batch_errors and not results:
1429
+ # If we have batch errors after retry attempts, fail the entire process
1430
+ if batch_errors:
1431
+ total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
1432
+ failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
1433
+
1434
+ # Format detailed error information for debugging
1435
+ error_details_list = []
1436
+ for idx, batch, msg in batch_errors:
1437
+ error_details_list.append(f" Batch {idx} ({len(batch)} features): {msg}")
1438
+ error_details = "\n".join(error_details_list)
1439
+
1440
+ # Analyze error patterns for debugging hints
1441
+ error_patterns = {
1442
+ "memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
1443
+ "request_size": any(
1444
+ keyword in msg.lower()
1445
+ for _, _, msg in batch_errors
1446
+ for keyword in ["too large", "10mb", "payload", "size limit"]
1447
+ ),
1448
+ "quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
1449
+ "timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
1450
+ }
1451
+
1452
+ # Build helpful suggestions based on error patterns
1453
+ suggestions = []
1454
+ if error_patterns["memory"]:
1455
+ suggestions.append(
1456
+ f" • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
1457
+ )
1458
+ if error_patterns["request_size"]:
1459
+ suggestions.append(
1460
+ " • Request payload too large: reduce batch_size or simplify input geometries"
1461
+ )
1462
+ if error_patterns["quota"]:
1463
+ suggestions.append(" • Earth Engine quota exceeded: wait and retry later")
1464
+ if error_patterns["timeout"]:
1465
+ suggestions.append(
1466
+ " • Processing timeout: reduce batch_size or simplify input geometries"
1467
+ )
1468
+
1469
+ suggestions_text = (
1470
+ "\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
1471
+ )
1472
+
1473
+ raise RuntimeError(
1474
+ f"Failed to process {len(batch_errors)} batch(es):\n"
1475
+ f"\n{error_details}\n"
1476
+ f"\nTotal rows affected: {total_failed_rows}\n"
1477
+ f"{suggestions_text}\n"
1478
+ f"Please reduce batch_size and try again."
1479
+ )
1480
+
1481
+ # Check if we should retry with validation due to band errors (legacy band error handling)
1482
+ if not results:
1197
1483
  # All batches failed - likely a bad band issue
1198
1484
  is_band_error = any(
1199
1485
  keyword in str(batch_errors)
@@ -1483,7 +1769,7 @@ def whisp_stats_geojson_to_df_concurrent(
1483
1769
  )
1484
1770
  raise retry_e
1485
1771
 
1486
- logger.info(f"Processed {len(formatted):,} features successfully")
1772
+ logger.info(f"Processing complete: {len(formatted):,} features")
1487
1773
  return formatted
1488
1774
  else:
1489
1775
  logger.error(" No results produced")
@@ -1498,7 +1784,6 @@ def whisp_stats_geojson_to_df_concurrent(
1498
1784
  def whisp_stats_geojson_to_df_sequential(
1499
1785
  input_geojson_filepath: str,
1500
1786
  external_id_column: str = None,
1501
- remove_geom: bool = False,
1502
1787
  national_codes: List[str] = None,
1503
1788
  unit_type: str = "ha",
1504
1789
  whisp_image: ee.Image = None,
@@ -1523,8 +1808,6 @@ def whisp_stats_geojson_to_df_sequential(
1523
1808
  Path to input GeoJSON
1524
1809
  external_id_column : str, optional
1525
1810
  Column name for external IDs
1526
- remove_geom : bool
1527
- Remove geometry from output
1528
1811
  national_codes : List[str], optional
1529
1812
  ISO2 codes for national datasets
1530
1813
  unit_type : str
@@ -1564,8 +1847,29 @@ def whisp_stats_geojson_to_df_sequential(
1564
1847
  gdf = _load_geojson_silently(input_geojson_filepath)
1565
1848
  logger.info(f"Loaded {len(gdf):,} features")
1566
1849
 
1567
- # Clean geometries
1568
- gdf = clean_geodataframe(gdf, logger=logger)
1850
+ # Validate external_id_column if provided (lightweight client-side check)
1851
+ if external_id_column and external_id_column not in gdf.columns:
1852
+ # Exclude geometry column from available columns list
1853
+ available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1854
+ raise ValueError(
1855
+ f"Column '{external_id_column}' not found in GeoJSON properties. "
1856
+ f"Available columns: {available_cols}"
1857
+ )
1858
+
1859
+ # Check completeness of external_id_column (warn if nulls exist)
1860
+ if external_id_column and external_id_column in gdf.columns:
1861
+ null_count = gdf[external_id_column].isna().sum()
1862
+ if null_count > 0:
1863
+ null_pct = (null_count / len(gdf)) * 100
1864
+ logger.warning(
1865
+ f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1866
+ f"These features may have missing external IDs in output."
1867
+ )
1868
+
1869
+ # Clean geometries (preserve both null and invalid geometries by default)
1870
+ gdf = clean_geodataframe(
1871
+ gdf, remove_nulls=False, repair_geometries=False, logger=logger
1872
+ )
1569
1873
 
1570
1874
  # Add stable plotIds for merging (starting from 1, not 0)
1571
1875
  gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1612,7 +1916,9 @@ def whisp_stats_geojson_to_df_sequential(
1612
1916
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1613
1917
 
1614
1918
  # Process server-side with error handling for bad bands
1615
- logger.info("Processing with Earth Engine...")
1919
+ logger.info(
1920
+ f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
1921
+ )
1616
1922
  try:
1617
1923
  results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
1618
1924
  df_server = convert_ee_to_df(results_fc)
@@ -1698,7 +2004,7 @@ def whisp_stats_geojson_to_df_sequential(
1698
2004
  convert_water_flag=True,
1699
2005
  )
1700
2006
 
1701
- logger.info(f"Processed {len(formatted):,} features")
2007
+ logger.info(f"Processing complete: {len(formatted):,} features")
1702
2008
 
1703
2009
  # Consolidate external_id_column to standardized 'external_id'
1704
2010
  if external_id_column:
@@ -1731,7 +2037,6 @@ def whisp_stats_geojson_to_df_sequential(
1731
2037
  def whisp_formatted_stats_geojson_to_df_concurrent(
1732
2038
  input_geojson_filepath: str,
1733
2039
  external_id_column: str = None,
1734
- remove_geom: bool = False,
1735
2040
  national_codes: List[str] = None,
1736
2041
  unit_type: str = "ha",
1737
2042
  whisp_image: ee.Image = None,
@@ -1748,7 +2053,8 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1748
2053
  convert_water_flag: bool = True,
1749
2054
  water_flag_threshold: float = 0.5,
1750
2055
  sort_column: str = "plotId",
1751
- include_geometry_audit_trail: bool = False,
2056
+ geometry_audit_trail: bool = False,
2057
+ status_file: str = None,
1752
2058
  ) -> pd.DataFrame:
1753
2059
  """
1754
2060
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -1764,8 +2070,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1764
2070
  Path to input GeoJSON file
1765
2071
  external_id_column : str, optional
1766
2072
  Column name for external IDs
1767
- remove_geom : bool
1768
- Remove geometry column from output
1769
2073
  national_codes : List[str], optional
1770
2074
  ISO2 codes for national datasets
1771
2075
  unit_type : str
@@ -1799,14 +2103,10 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1799
2103
  Water flag ratio threshold (default 0.5)
1800
2104
  sort_column : str
1801
2105
  Column to sort by (default "plotId", None to skip)
1802
- include_geometry_audit_trail : bool, default False
1803
- If True, includes audit trail columns:
1804
- - geo_original: Original input geometry (before EE processing)
1805
- - geometry_type_original: Original geometry type
1806
- - geometry_type: Processed geometry type (from EE)
1807
- - geometry_type_changed: Boolean flag if geometry changed
1808
- - geometry_type_transition: Description of how it changed
1809
- These columns enable full transparency and auditability for compliance tracking.
2106
+ geometry_audit_trail : bool, default False
2107
+ If True, includes original input geometry column:
2108
+ - geo_original: Original input geometry (before EE processing), stored as GeoJSON
2109
+ Enables geometry traceability for compliance and audit purposes.
1810
2110
 
1811
2111
  Returns
1812
2112
  -------
@@ -1826,15 +2126,17 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1826
2126
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1827
2127
  logger.debug(f"Using decimal_places={decimal_places} from config")
1828
2128
 
1829
- # Normalize keep_external_columns parameter early (will be used in merge logic later)
1830
- # Load GeoJSON temporarily to get column names for normalization
2129
+ # Load original geometries once here if needed for audit trail (avoid reloading later)
2130
+ gdf_original_geoms = None
2131
+ if geometry_audit_trail:
2132
+ logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2133
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
1831
2134
 
1832
2135
  # Step 1: Get raw stats
1833
2136
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
1834
2137
  df_raw = whisp_stats_geojson_to_df_concurrent(
1835
2138
  input_geojson_filepath=input_geojson_filepath,
1836
2139
  external_id_column=external_id_column,
1837
- remove_geom=remove_geom,
1838
2140
  national_codes=national_codes,
1839
2141
  unit_type=unit_type,
1840
2142
  whisp_image=whisp_image,
@@ -1845,6 +2147,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1845
2147
  max_retries=max_retries,
1846
2148
  add_metadata_server=add_metadata_server,
1847
2149
  logger=logger,
2150
+ status_file=status_file,
1848
2151
  )
1849
2152
 
1850
2153
  # Step 2: Format the output
@@ -1890,95 +2193,39 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1890
2193
  )
1891
2194
 
1892
2195
  # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
1893
- if include_geometry_audit_trail:
2196
+ if geometry_audit_trail:
1894
2197
  logger.debug("Adding audit trail columns...")
1895
2198
  try:
1896
- # Capture original geometries AFTER we have the raw stats
1897
- logger.debug("Capturing original geometries for audit trail...")
1898
- gdf_original = _load_geojson_silently(input_geojson_filepath)
2199
+ # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2200
+ if gdf_original_geoms is None:
2201
+ logger.warning("Original geometries not pre-loaded, loading now...")
2202
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
1899
2203
 
1900
2204
  # Use plotId from df_validated to maintain mapping
1901
2205
  df_original_geom = pd.DataFrame(
1902
2206
  {
1903
- "plotId": df_validated["plotId"].values[: len(gdf_original)],
1904
- "geo_original": gdf_original["geometry"].apply(
2207
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2208
+ "geo_original": gdf_original_geoms["geometry"].apply(
1905
2209
  lambda g: json.dumps(mapping(g)) if g is not None else None
1906
2210
  ),
1907
- "geometry_type_original": gdf_original["geometry"].geom_type.values,
1908
2211
  }
1909
2212
  )
1910
2213
 
1911
2214
  # Merge original geometries back
1912
2215
  df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
1913
2216
 
1914
- # Extract geometry type from processed 'geo' column if it exists
1915
- # Note: 'geo' column may not exist after validation removes extra columns
1916
- if "geo" in df_validated.columns:
1917
- # Use geo column from validated dataframe
1918
- def extract_geom_type(x):
1919
- try:
1920
- if isinstance(x, dict):
1921
- return x.get("type")
1922
- elif isinstance(x, str):
1923
- # Handle both JSON strings and Python dict string representations
1924
- try:
1925
- parsed = json.loads(x)
1926
- except:
1927
- # Try ast.literal_eval for Python dict representations
1928
- import ast
1929
-
1930
- parsed = ast.literal_eval(x)
1931
- return (
1932
- parsed.get("type") if isinstance(parsed, dict) else None
1933
- )
1934
- except:
1935
- pass
1936
- return None
1937
-
1938
- df_validated["geometry_type"] = df_validated["geo"].apply(
1939
- extract_geom_type
1940
- )
1941
- else:
1942
- # If geo doesn't exist, just use the original type
1943
- df_validated["geometry_type"] = df_validated["geometry_type_original"]
1944
-
1945
- # Flag if geometry changed
1946
- df_validated["geometry_type_changed"] = (
1947
- df_validated["geometry_type_original"] != df_validated["geometry_type"]
1948
- )
1949
-
1950
- # Classify the geometry type transition
1951
- def classify_transition(orig, proc):
1952
- if orig == proc:
1953
- return "no_change"
1954
- elif proc == "LineString":
1955
- return f"{orig}_simplified_to_linestring"
1956
- elif proc == "Point":
1957
- return f"{orig}_simplified_to_point"
1958
- else:
1959
- return f"{orig}_to_{proc}"
1960
-
1961
- df_validated["geometry_type_transition"] = df_validated.apply(
1962
- lambda row: classify_transition(
1963
- row["geometry_type_original"], row["geometry_type"]
1964
- ),
1965
- axis=1,
1966
- )
1967
-
1968
2217
  # Store processing metadata
1969
2218
  df_validated.attrs["processing_metadata"] = {
1970
- "whisp_version": "2.0",
2219
+ "whisp_version": "3.0.0a1",
1971
2220
  "processing_date": datetime.now().isoformat(),
1972
2221
  "processing_mode": "concurrent",
1973
2222
  "ee_endpoint": "high_volume",
1974
2223
  "validate_geometries": validate_geometries,
1975
2224
  "datasets_used": national_codes or [],
1976
- "include_geometry_audit_trail": True,
2225
+ "geometry_audit_trail": True,
1977
2226
  }
1978
2227
 
1979
- logger.info(
1980
- f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
1981
- )
2228
+ logger.info(f"Audit trail added: geo_original column")
1982
2229
 
1983
2230
  except Exception as e:
1984
2231
  logger.warning(f"Error adding audit trail: {e}")
@@ -2003,7 +2250,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2003
2250
  def whisp_formatted_stats_geojson_to_df_sequential(
2004
2251
  input_geojson_filepath: str,
2005
2252
  external_id_column: str = None,
2006
- remove_geom: bool = False,
2007
2253
  national_codes: List[str] = None,
2008
2254
  unit_type: str = "ha",
2009
2255
  whisp_image: ee.Image = None,
@@ -2016,7 +2262,8 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2016
2262
  convert_water_flag: bool = True,
2017
2263
  water_flag_threshold: float = 0.5,
2018
2264
  sort_column: str = "plotId",
2019
- include_geometry_audit_trail: bool = False,
2265
+ geometry_audit_trail: bool = False,
2266
+ status_file: str = None,
2020
2267
  ) -> pd.DataFrame:
2021
2268
  """
2022
2269
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -2032,8 +2279,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2032
2279
  Path to input GeoJSON file
2033
2280
  external_id_column : str, optional
2034
2281
  Column name for external IDs
2035
- remove_geom : bool
2036
- Remove geometry from output
2037
2282
  national_codes : List[str], optional
2038
2283
  ISO2 codes for national datasets
2039
2284
  unit_type : str
@@ -2059,14 +2304,10 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2059
2304
  Water flag ratio threshold (default 0.5)
2060
2305
  sort_column : str
2061
2306
  Column to sort by (default "plotId", None to skip)
2062
- include_geometry_audit_trail : bool, default True
2063
- If True, includes audit trail columns:
2064
- - geo_original: Original input geometry (before EE processing)
2065
- - geometry_type_original: Original geometry type
2066
- - geometry_type: Processed geometry type (from EE)
2067
- - geometry_type_changed: Boolean flag if geometry changed
2068
- - geometry_type_transition: Description of how it changed
2069
- These columns enable full transparency and auditability for EUDR compliance.
2307
+ geometry_audit_trail : bool, default True
2308
+ If True, includes original input geometry column:
2309
+ - geo_original: Original input geometry (before EE processing), stored as GeoJSON
2310
+ Enables geometry traceability for compliance and audit purposes.
2070
2311
 
2071
2312
  Returns
2072
2313
  -------
@@ -2086,12 +2327,17 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2086
2327
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
2087
2328
  logger.debug(f"Using decimal_places={decimal_places} from config")
2088
2329
 
2330
+ # Load original geometries once here if needed for audit trail (avoid reloading later)
2331
+ gdf_original_geoms = None
2332
+ if geometry_audit_trail:
2333
+ logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2334
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2335
+
2089
2336
  # Step 1: Get raw stats
2090
2337
  logger.debug("Step 1/2: Extracting statistics (sequential)...")
2091
2338
  df_raw = whisp_stats_geojson_to_df_sequential(
2092
2339
  input_geojson_filepath=input_geojson_filepath,
2093
2340
  external_id_column=external_id_column,
2094
- remove_geom=remove_geom,
2095
2341
  national_codes=national_codes,
2096
2342
  unit_type=unit_type,
2097
2343
  whisp_image=whisp_image,
@@ -2143,94 +2389,38 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2143
2389
  )
2144
2390
 
2145
2391
  # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2146
- if include_geometry_audit_trail:
2392
+ if geometry_audit_trail:
2147
2393
  logger.debug("Adding audit trail columns...")
2148
2394
  try:
2149
- # Capture original geometries AFTER we have the raw stats
2150
- logger.debug("Capturing original geometries for audit trail...")
2151
- gdf_original = _load_geojson_silently(input_geojson_filepath)
2395
+ # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2396
+ if gdf_original_geoms is None:
2397
+ logger.warning("Original geometries not pre-loaded, loading now...")
2398
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2152
2399
 
2153
2400
  # Use plotId from df_validated to maintain mapping
2154
2401
  df_original_geom = pd.DataFrame(
2155
2402
  {
2156
- "plotId": df_validated["plotId"].values[: len(gdf_original)],
2157
- "geo_original": gdf_original["geometry"].apply(
2403
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2404
+ "geo_original": gdf_original_geoms["geometry"].apply(
2158
2405
  lambda g: json.dumps(mapping(g)) if g is not None else None
2159
2406
  ),
2160
- "geometry_type_original": gdf_original["geometry"].geom_type.values,
2161
2407
  }
2162
2408
  )
2163
2409
 
2164
2410
  # Merge original geometries back
2165
2411
  df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2166
2412
 
2167
- # Extract geometry type from processed 'geo' column if it exists
2168
- # Note: 'geo' column may not exist after validation removes extra columns
2169
- if "geo" in df_validated.columns:
2170
- # Use geo column from validated dataframe
2171
- def extract_geom_type(x):
2172
- try:
2173
- if isinstance(x, dict):
2174
- return x.get("type")
2175
- elif isinstance(x, str):
2176
- # Handle both JSON strings and Python dict string representations
2177
- try:
2178
- parsed = json.loads(x)
2179
- except:
2180
- # Try ast.literal_eval for Python dict representations
2181
- import ast
2182
-
2183
- parsed = ast.literal_eval(x)
2184
- return (
2185
- parsed.get("type") if isinstance(parsed, dict) else None
2186
- )
2187
- except:
2188
- pass
2189
- return None
2190
-
2191
- df_validated["geometry_type"] = df_validated["geo"].apply(
2192
- extract_geom_type
2193
- )
2194
- else:
2195
- # If geo doesn't exist, just use the original type
2196
- df_validated["geometry_type"] = df_validated["geometry_type_original"]
2197
-
2198
- # Flag if geometry changed
2199
- df_validated["geometry_type_changed"] = (
2200
- df_validated["geometry_type_original"] != df_validated["geometry_type"]
2201
- )
2202
-
2203
- # Classify the geometry type transition
2204
- def classify_transition(orig, proc):
2205
- if orig == proc:
2206
- return "no_change"
2207
- elif proc == "LineString":
2208
- return f"{orig}_simplified_to_linestring"
2209
- elif proc == "Point":
2210
- return f"{orig}_simplified_to_point"
2211
- else:
2212
- return f"{orig}_to_{proc}"
2213
-
2214
- df_validated["geometry_type_transition"] = df_validated.apply(
2215
- lambda row: classify_transition(
2216
- row["geometry_type_original"], row["geometry_type"]
2217
- ),
2218
- axis=1,
2219
- )
2220
-
2221
2413
  # Store processing metadata
2222
2414
  df_validated.attrs["processing_metadata"] = {
2223
- "whisp_version": "2.0",
2415
+ "whisp_version": "3.0.0a1",
2224
2416
  "processing_date": datetime.now().isoformat(),
2225
2417
  "processing_mode": "sequential",
2226
2418
  "ee_endpoint": "standard",
2227
2419
  "datasets_used": national_codes or [],
2228
- "include_geometry_audit_trail": True,
2420
+ "geometry_audit_trail": True,
2229
2421
  }
2230
2422
 
2231
- logger.info(
2232
- f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
2233
- )
2423
+ logger.info(f"Audit trail added: geo_original column")
2234
2424
 
2235
2425
  except Exception as e:
2236
2426
  logger.warning(f"Error adding audit trail: {e}")
@@ -2260,12 +2450,11 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2260
2450
  def whisp_formatted_stats_geojson_to_df_fast(
2261
2451
  input_geojson_filepath: str,
2262
2452
  external_id_column: str = None,
2263
- remove_geom: bool = False,
2264
2453
  national_codes: List[str] = None,
2265
2454
  unit_type: str = "ha",
2266
2455
  whisp_image: ee.Image = None,
2267
2456
  custom_bands: Dict[str, Any] = None,
2268
- mode: str = "auto",
2457
+ mode: str = "sequential",
2269
2458
  # Concurrent-specific parameters
2270
2459
  batch_size: int = 10,
2271
2460
  max_concurrent: int = 20,
@@ -2278,15 +2467,16 @@ def whisp_formatted_stats_geojson_to_df_fast(
2278
2467
  convert_water_flag: bool = True,
2279
2468
  water_flag_threshold: float = 0.5,
2280
2469
  sort_column: str = "plotId",
2281
- include_geometry_audit_trail: bool = False,
2470
+ geometry_audit_trail: bool = False,
2471
+ status_file: str = None,
2282
2472
  ) -> pd.DataFrame:
2283
2473
  """
2284
2474
  Process GeoJSON to Whisp statistics with optimized fast processing.
2285
2475
 
2286
- Automatically selects between concurrent (high-volume endpoint) and sequential
2287
- (standard endpoint) based on file size, or allows explicit mode selection.
2476
+ Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
2477
+ based on explicit mode selection.
2288
2478
 
2289
- This is the recommended entry point for most users who want automatic optimization.
2479
+ This is the recommended entry point for most users.
2290
2480
 
2291
2481
  Parameters
2292
2482
  ----------
@@ -2294,8 +2484,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2294
2484
  Path to input GeoJSON file
2295
2485
  external_id_column : str, optional
2296
2486
  Column name for external IDs
2297
- remove_geom : bool
2298
- Remove geometry column from output
2299
2487
  national_codes : List[str], optional
2300
2488
  ISO2 codes for national datasets
2301
2489
  unit_type : str
@@ -2306,12 +2494,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
2306
2494
  Custom band information
2307
2495
  mode : str
2308
2496
  Processing mode:
2309
- - "auto": Choose based on file size (default)
2310
- * <1MB: sequential
2311
- * 1-5MB: sequential
2312
- * >5MB: concurrent
2313
- - "concurrent": Force high-volume endpoint (batch processing)
2314
- - "sequential": Force standard endpoint (single-threaded)
2497
+ - "concurrent": Uses high-volume endpoint with batch processing
2498
+ - "sequential": Uses standard endpoint for sequential processing
2315
2499
  batch_size : int
2316
2500
  Features per batch (only for concurrent mode)
2317
2501
  max_concurrent : int
@@ -2332,6 +2516,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
2332
2516
  Water flag ratio threshold
2333
2517
  sort_column : str
2334
2518
  Column to sort by
2519
+ geometry_audit_trail : bool
2520
+ Include geometry modification audit trail columns
2335
2521
 
2336
2522
  Returns
2337
2523
  -------
@@ -2340,16 +2526,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
2340
2526
 
2341
2527
  Examples
2342
2528
  --------
2343
- >>> # Auto-detect best method based on file size
2344
- >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
2345
-
2346
- >>> # Force concurrent processing for large datasets
2529
+ >>> # Use concurrent processing (recommended for most datasets)
2347
2530
  >>> df = whisp_formatted_stats_geojson_to_df_fast(
2348
- ... "large_data.geojson",
2531
+ ... "data.geojson",
2349
2532
  ... mode="concurrent"
2350
2533
  ... )
2351
2534
 
2352
- >>> # Use sequential for guaranteed completion
2535
+ >>> # Use sequential processing for more stable results
2353
2536
  >>> df = whisp_formatted_stats_geojson_to_df_fast(
2354
2537
  ... "data.geojson",
2355
2538
  ... mode="sequential"
@@ -2357,40 +2540,20 @@ def whisp_formatted_stats_geojson_to_df_fast(
2357
2540
  """
2358
2541
  logger = logging.getLogger("whisp")
2359
2542
 
2360
- # Determine processing mode
2361
- if mode == "auto":
2362
- try:
2363
- file_size = Path(input_geojson_filepath).stat().st_size
2364
- if file_size > 5_000_000: # >5MB
2365
- chosen_mode = "concurrent"
2366
- logger.info(
2367
- f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
2368
- )
2369
- else: # <=5MB
2370
- chosen_mode = "sequential"
2371
- logger.info(
2372
- f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
2373
- )
2374
- except Exception as e:
2375
- logger.warning(
2376
- f"Could not determine file size: {e}. Defaulting to sequential."
2377
- )
2378
- chosen_mode = "sequential"
2379
- elif mode in ("concurrent", "sequential"):
2380
- chosen_mode = mode
2381
- logger.info(f"Mode explicitly set to: {mode}")
2382
- else:
2543
+ # Validate mode parameter
2544
+ if mode not in ("concurrent", "sequential"):
2383
2545
  raise ValueError(
2384
- f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
2546
+ f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
2385
2547
  )
2386
2548
 
2549
+ logger.info(f"Mode: {mode}")
2550
+
2387
2551
  # Route to appropriate function
2388
- if chosen_mode == "concurrent":
2552
+ if mode == "concurrent":
2389
2553
  logger.debug("Routing to concurrent processing...")
2390
2554
  return whisp_formatted_stats_geojson_to_df_concurrent(
2391
2555
  input_geojson_filepath=input_geojson_filepath,
2392
2556
  external_id_column=external_id_column,
2393
- remove_geom=remove_geom,
2394
2557
  national_codes=national_codes,
2395
2558
  unit_type=unit_type,
2396
2559
  whisp_image=whisp_image,
@@ -2406,14 +2569,14 @@ def whisp_formatted_stats_geojson_to_df_fast(
2406
2569
  convert_water_flag=convert_water_flag,
2407
2570
  water_flag_threshold=water_flag_threshold,
2408
2571
  sort_column=sort_column,
2409
- include_geometry_audit_trail=include_geometry_audit_trail,
2572
+ geometry_audit_trail=geometry_audit_trail,
2573
+ status_file=status_file,
2410
2574
  )
2411
2575
  else: # sequential
2412
2576
  logger.debug("Routing to sequential processing...")
2413
2577
  return whisp_formatted_stats_geojson_to_df_sequential(
2414
2578
  input_geojson_filepath=input_geojson_filepath,
2415
2579
  external_id_column=external_id_column,
2416
- remove_geom=remove_geom,
2417
2580
  national_codes=national_codes,
2418
2581
  unit_type=unit_type,
2419
2582
  whisp_image=whisp_image,
@@ -2424,5 +2587,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2424
2587
  convert_water_flag=convert_water_flag,
2425
2588
  water_flag_threshold=water_flag_threshold,
2426
2589
  sort_column=sort_column,
2427
- include_geometry_audit_trail=include_geometry_audit_trail,
2590
+ geometry_audit_trail=geometry_audit_trail,
2591
+ status_file=status_file,
2428
2592
  )