openforis-whisp 3.0.0a3__py3-none-any.whl → 3.0.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
36
36
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
37
  import tempfile
38
38
 
39
+ # Configure the "whisp" logger with auto-flush handler for Colab visibility
40
+ _whisp_logger = logging.getLogger("whisp")
41
+ if not _whisp_logger.handlers:
42
+ _handler = logging.StreamHandler(sys.stdout)
43
+ _handler.setLevel(logging.DEBUG)
44
+ _handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
45
+ # Override emit to force flush after each message for Colab
46
+ _original_emit = _handler.emit
47
+
48
+ def _emit_with_flush(record):
49
+ _original_emit(record)
50
+ sys.stdout.flush()
51
+
52
+ _handler.emit = _emit_with_flush
53
+ _whisp_logger.addHandler(_handler)
54
+ _whisp_logger.setLevel(logging.INFO)
55
+ _whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
56
+
39
57
  # ============================================================================
40
58
  # STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
41
59
  # ============================================================================
@@ -163,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
163
181
  reformat_logger.setLevel(logging.ERROR)
164
182
 
165
183
 
166
- def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
167
- """Load GeoJSON file with all output suppressed."""
184
+ def _load_and_prepare_geojson(
185
+ filepath: str, external_id_column: Optional[str] = None
186
+ ) -> gpd.GeoDataFrame:
187
+ """Load GeoJSON file and prepare for processing.
188
+
189
+ Suppresses logging output and optionally renames external_id column.
190
+
191
+ Parameters
192
+ ----------
193
+ filepath : str
194
+ Path to GeoJSON file
195
+ external_id_column : str, optional
196
+ If provided, rename this column to 'external_id' immediately after loading
197
+
198
+ Returns
199
+ -------
200
+ gpd.GeoDataFrame
201
+ Loaded GeoDataFrame with external_id renamed if specified
202
+ """
168
203
  fiona_logger = logging.getLogger("fiona")
169
204
  pyogrio_logger = logging.getLogger("pyogrio._io")
170
205
  old_fiona_level = fiona_logger.level
@@ -175,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
175
210
  try:
176
211
  with redirect_stdout(io.StringIO()):
177
212
  gdf = gpd.read_file(filepath)
213
+
214
+ # Rename external_id column early and convert to string
215
+ if external_id_column and external_id_column in gdf.columns:
216
+ if external_id_column != "external_id":
217
+ gdf = gdf.rename(
218
+ columns={external_id_column: "external_id"}
219
+ ) # hard coding here to avoid confusion later
220
+ # Convert to string to ensure consistent type throughout pipeline
221
+ gdf["external_id"] = gdf["external_id"].astype(str)
222
+
178
223
  return gdf
179
224
  finally:
180
225
  fiona_logger.setLevel(old_fiona_level)
@@ -445,6 +490,16 @@ def join_admin_codes(
445
490
  columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
446
491
  )
447
492
 
493
+ # Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
494
+ # (e.g., points in the ocean or international waters)
495
+ df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
496
+ "Unknown"
497
+ )
498
+ df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
499
+ "not found"
500
+ )
501
+ df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
502
+
448
503
  logger.debug(
449
504
  f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
450
505
  )
@@ -461,10 +516,16 @@ class ProgressTracker:
461
516
 
462
517
  Shows progress at adaptive milestones (more frequent for small datasets,
463
518
  less frequent for large datasets) with estimated time remaining based on
464
- processing speed.
519
+ processing speed. Includes time-based heartbeat to prevent long silences.
465
520
  """
466
521
 
467
- def __init__(self, total: int, logger: logging.Logger = None):
522
+ def __init__(
523
+ self,
524
+ total: int,
525
+ logger: logging.Logger = None,
526
+ heartbeat_interval: int = 180,
527
+ status_file: str = None,
528
+ ):
468
529
  """
469
530
  Initialize progress tracker.
470
531
 
@@ -474,26 +535,147 @@ class ProgressTracker:
474
535
  Total number of items to process
475
536
  logger : logging.Logger, optional
476
537
  Logger for output
538
+ heartbeat_interval : int, optional
539
+ Seconds between heartbeat messages (default: 180 = 3 minutes)
540
+ status_file : str, optional
541
+ Path to JSON status file for API/web app consumption.
542
+ Checkpoints auto-save to same directory as status_file.
477
543
  """
478
544
  self.total = total
479
545
  self.completed = 0
480
546
  self.lock = threading.Lock()
481
547
  self.logger = logger or logging.getLogger("whisp")
548
+ self.heartbeat_interval = heartbeat_interval
549
+
550
+ # Handle status_file: if directory passed, auto-generate filename
551
+ if status_file:
552
+ import os
553
+
554
+ if os.path.isdir(status_file):
555
+ self.status_file = os.path.join(
556
+ status_file, "whisp_processing_status.json"
557
+ )
558
+ else:
559
+ # Validate that parent directory exists
560
+ parent_dir = os.path.dirname(status_file)
561
+ if parent_dir and not os.path.isdir(parent_dir):
562
+ self.logger.warning(
563
+ f"Status file directory does not exist: {parent_dir}"
564
+ )
565
+ self.status_file = None
566
+ else:
567
+ self.status_file = status_file
568
+ else:
569
+ self.status_file = None
482
570
 
483
571
  # Adaptive milestones based on dataset size
484
572
  # Small datasets (< 50): show every 25% (not too spammy)
485
573
  # Medium (50-500): show every 20%
486
- # Large (500+): show every 10% (more frequent feedback on long runs)
574
+ # Large (500-1000): show every 10%
575
+ # Very large (1000+): show every 5% (cleaner for long jobs)
487
576
  if total < 50:
488
577
  self.milestones = {25, 50, 75, 100}
489
578
  elif total < 500:
490
579
  self.milestones = {20, 40, 60, 80, 100}
491
- else:
580
+ elif total < 1000:
492
581
  self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
582
+ else:
583
+ self.milestones = {
584
+ 5,
585
+ 10,
586
+ 15,
587
+ 20,
588
+ 25,
589
+ 30,
590
+ 35,
591
+ 40,
592
+ 45,
593
+ 50,
594
+ 55,
595
+ 60,
596
+ 65,
597
+ 70,
598
+ 75,
599
+ 80,
600
+ 85,
601
+ 90,
602
+ 95,
603
+ 100,
604
+ }
493
605
 
494
606
  self.shown_milestones = set()
495
607
  self.start_time = time.time()
496
608
  self.last_update_time = self.start_time
609
+ self.heartbeat_stop = threading.Event()
610
+ self.heartbeat_thread = None
611
+
612
+ def _write_status_file(self, status: str = "processing") -> None:
613
+ """Write current progress to JSON status file using atomic write."""
614
+ if not self.status_file:
615
+ return
616
+
617
+ try:
618
+ import json
619
+ import os
620
+
621
+ elapsed = time.time() - self.start_time
622
+ percent = (self.completed / self.total * 100) if self.total > 0 else 0
623
+ rate = self.completed / elapsed if elapsed > 0 else 0
624
+ eta = (
625
+ (self.total - self.completed) / rate * 1.15
626
+ if rate > 0 and percent >= 5
627
+ else None
628
+ )
629
+
630
+ # Write to temp file then atomic rename to prevent partial reads
631
+ from datetime import datetime
632
+
633
+ temp_file = self.status_file + ".tmp"
634
+ with open(temp_file, "w") as f:
635
+ json.dump(
636
+ {
637
+ "status": status,
638
+ "progress": f"{self.completed}/{self.total}",
639
+ "percent": round(percent, 1),
640
+ "elapsed_sec": round(elapsed),
641
+ "eta_sec": round(eta) if eta else None,
642
+ "updated_at": datetime.now().isoformat(),
643
+ },
644
+ f,
645
+ )
646
+ os.replace(temp_file, self.status_file)
647
+ except Exception:
648
+ pass
649
+
650
+ def start_heartbeat(self) -> None:
651
+ """Start background heartbeat thread for time-based progress updates."""
652
+ if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
653
+ self.heartbeat_stop.clear()
654
+ self.heartbeat_thread = threading.Thread(
655
+ target=self._heartbeat_loop, daemon=True
656
+ )
657
+ self.heartbeat_thread.start()
658
+ # Write initial status
659
+ self._write_status_file(status="processing")
660
+
661
+ def _heartbeat_loop(self) -> None:
662
+ """Background loop that logs progress at time intervals."""
663
+ while not self.heartbeat_stop.wait(self.heartbeat_interval):
664
+ with self.lock:
665
+ # Only log if we haven't shown a milestone recently
666
+ time_since_update = time.time() - self.last_update_time
667
+ if (
668
+ time_since_update >= self.heartbeat_interval
669
+ and self.completed < self.total
670
+ ):
671
+ elapsed = time.time() - self.start_time
672
+ percent = int((self.completed / self.total) * 100)
673
+ elapsed_str = self._format_time(elapsed)
674
+ self.logger.info(
675
+ f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
676
+ f"Elapsed: {elapsed_str}"
677
+ )
678
+ self.last_update_time = time.time()
497
679
 
498
680
  def update(self, n: int = 1) -> None:
499
681
  """
@@ -508,7 +690,7 @@ class ProgressTracker:
508
690
  self.completed += n
509
691
  percent = int((self.completed / self.total) * 100)
510
692
 
511
- # Show milestone messages (25%, 50%, 75%, 100%)
693
+ # Show milestone messages (5%, 10%, 15%... for large datasets)
512
694
  for milestone in sorted(self.milestones):
513
695
  if percent >= milestone and milestone not in self.shown_milestones:
514
696
  self.shown_milestones.add(milestone)
@@ -517,20 +699,36 @@ class ProgressTracker:
517
699
  elapsed = time.time() - self.start_time
518
700
  rate = self.completed / elapsed if elapsed > 0 else 0
519
701
  remaining_items = self.total - self.completed
520
- eta_seconds = remaining_items / rate if rate > 0 else 0
702
+
703
+ # Calculate ETA with padding for overhead (loading, joins, etc.)
704
+ # Don't show ETA until we have some samples (at least 5% complete)
705
+ if rate > 0 and self.completed >= max(5, self.total * 0.05):
706
+ eta_seconds = (
707
+ remaining_items / rate
708
+ ) * 1.15 # Add 15% padding for overhead
709
+ else:
710
+ eta_seconds = 0
521
711
 
522
712
  # Format time strings
523
- eta_str = self._format_time(eta_seconds)
713
+ eta_str = (
714
+ self._format_time(eta_seconds)
715
+ if eta_seconds > 0
716
+ else "calculating..."
717
+ )
524
718
  elapsed_str = self._format_time(elapsed)
525
719
 
526
720
  # Build progress message
527
- msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
721
+ msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
528
722
  if percent < 100:
529
723
  msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
530
724
  else:
531
725
  msg += f" | Total time: {elapsed_str}"
532
726
 
533
727
  self.logger.info(msg)
728
+ self.last_update_time = time.time()
729
+
730
+ # Update status file for API consumption
731
+ self._write_status_file()
534
732
 
535
733
  @staticmethod
536
734
  def _format_time(seconds: float) -> str:
@@ -544,14 +742,21 @@ class ProgressTracker:
544
742
  hours = seconds / 3600
545
743
  return f"{hours:.1f}h"
546
744
 
547
- def finish(self) -> None:
548
- """Log completion."""
745
+ def finish(self, output_file: str = None) -> None:
746
+ """Stop heartbeat and log completion."""
747
+ # Stop heartbeat thread
748
+ self.heartbeat_stop.set()
749
+ if self.heartbeat_thread and self.heartbeat_thread.is_alive():
750
+ self.heartbeat_thread.join(timeout=1)
751
+
549
752
  with self.lock:
550
753
  total_time = time.time() - self.start_time
551
754
  time_str = self._format_time(total_time)
552
- self.logger.info(
553
- f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
554
- )
755
+ msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
756
+ self.logger.info(msg)
757
+
758
+ # Write final status
759
+ self._write_status_file(status="completed")
555
760
 
556
761
 
557
762
  # ============================================================================
@@ -602,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
602
807
  if not check_ee_endpoint(endpoint_type):
603
808
  if endpoint_type == "high-volume":
604
809
  msg = (
605
- "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
810
+ "# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
606
811
  "ee.Reset()\n"
607
- "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
608
- "Or with project specified (e.g. when in Colab):\n"
609
- "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
812
+ "ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
813
+ "# where gee_project_name is your GEE project (necessary in Colab)"
610
814
  )
611
815
  else: # standard endpoint
612
816
  msg = (
613
817
  "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
614
818
  "ee.Reset()\n"
615
- "ee.Initialize()\n"
616
- "Or with project specified (e.g. when in Colab):\n"
617
- "ee.Initialize(project='your_cloud_project_name')"
819
+ "ee.Initialize(project=gee_project_name)\n"
820
+ "# where gee_project_name is your GEE project (necessary in Colab)"
618
821
  )
619
822
 
620
823
  if raise_error:
@@ -687,13 +890,13 @@ def extract_centroid_and_geomtype_client(
687
890
  if plot_id_column in gdf.columns:
688
891
  cols.append(plot_id_column)
689
892
 
690
- # Include external_id_column if provided and exists
893
+ # Include external_id if it exists (already renamed during load)
691
894
  if (
692
895
  external_id_column
693
- and external_id_column in gdf.columns
694
- and external_id_column not in cols
896
+ and "external_id" in gdf.columns
897
+ and "external_id" not in cols
695
898
  ):
696
- cols.append(external_id_column)
899
+ cols.append("external_id")
697
900
 
698
901
  # Always include metadata columns (centroid, geometry type)
699
902
  cols.extend([x_col, y_col, type_col])
@@ -787,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
787
990
 
788
991
  Preserves the __row_id__ column if present so it can be retrieved after processing.
789
992
 
993
+ IMPORTANT: Drops external_id column before sending to EE to enable query caching.
994
+ external_id is user metadata that's not needed for EE computation. Including it
995
+ breaks EE's caching mechanism since each unique external_id creates a different query.
996
+
790
997
  Parameters
791
998
  ----------
792
999
  batch_gdf : gpd.GeoDataFrame
@@ -795,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
795
1002
  Returns
796
1003
  -------
797
1004
  ee.FeatureCollection
798
- EE FeatureCollection with __row_id__ as a feature property
1005
+ EE FeatureCollection with __row_id__ as a feature property (no external_id)
799
1006
  """
1007
+ # Drop external_id before sending to EE to enable caching
1008
+ # (external_id is preserved separately on client side for merging)
1009
+ batch_for_ee = batch_gdf.copy()
1010
+ if "external_id" in batch_for_ee.columns:
1011
+ batch_for_ee = batch_for_ee.drop(columns=["external_id"])
1012
+
800
1013
  # Pass GeoDataFrame directly to preserve CRS metadata
801
1014
  # convert_geojson_to_ee will handle:
802
1015
  # - CRS detection and conversion to WGS84 if needed
803
1016
  # - Data type sanitization (datetime, object columns)
804
1017
  # - Geometry validation and Z-coordinate stripping
805
1018
 
806
- fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
1019
+ fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
807
1020
 
808
1021
  # If __row_id__ is in the original GeoDataFrame, it will be preserved
809
1022
  # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -929,7 +1142,19 @@ def process_ee_batch(
929
1142
  # Ensure plot_id_column is present for merging
930
1143
  # It should come from the feature properties (added before EE processing)
931
1144
  if plot_id_column not in df.columns:
932
- df[plot_id_column] = range(len(df))
1145
+ logger.warning(
1146
+ f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
1147
+ f"Regenerating with 1-indexed range. "
1148
+ f"Columns from EE: {list(df.columns)}"
1149
+ )
1150
+ # Use 1-indexed range to match client-side assignment
1151
+ df[plot_id_column] = range(1, len(df) + 1)
1152
+
1153
+ # Ensure plotId is integer type (EE may return as string)
1154
+ if plot_id_column in df.columns:
1155
+ df[plot_id_column] = pd.to_numeric(
1156
+ df[plot_id_column], errors="coerce"
1157
+ ).astype("Int64")
933
1158
 
934
1159
  # Ensure all column names are strings (fixes pandas .str accessor issues)
935
1160
  df.columns = df.columns.astype(str)
@@ -983,7 +1208,6 @@ def process_ee_batch(
983
1208
  def whisp_stats_geojson_to_df_concurrent(
984
1209
  input_geojson_filepath: str,
985
1210
  external_id_column: str = None,
986
- remove_geom: bool = False,
987
1211
  national_codes: List[str] = None,
988
1212
  unit_type: str = "ha",
989
1213
  whisp_image: ee.Image = None,
@@ -996,6 +1220,7 @@ def whisp_stats_geojson_to_df_concurrent(
996
1220
  logger: logging.Logger = None,
997
1221
  # Format parameters (auto-detect from config if not provided)
998
1222
  decimal_places: int = None,
1223
+ status_file: str = None,
999
1224
  ) -> pd.DataFrame:
1000
1225
  """
1001
1226
  Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
@@ -1010,8 +1235,6 @@ def whisp_stats_geojson_to_df_concurrent(
1010
1235
  Path to input GeoJSON file
1011
1236
  external_id_column : str, optional
1012
1237
  Column name for external IDs
1013
- remove_geom : bool
1014
- Remove geometry column from output
1015
1238
  national_codes : List[str], optional
1016
1239
  ISO2 codes for national datasets
1017
1240
  unit_type : str
@@ -1055,10 +1278,32 @@ def whisp_stats_geojson_to_df_concurrent(
1055
1278
  # Validate endpoint
1056
1279
  validate_ee_endpoint("high-volume", raise_error=True)
1057
1280
 
1058
- # Load GeoJSON with output suppressed
1059
- gdf = _load_geojson_silently(input_geojson_filepath)
1281
+ # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
1282
+ gdf = _load_and_prepare_geojson(
1283
+ input_geojson_filepath, external_id_column=external_id_column
1284
+ )
1060
1285
  logger.info(f"Loaded {len(gdf):,} features")
1061
1286
 
1287
+ # Validate external_id if provided (lightweight client-side check)
1288
+ # Note: external_id_column already renamed to 'external_id' during load
1289
+ if external_id_column and "external_id" not in gdf.columns:
1290
+ # Exclude geometry column from available columns list
1291
+ available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1292
+ raise ValueError(
1293
+ f"Column '{external_id_column}' not found in GeoJSON properties. "
1294
+ f"Available columns: {available_cols}"
1295
+ )
1296
+
1297
+ # Check completeness of external_id (warn if nulls exist)
1298
+ if external_id_column and "external_id" in gdf.columns:
1299
+ null_count = gdf["external_id"].isna().sum()
1300
+ if null_count > 0:
1301
+ null_pct = (null_count / len(gdf)) * 100
1302
+ logger.warning(
1303
+ f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1304
+ f"These features may have missing external IDs in output."
1305
+ )
1306
+
1062
1307
  if validate_geometries:
1063
1308
  gdf = clean_geodataframe(
1064
1309
  gdf, remove_nulls=False, repair_geometries=False, logger=logger
@@ -1068,13 +1313,21 @@ def whisp_stats_geojson_to_df_concurrent(
1068
1313
  gdf[plot_id_column] = range(1, len(gdf) + 1)
1069
1314
 
1070
1315
  # Strip unnecessary properties before sending to EE
1071
- # Keep only: geometry, plot_id_column, and external_id_column
1316
+ # Keep only: geometry, plot_id_column, and external_id
1072
1317
  # This prevents duplication of GeoJSON properties in EE results
1073
1318
  keep_cols = ["geometry", plot_id_column]
1074
- if external_id_column and external_id_column in gdf.columns:
1075
- keep_cols.append(external_id_column)
1319
+ if (
1320
+ external_id_column and "external_id" in gdf.columns
1321
+ ): # Already renamed during load
1322
+ keep_cols.append("external_id")
1076
1323
 
1077
1324
  gdf_for_ee = gdf[keep_cols].copy()
1325
+
1326
+ # CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
1327
+ if external_id_column and "external_id" in gdf_for_ee.columns:
1328
+ gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
1329
+ logger.debug(f"Converted external_id column to string type")
1330
+
1078
1331
  logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1079
1332
 
1080
1333
  # Create image if not provided
@@ -1101,13 +1354,18 @@ def whisp_stats_geojson_to_df_concurrent(
1101
1354
 
1102
1355
  # Batch the data
1103
1356
  batches = batch_geodataframe(gdf_for_ee, batch_size)
1104
- logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
1357
+ logger.info(
1358
+ f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
1359
+ )
1105
1360
 
1106
1361
  # Setup semaphore for EE concurrency control
1107
1362
  ee_semaphore = threading.BoundedSemaphore(max_concurrent)
1108
1363
 
1109
- # Progress tracker
1110
- progress = ProgressTracker(len(batches), logger=logger)
1364
+ # Progress tracker with heartbeat for long-running jobs
1365
+ progress = ProgressTracker(
1366
+ len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
1367
+ )
1368
+ progress.start_heartbeat()
1111
1369
 
1112
1370
  results = []
1113
1371
 
@@ -1148,73 +1406,97 @@ def whisp_stats_geojson_to_df_concurrent(
1148
1406
  pyogrio_logger.setLevel(logging.CRITICAL)
1149
1407
 
1150
1408
  try:
1151
- with redirect_stdout(io.StringIO()):
1152
- with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1153
- futures = {
1154
- executor.submit(process_batch, i, batch): i
1155
- for i, batch in enumerate(batches)
1156
- }
1157
-
1158
- # Track which batches failed for retry
1159
- batch_map = {i: batch for i, batch in enumerate(batches)}
1160
- batch_futures = {future: i for future, i in futures.items()}
1409
+ # Don't suppress stdout here - we want progress messages to show in Colab
1410
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1411
+ futures = {
1412
+ executor.submit(process_batch, i, batch): i
1413
+ for i, batch in enumerate(batches)
1414
+ }
1161
1415
 
1162
- for future in as_completed(futures):
1163
- batch_idx = batch_futures[future]
1164
- try:
1165
- batch_idx, df_server, df_client = future.result()
1166
-
1167
- # Merge server and client results
1168
- if plot_id_column not in df_server.columns:
1169
- df_server[plot_id_column] = range(len(df_server))
1170
-
1171
- # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1172
- # These are the actual EE processing results
1173
- df_server_clean = df_server.copy()
1174
-
1175
- # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1176
- # (formatted wrapper handles keep_external_columns parameter)
1177
- keep_external_columns = [plot_id_column]
1178
- if (
1179
- external_id_column
1180
- and external_id_column in df_client.columns
1181
- ):
1182
- keep_external_columns.append(external_id_column)
1183
- if "geometry" in df_client.columns:
1184
- keep_external_columns.append("geometry")
1185
- # Keep geometry type column (Geometry_type)
1186
- if geometry_type_column in df_client.columns:
1187
- keep_external_columns.append(geometry_type_column)
1188
- # Also keep centroid columns (Centroid_lon, Centroid_lat)
1189
- centroid_cols = [
1190
- c for c in df_client.columns if c.startswith("Centroid_")
1191
- ]
1192
- keep_external_columns.extend(centroid_cols)
1416
+ # Track which batches failed for retry
1417
+ batch_map = {i: batch for i, batch in enumerate(batches)}
1418
+ batch_futures = {future: i for future, i in futures.items()}
1193
1419
 
1194
- df_client_clean = df_client[
1195
- [c for c in keep_external_columns if c in df_client.columns]
1196
- ].drop_duplicates()
1420
+ for future in as_completed(futures):
1421
+ batch_idx = batch_futures[future]
1422
+ try:
1423
+ batch_idx, df_server, df_client = future.result()
1197
1424
 
1198
- merged = df_server_clean.merge(
1199
- df_client_clean,
1200
- on=plot_id_column,
1201
- how="left",
1202
- suffixes=("_ee", "_client"),
1425
+ # Merge server and client results
1426
+ if plot_id_column not in df_server.columns:
1427
+ logger.warning(
1428
+ f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
1429
+ f"Regenerating. Columns from EE: {list(df_server.columns)}"
1203
1430
  )
1204
- results.append(merged)
1205
- progress.update()
1206
-
1207
- except Exception as e:
1208
- # Batch failed - fail fast with clear guidance
1209
- error_msg = str(e)
1210
- logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1211
- logger.debug(f"Full error: {error_msg}")
1212
-
1213
- # Get original batch for error reporting
1214
- original_batch = batch_map[batch_idx]
1215
-
1216
- # Add to batch errors for final reporting
1217
- batch_errors.append((batch_idx, original_batch, error_msg))
1431
+ df_server[plot_id_column] = pd.array(
1432
+ range(1, len(df_server) + 1), dtype="Int64"
1433
+ )
1434
+ else:
1435
+ df_server[plot_id_column] = pd.to_numeric(
1436
+ df_server[plot_id_column], errors="coerce"
1437
+ ).astype("Int64")
1438
+
1439
+ # Ensure plotId is Int64 in client data too
1440
+ if plot_id_column in df_client.columns:
1441
+ df_client[plot_id_column] = pd.to_numeric(
1442
+ df_client[plot_id_column], errors="coerce"
1443
+ ).astype("Int64")
1444
+
1445
+ # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1446
+ # These are the actual EE processing results
1447
+ df_server_clean = df_server.copy()
1448
+
1449
+ # Drop external_id from df_server if it exists (already in df_client)
1450
+ if "external_id" in df_server_clean.columns:
1451
+ df_server_clean = df_server_clean.drop(columns=["external_id"])
1452
+
1453
+ # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1454
+ # (formatted wrapper handles keep_external_columns parameter)
1455
+ keep_external_columns = [plot_id_column]
1456
+ if external_id_column and "external_id" in df_client.columns:
1457
+ keep_external_columns.append("external_id")
1458
+ if "geometry" in df_client.columns:
1459
+ keep_external_columns.append("geometry")
1460
+ # Keep geometry type column (Geometry_type)
1461
+ if geometry_type_column in df_client.columns:
1462
+ keep_external_columns.append(geometry_type_column)
1463
+ # Also keep centroid columns (Centroid_lon, Centroid_lat)
1464
+ centroid_cols = [
1465
+ c for c in df_client.columns if c.startswith("Centroid_")
1466
+ ]
1467
+ keep_external_columns.extend(centroid_cols)
1468
+
1469
+ df_client_clean = df_client[
1470
+ [c for c in keep_external_columns if c in df_client.columns]
1471
+ ]
1472
+ # Don't drop duplicates - we need one row per feature (one per plot_id)
1473
+ # Each plot_id should have exactly one row with its metadata
1474
+
1475
+ merged = df_server_clean.merge(
1476
+ df_client_clean,
1477
+ on=plot_id_column,
1478
+ how="left",
1479
+ suffixes=("_ee", "_client"),
1480
+ )
1481
+ results.append(merged)
1482
+ progress.update()
1483
+
1484
+ except Exception as e:
1485
+ # Batch failed - fail fast with clear guidance
1486
+ error_msg = str(e)
1487
+ logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1488
+ logger.debug(f"Full error: {error_msg}")
1489
+
1490
+ # Get original batch for error reporting
1491
+ original_batch = batch_map[batch_idx]
1492
+
1493
+ # Add to batch errors for final reporting
1494
+ batch_errors.append((batch_idx, original_batch, error_msg))
1495
+ except (KeyboardInterrupt, SystemExit) as interrupt:
1496
+ logger.warning("Processing interrupted by user")
1497
+ # Update status file with interrupted state
1498
+ progress._write_status_file(status="interrupted")
1499
+ raise interrupt
1218
1500
  finally:
1219
1501
  # Restore logger levels
1220
1502
  fiona_logger.setLevel(old_fiona_level)
@@ -1318,7 +1600,10 @@ def whisp_stats_geojson_to_df_concurrent(
1318
1600
  try:
1319
1601
  batch_idx, df_server, df_client = future.result()
1320
1602
  if plot_id_column not in df_server.columns:
1321
- df_server[plot_id_column] = range(len(df_server))
1603
+ # Use 1-indexed range to match client-side assignment
1604
+ df_server[plot_id_column] = range(
1605
+ 1, len(df_server) + 1
1606
+ )
1322
1607
  merged = df_server.merge(
1323
1608
  df_client,
1324
1609
  on=plot_id_column,
@@ -1362,31 +1647,21 @@ def whisp_stats_geojson_to_df_concurrent(
1362
1647
  else:
1363
1648
  return pd.DataFrame()
1364
1649
 
1365
- # Clean up duplicate external_id columns created by merges
1366
- # Rename external_id_column to standardized 'external_id' for schema validation
1367
- if external_id_column:
1368
- # Find all columns related to external_id
1369
- external_id_variants = [
1650
+ # Clean up duplicate external_id columns created by merges (if any exist)
1651
+ # external_id was already renamed during load, so we just need to handle duplicates
1652
+ if external_id_column and "external_id" in combined.columns:
1653
+ # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
1654
+ duplicate_variants = [
1370
1655
  col
1371
1656
  for col in combined.columns
1372
- if external_id_column.lower() in col.lower()
1657
+ if col != "external_id" and col.startswith("external_id_")
1373
1658
  ]
1374
1659
 
1375
- if external_id_variants:
1376
- # Use the base column name if it exists, otherwise use first variant
1377
- base_col = (
1378
- external_id_column
1379
- if external_id_column in combined.columns
1380
- else external_id_variants[0]
1660
+ if duplicate_variants:
1661
+ logger.debug(
1662
+ f"Dropping duplicate external_id columns: {duplicate_variants}"
1381
1663
  )
1382
-
1383
- # Rename to standardized 'external_id'
1384
- if base_col != "external_id":
1385
- combined = combined.rename(columns={base_col: "external_id"})
1386
-
1387
- # Drop all other variants
1388
- cols_to_drop = [c for c in external_id_variants if c != base_col]
1389
- combined = combined.drop(columns=cols_to_drop, errors="ignore")
1664
+ combined = combined.drop(columns=duplicate_variants, errors="ignore")
1390
1665
 
1391
1666
  # plotId column is already present from batch processing
1392
1667
  # Just ensure it's at position 0
@@ -1469,14 +1744,26 @@ def whisp_stats_geojson_to_df_concurrent(
1469
1744
  try:
1470
1745
  batch_idx, df_server, df_client = future.result()
1471
1746
  if plot_id_column not in df_server.columns:
1472
- df_server[plot_id_column] = range(len(df_server))
1473
-
1474
- # Drop external_id_column from df_client if it exists (already in df_server)
1475
- if (
1476
- external_id_column
1477
- and external_id_column in df_client.columns
1478
- ):
1479
- df_client = df_client.drop(columns=[external_id_column])
1747
+ logger.warning(
1748
+ f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
1749
+ f"Regenerating. Columns from EE: {list(df_server.columns)}"
1750
+ )
1751
+ # Use 1-indexed range to match client-side assignment
1752
+ df_server[plot_id_column] = range(1, len(df_server) + 1)
1753
+
1754
+ # Ensure plotId is integer type (EE may return as string)
1755
+ if plot_id_column in df_server.columns:
1756
+ df_server[plot_id_column] = pd.to_numeric(
1757
+ df_server[plot_id_column], errors="coerce"
1758
+ ).astype("Int64")
1759
+ if plot_id_column in df_client.columns:
1760
+ df_client[plot_id_column] = pd.to_numeric(
1761
+ df_client[plot_id_column], errors="coerce"
1762
+ ).astype("Int64")
1763
+
1764
+ # Drop external_id from df_server if it exists (already in df_client)
1765
+ if "external_id" in df_server.columns:
1766
+ df_server = df_server.drop(columns=["external_id"])
1480
1767
 
1481
1768
  merged = df_server.merge(
1482
1769
  df_client,
@@ -1498,30 +1785,22 @@ def whisp_stats_geojson_to_df_concurrent(
1498
1785
  # Ensure all column names are strings (fixes pandas .str accessor issues later)
1499
1786
  combined.columns = combined.columns.astype(str)
1500
1787
 
1501
- # Clean up duplicate external_id columns created by merges
1502
- if external_id_column:
1503
- external_id_variants = [
1788
+ # Clean up duplicate external_id columns created by merges (if any exist)
1789
+ # external_id was already renamed during load, so we just need to handle duplicates
1790
+ if external_id_column and "external_id" in combined.columns:
1791
+ # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
1792
+ duplicate_variants = [
1504
1793
  col
1505
1794
  for col in combined.columns
1506
- if external_id_column.lower() in col.lower()
1795
+ if col != "external_id" and col.startswith("external_id_")
1507
1796
  ]
1508
1797
 
1509
- if external_id_variants:
1510
- base_col = external_id_column
1511
- if (
1512
- base_col not in combined.columns
1513
- and external_id_variants
1514
- ):
1515
- base_col = external_id_variants[0]
1516
- combined = combined.rename(
1517
- columns={base_col: "external_id"}
1518
- )
1519
-
1520
- cols_to_drop = [
1521
- c for c in external_id_variants if c != base_col
1522
- ]
1798
+ if duplicate_variants:
1799
+ logger.debug(
1800
+ f"Dropping duplicate external_id columns: {duplicate_variants}"
1801
+ )
1523
1802
  combined = combined.drop(
1524
- columns=cols_to_drop, errors="ignore"
1803
+ columns=duplicate_variants, errors="ignore"
1525
1804
  )
1526
1805
 
1527
1806
  # plotId column is already present, just ensure it's at position 0
@@ -1565,7 +1844,15 @@ def whisp_stats_geojson_to_df_concurrent(
1565
1844
  )
1566
1845
  raise retry_e
1567
1846
 
1568
- logger.info(f"Processed {len(formatted):,} features successfully")
1847
+ # Ensure plot_id is present (should already be there from batch processing)
1848
+ if plot_id_column not in formatted.columns:
1849
+ logger.warning(f"{plot_id_column} column missing, regenerating...")
1850
+ formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1851
+
1852
+ # Sort by plot_id to ensure consistent output order
1853
+ formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
1854
+
1855
+ logger.info(f"Processing complete: {len(formatted):,} features")
1569
1856
  return formatted
1570
1857
  else:
1571
1858
  logger.error(" No results produced")
@@ -1580,7 +1867,6 @@ def whisp_stats_geojson_to_df_concurrent(
1580
1867
  def whisp_stats_geojson_to_df_sequential(
1581
1868
  input_geojson_filepath: str,
1582
1869
  external_id_column: str = None,
1583
- remove_geom: bool = False,
1584
1870
  national_codes: List[str] = None,
1585
1871
  unit_type: str = "ha",
1586
1872
  whisp_image: ee.Image = None,
@@ -1605,8 +1891,6 @@ def whisp_stats_geojson_to_df_sequential(
1605
1891
  Path to input GeoJSON
1606
1892
  external_id_column : str, optional
1607
1893
  Column name for external IDs
1608
- remove_geom : bool
1609
- Remove geometry from output
1610
1894
  national_codes : List[str], optional
1611
1895
  ISO2 codes for national datasets
1612
1896
  unit_type : str
@@ -1642,10 +1926,32 @@ def whisp_stats_geojson_to_df_sequential(
1642
1926
  # Validate endpoint
1643
1927
  validate_ee_endpoint("standard", raise_error=True)
1644
1928
 
1645
- # Load GeoJSON with output suppressed
1646
- gdf = _load_geojson_silently(input_geojson_filepath)
1929
+ # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
1930
+ gdf = _load_and_prepare_geojson(
1931
+ input_geojson_filepath, external_id_column=external_id_column
1932
+ )
1647
1933
  logger.info(f"Loaded {len(gdf):,} features")
1648
1934
 
1935
+ # Validate external_id if provided (lightweight client-side check)
1936
+ # Note: external_id_column already renamed to 'external_id' during load
1937
+ if external_id_column and "external_id" not in gdf.columns:
1938
+ # Exclude geometry column from available columns list
1939
+ available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1940
+ raise ValueError(
1941
+ f"Column '{external_id_column}' not found in GeoJSON properties. "
1942
+ f"Available columns: {available_cols}"
1943
+ )
1944
+
1945
+ # Check completeness of external_id (warn if nulls exist)
1946
+ if external_id_column and "external_id" in gdf.columns:
1947
+ null_count = gdf["external_id"].isna().sum()
1948
+ if null_count > 0:
1949
+ null_pct = (null_count / len(gdf)) * 100
1950
+ logger.warning(
1951
+ f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1952
+ f"These features may have missing external IDs in output."
1953
+ )
1954
+
1649
1955
  # Clean geometries (preserve both null and invalid geometries by default)
1650
1956
  gdf = clean_geodataframe(
1651
1957
  gdf, remove_nulls=False, repair_geometries=False, logger=logger
@@ -1654,18 +1960,22 @@ def whisp_stats_geojson_to_df_sequential(
1654
1960
  # Add stable plotIds for merging (starting from 1, not 0)
1655
1961
  gdf[plot_id_column] = range(1, len(gdf) + 1)
1656
1962
 
1657
- # Add stable row IDs
1658
- row_id_col = "__row_id__"
1659
- gdf[row_id_col] = range(len(gdf))
1660
-
1661
1963
  # Strip unnecessary properties before sending to EE
1662
- # Keep only: geometry, plot_id_column, and external_id_column
1964
+ # Keep only: geometry, plot_id_column, and external_id
1663
1965
  # This prevents duplication of GeoJSON properties in EE results
1664
- keep_cols = ["geometry", plot_id_column, row_id_col]
1665
- if external_id_column and external_id_column in gdf.columns:
1666
- keep_cols.append(external_id_column)
1966
+ keep_cols = ["geometry", plot_id_column]
1967
+ if (
1968
+ external_id_column and "external_id" in gdf.columns
1969
+ ): # Already renamed during load
1970
+ keep_cols.append("external_id")
1667
1971
 
1668
1972
  gdf_for_ee = gdf[keep_cols].copy()
1973
+
1974
+ # CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
1975
+ if external_id_column and "external_id" in gdf_for_ee.columns:
1976
+ gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
1977
+ logger.debug(f"Converted external_id column to string type")
1978
+
1669
1979
  logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1670
1980
 
1671
1981
  # Create image if not provided
@@ -1687,16 +1997,27 @@ def whisp_stats_geojson_to_df_sequential(
1687
1997
  national_codes=national_codes, validate_bands=True
1688
1998
  )
1689
1999
 
2000
+ # Drop external_id before sending to EE to enable caching
2001
+ # (external_id is preserved separately in gdf for client-side merging)
2002
+ gdf_for_ee_clean = gdf_for_ee.copy()
2003
+ if "external_id" in gdf_for_ee_clean.columns:
2004
+ gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
2005
+ logger.debug("Dropped external_id from data sent to EE (enables caching)")
2006
+
1690
2007
  # Convert to EE (suppress print statements from convert_geojson_to_ee)
1691
2008
  logger.debug("Converting to EE FeatureCollection...")
1692
2009
  with redirect_stdout(io.StringIO()):
1693
- fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
2010
+ fc = convert_geojson_to_ee(
2011
+ gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
2012
+ )
1694
2013
 
1695
2014
  # Create reducer
1696
2015
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1697
2016
 
1698
2017
  # Process server-side with error handling for bad bands
1699
- logger.info("Processing with Earth Engine...")
2018
+ logger.info(
2019
+ f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
2020
+ )
1700
2021
  try:
1701
2022
  results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
1702
2023
  df_server = convert_ee_to_df(results_fc)
@@ -1728,11 +2049,13 @@ def whisp_stats_geojson_to_df_sequential(
1728
2049
  else:
1729
2050
  raise
1730
2051
 
1731
- logger.debug("Server-side processing complete")
2052
+ logger.info("Server-side processing complete")
1732
2053
 
1733
- # Add row_id if missing
1734
- if row_id_col not in df_server.columns:
1735
- df_server[row_id_col] = range(len(df_server))
2054
+ # Ensure plotId is Int64 type for fast merges
2055
+ if plot_id_column in df_server.columns:
2056
+ df_server[plot_id_column] = pd.to_numeric(
2057
+ df_server[plot_id_column], errors="coerce"
2058
+ ).astype("Int64")
1736
2059
 
1737
2060
  # Add client-side metadata if requested
1738
2061
  if add_metadata_client_side:
@@ -1743,21 +2066,23 @@ def whisp_stats_geojson_to_df_sequential(
1743
2066
  return_attributes_only=True,
1744
2067
  )
1745
2068
 
1746
- # Drop external_id_column from df_client if it exists (already in df_server)
1747
- if external_id_column and external_id_column in df_client.columns:
1748
- df_client = df_client.drop(columns=[external_id_column])
2069
+ # Ensure plotId is Int64 type for fast merges
2070
+ if plot_id_column in df_client.columns:
2071
+ df_client[plot_id_column] = pd.to_numeric(
2072
+ df_client[plot_id_column], errors="coerce"
2073
+ ).astype("Int64")
1749
2074
 
1750
- # Merge
2075
+ # Drop external_id from df_server if it exists (keep from df_client - more reliable)
2076
+ if "external_id" in df_server.columns:
2077
+ df_server = df_server.drop(columns=["external_id"])
2078
+
2079
+ # Merge on plotId (same strategy as concurrent mode)
1751
2080
  result = df_server.merge(
1752
- df_client, on=row_id_col, how="left", suffixes=("", "_client")
2081
+ df_client, on=plot_id_column, how="left", suffixes=("", "_client")
1753
2082
  )
1754
2083
  else:
1755
2084
  result = df_server
1756
2085
 
1757
- # Remove internal __row_id__ column if present
1758
- if row_id_col in result.columns:
1759
- result = result.drop(columns=[row_id_col])
1760
-
1761
2086
  # Format the output
1762
2087
  # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1763
2088
  # MUST be done BEFORE formatting (which removes _median columns)
@@ -1782,27 +2107,14 @@ def whisp_stats_geojson_to_df_sequential(
1782
2107
  convert_water_flag=True,
1783
2108
  )
1784
2109
 
1785
- logger.info(f"Processed {len(formatted):,} features")
1786
-
1787
- # Consolidate external_id_column to standardized 'external_id'
1788
- if external_id_column:
1789
- variants = [
1790
- col
1791
- for col in formatted.columns
1792
- if external_id_column.lower() in col.lower()
1793
- ]
1794
- if variants:
1795
- base_col = (
1796
- external_id_column
1797
- if external_id_column in formatted.columns
1798
- else variants[0]
1799
- )
1800
- if base_col != "external_id":
1801
- formatted = formatted.rename(columns={base_col: "external_id"})
1802
- # Drop other variants
1803
- formatted = formatted.drop(
1804
- columns=[c for c in variants if c != base_col], errors="ignore"
1805
- )
2110
+ # Ensure plot_id exists and sort by it
2111
+ if plot_id_column not in formatted.columns:
2112
+ formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
2113
+ formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
2114
+
2115
+ logger.info(f"Processing complete: {len(formatted):,} features")
2116
+
2117
+ # external_id_column already renamed to 'external_id' during load - no action needed here
1806
2118
 
1807
2119
  return formatted
1808
2120
 
@@ -1815,7 +2127,6 @@ def whisp_stats_geojson_to_df_sequential(
1815
2127
  def whisp_formatted_stats_geojson_to_df_concurrent(
1816
2128
  input_geojson_filepath: str,
1817
2129
  external_id_column: str = None,
1818
- remove_geom: bool = False,
1819
2130
  national_codes: List[str] = None,
1820
2131
  unit_type: str = "ha",
1821
2132
  whisp_image: ee.Image = None,
@@ -1833,6 +2144,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1833
2144
  water_flag_threshold: float = 0.5,
1834
2145
  sort_column: str = "plotId",
1835
2146
  geometry_audit_trail: bool = False,
2147
+ status_file: str = None,
1836
2148
  ) -> pd.DataFrame:
1837
2149
  """
1838
2150
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -1848,8 +2160,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1848
2160
  Path to input GeoJSON file
1849
2161
  external_id_column : str, optional
1850
2162
  Column name for external IDs
1851
- remove_geom : bool
1852
- Remove geometry column from output
1853
2163
  national_codes : List[str], optional
1854
2164
  ISO2 codes for national datasets
1855
2165
  unit_type : str
@@ -1910,14 +2220,13 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1910
2220
  gdf_original_geoms = None
1911
2221
  if geometry_audit_trail:
1912
2222
  logger.debug("Pre-loading GeoJSON for geometry audit trail...")
1913
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2223
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
1914
2224
 
1915
2225
  # Step 1: Get raw stats
1916
2226
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
1917
2227
  df_raw = whisp_stats_geojson_to_df_concurrent(
1918
2228
  input_geojson_filepath=input_geojson_filepath,
1919
2229
  external_id_column=external_id_column,
1920
- remove_geom=remove_geom,
1921
2230
  national_codes=national_codes,
1922
2231
  unit_type=unit_type,
1923
2232
  whisp_image=whisp_image,
@@ -1928,6 +2237,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1928
2237
  max_retries=max_retries,
1929
2238
  add_metadata_server=add_metadata_server,
1930
2239
  logger=logger,
2240
+ status_file=status_file,
1931
2241
  )
1932
2242
 
1933
2243
  # Step 2: Format the output
@@ -1979,7 +2289,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1979
2289
  # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
1980
2290
  if gdf_original_geoms is None:
1981
2291
  logger.warning("Original geometries not pre-loaded, loading now...")
1982
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2292
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
1983
2293
 
1984
2294
  # Use plotId from df_validated to maintain mapping
1985
2295
  df_original_geom = pd.DataFrame(
@@ -2030,7 +2340,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2030
2340
  def whisp_formatted_stats_geojson_to_df_sequential(
2031
2341
  input_geojson_filepath: str,
2032
2342
  external_id_column: str = None,
2033
- remove_geom: bool = False,
2034
2343
  national_codes: List[str] = None,
2035
2344
  unit_type: str = "ha",
2036
2345
  whisp_image: ee.Image = None,
@@ -2044,6 +2353,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2044
2353
  water_flag_threshold: float = 0.5,
2045
2354
  sort_column: str = "plotId",
2046
2355
  geometry_audit_trail: bool = False,
2356
+ status_file: str = None,
2047
2357
  ) -> pd.DataFrame:
2048
2358
  """
2049
2359
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -2059,8 +2369,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2059
2369
  Path to input GeoJSON file
2060
2370
  external_id_column : str, optional
2061
2371
  Column name for external IDs
2062
- remove_geom : bool
2063
- Remove geometry from output
2064
2372
  national_codes : List[str], optional
2065
2373
  ISO2 codes for national datasets
2066
2374
  unit_type : str
@@ -2113,14 +2421,13 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2113
2421
  gdf_original_geoms = None
2114
2422
  if geometry_audit_trail:
2115
2423
  logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2116
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2424
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2117
2425
 
2118
2426
  # Step 1: Get raw stats
2119
2427
  logger.debug("Step 1/2: Extracting statistics (sequential)...")
2120
2428
  df_raw = whisp_stats_geojson_to_df_sequential(
2121
2429
  input_geojson_filepath=input_geojson_filepath,
2122
2430
  external_id_column=external_id_column,
2123
- remove_geom=remove_geom,
2124
2431
  national_codes=national_codes,
2125
2432
  unit_type=unit_type,
2126
2433
  whisp_image=whisp_image,
@@ -2178,7 +2485,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2178
2485
  # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2179
2486
  if gdf_original_geoms is None:
2180
2487
  logger.warning("Original geometries not pre-loaded, loading now...")
2181
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2488
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2182
2489
 
2183
2490
  # Use plotId from df_validated to maintain mapping
2184
2491
  df_original_geom = pd.DataFrame(
@@ -2233,7 +2540,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2233
2540
  def whisp_formatted_stats_geojson_to_df_fast(
2234
2541
  input_geojson_filepath: str,
2235
2542
  external_id_column: str = None,
2236
- remove_geom: bool = False,
2237
2543
  national_codes: List[str] = None,
2238
2544
  unit_type: str = "ha",
2239
2545
  whisp_image: ee.Image = None,
@@ -2252,6 +2558,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
2252
2558
  water_flag_threshold: float = 0.5,
2253
2559
  sort_column: str = "plotId",
2254
2560
  geometry_audit_trail: bool = False,
2561
+ status_file: str = None,
2255
2562
  ) -> pd.DataFrame:
2256
2563
  """
2257
2564
  Process GeoJSON to Whisp statistics with optimized fast processing.
@@ -2267,8 +2574,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2267
2574
  Path to input GeoJSON file
2268
2575
  external_id_column : str, optional
2269
2576
  Column name for external IDs
2270
- remove_geom : bool
2271
- Remove geometry column from output
2272
2577
  national_codes : List[str], optional
2273
2578
  ISO2 codes for national datasets
2274
2579
  unit_type : str
@@ -2339,7 +2644,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2339
2644
  return whisp_formatted_stats_geojson_to_df_concurrent(
2340
2645
  input_geojson_filepath=input_geojson_filepath,
2341
2646
  external_id_column=external_id_column,
2342
- remove_geom=remove_geom,
2343
2647
  national_codes=national_codes,
2344
2648
  unit_type=unit_type,
2345
2649
  whisp_image=whisp_image,
@@ -2356,13 +2660,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
2356
2660
  water_flag_threshold=water_flag_threshold,
2357
2661
  sort_column=sort_column,
2358
2662
  geometry_audit_trail=geometry_audit_trail,
2663
+ status_file=status_file,
2359
2664
  )
2360
2665
  else: # sequential
2361
2666
  logger.debug("Routing to sequential processing...")
2362
2667
  return whisp_formatted_stats_geojson_to_df_sequential(
2363
2668
  input_geojson_filepath=input_geojson_filepath,
2364
2669
  external_id_column=external_id_column,
2365
- remove_geom=remove_geom,
2366
2670
  national_codes=national_codes,
2367
2671
  unit_type=unit_type,
2368
2672
  whisp_image=whisp_image,
@@ -2374,4 +2678,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
2374
2678
  water_flag_threshold=water_flag_threshold,
2375
2679
  sort_column=sort_column,
2376
2680
  geometry_audit_trail=geometry_audit_trail,
2681
+ status_file=status_file,
2377
2682
  )