openforis-whisp 3.0.0a6__py3-none-any.whl → 3.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,7 @@ import subprocess
33
33
  from contextlib import redirect_stdout, contextmanager
34
34
  from pathlib import Path
35
35
  from typing import Optional, List, Dict, Any, Tuple, Union
36
+ from importlib.metadata import version as get_version
36
37
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
38
  import tempfile
38
39
 
@@ -510,253 +511,102 @@ def join_admin_codes(
510
511
  return df
511
512
 
512
513
 
513
- class ProgressTracker:
514
- """
515
- Track batch processing progress with time estimation.
514
+ def _format_time(seconds: float) -> str:
515
+ """Format seconds as human-readable string."""
516
+ if seconds < 60:
517
+ return f"{seconds:.0f}s"
518
+ elif seconds < 3600:
519
+ mins = seconds / 60
520
+ return f"{mins:.1f}m"
521
+ else:
522
+ hours = seconds / 3600
523
+ return f"{hours:.1f}h"
516
524
 
517
- Shows progress at adaptive milestones (more frequent for small datasets,
518
- less frequent for large datasets) with estimated time remaining based on
519
- processing speed. Includes time-based heartbeat to prevent long silences.
520
- """
521
525
 
522
- def __init__(
523
- self,
524
- total: int,
525
- logger: logging.Logger = None,
526
- heartbeat_interval: int = 180,
527
- status_file: str = None,
528
- ):
529
- """
530
- Initialize progress tracker.
531
-
532
- Parameters
533
- ----------
534
- total : int
535
- Total number of items to process
536
- logger : logging.Logger, optional
537
- Logger for output
538
- heartbeat_interval : int, optional
539
- Seconds between heartbeat messages (default: 180 = 3 minutes)
540
- status_file : str, optional
541
- Path to JSON status file for API/web app consumption.
542
- Checkpoints auto-save to same directory as status_file.
543
- """
544
- self.total = total
545
- self.completed = 0
546
- self.lock = threading.Lock()
547
- self.logger = logger or logging.getLogger("whisp")
548
- self.heartbeat_interval = heartbeat_interval
549
-
550
- # Handle status_file: if directory passed, auto-generate filename
551
- if status_file:
552
- import os
553
-
554
- if os.path.isdir(status_file):
555
- self.status_file = os.path.join(
556
- status_file, "whisp_processing_status.json"
557
- )
558
- else:
559
- # Validate that parent directory exists
560
- parent_dir = os.path.dirname(status_file)
561
- if parent_dir and not os.path.isdir(parent_dir):
562
- self.logger.warning(
563
- f"Status file directory does not exist: {parent_dir}"
564
- )
565
- self.status_file = None
566
- else:
567
- self.status_file = status_file
568
- else:
569
- self.status_file = None
570
-
571
- # Adaptive milestones based on dataset size
572
- # Small datasets (< 50): show every 25% (not too spammy)
573
- # Medium (50-500): show every 20%
574
- # Large (500-1000): show every 10%
575
- # Very large (1000+): show every 5% (cleaner for long jobs)
576
- if total < 50:
577
- self.milestones = {25, 50, 75, 100}
578
- elif total < 500:
579
- self.milestones = {20, 40, 60, 80, 100}
580
- elif total < 1000:
581
- self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
582
- else:
583
- self.milestones = {
584
- 5,
585
- 10,
586
- 15,
587
- 20,
588
- 25,
589
- 30,
590
- 35,
591
- 40,
592
- 45,
593
- 50,
594
- 55,
595
- 60,
596
- 65,
597
- 70,
598
- 75,
599
- 80,
600
- 85,
601
- 90,
602
- 95,
603
- 100,
604
- }
605
-
606
- self.shown_milestones = set()
607
- self.start_time = time.time()
608
- self.last_update_time = self.start_time
609
- self.heartbeat_stop = threading.Event()
610
- self.heartbeat_thread = None
526
+ def _get_progress_milestones(total_features: int) -> set:
527
+ """
528
+ Get progress milestones based on dataset size.
611
529
 
612
- def _write_status_file(self, status: str = "processing") -> None:
613
- """Write current progress to JSON status file using atomic write."""
614
- if not self.status_file:
615
- return
530
+ Parameters
531
+ ----------
532
+ total_features : int
533
+ Total number of features being processed
616
534
 
617
- try:
618
- import json
619
- import os
620
-
621
- elapsed = time.time() - self.start_time
622
- percent = (self.completed / self.total * 100) if self.total > 0 else 0
623
- rate = self.completed / elapsed if elapsed > 0 else 0
624
- eta = (
625
- (self.total - self.completed) / rate * 1.15
626
- if rate > 0 and percent >= 5
627
- else None
628
- )
535
+ Returns
536
+ -------
537
+ set
538
+ Set of percentage milestones to show
539
+ """
540
+ # Set milestones based on feature count
541
+ if total_features < 250:
542
+ return set(range(20, 101, 20)) # Every 20%: {20, 40, 60, 80, 100}
543
+ elif total_features < 1000:
544
+ return set(range(10, 101, 10)) # Every 10%
545
+ elif total_features < 10000:
546
+ return set(range(5, 101, 5)) # Every 5%
547
+ elif total_features < 50000:
548
+ return set(range(2, 101, 2)) # Every 2%
549
+ else:
550
+ return set(range(1, 101)) # Every 1%
629
551
 
630
- # Write to temp file then atomic rename to prevent partial reads
631
- from datetime import datetime
632
-
633
- temp_file = self.status_file + ".tmp"
634
- with open(temp_file, "w") as f:
635
- json.dump(
636
- {
637
- "status": status,
638
- "progress": f"{self.completed}/{self.total}",
639
- "percent": round(percent, 1),
640
- "elapsed_sec": round(elapsed),
641
- "eta_sec": round(eta) if eta else None,
642
- "updated_at": datetime.now().isoformat(),
643
- },
644
- f,
645
- )
646
- os.replace(temp_file, self.status_file)
647
- except Exception:
648
- pass
649
-
650
- def start_heartbeat(self) -> None:
651
- """Start background heartbeat thread for time-based progress updates."""
652
- if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
653
- self.heartbeat_stop.clear()
654
- self.heartbeat_thread = threading.Thread(
655
- target=self._heartbeat_loop, daemon=True
656
- )
657
- self.heartbeat_thread.start()
658
- # Write initial status
659
- self._write_status_file(status="processing")
660
-
661
- def _heartbeat_loop(self) -> None:
662
- """Background loop that logs progress at time intervals."""
663
- while not self.heartbeat_stop.wait(self.heartbeat_interval):
664
- with self.lock:
665
- # Only log if we haven't shown a milestone recently
666
- time_since_update = time.time() - self.last_update_time
667
- if (
668
- time_since_update >= self.heartbeat_interval
669
- and self.completed < self.total
670
- ):
671
- elapsed = time.time() - self.start_time
672
- percent = int((self.completed / self.total) * 100)
673
- elapsed_str = self._format_time(elapsed)
674
- self.logger.info(
675
- f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
676
- f"Elapsed: {elapsed_str}"
677
- )
678
- self.last_update_time = time.time()
679
-
680
- def update(self, n: int = 1) -> None:
681
- """
682
- Update progress count.
683
-
684
- Parameters
685
- ----------
686
- n : int
687
- Number of items completed
688
- """
689
- with self.lock:
690
- self.completed += n
691
- percent = int((self.completed / self.total) * 100)
692
-
693
- # Show milestone messages (5%, 10%, 15%... for large datasets)
694
- for milestone in sorted(self.milestones):
695
- if percent >= milestone and milestone not in self.shown_milestones:
696
- self.shown_milestones.add(milestone)
697
-
698
- # Calculate time metrics
699
- elapsed = time.time() - self.start_time
700
- rate = self.completed / elapsed if elapsed > 0 else 0
701
- remaining_items = self.total - self.completed
702
-
703
- # Calculate ETA with padding for overhead (loading, joins, etc.)
704
- # Don't show ETA until we have some samples (at least 5% complete)
705
- if rate > 0 and self.completed >= max(5, self.total * 0.05):
706
- eta_seconds = (
707
- remaining_items / rate
708
- ) * 1.15 # Add 15% padding for overhead
709
- else:
710
- eta_seconds = 0
711
552
 
712
- # Format time strings
713
- eta_str = (
714
- self._format_time(eta_seconds)
715
- if eta_seconds > 0
716
- else "calculating..."
717
- )
718
- elapsed_str = self._format_time(elapsed)
553
+ def _log_progress(
554
+ completed: int,
555
+ total: int,
556
+ milestones: set,
557
+ shown_milestones: set,
558
+ start_time: float,
559
+ logger: logging.Logger,
560
+ ) -> None:
561
+ """
562
+ Log progress at milestone percentages.
719
563
 
720
- # Build progress message
721
- msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
722
- if percent < 100:
723
- msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
724
- else:
725
- msg += f" | Total time: {elapsed_str}"
726
-
727
- self.logger.info(msg)
728
- self.last_update_time = time.time()
729
-
730
- # Update status file for API consumption
731
- self._write_status_file()
732
-
733
- @staticmethod
734
- def _format_time(seconds: float) -> str:
735
- """Format seconds as human-readable string."""
736
- if seconds < 60:
737
- return f"{seconds:.0f}s"
738
- elif seconds < 3600:
739
- mins = seconds / 60
740
- return f"{mins:.1f}m"
741
- else:
742
- hours = seconds / 3600
743
- return f"{hours:.1f}h"
564
+ Parameters
565
+ ----------
566
+ completed : int
567
+ Number of batches completed
568
+ total : int
569
+ Total number of batches
570
+ milestones : set
571
+ Set of percentage milestones to show
572
+ shown_milestones : set
573
+ Set of milestones already shown (modified in place)
574
+ start_time : float
575
+ Start time from time.time()
576
+ logger : logging.Logger
577
+ Logger for output
578
+ """
579
+ percent = int((completed / total) * 100)
580
+
581
+ # Check for new milestones reached
582
+ for milestone in sorted(milestones):
583
+ if percent >= milestone and milestone not in shown_milestones:
584
+ shown_milestones.add(milestone)
585
+
586
+ # Calculate time metrics
587
+ elapsed = time.time() - start_time
588
+ rate = completed / elapsed if elapsed > 0 else 0
589
+ remaining_items = total - completed
590
+
591
+ # Calculate ETA with padding for overhead (loading, joins, etc.)
592
+ # Don't show ETA until we have some samples (at least 5% complete)
593
+ if rate > 0 and completed >= max(5, total * 0.05):
594
+ eta_seconds = (remaining_items / rate) * 1.15 # Add 15% padding
595
+ else:
596
+ eta_seconds = 0
744
597
 
745
- def finish(self, output_file: str = None) -> None:
746
- """Stop heartbeat and log completion."""
747
- # Stop heartbeat thread
748
- self.heartbeat_stop.set()
749
- if self.heartbeat_thread and self.heartbeat_thread.is_alive():
750
- self.heartbeat_thread.join(timeout=1)
598
+ # Format time strings
599
+ eta_str = _format_time(eta_seconds) if eta_seconds > 0 else "calculating..."
600
+ elapsed_str = _format_time(elapsed)
751
601
 
752
- with self.lock:
753
- total_time = time.time() - self.start_time
754
- time_str = self._format_time(total_time)
755
- msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
756
- self.logger.info(msg)
602
+ # Build progress message
603
+ msg = f"Progress: {completed:,}/{total:,} batches ({percent}%)"
604
+ if percent < 100:
605
+ msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
606
+ else:
607
+ msg += f" | Total time: {elapsed_str}"
757
608
 
758
- # Write final status
759
- self._write_status_file(status="completed")
609
+ logger.info(msg)
760
610
 
761
611
 
762
612
  # ============================================================================
@@ -1075,10 +925,67 @@ def clean_geodataframe(
1075
925
 
1076
926
 
1077
927
  # ============================================================================
1078
- # BATCH RETRY HELPER
928
+ # AUDIT TRAIL HELPER
1079
929
  # ============================================================================
1080
930
 
1081
931
 
932
+ def _add_geometry_audit_trail(
933
+ df_validated: pd.DataFrame,
934
+ input_geojson_filepath: str,
935
+ gdf_original_geoms: gpd.GeoDataFrame = None,
936
+ logger: logging.Logger = None,
937
+ ) -> pd.DataFrame:
938
+ """
939
+ Add original input geometries as geo_original column for audit trail.
940
+
941
+ Parameters
942
+ ----------
943
+ df_validated : pd.DataFrame
944
+ Validated DataFrame to add audit trail to
945
+ input_geojson_filepath : str
946
+ Path to original GeoJSON file
947
+ gdf_original_geoms : gpd.GeoDataFrame, optional
948
+ Pre-loaded original geometries (to avoid reloading)
949
+ logger : logging.Logger, optional
950
+ Logger for output
951
+
952
+ Returns
953
+ -------
954
+ pd.DataFrame
955
+ DataFrame with geo_original column added
956
+ """
957
+ import json
958
+ from shapely.geometry import mapping
959
+
960
+ logger = logger or logging.getLogger("whisp")
961
+
962
+ try:
963
+ # Load original geometries if not provided
964
+ if gdf_original_geoms is None:
965
+ logger.warning("Original geometries not pre-loaded, loading now...")
966
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
967
+
968
+ # Create DataFrame with plotId and geo_original
969
+ df_original_geom = pd.DataFrame(
970
+ {
971
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
972
+ "geo_original": gdf_original_geoms["geometry"].apply(
973
+ lambda g: json.dumps(mapping(g)) if g is not None else None
974
+ ),
975
+ }
976
+ )
977
+
978
+ # Merge original geometries back
979
+ df_result = df_validated.merge(df_original_geom, on="plotId", how="left")
980
+ logger.info("Audit trail added: geo_original column")
981
+ return df_result
982
+
983
+ except Exception as e:
984
+ logger.warning(f"Error adding audit trail: {e}")
985
+ # Return original DataFrame if audit trail fails
986
+ return df_validated
987
+
988
+
1082
989
  # ============================================================================
1083
990
  # BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
1084
991
  # ============================================================================
@@ -1218,7 +1125,6 @@ def whisp_stats_geojson_to_df_concurrent(
1218
1125
  logger: logging.Logger = None,
1219
1126
  # Format parameters (auto-detect from config if not provided)
1220
1127
  decimal_places: int = None,
1221
- status_file: str = None,
1222
1128
  ) -> pd.DataFrame:
1223
1129
  """
1224
1130
  Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
@@ -1359,11 +1265,12 @@ def whisp_stats_geojson_to_df_concurrent(
1359
1265
  # Setup semaphore for EE concurrency control
1360
1266
  ee_semaphore = threading.BoundedSemaphore(max_concurrent)
1361
1267
 
1362
- # Progress tracker with heartbeat for long-running jobs
1363
- progress = ProgressTracker(
1364
- len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
1365
- )
1366
- progress.start_heartbeat()
1268
+ # Progress tracking setup
1269
+ progress_lock = threading.Lock()
1270
+ completed_batches = 0
1271
+ milestones = _get_progress_milestones(len(gdf_for_ee))
1272
+ shown_milestones = set()
1273
+ start_time = time.time()
1367
1274
 
1368
1275
  results = []
1369
1276
 
@@ -1477,7 +1384,18 @@ def whisp_stats_geojson_to_df_concurrent(
1477
1384
  suffixes=("_ee", "_client"),
1478
1385
  )
1479
1386
  results.append(merged)
1480
- progress.update()
1387
+
1388
+ # Update progress
1389
+ with progress_lock:
1390
+ completed_batches += 1
1391
+ _log_progress(
1392
+ completed_batches,
1393
+ len(batches),
1394
+ milestones,
1395
+ shown_milestones,
1396
+ start_time,
1397
+ logger,
1398
+ )
1481
1399
 
1482
1400
  except Exception as e:
1483
1401
  # Batch failed - fail fast with clear guidance
@@ -1492,15 +1410,18 @@ def whisp_stats_geojson_to_df_concurrent(
1492
1410
  batch_errors.append((batch_idx, original_batch, error_msg))
1493
1411
  except (KeyboardInterrupt, SystemExit) as interrupt:
1494
1412
  logger.warning("Processing interrupted by user")
1495
- # Update status file with interrupted state
1496
- progress._write_status_file(status="interrupted")
1497
1413
  raise interrupt
1498
1414
  finally:
1499
1415
  # Restore logger levels
1500
1416
  fiona_logger.setLevel(old_fiona_level)
1501
1417
  pyogrio_logger.setLevel(old_pyogrio_level)
1502
1418
 
1503
- progress.finish()
1419
+ # Log completion
1420
+ total_time = time.time() - start_time
1421
+ time_str = _format_time(total_time)
1422
+ logger.info(
1423
+ f"Processing complete: {completed_batches:,}/{len(batches):,} batches in {time_str}"
1424
+ )
1504
1425
 
1505
1426
  # If we have batch errors after retry attempts, fail the entire process
1506
1427
  if batch_errors:
@@ -1577,7 +1498,9 @@ def whisp_stats_geojson_to_df_concurrent(
1577
1498
 
1578
1499
  # Retry batch processing with validated image
1579
1500
  results = []
1580
- progress = ProgressTracker(len(batches), logger=logger)
1501
+ retry_completed = 0
1502
+ retry_shown = set()
1503
+ retry_start = time.time()
1581
1504
 
1582
1505
  # Suppress fiona logging during batch processing (threads create new loggers)
1583
1506
  fiona_logger = logging.getLogger("fiona")
@@ -1609,13 +1532,28 @@ def whisp_stats_geojson_to_df_concurrent(
1609
1532
  suffixes=("", "_client"),
1610
1533
  )
1611
1534
  results.append(merged)
1612
- progress.update()
1535
+
1536
+ # Update retry progress
1537
+ with progress_lock:
1538
+ retry_completed += 1
1539
+ _log_progress(
1540
+ retry_completed,
1541
+ len(batches),
1542
+ milestones,
1543
+ retry_shown,
1544
+ retry_start,
1545
+ logger,
1546
+ )
1613
1547
  except Exception as e:
1614
1548
  logger.error(
1615
1549
  f"Batch processing error (retry): {str(e)[:100]}"
1616
1550
  )
1617
1551
 
1618
- progress.finish()
1552
+ # Log retry completion
1553
+ retry_time = time.time() - retry_start
1554
+ logger.info(
1555
+ f"Retry complete: {retry_completed:,}/{len(batches):,} batches in {_format_time(retry_time)}"
1556
+ )
1619
1557
  finally:
1620
1558
  # Restore logger levels
1621
1559
  fiona_logger.setLevel(old_fiona_level)
@@ -1847,8 +1785,7 @@ def whisp_stats_geojson_to_df_concurrent(
1847
1785
  logger.warning(f"{plot_id_column} column missing, regenerating...")
1848
1786
  formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1849
1787
 
1850
- # Sort by plot_id to ensure consistent output order
1851
- formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
1788
+ # Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
1852
1789
 
1853
1790
  logger.info(f"Processing complete: {len(formatted):,} features")
1854
1791
  return formatted
@@ -2101,10 +2038,11 @@ def whisp_stats_geojson_to_df_sequential(
2101
2038
  convert_water_flag=True,
2102
2039
  )
2103
2040
 
2104
- # Ensure plot_id exists and sort by it
2041
+ # Ensure plot_id exists
2105
2042
  if plot_id_column not in formatted.columns:
2106
2043
  formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
2107
- formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
2044
+
2045
+ # Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
2108
2046
 
2109
2047
  logger.info(f"Processing complete: {len(formatted):,} features")
2110
2048
 
@@ -2138,7 +2076,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2138
2076
  water_flag_threshold: float = 0.5,
2139
2077
  sort_column: str = "plotId",
2140
2078
  geometry_audit_trail: bool = False,
2141
- status_file: str = None,
2142
2079
  ) -> pd.DataFrame:
2143
2080
  """
2144
2081
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -2231,7 +2168,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2231
2168
  max_retries=max_retries,
2232
2169
  add_metadata_server=add_metadata_server,
2233
2170
  logger=logger,
2234
- status_file=status_file,
2235
2171
  )
2236
2172
 
2237
2173
  # Step 2: Format the output
@@ -2276,50 +2212,21 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2276
2212
  custom_bands=custom_bands,
2277
2213
  )
2278
2214
 
2279
- # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2215
+ # Step 2c: Add audit trail column (AFTER validation to preserve columns)
2280
2216
  if geometry_audit_trail:
2281
- logger.debug("Adding audit trail columns...")
2282
- try:
2283
- # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2284
- if gdf_original_geoms is None:
2285
- logger.warning("Original geometries not pre-loaded, loading now...")
2286
- gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2287
-
2288
- # Use plotId from df_validated to maintain mapping
2289
- df_original_geom = pd.DataFrame(
2290
- {
2291
- "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2292
- "geo_original": gdf_original_geoms["geometry"].apply(
2293
- lambda g: json.dumps(mapping(g)) if g is not None else None
2294
- ),
2295
- }
2296
- )
2297
-
2298
- # Merge original geometries back
2299
- df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2300
-
2301
- # Store processing metadata
2302
- df_validated.attrs["processing_metadata"] = {
2303
- "whisp_version": "3.0.0a1",
2304
- "processing_date": datetime.now().isoformat(),
2305
- "processing_mode": "concurrent",
2306
- "ee_endpoint": "high_volume",
2307
- "validate_geometries": validate_geometries,
2308
- "datasets_used": national_codes or [],
2309
- "geometry_audit_trail": True,
2310
- }
2311
-
2312
- logger.info(f"Audit trail added: geo_original column")
2313
-
2314
- except Exception as e:
2315
- logger.warning(f"Error adding audit trail: {e}")
2316
- # Continue without audit trail if something fails
2217
+ logger.debug("Adding geo_original column for audit trail...")
2218
+ df_validated = _add_geometry_audit_trail(
2219
+ df_validated=df_validated,
2220
+ input_geojson_filepath=input_geojson_filepath,
2221
+ gdf_original_geoms=gdf_original_geoms,
2222
+ logger=logger,
2223
+ )
2317
2224
 
2318
2225
  # Add processing metadata column using pd.concat to avoid fragmentation warning
2319
2226
  metadata_dict = {
2320
- "whisp_version": "3.0.0a1",
2227
+ "whisp_version": get_version("openforis-whisp"),
2321
2228
  "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2322
- "%Y-%m-%d %H:%M:%S UTC"
2229
+ "%Y-%m-%d %H:%M:%S%z"
2323
2230
  ),
2324
2231
  }
2325
2232
  metadata_series = pd.Series(
@@ -2347,7 +2254,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2347
2254
  water_flag_threshold: float = 0.5,
2348
2255
  sort_column: str = "plotId",
2349
2256
  geometry_audit_trail: bool = False,
2350
- status_file: str = None,
2351
2257
  ) -> pd.DataFrame:
2352
2258
  """
2353
2259
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -2472,49 +2378,21 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2472
2378
  custom_bands=custom_bands,
2473
2379
  )
2474
2380
 
2475
- # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2381
+ # Step 2c: Add audit trail column (AFTER validation to preserve columns)
2476
2382
  if geometry_audit_trail:
2477
- logger.debug("Adding audit trail columns...")
2478
- try:
2479
- # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2480
- if gdf_original_geoms is None:
2481
- logger.warning("Original geometries not pre-loaded, loading now...")
2482
- gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2483
-
2484
- # Use plotId from df_validated to maintain mapping
2485
- df_original_geom = pd.DataFrame(
2486
- {
2487
- "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2488
- "geo_original": gdf_original_geoms["geometry"].apply(
2489
- lambda g: json.dumps(mapping(g)) if g is not None else None
2490
- ),
2491
- }
2492
- )
2493
-
2494
- # Merge original geometries back
2495
- df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2496
-
2497
- # Store processing metadata
2498
- df_validated.attrs["processing_metadata"] = {
2499
- "whisp_version": "3.0.0a1",
2500
- "processing_date": datetime.now().isoformat(),
2501
- "processing_mode": "sequential",
2502
- "ee_endpoint": "standard",
2503
- "datasets_used": national_codes or [],
2504
- "geometry_audit_trail": True,
2505
- }
2506
-
2507
- logger.info(f"Audit trail added: geo_original column")
2508
-
2509
- except Exception as e:
2510
- logger.warning(f"Error adding audit trail: {e}")
2511
- # Continue without audit trail if something fails
2383
+ logger.debug("Adding geo_original column for audit trail...")
2384
+ df_validated = _add_geometry_audit_trail(
2385
+ df_validated=df_validated,
2386
+ input_geojson_filepath=input_geojson_filepath,
2387
+ gdf_original_geoms=gdf_original_geoms,
2388
+ logger=logger,
2389
+ )
2512
2390
 
2513
2391
  # Add processing metadata column using pd.concat to avoid fragmentation warning
2514
2392
  metadata_dict = {
2515
- "whisp_version": "3.0.0a1",
2393
+ "whisp_version": get_version("openforis-whisp"),
2516
2394
  "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2517
- "%Y-%m-%d %H:%M:%S UTC"
2395
+ "%Y-%m-%d %H:%M:%S%z"
2518
2396
  ),
2519
2397
  }
2520
2398
  metadata_series = pd.Series(
@@ -2552,7 +2430,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2552
2430
  water_flag_threshold: float = 0.5,
2553
2431
  sort_column: str = "plotId",
2554
2432
  geometry_audit_trail: bool = False,
2555
- status_file: str = None,
2556
2433
  ) -> pd.DataFrame:
2557
2434
  """
2558
2435
  Process GeoJSON to Whisp statistics with optimized fast processing.
@@ -2654,7 +2531,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2654
2531
  water_flag_threshold=water_flag_threshold,
2655
2532
  sort_column=sort_column,
2656
2533
  geometry_audit_trail=geometry_audit_trail,
2657
- status_file=status_file,
2658
2534
  )
2659
2535
  else: # sequential
2660
2536
  logger.debug("Routing to sequential processing...")
@@ -2672,5 +2548,4 @@ def whisp_formatted_stats_geojson_to_df_fast(
2672
2548
  water_flag_threshold=water_flag_threshold,
2673
2549
  sort_column=sort_column,
2674
2550
  geometry_audit_trail=geometry_audit_trail,
2675
- status_file=status_file,
2676
2551
  )