openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +8 -8
- openforis_whisp/advanced_stats.py +476 -312
- openforis_whisp/data_checks.py +80 -28
- openforis_whisp/datasets.py +14 -0
- openforis_whisp/logger.py +15 -3
- openforis_whisp/parameters/lookup_gee_datasets.csv +3 -2
- openforis_whisp/pd_schemas.py +7 -2
- openforis_whisp/reformat.py +8 -30
- openforis_whisp/stats.py +16 -62
- openforis_whisp/utils.py +468 -80
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a4.dist-info/RECORD +20 -0
- openforis_whisp-3.0.0a2.dist-info/RECORD +0 -20
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/WHEEL +0 -0
|
@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
|
|
|
36
36
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
37
|
import tempfile
|
|
38
38
|
|
|
39
|
+
# Configure the "whisp" logger with auto-flush handler for Colab visibility
|
|
40
|
+
_whisp_logger = logging.getLogger("whisp")
|
|
41
|
+
if not _whisp_logger.handlers:
|
|
42
|
+
_handler = logging.StreamHandler(sys.stdout)
|
|
43
|
+
_handler.setLevel(logging.DEBUG)
|
|
44
|
+
_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
|
|
45
|
+
# Override emit to force flush after each message for Colab
|
|
46
|
+
_original_emit = _handler.emit
|
|
47
|
+
|
|
48
|
+
def _emit_with_flush(record):
|
|
49
|
+
_original_emit(record)
|
|
50
|
+
sys.stdout.flush()
|
|
51
|
+
|
|
52
|
+
_handler.emit = _emit_with_flush
|
|
53
|
+
_whisp_logger.addHandler(_handler)
|
|
54
|
+
_whisp_logger.setLevel(logging.INFO)
|
|
55
|
+
_whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
|
|
56
|
+
|
|
39
57
|
# ============================================================================
|
|
40
58
|
# STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
|
|
41
59
|
# ============================================================================
|
|
@@ -445,6 +463,16 @@ def join_admin_codes(
|
|
|
445
463
|
columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
|
|
446
464
|
)
|
|
447
465
|
|
|
466
|
+
# Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
|
|
467
|
+
# (e.g., points in the ocean or international waters)
|
|
468
|
+
df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
|
|
469
|
+
"Unknown"
|
|
470
|
+
)
|
|
471
|
+
df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
|
|
472
|
+
"not found"
|
|
473
|
+
)
|
|
474
|
+
df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
|
|
475
|
+
|
|
448
476
|
logger.debug(
|
|
449
477
|
f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
|
|
450
478
|
)
|
|
@@ -461,10 +489,16 @@ class ProgressTracker:
|
|
|
461
489
|
|
|
462
490
|
Shows progress at adaptive milestones (more frequent for small datasets,
|
|
463
491
|
less frequent for large datasets) with estimated time remaining based on
|
|
464
|
-
processing speed.
|
|
492
|
+
processing speed. Includes time-based heartbeat to prevent long silences.
|
|
465
493
|
"""
|
|
466
494
|
|
|
467
|
-
def __init__(
|
|
495
|
+
def __init__(
|
|
496
|
+
self,
|
|
497
|
+
total: int,
|
|
498
|
+
logger: logging.Logger = None,
|
|
499
|
+
heartbeat_interval: int = 180,
|
|
500
|
+
status_file: str = None,
|
|
501
|
+
):
|
|
468
502
|
"""
|
|
469
503
|
Initialize progress tracker.
|
|
470
504
|
|
|
@@ -474,26 +508,147 @@ class ProgressTracker:
|
|
|
474
508
|
Total number of items to process
|
|
475
509
|
logger : logging.Logger, optional
|
|
476
510
|
Logger for output
|
|
511
|
+
heartbeat_interval : int, optional
|
|
512
|
+
Seconds between heartbeat messages (default: 180 = 3 minutes)
|
|
513
|
+
status_file : str, optional
|
|
514
|
+
Path to JSON status file for API/web app consumption.
|
|
515
|
+
Checkpoints auto-save to same directory as status_file.
|
|
477
516
|
"""
|
|
478
517
|
self.total = total
|
|
479
518
|
self.completed = 0
|
|
480
519
|
self.lock = threading.Lock()
|
|
481
520
|
self.logger = logger or logging.getLogger("whisp")
|
|
521
|
+
self.heartbeat_interval = heartbeat_interval
|
|
522
|
+
|
|
523
|
+
# Handle status_file: if directory passed, auto-generate filename
|
|
524
|
+
if status_file:
|
|
525
|
+
import os
|
|
526
|
+
|
|
527
|
+
if os.path.isdir(status_file):
|
|
528
|
+
self.status_file = os.path.join(
|
|
529
|
+
status_file, "whisp_processing_status.json"
|
|
530
|
+
)
|
|
531
|
+
else:
|
|
532
|
+
# Validate that parent directory exists
|
|
533
|
+
parent_dir = os.path.dirname(status_file)
|
|
534
|
+
if parent_dir and not os.path.isdir(parent_dir):
|
|
535
|
+
self.logger.warning(
|
|
536
|
+
f"Status file directory does not exist: {parent_dir}"
|
|
537
|
+
)
|
|
538
|
+
self.status_file = None
|
|
539
|
+
else:
|
|
540
|
+
self.status_file = status_file
|
|
541
|
+
else:
|
|
542
|
+
self.status_file = None
|
|
482
543
|
|
|
483
544
|
# Adaptive milestones based on dataset size
|
|
484
545
|
# Small datasets (< 50): show every 25% (not too spammy)
|
|
485
546
|
# Medium (50-500): show every 20%
|
|
486
|
-
# Large (500
|
|
547
|
+
# Large (500-1000): show every 10%
|
|
548
|
+
# Very large (1000+): show every 5% (cleaner for long jobs)
|
|
487
549
|
if total < 50:
|
|
488
550
|
self.milestones = {25, 50, 75, 100}
|
|
489
551
|
elif total < 500:
|
|
490
552
|
self.milestones = {20, 40, 60, 80, 100}
|
|
491
|
-
|
|
553
|
+
elif total < 1000:
|
|
492
554
|
self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
|
|
555
|
+
else:
|
|
556
|
+
self.milestones = {
|
|
557
|
+
5,
|
|
558
|
+
10,
|
|
559
|
+
15,
|
|
560
|
+
20,
|
|
561
|
+
25,
|
|
562
|
+
30,
|
|
563
|
+
35,
|
|
564
|
+
40,
|
|
565
|
+
45,
|
|
566
|
+
50,
|
|
567
|
+
55,
|
|
568
|
+
60,
|
|
569
|
+
65,
|
|
570
|
+
70,
|
|
571
|
+
75,
|
|
572
|
+
80,
|
|
573
|
+
85,
|
|
574
|
+
90,
|
|
575
|
+
95,
|
|
576
|
+
100,
|
|
577
|
+
}
|
|
493
578
|
|
|
494
579
|
self.shown_milestones = set()
|
|
495
580
|
self.start_time = time.time()
|
|
496
581
|
self.last_update_time = self.start_time
|
|
582
|
+
self.heartbeat_stop = threading.Event()
|
|
583
|
+
self.heartbeat_thread = None
|
|
584
|
+
|
|
585
|
+
def _write_status_file(self, status: str = "processing") -> None:
|
|
586
|
+
"""Write current progress to JSON status file using atomic write."""
|
|
587
|
+
if not self.status_file:
|
|
588
|
+
return
|
|
589
|
+
|
|
590
|
+
try:
|
|
591
|
+
import json
|
|
592
|
+
import os
|
|
593
|
+
|
|
594
|
+
elapsed = time.time() - self.start_time
|
|
595
|
+
percent = (self.completed / self.total * 100) if self.total > 0 else 0
|
|
596
|
+
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
597
|
+
eta = (
|
|
598
|
+
(self.total - self.completed) / rate * 1.15
|
|
599
|
+
if rate > 0 and percent >= 5
|
|
600
|
+
else None
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
# Write to temp file then atomic rename to prevent partial reads
|
|
604
|
+
from datetime import datetime
|
|
605
|
+
|
|
606
|
+
temp_file = self.status_file + ".tmp"
|
|
607
|
+
with open(temp_file, "w") as f:
|
|
608
|
+
json.dump(
|
|
609
|
+
{
|
|
610
|
+
"status": status,
|
|
611
|
+
"progress": f"{self.completed}/{self.total}",
|
|
612
|
+
"percent": round(percent, 1),
|
|
613
|
+
"elapsed_sec": round(elapsed),
|
|
614
|
+
"eta_sec": round(eta) if eta else None,
|
|
615
|
+
"updated_at": datetime.now().isoformat(),
|
|
616
|
+
},
|
|
617
|
+
f,
|
|
618
|
+
)
|
|
619
|
+
os.replace(temp_file, self.status_file)
|
|
620
|
+
except Exception:
|
|
621
|
+
pass
|
|
622
|
+
|
|
623
|
+
def start_heartbeat(self) -> None:
|
|
624
|
+
"""Start background heartbeat thread for time-based progress updates."""
|
|
625
|
+
if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
|
|
626
|
+
self.heartbeat_stop.clear()
|
|
627
|
+
self.heartbeat_thread = threading.Thread(
|
|
628
|
+
target=self._heartbeat_loop, daemon=True
|
|
629
|
+
)
|
|
630
|
+
self.heartbeat_thread.start()
|
|
631
|
+
# Write initial status
|
|
632
|
+
self._write_status_file(status="processing")
|
|
633
|
+
|
|
634
|
+
def _heartbeat_loop(self) -> None:
|
|
635
|
+
"""Background loop that logs progress at time intervals."""
|
|
636
|
+
while not self.heartbeat_stop.wait(self.heartbeat_interval):
|
|
637
|
+
with self.lock:
|
|
638
|
+
# Only log if we haven't shown a milestone recently
|
|
639
|
+
time_since_update = time.time() - self.last_update_time
|
|
640
|
+
if (
|
|
641
|
+
time_since_update >= self.heartbeat_interval
|
|
642
|
+
and self.completed < self.total
|
|
643
|
+
):
|
|
644
|
+
elapsed = time.time() - self.start_time
|
|
645
|
+
percent = int((self.completed / self.total) * 100)
|
|
646
|
+
elapsed_str = self._format_time(elapsed)
|
|
647
|
+
self.logger.info(
|
|
648
|
+
f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
|
|
649
|
+
f"Elapsed: {elapsed_str}"
|
|
650
|
+
)
|
|
651
|
+
self.last_update_time = time.time()
|
|
497
652
|
|
|
498
653
|
def update(self, n: int = 1) -> None:
|
|
499
654
|
"""
|
|
@@ -508,7 +663,7 @@ class ProgressTracker:
|
|
|
508
663
|
self.completed += n
|
|
509
664
|
percent = int((self.completed / self.total) * 100)
|
|
510
665
|
|
|
511
|
-
# Show milestone messages (
|
|
666
|
+
# Show milestone messages (5%, 10%, 15%... for large datasets)
|
|
512
667
|
for milestone in sorted(self.milestones):
|
|
513
668
|
if percent >= milestone and milestone not in self.shown_milestones:
|
|
514
669
|
self.shown_milestones.add(milestone)
|
|
@@ -517,20 +672,36 @@ class ProgressTracker:
|
|
|
517
672
|
elapsed = time.time() - self.start_time
|
|
518
673
|
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
519
674
|
remaining_items = self.total - self.completed
|
|
520
|
-
|
|
675
|
+
|
|
676
|
+
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
677
|
+
# Don't show ETA until we have some samples (at least 5% complete)
|
|
678
|
+
if rate > 0 and self.completed >= max(5, self.total * 0.05):
|
|
679
|
+
eta_seconds = (
|
|
680
|
+
remaining_items / rate
|
|
681
|
+
) * 1.15 # Add 15% padding for overhead
|
|
682
|
+
else:
|
|
683
|
+
eta_seconds = 0
|
|
521
684
|
|
|
522
685
|
# Format time strings
|
|
523
|
-
eta_str =
|
|
686
|
+
eta_str = (
|
|
687
|
+
self._format_time(eta_seconds)
|
|
688
|
+
if eta_seconds > 0
|
|
689
|
+
else "calculating..."
|
|
690
|
+
)
|
|
524
691
|
elapsed_str = self._format_time(elapsed)
|
|
525
692
|
|
|
526
693
|
# Build progress message
|
|
527
|
-
msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
|
|
694
|
+
msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
|
|
528
695
|
if percent < 100:
|
|
529
696
|
msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
|
|
530
697
|
else:
|
|
531
698
|
msg += f" | Total time: {elapsed_str}"
|
|
532
699
|
|
|
533
700
|
self.logger.info(msg)
|
|
701
|
+
self.last_update_time = time.time()
|
|
702
|
+
|
|
703
|
+
# Update status file for API consumption
|
|
704
|
+
self._write_status_file()
|
|
534
705
|
|
|
535
706
|
@staticmethod
|
|
536
707
|
def _format_time(seconds: float) -> str:
|
|
@@ -544,14 +715,21 @@ class ProgressTracker:
|
|
|
544
715
|
hours = seconds / 3600
|
|
545
716
|
return f"{hours:.1f}h"
|
|
546
717
|
|
|
547
|
-
def finish(self) -> None:
|
|
548
|
-
"""
|
|
718
|
+
def finish(self, output_file: str = None) -> None:
|
|
719
|
+
"""Stop heartbeat and log completion."""
|
|
720
|
+
# Stop heartbeat thread
|
|
721
|
+
self.heartbeat_stop.set()
|
|
722
|
+
if self.heartbeat_thread and self.heartbeat_thread.is_alive():
|
|
723
|
+
self.heartbeat_thread.join(timeout=1)
|
|
724
|
+
|
|
549
725
|
with self.lock:
|
|
550
726
|
total_time = time.time() - self.start_time
|
|
551
727
|
time_str = self._format_time(total_time)
|
|
552
|
-
self.
|
|
553
|
-
|
|
554
|
-
|
|
728
|
+
msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
|
|
729
|
+
self.logger.info(msg)
|
|
730
|
+
|
|
731
|
+
# Write final status
|
|
732
|
+
self._write_status_file(status="completed")
|
|
555
733
|
|
|
556
734
|
|
|
557
735
|
# ============================================================================
|
|
@@ -600,18 +778,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
|
|
|
600
778
|
If incorrect endpoint and raise_error=True
|
|
601
779
|
"""
|
|
602
780
|
if not check_ee_endpoint(endpoint_type):
|
|
603
|
-
msg = (
|
|
604
|
-
f"Not using {endpoint_type.upper()} endpoint.\n"
|
|
605
|
-
f"Current URL: {ee.data._cloud_api_base_url}\n"
|
|
606
|
-
f"\nTo use {endpoint_type} endpoint, run:\n"
|
|
607
|
-
)
|
|
608
|
-
msg += "ee.Reset()\n"
|
|
609
781
|
if endpoint_type == "high-volume":
|
|
610
|
-
msg
|
|
611
|
-
"
|
|
782
|
+
msg = (
|
|
783
|
+
"Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
784
|
+
"ee.Reset()\n"
|
|
785
|
+
"ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
786
|
+
"Or with project specified (e.g. when in Colab):\n"
|
|
787
|
+
"ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
788
|
+
)
|
|
789
|
+
else: # standard endpoint
|
|
790
|
+
msg = (
|
|
791
|
+
"Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
|
|
792
|
+
"ee.Reset()\n"
|
|
793
|
+
"ee.Initialize()\n"
|
|
794
|
+
"Or with project specified (e.g. when in Colab):\n"
|
|
795
|
+
"ee.Initialize(project='your_cloud_project_name')"
|
|
612
796
|
)
|
|
613
|
-
else:
|
|
614
|
-
msg += "ee.Initialize() # Uses standard endpoint by default"
|
|
615
797
|
|
|
616
798
|
if raise_error:
|
|
617
799
|
raise RuntimeError(msg)
|
|
@@ -808,8 +990,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
808
990
|
|
|
809
991
|
def clean_geodataframe(
|
|
810
992
|
gdf: gpd.GeoDataFrame,
|
|
811
|
-
remove_nulls: bool =
|
|
812
|
-
|
|
993
|
+
remove_nulls: bool = False,
|
|
994
|
+
repair_geometries: bool = False,
|
|
813
995
|
logger: logging.Logger = None,
|
|
814
996
|
) -> gpd.GeoDataFrame:
|
|
815
997
|
"""
|
|
@@ -820,9 +1002,11 @@ def clean_geodataframe(
|
|
|
820
1002
|
gdf : gpd.GeoDataFrame
|
|
821
1003
|
Input GeoDataFrame
|
|
822
1004
|
remove_nulls : bool
|
|
823
|
-
Remove null geometries
|
|
824
|
-
|
|
825
|
-
|
|
1005
|
+
Remove null geometries. Defaults to False to preserve data integrity.
|
|
1006
|
+
Set to True only if you explicitly want to drop rows with null geometries.
|
|
1007
|
+
repair_geometries : bool
|
|
1008
|
+
Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
|
|
1009
|
+
original geometries. Set to True only if you want to automatically repair invalid geometries.
|
|
826
1010
|
logger : logging.Logger, optional
|
|
827
1011
|
Logger for output
|
|
828
1012
|
|
|
@@ -839,11 +1023,11 @@ def clean_geodataframe(
|
|
|
839
1023
|
logger.warning(f"Removing {null_count} null geometries")
|
|
840
1024
|
gdf = gdf[~gdf.geometry.isna()].copy()
|
|
841
1025
|
|
|
842
|
-
if
|
|
1026
|
+
if repair_geometries:
|
|
843
1027
|
valid_count = gdf.geometry.is_valid.sum()
|
|
844
1028
|
invalid_count = len(gdf) - valid_count
|
|
845
1029
|
if invalid_count > 0:
|
|
846
|
-
logger.warning(f"
|
|
1030
|
+
logger.warning(f"Repairing {invalid_count} invalid geometries")
|
|
847
1031
|
from shapely.validation import make_valid
|
|
848
1032
|
|
|
849
1033
|
gdf = gdf.copy()
|
|
@@ -855,6 +1039,19 @@ def clean_geodataframe(
|
|
|
855
1039
|
return gdf
|
|
856
1040
|
|
|
857
1041
|
|
|
1042
|
+
# ============================================================================
|
|
1043
|
+
# BATCH RETRY HELPER
|
|
1044
|
+
# ============================================================================
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
# ============================================================================
|
|
1048
|
+
# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
|
|
1049
|
+
# ============================================================================
|
|
1050
|
+
# Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
|
|
1051
|
+
# approach: when a batch fails, reduce batch_size parameter and retry manually.
|
|
1052
|
+
# This avoids semaphore deadlocks and provides clearer error messages.
|
|
1053
|
+
|
|
1054
|
+
|
|
858
1055
|
# ============================================================================
|
|
859
1056
|
# EE PROCESSING WITH RETRY LOGIC
|
|
860
1057
|
# ============================================================================
|
|
@@ -964,7 +1161,6 @@ def process_ee_batch(
|
|
|
964
1161
|
def whisp_stats_geojson_to_df_concurrent(
|
|
965
1162
|
input_geojson_filepath: str,
|
|
966
1163
|
external_id_column: str = None,
|
|
967
|
-
remove_geom: bool = False,
|
|
968
1164
|
national_codes: List[str] = None,
|
|
969
1165
|
unit_type: str = "ha",
|
|
970
1166
|
whisp_image: ee.Image = None,
|
|
@@ -977,6 +1173,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
977
1173
|
logger: logging.Logger = None,
|
|
978
1174
|
# Format parameters (auto-detect from config if not provided)
|
|
979
1175
|
decimal_places: int = None,
|
|
1176
|
+
status_file: str = None,
|
|
980
1177
|
) -> pd.DataFrame:
|
|
981
1178
|
"""
|
|
982
1179
|
Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
|
|
@@ -991,8 +1188,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
991
1188
|
Path to input GeoJSON file
|
|
992
1189
|
external_id_column : str, optional
|
|
993
1190
|
Column name for external IDs
|
|
994
|
-
remove_geom : bool
|
|
995
|
-
Remove geometry column from output
|
|
996
1191
|
national_codes : List[str], optional
|
|
997
1192
|
ISO2 codes for national datasets
|
|
998
1193
|
unit_type : str
|
|
@@ -1040,8 +1235,29 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1040
1235
|
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1041
1236
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1042
1237
|
|
|
1238
|
+
# Validate external_id_column if provided (lightweight client-side check)
|
|
1239
|
+
if external_id_column and external_id_column not in gdf.columns:
|
|
1240
|
+
# Exclude geometry column from available columns list
|
|
1241
|
+
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1242
|
+
raise ValueError(
|
|
1243
|
+
f"Column '{external_id_column}' not found in GeoJSON properties. "
|
|
1244
|
+
f"Available columns: {available_cols}"
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
# Check completeness of external_id_column (warn if nulls exist)
|
|
1248
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
1249
|
+
null_count = gdf[external_id_column].isna().sum()
|
|
1250
|
+
if null_count > 0:
|
|
1251
|
+
null_pct = (null_count / len(gdf)) * 100
|
|
1252
|
+
logger.warning(
|
|
1253
|
+
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1254
|
+
f"These features may have missing external IDs in output."
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1043
1257
|
if validate_geometries:
|
|
1044
|
-
gdf = clean_geodataframe(
|
|
1258
|
+
gdf = clean_geodataframe(
|
|
1259
|
+
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
1260
|
+
)
|
|
1045
1261
|
|
|
1046
1262
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1047
1263
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
@@ -1080,13 +1296,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1080
1296
|
|
|
1081
1297
|
# Batch the data
|
|
1082
1298
|
batches = batch_geodataframe(gdf_for_ee, batch_size)
|
|
1083
|
-
logger.info(
|
|
1299
|
+
logger.info(
|
|
1300
|
+
f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
|
|
1301
|
+
)
|
|
1084
1302
|
|
|
1085
1303
|
# Setup semaphore for EE concurrency control
|
|
1086
1304
|
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
1087
1305
|
|
|
1088
|
-
# Progress tracker
|
|
1089
|
-
progress = ProgressTracker(
|
|
1306
|
+
# Progress tracker with heartbeat for long-running jobs
|
|
1307
|
+
progress = ProgressTracker(
|
|
1308
|
+
len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
|
|
1309
|
+
)
|
|
1310
|
+
progress.start_heartbeat()
|
|
1090
1311
|
|
|
1091
1312
|
results = []
|
|
1092
1313
|
|
|
@@ -1127,64 +1348,77 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1127
1348
|
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1128
1349
|
|
|
1129
1350
|
try:
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
for future in as_completed(futures):
|
|
1138
|
-
try:
|
|
1139
|
-
batch_idx, df_server, df_client = future.result()
|
|
1140
|
-
|
|
1141
|
-
# Merge server and client results
|
|
1142
|
-
if plot_id_column not in df_server.columns:
|
|
1143
|
-
df_server[plot_id_column] = range(len(df_server))
|
|
1144
|
-
|
|
1145
|
-
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1146
|
-
# These are the actual EE processing results
|
|
1147
|
-
df_server_clean = df_server.copy()
|
|
1148
|
-
|
|
1149
|
-
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1150
|
-
# (formatted wrapper handles keep_external_columns parameter)
|
|
1151
|
-
keep_external_columns = [plot_id_column]
|
|
1152
|
-
if (
|
|
1153
|
-
external_id_column
|
|
1154
|
-
and external_id_column in df_client.columns
|
|
1155
|
-
):
|
|
1156
|
-
keep_external_columns.append(external_id_column)
|
|
1157
|
-
if "geometry" in df_client.columns:
|
|
1158
|
-
keep_external_columns.append("geometry")
|
|
1159
|
-
# Keep geometry type column (Geometry_type)
|
|
1160
|
-
if geometry_type_column in df_client.columns:
|
|
1161
|
-
keep_external_columns.append(geometry_type_column)
|
|
1162
|
-
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1163
|
-
centroid_cols = [
|
|
1164
|
-
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1165
|
-
]
|
|
1166
|
-
keep_external_columns.extend(centroid_cols)
|
|
1167
|
-
|
|
1168
|
-
df_client_clean = df_client[
|
|
1169
|
-
[c for c in keep_external_columns if c in df_client.columns]
|
|
1170
|
-
].drop_duplicates()
|
|
1171
|
-
|
|
1172
|
-
merged = df_server_clean.merge(
|
|
1173
|
-
df_client_clean,
|
|
1174
|
-
on=plot_id_column,
|
|
1175
|
-
how="left",
|
|
1176
|
-
suffixes=("_ee", "_client"),
|
|
1177
|
-
)
|
|
1178
|
-
results.append(merged)
|
|
1179
|
-
progress.update()
|
|
1351
|
+
# Don't suppress stdout here - we want progress messages to show in Colab
|
|
1352
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1353
|
+
futures = {
|
|
1354
|
+
executor.submit(process_batch, i, batch): i
|
|
1355
|
+
for i, batch in enumerate(batches)
|
|
1356
|
+
}
|
|
1180
1357
|
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
import traceback
|
|
1358
|
+
# Track which batches failed for retry
|
|
1359
|
+
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1360
|
+
batch_futures = {future: i for future, i in futures.items()}
|
|
1185
1361
|
|
|
1186
|
-
|
|
1187
|
-
|
|
1362
|
+
for future in as_completed(futures):
|
|
1363
|
+
batch_idx = batch_futures[future]
|
|
1364
|
+
try:
|
|
1365
|
+
batch_idx, df_server, df_client = future.result()
|
|
1366
|
+
|
|
1367
|
+
# Merge server and client results
|
|
1368
|
+
if plot_id_column not in df_server.columns:
|
|
1369
|
+
df_server[plot_id_column] = range(len(df_server))
|
|
1370
|
+
|
|
1371
|
+
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1372
|
+
# These are the actual EE processing results
|
|
1373
|
+
df_server_clean = df_server.copy()
|
|
1374
|
+
|
|
1375
|
+
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1376
|
+
# (formatted wrapper handles keep_external_columns parameter)
|
|
1377
|
+
keep_external_columns = [plot_id_column]
|
|
1378
|
+
if external_id_column and external_id_column in df_client.columns:
|
|
1379
|
+
keep_external_columns.append(external_id_column)
|
|
1380
|
+
if "geometry" in df_client.columns:
|
|
1381
|
+
keep_external_columns.append("geometry")
|
|
1382
|
+
# Keep geometry type column (Geometry_type)
|
|
1383
|
+
if geometry_type_column in df_client.columns:
|
|
1384
|
+
keep_external_columns.append(geometry_type_column)
|
|
1385
|
+
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1386
|
+
centroid_cols = [
|
|
1387
|
+
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1388
|
+
]
|
|
1389
|
+
keep_external_columns.extend(centroid_cols)
|
|
1390
|
+
|
|
1391
|
+
df_client_clean = df_client[
|
|
1392
|
+
[c for c in keep_external_columns if c in df_client.columns]
|
|
1393
|
+
]
|
|
1394
|
+
# Don't drop duplicates - we need one row per feature (one per plot_id)
|
|
1395
|
+
# Each plot_id should have exactly one row with its metadata
|
|
1396
|
+
|
|
1397
|
+
merged = df_server_clean.merge(
|
|
1398
|
+
df_client_clean,
|
|
1399
|
+
on=plot_id_column,
|
|
1400
|
+
how="left",
|
|
1401
|
+
suffixes=("_ee", "_client"),
|
|
1402
|
+
)
|
|
1403
|
+
results.append(merged)
|
|
1404
|
+
progress.update()
|
|
1405
|
+
|
|
1406
|
+
except Exception as e:
|
|
1407
|
+
# Batch failed - fail fast with clear guidance
|
|
1408
|
+
error_msg = str(e)
|
|
1409
|
+
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1410
|
+
logger.debug(f"Full error: {error_msg}")
|
|
1411
|
+
|
|
1412
|
+
# Get original batch for error reporting
|
|
1413
|
+
original_batch = batch_map[batch_idx]
|
|
1414
|
+
|
|
1415
|
+
# Add to batch errors for final reporting
|
|
1416
|
+
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1417
|
+
except (KeyboardInterrupt, SystemExit) as interrupt:
|
|
1418
|
+
logger.warning("Processing interrupted by user")
|
|
1419
|
+
# Update status file with interrupted state
|
|
1420
|
+
progress._write_status_file(status="interrupted")
|
|
1421
|
+
raise interrupt
|
|
1188
1422
|
finally:
|
|
1189
1423
|
# Restore logger levels
|
|
1190
1424
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -1192,8 +1426,60 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1192
1426
|
|
|
1193
1427
|
progress.finish()
|
|
1194
1428
|
|
|
1195
|
-
#
|
|
1196
|
-
if batch_errors
|
|
1429
|
+
# If we have batch errors after retry attempts, fail the entire process
|
|
1430
|
+
if batch_errors:
|
|
1431
|
+
total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
|
|
1432
|
+
failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
|
|
1433
|
+
|
|
1434
|
+
# Format detailed error information for debugging
|
|
1435
|
+
error_details_list = []
|
|
1436
|
+
for idx, batch, msg in batch_errors:
|
|
1437
|
+
error_details_list.append(f" Batch {idx} ({len(batch)} features): {msg}")
|
|
1438
|
+
error_details = "\n".join(error_details_list)
|
|
1439
|
+
|
|
1440
|
+
# Analyze error patterns for debugging hints
|
|
1441
|
+
error_patterns = {
|
|
1442
|
+
"memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
|
|
1443
|
+
"request_size": any(
|
|
1444
|
+
keyword in msg.lower()
|
|
1445
|
+
for _, _, msg in batch_errors
|
|
1446
|
+
for keyword in ["too large", "10mb", "payload", "size limit"]
|
|
1447
|
+
),
|
|
1448
|
+
"quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
|
|
1449
|
+
"timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
# Build helpful suggestions based on error patterns
|
|
1453
|
+
suggestions = []
|
|
1454
|
+
if error_patterns["memory"]:
|
|
1455
|
+
suggestions.append(
|
|
1456
|
+
f" • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
|
|
1457
|
+
)
|
|
1458
|
+
if error_patterns["request_size"]:
|
|
1459
|
+
suggestions.append(
|
|
1460
|
+
" • Request payload too large: reduce batch_size or simplify input geometries"
|
|
1461
|
+
)
|
|
1462
|
+
if error_patterns["quota"]:
|
|
1463
|
+
suggestions.append(" • Earth Engine quota exceeded: wait and retry later")
|
|
1464
|
+
if error_patterns["timeout"]:
|
|
1465
|
+
suggestions.append(
|
|
1466
|
+
" • Processing timeout: reduce batch_size or simplify input geometries"
|
|
1467
|
+
)
|
|
1468
|
+
|
|
1469
|
+
suggestions_text = (
|
|
1470
|
+
"\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
|
|
1471
|
+
)
|
|
1472
|
+
|
|
1473
|
+
raise RuntimeError(
|
|
1474
|
+
f"Failed to process {len(batch_errors)} batch(es):\n"
|
|
1475
|
+
f"\n{error_details}\n"
|
|
1476
|
+
f"\nTotal rows affected: {total_failed_rows}\n"
|
|
1477
|
+
f"{suggestions_text}\n"
|
|
1478
|
+
f"Please reduce batch_size and try again."
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
# Check if we should retry with validation due to band errors (legacy band error handling)
|
|
1482
|
+
if not results:
|
|
1197
1483
|
# All batches failed - likely a bad band issue
|
|
1198
1484
|
is_band_error = any(
|
|
1199
1485
|
keyword in str(batch_errors)
|
|
@@ -1483,7 +1769,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1483
1769
|
)
|
|
1484
1770
|
raise retry_e
|
|
1485
1771
|
|
|
1486
|
-
logger.info(f"
|
|
1772
|
+
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1487
1773
|
return formatted
|
|
1488
1774
|
else:
|
|
1489
1775
|
logger.error(" No results produced")
|
|
@@ -1498,7 +1784,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1498
1784
|
def whisp_stats_geojson_to_df_sequential(
|
|
1499
1785
|
input_geojson_filepath: str,
|
|
1500
1786
|
external_id_column: str = None,
|
|
1501
|
-
remove_geom: bool = False,
|
|
1502
1787
|
national_codes: List[str] = None,
|
|
1503
1788
|
unit_type: str = "ha",
|
|
1504
1789
|
whisp_image: ee.Image = None,
|
|
@@ -1523,8 +1808,6 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1523
1808
|
Path to input GeoJSON
|
|
1524
1809
|
external_id_column : str, optional
|
|
1525
1810
|
Column name for external IDs
|
|
1526
|
-
remove_geom : bool
|
|
1527
|
-
Remove geometry from output
|
|
1528
1811
|
national_codes : List[str], optional
|
|
1529
1812
|
ISO2 codes for national datasets
|
|
1530
1813
|
unit_type : str
|
|
@@ -1564,8 +1847,29 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1564
1847
|
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1565
1848
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1566
1849
|
|
|
1567
|
-
#
|
|
1568
|
-
|
|
1850
|
+
# Validate external_id_column if provided (lightweight client-side check)
|
|
1851
|
+
if external_id_column and external_id_column not in gdf.columns:
|
|
1852
|
+
# Exclude geometry column from available columns list
|
|
1853
|
+
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1854
|
+
raise ValueError(
|
|
1855
|
+
f"Column '{external_id_column}' not found in GeoJSON properties. "
|
|
1856
|
+
f"Available columns: {available_cols}"
|
|
1857
|
+
)
|
|
1858
|
+
|
|
1859
|
+
# Check completeness of external_id_column (warn if nulls exist)
|
|
1860
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
1861
|
+
null_count = gdf[external_id_column].isna().sum()
|
|
1862
|
+
if null_count > 0:
|
|
1863
|
+
null_pct = (null_count / len(gdf)) * 100
|
|
1864
|
+
logger.warning(
|
|
1865
|
+
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1866
|
+
f"These features may have missing external IDs in output."
|
|
1867
|
+
)
|
|
1868
|
+
|
|
1869
|
+
# Clean geometries (preserve both null and invalid geometries by default)
|
|
1870
|
+
gdf = clean_geodataframe(
|
|
1871
|
+
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
1872
|
+
)
|
|
1569
1873
|
|
|
1570
1874
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1571
1875
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
@@ -1612,7 +1916,9 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1612
1916
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
1613
1917
|
|
|
1614
1918
|
# Process server-side with error handling for bad bands
|
|
1615
|
-
logger.info(
|
|
1919
|
+
logger.info(
|
|
1920
|
+
f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
|
|
1921
|
+
)
|
|
1616
1922
|
try:
|
|
1617
1923
|
results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
|
|
1618
1924
|
df_server = convert_ee_to_df(results_fc)
|
|
@@ -1698,7 +2004,7 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1698
2004
|
convert_water_flag=True,
|
|
1699
2005
|
)
|
|
1700
2006
|
|
|
1701
|
-
logger.info(f"
|
|
2007
|
+
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1702
2008
|
|
|
1703
2009
|
# Consolidate external_id_column to standardized 'external_id'
|
|
1704
2010
|
if external_id_column:
|
|
@@ -1731,7 +2037,6 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1731
2037
|
def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
1732
2038
|
input_geojson_filepath: str,
|
|
1733
2039
|
external_id_column: str = None,
|
|
1734
|
-
remove_geom: bool = False,
|
|
1735
2040
|
national_codes: List[str] = None,
|
|
1736
2041
|
unit_type: str = "ha",
|
|
1737
2042
|
whisp_image: ee.Image = None,
|
|
@@ -1748,7 +2053,8 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1748
2053
|
convert_water_flag: bool = True,
|
|
1749
2054
|
water_flag_threshold: float = 0.5,
|
|
1750
2055
|
sort_column: str = "plotId",
|
|
1751
|
-
|
|
2056
|
+
geometry_audit_trail: bool = False,
|
|
2057
|
+
status_file: str = None,
|
|
1752
2058
|
) -> pd.DataFrame:
|
|
1753
2059
|
"""
|
|
1754
2060
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -1764,8 +2070,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1764
2070
|
Path to input GeoJSON file
|
|
1765
2071
|
external_id_column : str, optional
|
|
1766
2072
|
Column name for external IDs
|
|
1767
|
-
remove_geom : bool
|
|
1768
|
-
Remove geometry column from output
|
|
1769
2073
|
national_codes : List[str], optional
|
|
1770
2074
|
ISO2 codes for national datasets
|
|
1771
2075
|
unit_type : str
|
|
@@ -1799,14 +2103,10 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1799
2103
|
Water flag ratio threshold (default 0.5)
|
|
1800
2104
|
sort_column : str
|
|
1801
2105
|
Column to sort by (default "plotId", None to skip)
|
|
1802
|
-
|
|
1803
|
-
If True, includes
|
|
1804
|
-
- geo_original: Original input geometry (before EE processing)
|
|
1805
|
-
|
|
1806
|
-
- geometry_type: Processed geometry type (from EE)
|
|
1807
|
-
- geometry_type_changed: Boolean flag if geometry changed
|
|
1808
|
-
- geometry_type_transition: Description of how it changed
|
|
1809
|
-
These columns enable full transparency and auditability for compliance tracking.
|
|
2106
|
+
geometry_audit_trail : bool, default False
|
|
2107
|
+
If True, includes original input geometry column:
|
|
2108
|
+
- geo_original: Original input geometry (before EE processing), stored as GeoJSON
|
|
2109
|
+
Enables geometry traceability for compliance and audit purposes.
|
|
1810
2110
|
|
|
1811
2111
|
Returns
|
|
1812
2112
|
-------
|
|
@@ -1826,15 +2126,17 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1826
2126
|
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1827
2127
|
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1828
2128
|
|
|
1829
|
-
#
|
|
1830
|
-
|
|
2129
|
+
# Load original geometries once here if needed for audit trail (avoid reloading later)
|
|
2130
|
+
gdf_original_geoms = None
|
|
2131
|
+
if geometry_audit_trail:
|
|
2132
|
+
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2133
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
1831
2134
|
|
|
1832
2135
|
# Step 1: Get raw stats
|
|
1833
2136
|
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
1834
2137
|
df_raw = whisp_stats_geojson_to_df_concurrent(
|
|
1835
2138
|
input_geojson_filepath=input_geojson_filepath,
|
|
1836
2139
|
external_id_column=external_id_column,
|
|
1837
|
-
remove_geom=remove_geom,
|
|
1838
2140
|
national_codes=national_codes,
|
|
1839
2141
|
unit_type=unit_type,
|
|
1840
2142
|
whisp_image=whisp_image,
|
|
@@ -1845,6 +2147,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1845
2147
|
max_retries=max_retries,
|
|
1846
2148
|
add_metadata_server=add_metadata_server,
|
|
1847
2149
|
logger=logger,
|
|
2150
|
+
status_file=status_file,
|
|
1848
2151
|
)
|
|
1849
2152
|
|
|
1850
2153
|
# Step 2: Format the output
|
|
@@ -1890,95 +2193,39 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1890
2193
|
)
|
|
1891
2194
|
|
|
1892
2195
|
# Step 2c: Add audit trail columns (AFTER validation to preserve columns)
|
|
1893
|
-
if
|
|
2196
|
+
if geometry_audit_trail:
|
|
1894
2197
|
logger.debug("Adding audit trail columns...")
|
|
1895
2198
|
try:
|
|
1896
|
-
#
|
|
1897
|
-
|
|
1898
|
-
|
|
2199
|
+
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2200
|
+
if gdf_original_geoms is None:
|
|
2201
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2202
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
1899
2203
|
|
|
1900
2204
|
# Use plotId from df_validated to maintain mapping
|
|
1901
2205
|
df_original_geom = pd.DataFrame(
|
|
1902
2206
|
{
|
|
1903
|
-
"plotId": df_validated["plotId"].values[: len(
|
|
1904
|
-
"geo_original":
|
|
2207
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2208
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
1905
2209
|
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
1906
2210
|
),
|
|
1907
|
-
"geometry_type_original": gdf_original["geometry"].geom_type.values,
|
|
1908
2211
|
}
|
|
1909
2212
|
)
|
|
1910
2213
|
|
|
1911
2214
|
# Merge original geometries back
|
|
1912
2215
|
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
1913
2216
|
|
|
1914
|
-
# Extract geometry type from processed 'geo' column if it exists
|
|
1915
|
-
# Note: 'geo' column may not exist after validation removes extra columns
|
|
1916
|
-
if "geo" in df_validated.columns:
|
|
1917
|
-
# Use geo column from validated dataframe
|
|
1918
|
-
def extract_geom_type(x):
|
|
1919
|
-
try:
|
|
1920
|
-
if isinstance(x, dict):
|
|
1921
|
-
return x.get("type")
|
|
1922
|
-
elif isinstance(x, str):
|
|
1923
|
-
# Handle both JSON strings and Python dict string representations
|
|
1924
|
-
try:
|
|
1925
|
-
parsed = json.loads(x)
|
|
1926
|
-
except:
|
|
1927
|
-
# Try ast.literal_eval for Python dict representations
|
|
1928
|
-
import ast
|
|
1929
|
-
|
|
1930
|
-
parsed = ast.literal_eval(x)
|
|
1931
|
-
return (
|
|
1932
|
-
parsed.get("type") if isinstance(parsed, dict) else None
|
|
1933
|
-
)
|
|
1934
|
-
except:
|
|
1935
|
-
pass
|
|
1936
|
-
return None
|
|
1937
|
-
|
|
1938
|
-
df_validated["geometry_type"] = df_validated["geo"].apply(
|
|
1939
|
-
extract_geom_type
|
|
1940
|
-
)
|
|
1941
|
-
else:
|
|
1942
|
-
# If geo doesn't exist, just use the original type
|
|
1943
|
-
df_validated["geometry_type"] = df_validated["geometry_type_original"]
|
|
1944
|
-
|
|
1945
|
-
# Flag if geometry changed
|
|
1946
|
-
df_validated["geometry_type_changed"] = (
|
|
1947
|
-
df_validated["geometry_type_original"] != df_validated["geometry_type"]
|
|
1948
|
-
)
|
|
1949
|
-
|
|
1950
|
-
# Classify the geometry type transition
|
|
1951
|
-
def classify_transition(orig, proc):
|
|
1952
|
-
if orig == proc:
|
|
1953
|
-
return "no_change"
|
|
1954
|
-
elif proc == "LineString":
|
|
1955
|
-
return f"{orig}_simplified_to_linestring"
|
|
1956
|
-
elif proc == "Point":
|
|
1957
|
-
return f"{orig}_simplified_to_point"
|
|
1958
|
-
else:
|
|
1959
|
-
return f"{orig}_to_{proc}"
|
|
1960
|
-
|
|
1961
|
-
df_validated["geometry_type_transition"] = df_validated.apply(
|
|
1962
|
-
lambda row: classify_transition(
|
|
1963
|
-
row["geometry_type_original"], row["geometry_type"]
|
|
1964
|
-
),
|
|
1965
|
-
axis=1,
|
|
1966
|
-
)
|
|
1967
|
-
|
|
1968
2217
|
# Store processing metadata
|
|
1969
2218
|
df_validated.attrs["processing_metadata"] = {
|
|
1970
|
-
"whisp_version": "
|
|
2219
|
+
"whisp_version": "3.0.0a1",
|
|
1971
2220
|
"processing_date": datetime.now().isoformat(),
|
|
1972
2221
|
"processing_mode": "concurrent",
|
|
1973
2222
|
"ee_endpoint": "high_volume",
|
|
1974
2223
|
"validate_geometries": validate_geometries,
|
|
1975
2224
|
"datasets_used": national_codes or [],
|
|
1976
|
-
"
|
|
2225
|
+
"geometry_audit_trail": True,
|
|
1977
2226
|
}
|
|
1978
2227
|
|
|
1979
|
-
logger.info(
|
|
1980
|
-
f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
|
|
1981
|
-
)
|
|
2228
|
+
logger.info(f"Audit trail added: geo_original column")
|
|
1982
2229
|
|
|
1983
2230
|
except Exception as e:
|
|
1984
2231
|
logger.warning(f"Error adding audit trail: {e}")
|
|
@@ -2003,7 +2250,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2003
2250
|
def whisp_formatted_stats_geojson_to_df_sequential(
|
|
2004
2251
|
input_geojson_filepath: str,
|
|
2005
2252
|
external_id_column: str = None,
|
|
2006
|
-
remove_geom: bool = False,
|
|
2007
2253
|
national_codes: List[str] = None,
|
|
2008
2254
|
unit_type: str = "ha",
|
|
2009
2255
|
whisp_image: ee.Image = None,
|
|
@@ -2016,7 +2262,8 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2016
2262
|
convert_water_flag: bool = True,
|
|
2017
2263
|
water_flag_threshold: float = 0.5,
|
|
2018
2264
|
sort_column: str = "plotId",
|
|
2019
|
-
|
|
2265
|
+
geometry_audit_trail: bool = False,
|
|
2266
|
+
status_file: str = None,
|
|
2020
2267
|
) -> pd.DataFrame:
|
|
2021
2268
|
"""
|
|
2022
2269
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -2032,8 +2279,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2032
2279
|
Path to input GeoJSON file
|
|
2033
2280
|
external_id_column : str, optional
|
|
2034
2281
|
Column name for external IDs
|
|
2035
|
-
remove_geom : bool
|
|
2036
|
-
Remove geometry from output
|
|
2037
2282
|
national_codes : List[str], optional
|
|
2038
2283
|
ISO2 codes for national datasets
|
|
2039
2284
|
unit_type : str
|
|
@@ -2059,14 +2304,10 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2059
2304
|
Water flag ratio threshold (default 0.5)
|
|
2060
2305
|
sort_column : str
|
|
2061
2306
|
Column to sort by (default "plotId", None to skip)
|
|
2062
|
-
|
|
2063
|
-
If True, includes
|
|
2064
|
-
- geo_original: Original input geometry (before EE processing)
|
|
2065
|
-
|
|
2066
|
-
- geometry_type: Processed geometry type (from EE)
|
|
2067
|
-
- geometry_type_changed: Boolean flag if geometry changed
|
|
2068
|
-
- geometry_type_transition: Description of how it changed
|
|
2069
|
-
These columns enable full transparency and auditability for EUDR compliance.
|
|
2307
|
+
geometry_audit_trail : bool, default True
|
|
2308
|
+
If True, includes original input geometry column:
|
|
2309
|
+
- geo_original: Original input geometry (before EE processing), stored as GeoJSON
|
|
2310
|
+
Enables geometry traceability for compliance and audit purposes.
|
|
2070
2311
|
|
|
2071
2312
|
Returns
|
|
2072
2313
|
-------
|
|
@@ -2086,12 +2327,17 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2086
2327
|
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
2087
2328
|
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
2088
2329
|
|
|
2330
|
+
# Load original geometries once here if needed for audit trail (avoid reloading later)
|
|
2331
|
+
gdf_original_geoms = None
|
|
2332
|
+
if geometry_audit_trail:
|
|
2333
|
+
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2334
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
2335
|
+
|
|
2089
2336
|
# Step 1: Get raw stats
|
|
2090
2337
|
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
2091
2338
|
df_raw = whisp_stats_geojson_to_df_sequential(
|
|
2092
2339
|
input_geojson_filepath=input_geojson_filepath,
|
|
2093
2340
|
external_id_column=external_id_column,
|
|
2094
|
-
remove_geom=remove_geom,
|
|
2095
2341
|
national_codes=national_codes,
|
|
2096
2342
|
unit_type=unit_type,
|
|
2097
2343
|
whisp_image=whisp_image,
|
|
@@ -2143,94 +2389,38 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2143
2389
|
)
|
|
2144
2390
|
|
|
2145
2391
|
# Step 2c: Add audit trail columns (AFTER validation to preserve columns)
|
|
2146
|
-
if
|
|
2392
|
+
if geometry_audit_trail:
|
|
2147
2393
|
logger.debug("Adding audit trail columns...")
|
|
2148
2394
|
try:
|
|
2149
|
-
#
|
|
2150
|
-
|
|
2151
|
-
|
|
2395
|
+
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2396
|
+
if gdf_original_geoms is None:
|
|
2397
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2398
|
+
gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
|
|
2152
2399
|
|
|
2153
2400
|
# Use plotId from df_validated to maintain mapping
|
|
2154
2401
|
df_original_geom = pd.DataFrame(
|
|
2155
2402
|
{
|
|
2156
|
-
"plotId": df_validated["plotId"].values[: len(
|
|
2157
|
-
"geo_original":
|
|
2403
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2404
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2158
2405
|
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2159
2406
|
),
|
|
2160
|
-
"geometry_type_original": gdf_original["geometry"].geom_type.values,
|
|
2161
2407
|
}
|
|
2162
2408
|
)
|
|
2163
2409
|
|
|
2164
2410
|
# Merge original geometries back
|
|
2165
2411
|
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2166
2412
|
|
|
2167
|
-
# Extract geometry type from processed 'geo' column if it exists
|
|
2168
|
-
# Note: 'geo' column may not exist after validation removes extra columns
|
|
2169
|
-
if "geo" in df_validated.columns:
|
|
2170
|
-
# Use geo column from validated dataframe
|
|
2171
|
-
def extract_geom_type(x):
|
|
2172
|
-
try:
|
|
2173
|
-
if isinstance(x, dict):
|
|
2174
|
-
return x.get("type")
|
|
2175
|
-
elif isinstance(x, str):
|
|
2176
|
-
# Handle both JSON strings and Python dict string representations
|
|
2177
|
-
try:
|
|
2178
|
-
parsed = json.loads(x)
|
|
2179
|
-
except:
|
|
2180
|
-
# Try ast.literal_eval for Python dict representations
|
|
2181
|
-
import ast
|
|
2182
|
-
|
|
2183
|
-
parsed = ast.literal_eval(x)
|
|
2184
|
-
return (
|
|
2185
|
-
parsed.get("type") if isinstance(parsed, dict) else None
|
|
2186
|
-
)
|
|
2187
|
-
except:
|
|
2188
|
-
pass
|
|
2189
|
-
return None
|
|
2190
|
-
|
|
2191
|
-
df_validated["geometry_type"] = df_validated["geo"].apply(
|
|
2192
|
-
extract_geom_type
|
|
2193
|
-
)
|
|
2194
|
-
else:
|
|
2195
|
-
# If geo doesn't exist, just use the original type
|
|
2196
|
-
df_validated["geometry_type"] = df_validated["geometry_type_original"]
|
|
2197
|
-
|
|
2198
|
-
# Flag if geometry changed
|
|
2199
|
-
df_validated["geometry_type_changed"] = (
|
|
2200
|
-
df_validated["geometry_type_original"] != df_validated["geometry_type"]
|
|
2201
|
-
)
|
|
2202
|
-
|
|
2203
|
-
# Classify the geometry type transition
|
|
2204
|
-
def classify_transition(orig, proc):
|
|
2205
|
-
if orig == proc:
|
|
2206
|
-
return "no_change"
|
|
2207
|
-
elif proc == "LineString":
|
|
2208
|
-
return f"{orig}_simplified_to_linestring"
|
|
2209
|
-
elif proc == "Point":
|
|
2210
|
-
return f"{orig}_simplified_to_point"
|
|
2211
|
-
else:
|
|
2212
|
-
return f"{orig}_to_{proc}"
|
|
2213
|
-
|
|
2214
|
-
df_validated["geometry_type_transition"] = df_validated.apply(
|
|
2215
|
-
lambda row: classify_transition(
|
|
2216
|
-
row["geometry_type_original"], row["geometry_type"]
|
|
2217
|
-
),
|
|
2218
|
-
axis=1,
|
|
2219
|
-
)
|
|
2220
|
-
|
|
2221
2413
|
# Store processing metadata
|
|
2222
2414
|
df_validated.attrs["processing_metadata"] = {
|
|
2223
|
-
"whisp_version": "
|
|
2415
|
+
"whisp_version": "3.0.0a1",
|
|
2224
2416
|
"processing_date": datetime.now().isoformat(),
|
|
2225
2417
|
"processing_mode": "sequential",
|
|
2226
2418
|
"ee_endpoint": "standard",
|
|
2227
2419
|
"datasets_used": national_codes or [],
|
|
2228
|
-
"
|
|
2420
|
+
"geometry_audit_trail": True,
|
|
2229
2421
|
}
|
|
2230
2422
|
|
|
2231
|
-
logger.info(
|
|
2232
|
-
f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
|
|
2233
|
-
)
|
|
2423
|
+
logger.info(f"Audit trail added: geo_original column")
|
|
2234
2424
|
|
|
2235
2425
|
except Exception as e:
|
|
2236
2426
|
logger.warning(f"Error adding audit trail: {e}")
|
|
@@ -2260,12 +2450,11 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2260
2450
|
def whisp_formatted_stats_geojson_to_df_fast(
|
|
2261
2451
|
input_geojson_filepath: str,
|
|
2262
2452
|
external_id_column: str = None,
|
|
2263
|
-
remove_geom: bool = False,
|
|
2264
2453
|
national_codes: List[str] = None,
|
|
2265
2454
|
unit_type: str = "ha",
|
|
2266
2455
|
whisp_image: ee.Image = None,
|
|
2267
2456
|
custom_bands: Dict[str, Any] = None,
|
|
2268
|
-
mode: str = "
|
|
2457
|
+
mode: str = "sequential",
|
|
2269
2458
|
# Concurrent-specific parameters
|
|
2270
2459
|
batch_size: int = 10,
|
|
2271
2460
|
max_concurrent: int = 20,
|
|
@@ -2278,15 +2467,16 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2278
2467
|
convert_water_flag: bool = True,
|
|
2279
2468
|
water_flag_threshold: float = 0.5,
|
|
2280
2469
|
sort_column: str = "plotId",
|
|
2281
|
-
|
|
2470
|
+
geometry_audit_trail: bool = False,
|
|
2471
|
+
status_file: str = None,
|
|
2282
2472
|
) -> pd.DataFrame:
|
|
2283
2473
|
"""
|
|
2284
2474
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
2285
2475
|
|
|
2286
|
-
|
|
2287
|
-
|
|
2476
|
+
Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
|
|
2477
|
+
based on explicit mode selection.
|
|
2288
2478
|
|
|
2289
|
-
This is the recommended entry point for most users
|
|
2479
|
+
This is the recommended entry point for most users.
|
|
2290
2480
|
|
|
2291
2481
|
Parameters
|
|
2292
2482
|
----------
|
|
@@ -2294,8 +2484,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2294
2484
|
Path to input GeoJSON file
|
|
2295
2485
|
external_id_column : str, optional
|
|
2296
2486
|
Column name for external IDs
|
|
2297
|
-
remove_geom : bool
|
|
2298
|
-
Remove geometry column from output
|
|
2299
2487
|
national_codes : List[str], optional
|
|
2300
2488
|
ISO2 codes for national datasets
|
|
2301
2489
|
unit_type : str
|
|
@@ -2306,12 +2494,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2306
2494
|
Custom band information
|
|
2307
2495
|
mode : str
|
|
2308
2496
|
Processing mode:
|
|
2309
|
-
- "
|
|
2310
|
-
|
|
2311
|
-
* 1-5MB: sequential
|
|
2312
|
-
* >5MB: concurrent
|
|
2313
|
-
- "concurrent": Force high-volume endpoint (batch processing)
|
|
2314
|
-
- "sequential": Force standard endpoint (single-threaded)
|
|
2497
|
+
- "concurrent": Uses high-volume endpoint with batch processing
|
|
2498
|
+
- "sequential": Uses standard endpoint for sequential processing
|
|
2315
2499
|
batch_size : int
|
|
2316
2500
|
Features per batch (only for concurrent mode)
|
|
2317
2501
|
max_concurrent : int
|
|
@@ -2332,6 +2516,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2332
2516
|
Water flag ratio threshold
|
|
2333
2517
|
sort_column : str
|
|
2334
2518
|
Column to sort by
|
|
2519
|
+
geometry_audit_trail : bool
|
|
2520
|
+
Include geometry modification audit trail columns
|
|
2335
2521
|
|
|
2336
2522
|
Returns
|
|
2337
2523
|
-------
|
|
@@ -2340,16 +2526,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2340
2526
|
|
|
2341
2527
|
Examples
|
|
2342
2528
|
--------
|
|
2343
|
-
>>> #
|
|
2344
|
-
>>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
|
|
2345
|
-
|
|
2346
|
-
>>> # Force concurrent processing for large datasets
|
|
2529
|
+
>>> # Use concurrent processing (recommended for most datasets)
|
|
2347
2530
|
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
2348
|
-
... "
|
|
2531
|
+
... "data.geojson",
|
|
2349
2532
|
... mode="concurrent"
|
|
2350
2533
|
... )
|
|
2351
2534
|
|
|
2352
|
-
>>> # Use sequential for
|
|
2535
|
+
>>> # Use sequential processing for more stable results
|
|
2353
2536
|
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
2354
2537
|
... "data.geojson",
|
|
2355
2538
|
... mode="sequential"
|
|
@@ -2357,40 +2540,20 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2357
2540
|
"""
|
|
2358
2541
|
logger = logging.getLogger("whisp")
|
|
2359
2542
|
|
|
2360
|
-
#
|
|
2361
|
-
if mode
|
|
2362
|
-
try:
|
|
2363
|
-
file_size = Path(input_geojson_filepath).stat().st_size
|
|
2364
|
-
if file_size > 5_000_000: # >5MB
|
|
2365
|
-
chosen_mode = "concurrent"
|
|
2366
|
-
logger.info(
|
|
2367
|
-
f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
|
|
2368
|
-
)
|
|
2369
|
-
else: # <=5MB
|
|
2370
|
-
chosen_mode = "sequential"
|
|
2371
|
-
logger.info(
|
|
2372
|
-
f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
|
|
2373
|
-
)
|
|
2374
|
-
except Exception as e:
|
|
2375
|
-
logger.warning(
|
|
2376
|
-
f"Could not determine file size: {e}. Defaulting to sequential."
|
|
2377
|
-
)
|
|
2378
|
-
chosen_mode = "sequential"
|
|
2379
|
-
elif mode in ("concurrent", "sequential"):
|
|
2380
|
-
chosen_mode = mode
|
|
2381
|
-
logger.info(f"Mode explicitly set to: {mode}")
|
|
2382
|
-
else:
|
|
2543
|
+
# Validate mode parameter
|
|
2544
|
+
if mode not in ("concurrent", "sequential"):
|
|
2383
2545
|
raise ValueError(
|
|
2384
|
-
f"Invalid mode '{mode}'. Must be '
|
|
2546
|
+
f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
|
|
2385
2547
|
)
|
|
2386
2548
|
|
|
2549
|
+
logger.info(f"Mode: {mode}")
|
|
2550
|
+
|
|
2387
2551
|
# Route to appropriate function
|
|
2388
|
-
if
|
|
2552
|
+
if mode == "concurrent":
|
|
2389
2553
|
logger.debug("Routing to concurrent processing...")
|
|
2390
2554
|
return whisp_formatted_stats_geojson_to_df_concurrent(
|
|
2391
2555
|
input_geojson_filepath=input_geojson_filepath,
|
|
2392
2556
|
external_id_column=external_id_column,
|
|
2393
|
-
remove_geom=remove_geom,
|
|
2394
2557
|
national_codes=national_codes,
|
|
2395
2558
|
unit_type=unit_type,
|
|
2396
2559
|
whisp_image=whisp_image,
|
|
@@ -2406,14 +2569,14 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2406
2569
|
convert_water_flag=convert_water_flag,
|
|
2407
2570
|
water_flag_threshold=water_flag_threshold,
|
|
2408
2571
|
sort_column=sort_column,
|
|
2409
|
-
|
|
2572
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
2573
|
+
status_file=status_file,
|
|
2410
2574
|
)
|
|
2411
2575
|
else: # sequential
|
|
2412
2576
|
logger.debug("Routing to sequential processing...")
|
|
2413
2577
|
return whisp_formatted_stats_geojson_to_df_sequential(
|
|
2414
2578
|
input_geojson_filepath=input_geojson_filepath,
|
|
2415
2579
|
external_id_column=external_id_column,
|
|
2416
|
-
remove_geom=remove_geom,
|
|
2417
2580
|
national_codes=national_codes,
|
|
2418
2581
|
unit_type=unit_type,
|
|
2419
2582
|
whisp_image=whisp_image,
|
|
@@ -2424,5 +2587,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2424
2587
|
convert_water_flag=convert_water_flag,
|
|
2425
2588
|
water_flag_threshold=water_flag_threshold,
|
|
2426
2589
|
sort_column=sort_column,
|
|
2427
|
-
|
|
2590
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
2591
|
+
status_file=status_file,
|
|
2428
2592
|
)
|