openforis-whisp 3.0.0a6__py3-none-any.whl → 3.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +3 -1
- openforis_whisp/advanced_stats.py +213 -338
- openforis_whisp/data_checks.py +191 -144
- openforis_whisp/datasets.py +4 -5
- openforis_whisp/reformat.py +8 -6
- openforis_whisp/risk.py +113 -29
- openforis_whisp/stats.py +0 -9
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/METADATA +37 -120
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/RECORD +11 -11
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/WHEEL +0 -0
|
@@ -33,6 +33,7 @@ import subprocess
|
|
|
33
33
|
from contextlib import redirect_stdout, contextmanager
|
|
34
34
|
from pathlib import Path
|
|
35
35
|
from typing import Optional, List, Dict, Any, Tuple, Union
|
|
36
|
+
from importlib.metadata import version as get_version
|
|
36
37
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
38
|
import tempfile
|
|
38
39
|
|
|
@@ -510,253 +511,102 @@ def join_admin_codes(
|
|
|
510
511
|
return df
|
|
511
512
|
|
|
512
513
|
|
|
513
|
-
|
|
514
|
-
"""
|
|
515
|
-
|
|
514
|
+
def _format_time(seconds: float) -> str:
|
|
515
|
+
"""Format seconds as human-readable string."""
|
|
516
|
+
if seconds < 60:
|
|
517
|
+
return f"{seconds:.0f}s"
|
|
518
|
+
elif seconds < 3600:
|
|
519
|
+
mins = seconds / 60
|
|
520
|
+
return f"{mins:.1f}m"
|
|
521
|
+
else:
|
|
522
|
+
hours = seconds / 3600
|
|
523
|
+
return f"{hours:.1f}h"
|
|
516
524
|
|
|
517
|
-
Shows progress at adaptive milestones (more frequent for small datasets,
|
|
518
|
-
less frequent for large datasets) with estimated time remaining based on
|
|
519
|
-
processing speed. Includes time-based heartbeat to prevent long silences.
|
|
520
|
-
"""
|
|
521
525
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
logger: logging.Logger = None,
|
|
526
|
-
heartbeat_interval: int = 180,
|
|
527
|
-
status_file: str = None,
|
|
528
|
-
):
|
|
529
|
-
"""
|
|
530
|
-
Initialize progress tracker.
|
|
531
|
-
|
|
532
|
-
Parameters
|
|
533
|
-
----------
|
|
534
|
-
total : int
|
|
535
|
-
Total number of items to process
|
|
536
|
-
logger : logging.Logger, optional
|
|
537
|
-
Logger for output
|
|
538
|
-
heartbeat_interval : int, optional
|
|
539
|
-
Seconds between heartbeat messages (default: 180 = 3 minutes)
|
|
540
|
-
status_file : str, optional
|
|
541
|
-
Path to JSON status file for API/web app consumption.
|
|
542
|
-
Checkpoints auto-save to same directory as status_file.
|
|
543
|
-
"""
|
|
544
|
-
self.total = total
|
|
545
|
-
self.completed = 0
|
|
546
|
-
self.lock = threading.Lock()
|
|
547
|
-
self.logger = logger or logging.getLogger("whisp")
|
|
548
|
-
self.heartbeat_interval = heartbeat_interval
|
|
549
|
-
|
|
550
|
-
# Handle status_file: if directory passed, auto-generate filename
|
|
551
|
-
if status_file:
|
|
552
|
-
import os
|
|
553
|
-
|
|
554
|
-
if os.path.isdir(status_file):
|
|
555
|
-
self.status_file = os.path.join(
|
|
556
|
-
status_file, "whisp_processing_status.json"
|
|
557
|
-
)
|
|
558
|
-
else:
|
|
559
|
-
# Validate that parent directory exists
|
|
560
|
-
parent_dir = os.path.dirname(status_file)
|
|
561
|
-
if parent_dir and not os.path.isdir(parent_dir):
|
|
562
|
-
self.logger.warning(
|
|
563
|
-
f"Status file directory does not exist: {parent_dir}"
|
|
564
|
-
)
|
|
565
|
-
self.status_file = None
|
|
566
|
-
else:
|
|
567
|
-
self.status_file = status_file
|
|
568
|
-
else:
|
|
569
|
-
self.status_file = None
|
|
570
|
-
|
|
571
|
-
# Adaptive milestones based on dataset size
|
|
572
|
-
# Small datasets (< 50): show every 25% (not too spammy)
|
|
573
|
-
# Medium (50-500): show every 20%
|
|
574
|
-
# Large (500-1000): show every 10%
|
|
575
|
-
# Very large (1000+): show every 5% (cleaner for long jobs)
|
|
576
|
-
if total < 50:
|
|
577
|
-
self.milestones = {25, 50, 75, 100}
|
|
578
|
-
elif total < 500:
|
|
579
|
-
self.milestones = {20, 40, 60, 80, 100}
|
|
580
|
-
elif total < 1000:
|
|
581
|
-
self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
|
|
582
|
-
else:
|
|
583
|
-
self.milestones = {
|
|
584
|
-
5,
|
|
585
|
-
10,
|
|
586
|
-
15,
|
|
587
|
-
20,
|
|
588
|
-
25,
|
|
589
|
-
30,
|
|
590
|
-
35,
|
|
591
|
-
40,
|
|
592
|
-
45,
|
|
593
|
-
50,
|
|
594
|
-
55,
|
|
595
|
-
60,
|
|
596
|
-
65,
|
|
597
|
-
70,
|
|
598
|
-
75,
|
|
599
|
-
80,
|
|
600
|
-
85,
|
|
601
|
-
90,
|
|
602
|
-
95,
|
|
603
|
-
100,
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
self.shown_milestones = set()
|
|
607
|
-
self.start_time = time.time()
|
|
608
|
-
self.last_update_time = self.start_time
|
|
609
|
-
self.heartbeat_stop = threading.Event()
|
|
610
|
-
self.heartbeat_thread = None
|
|
526
|
+
def _get_progress_milestones(total_features: int) -> set:
|
|
527
|
+
"""
|
|
528
|
+
Get progress milestones based on dataset size.
|
|
611
529
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
530
|
+
Parameters
|
|
531
|
+
----------
|
|
532
|
+
total_features : int
|
|
533
|
+
Total number of features being processed
|
|
616
534
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
set
|
|
538
|
+
Set of percentage milestones to show
|
|
539
|
+
"""
|
|
540
|
+
# Set milestones based on feature count
|
|
541
|
+
if total_features < 250:
|
|
542
|
+
return set(range(20, 101, 20)) # Every 20%: {20, 40, 60, 80, 100}
|
|
543
|
+
elif total_features < 1000:
|
|
544
|
+
return set(range(10, 101, 10)) # Every 10%
|
|
545
|
+
elif total_features < 10000:
|
|
546
|
+
return set(range(5, 101, 5)) # Every 5%
|
|
547
|
+
elif total_features < 50000:
|
|
548
|
+
return set(range(2, 101, 2)) # Every 2%
|
|
549
|
+
else:
|
|
550
|
+
return set(range(1, 101)) # Every 1%
|
|
629
551
|
|
|
630
|
-
# Write to temp file then atomic rename to prevent partial reads
|
|
631
|
-
from datetime import datetime
|
|
632
|
-
|
|
633
|
-
temp_file = self.status_file + ".tmp"
|
|
634
|
-
with open(temp_file, "w") as f:
|
|
635
|
-
json.dump(
|
|
636
|
-
{
|
|
637
|
-
"status": status,
|
|
638
|
-
"progress": f"{self.completed}/{self.total}",
|
|
639
|
-
"percent": round(percent, 1),
|
|
640
|
-
"elapsed_sec": round(elapsed),
|
|
641
|
-
"eta_sec": round(eta) if eta else None,
|
|
642
|
-
"updated_at": datetime.now().isoformat(),
|
|
643
|
-
},
|
|
644
|
-
f,
|
|
645
|
-
)
|
|
646
|
-
os.replace(temp_file, self.status_file)
|
|
647
|
-
except Exception:
|
|
648
|
-
pass
|
|
649
|
-
|
|
650
|
-
def start_heartbeat(self) -> None:
|
|
651
|
-
"""Start background heartbeat thread for time-based progress updates."""
|
|
652
|
-
if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
|
|
653
|
-
self.heartbeat_stop.clear()
|
|
654
|
-
self.heartbeat_thread = threading.Thread(
|
|
655
|
-
target=self._heartbeat_loop, daemon=True
|
|
656
|
-
)
|
|
657
|
-
self.heartbeat_thread.start()
|
|
658
|
-
# Write initial status
|
|
659
|
-
self._write_status_file(status="processing")
|
|
660
|
-
|
|
661
|
-
def _heartbeat_loop(self) -> None:
|
|
662
|
-
"""Background loop that logs progress at time intervals."""
|
|
663
|
-
while not self.heartbeat_stop.wait(self.heartbeat_interval):
|
|
664
|
-
with self.lock:
|
|
665
|
-
# Only log if we haven't shown a milestone recently
|
|
666
|
-
time_since_update = time.time() - self.last_update_time
|
|
667
|
-
if (
|
|
668
|
-
time_since_update >= self.heartbeat_interval
|
|
669
|
-
and self.completed < self.total
|
|
670
|
-
):
|
|
671
|
-
elapsed = time.time() - self.start_time
|
|
672
|
-
percent = int((self.completed / self.total) * 100)
|
|
673
|
-
elapsed_str = self._format_time(elapsed)
|
|
674
|
-
self.logger.info(
|
|
675
|
-
f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
|
|
676
|
-
f"Elapsed: {elapsed_str}"
|
|
677
|
-
)
|
|
678
|
-
self.last_update_time = time.time()
|
|
679
|
-
|
|
680
|
-
def update(self, n: int = 1) -> None:
|
|
681
|
-
"""
|
|
682
|
-
Update progress count.
|
|
683
|
-
|
|
684
|
-
Parameters
|
|
685
|
-
----------
|
|
686
|
-
n : int
|
|
687
|
-
Number of items completed
|
|
688
|
-
"""
|
|
689
|
-
with self.lock:
|
|
690
|
-
self.completed += n
|
|
691
|
-
percent = int((self.completed / self.total) * 100)
|
|
692
|
-
|
|
693
|
-
# Show milestone messages (5%, 10%, 15%... for large datasets)
|
|
694
|
-
for milestone in sorted(self.milestones):
|
|
695
|
-
if percent >= milestone and milestone not in self.shown_milestones:
|
|
696
|
-
self.shown_milestones.add(milestone)
|
|
697
|
-
|
|
698
|
-
# Calculate time metrics
|
|
699
|
-
elapsed = time.time() - self.start_time
|
|
700
|
-
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
701
|
-
remaining_items = self.total - self.completed
|
|
702
|
-
|
|
703
|
-
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
704
|
-
# Don't show ETA until we have some samples (at least 5% complete)
|
|
705
|
-
if rate > 0 and self.completed >= max(5, self.total * 0.05):
|
|
706
|
-
eta_seconds = (
|
|
707
|
-
remaining_items / rate
|
|
708
|
-
) * 1.15 # Add 15% padding for overhead
|
|
709
|
-
else:
|
|
710
|
-
eta_seconds = 0
|
|
711
552
|
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
553
|
+
def _log_progress(
|
|
554
|
+
completed: int,
|
|
555
|
+
total: int,
|
|
556
|
+
milestones: set,
|
|
557
|
+
shown_milestones: set,
|
|
558
|
+
start_time: float,
|
|
559
|
+
logger: logging.Logger,
|
|
560
|
+
) -> None:
|
|
561
|
+
"""
|
|
562
|
+
Log progress at milestone percentages.
|
|
719
563
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
564
|
+
Parameters
|
|
565
|
+
----------
|
|
566
|
+
completed : int
|
|
567
|
+
Number of batches completed
|
|
568
|
+
total : int
|
|
569
|
+
Total number of batches
|
|
570
|
+
milestones : set
|
|
571
|
+
Set of percentage milestones to show
|
|
572
|
+
shown_milestones : set
|
|
573
|
+
Set of milestones already shown (modified in place)
|
|
574
|
+
start_time : float
|
|
575
|
+
Start time from time.time()
|
|
576
|
+
logger : logging.Logger
|
|
577
|
+
Logger for output
|
|
578
|
+
"""
|
|
579
|
+
percent = int((completed / total) * 100)
|
|
580
|
+
|
|
581
|
+
# Check for new milestones reached
|
|
582
|
+
for milestone in sorted(milestones):
|
|
583
|
+
if percent >= milestone and milestone not in shown_milestones:
|
|
584
|
+
shown_milestones.add(milestone)
|
|
585
|
+
|
|
586
|
+
# Calculate time metrics
|
|
587
|
+
elapsed = time.time() - start_time
|
|
588
|
+
rate = completed / elapsed if elapsed > 0 else 0
|
|
589
|
+
remaining_items = total - completed
|
|
590
|
+
|
|
591
|
+
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
592
|
+
# Don't show ETA until we have some samples (at least 5% complete)
|
|
593
|
+
if rate > 0 and completed >= max(5, total * 0.05):
|
|
594
|
+
eta_seconds = (remaining_items / rate) * 1.15 # Add 15% padding
|
|
595
|
+
else:
|
|
596
|
+
eta_seconds = 0
|
|
744
597
|
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
self.heartbeat_stop.set()
|
|
749
|
-
if self.heartbeat_thread and self.heartbeat_thread.is_alive():
|
|
750
|
-
self.heartbeat_thread.join(timeout=1)
|
|
598
|
+
# Format time strings
|
|
599
|
+
eta_str = _format_time(eta_seconds) if eta_seconds > 0 else "calculating..."
|
|
600
|
+
elapsed_str = _format_time(elapsed)
|
|
751
601
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
602
|
+
# Build progress message
|
|
603
|
+
msg = f"Progress: {completed:,}/{total:,} batches ({percent}%)"
|
|
604
|
+
if percent < 100:
|
|
605
|
+
msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
|
|
606
|
+
else:
|
|
607
|
+
msg += f" | Total time: {elapsed_str}"
|
|
757
608
|
|
|
758
|
-
|
|
759
|
-
self._write_status_file(status="completed")
|
|
609
|
+
logger.info(msg)
|
|
760
610
|
|
|
761
611
|
|
|
762
612
|
# ============================================================================
|
|
@@ -1075,10 +925,67 @@ def clean_geodataframe(
|
|
|
1075
925
|
|
|
1076
926
|
|
|
1077
927
|
# ============================================================================
|
|
1078
|
-
#
|
|
928
|
+
# AUDIT TRAIL HELPER
|
|
1079
929
|
# ============================================================================
|
|
1080
930
|
|
|
1081
931
|
|
|
932
|
+
def _add_geometry_audit_trail(
|
|
933
|
+
df_validated: pd.DataFrame,
|
|
934
|
+
input_geojson_filepath: str,
|
|
935
|
+
gdf_original_geoms: gpd.GeoDataFrame = None,
|
|
936
|
+
logger: logging.Logger = None,
|
|
937
|
+
) -> pd.DataFrame:
|
|
938
|
+
"""
|
|
939
|
+
Add original input geometries as geo_original column for audit trail.
|
|
940
|
+
|
|
941
|
+
Parameters
|
|
942
|
+
----------
|
|
943
|
+
df_validated : pd.DataFrame
|
|
944
|
+
Validated DataFrame to add audit trail to
|
|
945
|
+
input_geojson_filepath : str
|
|
946
|
+
Path to original GeoJSON file
|
|
947
|
+
gdf_original_geoms : gpd.GeoDataFrame, optional
|
|
948
|
+
Pre-loaded original geometries (to avoid reloading)
|
|
949
|
+
logger : logging.Logger, optional
|
|
950
|
+
Logger for output
|
|
951
|
+
|
|
952
|
+
Returns
|
|
953
|
+
-------
|
|
954
|
+
pd.DataFrame
|
|
955
|
+
DataFrame with geo_original column added
|
|
956
|
+
"""
|
|
957
|
+
import json
|
|
958
|
+
from shapely.geometry import mapping
|
|
959
|
+
|
|
960
|
+
logger = logger or logging.getLogger("whisp")
|
|
961
|
+
|
|
962
|
+
try:
|
|
963
|
+
# Load original geometries if not provided
|
|
964
|
+
if gdf_original_geoms is None:
|
|
965
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
966
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
967
|
+
|
|
968
|
+
# Create DataFrame with plotId and geo_original
|
|
969
|
+
df_original_geom = pd.DataFrame(
|
|
970
|
+
{
|
|
971
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
972
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
973
|
+
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
974
|
+
),
|
|
975
|
+
}
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
# Merge original geometries back
|
|
979
|
+
df_result = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
980
|
+
logger.info("Audit trail added: geo_original column")
|
|
981
|
+
return df_result
|
|
982
|
+
|
|
983
|
+
except Exception as e:
|
|
984
|
+
logger.warning(f"Error adding audit trail: {e}")
|
|
985
|
+
# Return original DataFrame if audit trail fails
|
|
986
|
+
return df_validated
|
|
987
|
+
|
|
988
|
+
|
|
1082
989
|
# ============================================================================
|
|
1083
990
|
# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
|
|
1084
991
|
# ============================================================================
|
|
@@ -1218,7 +1125,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1218
1125
|
logger: logging.Logger = None,
|
|
1219
1126
|
# Format parameters (auto-detect from config if not provided)
|
|
1220
1127
|
decimal_places: int = None,
|
|
1221
|
-
status_file: str = None,
|
|
1222
1128
|
) -> pd.DataFrame:
|
|
1223
1129
|
"""
|
|
1224
1130
|
Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
|
|
@@ -1359,11 +1265,12 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1359
1265
|
# Setup semaphore for EE concurrency control
|
|
1360
1266
|
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
1361
1267
|
|
|
1362
|
-
# Progress
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
)
|
|
1366
|
-
|
|
1268
|
+
# Progress tracking setup
|
|
1269
|
+
progress_lock = threading.Lock()
|
|
1270
|
+
completed_batches = 0
|
|
1271
|
+
milestones = _get_progress_milestones(len(gdf_for_ee))
|
|
1272
|
+
shown_milestones = set()
|
|
1273
|
+
start_time = time.time()
|
|
1367
1274
|
|
|
1368
1275
|
results = []
|
|
1369
1276
|
|
|
@@ -1477,7 +1384,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1477
1384
|
suffixes=("_ee", "_client"),
|
|
1478
1385
|
)
|
|
1479
1386
|
results.append(merged)
|
|
1480
|
-
|
|
1387
|
+
|
|
1388
|
+
# Update progress
|
|
1389
|
+
with progress_lock:
|
|
1390
|
+
completed_batches += 1
|
|
1391
|
+
_log_progress(
|
|
1392
|
+
completed_batches,
|
|
1393
|
+
len(batches),
|
|
1394
|
+
milestones,
|
|
1395
|
+
shown_milestones,
|
|
1396
|
+
start_time,
|
|
1397
|
+
logger,
|
|
1398
|
+
)
|
|
1481
1399
|
|
|
1482
1400
|
except Exception as e:
|
|
1483
1401
|
# Batch failed - fail fast with clear guidance
|
|
@@ -1492,15 +1410,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1492
1410
|
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1493
1411
|
except (KeyboardInterrupt, SystemExit) as interrupt:
|
|
1494
1412
|
logger.warning("Processing interrupted by user")
|
|
1495
|
-
# Update status file with interrupted state
|
|
1496
|
-
progress._write_status_file(status="interrupted")
|
|
1497
1413
|
raise interrupt
|
|
1498
1414
|
finally:
|
|
1499
1415
|
# Restore logger levels
|
|
1500
1416
|
fiona_logger.setLevel(old_fiona_level)
|
|
1501
1417
|
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
1502
1418
|
|
|
1503
|
-
|
|
1419
|
+
# Log completion
|
|
1420
|
+
total_time = time.time() - start_time
|
|
1421
|
+
time_str = _format_time(total_time)
|
|
1422
|
+
logger.info(
|
|
1423
|
+
f"Processing complete: {completed_batches:,}/{len(batches):,} batches in {time_str}"
|
|
1424
|
+
)
|
|
1504
1425
|
|
|
1505
1426
|
# If we have batch errors after retry attempts, fail the entire process
|
|
1506
1427
|
if batch_errors:
|
|
@@ -1577,7 +1498,9 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1577
1498
|
|
|
1578
1499
|
# Retry batch processing with validated image
|
|
1579
1500
|
results = []
|
|
1580
|
-
|
|
1501
|
+
retry_completed = 0
|
|
1502
|
+
retry_shown = set()
|
|
1503
|
+
retry_start = time.time()
|
|
1581
1504
|
|
|
1582
1505
|
# Suppress fiona logging during batch processing (threads create new loggers)
|
|
1583
1506
|
fiona_logger = logging.getLogger("fiona")
|
|
@@ -1609,13 +1532,28 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1609
1532
|
suffixes=("", "_client"),
|
|
1610
1533
|
)
|
|
1611
1534
|
results.append(merged)
|
|
1612
|
-
|
|
1535
|
+
|
|
1536
|
+
# Update retry progress
|
|
1537
|
+
with progress_lock:
|
|
1538
|
+
retry_completed += 1
|
|
1539
|
+
_log_progress(
|
|
1540
|
+
retry_completed,
|
|
1541
|
+
len(batches),
|
|
1542
|
+
milestones,
|
|
1543
|
+
retry_shown,
|
|
1544
|
+
retry_start,
|
|
1545
|
+
logger,
|
|
1546
|
+
)
|
|
1613
1547
|
except Exception as e:
|
|
1614
1548
|
logger.error(
|
|
1615
1549
|
f"Batch processing error (retry): {str(e)[:100]}"
|
|
1616
1550
|
)
|
|
1617
1551
|
|
|
1618
|
-
|
|
1552
|
+
# Log retry completion
|
|
1553
|
+
retry_time = time.time() - retry_start
|
|
1554
|
+
logger.info(
|
|
1555
|
+
f"Retry complete: {retry_completed:,}/{len(batches):,} batches in {_format_time(retry_time)}"
|
|
1556
|
+
)
|
|
1619
1557
|
finally:
|
|
1620
1558
|
# Restore logger levels
|
|
1621
1559
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -1847,8 +1785,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1847
1785
|
logger.warning(f"{plot_id_column} column missing, regenerating...")
|
|
1848
1786
|
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
1849
1787
|
|
|
1850
|
-
#
|
|
1851
|
-
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
1788
|
+
# Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
|
|
1852
1789
|
|
|
1853
1790
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1854
1791
|
return formatted
|
|
@@ -2101,10 +2038,11 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
2101
2038
|
convert_water_flag=True,
|
|
2102
2039
|
)
|
|
2103
2040
|
|
|
2104
|
-
# Ensure plot_id exists
|
|
2041
|
+
# Ensure plot_id exists
|
|
2105
2042
|
if plot_id_column not in formatted.columns:
|
|
2106
2043
|
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
2107
|
-
|
|
2044
|
+
|
|
2045
|
+
# Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
|
|
2108
2046
|
|
|
2109
2047
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
2110
2048
|
|
|
@@ -2138,7 +2076,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2138
2076
|
water_flag_threshold: float = 0.5,
|
|
2139
2077
|
sort_column: str = "plotId",
|
|
2140
2078
|
geometry_audit_trail: bool = False,
|
|
2141
|
-
status_file: str = None,
|
|
2142
2079
|
) -> pd.DataFrame:
|
|
2143
2080
|
"""
|
|
2144
2081
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -2231,7 +2168,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2231
2168
|
max_retries=max_retries,
|
|
2232
2169
|
add_metadata_server=add_metadata_server,
|
|
2233
2170
|
logger=logger,
|
|
2234
|
-
status_file=status_file,
|
|
2235
2171
|
)
|
|
2236
2172
|
|
|
2237
2173
|
# Step 2: Format the output
|
|
@@ -2276,50 +2212,21 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2276
2212
|
custom_bands=custom_bands,
|
|
2277
2213
|
)
|
|
2278
2214
|
|
|
2279
|
-
# Step 2c: Add audit trail
|
|
2215
|
+
# Step 2c: Add audit trail column (AFTER validation to preserve columns)
|
|
2280
2216
|
if geometry_audit_trail:
|
|
2281
|
-
logger.debug("Adding audit trail
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
# Use plotId from df_validated to maintain mapping
|
|
2289
|
-
df_original_geom = pd.DataFrame(
|
|
2290
|
-
{
|
|
2291
|
-
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2292
|
-
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2293
|
-
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2294
|
-
),
|
|
2295
|
-
}
|
|
2296
|
-
)
|
|
2297
|
-
|
|
2298
|
-
# Merge original geometries back
|
|
2299
|
-
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2300
|
-
|
|
2301
|
-
# Store processing metadata
|
|
2302
|
-
df_validated.attrs["processing_metadata"] = {
|
|
2303
|
-
"whisp_version": "3.0.0a1",
|
|
2304
|
-
"processing_date": datetime.now().isoformat(),
|
|
2305
|
-
"processing_mode": "concurrent",
|
|
2306
|
-
"ee_endpoint": "high_volume",
|
|
2307
|
-
"validate_geometries": validate_geometries,
|
|
2308
|
-
"datasets_used": national_codes or [],
|
|
2309
|
-
"geometry_audit_trail": True,
|
|
2310
|
-
}
|
|
2311
|
-
|
|
2312
|
-
logger.info(f"Audit trail added: geo_original column")
|
|
2313
|
-
|
|
2314
|
-
except Exception as e:
|
|
2315
|
-
logger.warning(f"Error adding audit trail: {e}")
|
|
2316
|
-
# Continue without audit trail if something fails
|
|
2217
|
+
logger.debug("Adding geo_original column for audit trail...")
|
|
2218
|
+
df_validated = _add_geometry_audit_trail(
|
|
2219
|
+
df_validated=df_validated,
|
|
2220
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
2221
|
+
gdf_original_geoms=gdf_original_geoms,
|
|
2222
|
+
logger=logger,
|
|
2223
|
+
)
|
|
2317
2224
|
|
|
2318
2225
|
# Add processing metadata column using pd.concat to avoid fragmentation warning
|
|
2319
2226
|
metadata_dict = {
|
|
2320
|
-
"whisp_version": "
|
|
2227
|
+
"whisp_version": get_version("openforis-whisp"),
|
|
2321
2228
|
"processing_timestamp_utc": datetime.now(timezone.utc).strftime(
|
|
2322
|
-
"%Y-%m-%d %H:%M:%S
|
|
2229
|
+
"%Y-%m-%d %H:%M:%S%z"
|
|
2323
2230
|
),
|
|
2324
2231
|
}
|
|
2325
2232
|
metadata_series = pd.Series(
|
|
@@ -2347,7 +2254,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2347
2254
|
water_flag_threshold: float = 0.5,
|
|
2348
2255
|
sort_column: str = "plotId",
|
|
2349
2256
|
geometry_audit_trail: bool = False,
|
|
2350
|
-
status_file: str = None,
|
|
2351
2257
|
) -> pd.DataFrame:
|
|
2352
2258
|
"""
|
|
2353
2259
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -2472,49 +2378,21 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2472
2378
|
custom_bands=custom_bands,
|
|
2473
2379
|
)
|
|
2474
2380
|
|
|
2475
|
-
# Step 2c: Add audit trail
|
|
2381
|
+
# Step 2c: Add audit trail column (AFTER validation to preserve columns)
|
|
2476
2382
|
if geometry_audit_trail:
|
|
2477
|
-
logger.debug("Adding audit trail
|
|
2478
|
-
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
|
|
2483
|
-
|
|
2484
|
-
# Use plotId from df_validated to maintain mapping
|
|
2485
|
-
df_original_geom = pd.DataFrame(
|
|
2486
|
-
{
|
|
2487
|
-
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2488
|
-
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2489
|
-
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2490
|
-
),
|
|
2491
|
-
}
|
|
2492
|
-
)
|
|
2493
|
-
|
|
2494
|
-
# Merge original geometries back
|
|
2495
|
-
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2496
|
-
|
|
2497
|
-
# Store processing metadata
|
|
2498
|
-
df_validated.attrs["processing_metadata"] = {
|
|
2499
|
-
"whisp_version": "3.0.0a1",
|
|
2500
|
-
"processing_date": datetime.now().isoformat(),
|
|
2501
|
-
"processing_mode": "sequential",
|
|
2502
|
-
"ee_endpoint": "standard",
|
|
2503
|
-
"datasets_used": national_codes or [],
|
|
2504
|
-
"geometry_audit_trail": True,
|
|
2505
|
-
}
|
|
2506
|
-
|
|
2507
|
-
logger.info(f"Audit trail added: geo_original column")
|
|
2508
|
-
|
|
2509
|
-
except Exception as e:
|
|
2510
|
-
logger.warning(f"Error adding audit trail: {e}")
|
|
2511
|
-
# Continue without audit trail if something fails
|
|
2383
|
+
logger.debug("Adding geo_original column for audit trail...")
|
|
2384
|
+
df_validated = _add_geometry_audit_trail(
|
|
2385
|
+
df_validated=df_validated,
|
|
2386
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
2387
|
+
gdf_original_geoms=gdf_original_geoms,
|
|
2388
|
+
logger=logger,
|
|
2389
|
+
)
|
|
2512
2390
|
|
|
2513
2391
|
# Add processing metadata column using pd.concat to avoid fragmentation warning
|
|
2514
2392
|
metadata_dict = {
|
|
2515
|
-
"whisp_version": "
|
|
2393
|
+
"whisp_version": get_version("openforis-whisp"),
|
|
2516
2394
|
"processing_timestamp_utc": datetime.now(timezone.utc).strftime(
|
|
2517
|
-
"%Y-%m-%d %H:%M:%S
|
|
2395
|
+
"%Y-%m-%d %H:%M:%S%z"
|
|
2518
2396
|
),
|
|
2519
2397
|
}
|
|
2520
2398
|
metadata_series = pd.Series(
|
|
@@ -2552,7 +2430,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2552
2430
|
water_flag_threshold: float = 0.5,
|
|
2553
2431
|
sort_column: str = "plotId",
|
|
2554
2432
|
geometry_audit_trail: bool = False,
|
|
2555
|
-
status_file: str = None,
|
|
2556
2433
|
) -> pd.DataFrame:
|
|
2557
2434
|
"""
|
|
2558
2435
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
@@ -2654,7 +2531,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2654
2531
|
water_flag_threshold=water_flag_threshold,
|
|
2655
2532
|
sort_column=sort_column,
|
|
2656
2533
|
geometry_audit_trail=geometry_audit_trail,
|
|
2657
|
-
status_file=status_file,
|
|
2658
2534
|
)
|
|
2659
2535
|
else: # sequential
|
|
2660
2536
|
logger.debug("Routing to sequential processing...")
|
|
@@ -2672,5 +2548,4 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2672
2548
|
water_flag_threshold=water_flag_threshold,
|
|
2673
2549
|
sort_column=sort_column,
|
|
2674
2550
|
geometry_audit_trail=geometry_audit_trail,
|
|
2675
|
-
status_file=status_file,
|
|
2676
2551
|
)
|