openforis-whisp 3.0.0a3__py3-none-any.whl → 3.0.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +1 -1
- openforis_whisp/advanced_stats.py +318 -103
- openforis_whisp/data_checks.py +80 -28
- openforis_whisp/datasets.py +14 -0
- openforis_whisp/logger.py +15 -3
- openforis_whisp/parameters/lookup_gee_datasets.csv +3 -2
- openforis_whisp/pd_schemas.py +7 -2
- openforis_whisp/reformat.py +6 -1
- openforis_whisp/stats.py +10 -11
- openforis_whisp/utils.py +19 -0
- {openforis_whisp-3.0.0a3.dist-info → openforis_whisp-3.0.0a4.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a4.dist-info/RECORD +20 -0
- openforis_whisp-3.0.0a3.dist-info/RECORD +0 -20
- {openforis_whisp-3.0.0a3.dist-info → openforis_whisp-3.0.0a4.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a3.dist-info → openforis_whisp-3.0.0a4.dist-info}/WHEEL +0 -0
openforis_whisp/__init__.py
CHANGED
|
@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
|
|
|
36
36
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
37
|
import tempfile
|
|
38
38
|
|
|
39
|
+
# Configure the "whisp" logger with auto-flush handler for Colab visibility
|
|
40
|
+
_whisp_logger = logging.getLogger("whisp")
|
|
41
|
+
if not _whisp_logger.handlers:
|
|
42
|
+
_handler = logging.StreamHandler(sys.stdout)
|
|
43
|
+
_handler.setLevel(logging.DEBUG)
|
|
44
|
+
_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
|
|
45
|
+
# Override emit to force flush after each message for Colab
|
|
46
|
+
_original_emit = _handler.emit
|
|
47
|
+
|
|
48
|
+
def _emit_with_flush(record):
|
|
49
|
+
_original_emit(record)
|
|
50
|
+
sys.stdout.flush()
|
|
51
|
+
|
|
52
|
+
_handler.emit = _emit_with_flush
|
|
53
|
+
_whisp_logger.addHandler(_handler)
|
|
54
|
+
_whisp_logger.setLevel(logging.INFO)
|
|
55
|
+
_whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
|
|
56
|
+
|
|
39
57
|
# ============================================================================
|
|
40
58
|
# STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
|
|
41
59
|
# ============================================================================
|
|
@@ -445,6 +463,16 @@ def join_admin_codes(
|
|
|
445
463
|
columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
|
|
446
464
|
)
|
|
447
465
|
|
|
466
|
+
# Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
|
|
467
|
+
# (e.g., points in the ocean or international waters)
|
|
468
|
+
df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
|
|
469
|
+
"Unknown"
|
|
470
|
+
)
|
|
471
|
+
df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
|
|
472
|
+
"not found"
|
|
473
|
+
)
|
|
474
|
+
df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
|
|
475
|
+
|
|
448
476
|
logger.debug(
|
|
449
477
|
f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
|
|
450
478
|
)
|
|
@@ -461,10 +489,16 @@ class ProgressTracker:
|
|
|
461
489
|
|
|
462
490
|
Shows progress at adaptive milestones (more frequent for small datasets,
|
|
463
491
|
less frequent for large datasets) with estimated time remaining based on
|
|
464
|
-
processing speed.
|
|
492
|
+
processing speed. Includes time-based heartbeat to prevent long silences.
|
|
465
493
|
"""
|
|
466
494
|
|
|
467
|
-
def __init__(
|
|
495
|
+
def __init__(
|
|
496
|
+
self,
|
|
497
|
+
total: int,
|
|
498
|
+
logger: logging.Logger = None,
|
|
499
|
+
heartbeat_interval: int = 180,
|
|
500
|
+
status_file: str = None,
|
|
501
|
+
):
|
|
468
502
|
"""
|
|
469
503
|
Initialize progress tracker.
|
|
470
504
|
|
|
@@ -474,26 +508,147 @@ class ProgressTracker:
|
|
|
474
508
|
Total number of items to process
|
|
475
509
|
logger : logging.Logger, optional
|
|
476
510
|
Logger for output
|
|
511
|
+
heartbeat_interval : int, optional
|
|
512
|
+
Seconds between heartbeat messages (default: 180 = 3 minutes)
|
|
513
|
+
status_file : str, optional
|
|
514
|
+
Path to JSON status file for API/web app consumption.
|
|
515
|
+
Checkpoints auto-save to same directory as status_file.
|
|
477
516
|
"""
|
|
478
517
|
self.total = total
|
|
479
518
|
self.completed = 0
|
|
480
519
|
self.lock = threading.Lock()
|
|
481
520
|
self.logger = logger or logging.getLogger("whisp")
|
|
521
|
+
self.heartbeat_interval = heartbeat_interval
|
|
522
|
+
|
|
523
|
+
# Handle status_file: if directory passed, auto-generate filename
|
|
524
|
+
if status_file:
|
|
525
|
+
import os
|
|
526
|
+
|
|
527
|
+
if os.path.isdir(status_file):
|
|
528
|
+
self.status_file = os.path.join(
|
|
529
|
+
status_file, "whisp_processing_status.json"
|
|
530
|
+
)
|
|
531
|
+
else:
|
|
532
|
+
# Validate that parent directory exists
|
|
533
|
+
parent_dir = os.path.dirname(status_file)
|
|
534
|
+
if parent_dir and not os.path.isdir(parent_dir):
|
|
535
|
+
self.logger.warning(
|
|
536
|
+
f"Status file directory does not exist: {parent_dir}"
|
|
537
|
+
)
|
|
538
|
+
self.status_file = None
|
|
539
|
+
else:
|
|
540
|
+
self.status_file = status_file
|
|
541
|
+
else:
|
|
542
|
+
self.status_file = None
|
|
482
543
|
|
|
483
544
|
# Adaptive milestones based on dataset size
|
|
484
545
|
# Small datasets (< 50): show every 25% (not too spammy)
|
|
485
546
|
# Medium (50-500): show every 20%
|
|
486
|
-
# Large (500
|
|
547
|
+
# Large (500-1000): show every 10%
|
|
548
|
+
# Very large (1000+): show every 5% (cleaner for long jobs)
|
|
487
549
|
if total < 50:
|
|
488
550
|
self.milestones = {25, 50, 75, 100}
|
|
489
551
|
elif total < 500:
|
|
490
552
|
self.milestones = {20, 40, 60, 80, 100}
|
|
491
|
-
|
|
553
|
+
elif total < 1000:
|
|
492
554
|
self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
|
|
555
|
+
else:
|
|
556
|
+
self.milestones = {
|
|
557
|
+
5,
|
|
558
|
+
10,
|
|
559
|
+
15,
|
|
560
|
+
20,
|
|
561
|
+
25,
|
|
562
|
+
30,
|
|
563
|
+
35,
|
|
564
|
+
40,
|
|
565
|
+
45,
|
|
566
|
+
50,
|
|
567
|
+
55,
|
|
568
|
+
60,
|
|
569
|
+
65,
|
|
570
|
+
70,
|
|
571
|
+
75,
|
|
572
|
+
80,
|
|
573
|
+
85,
|
|
574
|
+
90,
|
|
575
|
+
95,
|
|
576
|
+
100,
|
|
577
|
+
}
|
|
493
578
|
|
|
494
579
|
self.shown_milestones = set()
|
|
495
580
|
self.start_time = time.time()
|
|
496
581
|
self.last_update_time = self.start_time
|
|
582
|
+
self.heartbeat_stop = threading.Event()
|
|
583
|
+
self.heartbeat_thread = None
|
|
584
|
+
|
|
585
|
+
def _write_status_file(self, status: str = "processing") -> None:
|
|
586
|
+
"""Write current progress to JSON status file using atomic write."""
|
|
587
|
+
if not self.status_file:
|
|
588
|
+
return
|
|
589
|
+
|
|
590
|
+
try:
|
|
591
|
+
import json
|
|
592
|
+
import os
|
|
593
|
+
|
|
594
|
+
elapsed = time.time() - self.start_time
|
|
595
|
+
percent = (self.completed / self.total * 100) if self.total > 0 else 0
|
|
596
|
+
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
597
|
+
eta = (
|
|
598
|
+
(self.total - self.completed) / rate * 1.15
|
|
599
|
+
if rate > 0 and percent >= 5
|
|
600
|
+
else None
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
# Write to temp file then atomic rename to prevent partial reads
|
|
604
|
+
from datetime import datetime
|
|
605
|
+
|
|
606
|
+
temp_file = self.status_file + ".tmp"
|
|
607
|
+
with open(temp_file, "w") as f:
|
|
608
|
+
json.dump(
|
|
609
|
+
{
|
|
610
|
+
"status": status,
|
|
611
|
+
"progress": f"{self.completed}/{self.total}",
|
|
612
|
+
"percent": round(percent, 1),
|
|
613
|
+
"elapsed_sec": round(elapsed),
|
|
614
|
+
"eta_sec": round(eta) if eta else None,
|
|
615
|
+
"updated_at": datetime.now().isoformat(),
|
|
616
|
+
},
|
|
617
|
+
f,
|
|
618
|
+
)
|
|
619
|
+
os.replace(temp_file, self.status_file)
|
|
620
|
+
except Exception:
|
|
621
|
+
pass
|
|
622
|
+
|
|
623
|
+
def start_heartbeat(self) -> None:
|
|
624
|
+
"""Start background heartbeat thread for time-based progress updates."""
|
|
625
|
+
if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
|
|
626
|
+
self.heartbeat_stop.clear()
|
|
627
|
+
self.heartbeat_thread = threading.Thread(
|
|
628
|
+
target=self._heartbeat_loop, daemon=True
|
|
629
|
+
)
|
|
630
|
+
self.heartbeat_thread.start()
|
|
631
|
+
# Write initial status
|
|
632
|
+
self._write_status_file(status="processing")
|
|
633
|
+
|
|
634
|
+
def _heartbeat_loop(self) -> None:
|
|
635
|
+
"""Background loop that logs progress at time intervals."""
|
|
636
|
+
while not self.heartbeat_stop.wait(self.heartbeat_interval):
|
|
637
|
+
with self.lock:
|
|
638
|
+
# Only log if we haven't shown a milestone recently
|
|
639
|
+
time_since_update = time.time() - self.last_update_time
|
|
640
|
+
if (
|
|
641
|
+
time_since_update >= self.heartbeat_interval
|
|
642
|
+
and self.completed < self.total
|
|
643
|
+
):
|
|
644
|
+
elapsed = time.time() - self.start_time
|
|
645
|
+
percent = int((self.completed / self.total) * 100)
|
|
646
|
+
elapsed_str = self._format_time(elapsed)
|
|
647
|
+
self.logger.info(
|
|
648
|
+
f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
|
|
649
|
+
f"Elapsed: {elapsed_str}"
|
|
650
|
+
)
|
|
651
|
+
self.last_update_time = time.time()
|
|
497
652
|
|
|
498
653
|
def update(self, n: int = 1) -> None:
|
|
499
654
|
"""
|
|
@@ -508,7 +663,7 @@ class ProgressTracker:
|
|
|
508
663
|
self.completed += n
|
|
509
664
|
percent = int((self.completed / self.total) * 100)
|
|
510
665
|
|
|
511
|
-
# Show milestone messages (
|
|
666
|
+
# Show milestone messages (5%, 10%, 15%... for large datasets)
|
|
512
667
|
for milestone in sorted(self.milestones):
|
|
513
668
|
if percent >= milestone and milestone not in self.shown_milestones:
|
|
514
669
|
self.shown_milestones.add(milestone)
|
|
@@ -517,20 +672,36 @@ class ProgressTracker:
|
|
|
517
672
|
elapsed = time.time() - self.start_time
|
|
518
673
|
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
519
674
|
remaining_items = self.total - self.completed
|
|
520
|
-
|
|
675
|
+
|
|
676
|
+
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
677
|
+
# Don't show ETA until we have some samples (at least 5% complete)
|
|
678
|
+
if rate > 0 and self.completed >= max(5, self.total * 0.05):
|
|
679
|
+
eta_seconds = (
|
|
680
|
+
remaining_items / rate
|
|
681
|
+
) * 1.15 # Add 15% padding for overhead
|
|
682
|
+
else:
|
|
683
|
+
eta_seconds = 0
|
|
521
684
|
|
|
522
685
|
# Format time strings
|
|
523
|
-
eta_str =
|
|
686
|
+
eta_str = (
|
|
687
|
+
self._format_time(eta_seconds)
|
|
688
|
+
if eta_seconds > 0
|
|
689
|
+
else "calculating..."
|
|
690
|
+
)
|
|
524
691
|
elapsed_str = self._format_time(elapsed)
|
|
525
692
|
|
|
526
693
|
# Build progress message
|
|
527
|
-
msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
|
|
694
|
+
msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
|
|
528
695
|
if percent < 100:
|
|
529
696
|
msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
|
|
530
697
|
else:
|
|
531
698
|
msg += f" | Total time: {elapsed_str}"
|
|
532
699
|
|
|
533
700
|
self.logger.info(msg)
|
|
701
|
+
self.last_update_time = time.time()
|
|
702
|
+
|
|
703
|
+
# Update status file for API consumption
|
|
704
|
+
self._write_status_file()
|
|
534
705
|
|
|
535
706
|
@staticmethod
|
|
536
707
|
def _format_time(seconds: float) -> str:
|
|
@@ -544,14 +715,21 @@ class ProgressTracker:
|
|
|
544
715
|
hours = seconds / 3600
|
|
545
716
|
return f"{hours:.1f}h"
|
|
546
717
|
|
|
547
|
-
def finish(self) -> None:
|
|
548
|
-
"""
|
|
718
|
+
def finish(self, output_file: str = None) -> None:
|
|
719
|
+
"""Stop heartbeat and log completion."""
|
|
720
|
+
# Stop heartbeat thread
|
|
721
|
+
self.heartbeat_stop.set()
|
|
722
|
+
if self.heartbeat_thread and self.heartbeat_thread.is_alive():
|
|
723
|
+
self.heartbeat_thread.join(timeout=1)
|
|
724
|
+
|
|
549
725
|
with self.lock:
|
|
550
726
|
total_time = time.time() - self.start_time
|
|
551
727
|
time_str = self._format_time(total_time)
|
|
552
|
-
self.
|
|
553
|
-
|
|
554
|
-
|
|
728
|
+
msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
|
|
729
|
+
self.logger.info(msg)
|
|
730
|
+
|
|
731
|
+
# Write final status
|
|
732
|
+
self._write_status_file(status="completed")
|
|
555
733
|
|
|
556
734
|
|
|
557
735
|
# ============================================================================
|
|
@@ -983,7 +1161,6 @@ def process_ee_batch(
|
|
|
983
1161
|
def whisp_stats_geojson_to_df_concurrent(
|
|
984
1162
|
input_geojson_filepath: str,
|
|
985
1163
|
external_id_column: str = None,
|
|
986
|
-
remove_geom: bool = False,
|
|
987
1164
|
national_codes: List[str] = None,
|
|
988
1165
|
unit_type: str = "ha",
|
|
989
1166
|
whisp_image: ee.Image = None,
|
|
@@ -996,6 +1173,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
996
1173
|
logger: logging.Logger = None,
|
|
997
1174
|
# Format parameters (auto-detect from config if not provided)
|
|
998
1175
|
decimal_places: int = None,
|
|
1176
|
+
status_file: str = None,
|
|
999
1177
|
) -> pd.DataFrame:
|
|
1000
1178
|
"""
|
|
1001
1179
|
Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
|
|
@@ -1010,8 +1188,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1010
1188
|
Path to input GeoJSON file
|
|
1011
1189
|
external_id_column : str, optional
|
|
1012
1190
|
Column name for external IDs
|
|
1013
|
-
remove_geom : bool
|
|
1014
|
-
Remove geometry column from output
|
|
1015
1191
|
national_codes : List[str], optional
|
|
1016
1192
|
ISO2 codes for national datasets
|
|
1017
1193
|
unit_type : str
|
|
@@ -1059,6 +1235,25 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1059
1235
|
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1060
1236
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1061
1237
|
|
|
1238
|
+
# Validate external_id_column if provided (lightweight client-side check)
|
|
1239
|
+
if external_id_column and external_id_column not in gdf.columns:
|
|
1240
|
+
# Exclude geometry column from available columns list
|
|
1241
|
+
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1242
|
+
raise ValueError(
|
|
1243
|
+
f"Column '{external_id_column}' not found in GeoJSON properties. "
|
|
1244
|
+
f"Available columns: {available_cols}"
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
# Check completeness of external_id_column (warn if nulls exist)
|
|
1248
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
1249
|
+
null_count = gdf[external_id_column].isna().sum()
|
|
1250
|
+
if null_count > 0:
|
|
1251
|
+
null_pct = (null_count / len(gdf)) * 100
|
|
1252
|
+
logger.warning(
|
|
1253
|
+
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1254
|
+
f"These features may have missing external IDs in output."
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1062
1257
|
if validate_geometries:
|
|
1063
1258
|
gdf = clean_geodataframe(
|
|
1064
1259
|
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
@@ -1101,13 +1296,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1101
1296
|
|
|
1102
1297
|
# Batch the data
|
|
1103
1298
|
batches = batch_geodataframe(gdf_for_ee, batch_size)
|
|
1104
|
-
logger.info(
|
|
1299
|
+
logger.info(
|
|
1300
|
+
f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
|
|
1301
|
+
)
|
|
1105
1302
|
|
|
1106
1303
|
# Setup semaphore for EE concurrency control
|
|
1107
1304
|
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
1108
1305
|
|
|
1109
|
-
# Progress tracker
|
|
1110
|
-
progress = ProgressTracker(
|
|
1306
|
+
# Progress tracker with heartbeat for long-running jobs
|
|
1307
|
+
progress = ProgressTracker(
|
|
1308
|
+
len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
|
|
1309
|
+
)
|
|
1310
|
+
progress.start_heartbeat()
|
|
1111
1311
|
|
|
1112
1312
|
results = []
|
|
1113
1313
|
|
|
@@ -1148,73 +1348,77 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1148
1348
|
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1149
1349
|
|
|
1150
1350
|
try:
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
# Track which batches failed for retry
|
|
1159
|
-
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1160
|
-
batch_futures = {future: i for future, i in futures.items()}
|
|
1161
|
-
|
|
1162
|
-
for future in as_completed(futures):
|
|
1163
|
-
batch_idx = batch_futures[future]
|
|
1164
|
-
try:
|
|
1165
|
-
batch_idx, df_server, df_client = future.result()
|
|
1166
|
-
|
|
1167
|
-
# Merge server and client results
|
|
1168
|
-
if plot_id_column not in df_server.columns:
|
|
1169
|
-
df_server[plot_id_column] = range(len(df_server))
|
|
1170
|
-
|
|
1171
|
-
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1172
|
-
# These are the actual EE processing results
|
|
1173
|
-
df_server_clean = df_server.copy()
|
|
1174
|
-
|
|
1175
|
-
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1176
|
-
# (formatted wrapper handles keep_external_columns parameter)
|
|
1177
|
-
keep_external_columns = [plot_id_column]
|
|
1178
|
-
if (
|
|
1179
|
-
external_id_column
|
|
1180
|
-
and external_id_column in df_client.columns
|
|
1181
|
-
):
|
|
1182
|
-
keep_external_columns.append(external_id_column)
|
|
1183
|
-
if "geometry" in df_client.columns:
|
|
1184
|
-
keep_external_columns.append("geometry")
|
|
1185
|
-
# Keep geometry type column (Geometry_type)
|
|
1186
|
-
if geometry_type_column in df_client.columns:
|
|
1187
|
-
keep_external_columns.append(geometry_type_column)
|
|
1188
|
-
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1189
|
-
centroid_cols = [
|
|
1190
|
-
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1191
|
-
]
|
|
1192
|
-
keep_external_columns.extend(centroid_cols)
|
|
1193
|
-
|
|
1194
|
-
df_client_clean = df_client[
|
|
1195
|
-
[c for c in keep_external_columns if c in df_client.columns]
|
|
1196
|
-
].drop_duplicates()
|
|
1197
|
-
|
|
1198
|
-
merged = df_server_clean.merge(
|
|
1199
|
-
df_client_clean,
|
|
1200
|
-
on=plot_id_column,
|
|
1201
|
-
how="left",
|
|
1202
|
-
suffixes=("_ee", "_client"),
|
|
1203
|
-
)
|
|
1204
|
-
results.append(merged)
|
|
1205
|
-
progress.update()
|
|
1206
|
-
|
|
1207
|
-
except Exception as e:
|
|
1208
|
-
# Batch failed - fail fast with clear guidance
|
|
1209
|
-
error_msg = str(e)
|
|
1210
|
-
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1211
|
-
logger.debug(f"Full error: {error_msg}")
|
|
1351
|
+
# Don't suppress stdout here - we want progress messages to show in Colab
|
|
1352
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1353
|
+
futures = {
|
|
1354
|
+
executor.submit(process_batch, i, batch): i
|
|
1355
|
+
for i, batch in enumerate(batches)
|
|
1356
|
+
}
|
|
1212
1357
|
|
|
1213
|
-
|
|
1214
|
-
|
|
1358
|
+
# Track which batches failed for retry
|
|
1359
|
+
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1360
|
+
batch_futures = {future: i for future, i in futures.items()}
|
|
1215
1361
|
|
|
1216
|
-
|
|
1217
|
-
|
|
1362
|
+
for future in as_completed(futures):
|
|
1363
|
+
batch_idx = batch_futures[future]
|
|
1364
|
+
try:
|
|
1365
|
+
batch_idx, df_server, df_client = future.result()
|
|
1366
|
+
|
|
1367
|
+
# Merge server and client results
|
|
1368
|
+
if plot_id_column not in df_server.columns:
|
|
1369
|
+
df_server[plot_id_column] = range(len(df_server))
|
|
1370
|
+
|
|
1371
|
+
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1372
|
+
# These are the actual EE processing results
|
|
1373
|
+
df_server_clean = df_server.copy()
|
|
1374
|
+
|
|
1375
|
+
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1376
|
+
# (formatted wrapper handles keep_external_columns parameter)
|
|
1377
|
+
keep_external_columns = [plot_id_column]
|
|
1378
|
+
if external_id_column and external_id_column in df_client.columns:
|
|
1379
|
+
keep_external_columns.append(external_id_column)
|
|
1380
|
+
if "geometry" in df_client.columns:
|
|
1381
|
+
keep_external_columns.append("geometry")
|
|
1382
|
+
# Keep geometry type column (Geometry_type)
|
|
1383
|
+
if geometry_type_column in df_client.columns:
|
|
1384
|
+
keep_external_columns.append(geometry_type_column)
|
|
1385
|
+
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1386
|
+
centroid_cols = [
|
|
1387
|
+
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1388
|
+
]
|
|
1389
|
+
keep_external_columns.extend(centroid_cols)
|
|
1390
|
+
|
|
1391
|
+
df_client_clean = df_client[
|
|
1392
|
+
[c for c in keep_external_columns if c in df_client.columns]
|
|
1393
|
+
]
|
|
1394
|
+
# Don't drop duplicates - we need one row per feature (one per plot_id)
|
|
1395
|
+
# Each plot_id should have exactly one row with its metadata
|
|
1396
|
+
|
|
1397
|
+
merged = df_server_clean.merge(
|
|
1398
|
+
df_client_clean,
|
|
1399
|
+
on=plot_id_column,
|
|
1400
|
+
how="left",
|
|
1401
|
+
suffixes=("_ee", "_client"),
|
|
1402
|
+
)
|
|
1403
|
+
results.append(merged)
|
|
1404
|
+
progress.update()
|
|
1405
|
+
|
|
1406
|
+
except Exception as e:
|
|
1407
|
+
# Batch failed - fail fast with clear guidance
|
|
1408
|
+
error_msg = str(e)
|
|
1409
|
+
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1410
|
+
logger.debug(f"Full error: {error_msg}")
|
|
1411
|
+
|
|
1412
|
+
# Get original batch for error reporting
|
|
1413
|
+
original_batch = batch_map[batch_idx]
|
|
1414
|
+
|
|
1415
|
+
# Add to batch errors for final reporting
|
|
1416
|
+
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1417
|
+
except (KeyboardInterrupt, SystemExit) as interrupt:
|
|
1418
|
+
logger.warning("Processing interrupted by user")
|
|
1419
|
+
# Update status file with interrupted state
|
|
1420
|
+
progress._write_status_file(status="interrupted")
|
|
1421
|
+
raise interrupt
|
|
1218
1422
|
finally:
|
|
1219
1423
|
# Restore logger levels
|
|
1220
1424
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -1565,7 +1769,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1565
1769
|
)
|
|
1566
1770
|
raise retry_e
|
|
1567
1771
|
|
|
1568
|
-
logger.info(f"
|
|
1772
|
+
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1569
1773
|
return formatted
|
|
1570
1774
|
else:
|
|
1571
1775
|
logger.error(" No results produced")
|
|
@@ -1580,7 +1784,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1580
1784
|
def whisp_stats_geojson_to_df_sequential(
|
|
1581
1785
|
input_geojson_filepath: str,
|
|
1582
1786
|
external_id_column: str = None,
|
|
1583
|
-
remove_geom: bool = False,
|
|
1584
1787
|
national_codes: List[str] = None,
|
|
1585
1788
|
unit_type: str = "ha",
|
|
1586
1789
|
whisp_image: ee.Image = None,
|
|
@@ -1605,8 +1808,6 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1605
1808
|
Path to input GeoJSON
|
|
1606
1809
|
external_id_column : str, optional
|
|
1607
1810
|
Column name for external IDs
|
|
1608
|
-
remove_geom : bool
|
|
1609
|
-
Remove geometry from output
|
|
1610
1811
|
national_codes : List[str], optional
|
|
1611
1812
|
ISO2 codes for national datasets
|
|
1612
1813
|
unit_type : str
|
|
@@ -1646,6 +1847,25 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1646
1847
|
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1647
1848
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1648
1849
|
|
|
1850
|
+
# Validate external_id_column if provided (lightweight client-side check)
|
|
1851
|
+
if external_id_column and external_id_column not in gdf.columns:
|
|
1852
|
+
# Exclude geometry column from available columns list
|
|
1853
|
+
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1854
|
+
raise ValueError(
|
|
1855
|
+
f"Column '{external_id_column}' not found in GeoJSON properties. "
|
|
1856
|
+
f"Available columns: {available_cols}"
|
|
1857
|
+
)
|
|
1858
|
+
|
|
1859
|
+
# Check completeness of external_id_column (warn if nulls exist)
|
|
1860
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
1861
|
+
null_count = gdf[external_id_column].isna().sum()
|
|
1862
|
+
if null_count > 0:
|
|
1863
|
+
null_pct = (null_count / len(gdf)) * 100
|
|
1864
|
+
logger.warning(
|
|
1865
|
+
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1866
|
+
f"These features may have missing external IDs in output."
|
|
1867
|
+
)
|
|
1868
|
+
|
|
1649
1869
|
# Clean geometries (preserve both null and invalid geometries by default)
|
|
1650
1870
|
gdf = clean_geodataframe(
|
|
1651
1871
|
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
@@ -1696,7 +1916,9 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1696
1916
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
1697
1917
|
|
|
1698
1918
|
# Process server-side with error handling for bad bands
|
|
1699
|
-
logger.info(
|
|
1919
|
+
logger.info(
|
|
1920
|
+
f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
|
|
1921
|
+
)
|
|
1700
1922
|
try:
|
|
1701
1923
|
results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
|
|
1702
1924
|
df_server = convert_ee_to_df(results_fc)
|
|
@@ -1782,7 +2004,7 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1782
2004
|
convert_water_flag=True,
|
|
1783
2005
|
)
|
|
1784
2006
|
|
|
1785
|
-
logger.info(f"
|
|
2007
|
+
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1786
2008
|
|
|
1787
2009
|
# Consolidate external_id_column to standardized 'external_id'
|
|
1788
2010
|
if external_id_column:
|
|
@@ -1815,7 +2037,6 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1815
2037
|
def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
1816
2038
|
input_geojson_filepath: str,
|
|
1817
2039
|
external_id_column: str = None,
|
|
1818
|
-
remove_geom: bool = False,
|
|
1819
2040
|
national_codes: List[str] = None,
|
|
1820
2041
|
unit_type: str = "ha",
|
|
1821
2042
|
whisp_image: ee.Image = None,
|
|
@@ -1833,6 +2054,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1833
2054
|
water_flag_threshold: float = 0.5,
|
|
1834
2055
|
sort_column: str = "plotId",
|
|
1835
2056
|
geometry_audit_trail: bool = False,
|
|
2057
|
+
status_file: str = None,
|
|
1836
2058
|
) -> pd.DataFrame:
|
|
1837
2059
|
"""
|
|
1838
2060
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -1848,8 +2070,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1848
2070
|
Path to input GeoJSON file
|
|
1849
2071
|
external_id_column : str, optional
|
|
1850
2072
|
Column name for external IDs
|
|
1851
|
-
remove_geom : bool
|
|
1852
|
-
Remove geometry column from output
|
|
1853
2073
|
national_codes : List[str], optional
|
|
1854
2074
|
ISO2 codes for national datasets
|
|
1855
2075
|
unit_type : str
|
|
@@ -1917,7 +2137,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1917
2137
|
df_raw = whisp_stats_geojson_to_df_concurrent(
|
|
1918
2138
|
input_geojson_filepath=input_geojson_filepath,
|
|
1919
2139
|
external_id_column=external_id_column,
|
|
1920
|
-
remove_geom=remove_geom,
|
|
1921
2140
|
national_codes=national_codes,
|
|
1922
2141
|
unit_type=unit_type,
|
|
1923
2142
|
whisp_image=whisp_image,
|
|
@@ -1928,6 +2147,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1928
2147
|
max_retries=max_retries,
|
|
1929
2148
|
add_metadata_server=add_metadata_server,
|
|
1930
2149
|
logger=logger,
|
|
2150
|
+
status_file=status_file,
|
|
1931
2151
|
)
|
|
1932
2152
|
|
|
1933
2153
|
# Step 2: Format the output
|
|
@@ -2030,7 +2250,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2030
2250
|
def whisp_formatted_stats_geojson_to_df_sequential(
|
|
2031
2251
|
input_geojson_filepath: str,
|
|
2032
2252
|
external_id_column: str = None,
|
|
2033
|
-
remove_geom: bool = False,
|
|
2034
2253
|
national_codes: List[str] = None,
|
|
2035
2254
|
unit_type: str = "ha",
|
|
2036
2255
|
whisp_image: ee.Image = None,
|
|
@@ -2044,6 +2263,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2044
2263
|
water_flag_threshold: float = 0.5,
|
|
2045
2264
|
sort_column: str = "plotId",
|
|
2046
2265
|
geometry_audit_trail: bool = False,
|
|
2266
|
+
status_file: str = None,
|
|
2047
2267
|
) -> pd.DataFrame:
|
|
2048
2268
|
"""
|
|
2049
2269
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -2059,8 +2279,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2059
2279
|
Path to input GeoJSON file
|
|
2060
2280
|
external_id_column : str, optional
|
|
2061
2281
|
Column name for external IDs
|
|
2062
|
-
remove_geom : bool
|
|
2063
|
-
Remove geometry from output
|
|
2064
2282
|
national_codes : List[str], optional
|
|
2065
2283
|
ISO2 codes for national datasets
|
|
2066
2284
|
unit_type : str
|
|
@@ -2120,7 +2338,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2120
2338
|
df_raw = whisp_stats_geojson_to_df_sequential(
|
|
2121
2339
|
input_geojson_filepath=input_geojson_filepath,
|
|
2122
2340
|
external_id_column=external_id_column,
|
|
2123
|
-
remove_geom=remove_geom,
|
|
2124
2341
|
national_codes=national_codes,
|
|
2125
2342
|
unit_type=unit_type,
|
|
2126
2343
|
whisp_image=whisp_image,
|
|
@@ -2233,7 +2450,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2233
2450
|
def whisp_formatted_stats_geojson_to_df_fast(
|
|
2234
2451
|
input_geojson_filepath: str,
|
|
2235
2452
|
external_id_column: str = None,
|
|
2236
|
-
remove_geom: bool = False,
|
|
2237
2453
|
national_codes: List[str] = None,
|
|
2238
2454
|
unit_type: str = "ha",
|
|
2239
2455
|
whisp_image: ee.Image = None,
|
|
@@ -2252,6 +2468,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2252
2468
|
water_flag_threshold: float = 0.5,
|
|
2253
2469
|
sort_column: str = "plotId",
|
|
2254
2470
|
geometry_audit_trail: bool = False,
|
|
2471
|
+
status_file: str = None,
|
|
2255
2472
|
) -> pd.DataFrame:
|
|
2256
2473
|
"""
|
|
2257
2474
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
@@ -2267,8 +2484,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2267
2484
|
Path to input GeoJSON file
|
|
2268
2485
|
external_id_column : str, optional
|
|
2269
2486
|
Column name for external IDs
|
|
2270
|
-
remove_geom : bool
|
|
2271
|
-
Remove geometry column from output
|
|
2272
2487
|
national_codes : List[str], optional
|
|
2273
2488
|
ISO2 codes for national datasets
|
|
2274
2489
|
unit_type : str
|
|
@@ -2339,7 +2554,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2339
2554
|
return whisp_formatted_stats_geojson_to_df_concurrent(
|
|
2340
2555
|
input_geojson_filepath=input_geojson_filepath,
|
|
2341
2556
|
external_id_column=external_id_column,
|
|
2342
|
-
remove_geom=remove_geom,
|
|
2343
2557
|
national_codes=national_codes,
|
|
2344
2558
|
unit_type=unit_type,
|
|
2345
2559
|
whisp_image=whisp_image,
|
|
@@ -2356,13 +2570,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2356
2570
|
water_flag_threshold=water_flag_threshold,
|
|
2357
2571
|
sort_column=sort_column,
|
|
2358
2572
|
geometry_audit_trail=geometry_audit_trail,
|
|
2573
|
+
status_file=status_file,
|
|
2359
2574
|
)
|
|
2360
2575
|
else: # sequential
|
|
2361
2576
|
logger.debug("Routing to sequential processing...")
|
|
2362
2577
|
return whisp_formatted_stats_geojson_to_df_sequential(
|
|
2363
2578
|
input_geojson_filepath=input_geojson_filepath,
|
|
2364
2579
|
external_id_column=external_id_column,
|
|
2365
|
-
remove_geom=remove_geom,
|
|
2366
2580
|
national_codes=national_codes,
|
|
2367
2581
|
unit_type=unit_type,
|
|
2368
2582
|
whisp_image=whisp_image,
|
|
@@ -2374,4 +2588,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2374
2588
|
water_flag_threshold=water_flag_threshold,
|
|
2375
2589
|
sort_column=sort_column,
|
|
2376
2590
|
geometry_audit_trail=geometry_audit_trail,
|
|
2591
|
+
status_file=status_file,
|
|
2377
2592
|
)
|
openforis_whisp/data_checks.py
CHANGED
|
@@ -750,23 +750,43 @@ def validate_geojson_constraints(
|
|
|
750
750
|
return results
|
|
751
751
|
|
|
752
752
|
|
|
753
|
-
def
|
|
753
|
+
def suggest_processing_mode(
|
|
754
|
+
feature_count,
|
|
755
|
+
mean_area_ha=None,
|
|
756
|
+
mean_vertices=None,
|
|
757
|
+
feature_type="polygon",
|
|
758
|
+
verbose=True,
|
|
759
|
+
):
|
|
754
760
|
"""
|
|
755
|
-
Suggest processing
|
|
761
|
+
Suggest processing mode based on feature characteristics.
|
|
762
|
+
|
|
763
|
+
Decision thresholds from comprehensive benchmark data (Nov 2025):
|
|
756
764
|
|
|
757
|
-
|
|
758
|
-
-
|
|
759
|
-
-
|
|
760
|
-
-
|
|
765
|
+
POINTS:
|
|
766
|
+
- Break-even: 750-1000 features
|
|
767
|
+
- Sequential faster: < 750 features
|
|
768
|
+
- Concurrent faster: >= 750 features
|
|
769
|
+
|
|
770
|
+
POLYGONS (area-based thresholds):
|
|
771
|
+
- Tiny (< 1 ha): break-even ~500 features
|
|
772
|
+
- Small (1-5 ha, simple): break-even ~500 features
|
|
773
|
+
- Small (1-5 ha, complex 20-50v): break-even ~500 features
|
|
774
|
+
- Medium (5-20 ha): break-even ~250 features
|
|
775
|
+
- Large (20-100 ha): break-even ~250 features
|
|
776
|
+
- Very large (50-200 ha): break-even ~250 features
|
|
777
|
+
|
|
778
|
+
Vertex complexity adjustment: High vertex counts (>50) favor concurrent at lower thresholds
|
|
761
779
|
|
|
762
780
|
Parameters:
|
|
763
781
|
-----------
|
|
764
|
-
|
|
765
|
-
Number of polygons
|
|
766
|
-
mean_area_ha : float
|
|
767
|
-
Mean area per polygon in hectares
|
|
782
|
+
feature_count : int
|
|
783
|
+
Number of features (polygons or points)
|
|
784
|
+
mean_area_ha : float, optional
|
|
785
|
+
Mean area per polygon in hectares (required for polygons, ignored for points)
|
|
768
786
|
mean_vertices : float, optional
|
|
769
|
-
Mean number of vertices per polygon (
|
|
787
|
+
Mean number of vertices per polygon (influences decision for complex geometries)
|
|
788
|
+
feature_type : str
|
|
789
|
+
'polygon', 'multipolygon', or 'point' (default: 'polygon')
|
|
770
790
|
verbose : bool
|
|
771
791
|
Print recommendation explanation
|
|
772
792
|
|
|
@@ -775,31 +795,63 @@ def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True
|
|
|
775
795
|
str: 'concurrent' or 'sequential'
|
|
776
796
|
"""
|
|
777
797
|
|
|
778
|
-
#
|
|
779
|
-
if
|
|
780
|
-
breakeven =
|
|
781
|
-
method = "concurrent" if
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
798
|
+
# Points: simple threshold-based decision
|
|
799
|
+
if feature_type == "point":
|
|
800
|
+
breakeven = 750
|
|
801
|
+
method = "concurrent" if feature_count >= breakeven else "sequential"
|
|
802
|
+
|
|
803
|
+
if verbose:
|
|
804
|
+
print(f"\nMETHOD RECOMMENDATION (Points)")
|
|
805
|
+
print(f" Features: {feature_count} points")
|
|
806
|
+
print(f" Break-even: {breakeven} features | Method: {method.upper()}")
|
|
807
|
+
|
|
808
|
+
return method
|
|
809
|
+
|
|
810
|
+
# Polygons and MultiPolygons: area and complexity-based decision
|
|
811
|
+
# MultiPolygons use same breakpoints as Polygons
|
|
812
|
+
if mean_area_ha is None:
|
|
813
|
+
# Default to conservative threshold if area unknown
|
|
814
|
+
breakeven = 500
|
|
815
|
+
method = "concurrent" if feature_count >= breakeven else "sequential"
|
|
816
|
+
|
|
817
|
+
if verbose:
|
|
818
|
+
print(f"\nMETHOD RECOMMENDATION (Polygons - area unknown)")
|
|
819
|
+
print(f" Features: {feature_count} polygons")
|
|
820
|
+
print(
|
|
821
|
+
f" Break-even: {breakeven} (conservative) | Method: {method.upper()}"
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
return method
|
|
825
|
+
|
|
826
|
+
# Area-based thresholds from benchmark data
|
|
827
|
+
if mean_area_ha >= 20: # Large to very large polygons
|
|
828
|
+
breakeven = 250
|
|
829
|
+
elif mean_area_ha >= 5: # Medium polygons
|
|
786
830
|
breakeven = 250
|
|
787
|
-
|
|
831
|
+
elif mean_area_ha >= 1: # Small polygons
|
|
832
|
+
# Vertex complexity matters more for small polygons
|
|
833
|
+
if mean_vertices is not None and mean_vertices >= 30:
|
|
834
|
+
breakeven = 500 # Complex small polygons
|
|
835
|
+
else:
|
|
836
|
+
breakeven = 500 # Simple small polygons
|
|
837
|
+
else: # Tiny polygons (< 1 ha)
|
|
838
|
+
breakeven = 500
|
|
839
|
+
|
|
840
|
+
# Vertex complexity adjustment for high-complexity geometries
|
|
841
|
+
if mean_vertices is not None and mean_vertices >= 50:
|
|
842
|
+
# High complexity: reduce breakeven by 20% (concurrent beneficial sooner)
|
|
843
|
+
breakeven = int(breakeven * 0.8)
|
|
788
844
|
|
|
789
|
-
|
|
790
|
-
if mean_vertices is not None and mean_vertices > 500:
|
|
791
|
-
# Reduce breakeven by 25% for very complex geometries
|
|
792
|
-
adjusted_breakeven = int(breakeven * 0.75)
|
|
793
|
-
method = "concurrent" if polygon_count >= adjusted_breakeven else "sequential"
|
|
845
|
+
method = "concurrent" if feature_count >= breakeven else "sequential"
|
|
794
846
|
|
|
795
847
|
if verbose:
|
|
796
|
-
print(f"\nMETHOD RECOMMENDATION")
|
|
848
|
+
print(f"\nMETHOD RECOMMENDATION (Polygons)")
|
|
797
849
|
print(
|
|
798
|
-
f"
|
|
850
|
+
f" Features: {feature_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
|
|
799
851
|
)
|
|
800
852
|
if mean_vertices is not None:
|
|
801
853
|
print(f" | Mean Vertices: {mean_vertices:.1f}", end="")
|
|
802
854
|
print()
|
|
803
|
-
print(f"
|
|
855
|
+
print(f" Break-even: {breakeven} features | Method: {method.upper()}")
|
|
804
856
|
|
|
805
857
|
return method
|
openforis_whisp/datasets.py
CHANGED
|
@@ -1160,6 +1160,20 @@ def nci_ocs2020_prep():
|
|
|
1160
1160
|
).selfMask() # cocoa from national land cover map for Côte d'Ivoire
|
|
1161
1161
|
|
|
1162
1162
|
|
|
1163
|
+
# nCM - Cameroon
|
|
1164
|
+
# data from Aurelie Shapiro (FAO) working directly with country experts - info on methods and accuracy assessment to follow
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
def ncm_treecover_2020_prep():
|
|
1168
|
+
return (
|
|
1169
|
+
ee.Image("projects/ee-cocoacmr/assets/land_cover/CMR_TNTMMU_2020")
|
|
1170
|
+
.select("FNF_2020")
|
|
1171
|
+
.eq(1)
|
|
1172
|
+
.rename("nCM_Treecover_2020")
|
|
1173
|
+
.selfMask()
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
|
|
1163
1177
|
# ============================================================================
|
|
1164
1178
|
# CONTEXT BANDS (Administrative boundaries and water mask)
|
|
1165
1179
|
# ============================================================================
|
openforis_whisp/logger.py
CHANGED
|
@@ -8,9 +8,21 @@ BASE_MSG_FORMAT = (
|
|
|
8
8
|
|
|
9
9
|
class StdoutLogger:
|
|
10
10
|
def __init__(self, name: str, msg_format: str = BASE_MSG_FORMAT) -> None:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
# Create handler that auto-flushes for Colab/notebook visibility
|
|
12
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
13
|
+
handler.setFormatter(logging.Formatter(msg_format))
|
|
14
|
+
handler.setLevel(logging.DEBUG)
|
|
15
|
+
|
|
16
|
+
# Override emit to force flush after each message
|
|
17
|
+
original_emit = handler.emit
|
|
18
|
+
|
|
19
|
+
def emit_with_flush(record):
|
|
20
|
+
original_emit(record)
|
|
21
|
+
sys.stdout.flush()
|
|
22
|
+
|
|
23
|
+
handler.emit = emit_with_flush
|
|
24
|
+
|
|
25
|
+
self.handler = handler
|
|
14
26
|
self.logger = logging.getLogger(name)
|
|
15
27
|
self.logger.addHandler(self.handler)
|
|
16
28
|
self.logger.propagate = False
|
|
@@ -2,9 +2,9 @@ name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude
|
|
|
2
2
|
EUFO_2020,10,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_jrc_gfc_2020_prep
|
|
3
3
|
GLAD_Primary,20,,treecover,primary,1,1,0,float32,1,0,g_glad_pht_prep
|
|
4
4
|
TMF_undist,30,,treecover,primary,1,1,0,float32,1,0,g_jrc_tmf_undisturbed_prep
|
|
5
|
-
GFC_TC_2020,50,,treecover,naturally_reg_2020,
|
|
5
|
+
GFC_TC_2020,50,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_glad_gfc_10pc_prep
|
|
6
6
|
Forest_FDaP,60,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
|
|
7
|
-
ESA_TC_2020,70,,treecover,naturally_reg_2020,
|
|
7
|
+
ESA_TC_2020,70,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_esa_worldcover_trees_prep
|
|
8
8
|
TMF_plant,80,,commodities,NA,1,1,0,float32,1,0,g_jrc_tmf_plantation_prep
|
|
9
9
|
Oil_palm_Descals,90,,commodities,NA,1,1,0,float32,1,0,g_creaf_descals_palm_prep
|
|
10
10
|
Oil_palm_FDaP,100,,commodities,NA,1,1,0,float32,1,0,g_fdap_palm_prep
|
|
@@ -197,3 +197,4 @@ nBR_INPE_TCamz_pasture_2020,2422,BR,commodities,NA,1,1,0,float32,1,0,nbr_terracl
|
|
|
197
197
|
nBR_INPE_TCcer_pasture_2020,2423,BR,commodities,NA,1,1,0,float32,1,0,nbr_terraclass_cer20_ac_prep
|
|
198
198
|
nBR_MapBiomas_col9_pasture_2020,2424,BR,commodities,NA,1,1,0,float32,1,0,nbr_mapbiomasc9_pasture_prep
|
|
199
199
|
nCI_Cocoa_bnetd,3000,CI,commodities,NA,1,1,0,float32,1,0,nci_ocs2020_prep
|
|
200
|
+
nCM_Treecover_2020,3100,CM,treecover,NA,1,0,0,float32,1,0,ncm_treecover_2020_prep
|
openforis_whisp/pd_schemas.py
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
# Support both old and new pandera import paths
|
|
2
|
+
try:
|
|
3
|
+
import pandera.pandas as pa
|
|
4
|
+
from pandera.typing.pandas import DataFrame, Series
|
|
5
|
+
except (ImportError, ModuleNotFoundError):
|
|
6
|
+
import pandera as pa
|
|
7
|
+
from pandera.typing import DataFrame, Series
|
|
3
8
|
|
|
4
9
|
# Define a schema for validating a DataFrame related to GEE (Google Earth Engine) datasets.
|
|
5
10
|
class DataLookupSchema(pa.DataFrameModel):
|
openforis_whisp/reformat.py
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# !pip install pandera[io] # special version used
|
|
2
|
-
|
|
2
|
+
# Support both old and new pandera import paths
|
|
3
|
+
try:
|
|
4
|
+
import pandera.pandas as pa
|
|
5
|
+
except (ImportError, ModuleNotFoundError):
|
|
6
|
+
import pandera as pa
|
|
7
|
+
|
|
3
8
|
import pandas as pd
|
|
4
9
|
import os
|
|
5
10
|
import logging
|
openforis_whisp/stats.py
CHANGED
|
@@ -88,7 +88,6 @@ def get_admin_boundaries_fc():
|
|
|
88
88
|
def whisp_formatted_stats_geojson_to_df_legacy(
|
|
89
89
|
input_geojson_filepath: Path | str,
|
|
90
90
|
external_id_column=None,
|
|
91
|
-
remove_geom=False,
|
|
92
91
|
national_codes=None,
|
|
93
92
|
unit_type="ha",
|
|
94
93
|
whisp_image=None,
|
|
@@ -147,7 +146,6 @@ def whisp_formatted_stats_geojson_to_df_legacy(
|
|
|
147
146
|
return whisp_formatted_stats_ee_to_df(
|
|
148
147
|
feature_collection,
|
|
149
148
|
external_id_column,
|
|
150
|
-
remove_geom,
|
|
151
149
|
national_codes=national_codes,
|
|
152
150
|
unit_type=unit_type,
|
|
153
151
|
whisp_image=whisp_image,
|
|
@@ -167,6 +165,7 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
167
165
|
batch_size: int = 10,
|
|
168
166
|
max_concurrent: int = 20,
|
|
169
167
|
geometry_audit_trail: bool = False,
|
|
168
|
+
status_file: str = None,
|
|
170
169
|
) -> pd.DataFrame:
|
|
171
170
|
"""
|
|
172
171
|
Main entry point for converting GeoJSON to Whisp statistics.
|
|
@@ -188,11 +187,7 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
188
187
|
The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
|
|
189
188
|
This column must exist as a property in ALL features of the GeoJSON file.
|
|
190
189
|
Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
|
|
191
|
-
remove_geom : bool, default=False
|
|
192
|
-
If True, the geometry of the GeoJSON is removed from the output DataFrame.
|
|
193
190
|
national_codes : list, optional
|
|
194
|
-
List of ISO2 country codes to include national datasets.
|
|
195
|
-
unit_type: str, optional
|
|
196
191
|
Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
|
|
197
192
|
whisp_image : ee.Image, optional
|
|
198
193
|
Pre-combined multiband Earth Engine Image containing all Whisp datasets.
|
|
@@ -224,6 +219,13 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
224
219
|
|
|
225
220
|
Processing metadata stored in df.attrs['processing_metadata'].
|
|
226
221
|
These columns enable full transparency for geometry modifications during processing.
|
|
222
|
+
status_file : str, optional
|
|
223
|
+
Path to JSON status file or directory for real-time progress tracking.
|
|
224
|
+
If a directory is provided, creates 'whisp_processing_status.json' in that directory.
|
|
225
|
+
Updates every 3 minutes and at progress milestones (5%, 10%, etc.).
|
|
226
|
+
Format: {"status": "processing", "progress": "450/1000", "percent": 45.0,
|
|
227
|
+
"elapsed_sec": 120, "eta_sec": 145, "updated_at": "2025-11-13T14:23:45"}
|
|
228
|
+
Most useful for large concurrent jobs. Works in both concurrent and sequential modes.
|
|
227
229
|
|
|
228
230
|
Returns
|
|
229
231
|
-------
|
|
@@ -283,7 +285,6 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
283
285
|
return whisp_formatted_stats_geojson_to_df_legacy(
|
|
284
286
|
input_geojson_filepath=input_geojson_filepath,
|
|
285
287
|
external_id_column=external_id_column,
|
|
286
|
-
remove_geom=remove_geom,
|
|
287
288
|
national_codes=national_codes,
|
|
288
289
|
unit_type=unit_type,
|
|
289
290
|
whisp_image=whisp_image,
|
|
@@ -306,7 +307,6 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
306
307
|
return whisp_formatted_stats_geojson_to_df_fast(
|
|
307
308
|
input_geojson_filepath=input_geojson_filepath,
|
|
308
309
|
external_id_column=external_id_column,
|
|
309
|
-
remove_geom=remove_geom,
|
|
310
310
|
national_codes=national_codes,
|
|
311
311
|
unit_type=unit_type,
|
|
312
312
|
whisp_image=whisp_image,
|
|
@@ -315,6 +315,7 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
315
315
|
batch_size=batch_size,
|
|
316
316
|
max_concurrent=max_concurrent,
|
|
317
317
|
geometry_audit_trail=geometry_audit_trail,
|
|
318
|
+
status_file=status_file,
|
|
318
319
|
)
|
|
319
320
|
else:
|
|
320
321
|
raise ValueError(
|
|
@@ -473,7 +474,6 @@ def whisp_formatted_stats_ee_to_df(
|
|
|
473
474
|
def whisp_stats_geojson_to_df(
|
|
474
475
|
input_geojson_filepath: Path | str,
|
|
475
476
|
external_id_column=None,
|
|
476
|
-
remove_geom=False,
|
|
477
477
|
national_codes=None,
|
|
478
478
|
unit_type="ha",
|
|
479
479
|
whisp_image=None, # New parameter
|
|
@@ -506,7 +506,6 @@ def whisp_stats_geojson_to_df(
|
|
|
506
506
|
return whisp_stats_ee_to_df(
|
|
507
507
|
feature_collection,
|
|
508
508
|
external_id_column,
|
|
509
|
-
remove_geom,
|
|
510
509
|
national_codes=national_codes,
|
|
511
510
|
unit_type=unit_type,
|
|
512
511
|
whisp_image=whisp_image, # Pass through
|
|
@@ -990,7 +989,7 @@ def whisp_stats_ee_to_drive(
|
|
|
990
989
|
)
|
|
991
990
|
task.start()
|
|
992
991
|
print(
|
|
993
|
-
"Exporting to Google Drive: '
|
|
992
|
+
"Exporting to Google Drive: 'whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
|
|
994
993
|
)
|
|
995
994
|
except Exception as e:
|
|
996
995
|
print(f"An error occurred during the export: {e}")
|
openforis_whisp/utils.py
CHANGED
|
@@ -5,6 +5,8 @@ import os
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import random
|
|
7
7
|
import numpy as np
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
8
10
|
|
|
9
11
|
import urllib.request
|
|
10
12
|
import os
|
|
@@ -19,6 +21,23 @@ from shapely.validation import make_valid
|
|
|
19
21
|
|
|
20
22
|
from .logger import StdoutLogger
|
|
21
23
|
|
|
24
|
+
# Configure the "whisp" logger with auto-flush handler for Colab visibility
|
|
25
|
+
_whisp_logger = logging.getLogger("whisp")
|
|
26
|
+
if not _whisp_logger.handlers:
|
|
27
|
+
_handler = logging.StreamHandler(sys.stdout)
|
|
28
|
+
_handler.setLevel(logging.DEBUG)
|
|
29
|
+
_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
|
|
30
|
+
# Override emit to force flush after each message for Colab
|
|
31
|
+
_original_emit = _handler.emit
|
|
32
|
+
|
|
33
|
+
def _emit_with_flush(record):
|
|
34
|
+
_original_emit(record)
|
|
35
|
+
sys.stdout.flush()
|
|
36
|
+
|
|
37
|
+
_handler.emit = _emit_with_flush
|
|
38
|
+
_whisp_logger.addHandler(_handler)
|
|
39
|
+
_whisp_logger.setLevel(logging.INFO)
|
|
40
|
+
_whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
|
|
22
41
|
|
|
23
42
|
logger = StdoutLogger(__name__)
|
|
24
43
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: openforis-whisp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0a4
|
|
4
4
|
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: whisp,geospatial,data-processing
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
openforis_whisp/__init__.py,sha256=5zJK84LYnlslxSajdCz6ZIYxRS4xgN3sGxSD6_GXEHs,3547
|
|
2
|
+
openforis_whisp/advanced_stats.py,sha256=FC1YasSZ93jplF1qBgDopzBIsO2ueXnidomQU3rpP_Q,100006
|
|
3
|
+
openforis_whisp/data_checks.py,sha256=ErIKGbCa3R8eYP0sVoAl-ZUl607W1QrG0Jr2SIVgm2I,34056
|
|
4
|
+
openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
|
|
5
|
+
openforis_whisp/datasets.py,sha256=F1WxXc93mxxmN-WHa0bf-XX-FloSQyEBJKmnrQEHYn8,53855
|
|
6
|
+
openforis_whisp/logger.py,sha256=gFkRTwJDJKIBWcHDOK74Uln3JM7fAybURo7pQpGL790,3395
|
|
7
|
+
openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
|
|
9
|
+
openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=KgK0ik_Gd4t_Nq5cUkGPT4ZFZVO93HWSG82jRrOukt4,1298
|
|
10
|
+
openforis_whisp/parameters/lookup_gaul1_admin.py,sha256=cQr5liRdXi85QieTxrz4VAkn0COvRCp82ZV0dYFWOio,474980
|
|
11
|
+
openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=7KdnFocEgbZO5m8JmWQchzZTurg9rJ96y17z8UyLtI0,17537
|
|
12
|
+
openforis_whisp/pd_schemas.py,sha256=0z-oPmYIDUIn7mNY41W_uUpmTwjoR7e254mOCoHVsOg,2878
|
|
13
|
+
openforis_whisp/reformat.py,sha256=gvhIa-_kTT5BSO8LuVmJ1TQcf_NwheskXboFM9e0KJY,32758
|
|
14
|
+
openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
|
|
15
|
+
openforis_whisp/stats.py,sha256=pTSYs77ISRBOIglRpq4SUx3lKRkrUZOKROLRX5IP9IY,63941
|
|
16
|
+
openforis_whisp/utils.py,sha256=AISWF-MpfFdYkhd6bei4BViw2Iag20mmq61ykrF9YTk,31287
|
|
17
|
+
openforis_whisp-3.0.0a4.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
|
|
18
|
+
openforis_whisp-3.0.0a4.dist-info/METADATA,sha256=ak2Dw632lgOtXEXkl5-haYK7vF3hPaJ6IkaRRJRdH0Y,16684
|
|
19
|
+
openforis_whisp-3.0.0a4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
20
|
+
openforis_whisp-3.0.0a4.dist-info/RECORD,,
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
openforis_whisp/__init__.py,sha256=s42Q0VJdzm8mgnxfYg1hUEJPM2VLWIva2h-mdKyr444,3538
|
|
2
|
-
openforis_whisp/advanced_stats.py,sha256=tvhgNTCGlT3aYecUPP6QCTO0FRrjk0qjs95NoVZvIt4,90935
|
|
3
|
-
openforis_whisp/data_checks.py,sha256=KwgD72FA_n7joiJadGRpzntd2sLo0aqGNbOjRkB8iQI,32293
|
|
4
|
-
openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
|
|
5
|
-
openforis_whisp/datasets.py,sha256=aGJy0OYN4d0nsH3_IOYlHl-WCB7KFwZwMJ-dBi5Hc5Y,53470
|
|
6
|
-
openforis_whisp/logger.py,sha256=9M6_3mdpoiWfC-pDwM9vKmB2l5Gul6Rb5rNTNh-_nzs,3054
|
|
7
|
-
openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
|
|
9
|
-
openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=KgK0ik_Gd4t_Nq5cUkGPT4ZFZVO93HWSG82jRrOukt4,1298
|
|
10
|
-
openforis_whisp/parameters/lookup_gaul1_admin.py,sha256=cQr5liRdXi85QieTxrz4VAkn0COvRCp82ZV0dYFWOio,474980
|
|
11
|
-
openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=UDvZrQsL5rXJn6CW6P3wofUrPLRmUFZWt6ETbXaxBMs,17454
|
|
12
|
-
openforis_whisp/pd_schemas.py,sha256=W_ocS773LHfc05dJqvWRa-bRdX0wKFoNp0lMxgFx94Y,2681
|
|
13
|
-
openforis_whisp/reformat.py,sha256=MPjP5lb218GTcTpd_Qvbj5ER_8EY4JjLDteQaS5OZCQ,32620
|
|
14
|
-
openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
|
|
15
|
-
openforis_whisp/stats.py,sha256=nVzQpSu7BoSb2S6HheLeoK_pmguZ9Lyw0ZfbTTMVq4Q,63720
|
|
16
|
-
openforis_whisp/utils.py,sha256=Q-EwhUaohk63WCx7Rr5VuR3X-oGtgILZDc8JsjbWhgg,30538
|
|
17
|
-
openforis_whisp-3.0.0a3.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
|
|
18
|
-
openforis_whisp-3.0.0a3.dist-info/METADATA,sha256=6xuNhUpQWyzKU3m13FnJ7SX39jAVry1YEKNAdH0D2to,16684
|
|
19
|
-
openforis_whisp-3.0.0a3.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
20
|
-
openforis_whisp-3.0.0a3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|