openforis-whisp 3.0.0a3__py3-none-any.whl → 3.0.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +1 -1
- openforis_whisp/advanced_stats.py +523 -218
- openforis_whisp/data_checks.py +80 -28
- openforis_whisp/datasets.py +14 -0
- openforis_whisp/logger.py +15 -3
- openforis_whisp/parameters/lookup_context_and_metadata.csv +1 -1
- openforis_whisp/parameters/lookup_gee_datasets.csv +3 -2
- openforis_whisp/pd_schemas.py +7 -2
- openforis_whisp/reformat.py +6 -1
- openforis_whisp/stats.py +10 -11
- openforis_whisp/utils.py +19 -0
- {openforis_whisp-3.0.0a3.dist-info → openforis_whisp-3.0.0a5.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a5.dist-info/RECORD +20 -0
- openforis_whisp-3.0.0a3.dist-info/RECORD +0 -20
- {openforis_whisp-3.0.0a3.dist-info → openforis_whisp-3.0.0a5.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a3.dist-info → openforis_whisp-3.0.0a5.dist-info}/WHEEL +0 -0
|
@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
|
|
|
36
36
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
37
|
import tempfile
|
|
38
38
|
|
|
39
|
+
# Configure the "whisp" logger with auto-flush handler for Colab visibility
|
|
40
|
+
_whisp_logger = logging.getLogger("whisp")
|
|
41
|
+
if not _whisp_logger.handlers:
|
|
42
|
+
_handler = logging.StreamHandler(sys.stdout)
|
|
43
|
+
_handler.setLevel(logging.DEBUG)
|
|
44
|
+
_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
|
|
45
|
+
# Override emit to force flush after each message for Colab
|
|
46
|
+
_original_emit = _handler.emit
|
|
47
|
+
|
|
48
|
+
def _emit_with_flush(record):
|
|
49
|
+
_original_emit(record)
|
|
50
|
+
sys.stdout.flush()
|
|
51
|
+
|
|
52
|
+
_handler.emit = _emit_with_flush
|
|
53
|
+
_whisp_logger.addHandler(_handler)
|
|
54
|
+
_whisp_logger.setLevel(logging.INFO)
|
|
55
|
+
_whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
|
|
56
|
+
|
|
39
57
|
# ============================================================================
|
|
40
58
|
# STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
|
|
41
59
|
# ============================================================================
|
|
@@ -163,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
|
|
|
163
181
|
reformat_logger.setLevel(logging.ERROR)
|
|
164
182
|
|
|
165
183
|
|
|
166
|
-
def
|
|
167
|
-
|
|
184
|
+
def _load_and_prepare_geojson(
|
|
185
|
+
filepath: str, external_id_column: Optional[str] = None
|
|
186
|
+
) -> gpd.GeoDataFrame:
|
|
187
|
+
"""Load GeoJSON file and prepare for processing.
|
|
188
|
+
|
|
189
|
+
Suppresses logging output and optionally renames external_id column.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
filepath : str
|
|
194
|
+
Path to GeoJSON file
|
|
195
|
+
external_id_column : str, optional
|
|
196
|
+
If provided, rename this column to 'external_id' immediately after loading
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
gpd.GeoDataFrame
|
|
201
|
+
Loaded GeoDataFrame with external_id renamed if specified
|
|
202
|
+
"""
|
|
168
203
|
fiona_logger = logging.getLogger("fiona")
|
|
169
204
|
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
170
205
|
old_fiona_level = fiona_logger.level
|
|
@@ -175,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
|
|
|
175
210
|
try:
|
|
176
211
|
with redirect_stdout(io.StringIO()):
|
|
177
212
|
gdf = gpd.read_file(filepath)
|
|
213
|
+
|
|
214
|
+
# Rename external_id column early and convert to string
|
|
215
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
216
|
+
if external_id_column != "external_id":
|
|
217
|
+
gdf = gdf.rename(
|
|
218
|
+
columns={external_id_column: "external_id"}
|
|
219
|
+
) # hard coding here to avoid confusion later
|
|
220
|
+
# Convert to string to ensure consistent type throughout pipeline
|
|
221
|
+
gdf["external_id"] = gdf["external_id"].astype(str)
|
|
222
|
+
|
|
178
223
|
return gdf
|
|
179
224
|
finally:
|
|
180
225
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -445,6 +490,16 @@ def join_admin_codes(
|
|
|
445
490
|
columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
|
|
446
491
|
)
|
|
447
492
|
|
|
493
|
+
# Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
|
|
494
|
+
# (e.g., points in the ocean or international waters)
|
|
495
|
+
df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
|
|
496
|
+
"Unknown"
|
|
497
|
+
)
|
|
498
|
+
df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
|
|
499
|
+
"not found"
|
|
500
|
+
)
|
|
501
|
+
df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
|
|
502
|
+
|
|
448
503
|
logger.debug(
|
|
449
504
|
f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
|
|
450
505
|
)
|
|
@@ -461,10 +516,16 @@ class ProgressTracker:
|
|
|
461
516
|
|
|
462
517
|
Shows progress at adaptive milestones (more frequent for small datasets,
|
|
463
518
|
less frequent for large datasets) with estimated time remaining based on
|
|
464
|
-
processing speed.
|
|
519
|
+
processing speed. Includes time-based heartbeat to prevent long silences.
|
|
465
520
|
"""
|
|
466
521
|
|
|
467
|
-
def __init__(
|
|
522
|
+
def __init__(
|
|
523
|
+
self,
|
|
524
|
+
total: int,
|
|
525
|
+
logger: logging.Logger = None,
|
|
526
|
+
heartbeat_interval: int = 180,
|
|
527
|
+
status_file: str = None,
|
|
528
|
+
):
|
|
468
529
|
"""
|
|
469
530
|
Initialize progress tracker.
|
|
470
531
|
|
|
@@ -474,26 +535,147 @@ class ProgressTracker:
|
|
|
474
535
|
Total number of items to process
|
|
475
536
|
logger : logging.Logger, optional
|
|
476
537
|
Logger for output
|
|
538
|
+
heartbeat_interval : int, optional
|
|
539
|
+
Seconds between heartbeat messages (default: 180 = 3 minutes)
|
|
540
|
+
status_file : str, optional
|
|
541
|
+
Path to JSON status file for API/web app consumption.
|
|
542
|
+
Checkpoints auto-save to same directory as status_file.
|
|
477
543
|
"""
|
|
478
544
|
self.total = total
|
|
479
545
|
self.completed = 0
|
|
480
546
|
self.lock = threading.Lock()
|
|
481
547
|
self.logger = logger or logging.getLogger("whisp")
|
|
548
|
+
self.heartbeat_interval = heartbeat_interval
|
|
549
|
+
|
|
550
|
+
# Handle status_file: if directory passed, auto-generate filename
|
|
551
|
+
if status_file:
|
|
552
|
+
import os
|
|
553
|
+
|
|
554
|
+
if os.path.isdir(status_file):
|
|
555
|
+
self.status_file = os.path.join(
|
|
556
|
+
status_file, "whisp_processing_status.json"
|
|
557
|
+
)
|
|
558
|
+
else:
|
|
559
|
+
# Validate that parent directory exists
|
|
560
|
+
parent_dir = os.path.dirname(status_file)
|
|
561
|
+
if parent_dir and not os.path.isdir(parent_dir):
|
|
562
|
+
self.logger.warning(
|
|
563
|
+
f"Status file directory does not exist: {parent_dir}"
|
|
564
|
+
)
|
|
565
|
+
self.status_file = None
|
|
566
|
+
else:
|
|
567
|
+
self.status_file = status_file
|
|
568
|
+
else:
|
|
569
|
+
self.status_file = None
|
|
482
570
|
|
|
483
571
|
# Adaptive milestones based on dataset size
|
|
484
572
|
# Small datasets (< 50): show every 25% (not too spammy)
|
|
485
573
|
# Medium (50-500): show every 20%
|
|
486
|
-
# Large (500
|
|
574
|
+
# Large (500-1000): show every 10%
|
|
575
|
+
# Very large (1000+): show every 5% (cleaner for long jobs)
|
|
487
576
|
if total < 50:
|
|
488
577
|
self.milestones = {25, 50, 75, 100}
|
|
489
578
|
elif total < 500:
|
|
490
579
|
self.milestones = {20, 40, 60, 80, 100}
|
|
491
|
-
|
|
580
|
+
elif total < 1000:
|
|
492
581
|
self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
|
|
582
|
+
else:
|
|
583
|
+
self.milestones = {
|
|
584
|
+
5,
|
|
585
|
+
10,
|
|
586
|
+
15,
|
|
587
|
+
20,
|
|
588
|
+
25,
|
|
589
|
+
30,
|
|
590
|
+
35,
|
|
591
|
+
40,
|
|
592
|
+
45,
|
|
593
|
+
50,
|
|
594
|
+
55,
|
|
595
|
+
60,
|
|
596
|
+
65,
|
|
597
|
+
70,
|
|
598
|
+
75,
|
|
599
|
+
80,
|
|
600
|
+
85,
|
|
601
|
+
90,
|
|
602
|
+
95,
|
|
603
|
+
100,
|
|
604
|
+
}
|
|
493
605
|
|
|
494
606
|
self.shown_milestones = set()
|
|
495
607
|
self.start_time = time.time()
|
|
496
608
|
self.last_update_time = self.start_time
|
|
609
|
+
self.heartbeat_stop = threading.Event()
|
|
610
|
+
self.heartbeat_thread = None
|
|
611
|
+
|
|
612
|
+
def _write_status_file(self, status: str = "processing") -> None:
|
|
613
|
+
"""Write current progress to JSON status file using atomic write."""
|
|
614
|
+
if not self.status_file:
|
|
615
|
+
return
|
|
616
|
+
|
|
617
|
+
try:
|
|
618
|
+
import json
|
|
619
|
+
import os
|
|
620
|
+
|
|
621
|
+
elapsed = time.time() - self.start_time
|
|
622
|
+
percent = (self.completed / self.total * 100) if self.total > 0 else 0
|
|
623
|
+
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
624
|
+
eta = (
|
|
625
|
+
(self.total - self.completed) / rate * 1.15
|
|
626
|
+
if rate > 0 and percent >= 5
|
|
627
|
+
else None
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
# Write to temp file then atomic rename to prevent partial reads
|
|
631
|
+
from datetime import datetime
|
|
632
|
+
|
|
633
|
+
temp_file = self.status_file + ".tmp"
|
|
634
|
+
with open(temp_file, "w") as f:
|
|
635
|
+
json.dump(
|
|
636
|
+
{
|
|
637
|
+
"status": status,
|
|
638
|
+
"progress": f"{self.completed}/{self.total}",
|
|
639
|
+
"percent": round(percent, 1),
|
|
640
|
+
"elapsed_sec": round(elapsed),
|
|
641
|
+
"eta_sec": round(eta) if eta else None,
|
|
642
|
+
"updated_at": datetime.now().isoformat(),
|
|
643
|
+
},
|
|
644
|
+
f,
|
|
645
|
+
)
|
|
646
|
+
os.replace(temp_file, self.status_file)
|
|
647
|
+
except Exception:
|
|
648
|
+
pass
|
|
649
|
+
|
|
650
|
+
def start_heartbeat(self) -> None:
|
|
651
|
+
"""Start background heartbeat thread for time-based progress updates."""
|
|
652
|
+
if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
|
|
653
|
+
self.heartbeat_stop.clear()
|
|
654
|
+
self.heartbeat_thread = threading.Thread(
|
|
655
|
+
target=self._heartbeat_loop, daemon=True
|
|
656
|
+
)
|
|
657
|
+
self.heartbeat_thread.start()
|
|
658
|
+
# Write initial status
|
|
659
|
+
self._write_status_file(status="processing")
|
|
660
|
+
|
|
661
|
+
def _heartbeat_loop(self) -> None:
|
|
662
|
+
"""Background loop that logs progress at time intervals."""
|
|
663
|
+
while not self.heartbeat_stop.wait(self.heartbeat_interval):
|
|
664
|
+
with self.lock:
|
|
665
|
+
# Only log if we haven't shown a milestone recently
|
|
666
|
+
time_since_update = time.time() - self.last_update_time
|
|
667
|
+
if (
|
|
668
|
+
time_since_update >= self.heartbeat_interval
|
|
669
|
+
and self.completed < self.total
|
|
670
|
+
):
|
|
671
|
+
elapsed = time.time() - self.start_time
|
|
672
|
+
percent = int((self.completed / self.total) * 100)
|
|
673
|
+
elapsed_str = self._format_time(elapsed)
|
|
674
|
+
self.logger.info(
|
|
675
|
+
f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
|
|
676
|
+
f"Elapsed: {elapsed_str}"
|
|
677
|
+
)
|
|
678
|
+
self.last_update_time = time.time()
|
|
497
679
|
|
|
498
680
|
def update(self, n: int = 1) -> None:
|
|
499
681
|
"""
|
|
@@ -508,7 +690,7 @@ class ProgressTracker:
|
|
|
508
690
|
self.completed += n
|
|
509
691
|
percent = int((self.completed / self.total) * 100)
|
|
510
692
|
|
|
511
|
-
# Show milestone messages (
|
|
693
|
+
# Show milestone messages (5%, 10%, 15%... for large datasets)
|
|
512
694
|
for milestone in sorted(self.milestones):
|
|
513
695
|
if percent >= milestone and milestone not in self.shown_milestones:
|
|
514
696
|
self.shown_milestones.add(milestone)
|
|
@@ -517,20 +699,36 @@ class ProgressTracker:
|
|
|
517
699
|
elapsed = time.time() - self.start_time
|
|
518
700
|
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
519
701
|
remaining_items = self.total - self.completed
|
|
520
|
-
|
|
702
|
+
|
|
703
|
+
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
704
|
+
# Don't show ETA until we have some samples (at least 5% complete)
|
|
705
|
+
if rate > 0 and self.completed >= max(5, self.total * 0.05):
|
|
706
|
+
eta_seconds = (
|
|
707
|
+
remaining_items / rate
|
|
708
|
+
) * 1.15 # Add 15% padding for overhead
|
|
709
|
+
else:
|
|
710
|
+
eta_seconds = 0
|
|
521
711
|
|
|
522
712
|
# Format time strings
|
|
523
|
-
eta_str =
|
|
713
|
+
eta_str = (
|
|
714
|
+
self._format_time(eta_seconds)
|
|
715
|
+
if eta_seconds > 0
|
|
716
|
+
else "calculating..."
|
|
717
|
+
)
|
|
524
718
|
elapsed_str = self._format_time(elapsed)
|
|
525
719
|
|
|
526
720
|
# Build progress message
|
|
527
|
-
msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
|
|
721
|
+
msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
|
|
528
722
|
if percent < 100:
|
|
529
723
|
msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
|
|
530
724
|
else:
|
|
531
725
|
msg += f" | Total time: {elapsed_str}"
|
|
532
726
|
|
|
533
727
|
self.logger.info(msg)
|
|
728
|
+
self.last_update_time = time.time()
|
|
729
|
+
|
|
730
|
+
# Update status file for API consumption
|
|
731
|
+
self._write_status_file()
|
|
534
732
|
|
|
535
733
|
@staticmethod
|
|
536
734
|
def _format_time(seconds: float) -> str:
|
|
@@ -544,14 +742,21 @@ class ProgressTracker:
|
|
|
544
742
|
hours = seconds / 3600
|
|
545
743
|
return f"{hours:.1f}h"
|
|
546
744
|
|
|
547
|
-
def finish(self) -> None:
|
|
548
|
-
"""
|
|
745
|
+
def finish(self, output_file: str = None) -> None:
|
|
746
|
+
"""Stop heartbeat and log completion."""
|
|
747
|
+
# Stop heartbeat thread
|
|
748
|
+
self.heartbeat_stop.set()
|
|
749
|
+
if self.heartbeat_thread and self.heartbeat_thread.is_alive():
|
|
750
|
+
self.heartbeat_thread.join(timeout=1)
|
|
751
|
+
|
|
549
752
|
with self.lock:
|
|
550
753
|
total_time = time.time() - self.start_time
|
|
551
754
|
time_str = self._format_time(total_time)
|
|
552
|
-
self.
|
|
553
|
-
|
|
554
|
-
|
|
755
|
+
msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
|
|
756
|
+
self.logger.info(msg)
|
|
757
|
+
|
|
758
|
+
# Write final status
|
|
759
|
+
self._write_status_file(status="completed")
|
|
555
760
|
|
|
556
761
|
|
|
557
762
|
# ============================================================================
|
|
@@ -602,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
|
|
|
602
807
|
if not check_ee_endpoint(endpoint_type):
|
|
603
808
|
if endpoint_type == "high-volume":
|
|
604
809
|
msg = (
|
|
605
|
-
"Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
810
|
+
"# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
606
811
|
"ee.Reset()\n"
|
|
607
|
-
"ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
608
|
-
"
|
|
609
|
-
"ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
812
|
+
"ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
813
|
+
"# where gee_project_name is your GEE project (necessary in Colab)"
|
|
610
814
|
)
|
|
611
815
|
else: # standard endpoint
|
|
612
816
|
msg = (
|
|
613
817
|
"Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
|
|
614
818
|
"ee.Reset()\n"
|
|
615
|
-
"ee.Initialize()\n"
|
|
616
|
-
"
|
|
617
|
-
"ee.Initialize(project='your_cloud_project_name')"
|
|
819
|
+
"ee.Initialize(project=gee_project_name)\n"
|
|
820
|
+
"# where gee_project_name is your GEE project (necessary in Colab)"
|
|
618
821
|
)
|
|
619
822
|
|
|
620
823
|
if raise_error:
|
|
@@ -687,13 +890,13 @@ def extract_centroid_and_geomtype_client(
|
|
|
687
890
|
if plot_id_column in gdf.columns:
|
|
688
891
|
cols.append(plot_id_column)
|
|
689
892
|
|
|
690
|
-
# Include
|
|
893
|
+
# Include external_id if it exists (already renamed during load)
|
|
691
894
|
if (
|
|
692
895
|
external_id_column
|
|
693
|
-
and
|
|
694
|
-
and
|
|
896
|
+
and "external_id" in gdf.columns
|
|
897
|
+
and "external_id" not in cols
|
|
695
898
|
):
|
|
696
|
-
cols.append(
|
|
899
|
+
cols.append("external_id")
|
|
697
900
|
|
|
698
901
|
# Always include metadata columns (centroid, geometry type)
|
|
699
902
|
cols.extend([x_col, y_col, type_col])
|
|
@@ -787,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
787
990
|
|
|
788
991
|
Preserves the __row_id__ column if present so it can be retrieved after processing.
|
|
789
992
|
|
|
993
|
+
IMPORTANT: Drops external_id column before sending to EE to enable query caching.
|
|
994
|
+
external_id is user metadata that's not needed for EE computation. Including it
|
|
995
|
+
breaks EE's caching mechanism since each unique external_id creates a different query.
|
|
996
|
+
|
|
790
997
|
Parameters
|
|
791
998
|
----------
|
|
792
999
|
batch_gdf : gpd.GeoDataFrame
|
|
@@ -795,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
795
1002
|
Returns
|
|
796
1003
|
-------
|
|
797
1004
|
ee.FeatureCollection
|
|
798
|
-
EE FeatureCollection with __row_id__ as a feature property
|
|
1005
|
+
EE FeatureCollection with __row_id__ as a feature property (no external_id)
|
|
799
1006
|
"""
|
|
1007
|
+
# Drop external_id before sending to EE to enable caching
|
|
1008
|
+
# (external_id is preserved separately on client side for merging)
|
|
1009
|
+
batch_for_ee = batch_gdf.copy()
|
|
1010
|
+
if "external_id" in batch_for_ee.columns:
|
|
1011
|
+
batch_for_ee = batch_for_ee.drop(columns=["external_id"])
|
|
1012
|
+
|
|
800
1013
|
# Pass GeoDataFrame directly to preserve CRS metadata
|
|
801
1014
|
# convert_geojson_to_ee will handle:
|
|
802
1015
|
# - CRS detection and conversion to WGS84 if needed
|
|
803
1016
|
# - Data type sanitization (datetime, object columns)
|
|
804
1017
|
# - Geometry validation and Z-coordinate stripping
|
|
805
1018
|
|
|
806
|
-
fc = convert_geojson_to_ee(
|
|
1019
|
+
fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
|
|
807
1020
|
|
|
808
1021
|
# If __row_id__ is in the original GeoDataFrame, it will be preserved
|
|
809
1022
|
# as a feature property in the GeoJSON and thus in the EE FeatureCollection
|
|
@@ -929,7 +1142,19 @@ def process_ee_batch(
|
|
|
929
1142
|
# Ensure plot_id_column is present for merging
|
|
930
1143
|
# It should come from the feature properties (added before EE processing)
|
|
931
1144
|
if plot_id_column not in df.columns:
|
|
932
|
-
|
|
1145
|
+
logger.warning(
|
|
1146
|
+
f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
|
|
1147
|
+
f"Regenerating with 1-indexed range. "
|
|
1148
|
+
f"Columns from EE: {list(df.columns)}"
|
|
1149
|
+
)
|
|
1150
|
+
# Use 1-indexed range to match client-side assignment
|
|
1151
|
+
df[plot_id_column] = range(1, len(df) + 1)
|
|
1152
|
+
|
|
1153
|
+
# Ensure plotId is integer type (EE may return as string)
|
|
1154
|
+
if plot_id_column in df.columns:
|
|
1155
|
+
df[plot_id_column] = pd.to_numeric(
|
|
1156
|
+
df[plot_id_column], errors="coerce"
|
|
1157
|
+
).astype("Int64")
|
|
933
1158
|
|
|
934
1159
|
# Ensure all column names are strings (fixes pandas .str accessor issues)
|
|
935
1160
|
df.columns = df.columns.astype(str)
|
|
@@ -983,7 +1208,6 @@ def process_ee_batch(
|
|
|
983
1208
|
def whisp_stats_geojson_to_df_concurrent(
|
|
984
1209
|
input_geojson_filepath: str,
|
|
985
1210
|
external_id_column: str = None,
|
|
986
|
-
remove_geom: bool = False,
|
|
987
1211
|
national_codes: List[str] = None,
|
|
988
1212
|
unit_type: str = "ha",
|
|
989
1213
|
whisp_image: ee.Image = None,
|
|
@@ -996,6 +1220,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
996
1220
|
logger: logging.Logger = None,
|
|
997
1221
|
# Format parameters (auto-detect from config if not provided)
|
|
998
1222
|
decimal_places: int = None,
|
|
1223
|
+
status_file: str = None,
|
|
999
1224
|
) -> pd.DataFrame:
|
|
1000
1225
|
"""
|
|
1001
1226
|
Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
|
|
@@ -1010,8 +1235,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1010
1235
|
Path to input GeoJSON file
|
|
1011
1236
|
external_id_column : str, optional
|
|
1012
1237
|
Column name for external IDs
|
|
1013
|
-
remove_geom : bool
|
|
1014
|
-
Remove geometry column from output
|
|
1015
1238
|
national_codes : List[str], optional
|
|
1016
1239
|
ISO2 codes for national datasets
|
|
1017
1240
|
unit_type : str
|
|
@@ -1055,10 +1278,32 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1055
1278
|
# Validate endpoint
|
|
1056
1279
|
validate_ee_endpoint("high-volume", raise_error=True)
|
|
1057
1280
|
|
|
1058
|
-
# Load GeoJSON with output suppressed
|
|
1059
|
-
gdf =
|
|
1281
|
+
# Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
|
|
1282
|
+
gdf = _load_and_prepare_geojson(
|
|
1283
|
+
input_geojson_filepath, external_id_column=external_id_column
|
|
1284
|
+
)
|
|
1060
1285
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1061
1286
|
|
|
1287
|
+
# Validate external_id if provided (lightweight client-side check)
|
|
1288
|
+
# Note: external_id_column already renamed to 'external_id' during load
|
|
1289
|
+
if external_id_column and "external_id" not in gdf.columns:
|
|
1290
|
+
# Exclude geometry column from available columns list
|
|
1291
|
+
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1292
|
+
raise ValueError(
|
|
1293
|
+
f"Column '{external_id_column}' not found in GeoJSON properties. "
|
|
1294
|
+
f"Available columns: {available_cols}"
|
|
1295
|
+
)
|
|
1296
|
+
|
|
1297
|
+
# Check completeness of external_id (warn if nulls exist)
|
|
1298
|
+
if external_id_column and "external_id" in gdf.columns:
|
|
1299
|
+
null_count = gdf["external_id"].isna().sum()
|
|
1300
|
+
if null_count > 0:
|
|
1301
|
+
null_pct = (null_count / len(gdf)) * 100
|
|
1302
|
+
logger.warning(
|
|
1303
|
+
f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1304
|
+
f"These features may have missing external IDs in output."
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1062
1307
|
if validate_geometries:
|
|
1063
1308
|
gdf = clean_geodataframe(
|
|
1064
1309
|
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
@@ -1068,13 +1313,21 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1068
1313
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1069
1314
|
|
|
1070
1315
|
# Strip unnecessary properties before sending to EE
|
|
1071
|
-
# Keep only: geometry, plot_id_column, and
|
|
1316
|
+
# Keep only: geometry, plot_id_column, and external_id
|
|
1072
1317
|
# This prevents duplication of GeoJSON properties in EE results
|
|
1073
1318
|
keep_cols = ["geometry", plot_id_column]
|
|
1074
|
-
if
|
|
1075
|
-
|
|
1319
|
+
if (
|
|
1320
|
+
external_id_column and "external_id" in gdf.columns
|
|
1321
|
+
): # Already renamed during load
|
|
1322
|
+
keep_cols.append("external_id")
|
|
1076
1323
|
|
|
1077
1324
|
gdf_for_ee = gdf[keep_cols].copy()
|
|
1325
|
+
|
|
1326
|
+
# CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
|
|
1327
|
+
if external_id_column and "external_id" in gdf_for_ee.columns:
|
|
1328
|
+
gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
|
|
1329
|
+
logger.debug(f"Converted external_id column to string type")
|
|
1330
|
+
|
|
1078
1331
|
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1079
1332
|
|
|
1080
1333
|
# Create image if not provided
|
|
@@ -1101,13 +1354,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1101
1354
|
|
|
1102
1355
|
# Batch the data
|
|
1103
1356
|
batches = batch_geodataframe(gdf_for_ee, batch_size)
|
|
1104
|
-
logger.info(
|
|
1357
|
+
logger.info(
|
|
1358
|
+
f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
|
|
1359
|
+
)
|
|
1105
1360
|
|
|
1106
1361
|
# Setup semaphore for EE concurrency control
|
|
1107
1362
|
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
1108
1363
|
|
|
1109
|
-
# Progress tracker
|
|
1110
|
-
progress = ProgressTracker(
|
|
1364
|
+
# Progress tracker with heartbeat for long-running jobs
|
|
1365
|
+
progress = ProgressTracker(
|
|
1366
|
+
len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
|
|
1367
|
+
)
|
|
1368
|
+
progress.start_heartbeat()
|
|
1111
1369
|
|
|
1112
1370
|
results = []
|
|
1113
1371
|
|
|
@@ -1148,73 +1406,97 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1148
1406
|
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1149
1407
|
|
|
1150
1408
|
try:
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
# Track which batches failed for retry
|
|
1159
|
-
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1160
|
-
batch_futures = {future: i for future, i in futures.items()}
|
|
1409
|
+
# Don't suppress stdout here - we want progress messages to show in Colab
|
|
1410
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1411
|
+
futures = {
|
|
1412
|
+
executor.submit(process_batch, i, batch): i
|
|
1413
|
+
for i, batch in enumerate(batches)
|
|
1414
|
+
}
|
|
1161
1415
|
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
batch_idx, df_server, df_client = future.result()
|
|
1166
|
-
|
|
1167
|
-
# Merge server and client results
|
|
1168
|
-
if plot_id_column not in df_server.columns:
|
|
1169
|
-
df_server[plot_id_column] = range(len(df_server))
|
|
1170
|
-
|
|
1171
|
-
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1172
|
-
# These are the actual EE processing results
|
|
1173
|
-
df_server_clean = df_server.copy()
|
|
1174
|
-
|
|
1175
|
-
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1176
|
-
# (formatted wrapper handles keep_external_columns parameter)
|
|
1177
|
-
keep_external_columns = [plot_id_column]
|
|
1178
|
-
if (
|
|
1179
|
-
external_id_column
|
|
1180
|
-
and external_id_column in df_client.columns
|
|
1181
|
-
):
|
|
1182
|
-
keep_external_columns.append(external_id_column)
|
|
1183
|
-
if "geometry" in df_client.columns:
|
|
1184
|
-
keep_external_columns.append("geometry")
|
|
1185
|
-
# Keep geometry type column (Geometry_type)
|
|
1186
|
-
if geometry_type_column in df_client.columns:
|
|
1187
|
-
keep_external_columns.append(geometry_type_column)
|
|
1188
|
-
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1189
|
-
centroid_cols = [
|
|
1190
|
-
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1191
|
-
]
|
|
1192
|
-
keep_external_columns.extend(centroid_cols)
|
|
1416
|
+
# Track which batches failed for retry
|
|
1417
|
+
batch_map = {i: batch for i, batch in enumerate(batches)}
|
|
1418
|
+
batch_futures = {future: i for future, i in futures.items()}
|
|
1193
1419
|
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1420
|
+
for future in as_completed(futures):
|
|
1421
|
+
batch_idx = batch_futures[future]
|
|
1422
|
+
try:
|
|
1423
|
+
batch_idx, df_server, df_client = future.result()
|
|
1197
1424
|
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1425
|
+
# Merge server and client results
|
|
1426
|
+
if plot_id_column not in df_server.columns:
|
|
1427
|
+
logger.warning(
|
|
1428
|
+
f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
|
|
1429
|
+
f"Regenerating. Columns from EE: {list(df_server.columns)}"
|
|
1203
1430
|
)
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1431
|
+
df_server[plot_id_column] = pd.array(
|
|
1432
|
+
range(1, len(df_server) + 1), dtype="Int64"
|
|
1433
|
+
)
|
|
1434
|
+
else:
|
|
1435
|
+
df_server[plot_id_column] = pd.to_numeric(
|
|
1436
|
+
df_server[plot_id_column], errors="coerce"
|
|
1437
|
+
).astype("Int64")
|
|
1438
|
+
|
|
1439
|
+
# Ensure plotId is Int64 in client data too
|
|
1440
|
+
if plot_id_column in df_client.columns:
|
|
1441
|
+
df_client[plot_id_column] = pd.to_numeric(
|
|
1442
|
+
df_client[plot_id_column], errors="coerce"
|
|
1443
|
+
).astype("Int64")
|
|
1444
|
+
|
|
1445
|
+
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1446
|
+
# These are the actual EE processing results
|
|
1447
|
+
df_server_clean = df_server.copy()
|
|
1448
|
+
|
|
1449
|
+
# Drop external_id from df_server if it exists (already in df_client)
|
|
1450
|
+
if "external_id" in df_server_clean.columns:
|
|
1451
|
+
df_server_clean = df_server_clean.drop(columns=["external_id"])
|
|
1452
|
+
|
|
1453
|
+
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1454
|
+
# (formatted wrapper handles keep_external_columns parameter)
|
|
1455
|
+
keep_external_columns = [plot_id_column]
|
|
1456
|
+
if external_id_column and "external_id" in df_client.columns:
|
|
1457
|
+
keep_external_columns.append("external_id")
|
|
1458
|
+
if "geometry" in df_client.columns:
|
|
1459
|
+
keep_external_columns.append("geometry")
|
|
1460
|
+
# Keep geometry type column (Geometry_type)
|
|
1461
|
+
if geometry_type_column in df_client.columns:
|
|
1462
|
+
keep_external_columns.append(geometry_type_column)
|
|
1463
|
+
# Also keep centroid columns (Centroid_lon, Centroid_lat)
|
|
1464
|
+
centroid_cols = [
|
|
1465
|
+
c for c in df_client.columns if c.startswith("Centroid_")
|
|
1466
|
+
]
|
|
1467
|
+
keep_external_columns.extend(centroid_cols)
|
|
1468
|
+
|
|
1469
|
+
df_client_clean = df_client[
|
|
1470
|
+
[c for c in keep_external_columns if c in df_client.columns]
|
|
1471
|
+
]
|
|
1472
|
+
# Don't drop duplicates - we need one row per feature (one per plot_id)
|
|
1473
|
+
# Each plot_id should have exactly one row with its metadata
|
|
1474
|
+
|
|
1475
|
+
merged = df_server_clean.merge(
|
|
1476
|
+
df_client_clean,
|
|
1477
|
+
on=plot_id_column,
|
|
1478
|
+
how="left",
|
|
1479
|
+
suffixes=("_ee", "_client"),
|
|
1480
|
+
)
|
|
1481
|
+
results.append(merged)
|
|
1482
|
+
progress.update()
|
|
1483
|
+
|
|
1484
|
+
except Exception as e:
|
|
1485
|
+
# Batch failed - fail fast with clear guidance
|
|
1486
|
+
error_msg = str(e)
|
|
1487
|
+
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1488
|
+
logger.debug(f"Full error: {error_msg}")
|
|
1489
|
+
|
|
1490
|
+
# Get original batch for error reporting
|
|
1491
|
+
original_batch = batch_map[batch_idx]
|
|
1492
|
+
|
|
1493
|
+
# Add to batch errors for final reporting
|
|
1494
|
+
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1495
|
+
except (KeyboardInterrupt, SystemExit) as interrupt:
|
|
1496
|
+
logger.warning("Processing interrupted by user")
|
|
1497
|
+
# Update status file with interrupted state
|
|
1498
|
+
progress._write_status_file(status="interrupted")
|
|
1499
|
+
raise interrupt
|
|
1218
1500
|
finally:
|
|
1219
1501
|
# Restore logger levels
|
|
1220
1502
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -1318,7 +1600,10 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1318
1600
|
try:
|
|
1319
1601
|
batch_idx, df_server, df_client = future.result()
|
|
1320
1602
|
if plot_id_column not in df_server.columns:
|
|
1321
|
-
|
|
1603
|
+
# Use 1-indexed range to match client-side assignment
|
|
1604
|
+
df_server[plot_id_column] = range(
|
|
1605
|
+
1, len(df_server) + 1
|
|
1606
|
+
)
|
|
1322
1607
|
merged = df_server.merge(
|
|
1323
1608
|
df_client,
|
|
1324
1609
|
on=plot_id_column,
|
|
@@ -1362,31 +1647,21 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1362
1647
|
else:
|
|
1363
1648
|
return pd.DataFrame()
|
|
1364
1649
|
|
|
1365
|
-
# Clean up duplicate external_id columns created by merges
|
|
1366
|
-
#
|
|
1367
|
-
if external_id_column:
|
|
1368
|
-
# Find
|
|
1369
|
-
|
|
1650
|
+
# Clean up duplicate external_id columns created by merges (if any exist)
|
|
1651
|
+
# external_id was already renamed during load, so we just need to handle duplicates
|
|
1652
|
+
if external_id_column and "external_id" in combined.columns:
|
|
1653
|
+
# Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
|
|
1654
|
+
duplicate_variants = [
|
|
1370
1655
|
col
|
|
1371
1656
|
for col in combined.columns
|
|
1372
|
-
if
|
|
1657
|
+
if col != "external_id" and col.startswith("external_id_")
|
|
1373
1658
|
]
|
|
1374
1659
|
|
|
1375
|
-
if
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
external_id_column
|
|
1379
|
-
if external_id_column in combined.columns
|
|
1380
|
-
else external_id_variants[0]
|
|
1660
|
+
if duplicate_variants:
|
|
1661
|
+
logger.debug(
|
|
1662
|
+
f"Dropping duplicate external_id columns: {duplicate_variants}"
|
|
1381
1663
|
)
|
|
1382
|
-
|
|
1383
|
-
# Rename to standardized 'external_id'
|
|
1384
|
-
if base_col != "external_id":
|
|
1385
|
-
combined = combined.rename(columns={base_col: "external_id"})
|
|
1386
|
-
|
|
1387
|
-
# Drop all other variants
|
|
1388
|
-
cols_to_drop = [c for c in external_id_variants if c != base_col]
|
|
1389
|
-
combined = combined.drop(columns=cols_to_drop, errors="ignore")
|
|
1664
|
+
combined = combined.drop(columns=duplicate_variants, errors="ignore")
|
|
1390
1665
|
|
|
1391
1666
|
# plotId column is already present from batch processing
|
|
1392
1667
|
# Just ensure it's at position 0
|
|
@@ -1469,14 +1744,26 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1469
1744
|
try:
|
|
1470
1745
|
batch_idx, df_server, df_client = future.result()
|
|
1471
1746
|
if plot_id_column not in df_server.columns:
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1747
|
+
logger.warning(
|
|
1748
|
+
f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
|
|
1749
|
+
f"Regenerating. Columns from EE: {list(df_server.columns)}"
|
|
1750
|
+
)
|
|
1751
|
+
# Use 1-indexed range to match client-side assignment
|
|
1752
|
+
df_server[plot_id_column] = range(1, len(df_server) + 1)
|
|
1753
|
+
|
|
1754
|
+
# Ensure plotId is integer type (EE may return as string)
|
|
1755
|
+
if plot_id_column in df_server.columns:
|
|
1756
|
+
df_server[plot_id_column] = pd.to_numeric(
|
|
1757
|
+
df_server[plot_id_column], errors="coerce"
|
|
1758
|
+
).astype("Int64")
|
|
1759
|
+
if plot_id_column in df_client.columns:
|
|
1760
|
+
df_client[plot_id_column] = pd.to_numeric(
|
|
1761
|
+
df_client[plot_id_column], errors="coerce"
|
|
1762
|
+
).astype("Int64")
|
|
1763
|
+
|
|
1764
|
+
# Drop external_id from df_server if it exists (already in df_client)
|
|
1765
|
+
if "external_id" in df_server.columns:
|
|
1766
|
+
df_server = df_server.drop(columns=["external_id"])
|
|
1480
1767
|
|
|
1481
1768
|
merged = df_server.merge(
|
|
1482
1769
|
df_client,
|
|
@@ -1498,30 +1785,22 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1498
1785
|
# Ensure all column names are strings (fixes pandas .str accessor issues later)
|
|
1499
1786
|
combined.columns = combined.columns.astype(str)
|
|
1500
1787
|
|
|
1501
|
-
# Clean up duplicate external_id columns created by merges
|
|
1502
|
-
|
|
1503
|
-
|
|
1788
|
+
# Clean up duplicate external_id columns created by merges (if any exist)
|
|
1789
|
+
# external_id was already renamed during load, so we just need to handle duplicates
|
|
1790
|
+
if external_id_column and "external_id" in combined.columns:
|
|
1791
|
+
# Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
|
|
1792
|
+
duplicate_variants = [
|
|
1504
1793
|
col
|
|
1505
1794
|
for col in combined.columns
|
|
1506
|
-
if
|
|
1795
|
+
if col != "external_id" and col.startswith("external_id_")
|
|
1507
1796
|
]
|
|
1508
1797
|
|
|
1509
|
-
if
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
and external_id_variants
|
|
1514
|
-
):
|
|
1515
|
-
base_col = external_id_variants[0]
|
|
1516
|
-
combined = combined.rename(
|
|
1517
|
-
columns={base_col: "external_id"}
|
|
1518
|
-
)
|
|
1519
|
-
|
|
1520
|
-
cols_to_drop = [
|
|
1521
|
-
c for c in external_id_variants if c != base_col
|
|
1522
|
-
]
|
|
1798
|
+
if duplicate_variants:
|
|
1799
|
+
logger.debug(
|
|
1800
|
+
f"Dropping duplicate external_id columns: {duplicate_variants}"
|
|
1801
|
+
)
|
|
1523
1802
|
combined = combined.drop(
|
|
1524
|
-
columns=
|
|
1803
|
+
columns=duplicate_variants, errors="ignore"
|
|
1525
1804
|
)
|
|
1526
1805
|
|
|
1527
1806
|
# plotId column is already present, just ensure it's at position 0
|
|
@@ -1565,7 +1844,15 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1565
1844
|
)
|
|
1566
1845
|
raise retry_e
|
|
1567
1846
|
|
|
1568
|
-
|
|
1847
|
+
# Ensure plot_id is present (should already be there from batch processing)
|
|
1848
|
+
if plot_id_column not in formatted.columns:
|
|
1849
|
+
logger.warning(f"{plot_id_column} column missing, regenerating...")
|
|
1850
|
+
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
1851
|
+
|
|
1852
|
+
# Sort by plot_id to ensure consistent output order
|
|
1853
|
+
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
1854
|
+
|
|
1855
|
+
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1569
1856
|
return formatted
|
|
1570
1857
|
else:
|
|
1571
1858
|
logger.error(" No results produced")
|
|
@@ -1580,7 +1867,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1580
1867
|
def whisp_stats_geojson_to_df_sequential(
|
|
1581
1868
|
input_geojson_filepath: str,
|
|
1582
1869
|
external_id_column: str = None,
|
|
1583
|
-
remove_geom: bool = False,
|
|
1584
1870
|
national_codes: List[str] = None,
|
|
1585
1871
|
unit_type: str = "ha",
|
|
1586
1872
|
whisp_image: ee.Image = None,
|
|
@@ -1605,8 +1891,6 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1605
1891
|
Path to input GeoJSON
|
|
1606
1892
|
external_id_column : str, optional
|
|
1607
1893
|
Column name for external IDs
|
|
1608
|
-
remove_geom : bool
|
|
1609
|
-
Remove geometry from output
|
|
1610
1894
|
national_codes : List[str], optional
|
|
1611
1895
|
ISO2 codes for national datasets
|
|
1612
1896
|
unit_type : str
|
|
@@ -1642,10 +1926,32 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1642
1926
|
# Validate endpoint
|
|
1643
1927
|
validate_ee_endpoint("standard", raise_error=True)
|
|
1644
1928
|
|
|
1645
|
-
# Load GeoJSON with output suppressed
|
|
1646
|
-
gdf =
|
|
1929
|
+
# Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
|
|
1930
|
+
gdf = _load_and_prepare_geojson(
|
|
1931
|
+
input_geojson_filepath, external_id_column=external_id_column
|
|
1932
|
+
)
|
|
1647
1933
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1648
1934
|
|
|
1935
|
+
# Validate external_id if provided (lightweight client-side check)
|
|
1936
|
+
# Note: external_id_column already renamed to 'external_id' during load
|
|
1937
|
+
if external_id_column and "external_id" not in gdf.columns:
|
|
1938
|
+
# Exclude geometry column from available columns list
|
|
1939
|
+
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1940
|
+
raise ValueError(
|
|
1941
|
+
f"Column '{external_id_column}' not found in GeoJSON properties. "
|
|
1942
|
+
f"Available columns: {available_cols}"
|
|
1943
|
+
)
|
|
1944
|
+
|
|
1945
|
+
# Check completeness of external_id (warn if nulls exist)
|
|
1946
|
+
if external_id_column and "external_id" in gdf.columns:
|
|
1947
|
+
null_count = gdf["external_id"].isna().sum()
|
|
1948
|
+
if null_count > 0:
|
|
1949
|
+
null_pct = (null_count / len(gdf)) * 100
|
|
1950
|
+
logger.warning(
|
|
1951
|
+
f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1952
|
+
f"These features may have missing external IDs in output."
|
|
1953
|
+
)
|
|
1954
|
+
|
|
1649
1955
|
# Clean geometries (preserve both null and invalid geometries by default)
|
|
1650
1956
|
gdf = clean_geodataframe(
|
|
1651
1957
|
gdf, remove_nulls=False, repair_geometries=False, logger=logger
|
|
@@ -1654,18 +1960,22 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1654
1960
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1655
1961
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1656
1962
|
|
|
1657
|
-
# Add stable row IDs
|
|
1658
|
-
row_id_col = "__row_id__"
|
|
1659
|
-
gdf[row_id_col] = range(len(gdf))
|
|
1660
|
-
|
|
1661
1963
|
# Strip unnecessary properties before sending to EE
|
|
1662
|
-
# Keep only: geometry, plot_id_column, and
|
|
1964
|
+
# Keep only: geometry, plot_id_column, and external_id
|
|
1663
1965
|
# This prevents duplication of GeoJSON properties in EE results
|
|
1664
|
-
keep_cols = ["geometry", plot_id_column
|
|
1665
|
-
if
|
|
1666
|
-
|
|
1966
|
+
keep_cols = ["geometry", plot_id_column]
|
|
1967
|
+
if (
|
|
1968
|
+
external_id_column and "external_id" in gdf.columns
|
|
1969
|
+
): # Already renamed during load
|
|
1970
|
+
keep_cols.append("external_id")
|
|
1667
1971
|
|
|
1668
1972
|
gdf_for_ee = gdf[keep_cols].copy()
|
|
1973
|
+
|
|
1974
|
+
# CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
|
|
1975
|
+
if external_id_column and "external_id" in gdf_for_ee.columns:
|
|
1976
|
+
gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
|
|
1977
|
+
logger.debug(f"Converted external_id column to string type")
|
|
1978
|
+
|
|
1669
1979
|
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1670
1980
|
|
|
1671
1981
|
# Create image if not provided
|
|
@@ -1687,16 +1997,27 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1687
1997
|
national_codes=national_codes, validate_bands=True
|
|
1688
1998
|
)
|
|
1689
1999
|
|
|
2000
|
+
# Drop external_id before sending to EE to enable caching
|
|
2001
|
+
# (external_id is preserved separately in gdf for client-side merging)
|
|
2002
|
+
gdf_for_ee_clean = gdf_for_ee.copy()
|
|
2003
|
+
if "external_id" in gdf_for_ee_clean.columns:
|
|
2004
|
+
gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
|
|
2005
|
+
logger.debug("Dropped external_id from data sent to EE (enables caching)")
|
|
2006
|
+
|
|
1690
2007
|
# Convert to EE (suppress print statements from convert_geojson_to_ee)
|
|
1691
2008
|
logger.debug("Converting to EE FeatureCollection...")
|
|
1692
2009
|
with redirect_stdout(io.StringIO()):
|
|
1693
|
-
fc = convert_geojson_to_ee(
|
|
2010
|
+
fc = convert_geojson_to_ee(
|
|
2011
|
+
gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
|
|
2012
|
+
)
|
|
1694
2013
|
|
|
1695
2014
|
# Create reducer
|
|
1696
2015
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
1697
2016
|
|
|
1698
2017
|
# Process server-side with error handling for bad bands
|
|
1699
|
-
logger.info(
|
|
2018
|
+
logger.info(
|
|
2019
|
+
f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
|
|
2020
|
+
)
|
|
1700
2021
|
try:
|
|
1701
2022
|
results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
|
|
1702
2023
|
df_server = convert_ee_to_df(results_fc)
|
|
@@ -1728,11 +2049,13 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1728
2049
|
else:
|
|
1729
2050
|
raise
|
|
1730
2051
|
|
|
1731
|
-
logger.
|
|
2052
|
+
logger.info("Server-side processing complete")
|
|
1732
2053
|
|
|
1733
|
-
#
|
|
1734
|
-
if
|
|
1735
|
-
df_server[
|
|
2054
|
+
# Ensure plotId is Int64 type for fast merges
|
|
2055
|
+
if plot_id_column in df_server.columns:
|
|
2056
|
+
df_server[plot_id_column] = pd.to_numeric(
|
|
2057
|
+
df_server[plot_id_column], errors="coerce"
|
|
2058
|
+
).astype("Int64")
|
|
1736
2059
|
|
|
1737
2060
|
# Add client-side metadata if requested
|
|
1738
2061
|
if add_metadata_client_side:
|
|
@@ -1743,21 +2066,23 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1743
2066
|
return_attributes_only=True,
|
|
1744
2067
|
)
|
|
1745
2068
|
|
|
1746
|
-
#
|
|
1747
|
-
if
|
|
1748
|
-
df_client =
|
|
2069
|
+
# Ensure plotId is Int64 type for fast merges
|
|
2070
|
+
if plot_id_column in df_client.columns:
|
|
2071
|
+
df_client[plot_id_column] = pd.to_numeric(
|
|
2072
|
+
df_client[plot_id_column], errors="coerce"
|
|
2073
|
+
).astype("Int64")
|
|
1749
2074
|
|
|
1750
|
-
#
|
|
2075
|
+
# Drop external_id from df_server if it exists (keep from df_client - more reliable)
|
|
2076
|
+
if "external_id" in df_server.columns:
|
|
2077
|
+
df_server = df_server.drop(columns=["external_id"])
|
|
2078
|
+
|
|
2079
|
+
# Merge on plotId (same strategy as concurrent mode)
|
|
1751
2080
|
result = df_server.merge(
|
|
1752
|
-
df_client, on=
|
|
2081
|
+
df_client, on=plot_id_column, how="left", suffixes=("", "_client")
|
|
1753
2082
|
)
|
|
1754
2083
|
else:
|
|
1755
2084
|
result = df_server
|
|
1756
2085
|
|
|
1757
|
-
# Remove internal __row_id__ column if present
|
|
1758
|
-
if row_id_col in result.columns:
|
|
1759
|
-
result = result.drop(columns=[row_id_col])
|
|
1760
|
-
|
|
1761
2086
|
# Format the output
|
|
1762
2087
|
# Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
|
|
1763
2088
|
# MUST be done BEFORE formatting (which removes _median columns)
|
|
@@ -1782,27 +2107,14 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1782
2107
|
convert_water_flag=True,
|
|
1783
2108
|
)
|
|
1784
2109
|
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
]
|
|
1794
|
-
if variants:
|
|
1795
|
-
base_col = (
|
|
1796
|
-
external_id_column
|
|
1797
|
-
if external_id_column in formatted.columns
|
|
1798
|
-
else variants[0]
|
|
1799
|
-
)
|
|
1800
|
-
if base_col != "external_id":
|
|
1801
|
-
formatted = formatted.rename(columns={base_col: "external_id"})
|
|
1802
|
-
# Drop other variants
|
|
1803
|
-
formatted = formatted.drop(
|
|
1804
|
-
columns=[c for c in variants if c != base_col], errors="ignore"
|
|
1805
|
-
)
|
|
2110
|
+
# Ensure plot_id exists and sort by it
|
|
2111
|
+
if plot_id_column not in formatted.columns:
|
|
2112
|
+
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
2113
|
+
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
2114
|
+
|
|
2115
|
+
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
2116
|
+
|
|
2117
|
+
# external_id_column already renamed to 'external_id' during load - no action needed here
|
|
1806
2118
|
|
|
1807
2119
|
return formatted
|
|
1808
2120
|
|
|
@@ -1815,7 +2127,6 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1815
2127
|
def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
1816
2128
|
input_geojson_filepath: str,
|
|
1817
2129
|
external_id_column: str = None,
|
|
1818
|
-
remove_geom: bool = False,
|
|
1819
2130
|
national_codes: List[str] = None,
|
|
1820
2131
|
unit_type: str = "ha",
|
|
1821
2132
|
whisp_image: ee.Image = None,
|
|
@@ -1833,6 +2144,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1833
2144
|
water_flag_threshold: float = 0.5,
|
|
1834
2145
|
sort_column: str = "plotId",
|
|
1835
2146
|
geometry_audit_trail: bool = False,
|
|
2147
|
+
status_file: str = None,
|
|
1836
2148
|
) -> pd.DataFrame:
|
|
1837
2149
|
"""
|
|
1838
2150
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -1848,8 +2160,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1848
2160
|
Path to input GeoJSON file
|
|
1849
2161
|
external_id_column : str, optional
|
|
1850
2162
|
Column name for external IDs
|
|
1851
|
-
remove_geom : bool
|
|
1852
|
-
Remove geometry column from output
|
|
1853
2163
|
national_codes : List[str], optional
|
|
1854
2164
|
ISO2 codes for national datasets
|
|
1855
2165
|
unit_type : str
|
|
@@ -1910,14 +2220,13 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1910
2220
|
gdf_original_geoms = None
|
|
1911
2221
|
if geometry_audit_trail:
|
|
1912
2222
|
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
1913
|
-
gdf_original_geoms =
|
|
2223
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
1914
2224
|
|
|
1915
2225
|
# Step 1: Get raw stats
|
|
1916
2226
|
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
1917
2227
|
df_raw = whisp_stats_geojson_to_df_concurrent(
|
|
1918
2228
|
input_geojson_filepath=input_geojson_filepath,
|
|
1919
2229
|
external_id_column=external_id_column,
|
|
1920
|
-
remove_geom=remove_geom,
|
|
1921
2230
|
national_codes=national_codes,
|
|
1922
2231
|
unit_type=unit_type,
|
|
1923
2232
|
whisp_image=whisp_image,
|
|
@@ -1928,6 +2237,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1928
2237
|
max_retries=max_retries,
|
|
1929
2238
|
add_metadata_server=add_metadata_server,
|
|
1930
2239
|
logger=logger,
|
|
2240
|
+
status_file=status_file,
|
|
1931
2241
|
)
|
|
1932
2242
|
|
|
1933
2243
|
# Step 2: Format the output
|
|
@@ -1979,7 +2289,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
1979
2289
|
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
1980
2290
|
if gdf_original_geoms is None:
|
|
1981
2291
|
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
1982
|
-
gdf_original_geoms =
|
|
2292
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
1983
2293
|
|
|
1984
2294
|
# Use plotId from df_validated to maintain mapping
|
|
1985
2295
|
df_original_geom = pd.DataFrame(
|
|
@@ -2030,7 +2340,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2030
2340
|
def whisp_formatted_stats_geojson_to_df_sequential(
|
|
2031
2341
|
input_geojson_filepath: str,
|
|
2032
2342
|
external_id_column: str = None,
|
|
2033
|
-
remove_geom: bool = False,
|
|
2034
2343
|
national_codes: List[str] = None,
|
|
2035
2344
|
unit_type: str = "ha",
|
|
2036
2345
|
whisp_image: ee.Image = None,
|
|
@@ -2044,6 +2353,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2044
2353
|
water_flag_threshold: float = 0.5,
|
|
2045
2354
|
sort_column: str = "plotId",
|
|
2046
2355
|
geometry_audit_trail: bool = False,
|
|
2356
|
+
status_file: str = None,
|
|
2047
2357
|
) -> pd.DataFrame:
|
|
2048
2358
|
"""
|
|
2049
2359
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -2059,8 +2369,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2059
2369
|
Path to input GeoJSON file
|
|
2060
2370
|
external_id_column : str, optional
|
|
2061
2371
|
Column name for external IDs
|
|
2062
|
-
remove_geom : bool
|
|
2063
|
-
Remove geometry from output
|
|
2064
2372
|
national_codes : List[str], optional
|
|
2065
2373
|
ISO2 codes for national datasets
|
|
2066
2374
|
unit_type : str
|
|
@@ -2113,14 +2421,13 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2113
2421
|
gdf_original_geoms = None
|
|
2114
2422
|
if geometry_audit_trail:
|
|
2115
2423
|
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2116
|
-
gdf_original_geoms =
|
|
2424
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2117
2425
|
|
|
2118
2426
|
# Step 1: Get raw stats
|
|
2119
2427
|
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
2120
2428
|
df_raw = whisp_stats_geojson_to_df_sequential(
|
|
2121
2429
|
input_geojson_filepath=input_geojson_filepath,
|
|
2122
2430
|
external_id_column=external_id_column,
|
|
2123
|
-
remove_geom=remove_geom,
|
|
2124
2431
|
national_codes=national_codes,
|
|
2125
2432
|
unit_type=unit_type,
|
|
2126
2433
|
whisp_image=whisp_image,
|
|
@@ -2178,7 +2485,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2178
2485
|
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2179
2486
|
if gdf_original_geoms is None:
|
|
2180
2487
|
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2181
|
-
gdf_original_geoms =
|
|
2488
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2182
2489
|
|
|
2183
2490
|
# Use plotId from df_validated to maintain mapping
|
|
2184
2491
|
df_original_geom = pd.DataFrame(
|
|
@@ -2233,7 +2540,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2233
2540
|
def whisp_formatted_stats_geojson_to_df_fast(
|
|
2234
2541
|
input_geojson_filepath: str,
|
|
2235
2542
|
external_id_column: str = None,
|
|
2236
|
-
remove_geom: bool = False,
|
|
2237
2543
|
national_codes: List[str] = None,
|
|
2238
2544
|
unit_type: str = "ha",
|
|
2239
2545
|
whisp_image: ee.Image = None,
|
|
@@ -2252,6 +2558,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2252
2558
|
water_flag_threshold: float = 0.5,
|
|
2253
2559
|
sort_column: str = "plotId",
|
|
2254
2560
|
geometry_audit_trail: bool = False,
|
|
2561
|
+
status_file: str = None,
|
|
2255
2562
|
) -> pd.DataFrame:
|
|
2256
2563
|
"""
|
|
2257
2564
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
@@ -2267,8 +2574,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2267
2574
|
Path to input GeoJSON file
|
|
2268
2575
|
external_id_column : str, optional
|
|
2269
2576
|
Column name for external IDs
|
|
2270
|
-
remove_geom : bool
|
|
2271
|
-
Remove geometry column from output
|
|
2272
2577
|
national_codes : List[str], optional
|
|
2273
2578
|
ISO2 codes for national datasets
|
|
2274
2579
|
unit_type : str
|
|
@@ -2339,7 +2644,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2339
2644
|
return whisp_formatted_stats_geojson_to_df_concurrent(
|
|
2340
2645
|
input_geojson_filepath=input_geojson_filepath,
|
|
2341
2646
|
external_id_column=external_id_column,
|
|
2342
|
-
remove_geom=remove_geom,
|
|
2343
2647
|
national_codes=national_codes,
|
|
2344
2648
|
unit_type=unit_type,
|
|
2345
2649
|
whisp_image=whisp_image,
|
|
@@ -2356,13 +2660,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2356
2660
|
water_flag_threshold=water_flag_threshold,
|
|
2357
2661
|
sort_column=sort_column,
|
|
2358
2662
|
geometry_audit_trail=geometry_audit_trail,
|
|
2663
|
+
status_file=status_file,
|
|
2359
2664
|
)
|
|
2360
2665
|
else: # sequential
|
|
2361
2666
|
logger.debug("Routing to sequential processing...")
|
|
2362
2667
|
return whisp_formatted_stats_geojson_to_df_sequential(
|
|
2363
2668
|
input_geojson_filepath=input_geojson_filepath,
|
|
2364
2669
|
external_id_column=external_id_column,
|
|
2365
|
-
remove_geom=remove_geom,
|
|
2366
2670
|
national_codes=national_codes,
|
|
2367
2671
|
unit_type=unit_type,
|
|
2368
2672
|
whisp_image=whisp_image,
|
|
@@ -2374,4 +2678,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2374
2678
|
water_flag_threshold=water_flag_threshold,
|
|
2375
2679
|
sort_column=sort_column,
|
|
2376
2680
|
geometry_audit_trail=geometry_audit_trail,
|
|
2681
|
+
status_file=status_file,
|
|
2377
2682
|
)
|