openforis-whisp 3.0.0a6__py3-none-any.whl → 3.0.0a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +3 -1
- openforis_whisp/advanced_stats.py +130 -256
- openforis_whisp/data_checks.py +191 -144
- openforis_whisp/datasets.py +3 -2
- openforis_whisp/stats.py +0 -9
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a7.dist-info}/METADATA +7 -4
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a7.dist-info}/RECORD +9 -9
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a7.dist-info}/WHEEL +1 -1
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a7.dist-info/licenses}/LICENSE +0 -0
openforis_whisp/__init__.py
CHANGED
|
@@ -101,6 +101,8 @@ from openforis_whisp.utils import (
|
|
|
101
101
|
|
|
102
102
|
from openforis_whisp.data_checks import (
|
|
103
103
|
analyze_geojson,
|
|
104
|
-
|
|
104
|
+
check_geojson_limits,
|
|
105
|
+
screen_geojson, # Backward compatibility alias
|
|
105
106
|
suggest_processing_mode,
|
|
107
|
+
validate_geojson_constraints, # Backward compatibility alias
|
|
106
108
|
)
|
|
@@ -510,253 +510,102 @@ def join_admin_codes(
|
|
|
510
510
|
return df
|
|
511
511
|
|
|
512
512
|
|
|
513
|
-
|
|
514
|
-
"""
|
|
515
|
-
|
|
513
|
+
def _format_time(seconds: float) -> str:
|
|
514
|
+
"""Format seconds as human-readable string."""
|
|
515
|
+
if seconds < 60:
|
|
516
|
+
return f"{seconds:.0f}s"
|
|
517
|
+
elif seconds < 3600:
|
|
518
|
+
mins = seconds / 60
|
|
519
|
+
return f"{mins:.1f}m"
|
|
520
|
+
else:
|
|
521
|
+
hours = seconds / 3600
|
|
522
|
+
return f"{hours:.1f}h"
|
|
516
523
|
|
|
517
|
-
Shows progress at adaptive milestones (more frequent for small datasets,
|
|
518
|
-
less frequent for large datasets) with estimated time remaining based on
|
|
519
|
-
processing speed. Includes time-based heartbeat to prevent long silences.
|
|
520
|
-
"""
|
|
521
524
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
logger: logging.Logger = None,
|
|
526
|
-
heartbeat_interval: int = 180,
|
|
527
|
-
status_file: str = None,
|
|
528
|
-
):
|
|
529
|
-
"""
|
|
530
|
-
Initialize progress tracker.
|
|
531
|
-
|
|
532
|
-
Parameters
|
|
533
|
-
----------
|
|
534
|
-
total : int
|
|
535
|
-
Total number of items to process
|
|
536
|
-
logger : logging.Logger, optional
|
|
537
|
-
Logger for output
|
|
538
|
-
heartbeat_interval : int, optional
|
|
539
|
-
Seconds between heartbeat messages (default: 180 = 3 minutes)
|
|
540
|
-
status_file : str, optional
|
|
541
|
-
Path to JSON status file for API/web app consumption.
|
|
542
|
-
Checkpoints auto-save to same directory as status_file.
|
|
543
|
-
"""
|
|
544
|
-
self.total = total
|
|
545
|
-
self.completed = 0
|
|
546
|
-
self.lock = threading.Lock()
|
|
547
|
-
self.logger = logger or logging.getLogger("whisp")
|
|
548
|
-
self.heartbeat_interval = heartbeat_interval
|
|
549
|
-
|
|
550
|
-
# Handle status_file: if directory passed, auto-generate filename
|
|
551
|
-
if status_file:
|
|
552
|
-
import os
|
|
553
|
-
|
|
554
|
-
if os.path.isdir(status_file):
|
|
555
|
-
self.status_file = os.path.join(
|
|
556
|
-
status_file, "whisp_processing_status.json"
|
|
557
|
-
)
|
|
558
|
-
else:
|
|
559
|
-
# Validate that parent directory exists
|
|
560
|
-
parent_dir = os.path.dirname(status_file)
|
|
561
|
-
if parent_dir and not os.path.isdir(parent_dir):
|
|
562
|
-
self.logger.warning(
|
|
563
|
-
f"Status file directory does not exist: {parent_dir}"
|
|
564
|
-
)
|
|
565
|
-
self.status_file = None
|
|
566
|
-
else:
|
|
567
|
-
self.status_file = status_file
|
|
568
|
-
else:
|
|
569
|
-
self.status_file = None
|
|
570
|
-
|
|
571
|
-
# Adaptive milestones based on dataset size
|
|
572
|
-
# Small datasets (< 50): show every 25% (not too spammy)
|
|
573
|
-
# Medium (50-500): show every 20%
|
|
574
|
-
# Large (500-1000): show every 10%
|
|
575
|
-
# Very large (1000+): show every 5% (cleaner for long jobs)
|
|
576
|
-
if total < 50:
|
|
577
|
-
self.milestones = {25, 50, 75, 100}
|
|
578
|
-
elif total < 500:
|
|
579
|
-
self.milestones = {20, 40, 60, 80, 100}
|
|
580
|
-
elif total < 1000:
|
|
581
|
-
self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
|
|
582
|
-
else:
|
|
583
|
-
self.milestones = {
|
|
584
|
-
5,
|
|
585
|
-
10,
|
|
586
|
-
15,
|
|
587
|
-
20,
|
|
588
|
-
25,
|
|
589
|
-
30,
|
|
590
|
-
35,
|
|
591
|
-
40,
|
|
592
|
-
45,
|
|
593
|
-
50,
|
|
594
|
-
55,
|
|
595
|
-
60,
|
|
596
|
-
65,
|
|
597
|
-
70,
|
|
598
|
-
75,
|
|
599
|
-
80,
|
|
600
|
-
85,
|
|
601
|
-
90,
|
|
602
|
-
95,
|
|
603
|
-
100,
|
|
604
|
-
}
|
|
525
|
+
def _get_progress_milestones(total_features: int) -> set:
|
|
526
|
+
"""
|
|
527
|
+
Get progress milestones based on dataset size.
|
|
605
528
|
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
self.heartbeat_thread = None
|
|
529
|
+
Parameters
|
|
530
|
+
----------
|
|
531
|
+
total_features : int
|
|
532
|
+
Total number of features being processed
|
|
611
533
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
534
|
+
Returns
|
|
535
|
+
-------
|
|
536
|
+
set
|
|
537
|
+
Set of percentage milestones to show
|
|
538
|
+
"""
|
|
539
|
+
# Set milestones based on feature count
|
|
540
|
+
if total_features < 250:
|
|
541
|
+
return set(range(20, 101, 20)) # Every 20%: {20, 40, 60, 80, 100}
|
|
542
|
+
elif total_features < 1000:
|
|
543
|
+
return set(range(10, 101, 10)) # Every 10%
|
|
544
|
+
elif total_features < 10000:
|
|
545
|
+
return set(range(5, 101, 5)) # Every 5%
|
|
546
|
+
elif total_features < 50000:
|
|
547
|
+
return set(range(2, 101, 2)) # Every 2%
|
|
548
|
+
else:
|
|
549
|
+
return set(range(1, 101)) # Every 1%
|
|
616
550
|
|
|
617
|
-
try:
|
|
618
|
-
import json
|
|
619
|
-
import os
|
|
620
|
-
|
|
621
|
-
elapsed = time.time() - self.start_time
|
|
622
|
-
percent = (self.completed / self.total * 100) if self.total > 0 else 0
|
|
623
|
-
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
624
|
-
eta = (
|
|
625
|
-
(self.total - self.completed) / rate * 1.15
|
|
626
|
-
if rate > 0 and percent >= 5
|
|
627
|
-
else None
|
|
628
|
-
)
|
|
629
551
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
"elapsed_sec": round(elapsed),
|
|
641
|
-
"eta_sec": round(eta) if eta else None,
|
|
642
|
-
"updated_at": datetime.now().isoformat(),
|
|
643
|
-
},
|
|
644
|
-
f,
|
|
645
|
-
)
|
|
646
|
-
os.replace(temp_file, self.status_file)
|
|
647
|
-
except Exception:
|
|
648
|
-
pass
|
|
649
|
-
|
|
650
|
-
def start_heartbeat(self) -> None:
|
|
651
|
-
"""Start background heartbeat thread for time-based progress updates."""
|
|
652
|
-
if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
|
|
653
|
-
self.heartbeat_stop.clear()
|
|
654
|
-
self.heartbeat_thread = threading.Thread(
|
|
655
|
-
target=self._heartbeat_loop, daemon=True
|
|
656
|
-
)
|
|
657
|
-
self.heartbeat_thread.start()
|
|
658
|
-
# Write initial status
|
|
659
|
-
self._write_status_file(status="processing")
|
|
660
|
-
|
|
661
|
-
def _heartbeat_loop(self) -> None:
|
|
662
|
-
"""Background loop that logs progress at time intervals."""
|
|
663
|
-
while not self.heartbeat_stop.wait(self.heartbeat_interval):
|
|
664
|
-
with self.lock:
|
|
665
|
-
# Only log if we haven't shown a milestone recently
|
|
666
|
-
time_since_update = time.time() - self.last_update_time
|
|
667
|
-
if (
|
|
668
|
-
time_since_update >= self.heartbeat_interval
|
|
669
|
-
and self.completed < self.total
|
|
670
|
-
):
|
|
671
|
-
elapsed = time.time() - self.start_time
|
|
672
|
-
percent = int((self.completed / self.total) * 100)
|
|
673
|
-
elapsed_str = self._format_time(elapsed)
|
|
674
|
-
self.logger.info(
|
|
675
|
-
f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
|
|
676
|
-
f"Elapsed: {elapsed_str}"
|
|
677
|
-
)
|
|
678
|
-
self.last_update_time = time.time()
|
|
679
|
-
|
|
680
|
-
def update(self, n: int = 1) -> None:
|
|
681
|
-
"""
|
|
682
|
-
Update progress count.
|
|
683
|
-
|
|
684
|
-
Parameters
|
|
685
|
-
----------
|
|
686
|
-
n : int
|
|
687
|
-
Number of items completed
|
|
688
|
-
"""
|
|
689
|
-
with self.lock:
|
|
690
|
-
self.completed += n
|
|
691
|
-
percent = int((self.completed / self.total) * 100)
|
|
692
|
-
|
|
693
|
-
# Show milestone messages (5%, 10%, 15%... for large datasets)
|
|
694
|
-
for milestone in sorted(self.milestones):
|
|
695
|
-
if percent >= milestone and milestone not in self.shown_milestones:
|
|
696
|
-
self.shown_milestones.add(milestone)
|
|
697
|
-
|
|
698
|
-
# Calculate time metrics
|
|
699
|
-
elapsed = time.time() - self.start_time
|
|
700
|
-
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
701
|
-
remaining_items = self.total - self.completed
|
|
702
|
-
|
|
703
|
-
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
704
|
-
# Don't show ETA until we have some samples (at least 5% complete)
|
|
705
|
-
if rate > 0 and self.completed >= max(5, self.total * 0.05):
|
|
706
|
-
eta_seconds = (
|
|
707
|
-
remaining_items / rate
|
|
708
|
-
) * 1.15 # Add 15% padding for overhead
|
|
709
|
-
else:
|
|
710
|
-
eta_seconds = 0
|
|
711
|
-
|
|
712
|
-
# Format time strings
|
|
713
|
-
eta_str = (
|
|
714
|
-
self._format_time(eta_seconds)
|
|
715
|
-
if eta_seconds > 0
|
|
716
|
-
else "calculating..."
|
|
717
|
-
)
|
|
718
|
-
elapsed_str = self._format_time(elapsed)
|
|
552
|
+
def _log_progress(
|
|
553
|
+
completed: int,
|
|
554
|
+
total: int,
|
|
555
|
+
milestones: set,
|
|
556
|
+
shown_milestones: set,
|
|
557
|
+
start_time: float,
|
|
558
|
+
logger: logging.Logger,
|
|
559
|
+
) -> None:
|
|
560
|
+
"""
|
|
561
|
+
Log progress at milestone percentages.
|
|
719
562
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
563
|
+
Parameters
|
|
564
|
+
----------
|
|
565
|
+
completed : int
|
|
566
|
+
Number of batches completed
|
|
567
|
+
total : int
|
|
568
|
+
Total number of batches
|
|
569
|
+
milestones : set
|
|
570
|
+
Set of percentage milestones to show
|
|
571
|
+
shown_milestones : set
|
|
572
|
+
Set of milestones already shown (modified in place)
|
|
573
|
+
start_time : float
|
|
574
|
+
Start time from time.time()
|
|
575
|
+
logger : logging.Logger
|
|
576
|
+
Logger for output
|
|
577
|
+
"""
|
|
578
|
+
percent = int((completed / total) * 100)
|
|
579
|
+
|
|
580
|
+
# Check for new milestones reached
|
|
581
|
+
for milestone in sorted(milestones):
|
|
582
|
+
if percent >= milestone and milestone not in shown_milestones:
|
|
583
|
+
shown_milestones.add(milestone)
|
|
584
|
+
|
|
585
|
+
# Calculate time metrics
|
|
586
|
+
elapsed = time.time() - start_time
|
|
587
|
+
rate = completed / elapsed if elapsed > 0 else 0
|
|
588
|
+
remaining_items = total - completed
|
|
589
|
+
|
|
590
|
+
# Calculate ETA with padding for overhead (loading, joins, etc.)
|
|
591
|
+
# Don't show ETA until we have some samples (at least 5% complete)
|
|
592
|
+
if rate > 0 and completed >= max(5, total * 0.05):
|
|
593
|
+
eta_seconds = (remaining_items / rate) * 1.15 # Add 15% padding
|
|
594
|
+
else:
|
|
595
|
+
eta_seconds = 0
|
|
744
596
|
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
self.heartbeat_stop.set()
|
|
749
|
-
if self.heartbeat_thread and self.heartbeat_thread.is_alive():
|
|
750
|
-
self.heartbeat_thread.join(timeout=1)
|
|
597
|
+
# Format time strings
|
|
598
|
+
eta_str = _format_time(eta_seconds) if eta_seconds > 0 else "calculating..."
|
|
599
|
+
elapsed_str = _format_time(elapsed)
|
|
751
600
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
601
|
+
# Build progress message
|
|
602
|
+
msg = f"Progress: {completed:,}/{total:,} batches ({percent}%)"
|
|
603
|
+
if percent < 100:
|
|
604
|
+
msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
|
|
605
|
+
else:
|
|
606
|
+
msg += f" | Total time: {elapsed_str}"
|
|
757
607
|
|
|
758
|
-
|
|
759
|
-
self._write_status_file(status="completed")
|
|
608
|
+
logger.info(msg)
|
|
760
609
|
|
|
761
610
|
|
|
762
611
|
# ============================================================================
|
|
@@ -1218,7 +1067,6 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1218
1067
|
logger: logging.Logger = None,
|
|
1219
1068
|
# Format parameters (auto-detect from config if not provided)
|
|
1220
1069
|
decimal_places: int = None,
|
|
1221
|
-
status_file: str = None,
|
|
1222
1070
|
) -> pd.DataFrame:
|
|
1223
1071
|
"""
|
|
1224
1072
|
Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
|
|
@@ -1359,11 +1207,12 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1359
1207
|
# Setup semaphore for EE concurrency control
|
|
1360
1208
|
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
1361
1209
|
|
|
1362
|
-
# Progress
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
)
|
|
1366
|
-
|
|
1210
|
+
# Progress tracking setup
|
|
1211
|
+
progress_lock = threading.Lock()
|
|
1212
|
+
completed_batches = 0
|
|
1213
|
+
milestones = _get_progress_milestones(len(gdf_for_ee))
|
|
1214
|
+
shown_milestones = set()
|
|
1215
|
+
start_time = time.time()
|
|
1367
1216
|
|
|
1368
1217
|
results = []
|
|
1369
1218
|
|
|
@@ -1477,7 +1326,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1477
1326
|
suffixes=("_ee", "_client"),
|
|
1478
1327
|
)
|
|
1479
1328
|
results.append(merged)
|
|
1480
|
-
|
|
1329
|
+
|
|
1330
|
+
# Update progress
|
|
1331
|
+
with progress_lock:
|
|
1332
|
+
completed_batches += 1
|
|
1333
|
+
_log_progress(
|
|
1334
|
+
completed_batches,
|
|
1335
|
+
len(batches),
|
|
1336
|
+
milestones,
|
|
1337
|
+
shown_milestones,
|
|
1338
|
+
start_time,
|
|
1339
|
+
logger,
|
|
1340
|
+
)
|
|
1481
1341
|
|
|
1482
1342
|
except Exception as e:
|
|
1483
1343
|
# Batch failed - fail fast with clear guidance
|
|
@@ -1492,15 +1352,18 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1492
1352
|
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1493
1353
|
except (KeyboardInterrupt, SystemExit) as interrupt:
|
|
1494
1354
|
logger.warning("Processing interrupted by user")
|
|
1495
|
-
# Update status file with interrupted state
|
|
1496
|
-
progress._write_status_file(status="interrupted")
|
|
1497
1355
|
raise interrupt
|
|
1498
1356
|
finally:
|
|
1499
1357
|
# Restore logger levels
|
|
1500
1358
|
fiona_logger.setLevel(old_fiona_level)
|
|
1501
1359
|
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
1502
1360
|
|
|
1503
|
-
|
|
1361
|
+
# Log completion
|
|
1362
|
+
total_time = time.time() - start_time
|
|
1363
|
+
time_str = _format_time(total_time)
|
|
1364
|
+
logger.info(
|
|
1365
|
+
f"Processing complete: {completed_batches:,}/{len(batches):,} batches in {time_str}"
|
|
1366
|
+
)
|
|
1504
1367
|
|
|
1505
1368
|
# If we have batch errors after retry attempts, fail the entire process
|
|
1506
1369
|
if batch_errors:
|
|
@@ -1577,7 +1440,9 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1577
1440
|
|
|
1578
1441
|
# Retry batch processing with validated image
|
|
1579
1442
|
results = []
|
|
1580
|
-
|
|
1443
|
+
retry_completed = 0
|
|
1444
|
+
retry_shown = set()
|
|
1445
|
+
retry_start = time.time()
|
|
1581
1446
|
|
|
1582
1447
|
# Suppress fiona logging during batch processing (threads create new loggers)
|
|
1583
1448
|
fiona_logger = logging.getLogger("fiona")
|
|
@@ -1609,13 +1474,28 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1609
1474
|
suffixes=("", "_client"),
|
|
1610
1475
|
)
|
|
1611
1476
|
results.append(merged)
|
|
1612
|
-
|
|
1477
|
+
|
|
1478
|
+
# Update retry progress
|
|
1479
|
+
with progress_lock:
|
|
1480
|
+
retry_completed += 1
|
|
1481
|
+
_log_progress(
|
|
1482
|
+
retry_completed,
|
|
1483
|
+
len(batches),
|
|
1484
|
+
milestones,
|
|
1485
|
+
retry_shown,
|
|
1486
|
+
retry_start,
|
|
1487
|
+
logger,
|
|
1488
|
+
)
|
|
1613
1489
|
except Exception as e:
|
|
1614
1490
|
logger.error(
|
|
1615
1491
|
f"Batch processing error (retry): {str(e)[:100]}"
|
|
1616
1492
|
)
|
|
1617
1493
|
|
|
1618
|
-
|
|
1494
|
+
# Log retry completion
|
|
1495
|
+
retry_time = time.time() - retry_start
|
|
1496
|
+
logger.info(
|
|
1497
|
+
f"Retry complete: {retry_completed:,}/{len(batches):,} batches in {_format_time(retry_time)}"
|
|
1498
|
+
)
|
|
1619
1499
|
finally:
|
|
1620
1500
|
# Restore logger levels
|
|
1621
1501
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -2138,7 +2018,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2138
2018
|
water_flag_threshold: float = 0.5,
|
|
2139
2019
|
sort_column: str = "plotId",
|
|
2140
2020
|
geometry_audit_trail: bool = False,
|
|
2141
|
-
status_file: str = None,
|
|
2142
2021
|
) -> pd.DataFrame:
|
|
2143
2022
|
"""
|
|
2144
2023
|
Process GeoJSON concurrently with automatic formatting and validation.
|
|
@@ -2231,7 +2110,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2231
2110
|
max_retries=max_retries,
|
|
2232
2111
|
add_metadata_server=add_metadata_server,
|
|
2233
2112
|
logger=logger,
|
|
2234
|
-
status_file=status_file,
|
|
2235
2113
|
)
|
|
2236
2114
|
|
|
2237
2115
|
# Step 2: Format the output
|
|
@@ -2347,7 +2225,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2347
2225
|
water_flag_threshold: float = 0.5,
|
|
2348
2226
|
sort_column: str = "plotId",
|
|
2349
2227
|
geometry_audit_trail: bool = False,
|
|
2350
|
-
status_file: str = None,
|
|
2351
2228
|
) -> pd.DataFrame:
|
|
2352
2229
|
"""
|
|
2353
2230
|
Process GeoJSON sequentially with automatic formatting and validation.
|
|
@@ -2552,7 +2429,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2552
2429
|
water_flag_threshold: float = 0.5,
|
|
2553
2430
|
sort_column: str = "plotId",
|
|
2554
2431
|
geometry_audit_trail: bool = False,
|
|
2555
|
-
status_file: str = None,
|
|
2556
2432
|
) -> pd.DataFrame:
|
|
2557
2433
|
"""
|
|
2558
2434
|
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
@@ -2654,7 +2530,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2654
2530
|
water_flag_threshold=water_flag_threshold,
|
|
2655
2531
|
sort_column=sort_column,
|
|
2656
2532
|
geometry_audit_trail=geometry_audit_trail,
|
|
2657
|
-
status_file=status_file,
|
|
2658
2533
|
)
|
|
2659
2534
|
else: # sequential
|
|
2660
2535
|
logger.debug("Routing to sequential processing...")
|
|
@@ -2672,5 +2547,4 @@ def whisp_formatted_stats_geojson_to_df_fast(
|
|
|
2672
2547
|
water_flag_threshold=water_flag_threshold,
|
|
2673
2548
|
sort_column=sort_column,
|
|
2674
2549
|
geometry_audit_trail=geometry_audit_trail,
|
|
2675
|
-
status_file=status_file,
|
|
2676
2550
|
)
|
openforis_whisp/data_checks.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Data validation and constraint checking functions for WHISP.
|
|
3
3
|
|
|
4
|
-
Provides validation functions to check GeoJSON data against defined limits
|
|
4
|
+
Provides validation functions to check GeoJSON data against user defined limits
|
|
5
5
|
and thresholds, raising informative errors when constraints are violated.
|
|
6
|
+
Note: Defaults in each function are not necessarily enforced.
|
|
6
7
|
"""
|
|
7
8
|
|
|
8
9
|
import json
|
|
@@ -13,26 +14,6 @@ from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
|
|
|
13
14
|
# (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
|
|
17
|
-
"""
|
|
18
|
-
Convert area from projected CRS units to hectares.
|
|
19
|
-
|
|
20
|
-
Most projected CRS use meters as units, so:
|
|
21
|
-
- area_sq_units is in square meters
|
|
22
|
-
- 1 hectare = 10,000 m²
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
area_sq_units: Area in square units of the projection (typically square meters)
|
|
26
|
-
crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
Area in hectares
|
|
30
|
-
"""
|
|
31
|
-
# Standard conversion: 1 hectare = 10,000 m²
|
|
32
|
-
# Most projected CRS use meters, so this works universally
|
|
33
|
-
return area_sq_units / 10000
|
|
34
|
-
|
|
35
|
-
|
|
36
17
|
def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
|
|
37
18
|
"""
|
|
38
19
|
Estimate area from bounding box when actual area calculation fails.
|
|
@@ -75,6 +56,8 @@ def analyze_geojson(
|
|
|
75
56
|
metrics=[
|
|
76
57
|
"count",
|
|
77
58
|
"geometry_types",
|
|
59
|
+
"crs",
|
|
60
|
+
"file_size_mb",
|
|
78
61
|
"min_area_ha",
|
|
79
62
|
"mean_area_ha",
|
|
80
63
|
"median_area_ha",
|
|
@@ -107,6 +90,8 @@ def analyze_geojson(
|
|
|
107
90
|
Which metrics to return. Available metrics:
|
|
108
91
|
- 'count': number of polygons
|
|
109
92
|
- 'geometry_types': dict of geometry type counts (e.g., {'Polygon': 95, 'MultiPolygon': 5})
|
|
93
|
+
- 'crs': coordinate reference system (e.g., 'EPSG:4326') - only available when geojson_data is a file path
|
|
94
|
+
- 'file_size_mb': file size in megabytes (only available when geojson_data is a file path)
|
|
110
95
|
- 'min_area_ha', 'mean_area_ha', 'median_area_ha', 'max_area_ha': area statistics (hectares) (accurate only at equator)
|
|
111
96
|
- 'area_percentiles': dict with p25, p50 (median), p75, p90 area values (accurate only at equator)
|
|
112
97
|
- 'min_vertices', 'mean_vertices', 'median_vertices', 'max_vertices': vertex count statistics
|
|
@@ -123,6 +108,8 @@ def analyze_geojson(
|
|
|
123
108
|
dict with requested metrics:
|
|
124
109
|
- 'count': number of polygons
|
|
125
110
|
- 'geometry_types': {'Polygon': int, 'MultiPolygon': int, ...}
|
|
111
|
+
- 'crs': coordinate reference system string (e.g., 'EPSG:4326', only when geojson_data is a file path)
|
|
112
|
+
- 'file_size_mb': file size in megabytes (float, only when geojson_data is a file path)
|
|
126
113
|
- 'min_area_ha': minimum area among all polygons in hectares
|
|
127
114
|
- 'mean_area_ha': mean area per polygon in hectares (calculated from coordinates)
|
|
128
115
|
- 'median_area_ha': median area among all polygons in hectares
|
|
@@ -134,8 +121,28 @@ def analyze_geojson(
|
|
|
134
121
|
- 'max_vertices': maximum number of vertices among all polygons
|
|
135
122
|
- 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
|
|
136
123
|
"""
|
|
124
|
+
# Handle None metrics (use all default metrics)
|
|
125
|
+
if metrics is None:
|
|
126
|
+
metrics = [
|
|
127
|
+
"count",
|
|
128
|
+
"geometry_types",
|
|
129
|
+
"crs",
|
|
130
|
+
"file_size_mb",
|
|
131
|
+
"min_area_ha",
|
|
132
|
+
"mean_area_ha",
|
|
133
|
+
"median_area_ha",
|
|
134
|
+
"max_area_ha",
|
|
135
|
+
"area_percentiles",
|
|
136
|
+
"min_vertices",
|
|
137
|
+
"mean_vertices",
|
|
138
|
+
"median_vertices",
|
|
139
|
+
"max_vertices",
|
|
140
|
+
"vertex_percentiles",
|
|
141
|
+
]
|
|
142
|
+
|
|
137
143
|
results = {}
|
|
138
144
|
crs_warning = None
|
|
145
|
+
detected_crs = None
|
|
139
146
|
file_path = None
|
|
140
147
|
|
|
141
148
|
try:
|
|
@@ -145,6 +152,35 @@ def analyze_geojson(
|
|
|
145
152
|
if not file_path.exists():
|
|
146
153
|
raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
|
|
147
154
|
|
|
155
|
+
# Quick CRS detection BEFORE loading full file (if requested)
|
|
156
|
+
if "crs" in metrics:
|
|
157
|
+
try:
|
|
158
|
+
# Use fiona which only reads file metadata (fast, doesn't load features)
|
|
159
|
+
import fiona
|
|
160
|
+
|
|
161
|
+
with fiona.open(file_path) as src:
|
|
162
|
+
if src.crs:
|
|
163
|
+
# Convert fiona CRS dict to EPSG string
|
|
164
|
+
crs_dict = src.crs
|
|
165
|
+
if "init" in crs_dict:
|
|
166
|
+
# Old format: {'init': 'epsg:4326'}
|
|
167
|
+
detected_crs = (
|
|
168
|
+
crs_dict["init"].upper().replace("EPSG:", "EPSG:")
|
|
169
|
+
)
|
|
170
|
+
elif isinstance(crs_dict, dict) and crs_dict:
|
|
171
|
+
# Try to extract EPSG from dict (json already imported at top)
|
|
172
|
+
detected_crs = json.dumps(crs_dict)
|
|
173
|
+
else:
|
|
174
|
+
# No CRS means WGS84 by GeoJSON spec
|
|
175
|
+
detected_crs = "EPSG:4326"
|
|
176
|
+
|
|
177
|
+
# Check if CRS is WGS84
|
|
178
|
+
if detected_crs and detected_crs != "EPSG:4326":
|
|
179
|
+
crs_warning = f"⚠️ CRS is {detected_crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
|
|
180
|
+
except Exception as e:
|
|
181
|
+
# If fiona fails, assume WGS84 (GeoJSON default)
|
|
182
|
+
detected_crs = "EPSG:4326"
|
|
183
|
+
|
|
148
184
|
# Try UTF-8 first (most common), then fall back to auto-detection
|
|
149
185
|
try:
|
|
150
186
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
@@ -166,26 +202,29 @@ def analyze_geojson(
|
|
|
166
202
|
with open(file_path, "r", encoding="latin-1") as f:
|
|
167
203
|
geojson_data = json.load(f)
|
|
168
204
|
|
|
169
|
-
# Detect CRS from file if available
|
|
170
|
-
try:
|
|
171
|
-
import geopandas as gpd
|
|
172
|
-
|
|
173
|
-
gdf = gpd.read_file(file_path)
|
|
174
|
-
if gdf.crs and gdf.crs != "EPSG:4326":
|
|
175
|
-
crs_warning = f"⚠️ CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
|
|
176
|
-
except Exception:
|
|
177
|
-
pass # If we can't detect CRS, continue without warning
|
|
178
|
-
|
|
179
205
|
features = geojson_data.get("features", [])
|
|
180
206
|
|
|
181
|
-
# Add
|
|
182
|
-
if
|
|
183
|
-
|
|
184
|
-
|
|
207
|
+
# Add file size if requested and available
|
|
208
|
+
if "file_size_mb" in metrics and file_path is not None:
|
|
209
|
+
size_bytes = file_path.stat().st_size
|
|
210
|
+
results["file_size_mb"] = round(size_bytes / (1024 * 1024), 2)
|
|
211
|
+
|
|
212
|
+
# Add CRS info if requested and detected
|
|
213
|
+
if "crs" in metrics and detected_crs:
|
|
214
|
+
results["crs"] = detected_crs
|
|
215
|
+
# Add warning if not WGS84
|
|
216
|
+
if crs_warning:
|
|
217
|
+
results["crs_warning"] = crs_warning
|
|
218
|
+
print(crs_warning)
|
|
185
219
|
|
|
186
220
|
if "count" in metrics:
|
|
187
221
|
results["count"] = len(features)
|
|
188
222
|
|
|
223
|
+
# Initialize tracking variables (used in quality logging later)
|
|
224
|
+
bbox_fallback_count = 0
|
|
225
|
+
geometry_skip_count = 0
|
|
226
|
+
polygon_type_stats = {}
|
|
227
|
+
|
|
189
228
|
# Single sweep through features - compute all area/vertex metrics at once
|
|
190
229
|
if any(
|
|
191
230
|
m in metrics
|
|
@@ -208,11 +247,6 @@ def analyze_geojson(
|
|
|
208
247
|
geometry_type_counts = {}
|
|
209
248
|
valid_polygons = 0
|
|
210
249
|
|
|
211
|
-
# Tracking for fallback geometries
|
|
212
|
-
bbox_fallback_count = 0 # Geometries that used bounding box estimate
|
|
213
|
-
geometry_skip_count = 0 # Geometries completely skipped
|
|
214
|
-
polygon_type_stats = {} # Track stats by geometry type
|
|
215
|
-
|
|
216
250
|
# Detect CRS to determine area conversion factor
|
|
217
251
|
area_conversion_factor = 1232100 # Default: WGS84 (degrees to ha)
|
|
218
252
|
detected_crs = None
|
|
@@ -489,6 +523,7 @@ def _check_metric_constraints(
|
|
|
489
523
|
max_max_area_ha=None,
|
|
490
524
|
max_mean_vertices=None,
|
|
491
525
|
max_max_vertices=10_000,
|
|
526
|
+
max_file_size_mb=None,
|
|
492
527
|
):
|
|
493
528
|
"""
|
|
494
529
|
Check if computed metrics violate any constraints.
|
|
@@ -499,7 +534,7 @@ def _check_metric_constraints(
|
|
|
499
534
|
-----------
|
|
500
535
|
metrics : dict
|
|
501
536
|
Dictionary of computed metrics with keys: count, mean_area_ha, max_area_ha,
|
|
502
|
-
mean_vertices, max_vertices
|
|
537
|
+
mean_vertices, max_vertices, file_size_mb (optional)
|
|
503
538
|
max_polygon_count : int
|
|
504
539
|
Maximum allowed number of polygons
|
|
505
540
|
max_mean_area_ha : float
|
|
@@ -510,6 +545,8 @@ def _check_metric_constraints(
|
|
|
510
545
|
Maximum allowed mean vertices per polygon
|
|
511
546
|
max_max_vertices : int, optional
|
|
512
547
|
Maximum allowed vertices per polygon
|
|
548
|
+
max_file_size_mb : float, optional
|
|
549
|
+
Maximum allowed file size in megabytes
|
|
513
550
|
|
|
514
551
|
Returns:
|
|
515
552
|
--------
|
|
@@ -523,6 +560,7 @@ def _check_metric_constraints(
|
|
|
523
560
|
max_area = metrics["max_area_ha"]
|
|
524
561
|
mean_vertices = metrics["mean_vertices"]
|
|
525
562
|
max_vertices_value = metrics["max_vertices"]
|
|
563
|
+
file_size_mb = metrics.get("file_size_mb")
|
|
526
564
|
|
|
527
565
|
if polygon_count > max_polygon_count:
|
|
528
566
|
violations.append(
|
|
@@ -549,41 +587,63 @@ def _check_metric_constraints(
|
|
|
549
587
|
f"Max vertices ({max_vertices_value:,}) exceeds limit ({max_max_vertices:,})"
|
|
550
588
|
)
|
|
551
589
|
|
|
590
|
+
if (
|
|
591
|
+
max_file_size_mb is not None
|
|
592
|
+
and file_size_mb is not None
|
|
593
|
+
and file_size_mb > max_file_size_mb
|
|
594
|
+
):
|
|
595
|
+
violations.append(
|
|
596
|
+
f"File size ({file_size_mb:.2f} MB) exceeds limit ({max_file_size_mb:.2f} MB)"
|
|
597
|
+
)
|
|
598
|
+
|
|
552
599
|
return violations
|
|
553
600
|
|
|
554
601
|
|
|
555
|
-
def
|
|
556
|
-
geojson_data: Path | str | dict,
|
|
602
|
+
def check_geojson_limits(
|
|
603
|
+
geojson_data: Path | str | dict = None,
|
|
604
|
+
analysis_results: dict = None,
|
|
557
605
|
max_polygon_count=250_000,
|
|
558
|
-
max_mean_area_ha=
|
|
559
|
-
max_max_area_ha=
|
|
560
|
-
max_mean_vertices=
|
|
561
|
-
max_max_vertices=
|
|
606
|
+
max_mean_area_ha=50_000,
|
|
607
|
+
max_max_area_ha=50_000,
|
|
608
|
+
max_mean_vertices=50_000,
|
|
609
|
+
max_max_vertices=50_000,
|
|
610
|
+
max_file_size_mb=None,
|
|
611
|
+
allowed_crs=["EPSG:4326"],
|
|
562
612
|
verbose=True,
|
|
563
613
|
):
|
|
564
614
|
"""
|
|
565
|
-
|
|
615
|
+
Check GeoJSON data against defined limits for processing readiness.
|
|
566
616
|
|
|
567
617
|
Raises ValueError if any metrics exceed the specified limits.
|
|
568
618
|
Uses analyze_geojson to compute metrics efficiently in a single sweep.
|
|
569
619
|
|
|
570
620
|
Parameters:
|
|
571
621
|
-----------
|
|
572
|
-
geojson_data : Path | str | dict
|
|
622
|
+
geojson_data : Path | str | dict, optional
|
|
573
623
|
GeoJSON FeatureCollection to validate. Can be:
|
|
574
624
|
- dict: GeoJSON FeatureCollection dictionary
|
|
575
625
|
- str: Path to GeoJSON file as string
|
|
576
626
|
- Path: pathlib.Path to GeoJSON file
|
|
627
|
+
Note: Cannot be used together with analysis_results
|
|
628
|
+
analysis_results : dict, optional
|
|
629
|
+
Pre-computed results from analyze_geojson(). Must contain keys:
|
|
630
|
+
'count', 'mean_area_ha', 'max_area_ha', 'mean_vertices', 'max_vertices'
|
|
631
|
+
Note: Cannot be used together with geojson_data
|
|
577
632
|
max_polygon_count : int, optional
|
|
578
633
|
Maximum allowed number of polygons (default: 250,000)
|
|
579
634
|
max_mean_area_ha : float, optional
|
|
580
|
-
Maximum allowed mean area per polygon in hectares (default:
|
|
635
|
+
Maximum allowed mean area per polygon in hectares (default: 50,000)
|
|
581
636
|
max_max_area_ha : float, optional
|
|
582
|
-
Maximum allowed maximum area per polygon in hectares (default:
|
|
637
|
+
Maximum allowed maximum area per polygon in hectares (default: 50,000)
|
|
583
638
|
max_mean_vertices : float, optional
|
|
584
|
-
Maximum allowed mean vertices per polygon (default:
|
|
639
|
+
Maximum allowed mean vertices per polygon (default: 50,000)
|
|
585
640
|
max_max_vertices : int, optional
|
|
586
|
-
Maximum allowed vertices per polygon (default:
|
|
641
|
+
Maximum allowed vertices per polygon (default: 50,000)
|
|
642
|
+
max_file_size_mb : float, optional
|
|
643
|
+
Maximum allowed file size in megabytes (default: None, no limit)
|
|
644
|
+
allowed_crs : list, optional
|
|
645
|
+
List of allowed coordinate reference systems (default: ["EPSG:4326"])
|
|
646
|
+
Set to None to skip CRS validation
|
|
587
647
|
verbose : bool
|
|
588
648
|
Print validation results (default: True)
|
|
589
649
|
|
|
@@ -603,22 +663,25 @@ def validate_geojson_constraints(
|
|
|
603
663
|
Raises:
|
|
604
664
|
-------
|
|
605
665
|
ValueError
|
|
606
|
-
If any constraint is violated
|
|
666
|
+
If any constraint is violated, or if both geojson_data and analysis_results are provided,
|
|
667
|
+
or if neither is provided
|
|
607
668
|
"""
|
|
608
|
-
|
|
609
|
-
|
|
669
|
+
# Validate input parameters
|
|
670
|
+
if geojson_data is not None and analysis_results is not None:
|
|
671
|
+
raise ValueError(
|
|
672
|
+
"Cannot provide both 'geojson_data' and 'analysis_results'. "
|
|
673
|
+
"Please provide only one input source."
|
|
674
|
+
)
|
|
610
675
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
with open(file_path, "r") as f:
|
|
617
|
-
geojson_data = json.load(f)
|
|
676
|
+
if geojson_data is None and analysis_results is None:
|
|
677
|
+
raise ValueError(
|
|
678
|
+
"Must provide either 'geojson_data' or 'analysis_results'. "
|
|
679
|
+
"Both cannot be None."
|
|
680
|
+
)
|
|
618
681
|
|
|
619
682
|
if verbose:
|
|
620
683
|
print("\n" + "=" * 80)
|
|
621
|
-
print("GEOJSON
|
|
684
|
+
print("GEOJSON LIMITS CHECK")
|
|
622
685
|
print("=" * 80)
|
|
623
686
|
print("\nConstraint Limits:")
|
|
624
687
|
print(f" - Max polygon count: {max_polygon_count:,}")
|
|
@@ -629,90 +692,47 @@ def validate_geojson_constraints(
|
|
|
629
692
|
print(f" - Max mean vertices: {max_mean_vertices:,}")
|
|
630
693
|
if max_max_vertices is not None:
|
|
631
694
|
print(f" - Max vertices per polygon: {max_max_vertices:,}")
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
coords = feature["geometry"]["coordinates"]
|
|
656
|
-
geom_type = feature["geometry"]["type"]
|
|
657
|
-
|
|
658
|
-
if geom_type == "Polygon":
|
|
659
|
-
# Count vertices
|
|
660
|
-
feature_vertices = 0
|
|
661
|
-
for ring in coords:
|
|
662
|
-
feature_vertices += len(ring)
|
|
663
|
-
total_vertices += feature_vertices
|
|
664
|
-
max_vertices_value = max(max_vertices_value, feature_vertices)
|
|
665
|
-
|
|
666
|
-
# Calculate area
|
|
667
|
-
try:
|
|
668
|
-
poly = ShapelyPolygon(coords[0])
|
|
669
|
-
area_ha = abs(poly.area) * 1232100
|
|
670
|
-
total_area += area_ha
|
|
671
|
-
max_area = max(max_area, area_ha)
|
|
672
|
-
except:
|
|
673
|
-
pass
|
|
674
|
-
valid_polygons += 1
|
|
675
|
-
|
|
676
|
-
elif geom_type == "MultiPolygon":
|
|
677
|
-
# Count vertices
|
|
678
|
-
feature_vertices = 0
|
|
679
|
-
for polygon in coords:
|
|
680
|
-
for ring in polygon:
|
|
681
|
-
feature_vertices += len(ring)
|
|
682
|
-
total_vertices += feature_vertices
|
|
683
|
-
max_vertices_value = max(max_vertices_value, feature_vertices)
|
|
684
|
-
|
|
685
|
-
# Calculate area
|
|
686
|
-
try:
|
|
687
|
-
for polygon in coords:
|
|
688
|
-
poly = ShapelyPolygon(polygon[0])
|
|
689
|
-
area_ha = abs(poly.area) * 1232100
|
|
690
|
-
total_area += area_ha
|
|
691
|
-
max_area = max(max_area, area_ha)
|
|
692
|
-
except:
|
|
693
|
-
pass
|
|
694
|
-
valid_polygons += 1
|
|
695
|
-
|
|
696
|
-
except:
|
|
697
|
-
continue
|
|
698
|
-
|
|
699
|
-
# Compute means
|
|
700
|
-
polygon_count = len(features)
|
|
701
|
-
mean_area = total_area / valid_polygons if valid_polygons > 0 else 0
|
|
702
|
-
mean_vertices = total_vertices / valid_polygons if valid_polygons > 0 else 0
|
|
703
|
-
|
|
695
|
+
if max_file_size_mb is not None:
|
|
696
|
+
print(f" - Max file size (MB): {max_file_size_mb:.2f}")
|
|
697
|
+
|
|
698
|
+
# Get metrics either from analysis_results or by analyzing geojson_data
|
|
699
|
+
if analysis_results is not None:
|
|
700
|
+
# Use pre-computed analysis results
|
|
701
|
+
metrics = analysis_results
|
|
702
|
+
else:
|
|
703
|
+
# Use analyze_geojson to compute all required metrics in a single sweep
|
|
704
|
+
metrics_to_compute = [
|
|
705
|
+
"count",
|
|
706
|
+
"file_size_mb",
|
|
707
|
+
"mean_area_ha",
|
|
708
|
+
"max_area_ha",
|
|
709
|
+
"mean_vertices",
|
|
710
|
+
"max_vertices",
|
|
711
|
+
]
|
|
712
|
+
# Add CRS if validation is requested
|
|
713
|
+
if allowed_crs is not None:
|
|
714
|
+
metrics_to_compute.append("crs")
|
|
715
|
+
metrics = analyze_geojson(geojson_data, metrics=metrics_to_compute)
|
|
716
|
+
|
|
717
|
+
# Build results dict with required keys
|
|
704
718
|
results = {
|
|
705
|
-
"count":
|
|
706
|
-
"
|
|
707
|
-
"
|
|
708
|
-
"
|
|
709
|
-
"
|
|
719
|
+
"count": metrics.get("count", 0),
|
|
720
|
+
"file_size_mb": metrics.get("file_size_mb"),
|
|
721
|
+
"mean_area_ha": metrics.get("mean_area_ha", 0),
|
|
722
|
+
"max_area_ha": metrics.get("max_area_ha", 0),
|
|
723
|
+
"mean_vertices": metrics.get("mean_vertices", 0),
|
|
724
|
+
"max_vertices": metrics.get("max_vertices", 0),
|
|
725
|
+
"crs": metrics.get("crs"),
|
|
710
726
|
"valid": True,
|
|
711
727
|
}
|
|
712
728
|
|
|
713
729
|
if verbose:
|
|
714
730
|
print("\nComputed Metrics:")
|
|
715
731
|
print(f" - Polygon count: {results['count']:,}")
|
|
732
|
+
if results.get("file_size_mb") is not None:
|
|
733
|
+
print(f" - File size (MB): {results['file_size_mb']:,.2f}")
|
|
734
|
+
if results.get("crs") is not None:
|
|
735
|
+
print(f" - CRS: {results['crs']}")
|
|
716
736
|
print(f" - Mean area (ha): {results['mean_area_ha']:,}")
|
|
717
737
|
print(f" - Max area (ha): {results['max_area_ha']:,}")
|
|
718
738
|
print(f" - Mean vertices: {results['mean_vertices']:,}")
|
|
@@ -726,34 +746,48 @@ def validate_geojson_constraints(
|
|
|
726
746
|
max_max_area_ha=max_max_area_ha,
|
|
727
747
|
max_mean_vertices=max_mean_vertices,
|
|
728
748
|
max_max_vertices=max_max_vertices,
|
|
749
|
+
max_file_size_mb=max_file_size_mb,
|
|
729
750
|
)
|
|
730
751
|
|
|
752
|
+
# Check CRS if validation is requested
|
|
753
|
+
if allowed_crs is not None and results.get("crs"):
|
|
754
|
+
if results["crs"] not in allowed_crs:
|
|
755
|
+
violations.append(
|
|
756
|
+
f"CRS '{results['crs']}' is not in allowed list: {allowed_crs}"
|
|
757
|
+
)
|
|
758
|
+
|
|
731
759
|
# Report results
|
|
732
760
|
if verbose:
|
|
733
761
|
print("\n" + "=" * 80)
|
|
734
762
|
if violations:
|
|
735
|
-
print("
|
|
763
|
+
print("LIMITS CHECK FAILED")
|
|
736
764
|
print("=" * 80)
|
|
737
765
|
for violation in violations:
|
|
738
766
|
print(f"\n{violation}")
|
|
739
767
|
results["valid"] = False
|
|
740
768
|
else:
|
|
741
|
-
print("
|
|
769
|
+
print("LIMITS CHECK PASSED")
|
|
742
770
|
print("=" * 80)
|
|
743
771
|
print("\nAll metrics within acceptable limits")
|
|
744
772
|
|
|
745
773
|
# Raise error with detailed message if any constraint violated
|
|
746
774
|
if violations:
|
|
747
|
-
error_message = "
|
|
775
|
+
error_message = "GeoJSON limits check failed:\n" + "\n".join(violations)
|
|
748
776
|
raise ValueError(error_message)
|
|
749
777
|
|
|
750
778
|
return results
|
|
751
779
|
|
|
752
780
|
|
|
781
|
+
# Backward compatibility aliases
|
|
782
|
+
screen_geojson = check_geojson_limits
|
|
783
|
+
validate_geojson_constraints = check_geojson_limits
|
|
784
|
+
|
|
785
|
+
|
|
753
786
|
def suggest_processing_mode(
|
|
754
787
|
feature_count,
|
|
755
788
|
mean_area_ha=None,
|
|
756
789
|
mean_vertices=None,
|
|
790
|
+
file_size_mb=None,
|
|
757
791
|
feature_type="polygon",
|
|
758
792
|
verbose=True,
|
|
759
793
|
):
|
|
@@ -762,6 +796,9 @@ def suggest_processing_mode(
|
|
|
762
796
|
|
|
763
797
|
Decision thresholds from comprehensive benchmark data (Nov 2025):
|
|
764
798
|
|
|
799
|
+
FILE SIZE:
|
|
800
|
+
- Files >= 10 MB: recommend sequential mode (avoids payload size limits)
|
|
801
|
+
|
|
765
802
|
POINTS:
|
|
766
803
|
- Break-even: 750-1000 features
|
|
767
804
|
- Sequential faster: < 750 features
|
|
@@ -785,6 +822,8 @@ def suggest_processing_mode(
|
|
|
785
822
|
Mean area per polygon in hectares (required for polygons, ignored for points)
|
|
786
823
|
mean_vertices : float, optional
|
|
787
824
|
Mean number of vertices per polygon (influences decision for complex geometries)
|
|
825
|
+
file_size_mb : float, optional
|
|
826
|
+
File size in megabytes (if >= 10 MB, recommends sequential mode)
|
|
788
827
|
feature_type : str
|
|
789
828
|
'polygon', 'multipolygon', or 'point' (default: 'polygon')
|
|
790
829
|
verbose : bool
|
|
@@ -795,6 +834,14 @@ def suggest_processing_mode(
|
|
|
795
834
|
str: 'concurrent' or 'sequential'
|
|
796
835
|
"""
|
|
797
836
|
|
|
837
|
+
# File size check: large files should use sequential mode
|
|
838
|
+
if file_size_mb is not None and file_size_mb >= 10:
|
|
839
|
+
if verbose:
|
|
840
|
+
print(f"\nMETHOD RECOMMENDATION (File Size Constraint)")
|
|
841
|
+
print(f" File size: {file_size_mb:.2f} MB (>= 10 MB threshold)")
|
|
842
|
+
print(f" Method: SEQUENTIAL (avoids payload size limits)")
|
|
843
|
+
return "sequential"
|
|
844
|
+
|
|
798
845
|
# Points: simple threshold-based decision
|
|
799
846
|
if feature_type == "point":
|
|
800
847
|
breakeven = 750
|
openforis_whisp/datasets.py
CHANGED
|
@@ -61,8 +61,9 @@ def g_esa_worldcover_trees_prep():
|
|
|
61
61
|
|
|
62
62
|
# EUFO_2020
|
|
63
63
|
def g_jrc_gfc_2020_prep():
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
# JRC GFC2020 V3 is a single Image with band 'Map'
|
|
65
|
+
jrc_gfc2020 = ee.Image("JRC/GFC2020/V3").select("Map")
|
|
66
|
+
return jrc_gfc2020.rename("EUFO_2020").selfMask()
|
|
66
67
|
|
|
67
68
|
|
|
68
69
|
# GFC_TC_2020
|
openforis_whisp/stats.py
CHANGED
|
@@ -165,7 +165,6 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
165
165
|
batch_size: int = 10,
|
|
166
166
|
max_concurrent: int = 20,
|
|
167
167
|
geometry_audit_trail: bool = False,
|
|
168
|
-
status_file: str = None,
|
|
169
168
|
) -> pd.DataFrame:
|
|
170
169
|
"""
|
|
171
170
|
Main entry point for converting GeoJSON to Whisp statistics.
|
|
@@ -219,13 +218,6 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
219
218
|
|
|
220
219
|
Processing metadata stored in df.attrs['processing_metadata'].
|
|
221
220
|
These columns enable full transparency for geometry modifications during processing.
|
|
222
|
-
status_file : str, optional
|
|
223
|
-
Path to JSON status file or directory for real-time progress tracking.
|
|
224
|
-
If a directory is provided, creates 'whisp_processing_status.json' in that directory.
|
|
225
|
-
Updates every 3 minutes and at progress milestones (5%, 10%, etc.).
|
|
226
|
-
Format: {"status": "processing", "progress": "450/1000", "percent": 45.0,
|
|
227
|
-
"elapsed_sec": 120, "eta_sec": 145, "updated_at": "2025-11-13T14:23:45"}
|
|
228
|
-
Most useful for large concurrent jobs. Works in both concurrent and sequential modes.
|
|
229
221
|
|
|
230
222
|
Returns
|
|
231
223
|
-------
|
|
@@ -315,7 +307,6 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
315
307
|
batch_size=batch_size,
|
|
316
308
|
max_concurrent=max_concurrent,
|
|
317
309
|
geometry_audit_trail=geometry_audit_trail,
|
|
318
|
-
status_file=status_file,
|
|
319
310
|
)
|
|
320
311
|
else:
|
|
321
312
|
raise ValueError(
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: openforis-whisp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0a7
|
|
4
4
|
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
|
|
5
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Keywords: whisp,geospatial,data-processing
|
|
7
8
|
Author: Andy Arnell
|
|
8
9
|
Author-email: andrew.arnell@fao.org
|
|
@@ -16,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
16
17
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
19
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
22
|
Requires-Dist: country_converter (>=0.7,<2.0.0)
|
|
21
23
|
Requires-Dist: earthengine-api
|
|
@@ -69,11 +71,11 @@ Description-Content-Type: text/markdown
|
|
|
69
71
|
***Whisp*** can currently be used directly or implemented in your own code through three different pathways:
|
|
70
72
|
|
|
71
73
|
|
|
72
|
-
1. The Whisp App with its simple interface can be used [right here](https://whisp.openforis.org/) or called from other software by [API](https://whisp.openforis.org/documentation/api-guide). The Whisp App currently supports the processing of up to
|
|
74
|
+
1. The Whisp App with its simple interface can be used [right here](https://whisp.openforis.org/) or called from other software by [API](https://whisp.openforis.org/documentation/api-guide). The Whisp App currently supports the processing of up to 3,000 geometries per job. The original JS & Python code behind the Whisp App and API can be found [here](https://github.com/forestdatapartnership/whisp-app).
|
|
73
75
|
|
|
74
76
|
2. [Whisp in Earthmap](https://whisp.earthmap.org/?aoi=WHISP&boundary=plot1&layers=%7B%22CocoaETH%22%3A%7B%22opacity%22%3A1%7D%2C%22JRCForestMask%22%3A%7B%22opacity%22%3A1%7D%2C%22planet_rgb%22%3A%7B%22opacity%22%3A1%2C%22date%22%3A%222020-12%22%7D%7D&map=%7B%22center%22%3A%7B%22lat%22%3A7%2C%22lng%22%3A4%7D%2C%22zoom%22%3A3%2C%22mapType%22%3A%22satellite%22%7D&statisticsOpen=true) supports the visualization of geometries on actual maps with the possibility to toggle different relevant map products around tree cover, commodities and deforestation. It is practical for demonstration purposes and spot checks of single geometries but not recommended for larger datasets.
|
|
75
77
|
|
|
76
|
-
3. Datasets of any size, especially when holding more than
|
|
78
|
+
3. Datasets of any size, especially when holding more than 3,000 geometries, can be analyzed with Whisp through the [python package on pip](https://pypi.org/project/openforis-whisp/). See example [Colab Notebook](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/Colab_whisp_geojson_to_csv.ipynb) for implementation with a geojson input. For the detailed procedure please go to the section [Whisp notebooks](#whisp_notebooks).
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
## Whisp datasets <a name="whisp_datasets"></a>
|
|
@@ -365,3 +367,4 @@ Please read the [contributing guidelines](contributing_guidelines.md) for good p
|
|
|
365
367
|
Users can report violations directly to us by emailing the address listed in the "Contact Us" section of the website:
|
|
366
368
|
https://openforis.org/solutions/whisp/
|
|
367
369
|
|
|
370
|
+
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
openforis_whisp/__init__.py,sha256=
|
|
2
|
-
openforis_whisp/advanced_stats.py,sha256=
|
|
3
|
-
openforis_whisp/data_checks.py,sha256=
|
|
1
|
+
openforis_whisp/__init__.py,sha256=YihdNrybfFygwcwa2Bis59V7sYpNR9aAxL-VNO4dqEI,3659
|
|
2
|
+
openforis_whisp/advanced_stats.py,sha256=1ZhIwdlZjephXvXVChVrNmouPgN_urXvYXYGeCs0Ay0,99731
|
|
3
|
+
openforis_whisp/data_checks.py,sha256=jxShBiihtX0rel__Vkzu1bZfqgVQIx_l-uPP1OeCaKY,37015
|
|
4
4
|
openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
|
|
5
|
-
openforis_whisp/datasets.py,sha256=
|
|
5
|
+
openforis_whisp/datasets.py,sha256=05m-8dj1r11CWTQd5xAStV3JEStmfiNuBm2zjyiTr0Y,53898
|
|
6
6
|
openforis_whisp/logger.py,sha256=gFkRTwJDJKIBWcHDOK74Uln3JM7fAybURo7pQpGL790,3395
|
|
7
7
|
openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
|
|
@@ -12,9 +12,9 @@ openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=7KdnFocEgbZO5m8JmWQchz
|
|
|
12
12
|
openforis_whisp/pd_schemas.py,sha256=0z-oPmYIDUIn7mNY41W_uUpmTwjoR7e254mOCoHVsOg,2878
|
|
13
13
|
openforis_whisp/reformat.py,sha256=gvhIa-_kTT5BSO8LuVmJ1TQcf_NwheskXboFM9e0KJY,32758
|
|
14
14
|
openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
|
|
15
|
-
openforis_whisp/stats.py,sha256=
|
|
15
|
+
openforis_whisp/stats.py,sha256=RJ_PJSXyvz9FnoHeQ3tqrfhhWibXjz9AlX27suSKiO4,63319
|
|
16
16
|
openforis_whisp/utils.py,sha256=AISWF-MpfFdYkhd6bei4BViw2Iag20mmq61ykrF9YTk,31287
|
|
17
|
-
openforis_whisp-3.0.
|
|
18
|
-
openforis_whisp-3.0.
|
|
19
|
-
openforis_whisp-3.0.
|
|
20
|
-
openforis_whisp-3.0.
|
|
17
|
+
openforis_whisp-3.0.0a7.dist-info/licenses/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
|
|
18
|
+
openforis_whisp-3.0.0a7.dist-info/METADATA,sha256=U-VC2XOZJ1DIz_Ar8ZIuXqJFhasA7NkzufKP_ykl2NY,16760
|
|
19
|
+
openforis_whisp-3.0.0a7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
20
|
+
openforis_whisp-3.0.0a7.dist-info/RECORD,,
|
|
File without changes
|