openforis-whisp 3.0.0a6__py3-none-any.whl → 3.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +3 -1
- openforis_whisp/advanced_stats.py +213 -338
- openforis_whisp/data_checks.py +191 -144
- openforis_whisp/datasets.py +4 -5
- openforis_whisp/reformat.py +8 -6
- openforis_whisp/risk.py +113 -29
- openforis_whisp/stats.py +0 -9
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/METADATA +37 -120
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/RECORD +11 -11
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a6.dist-info → openforis_whisp-3.0.0a8.dist-info}/WHEEL +0 -0
openforis_whisp/data_checks.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Data validation and constraint checking functions for WHISP.
|
|
3
3
|
|
|
4
|
-
Provides validation functions to check GeoJSON data against defined limits
|
|
4
|
+
Provides validation functions to check GeoJSON data against user defined limits
|
|
5
5
|
and thresholds, raising informative errors when constraints are violated.
|
|
6
|
+
Note: Defaults in each function are not necessarily enforced.
|
|
6
7
|
"""
|
|
7
8
|
|
|
8
9
|
import json
|
|
@@ -13,26 +14,6 @@ from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
|
|
|
13
14
|
# (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
|
|
17
|
-
"""
|
|
18
|
-
Convert area from projected CRS units to hectares.
|
|
19
|
-
|
|
20
|
-
Most projected CRS use meters as units, so:
|
|
21
|
-
- area_sq_units is in square meters
|
|
22
|
-
- 1 hectare = 10,000 m²
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
area_sq_units: Area in square units of the projection (typically square meters)
|
|
26
|
-
crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
Area in hectares
|
|
30
|
-
"""
|
|
31
|
-
# Standard conversion: 1 hectare = 10,000 m²
|
|
32
|
-
# Most projected CRS use meters, so this works universally
|
|
33
|
-
return area_sq_units / 10000
|
|
34
|
-
|
|
35
|
-
|
|
36
17
|
def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
|
|
37
18
|
"""
|
|
38
19
|
Estimate area from bounding box when actual area calculation fails.
|
|
@@ -75,6 +56,8 @@ def analyze_geojson(
|
|
|
75
56
|
metrics=[
|
|
76
57
|
"count",
|
|
77
58
|
"geometry_types",
|
|
59
|
+
"crs",
|
|
60
|
+
"file_size_mb",
|
|
78
61
|
"min_area_ha",
|
|
79
62
|
"mean_area_ha",
|
|
80
63
|
"median_area_ha",
|
|
@@ -107,6 +90,8 @@ def analyze_geojson(
|
|
|
107
90
|
Which metrics to return. Available metrics:
|
|
108
91
|
- 'count': number of polygons
|
|
109
92
|
- 'geometry_types': dict of geometry type counts (e.g., {'Polygon': 95, 'MultiPolygon': 5})
|
|
93
|
+
- 'crs': coordinate reference system (e.g., 'EPSG:4326') - only available when geojson_data is a file path
|
|
94
|
+
- 'file_size_mb': file size in megabytes (only available when geojson_data is a file path)
|
|
110
95
|
- 'min_area_ha', 'mean_area_ha', 'median_area_ha', 'max_area_ha': area statistics (hectares) (accurate only at equator)
|
|
111
96
|
- 'area_percentiles': dict with p25, p50 (median), p75, p90 area values (accurate only at equator)
|
|
112
97
|
- 'min_vertices', 'mean_vertices', 'median_vertices', 'max_vertices': vertex count statistics
|
|
@@ -123,6 +108,8 @@ def analyze_geojson(
|
|
|
123
108
|
dict with requested metrics:
|
|
124
109
|
- 'count': number of polygons
|
|
125
110
|
- 'geometry_types': {'Polygon': int, 'MultiPolygon': int, ...}
|
|
111
|
+
- 'crs': coordinate reference system string (e.g., 'EPSG:4326', only when geojson_data is a file path)
|
|
112
|
+
- 'file_size_mb': file size in megabytes (float, only when geojson_data is a file path)
|
|
126
113
|
- 'min_area_ha': minimum area among all polygons in hectares
|
|
127
114
|
- 'mean_area_ha': mean area per polygon in hectares (calculated from coordinates)
|
|
128
115
|
- 'median_area_ha': median area among all polygons in hectares
|
|
@@ -134,8 +121,28 @@ def analyze_geojson(
|
|
|
134
121
|
- 'max_vertices': maximum number of vertices among all polygons
|
|
135
122
|
- 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
|
|
136
123
|
"""
|
|
124
|
+
# Handle None metrics (use all default metrics)
|
|
125
|
+
if metrics is None:
|
|
126
|
+
metrics = [
|
|
127
|
+
"count",
|
|
128
|
+
"geometry_types",
|
|
129
|
+
"crs",
|
|
130
|
+
"file_size_mb",
|
|
131
|
+
"min_area_ha",
|
|
132
|
+
"mean_area_ha",
|
|
133
|
+
"median_area_ha",
|
|
134
|
+
"max_area_ha",
|
|
135
|
+
"area_percentiles",
|
|
136
|
+
"min_vertices",
|
|
137
|
+
"mean_vertices",
|
|
138
|
+
"median_vertices",
|
|
139
|
+
"max_vertices",
|
|
140
|
+
"vertex_percentiles",
|
|
141
|
+
]
|
|
142
|
+
|
|
137
143
|
results = {}
|
|
138
144
|
crs_warning = None
|
|
145
|
+
detected_crs = None
|
|
139
146
|
file_path = None
|
|
140
147
|
|
|
141
148
|
try:
|
|
@@ -145,6 +152,35 @@ def analyze_geojson(
|
|
|
145
152
|
if not file_path.exists():
|
|
146
153
|
raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
|
|
147
154
|
|
|
155
|
+
# Quick CRS detection BEFORE loading full file (if requested)
|
|
156
|
+
if "crs" in metrics:
|
|
157
|
+
try:
|
|
158
|
+
# Use fiona which only reads file metadata (fast, doesn't load features)
|
|
159
|
+
import fiona
|
|
160
|
+
|
|
161
|
+
with fiona.open(file_path) as src:
|
|
162
|
+
if src.crs:
|
|
163
|
+
# Convert fiona CRS dict to EPSG string
|
|
164
|
+
crs_dict = src.crs
|
|
165
|
+
if "init" in crs_dict:
|
|
166
|
+
# Old format: {'init': 'epsg:4326'}
|
|
167
|
+
detected_crs = (
|
|
168
|
+
crs_dict["init"].upper().replace("EPSG:", "EPSG:")
|
|
169
|
+
)
|
|
170
|
+
elif isinstance(crs_dict, dict) and crs_dict:
|
|
171
|
+
# Try to extract EPSG from dict (json already imported at top)
|
|
172
|
+
detected_crs = json.dumps(crs_dict)
|
|
173
|
+
else:
|
|
174
|
+
# No CRS means WGS84 by GeoJSON spec
|
|
175
|
+
detected_crs = "EPSG:4326"
|
|
176
|
+
|
|
177
|
+
# Check if CRS is WGS84
|
|
178
|
+
if detected_crs and detected_crs != "EPSG:4326":
|
|
179
|
+
crs_warning = f"⚠️ CRS is {detected_crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
|
|
180
|
+
except Exception as e:
|
|
181
|
+
# If fiona fails, assume WGS84 (GeoJSON default)
|
|
182
|
+
detected_crs = "EPSG:4326"
|
|
183
|
+
|
|
148
184
|
# Try UTF-8 first (most common), then fall back to auto-detection
|
|
149
185
|
try:
|
|
150
186
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
@@ -166,26 +202,29 @@ def analyze_geojson(
|
|
|
166
202
|
with open(file_path, "r", encoding="latin-1") as f:
|
|
167
203
|
geojson_data = json.load(f)
|
|
168
204
|
|
|
169
|
-
# Detect CRS from file if available
|
|
170
|
-
try:
|
|
171
|
-
import geopandas as gpd
|
|
172
|
-
|
|
173
|
-
gdf = gpd.read_file(file_path)
|
|
174
|
-
if gdf.crs and gdf.crs != "EPSG:4326":
|
|
175
|
-
crs_warning = f"⚠️ CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
|
|
176
|
-
except Exception:
|
|
177
|
-
pass # If we can't detect CRS, continue without warning
|
|
178
|
-
|
|
179
205
|
features = geojson_data.get("features", [])
|
|
180
206
|
|
|
181
|
-
# Add
|
|
182
|
-
if
|
|
183
|
-
|
|
184
|
-
|
|
207
|
+
# Add file size if requested and available
|
|
208
|
+
if "file_size_mb" in metrics and file_path is not None:
|
|
209
|
+
size_bytes = file_path.stat().st_size
|
|
210
|
+
results["file_size_mb"] = round(size_bytes / (1024 * 1024), 2)
|
|
211
|
+
|
|
212
|
+
# Add CRS info if requested and detected
|
|
213
|
+
if "crs" in metrics and detected_crs:
|
|
214
|
+
results["crs"] = detected_crs
|
|
215
|
+
# Add warning if not WGS84
|
|
216
|
+
if crs_warning:
|
|
217
|
+
results["crs_warning"] = crs_warning
|
|
218
|
+
print(crs_warning)
|
|
185
219
|
|
|
186
220
|
if "count" in metrics:
|
|
187
221
|
results["count"] = len(features)
|
|
188
222
|
|
|
223
|
+
# Initialize tracking variables (used in quality logging later)
|
|
224
|
+
bbox_fallback_count = 0
|
|
225
|
+
geometry_skip_count = 0
|
|
226
|
+
polygon_type_stats = {}
|
|
227
|
+
|
|
189
228
|
# Single sweep through features - compute all area/vertex metrics at once
|
|
190
229
|
if any(
|
|
191
230
|
m in metrics
|
|
@@ -208,11 +247,6 @@ def analyze_geojson(
|
|
|
208
247
|
geometry_type_counts = {}
|
|
209
248
|
valid_polygons = 0
|
|
210
249
|
|
|
211
|
-
# Tracking for fallback geometries
|
|
212
|
-
bbox_fallback_count = 0 # Geometries that used bounding box estimate
|
|
213
|
-
geometry_skip_count = 0 # Geometries completely skipped
|
|
214
|
-
polygon_type_stats = {} # Track stats by geometry type
|
|
215
|
-
|
|
216
250
|
# Detect CRS to determine area conversion factor
|
|
217
251
|
area_conversion_factor = 1232100 # Default: WGS84 (degrees to ha)
|
|
218
252
|
detected_crs = None
|
|
@@ -489,6 +523,7 @@ def _check_metric_constraints(
|
|
|
489
523
|
max_max_area_ha=None,
|
|
490
524
|
max_mean_vertices=None,
|
|
491
525
|
max_max_vertices=10_000,
|
|
526
|
+
max_file_size_mb=None,
|
|
492
527
|
):
|
|
493
528
|
"""
|
|
494
529
|
Check if computed metrics violate any constraints.
|
|
@@ -499,7 +534,7 @@ def _check_metric_constraints(
|
|
|
499
534
|
-----------
|
|
500
535
|
metrics : dict
|
|
501
536
|
Dictionary of computed metrics with keys: count, mean_area_ha, max_area_ha,
|
|
502
|
-
mean_vertices, max_vertices
|
|
537
|
+
mean_vertices, max_vertices, file_size_mb (optional)
|
|
503
538
|
max_polygon_count : int
|
|
504
539
|
Maximum allowed number of polygons
|
|
505
540
|
max_mean_area_ha : float
|
|
@@ -510,6 +545,8 @@ def _check_metric_constraints(
|
|
|
510
545
|
Maximum allowed mean vertices per polygon
|
|
511
546
|
max_max_vertices : int, optional
|
|
512
547
|
Maximum allowed vertices per polygon
|
|
548
|
+
max_file_size_mb : float, optional
|
|
549
|
+
Maximum allowed file size in megabytes
|
|
513
550
|
|
|
514
551
|
Returns:
|
|
515
552
|
--------
|
|
@@ -523,6 +560,7 @@ def _check_metric_constraints(
|
|
|
523
560
|
max_area = metrics["max_area_ha"]
|
|
524
561
|
mean_vertices = metrics["mean_vertices"]
|
|
525
562
|
max_vertices_value = metrics["max_vertices"]
|
|
563
|
+
file_size_mb = metrics.get("file_size_mb")
|
|
526
564
|
|
|
527
565
|
if polygon_count > max_polygon_count:
|
|
528
566
|
violations.append(
|
|
@@ -549,41 +587,63 @@ def _check_metric_constraints(
|
|
|
549
587
|
f"Max vertices ({max_vertices_value:,}) exceeds limit ({max_max_vertices:,})"
|
|
550
588
|
)
|
|
551
589
|
|
|
590
|
+
if (
|
|
591
|
+
max_file_size_mb is not None
|
|
592
|
+
and file_size_mb is not None
|
|
593
|
+
and file_size_mb > max_file_size_mb
|
|
594
|
+
):
|
|
595
|
+
violations.append(
|
|
596
|
+
f"File size ({file_size_mb:.2f} MB) exceeds limit ({max_file_size_mb:.2f} MB)"
|
|
597
|
+
)
|
|
598
|
+
|
|
552
599
|
return violations
|
|
553
600
|
|
|
554
601
|
|
|
555
|
-
def
|
|
556
|
-
geojson_data: Path | str | dict,
|
|
602
|
+
def check_geojson_limits(
|
|
603
|
+
geojson_data: Path | str | dict = None,
|
|
604
|
+
analysis_results: dict = None,
|
|
557
605
|
max_polygon_count=250_000,
|
|
558
|
-
max_mean_area_ha=
|
|
559
|
-
max_max_area_ha=
|
|
560
|
-
max_mean_vertices=
|
|
561
|
-
max_max_vertices=
|
|
606
|
+
max_mean_area_ha=50_000,
|
|
607
|
+
max_max_area_ha=50_000,
|
|
608
|
+
max_mean_vertices=50_000,
|
|
609
|
+
max_max_vertices=50_000,
|
|
610
|
+
max_file_size_mb=None,
|
|
611
|
+
allowed_crs=["EPSG:4326"],
|
|
562
612
|
verbose=True,
|
|
563
613
|
):
|
|
564
614
|
"""
|
|
565
|
-
|
|
615
|
+
Check GeoJSON data against defined limits for processing readiness.
|
|
566
616
|
|
|
567
617
|
Raises ValueError if any metrics exceed the specified limits.
|
|
568
618
|
Uses analyze_geojson to compute metrics efficiently in a single sweep.
|
|
569
619
|
|
|
570
620
|
Parameters:
|
|
571
621
|
-----------
|
|
572
|
-
geojson_data : Path | str | dict
|
|
622
|
+
geojson_data : Path | str | dict, optional
|
|
573
623
|
GeoJSON FeatureCollection to validate. Can be:
|
|
574
624
|
- dict: GeoJSON FeatureCollection dictionary
|
|
575
625
|
- str: Path to GeoJSON file as string
|
|
576
626
|
- Path: pathlib.Path to GeoJSON file
|
|
627
|
+
Note: Cannot be used together with analysis_results
|
|
628
|
+
analysis_results : dict, optional
|
|
629
|
+
Pre-computed results from analyze_geojson(). Must contain keys:
|
|
630
|
+
'count', 'mean_area_ha', 'max_area_ha', 'mean_vertices', 'max_vertices'
|
|
631
|
+
Note: Cannot be used together with geojson_data
|
|
577
632
|
max_polygon_count : int, optional
|
|
578
633
|
Maximum allowed number of polygons (default: 250,000)
|
|
579
634
|
max_mean_area_ha : float, optional
|
|
580
|
-
Maximum allowed mean area per polygon in hectares (default:
|
|
635
|
+
Maximum allowed mean area per polygon in hectares (default: 50,000)
|
|
581
636
|
max_max_area_ha : float, optional
|
|
582
|
-
Maximum allowed maximum area per polygon in hectares (default:
|
|
637
|
+
Maximum allowed maximum area per polygon in hectares (default: 50,000)
|
|
583
638
|
max_mean_vertices : float, optional
|
|
584
|
-
Maximum allowed mean vertices per polygon (default:
|
|
639
|
+
Maximum allowed mean vertices per polygon (default: 50,000)
|
|
585
640
|
max_max_vertices : int, optional
|
|
586
|
-
Maximum allowed vertices per polygon (default:
|
|
641
|
+
Maximum allowed vertices per polygon (default: 50,000)
|
|
642
|
+
max_file_size_mb : float, optional
|
|
643
|
+
Maximum allowed file size in megabytes (default: None, no limit)
|
|
644
|
+
allowed_crs : list, optional
|
|
645
|
+
List of allowed coordinate reference systems (default: ["EPSG:4326"])
|
|
646
|
+
Set to None to skip CRS validation
|
|
587
647
|
verbose : bool
|
|
588
648
|
Print validation results (default: True)
|
|
589
649
|
|
|
@@ -603,22 +663,25 @@ def validate_geojson_constraints(
|
|
|
603
663
|
Raises:
|
|
604
664
|
-------
|
|
605
665
|
ValueError
|
|
606
|
-
If any constraint is violated
|
|
666
|
+
If any constraint is violated, or if both geojson_data and analysis_results are provided,
|
|
667
|
+
or if neither is provided
|
|
607
668
|
"""
|
|
608
|
-
|
|
609
|
-
|
|
669
|
+
# Validate input parameters
|
|
670
|
+
if geojson_data is not None and analysis_results is not None:
|
|
671
|
+
raise ValueError(
|
|
672
|
+
"Cannot provide both 'geojson_data' and 'analysis_results'. "
|
|
673
|
+
"Please provide only one input source."
|
|
674
|
+
)
|
|
610
675
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
with open(file_path, "r") as f:
|
|
617
|
-
geojson_data = json.load(f)
|
|
676
|
+
if geojson_data is None and analysis_results is None:
|
|
677
|
+
raise ValueError(
|
|
678
|
+
"Must provide either 'geojson_data' or 'analysis_results'. "
|
|
679
|
+
"Both cannot be None."
|
|
680
|
+
)
|
|
618
681
|
|
|
619
682
|
if verbose:
|
|
620
683
|
print("\n" + "=" * 80)
|
|
621
|
-
print("GEOJSON
|
|
684
|
+
print("GEOJSON LIMITS CHECK")
|
|
622
685
|
print("=" * 80)
|
|
623
686
|
print("\nConstraint Limits:")
|
|
624
687
|
print(f" - Max polygon count: {max_polygon_count:,}")
|
|
@@ -629,90 +692,47 @@ def validate_geojson_constraints(
|
|
|
629
692
|
print(f" - Max mean vertices: {max_mean_vertices:,}")
|
|
630
693
|
if max_max_vertices is not None:
|
|
631
694
|
print(f" - Max vertices per polygon: {max_max_vertices:,}")
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
coords = feature["geometry"]["coordinates"]
|
|
656
|
-
geom_type = feature["geometry"]["type"]
|
|
657
|
-
|
|
658
|
-
if geom_type == "Polygon":
|
|
659
|
-
# Count vertices
|
|
660
|
-
feature_vertices = 0
|
|
661
|
-
for ring in coords:
|
|
662
|
-
feature_vertices += len(ring)
|
|
663
|
-
total_vertices += feature_vertices
|
|
664
|
-
max_vertices_value = max(max_vertices_value, feature_vertices)
|
|
665
|
-
|
|
666
|
-
# Calculate area
|
|
667
|
-
try:
|
|
668
|
-
poly = ShapelyPolygon(coords[0])
|
|
669
|
-
area_ha = abs(poly.area) * 1232100
|
|
670
|
-
total_area += area_ha
|
|
671
|
-
max_area = max(max_area, area_ha)
|
|
672
|
-
except:
|
|
673
|
-
pass
|
|
674
|
-
valid_polygons += 1
|
|
675
|
-
|
|
676
|
-
elif geom_type == "MultiPolygon":
|
|
677
|
-
# Count vertices
|
|
678
|
-
feature_vertices = 0
|
|
679
|
-
for polygon in coords:
|
|
680
|
-
for ring in polygon:
|
|
681
|
-
feature_vertices += len(ring)
|
|
682
|
-
total_vertices += feature_vertices
|
|
683
|
-
max_vertices_value = max(max_vertices_value, feature_vertices)
|
|
684
|
-
|
|
685
|
-
# Calculate area
|
|
686
|
-
try:
|
|
687
|
-
for polygon in coords:
|
|
688
|
-
poly = ShapelyPolygon(polygon[0])
|
|
689
|
-
area_ha = abs(poly.area) * 1232100
|
|
690
|
-
total_area += area_ha
|
|
691
|
-
max_area = max(max_area, area_ha)
|
|
692
|
-
except:
|
|
693
|
-
pass
|
|
694
|
-
valid_polygons += 1
|
|
695
|
-
|
|
696
|
-
except:
|
|
697
|
-
continue
|
|
698
|
-
|
|
699
|
-
# Compute means
|
|
700
|
-
polygon_count = len(features)
|
|
701
|
-
mean_area = total_area / valid_polygons if valid_polygons > 0 else 0
|
|
702
|
-
mean_vertices = total_vertices / valid_polygons if valid_polygons > 0 else 0
|
|
703
|
-
|
|
695
|
+
if max_file_size_mb is not None:
|
|
696
|
+
print(f" - Max file size (MB): {max_file_size_mb:.2f}")
|
|
697
|
+
|
|
698
|
+
# Get metrics either from analysis_results or by analyzing geojson_data
|
|
699
|
+
if analysis_results is not None:
|
|
700
|
+
# Use pre-computed analysis results
|
|
701
|
+
metrics = analysis_results
|
|
702
|
+
else:
|
|
703
|
+
# Use analyze_geojson to compute all required metrics in a single sweep
|
|
704
|
+
metrics_to_compute = [
|
|
705
|
+
"count",
|
|
706
|
+
"file_size_mb",
|
|
707
|
+
"mean_area_ha",
|
|
708
|
+
"max_area_ha",
|
|
709
|
+
"mean_vertices",
|
|
710
|
+
"max_vertices",
|
|
711
|
+
]
|
|
712
|
+
# Add CRS if validation is requested
|
|
713
|
+
if allowed_crs is not None:
|
|
714
|
+
metrics_to_compute.append("crs")
|
|
715
|
+
metrics = analyze_geojson(geojson_data, metrics=metrics_to_compute)
|
|
716
|
+
|
|
717
|
+
# Build results dict with required keys
|
|
704
718
|
results = {
|
|
705
|
-
"count":
|
|
706
|
-
"
|
|
707
|
-
"
|
|
708
|
-
"
|
|
709
|
-
"
|
|
719
|
+
"count": metrics.get("count", 0),
|
|
720
|
+
"file_size_mb": metrics.get("file_size_mb"),
|
|
721
|
+
"mean_area_ha": metrics.get("mean_area_ha", 0),
|
|
722
|
+
"max_area_ha": metrics.get("max_area_ha", 0),
|
|
723
|
+
"mean_vertices": metrics.get("mean_vertices", 0),
|
|
724
|
+
"max_vertices": metrics.get("max_vertices", 0),
|
|
725
|
+
"crs": metrics.get("crs"),
|
|
710
726
|
"valid": True,
|
|
711
727
|
}
|
|
712
728
|
|
|
713
729
|
if verbose:
|
|
714
730
|
print("\nComputed Metrics:")
|
|
715
731
|
print(f" - Polygon count: {results['count']:,}")
|
|
732
|
+
if results.get("file_size_mb") is not None:
|
|
733
|
+
print(f" - File size (MB): {results['file_size_mb']:,.2f}")
|
|
734
|
+
if results.get("crs") is not None:
|
|
735
|
+
print(f" - CRS: {results['crs']}")
|
|
716
736
|
print(f" - Mean area (ha): {results['mean_area_ha']:,}")
|
|
717
737
|
print(f" - Max area (ha): {results['max_area_ha']:,}")
|
|
718
738
|
print(f" - Mean vertices: {results['mean_vertices']:,}")
|
|
@@ -726,34 +746,48 @@ def validate_geojson_constraints(
|
|
|
726
746
|
max_max_area_ha=max_max_area_ha,
|
|
727
747
|
max_mean_vertices=max_mean_vertices,
|
|
728
748
|
max_max_vertices=max_max_vertices,
|
|
749
|
+
max_file_size_mb=max_file_size_mb,
|
|
729
750
|
)
|
|
730
751
|
|
|
752
|
+
# Check CRS if validation is requested
|
|
753
|
+
if allowed_crs is not None and results.get("crs"):
|
|
754
|
+
if results["crs"] not in allowed_crs:
|
|
755
|
+
violations.append(
|
|
756
|
+
f"CRS '{results['crs']}' is not in allowed list: {allowed_crs}"
|
|
757
|
+
)
|
|
758
|
+
|
|
731
759
|
# Report results
|
|
732
760
|
if verbose:
|
|
733
761
|
print("\n" + "=" * 80)
|
|
734
762
|
if violations:
|
|
735
|
-
print("
|
|
763
|
+
print("LIMITS CHECK FAILED")
|
|
736
764
|
print("=" * 80)
|
|
737
765
|
for violation in violations:
|
|
738
766
|
print(f"\n{violation}")
|
|
739
767
|
results["valid"] = False
|
|
740
768
|
else:
|
|
741
|
-
print("
|
|
769
|
+
print("LIMITS CHECK PASSED")
|
|
742
770
|
print("=" * 80)
|
|
743
771
|
print("\nAll metrics within acceptable limits")
|
|
744
772
|
|
|
745
773
|
# Raise error with detailed message if any constraint violated
|
|
746
774
|
if violations:
|
|
747
|
-
error_message = "
|
|
775
|
+
error_message = "GeoJSON limits check failed:\n" + "\n".join(violations)
|
|
748
776
|
raise ValueError(error_message)
|
|
749
777
|
|
|
750
778
|
return results
|
|
751
779
|
|
|
752
780
|
|
|
781
|
+
# Backward compatibility aliases
|
|
782
|
+
screen_geojson = check_geojson_limits
|
|
783
|
+
validate_geojson_constraints = check_geojson_limits
|
|
784
|
+
|
|
785
|
+
|
|
753
786
|
def suggest_processing_mode(
|
|
754
787
|
feature_count,
|
|
755
788
|
mean_area_ha=None,
|
|
756
789
|
mean_vertices=None,
|
|
790
|
+
file_size_mb=None,
|
|
757
791
|
feature_type="polygon",
|
|
758
792
|
verbose=True,
|
|
759
793
|
):
|
|
@@ -762,6 +796,9 @@ def suggest_processing_mode(
|
|
|
762
796
|
|
|
763
797
|
Decision thresholds from comprehensive benchmark data (Nov 2025):
|
|
764
798
|
|
|
799
|
+
FILE SIZE:
|
|
800
|
+
- Files >= 10 MB: recommend sequential mode (avoids payload size limits)
|
|
801
|
+
|
|
765
802
|
POINTS:
|
|
766
803
|
- Break-even: 750-1000 features
|
|
767
804
|
- Sequential faster: < 750 features
|
|
@@ -785,6 +822,8 @@ def suggest_processing_mode(
|
|
|
785
822
|
Mean area per polygon in hectares (required for polygons, ignored for points)
|
|
786
823
|
mean_vertices : float, optional
|
|
787
824
|
Mean number of vertices per polygon (influences decision for complex geometries)
|
|
825
|
+
file_size_mb : float, optional
|
|
826
|
+
File size in megabytes (if >= 10 MB, recommends sequential mode)
|
|
788
827
|
feature_type : str
|
|
789
828
|
'polygon', 'multipolygon', or 'point' (default: 'polygon')
|
|
790
829
|
verbose : bool
|
|
@@ -795,6 +834,14 @@ def suggest_processing_mode(
|
|
|
795
834
|
str: 'concurrent' or 'sequential'
|
|
796
835
|
"""
|
|
797
836
|
|
|
837
|
+
# File size check: large files should use sequential mode
|
|
838
|
+
if file_size_mb is not None and file_size_mb >= 10:
|
|
839
|
+
if verbose:
|
|
840
|
+
print(f"\nMETHOD RECOMMENDATION (File Size Constraint)")
|
|
841
|
+
print(f" File size: {file_size_mb:.2f} MB (>= 10 MB threshold)")
|
|
842
|
+
print(f" Method: SEQUENTIAL (avoids payload size limits)")
|
|
843
|
+
return "sequential"
|
|
844
|
+
|
|
798
845
|
# Points: simple threshold-based decision
|
|
799
846
|
if feature_type == "point":
|
|
800
847
|
breakeven = 750
|
openforis_whisp/datasets.py
CHANGED
|
@@ -61,8 +61,9 @@ def g_esa_worldcover_trees_prep():
|
|
|
61
61
|
|
|
62
62
|
# EUFO_2020
|
|
63
63
|
def g_jrc_gfc_2020_prep():
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
# JRC GFC2020 V3 is a single Image with band 'Map'
|
|
65
|
+
jrc_gfc2020 = ee.Image("JRC/GFC2020/V3").select("Map")
|
|
66
|
+
return jrc_gfc2020.rename("EUFO_2020").selfMask()
|
|
66
67
|
|
|
67
68
|
|
|
68
69
|
# GFC_TC_2020
|
|
@@ -373,14 +374,12 @@ def g_esri_2020_2023_crop_prep():
|
|
|
373
374
|
|
|
374
375
|
# RADD_year_2019 to RADD_year_< current year >
|
|
375
376
|
def g_radd_year_prep():
|
|
376
|
-
from datetime import datetime
|
|
377
|
-
|
|
378
377
|
radd = ee.ImageCollection("projects/radar-wur/raddalert/v1")
|
|
379
378
|
radd_date = (
|
|
380
379
|
radd.filterMetadata("layer", "contains", "alert").select("Date").mosaic()
|
|
381
380
|
)
|
|
382
381
|
start_year = 19
|
|
383
|
-
current_year =
|
|
382
|
+
current_year = CURRENT_YEAR_2DIGIT
|
|
384
383
|
|
|
385
384
|
def make_band(year, img_stack):
|
|
386
385
|
start = year * 1000
|
openforis_whisp/reformat.py
CHANGED
|
@@ -859,12 +859,14 @@ def format_stats_dataframe(
|
|
|
859
859
|
)
|
|
860
860
|
df.rename(columns={area_col: area_col_stripped}, inplace=True)
|
|
861
861
|
|
|
862
|
-
# 10) reorder by plotId column if present
|
|
863
|
-
|
|
864
|
-
df.
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
862
|
+
# 10) reorder by plotId column numerically if present (column is string but contains int values)
|
|
863
|
+
if sort_column in df.columns:
|
|
864
|
+
df["_sort_key"] = pd.to_numeric(df[sort_column], errors="coerce")
|
|
865
|
+
df = (
|
|
866
|
+
df.sort_values(by="_sort_key")
|
|
867
|
+
.drop(columns=["_sort_key"])
|
|
868
|
+
.reset_index(drop=True)
|
|
869
|
+
)
|
|
868
870
|
|
|
869
871
|
# 11) Defragment final DataFrame and return
|
|
870
872
|
return df.copy()
|