openforis-whisp 2.0.0b2__py3-none-any.whl → 3.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +35 -4
- openforis_whisp/advanced_stats.py +2070 -0
- openforis_whisp/data_checks.py +642 -0
- openforis_whisp/data_conversion.py +86 -44
- openforis_whisp/datasets.py +298 -225
- openforis_whisp/logger.py +26 -0
- openforis_whisp/parameters/__init__.py +0 -0
- openforis_whisp/parameters/lookup_gaul1_admin.py +18663 -0
- openforis_whisp/reformat.py +198 -2
- openforis_whisp/stats.py +488 -68
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a1.dist-info/RECORD +20 -0
- openforis_whisp-2.0.0b2.dist-info/RECORD +0 -16
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/LICENSE +0 -0
- {openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data validation and constraint checking functions for WHISP.
|
|
3
|
+
|
|
4
|
+
Provides validation functions to check GeoJSON data against defined limits
|
|
5
|
+
and thresholds, raising informative errors when constraints are violated.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from shapely.geometry import Polygon as ShapelyPolygon
|
|
11
|
+
|
|
12
|
+
# Note: area summary stats are estimations for use in deciding pathways for analysis
|
|
13
|
+
# (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
|
|
14
|
+
def analyze_geojson(
|
|
15
|
+
geojson_data: Path | str | dict,
|
|
16
|
+
metrics=[
|
|
17
|
+
"count",
|
|
18
|
+
"geometry_types",
|
|
19
|
+
"min_area_ha",
|
|
20
|
+
"mean_area_ha",
|
|
21
|
+
"median_area_ha",
|
|
22
|
+
"max_area_ha",
|
|
23
|
+
"area_percentiles",
|
|
24
|
+
"min_vertices",
|
|
25
|
+
"mean_vertices",
|
|
26
|
+
"median_vertices",
|
|
27
|
+
"max_vertices",
|
|
28
|
+
"vertex_percentiles",
|
|
29
|
+
],
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Analyze GeoJSON polygons with selectable metrics for method selection.
|
|
33
|
+
|
|
34
|
+
Fast lightweight analysis - only computes requested metrics.
|
|
35
|
+
Works with or without area_ha property in features.
|
|
36
|
+
All metrics computed in a single sweep through the data for efficiency.
|
|
37
|
+
|
|
38
|
+
Warning: area metrics are estimations using EPSG:4326 - accuracy at equator only (extreme differences towards poles)
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
-----------
|
|
42
|
+
geojson_data : Path | str | dict
|
|
43
|
+
GeoJSON FeatureCollection. Can be:
|
|
44
|
+
- dict: GeoJSON FeatureCollection dictionary
|
|
45
|
+
- str: Path to GeoJSON file as string
|
|
46
|
+
- Path: pathlib.Path to GeoJSON file
|
|
47
|
+
metrics : list
|
|
48
|
+
Which metrics to return. Available metrics:
|
|
49
|
+
- 'count': number of polygons
|
|
50
|
+
- 'geometry_types': dict of geometry type counts (e.g., {'Polygon': 95, 'MultiPolygon': 5})
|
|
51
|
+
- 'min_area_ha', 'mean_area_ha', 'median_area_ha', 'max_area_ha': area statistics (hectares) (accurate only at equator)
|
|
52
|
+
- 'area_percentiles': dict with p25, p50 (median), p75, p90 area values (accurate only at equator)
|
|
53
|
+
- 'min_vertices', 'mean_vertices', 'median_vertices', 'max_vertices': vertex count statistics
|
|
54
|
+
- 'vertex_percentiles': dict with p25, p50 (median), p75, p90 vertex count values
|
|
55
|
+
|
|
56
|
+
Default includes all metrics for comprehensive analysis.
|
|
57
|
+
Examples:
|
|
58
|
+
- ['count'] -> just polygon count
|
|
59
|
+
- ['count', 'mean_area_ha', 'max_area_ha'] -> subset of metrics
|
|
60
|
+
- Default: all metrics for full statistical summary
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
--------
|
|
64
|
+
dict with requested metrics:
|
|
65
|
+
- 'count': number of polygons
|
|
66
|
+
- 'geometry_types': {'Polygon': int, 'MultiPolygon': int, ...}
|
|
67
|
+
- 'min_area_ha': minimum area among all polygons in hectares
|
|
68
|
+
- 'mean_area_ha': mean area per polygon in hectares (calculated from coordinates)
|
|
69
|
+
- 'median_area_ha': median area among all polygons in hectares
|
|
70
|
+
- 'max_area_ha': maximum area among all polygons in hectares
|
|
71
|
+
- 'area_percentiles': {'p25': float, 'p50': float, 'p75': float, 'p90': float}
|
|
72
|
+
- 'min_vertices': minimum number of vertices among all polygons
|
|
73
|
+
- 'mean_vertices': mean number of vertices per polygon
|
|
74
|
+
- 'median_vertices': median number of vertices among all polygons
|
|
75
|
+
- 'max_vertices': maximum number of vertices among all polygons
|
|
76
|
+
- 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
|
|
77
|
+
"""
|
|
78
|
+
results = {}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Load GeoJSON from file if path provided
|
|
82
|
+
if isinstance(geojson_data, (str, Path)):
|
|
83
|
+
file_path = Path(geojson_data)
|
|
84
|
+
if not file_path.exists():
|
|
85
|
+
raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
|
|
86
|
+
with open(file_path, "r") as f:
|
|
87
|
+
geojson_data = json.load(f)
|
|
88
|
+
|
|
89
|
+
features = geojson_data.get("features", [])
|
|
90
|
+
|
|
91
|
+
if "count" in metrics:
|
|
92
|
+
results["count"] = len(features)
|
|
93
|
+
|
|
94
|
+
# Single sweep through features - compute all area/vertex metrics at once
|
|
95
|
+
if any(
|
|
96
|
+
m in metrics
|
|
97
|
+
for m in [
|
|
98
|
+
"geometry_types",
|
|
99
|
+
"min_area_ha",
|
|
100
|
+
"mean_area_ha",
|
|
101
|
+
"median_area_ha",
|
|
102
|
+
"max_area_ha",
|
|
103
|
+
"area_percentiles",
|
|
104
|
+
"min_vertices",
|
|
105
|
+
"mean_vertices",
|
|
106
|
+
"median_vertices",
|
|
107
|
+
"max_vertices",
|
|
108
|
+
"vertex_percentiles",
|
|
109
|
+
]
|
|
110
|
+
):
|
|
111
|
+
areas = []
|
|
112
|
+
vertices_list = []
|
|
113
|
+
geometry_type_counts = {}
|
|
114
|
+
valid_polygons = 0
|
|
115
|
+
|
|
116
|
+
for feature in features:
|
|
117
|
+
try:
|
|
118
|
+
coords = feature["geometry"]["coordinates"]
|
|
119
|
+
geom_type = feature["geometry"]["type"]
|
|
120
|
+
properties = feature.get("properties", {})
|
|
121
|
+
|
|
122
|
+
# Count geometry types
|
|
123
|
+
geometry_type_counts[geom_type] = (
|
|
124
|
+
geometry_type_counts.get(geom_type, 0) + 1
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if geom_type == "Polygon":
|
|
128
|
+
# Count vertices in this polygon
|
|
129
|
+
feature_vertices = 0
|
|
130
|
+
for ring in coords:
|
|
131
|
+
feature_vertices += len(ring)
|
|
132
|
+
vertices_list.append(feature_vertices)
|
|
133
|
+
|
|
134
|
+
# Calculate area from coordinates using shapely
|
|
135
|
+
try:
|
|
136
|
+
poly = ShapelyPolygon(coords[0])
|
|
137
|
+
# Convert square degrees to hectares (near equator)
|
|
138
|
+
# 1 degree latitude ≈ 111 km, so 1 degree² ≈ 111² km² = 12,321 km² = 1,232,100 ha
|
|
139
|
+
area_ha = abs(poly.area) * 1232100
|
|
140
|
+
areas.append(area_ha)
|
|
141
|
+
except:
|
|
142
|
+
pass # Skip if calculation fails
|
|
143
|
+
valid_polygons += 1
|
|
144
|
+
|
|
145
|
+
elif geom_type == "MultiPolygon":
|
|
146
|
+
# Count vertices in this multipolygon
|
|
147
|
+
feature_vertices = 0
|
|
148
|
+
for polygon in coords:
|
|
149
|
+
for ring in polygon:
|
|
150
|
+
feature_vertices += len(ring)
|
|
151
|
+
vertices_list.append(feature_vertices)
|
|
152
|
+
|
|
153
|
+
# Calculate area from coordinates using shapely
|
|
154
|
+
try:
|
|
155
|
+
for polygon in coords:
|
|
156
|
+
poly = ShapelyPolygon(polygon[0])
|
|
157
|
+
area_ha = abs(poly.area) * 1232100
|
|
158
|
+
areas.append(area_ha)
|
|
159
|
+
except:
|
|
160
|
+
pass # Skip if calculation fails
|
|
161
|
+
valid_polygons += 1
|
|
162
|
+
|
|
163
|
+
except:
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
# Calculate statistics and return requested metrics
|
|
167
|
+
|
|
168
|
+
# Geometry type counts
|
|
169
|
+
if "geometry_types" in metrics:
|
|
170
|
+
results["geometry_types"] = geometry_type_counts
|
|
171
|
+
|
|
172
|
+
if areas or vertices_list:
|
|
173
|
+
# Area statistics
|
|
174
|
+
if areas:
|
|
175
|
+
if "min_area_ha" in metrics:
|
|
176
|
+
results["min_area_ha"] = round(min(areas), 2)
|
|
177
|
+
if "mean_area_ha" in metrics:
|
|
178
|
+
results["mean_area_ha"] = round(sum(areas) / len(areas), 2)
|
|
179
|
+
|
|
180
|
+
sorted_areas = sorted(areas) # Sort once for median and percentiles
|
|
181
|
+
|
|
182
|
+
if "median_area_ha" in metrics:
|
|
183
|
+
mid = len(sorted_areas) // 2
|
|
184
|
+
results["median_area_ha"] = round(
|
|
185
|
+
sorted_areas[mid]
|
|
186
|
+
if len(sorted_areas) % 2 == 1
|
|
187
|
+
else (sorted_areas[mid - 1] + sorted_areas[mid]) / 2,
|
|
188
|
+
2,
|
|
189
|
+
)
|
|
190
|
+
if "max_area_ha" in metrics:
|
|
191
|
+
results["max_area_ha"] = round(max(areas), 2)
|
|
192
|
+
|
|
193
|
+
if "area_percentiles" in metrics:
|
|
194
|
+
n = len(sorted_areas)
|
|
195
|
+
p25_idx = n // 4
|
|
196
|
+
p50_idx = n // 2
|
|
197
|
+
p75_idx = (n * 3) // 4
|
|
198
|
+
p90_idx = int(n * 0.9)
|
|
199
|
+
|
|
200
|
+
results["area_percentiles"] = {
|
|
201
|
+
"p25": round(sorted_areas[p25_idx], 2),
|
|
202
|
+
"p50": round(
|
|
203
|
+
sorted_areas[p50_idx]
|
|
204
|
+
if n % 2 == 1
|
|
205
|
+
else (sorted_areas[p50_idx - 1] + sorted_areas[p50_idx])
|
|
206
|
+
/ 2,
|
|
207
|
+
2,
|
|
208
|
+
),
|
|
209
|
+
"p75": round(sorted_areas[p75_idx], 2),
|
|
210
|
+
"p90": round(sorted_areas[p90_idx], 2),
|
|
211
|
+
}
|
|
212
|
+
else:
|
|
213
|
+
# Return zeros for no areas
|
|
214
|
+
if "min_area_ha" in metrics:
|
|
215
|
+
results["min_area_ha"] = 0
|
|
216
|
+
if "mean_area_ha" in metrics:
|
|
217
|
+
results["mean_area_ha"] = 0
|
|
218
|
+
if "median_area_ha" in metrics:
|
|
219
|
+
results["median_area_ha"] = 0
|
|
220
|
+
if "max_area_ha" in metrics:
|
|
221
|
+
results["max_area_ha"] = 0
|
|
222
|
+
if "area_percentiles" in metrics:
|
|
223
|
+
results["area_percentiles"] = {
|
|
224
|
+
"p25": 0,
|
|
225
|
+
"p50": 0,
|
|
226
|
+
"p75": 0,
|
|
227
|
+
"p90": 0,
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# Vertex statistics
|
|
231
|
+
if vertices_list:
|
|
232
|
+
if "min_vertices" in metrics:
|
|
233
|
+
results["min_vertices"] = min(vertices_list)
|
|
234
|
+
if "mean_vertices" in metrics:
|
|
235
|
+
results["mean_vertices"] = round(
|
|
236
|
+
sum(vertices_list) / len(vertices_list), 2
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
sorted_vertices = sorted(
|
|
240
|
+
vertices_list
|
|
241
|
+
) # Sort once for median and percentiles
|
|
242
|
+
|
|
243
|
+
if "median_vertices" in metrics:
|
|
244
|
+
mid = len(sorted_vertices) // 2
|
|
245
|
+
results["median_vertices"] = (
|
|
246
|
+
sorted_vertices[mid]
|
|
247
|
+
if len(sorted_vertices) % 2 == 1
|
|
248
|
+
else round(
|
|
249
|
+
(sorted_vertices[mid - 1] + sorted_vertices[mid]) / 2, 0
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
if "max_vertices" in metrics:
|
|
253
|
+
results["max_vertices"] = max(vertices_list)
|
|
254
|
+
|
|
255
|
+
if "vertex_percentiles" in metrics:
|
|
256
|
+
n = len(sorted_vertices)
|
|
257
|
+
p25_idx = n // 4
|
|
258
|
+
p50_idx = n // 2
|
|
259
|
+
p75_idx = (n * 3) // 4
|
|
260
|
+
p90_idx = int(n * 0.9)
|
|
261
|
+
|
|
262
|
+
results["vertex_percentiles"] = {
|
|
263
|
+
"p25": sorted_vertices[p25_idx],
|
|
264
|
+
"p50": sorted_vertices[p50_idx]
|
|
265
|
+
if n % 2 == 1
|
|
266
|
+
else round(
|
|
267
|
+
(
|
|
268
|
+
sorted_vertices[p50_idx - 1]
|
|
269
|
+
+ sorted_vertices[p50_idx]
|
|
270
|
+
)
|
|
271
|
+
/ 2,
|
|
272
|
+
0,
|
|
273
|
+
),
|
|
274
|
+
"p75": sorted_vertices[p75_idx],
|
|
275
|
+
"p90": sorted_vertices[p90_idx],
|
|
276
|
+
}
|
|
277
|
+
else:
|
|
278
|
+
# Return zeros for no vertices
|
|
279
|
+
if "min_vertices" in metrics:
|
|
280
|
+
results["min_vertices"] = 0
|
|
281
|
+
if "mean_vertices" in metrics:
|
|
282
|
+
results["mean_vertices"] = 0
|
|
283
|
+
if "median_vertices" in metrics:
|
|
284
|
+
results["median_vertices"] = 0
|
|
285
|
+
if "max_vertices" in metrics:
|
|
286
|
+
results["max_vertices"] = 0
|
|
287
|
+
if "vertex_percentiles" in metrics:
|
|
288
|
+
results["vertex_percentiles"] = {
|
|
289
|
+
"p25": 0,
|
|
290
|
+
"p50": 0,
|
|
291
|
+
"p75": 0,
|
|
292
|
+
"p90": 0,
|
|
293
|
+
}
|
|
294
|
+
else:
|
|
295
|
+
# Return zeros for empty datasets
|
|
296
|
+
for metric in [
|
|
297
|
+
"min_area_ha",
|
|
298
|
+
"mean_area_ha",
|
|
299
|
+
"median_area_ha",
|
|
300
|
+
"max_area_ha",
|
|
301
|
+
"area_percentiles",
|
|
302
|
+
"min_vertices",
|
|
303
|
+
"mean_vertices",
|
|
304
|
+
"median_vertices",
|
|
305
|
+
"max_vertices",
|
|
306
|
+
"vertex_percentiles",
|
|
307
|
+
]:
|
|
308
|
+
if metric in metrics:
|
|
309
|
+
results[metric] = (
|
|
310
|
+
0
|
|
311
|
+
if metric not in ["area_percentiles", "vertex_percentiles"]
|
|
312
|
+
else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
return results
|
|
316
|
+
|
|
317
|
+
except Exception as e:
|
|
318
|
+
print(f"Error: {str(e)}")
|
|
319
|
+
return {}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _check_metric_constraints(
|
|
323
|
+
metrics,
|
|
324
|
+
max_polygon_count=250_000,
|
|
325
|
+
max_mean_area_ha=10_000,
|
|
326
|
+
max_max_area_ha=None,
|
|
327
|
+
max_mean_vertices=None,
|
|
328
|
+
max_max_vertices=10_000,
|
|
329
|
+
):
|
|
330
|
+
"""
|
|
331
|
+
Check if computed metrics violate any constraints.
|
|
332
|
+
|
|
333
|
+
Internal helper function for constraint validation.
|
|
334
|
+
|
|
335
|
+
Parameters:
|
|
336
|
+
-----------
|
|
337
|
+
metrics : dict
|
|
338
|
+
Dictionary of computed metrics with keys: count, mean_area_ha, max_area_ha,
|
|
339
|
+
mean_vertices, max_vertices
|
|
340
|
+
max_polygon_count : int
|
|
341
|
+
Maximum allowed number of polygons
|
|
342
|
+
max_mean_area_ha : float
|
|
343
|
+
Maximum allowed mean area per polygon in hectares
|
|
344
|
+
max_max_area_ha : float, optional
|
|
345
|
+
Maximum allowed maximum area per polygon in hectares
|
|
346
|
+
max_mean_vertices : float, optional
|
|
347
|
+
Maximum allowed mean vertices per polygon
|
|
348
|
+
max_max_vertices : int, optional
|
|
349
|
+
Maximum allowed vertices per polygon
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
--------
|
|
353
|
+
list
|
|
354
|
+
List of violation strings (empty if all constraints pass)
|
|
355
|
+
"""
|
|
356
|
+
violations = []
|
|
357
|
+
|
|
358
|
+
polygon_count = metrics["count"]
|
|
359
|
+
mean_area = metrics["mean_area_ha"]
|
|
360
|
+
max_area = metrics["max_area_ha"]
|
|
361
|
+
mean_vertices = metrics["mean_vertices"]
|
|
362
|
+
max_vertices_value = metrics["max_vertices"]
|
|
363
|
+
|
|
364
|
+
if polygon_count > max_polygon_count:
|
|
365
|
+
violations.append(
|
|
366
|
+
f"Polygon count ({polygon_count:,}) exceeds limit ({max_polygon_count:,})"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
if mean_area > max_mean_area_ha:
|
|
370
|
+
violations.append(
|
|
371
|
+
f"Mean area ({mean_area:,.2f} ha) exceeds limit ({max_mean_area_ha:,} ha)"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
if max_max_area_ha is not None and max_area > max_max_area_ha:
|
|
375
|
+
violations.append(
|
|
376
|
+
f"Max area ({max_area:,.2f} ha) exceeds limit ({max_max_area_ha:,} ha)"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
if max_mean_vertices is not None and mean_vertices > max_mean_vertices:
|
|
380
|
+
violations.append(
|
|
381
|
+
f"Mean vertices ({mean_vertices:.2f}) exceeds limit ({max_mean_vertices:,})"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if max_max_vertices is not None and max_vertices_value > max_max_vertices:
|
|
385
|
+
violations.append(
|
|
386
|
+
f"Max vertices ({max_vertices_value:,}) exceeds limit ({max_max_vertices:,})"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return violations
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def validate_geojson_constraints(
|
|
393
|
+
geojson_data: Path | str | dict,
|
|
394
|
+
max_polygon_count=250_000,
|
|
395
|
+
max_mean_area_ha=10_000,
|
|
396
|
+
max_max_area_ha=None,
|
|
397
|
+
max_mean_vertices=None,
|
|
398
|
+
max_max_vertices=10_000,
|
|
399
|
+
verbose=True,
|
|
400
|
+
):
|
|
401
|
+
"""
|
|
402
|
+
Validate GeoJSON data against defined constraints.
|
|
403
|
+
|
|
404
|
+
Raises ValueError if any metrics exceed the specified limits.
|
|
405
|
+
Uses analyze_geojson to compute metrics efficiently in a single sweep.
|
|
406
|
+
|
|
407
|
+
Parameters:
|
|
408
|
+
-----------
|
|
409
|
+
geojson_data : Path | str | dict
|
|
410
|
+
GeoJSON FeatureCollection to validate. Can be:
|
|
411
|
+
- dict: GeoJSON FeatureCollection dictionary
|
|
412
|
+
- str: Path to GeoJSON file as string
|
|
413
|
+
- Path: pathlib.Path to GeoJSON file
|
|
414
|
+
max_polygon_count : int, optional
|
|
415
|
+
Maximum allowed number of polygons (default: 250,000)
|
|
416
|
+
max_mean_area_ha : float, optional
|
|
417
|
+
Maximum allowed mean area per polygon in hectares (default: 10,000)
|
|
418
|
+
max_max_area_ha : float, optional
|
|
419
|
+
Maximum allowed maximum area per polygon in hectares (default: None, no limit)
|
|
420
|
+
max_mean_vertices : float, optional
|
|
421
|
+
Maximum allowed mean vertices per polygon (default: None, no limit)
|
|
422
|
+
max_max_vertices : int, optional
|
|
423
|
+
Maximum allowed vertices per polygon (default: 10,000)
|
|
424
|
+
verbose : bool
|
|
425
|
+
Print validation results (default: True)
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
--------
|
|
429
|
+
dict
|
|
430
|
+
Dictionary containing computed metrics that passed validation:
|
|
431
|
+
{
|
|
432
|
+
'count': int,
|
|
433
|
+
'mean_area_ha': float,
|
|
434
|
+
'max_area_ha': float,
|
|
435
|
+
'mean_vertices': float,
|
|
436
|
+
'max_vertices': int,
|
|
437
|
+
'valid': bool
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
Raises:
|
|
441
|
+
-------
|
|
442
|
+
ValueError
|
|
443
|
+
If any constraint is violated
|
|
444
|
+
"""
|
|
445
|
+
from openforis_whisp.data_conversion import convert_geojson_to_ee
|
|
446
|
+
from shapely.geometry import Polygon as ShapelyPolygon
|
|
447
|
+
|
|
448
|
+
# Load GeoJSON from file if path provided
|
|
449
|
+
if isinstance(geojson_data, (str, Path)):
|
|
450
|
+
file_path = Path(geojson_data)
|
|
451
|
+
if not file_path.exists():
|
|
452
|
+
raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
|
|
453
|
+
with open(file_path, "r") as f:
|
|
454
|
+
geojson_data = json.load(f)
|
|
455
|
+
|
|
456
|
+
if verbose:
|
|
457
|
+
print("\n" + "=" * 80)
|
|
458
|
+
print("GEOJSON CONSTRAINT VALIDATION")
|
|
459
|
+
print("=" * 80)
|
|
460
|
+
print("\nConstraint Limits:")
|
|
461
|
+
print(f" - Max polygon count: {max_polygon_count:,}")
|
|
462
|
+
print(f" - Max mean area (ha): {max_mean_area_ha:,}")
|
|
463
|
+
if max_max_area_ha is not None:
|
|
464
|
+
print(f" - Max area per polygon (ha): {max_max_area_ha:,}")
|
|
465
|
+
if max_mean_vertices is not None:
|
|
466
|
+
print(f" - Max mean vertices: {max_mean_vertices:,}")
|
|
467
|
+
if max_max_vertices is not None:
|
|
468
|
+
print(f" - Max vertices per polygon: {max_max_vertices:,}")
|
|
469
|
+
|
|
470
|
+
# Collect all metrics we need to compute
|
|
471
|
+
metrics_to_compute = [
|
|
472
|
+
"count",
|
|
473
|
+
"mean_area_ha",
|
|
474
|
+
"max_area_ha",
|
|
475
|
+
"mean_vertices",
|
|
476
|
+
"max_vertices",
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
# Import analyze_geojson (will be available after function is defined elsewhere)
|
|
480
|
+
# For now, we'll compute it here efficiently in a single sweep
|
|
481
|
+
features = geojson_data.get("features", [])
|
|
482
|
+
|
|
483
|
+
# Single sweep computation
|
|
484
|
+
total_area = 0
|
|
485
|
+
total_vertices = 0
|
|
486
|
+
max_area = 0
|
|
487
|
+
max_vertices_value = 0
|
|
488
|
+
valid_polygons = 0
|
|
489
|
+
|
|
490
|
+
for feature in features:
|
|
491
|
+
try:
|
|
492
|
+
coords = feature["geometry"]["coordinates"]
|
|
493
|
+
geom_type = feature["geometry"]["type"]
|
|
494
|
+
|
|
495
|
+
if geom_type == "Polygon":
|
|
496
|
+
# Count vertices
|
|
497
|
+
feature_vertices = 0
|
|
498
|
+
for ring in coords:
|
|
499
|
+
feature_vertices += len(ring)
|
|
500
|
+
total_vertices += feature_vertices
|
|
501
|
+
max_vertices_value = max(max_vertices_value, feature_vertices)
|
|
502
|
+
|
|
503
|
+
# Calculate area
|
|
504
|
+
try:
|
|
505
|
+
poly = ShapelyPolygon(coords[0])
|
|
506
|
+
area_ha = abs(poly.area) * 1232100
|
|
507
|
+
total_area += area_ha
|
|
508
|
+
max_area = max(max_area, area_ha)
|
|
509
|
+
except:
|
|
510
|
+
pass
|
|
511
|
+
valid_polygons += 1
|
|
512
|
+
|
|
513
|
+
elif geom_type == "MultiPolygon":
|
|
514
|
+
# Count vertices
|
|
515
|
+
feature_vertices = 0
|
|
516
|
+
for polygon in coords:
|
|
517
|
+
for ring in polygon:
|
|
518
|
+
feature_vertices += len(ring)
|
|
519
|
+
total_vertices += feature_vertices
|
|
520
|
+
max_vertices_value = max(max_vertices_value, feature_vertices)
|
|
521
|
+
|
|
522
|
+
# Calculate area
|
|
523
|
+
try:
|
|
524
|
+
for polygon in coords:
|
|
525
|
+
poly = ShapelyPolygon(polygon[0])
|
|
526
|
+
area_ha = abs(poly.area) * 1232100
|
|
527
|
+
total_area += area_ha
|
|
528
|
+
max_area = max(max_area, area_ha)
|
|
529
|
+
except:
|
|
530
|
+
pass
|
|
531
|
+
valid_polygons += 1
|
|
532
|
+
|
|
533
|
+
except:
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
# Compute means
|
|
537
|
+
polygon_count = len(features)
|
|
538
|
+
mean_area = total_area / valid_polygons if valid_polygons > 0 else 0
|
|
539
|
+
mean_vertices = total_vertices / valid_polygons if valid_polygons > 0 else 0
|
|
540
|
+
|
|
541
|
+
results = {
|
|
542
|
+
"count": polygon_count,
|
|
543
|
+
"mean_area_ha": round(mean_area, 2),
|
|
544
|
+
"max_area_ha": round(max_area, 2),
|
|
545
|
+
"mean_vertices": round(mean_vertices, 2),
|
|
546
|
+
"max_vertices": max_vertices_value,
|
|
547
|
+
"valid": True,
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
if verbose:
|
|
551
|
+
print("\nComputed Metrics:")
|
|
552
|
+
print(f" - Polygon count: {results['count']:,}")
|
|
553
|
+
print(f" - Mean area (ha): {results['mean_area_ha']:,}")
|
|
554
|
+
print(f" - Max area (ha): {results['max_area_ha']:,}")
|
|
555
|
+
print(f" - Mean vertices: {results['mean_vertices']:,}")
|
|
556
|
+
print(f" - Max vertices: {results['max_vertices']:,}")
|
|
557
|
+
|
|
558
|
+
# Check constraints using dedicated method
|
|
559
|
+
violations = _check_metric_constraints(
|
|
560
|
+
results,
|
|
561
|
+
max_polygon_count=max_polygon_count,
|
|
562
|
+
max_mean_area_ha=max_mean_area_ha,
|
|
563
|
+
max_max_area_ha=max_max_area_ha,
|
|
564
|
+
max_mean_vertices=max_mean_vertices,
|
|
565
|
+
max_max_vertices=max_max_vertices,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
# Report results
|
|
569
|
+
if verbose:
|
|
570
|
+
print("\n" + "=" * 80)
|
|
571
|
+
if violations:
|
|
572
|
+
print("VALIDATION FAILED")
|
|
573
|
+
print("=" * 80)
|
|
574
|
+
for violation in violations:
|
|
575
|
+
print(f"\n{violation}")
|
|
576
|
+
results["valid"] = False
|
|
577
|
+
else:
|
|
578
|
+
print("VALIDATION PASSED")
|
|
579
|
+
print("=" * 80)
|
|
580
|
+
print("\nAll metrics within acceptable limits")
|
|
581
|
+
|
|
582
|
+
# Raise error with detailed message if any constraint violated
|
|
583
|
+
if violations:
|
|
584
|
+
error_message = "Constraint validation failed:\n" + "\n".join(violations)
|
|
585
|
+
raise ValueError(error_message)
|
|
586
|
+
|
|
587
|
+
return results
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True):
|
|
591
|
+
"""
|
|
592
|
+
Suggest processing method based on polygon characteristics.
|
|
593
|
+
|
|
594
|
+
Decision thresholds from benchmark data (area per polygon × polygon count):
|
|
595
|
+
- Small polygons (10 ha): need 250+ polygons for concurrent
|
|
596
|
+
- Medium polygons (100 ha): breakeven at ~100 polygons
|
|
597
|
+
- Large polygons (500 ha): concurrent wins at 50+ polygons
|
|
598
|
+
|
|
599
|
+
Parameters:
|
|
600
|
+
-----------
|
|
601
|
+
polygon_count : int
|
|
602
|
+
Number of polygons
|
|
603
|
+
mean_area_ha : float
|
|
604
|
+
Mean area per polygon in hectares
|
|
605
|
+
mean_vertices : float, optional
|
|
606
|
+
Mean number of vertices per polygon (can influence decision for complex geometries)
|
|
607
|
+
verbose : bool
|
|
608
|
+
Print recommendation explanation
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
--------
|
|
612
|
+
str: 'concurrent' or 'sequential'
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
# Primary decision based on area
|
|
616
|
+
if mean_area_ha >= 300: # Large polygons
|
|
617
|
+
breakeven = 50
|
|
618
|
+
method = "concurrent" if polygon_count >= breakeven else "sequential"
|
|
619
|
+
elif mean_area_ha >= 50: # Medium polygons
|
|
620
|
+
breakeven = 100
|
|
621
|
+
method = "concurrent" if polygon_count >= breakeven else "sequential"
|
|
622
|
+
else: # Small polygons
|
|
623
|
+
breakeven = 250
|
|
624
|
+
method = "concurrent" if polygon_count >= breakeven else "sequential"
|
|
625
|
+
|
|
626
|
+
# Optional adjustment based on vertex complexity (very high complexity favors concurrent)
|
|
627
|
+
if mean_vertices is not None and mean_vertices > 500:
|
|
628
|
+
# Reduce breakeven by 25% for very complex geometries
|
|
629
|
+
adjusted_breakeven = int(breakeven * 0.75)
|
|
630
|
+
method = "concurrent" if polygon_count >= adjusted_breakeven else "sequential"
|
|
631
|
+
|
|
632
|
+
if verbose:
|
|
633
|
+
print(f"\nMETHOD RECOMMENDATION")
|
|
634
|
+
print(
|
|
635
|
+
f" Polygons: {polygon_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
|
|
636
|
+
)
|
|
637
|
+
if mean_vertices is not None:
|
|
638
|
+
print(f" | Mean Vertices: {mean_vertices:.1f}", end="")
|
|
639
|
+
print()
|
|
640
|
+
print(f" Breakeven: {breakeven} polygons | Method: {method.upper()}")
|
|
641
|
+
|
|
642
|
+
return method
|