openforis-whisp 2.0.0b3__py3-none-any.whl → 3.0.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,805 @@
1
+ """
2
+ Data validation and constraint checking functions for WHISP.
3
+
4
+ Provides validation functions to check GeoJSON data against defined limits
5
+ and thresholds, raising informative errors when constraints are violated.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
11
+
12
+ # Note: area summary stats are estimations for use in deciding pathways for analysis
13
+ # (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
14
+
15
+
16
+ def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
17
+ """
18
+ Convert area from projected CRS units to hectares.
19
+
20
+ Most projected CRS use meters as units, so:
21
+ - area_sq_units is in square meters
22
+ - 1 hectare = 10,000 m²
23
+
24
+ Args:
25
+ area_sq_units: Area in square units of the projection (typically square meters)
26
+ crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
27
+
28
+ Returns:
29
+ Area in hectares
30
+ """
31
+ # Standard conversion: 1 hectare = 10,000 m²
32
+ # Most projected CRS use meters, so this works universally
33
+ return area_sq_units / 10000
34
+
35
+
36
+ def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
37
+ """
38
+ Estimate area from bounding box when actual area calculation fails.
39
+ Extracts bounding box and calculates its area as a fallback estimate.
40
+ Returns area in hectares.
41
+ """
42
+ try:
43
+ # Flatten all coordinates to find bounds
44
+ all_coords = []
45
+
46
+ def flatten_coords(c):
47
+ if isinstance(c[0], (list, tuple)) and isinstance(c[0][0], (list, tuple)):
48
+ for sub in c:
49
+ flatten_coords(sub)
50
+ else:
51
+ all_coords.extend(c)
52
+
53
+ flatten_coords(coords)
54
+ if not all_coords:
55
+ return 0
56
+
57
+ # Extract lon/lat values
58
+ lons = [c[0] for c in all_coords]
59
+ lats = [c[1] for c in all_coords]
60
+
61
+ min_lon, max_lon = min(lons), max(lons)
62
+ min_lat, max_lat = min(lats), max(lats)
63
+
64
+ # Bounding box area
65
+ bbox_area = (max_lon - min_lon) * (max_lat - min_lat)
66
+
67
+ # Apply conversion factor
68
+ return abs(bbox_area) * area_conversion_factor
69
+ except:
70
+ return 0
71
+
72
+
73
+ def analyze_geojson(
74
+ geojson_data: Path | str | dict,
75
+ metrics=[
76
+ "count",
77
+ "geometry_types",
78
+ "min_area_ha",
79
+ "mean_area_ha",
80
+ "median_area_ha",
81
+ "max_area_ha",
82
+ "area_percentiles",
83
+ "min_vertices",
84
+ "mean_vertices",
85
+ "median_vertices",
86
+ "max_vertices",
87
+ "vertex_percentiles",
88
+ ],
89
+ ):
90
+ """
91
+ Analyze GeoJSON polygons with selectable metrics for method selection.
92
+
93
+ Fast lightweight analysis - only computes requested metrics.
94
+ Works with or without area_ha property in features.
95
+ All metrics computed in a single sweep through the data for efficiency.
96
+
97
+ Warning: area metrics are estimations using EPSG:4326 - accuracy at equator only (extreme differences towards poles)
98
+
99
+ Parameters:
100
+ -----------
101
+ geojson_data : Path | str | dict
102
+ GeoJSON FeatureCollection. Can be:
103
+ - dict: GeoJSON FeatureCollection dictionary
104
+ - str: Path to GeoJSON file as string
105
+ - Path: pathlib.Path to GeoJSON file
106
+ metrics : list
107
+ Which metrics to return. Available metrics:
108
+ - 'count': number of polygons
109
+ - 'geometry_types': dict of geometry type counts (e.g., {'Polygon': 95, 'MultiPolygon': 5})
110
+ - 'min_area_ha', 'mean_area_ha', 'median_area_ha', 'max_area_ha': area statistics (hectares) (accurate only at equator)
111
+ - 'area_percentiles': dict with p25, p50 (median), p75, p90 area values (accurate only at equator)
112
+ - 'min_vertices', 'mean_vertices', 'median_vertices', 'max_vertices': vertex count statistics
113
+ - 'vertex_percentiles': dict with p25, p50 (median), p75, p90 vertex count values
114
+
115
+ Default includes all metrics for comprehensive analysis.
116
+ Examples:
117
+ - ['count'] -> just polygon count
118
+ - ['count', 'mean_area_ha', 'max_area_ha'] -> subset of metrics
119
+ - Default: all metrics for full statistical summary
120
+
121
+ Returns:
122
+ --------
123
+ dict with requested metrics:
124
+ - 'count': number of polygons
125
+ - 'geometry_types': {'Polygon': int, 'MultiPolygon': int, ...}
126
+ - 'min_area_ha': minimum area among all polygons in hectares
127
+ - 'mean_area_ha': mean area per polygon in hectares (calculated from coordinates)
128
+ - 'median_area_ha': median area among all polygons in hectares
129
+ - 'max_area_ha': maximum area among all polygons in hectares
130
+ - 'area_percentiles': {'p25': float, 'p50': float, 'p75': float, 'p90': float}
131
+ - 'min_vertices': minimum number of vertices among all polygons
132
+ - 'mean_vertices': mean number of vertices per polygon
133
+ - 'median_vertices': median number of vertices among all polygons
134
+ - 'max_vertices': maximum number of vertices among all polygons
135
+ - 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
136
+ """
137
+ results = {}
138
+ crs_warning = None
139
+ file_path = None
140
+
141
+ try:
142
+ # Load GeoJSON from file if path provided
143
+ if isinstance(geojson_data, (str, Path)):
144
+ file_path = Path(geojson_data)
145
+ if not file_path.exists():
146
+ raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
147
+
148
+ # Try UTF-8 first (most common), then fall back to auto-detection
149
+ try:
150
+ with open(file_path, "r", encoding="utf-8") as f:
151
+ geojson_data = json.load(f)
152
+ except UnicodeDecodeError:
153
+ # Auto-detect encoding if UTF-8 fails
154
+ try:
155
+ import chardet
156
+
157
+ with open(file_path, "rb") as f:
158
+ raw_data = f.read()
159
+ detected = chardet.detect(raw_data)
160
+ encoding = detected.get("encoding", "latin-1")
161
+
162
+ with open(file_path, "r", encoding=encoding, errors="replace") as f:
163
+ geojson_data = json.load(f)
164
+ except Exception:
165
+ # Final fallback: use latin-1 which accepts all byte values
166
+ with open(file_path, "r", encoding="latin-1") as f:
167
+ geojson_data = json.load(f)
168
+
169
+ # Detect CRS from file if available
170
+ try:
171
+ import geopandas as gpd
172
+
173
+ gdf = gpd.read_file(file_path)
174
+ if gdf.crs and gdf.crs != "EPSG:4326":
175
+ crs_warning = f"⚠️ CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
176
+ except Exception:
177
+ pass # If we can't detect CRS, continue without warning
178
+
179
+ features = geojson_data.get("features", [])
180
+
181
+ # Add CRS warning to results if detected
182
+ if crs_warning:
183
+ results["crs_warning"] = crs_warning
184
+ print(crs_warning)
185
+
186
+ if "count" in metrics:
187
+ results["count"] = len(features)
188
+
189
+ # Single sweep through features - compute all area/vertex metrics at once
190
+ if any(
191
+ m in metrics
192
+ for m in [
193
+ "geometry_types",
194
+ "min_area_ha",
195
+ "mean_area_ha",
196
+ "median_area_ha",
197
+ "max_area_ha",
198
+ "area_percentiles",
199
+ "min_vertices",
200
+ "mean_vertices",
201
+ "median_vertices",
202
+ "max_vertices",
203
+ "vertex_percentiles",
204
+ ]
205
+ ):
206
+ areas = []
207
+ vertices_list = []
208
+ geometry_type_counts = {}
209
+ valid_polygons = 0
210
+
211
+ # Tracking for fallback geometries
212
+ bbox_fallback_count = 0 # Geometries that used bounding box estimate
213
+ geometry_skip_count = 0 # Geometries completely skipped
214
+ polygon_type_stats = {} # Track stats by geometry type
215
+
216
+ # Detect CRS to determine area conversion factor
217
+ area_conversion_factor = 1232100 # Default: WGS84 (degrees to ha)
218
+ detected_crs = None
219
+
220
+ # Try to detect CRS from file if available
221
+ if file_path:
222
+ try:
223
+ import geopandas as gpd
224
+
225
+ gdf_temp = gpd.read_file(str(file_path))
226
+ detected_crs = gdf_temp.crs
227
+ if detected_crs and detected_crs != "EPSG:4326":
228
+ # Projected CRS typically uses meters, so convert m² to ha
229
+ # 1 ha = 10,000 m²
230
+ area_conversion_factor = 1 / 10000
231
+ except Exception:
232
+ pass # Use default if CRS detection fails
233
+
234
+ for feature in features:
235
+ try:
236
+ coords = feature["geometry"]["coordinates"]
237
+ geom_type = feature["geometry"]["type"]
238
+ properties = feature.get("properties", {})
239
+
240
+ # Count geometry types
241
+ geometry_type_counts[geom_type] = (
242
+ geometry_type_counts.get(geom_type, 0) + 1
243
+ )
244
+
245
+ if geom_type == "Polygon":
246
+ # Count vertices in this polygon
247
+ feature_vertices = 0
248
+ for ring in coords:
249
+ feature_vertices += len(ring)
250
+ vertices_list.append(feature_vertices)
251
+
252
+ # Calculate area from coordinates using shapely
253
+ try:
254
+ # Use shapely.geometry.shape to properly handle all geometry components
255
+ geom = shapely_shape(feature["geometry"])
256
+ # Convert using detected CRS
257
+ area_ha = abs(geom.area) * area_conversion_factor
258
+ areas.append(area_ha)
259
+ except Exception as e:
260
+ # Fallback: estimate from bounding box if geometry fails
261
+ bbox_area = _estimate_area_from_bounds(
262
+ coords, area_conversion_factor
263
+ )
264
+ if bbox_area > 0:
265
+ areas.append(bbox_area)
266
+ bbox_fallback_count += 1
267
+ polygon_type_stats["Polygon_bbox"] = (
268
+ polygon_type_stats.get("Polygon_bbox", 0) + 1
269
+ )
270
+ else:
271
+ geometry_skip_count += 1
272
+ polygon_type_stats["Polygon_skipped"] = (
273
+ polygon_type_stats.get("Polygon_skipped", 0) + 1
274
+ )
275
+ valid_polygons += 1
276
+
277
+ elif geom_type == "MultiPolygon":
278
+ # Count vertices in this multipolygon
279
+ feature_vertices = 0
280
+ for polygon in coords:
281
+ for ring in polygon:
282
+ feature_vertices += len(ring)
283
+ vertices_list.append(feature_vertices)
284
+
285
+ # Calculate area from coordinates using shapely
286
+ try:
287
+ # Use shapely.geometry.shape to properly handle MultiPolygon
288
+ geom = shapely_shape(feature["geometry"])
289
+ # Convert using detected CRS - use total area of all parts
290
+ area_ha = abs(geom.area) * area_conversion_factor
291
+ areas.append(area_ha)
292
+ except Exception as e:
293
+ # Fallback: estimate from bounding box if geometry fails
294
+ bbox_area = _estimate_area_from_bounds(
295
+ coords, area_conversion_factor
296
+ )
297
+ if bbox_area > 0:
298
+ areas.append(bbox_area)
299
+ bbox_fallback_count += 1
300
+ polygon_type_stats["MultiPolygon_bbox"] = (
301
+ polygon_type_stats.get("MultiPolygon_bbox", 0) + 1
302
+ )
303
+ else:
304
+ geometry_skip_count += 1
305
+ polygon_type_stats["MultiPolygon_skipped"] = (
306
+ polygon_type_stats.get("MultiPolygon_skipped", 0)
307
+ + 1
308
+ )
309
+ valid_polygons += 1
310
+
311
+ except:
312
+ continue
313
+
314
+ # Calculate statistics and return requested metrics
315
+
316
+ # Geometry type counts
317
+ if "geometry_types" in metrics:
318
+ results["geometry_types"] = geometry_type_counts
319
+
320
+ if areas or vertices_list:
321
+ # Area statistics
322
+ if areas:
323
+ if "min_area_ha" in metrics:
324
+ results["min_area_ha"] = round(min(areas), 2)
325
+ if "mean_area_ha" in metrics:
326
+ results["mean_area_ha"] = round(sum(areas) / len(areas), 2)
327
+
328
+ sorted_areas = sorted(areas) # Sort once for median and percentiles
329
+
330
+ if "median_area_ha" in metrics:
331
+ mid = len(sorted_areas) // 2
332
+ results["median_area_ha"] = round(
333
+ sorted_areas[mid]
334
+ if len(sorted_areas) % 2 == 1
335
+ else (sorted_areas[mid - 1] + sorted_areas[mid]) / 2,
336
+ 2,
337
+ )
338
+ if "max_area_ha" in metrics:
339
+ results["max_area_ha"] = round(max(areas), 2)
340
+
341
+ if "area_percentiles" in metrics:
342
+ n = len(sorted_areas)
343
+ p25_idx = n // 4
344
+ p50_idx = n // 2
345
+ p75_idx = (n * 3) // 4
346
+ p90_idx = int(n * 0.9)
347
+
348
+ results["area_percentiles"] = {
349
+ "p25": round(sorted_areas[p25_idx], 2),
350
+ "p50": round(
351
+ sorted_areas[p50_idx]
352
+ if n % 2 == 1
353
+ else (sorted_areas[p50_idx - 1] + sorted_areas[p50_idx])
354
+ / 2,
355
+ 2,
356
+ ),
357
+ "p75": round(sorted_areas[p75_idx], 2),
358
+ "p90": round(sorted_areas[p90_idx], 2),
359
+ }
360
+ else:
361
+ # Return zeros for no areas
362
+ if "min_area_ha" in metrics:
363
+ results["min_area_ha"] = 0
364
+ if "mean_area_ha" in metrics:
365
+ results["mean_area_ha"] = 0
366
+ if "median_area_ha" in metrics:
367
+ results["median_area_ha"] = 0
368
+ if "max_area_ha" in metrics:
369
+ results["max_area_ha"] = 0
370
+ if "area_percentiles" in metrics:
371
+ results["area_percentiles"] = {
372
+ "p25": 0,
373
+ "p50": 0,
374
+ "p75": 0,
375
+ "p90": 0,
376
+ }
377
+
378
+ # Vertex statistics
379
+ if vertices_list:
380
+ if "min_vertices" in metrics:
381
+ results["min_vertices"] = min(vertices_list)
382
+ if "mean_vertices" in metrics:
383
+ results["mean_vertices"] = round(
384
+ sum(vertices_list) / len(vertices_list), 2
385
+ )
386
+
387
+ sorted_vertices = sorted(
388
+ vertices_list
389
+ ) # Sort once for median and percentiles
390
+
391
+ if "median_vertices" in metrics:
392
+ mid = len(sorted_vertices) // 2
393
+ results["median_vertices"] = (
394
+ sorted_vertices[mid]
395
+ if len(sorted_vertices) % 2 == 1
396
+ else round(
397
+ (sorted_vertices[mid - 1] + sorted_vertices[mid]) / 2, 0
398
+ )
399
+ )
400
+ if "max_vertices" in metrics:
401
+ results["max_vertices"] = max(vertices_list)
402
+
403
+ if "vertex_percentiles" in metrics:
404
+ n = len(sorted_vertices)
405
+ p25_idx = n // 4
406
+ p50_idx = n // 2
407
+ p75_idx = (n * 3) // 4
408
+ p90_idx = int(n * 0.9)
409
+
410
+ results["vertex_percentiles"] = {
411
+ "p25": sorted_vertices[p25_idx],
412
+ "p50": sorted_vertices[p50_idx]
413
+ if n % 2 == 1
414
+ else round(
415
+ (
416
+ sorted_vertices[p50_idx - 1]
417
+ + sorted_vertices[p50_idx]
418
+ )
419
+ / 2,
420
+ 0,
421
+ ),
422
+ "p75": sorted_vertices[p75_idx],
423
+ "p90": sorted_vertices[p90_idx],
424
+ }
425
+ else:
426
+ # Return zeros for no vertices
427
+ if "min_vertices" in metrics:
428
+ results["min_vertices"] = 0
429
+ if "mean_vertices" in metrics:
430
+ results["mean_vertices"] = 0
431
+ if "median_vertices" in metrics:
432
+ results["median_vertices"] = 0
433
+ if "max_vertices" in metrics:
434
+ results["max_vertices"] = 0
435
+ if "vertex_percentiles" in metrics:
436
+ results["vertex_percentiles"] = {
437
+ "p25": 0,
438
+ "p50": 0,
439
+ "p75": 0,
440
+ "p90": 0,
441
+ }
442
+ else:
443
+ # Return zeros for empty datasets
444
+ for metric in [
445
+ "min_area_ha",
446
+ "mean_area_ha",
447
+ "median_area_ha",
448
+ "max_area_ha",
449
+ "area_percentiles",
450
+ "min_vertices",
451
+ "mean_vertices",
452
+ "median_vertices",
453
+ "max_vertices",
454
+ "vertex_percentiles",
455
+ ]:
456
+ if metric in metrics:
457
+ results[metric] = (
458
+ 0
459
+ if metric not in ["area_percentiles", "vertex_percentiles"]
460
+ else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
461
+ )
462
+
463
+ # Add geometry quality logging to results
464
+ if bbox_fallback_count > 0 or geometry_skip_count > 0:
465
+ geometry_quality_log = (
466
+ f"Geometry quality summary:\n"
467
+ f" - Bounding box fallback used: {bbox_fallback_count} features\n"
468
+ f" - Geometries skipped: {geometry_skip_count} features"
469
+ )
470
+ if polygon_type_stats:
471
+ geometry_quality_log += "\n - Breakdown:"
472
+ for stat_type, count in sorted(polygon_type_stats.items()):
473
+ geometry_quality_log += f"\n - {stat_type}: {count}"
474
+
475
+ results["geometry_quality_note"] = geometry_quality_log
476
+ print(geometry_quality_log)
477
+
478
+ return results
479
+
480
+ except Exception as e:
481
+ print(f"Error: {str(e)}")
482
+ return {}
483
+
484
+
485
+ def _check_metric_constraints(
486
+ metrics,
487
+ max_polygon_count=250_000,
488
+ max_mean_area_ha=10_000,
489
+ max_max_area_ha=None,
490
+ max_mean_vertices=None,
491
+ max_max_vertices=10_000,
492
+ ):
493
+ """
494
+ Check if computed metrics violate any constraints.
495
+
496
+ Internal helper function for constraint validation.
497
+
498
+ Parameters:
499
+ -----------
500
+ metrics : dict
501
+ Dictionary of computed metrics with keys: count, mean_area_ha, max_area_ha,
502
+ mean_vertices, max_vertices
503
+ max_polygon_count : int
504
+ Maximum allowed number of polygons
505
+ max_mean_area_ha : float
506
+ Maximum allowed mean area per polygon in hectares
507
+ max_max_area_ha : float, optional
508
+ Maximum allowed maximum area per polygon in hectares
509
+ max_mean_vertices : float, optional
510
+ Maximum allowed mean vertices per polygon
511
+ max_max_vertices : int, optional
512
+ Maximum allowed vertices per polygon
513
+
514
+ Returns:
515
+ --------
516
+ list
517
+ List of violation strings (empty if all constraints pass)
518
+ """
519
+ violations = []
520
+
521
+ polygon_count = metrics["count"]
522
+ mean_area = metrics["mean_area_ha"]
523
+ max_area = metrics["max_area_ha"]
524
+ mean_vertices = metrics["mean_vertices"]
525
+ max_vertices_value = metrics["max_vertices"]
526
+
527
+ if polygon_count > max_polygon_count:
528
+ violations.append(
529
+ f"Polygon count ({polygon_count:,}) exceeds limit ({max_polygon_count:,})"
530
+ )
531
+
532
+ if mean_area > max_mean_area_ha:
533
+ violations.append(
534
+ f"Mean area ({mean_area:,.2f} ha) exceeds limit ({max_mean_area_ha:,} ha)"
535
+ )
536
+
537
+ if max_max_area_ha is not None and max_area > max_max_area_ha:
538
+ violations.append(
539
+ f"Max area ({max_area:,.2f} ha) exceeds limit ({max_max_area_ha:,} ha)"
540
+ )
541
+
542
+ if max_mean_vertices is not None and mean_vertices > max_mean_vertices:
543
+ violations.append(
544
+ f"Mean vertices ({mean_vertices:.2f}) exceeds limit ({max_mean_vertices:,})"
545
+ )
546
+
547
+ if max_max_vertices is not None and max_vertices_value > max_max_vertices:
548
+ violations.append(
549
+ f"Max vertices ({max_vertices_value:,}) exceeds limit ({max_max_vertices:,})"
550
+ )
551
+
552
+ return violations
553
+
554
+
555
+ def validate_geojson_constraints(
556
+ geojson_data: Path | str | dict,
557
+ max_polygon_count=250_000,
558
+ max_mean_area_ha=10_000,
559
+ max_max_area_ha=None,
560
+ max_mean_vertices=None,
561
+ max_max_vertices=10_000,
562
+ verbose=True,
563
+ ):
564
+ """
565
+ Validate GeoJSON data against defined constraints.
566
+
567
+ Raises ValueError if any metrics exceed the specified limits.
568
+ Uses analyze_geojson to compute metrics efficiently in a single sweep.
569
+
570
+ Parameters:
571
+ -----------
572
+ geojson_data : Path | str | dict
573
+ GeoJSON FeatureCollection to validate. Can be:
574
+ - dict: GeoJSON FeatureCollection dictionary
575
+ - str: Path to GeoJSON file as string
576
+ - Path: pathlib.Path to GeoJSON file
577
+ max_polygon_count : int, optional
578
+ Maximum allowed number of polygons (default: 250,000)
579
+ max_mean_area_ha : float, optional
580
+ Maximum allowed mean area per polygon in hectares (default: 10,000)
581
+ max_max_area_ha : float, optional
582
+ Maximum allowed maximum area per polygon in hectares (default: None, no limit)
583
+ max_mean_vertices : float, optional
584
+ Maximum allowed mean vertices per polygon (default: None, no limit)
585
+ max_max_vertices : int, optional
586
+ Maximum allowed vertices per polygon (default: 10,000)
587
+ verbose : bool
588
+ Print validation results (default: True)
589
+
590
+ Returns:
591
+ --------
592
+ dict
593
+ Dictionary containing computed metrics that passed validation:
594
+ {
595
+ 'count': int,
596
+ 'mean_area_ha': float,
597
+ 'max_area_ha': float,
598
+ 'mean_vertices': float,
599
+ 'max_vertices': int,
600
+ 'valid': bool
601
+ }
602
+
603
+ Raises:
604
+ -------
605
+ ValueError
606
+ If any constraint is violated
607
+ """
608
+ from openforis_whisp.data_conversion import convert_geojson_to_ee
609
+ from shapely.geometry import Polygon as ShapelyPolygon
610
+
611
+ # Load GeoJSON from file if path provided
612
+ if isinstance(geojson_data, (str, Path)):
613
+ file_path = Path(geojson_data)
614
+ if not file_path.exists():
615
+ raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
616
+ with open(file_path, "r") as f:
617
+ geojson_data = json.load(f)
618
+
619
+ if verbose:
620
+ print("\n" + "=" * 80)
621
+ print("GEOJSON CONSTRAINT VALIDATION")
622
+ print("=" * 80)
623
+ print("\nConstraint Limits:")
624
+ print(f" - Max polygon count: {max_polygon_count:,}")
625
+ print(f" - Max mean area (ha): {max_mean_area_ha:,}")
626
+ if max_max_area_ha is not None:
627
+ print(f" - Max area per polygon (ha): {max_max_area_ha:,}")
628
+ if max_mean_vertices is not None:
629
+ print(f" - Max mean vertices: {max_mean_vertices:,}")
630
+ if max_max_vertices is not None:
631
+ print(f" - Max vertices per polygon: {max_max_vertices:,}")
632
+
633
+ # Collect all metrics we need to compute
634
+ metrics_to_compute = [
635
+ "count",
636
+ "mean_area_ha",
637
+ "max_area_ha",
638
+ "mean_vertices",
639
+ "max_vertices",
640
+ ]
641
+
642
+ # Import analyze_geojson (will be available after function is defined elsewhere)
643
+ # For now, we'll compute it here efficiently in a single sweep
644
+ features = geojson_data.get("features", [])
645
+
646
+ # Single sweep computation
647
+ total_area = 0
648
+ total_vertices = 0
649
+ max_area = 0
650
+ max_vertices_value = 0
651
+ valid_polygons = 0
652
+
653
+ for feature in features:
654
+ try:
655
+ coords = feature["geometry"]["coordinates"]
656
+ geom_type = feature["geometry"]["type"]
657
+
658
+ if geom_type == "Polygon":
659
+ # Count vertices
660
+ feature_vertices = 0
661
+ for ring in coords:
662
+ feature_vertices += len(ring)
663
+ total_vertices += feature_vertices
664
+ max_vertices_value = max(max_vertices_value, feature_vertices)
665
+
666
+ # Calculate area
667
+ try:
668
+ poly = ShapelyPolygon(coords[0])
669
+ area_ha = abs(poly.area) * 1232100
670
+ total_area += area_ha
671
+ max_area = max(max_area, area_ha)
672
+ except:
673
+ pass
674
+ valid_polygons += 1
675
+
676
+ elif geom_type == "MultiPolygon":
677
+ # Count vertices
678
+ feature_vertices = 0
679
+ for polygon in coords:
680
+ for ring in polygon:
681
+ feature_vertices += len(ring)
682
+ total_vertices += feature_vertices
683
+ max_vertices_value = max(max_vertices_value, feature_vertices)
684
+
685
+ # Calculate area
686
+ try:
687
+ for polygon in coords:
688
+ poly = ShapelyPolygon(polygon[0])
689
+ area_ha = abs(poly.area) * 1232100
690
+ total_area += area_ha
691
+ max_area = max(max_area, area_ha)
692
+ except:
693
+ pass
694
+ valid_polygons += 1
695
+
696
+ except:
697
+ continue
698
+
699
+ # Compute means
700
+ polygon_count = len(features)
701
+ mean_area = total_area / valid_polygons if valid_polygons > 0 else 0
702
+ mean_vertices = total_vertices / valid_polygons if valid_polygons > 0 else 0
703
+
704
+ results = {
705
+ "count": polygon_count,
706
+ "mean_area_ha": round(mean_area, 2),
707
+ "max_area_ha": round(max_area, 2),
708
+ "mean_vertices": round(mean_vertices, 2),
709
+ "max_vertices": max_vertices_value,
710
+ "valid": True,
711
+ }
712
+
713
+ if verbose:
714
+ print("\nComputed Metrics:")
715
+ print(f" - Polygon count: {results['count']:,}")
716
+ print(f" - Mean area (ha): {results['mean_area_ha']:,}")
717
+ print(f" - Max area (ha): {results['max_area_ha']:,}")
718
+ print(f" - Mean vertices: {results['mean_vertices']:,}")
719
+ print(f" - Max vertices: {results['max_vertices']:,}")
720
+
721
+ # Check constraints using dedicated method
722
+ violations = _check_metric_constraints(
723
+ results,
724
+ max_polygon_count=max_polygon_count,
725
+ max_mean_area_ha=max_mean_area_ha,
726
+ max_max_area_ha=max_max_area_ha,
727
+ max_mean_vertices=max_mean_vertices,
728
+ max_max_vertices=max_max_vertices,
729
+ )
730
+
731
+ # Report results
732
+ if verbose:
733
+ print("\n" + "=" * 80)
734
+ if violations:
735
+ print("VALIDATION FAILED")
736
+ print("=" * 80)
737
+ for violation in violations:
738
+ print(f"\n{violation}")
739
+ results["valid"] = False
740
+ else:
741
+ print("VALIDATION PASSED")
742
+ print("=" * 80)
743
+ print("\nAll metrics within acceptable limits")
744
+
745
+ # Raise error with detailed message if any constraint violated
746
+ if violations:
747
+ error_message = "Constraint validation failed:\n" + "\n".join(violations)
748
+ raise ValueError(error_message)
749
+
750
+ return results
751
+
752
+
753
+ def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True):
754
+ """
755
+ Suggest processing method based on polygon characteristics.
756
+
757
+ Decision thresholds from benchmark data (area per polygon × polygon count):
758
+ - Small polygons (10 ha): need 250+ polygons for concurrent
759
+ - Medium polygons (100 ha): breakeven at ~100 polygons
760
+ - Large polygons (500 ha): concurrent wins at 50+ polygons
761
+
762
+ Parameters:
763
+ -----------
764
+ polygon_count : int
765
+ Number of polygons
766
+ mean_area_ha : float
767
+ Mean area per polygon in hectares
768
+ mean_vertices : float, optional
769
+ Mean number of vertices per polygon (can influence decision for complex geometries)
770
+ verbose : bool
771
+ Print recommendation explanation
772
+
773
+ Returns:
774
+ --------
775
+ str: 'concurrent' or 'sequential'
776
+ """
777
+
778
+ # Primary decision based on area
779
+ if mean_area_ha >= 300: # Large polygons
780
+ breakeven = 50
781
+ method = "concurrent" if polygon_count >= breakeven else "sequential"
782
+ elif mean_area_ha >= 50: # Medium polygons
783
+ breakeven = 100
784
+ method = "concurrent" if polygon_count >= breakeven else "sequential"
785
+ else: # Small polygons
786
+ breakeven = 250
787
+ method = "concurrent" if polygon_count >= breakeven else "sequential"
788
+
789
+ # Optional adjustment based on vertex complexity (very high complexity favors concurrent)
790
+ if mean_vertices is not None and mean_vertices > 500:
791
+ # Reduce breakeven by 25% for very complex geometries
792
+ adjusted_breakeven = int(breakeven * 0.75)
793
+ method = "concurrent" if polygon_count >= adjusted_breakeven else "sequential"
794
+
795
+ if verbose:
796
+ print(f"\nMETHOD RECOMMENDATION")
797
+ print(
798
+ f" Polygons: {polygon_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
799
+ )
800
+ if mean_vertices is not None:
801
+ print(f" | Mean Vertices: {mean_vertices:.1f}", end="")
802
+ print()
803
+ print(f" Breakeven: {breakeven} polygons | Method: {method.upper()}")
804
+
805
+ return method