openforis-whisp 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +7 -7
- openforis_whisp/advanced_stats.py +400 -93
- openforis_whisp/data_checks.py +178 -15
- openforis_whisp/data_conversion.py +154 -59
- openforis_whisp/reformat.py +2 -29
- openforis_whisp/stats.py +15 -45
- openforis_whisp/utils.py +449 -80
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/METADATA +1 -1
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/RECORD +11 -11
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/LICENSE +0 -0
- {openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/WHEEL +0 -0
openforis_whisp/data_checks.py
CHANGED
|
@@ -7,10 +7,69 @@ and thresholds, raising informative errors when constraints are violated.
|
|
|
7
7
|
|
|
8
8
|
import json
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from shapely.geometry import Polygon as ShapelyPolygon
|
|
10
|
+
from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
|
|
11
11
|
|
|
12
12
|
# Note: area summary stats are estimations for use in deciding pathways for analysis
|
|
13
13
|
# (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
|
|
17
|
+
"""
|
|
18
|
+
Convert area from projected CRS units to hectares.
|
|
19
|
+
|
|
20
|
+
Most projected CRS use meters as units, so:
|
|
21
|
+
- area_sq_units is in square meters
|
|
22
|
+
- 1 hectare = 10,000 m²
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
area_sq_units: Area in square units of the projection (typically square meters)
|
|
26
|
+
crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Area in hectares
|
|
30
|
+
"""
|
|
31
|
+
# Standard conversion: 1 hectare = 10,000 m²
|
|
32
|
+
# Most projected CRS use meters, so this works universally
|
|
33
|
+
return area_sq_units / 10000
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
|
|
37
|
+
"""
|
|
38
|
+
Estimate area from bounding box when actual area calculation fails.
|
|
39
|
+
Extracts bounding box and calculates its area as a fallback estimate.
|
|
40
|
+
Returns area in hectares.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
# Flatten all coordinates to find bounds
|
|
44
|
+
all_coords = []
|
|
45
|
+
|
|
46
|
+
def flatten_coords(c):
|
|
47
|
+
if isinstance(c[0], (list, tuple)) and isinstance(c[0][0], (list, tuple)):
|
|
48
|
+
for sub in c:
|
|
49
|
+
flatten_coords(sub)
|
|
50
|
+
else:
|
|
51
|
+
all_coords.extend(c)
|
|
52
|
+
|
|
53
|
+
flatten_coords(coords)
|
|
54
|
+
if not all_coords:
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
# Extract lon/lat values
|
|
58
|
+
lons = [c[0] for c in all_coords]
|
|
59
|
+
lats = [c[1] for c in all_coords]
|
|
60
|
+
|
|
61
|
+
min_lon, max_lon = min(lons), max(lons)
|
|
62
|
+
min_lat, max_lat = min(lats), max(lats)
|
|
63
|
+
|
|
64
|
+
# Bounding box area
|
|
65
|
+
bbox_area = (max_lon - min_lon) * (max_lat - min_lat)
|
|
66
|
+
|
|
67
|
+
# Apply conversion factor
|
|
68
|
+
return abs(bbox_area) * area_conversion_factor
|
|
69
|
+
except:
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
14
73
|
def analyze_geojson(
|
|
15
74
|
geojson_data: Path | str | dict,
|
|
16
75
|
metrics=[
|
|
@@ -76,6 +135,8 @@ def analyze_geojson(
|
|
|
76
135
|
- 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
|
|
77
136
|
"""
|
|
78
137
|
results = {}
|
|
138
|
+
crs_warning = None
|
|
139
|
+
file_path = None
|
|
79
140
|
|
|
80
141
|
try:
|
|
81
142
|
# Load GeoJSON from file if path provided
|
|
@@ -83,11 +144,45 @@ def analyze_geojson(
|
|
|
83
144
|
file_path = Path(geojson_data)
|
|
84
145
|
if not file_path.exists():
|
|
85
146
|
raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
|
|
86
|
-
|
|
87
|
-
|
|
147
|
+
|
|
148
|
+
# Try UTF-8 first (most common), then fall back to auto-detection
|
|
149
|
+
try:
|
|
150
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
151
|
+
geojson_data = json.load(f)
|
|
152
|
+
except UnicodeDecodeError:
|
|
153
|
+
# Auto-detect encoding if UTF-8 fails
|
|
154
|
+
try:
|
|
155
|
+
import chardet
|
|
156
|
+
|
|
157
|
+
with open(file_path, "rb") as f:
|
|
158
|
+
raw_data = f.read()
|
|
159
|
+
detected = chardet.detect(raw_data)
|
|
160
|
+
encoding = detected.get("encoding", "latin-1")
|
|
161
|
+
|
|
162
|
+
with open(file_path, "r", encoding=encoding, errors="replace") as f:
|
|
163
|
+
geojson_data = json.load(f)
|
|
164
|
+
except Exception:
|
|
165
|
+
# Final fallback: use latin-1 which accepts all byte values
|
|
166
|
+
with open(file_path, "r", encoding="latin-1") as f:
|
|
167
|
+
geojson_data = json.load(f)
|
|
168
|
+
|
|
169
|
+
# Detect CRS from file if available
|
|
170
|
+
try:
|
|
171
|
+
import geopandas as gpd
|
|
172
|
+
|
|
173
|
+
gdf = gpd.read_file(file_path)
|
|
174
|
+
if gdf.crs and gdf.crs != "EPSG:4326":
|
|
175
|
+
crs_warning = f"⚠️ CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
|
|
176
|
+
except Exception:
|
|
177
|
+
pass # If we can't detect CRS, continue without warning
|
|
88
178
|
|
|
89
179
|
features = geojson_data.get("features", [])
|
|
90
180
|
|
|
181
|
+
# Add CRS warning to results if detected
|
|
182
|
+
if crs_warning:
|
|
183
|
+
results["crs_warning"] = crs_warning
|
|
184
|
+
print(crs_warning)
|
|
185
|
+
|
|
91
186
|
if "count" in metrics:
|
|
92
187
|
results["count"] = len(features)
|
|
93
188
|
|
|
@@ -113,6 +208,29 @@ def analyze_geojson(
|
|
|
113
208
|
geometry_type_counts = {}
|
|
114
209
|
valid_polygons = 0
|
|
115
210
|
|
|
211
|
+
# Tracking for fallback geometries
|
|
212
|
+
bbox_fallback_count = 0 # Geometries that used bounding box estimate
|
|
213
|
+
geometry_skip_count = 0 # Geometries completely skipped
|
|
214
|
+
polygon_type_stats = {} # Track stats by geometry type
|
|
215
|
+
|
|
216
|
+
# Detect CRS to determine area conversion factor
|
|
217
|
+
area_conversion_factor = 1232100 # Default: WGS84 (degrees to ha)
|
|
218
|
+
detected_crs = None
|
|
219
|
+
|
|
220
|
+
# Try to detect CRS from file if available
|
|
221
|
+
if file_path:
|
|
222
|
+
try:
|
|
223
|
+
import geopandas as gpd
|
|
224
|
+
|
|
225
|
+
gdf_temp = gpd.read_file(str(file_path))
|
|
226
|
+
detected_crs = gdf_temp.crs
|
|
227
|
+
if detected_crs and detected_crs != "EPSG:4326":
|
|
228
|
+
# Projected CRS typically uses meters, so convert m² to ha
|
|
229
|
+
# 1 ha = 10,000 m²
|
|
230
|
+
area_conversion_factor = 1 / 10000
|
|
231
|
+
except Exception:
|
|
232
|
+
pass # Use default if CRS detection fails
|
|
233
|
+
|
|
116
234
|
for feature in features:
|
|
117
235
|
try:
|
|
118
236
|
coords = feature["geometry"]["coordinates"]
|
|
@@ -133,13 +251,27 @@ def analyze_geojson(
|
|
|
133
251
|
|
|
134
252
|
# Calculate area from coordinates using shapely
|
|
135
253
|
try:
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
#
|
|
139
|
-
area_ha = abs(
|
|
254
|
+
# Use shapely.geometry.shape to properly handle all geometry components
|
|
255
|
+
geom = shapely_shape(feature["geometry"])
|
|
256
|
+
# Convert using detected CRS
|
|
257
|
+
area_ha = abs(geom.area) * area_conversion_factor
|
|
140
258
|
areas.append(area_ha)
|
|
141
|
-
except:
|
|
142
|
-
|
|
259
|
+
except Exception as e:
|
|
260
|
+
# Fallback: estimate from bounding box if geometry fails
|
|
261
|
+
bbox_area = _estimate_area_from_bounds(
|
|
262
|
+
coords, area_conversion_factor
|
|
263
|
+
)
|
|
264
|
+
if bbox_area > 0:
|
|
265
|
+
areas.append(bbox_area)
|
|
266
|
+
bbox_fallback_count += 1
|
|
267
|
+
polygon_type_stats["Polygon_bbox"] = (
|
|
268
|
+
polygon_type_stats.get("Polygon_bbox", 0) + 1
|
|
269
|
+
)
|
|
270
|
+
else:
|
|
271
|
+
geometry_skip_count += 1
|
|
272
|
+
polygon_type_stats["Polygon_skipped"] = (
|
|
273
|
+
polygon_type_stats.get("Polygon_skipped", 0) + 1
|
|
274
|
+
)
|
|
143
275
|
valid_polygons += 1
|
|
144
276
|
|
|
145
277
|
elif geom_type == "MultiPolygon":
|
|
@@ -152,12 +284,28 @@ def analyze_geojson(
|
|
|
152
284
|
|
|
153
285
|
# Calculate area from coordinates using shapely
|
|
154
286
|
try:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
287
|
+
# Use shapely.geometry.shape to properly handle MultiPolygon
|
|
288
|
+
geom = shapely_shape(feature["geometry"])
|
|
289
|
+
# Convert using detected CRS - use total area of all parts
|
|
290
|
+
area_ha = abs(geom.area) * area_conversion_factor
|
|
291
|
+
areas.append(area_ha)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
# Fallback: estimate from bounding box if geometry fails
|
|
294
|
+
bbox_area = _estimate_area_from_bounds(
|
|
295
|
+
coords, area_conversion_factor
|
|
296
|
+
)
|
|
297
|
+
if bbox_area > 0:
|
|
298
|
+
areas.append(bbox_area)
|
|
299
|
+
bbox_fallback_count += 1
|
|
300
|
+
polygon_type_stats["MultiPolygon_bbox"] = (
|
|
301
|
+
polygon_type_stats.get("MultiPolygon_bbox", 0) + 1
|
|
302
|
+
)
|
|
303
|
+
else:
|
|
304
|
+
geometry_skip_count += 1
|
|
305
|
+
polygon_type_stats["MultiPolygon_skipped"] = (
|
|
306
|
+
polygon_type_stats.get("MultiPolygon_skipped", 0)
|
|
307
|
+
+ 1
|
|
308
|
+
)
|
|
161
309
|
valid_polygons += 1
|
|
162
310
|
|
|
163
311
|
except:
|
|
@@ -312,6 +460,21 @@ def analyze_geojson(
|
|
|
312
460
|
else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
|
|
313
461
|
)
|
|
314
462
|
|
|
463
|
+
# Add geometry quality logging to results
|
|
464
|
+
if bbox_fallback_count > 0 or geometry_skip_count > 0:
|
|
465
|
+
geometry_quality_log = (
|
|
466
|
+
f"Geometry quality summary:\n"
|
|
467
|
+
f" - Bounding box fallback used: {bbox_fallback_count} features\n"
|
|
468
|
+
f" - Geometries skipped: {geometry_skip_count} features"
|
|
469
|
+
)
|
|
470
|
+
if polygon_type_stats:
|
|
471
|
+
geometry_quality_log += "\n - Breakdown:"
|
|
472
|
+
for stat_type, count in sorted(polygon_type_stats.items()):
|
|
473
|
+
geometry_quality_log += f"\n - {stat_type}: {count}"
|
|
474
|
+
|
|
475
|
+
results["geometry_quality_note"] = geometry_quality_log
|
|
476
|
+
print(geometry_quality_log)
|
|
477
|
+
|
|
315
478
|
return results
|
|
316
479
|
|
|
317
480
|
except Exception as e:
|
|
@@ -12,67 +12,81 @@ import geopandas as gpd
|
|
|
12
12
|
import ee
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
15
|
+
# ============================================================================
|
|
16
|
+
# HELPER FUNCTIONS FOR UNIFIED PROCESSING PATHWAY
|
|
17
|
+
# ============================================================================
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _sanitize_geodataframe(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
|
20
21
|
"""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
Sanitize GeoDataFrame data types for JSON serialization.
|
|
23
|
+
|
|
24
|
+
Converts problematic data types that cannot be directly serialized:
|
|
25
|
+
- DateTime/Timestamp columns → ISO format strings
|
|
26
|
+
- Object columns → strings
|
|
27
|
+
- Skips geometry column
|
|
25
28
|
|
|
26
29
|
Args:
|
|
27
|
-
|
|
28
|
-
or a GeoJSON dictionary object.
|
|
29
|
-
enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326). Defaults to True.
|
|
30
|
-
Only applies when input is a file path (dicts are assumed to be in WGS84).
|
|
31
|
-
strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries. Defaults to True.
|
|
30
|
+
gdf (gpd.GeoDataFrame): Input GeoDataFrame
|
|
32
31
|
|
|
33
32
|
Returns:
|
|
34
|
-
|
|
33
|
+
gpd.GeoDataFrame: GeoDataFrame with sanitized data types
|
|
34
|
+
"""
|
|
35
|
+
gdf = gdf.copy()
|
|
36
|
+
for col in gdf.columns:
|
|
37
|
+
if col != gdf.geometry.name: # Skip geometry column
|
|
38
|
+
# Handle datetime/timestamp columns
|
|
39
|
+
if pd.api.types.is_datetime64_any_dtype(gdf[col]):
|
|
40
|
+
gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
|
|
41
|
+
# Handle other problematic types
|
|
42
|
+
elif gdf[col].dtype == "object":
|
|
43
|
+
# Convert any remaining non-serializable objects to strings
|
|
44
|
+
gdf[col] = gdf[col].astype(str)
|
|
45
|
+
return gdf
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _ensure_wgs84_crs(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
|
35
49
|
"""
|
|
36
|
-
|
|
37
|
-
# Input is already a GeoJSON dictionary - skip file reading
|
|
38
|
-
geojson_data = geojson_filepath
|
|
39
|
-
elif isinstance(geojson_filepath, (str, Path)):
|
|
40
|
-
file_path = os.path.abspath(geojson_filepath)
|
|
50
|
+
Ensure GeoDataFrame uses WGS 84 (EPSG:4326) coordinate reference system.
|
|
41
51
|
|
|
42
|
-
|
|
43
|
-
|
|
52
|
+
- If CRS is None, assumes WGS 84
|
|
53
|
+
- If CRS is not WGS 84, converts to WGS 84
|
|
54
|
+
- If already WGS 84, returns unchanged
|
|
44
55
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if col != gdf.geometry.name: # Skip geometry column
|
|
48
|
-
# Handle datetime/timestamp columns
|
|
49
|
-
if pd.api.types.is_datetime64_any_dtype(gdf[col]):
|
|
50
|
-
gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
|
|
51
|
-
# Handle other problematic types
|
|
52
|
-
elif gdf[col].dtype == "object":
|
|
53
|
-
# Convert any remaining non-serializable objects to strings
|
|
54
|
-
gdf[col] = gdf[col].astype(str)
|
|
55
|
-
|
|
56
|
-
# Check and convert CRS if needed
|
|
57
|
-
if enforce_wgs84:
|
|
58
|
-
if gdf.crs is None:
|
|
59
|
-
# Assuming WGS 84 if no CRS defined
|
|
60
|
-
pass
|
|
61
|
-
elif gdf.crs != "EPSG:4326":
|
|
62
|
-
gdf = gdf.to_crs("EPSG:4326")
|
|
63
|
-
|
|
64
|
-
# Convert to GeoJSON
|
|
65
|
-
geojson_data = json.loads(gdf.to_json())
|
|
66
|
-
else:
|
|
67
|
-
raise ValueError(
|
|
68
|
-
"Input must be a file path (str or Path) or a GeoJSON dictionary object (dict)"
|
|
69
|
-
)
|
|
56
|
+
Args:
|
|
57
|
+
gdf (gpd.GeoDataFrame): Input GeoDataFrame
|
|
70
58
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
59
|
+
Returns:
|
|
60
|
+
gpd.GeoDataFrame: GeoDataFrame in WGS 84
|
|
61
|
+
"""
|
|
62
|
+
if gdf.crs is None:
|
|
63
|
+
# Assuming WGS 84 if no CRS defined
|
|
64
|
+
return gdf
|
|
65
|
+
elif gdf.crs != "EPSG:4326":
|
|
66
|
+
return gdf.to_crs("EPSG:4326")
|
|
67
|
+
return gdf
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _create_ee_feature_collection(
|
|
71
|
+
geojson_data: dict, strip_z_coords: bool = True, input_source: str = "input"
|
|
72
|
+
) -> ee.FeatureCollection:
|
|
73
|
+
"""
|
|
74
|
+
Create Earth Engine FeatureCollection from GeoJSON dict with error recovery.
|
|
75
|
+
|
|
76
|
+
Attempts to create EE FeatureCollection. If it fails due to 3D coordinates
|
|
77
|
+
and strip_z_coords is True, automatically strips Z values and retries.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
geojson_data (dict): GeoJSON data dictionary
|
|
81
|
+
strip_z_coords (bool): Whether to retry with 2D geometries on failure
|
|
82
|
+
input_source (str): Description of input source for logging
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
ee.FeatureCollection: Earth Engine FeatureCollection
|
|
74
86
|
|
|
75
|
-
|
|
87
|
+
Raises:
|
|
88
|
+
ee.EEException: If conversion fails even after retries
|
|
89
|
+
"""
|
|
76
90
|
try:
|
|
77
91
|
feature_collection = ee.FeatureCollection(
|
|
78
92
|
create_feature_collection(geojson_data)
|
|
@@ -81,16 +95,16 @@ def convert_geojson_to_ee(
|
|
|
81
95
|
except ee.EEException as e:
|
|
82
96
|
if "Invalid GeoJSON geometry" in str(e) and strip_z_coords:
|
|
83
97
|
# Apply print_once deduplication for Z-coordinate stripping messages
|
|
84
|
-
if not hasattr(
|
|
85
|
-
|
|
98
|
+
if not hasattr(_create_ee_feature_collection, "_printed_z_messages"):
|
|
99
|
+
_create_ee_feature_collection._printed_z_messages = set()
|
|
86
100
|
|
|
87
|
-
z_message_key = f"z_coords_{
|
|
88
|
-
if z_message_key not in
|
|
101
|
+
z_message_key = f"z_coords_{input_source}"
|
|
102
|
+
if z_message_key not in _create_ee_feature_collection._printed_z_messages:
|
|
89
103
|
print(
|
|
90
104
|
"Warning: Invalid GeoJSON geometry detected, likely due to 3D coordinates."
|
|
91
105
|
)
|
|
92
106
|
print("Attempting to fix by stripping Z coordinates...")
|
|
93
|
-
|
|
107
|
+
_create_ee_feature_collection._printed_z_messages.add(z_message_key)
|
|
94
108
|
|
|
95
109
|
# Apply Z-coordinate stripping
|
|
96
110
|
geojson_data_fixed = _strip_z_coordinates_from_geojson(geojson_data)
|
|
@@ -101,10 +115,15 @@ def convert_geojson_to_ee(
|
|
|
101
115
|
create_feature_collection(geojson_data_fixed)
|
|
102
116
|
)
|
|
103
117
|
|
|
104
|
-
success_message_key = f"z_coords_success_{
|
|
105
|
-
if
|
|
118
|
+
success_message_key = f"z_coords_success_{input_source}"
|
|
119
|
+
if (
|
|
120
|
+
success_message_key
|
|
121
|
+
not in _create_ee_feature_collection._printed_z_messages
|
|
122
|
+
):
|
|
106
123
|
print("Successfully converted after stripping Z coordinates")
|
|
107
|
-
|
|
124
|
+
_create_ee_feature_collection._printed_z_messages.add(
|
|
125
|
+
success_message_key
|
|
126
|
+
)
|
|
108
127
|
|
|
109
128
|
return feature_collection
|
|
110
129
|
except Exception as retry_error:
|
|
@@ -115,6 +134,82 @@ def convert_geojson_to_ee(
|
|
|
115
134
|
raise e
|
|
116
135
|
|
|
117
136
|
|
|
137
|
+
def convert_geojson_to_ee(
|
|
138
|
+
geojson_input: Union[str, Path, dict, gpd.GeoDataFrame],
|
|
139
|
+
enforce_wgs84: bool = True,
|
|
140
|
+
strip_z_coords: bool = True,
|
|
141
|
+
) -> ee.FeatureCollection:
|
|
142
|
+
"""
|
|
143
|
+
Converts GeoJSON data to an Earth Engine FeatureCollection.
|
|
144
|
+
|
|
145
|
+
Accepts flexible input types with a unified processing pathway:
|
|
146
|
+
- File path (str or Path) → loads with GeoPandas
|
|
147
|
+
- GeoJSON dict → uses directly
|
|
148
|
+
- GeoDataFrame → uses directly
|
|
149
|
+
|
|
150
|
+
Automatically handles:
|
|
151
|
+
- CRS conversion to WGS 84 (EPSG:4326) if needed
|
|
152
|
+
- DateTime/Timestamp columns → converts to ISO strings before JSON serialization
|
|
153
|
+
- Non-serializable objects → converts to strings
|
|
154
|
+
- 3D coordinates → strips Z values when necessary
|
|
155
|
+
- Z-coordinate errors → retries with 2D geometries if enabled
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
geojson_input (Union[str, Path, dict, gpd.GeoDataFrame]):
|
|
159
|
+
- File path (str or Path) to GeoJSON file
|
|
160
|
+
- GeoJSON dictionary object
|
|
161
|
+
- GeoPandas GeoDataFrame
|
|
162
|
+
enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326).
|
|
163
|
+
Defaults to True. Only applies to file path and GeoDataFrame inputs.
|
|
164
|
+
strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries.
|
|
165
|
+
Defaults to True.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ValueError: If input type is unsupported or GeoJSON validation fails.
|
|
172
|
+
ee.EEException: If GeoJSON cannot be converted even after retries.
|
|
173
|
+
"""
|
|
174
|
+
# UNIFIED INPUT NORMALIZATION: Convert all inputs to GeoDataFrame first
|
|
175
|
+
if isinstance(geojson_input, gpd.GeoDataFrame):
|
|
176
|
+
gdf = geojson_input.copy()
|
|
177
|
+
input_source = "GeoDataFrame"
|
|
178
|
+
elif isinstance(geojson_input, dict):
|
|
179
|
+
# Convert dict to GeoDataFrame for unified processing
|
|
180
|
+
gdf = gpd.GeoDataFrame.from_features(geojson_input.get("features", []))
|
|
181
|
+
input_source = "dict"
|
|
182
|
+
elif isinstance(geojson_input, (str, Path)):
|
|
183
|
+
# Load file and convert to GeoDataFrame
|
|
184
|
+
file_path = os.path.abspath(geojson_input)
|
|
185
|
+
gdf = gpd.read_file(file_path)
|
|
186
|
+
input_source = f"file ({file_path})"
|
|
187
|
+
else:
|
|
188
|
+
raise ValueError(
|
|
189
|
+
f"Input must be a file path (str or Path), GeoJSON dict, or GeoDataFrame. "
|
|
190
|
+
f"Got {type(geojson_input).__name__}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# UNIFIED DATA SANITIZATION PATHWAY
|
|
194
|
+
# Handle problematic data types before JSON conversion
|
|
195
|
+
gdf = _sanitize_geodataframe(gdf)
|
|
196
|
+
|
|
197
|
+
# UNIFIED CRS HANDLING
|
|
198
|
+
if enforce_wgs84:
|
|
199
|
+
gdf = _ensure_wgs84_crs(gdf)
|
|
200
|
+
|
|
201
|
+
# UNIFIED GEOJSON CONVERSION
|
|
202
|
+
geojson_data = json.loads(gdf.to_json())
|
|
203
|
+
|
|
204
|
+
# UNIFIED VALIDATION
|
|
205
|
+
validation_errors = validate_geojson(geojson_data)
|
|
206
|
+
if validation_errors:
|
|
207
|
+
raise ValueError(f"GeoJSON validation errors: {validation_errors}")
|
|
208
|
+
|
|
209
|
+
# UNIFIED EE CONVERSION with error recovery
|
|
210
|
+
return _create_ee_feature_collection(geojson_data, strip_z_coords, input_source)
|
|
211
|
+
|
|
212
|
+
|
|
118
213
|
def _strip_z_coordinates_from_geojson(geojson_data: dict) -> dict:
|
|
119
214
|
"""
|
|
120
215
|
Helper function to strip Z coordinates from GeoJSON data.
|
openforis_whisp/reformat.py
CHANGED
|
@@ -125,7 +125,7 @@ def validate_dataframe(
|
|
|
125
125
|
Returns:
|
|
126
126
|
pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
|
|
127
127
|
"""
|
|
128
|
-
|
|
128
|
+
_log_missing_columns(df_stats, schema)
|
|
129
129
|
|
|
130
130
|
# df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
|
|
131
131
|
|
|
@@ -251,7 +251,7 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
|
|
|
251
251
|
# return logger
|
|
252
252
|
|
|
253
253
|
|
|
254
|
-
def
|
|
254
|
+
def _log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
255
255
|
# Initialize the logger
|
|
256
256
|
logger = setup_logger(__name__)
|
|
257
257
|
|
|
@@ -675,33 +675,6 @@ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
|
|
|
675
675
|
|
|
676
676
|
|
|
677
677
|
# Fix the duplicate logging issue
|
|
678
|
-
def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
679
|
-
# Remove the duplicate logger creation line
|
|
680
|
-
# logger = setup_logger(__name__) # DELETE THIS LINE
|
|
681
|
-
|
|
682
|
-
# Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
|
|
683
|
-
|
|
684
|
-
# Extract the expected columns from the DataFrameSchema
|
|
685
|
-
template_columns = list(template_schema.columns.keys())
|
|
686
|
-
df_stats_columns = df_stats.columns.tolist()
|
|
687
|
-
|
|
688
|
-
# Find missing and extra columns
|
|
689
|
-
missing_in_df = [col for col in template_columns if col not in df_stats_columns]
|
|
690
|
-
extra_in_df = [col for col in df_stats_columns if col not in template_columns]
|
|
691
|
-
|
|
692
|
-
# Log missing schema columns
|
|
693
|
-
if missing_in_df:
|
|
694
|
-
logger.warning(f"Missing expected schema columns: {missing_in_df}")
|
|
695
|
-
else:
|
|
696
|
-
logger.info("All expected schema columns found in DataFrame.")
|
|
697
|
-
|
|
698
|
-
# Log extra columns (will be preserved)
|
|
699
|
-
if extra_in_df:
|
|
700
|
-
logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
|
|
701
|
-
else:
|
|
702
|
-
logger.info("No extra columns found in DataFrame.")
|
|
703
|
-
|
|
704
|
-
|
|
705
678
|
def format_stats_dataframe(
|
|
706
679
|
df,
|
|
707
680
|
area_col="Area_sum",
|
openforis_whisp/stats.py
CHANGED
|
@@ -93,7 +93,6 @@ def whisp_formatted_stats_geojson_to_df_legacy(
|
|
|
93
93
|
unit_type="ha",
|
|
94
94
|
whisp_image=None,
|
|
95
95
|
custom_bands=None, # New parameter
|
|
96
|
-
validate_geometries: bool = False,
|
|
97
96
|
) -> pd.DataFrame:
|
|
98
97
|
"""
|
|
99
98
|
Legacy function for basic Whisp stats extraction.
|
|
@@ -135,48 +134,15 @@ def whisp_formatted_stats_geojson_to_df_legacy(
|
|
|
135
134
|
- List of band names: ['Aa_test', 'elevation']
|
|
136
135
|
- Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
|
|
137
136
|
- None: preserves all extra columns automatically
|
|
138
|
-
validate_geometries : bool, optional
|
|
139
|
-
Whether to validate and fix invalid geometries, by default False.
|
|
140
|
-
Set to True to automatically fix invalid/self-intersecting polygons.
|
|
141
137
|
|
|
142
138
|
Returns
|
|
143
139
|
-------
|
|
144
140
|
df_stats : pd.DataFrame
|
|
145
141
|
The DataFrame containing the Whisp stats for the input ROI.
|
|
146
142
|
"""
|
|
147
|
-
#
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
import geopandas as gpd
|
|
151
|
-
from shapely.validation import make_valid
|
|
152
|
-
import logging as py_logging
|
|
153
|
-
|
|
154
|
-
logger = py_logging.getLogger("whisp-legacy")
|
|
155
|
-
|
|
156
|
-
# Load GeoJSON file
|
|
157
|
-
with open(input_geojson_filepath, "r") as f:
|
|
158
|
-
geojson_data = json.load(f)
|
|
159
|
-
|
|
160
|
-
# Convert to GeoDataFrame
|
|
161
|
-
gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
|
|
162
|
-
|
|
163
|
-
# Validate and fix invalid geometries
|
|
164
|
-
valid_count = gdf.geometry.is_valid.sum()
|
|
165
|
-
invalid_count = len(gdf) - valid_count
|
|
166
|
-
if invalid_count > 0:
|
|
167
|
-
logger.warning(f"Fixing {invalid_count} invalid geometries")
|
|
168
|
-
gdf["geometry"] = gdf["geometry"].apply(
|
|
169
|
-
lambda g: make_valid(g) if g and not g.is_valid else g
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
# Convert back to GeoJSON dict (stays in memory - no temp files!)
|
|
173
|
-
geojson_cleaned = json.loads(gdf.to_json())
|
|
174
|
-
|
|
175
|
-
# OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
|
|
176
|
-
feature_collection = convert_geojson_to_ee(geojson_cleaned)
|
|
177
|
-
else:
|
|
178
|
-
# Original path - no validation
|
|
179
|
-
feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
|
|
143
|
+
# Convert GeoJSON to Earth Engine FeatureCollection
|
|
144
|
+
# Note: Geometry validation/cleaning should be done before calling this function
|
|
145
|
+
feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
|
|
180
146
|
|
|
181
147
|
return whisp_formatted_stats_ee_to_df(
|
|
182
148
|
feature_collection,
|
|
@@ -200,7 +166,7 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
200
166
|
mode: str = "sequential",
|
|
201
167
|
batch_size: int = 10,
|
|
202
168
|
max_concurrent: int = 20,
|
|
203
|
-
|
|
169
|
+
geometry_audit_trail: bool = False,
|
|
204
170
|
) -> pd.DataFrame:
|
|
205
171
|
"""
|
|
206
172
|
Main entry point for converting GeoJSON to Whisp statistics.
|
|
@@ -248,11 +214,16 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
248
214
|
max_concurrent : int, optional
|
|
249
215
|
Maximum concurrent EE calls for concurrent mode, by default 20.
|
|
250
216
|
Only applicable for "concurrent" mode.
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
217
|
+
geometry_audit_trail : bool, default True
|
|
218
|
+
If True (default), includes audit trail columns:
|
|
219
|
+
- geo_original: Original input geometry
|
|
220
|
+
- geometry_type_original: Original geometry type
|
|
221
|
+
- geometry_type: Processed geometry type (from EE)
|
|
222
|
+
- geometry_type_changed: Boolean flag if geometry changed
|
|
223
|
+
- geometry_degradation_type: Description of how it changed
|
|
224
|
+
|
|
225
|
+
Processing metadata stored in df.attrs['processing_metadata'].
|
|
226
|
+
These columns enable full transparency for geometry modifications during processing.
|
|
256
227
|
|
|
257
228
|
Returns
|
|
258
229
|
-------
|
|
@@ -317,7 +288,6 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
317
288
|
unit_type=unit_type,
|
|
318
289
|
whisp_image=whisp_image,
|
|
319
290
|
custom_bands=custom_bands,
|
|
320
|
-
validate_geometries=validate_geometries,
|
|
321
291
|
)
|
|
322
292
|
elif mode in ("concurrent", "sequential"):
|
|
323
293
|
# Log info if batch_size or max_concurrent are not used in sequential mode
|
|
@@ -344,7 +314,7 @@ def whisp_formatted_stats_geojson_to_df(
|
|
|
344
314
|
mode=mode, # Pass mode directly (concurrent or sequential)
|
|
345
315
|
batch_size=batch_size,
|
|
346
316
|
max_concurrent=max_concurrent,
|
|
347
|
-
|
|
317
|
+
geometry_audit_trail=geometry_audit_trail,
|
|
348
318
|
)
|
|
349
319
|
else:
|
|
350
320
|
raise ValueError(
|