openforis-whisp 2.0.0b3__py3-none-any.whl → 3.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +35 -4
- openforis_whisp/advanced_stats.py +2070 -0
- openforis_whisp/data_checks.py +642 -0
- openforis_whisp/data_conversion.py +86 -44
- openforis_whisp/datasets.py +124 -36
- openforis_whisp/logger.py +26 -0
- openforis_whisp/parameters/__init__.py +0 -0
- openforis_whisp/parameters/lookup_gaul1_admin.py +18663 -0
- openforis_whisp/reformat.py +198 -2
- openforis_whisp/stats.py +314 -52
- {openforis_whisp-2.0.0b3.dist-info → openforis_whisp-3.0.0a1.dist-info}/METADATA +1 -1
- openforis_whisp-3.0.0a1.dist-info/RECORD +20 -0
- openforis_whisp-2.0.0b3.dist-info/RECORD +0 -16
- {openforis_whisp-2.0.0b3.dist-info → openforis_whisp-3.0.0a1.dist-info}/LICENSE +0 -0
- {openforis_whisp-2.0.0b3.dist-info → openforis_whisp-3.0.0a1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,2070 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Advanced statistics processing for WHISP - concurrent and sequential endpoints.
|
|
3
|
+
|
|
4
|
+
This module provides optimized functions for processing GeoJSON FeatureCollections
|
|
5
|
+
with Whisp datasets using concurrent batching (for high-volume processing)
|
|
6
|
+
and standard sequential processing.
|
|
7
|
+
|
|
8
|
+
NOTE: This module is a transition state. The plan is to eventually merge these
|
|
9
|
+
functions into stats.py and replace the standard functions there as the primary
|
|
10
|
+
implementation, deprecating the legacy versions.
|
|
11
|
+
|
|
12
|
+
Key features:
|
|
13
|
+
- whisp_stats_geojson_to_df_concurrent
|
|
14
|
+
- whisp_stats_geojson_to_df_sequential (standard endpoint, sequential)
|
|
15
|
+
- Proper logging at different levels (WARNING, INFO, DEBUG)
|
|
16
|
+
- Progress tracking without external dependencies
|
|
17
|
+
- Client-side and server-side metadata extraction options
|
|
18
|
+
- Endpoint validation and warnings
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import ee
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import geopandas as gpd
|
|
24
|
+
import logging
|
|
25
|
+
import sys
|
|
26
|
+
import threading
|
|
27
|
+
import time
|
|
28
|
+
import warnings
|
|
29
|
+
import json
|
|
30
|
+
import io
|
|
31
|
+
import os
|
|
32
|
+
import subprocess
|
|
33
|
+
from contextlib import redirect_stdout, contextmanager
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
36
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
|
+
import tempfile
|
|
38
|
+
|
|
39
|
+
# ============================================================================
|
|
40
|
+
# STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
|
|
41
|
+
# ============================================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@contextmanager
|
|
45
|
+
def suppress_c_level_output():
|
|
46
|
+
"""Suppress C-level stdout/stderr writes from libraries like Fiona."""
|
|
47
|
+
if sys.platform == "win32":
|
|
48
|
+
# Windows doesn't support dup2() reliably for STDOUT/STDERR
|
|
49
|
+
# Fall back to Python-level suppression
|
|
50
|
+
with redirect_stdout(io.StringIO()):
|
|
51
|
+
yield
|
|
52
|
+
else:
|
|
53
|
+
# Unix-like systems: use file descriptor redirection
|
|
54
|
+
saved_stdout = os.dup(1)
|
|
55
|
+
saved_stderr = os.dup(2)
|
|
56
|
+
try:
|
|
57
|
+
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
58
|
+
os.dup2(devnull, 1)
|
|
59
|
+
os.dup2(devnull, 2)
|
|
60
|
+
yield
|
|
61
|
+
finally:
|
|
62
|
+
os.dup2(saved_stdout, 1)
|
|
63
|
+
os.dup2(saved_stderr, 2)
|
|
64
|
+
os.close(devnull)
|
|
65
|
+
os.close(saved_stdout)
|
|
66
|
+
os.close(saved_stderr)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Suppress verbose warnings globally for this module
|
|
70
|
+
# Note: FutureWarnings are kept (they signal important API changes)
|
|
71
|
+
warnings.filterwarnings("ignore", category=UserWarning, message=".*geographic CRS.*")
|
|
72
|
+
warnings.simplefilter("ignore", UserWarning)
|
|
73
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
74
|
+
|
|
75
|
+
# Suppress verbose logging from GeoPandas/Fiona/pyogrio
|
|
76
|
+
logging.getLogger("fiona").setLevel(logging.WARNING)
|
|
77
|
+
logging.getLogger("fiona.ogrext").setLevel(logging.WARNING)
|
|
78
|
+
logging.getLogger("pyogrio").setLevel(logging.WARNING)
|
|
79
|
+
logging.getLogger("pyogrio._io").setLevel(logging.WARNING)
|
|
80
|
+
|
|
81
|
+
from openforis_whisp.parameters.config_runtime import (
|
|
82
|
+
plot_id_column,
|
|
83
|
+
external_id_column,
|
|
84
|
+
geometry_type_column,
|
|
85
|
+
geometry_area_column,
|
|
86
|
+
centroid_x_coord_column,
|
|
87
|
+
centroid_y_coord_column,
|
|
88
|
+
iso3_country_column,
|
|
89
|
+
iso2_country_column,
|
|
90
|
+
admin_1_column,
|
|
91
|
+
water_flag,
|
|
92
|
+
geometry_area_column_formatting,
|
|
93
|
+
stats_area_columns_formatting,
|
|
94
|
+
stats_percent_columns_formatting,
|
|
95
|
+
)
|
|
96
|
+
from openforis_whisp.data_conversion import (
|
|
97
|
+
convert_geojson_to_ee,
|
|
98
|
+
convert_ee_to_df,
|
|
99
|
+
convert_ee_to_geojson,
|
|
100
|
+
)
|
|
101
|
+
from openforis_whisp.datasets import combine_datasets
|
|
102
|
+
from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
|
|
103
|
+
from openforis_whisp.stats import (
|
|
104
|
+
reformat_geometry_type,
|
|
105
|
+
set_point_geometry_area_to_zero,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ============================================================================
|
|
110
|
+
# LOGGING & PROGRESS UTILITIES
|
|
111
|
+
# ============================================================================
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _suppress_verbose_output(max_concurrent: int = None):
|
|
115
|
+
"""
|
|
116
|
+
Suppress verbose warnings and logging from dependencies.
|
|
117
|
+
|
|
118
|
+
Dynamically adjusts urllib3 logger level based on max_concurrent to prevent
|
|
119
|
+
"Connection pool is full" warnings during high-concurrency scenarios.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
max_concurrent : int, optional
|
|
124
|
+
Maximum concurrent workers. Adjusts urllib3 logging level:
|
|
125
|
+
- max_concurrent <= 20: WARNING (pool rarely full)
|
|
126
|
+
- max_concurrent 21-35: CRITICAL (suppress pool warnings)
|
|
127
|
+
- max_concurrent >= 36: CRITICAL (maximum suppression)
|
|
128
|
+
"""
|
|
129
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
130
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
131
|
+
|
|
132
|
+
# Suppress urllib3 connection pool warnings via filters
|
|
133
|
+
warnings.filterwarnings("ignore", message=".*Connection pool is full.*")
|
|
134
|
+
warnings.filterwarnings("ignore", message=".*discarding connection.*")
|
|
135
|
+
|
|
136
|
+
# Set logger levels to WARNING to suppress INFO messages
|
|
137
|
+
for mod_name in [
|
|
138
|
+
"openforis_whisp.reformat",
|
|
139
|
+
"openforis_whisp.data_conversion",
|
|
140
|
+
"geopandas",
|
|
141
|
+
"fiona",
|
|
142
|
+
"pyogrio._io",
|
|
143
|
+
"urllib3",
|
|
144
|
+
]:
|
|
145
|
+
logging.getLogger(mod_name).setLevel(logging.WARNING)
|
|
146
|
+
|
|
147
|
+
# ALL urllib3 loggers: use CRITICAL to suppress ALL connection pool warnings
|
|
148
|
+
# (these appear at WARNING level during high concurrency)
|
|
149
|
+
urllib3_loggers = [
|
|
150
|
+
"urllib3.connectionpool",
|
|
151
|
+
"urllib3.poolmanager",
|
|
152
|
+
"urllib3",
|
|
153
|
+
"requests.packages.urllib3.connectionpool",
|
|
154
|
+
"requests.packages.urllib3.poolmanager",
|
|
155
|
+
"requests.packages.urllib3",
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
for logger_name in urllib3_loggers:
|
|
159
|
+
logging.getLogger(logger_name).setLevel(logging.CRITICAL)
|
|
160
|
+
|
|
161
|
+
# Suppress warning logs specifically from reformat module during validation
|
|
162
|
+
reformat_logger = logging.getLogger("openforis_whisp.reformat")
|
|
163
|
+
reformat_logger.setLevel(logging.ERROR)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
|
|
167
|
+
"""Load GeoJSON file with all output suppressed."""
|
|
168
|
+
fiona_logger = logging.getLogger("fiona")
|
|
169
|
+
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
170
|
+
old_fiona_level = fiona_logger.level
|
|
171
|
+
old_pyogrio_level = pyogrio_logger.level
|
|
172
|
+
fiona_logger.setLevel(logging.CRITICAL)
|
|
173
|
+
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
with redirect_stdout(io.StringIO()):
|
|
177
|
+
gdf = gpd.read_file(filepath)
|
|
178
|
+
return gdf
|
|
179
|
+
finally:
|
|
180
|
+
fiona_logger.setLevel(old_fiona_level)
|
|
181
|
+
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _extract_decimal_places(format_string: str) -> int:
|
|
185
|
+
"""
|
|
186
|
+
Extract decimal places from a format string like '%.3f'.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
format_string : str
|
|
191
|
+
Format string (e.g., '%.3f' → 3)
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
int
|
|
196
|
+
Number of decimal places
|
|
197
|
+
"""
|
|
198
|
+
import re
|
|
199
|
+
|
|
200
|
+
match = re.search(r"\.(\d+)f", format_string)
|
|
201
|
+
if match:
|
|
202
|
+
return int(match.group(1))
|
|
203
|
+
return 2 # Default to 2 decimal places
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _add_admin_context(
|
|
207
|
+
df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
|
|
208
|
+
) -> pd.DataFrame:
|
|
209
|
+
"""
|
|
210
|
+
Join admin codes to get Country, ProducerCountry, and Admin_Level_1 information.
|
|
211
|
+
|
|
212
|
+
Uses GAUL 2024 Level 1 administrative lookup to map admin codes to country and
|
|
213
|
+
administrative region names.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
df : pd.DataFrame
|
|
218
|
+
DataFrame with admin_code_median column from reduceRegions
|
|
219
|
+
admin_code_col : str
|
|
220
|
+
Name of the admin code column (default: "admin_code_median")
|
|
221
|
+
debug : bool
|
|
222
|
+
If True, print detailed debugging information (default: False)
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
pd.DataFrame
|
|
227
|
+
DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
|
|
228
|
+
"""
|
|
229
|
+
logger = logging.getLogger("whisp-concurrent")
|
|
230
|
+
|
|
231
|
+
# Return early if admin code column doesn't exist
|
|
232
|
+
if admin_code_col not in df.columns:
|
|
233
|
+
logger.debug(f"Admin code column '{admin_code_col}' not found in dataframe")
|
|
234
|
+
if debug:
|
|
235
|
+
print(f"DEBUG: Admin code column '{admin_code_col}' not found")
|
|
236
|
+
print(f"DEBUG: Available columns: {df.columns.tolist()}")
|
|
237
|
+
return df
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
|
|
241
|
+
|
|
242
|
+
if debug:
|
|
243
|
+
print(f"DEBUG: Found admin_code_col '{admin_code_col}'")
|
|
244
|
+
print(f"DEBUG: Sample values: {df[admin_code_col].head()}")
|
|
245
|
+
print(f"DEBUG: Value types: {df[admin_code_col].dtype}")
|
|
246
|
+
print(f"DEBUG: Null count: {df[admin_code_col].isna().sum()}")
|
|
247
|
+
|
|
248
|
+
# Create lookup dataframe
|
|
249
|
+
lookup_data = []
|
|
250
|
+
for gaul_code, info in lookup_dict.items():
|
|
251
|
+
lookup_data.append(
|
|
252
|
+
{
|
|
253
|
+
"gaul1_code": gaul_code,
|
|
254
|
+
"gaul1_name": info.get("gaul1_name"),
|
|
255
|
+
"iso3_code": info.get("iso3_code"),
|
|
256
|
+
"iso2_code": info.get("iso2_code"),
|
|
257
|
+
}
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
lookup_df = pd.DataFrame(lookup_data)
|
|
261
|
+
|
|
262
|
+
if debug:
|
|
263
|
+
print(f"DEBUG: Lookup dictionary has {len(lookup_df)} entries")
|
|
264
|
+
print(f"DEBUG: Sample lookup codes: {lookup_df['gaul1_code'].head()}")
|
|
265
|
+
|
|
266
|
+
# Prepare data for join
|
|
267
|
+
df = df.copy()
|
|
268
|
+
df["admin_code_for_join"] = df[admin_code_col].fillna(-9999).astype("int32")
|
|
269
|
+
lookup_df["gaul1_code"] = lookup_df["gaul1_code"].astype("int32")
|
|
270
|
+
|
|
271
|
+
if debug:
|
|
272
|
+
print(
|
|
273
|
+
f"DEBUG: Codes to join (first 10): {df['admin_code_for_join'].unique()[:10]}"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Perform join
|
|
277
|
+
df_joined = df.merge(
|
|
278
|
+
lookup_df, left_on="admin_code_for_join", right_on="gaul1_code", how="left"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if debug:
|
|
282
|
+
matched = df_joined["iso3_code"].notna().sum()
|
|
283
|
+
print(f"DEBUG: Merge result - {matched}/{len(df_joined)} rows matched")
|
|
284
|
+
print(f"DEBUG: Sample matched rows:")
|
|
285
|
+
print(
|
|
286
|
+
df_joined[
|
|
287
|
+
["admin_code_for_join", "iso3_code", "iso2_code", "gaul1_name"]
|
|
288
|
+
].head()
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Rename columns to match output schema
|
|
292
|
+
df_joined = df_joined.rename(
|
|
293
|
+
columns={
|
|
294
|
+
"iso3_code": iso3_country_column, # 'Country'
|
|
295
|
+
"iso2_code": iso2_country_column, # 'ProducerCountry'
|
|
296
|
+
"gaul1_name": admin_1_column, # 'Admin_Level_1'
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Drop temporary columns
|
|
301
|
+
df_joined = df_joined.drop(
|
|
302
|
+
columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
logger.debug(
|
|
306
|
+
f"Admin context added: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
|
|
307
|
+
)
|
|
308
|
+
return df_joined
|
|
309
|
+
|
|
310
|
+
except ImportError:
|
|
311
|
+
logger.warning(
|
|
312
|
+
"Could not import GAUL lookup dictionary - admin context not added"
|
|
313
|
+
)
|
|
314
|
+
if debug:
|
|
315
|
+
print("DEBUG: ImportError - could not load lookup dictionary")
|
|
316
|
+
return df
|
|
317
|
+
except Exception as e:
|
|
318
|
+
logger.warning(f"Error adding admin context: {e}")
|
|
319
|
+
if debug:
|
|
320
|
+
print(f"DEBUG: Exception in _add_admin_context: {e}")
|
|
321
|
+
import traceback
|
|
322
|
+
|
|
323
|
+
traceback.print_exc()
|
|
324
|
+
return df
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def join_admin_codes(
|
|
328
|
+
df: pd.DataFrame, lookup_dict: Dict, id_col: str = "admin_code_median"
|
|
329
|
+
) -> pd.DataFrame:
|
|
330
|
+
"""
|
|
331
|
+
Join admin codes to DataFrame using a lookup dictionary.
|
|
332
|
+
|
|
333
|
+
Converts the admin code column to integer and performs a left join with
|
|
334
|
+
the lookup dictionary to add Country, ProducerCountry, and Admin_Level_1.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
df : pd.DataFrame
|
|
339
|
+
DataFrame with admin code column
|
|
340
|
+
lookup_dict : dict
|
|
341
|
+
Dictionary mapping GAUL codes to admin info (iso3_code, iso2_code, gaul1_name)
|
|
342
|
+
id_col : str
|
|
343
|
+
Name of the admin code column (default: "admin_code_median")
|
|
344
|
+
|
|
345
|
+
Returns
|
|
346
|
+
-------
|
|
347
|
+
pd.DataFrame
|
|
348
|
+
DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
|
|
349
|
+
"""
|
|
350
|
+
logger = logging.getLogger("whisp-concurrent")
|
|
351
|
+
|
|
352
|
+
# Return early if admin code column doesn't exist
|
|
353
|
+
if id_col not in df.columns:
|
|
354
|
+
logger.debug(f"Admin code column '{id_col}' not found in dataframe")
|
|
355
|
+
return df
|
|
356
|
+
|
|
357
|
+
try:
|
|
358
|
+
# Create lookup dataframe
|
|
359
|
+
lookup_data = []
|
|
360
|
+
for gaul_code, info in lookup_dict.items():
|
|
361
|
+
lookup_data.append(
|
|
362
|
+
{
|
|
363
|
+
"gaul1_code": gaul_code,
|
|
364
|
+
"gaul1_name": info.get("gaul1_name"),
|
|
365
|
+
"iso3_code": info.get("iso3_code"),
|
|
366
|
+
"iso2_code": info.get("iso2_code"),
|
|
367
|
+
}
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
lookup_df = pd.DataFrame(lookup_data)
|
|
371
|
+
|
|
372
|
+
# Prepare data for join
|
|
373
|
+
df = df.copy()
|
|
374
|
+
# Round to nearest integer (handles float values from EE reducers)
|
|
375
|
+
df["admin_code_for_join"] = df[id_col].fillna(-9999).astype("int32")
|
|
376
|
+
lookup_df["gaul1_code"] = lookup_df["gaul1_code"].astype("int32")
|
|
377
|
+
|
|
378
|
+
# Perform join
|
|
379
|
+
df_joined = df.merge(
|
|
380
|
+
lookup_df, left_on="admin_code_for_join", right_on="gaul1_code", how="left"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Rename columns to match output schema
|
|
384
|
+
df_joined = df_joined.rename(
|
|
385
|
+
columns={
|
|
386
|
+
"iso3_code": iso3_country_column, # 'Country'
|
|
387
|
+
"iso2_code": iso2_country_column, # 'ProducerCountry'
|
|
388
|
+
"gaul1_name": admin_1_column, # 'Admin_Level_1'
|
|
389
|
+
}
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Drop temporary columns
|
|
393
|
+
df_joined = df_joined.drop(
|
|
394
|
+
columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
logger.debug(
|
|
398
|
+
f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
|
|
399
|
+
)
|
|
400
|
+
return df_joined
|
|
401
|
+
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logger.warning(f"Error joining admin codes: {e}")
|
|
404
|
+
return df
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
class ProgressTracker:
|
|
408
|
+
"""
|
|
409
|
+
Track batch processing progress with time estimation.
|
|
410
|
+
|
|
411
|
+
Shows progress at key milestones (25%, 50%, 75%, 100%) with estimated
|
|
412
|
+
time remaining based on processing speed.
|
|
413
|
+
"""
|
|
414
|
+
|
|
415
|
+
def __init__(self, total: int, logger: logging.Logger = None):
|
|
416
|
+
"""
|
|
417
|
+
Initialize progress tracker.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
total : int
|
|
422
|
+
Total number of items to process
|
|
423
|
+
logger : logging.Logger, optional
|
|
424
|
+
Logger for output
|
|
425
|
+
"""
|
|
426
|
+
self.total = total
|
|
427
|
+
self.completed = 0
|
|
428
|
+
self.lock = threading.Lock()
|
|
429
|
+
self.logger = logger or logging.getLogger("whisp-concurrent")
|
|
430
|
+
self.milestones = {25, 50, 75, 100}
|
|
431
|
+
self.shown_milestones = set()
|
|
432
|
+
self.start_time = time.time()
|
|
433
|
+
self.last_update_time = self.start_time
|
|
434
|
+
|
|
435
|
+
def update(self, n: int = 1) -> None:
|
|
436
|
+
"""
|
|
437
|
+
Update progress count.
|
|
438
|
+
|
|
439
|
+
Parameters
|
|
440
|
+
----------
|
|
441
|
+
n : int
|
|
442
|
+
Number of items completed
|
|
443
|
+
"""
|
|
444
|
+
with self.lock:
|
|
445
|
+
self.completed += n
|
|
446
|
+
percent = int((self.completed / self.total) * 100)
|
|
447
|
+
|
|
448
|
+
# Show milestone messages (25%, 50%, 75%, 100%)
|
|
449
|
+
for milestone in sorted(self.milestones):
|
|
450
|
+
if percent >= milestone and milestone not in self.shown_milestones:
|
|
451
|
+
self.shown_milestones.add(milestone)
|
|
452
|
+
|
|
453
|
+
# Calculate time metrics
|
|
454
|
+
elapsed = time.time() - self.start_time
|
|
455
|
+
rate = self.completed / elapsed if elapsed > 0 else 0
|
|
456
|
+
remaining_items = self.total - self.completed
|
|
457
|
+
eta_seconds = remaining_items / rate if rate > 0 else 0
|
|
458
|
+
|
|
459
|
+
# Format time strings
|
|
460
|
+
eta_str = self._format_time(eta_seconds)
|
|
461
|
+
elapsed_str = self._format_time(elapsed)
|
|
462
|
+
|
|
463
|
+
# Build progress message
|
|
464
|
+
msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
|
|
465
|
+
if percent < 100:
|
|
466
|
+
msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
|
|
467
|
+
else:
|
|
468
|
+
msg += f" | Total time: {elapsed_str}"
|
|
469
|
+
|
|
470
|
+
self.logger.info(msg)
|
|
471
|
+
|
|
472
|
+
@staticmethod
|
|
473
|
+
def _format_time(seconds: float) -> str:
|
|
474
|
+
"""Format seconds as human-readable string."""
|
|
475
|
+
if seconds < 60:
|
|
476
|
+
return f"{seconds:.0f}s"
|
|
477
|
+
elif seconds < 3600:
|
|
478
|
+
mins = seconds / 60
|
|
479
|
+
return f"{mins:.1f}m"
|
|
480
|
+
else:
|
|
481
|
+
hours = seconds / 3600
|
|
482
|
+
return f"{hours:.1f}h"
|
|
483
|
+
|
|
484
|
+
def finish(self) -> None:
|
|
485
|
+
"""Log completion."""
|
|
486
|
+
with self.lock:
|
|
487
|
+
total_time = time.time() - self.start_time
|
|
488
|
+
time_str = self._format_time(total_time)
|
|
489
|
+
self.logger.info(
|
|
490
|
+
f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
# ============================================================================
|
|
495
|
+
# ENDPOINT VALIDATION
|
|
496
|
+
# ============================================================================
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def check_ee_endpoint(endpoint_type: str = "high-volume") -> bool:
|
|
500
|
+
"""
|
|
501
|
+
Check if Earth Engine is using the correct endpoint.
|
|
502
|
+
|
|
503
|
+
Parameters
|
|
504
|
+
----------
|
|
505
|
+
endpoint_type : str
|
|
506
|
+
Expected endpoint type: "high-volume" or "standard"
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
bool
|
|
511
|
+
True if using expected endpoint, False otherwise
|
|
512
|
+
"""
|
|
513
|
+
api_url = str(ee.data._cloud_api_base_url)
|
|
514
|
+
|
|
515
|
+
if endpoint_type == "high-volume":
|
|
516
|
+
return "highvolume" in api_url.lower()
|
|
517
|
+
elif endpoint_type == "standard":
|
|
518
|
+
return "highvolume" not in api_url.lower()
|
|
519
|
+
else:
|
|
520
|
+
return False
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool = True):
|
|
524
|
+
"""
|
|
525
|
+
Validate Earth Engine endpoint and warn/error if incorrect.
|
|
526
|
+
|
|
527
|
+
Parameters
|
|
528
|
+
----------
|
|
529
|
+
endpoint_type : str
|
|
530
|
+
Expected endpoint type
|
|
531
|
+
raise_error : bool
|
|
532
|
+
If True, raise error if incorrect endpoint; if False, warn
|
|
533
|
+
|
|
534
|
+
Raises
|
|
535
|
+
------
|
|
536
|
+
RuntimeError
|
|
537
|
+
If incorrect endpoint and raise_error=True
|
|
538
|
+
"""
|
|
539
|
+
if not check_ee_endpoint(endpoint_type):
|
|
540
|
+
msg = (
|
|
541
|
+
f"Not using {endpoint_type.upper()} endpoint.\n"
|
|
542
|
+
f"Current URL: {ee.data._cloud_api_base_url}\n"
|
|
543
|
+
f"\nTo use {endpoint_type} endpoint, run:\n"
|
|
544
|
+
)
|
|
545
|
+
msg += "ee.Reset()\n"
|
|
546
|
+
if endpoint_type == "high-volume":
|
|
547
|
+
msg += " ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
548
|
+
else:
|
|
549
|
+
msg += " ee.Initialize() # Uses standard endpoint by default"
|
|
550
|
+
|
|
551
|
+
if raise_error:
|
|
552
|
+
raise RuntimeError(msg)
|
|
553
|
+
else:
|
|
554
|
+
logging.warning(msg)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
# ============================================================================
|
|
558
|
+
# METADATA EXTRACTION (CLIENT & SERVER SIDE)
|
|
559
|
+
# ============================================================================
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def extract_centroid_and_geomtype_client(
|
|
563
|
+
gdf: gpd.GeoDataFrame,
|
|
564
|
+
x_col: str = None,
|
|
565
|
+
y_col: str = None,
|
|
566
|
+
type_col: str = None,
|
|
567
|
+
external_id_column: str = None,
|
|
568
|
+
return_attributes_only: bool = True,
|
|
569
|
+
) -> pd.DataFrame:
|
|
570
|
+
"""
|
|
571
|
+
Extract centroid coordinates and geometry type using GeoPandas (client-side).
|
|
572
|
+
|
|
573
|
+
Parameters
|
|
574
|
+
----------
|
|
575
|
+
gdf : gpd.GeoDataFrame
|
|
576
|
+
Input GeoDataFrame
|
|
577
|
+
x_col : str, optional
|
|
578
|
+
Column name for centroid x. Defaults to config value
|
|
579
|
+
y_col : str, optional
|
|
580
|
+
Column name for centroid y. Defaults to config value
|
|
581
|
+
type_col : str, optional
|
|
582
|
+
Column name for geometry type. Defaults to config value
|
|
583
|
+
external_id_column: : str, optional
|
|
584
|
+
Name of external ID column to preserve
|
|
585
|
+
return_attributes_only : bool
|
|
586
|
+
If True, return only attribute columns (no geometry)
|
|
587
|
+
|
|
588
|
+
Returns
|
|
589
|
+
-------
|
|
590
|
+
pd.DataFrame or gpd.GeoDataFrame
|
|
591
|
+
DataFrame/GeoDataFrame with centroid and geometry type columns
|
|
592
|
+
"""
|
|
593
|
+
x_col = x_col or centroid_x_coord_column
|
|
594
|
+
y_col = y_col or centroid_y_coord_column
|
|
595
|
+
type_col = type_col or geometry_type_column
|
|
596
|
+
|
|
597
|
+
gdf = gdf.copy()
|
|
598
|
+
|
|
599
|
+
# Extract centroid coordinates (suppressing geographic CRS warning from Shapely)
|
|
600
|
+
with warnings.catch_warnings():
|
|
601
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
602
|
+
warnings.simplefilter("ignore", UserWarning) # Additional suppression
|
|
603
|
+
centroid_points = gdf.geometry.centroid
|
|
604
|
+
|
|
605
|
+
gdf[x_col] = centroid_points.x.round(6)
|
|
606
|
+
gdf[y_col] = centroid_points.y.round(6)
|
|
607
|
+
gdf[type_col] = gdf.geometry.geom_type
|
|
608
|
+
|
|
609
|
+
if return_attributes_only:
|
|
610
|
+
# Build column list starting with merge keys
|
|
611
|
+
cols = []
|
|
612
|
+
|
|
613
|
+
# Always include __row_id__ first if present (needed for row-level merging)
|
|
614
|
+
if "__row_id__" in gdf.columns:
|
|
615
|
+
cols.append("__row_id__")
|
|
616
|
+
|
|
617
|
+
# Always include plot_id_column if present (needed for merging batches)
|
|
618
|
+
if plot_id_column in gdf.columns:
|
|
619
|
+
cols.append(plot_id_column)
|
|
620
|
+
|
|
621
|
+
# Include external_id_column if provided and exists
|
|
622
|
+
if (
|
|
623
|
+
external_id_column
|
|
624
|
+
and external_id_column in gdf.columns
|
|
625
|
+
and external_id_column not in cols
|
|
626
|
+
):
|
|
627
|
+
cols.append(external_id_column)
|
|
628
|
+
|
|
629
|
+
# Always include metadata columns (centroid, geometry type)
|
|
630
|
+
cols.extend([x_col, y_col, type_col])
|
|
631
|
+
|
|
632
|
+
# Remove any duplicates while preserving order
|
|
633
|
+
cols = list(dict.fromkeys(cols))
|
|
634
|
+
|
|
635
|
+
return gdf[cols].reset_index(drop=True)
|
|
636
|
+
|
|
637
|
+
return gdf
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def extract_centroid_and_geomtype_server(
|
|
641
|
+
fc: ee.FeatureCollection,
|
|
642
|
+
x_col: str = None,
|
|
643
|
+
y_col: str = None,
|
|
644
|
+
type_col: str = None,
|
|
645
|
+
max_error: float = 1.0,
|
|
646
|
+
) -> ee.FeatureCollection:
|
|
647
|
+
"""
|
|
648
|
+
Extract centroid coordinates and geometry type using Earth Engine (server-side).
|
|
649
|
+
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
652
|
+
fc : ee.FeatureCollection
|
|
653
|
+
Input FeatureCollection
|
|
654
|
+
x_col : str, optional
|
|
655
|
+
Column name for centroid x
|
|
656
|
+
y_col : str, optional
|
|
657
|
+
Column name for centroid y
|
|
658
|
+
type_col : str, optional
|
|
659
|
+
Column name for geometry type
|
|
660
|
+
max_error : float
|
|
661
|
+
Maximum error for centroid calculation (meters)
|
|
662
|
+
|
|
663
|
+
Returns
|
|
664
|
+
-------
|
|
665
|
+
ee.FeatureCollection
|
|
666
|
+
FeatureCollection with centroid and geometry type properties
|
|
667
|
+
"""
|
|
668
|
+
x_col = x_col or centroid_x_coord_column
|
|
669
|
+
y_col = y_col or centroid_y_coord_column
|
|
670
|
+
type_col = type_col or geometry_type_column
|
|
671
|
+
|
|
672
|
+
def add_metadata(feature):
|
|
673
|
+
centroid = feature.geometry().centroid(max_error)
|
|
674
|
+
coords = centroid.coordinates()
|
|
675
|
+
x = ee.Number(coords.get(0)).multiply(1e6).round().divide(1e6)
|
|
676
|
+
y = ee.Number(coords.get(1)).multiply(1e6).round().divide(1e6)
|
|
677
|
+
return feature.set({x_col: x, y_col: y, type_col: feature.geometry().type()})
|
|
678
|
+
|
|
679
|
+
return fc.map(add_metadata)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
# ============================================================================
|
|
683
|
+
# BATCH PROCESSING UTILITIES
|
|
684
|
+
# ============================================================================
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def batch_geodataframe(
|
|
688
|
+
gdf: gpd.GeoDataFrame,
|
|
689
|
+
batch_size: int,
|
|
690
|
+
) -> List[gpd.GeoDataFrame]:
|
|
691
|
+
"""
|
|
692
|
+
Split a GeoDataFrame into batches.
|
|
693
|
+
|
|
694
|
+
Parameters
|
|
695
|
+
----------
|
|
696
|
+
gdf : gpd.GeoDataFrame
|
|
697
|
+
Input GeoDataFrame
|
|
698
|
+
batch_size : int
|
|
699
|
+
Size of each batch
|
|
700
|
+
|
|
701
|
+
Returns
|
|
702
|
+
-------
|
|
703
|
+
List[gpd.GeoDataFrame]
|
|
704
|
+
List of batch GeoDataFrames
|
|
705
|
+
"""
|
|
706
|
+
batches = []
|
|
707
|
+
for i in range(0, len(gdf), batch_size):
|
|
708
|
+
batches.append(gdf.iloc[i : i + batch_size].copy())
|
|
709
|
+
return batches
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
713
|
+
"""
|
|
714
|
+
Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
|
|
715
|
+
|
|
716
|
+
OPTIMIZATION: Uses GeoJSON dict input directly to avoid temp file I/O.
|
|
717
|
+
This provides ~67% performance improvement over writing to disk.
|
|
718
|
+
|
|
719
|
+
Preserves the __row_id__ column if present so it can be retrieved after processing.
|
|
720
|
+
|
|
721
|
+
Parameters
|
|
722
|
+
----------
|
|
723
|
+
batch_gdf : gpd.GeoDataFrame
|
|
724
|
+
Input batch (should have __row_id__ column)
|
|
725
|
+
|
|
726
|
+
Returns
|
|
727
|
+
-------
|
|
728
|
+
ee.FeatureCollection
|
|
729
|
+
EE FeatureCollection with __row_id__ as a feature property
|
|
730
|
+
"""
|
|
731
|
+
# OPTIMIZATION: Convert to GeoJSON dict and pass directly
|
|
732
|
+
# This eliminates the need to write to/read from temp files (~67% faster)
|
|
733
|
+
geojson_dict = json.loads(batch_gdf.to_json())
|
|
734
|
+
fc = convert_geojson_to_ee(geojson_dict)
|
|
735
|
+
|
|
736
|
+
# If __row_id__ is in the original GeoDataFrame, it will be preserved
|
|
737
|
+
# as a feature property in the GeoJSON and thus in the EE FeatureCollection
|
|
738
|
+
return fc
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def clean_geodataframe(
|
|
742
|
+
gdf: gpd.GeoDataFrame,
|
|
743
|
+
remove_nulls: bool = True,
|
|
744
|
+
fix_invalid: bool = True,
|
|
745
|
+
logger: logging.Logger = None,
|
|
746
|
+
) -> gpd.GeoDataFrame:
|
|
747
|
+
"""
|
|
748
|
+
Validate and clean GeoDataFrame geometries.
|
|
749
|
+
|
|
750
|
+
Parameters
|
|
751
|
+
----------
|
|
752
|
+
gdf : gpd.GeoDataFrame
|
|
753
|
+
Input GeoDataFrame
|
|
754
|
+
remove_nulls : bool
|
|
755
|
+
Remove null geometries
|
|
756
|
+
fix_invalid : bool
|
|
757
|
+
Fix invalid geometries
|
|
758
|
+
logger : logging.Logger, optional
|
|
759
|
+
Logger for output
|
|
760
|
+
|
|
761
|
+
Returns
|
|
762
|
+
-------
|
|
763
|
+
gpd.GeoDataFrame
|
|
764
|
+
Cleaned GeoDataFrame
|
|
765
|
+
"""
|
|
766
|
+
logger = logger or logging.getLogger("whisp-concurrent")
|
|
767
|
+
|
|
768
|
+
if remove_nulls:
|
|
769
|
+
null_count = gdf.geometry.isna().sum()
|
|
770
|
+
if null_count > 0:
|
|
771
|
+
logger.warning(f"Removing {null_count} null geometries")
|
|
772
|
+
gdf = gdf[~gdf.geometry.isna()].copy()
|
|
773
|
+
|
|
774
|
+
if fix_invalid:
|
|
775
|
+
valid_count = gdf.geometry.is_valid.sum()
|
|
776
|
+
invalid_count = len(gdf) - valid_count
|
|
777
|
+
if invalid_count > 0:
|
|
778
|
+
logger.warning(f"Fixing {invalid_count} invalid geometries")
|
|
779
|
+
from shapely.validation import make_valid
|
|
780
|
+
|
|
781
|
+
gdf = gdf.copy()
|
|
782
|
+
gdf["geometry"] = gdf["geometry"].apply(
|
|
783
|
+
lambda g: make_valid(g) if g and not g.is_valid else g
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
logger.debug(f"Validation complete: {len(gdf):,} geometries ready")
|
|
787
|
+
return gdf
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
# ============================================================================
|
|
791
|
+
# EE PROCESSING WITH RETRY LOGIC
|
|
792
|
+
# ============================================================================
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
def process_ee_batch(
|
|
796
|
+
fc: ee.FeatureCollection,
|
|
797
|
+
whisp_image: ee.Image,
|
|
798
|
+
reducer: ee.Reducer,
|
|
799
|
+
batch_idx: int,
|
|
800
|
+
max_retries: int = 3,
|
|
801
|
+
logger: logging.Logger = None,
|
|
802
|
+
) -> pd.DataFrame:
|
|
803
|
+
"""
|
|
804
|
+
Process an EE FeatureCollection with automatic retry logic.
|
|
805
|
+
|
|
806
|
+
Parameters
|
|
807
|
+
----------
|
|
808
|
+
fc : ee.FeatureCollection
|
|
809
|
+
Input FeatureCollection
|
|
810
|
+
whisp_image : ee.Image
|
|
811
|
+
Image containing bands to reduce
|
|
812
|
+
reducer : ee.Reducer
|
|
813
|
+
Reducer to apply
|
|
814
|
+
batch_idx : int
|
|
815
|
+
Batch index (for logging)
|
|
816
|
+
max_retries : int
|
|
817
|
+
Maximum retry attempts
|
|
818
|
+
logger : logging.Logger, optional
|
|
819
|
+
Logger for output
|
|
820
|
+
|
|
821
|
+
Returns
|
|
822
|
+
-------
|
|
823
|
+
pd.DataFrame
|
|
824
|
+
Results as DataFrame
|
|
825
|
+
|
|
826
|
+
Raises
|
|
827
|
+
------
|
|
828
|
+
RuntimeError
|
|
829
|
+
If processing fails after all retries
|
|
830
|
+
"""
|
|
831
|
+
logger = logger or logging.getLogger("whisp-concurrent")
|
|
832
|
+
|
|
833
|
+
for attempt in range(max_retries):
|
|
834
|
+
try:
|
|
835
|
+
results = whisp_image.reduceRegions(
|
|
836
|
+
collection=fc,
|
|
837
|
+
reducer=reducer,
|
|
838
|
+
scale=10,
|
|
839
|
+
)
|
|
840
|
+
df = convert_ee_to_df(results)
|
|
841
|
+
|
|
842
|
+
# Ensure plot_id_column is present for merging
|
|
843
|
+
# It should come from the feature properties (added before EE processing)
|
|
844
|
+
if plot_id_column not in df.columns:
|
|
845
|
+
df[plot_id_column] = range(len(df))
|
|
846
|
+
|
|
847
|
+
# Ensure all column names are strings (fixes pandas .str accessor issues)
|
|
848
|
+
df.columns = df.columns.astype(str)
|
|
849
|
+
|
|
850
|
+
return df
|
|
851
|
+
|
|
852
|
+
except ee.EEException as e:
|
|
853
|
+
error_msg = str(e)
|
|
854
|
+
|
|
855
|
+
if "Quota" in error_msg or "limit" in error_msg.lower():
|
|
856
|
+
if attempt < max_retries - 1:
|
|
857
|
+
wait_time = min(30, 2**attempt)
|
|
858
|
+
logger.warning(
|
|
859
|
+
f"Batch {batch_idx + 1}: Rate limited, waiting {wait_time}s..."
|
|
860
|
+
)
|
|
861
|
+
time.sleep(wait_time)
|
|
862
|
+
else:
|
|
863
|
+
raise RuntimeError(f"Batch {batch_idx + 1}: Quota exhausted")
|
|
864
|
+
|
|
865
|
+
elif "timeout" in error_msg.lower():
|
|
866
|
+
if attempt < max_retries - 1:
|
|
867
|
+
wait_time = min(15, 2**attempt)
|
|
868
|
+
logger.warning(
|
|
869
|
+
f"Batch {batch_idx + 1}: Timeout, retrying in {wait_time}s..."
|
|
870
|
+
)
|
|
871
|
+
time.sleep(wait_time)
|
|
872
|
+
else:
|
|
873
|
+
raise
|
|
874
|
+
|
|
875
|
+
else:
|
|
876
|
+
if attempt < max_retries - 1:
|
|
877
|
+
wait_time = min(5, 2**attempt)
|
|
878
|
+
time.sleep(wait_time)
|
|
879
|
+
else:
|
|
880
|
+
raise
|
|
881
|
+
|
|
882
|
+
except Exception as e:
|
|
883
|
+
if attempt < max_retries - 1:
|
|
884
|
+
time.sleep(min(5, 2**attempt))
|
|
885
|
+
else:
|
|
886
|
+
raise RuntimeError(f"Batch {batch_idx + 1}: {str(e)}")
|
|
887
|
+
|
|
888
|
+
raise RuntimeError(f"Batch {batch_idx + 1}: Failed after {max_retries} attempts")
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
# ============================================================================
|
|
892
|
+
# CONCURRENT PROCESSING FUNCTIONS
|
|
893
|
+
# ============================================================================
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def whisp_stats_geojson_to_df_concurrent(
|
|
897
|
+
input_geojson_filepath: str,
|
|
898
|
+
external_id_column: str = None,
|
|
899
|
+
remove_geom: bool = False,
|
|
900
|
+
national_codes: List[str] = None,
|
|
901
|
+
unit_type: str = "ha",
|
|
902
|
+
whisp_image: ee.Image = None,
|
|
903
|
+
custom_bands: Dict[str, Any] = None,
|
|
904
|
+
batch_size: int = 10,
|
|
905
|
+
max_concurrent: int = 20,
|
|
906
|
+
validate_geometries: bool = True,
|
|
907
|
+
max_retries: int = 3,
|
|
908
|
+
add_metadata_server: bool = False,
|
|
909
|
+
logger: logging.Logger = None,
|
|
910
|
+
# Format parameters (auto-detect from config if not provided)
|
|
911
|
+
decimal_places: int = None,
|
|
912
|
+
) -> pd.DataFrame:
|
|
913
|
+
"""
|
|
914
|
+
Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
|
|
915
|
+
|
|
916
|
+
Uses high-volume endpoint and concurrent batching. Client-side metadata
|
|
917
|
+
extraction is always applied; optionally add server-side metadata too.
|
|
918
|
+
Automatically formats output (converts units, removes noise columns, etc.).
|
|
919
|
+
|
|
920
|
+
Parameters
|
|
921
|
+
----------
|
|
922
|
+
input_geojson_filepath : str
|
|
923
|
+
Path to input GeoJSON file
|
|
924
|
+
external_id_column : str, optional
|
|
925
|
+
Column name for external IDs
|
|
926
|
+
remove_geom : bool
|
|
927
|
+
Remove geometry column from output
|
|
928
|
+
national_codes : List[str], optional
|
|
929
|
+
ISO2 codes for national datasets
|
|
930
|
+
unit_type : str
|
|
931
|
+
"ha" or "percent"
|
|
932
|
+
whisp_image : ee.Image, optional
|
|
933
|
+
Pre-combined image (created with combine_datasets if None)
|
|
934
|
+
custom_bands : Dict[str, Any], optional
|
|
935
|
+
Custom band information
|
|
936
|
+
batch_size : int
|
|
937
|
+
Features per batch
|
|
938
|
+
max_concurrent : int
|
|
939
|
+
Maximum concurrent EE calls
|
|
940
|
+
validate_geometries : bool
|
|
941
|
+
Validate and clean geometries
|
|
942
|
+
max_retries : int
|
|
943
|
+
Retry attempts per batch
|
|
944
|
+
add_metadata_server : bool
|
|
945
|
+
Add metadata server-side (in addition to client-side)
|
|
946
|
+
logger : logging.Logger, optional
|
|
947
|
+
Logger for output
|
|
948
|
+
decimal_places : int, optional
|
|
949
|
+
Decimal places for formatting. If None, auto-detects from config.
|
|
950
|
+
|
|
951
|
+
Returns
|
|
952
|
+
-------
|
|
953
|
+
pd.DataFrame
|
|
954
|
+
Formatted results DataFrame with Whisp statistics
|
|
955
|
+
"""
|
|
956
|
+
from openforis_whisp.reformat import format_stats_dataframe
|
|
957
|
+
|
|
958
|
+
logger = logger or logging.getLogger("whisp-concurrent")
|
|
959
|
+
|
|
960
|
+
# Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
|
|
961
|
+
_suppress_verbose_output(max_concurrent=max_concurrent)
|
|
962
|
+
|
|
963
|
+
# Auto-detect decimal places from config if not provided
|
|
964
|
+
if decimal_places is None:
|
|
965
|
+
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
966
|
+
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
967
|
+
|
|
968
|
+
# Validate endpoint
|
|
969
|
+
validate_ee_endpoint("high-volume", raise_error=True)
|
|
970
|
+
|
|
971
|
+
# Load GeoJSON with output suppressed
|
|
972
|
+
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
973
|
+
logger.info(f"Loaded {len(gdf):,} features")
|
|
974
|
+
|
|
975
|
+
if validate_geometries:
|
|
976
|
+
gdf = clean_geodataframe(gdf, logger=logger)
|
|
977
|
+
|
|
978
|
+
# Add stable plotIds for merging (starting from 1, not 0)
|
|
979
|
+
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
980
|
+
|
|
981
|
+
# Create image if not provided
|
|
982
|
+
if whisp_image is None:
|
|
983
|
+
logger.debug("Creating Whisp image...")
|
|
984
|
+
# Suppress print statements from combine_datasets
|
|
985
|
+
with redirect_stdout(io.StringIO()):
|
|
986
|
+
try:
|
|
987
|
+
# First try without validation
|
|
988
|
+
whisp_image = combine_datasets(
|
|
989
|
+
national_codes=national_codes, validate_bands=False
|
|
990
|
+
)
|
|
991
|
+
except Exception as e:
|
|
992
|
+
logger.warning(
|
|
993
|
+
f"First attempt failed: {str(e)[:100]}. Retrying with validate_bands=True..."
|
|
994
|
+
)
|
|
995
|
+
# Retry with validation to catch and fix bad bands
|
|
996
|
+
whisp_image = combine_datasets(
|
|
997
|
+
national_codes=national_codes, validate_bands=True
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
# Create reducer
|
|
1001
|
+
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
1002
|
+
|
|
1003
|
+
# Batch the data
|
|
1004
|
+
batches = batch_geodataframe(gdf, batch_size)
|
|
1005
|
+
logger.info(f"Processing {len(gdf):,} features in {len(batches)} batches")
|
|
1006
|
+
|
|
1007
|
+
# Setup semaphore for EE concurrency control
|
|
1008
|
+
ee_semaphore = threading.BoundedSemaphore(max_concurrent)
|
|
1009
|
+
|
|
1010
|
+
# Progress tracker
|
|
1011
|
+
progress = ProgressTracker(len(batches), logger=logger)
|
|
1012
|
+
|
|
1013
|
+
results = []
|
|
1014
|
+
|
|
1015
|
+
def process_batch(
|
|
1016
|
+
batch_idx: int, batch: gpd.GeoDataFrame
|
|
1017
|
+
) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
|
|
1018
|
+
"""Process one batch: server EE work + client metadata."""
|
|
1019
|
+
with ee_semaphore:
|
|
1020
|
+
# Server-side: convert to EE, optionally add metadata, reduce
|
|
1021
|
+
fc = convert_batch_to_ee(batch)
|
|
1022
|
+
if add_metadata_server:
|
|
1023
|
+
fc = extract_centroid_and_geomtype_server(fc)
|
|
1024
|
+
df_server = process_ee_batch(
|
|
1025
|
+
fc, whisp_image, reducer, batch_idx, max_retries, logger
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
# Client-side: extract metadata using GeoPandas
|
|
1029
|
+
df_client = extract_centroid_and_geomtype_client(
|
|
1030
|
+
batch,
|
|
1031
|
+
external_id_column=external_id_column,
|
|
1032
|
+
return_attributes_only=True,
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
return batch_idx, df_server, df_client
|
|
1036
|
+
|
|
1037
|
+
# Process batches with thread pool
|
|
1038
|
+
pool_workers = max(2 * max_concurrent, max_concurrent + 2)
|
|
1039
|
+
|
|
1040
|
+
# Track if we had errors that suggest bad bands
|
|
1041
|
+
batch_errors = []
|
|
1042
|
+
|
|
1043
|
+
# Suppress fiona logging during batch processing (threads create new loggers)
|
|
1044
|
+
fiona_logger = logging.getLogger("fiona")
|
|
1045
|
+
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
1046
|
+
old_fiona_level = fiona_logger.level
|
|
1047
|
+
old_pyogrio_level = pyogrio_logger.level
|
|
1048
|
+
fiona_logger.setLevel(logging.CRITICAL)
|
|
1049
|
+
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1050
|
+
|
|
1051
|
+
try:
|
|
1052
|
+
with redirect_stdout(io.StringIO()):
|
|
1053
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1054
|
+
futures = {
|
|
1055
|
+
executor.submit(process_batch, i, batch): i
|
|
1056
|
+
for i, batch in enumerate(batches)
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
for future in as_completed(futures):
|
|
1060
|
+
try:
|
|
1061
|
+
batch_idx, df_server, df_client = future.result()
|
|
1062
|
+
|
|
1063
|
+
# Merge server and client results
|
|
1064
|
+
if plot_id_column not in df_server.columns:
|
|
1065
|
+
df_server[plot_id_column] = range(len(df_server))
|
|
1066
|
+
|
|
1067
|
+
merged = df_server.merge(
|
|
1068
|
+
df_client,
|
|
1069
|
+
on=plot_id_column,
|
|
1070
|
+
how="left",
|
|
1071
|
+
suffixes=("_ee", "_client"),
|
|
1072
|
+
)
|
|
1073
|
+
results.append(merged)
|
|
1074
|
+
progress.update()
|
|
1075
|
+
|
|
1076
|
+
except Exception as e:
|
|
1077
|
+
error_msg = str(e)
|
|
1078
|
+
logger.error(f"Batch processing error: {error_msg[:100]}")
|
|
1079
|
+
import traceback
|
|
1080
|
+
|
|
1081
|
+
logger.debug(traceback.format_exc())
|
|
1082
|
+
batch_errors.append(error_msg)
|
|
1083
|
+
finally:
|
|
1084
|
+
# Restore logger levels
|
|
1085
|
+
fiona_logger.setLevel(old_fiona_level)
|
|
1086
|
+
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
1087
|
+
|
|
1088
|
+
progress.finish()
|
|
1089
|
+
|
|
1090
|
+
# Check if we should retry with validation due to band errors
|
|
1091
|
+
if batch_errors and not results:
|
|
1092
|
+
# All batches failed - likely a bad band issue
|
|
1093
|
+
is_band_error = any(
|
|
1094
|
+
keyword in str(batch_errors)
|
|
1095
|
+
for keyword in ["Image.load", "asset", "not found", "does not exist"]
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
if is_band_error:
|
|
1099
|
+
logger.warning(
|
|
1100
|
+
"Detected potential bad band error. Retrying with validate_bands=True..."
|
|
1101
|
+
)
|
|
1102
|
+
try:
|
|
1103
|
+
with redirect_stdout(io.StringIO()):
|
|
1104
|
+
whisp_image = combine_datasets(
|
|
1105
|
+
national_codes=national_codes, validate_bands=True
|
|
1106
|
+
)
|
|
1107
|
+
logger.info(
|
|
1108
|
+
"Image recreated with validation. Retrying batch processing..."
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
# Retry batch processing with validated image
|
|
1112
|
+
results = []
|
|
1113
|
+
progress = ProgressTracker(len(batches), logger=logger)
|
|
1114
|
+
|
|
1115
|
+
# Suppress fiona logging during batch processing (threads create new loggers)
|
|
1116
|
+
fiona_logger = logging.getLogger("fiona")
|
|
1117
|
+
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
1118
|
+
old_fiona_level = fiona_logger.level
|
|
1119
|
+
old_pyogrio_level = pyogrio_logger.level
|
|
1120
|
+
fiona_logger.setLevel(logging.CRITICAL)
|
|
1121
|
+
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1122
|
+
|
|
1123
|
+
try:
|
|
1124
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1125
|
+
futures = {
|
|
1126
|
+
executor.submit(process_batch, i, batch): i
|
|
1127
|
+
for i, batch in enumerate(batches)
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
for future in as_completed(futures):
|
|
1131
|
+
try:
|
|
1132
|
+
batch_idx, df_server, df_client = future.result()
|
|
1133
|
+
if plot_id_column not in df_server.columns:
|
|
1134
|
+
df_server[plot_id_column] = range(len(df_server))
|
|
1135
|
+
merged = df_server.merge(
|
|
1136
|
+
df_client,
|
|
1137
|
+
on=plot_id_column,
|
|
1138
|
+
how="left",
|
|
1139
|
+
suffixes=("", "_client"),
|
|
1140
|
+
)
|
|
1141
|
+
results.append(merged)
|
|
1142
|
+
progress.update()
|
|
1143
|
+
except Exception as e:
|
|
1144
|
+
logger.error(
|
|
1145
|
+
f"Batch processing error (retry): {str(e)[:100]}"
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
progress.finish()
|
|
1149
|
+
finally:
|
|
1150
|
+
# Restore logger levels
|
|
1151
|
+
fiona_logger.setLevel(old_fiona_level)
|
|
1152
|
+
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
1153
|
+
except Exception as validation_e:
|
|
1154
|
+
logger.error(
|
|
1155
|
+
f"Failed to recover with validation: {str(validation_e)[:100]}"
|
|
1156
|
+
)
|
|
1157
|
+
return pd.DataFrame()
|
|
1158
|
+
|
|
1159
|
+
if results:
|
|
1160
|
+
# Filter out empty DataFrames and all-NA columns to avoid FutureWarning in pd.concat
|
|
1161
|
+
results_filtered = []
|
|
1162
|
+
for df in results:
|
|
1163
|
+
if not df.empty:
|
|
1164
|
+
# Drop columns that are entirely NA
|
|
1165
|
+
df_clean = df.dropna(axis=1, how="all")
|
|
1166
|
+
if not df_clean.empty:
|
|
1167
|
+
results_filtered.append(df_clean)
|
|
1168
|
+
results = results_filtered
|
|
1169
|
+
|
|
1170
|
+
if results:
|
|
1171
|
+
# Concatenate with explicit dtype handling to suppress FutureWarning
|
|
1172
|
+
combined = pd.concat(results, ignore_index=True, sort=False)
|
|
1173
|
+
# Ensure all column names are strings (fixes pandas .str accessor issues later)
|
|
1174
|
+
combined.columns = combined.columns.astype(str)
|
|
1175
|
+
else:
|
|
1176
|
+
return pd.DataFrame()
|
|
1177
|
+
|
|
1178
|
+
# Clean up duplicate external_id columns created by merges
|
|
1179
|
+
# Rename external_id_column to standardized 'external_id' for schema validation
|
|
1180
|
+
if external_id_column:
|
|
1181
|
+
# Find all columns related to external_id
|
|
1182
|
+
external_id_variants = [
|
|
1183
|
+
col
|
|
1184
|
+
for col in combined.columns
|
|
1185
|
+
if external_id_column.lower() in col.lower()
|
|
1186
|
+
]
|
|
1187
|
+
|
|
1188
|
+
if external_id_variants:
|
|
1189
|
+
# Use the base column name if it exists, otherwise use first variant
|
|
1190
|
+
base_col = (
|
|
1191
|
+
external_id_column
|
|
1192
|
+
if external_id_column in combined.columns
|
|
1193
|
+
else external_id_variants[0]
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
# Rename to standardized 'external_id'
|
|
1197
|
+
if base_col != "external_id":
|
|
1198
|
+
combined = combined.rename(columns={base_col: "external_id"})
|
|
1199
|
+
|
|
1200
|
+
# Drop all other variants
|
|
1201
|
+
cols_to_drop = [c for c in external_id_variants if c != base_col]
|
|
1202
|
+
combined = combined.drop(columns=cols_to_drop, errors="ignore")
|
|
1203
|
+
|
|
1204
|
+
# plotId column is already present from batch processing
|
|
1205
|
+
# Just ensure it's at position 0
|
|
1206
|
+
if plot_id_column in combined.columns:
|
|
1207
|
+
combined = combined[
|
|
1208
|
+
[plot_id_column]
|
|
1209
|
+
+ [col for col in combined.columns if col != plot_id_column]
|
|
1210
|
+
]
|
|
1211
|
+
|
|
1212
|
+
# Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
|
|
1213
|
+
# MUST be done BEFORE formatting (which removes _median columns)
|
|
1214
|
+
logger.debug("Adding administrative context...")
|
|
1215
|
+
try:
|
|
1216
|
+
from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
|
|
1217
|
+
|
|
1218
|
+
combined = join_admin_codes(
|
|
1219
|
+
df=combined, lookup_dict=lookup_dict, id_col="admin_code_median"
|
|
1220
|
+
)
|
|
1221
|
+
except ImportError:
|
|
1222
|
+
logger.warning(
|
|
1223
|
+
"Could not import lookup dictionary - admin context not added"
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
# Format the output with error handling for bad bands
|
|
1227
|
+
logger.debug("Formatting output...")
|
|
1228
|
+
try:
|
|
1229
|
+
formatted = format_stats_dataframe(
|
|
1230
|
+
df=combined,
|
|
1231
|
+
area_col=f"{geometry_area_column}_sum",
|
|
1232
|
+
decimal_places=decimal_places,
|
|
1233
|
+
unit_type=unit_type,
|
|
1234
|
+
remove_columns=True,
|
|
1235
|
+
convert_water_flag=True,
|
|
1236
|
+
)
|
|
1237
|
+
except Exception as e:
|
|
1238
|
+
# If formatting fails, try recreating the image with validation
|
|
1239
|
+
logger.warning(
|
|
1240
|
+
f"Formatting failed: {str(e)[:100]}. Attempting to recreate image with band validation..."
|
|
1241
|
+
)
|
|
1242
|
+
try:
|
|
1243
|
+
with redirect_stdout(io.StringIO()):
|
|
1244
|
+
whisp_image_validated = combine_datasets(
|
|
1245
|
+
national_codes=national_codes, validate_bands=True
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
# Reprocess batches with validated image - create a local process function
|
|
1249
|
+
logger.info("Reprocessing batches with validated image...")
|
|
1250
|
+
results_validated = []
|
|
1251
|
+
|
|
1252
|
+
def process_batch_validated(
|
|
1253
|
+
batch_idx: int, batch: gpd.GeoDataFrame
|
|
1254
|
+
) -> Tuple[int, pd.DataFrame, pd.DataFrame]:
|
|
1255
|
+
"""Process one batch with validated image."""
|
|
1256
|
+
with ee_semaphore:
|
|
1257
|
+
fc = convert_batch_to_ee(batch)
|
|
1258
|
+
if add_metadata_server:
|
|
1259
|
+
fc = extract_centroid_and_geomtype_server(fc)
|
|
1260
|
+
df_server = process_ee_batch(
|
|
1261
|
+
fc,
|
|
1262
|
+
whisp_image_validated,
|
|
1263
|
+
reducer,
|
|
1264
|
+
batch_idx,
|
|
1265
|
+
max_retries,
|
|
1266
|
+
logger,
|
|
1267
|
+
)
|
|
1268
|
+
df_client = extract_centroid_and_geomtype_client(
|
|
1269
|
+
batch,
|
|
1270
|
+
external_id_column=external_id_column,
|
|
1271
|
+
return_attributes_only=True,
|
|
1272
|
+
)
|
|
1273
|
+
return batch_idx, df_server, df_client
|
|
1274
|
+
|
|
1275
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1276
|
+
futures = {
|
|
1277
|
+
executor.submit(process_batch_validated, i, batch): i
|
|
1278
|
+
for i, batch in enumerate(batches)
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
for future in as_completed(futures):
|
|
1282
|
+
try:
|
|
1283
|
+
batch_idx, df_server, df_client = future.result()
|
|
1284
|
+
if plot_id_column not in df_server.columns:
|
|
1285
|
+
df_server[plot_id_column] = range(len(df_server))
|
|
1286
|
+
|
|
1287
|
+
# Drop external_id_column from df_client if it exists (already in df_server)
|
|
1288
|
+
if (
|
|
1289
|
+
external_id_column
|
|
1290
|
+
and external_id_column in df_client.columns
|
|
1291
|
+
):
|
|
1292
|
+
df_client = df_client.drop(columns=[external_id_column])
|
|
1293
|
+
|
|
1294
|
+
merged = df_server.merge(
|
|
1295
|
+
df_client,
|
|
1296
|
+
on=plot_id_column,
|
|
1297
|
+
how="left",
|
|
1298
|
+
suffixes=("", "_client"),
|
|
1299
|
+
)
|
|
1300
|
+
results_validated.append(merged)
|
|
1301
|
+
except Exception as batch_e:
|
|
1302
|
+
logger.error(
|
|
1303
|
+
f"Batch reprocessing error: {str(batch_e)[:100]}"
|
|
1304
|
+
)
|
|
1305
|
+
|
|
1306
|
+
if results_validated:
|
|
1307
|
+
# Concatenate with explicit dtype handling to suppress FutureWarning
|
|
1308
|
+
combined = pd.concat(
|
|
1309
|
+
results_validated, ignore_index=True, sort=False
|
|
1310
|
+
)
|
|
1311
|
+
# Ensure all column names are strings (fixes pandas .str accessor issues later)
|
|
1312
|
+
combined.columns = combined.columns.astype(str)
|
|
1313
|
+
|
|
1314
|
+
# Clean up duplicate external_id columns created by merges
|
|
1315
|
+
if external_id_column:
|
|
1316
|
+
external_id_variants = [
|
|
1317
|
+
col
|
|
1318
|
+
for col in combined.columns
|
|
1319
|
+
if external_id_column.lower() in col.lower()
|
|
1320
|
+
]
|
|
1321
|
+
|
|
1322
|
+
if external_id_variants:
|
|
1323
|
+
base_col = external_id_column
|
|
1324
|
+
if (
|
|
1325
|
+
base_col not in combined.columns
|
|
1326
|
+
and external_id_variants
|
|
1327
|
+
):
|
|
1328
|
+
base_col = external_id_variants[0]
|
|
1329
|
+
combined = combined.rename(
|
|
1330
|
+
columns={base_col: "external_id"}
|
|
1331
|
+
)
|
|
1332
|
+
|
|
1333
|
+
cols_to_drop = [
|
|
1334
|
+
c for c in external_id_variants if c != base_col
|
|
1335
|
+
]
|
|
1336
|
+
combined = combined.drop(
|
|
1337
|
+
columns=cols_to_drop, errors="ignore"
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
# plotId column is already present, just ensure it's at position 0
|
|
1341
|
+
if plot_id_column in combined.columns:
|
|
1342
|
+
combined = combined[
|
|
1343
|
+
[plot_id_column]
|
|
1344
|
+
+ [col for col in combined.columns if col != plot_id_column]
|
|
1345
|
+
]
|
|
1346
|
+
|
|
1347
|
+
# Add admin context again
|
|
1348
|
+
try:
|
|
1349
|
+
from openforis_whisp.parameters.lookup_gaul1_admin import (
|
|
1350
|
+
lookup_dict,
|
|
1351
|
+
)
|
|
1352
|
+
|
|
1353
|
+
combined = join_admin_codes(
|
|
1354
|
+
df=combined,
|
|
1355
|
+
lookup_dict=lookup_dict,
|
|
1356
|
+
id_col="admin_code_median",
|
|
1357
|
+
)
|
|
1358
|
+
except ImportError:
|
|
1359
|
+
logger.warning(
|
|
1360
|
+
"Could not import lookup dictionary - admin context not added"
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Try formatting again with validated data
|
|
1364
|
+
formatted = format_stats_dataframe(
|
|
1365
|
+
df=combined,
|
|
1366
|
+
area_col=f"{geometry_area_column}_sum",
|
|
1367
|
+
decimal_places=decimal_places,
|
|
1368
|
+
unit_type=unit_type,
|
|
1369
|
+
remove_columns=True,
|
|
1370
|
+
convert_water_flag=True,
|
|
1371
|
+
)
|
|
1372
|
+
else:
|
|
1373
|
+
logger.error(" Reprocessing with validation produced no results")
|
|
1374
|
+
return pd.DataFrame()
|
|
1375
|
+
except Exception as retry_e:
|
|
1376
|
+
logger.error(
|
|
1377
|
+
f"Failed to recover from formatting error: {str(retry_e)[:100]}"
|
|
1378
|
+
)
|
|
1379
|
+
raise retry_e
|
|
1380
|
+
|
|
1381
|
+
logger.info(f"Processed {len(formatted):,} features successfully")
|
|
1382
|
+
return formatted
|
|
1383
|
+
else:
|
|
1384
|
+
logger.error(" No results produced")
|
|
1385
|
+
return pd.DataFrame()
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
# ============================================================================
|
|
1389
|
+
# SEQUENTIAL PROCESSING (STANDARD ENDPOINT)
|
|
1390
|
+
# ============================================================================
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
def whisp_stats_geojson_to_df_sequential(
|
|
1394
|
+
input_geojson_filepath: str,
|
|
1395
|
+
external_id_column: str = None,
|
|
1396
|
+
remove_geom: bool = False,
|
|
1397
|
+
national_codes: List[str] = None,
|
|
1398
|
+
unit_type: str = "ha",
|
|
1399
|
+
whisp_image: ee.Image = None,
|
|
1400
|
+
custom_bands: Dict[str, Any] = None,
|
|
1401
|
+
add_metadata_client_side: bool = True,
|
|
1402
|
+
logger: logging.Logger = None,
|
|
1403
|
+
# Format parameters (auto-detect from config if not provided)
|
|
1404
|
+
decimal_places: int = None,
|
|
1405
|
+
) -> pd.DataFrame:
|
|
1406
|
+
"""
|
|
1407
|
+
Process GeoJSON sequentially using standard EE endpoint with automatic formatting.
|
|
1408
|
+
|
|
1409
|
+
Uses reduceRegions for server-side processing and client-side metadata
|
|
1410
|
+
extraction via GeoPandas. Suitable for smaller datasets or when high-volume
|
|
1411
|
+
endpoint is not available. Automatically formats output.
|
|
1412
|
+
|
|
1413
|
+
Requires: standard EE endpoint (default)
|
|
1414
|
+
|
|
1415
|
+
Parameters
|
|
1416
|
+
----------
|
|
1417
|
+
input_geojson_filepath : str
|
|
1418
|
+
Path to input GeoJSON
|
|
1419
|
+
external_id_column : str, optional
|
|
1420
|
+
Column name for external IDs
|
|
1421
|
+
remove_geom : bool
|
|
1422
|
+
Remove geometry from output
|
|
1423
|
+
national_codes : List[str], optional
|
|
1424
|
+
ISO2 codes for national datasets
|
|
1425
|
+
unit_type : str
|
|
1426
|
+
"ha" or "percent"
|
|
1427
|
+
whisp_image : ee.Image, optional
|
|
1428
|
+
Pre-combined image
|
|
1429
|
+
custom_bands : Dict[str, Any], optional
|
|
1430
|
+
Custom band information
|
|
1431
|
+
add_metadata_client_side : bool
|
|
1432
|
+
Add client-side metadata (recommended)
|
|
1433
|
+
logger : logging.Logger, optional
|
|
1434
|
+
Logger for output
|
|
1435
|
+
decimal_places : int, optional
|
|
1436
|
+
Decimal places for formatting. If None, auto-detects from config.
|
|
1437
|
+
|
|
1438
|
+
Returns
|
|
1439
|
+
-------
|
|
1440
|
+
pd.DataFrame
|
|
1441
|
+
Formatted results DataFrame
|
|
1442
|
+
"""
|
|
1443
|
+
from openforis_whisp.reformat import format_stats_dataframe
|
|
1444
|
+
|
|
1445
|
+
logger = logger or logging.getLogger("whisp-concurrent")
|
|
1446
|
+
|
|
1447
|
+
# Suppress verbose output from dependencies (sequential has lower concurrency, use default)
|
|
1448
|
+
_suppress_verbose_output(max_concurrent=1)
|
|
1449
|
+
|
|
1450
|
+
# Auto-detect decimal places from config if not provided
|
|
1451
|
+
if decimal_places is None:
|
|
1452
|
+
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1453
|
+
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1454
|
+
|
|
1455
|
+
# Validate endpoint
|
|
1456
|
+
validate_ee_endpoint("standard", raise_error=True)
|
|
1457
|
+
|
|
1458
|
+
# Load GeoJSON with output suppressed
|
|
1459
|
+
gdf = _load_geojson_silently(input_geojson_filepath)
|
|
1460
|
+
logger.info(f"Loaded {len(gdf):,} features")
|
|
1461
|
+
|
|
1462
|
+
# Clean geometries
|
|
1463
|
+
gdf = clean_geodataframe(gdf, logger=logger)
|
|
1464
|
+
|
|
1465
|
+
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1466
|
+
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1467
|
+
|
|
1468
|
+
# Add stable row IDs
|
|
1469
|
+
row_id_col = "__row_id__"
|
|
1470
|
+
gdf[row_id_col] = range(len(gdf))
|
|
1471
|
+
|
|
1472
|
+
# Create image if not provided
|
|
1473
|
+
if whisp_image is None:
|
|
1474
|
+
logger.debug("Creating Whisp image...")
|
|
1475
|
+
# Suppress print statements from combine_datasets
|
|
1476
|
+
with redirect_stdout(io.StringIO()):
|
|
1477
|
+
try:
|
|
1478
|
+
# First try without validation
|
|
1479
|
+
whisp_image = combine_datasets(
|
|
1480
|
+
national_codes=national_codes, validate_bands=False
|
|
1481
|
+
)
|
|
1482
|
+
except Exception as e:
|
|
1483
|
+
logger.warning(
|
|
1484
|
+
f"First attempt failed: {str(e)[:100]}. Retrying with validate_bands=True..."
|
|
1485
|
+
)
|
|
1486
|
+
# Retry with validation to catch and fix bad bands
|
|
1487
|
+
whisp_image = combine_datasets(
|
|
1488
|
+
national_codes=national_codes, validate_bands=True
|
|
1489
|
+
)
|
|
1490
|
+
|
|
1491
|
+
# Convert to EE (suppress print statements from convert_geojson_to_ee)
|
|
1492
|
+
logger.debug("Converting to EE FeatureCollection...")
|
|
1493
|
+
with redirect_stdout(io.StringIO()):
|
|
1494
|
+
fc = convert_geojson_to_ee(input_geojson_filepath)
|
|
1495
|
+
|
|
1496
|
+
# Create reducer
|
|
1497
|
+
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
1498
|
+
|
|
1499
|
+
# Process server-side with error handling for bad bands
|
|
1500
|
+
logger.info("Processing with Earth Engine...")
|
|
1501
|
+
try:
|
|
1502
|
+
results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
|
|
1503
|
+
df_server = convert_ee_to_df(results_fc)
|
|
1504
|
+
except Exception as e:
|
|
1505
|
+
# Check if this is a band error
|
|
1506
|
+
error_msg = str(e)
|
|
1507
|
+
is_band_error = any(
|
|
1508
|
+
keyword in error_msg
|
|
1509
|
+
for keyword in ["Image.load", "asset", "not found", "does not exist"]
|
|
1510
|
+
)
|
|
1511
|
+
|
|
1512
|
+
if is_band_error and whisp_image is not None:
|
|
1513
|
+
logger.warning(
|
|
1514
|
+
f"Detected bad band error: {error_msg[:100]}. Retrying with validate_bands=True..."
|
|
1515
|
+
)
|
|
1516
|
+
try:
|
|
1517
|
+
with redirect_stdout(io.StringIO()):
|
|
1518
|
+
whisp_image = combine_datasets(
|
|
1519
|
+
national_codes=national_codes, validate_bands=True
|
|
1520
|
+
)
|
|
1521
|
+
logger.info("Image recreated with validation. Retrying processing...")
|
|
1522
|
+
results_fc = whisp_image.reduceRegions(
|
|
1523
|
+
collection=fc, reducer=reducer, scale=10
|
|
1524
|
+
)
|
|
1525
|
+
df_server = convert_ee_to_df(results_fc)
|
|
1526
|
+
except Exception as retry_e:
|
|
1527
|
+
logger.error(f"Retry failed: {str(retry_e)[:100]}")
|
|
1528
|
+
raise
|
|
1529
|
+
else:
|
|
1530
|
+
raise
|
|
1531
|
+
|
|
1532
|
+
logger.debug("Server-side processing complete")
|
|
1533
|
+
|
|
1534
|
+
# Add row_id if missing
|
|
1535
|
+
if row_id_col not in df_server.columns:
|
|
1536
|
+
df_server[row_id_col] = range(len(df_server))
|
|
1537
|
+
|
|
1538
|
+
# Add client-side metadata if requested
|
|
1539
|
+
if add_metadata_client_side:
|
|
1540
|
+
logger.debug("Extracting client-side metadata...")
|
|
1541
|
+
df_client = extract_centroid_and_geomtype_client(
|
|
1542
|
+
gdf,
|
|
1543
|
+
external_id_column=external_id_column,
|
|
1544
|
+
return_attributes_only=True,
|
|
1545
|
+
)
|
|
1546
|
+
|
|
1547
|
+
# Drop external_id_column from df_client if it exists (already in df_server)
|
|
1548
|
+
if external_id_column and external_id_column in df_client.columns:
|
|
1549
|
+
df_client = df_client.drop(columns=[external_id_column])
|
|
1550
|
+
|
|
1551
|
+
# Merge
|
|
1552
|
+
result = df_server.merge(
|
|
1553
|
+
df_client, on=row_id_col, how="left", suffixes=("", "_client")
|
|
1554
|
+
)
|
|
1555
|
+
else:
|
|
1556
|
+
result = df_server
|
|
1557
|
+
|
|
1558
|
+
# Remove internal __row_id__ column if present
|
|
1559
|
+
if row_id_col in result.columns:
|
|
1560
|
+
result = result.drop(columns=[row_id_col])
|
|
1561
|
+
|
|
1562
|
+
# Format the output
|
|
1563
|
+
# Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
|
|
1564
|
+
# MUST be done BEFORE formatting (which removes _median columns)
|
|
1565
|
+
logger.debug("Adding administrative context...")
|
|
1566
|
+
try:
|
|
1567
|
+
from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
|
|
1568
|
+
|
|
1569
|
+
result = join_admin_codes(
|
|
1570
|
+
df=result, lookup_dict=lookup_dict, id_col="admin_code_median"
|
|
1571
|
+
)
|
|
1572
|
+
except ImportError:
|
|
1573
|
+
logger.warning("Could not import lookup dictionary - admin context not added")
|
|
1574
|
+
|
|
1575
|
+
# Format the output
|
|
1576
|
+
logger.debug("Formatting output...")
|
|
1577
|
+
formatted = format_stats_dataframe(
|
|
1578
|
+
df=result,
|
|
1579
|
+
area_col=f"{geometry_area_column}_sum",
|
|
1580
|
+
decimal_places=decimal_places,
|
|
1581
|
+
unit_type=unit_type,
|
|
1582
|
+
remove_columns=True,
|
|
1583
|
+
convert_water_flag=True,
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
logger.info(f"Processed {len(formatted):,} features")
|
|
1587
|
+
|
|
1588
|
+
# Consolidate external_id_column to standardized 'external_id'
|
|
1589
|
+
if external_id_column:
|
|
1590
|
+
variants = [
|
|
1591
|
+
col
|
|
1592
|
+
for col in formatted.columns
|
|
1593
|
+
if external_id_column.lower() in col.lower()
|
|
1594
|
+
]
|
|
1595
|
+
if variants:
|
|
1596
|
+
base_col = (
|
|
1597
|
+
external_id_column
|
|
1598
|
+
if external_id_column in formatted.columns
|
|
1599
|
+
else variants[0]
|
|
1600
|
+
)
|
|
1601
|
+
if base_col != "external_id":
|
|
1602
|
+
formatted = formatted.rename(columns={base_col: "external_id"})
|
|
1603
|
+
# Drop other variants
|
|
1604
|
+
formatted = formatted.drop(
|
|
1605
|
+
columns=[c for c in variants if c != base_col], errors="ignore"
|
|
1606
|
+
)
|
|
1607
|
+
|
|
1608
|
+
return formatted
|
|
1609
|
+
|
|
1610
|
+
|
|
1611
|
+
# ============================================================================
|
|
1612
|
+
# FORMATTED WRAPPER FUNCTIONS (STATS + FORMAT)
|
|
1613
|
+
# ============================================================================
|
|
1614
|
+
|
|
1615
|
+
|
|
1616
|
+
def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
1617
|
+
input_geojson_filepath: str,
|
|
1618
|
+
external_id_column: str = None,
|
|
1619
|
+
remove_geom: bool = False,
|
|
1620
|
+
national_codes: List[str] = None,
|
|
1621
|
+
unit_type: str = "ha",
|
|
1622
|
+
whisp_image: ee.Image = None,
|
|
1623
|
+
custom_bands: Dict[str, Any] = None,
|
|
1624
|
+
batch_size: int = 10,
|
|
1625
|
+
max_concurrent: int = 20,
|
|
1626
|
+
validate_geometries: bool = True,
|
|
1627
|
+
max_retries: int = 3,
|
|
1628
|
+
add_metadata_server: bool = False,
|
|
1629
|
+
logger: logging.Logger = None,
|
|
1630
|
+
# Format parameters (auto-detect from config if not provided)
|
|
1631
|
+
decimal_places: int = None,
|
|
1632
|
+
remove_median_columns: bool = True,
|
|
1633
|
+
convert_water_flag: bool = True,
|
|
1634
|
+
water_flag_threshold: float = 0.5,
|
|
1635
|
+
sort_column: str = "plotId",
|
|
1636
|
+
) -> pd.DataFrame:
|
|
1637
|
+
"""
|
|
1638
|
+
Process GeoJSON concurrently with automatic formatting and validation.
|
|
1639
|
+
|
|
1640
|
+
Combines whisp_stats_geojson_to_df_concurrent + format_stats_dataframe + validation
|
|
1641
|
+
for a complete pipeline: extract stats → convert units → format output → validate schema.
|
|
1642
|
+
|
|
1643
|
+
Uses high-volume endpoint and concurrent batching.
|
|
1644
|
+
|
|
1645
|
+
Parameters
|
|
1646
|
+
----------
|
|
1647
|
+
input_geojson_filepath : str
|
|
1648
|
+
Path to input GeoJSON file
|
|
1649
|
+
external_id_column : str, optional
|
|
1650
|
+
Column name for external IDs
|
|
1651
|
+
remove_geom : bool
|
|
1652
|
+
Remove geometry column from output
|
|
1653
|
+
national_codes : List[str], optional
|
|
1654
|
+
ISO2 codes for national datasets
|
|
1655
|
+
unit_type : str
|
|
1656
|
+
"ha" or "percent"
|
|
1657
|
+
whisp_image : ee.Image, optional
|
|
1658
|
+
Pre-combined image
|
|
1659
|
+
custom_bands : Dict[str, Any], optional
|
|
1660
|
+
Custom band information
|
|
1661
|
+
batch_size : int
|
|
1662
|
+
Features per batch (default 25)
|
|
1663
|
+
max_concurrent : int
|
|
1664
|
+
Maximum concurrent EE calls (default 10)
|
|
1665
|
+
validate_geometries : bool
|
|
1666
|
+
Validate and clean geometries (default True)
|
|
1667
|
+
max_retries : int
|
|
1668
|
+
Retry attempts per batch (default 3)
|
|
1669
|
+
add_metadata_server : bool
|
|
1670
|
+
Add metadata server-side (default False)
|
|
1671
|
+
logger : logging.Logger, optional
|
|
1672
|
+
Logger for output
|
|
1673
|
+
decimal_places : int, optional
|
|
1674
|
+
Decimal places for rounding. If None, auto-detects from config:
|
|
1675
|
+
- Area columns: geometry_area_column_formatting
|
|
1676
|
+
- Percent columns: stats_percent_columns_formatting
|
|
1677
|
+
- Other columns: stats_area_columns_formatting
|
|
1678
|
+
remove_median_columns : bool
|
|
1679
|
+
Remove '_median' columns (default True)
|
|
1680
|
+
convert_water_flag : bool
|
|
1681
|
+
Convert water flag to boolean (default True)
|
|
1682
|
+
water_flag_threshold : float
|
|
1683
|
+
Water flag ratio threshold (default 0.5)
|
|
1684
|
+
sort_column : str
|
|
1685
|
+
Column to sort by (default "plotId", None to skip)
|
|
1686
|
+
|
|
1687
|
+
Returns
|
|
1688
|
+
-------
|
|
1689
|
+
pd.DataFrame
|
|
1690
|
+
Validated, formatted results DataFrame
|
|
1691
|
+
"""
|
|
1692
|
+
from openforis_whisp.reformat import format_stats_dataframe
|
|
1693
|
+
|
|
1694
|
+
logger = logger or logging.getLogger("whisp-concurrent")
|
|
1695
|
+
|
|
1696
|
+
# Auto-detect decimal places from config if not provided
|
|
1697
|
+
if decimal_places is None:
|
|
1698
|
+
# Use stats_area_columns_formatting as default for most columns
|
|
1699
|
+
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1700
|
+
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1701
|
+
|
|
1702
|
+
# Step 1: Get raw stats
|
|
1703
|
+
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
1704
|
+
df_raw = whisp_stats_geojson_to_df_concurrent(
|
|
1705
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
1706
|
+
external_id_column=external_id_column,
|
|
1707
|
+
remove_geom=remove_geom,
|
|
1708
|
+
national_codes=national_codes,
|
|
1709
|
+
unit_type=unit_type,
|
|
1710
|
+
whisp_image=whisp_image,
|
|
1711
|
+
custom_bands=custom_bands,
|
|
1712
|
+
batch_size=batch_size,
|
|
1713
|
+
max_concurrent=max_concurrent,
|
|
1714
|
+
validate_geometries=validate_geometries,
|
|
1715
|
+
max_retries=max_retries,
|
|
1716
|
+
add_metadata_server=add_metadata_server,
|
|
1717
|
+
logger=logger,
|
|
1718
|
+
)
|
|
1719
|
+
|
|
1720
|
+
# Step 2: Format the output
|
|
1721
|
+
logger.debug("Step 2/2: Formatting output...")
|
|
1722
|
+
median_cols_before = [c for c in df_raw.columns if c.endswith("_median")]
|
|
1723
|
+
logger.debug(
|
|
1724
|
+
f"Columns ending with '_median' BEFORE formatting: {median_cols_before}"
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
df_formatted = format_stats_dataframe(
|
|
1728
|
+
df=df_raw,
|
|
1729
|
+
area_col=f"{geometry_area_column}_sum",
|
|
1730
|
+
decimal_places=decimal_places,
|
|
1731
|
+
unit_type=unit_type,
|
|
1732
|
+
remove_columns=remove_median_columns,
|
|
1733
|
+
convert_water_flag=convert_water_flag,
|
|
1734
|
+
water_flag_threshold=water_flag_threshold,
|
|
1735
|
+
sort_column=sort_column,
|
|
1736
|
+
)
|
|
1737
|
+
|
|
1738
|
+
median_cols_after = [c for c in df_formatted.columns if c.endswith("_median")]
|
|
1739
|
+
logger.debug(f"Columns ending with '_median' AFTER formatting: {median_cols_after}")
|
|
1740
|
+
|
|
1741
|
+
# Step 2b: Reformat geometry and handle point areas
|
|
1742
|
+
try:
|
|
1743
|
+
df_formatted = reformat_geometry_type(df_formatted)
|
|
1744
|
+
except Exception as e:
|
|
1745
|
+
logger.warning(f"Error reformatting geometry type: {e}")
|
|
1746
|
+
|
|
1747
|
+
try:
|
|
1748
|
+
df_formatted = set_point_geometry_area_to_zero(df_formatted)
|
|
1749
|
+
except Exception as e:
|
|
1750
|
+
logger.warning(f"Error setting point geometry area to zero: {e}")
|
|
1751
|
+
|
|
1752
|
+
# Step 3: Validate against schema
|
|
1753
|
+
logger.debug("Step 3/3: Validating against schema...")
|
|
1754
|
+
from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
|
|
1755
|
+
|
|
1756
|
+
df_validated = validate_dataframe_using_lookups_flexible(
|
|
1757
|
+
df_stats=df_formatted,
|
|
1758
|
+
national_codes=national_codes,
|
|
1759
|
+
custom_bands=custom_bands,
|
|
1760
|
+
)
|
|
1761
|
+
|
|
1762
|
+
logger.info("Concurrent processing + formatting + validation complete")
|
|
1763
|
+
return df_validated
|
|
1764
|
+
|
|
1765
|
+
|
|
1766
|
+
def whisp_formatted_stats_geojson_to_df_sequential(
|
|
1767
|
+
input_geojson_filepath: str,
|
|
1768
|
+
external_id_column: str = None,
|
|
1769
|
+
remove_geom: bool = False,
|
|
1770
|
+
national_codes: List[str] = None,
|
|
1771
|
+
unit_type: str = "ha",
|
|
1772
|
+
whisp_image: ee.Image = None,
|
|
1773
|
+
custom_bands: Dict[str, Any] = None,
|
|
1774
|
+
add_metadata_client_side: bool = True,
|
|
1775
|
+
logger: logging.Logger = None,
|
|
1776
|
+
# Format parameters (auto-detect from config if not provided)
|
|
1777
|
+
decimal_places: int = None,
|
|
1778
|
+
remove_median_columns: bool = True,
|
|
1779
|
+
convert_water_flag: bool = True,
|
|
1780
|
+
water_flag_threshold: float = 0.5,
|
|
1781
|
+
sort_column: str = "plotId",
|
|
1782
|
+
) -> pd.DataFrame:
|
|
1783
|
+
"""
|
|
1784
|
+
Process GeoJSON sequentially with automatic formatting and validation.
|
|
1785
|
+
|
|
1786
|
+
Combines whisp_stats_geojson_to_df_sequential + format_stats_dataframe + validation
|
|
1787
|
+
for a complete pipeline: extract stats → convert units → format output → validate schema.
|
|
1788
|
+
|
|
1789
|
+
Uses standard endpoint for sequential processing.
|
|
1790
|
+
|
|
1791
|
+
Parameters
|
|
1792
|
+
----------
|
|
1793
|
+
input_geojson_filepath : str
|
|
1794
|
+
Path to input GeoJSON file
|
|
1795
|
+
external_id_column : str, optional
|
|
1796
|
+
Column name for external IDs
|
|
1797
|
+
remove_geom : bool
|
|
1798
|
+
Remove geometry from output
|
|
1799
|
+
national_codes : List[str], optional
|
|
1800
|
+
ISO2 codes for national datasets
|
|
1801
|
+
unit_type : str
|
|
1802
|
+
"ha" or "percent"
|
|
1803
|
+
whisp_image : ee.Image, optional
|
|
1804
|
+
Pre-combined image
|
|
1805
|
+
custom_bands : Dict[str, Any], optional
|
|
1806
|
+
Custom band information
|
|
1807
|
+
add_metadata_client_side : bool
|
|
1808
|
+
Add client-side metadata (default True)
|
|
1809
|
+
logger : logging.Logger, optional
|
|
1810
|
+
Logger for output
|
|
1811
|
+
decimal_places : int, optional
|
|
1812
|
+
Decimal places for rounding. If None, auto-detects from config:
|
|
1813
|
+
- Area columns: geometry_area_column_formatting
|
|
1814
|
+
- Percent columns: stats_percent_columns_formatting
|
|
1815
|
+
- Other columns: stats_area_columns_formatting
|
|
1816
|
+
remove_median_columns : bool
|
|
1817
|
+
Remove '_median' columns (default True)
|
|
1818
|
+
convert_water_flag : bool
|
|
1819
|
+
Convert water flag to boolean (default True)
|
|
1820
|
+
water_flag_threshold : float
|
|
1821
|
+
Water flag ratio threshold (default 0.5)
|
|
1822
|
+
sort_column : str
|
|
1823
|
+
Column to sort by (default "plotId", None to skip)
|
|
1824
|
+
|
|
1825
|
+
Returns
|
|
1826
|
+
-------
|
|
1827
|
+
pd.DataFrame
|
|
1828
|
+
Validated, formatted results DataFrame
|
|
1829
|
+
"""
|
|
1830
|
+
from openforis_whisp.reformat import format_stats_dataframe
|
|
1831
|
+
|
|
1832
|
+
logger = logger or logging.getLogger("whisp-concurrent")
|
|
1833
|
+
|
|
1834
|
+
# Auto-detect decimal places from config if not provided
|
|
1835
|
+
if decimal_places is None:
|
|
1836
|
+
# Use stats_area_columns_formatting as default for most columns
|
|
1837
|
+
decimal_places = _extract_decimal_places(stats_area_columns_formatting)
|
|
1838
|
+
logger.debug(f"Using decimal_places={decimal_places} from config")
|
|
1839
|
+
|
|
1840
|
+
# Step 1: Get raw stats
|
|
1841
|
+
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
1842
|
+
df_raw = whisp_stats_geojson_to_df_sequential(
|
|
1843
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
1844
|
+
external_id_column=external_id_column,
|
|
1845
|
+
remove_geom=remove_geom,
|
|
1846
|
+
national_codes=national_codes,
|
|
1847
|
+
unit_type=unit_type,
|
|
1848
|
+
whisp_image=whisp_image,
|
|
1849
|
+
custom_bands=custom_bands,
|
|
1850
|
+
add_metadata_client_side=add_metadata_client_side,
|
|
1851
|
+
logger=logger,
|
|
1852
|
+
)
|
|
1853
|
+
|
|
1854
|
+
# Step 2: Format the output
|
|
1855
|
+
logger.debug("Step 2/2: Formatting output...")
|
|
1856
|
+
median_cols_before = [c for c in df_raw.columns if c.endswith("_median")]
|
|
1857
|
+
logger.debug(
|
|
1858
|
+
f"Columns ending with '_median' BEFORE formatting: {median_cols_before}"
|
|
1859
|
+
)
|
|
1860
|
+
|
|
1861
|
+
df_formatted = format_stats_dataframe(
|
|
1862
|
+
df=df_raw,
|
|
1863
|
+
area_col=f"{geometry_area_column}_sum",
|
|
1864
|
+
decimal_places=decimal_places,
|
|
1865
|
+
unit_type=unit_type,
|
|
1866
|
+
remove_columns=remove_median_columns,
|
|
1867
|
+
convert_water_flag=convert_water_flag,
|
|
1868
|
+
water_flag_threshold=water_flag_threshold,
|
|
1869
|
+
sort_column=sort_column,
|
|
1870
|
+
)
|
|
1871
|
+
|
|
1872
|
+
median_cols_after = [c for c in df_formatted.columns if c.endswith("_median")]
|
|
1873
|
+
logger.debug(f"Columns ending with '_median' AFTER formatting: {median_cols_after}")
|
|
1874
|
+
|
|
1875
|
+
# Step 2b: Reformat geometry and handle point areas
|
|
1876
|
+
try:
|
|
1877
|
+
df_formatted = reformat_geometry_type(df_formatted)
|
|
1878
|
+
except Exception as e:
|
|
1879
|
+
logger.warning(f"Error reformatting geometry type: {e}")
|
|
1880
|
+
|
|
1881
|
+
try:
|
|
1882
|
+
df_formatted = set_point_geometry_area_to_zero(df_formatted)
|
|
1883
|
+
except Exception as e:
|
|
1884
|
+
logger.warning(f"Error setting point geometry area to zero: {e}")
|
|
1885
|
+
|
|
1886
|
+
# Step 3: Validate against schema
|
|
1887
|
+
logger.debug("Step 3/3: Validating against schema...")
|
|
1888
|
+
from openforis_whisp.reformat import validate_dataframe_using_lookups_flexible
|
|
1889
|
+
|
|
1890
|
+
df_validated = validate_dataframe_using_lookups_flexible(
|
|
1891
|
+
df_stats=df_formatted,
|
|
1892
|
+
national_codes=national_codes,
|
|
1893
|
+
custom_bands=custom_bands,
|
|
1894
|
+
)
|
|
1895
|
+
|
|
1896
|
+
logger.info("Sequential processing + formatting + validation complete")
|
|
1897
|
+
return df_validated
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
# ============================================================================
|
|
1901
|
+
# FAST PROCESSING WITH AUTO-ROUTING
|
|
1902
|
+
# ============================================================================
|
|
1903
|
+
|
|
1904
|
+
|
|
1905
|
+
def whisp_formatted_stats_geojson_to_df_fast(
|
|
1906
|
+
input_geojson_filepath: str,
|
|
1907
|
+
external_id_column: str = None,
|
|
1908
|
+
remove_geom: bool = False,
|
|
1909
|
+
national_codes: List[str] = None,
|
|
1910
|
+
unit_type: str = "ha",
|
|
1911
|
+
whisp_image: ee.Image = None,
|
|
1912
|
+
custom_bands: Dict[str, Any] = None,
|
|
1913
|
+
mode: str = "auto",
|
|
1914
|
+
# Concurrent-specific parameters
|
|
1915
|
+
batch_size: int = 10,
|
|
1916
|
+
max_concurrent: int = 20,
|
|
1917
|
+
validate_geometries: bool = True,
|
|
1918
|
+
max_retries: int = 3,
|
|
1919
|
+
add_metadata_server: bool = False,
|
|
1920
|
+
# Format parameters (auto-detect from config if not provided)
|
|
1921
|
+
decimal_places: int = None,
|
|
1922
|
+
remove_median_columns: bool = True,
|
|
1923
|
+
convert_water_flag: bool = True,
|
|
1924
|
+
water_flag_threshold: float = 0.5,
|
|
1925
|
+
sort_column: str = "plotId",
|
|
1926
|
+
) -> pd.DataFrame:
|
|
1927
|
+
"""
|
|
1928
|
+
Process GeoJSON to Whisp statistics with optimized fast processing.
|
|
1929
|
+
|
|
1930
|
+
Automatically selects between concurrent (high-volume endpoint) and sequential
|
|
1931
|
+
(standard endpoint) based on file size, or allows explicit mode selection.
|
|
1932
|
+
|
|
1933
|
+
This is the recommended entry point for most users who want automatic optimization.
|
|
1934
|
+
|
|
1935
|
+
Parameters
|
|
1936
|
+
----------
|
|
1937
|
+
input_geojson_filepath : str
|
|
1938
|
+
Path to input GeoJSON file
|
|
1939
|
+
external_id_column : str, optional
|
|
1940
|
+
Column name for external IDs
|
|
1941
|
+
remove_geom : bool
|
|
1942
|
+
Remove geometry column from output
|
|
1943
|
+
national_codes : List[str], optional
|
|
1944
|
+
ISO2 codes for national datasets
|
|
1945
|
+
unit_type : str
|
|
1946
|
+
"ha" or "percent"
|
|
1947
|
+
whisp_image : ee.Image, optional
|
|
1948
|
+
Pre-combined image
|
|
1949
|
+
custom_bands : Dict[str, Any], optional
|
|
1950
|
+
Custom band information
|
|
1951
|
+
mode : str
|
|
1952
|
+
Processing mode:
|
|
1953
|
+
- "auto": Choose based on file size (default)
|
|
1954
|
+
* <1MB: sequential
|
|
1955
|
+
* 1-5MB: sequential
|
|
1956
|
+
* >5MB: concurrent
|
|
1957
|
+
- "concurrent": Force high-volume endpoint (batch processing)
|
|
1958
|
+
- "sequential": Force standard endpoint (single-threaded)
|
|
1959
|
+
batch_size : int
|
|
1960
|
+
Features per batch (only for concurrent mode)
|
|
1961
|
+
max_concurrent : int
|
|
1962
|
+
Maximum concurrent EE calls (only for concurrent mode)
|
|
1963
|
+
validate_geometries : bool
|
|
1964
|
+
Validate and clean geometries
|
|
1965
|
+
max_retries : int
|
|
1966
|
+
Retry attempts per batch (only for concurrent mode)
|
|
1967
|
+
add_metadata_server : bool
|
|
1968
|
+
Add metadata server-side (only for concurrent mode)
|
|
1969
|
+
decimal_places : int, optional
|
|
1970
|
+
Decimal places for rounding. If None, auto-detects from config.
|
|
1971
|
+
remove_median_columns : bool
|
|
1972
|
+
Remove '_median' columns
|
|
1973
|
+
convert_water_flag : bool
|
|
1974
|
+
Convert water flag to boolean
|
|
1975
|
+
water_flag_threshold : float
|
|
1976
|
+
Water flag ratio threshold
|
|
1977
|
+
sort_column : str
|
|
1978
|
+
Column to sort by
|
|
1979
|
+
|
|
1980
|
+
Returns
|
|
1981
|
+
-------
|
|
1982
|
+
pd.DataFrame
|
|
1983
|
+
Validated, formatted results DataFrame
|
|
1984
|
+
|
|
1985
|
+
Examples
|
|
1986
|
+
--------
|
|
1987
|
+
>>> # Auto-detect best method based on file size
|
|
1988
|
+
>>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
|
|
1989
|
+
|
|
1990
|
+
>>> # Force concurrent processing for large datasets
|
|
1991
|
+
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
1992
|
+
... "large_data.geojson",
|
|
1993
|
+
... mode="concurrent"
|
|
1994
|
+
... )
|
|
1995
|
+
|
|
1996
|
+
>>> # Use sequential for guaranteed completion
|
|
1997
|
+
>>> df = whisp_formatted_stats_geojson_to_df_fast(
|
|
1998
|
+
... "data.geojson",
|
|
1999
|
+
... mode="sequential"
|
|
2000
|
+
... )
|
|
2001
|
+
"""
|
|
2002
|
+
logger = logging.getLogger("whisp-concurrent")
|
|
2003
|
+
|
|
2004
|
+
# Determine processing mode
|
|
2005
|
+
if mode == "auto":
|
|
2006
|
+
try:
|
|
2007
|
+
file_size = Path(input_geojson_filepath).stat().st_size
|
|
2008
|
+
if file_size > 5_000_000: # >5MB
|
|
2009
|
+
chosen_mode = "concurrent"
|
|
2010
|
+
logger.info(
|
|
2011
|
+
f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
|
|
2012
|
+
)
|
|
2013
|
+
else: # <=5MB
|
|
2014
|
+
chosen_mode = "sequential"
|
|
2015
|
+
logger.info(
|
|
2016
|
+
f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
|
|
2017
|
+
)
|
|
2018
|
+
except Exception as e:
|
|
2019
|
+
logger.warning(
|
|
2020
|
+
f"Could not determine file size: {e}. Defaulting to sequential."
|
|
2021
|
+
)
|
|
2022
|
+
chosen_mode = "sequential"
|
|
2023
|
+
elif mode in ("concurrent", "sequential"):
|
|
2024
|
+
chosen_mode = mode
|
|
2025
|
+
logger.info(f"Mode explicitly set to: {mode}")
|
|
2026
|
+
else:
|
|
2027
|
+
raise ValueError(
|
|
2028
|
+
f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
# Route to appropriate function
|
|
2032
|
+
if chosen_mode == "concurrent":
|
|
2033
|
+
logger.debug("Routing to concurrent processing...")
|
|
2034
|
+
return whisp_formatted_stats_geojson_to_df_concurrent(
|
|
2035
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
2036
|
+
external_id_column=external_id_column,
|
|
2037
|
+
remove_geom=remove_geom,
|
|
2038
|
+
national_codes=national_codes,
|
|
2039
|
+
unit_type=unit_type,
|
|
2040
|
+
whisp_image=whisp_image,
|
|
2041
|
+
custom_bands=custom_bands,
|
|
2042
|
+
batch_size=batch_size,
|
|
2043
|
+
max_concurrent=max_concurrent,
|
|
2044
|
+
validate_geometries=validate_geometries,
|
|
2045
|
+
max_retries=max_retries,
|
|
2046
|
+
add_metadata_server=add_metadata_server,
|
|
2047
|
+
logger=logger,
|
|
2048
|
+
decimal_places=decimal_places,
|
|
2049
|
+
remove_median_columns=remove_median_columns,
|
|
2050
|
+
convert_water_flag=convert_water_flag,
|
|
2051
|
+
water_flag_threshold=water_flag_threshold,
|
|
2052
|
+
sort_column=sort_column,
|
|
2053
|
+
)
|
|
2054
|
+
else: # sequential
|
|
2055
|
+
logger.debug("Routing to sequential processing...")
|
|
2056
|
+
return whisp_formatted_stats_geojson_to_df_sequential(
|
|
2057
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
2058
|
+
external_id_column=external_id_column,
|
|
2059
|
+
remove_geom=remove_geom,
|
|
2060
|
+
national_codes=national_codes,
|
|
2061
|
+
unit_type=unit_type,
|
|
2062
|
+
whisp_image=whisp_image,
|
|
2063
|
+
custom_bands=custom_bands,
|
|
2064
|
+
logger=logger,
|
|
2065
|
+
decimal_places=decimal_places,
|
|
2066
|
+
remove_median_columns=remove_median_columns,
|
|
2067
|
+
convert_water_flag=convert_water_flag,
|
|
2068
|
+
water_flag_threshold=water_flag_threshold,
|
|
2069
|
+
sort_column=sort_column,
|
|
2070
|
+
)
|