openforis-whisp 3.0.0a4__tar.gz → 3.0.0a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/PKG-INFO +1 -1
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/pyproject.toml +1 -1
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/advanced_stats.py +221 -131
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/lookup_context_and_metadata.csv +1 -1
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/LICENSE +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/README.md +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/__init__.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/data_checks.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/data_conversion.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/datasets.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/logger.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/__init__.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/config_runtime.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/lookup_gaul1_admin.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/lookup_gee_datasets.csv +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/pd_schemas.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/reformat.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/risk.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/stats.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: openforis-whisp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0a5
|
|
4
4
|
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: whisp,geospatial,data-processing
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "openforis-whisp"
|
|
7
|
-
version = "3.0.
|
|
7
|
+
version = "3.0.0a5"
|
|
8
8
|
description = "Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations."
|
|
9
9
|
repository = "https://github.com/forestdatapartnership/whisp"
|
|
10
10
|
authors = ["Andy Arnell <andrew.arnell@fao.org>"]
|
|
@@ -181,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
|
|
|
181
181
|
reformat_logger.setLevel(logging.ERROR)
|
|
182
182
|
|
|
183
183
|
|
|
184
|
-
def
|
|
185
|
-
|
|
184
|
+
def _load_and_prepare_geojson(
|
|
185
|
+
filepath: str, external_id_column: Optional[str] = None
|
|
186
|
+
) -> gpd.GeoDataFrame:
|
|
187
|
+
"""Load GeoJSON file and prepare for processing.
|
|
188
|
+
|
|
189
|
+
Suppresses logging output and optionally renames external_id column.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
filepath : str
|
|
194
|
+
Path to GeoJSON file
|
|
195
|
+
external_id_column : str, optional
|
|
196
|
+
If provided, rename this column to 'external_id' immediately after loading
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
gpd.GeoDataFrame
|
|
201
|
+
Loaded GeoDataFrame with external_id renamed if specified
|
|
202
|
+
"""
|
|
186
203
|
fiona_logger = logging.getLogger("fiona")
|
|
187
204
|
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
188
205
|
old_fiona_level = fiona_logger.level
|
|
@@ -193,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
|
|
|
193
210
|
try:
|
|
194
211
|
with redirect_stdout(io.StringIO()):
|
|
195
212
|
gdf = gpd.read_file(filepath)
|
|
213
|
+
|
|
214
|
+
# Rename external_id column early and convert to string
|
|
215
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
216
|
+
if external_id_column != "external_id":
|
|
217
|
+
gdf = gdf.rename(
|
|
218
|
+
columns={external_id_column: "external_id"}
|
|
219
|
+
) # hard coding here to avoid confusion later
|
|
220
|
+
# Convert to string to ensure consistent type throughout pipeline
|
|
221
|
+
gdf["external_id"] = gdf["external_id"].astype(str)
|
|
222
|
+
|
|
196
223
|
return gdf
|
|
197
224
|
finally:
|
|
198
225
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -780,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
|
|
|
780
807
|
if not check_ee_endpoint(endpoint_type):
|
|
781
808
|
if endpoint_type == "high-volume":
|
|
782
809
|
msg = (
|
|
783
|
-
"Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
810
|
+
"# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
784
811
|
"ee.Reset()\n"
|
|
785
|
-
"ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
786
|
-
"
|
|
787
|
-
"ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
812
|
+
"ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
813
|
+
"# where gee_project_name is your GEE project (necessary in Colab)"
|
|
788
814
|
)
|
|
789
815
|
else: # standard endpoint
|
|
790
816
|
msg = (
|
|
791
817
|
"Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
|
|
792
818
|
"ee.Reset()\n"
|
|
793
|
-
"ee.Initialize()\n"
|
|
794
|
-
"
|
|
795
|
-
"ee.Initialize(project='your_cloud_project_name')"
|
|
819
|
+
"ee.Initialize(project=gee_project_name)\n"
|
|
820
|
+
"# where gee_project_name is your GEE project (necessary in Colab)"
|
|
796
821
|
)
|
|
797
822
|
|
|
798
823
|
if raise_error:
|
|
@@ -865,13 +890,13 @@ def extract_centroid_and_geomtype_client(
|
|
|
865
890
|
if plot_id_column in gdf.columns:
|
|
866
891
|
cols.append(plot_id_column)
|
|
867
892
|
|
|
868
|
-
# Include
|
|
893
|
+
# Include external_id if it exists (already renamed during load)
|
|
869
894
|
if (
|
|
870
895
|
external_id_column
|
|
871
|
-
and
|
|
872
|
-
and
|
|
896
|
+
and "external_id" in gdf.columns
|
|
897
|
+
and "external_id" not in cols
|
|
873
898
|
):
|
|
874
|
-
cols.append(
|
|
899
|
+
cols.append("external_id")
|
|
875
900
|
|
|
876
901
|
# Always include metadata columns (centroid, geometry type)
|
|
877
902
|
cols.extend([x_col, y_col, type_col])
|
|
@@ -965,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
965
990
|
|
|
966
991
|
Preserves the __row_id__ column if present so it can be retrieved after processing.
|
|
967
992
|
|
|
993
|
+
IMPORTANT: Drops external_id column before sending to EE to enable query caching.
|
|
994
|
+
external_id is user metadata that's not needed for EE computation. Including it
|
|
995
|
+
breaks EE's caching mechanism since each unique external_id creates a different query.
|
|
996
|
+
|
|
968
997
|
Parameters
|
|
969
998
|
----------
|
|
970
999
|
batch_gdf : gpd.GeoDataFrame
|
|
@@ -973,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
973
1002
|
Returns
|
|
974
1003
|
-------
|
|
975
1004
|
ee.FeatureCollection
|
|
976
|
-
EE FeatureCollection with __row_id__ as a feature property
|
|
1005
|
+
EE FeatureCollection with __row_id__ as a feature property (no external_id)
|
|
977
1006
|
"""
|
|
1007
|
+
# Drop external_id before sending to EE to enable caching
|
|
1008
|
+
# (external_id is preserved separately on client side for merging)
|
|
1009
|
+
batch_for_ee = batch_gdf.copy()
|
|
1010
|
+
if "external_id" in batch_for_ee.columns:
|
|
1011
|
+
batch_for_ee = batch_for_ee.drop(columns=["external_id"])
|
|
1012
|
+
|
|
978
1013
|
# Pass GeoDataFrame directly to preserve CRS metadata
|
|
979
1014
|
# convert_geojson_to_ee will handle:
|
|
980
1015
|
# - CRS detection and conversion to WGS84 if needed
|
|
981
1016
|
# - Data type sanitization (datetime, object columns)
|
|
982
1017
|
# - Geometry validation and Z-coordinate stripping
|
|
983
1018
|
|
|
984
|
-
fc = convert_geojson_to_ee(
|
|
1019
|
+
fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
|
|
985
1020
|
|
|
986
1021
|
# If __row_id__ is in the original GeoDataFrame, it will be preserved
|
|
987
1022
|
# as a feature property in the GeoJSON and thus in the EE FeatureCollection
|
|
@@ -1107,7 +1142,19 @@ def process_ee_batch(
|
|
|
1107
1142
|
# Ensure plot_id_column is present for merging
|
|
1108
1143
|
# It should come from the feature properties (added before EE processing)
|
|
1109
1144
|
if plot_id_column not in df.columns:
|
|
1110
|
-
|
|
1145
|
+
logger.warning(
|
|
1146
|
+
f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
|
|
1147
|
+
f"Regenerating with 1-indexed range. "
|
|
1148
|
+
f"Columns from EE: {list(df.columns)}"
|
|
1149
|
+
)
|
|
1150
|
+
# Use 1-indexed range to match client-side assignment
|
|
1151
|
+
df[plot_id_column] = range(1, len(df) + 1)
|
|
1152
|
+
|
|
1153
|
+
# Ensure plotId is integer type (EE may return as string)
|
|
1154
|
+
if plot_id_column in df.columns:
|
|
1155
|
+
df[plot_id_column] = pd.to_numeric(
|
|
1156
|
+
df[plot_id_column], errors="coerce"
|
|
1157
|
+
).astype("Int64")
|
|
1111
1158
|
|
|
1112
1159
|
# Ensure all column names are strings (fixes pandas .str accessor issues)
|
|
1113
1160
|
df.columns = df.columns.astype(str)
|
|
@@ -1231,12 +1278,15 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1231
1278
|
# Validate endpoint
|
|
1232
1279
|
validate_ee_endpoint("high-volume", raise_error=True)
|
|
1233
1280
|
|
|
1234
|
-
# Load GeoJSON with output suppressed
|
|
1235
|
-
gdf =
|
|
1281
|
+
# Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
|
|
1282
|
+
gdf = _load_and_prepare_geojson(
|
|
1283
|
+
input_geojson_filepath, external_id_column=external_id_column
|
|
1284
|
+
)
|
|
1236
1285
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1237
1286
|
|
|
1238
|
-
# Validate
|
|
1239
|
-
|
|
1287
|
+
# Validate external_id if provided (lightweight client-side check)
|
|
1288
|
+
# Note: external_id_column already renamed to 'external_id' during load
|
|
1289
|
+
if external_id_column and "external_id" not in gdf.columns:
|
|
1240
1290
|
# Exclude geometry column from available columns list
|
|
1241
1291
|
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1242
1292
|
raise ValueError(
|
|
@@ -1244,13 +1294,13 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1244
1294
|
f"Available columns: {available_cols}"
|
|
1245
1295
|
)
|
|
1246
1296
|
|
|
1247
|
-
# Check completeness of
|
|
1248
|
-
if external_id_column and
|
|
1249
|
-
null_count = gdf[
|
|
1297
|
+
# Check completeness of external_id (warn if nulls exist)
|
|
1298
|
+
if external_id_column and "external_id" in gdf.columns:
|
|
1299
|
+
null_count = gdf["external_id"].isna().sum()
|
|
1250
1300
|
if null_count > 0:
|
|
1251
1301
|
null_pct = (null_count / len(gdf)) * 100
|
|
1252
1302
|
logger.warning(
|
|
1253
|
-
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1303
|
+
f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1254
1304
|
f"These features may have missing external IDs in output."
|
|
1255
1305
|
)
|
|
1256
1306
|
|
|
@@ -1263,13 +1313,21 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1263
1313
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1264
1314
|
|
|
1265
1315
|
# Strip unnecessary properties before sending to EE
|
|
1266
|
-
# Keep only: geometry, plot_id_column, and
|
|
1316
|
+
# Keep only: geometry, plot_id_column, and external_id
|
|
1267
1317
|
# This prevents duplication of GeoJSON properties in EE results
|
|
1268
1318
|
keep_cols = ["geometry", plot_id_column]
|
|
1269
|
-
if
|
|
1270
|
-
|
|
1319
|
+
if (
|
|
1320
|
+
external_id_column and "external_id" in gdf.columns
|
|
1321
|
+
): # Already renamed during load
|
|
1322
|
+
keep_cols.append("external_id")
|
|
1271
1323
|
|
|
1272
1324
|
gdf_for_ee = gdf[keep_cols].copy()
|
|
1325
|
+
|
|
1326
|
+
# CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
|
|
1327
|
+
if external_id_column and "external_id" in gdf_for_ee.columns:
|
|
1328
|
+
gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
|
|
1329
|
+
logger.debug(f"Converted external_id column to string type")
|
|
1330
|
+
|
|
1273
1331
|
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1274
1332
|
|
|
1275
1333
|
# Create image if not provided
|
|
@@ -1366,17 +1424,37 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1366
1424
|
|
|
1367
1425
|
# Merge server and client results
|
|
1368
1426
|
if plot_id_column not in df_server.columns:
|
|
1369
|
-
|
|
1427
|
+
logger.warning(
|
|
1428
|
+
f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
|
|
1429
|
+
f"Regenerating. Columns from EE: {list(df_server.columns)}"
|
|
1430
|
+
)
|
|
1431
|
+
df_server[plot_id_column] = pd.array(
|
|
1432
|
+
range(1, len(df_server) + 1), dtype="Int64"
|
|
1433
|
+
)
|
|
1434
|
+
else:
|
|
1435
|
+
df_server[plot_id_column] = pd.to_numeric(
|
|
1436
|
+
df_server[plot_id_column], errors="coerce"
|
|
1437
|
+
).astype("Int64")
|
|
1438
|
+
|
|
1439
|
+
# Ensure plotId is Int64 in client data too
|
|
1440
|
+
if plot_id_column in df_client.columns:
|
|
1441
|
+
df_client[plot_id_column] = pd.to_numeric(
|
|
1442
|
+
df_client[plot_id_column], errors="coerce"
|
|
1443
|
+
).astype("Int64")
|
|
1370
1444
|
|
|
1371
1445
|
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1372
1446
|
# These are the actual EE processing results
|
|
1373
1447
|
df_server_clean = df_server.copy()
|
|
1374
1448
|
|
|
1449
|
+
# Drop external_id from df_server if it exists (already in df_client)
|
|
1450
|
+
if "external_id" in df_server_clean.columns:
|
|
1451
|
+
df_server_clean = df_server_clean.drop(columns=["external_id"])
|
|
1452
|
+
|
|
1375
1453
|
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1376
1454
|
# (formatted wrapper handles keep_external_columns parameter)
|
|
1377
1455
|
keep_external_columns = [plot_id_column]
|
|
1378
|
-
if external_id_column and
|
|
1379
|
-
keep_external_columns.append(
|
|
1456
|
+
if external_id_column and "external_id" in df_client.columns:
|
|
1457
|
+
keep_external_columns.append("external_id")
|
|
1380
1458
|
if "geometry" in df_client.columns:
|
|
1381
1459
|
keep_external_columns.append("geometry")
|
|
1382
1460
|
# Keep geometry type column (Geometry_type)
|
|
@@ -1522,7 +1600,10 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1522
1600
|
try:
|
|
1523
1601
|
batch_idx, df_server, df_client = future.result()
|
|
1524
1602
|
if plot_id_column not in df_server.columns:
|
|
1525
|
-
|
|
1603
|
+
# Use 1-indexed range to match client-side assignment
|
|
1604
|
+
df_server[plot_id_column] = range(
|
|
1605
|
+
1, len(df_server) + 1
|
|
1606
|
+
)
|
|
1526
1607
|
merged = df_server.merge(
|
|
1527
1608
|
df_client,
|
|
1528
1609
|
on=plot_id_column,
|
|
@@ -1566,31 +1647,21 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1566
1647
|
else:
|
|
1567
1648
|
return pd.DataFrame()
|
|
1568
1649
|
|
|
1569
|
-
# Clean up duplicate external_id columns created by merges
|
|
1570
|
-
#
|
|
1571
|
-
if external_id_column:
|
|
1572
|
-
# Find
|
|
1573
|
-
|
|
1650
|
+
# Clean up duplicate external_id columns created by merges (if any exist)
|
|
1651
|
+
# external_id was already renamed during load, so we just need to handle duplicates
|
|
1652
|
+
if external_id_column and "external_id" in combined.columns:
|
|
1653
|
+
# Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
|
|
1654
|
+
duplicate_variants = [
|
|
1574
1655
|
col
|
|
1575
1656
|
for col in combined.columns
|
|
1576
|
-
if
|
|
1657
|
+
if col != "external_id" and col.startswith("external_id_")
|
|
1577
1658
|
]
|
|
1578
1659
|
|
|
1579
|
-
if
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
external_id_column
|
|
1583
|
-
if external_id_column in combined.columns
|
|
1584
|
-
else external_id_variants[0]
|
|
1660
|
+
if duplicate_variants:
|
|
1661
|
+
logger.debug(
|
|
1662
|
+
f"Dropping duplicate external_id columns: {duplicate_variants}"
|
|
1585
1663
|
)
|
|
1586
|
-
|
|
1587
|
-
# Rename to standardized 'external_id'
|
|
1588
|
-
if base_col != "external_id":
|
|
1589
|
-
combined = combined.rename(columns={base_col: "external_id"})
|
|
1590
|
-
|
|
1591
|
-
# Drop all other variants
|
|
1592
|
-
cols_to_drop = [c for c in external_id_variants if c != base_col]
|
|
1593
|
-
combined = combined.drop(columns=cols_to_drop, errors="ignore")
|
|
1664
|
+
combined = combined.drop(columns=duplicate_variants, errors="ignore")
|
|
1594
1665
|
|
|
1595
1666
|
# plotId column is already present from batch processing
|
|
1596
1667
|
# Just ensure it's at position 0
|
|
@@ -1673,14 +1744,26 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1673
1744
|
try:
|
|
1674
1745
|
batch_idx, df_server, df_client = future.result()
|
|
1675
1746
|
if plot_id_column not in df_server.columns:
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1747
|
+
logger.warning(
|
|
1748
|
+
f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
|
|
1749
|
+
f"Regenerating. Columns from EE: {list(df_server.columns)}"
|
|
1750
|
+
)
|
|
1751
|
+
# Use 1-indexed range to match client-side assignment
|
|
1752
|
+
df_server[plot_id_column] = range(1, len(df_server) + 1)
|
|
1753
|
+
|
|
1754
|
+
# Ensure plotId is integer type (EE may return as string)
|
|
1755
|
+
if plot_id_column in df_server.columns:
|
|
1756
|
+
df_server[plot_id_column] = pd.to_numeric(
|
|
1757
|
+
df_server[plot_id_column], errors="coerce"
|
|
1758
|
+
).astype("Int64")
|
|
1759
|
+
if plot_id_column in df_client.columns:
|
|
1760
|
+
df_client[plot_id_column] = pd.to_numeric(
|
|
1761
|
+
df_client[plot_id_column], errors="coerce"
|
|
1762
|
+
).astype("Int64")
|
|
1763
|
+
|
|
1764
|
+
# Drop external_id from df_server if it exists (already in df_client)
|
|
1765
|
+
if "external_id" in df_server.columns:
|
|
1766
|
+
df_server = df_server.drop(columns=["external_id"])
|
|
1684
1767
|
|
|
1685
1768
|
merged = df_server.merge(
|
|
1686
1769
|
df_client,
|
|
@@ -1702,30 +1785,22 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1702
1785
|
# Ensure all column names are strings (fixes pandas .str accessor issues later)
|
|
1703
1786
|
combined.columns = combined.columns.astype(str)
|
|
1704
1787
|
|
|
1705
|
-
# Clean up duplicate external_id columns created by merges
|
|
1706
|
-
|
|
1707
|
-
|
|
1788
|
+
# Clean up duplicate external_id columns created by merges (if any exist)
|
|
1789
|
+
# external_id was already renamed during load, so we just need to handle duplicates
|
|
1790
|
+
if external_id_column and "external_id" in combined.columns:
|
|
1791
|
+
# Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
|
|
1792
|
+
duplicate_variants = [
|
|
1708
1793
|
col
|
|
1709
1794
|
for col in combined.columns
|
|
1710
|
-
if
|
|
1795
|
+
if col != "external_id" and col.startswith("external_id_")
|
|
1711
1796
|
]
|
|
1712
1797
|
|
|
1713
|
-
if
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
and external_id_variants
|
|
1718
|
-
):
|
|
1719
|
-
base_col = external_id_variants[0]
|
|
1720
|
-
combined = combined.rename(
|
|
1721
|
-
columns={base_col: "external_id"}
|
|
1722
|
-
)
|
|
1723
|
-
|
|
1724
|
-
cols_to_drop = [
|
|
1725
|
-
c for c in external_id_variants if c != base_col
|
|
1726
|
-
]
|
|
1798
|
+
if duplicate_variants:
|
|
1799
|
+
logger.debug(
|
|
1800
|
+
f"Dropping duplicate external_id columns: {duplicate_variants}"
|
|
1801
|
+
)
|
|
1727
1802
|
combined = combined.drop(
|
|
1728
|
-
columns=
|
|
1803
|
+
columns=duplicate_variants, errors="ignore"
|
|
1729
1804
|
)
|
|
1730
1805
|
|
|
1731
1806
|
# plotId column is already present, just ensure it's at position 0
|
|
@@ -1769,6 +1844,14 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1769
1844
|
)
|
|
1770
1845
|
raise retry_e
|
|
1771
1846
|
|
|
1847
|
+
# Ensure plot_id is present (should already be there from batch processing)
|
|
1848
|
+
if plot_id_column not in formatted.columns:
|
|
1849
|
+
logger.warning(f"{plot_id_column} column missing, regenerating...")
|
|
1850
|
+
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
1851
|
+
|
|
1852
|
+
# Sort by plot_id to ensure consistent output order
|
|
1853
|
+
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
1854
|
+
|
|
1772
1855
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1773
1856
|
return formatted
|
|
1774
1857
|
else:
|
|
@@ -1843,12 +1926,15 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1843
1926
|
# Validate endpoint
|
|
1844
1927
|
validate_ee_endpoint("standard", raise_error=True)
|
|
1845
1928
|
|
|
1846
|
-
# Load GeoJSON with output suppressed
|
|
1847
|
-
gdf =
|
|
1929
|
+
# Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
|
|
1930
|
+
gdf = _load_and_prepare_geojson(
|
|
1931
|
+
input_geojson_filepath, external_id_column=external_id_column
|
|
1932
|
+
)
|
|
1848
1933
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1849
1934
|
|
|
1850
|
-
# Validate
|
|
1851
|
-
|
|
1935
|
+
# Validate external_id if provided (lightweight client-side check)
|
|
1936
|
+
# Note: external_id_column already renamed to 'external_id' during load
|
|
1937
|
+
if external_id_column and "external_id" not in gdf.columns:
|
|
1852
1938
|
# Exclude geometry column from available columns list
|
|
1853
1939
|
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1854
1940
|
raise ValueError(
|
|
@@ -1856,13 +1942,13 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1856
1942
|
f"Available columns: {available_cols}"
|
|
1857
1943
|
)
|
|
1858
1944
|
|
|
1859
|
-
# Check completeness of
|
|
1860
|
-
if external_id_column and
|
|
1861
|
-
null_count = gdf[
|
|
1945
|
+
# Check completeness of external_id (warn if nulls exist)
|
|
1946
|
+
if external_id_column and "external_id" in gdf.columns:
|
|
1947
|
+
null_count = gdf["external_id"].isna().sum()
|
|
1862
1948
|
if null_count > 0:
|
|
1863
1949
|
null_pct = (null_count / len(gdf)) * 100
|
|
1864
1950
|
logger.warning(
|
|
1865
|
-
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1951
|
+
f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1866
1952
|
f"These features may have missing external IDs in output."
|
|
1867
1953
|
)
|
|
1868
1954
|
|
|
@@ -1874,18 +1960,22 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1874
1960
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1875
1961
|
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1876
1962
|
|
|
1877
|
-
# Add stable row IDs
|
|
1878
|
-
row_id_col = "__row_id__"
|
|
1879
|
-
gdf[row_id_col] = range(len(gdf))
|
|
1880
|
-
|
|
1881
1963
|
# Strip unnecessary properties before sending to EE
|
|
1882
|
-
# Keep only: geometry, plot_id_column, and
|
|
1964
|
+
# Keep only: geometry, plot_id_column, and external_id
|
|
1883
1965
|
# This prevents duplication of GeoJSON properties in EE results
|
|
1884
|
-
keep_cols = ["geometry", plot_id_column
|
|
1885
|
-
if
|
|
1886
|
-
|
|
1966
|
+
keep_cols = ["geometry", plot_id_column]
|
|
1967
|
+
if (
|
|
1968
|
+
external_id_column and "external_id" in gdf.columns
|
|
1969
|
+
): # Already renamed during load
|
|
1970
|
+
keep_cols.append("external_id")
|
|
1887
1971
|
|
|
1888
1972
|
gdf_for_ee = gdf[keep_cols].copy()
|
|
1973
|
+
|
|
1974
|
+
# CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
|
|
1975
|
+
if external_id_column and "external_id" in gdf_for_ee.columns:
|
|
1976
|
+
gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
|
|
1977
|
+
logger.debug(f"Converted external_id column to string type")
|
|
1978
|
+
|
|
1889
1979
|
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1890
1980
|
|
|
1891
1981
|
# Create image if not provided
|
|
@@ -1907,10 +1997,19 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1907
1997
|
national_codes=national_codes, validate_bands=True
|
|
1908
1998
|
)
|
|
1909
1999
|
|
|
2000
|
+
# Drop external_id before sending to EE to enable caching
|
|
2001
|
+
# (external_id is preserved separately in gdf for client-side merging)
|
|
2002
|
+
gdf_for_ee_clean = gdf_for_ee.copy()
|
|
2003
|
+
if "external_id" in gdf_for_ee_clean.columns:
|
|
2004
|
+
gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
|
|
2005
|
+
logger.debug("Dropped external_id from data sent to EE (enables caching)")
|
|
2006
|
+
|
|
1910
2007
|
# Convert to EE (suppress print statements from convert_geojson_to_ee)
|
|
1911
2008
|
logger.debug("Converting to EE FeatureCollection...")
|
|
1912
2009
|
with redirect_stdout(io.StringIO()):
|
|
1913
|
-
fc = convert_geojson_to_ee(
|
|
2010
|
+
fc = convert_geojson_to_ee(
|
|
2011
|
+
gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
|
|
2012
|
+
)
|
|
1914
2013
|
|
|
1915
2014
|
# Create reducer
|
|
1916
2015
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
@@ -1950,11 +2049,13 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1950
2049
|
else:
|
|
1951
2050
|
raise
|
|
1952
2051
|
|
|
1953
|
-
logger.
|
|
2052
|
+
logger.info("Server-side processing complete")
|
|
1954
2053
|
|
|
1955
|
-
#
|
|
1956
|
-
if
|
|
1957
|
-
df_server[
|
|
2054
|
+
# Ensure plotId is Int64 type for fast merges
|
|
2055
|
+
if plot_id_column in df_server.columns:
|
|
2056
|
+
df_server[plot_id_column] = pd.to_numeric(
|
|
2057
|
+
df_server[plot_id_column], errors="coerce"
|
|
2058
|
+
).astype("Int64")
|
|
1958
2059
|
|
|
1959
2060
|
# Add client-side metadata if requested
|
|
1960
2061
|
if add_metadata_client_side:
|
|
@@ -1965,21 +2066,23 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1965
2066
|
return_attributes_only=True,
|
|
1966
2067
|
)
|
|
1967
2068
|
|
|
1968
|
-
#
|
|
1969
|
-
if
|
|
1970
|
-
df_client =
|
|
2069
|
+
# Ensure plotId is Int64 type for fast merges
|
|
2070
|
+
if plot_id_column in df_client.columns:
|
|
2071
|
+
df_client[plot_id_column] = pd.to_numeric(
|
|
2072
|
+
df_client[plot_id_column], errors="coerce"
|
|
2073
|
+
).astype("Int64")
|
|
2074
|
+
|
|
2075
|
+
# Drop external_id from df_server if it exists (keep from df_client - more reliable)
|
|
2076
|
+
if "external_id" in df_server.columns:
|
|
2077
|
+
df_server = df_server.drop(columns=["external_id"])
|
|
1971
2078
|
|
|
1972
|
-
# Merge
|
|
2079
|
+
# Merge on plotId (same strategy as concurrent mode)
|
|
1973
2080
|
result = df_server.merge(
|
|
1974
|
-
df_client, on=
|
|
2081
|
+
df_client, on=plot_id_column, how="left", suffixes=("", "_client")
|
|
1975
2082
|
)
|
|
1976
2083
|
else:
|
|
1977
2084
|
result = df_server
|
|
1978
2085
|
|
|
1979
|
-
# Remove internal __row_id__ column if present
|
|
1980
|
-
if row_id_col in result.columns:
|
|
1981
|
-
result = result.drop(columns=[row_id_col])
|
|
1982
|
-
|
|
1983
2086
|
# Format the output
|
|
1984
2087
|
# Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
|
|
1985
2088
|
# MUST be done BEFORE formatting (which removes _median columns)
|
|
@@ -2004,27 +2107,14 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
2004
2107
|
convert_water_flag=True,
|
|
2005
2108
|
)
|
|
2006
2109
|
|
|
2110
|
+
# Ensure plot_id exists and sort by it
|
|
2111
|
+
if plot_id_column not in formatted.columns:
|
|
2112
|
+
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
2113
|
+
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
2114
|
+
|
|
2007
2115
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
2008
2116
|
|
|
2009
|
-
#
|
|
2010
|
-
if external_id_column:
|
|
2011
|
-
variants = [
|
|
2012
|
-
col
|
|
2013
|
-
for col in formatted.columns
|
|
2014
|
-
if external_id_column.lower() in col.lower()
|
|
2015
|
-
]
|
|
2016
|
-
if variants:
|
|
2017
|
-
base_col = (
|
|
2018
|
-
external_id_column
|
|
2019
|
-
if external_id_column in formatted.columns
|
|
2020
|
-
else variants[0]
|
|
2021
|
-
)
|
|
2022
|
-
if base_col != "external_id":
|
|
2023
|
-
formatted = formatted.rename(columns={base_col: "external_id"})
|
|
2024
|
-
# Drop other variants
|
|
2025
|
-
formatted = formatted.drop(
|
|
2026
|
-
columns=[c for c in variants if c != base_col], errors="ignore"
|
|
2027
|
-
)
|
|
2117
|
+
# external_id_column already renamed to 'external_id' during load - no action needed here
|
|
2028
2118
|
|
|
2029
2119
|
return formatted
|
|
2030
2120
|
|
|
@@ -2130,7 +2220,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2130
2220
|
gdf_original_geoms = None
|
|
2131
2221
|
if geometry_audit_trail:
|
|
2132
2222
|
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2133
|
-
gdf_original_geoms =
|
|
2223
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2134
2224
|
|
|
2135
2225
|
# Step 1: Get raw stats
|
|
2136
2226
|
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
@@ -2199,7 +2289,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2199
2289
|
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2200
2290
|
if gdf_original_geoms is None:
|
|
2201
2291
|
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2202
|
-
gdf_original_geoms =
|
|
2292
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2203
2293
|
|
|
2204
2294
|
# Use plotId from df_validated to maintain mapping
|
|
2205
2295
|
df_original_geom = pd.DataFrame(
|
|
@@ -2331,7 +2421,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2331
2421
|
gdf_original_geoms = None
|
|
2332
2422
|
if geometry_audit_trail:
|
|
2333
2423
|
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2334
|
-
gdf_original_geoms =
|
|
2424
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2335
2425
|
|
|
2336
2426
|
# Step 1: Get raw stats
|
|
2337
2427
|
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
@@ -2395,7 +2485,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2395
2485
|
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2396
2486
|
if gdf_original_geoms is None:
|
|
2397
2487
|
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2398
|
-
gdf_original_geoms =
|
|
2488
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2399
2489
|
|
|
2400
2490
|
# Use plotId from df_validated to maintain mapping
|
|
2401
2491
|
df_original_geom = pd.DataFrame(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude_from_output,col_type,is_nullable,is_required,corresponding_variable
|
|
2
|
-
plotId,-10,,context_and_metadata,context_and_metadata,NA,NA,0,
|
|
2
|
+
plotId,-10,,context_and_metadata,context_and_metadata,NA,NA,0,int64,1,0,plot_id_column
|
|
3
3
|
external_id,-9,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,0,external_id_column
|
|
4
4
|
Area,-8,,context_and_metadata,context_and_metadata,NA,NA,0,float32,1,1,geometry_area_column
|
|
5
5
|
Geometry_type,-7,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,1,geometry_type_column
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/__init__.py
RENAMED
|
File without changes
|
{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/config_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|