openforis-whisp 3.0.0a4__tar.gz → 3.0.0a6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/PKG-INFO +1 -1
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/pyproject.toml +1 -1
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/advanced_stats.py +217 -133
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/LICENSE +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/README.md +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/__init__.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/data_checks.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/data_conversion.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/datasets.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/logger.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/__init__.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/config_runtime.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/lookup_context_and_metadata.csv +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/lookup_gaul1_admin.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/lookup_gee_datasets.csv +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/pd_schemas.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/reformat.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/risk.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/stats.py +0 -0
- {openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: openforis-whisp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0a6
|
|
4
4
|
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: whisp,geospatial,data-processing
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "openforis-whisp"
|
|
7
|
-
version = "3.0.
|
|
7
|
+
version = "3.0.0a6"
|
|
8
8
|
description = "Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations."
|
|
9
9
|
repository = "https://github.com/forestdatapartnership/whisp"
|
|
10
10
|
authors = ["Andy Arnell <andrew.arnell@fao.org>"]
|
|
@@ -181,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
|
|
|
181
181
|
reformat_logger.setLevel(logging.ERROR)
|
|
182
182
|
|
|
183
183
|
|
|
184
|
-
def
|
|
185
|
-
|
|
184
|
+
def _load_and_prepare_geojson(
|
|
185
|
+
filepath: str, external_id_column: Optional[str] = None
|
|
186
|
+
) -> gpd.GeoDataFrame:
|
|
187
|
+
"""Load GeoJSON file and prepare for processing.
|
|
188
|
+
|
|
189
|
+
Suppresses logging output and optionally renames external_id column.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
filepath : str
|
|
194
|
+
Path to GeoJSON file
|
|
195
|
+
external_id_column : str, optional
|
|
196
|
+
If provided, rename this column to 'external_id' immediately after loading
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
gpd.GeoDataFrame
|
|
201
|
+
Loaded GeoDataFrame with external_id renamed if specified
|
|
202
|
+
"""
|
|
186
203
|
fiona_logger = logging.getLogger("fiona")
|
|
187
204
|
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
188
205
|
old_fiona_level = fiona_logger.level
|
|
@@ -193,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
|
|
|
193
210
|
try:
|
|
194
211
|
with redirect_stdout(io.StringIO()):
|
|
195
212
|
gdf = gpd.read_file(filepath)
|
|
213
|
+
|
|
214
|
+
# Rename external_id column early and convert to string
|
|
215
|
+
if external_id_column and external_id_column in gdf.columns:
|
|
216
|
+
if external_id_column != "external_id":
|
|
217
|
+
gdf = gdf.rename(
|
|
218
|
+
columns={external_id_column: "external_id"}
|
|
219
|
+
) # hard coding here to avoid confusion later
|
|
220
|
+
# Convert to string to ensure consistent type throughout pipeline
|
|
221
|
+
gdf["external_id"] = gdf["external_id"].astype(str)
|
|
222
|
+
|
|
196
223
|
return gdf
|
|
197
224
|
finally:
|
|
198
225
|
fiona_logger.setLevel(old_fiona_level)
|
|
@@ -780,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
|
|
|
780
807
|
if not check_ee_endpoint(endpoint_type):
|
|
781
808
|
if endpoint_type == "high-volume":
|
|
782
809
|
msg = (
|
|
783
|
-
"Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
810
|
+
"# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
|
|
784
811
|
"ee.Reset()\n"
|
|
785
|
-
"ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
786
|
-
"
|
|
787
|
-
"ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
|
|
812
|
+
"ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
|
|
813
|
+
"# where gee_project_name is your GEE project (necessary in Colab)"
|
|
788
814
|
)
|
|
789
815
|
else: # standard endpoint
|
|
790
816
|
msg = (
|
|
791
817
|
"Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
|
|
792
818
|
"ee.Reset()\n"
|
|
793
|
-
"ee.Initialize()\n"
|
|
794
|
-
"
|
|
795
|
-
"ee.Initialize(project='your_cloud_project_name')"
|
|
819
|
+
"ee.Initialize(project=gee_project_name)\n"
|
|
820
|
+
"# where gee_project_name is your GEE project (necessary in Colab)"
|
|
796
821
|
)
|
|
797
822
|
|
|
798
823
|
if raise_error:
|
|
@@ -865,13 +890,13 @@ def extract_centroid_and_geomtype_client(
|
|
|
865
890
|
if plot_id_column in gdf.columns:
|
|
866
891
|
cols.append(plot_id_column)
|
|
867
892
|
|
|
868
|
-
# Include
|
|
893
|
+
# Include external_id if it exists (already renamed during load)
|
|
869
894
|
if (
|
|
870
895
|
external_id_column
|
|
871
|
-
and
|
|
872
|
-
and
|
|
896
|
+
and "external_id" in gdf.columns
|
|
897
|
+
and "external_id" not in cols
|
|
873
898
|
):
|
|
874
|
-
cols.append(
|
|
899
|
+
cols.append("external_id")
|
|
875
900
|
|
|
876
901
|
# Always include metadata columns (centroid, geometry type)
|
|
877
902
|
cols.extend([x_col, y_col, type_col])
|
|
@@ -965,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
965
990
|
|
|
966
991
|
Preserves the __row_id__ column if present so it can be retrieved after processing.
|
|
967
992
|
|
|
993
|
+
IMPORTANT: Drops external_id column before sending to EE to enable query caching.
|
|
994
|
+
external_id is user metadata that's not needed for EE computation. Including it
|
|
995
|
+
breaks EE's caching mechanism since each unique external_id creates a different query.
|
|
996
|
+
|
|
968
997
|
Parameters
|
|
969
998
|
----------
|
|
970
999
|
batch_gdf : gpd.GeoDataFrame
|
|
@@ -973,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
|
|
|
973
1002
|
Returns
|
|
974
1003
|
-------
|
|
975
1004
|
ee.FeatureCollection
|
|
976
|
-
EE FeatureCollection with __row_id__ as a feature property
|
|
1005
|
+
EE FeatureCollection with __row_id__ as a feature property (no external_id)
|
|
977
1006
|
"""
|
|
1007
|
+
# Drop external_id before sending to EE to enable caching
|
|
1008
|
+
# (external_id is preserved separately on client side for merging)
|
|
1009
|
+
batch_for_ee = batch_gdf.copy()
|
|
1010
|
+
if "external_id" in batch_for_ee.columns:
|
|
1011
|
+
batch_for_ee = batch_for_ee.drop(columns=["external_id"])
|
|
1012
|
+
|
|
978
1013
|
# Pass GeoDataFrame directly to preserve CRS metadata
|
|
979
1014
|
# convert_geojson_to_ee will handle:
|
|
980
1015
|
# - CRS detection and conversion to WGS84 if needed
|
|
981
1016
|
# - Data type sanitization (datetime, object columns)
|
|
982
1017
|
# - Geometry validation and Z-coordinate stripping
|
|
983
1018
|
|
|
984
|
-
fc = convert_geojson_to_ee(
|
|
1019
|
+
fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
|
|
985
1020
|
|
|
986
1021
|
# If __row_id__ is in the original GeoDataFrame, it will be preserved
|
|
987
1022
|
# as a feature property in the GeoJSON and thus in the EE FeatureCollection
|
|
@@ -1107,7 +1142,17 @@ def process_ee_batch(
|
|
|
1107
1142
|
# Ensure plot_id_column is present for merging
|
|
1108
1143
|
# It should come from the feature properties (added before EE processing)
|
|
1109
1144
|
if plot_id_column not in df.columns:
|
|
1110
|
-
|
|
1145
|
+
logger.warning(
|
|
1146
|
+
f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
|
|
1147
|
+
f"Regenerating with 1-indexed range. "
|
|
1148
|
+
f"Columns from EE: {list(df.columns)}"
|
|
1149
|
+
)
|
|
1150
|
+
# Use 1-indexed range to match client-side assignment
|
|
1151
|
+
df[plot_id_column] = [str(i) for i in range(1, len(df) + 1)]
|
|
1152
|
+
|
|
1153
|
+
# Ensure plotId is string type (consistent with creation)
|
|
1154
|
+
if plot_id_column in df.columns:
|
|
1155
|
+
df[plot_id_column] = df[plot_id_column].astype(str)
|
|
1111
1156
|
|
|
1112
1157
|
# Ensure all column names are strings (fixes pandas .str accessor issues)
|
|
1113
1158
|
df.columns = df.columns.astype(str)
|
|
@@ -1231,12 +1276,15 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1231
1276
|
# Validate endpoint
|
|
1232
1277
|
validate_ee_endpoint("high-volume", raise_error=True)
|
|
1233
1278
|
|
|
1234
|
-
# Load GeoJSON with output suppressed
|
|
1235
|
-
gdf =
|
|
1279
|
+
# Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
|
|
1280
|
+
gdf = _load_and_prepare_geojson(
|
|
1281
|
+
input_geojson_filepath, external_id_column=external_id_column
|
|
1282
|
+
)
|
|
1236
1283
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1237
1284
|
|
|
1238
|
-
# Validate
|
|
1239
|
-
|
|
1285
|
+
# Validate external_id if provided (lightweight client-side check)
|
|
1286
|
+
# Note: external_id_column already renamed to 'external_id' during load
|
|
1287
|
+
if external_id_column and "external_id" not in gdf.columns:
|
|
1240
1288
|
# Exclude geometry column from available columns list
|
|
1241
1289
|
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1242
1290
|
raise ValueError(
|
|
@@ -1244,13 +1292,13 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1244
1292
|
f"Available columns: {available_cols}"
|
|
1245
1293
|
)
|
|
1246
1294
|
|
|
1247
|
-
# Check completeness of
|
|
1248
|
-
if external_id_column and
|
|
1249
|
-
null_count = gdf[
|
|
1295
|
+
# Check completeness of external_id (warn if nulls exist)
|
|
1296
|
+
if external_id_column and "external_id" in gdf.columns:
|
|
1297
|
+
null_count = gdf["external_id"].isna().sum()
|
|
1250
1298
|
if null_count > 0:
|
|
1251
1299
|
null_pct = (null_count / len(gdf)) * 100
|
|
1252
1300
|
logger.warning(
|
|
1253
|
-
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1301
|
+
f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1254
1302
|
f"These features may have missing external IDs in output."
|
|
1255
1303
|
)
|
|
1256
1304
|
|
|
@@ -1260,16 +1308,24 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1260
1308
|
)
|
|
1261
1309
|
|
|
1262
1310
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1263
|
-
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1311
|
+
gdf[plot_id_column] = [str(i) for i in range(1, len(gdf) + 1)]
|
|
1264
1312
|
|
|
1265
1313
|
# Strip unnecessary properties before sending to EE
|
|
1266
|
-
# Keep only: geometry, plot_id_column, and
|
|
1314
|
+
# Keep only: geometry, plot_id_column, and external_id
|
|
1267
1315
|
# This prevents duplication of GeoJSON properties in EE results
|
|
1268
1316
|
keep_cols = ["geometry", plot_id_column]
|
|
1269
|
-
if
|
|
1270
|
-
|
|
1317
|
+
if (
|
|
1318
|
+
external_id_column and "external_id" in gdf.columns
|
|
1319
|
+
): # Already renamed during load
|
|
1320
|
+
keep_cols.append("external_id")
|
|
1271
1321
|
|
|
1272
1322
|
gdf_for_ee = gdf[keep_cols].copy()
|
|
1323
|
+
|
|
1324
|
+
# CRITICAL: Convert external_id to string (both plotId and external_id are now strings)
|
|
1325
|
+
if external_id_column and "external_id" in gdf_for_ee.columns:
|
|
1326
|
+
gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
|
|
1327
|
+
logger.debug(f"Converted external_id column to string type")
|
|
1328
|
+
|
|
1273
1329
|
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1274
1330
|
|
|
1275
1331
|
# Create image if not provided
|
|
@@ -1366,17 +1422,37 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1366
1422
|
|
|
1367
1423
|
# Merge server and client results
|
|
1368
1424
|
if plot_id_column not in df_server.columns:
|
|
1369
|
-
|
|
1425
|
+
logger.warning(
|
|
1426
|
+
f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
|
|
1427
|
+
f"Regenerating. Columns from EE: {list(df_server.columns)}"
|
|
1428
|
+
)
|
|
1429
|
+
df_server[plot_id_column] = pd.array(
|
|
1430
|
+
range(1, len(df_server) + 1), dtype="Int64"
|
|
1431
|
+
)
|
|
1432
|
+
else:
|
|
1433
|
+
df_server[plot_id_column] = df_server[plot_id_column].astype(
|
|
1434
|
+
str
|
|
1435
|
+
)
|
|
1436
|
+
|
|
1437
|
+
# Ensure plotId is string in client data too
|
|
1438
|
+
if plot_id_column in df_client.columns:
|
|
1439
|
+
df_client[plot_id_column] = df_client[plot_id_column].astype(
|
|
1440
|
+
str
|
|
1441
|
+
)
|
|
1370
1442
|
|
|
1371
1443
|
# Keep all EE statistics from server (all columns with _sum and _median suffixes)
|
|
1372
1444
|
# These are the actual EE processing results
|
|
1373
1445
|
df_server_clean = df_server.copy()
|
|
1374
1446
|
|
|
1447
|
+
# Drop external_id from df_server if it exists (already in df_client)
|
|
1448
|
+
if "external_id" in df_server_clean.columns:
|
|
1449
|
+
df_server_clean = df_server_clean.drop(columns=["external_id"])
|
|
1450
|
+
|
|
1375
1451
|
# Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
|
|
1376
1452
|
# (formatted wrapper handles keep_external_columns parameter)
|
|
1377
1453
|
keep_external_columns = [plot_id_column]
|
|
1378
|
-
if external_id_column and
|
|
1379
|
-
keep_external_columns.append(
|
|
1454
|
+
if external_id_column and "external_id" in df_client.columns:
|
|
1455
|
+
keep_external_columns.append("external_id")
|
|
1380
1456
|
if "geometry" in df_client.columns:
|
|
1381
1457
|
keep_external_columns.append("geometry")
|
|
1382
1458
|
# Keep geometry type column (Geometry_type)
|
|
@@ -1522,7 +1598,10 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1522
1598
|
try:
|
|
1523
1599
|
batch_idx, df_server, df_client = future.result()
|
|
1524
1600
|
if plot_id_column not in df_server.columns:
|
|
1525
|
-
|
|
1601
|
+
# Use 1-indexed range to match client-side assignment
|
|
1602
|
+
df_server[plot_id_column] = range(
|
|
1603
|
+
1, len(df_server) + 1
|
|
1604
|
+
)
|
|
1526
1605
|
merged = df_server.merge(
|
|
1527
1606
|
df_client,
|
|
1528
1607
|
on=plot_id_column,
|
|
@@ -1566,31 +1645,21 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1566
1645
|
else:
|
|
1567
1646
|
return pd.DataFrame()
|
|
1568
1647
|
|
|
1569
|
-
# Clean up duplicate external_id columns created by merges
|
|
1570
|
-
#
|
|
1571
|
-
if external_id_column:
|
|
1572
|
-
# Find
|
|
1573
|
-
|
|
1648
|
+
# Clean up duplicate external_id columns created by merges (if any exist)
|
|
1649
|
+
# external_id was already renamed during load, so we just need to handle duplicates
|
|
1650
|
+
if external_id_column and "external_id" in combined.columns:
|
|
1651
|
+
# Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
|
|
1652
|
+
duplicate_variants = [
|
|
1574
1653
|
col
|
|
1575
1654
|
for col in combined.columns
|
|
1576
|
-
if
|
|
1655
|
+
if col != "external_id" and col.startswith("external_id_")
|
|
1577
1656
|
]
|
|
1578
1657
|
|
|
1579
|
-
if
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
external_id_column
|
|
1583
|
-
if external_id_column in combined.columns
|
|
1584
|
-
else external_id_variants[0]
|
|
1658
|
+
if duplicate_variants:
|
|
1659
|
+
logger.debug(
|
|
1660
|
+
f"Dropping duplicate external_id columns: {duplicate_variants}"
|
|
1585
1661
|
)
|
|
1586
|
-
|
|
1587
|
-
# Rename to standardized 'external_id'
|
|
1588
|
-
if base_col != "external_id":
|
|
1589
|
-
combined = combined.rename(columns={base_col: "external_id"})
|
|
1590
|
-
|
|
1591
|
-
# Drop all other variants
|
|
1592
|
-
cols_to_drop = [c for c in external_id_variants if c != base_col]
|
|
1593
|
-
combined = combined.drop(columns=cols_to_drop, errors="ignore")
|
|
1662
|
+
combined = combined.drop(columns=duplicate_variants, errors="ignore")
|
|
1594
1663
|
|
|
1595
1664
|
# plotId column is already present from batch processing
|
|
1596
1665
|
# Just ensure it's at position 0
|
|
@@ -1673,14 +1742,26 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1673
1742
|
try:
|
|
1674
1743
|
batch_idx, df_server, df_client = future.result()
|
|
1675
1744
|
if plot_id_column not in df_server.columns:
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1745
|
+
logger.warning(
|
|
1746
|
+
f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
|
|
1747
|
+
f"Regenerating. Columns from EE: {list(df_server.columns)}"
|
|
1748
|
+
)
|
|
1749
|
+
# Use 1-indexed range to match client-side assignment
|
|
1750
|
+
df_server[plot_id_column] = range(1, len(df_server) + 1)
|
|
1751
|
+
|
|
1752
|
+
# Ensure plotId is string type (consistent with creation)
|
|
1753
|
+
if plot_id_column in df_server.columns:
|
|
1754
|
+
df_server[plot_id_column] = df_server[
|
|
1755
|
+
plot_id_column
|
|
1756
|
+
].astype(str)
|
|
1757
|
+
if plot_id_column in df_client.columns:
|
|
1758
|
+
df_client[plot_id_column] = df_client[
|
|
1759
|
+
plot_id_column
|
|
1760
|
+
].astype(str)
|
|
1761
|
+
|
|
1762
|
+
# Drop external_id from df_server if it exists (already in df_client)
|
|
1763
|
+
if "external_id" in df_server.columns:
|
|
1764
|
+
df_server = df_server.drop(columns=["external_id"])
|
|
1684
1765
|
|
|
1685
1766
|
merged = df_server.merge(
|
|
1686
1767
|
df_client,
|
|
@@ -1702,30 +1783,22 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1702
1783
|
# Ensure all column names are strings (fixes pandas .str accessor issues later)
|
|
1703
1784
|
combined.columns = combined.columns.astype(str)
|
|
1704
1785
|
|
|
1705
|
-
# Clean up duplicate external_id columns created by merges
|
|
1706
|
-
|
|
1707
|
-
|
|
1786
|
+
# Clean up duplicate external_id columns created by merges (if any exist)
|
|
1787
|
+
# external_id was already renamed during load, so we just need to handle duplicates
|
|
1788
|
+
if external_id_column and "external_id" in combined.columns:
|
|
1789
|
+
# Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
|
|
1790
|
+
duplicate_variants = [
|
|
1708
1791
|
col
|
|
1709
1792
|
for col in combined.columns
|
|
1710
|
-
if
|
|
1793
|
+
if col != "external_id" and col.startswith("external_id_")
|
|
1711
1794
|
]
|
|
1712
1795
|
|
|
1713
|
-
if
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
and external_id_variants
|
|
1718
|
-
):
|
|
1719
|
-
base_col = external_id_variants[0]
|
|
1720
|
-
combined = combined.rename(
|
|
1721
|
-
columns={base_col: "external_id"}
|
|
1722
|
-
)
|
|
1723
|
-
|
|
1724
|
-
cols_to_drop = [
|
|
1725
|
-
c for c in external_id_variants if c != base_col
|
|
1726
|
-
]
|
|
1796
|
+
if duplicate_variants:
|
|
1797
|
+
logger.debug(
|
|
1798
|
+
f"Dropping duplicate external_id columns: {duplicate_variants}"
|
|
1799
|
+
)
|
|
1727
1800
|
combined = combined.drop(
|
|
1728
|
-
columns=
|
|
1801
|
+
columns=duplicate_variants, errors="ignore"
|
|
1729
1802
|
)
|
|
1730
1803
|
|
|
1731
1804
|
# plotId column is already present, just ensure it's at position 0
|
|
@@ -1769,6 +1842,14 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1769
1842
|
)
|
|
1770
1843
|
raise retry_e
|
|
1771
1844
|
|
|
1845
|
+
# Ensure plot_id is present (should already be there from batch processing)
|
|
1846
|
+
if plot_id_column not in formatted.columns:
|
|
1847
|
+
logger.warning(f"{plot_id_column} column missing, regenerating...")
|
|
1848
|
+
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
1849
|
+
|
|
1850
|
+
# Sort by plot_id to ensure consistent output order
|
|
1851
|
+
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
1852
|
+
|
|
1772
1853
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1773
1854
|
return formatted
|
|
1774
1855
|
else:
|
|
@@ -1843,12 +1924,15 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1843
1924
|
# Validate endpoint
|
|
1844
1925
|
validate_ee_endpoint("standard", raise_error=True)
|
|
1845
1926
|
|
|
1846
|
-
# Load GeoJSON with output suppressed
|
|
1847
|
-
gdf =
|
|
1927
|
+
# Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
|
|
1928
|
+
gdf = _load_and_prepare_geojson(
|
|
1929
|
+
input_geojson_filepath, external_id_column=external_id_column
|
|
1930
|
+
)
|
|
1848
1931
|
logger.info(f"Loaded {len(gdf):,} features")
|
|
1849
1932
|
|
|
1850
|
-
# Validate
|
|
1851
|
-
|
|
1933
|
+
# Validate external_id if provided (lightweight client-side check)
|
|
1934
|
+
# Note: external_id_column already renamed to 'external_id' during load
|
|
1935
|
+
if external_id_column and "external_id" not in gdf.columns:
|
|
1852
1936
|
# Exclude geometry column from available columns list
|
|
1853
1937
|
available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
|
|
1854
1938
|
raise ValueError(
|
|
@@ -1856,13 +1940,13 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1856
1940
|
f"Available columns: {available_cols}"
|
|
1857
1941
|
)
|
|
1858
1942
|
|
|
1859
|
-
# Check completeness of
|
|
1860
|
-
if external_id_column and
|
|
1861
|
-
null_count = gdf[
|
|
1943
|
+
# Check completeness of external_id (warn if nulls exist)
|
|
1944
|
+
if external_id_column and "external_id" in gdf.columns:
|
|
1945
|
+
null_count = gdf["external_id"].isna().sum()
|
|
1862
1946
|
if null_count > 0:
|
|
1863
1947
|
null_pct = (null_count / len(gdf)) * 100
|
|
1864
1948
|
logger.warning(
|
|
1865
|
-
f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1949
|
+
f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
|
|
1866
1950
|
f"These features may have missing external IDs in output."
|
|
1867
1951
|
)
|
|
1868
1952
|
|
|
@@ -1872,20 +1956,24 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1872
1956
|
)
|
|
1873
1957
|
|
|
1874
1958
|
# Add stable plotIds for merging (starting from 1, not 0)
|
|
1875
|
-
gdf[plot_id_column] = range(1, len(gdf) + 1)
|
|
1876
|
-
|
|
1877
|
-
# Add stable row IDs
|
|
1878
|
-
row_id_col = "__row_id__"
|
|
1879
|
-
gdf[row_id_col] = range(len(gdf))
|
|
1959
|
+
gdf[plot_id_column] = [str(i) for i in range(1, len(gdf) + 1)]
|
|
1880
1960
|
|
|
1881
1961
|
# Strip unnecessary properties before sending to EE
|
|
1882
|
-
# Keep only: geometry, plot_id_column, and
|
|
1962
|
+
# Keep only: geometry, plot_id_column, and external_id
|
|
1883
1963
|
# This prevents duplication of GeoJSON properties in EE results
|
|
1884
|
-
keep_cols = ["geometry", plot_id_column
|
|
1885
|
-
if
|
|
1886
|
-
|
|
1964
|
+
keep_cols = ["geometry", plot_id_column]
|
|
1965
|
+
if (
|
|
1966
|
+
external_id_column and "external_id" in gdf.columns
|
|
1967
|
+
): # Already renamed during load
|
|
1968
|
+
keep_cols.append("external_id")
|
|
1887
1969
|
|
|
1888
1970
|
gdf_for_ee = gdf[keep_cols].copy()
|
|
1971
|
+
|
|
1972
|
+
# CRITICAL: Convert external_id to string (both plotId and external_id are now strings)
|
|
1973
|
+
if external_id_column and "external_id" in gdf_for_ee.columns:
|
|
1974
|
+
gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
|
|
1975
|
+
logger.debug(f"Converted external_id column to string type")
|
|
1976
|
+
|
|
1889
1977
|
logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
|
|
1890
1978
|
|
|
1891
1979
|
# Create image if not provided
|
|
@@ -1907,10 +1995,19 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1907
1995
|
national_codes=national_codes, validate_bands=True
|
|
1908
1996
|
)
|
|
1909
1997
|
|
|
1998
|
+
# Drop external_id before sending to EE to enable caching
|
|
1999
|
+
# (external_id is preserved separately in gdf for client-side merging)
|
|
2000
|
+
gdf_for_ee_clean = gdf_for_ee.copy()
|
|
2001
|
+
if "external_id" in gdf_for_ee_clean.columns:
|
|
2002
|
+
gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
|
|
2003
|
+
logger.debug("Dropped external_id from data sent to EE (enables caching)")
|
|
2004
|
+
|
|
1910
2005
|
# Convert to EE (suppress print statements from convert_geojson_to_ee)
|
|
1911
2006
|
logger.debug("Converting to EE FeatureCollection...")
|
|
1912
2007
|
with redirect_stdout(io.StringIO()):
|
|
1913
|
-
fc = convert_geojson_to_ee(
|
|
2008
|
+
fc = convert_geojson_to_ee(
|
|
2009
|
+
gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
|
|
2010
|
+
)
|
|
1914
2011
|
|
|
1915
2012
|
# Create reducer
|
|
1916
2013
|
reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
|
|
@@ -1950,11 +2047,11 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1950
2047
|
else:
|
|
1951
2048
|
raise
|
|
1952
2049
|
|
|
1953
|
-
logger.
|
|
2050
|
+
logger.info("Server-side processing complete")
|
|
1954
2051
|
|
|
1955
|
-
#
|
|
1956
|
-
if
|
|
1957
|
-
df_server[
|
|
2052
|
+
# Ensure plotId is string type for consistent merges
|
|
2053
|
+
if plot_id_column in df_server.columns:
|
|
2054
|
+
df_server[plot_id_column] = df_server[plot_id_column].astype(str)
|
|
1958
2055
|
|
|
1959
2056
|
# Add client-side metadata if requested
|
|
1960
2057
|
if add_metadata_client_side:
|
|
@@ -1965,21 +2062,21 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1965
2062
|
return_attributes_only=True,
|
|
1966
2063
|
)
|
|
1967
2064
|
|
|
1968
|
-
#
|
|
1969
|
-
if
|
|
1970
|
-
df_client = df_client.
|
|
2065
|
+
# Ensure plotId is string type for consistent merges
|
|
2066
|
+
if plot_id_column in df_client.columns:
|
|
2067
|
+
df_client[plot_id_column] = df_client[plot_id_column].astype(str)
|
|
2068
|
+
|
|
2069
|
+
# Drop external_id from df_server if it exists (keep from df_client - more reliable)
|
|
2070
|
+
if "external_id" in df_server.columns:
|
|
2071
|
+
df_server = df_server.drop(columns=["external_id"])
|
|
1971
2072
|
|
|
1972
|
-
# Merge
|
|
2073
|
+
# Merge on plotId (same strategy as concurrent mode)
|
|
1973
2074
|
result = df_server.merge(
|
|
1974
|
-
df_client, on=
|
|
2075
|
+
df_client, on=plot_id_column, how="left", suffixes=("", "_client")
|
|
1975
2076
|
)
|
|
1976
2077
|
else:
|
|
1977
2078
|
result = df_server
|
|
1978
2079
|
|
|
1979
|
-
# Remove internal __row_id__ column if present
|
|
1980
|
-
if row_id_col in result.columns:
|
|
1981
|
-
result = result.drop(columns=[row_id_col])
|
|
1982
|
-
|
|
1983
2080
|
# Format the output
|
|
1984
2081
|
# Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
|
|
1985
2082
|
# MUST be done BEFORE formatting (which removes _median columns)
|
|
@@ -2004,27 +2101,14 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
2004
2101
|
convert_water_flag=True,
|
|
2005
2102
|
)
|
|
2006
2103
|
|
|
2104
|
+
# Ensure plot_id exists and sort by it
|
|
2105
|
+
if plot_id_column not in formatted.columns:
|
|
2106
|
+
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
2107
|
+
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
2108
|
+
|
|
2007
2109
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
2008
2110
|
|
|
2009
|
-
#
|
|
2010
|
-
if external_id_column:
|
|
2011
|
-
variants = [
|
|
2012
|
-
col
|
|
2013
|
-
for col in formatted.columns
|
|
2014
|
-
if external_id_column.lower() in col.lower()
|
|
2015
|
-
]
|
|
2016
|
-
if variants:
|
|
2017
|
-
base_col = (
|
|
2018
|
-
external_id_column
|
|
2019
|
-
if external_id_column in formatted.columns
|
|
2020
|
-
else variants[0]
|
|
2021
|
-
)
|
|
2022
|
-
if base_col != "external_id":
|
|
2023
|
-
formatted = formatted.rename(columns={base_col: "external_id"})
|
|
2024
|
-
# Drop other variants
|
|
2025
|
-
formatted = formatted.drop(
|
|
2026
|
-
columns=[c for c in variants if c != base_col], errors="ignore"
|
|
2027
|
-
)
|
|
2111
|
+
# external_id_column already renamed to 'external_id' during load - no action needed here
|
|
2028
2112
|
|
|
2029
2113
|
return formatted
|
|
2030
2114
|
|
|
@@ -2130,7 +2214,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2130
2214
|
gdf_original_geoms = None
|
|
2131
2215
|
if geometry_audit_trail:
|
|
2132
2216
|
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2133
|
-
gdf_original_geoms =
|
|
2217
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2134
2218
|
|
|
2135
2219
|
# Step 1: Get raw stats
|
|
2136
2220
|
logger.debug("Step 1/2: Extracting statistics (concurrent)...")
|
|
@@ -2199,7 +2283,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2199
2283
|
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2200
2284
|
if gdf_original_geoms is None:
|
|
2201
2285
|
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2202
|
-
gdf_original_geoms =
|
|
2286
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2203
2287
|
|
|
2204
2288
|
# Use plotId from df_validated to maintain mapping
|
|
2205
2289
|
df_original_geom = pd.DataFrame(
|
|
@@ -2331,7 +2415,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2331
2415
|
gdf_original_geoms = None
|
|
2332
2416
|
if geometry_audit_trail:
|
|
2333
2417
|
logger.debug("Pre-loading GeoJSON for geometry audit trail...")
|
|
2334
|
-
gdf_original_geoms =
|
|
2418
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2335
2419
|
|
|
2336
2420
|
# Step 1: Get raw stats
|
|
2337
2421
|
logger.debug("Step 1/2: Extracting statistics (sequential)...")
|
|
@@ -2395,7 +2479,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2395
2479
|
# Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
|
|
2396
2480
|
if gdf_original_geoms is None:
|
|
2397
2481
|
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
2398
|
-
gdf_original_geoms =
|
|
2482
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
2399
2483
|
|
|
2400
2484
|
# Use plotId from df_validated to maintain mapping
|
|
2401
2485
|
df_original_geom = pd.DataFrame(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/__init__.py
RENAMED
|
File without changes
|
{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a6}/src/openforis_whisp/parameters/config_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|