openforis-whisp 3.0.0a7__py3-none-any.whl → 3.0.0a9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/advanced_stats.py +247 -175
- openforis_whisp/datasets.py +414 -19
- openforis_whisp/parameters/lookup_gee_datasets.csv +194 -171
- openforis_whisp/reformat.py +8 -6
- openforis_whisp/risk.py +113 -29
- {openforis_whisp-3.0.0a7.dist-info → openforis_whisp-3.0.0a9.dist-info}/METADATA +38 -124
- {openforis_whisp-3.0.0a7.dist-info → openforis_whisp-3.0.0a9.dist-info}/RECORD +9 -9
- {openforis_whisp-3.0.0a7.dist-info → openforis_whisp-3.0.0a9.dist-info}/WHEEL +1 -1
- {openforis_whisp-3.0.0a7.dist-info/licenses → openforis_whisp-3.0.0a9.dist-info}/LICENSE +0 -0
|
@@ -33,6 +33,7 @@ import subprocess
|
|
|
33
33
|
from contextlib import redirect_stdout, contextmanager
|
|
34
34
|
from pathlib import Path
|
|
35
35
|
from typing import Optional, List, Dict, Any, Tuple, Union
|
|
36
|
+
from importlib.metadata import version as get_version
|
|
36
37
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
37
38
|
import tempfile
|
|
38
39
|
|
|
@@ -924,10 +925,67 @@ def clean_geodataframe(
|
|
|
924
925
|
|
|
925
926
|
|
|
926
927
|
# ============================================================================
|
|
927
|
-
#
|
|
928
|
+
# AUDIT TRAIL HELPER
|
|
928
929
|
# ============================================================================
|
|
929
930
|
|
|
930
931
|
|
|
932
|
+
def _add_geometry_audit_trail(
|
|
933
|
+
df_validated: pd.DataFrame,
|
|
934
|
+
input_geojson_filepath: str,
|
|
935
|
+
gdf_original_geoms: gpd.GeoDataFrame = None,
|
|
936
|
+
logger: logging.Logger = None,
|
|
937
|
+
) -> pd.DataFrame:
|
|
938
|
+
"""
|
|
939
|
+
Add original input geometries as geo_original column for audit trail.
|
|
940
|
+
|
|
941
|
+
Parameters
|
|
942
|
+
----------
|
|
943
|
+
df_validated : pd.DataFrame
|
|
944
|
+
Validated DataFrame to add audit trail to
|
|
945
|
+
input_geojson_filepath : str
|
|
946
|
+
Path to original GeoJSON file
|
|
947
|
+
gdf_original_geoms : gpd.GeoDataFrame, optional
|
|
948
|
+
Pre-loaded original geometries (to avoid reloading)
|
|
949
|
+
logger : logging.Logger, optional
|
|
950
|
+
Logger for output
|
|
951
|
+
|
|
952
|
+
Returns
|
|
953
|
+
-------
|
|
954
|
+
pd.DataFrame
|
|
955
|
+
DataFrame with geo_original column added
|
|
956
|
+
"""
|
|
957
|
+
import json
|
|
958
|
+
from shapely.geometry import mapping
|
|
959
|
+
|
|
960
|
+
logger = logger or logging.getLogger("whisp")
|
|
961
|
+
|
|
962
|
+
try:
|
|
963
|
+
# Load original geometries if not provided
|
|
964
|
+
if gdf_original_geoms is None:
|
|
965
|
+
logger.warning("Original geometries not pre-loaded, loading now...")
|
|
966
|
+
gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
|
|
967
|
+
|
|
968
|
+
# Create DataFrame with plotId and geo_original
|
|
969
|
+
df_original_geom = pd.DataFrame(
|
|
970
|
+
{
|
|
971
|
+
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
972
|
+
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
973
|
+
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
974
|
+
),
|
|
975
|
+
}
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
# Merge original geometries back
|
|
979
|
+
df_result = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
980
|
+
logger.info("Audit trail added: geo_original column")
|
|
981
|
+
return df_result
|
|
982
|
+
|
|
983
|
+
except Exception as e:
|
|
984
|
+
logger.warning(f"Error adding audit trail: {e}")
|
|
985
|
+
# Return original DataFrame if audit trail fails
|
|
986
|
+
return df_validated
|
|
987
|
+
|
|
988
|
+
|
|
931
989
|
# ============================================================================
|
|
932
990
|
# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
|
|
933
991
|
# ============================================================================
|
|
@@ -1244,6 +1302,9 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1244
1302
|
# Track if we had errors that suggest bad bands
|
|
1245
1303
|
batch_errors = []
|
|
1246
1304
|
|
|
1305
|
+
# Fail-fast flag for band errors - shared across threads
|
|
1306
|
+
band_error_detected = threading.Event()
|
|
1307
|
+
|
|
1247
1308
|
# Suppress fiona logging during batch processing (threads create new loggers)
|
|
1248
1309
|
fiona_logger = logging.getLogger("fiona")
|
|
1249
1310
|
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
@@ -1252,6 +1313,15 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1252
1313
|
fiona_logger.setLevel(logging.CRITICAL)
|
|
1253
1314
|
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1254
1315
|
|
|
1316
|
+
# Keywords that indicate missing asset/band errors
|
|
1317
|
+
BAND_ERROR_KEYWORDS = [
|
|
1318
|
+
"image.load",
|
|
1319
|
+
"asset",
|
|
1320
|
+
"not found",
|
|
1321
|
+
"does not exist",
|
|
1322
|
+
"imagecollection.load",
|
|
1323
|
+
]
|
|
1324
|
+
|
|
1255
1325
|
try:
|
|
1256
1326
|
# Don't suppress stdout here - we want progress messages to show in Colab
|
|
1257
1327
|
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
@@ -1265,6 +1335,13 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1265
1335
|
batch_futures = {future: i for future, i in futures.items()}
|
|
1266
1336
|
|
|
1267
1337
|
for future in as_completed(futures):
|
|
1338
|
+
# Check if we should abort due to band error
|
|
1339
|
+
if band_error_detected.is_set():
|
|
1340
|
+
# Cancel remaining futures and skip processing
|
|
1341
|
+
for f in futures:
|
|
1342
|
+
f.cancel()
|
|
1343
|
+
break
|
|
1344
|
+
|
|
1268
1345
|
batch_idx = batch_futures[future]
|
|
1269
1346
|
try:
|
|
1270
1347
|
batch_idx, df_server, df_client = future.result()
|
|
@@ -1340,8 +1417,22 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1340
1417
|
)
|
|
1341
1418
|
|
|
1342
1419
|
except Exception as e:
|
|
1343
|
-
# Batch failed -
|
|
1420
|
+
# Batch failed - check if it's a band error for fail-fast
|
|
1344
1421
|
error_msg = str(e)
|
|
1422
|
+
error_msg_lower = error_msg.lower()
|
|
1423
|
+
|
|
1424
|
+
# Check if this is a band/asset error - trigger fail-fast
|
|
1425
|
+
is_this_band_error = any(
|
|
1426
|
+
keyword in error_msg_lower for keyword in BAND_ERROR_KEYWORDS
|
|
1427
|
+
)
|
|
1428
|
+
|
|
1429
|
+
if is_this_band_error and not band_error_detected.is_set():
|
|
1430
|
+
band_error_detected.set()
|
|
1431
|
+
logger.warning(
|
|
1432
|
+
f"Band/asset error detected in batch {batch_idx}. "
|
|
1433
|
+
f"Cancelling remaining batches for retry with validation..."
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1345
1436
|
logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
|
|
1346
1437
|
logger.debug(f"Full error: {error_msg}")
|
|
1347
1438
|
|
|
@@ -1361,14 +1452,136 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1361
1452
|
# Log completion
|
|
1362
1453
|
total_time = time.time() - start_time
|
|
1363
1454
|
time_str = _format_time(total_time)
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1455
|
+
if band_error_detected.is_set():
|
|
1456
|
+
logger.info(
|
|
1457
|
+
f"Processing stopped early due to band error after {completed_batches:,}/{len(batches):,} batches in {time_str}"
|
|
1458
|
+
)
|
|
1459
|
+
else:
|
|
1460
|
+
logger.info(
|
|
1461
|
+
f"Processing complete: {completed_batches:,}/{len(batches):,} batches in {time_str}"
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1464
|
+
# If band error was detected, retry immediately with validation (fail-fast path)
|
|
1465
|
+
if band_error_detected.is_set():
|
|
1466
|
+
logger.warning("Retrying all batches with validate_bands=True...")
|
|
1467
|
+
try:
|
|
1468
|
+
with redirect_stdout(io.StringIO()):
|
|
1469
|
+
whisp_image = combine_datasets(
|
|
1470
|
+
national_codes=national_codes, validate_bands=True
|
|
1471
|
+
)
|
|
1472
|
+
logger.info("Image recreated with validation. Reprocessing all batches...")
|
|
1473
|
+
|
|
1474
|
+
# Clear state for full retry
|
|
1475
|
+
results = []
|
|
1476
|
+
batch_errors = []
|
|
1477
|
+
completed_batches = 0
|
|
1478
|
+
shown_milestones = set()
|
|
1479
|
+
start_time = time.time()
|
|
1480
|
+
|
|
1481
|
+
# Suppress fiona logging during retry
|
|
1482
|
+
fiona_logger.setLevel(logging.CRITICAL)
|
|
1483
|
+
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1484
|
+
|
|
1485
|
+
try:
|
|
1486
|
+
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1487
|
+
futures = {
|
|
1488
|
+
executor.submit(process_batch, i, batch): i
|
|
1489
|
+
for i, batch in enumerate(batches)
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
for future in as_completed(futures):
|
|
1493
|
+
batch_idx = futures[future]
|
|
1494
|
+
try:
|
|
1495
|
+
batch_idx, df_server, df_client = future.result()
|
|
1496
|
+
if plot_id_column not in df_server.columns:
|
|
1497
|
+
df_server[plot_id_column] = [
|
|
1498
|
+
str(i) for i in range(1, len(df_server) + 1)
|
|
1499
|
+
]
|
|
1500
|
+
else:
|
|
1501
|
+
df_server[plot_id_column] = df_server[
|
|
1502
|
+
plot_id_column
|
|
1503
|
+
].astype(str)
|
|
1504
|
+
if plot_id_column in df_client.columns:
|
|
1505
|
+
df_client[plot_id_column] = df_client[
|
|
1506
|
+
plot_id_column
|
|
1507
|
+
].astype(str)
|
|
1508
|
+
|
|
1509
|
+
# Drop external_id from server if present
|
|
1510
|
+
df_server_clean = df_server.copy()
|
|
1511
|
+
if "external_id" in df_server_clean.columns:
|
|
1512
|
+
df_server_clean = df_server_clean.drop(
|
|
1513
|
+
columns=["external_id"]
|
|
1514
|
+
)
|
|
1367
1515
|
|
|
1368
|
-
|
|
1516
|
+
# Keep essential columns from client
|
|
1517
|
+
keep_external_columns = [plot_id_column]
|
|
1518
|
+
if (
|
|
1519
|
+
external_id_column
|
|
1520
|
+
and "external_id" in df_client.columns
|
|
1521
|
+
):
|
|
1522
|
+
keep_external_columns.append("external_id")
|
|
1523
|
+
if "geometry" in df_client.columns:
|
|
1524
|
+
keep_external_columns.append("geometry")
|
|
1525
|
+
if geometry_type_column in df_client.columns:
|
|
1526
|
+
keep_external_columns.append(geometry_type_column)
|
|
1527
|
+
centroid_cols = [
|
|
1528
|
+
c
|
|
1529
|
+
for c in df_client.columns
|
|
1530
|
+
if c.startswith("Centroid_")
|
|
1531
|
+
]
|
|
1532
|
+
keep_external_columns.extend(centroid_cols)
|
|
1533
|
+
|
|
1534
|
+
df_client_clean = df_client[
|
|
1535
|
+
[
|
|
1536
|
+
c
|
|
1537
|
+
for c in keep_external_columns
|
|
1538
|
+
if c in df_client.columns
|
|
1539
|
+
]
|
|
1540
|
+
]
|
|
1541
|
+
|
|
1542
|
+
merged = df_server_clean.merge(
|
|
1543
|
+
df_client_clean,
|
|
1544
|
+
on=plot_id_column,
|
|
1545
|
+
how="left",
|
|
1546
|
+
suffixes=("_ee", "_client"),
|
|
1547
|
+
)
|
|
1548
|
+
results.append(merged)
|
|
1549
|
+
|
|
1550
|
+
with progress_lock:
|
|
1551
|
+
completed_batches += 1
|
|
1552
|
+
_log_progress(
|
|
1553
|
+
completed_batches,
|
|
1554
|
+
len(batches),
|
|
1555
|
+
milestones,
|
|
1556
|
+
shown_milestones,
|
|
1557
|
+
start_time,
|
|
1558
|
+
logger,
|
|
1559
|
+
)
|
|
1560
|
+
|
|
1561
|
+
except Exception as e:
|
|
1562
|
+
error_msg = str(e)
|
|
1563
|
+
logger.error(
|
|
1564
|
+
f"Retry batch {batch_idx} failed: {error_msg[:100]}"
|
|
1565
|
+
)
|
|
1566
|
+
original_batch = batch_map[batch_idx]
|
|
1567
|
+
batch_errors.append((batch_idx, original_batch, error_msg))
|
|
1568
|
+
finally:
|
|
1569
|
+
fiona_logger.setLevel(old_fiona_level)
|
|
1570
|
+
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
1571
|
+
|
|
1572
|
+
# Log retry completion
|
|
1573
|
+
retry_time = time.time() - start_time
|
|
1574
|
+
logger.info(
|
|
1575
|
+
f"Retry complete: {completed_batches:,}/{len(batches):,} batches in {_format_time(retry_time)}"
|
|
1576
|
+
)
|
|
1577
|
+
|
|
1578
|
+
except Exception as retry_error:
|
|
1579
|
+
logger.error(f"Failed to recreate image with validation: {retry_error}")
|
|
1580
|
+
# Fall through to error handling below
|
|
1581
|
+
|
|
1582
|
+
# If we have batch errors (either from initial run or retry), raise RuntimeError
|
|
1369
1583
|
if batch_errors:
|
|
1370
1584
|
total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
|
|
1371
|
-
failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
|
|
1372
1585
|
|
|
1373
1586
|
# Format detailed error information for debugging
|
|
1374
1587
|
error_details_list = []
|
|
@@ -1417,94 +1630,10 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1417
1630
|
f"Please reduce batch_size and try again."
|
|
1418
1631
|
)
|
|
1419
1632
|
|
|
1420
|
-
# Check
|
|
1633
|
+
# Check we have results
|
|
1421
1634
|
if not results:
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
keyword in str(batch_errors)
|
|
1425
|
-
for keyword in ["Image.load", "asset", "not found", "does not exist"]
|
|
1426
|
-
)
|
|
1427
|
-
|
|
1428
|
-
if is_band_error:
|
|
1429
|
-
logger.warning(
|
|
1430
|
-
"Detected potential bad band error. Retrying with validate_bands=True..."
|
|
1431
|
-
)
|
|
1432
|
-
try:
|
|
1433
|
-
with redirect_stdout(io.StringIO()):
|
|
1434
|
-
whisp_image = combine_datasets(
|
|
1435
|
-
national_codes=national_codes, validate_bands=True
|
|
1436
|
-
)
|
|
1437
|
-
logger.info(
|
|
1438
|
-
"Image recreated with validation. Retrying batch processing..."
|
|
1439
|
-
)
|
|
1440
|
-
|
|
1441
|
-
# Retry batch processing with validated image
|
|
1442
|
-
results = []
|
|
1443
|
-
retry_completed = 0
|
|
1444
|
-
retry_shown = set()
|
|
1445
|
-
retry_start = time.time()
|
|
1446
|
-
|
|
1447
|
-
# Suppress fiona logging during batch processing (threads create new loggers)
|
|
1448
|
-
fiona_logger = logging.getLogger("fiona")
|
|
1449
|
-
pyogrio_logger = logging.getLogger("pyogrio._io")
|
|
1450
|
-
old_fiona_level = fiona_logger.level
|
|
1451
|
-
old_pyogrio_level = pyogrio_logger.level
|
|
1452
|
-
fiona_logger.setLevel(logging.CRITICAL)
|
|
1453
|
-
pyogrio_logger.setLevel(logging.CRITICAL)
|
|
1454
|
-
|
|
1455
|
-
try:
|
|
1456
|
-
with ThreadPoolExecutor(max_workers=pool_workers) as executor:
|
|
1457
|
-
futures = {
|
|
1458
|
-
executor.submit(process_batch, i, batch): i
|
|
1459
|
-
for i, batch in enumerate(batches)
|
|
1460
|
-
}
|
|
1461
|
-
|
|
1462
|
-
for future in as_completed(futures):
|
|
1463
|
-
try:
|
|
1464
|
-
batch_idx, df_server, df_client = future.result()
|
|
1465
|
-
if plot_id_column not in df_server.columns:
|
|
1466
|
-
# Use 1-indexed range to match client-side assignment
|
|
1467
|
-
df_server[plot_id_column] = range(
|
|
1468
|
-
1, len(df_server) + 1
|
|
1469
|
-
)
|
|
1470
|
-
merged = df_server.merge(
|
|
1471
|
-
df_client,
|
|
1472
|
-
on=plot_id_column,
|
|
1473
|
-
how="left",
|
|
1474
|
-
suffixes=("", "_client"),
|
|
1475
|
-
)
|
|
1476
|
-
results.append(merged)
|
|
1477
|
-
|
|
1478
|
-
# Update retry progress
|
|
1479
|
-
with progress_lock:
|
|
1480
|
-
retry_completed += 1
|
|
1481
|
-
_log_progress(
|
|
1482
|
-
retry_completed,
|
|
1483
|
-
len(batches),
|
|
1484
|
-
milestones,
|
|
1485
|
-
retry_shown,
|
|
1486
|
-
retry_start,
|
|
1487
|
-
logger,
|
|
1488
|
-
)
|
|
1489
|
-
except Exception as e:
|
|
1490
|
-
logger.error(
|
|
1491
|
-
f"Batch processing error (retry): {str(e)[:100]}"
|
|
1492
|
-
)
|
|
1493
|
-
|
|
1494
|
-
# Log retry completion
|
|
1495
|
-
retry_time = time.time() - retry_start
|
|
1496
|
-
logger.info(
|
|
1497
|
-
f"Retry complete: {retry_completed:,}/{len(batches):,} batches in {_format_time(retry_time)}"
|
|
1498
|
-
)
|
|
1499
|
-
finally:
|
|
1500
|
-
# Restore logger levels
|
|
1501
|
-
fiona_logger.setLevel(old_fiona_level)
|
|
1502
|
-
pyogrio_logger.setLevel(old_pyogrio_level)
|
|
1503
|
-
except Exception as validation_e:
|
|
1504
|
-
logger.error(
|
|
1505
|
-
f"Failed to recover with validation: {str(validation_e)[:100]}"
|
|
1506
|
-
)
|
|
1507
|
-
return pd.DataFrame()
|
|
1635
|
+
logger.error("No results obtained from batch processing")
|
|
1636
|
+
return pd.DataFrame()
|
|
1508
1637
|
|
|
1509
1638
|
if results:
|
|
1510
1639
|
# Filter out empty DataFrames and all-NA columns to avoid FutureWarning in pd.concat
|
|
@@ -1727,8 +1856,7 @@ def whisp_stats_geojson_to_df_concurrent(
|
|
|
1727
1856
|
logger.warning(f"{plot_id_column} column missing, regenerating...")
|
|
1728
1857
|
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
1729
1858
|
|
|
1730
|
-
#
|
|
1731
|
-
formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
|
|
1859
|
+
# Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
|
|
1732
1860
|
|
|
1733
1861
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1734
1862
|
return formatted
|
|
@@ -1981,10 +2109,11 @@ def whisp_stats_geojson_to_df_sequential(
|
|
|
1981
2109
|
convert_water_flag=True,
|
|
1982
2110
|
)
|
|
1983
2111
|
|
|
1984
|
-
# Ensure plot_id exists
|
|
2112
|
+
# Ensure plot_id exists
|
|
1985
2113
|
if plot_id_column not in formatted.columns:
|
|
1986
2114
|
formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
|
|
1987
|
-
|
|
2115
|
+
|
|
2116
|
+
# Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
|
|
1988
2117
|
|
|
1989
2118
|
logger.info(f"Processing complete: {len(formatted):,} features")
|
|
1990
2119
|
|
|
@@ -2154,50 +2283,21 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
|
|
|
2154
2283
|
custom_bands=custom_bands,
|
|
2155
2284
|
)
|
|
2156
2285
|
|
|
2157
|
-
# Step 2c: Add audit trail
|
|
2286
|
+
# Step 2c: Add audit trail column (AFTER validation to preserve columns)
|
|
2158
2287
|
if geometry_audit_trail:
|
|
2159
|
-
logger.debug("Adding audit trail
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
# Use plotId from df_validated to maintain mapping
|
|
2167
|
-
df_original_geom = pd.DataFrame(
|
|
2168
|
-
{
|
|
2169
|
-
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2170
|
-
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2171
|
-
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2172
|
-
),
|
|
2173
|
-
}
|
|
2174
|
-
)
|
|
2175
|
-
|
|
2176
|
-
# Merge original geometries back
|
|
2177
|
-
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2178
|
-
|
|
2179
|
-
# Store processing metadata
|
|
2180
|
-
df_validated.attrs["processing_metadata"] = {
|
|
2181
|
-
"whisp_version": "3.0.0a1",
|
|
2182
|
-
"processing_date": datetime.now().isoformat(),
|
|
2183
|
-
"processing_mode": "concurrent",
|
|
2184
|
-
"ee_endpoint": "high_volume",
|
|
2185
|
-
"validate_geometries": validate_geometries,
|
|
2186
|
-
"datasets_used": national_codes or [],
|
|
2187
|
-
"geometry_audit_trail": True,
|
|
2188
|
-
}
|
|
2189
|
-
|
|
2190
|
-
logger.info(f"Audit trail added: geo_original column")
|
|
2191
|
-
|
|
2192
|
-
except Exception as e:
|
|
2193
|
-
logger.warning(f"Error adding audit trail: {e}")
|
|
2194
|
-
# Continue without audit trail if something fails
|
|
2288
|
+
logger.debug("Adding geo_original column for audit trail...")
|
|
2289
|
+
df_validated = _add_geometry_audit_trail(
|
|
2290
|
+
df_validated=df_validated,
|
|
2291
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
2292
|
+
gdf_original_geoms=gdf_original_geoms,
|
|
2293
|
+
logger=logger,
|
|
2294
|
+
)
|
|
2195
2295
|
|
|
2196
2296
|
# Add processing metadata column using pd.concat to avoid fragmentation warning
|
|
2197
2297
|
metadata_dict = {
|
|
2198
|
-
"whisp_version": "
|
|
2298
|
+
"whisp_version": get_version("openforis-whisp"),
|
|
2199
2299
|
"processing_timestamp_utc": datetime.now(timezone.utc).strftime(
|
|
2200
|
-
"%Y-%m-%d %H:%M:%S
|
|
2300
|
+
"%Y-%m-%d %H:%M:%S%z"
|
|
2201
2301
|
),
|
|
2202
2302
|
}
|
|
2203
2303
|
metadata_series = pd.Series(
|
|
@@ -2349,49 +2449,21 @@ def whisp_formatted_stats_geojson_to_df_sequential(
|
|
|
2349
2449
|
custom_bands=custom_bands,
|
|
2350
2450
|
)
|
|
2351
2451
|
|
|
2352
|
-
# Step 2c: Add audit trail
|
|
2452
|
+
# Step 2c: Add audit trail column (AFTER validation to preserve columns)
|
|
2353
2453
|
if geometry_audit_trail:
|
|
2354
|
-
logger.debug("Adding audit trail
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
# Use plotId from df_validated to maintain mapping
|
|
2362
|
-
df_original_geom = pd.DataFrame(
|
|
2363
|
-
{
|
|
2364
|
-
"plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
|
|
2365
|
-
"geo_original": gdf_original_geoms["geometry"].apply(
|
|
2366
|
-
lambda g: json.dumps(mapping(g)) if g is not None else None
|
|
2367
|
-
),
|
|
2368
|
-
}
|
|
2369
|
-
)
|
|
2370
|
-
|
|
2371
|
-
# Merge original geometries back
|
|
2372
|
-
df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
|
|
2373
|
-
|
|
2374
|
-
# Store processing metadata
|
|
2375
|
-
df_validated.attrs["processing_metadata"] = {
|
|
2376
|
-
"whisp_version": "3.0.0a1",
|
|
2377
|
-
"processing_date": datetime.now().isoformat(),
|
|
2378
|
-
"processing_mode": "sequential",
|
|
2379
|
-
"ee_endpoint": "standard",
|
|
2380
|
-
"datasets_used": national_codes or [],
|
|
2381
|
-
"geometry_audit_trail": True,
|
|
2382
|
-
}
|
|
2383
|
-
|
|
2384
|
-
logger.info(f"Audit trail added: geo_original column")
|
|
2385
|
-
|
|
2386
|
-
except Exception as e:
|
|
2387
|
-
logger.warning(f"Error adding audit trail: {e}")
|
|
2388
|
-
# Continue without audit trail if something fails
|
|
2454
|
+
logger.debug("Adding geo_original column for audit trail...")
|
|
2455
|
+
df_validated = _add_geometry_audit_trail(
|
|
2456
|
+
df_validated=df_validated,
|
|
2457
|
+
input_geojson_filepath=input_geojson_filepath,
|
|
2458
|
+
gdf_original_geoms=gdf_original_geoms,
|
|
2459
|
+
logger=logger,
|
|
2460
|
+
)
|
|
2389
2461
|
|
|
2390
2462
|
# Add processing metadata column using pd.concat to avoid fragmentation warning
|
|
2391
2463
|
metadata_dict = {
|
|
2392
|
-
"whisp_version": "
|
|
2464
|
+
"whisp_version": get_version("openforis-whisp"),
|
|
2393
2465
|
"processing_timestamp_utc": datetime.now(timezone.utc).strftime(
|
|
2394
|
-
"%Y-%m-%d %H:%M:%S
|
|
2466
|
+
"%Y-%m-%d %H:%M:%S%z"
|
|
2395
2467
|
),
|
|
2396
2468
|
}
|
|
2397
2469
|
metadata_series = pd.Series(
|