openforis-whisp 3.0.0a7__py3-none-any.whl → 3.0.0a9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,7 @@ import subprocess
33
33
  from contextlib import redirect_stdout, contextmanager
34
34
  from pathlib import Path
35
35
  from typing import Optional, List, Dict, Any, Tuple, Union
36
+ from importlib.metadata import version as get_version
36
37
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
38
  import tempfile
38
39
 
@@ -924,10 +925,67 @@ def clean_geodataframe(
924
925
 
925
926
 
926
927
  # ============================================================================
927
- # BATCH RETRY HELPER
928
+ # AUDIT TRAIL HELPER
928
929
  # ============================================================================
929
930
 
930
931
 
932
+ def _add_geometry_audit_trail(
933
+ df_validated: pd.DataFrame,
934
+ input_geojson_filepath: str,
935
+ gdf_original_geoms: gpd.GeoDataFrame = None,
936
+ logger: logging.Logger = None,
937
+ ) -> pd.DataFrame:
938
+ """
939
+ Add original input geometries as geo_original column for audit trail.
940
+
941
+ Parameters
942
+ ----------
943
+ df_validated : pd.DataFrame
944
+ Validated DataFrame to add audit trail to
945
+ input_geojson_filepath : str
946
+ Path to original GeoJSON file
947
+ gdf_original_geoms : gpd.GeoDataFrame, optional
948
+ Pre-loaded original geometries (to avoid reloading)
949
+ logger : logging.Logger, optional
950
+ Logger for output
951
+
952
+ Returns
953
+ -------
954
+ pd.DataFrame
955
+ DataFrame with geo_original column added
956
+ """
957
+ import json
958
+ from shapely.geometry import mapping
959
+
960
+ logger = logger or logging.getLogger("whisp")
961
+
962
+ try:
963
+ # Load original geometries if not provided
964
+ if gdf_original_geoms is None:
965
+ logger.warning("Original geometries not pre-loaded, loading now...")
966
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
967
+
968
+ # Create DataFrame with plotId and geo_original
969
+ df_original_geom = pd.DataFrame(
970
+ {
971
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
972
+ "geo_original": gdf_original_geoms["geometry"].apply(
973
+ lambda g: json.dumps(mapping(g)) if g is not None else None
974
+ ),
975
+ }
976
+ )
977
+
978
+ # Merge original geometries back
979
+ df_result = df_validated.merge(df_original_geom, on="plotId", how="left")
980
+ logger.info("Audit trail added: geo_original column")
981
+ return df_result
982
+
983
+ except Exception as e:
984
+ logger.warning(f"Error adding audit trail: {e}")
985
+ # Return original DataFrame if audit trail fails
986
+ return df_validated
987
+
988
+
931
989
  # ============================================================================
932
990
  # BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
933
991
  # ============================================================================
@@ -1244,6 +1302,9 @@ def whisp_stats_geojson_to_df_concurrent(
1244
1302
  # Track if we had errors that suggest bad bands
1245
1303
  batch_errors = []
1246
1304
 
1305
+ # Fail-fast flag for band errors - shared across threads
1306
+ band_error_detected = threading.Event()
1307
+
1247
1308
  # Suppress fiona logging during batch processing (threads create new loggers)
1248
1309
  fiona_logger = logging.getLogger("fiona")
1249
1310
  pyogrio_logger = logging.getLogger("pyogrio._io")
@@ -1252,6 +1313,15 @@ def whisp_stats_geojson_to_df_concurrent(
1252
1313
  fiona_logger.setLevel(logging.CRITICAL)
1253
1314
  pyogrio_logger.setLevel(logging.CRITICAL)
1254
1315
 
1316
+ # Keywords that indicate missing asset/band errors
1317
+ BAND_ERROR_KEYWORDS = [
1318
+ "image.load",
1319
+ "asset",
1320
+ "not found",
1321
+ "does not exist",
1322
+ "imagecollection.load",
1323
+ ]
1324
+
1255
1325
  try:
1256
1326
  # Don't suppress stdout here - we want progress messages to show in Colab
1257
1327
  with ThreadPoolExecutor(max_workers=pool_workers) as executor:
@@ -1265,6 +1335,13 @@ def whisp_stats_geojson_to_df_concurrent(
1265
1335
  batch_futures = {future: i for future, i in futures.items()}
1266
1336
 
1267
1337
  for future in as_completed(futures):
1338
+ # Check if we should abort due to band error
1339
+ if band_error_detected.is_set():
1340
+ # Cancel remaining futures and skip processing
1341
+ for f in futures:
1342
+ f.cancel()
1343
+ break
1344
+
1268
1345
  batch_idx = batch_futures[future]
1269
1346
  try:
1270
1347
  batch_idx, df_server, df_client = future.result()
@@ -1340,8 +1417,22 @@ def whisp_stats_geojson_to_df_concurrent(
1340
1417
  )
1341
1418
 
1342
1419
  except Exception as e:
1343
- # Batch failed - fail fast with clear guidance
1420
+ # Batch failed - check if it's a band error for fail-fast
1344
1421
  error_msg = str(e)
1422
+ error_msg_lower = error_msg.lower()
1423
+
1424
+ # Check if this is a band/asset error - trigger fail-fast
1425
+ is_this_band_error = any(
1426
+ keyword in error_msg_lower for keyword in BAND_ERROR_KEYWORDS
1427
+ )
1428
+
1429
+ if is_this_band_error and not band_error_detected.is_set():
1430
+ band_error_detected.set()
1431
+ logger.warning(
1432
+ f"Band/asset error detected in batch {batch_idx}. "
1433
+ f"Cancelling remaining batches for retry with validation..."
1434
+ )
1435
+
1345
1436
  logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1346
1437
  logger.debug(f"Full error: {error_msg}")
1347
1438
 
@@ -1361,14 +1452,136 @@ def whisp_stats_geojson_to_df_concurrent(
1361
1452
  # Log completion
1362
1453
  total_time = time.time() - start_time
1363
1454
  time_str = _format_time(total_time)
1364
- logger.info(
1365
- f"Processing complete: {completed_batches:,}/{len(batches):,} batches in {time_str}"
1366
- )
1455
+ if band_error_detected.is_set():
1456
+ logger.info(
1457
+ f"Processing stopped early due to band error after {completed_batches:,}/{len(batches):,} batches in {time_str}"
1458
+ )
1459
+ else:
1460
+ logger.info(
1461
+ f"Processing complete: {completed_batches:,}/{len(batches):,} batches in {time_str}"
1462
+ )
1463
+
1464
+ # If band error was detected, retry immediately with validation (fail-fast path)
1465
+ if band_error_detected.is_set():
1466
+ logger.warning("Retrying all batches with validate_bands=True...")
1467
+ try:
1468
+ with redirect_stdout(io.StringIO()):
1469
+ whisp_image = combine_datasets(
1470
+ national_codes=national_codes, validate_bands=True
1471
+ )
1472
+ logger.info("Image recreated with validation. Reprocessing all batches...")
1473
+
1474
+ # Clear state for full retry
1475
+ results = []
1476
+ batch_errors = []
1477
+ completed_batches = 0
1478
+ shown_milestones = set()
1479
+ start_time = time.time()
1480
+
1481
+ # Suppress fiona logging during retry
1482
+ fiona_logger.setLevel(logging.CRITICAL)
1483
+ pyogrio_logger.setLevel(logging.CRITICAL)
1484
+
1485
+ try:
1486
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1487
+ futures = {
1488
+ executor.submit(process_batch, i, batch): i
1489
+ for i, batch in enumerate(batches)
1490
+ }
1491
+
1492
+ for future in as_completed(futures):
1493
+ batch_idx = futures[future]
1494
+ try:
1495
+ batch_idx, df_server, df_client = future.result()
1496
+ if plot_id_column not in df_server.columns:
1497
+ df_server[plot_id_column] = [
1498
+ str(i) for i in range(1, len(df_server) + 1)
1499
+ ]
1500
+ else:
1501
+ df_server[plot_id_column] = df_server[
1502
+ plot_id_column
1503
+ ].astype(str)
1504
+ if plot_id_column in df_client.columns:
1505
+ df_client[plot_id_column] = df_client[
1506
+ plot_id_column
1507
+ ].astype(str)
1508
+
1509
+ # Drop external_id from server if present
1510
+ df_server_clean = df_server.copy()
1511
+ if "external_id" in df_server_clean.columns:
1512
+ df_server_clean = df_server_clean.drop(
1513
+ columns=["external_id"]
1514
+ )
1367
1515
 
1368
- # If we have batch errors after retry attempts, fail the entire process
1516
+ # Keep essential columns from client
1517
+ keep_external_columns = [plot_id_column]
1518
+ if (
1519
+ external_id_column
1520
+ and "external_id" in df_client.columns
1521
+ ):
1522
+ keep_external_columns.append("external_id")
1523
+ if "geometry" in df_client.columns:
1524
+ keep_external_columns.append("geometry")
1525
+ if geometry_type_column in df_client.columns:
1526
+ keep_external_columns.append(geometry_type_column)
1527
+ centroid_cols = [
1528
+ c
1529
+ for c in df_client.columns
1530
+ if c.startswith("Centroid_")
1531
+ ]
1532
+ keep_external_columns.extend(centroid_cols)
1533
+
1534
+ df_client_clean = df_client[
1535
+ [
1536
+ c
1537
+ for c in keep_external_columns
1538
+ if c in df_client.columns
1539
+ ]
1540
+ ]
1541
+
1542
+ merged = df_server_clean.merge(
1543
+ df_client_clean,
1544
+ on=plot_id_column,
1545
+ how="left",
1546
+ suffixes=("_ee", "_client"),
1547
+ )
1548
+ results.append(merged)
1549
+
1550
+ with progress_lock:
1551
+ completed_batches += 1
1552
+ _log_progress(
1553
+ completed_batches,
1554
+ len(batches),
1555
+ milestones,
1556
+ shown_milestones,
1557
+ start_time,
1558
+ logger,
1559
+ )
1560
+
1561
+ except Exception as e:
1562
+ error_msg = str(e)
1563
+ logger.error(
1564
+ f"Retry batch {batch_idx} failed: {error_msg[:100]}"
1565
+ )
1566
+ original_batch = batch_map[batch_idx]
1567
+ batch_errors.append((batch_idx, original_batch, error_msg))
1568
+ finally:
1569
+ fiona_logger.setLevel(old_fiona_level)
1570
+ pyogrio_logger.setLevel(old_pyogrio_level)
1571
+
1572
+ # Log retry completion
1573
+ retry_time = time.time() - start_time
1574
+ logger.info(
1575
+ f"Retry complete: {completed_batches:,}/{len(batches):,} batches in {_format_time(retry_time)}"
1576
+ )
1577
+
1578
+ except Exception as retry_error:
1579
+ logger.error(f"Failed to recreate image with validation: {retry_error}")
1580
+ # Fall through to error handling below
1581
+
1582
+ # If we have batch errors (either from initial run or retry), raise RuntimeError
1369
1583
  if batch_errors:
1370
1584
  total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
1371
- failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
1372
1585
 
1373
1586
  # Format detailed error information for debugging
1374
1587
  error_details_list = []
@@ -1417,94 +1630,10 @@ def whisp_stats_geojson_to_df_concurrent(
1417
1630
  f"Please reduce batch_size and try again."
1418
1631
  )
1419
1632
 
1420
- # Check if we should retry with validation due to band errors (legacy band error handling)
1633
+ # Check we have results
1421
1634
  if not results:
1422
- # All batches failed - likely a bad band issue
1423
- is_band_error = any(
1424
- keyword in str(batch_errors)
1425
- for keyword in ["Image.load", "asset", "not found", "does not exist"]
1426
- )
1427
-
1428
- if is_band_error:
1429
- logger.warning(
1430
- "Detected potential bad band error. Retrying with validate_bands=True..."
1431
- )
1432
- try:
1433
- with redirect_stdout(io.StringIO()):
1434
- whisp_image = combine_datasets(
1435
- national_codes=national_codes, validate_bands=True
1436
- )
1437
- logger.info(
1438
- "Image recreated with validation. Retrying batch processing..."
1439
- )
1440
-
1441
- # Retry batch processing with validated image
1442
- results = []
1443
- retry_completed = 0
1444
- retry_shown = set()
1445
- retry_start = time.time()
1446
-
1447
- # Suppress fiona logging during batch processing (threads create new loggers)
1448
- fiona_logger = logging.getLogger("fiona")
1449
- pyogrio_logger = logging.getLogger("pyogrio._io")
1450
- old_fiona_level = fiona_logger.level
1451
- old_pyogrio_level = pyogrio_logger.level
1452
- fiona_logger.setLevel(logging.CRITICAL)
1453
- pyogrio_logger.setLevel(logging.CRITICAL)
1454
-
1455
- try:
1456
- with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1457
- futures = {
1458
- executor.submit(process_batch, i, batch): i
1459
- for i, batch in enumerate(batches)
1460
- }
1461
-
1462
- for future in as_completed(futures):
1463
- try:
1464
- batch_idx, df_server, df_client = future.result()
1465
- if plot_id_column not in df_server.columns:
1466
- # Use 1-indexed range to match client-side assignment
1467
- df_server[plot_id_column] = range(
1468
- 1, len(df_server) + 1
1469
- )
1470
- merged = df_server.merge(
1471
- df_client,
1472
- on=plot_id_column,
1473
- how="left",
1474
- suffixes=("", "_client"),
1475
- )
1476
- results.append(merged)
1477
-
1478
- # Update retry progress
1479
- with progress_lock:
1480
- retry_completed += 1
1481
- _log_progress(
1482
- retry_completed,
1483
- len(batches),
1484
- milestones,
1485
- retry_shown,
1486
- retry_start,
1487
- logger,
1488
- )
1489
- except Exception as e:
1490
- logger.error(
1491
- f"Batch processing error (retry): {str(e)[:100]}"
1492
- )
1493
-
1494
- # Log retry completion
1495
- retry_time = time.time() - retry_start
1496
- logger.info(
1497
- f"Retry complete: {retry_completed:,}/{len(batches):,} batches in {_format_time(retry_time)}"
1498
- )
1499
- finally:
1500
- # Restore logger levels
1501
- fiona_logger.setLevel(old_fiona_level)
1502
- pyogrio_logger.setLevel(old_pyogrio_level)
1503
- except Exception as validation_e:
1504
- logger.error(
1505
- f"Failed to recover with validation: {str(validation_e)[:100]}"
1506
- )
1507
- return pd.DataFrame()
1635
+ logger.error("No results obtained from batch processing")
1636
+ return pd.DataFrame()
1508
1637
 
1509
1638
  if results:
1510
1639
  # Filter out empty DataFrames and all-NA columns to avoid FutureWarning in pd.concat
@@ -1727,8 +1856,7 @@ def whisp_stats_geojson_to_df_concurrent(
1727
1856
  logger.warning(f"{plot_id_column} column missing, regenerating...")
1728
1857
  formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1729
1858
 
1730
- # Sort by plot_id to ensure consistent output order
1731
- formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
1859
+ # Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
1732
1860
 
1733
1861
  logger.info(f"Processing complete: {len(formatted):,} features")
1734
1862
  return formatted
@@ -1981,10 +2109,11 @@ def whisp_stats_geojson_to_df_sequential(
1981
2109
  convert_water_flag=True,
1982
2110
  )
1983
2111
 
1984
- # Ensure plot_id exists and sort by it
2112
+ # Ensure plot_id exists
1985
2113
  if plot_id_column not in formatted.columns:
1986
2114
  formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1987
- formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
2115
+
2116
+ # Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
1988
2117
 
1989
2118
  logger.info(f"Processing complete: {len(formatted):,} features")
1990
2119
 
@@ -2154,50 +2283,21 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2154
2283
  custom_bands=custom_bands,
2155
2284
  )
2156
2285
 
2157
- # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2286
+ # Step 2c: Add audit trail column (AFTER validation to preserve columns)
2158
2287
  if geometry_audit_trail:
2159
- logger.debug("Adding audit trail columns...")
2160
- try:
2161
- # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2162
- if gdf_original_geoms is None:
2163
- logger.warning("Original geometries not pre-loaded, loading now...")
2164
- gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2165
-
2166
- # Use plotId from df_validated to maintain mapping
2167
- df_original_geom = pd.DataFrame(
2168
- {
2169
- "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2170
- "geo_original": gdf_original_geoms["geometry"].apply(
2171
- lambda g: json.dumps(mapping(g)) if g is not None else None
2172
- ),
2173
- }
2174
- )
2175
-
2176
- # Merge original geometries back
2177
- df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2178
-
2179
- # Store processing metadata
2180
- df_validated.attrs["processing_metadata"] = {
2181
- "whisp_version": "3.0.0a1",
2182
- "processing_date": datetime.now().isoformat(),
2183
- "processing_mode": "concurrent",
2184
- "ee_endpoint": "high_volume",
2185
- "validate_geometries": validate_geometries,
2186
- "datasets_used": national_codes or [],
2187
- "geometry_audit_trail": True,
2188
- }
2189
-
2190
- logger.info(f"Audit trail added: geo_original column")
2191
-
2192
- except Exception as e:
2193
- logger.warning(f"Error adding audit trail: {e}")
2194
- # Continue without audit trail if something fails
2288
+ logger.debug("Adding geo_original column for audit trail...")
2289
+ df_validated = _add_geometry_audit_trail(
2290
+ df_validated=df_validated,
2291
+ input_geojson_filepath=input_geojson_filepath,
2292
+ gdf_original_geoms=gdf_original_geoms,
2293
+ logger=logger,
2294
+ )
2195
2295
 
2196
2296
  # Add processing metadata column using pd.concat to avoid fragmentation warning
2197
2297
  metadata_dict = {
2198
- "whisp_version": "3.0.0a1",
2298
+ "whisp_version": get_version("openforis-whisp"),
2199
2299
  "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2200
- "%Y-%m-%d %H:%M:%S UTC"
2300
+ "%Y-%m-%d %H:%M:%S%z"
2201
2301
  ),
2202
2302
  }
2203
2303
  metadata_series = pd.Series(
@@ -2349,49 +2449,21 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2349
2449
  custom_bands=custom_bands,
2350
2450
  )
2351
2451
 
2352
- # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2452
+ # Step 2c: Add audit trail column (AFTER validation to preserve columns)
2353
2453
  if geometry_audit_trail:
2354
- logger.debug("Adding audit trail columns...")
2355
- try:
2356
- # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2357
- if gdf_original_geoms is None:
2358
- logger.warning("Original geometries not pre-loaded, loading now...")
2359
- gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2360
-
2361
- # Use plotId from df_validated to maintain mapping
2362
- df_original_geom = pd.DataFrame(
2363
- {
2364
- "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2365
- "geo_original": gdf_original_geoms["geometry"].apply(
2366
- lambda g: json.dumps(mapping(g)) if g is not None else None
2367
- ),
2368
- }
2369
- )
2370
-
2371
- # Merge original geometries back
2372
- df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2373
-
2374
- # Store processing metadata
2375
- df_validated.attrs["processing_metadata"] = {
2376
- "whisp_version": "3.0.0a1",
2377
- "processing_date": datetime.now().isoformat(),
2378
- "processing_mode": "sequential",
2379
- "ee_endpoint": "standard",
2380
- "datasets_used": national_codes or [],
2381
- "geometry_audit_trail": True,
2382
- }
2383
-
2384
- logger.info(f"Audit trail added: geo_original column")
2385
-
2386
- except Exception as e:
2387
- logger.warning(f"Error adding audit trail: {e}")
2388
- # Continue without audit trail if something fails
2454
+ logger.debug("Adding geo_original column for audit trail...")
2455
+ df_validated = _add_geometry_audit_trail(
2456
+ df_validated=df_validated,
2457
+ input_geojson_filepath=input_geojson_filepath,
2458
+ gdf_original_geoms=gdf_original_geoms,
2459
+ logger=logger,
2460
+ )
2389
2461
 
2390
2462
  # Add processing metadata column using pd.concat to avoid fragmentation warning
2391
2463
  metadata_dict = {
2392
- "whisp_version": "3.0.0a1",
2464
+ "whisp_version": get_version("openforis-whisp"),
2393
2465
  "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2394
- "%Y-%m-%d %H:%M:%S UTC"
2466
+ "%Y-%m-%d %H:%M:%S%z"
2395
2467
  ),
2396
2468
  }
2397
2469
  metadata_series = pd.Series(