openforis-whisp 3.0.0a4__py3-none-any.whl → 3.0.0a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -181,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
181
181
  reformat_logger.setLevel(logging.ERROR)
182
182
 
183
183
 
184
- def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
185
- """Load GeoJSON file with all output suppressed."""
184
+ def _load_and_prepare_geojson(
185
+ filepath: str, external_id_column: Optional[str] = None
186
+ ) -> gpd.GeoDataFrame:
187
+ """Load GeoJSON file and prepare for processing.
188
+
189
+ Suppresses logging output and optionally renames external_id column.
190
+
191
+ Parameters
192
+ ----------
193
+ filepath : str
194
+ Path to GeoJSON file
195
+ external_id_column : str, optional
196
+ If provided, rename this column to 'external_id' immediately after loading
197
+
198
+ Returns
199
+ -------
200
+ gpd.GeoDataFrame
201
+ Loaded GeoDataFrame with external_id renamed if specified
202
+ """
186
203
  fiona_logger = logging.getLogger("fiona")
187
204
  pyogrio_logger = logging.getLogger("pyogrio._io")
188
205
  old_fiona_level = fiona_logger.level
@@ -193,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
193
210
  try:
194
211
  with redirect_stdout(io.StringIO()):
195
212
  gdf = gpd.read_file(filepath)
213
+
214
+ # Rename external_id column early and convert to string
215
+ if external_id_column and external_id_column in gdf.columns:
216
+ if external_id_column != "external_id":
217
+ gdf = gdf.rename(
218
+ columns={external_id_column: "external_id"}
219
+ ) # hard coding here to avoid confusion later
220
+ # Convert to string to ensure consistent type throughout pipeline
221
+ gdf["external_id"] = gdf["external_id"].astype(str)
222
+
196
223
  return gdf
197
224
  finally:
198
225
  fiona_logger.setLevel(old_fiona_level)
@@ -780,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
780
807
  if not check_ee_endpoint(endpoint_type):
781
808
  if endpoint_type == "high-volume":
782
809
  msg = (
783
- "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
810
+ "# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
784
811
  "ee.Reset()\n"
785
- "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
786
- "Or with project specified (e.g. when in Colab):\n"
787
- "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
812
+ "ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
813
+ "# where gee_project_name is your GEE project (necessary in Colab)"
788
814
  )
789
815
  else: # standard endpoint
790
816
  msg = (
791
817
  "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
792
818
  "ee.Reset()\n"
793
- "ee.Initialize()\n"
794
- "Or with project specified (e.g. when in Colab):\n"
795
- "ee.Initialize(project='your_cloud_project_name')"
819
+ "ee.Initialize(project=gee_project_name)\n"
820
+ "# where gee_project_name is your GEE project (necessary in Colab)"
796
821
  )
797
822
 
798
823
  if raise_error:
@@ -865,13 +890,13 @@ def extract_centroid_and_geomtype_client(
865
890
  if plot_id_column in gdf.columns:
866
891
  cols.append(plot_id_column)
867
892
 
868
- # Include external_id_column if provided and exists
893
+ # Include external_id if it exists (already renamed during load)
869
894
  if (
870
895
  external_id_column
871
- and external_id_column in gdf.columns
872
- and external_id_column not in cols
896
+ and "external_id" in gdf.columns
897
+ and "external_id" not in cols
873
898
  ):
874
- cols.append(external_id_column)
899
+ cols.append("external_id")
875
900
 
876
901
  # Always include metadata columns (centroid, geometry type)
877
902
  cols.extend([x_col, y_col, type_col])
@@ -965,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
965
990
 
966
991
  Preserves the __row_id__ column if present so it can be retrieved after processing.
967
992
 
993
+ IMPORTANT: Drops external_id column before sending to EE to enable query caching.
994
+ external_id is user metadata that's not needed for EE computation. Including it
995
+ breaks EE's caching mechanism since each unique external_id creates a different query.
996
+
968
997
  Parameters
969
998
  ----------
970
999
  batch_gdf : gpd.GeoDataFrame
@@ -973,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
973
1002
  Returns
974
1003
  -------
975
1004
  ee.FeatureCollection
976
- EE FeatureCollection with __row_id__ as a feature property
1005
+ EE FeatureCollection with __row_id__ as a feature property (no external_id)
977
1006
  """
1007
+ # Drop external_id before sending to EE to enable caching
1008
+ # (external_id is preserved separately on client side for merging)
1009
+ batch_for_ee = batch_gdf.copy()
1010
+ if "external_id" in batch_for_ee.columns:
1011
+ batch_for_ee = batch_for_ee.drop(columns=["external_id"])
1012
+
978
1013
  # Pass GeoDataFrame directly to preserve CRS metadata
979
1014
  # convert_geojson_to_ee will handle:
980
1015
  # - CRS detection and conversion to WGS84 if needed
981
1016
  # - Data type sanitization (datetime, object columns)
982
1017
  # - Geometry validation and Z-coordinate stripping
983
1018
 
984
- fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
1019
+ fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
985
1020
 
986
1021
  # If __row_id__ is in the original GeoDataFrame, it will be preserved
987
1022
  # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -1107,7 +1142,17 @@ def process_ee_batch(
1107
1142
  # Ensure plot_id_column is present for merging
1108
1143
  # It should come from the feature properties (added before EE processing)
1109
1144
  if plot_id_column not in df.columns:
1110
- df[plot_id_column] = range(len(df))
1145
+ logger.warning(
1146
+ f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
1147
+ f"Regenerating with 1-indexed range. "
1148
+ f"Columns from EE: {list(df.columns)}"
1149
+ )
1150
+ # Use 1-indexed range to match client-side assignment
1151
+ df[plot_id_column] = [str(i) for i in range(1, len(df) + 1)]
1152
+
1153
+ # Ensure plotId is string type (consistent with creation)
1154
+ if plot_id_column in df.columns:
1155
+ df[plot_id_column] = df[plot_id_column].astype(str)
1111
1156
 
1112
1157
  # Ensure all column names are strings (fixes pandas .str accessor issues)
1113
1158
  df.columns = df.columns.astype(str)
@@ -1231,12 +1276,15 @@ def whisp_stats_geojson_to_df_concurrent(
1231
1276
  # Validate endpoint
1232
1277
  validate_ee_endpoint("high-volume", raise_error=True)
1233
1278
 
1234
- # Load GeoJSON with output suppressed
1235
- gdf = _load_geojson_silently(input_geojson_filepath)
1279
+ # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
1280
+ gdf = _load_and_prepare_geojson(
1281
+ input_geojson_filepath, external_id_column=external_id_column
1282
+ )
1236
1283
  logger.info(f"Loaded {len(gdf):,} features")
1237
1284
 
1238
- # Validate external_id_column if provided (lightweight client-side check)
1239
- if external_id_column and external_id_column not in gdf.columns:
1285
+ # Validate external_id if provided (lightweight client-side check)
1286
+ # Note: external_id_column already renamed to 'external_id' during load
1287
+ if external_id_column and "external_id" not in gdf.columns:
1240
1288
  # Exclude geometry column from available columns list
1241
1289
  available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1242
1290
  raise ValueError(
@@ -1244,13 +1292,13 @@ def whisp_stats_geojson_to_df_concurrent(
1244
1292
  f"Available columns: {available_cols}"
1245
1293
  )
1246
1294
 
1247
- # Check completeness of external_id_column (warn if nulls exist)
1248
- if external_id_column and external_id_column in gdf.columns:
1249
- null_count = gdf[external_id_column].isna().sum()
1295
+ # Check completeness of external_id (warn if nulls exist)
1296
+ if external_id_column and "external_id" in gdf.columns:
1297
+ null_count = gdf["external_id"].isna().sum()
1250
1298
  if null_count > 0:
1251
1299
  null_pct = (null_count / len(gdf)) * 100
1252
1300
  logger.warning(
1253
- f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1301
+ f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1254
1302
  f"These features may have missing external IDs in output."
1255
1303
  )
1256
1304
 
@@ -1260,16 +1308,24 @@ def whisp_stats_geojson_to_df_concurrent(
1260
1308
  )
1261
1309
 
1262
1310
  # Add stable plotIds for merging (starting from 1, not 0)
1263
- gdf[plot_id_column] = range(1, len(gdf) + 1)
1311
+ gdf[plot_id_column] = [str(i) for i in range(1, len(gdf) + 1)]
1264
1312
 
1265
1313
  # Strip unnecessary properties before sending to EE
1266
- # Keep only: geometry, plot_id_column, and external_id_column
1314
+ # Keep only: geometry, plot_id_column, and external_id
1267
1315
  # This prevents duplication of GeoJSON properties in EE results
1268
1316
  keep_cols = ["geometry", plot_id_column]
1269
- if external_id_column and external_id_column in gdf.columns:
1270
- keep_cols.append(external_id_column)
1317
+ if (
1318
+ external_id_column and "external_id" in gdf.columns
1319
+ ): # Already renamed during load
1320
+ keep_cols.append("external_id")
1271
1321
 
1272
1322
  gdf_for_ee = gdf[keep_cols].copy()
1323
+
1324
+ # CRITICAL: Convert external_id to string (both plotId and external_id are now strings)
1325
+ if external_id_column and "external_id" in gdf_for_ee.columns:
1326
+ gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
1327
+ logger.debug(f"Converted external_id column to string type")
1328
+
1273
1329
  logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1274
1330
 
1275
1331
  # Create image if not provided
@@ -1366,17 +1422,37 @@ def whisp_stats_geojson_to_df_concurrent(
1366
1422
 
1367
1423
  # Merge server and client results
1368
1424
  if plot_id_column not in df_server.columns:
1369
- df_server[plot_id_column] = range(len(df_server))
1425
+ logger.warning(
1426
+ f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
1427
+ f"Regenerating. Columns from EE: {list(df_server.columns)}"
1428
+ )
1429
+ df_server[plot_id_column] = pd.array(
1430
+ range(1, len(df_server) + 1), dtype="Int64"
1431
+ )
1432
+ else:
1433
+ df_server[plot_id_column] = df_server[plot_id_column].astype(
1434
+ str
1435
+ )
1436
+
1437
+ # Ensure plotId is string in client data too
1438
+ if plot_id_column in df_client.columns:
1439
+ df_client[plot_id_column] = df_client[plot_id_column].astype(
1440
+ str
1441
+ )
1370
1442
 
1371
1443
  # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1372
1444
  # These are the actual EE processing results
1373
1445
  df_server_clean = df_server.copy()
1374
1446
 
1447
+ # Drop external_id from df_server if it exists (already in df_client)
1448
+ if "external_id" in df_server_clean.columns:
1449
+ df_server_clean = df_server_clean.drop(columns=["external_id"])
1450
+
1375
1451
  # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1376
1452
  # (formatted wrapper handles keep_external_columns parameter)
1377
1453
  keep_external_columns = [plot_id_column]
1378
- if external_id_column and external_id_column in df_client.columns:
1379
- keep_external_columns.append(external_id_column)
1454
+ if external_id_column and "external_id" in df_client.columns:
1455
+ keep_external_columns.append("external_id")
1380
1456
  if "geometry" in df_client.columns:
1381
1457
  keep_external_columns.append("geometry")
1382
1458
  # Keep geometry type column (Geometry_type)
@@ -1522,7 +1598,10 @@ def whisp_stats_geojson_to_df_concurrent(
1522
1598
  try:
1523
1599
  batch_idx, df_server, df_client = future.result()
1524
1600
  if plot_id_column not in df_server.columns:
1525
- df_server[plot_id_column] = range(len(df_server))
1601
+ # Use 1-indexed range to match client-side assignment
1602
+ df_server[plot_id_column] = range(
1603
+ 1, len(df_server) + 1
1604
+ )
1526
1605
  merged = df_server.merge(
1527
1606
  df_client,
1528
1607
  on=plot_id_column,
@@ -1566,31 +1645,21 @@ def whisp_stats_geojson_to_df_concurrent(
1566
1645
  else:
1567
1646
  return pd.DataFrame()
1568
1647
 
1569
- # Clean up duplicate external_id columns created by merges
1570
- # Rename external_id_column to standardized 'external_id' for schema validation
1571
- if external_id_column:
1572
- # Find all columns related to external_id
1573
- external_id_variants = [
1648
+ # Clean up duplicate external_id columns created by merges (if any exist)
1649
+ # external_id was already renamed during load, so we just need to handle duplicates
1650
+ if external_id_column and "external_id" in combined.columns:
1651
+ # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
1652
+ duplicate_variants = [
1574
1653
  col
1575
1654
  for col in combined.columns
1576
- if external_id_column.lower() in col.lower()
1655
+ if col != "external_id" and col.startswith("external_id_")
1577
1656
  ]
1578
1657
 
1579
- if external_id_variants:
1580
- # Use the base column name if it exists, otherwise use first variant
1581
- base_col = (
1582
- external_id_column
1583
- if external_id_column in combined.columns
1584
- else external_id_variants[0]
1658
+ if duplicate_variants:
1659
+ logger.debug(
1660
+ f"Dropping duplicate external_id columns: {duplicate_variants}"
1585
1661
  )
1586
-
1587
- # Rename to standardized 'external_id'
1588
- if base_col != "external_id":
1589
- combined = combined.rename(columns={base_col: "external_id"})
1590
-
1591
- # Drop all other variants
1592
- cols_to_drop = [c for c in external_id_variants if c != base_col]
1593
- combined = combined.drop(columns=cols_to_drop, errors="ignore")
1662
+ combined = combined.drop(columns=duplicate_variants, errors="ignore")
1594
1663
 
1595
1664
  # plotId column is already present from batch processing
1596
1665
  # Just ensure it's at position 0
@@ -1673,14 +1742,26 @@ def whisp_stats_geojson_to_df_concurrent(
1673
1742
  try:
1674
1743
  batch_idx, df_server, df_client = future.result()
1675
1744
  if plot_id_column not in df_server.columns:
1676
- df_server[plot_id_column] = range(len(df_server))
1677
-
1678
- # Drop external_id_column from df_client if it exists (already in df_server)
1679
- if (
1680
- external_id_column
1681
- and external_id_column in df_client.columns
1682
- ):
1683
- df_client = df_client.drop(columns=[external_id_column])
1745
+ logger.warning(
1746
+ f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
1747
+ f"Regenerating. Columns from EE: {list(df_server.columns)}"
1748
+ )
1749
+ # Use 1-indexed range to match client-side assignment
1750
+ df_server[plot_id_column] = range(1, len(df_server) + 1)
1751
+
1752
+ # Ensure plotId is string type (consistent with creation)
1753
+ if plot_id_column in df_server.columns:
1754
+ df_server[plot_id_column] = df_server[
1755
+ plot_id_column
1756
+ ].astype(str)
1757
+ if plot_id_column in df_client.columns:
1758
+ df_client[plot_id_column] = df_client[
1759
+ plot_id_column
1760
+ ].astype(str)
1761
+
1762
+ # Drop external_id from df_server if it exists (already in df_client)
1763
+ if "external_id" in df_server.columns:
1764
+ df_server = df_server.drop(columns=["external_id"])
1684
1765
 
1685
1766
  merged = df_server.merge(
1686
1767
  df_client,
@@ -1702,30 +1783,22 @@ def whisp_stats_geojson_to_df_concurrent(
1702
1783
  # Ensure all column names are strings (fixes pandas .str accessor issues later)
1703
1784
  combined.columns = combined.columns.astype(str)
1704
1785
 
1705
- # Clean up duplicate external_id columns created by merges
1706
- if external_id_column:
1707
- external_id_variants = [
1786
+ # Clean up duplicate external_id columns created by merges (if any exist)
1787
+ # external_id was already renamed during load, so we just need to handle duplicates
1788
+ if external_id_column and "external_id" in combined.columns:
1789
+ # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
1790
+ duplicate_variants = [
1708
1791
  col
1709
1792
  for col in combined.columns
1710
- if external_id_column.lower() in col.lower()
1793
+ if col != "external_id" and col.startswith("external_id_")
1711
1794
  ]
1712
1795
 
1713
- if external_id_variants:
1714
- base_col = external_id_column
1715
- if (
1716
- base_col not in combined.columns
1717
- and external_id_variants
1718
- ):
1719
- base_col = external_id_variants[0]
1720
- combined = combined.rename(
1721
- columns={base_col: "external_id"}
1722
- )
1723
-
1724
- cols_to_drop = [
1725
- c for c in external_id_variants if c != base_col
1726
- ]
1796
+ if duplicate_variants:
1797
+ logger.debug(
1798
+ f"Dropping duplicate external_id columns: {duplicate_variants}"
1799
+ )
1727
1800
  combined = combined.drop(
1728
- columns=cols_to_drop, errors="ignore"
1801
+ columns=duplicate_variants, errors="ignore"
1729
1802
  )
1730
1803
 
1731
1804
  # plotId column is already present, just ensure it's at position 0
@@ -1769,6 +1842,14 @@ def whisp_stats_geojson_to_df_concurrent(
1769
1842
  )
1770
1843
  raise retry_e
1771
1844
 
1845
+ # Ensure plot_id is present (should already be there from batch processing)
1846
+ if plot_id_column not in formatted.columns:
1847
+ logger.warning(f"{plot_id_column} column missing, regenerating...")
1848
+ formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1849
+
1850
+ # Sort by plot_id to ensure consistent output order
1851
+ formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
1852
+
1772
1853
  logger.info(f"Processing complete: {len(formatted):,} features")
1773
1854
  return formatted
1774
1855
  else:
@@ -1843,12 +1924,15 @@ def whisp_stats_geojson_to_df_sequential(
1843
1924
  # Validate endpoint
1844
1925
  validate_ee_endpoint("standard", raise_error=True)
1845
1926
 
1846
- # Load GeoJSON with output suppressed
1847
- gdf = _load_geojson_silently(input_geojson_filepath)
1927
+ # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
1928
+ gdf = _load_and_prepare_geojson(
1929
+ input_geojson_filepath, external_id_column=external_id_column
1930
+ )
1848
1931
  logger.info(f"Loaded {len(gdf):,} features")
1849
1932
 
1850
- # Validate external_id_column if provided (lightweight client-side check)
1851
- if external_id_column and external_id_column not in gdf.columns:
1933
+ # Validate external_id if provided (lightweight client-side check)
1934
+ # Note: external_id_column already renamed to 'external_id' during load
1935
+ if external_id_column and "external_id" not in gdf.columns:
1852
1936
  # Exclude geometry column from available columns list
1853
1937
  available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1854
1938
  raise ValueError(
@@ -1856,13 +1940,13 @@ def whisp_stats_geojson_to_df_sequential(
1856
1940
  f"Available columns: {available_cols}"
1857
1941
  )
1858
1942
 
1859
- # Check completeness of external_id_column (warn if nulls exist)
1860
- if external_id_column and external_id_column in gdf.columns:
1861
- null_count = gdf[external_id_column].isna().sum()
1943
+ # Check completeness of external_id (warn if nulls exist)
1944
+ if external_id_column and "external_id" in gdf.columns:
1945
+ null_count = gdf["external_id"].isna().sum()
1862
1946
  if null_count > 0:
1863
1947
  null_pct = (null_count / len(gdf)) * 100
1864
1948
  logger.warning(
1865
- f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1949
+ f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1866
1950
  f"These features may have missing external IDs in output."
1867
1951
  )
1868
1952
 
@@ -1872,20 +1956,24 @@ def whisp_stats_geojson_to_df_sequential(
1872
1956
  )
1873
1957
 
1874
1958
  # Add stable plotIds for merging (starting from 1, not 0)
1875
- gdf[plot_id_column] = range(1, len(gdf) + 1)
1876
-
1877
- # Add stable row IDs
1878
- row_id_col = "__row_id__"
1879
- gdf[row_id_col] = range(len(gdf))
1959
+ gdf[plot_id_column] = [str(i) for i in range(1, len(gdf) + 1)]
1880
1960
 
1881
1961
  # Strip unnecessary properties before sending to EE
1882
- # Keep only: geometry, plot_id_column, and external_id_column
1962
+ # Keep only: geometry, plot_id_column, and external_id
1883
1963
  # This prevents duplication of GeoJSON properties in EE results
1884
- keep_cols = ["geometry", plot_id_column, row_id_col]
1885
- if external_id_column and external_id_column in gdf.columns:
1886
- keep_cols.append(external_id_column)
1964
+ keep_cols = ["geometry", plot_id_column]
1965
+ if (
1966
+ external_id_column and "external_id" in gdf.columns
1967
+ ): # Already renamed during load
1968
+ keep_cols.append("external_id")
1887
1969
 
1888
1970
  gdf_for_ee = gdf[keep_cols].copy()
1971
+
1972
+ # CRITICAL: Convert external_id to string (both plotId and external_id are now strings)
1973
+ if external_id_column and "external_id" in gdf_for_ee.columns:
1974
+ gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
1975
+ logger.debug(f"Converted external_id column to string type")
1976
+
1889
1977
  logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1890
1978
 
1891
1979
  # Create image if not provided
@@ -1907,10 +1995,19 @@ def whisp_stats_geojson_to_df_sequential(
1907
1995
  national_codes=national_codes, validate_bands=True
1908
1996
  )
1909
1997
 
1998
+ # Drop external_id before sending to EE to enable caching
1999
+ # (external_id is preserved separately in gdf for client-side merging)
2000
+ gdf_for_ee_clean = gdf_for_ee.copy()
2001
+ if "external_id" in gdf_for_ee_clean.columns:
2002
+ gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
2003
+ logger.debug("Dropped external_id from data sent to EE (enables caching)")
2004
+
1910
2005
  # Convert to EE (suppress print statements from convert_geojson_to_ee)
1911
2006
  logger.debug("Converting to EE FeatureCollection...")
1912
2007
  with redirect_stdout(io.StringIO()):
1913
- fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
2008
+ fc = convert_geojson_to_ee(
2009
+ gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
2010
+ )
1914
2011
 
1915
2012
  # Create reducer
1916
2013
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1950,11 +2047,11 @@ def whisp_stats_geojson_to_df_sequential(
1950
2047
  else:
1951
2048
  raise
1952
2049
 
1953
- logger.debug("Server-side processing complete")
2050
+ logger.info("Server-side processing complete")
1954
2051
 
1955
- # Add row_id if missing
1956
- if row_id_col not in df_server.columns:
1957
- df_server[row_id_col] = range(len(df_server))
2052
+ # Ensure plotId is string type for consistent merges
2053
+ if plot_id_column in df_server.columns:
2054
+ df_server[plot_id_column] = df_server[plot_id_column].astype(str)
1958
2055
 
1959
2056
  # Add client-side metadata if requested
1960
2057
  if add_metadata_client_side:
@@ -1965,21 +2062,21 @@ def whisp_stats_geojson_to_df_sequential(
1965
2062
  return_attributes_only=True,
1966
2063
  )
1967
2064
 
1968
- # Drop external_id_column from df_client if it exists (already in df_server)
1969
- if external_id_column and external_id_column in df_client.columns:
1970
- df_client = df_client.drop(columns=[external_id_column])
2065
+ # Ensure plotId is string type for consistent merges
2066
+ if plot_id_column in df_client.columns:
2067
+ df_client[plot_id_column] = df_client[plot_id_column].astype(str)
2068
+
2069
+ # Drop external_id from df_server if it exists (keep from df_client - more reliable)
2070
+ if "external_id" in df_server.columns:
2071
+ df_server = df_server.drop(columns=["external_id"])
1971
2072
 
1972
- # Merge
2073
+ # Merge on plotId (same strategy as concurrent mode)
1973
2074
  result = df_server.merge(
1974
- df_client, on=row_id_col, how="left", suffixes=("", "_client")
2075
+ df_client, on=plot_id_column, how="left", suffixes=("", "_client")
1975
2076
  )
1976
2077
  else:
1977
2078
  result = df_server
1978
2079
 
1979
- # Remove internal __row_id__ column if present
1980
- if row_id_col in result.columns:
1981
- result = result.drop(columns=[row_id_col])
1982
-
1983
2080
  # Format the output
1984
2081
  # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1985
2082
  # MUST be done BEFORE formatting (which removes _median columns)
@@ -2004,27 +2101,14 @@ def whisp_stats_geojson_to_df_sequential(
2004
2101
  convert_water_flag=True,
2005
2102
  )
2006
2103
 
2104
+ # Ensure plot_id exists and sort by it
2105
+ if plot_id_column not in formatted.columns:
2106
+ formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
2107
+ formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
2108
+
2007
2109
  logger.info(f"Processing complete: {len(formatted):,} features")
2008
2110
 
2009
- # Consolidate external_id_column to standardized 'external_id'
2010
- if external_id_column:
2011
- variants = [
2012
- col
2013
- for col in formatted.columns
2014
- if external_id_column.lower() in col.lower()
2015
- ]
2016
- if variants:
2017
- base_col = (
2018
- external_id_column
2019
- if external_id_column in formatted.columns
2020
- else variants[0]
2021
- )
2022
- if base_col != "external_id":
2023
- formatted = formatted.rename(columns={base_col: "external_id"})
2024
- # Drop other variants
2025
- formatted = formatted.drop(
2026
- columns=[c for c in variants if c != base_col], errors="ignore"
2027
- )
2111
+ # external_id_column already renamed to 'external_id' during load - no action needed here
2028
2112
 
2029
2113
  return formatted
2030
2114
 
@@ -2130,7 +2214,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2130
2214
  gdf_original_geoms = None
2131
2215
  if geometry_audit_trail:
2132
2216
  logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2133
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2217
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2134
2218
 
2135
2219
  # Step 1: Get raw stats
2136
2220
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
@@ -2199,7 +2283,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2199
2283
  # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2200
2284
  if gdf_original_geoms is None:
2201
2285
  logger.warning("Original geometries not pre-loaded, loading now...")
2202
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2286
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2203
2287
 
2204
2288
  # Use plotId from df_validated to maintain mapping
2205
2289
  df_original_geom = pd.DataFrame(
@@ -2331,7 +2415,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2331
2415
  gdf_original_geoms = None
2332
2416
  if geometry_audit_trail:
2333
2417
  logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2334
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2418
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2335
2419
 
2336
2420
  # Step 1: Get raw stats
2337
2421
  logger.debug("Step 1/2: Extracting statistics (sequential)...")
@@ -2395,7 +2479,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2395
2479
  # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2396
2480
  if gdf_original_geoms is None:
2397
2481
  logger.warning("Original geometries not pre-loaded, loading now...")
2398
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2482
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2399
2483
 
2400
2484
  # Use plotId from df_validated to maintain mapping
2401
2485
  df_original_geom = pd.DataFrame(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: openforis-whisp
3
- Version: 3.0.0a4
3
+ Version: 3.0.0a6
4
4
  Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
5
5
  License: MIT
6
6
  Keywords: whisp,geospatial,data-processing
@@ -1,5 +1,5 @@
1
1
  openforis_whisp/__init__.py,sha256=5zJK84LYnlslxSajdCz6ZIYxRS4xgN3sGxSD6_GXEHs,3547
2
- openforis_whisp/advanced_stats.py,sha256=FC1YasSZ93jplF1qBgDopzBIsO2ueXnidomQU3rpP_Q,100006
2
+ openforis_whisp/advanced_stats.py,sha256=9YOGCkPAPKAS02kUVpJLl8Npr0yMYbFgAKakiqg9dow,105038
3
3
  openforis_whisp/data_checks.py,sha256=ErIKGbCa3R8eYP0sVoAl-ZUl607W1QrG0Jr2SIVgm2I,34056
4
4
  openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
5
5
  openforis_whisp/datasets.py,sha256=F1WxXc93mxxmN-WHa0bf-XX-FloSQyEBJKmnrQEHYn8,53855
@@ -14,7 +14,7 @@ openforis_whisp/reformat.py,sha256=gvhIa-_kTT5BSO8LuVmJ1TQcf_NwheskXboFM9e0KJY,3
14
14
  openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
15
15
  openforis_whisp/stats.py,sha256=pTSYs77ISRBOIglRpq4SUx3lKRkrUZOKROLRX5IP9IY,63941
16
16
  openforis_whisp/utils.py,sha256=AISWF-MpfFdYkhd6bei4BViw2Iag20mmq61ykrF9YTk,31287
17
- openforis_whisp-3.0.0a4.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
- openforis_whisp-3.0.0a4.dist-info/METADATA,sha256=ak2Dw632lgOtXEXkl5-haYK7vF3hPaJ6IkaRRJRdH0Y,16684
19
- openforis_whisp-3.0.0a4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
- openforis_whisp-3.0.0a4.dist-info/RECORD,,
17
+ openforis_whisp-3.0.0a6.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
+ openforis_whisp-3.0.0a6.dist-info/METADATA,sha256=MHbnrfjkEo-wcvu4G9k7Gqat_KSZ4UaitFuoRahT6no,16684
19
+ openforis_whisp-3.0.0a6.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
+ openforis_whisp-3.0.0a6.dist-info/RECORD,,