openforis-whisp 3.0.0a4__py3-none-any.whl → 3.0.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -181,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
181
181
  reformat_logger.setLevel(logging.ERROR)
182
182
 
183
183
 
184
- def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
185
- """Load GeoJSON file with all output suppressed."""
184
+ def _load_and_prepare_geojson(
185
+ filepath: str, external_id_column: Optional[str] = None
186
+ ) -> gpd.GeoDataFrame:
187
+ """Load GeoJSON file and prepare for processing.
188
+
189
+ Suppresses logging output and optionally renames external_id column.
190
+
191
+ Parameters
192
+ ----------
193
+ filepath : str
194
+ Path to GeoJSON file
195
+ external_id_column : str, optional
196
+ If provided, rename this column to 'external_id' immediately after loading
197
+
198
+ Returns
199
+ -------
200
+ gpd.GeoDataFrame
201
+ Loaded GeoDataFrame with external_id renamed if specified
202
+ """
186
203
  fiona_logger = logging.getLogger("fiona")
187
204
  pyogrio_logger = logging.getLogger("pyogrio._io")
188
205
  old_fiona_level = fiona_logger.level
@@ -193,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
193
210
  try:
194
211
  with redirect_stdout(io.StringIO()):
195
212
  gdf = gpd.read_file(filepath)
213
+
214
+ # Rename external_id column early and convert to string
215
+ if external_id_column and external_id_column in gdf.columns:
216
+ if external_id_column != "external_id":
217
+ gdf = gdf.rename(
218
+ columns={external_id_column: "external_id"}
219
+ ) # hard coding here to avoid confusion later
220
+ # Convert to string to ensure consistent type throughout pipeline
221
+ gdf["external_id"] = gdf["external_id"].astype(str)
222
+
196
223
  return gdf
197
224
  finally:
198
225
  fiona_logger.setLevel(old_fiona_level)
@@ -780,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
780
807
  if not check_ee_endpoint(endpoint_type):
781
808
  if endpoint_type == "high-volume":
782
809
  msg = (
783
- "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
810
+ "# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
784
811
  "ee.Reset()\n"
785
- "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
786
- "Or with project specified (e.g. when in Colab):\n"
787
- "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
812
+ "ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
813
+ "# where gee_project_name is your GEE project (necessary in Colab)"
788
814
  )
789
815
  else: # standard endpoint
790
816
  msg = (
791
817
  "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
792
818
  "ee.Reset()\n"
793
- "ee.Initialize()\n"
794
- "Or with project specified (e.g. when in Colab):\n"
795
- "ee.Initialize(project='your_cloud_project_name')"
819
+ "ee.Initialize(project=gee_project_name)\n"
820
+ "# where gee_project_name is your GEE project (necessary in Colab)"
796
821
  )
797
822
 
798
823
  if raise_error:
@@ -865,13 +890,13 @@ def extract_centroid_and_geomtype_client(
865
890
  if plot_id_column in gdf.columns:
866
891
  cols.append(plot_id_column)
867
892
 
868
- # Include external_id_column if provided and exists
893
+ # Include external_id if it exists (already renamed during load)
869
894
  if (
870
895
  external_id_column
871
- and external_id_column in gdf.columns
872
- and external_id_column not in cols
896
+ and "external_id" in gdf.columns
897
+ and "external_id" not in cols
873
898
  ):
874
- cols.append(external_id_column)
899
+ cols.append("external_id")
875
900
 
876
901
  # Always include metadata columns (centroid, geometry type)
877
902
  cols.extend([x_col, y_col, type_col])
@@ -965,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
965
990
 
966
991
  Preserves the __row_id__ column if present so it can be retrieved after processing.
967
992
 
993
+ IMPORTANT: Drops external_id column before sending to EE to enable query caching.
994
+ external_id is user metadata that's not needed for EE computation. Including it
995
+ breaks EE's caching mechanism since each unique external_id creates a different query.
996
+
968
997
  Parameters
969
998
  ----------
970
999
  batch_gdf : gpd.GeoDataFrame
@@ -973,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
973
1002
  Returns
974
1003
  -------
975
1004
  ee.FeatureCollection
976
- EE FeatureCollection with __row_id__ as a feature property
1005
+ EE FeatureCollection with __row_id__ as a feature property (no external_id)
977
1006
  """
1007
+ # Drop external_id before sending to EE to enable caching
1008
+ # (external_id is preserved separately on client side for merging)
1009
+ batch_for_ee = batch_gdf.copy()
1010
+ if "external_id" in batch_for_ee.columns:
1011
+ batch_for_ee = batch_for_ee.drop(columns=["external_id"])
1012
+
978
1013
  # Pass GeoDataFrame directly to preserve CRS metadata
979
1014
  # convert_geojson_to_ee will handle:
980
1015
  # - CRS detection and conversion to WGS84 if needed
981
1016
  # - Data type sanitization (datetime, object columns)
982
1017
  # - Geometry validation and Z-coordinate stripping
983
1018
 
984
- fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
1019
+ fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
985
1020
 
986
1021
  # If __row_id__ is in the original GeoDataFrame, it will be preserved
987
1022
  # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -1107,7 +1142,19 @@ def process_ee_batch(
1107
1142
  # Ensure plot_id_column is present for merging
1108
1143
  # It should come from the feature properties (added before EE processing)
1109
1144
  if plot_id_column not in df.columns:
1110
- df[plot_id_column] = range(len(df))
1145
+ logger.warning(
1146
+ f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
1147
+ f"Regenerating with 1-indexed range. "
1148
+ f"Columns from EE: {list(df.columns)}"
1149
+ )
1150
+ # Use 1-indexed range to match client-side assignment
1151
+ df[plot_id_column] = range(1, len(df) + 1)
1152
+
1153
+ # Ensure plotId is integer type (EE may return as string)
1154
+ if plot_id_column in df.columns:
1155
+ df[plot_id_column] = pd.to_numeric(
1156
+ df[plot_id_column], errors="coerce"
1157
+ ).astype("Int64")
1111
1158
 
1112
1159
  # Ensure all column names are strings (fixes pandas .str accessor issues)
1113
1160
  df.columns = df.columns.astype(str)
@@ -1231,12 +1278,15 @@ def whisp_stats_geojson_to_df_concurrent(
1231
1278
  # Validate endpoint
1232
1279
  validate_ee_endpoint("high-volume", raise_error=True)
1233
1280
 
1234
- # Load GeoJSON with output suppressed
1235
- gdf = _load_geojson_silently(input_geojson_filepath)
1281
+ # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
1282
+ gdf = _load_and_prepare_geojson(
1283
+ input_geojson_filepath, external_id_column=external_id_column
1284
+ )
1236
1285
  logger.info(f"Loaded {len(gdf):,} features")
1237
1286
 
1238
- # Validate external_id_column if provided (lightweight client-side check)
1239
- if external_id_column and external_id_column not in gdf.columns:
1287
+ # Validate external_id if provided (lightweight client-side check)
1288
+ # Note: external_id_column already renamed to 'external_id' during load
1289
+ if external_id_column and "external_id" not in gdf.columns:
1240
1290
  # Exclude geometry column from available columns list
1241
1291
  available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1242
1292
  raise ValueError(
@@ -1244,13 +1294,13 @@ def whisp_stats_geojson_to_df_concurrent(
1244
1294
  f"Available columns: {available_cols}"
1245
1295
  )
1246
1296
 
1247
- # Check completeness of external_id_column (warn if nulls exist)
1248
- if external_id_column and external_id_column in gdf.columns:
1249
- null_count = gdf[external_id_column].isna().sum()
1297
+ # Check completeness of external_id (warn if nulls exist)
1298
+ if external_id_column and "external_id" in gdf.columns:
1299
+ null_count = gdf["external_id"].isna().sum()
1250
1300
  if null_count > 0:
1251
1301
  null_pct = (null_count / len(gdf)) * 100
1252
1302
  logger.warning(
1253
- f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1303
+ f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1254
1304
  f"These features may have missing external IDs in output."
1255
1305
  )
1256
1306
 
@@ -1263,13 +1313,21 @@ def whisp_stats_geojson_to_df_concurrent(
1263
1313
  gdf[plot_id_column] = range(1, len(gdf) + 1)
1264
1314
 
1265
1315
  # Strip unnecessary properties before sending to EE
1266
- # Keep only: geometry, plot_id_column, and external_id_column
1316
+ # Keep only: geometry, plot_id_column, and external_id
1267
1317
  # This prevents duplication of GeoJSON properties in EE results
1268
1318
  keep_cols = ["geometry", plot_id_column]
1269
- if external_id_column and external_id_column in gdf.columns:
1270
- keep_cols.append(external_id_column)
1319
+ if (
1320
+ external_id_column and "external_id" in gdf.columns
1321
+ ): # Already renamed during load
1322
+ keep_cols.append("external_id")
1271
1323
 
1272
1324
  gdf_for_ee = gdf[keep_cols].copy()
1325
+
1326
+ # CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
1327
+ if external_id_column and "external_id" in gdf_for_ee.columns:
1328
+ gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
1329
+ logger.debug(f"Converted external_id column to string type")
1330
+
1273
1331
  logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1274
1332
 
1275
1333
  # Create image if not provided
@@ -1366,17 +1424,37 @@ def whisp_stats_geojson_to_df_concurrent(
1366
1424
 
1367
1425
  # Merge server and client results
1368
1426
  if plot_id_column not in df_server.columns:
1369
- df_server[plot_id_column] = range(len(df_server))
1427
+ logger.warning(
1428
+ f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
1429
+ f"Regenerating. Columns from EE: {list(df_server.columns)}"
1430
+ )
1431
+ df_server[plot_id_column] = pd.array(
1432
+ range(1, len(df_server) + 1), dtype="Int64"
1433
+ )
1434
+ else:
1435
+ df_server[plot_id_column] = pd.to_numeric(
1436
+ df_server[plot_id_column], errors="coerce"
1437
+ ).astype("Int64")
1438
+
1439
+ # Ensure plotId is Int64 in client data too
1440
+ if plot_id_column in df_client.columns:
1441
+ df_client[plot_id_column] = pd.to_numeric(
1442
+ df_client[plot_id_column], errors="coerce"
1443
+ ).astype("Int64")
1370
1444
 
1371
1445
  # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1372
1446
  # These are the actual EE processing results
1373
1447
  df_server_clean = df_server.copy()
1374
1448
 
1449
+ # Drop external_id from df_server if it exists (already in df_client)
1450
+ if "external_id" in df_server_clean.columns:
1451
+ df_server_clean = df_server_clean.drop(columns=["external_id"])
1452
+
1375
1453
  # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1376
1454
  # (formatted wrapper handles keep_external_columns parameter)
1377
1455
  keep_external_columns = [plot_id_column]
1378
- if external_id_column and external_id_column in df_client.columns:
1379
- keep_external_columns.append(external_id_column)
1456
+ if external_id_column and "external_id" in df_client.columns:
1457
+ keep_external_columns.append("external_id")
1380
1458
  if "geometry" in df_client.columns:
1381
1459
  keep_external_columns.append("geometry")
1382
1460
  # Keep geometry type column (Geometry_type)
@@ -1522,7 +1600,10 @@ def whisp_stats_geojson_to_df_concurrent(
1522
1600
  try:
1523
1601
  batch_idx, df_server, df_client = future.result()
1524
1602
  if plot_id_column not in df_server.columns:
1525
- df_server[plot_id_column] = range(len(df_server))
1603
+ # Use 1-indexed range to match client-side assignment
1604
+ df_server[plot_id_column] = range(
1605
+ 1, len(df_server) + 1
1606
+ )
1526
1607
  merged = df_server.merge(
1527
1608
  df_client,
1528
1609
  on=plot_id_column,
@@ -1566,31 +1647,21 @@ def whisp_stats_geojson_to_df_concurrent(
1566
1647
  else:
1567
1648
  return pd.DataFrame()
1568
1649
 
1569
- # Clean up duplicate external_id columns created by merges
1570
- # Rename external_id_column to standardized 'external_id' for schema validation
1571
- if external_id_column:
1572
- # Find all columns related to external_id
1573
- external_id_variants = [
1650
+ # Clean up duplicate external_id columns created by merges (if any exist)
1651
+ # external_id was already renamed during load, so we just need to handle duplicates
1652
+ if external_id_column and "external_id" in combined.columns:
1653
+ # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
1654
+ duplicate_variants = [
1574
1655
  col
1575
1656
  for col in combined.columns
1576
- if external_id_column.lower() in col.lower()
1657
+ if col != "external_id" and col.startswith("external_id_")
1577
1658
  ]
1578
1659
 
1579
- if external_id_variants:
1580
- # Use the base column name if it exists, otherwise use first variant
1581
- base_col = (
1582
- external_id_column
1583
- if external_id_column in combined.columns
1584
- else external_id_variants[0]
1660
+ if duplicate_variants:
1661
+ logger.debug(
1662
+ f"Dropping duplicate external_id columns: {duplicate_variants}"
1585
1663
  )
1586
-
1587
- # Rename to standardized 'external_id'
1588
- if base_col != "external_id":
1589
- combined = combined.rename(columns={base_col: "external_id"})
1590
-
1591
- # Drop all other variants
1592
- cols_to_drop = [c for c in external_id_variants if c != base_col]
1593
- combined = combined.drop(columns=cols_to_drop, errors="ignore")
1664
+ combined = combined.drop(columns=duplicate_variants, errors="ignore")
1594
1665
 
1595
1666
  # plotId column is already present from batch processing
1596
1667
  # Just ensure it's at position 0
@@ -1673,14 +1744,26 @@ def whisp_stats_geojson_to_df_concurrent(
1673
1744
  try:
1674
1745
  batch_idx, df_server, df_client = future.result()
1675
1746
  if plot_id_column not in df_server.columns:
1676
- df_server[plot_id_column] = range(len(df_server))
1677
-
1678
- # Drop external_id_column from df_client if it exists (already in df_server)
1679
- if (
1680
- external_id_column
1681
- and external_id_column in df_client.columns
1682
- ):
1683
- df_client = df_client.drop(columns=[external_id_column])
1747
+ logger.warning(
1748
+ f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
1749
+ f"Regenerating. Columns from EE: {list(df_server.columns)}"
1750
+ )
1751
+ # Use 1-indexed range to match client-side assignment
1752
+ df_server[plot_id_column] = range(1, len(df_server) + 1)
1753
+
1754
+ # Ensure plotId is integer type (EE may return as string)
1755
+ if plot_id_column in df_server.columns:
1756
+ df_server[plot_id_column] = pd.to_numeric(
1757
+ df_server[plot_id_column], errors="coerce"
1758
+ ).astype("Int64")
1759
+ if plot_id_column in df_client.columns:
1760
+ df_client[plot_id_column] = pd.to_numeric(
1761
+ df_client[plot_id_column], errors="coerce"
1762
+ ).astype("Int64")
1763
+
1764
+ # Drop external_id from df_server if it exists (already in df_client)
1765
+ if "external_id" in df_server.columns:
1766
+ df_server = df_server.drop(columns=["external_id"])
1684
1767
 
1685
1768
  merged = df_server.merge(
1686
1769
  df_client,
@@ -1702,30 +1785,22 @@ def whisp_stats_geojson_to_df_concurrent(
1702
1785
  # Ensure all column names are strings (fixes pandas .str accessor issues later)
1703
1786
  combined.columns = combined.columns.astype(str)
1704
1787
 
1705
- # Clean up duplicate external_id columns created by merges
1706
- if external_id_column:
1707
- external_id_variants = [
1788
+ # Clean up duplicate external_id columns created by merges (if any exist)
1789
+ # external_id was already renamed during load, so we just need to handle duplicates
1790
+ if external_id_column and "external_id" in combined.columns:
1791
+ # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
1792
+ duplicate_variants = [
1708
1793
  col
1709
1794
  for col in combined.columns
1710
- if external_id_column.lower() in col.lower()
1795
+ if col != "external_id" and col.startswith("external_id_")
1711
1796
  ]
1712
1797
 
1713
- if external_id_variants:
1714
- base_col = external_id_column
1715
- if (
1716
- base_col not in combined.columns
1717
- and external_id_variants
1718
- ):
1719
- base_col = external_id_variants[0]
1720
- combined = combined.rename(
1721
- columns={base_col: "external_id"}
1722
- )
1723
-
1724
- cols_to_drop = [
1725
- c for c in external_id_variants if c != base_col
1726
- ]
1798
+ if duplicate_variants:
1799
+ logger.debug(
1800
+ f"Dropping duplicate external_id columns: {duplicate_variants}"
1801
+ )
1727
1802
  combined = combined.drop(
1728
- columns=cols_to_drop, errors="ignore"
1803
+ columns=duplicate_variants, errors="ignore"
1729
1804
  )
1730
1805
 
1731
1806
  # plotId column is already present, just ensure it's at position 0
@@ -1769,6 +1844,14 @@ def whisp_stats_geojson_to_df_concurrent(
1769
1844
  )
1770
1845
  raise retry_e
1771
1846
 
1847
+ # Ensure plot_id is present (should already be there from batch processing)
1848
+ if plot_id_column not in formatted.columns:
1849
+ logger.warning(f"{plot_id_column} column missing, regenerating...")
1850
+ formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1851
+
1852
+ # Sort by plot_id to ensure consistent output order
1853
+ formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
1854
+
1772
1855
  logger.info(f"Processing complete: {len(formatted):,} features")
1773
1856
  return formatted
1774
1857
  else:
@@ -1843,12 +1926,15 @@ def whisp_stats_geojson_to_df_sequential(
1843
1926
  # Validate endpoint
1844
1927
  validate_ee_endpoint("standard", raise_error=True)
1845
1928
 
1846
- # Load GeoJSON with output suppressed
1847
- gdf = _load_geojson_silently(input_geojson_filepath)
1929
+ # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
1930
+ gdf = _load_and_prepare_geojson(
1931
+ input_geojson_filepath, external_id_column=external_id_column
1932
+ )
1848
1933
  logger.info(f"Loaded {len(gdf):,} features")
1849
1934
 
1850
- # Validate external_id_column if provided (lightweight client-side check)
1851
- if external_id_column and external_id_column not in gdf.columns:
1935
+ # Validate external_id if provided (lightweight client-side check)
1936
+ # Note: external_id_column already renamed to 'external_id' during load
1937
+ if external_id_column and "external_id" not in gdf.columns:
1852
1938
  # Exclude geometry column from available columns list
1853
1939
  available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1854
1940
  raise ValueError(
@@ -1856,13 +1942,13 @@ def whisp_stats_geojson_to_df_sequential(
1856
1942
  f"Available columns: {available_cols}"
1857
1943
  )
1858
1944
 
1859
- # Check completeness of external_id_column (warn if nulls exist)
1860
- if external_id_column and external_id_column in gdf.columns:
1861
- null_count = gdf[external_id_column].isna().sum()
1945
+ # Check completeness of external_id (warn if nulls exist)
1946
+ if external_id_column and "external_id" in gdf.columns:
1947
+ null_count = gdf["external_id"].isna().sum()
1862
1948
  if null_count > 0:
1863
1949
  null_pct = (null_count / len(gdf)) * 100
1864
1950
  logger.warning(
1865
- f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1951
+ f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1866
1952
  f"These features may have missing external IDs in output."
1867
1953
  )
1868
1954
 
@@ -1874,18 +1960,22 @@ def whisp_stats_geojson_to_df_sequential(
1874
1960
  # Add stable plotIds for merging (starting from 1, not 0)
1875
1961
  gdf[plot_id_column] = range(1, len(gdf) + 1)
1876
1962
 
1877
- # Add stable row IDs
1878
- row_id_col = "__row_id__"
1879
- gdf[row_id_col] = range(len(gdf))
1880
-
1881
1963
  # Strip unnecessary properties before sending to EE
1882
- # Keep only: geometry, plot_id_column, and external_id_column
1964
+ # Keep only: geometry, plot_id_column, and external_id
1883
1965
  # This prevents duplication of GeoJSON properties in EE results
1884
- keep_cols = ["geometry", plot_id_column, row_id_col]
1885
- if external_id_column and external_id_column in gdf.columns:
1886
- keep_cols.append(external_id_column)
1966
+ keep_cols = ["geometry", plot_id_column]
1967
+ if (
1968
+ external_id_column and "external_id" in gdf.columns
1969
+ ): # Already renamed during load
1970
+ keep_cols.append("external_id")
1887
1971
 
1888
1972
  gdf_for_ee = gdf[keep_cols].copy()
1973
+
1974
+ # CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
1975
+ if external_id_column and "external_id" in gdf_for_ee.columns:
1976
+ gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
1977
+ logger.debug(f"Converted external_id column to string type")
1978
+
1889
1979
  logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1890
1980
 
1891
1981
  # Create image if not provided
@@ -1907,10 +1997,19 @@ def whisp_stats_geojson_to_df_sequential(
1907
1997
  national_codes=national_codes, validate_bands=True
1908
1998
  )
1909
1999
 
2000
+ # Drop external_id before sending to EE to enable caching
2001
+ # (external_id is preserved separately in gdf for client-side merging)
2002
+ gdf_for_ee_clean = gdf_for_ee.copy()
2003
+ if "external_id" in gdf_for_ee_clean.columns:
2004
+ gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
2005
+ logger.debug("Dropped external_id from data sent to EE (enables caching)")
2006
+
1910
2007
  # Convert to EE (suppress print statements from convert_geojson_to_ee)
1911
2008
  logger.debug("Converting to EE FeatureCollection...")
1912
2009
  with redirect_stdout(io.StringIO()):
1913
- fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
2010
+ fc = convert_geojson_to_ee(
2011
+ gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
2012
+ )
1914
2013
 
1915
2014
  # Create reducer
1916
2015
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1950,11 +2049,13 @@ def whisp_stats_geojson_to_df_sequential(
1950
2049
  else:
1951
2050
  raise
1952
2051
 
1953
- logger.debug("Server-side processing complete")
2052
+ logger.info("Server-side processing complete")
1954
2053
 
1955
- # Add row_id if missing
1956
- if row_id_col not in df_server.columns:
1957
- df_server[row_id_col] = range(len(df_server))
2054
+ # Ensure plotId is Int64 type for fast merges
2055
+ if plot_id_column in df_server.columns:
2056
+ df_server[plot_id_column] = pd.to_numeric(
2057
+ df_server[plot_id_column], errors="coerce"
2058
+ ).astype("Int64")
1958
2059
 
1959
2060
  # Add client-side metadata if requested
1960
2061
  if add_metadata_client_side:
@@ -1965,21 +2066,23 @@ def whisp_stats_geojson_to_df_sequential(
1965
2066
  return_attributes_only=True,
1966
2067
  )
1967
2068
 
1968
- # Drop external_id_column from df_client if it exists (already in df_server)
1969
- if external_id_column and external_id_column in df_client.columns:
1970
- df_client = df_client.drop(columns=[external_id_column])
2069
+ # Ensure plotId is Int64 type for fast merges
2070
+ if plot_id_column in df_client.columns:
2071
+ df_client[plot_id_column] = pd.to_numeric(
2072
+ df_client[plot_id_column], errors="coerce"
2073
+ ).astype("Int64")
2074
+
2075
+ # Drop external_id from df_server if it exists (keep from df_client - more reliable)
2076
+ if "external_id" in df_server.columns:
2077
+ df_server = df_server.drop(columns=["external_id"])
1971
2078
 
1972
- # Merge
2079
+ # Merge on plotId (same strategy as concurrent mode)
1973
2080
  result = df_server.merge(
1974
- df_client, on=row_id_col, how="left", suffixes=("", "_client")
2081
+ df_client, on=plot_id_column, how="left", suffixes=("", "_client")
1975
2082
  )
1976
2083
  else:
1977
2084
  result = df_server
1978
2085
 
1979
- # Remove internal __row_id__ column if present
1980
- if row_id_col in result.columns:
1981
- result = result.drop(columns=[row_id_col])
1982
-
1983
2086
  # Format the output
1984
2087
  # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
1985
2088
  # MUST be done BEFORE formatting (which removes _median columns)
@@ -2004,27 +2107,14 @@ def whisp_stats_geojson_to_df_sequential(
2004
2107
  convert_water_flag=True,
2005
2108
  )
2006
2109
 
2110
+ # Ensure plot_id exists and sort by it
2111
+ if plot_id_column not in formatted.columns:
2112
+ formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
2113
+ formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
2114
+
2007
2115
  logger.info(f"Processing complete: {len(formatted):,} features")
2008
2116
 
2009
- # Consolidate external_id_column to standardized 'external_id'
2010
- if external_id_column:
2011
- variants = [
2012
- col
2013
- for col in formatted.columns
2014
- if external_id_column.lower() in col.lower()
2015
- ]
2016
- if variants:
2017
- base_col = (
2018
- external_id_column
2019
- if external_id_column in formatted.columns
2020
- else variants[0]
2021
- )
2022
- if base_col != "external_id":
2023
- formatted = formatted.rename(columns={base_col: "external_id"})
2024
- # Drop other variants
2025
- formatted = formatted.drop(
2026
- columns=[c for c in variants if c != base_col], errors="ignore"
2027
- )
2117
+ # external_id_column already renamed to 'external_id' during load - no action needed here
2028
2118
 
2029
2119
  return formatted
2030
2120
 
@@ -2130,7 +2220,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2130
2220
  gdf_original_geoms = None
2131
2221
  if geometry_audit_trail:
2132
2222
  logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2133
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2223
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2134
2224
 
2135
2225
  # Step 1: Get raw stats
2136
2226
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
@@ -2199,7 +2289,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2199
2289
  # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2200
2290
  if gdf_original_geoms is None:
2201
2291
  logger.warning("Original geometries not pre-loaded, loading now...")
2202
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2292
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2203
2293
 
2204
2294
  # Use plotId from df_validated to maintain mapping
2205
2295
  df_original_geom = pd.DataFrame(
@@ -2331,7 +2421,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2331
2421
  gdf_original_geoms = None
2332
2422
  if geometry_audit_trail:
2333
2423
  logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2334
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2424
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2335
2425
 
2336
2426
  # Step 1: Get raw stats
2337
2427
  logger.debug("Step 1/2: Extracting statistics (sequential)...")
@@ -2395,7 +2485,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2395
2485
  # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2396
2486
  if gdf_original_geoms is None:
2397
2487
  logger.warning("Original geometries not pre-loaded, loading now...")
2398
- gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2488
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2399
2489
 
2400
2490
  # Use plotId from df_validated to maintain mapping
2401
2491
  df_original_geom = pd.DataFrame(
@@ -1,5 +1,5 @@
1
1
  name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude_from_output,col_type,is_nullable,is_required,corresponding_variable
2
- plotId,-10,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,0,plot_id_column
2
+ plotId,-10,,context_and_metadata,context_and_metadata,NA,NA,0,int64,1,0,plot_id_column
3
3
  external_id,-9,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,0,external_id_column
4
4
  Area,-8,,context_and_metadata,context_and_metadata,NA,NA,0,float32,1,1,geometry_area_column
5
5
  Geometry_type,-7,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,1,geometry_type_column
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: openforis-whisp
3
- Version: 3.0.0a4
3
+ Version: 3.0.0a5
4
4
  Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
5
5
  License: MIT
6
6
  Keywords: whisp,geospatial,data-processing
@@ -1,12 +1,12 @@
1
1
  openforis_whisp/__init__.py,sha256=5zJK84LYnlslxSajdCz6ZIYxRS4xgN3sGxSD6_GXEHs,3547
2
- openforis_whisp/advanced_stats.py,sha256=FC1YasSZ93jplF1qBgDopzBIsO2ueXnidomQU3rpP_Q,100006
2
+ openforis_whisp/advanced_stats.py,sha256=qg5Xsx9B3rAIDOsVTeHL-viRrY2KjS5f7CRlLK9WQt8,105329
3
3
  openforis_whisp/data_checks.py,sha256=ErIKGbCa3R8eYP0sVoAl-ZUl607W1QrG0Jr2SIVgm2I,34056
4
4
  openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
5
5
  openforis_whisp/datasets.py,sha256=F1WxXc93mxxmN-WHa0bf-XX-FloSQyEBJKmnrQEHYn8,53855
6
6
  openforis_whisp/logger.py,sha256=gFkRTwJDJKIBWcHDOK74Uln3JM7fAybURo7pQpGL790,3395
7
7
  openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
9
- openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=KgK0ik_Gd4t_Nq5cUkGPT4ZFZVO93HWSG82jRrOukt4,1298
9
+ openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=57Nz7t8GPMYpx81IY5RWiJGVMbIT7_0jIgZWkyih7no,1297
10
10
  openforis_whisp/parameters/lookup_gaul1_admin.py,sha256=cQr5liRdXi85QieTxrz4VAkn0COvRCp82ZV0dYFWOio,474980
11
11
  openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=7KdnFocEgbZO5m8JmWQchzZTurg9rJ96y17z8UyLtI0,17537
12
12
  openforis_whisp/pd_schemas.py,sha256=0z-oPmYIDUIn7mNY41W_uUpmTwjoR7e254mOCoHVsOg,2878
@@ -14,7 +14,7 @@ openforis_whisp/reformat.py,sha256=gvhIa-_kTT5BSO8LuVmJ1TQcf_NwheskXboFM9e0KJY,3
14
14
  openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
15
15
  openforis_whisp/stats.py,sha256=pTSYs77ISRBOIglRpq4SUx3lKRkrUZOKROLRX5IP9IY,63941
16
16
  openforis_whisp/utils.py,sha256=AISWF-MpfFdYkhd6bei4BViw2Iag20mmq61ykrF9YTk,31287
17
- openforis_whisp-3.0.0a4.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
- openforis_whisp-3.0.0a4.dist-info/METADATA,sha256=ak2Dw632lgOtXEXkl5-haYK7vF3hPaJ6IkaRRJRdH0Y,16684
19
- openforis_whisp-3.0.0a4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
- openforis_whisp-3.0.0a4.dist-info/RECORD,,
17
+ openforis_whisp-3.0.0a5.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
+ openforis_whisp-3.0.0a5.dist-info/METADATA,sha256=mYugV2a00b1X9D88x5MF7S6CEq86FlW6EjaCY1L1piA,16684
19
+ openforis_whisp-3.0.0a5.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
+ openforis_whisp-3.0.0a5.dist-info/RECORD,,