openforis-whisp 3.0.0a3__py3-none-any.whl → 3.0.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -102,5 +102,5 @@ from openforis_whisp.utils import (
102
102
  from openforis_whisp.data_checks import (
103
103
  analyze_geojson,
104
104
  validate_geojson_constraints,
105
- suggest_method,
105
+ suggest_processing_mode,
106
106
  )
@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
36
36
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
37
  import tempfile
38
38
 
39
+ # Configure the "whisp" logger with auto-flush handler for Colab visibility
40
+ _whisp_logger = logging.getLogger("whisp")
41
+ if not _whisp_logger.handlers:
42
+ _handler = logging.StreamHandler(sys.stdout)
43
+ _handler.setLevel(logging.DEBUG)
44
+ _handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
45
+ # Override emit to force flush after each message for Colab
46
+ _original_emit = _handler.emit
47
+
48
+ def _emit_with_flush(record):
49
+ _original_emit(record)
50
+ sys.stdout.flush()
51
+
52
+ _handler.emit = _emit_with_flush
53
+ _whisp_logger.addHandler(_handler)
54
+ _whisp_logger.setLevel(logging.INFO)
55
+ _whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
56
+
39
57
  # ============================================================================
40
58
  # STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
41
59
  # ============================================================================
@@ -445,6 +463,16 @@ def join_admin_codes(
445
463
  columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
446
464
  )
447
465
 
466
+ # Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
467
+ # (e.g., points in the ocean or international waters)
468
+ df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
469
+ "Unknown"
470
+ )
471
+ df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
472
+ "not found"
473
+ )
474
+ df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
475
+
448
476
  logger.debug(
449
477
  f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
450
478
  )
@@ -461,10 +489,16 @@ class ProgressTracker:
461
489
 
462
490
  Shows progress at adaptive milestones (more frequent for small datasets,
463
491
  less frequent for large datasets) with estimated time remaining based on
464
- processing speed.
492
+ processing speed. Includes time-based heartbeat to prevent long silences.
465
493
  """
466
494
 
467
- def __init__(self, total: int, logger: logging.Logger = None):
495
+ def __init__(
496
+ self,
497
+ total: int,
498
+ logger: logging.Logger = None,
499
+ heartbeat_interval: int = 180,
500
+ status_file: str = None,
501
+ ):
468
502
  """
469
503
  Initialize progress tracker.
470
504
 
@@ -474,26 +508,147 @@ class ProgressTracker:
474
508
  Total number of items to process
475
509
  logger : logging.Logger, optional
476
510
  Logger for output
511
+ heartbeat_interval : int, optional
512
+ Seconds between heartbeat messages (default: 180 = 3 minutes)
513
+ status_file : str, optional
514
+ Path to JSON status file for API/web app consumption.
515
+ Checkpoints auto-save to same directory as status_file.
477
516
  """
478
517
  self.total = total
479
518
  self.completed = 0
480
519
  self.lock = threading.Lock()
481
520
  self.logger = logger or logging.getLogger("whisp")
521
+ self.heartbeat_interval = heartbeat_interval
522
+
523
+ # Handle status_file: if directory passed, auto-generate filename
524
+ if status_file:
525
+ import os
526
+
527
+ if os.path.isdir(status_file):
528
+ self.status_file = os.path.join(
529
+ status_file, "whisp_processing_status.json"
530
+ )
531
+ else:
532
+ # Validate that parent directory exists
533
+ parent_dir = os.path.dirname(status_file)
534
+ if parent_dir and not os.path.isdir(parent_dir):
535
+ self.logger.warning(
536
+ f"Status file directory does not exist: {parent_dir}"
537
+ )
538
+ self.status_file = None
539
+ else:
540
+ self.status_file = status_file
541
+ else:
542
+ self.status_file = None
482
543
 
483
544
  # Adaptive milestones based on dataset size
484
545
  # Small datasets (< 50): show every 25% (not too spammy)
485
546
  # Medium (50-500): show every 20%
486
- # Large (500+): show every 10% (more frequent feedback on long runs)
547
+ # Large (500-1000): show every 10%
548
+ # Very large (1000+): show every 5% (cleaner for long jobs)
487
549
  if total < 50:
488
550
  self.milestones = {25, 50, 75, 100}
489
551
  elif total < 500:
490
552
  self.milestones = {20, 40, 60, 80, 100}
491
- else:
553
+ elif total < 1000:
492
554
  self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
555
+ else:
556
+ self.milestones = {
557
+ 5,
558
+ 10,
559
+ 15,
560
+ 20,
561
+ 25,
562
+ 30,
563
+ 35,
564
+ 40,
565
+ 45,
566
+ 50,
567
+ 55,
568
+ 60,
569
+ 65,
570
+ 70,
571
+ 75,
572
+ 80,
573
+ 85,
574
+ 90,
575
+ 95,
576
+ 100,
577
+ }
493
578
 
494
579
  self.shown_milestones = set()
495
580
  self.start_time = time.time()
496
581
  self.last_update_time = self.start_time
582
+ self.heartbeat_stop = threading.Event()
583
+ self.heartbeat_thread = None
584
+
585
+ def _write_status_file(self, status: str = "processing") -> None:
586
+ """Write current progress to JSON status file using atomic write."""
587
+ if not self.status_file:
588
+ return
589
+
590
+ try:
591
+ import json
592
+ import os
593
+
594
+ elapsed = time.time() - self.start_time
595
+ percent = (self.completed / self.total * 100) if self.total > 0 else 0
596
+ rate = self.completed / elapsed if elapsed > 0 else 0
597
+ eta = (
598
+ (self.total - self.completed) / rate * 1.15
599
+ if rate > 0 and percent >= 5
600
+ else None
601
+ )
602
+
603
+ # Write to temp file then atomic rename to prevent partial reads
604
+ from datetime import datetime
605
+
606
+ temp_file = self.status_file + ".tmp"
607
+ with open(temp_file, "w") as f:
608
+ json.dump(
609
+ {
610
+ "status": status,
611
+ "progress": f"{self.completed}/{self.total}",
612
+ "percent": round(percent, 1),
613
+ "elapsed_sec": round(elapsed),
614
+ "eta_sec": round(eta) if eta else None,
615
+ "updated_at": datetime.now().isoformat(),
616
+ },
617
+ f,
618
+ )
619
+ os.replace(temp_file, self.status_file)
620
+ except Exception:
621
+ pass
622
+
623
+ def start_heartbeat(self) -> None:
624
+ """Start background heartbeat thread for time-based progress updates."""
625
+ if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
626
+ self.heartbeat_stop.clear()
627
+ self.heartbeat_thread = threading.Thread(
628
+ target=self._heartbeat_loop, daemon=True
629
+ )
630
+ self.heartbeat_thread.start()
631
+ # Write initial status
632
+ self._write_status_file(status="processing")
633
+
634
+ def _heartbeat_loop(self) -> None:
635
+ """Background loop that logs progress at time intervals."""
636
+ while not self.heartbeat_stop.wait(self.heartbeat_interval):
637
+ with self.lock:
638
+ # Only log if we haven't shown a milestone recently
639
+ time_since_update = time.time() - self.last_update_time
640
+ if (
641
+ time_since_update >= self.heartbeat_interval
642
+ and self.completed < self.total
643
+ ):
644
+ elapsed = time.time() - self.start_time
645
+ percent = int((self.completed / self.total) * 100)
646
+ elapsed_str = self._format_time(elapsed)
647
+ self.logger.info(
648
+ f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
649
+ f"Elapsed: {elapsed_str}"
650
+ )
651
+ self.last_update_time = time.time()
497
652
 
498
653
  def update(self, n: int = 1) -> None:
499
654
  """
@@ -508,7 +663,7 @@ class ProgressTracker:
508
663
  self.completed += n
509
664
  percent = int((self.completed / self.total) * 100)
510
665
 
511
- # Show milestone messages (25%, 50%, 75%, 100%)
666
+ # Show milestone messages (5%, 10%, 15%... for large datasets)
512
667
  for milestone in sorted(self.milestones):
513
668
  if percent >= milestone and milestone not in self.shown_milestones:
514
669
  self.shown_milestones.add(milestone)
@@ -517,20 +672,36 @@ class ProgressTracker:
517
672
  elapsed = time.time() - self.start_time
518
673
  rate = self.completed / elapsed if elapsed > 0 else 0
519
674
  remaining_items = self.total - self.completed
520
- eta_seconds = remaining_items / rate if rate > 0 else 0
675
+
676
+ # Calculate ETA with padding for overhead (loading, joins, etc.)
677
+ # Don't show ETA until we have some samples (at least 5% complete)
678
+ if rate > 0 and self.completed >= max(5, self.total * 0.05):
679
+ eta_seconds = (
680
+ remaining_items / rate
681
+ ) * 1.15 # Add 15% padding for overhead
682
+ else:
683
+ eta_seconds = 0
521
684
 
522
685
  # Format time strings
523
- eta_str = self._format_time(eta_seconds)
686
+ eta_str = (
687
+ self._format_time(eta_seconds)
688
+ if eta_seconds > 0
689
+ else "calculating..."
690
+ )
524
691
  elapsed_str = self._format_time(elapsed)
525
692
 
526
693
  # Build progress message
527
- msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
694
+ msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
528
695
  if percent < 100:
529
696
  msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
530
697
  else:
531
698
  msg += f" | Total time: {elapsed_str}"
532
699
 
533
700
  self.logger.info(msg)
701
+ self.last_update_time = time.time()
702
+
703
+ # Update status file for API consumption
704
+ self._write_status_file()
534
705
 
535
706
  @staticmethod
536
707
  def _format_time(seconds: float) -> str:
@@ -544,14 +715,21 @@ class ProgressTracker:
544
715
  hours = seconds / 3600
545
716
  return f"{hours:.1f}h"
546
717
 
547
- def finish(self) -> None:
548
- """Log completion."""
718
+ def finish(self, output_file: str = None) -> None:
719
+ """Stop heartbeat and log completion."""
720
+ # Stop heartbeat thread
721
+ self.heartbeat_stop.set()
722
+ if self.heartbeat_thread and self.heartbeat_thread.is_alive():
723
+ self.heartbeat_thread.join(timeout=1)
724
+
549
725
  with self.lock:
550
726
  total_time = time.time() - self.start_time
551
727
  time_str = self._format_time(total_time)
552
- self.logger.info(
553
- f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
554
- )
728
+ msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
729
+ self.logger.info(msg)
730
+
731
+ # Write final status
732
+ self._write_status_file(status="completed")
555
733
 
556
734
 
557
735
  # ============================================================================
@@ -983,7 +1161,6 @@ def process_ee_batch(
983
1161
  def whisp_stats_geojson_to_df_concurrent(
984
1162
  input_geojson_filepath: str,
985
1163
  external_id_column: str = None,
986
- remove_geom: bool = False,
987
1164
  national_codes: List[str] = None,
988
1165
  unit_type: str = "ha",
989
1166
  whisp_image: ee.Image = None,
@@ -996,6 +1173,7 @@ def whisp_stats_geojson_to_df_concurrent(
996
1173
  logger: logging.Logger = None,
997
1174
  # Format parameters (auto-detect from config if not provided)
998
1175
  decimal_places: int = None,
1176
+ status_file: str = None,
999
1177
  ) -> pd.DataFrame:
1000
1178
  """
1001
1179
  Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
@@ -1010,8 +1188,6 @@ def whisp_stats_geojson_to_df_concurrent(
1010
1188
  Path to input GeoJSON file
1011
1189
  external_id_column : str, optional
1012
1190
  Column name for external IDs
1013
- remove_geom : bool
1014
- Remove geometry column from output
1015
1191
  national_codes : List[str], optional
1016
1192
  ISO2 codes for national datasets
1017
1193
  unit_type : str
@@ -1059,6 +1235,25 @@ def whisp_stats_geojson_to_df_concurrent(
1059
1235
  gdf = _load_geojson_silently(input_geojson_filepath)
1060
1236
  logger.info(f"Loaded {len(gdf):,} features")
1061
1237
 
1238
+ # Validate external_id_column if provided (lightweight client-side check)
1239
+ if external_id_column and external_id_column not in gdf.columns:
1240
+ # Exclude geometry column from available columns list
1241
+ available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1242
+ raise ValueError(
1243
+ f"Column '{external_id_column}' not found in GeoJSON properties. "
1244
+ f"Available columns: {available_cols}"
1245
+ )
1246
+
1247
+ # Check completeness of external_id_column (warn if nulls exist)
1248
+ if external_id_column and external_id_column in gdf.columns:
1249
+ null_count = gdf[external_id_column].isna().sum()
1250
+ if null_count > 0:
1251
+ null_pct = (null_count / len(gdf)) * 100
1252
+ logger.warning(
1253
+ f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1254
+ f"These features may have missing external IDs in output."
1255
+ )
1256
+
1062
1257
  if validate_geometries:
1063
1258
  gdf = clean_geodataframe(
1064
1259
  gdf, remove_nulls=False, repair_geometries=False, logger=logger
@@ -1101,13 +1296,18 @@ def whisp_stats_geojson_to_df_concurrent(
1101
1296
 
1102
1297
  # Batch the data
1103
1298
  batches = batch_geodataframe(gdf_for_ee, batch_size)
1104
- logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
1299
+ logger.info(
1300
+ f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
1301
+ )
1105
1302
 
1106
1303
  # Setup semaphore for EE concurrency control
1107
1304
  ee_semaphore = threading.BoundedSemaphore(max_concurrent)
1108
1305
 
1109
- # Progress tracker
1110
- progress = ProgressTracker(len(batches), logger=logger)
1306
+ # Progress tracker with heartbeat for long-running jobs
1307
+ progress = ProgressTracker(
1308
+ len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
1309
+ )
1310
+ progress.start_heartbeat()
1111
1311
 
1112
1312
  results = []
1113
1313
 
@@ -1148,73 +1348,77 @@ def whisp_stats_geojson_to_df_concurrent(
1148
1348
  pyogrio_logger.setLevel(logging.CRITICAL)
1149
1349
 
1150
1350
  try:
1151
- with redirect_stdout(io.StringIO()):
1152
- with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1153
- futures = {
1154
- executor.submit(process_batch, i, batch): i
1155
- for i, batch in enumerate(batches)
1156
- }
1157
-
1158
- # Track which batches failed for retry
1159
- batch_map = {i: batch for i, batch in enumerate(batches)}
1160
- batch_futures = {future: i for future, i in futures.items()}
1161
-
1162
- for future in as_completed(futures):
1163
- batch_idx = batch_futures[future]
1164
- try:
1165
- batch_idx, df_server, df_client = future.result()
1166
-
1167
- # Merge server and client results
1168
- if plot_id_column not in df_server.columns:
1169
- df_server[plot_id_column] = range(len(df_server))
1170
-
1171
- # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1172
- # These are the actual EE processing results
1173
- df_server_clean = df_server.copy()
1174
-
1175
- # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1176
- # (formatted wrapper handles keep_external_columns parameter)
1177
- keep_external_columns = [plot_id_column]
1178
- if (
1179
- external_id_column
1180
- and external_id_column in df_client.columns
1181
- ):
1182
- keep_external_columns.append(external_id_column)
1183
- if "geometry" in df_client.columns:
1184
- keep_external_columns.append("geometry")
1185
- # Keep geometry type column (Geometry_type)
1186
- if geometry_type_column in df_client.columns:
1187
- keep_external_columns.append(geometry_type_column)
1188
- # Also keep centroid columns (Centroid_lon, Centroid_lat)
1189
- centroid_cols = [
1190
- c for c in df_client.columns if c.startswith("Centroid_")
1191
- ]
1192
- keep_external_columns.extend(centroid_cols)
1193
-
1194
- df_client_clean = df_client[
1195
- [c for c in keep_external_columns if c in df_client.columns]
1196
- ].drop_duplicates()
1197
-
1198
- merged = df_server_clean.merge(
1199
- df_client_clean,
1200
- on=plot_id_column,
1201
- how="left",
1202
- suffixes=("_ee", "_client"),
1203
- )
1204
- results.append(merged)
1205
- progress.update()
1206
-
1207
- except Exception as e:
1208
- # Batch failed - fail fast with clear guidance
1209
- error_msg = str(e)
1210
- logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1211
- logger.debug(f"Full error: {error_msg}")
1351
+ # Don't suppress stdout here - we want progress messages to show in Colab
1352
+ with ThreadPoolExecutor(max_workers=pool_workers) as executor:
1353
+ futures = {
1354
+ executor.submit(process_batch, i, batch): i
1355
+ for i, batch in enumerate(batches)
1356
+ }
1212
1357
 
1213
- # Get original batch for error reporting
1214
- original_batch = batch_map[batch_idx]
1358
+ # Track which batches failed for retry
1359
+ batch_map = {i: batch for i, batch in enumerate(batches)}
1360
+ batch_futures = {future: i for future, i in futures.items()}
1215
1361
 
1216
- # Add to batch errors for final reporting
1217
- batch_errors.append((batch_idx, original_batch, error_msg))
1362
+ for future in as_completed(futures):
1363
+ batch_idx = batch_futures[future]
1364
+ try:
1365
+ batch_idx, df_server, df_client = future.result()
1366
+
1367
+ # Merge server and client results
1368
+ if plot_id_column not in df_server.columns:
1369
+ df_server[plot_id_column] = range(len(df_server))
1370
+
1371
+ # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1372
+ # These are the actual EE processing results
1373
+ df_server_clean = df_server.copy()
1374
+
1375
+ # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1376
+ # (formatted wrapper handles keep_external_columns parameter)
1377
+ keep_external_columns = [plot_id_column]
1378
+ if external_id_column and external_id_column in df_client.columns:
1379
+ keep_external_columns.append(external_id_column)
1380
+ if "geometry" in df_client.columns:
1381
+ keep_external_columns.append("geometry")
1382
+ # Keep geometry type column (Geometry_type)
1383
+ if geometry_type_column in df_client.columns:
1384
+ keep_external_columns.append(geometry_type_column)
1385
+ # Also keep centroid columns (Centroid_lon, Centroid_lat)
1386
+ centroid_cols = [
1387
+ c for c in df_client.columns if c.startswith("Centroid_")
1388
+ ]
1389
+ keep_external_columns.extend(centroid_cols)
1390
+
1391
+ df_client_clean = df_client[
1392
+ [c for c in keep_external_columns if c in df_client.columns]
1393
+ ]
1394
+ # Don't drop duplicates - we need one row per feature (one per plot_id)
1395
+ # Each plot_id should have exactly one row with its metadata
1396
+
1397
+ merged = df_server_clean.merge(
1398
+ df_client_clean,
1399
+ on=plot_id_column,
1400
+ how="left",
1401
+ suffixes=("_ee", "_client"),
1402
+ )
1403
+ results.append(merged)
1404
+ progress.update()
1405
+
1406
+ except Exception as e:
1407
+ # Batch failed - fail fast with clear guidance
1408
+ error_msg = str(e)
1409
+ logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1410
+ logger.debug(f"Full error: {error_msg}")
1411
+
1412
+ # Get original batch for error reporting
1413
+ original_batch = batch_map[batch_idx]
1414
+
1415
+ # Add to batch errors for final reporting
1416
+ batch_errors.append((batch_idx, original_batch, error_msg))
1417
+ except (KeyboardInterrupt, SystemExit) as interrupt:
1418
+ logger.warning("Processing interrupted by user")
1419
+ # Update status file with interrupted state
1420
+ progress._write_status_file(status="interrupted")
1421
+ raise interrupt
1218
1422
  finally:
1219
1423
  # Restore logger levels
1220
1424
  fiona_logger.setLevel(old_fiona_level)
@@ -1565,7 +1769,7 @@ def whisp_stats_geojson_to_df_concurrent(
1565
1769
  )
1566
1770
  raise retry_e
1567
1771
 
1568
- logger.info(f"Processed {len(formatted):,} features successfully")
1772
+ logger.info(f"Processing complete: {len(formatted):,} features")
1569
1773
  return formatted
1570
1774
  else:
1571
1775
  logger.error(" No results produced")
@@ -1580,7 +1784,6 @@ def whisp_stats_geojson_to_df_concurrent(
1580
1784
  def whisp_stats_geojson_to_df_sequential(
1581
1785
  input_geojson_filepath: str,
1582
1786
  external_id_column: str = None,
1583
- remove_geom: bool = False,
1584
1787
  national_codes: List[str] = None,
1585
1788
  unit_type: str = "ha",
1586
1789
  whisp_image: ee.Image = None,
@@ -1605,8 +1808,6 @@ def whisp_stats_geojson_to_df_sequential(
1605
1808
  Path to input GeoJSON
1606
1809
  external_id_column : str, optional
1607
1810
  Column name for external IDs
1608
- remove_geom : bool
1609
- Remove geometry from output
1610
1811
  national_codes : List[str], optional
1611
1812
  ISO2 codes for national datasets
1612
1813
  unit_type : str
@@ -1646,6 +1847,25 @@ def whisp_stats_geojson_to_df_sequential(
1646
1847
  gdf = _load_geojson_silently(input_geojson_filepath)
1647
1848
  logger.info(f"Loaded {len(gdf):,} features")
1648
1849
 
1850
+ # Validate external_id_column if provided (lightweight client-side check)
1851
+ if external_id_column and external_id_column not in gdf.columns:
1852
+ # Exclude geometry column from available columns list
1853
+ available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
1854
+ raise ValueError(
1855
+ f"Column '{external_id_column}' not found in GeoJSON properties. "
1856
+ f"Available columns: {available_cols}"
1857
+ )
1858
+
1859
+ # Check completeness of external_id_column (warn if nulls exist)
1860
+ if external_id_column and external_id_column in gdf.columns:
1861
+ null_count = gdf[external_id_column].isna().sum()
1862
+ if null_count > 0:
1863
+ null_pct = (null_count / len(gdf)) * 100
1864
+ logger.warning(
1865
+ f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
1866
+ f"These features may have missing external IDs in output."
1867
+ )
1868
+
1649
1869
  # Clean geometries (preserve both null and invalid geometries by default)
1650
1870
  gdf = clean_geodataframe(
1651
1871
  gdf, remove_nulls=False, repair_geometries=False, logger=logger
@@ -1696,7 +1916,9 @@ def whisp_stats_geojson_to_df_sequential(
1696
1916
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1697
1917
 
1698
1918
  # Process server-side with error handling for bad bands
1699
- logger.info("Processing with Earth Engine...")
1919
+ logger.info(
1920
+ f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
1921
+ )
1700
1922
  try:
1701
1923
  results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
1702
1924
  df_server = convert_ee_to_df(results_fc)
@@ -1782,7 +2004,7 @@ def whisp_stats_geojson_to_df_sequential(
1782
2004
  convert_water_flag=True,
1783
2005
  )
1784
2006
 
1785
- logger.info(f"Processed {len(formatted):,} features")
2007
+ logger.info(f"Processing complete: {len(formatted):,} features")
1786
2008
 
1787
2009
  # Consolidate external_id_column to standardized 'external_id'
1788
2010
  if external_id_column:
@@ -1815,7 +2037,6 @@ def whisp_stats_geojson_to_df_sequential(
1815
2037
  def whisp_formatted_stats_geojson_to_df_concurrent(
1816
2038
  input_geojson_filepath: str,
1817
2039
  external_id_column: str = None,
1818
- remove_geom: bool = False,
1819
2040
  national_codes: List[str] = None,
1820
2041
  unit_type: str = "ha",
1821
2042
  whisp_image: ee.Image = None,
@@ -1833,6 +2054,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1833
2054
  water_flag_threshold: float = 0.5,
1834
2055
  sort_column: str = "plotId",
1835
2056
  geometry_audit_trail: bool = False,
2057
+ status_file: str = None,
1836
2058
  ) -> pd.DataFrame:
1837
2059
  """
1838
2060
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -1848,8 +2070,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1848
2070
  Path to input GeoJSON file
1849
2071
  external_id_column : str, optional
1850
2072
  Column name for external IDs
1851
- remove_geom : bool
1852
- Remove geometry column from output
1853
2073
  national_codes : List[str], optional
1854
2074
  ISO2 codes for national datasets
1855
2075
  unit_type : str
@@ -1917,7 +2137,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1917
2137
  df_raw = whisp_stats_geojson_to_df_concurrent(
1918
2138
  input_geojson_filepath=input_geojson_filepath,
1919
2139
  external_id_column=external_id_column,
1920
- remove_geom=remove_geom,
1921
2140
  national_codes=national_codes,
1922
2141
  unit_type=unit_type,
1923
2142
  whisp_image=whisp_image,
@@ -1928,6 +2147,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1928
2147
  max_retries=max_retries,
1929
2148
  add_metadata_server=add_metadata_server,
1930
2149
  logger=logger,
2150
+ status_file=status_file,
1931
2151
  )
1932
2152
 
1933
2153
  # Step 2: Format the output
@@ -2030,7 +2250,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2030
2250
  def whisp_formatted_stats_geojson_to_df_sequential(
2031
2251
  input_geojson_filepath: str,
2032
2252
  external_id_column: str = None,
2033
- remove_geom: bool = False,
2034
2253
  national_codes: List[str] = None,
2035
2254
  unit_type: str = "ha",
2036
2255
  whisp_image: ee.Image = None,
@@ -2044,6 +2263,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2044
2263
  water_flag_threshold: float = 0.5,
2045
2264
  sort_column: str = "plotId",
2046
2265
  geometry_audit_trail: bool = False,
2266
+ status_file: str = None,
2047
2267
  ) -> pd.DataFrame:
2048
2268
  """
2049
2269
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -2059,8 +2279,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2059
2279
  Path to input GeoJSON file
2060
2280
  external_id_column : str, optional
2061
2281
  Column name for external IDs
2062
- remove_geom : bool
2063
- Remove geometry from output
2064
2282
  national_codes : List[str], optional
2065
2283
  ISO2 codes for national datasets
2066
2284
  unit_type : str
@@ -2120,7 +2338,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2120
2338
  df_raw = whisp_stats_geojson_to_df_sequential(
2121
2339
  input_geojson_filepath=input_geojson_filepath,
2122
2340
  external_id_column=external_id_column,
2123
- remove_geom=remove_geom,
2124
2341
  national_codes=national_codes,
2125
2342
  unit_type=unit_type,
2126
2343
  whisp_image=whisp_image,
@@ -2233,7 +2450,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2233
2450
  def whisp_formatted_stats_geojson_to_df_fast(
2234
2451
  input_geojson_filepath: str,
2235
2452
  external_id_column: str = None,
2236
- remove_geom: bool = False,
2237
2453
  national_codes: List[str] = None,
2238
2454
  unit_type: str = "ha",
2239
2455
  whisp_image: ee.Image = None,
@@ -2252,6 +2468,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
2252
2468
  water_flag_threshold: float = 0.5,
2253
2469
  sort_column: str = "plotId",
2254
2470
  geometry_audit_trail: bool = False,
2471
+ status_file: str = None,
2255
2472
  ) -> pd.DataFrame:
2256
2473
  """
2257
2474
  Process GeoJSON to Whisp statistics with optimized fast processing.
@@ -2267,8 +2484,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2267
2484
  Path to input GeoJSON file
2268
2485
  external_id_column : str, optional
2269
2486
  Column name for external IDs
2270
- remove_geom : bool
2271
- Remove geometry column from output
2272
2487
  national_codes : List[str], optional
2273
2488
  ISO2 codes for national datasets
2274
2489
  unit_type : str
@@ -2339,7 +2554,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
2339
2554
  return whisp_formatted_stats_geojson_to_df_concurrent(
2340
2555
  input_geojson_filepath=input_geojson_filepath,
2341
2556
  external_id_column=external_id_column,
2342
- remove_geom=remove_geom,
2343
2557
  national_codes=national_codes,
2344
2558
  unit_type=unit_type,
2345
2559
  whisp_image=whisp_image,
@@ -2356,13 +2570,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
2356
2570
  water_flag_threshold=water_flag_threshold,
2357
2571
  sort_column=sort_column,
2358
2572
  geometry_audit_trail=geometry_audit_trail,
2573
+ status_file=status_file,
2359
2574
  )
2360
2575
  else: # sequential
2361
2576
  logger.debug("Routing to sequential processing...")
2362
2577
  return whisp_formatted_stats_geojson_to_df_sequential(
2363
2578
  input_geojson_filepath=input_geojson_filepath,
2364
2579
  external_id_column=external_id_column,
2365
- remove_geom=remove_geom,
2366
2580
  national_codes=national_codes,
2367
2581
  unit_type=unit_type,
2368
2582
  whisp_image=whisp_image,
@@ -2374,4 +2588,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
2374
2588
  water_flag_threshold=water_flag_threshold,
2375
2589
  sort_column=sort_column,
2376
2590
  geometry_audit_trail=geometry_audit_trail,
2591
+ status_file=status_file,
2377
2592
  )
@@ -750,23 +750,43 @@ def validate_geojson_constraints(
750
750
  return results
751
751
 
752
752
 
753
- def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True):
753
+ def suggest_processing_mode(
754
+ feature_count,
755
+ mean_area_ha=None,
756
+ mean_vertices=None,
757
+ feature_type="polygon",
758
+ verbose=True,
759
+ ):
754
760
  """
755
- Suggest processing method based on polygon characteristics.
761
+ Suggest processing mode based on feature characteristics.
762
+
763
+ Decision thresholds from comprehensive benchmark data (Nov 2025):
756
764
 
757
- Decision thresholds from benchmark data (area per polygon × polygon count):
758
- - Small polygons (10 ha): need 250+ polygons for concurrent
759
- - Medium polygons (100 ha): breakeven at ~100 polygons
760
- - Large polygons (500 ha): concurrent wins at 50+ polygons
765
+ POINTS:
766
+ - Break-even: 750-1000 features
767
+ - Sequential faster: < 750 features
768
+ - Concurrent faster: >= 750 features
769
+
770
+ POLYGONS (area-based thresholds):
771
+ - Tiny (< 1 ha): break-even ~500 features
772
+ - Small (1-5 ha, simple): break-even ~500 features
773
+ - Small (1-5 ha, complex 20-50v): break-even ~500 features
774
+ - Medium (5-20 ha): break-even ~250 features
775
+ - Large (20-100 ha): break-even ~250 features
776
+ - Very large (50-200 ha): break-even ~250 features
777
+
778
+ Vertex complexity adjustment: High vertex counts (>50) favor concurrent at lower thresholds
761
779
 
762
780
  Parameters:
763
781
  -----------
764
- polygon_count : int
765
- Number of polygons
766
- mean_area_ha : float
767
- Mean area per polygon in hectares
782
+ feature_count : int
783
+ Number of features (polygons or points)
784
+ mean_area_ha : float, optional
785
+ Mean area per polygon in hectares (required for polygons, ignored for points)
768
786
  mean_vertices : float, optional
769
- Mean number of vertices per polygon (can influence decision for complex geometries)
787
+ Mean number of vertices per polygon (influences decision for complex geometries)
788
+ feature_type : str
789
+ 'polygon', 'multipolygon', or 'point' (default: 'polygon')
770
790
  verbose : bool
771
791
  Print recommendation explanation
772
792
 
@@ -775,31 +795,63 @@ def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True
775
795
  str: 'concurrent' or 'sequential'
776
796
  """
777
797
 
778
- # Primary decision based on area
779
- if mean_area_ha >= 300: # Large polygons
780
- breakeven = 50
781
- method = "concurrent" if polygon_count >= breakeven else "sequential"
782
- elif mean_area_ha >= 50: # Medium polygons
783
- breakeven = 100
784
- method = "concurrent" if polygon_count >= breakeven else "sequential"
785
- else: # Small polygons
798
+ # Points: simple threshold-based decision
799
+ if feature_type == "point":
800
+ breakeven = 750
801
+ method = "concurrent" if feature_count >= breakeven else "sequential"
802
+
803
+ if verbose:
804
+ print(f"\nMETHOD RECOMMENDATION (Points)")
805
+ print(f" Features: {feature_count} points")
806
+ print(f" Break-even: {breakeven} features | Method: {method.upper()}")
807
+
808
+ return method
809
+
810
+ # Polygons and MultiPolygons: area and complexity-based decision
811
+ # MultiPolygons use same breakpoints as Polygons
812
+ if mean_area_ha is None:
813
+ # Default to conservative threshold if area unknown
814
+ breakeven = 500
815
+ method = "concurrent" if feature_count >= breakeven else "sequential"
816
+
817
+ if verbose:
818
+ print(f"\nMETHOD RECOMMENDATION (Polygons - area unknown)")
819
+ print(f" Features: {feature_count} polygons")
820
+ print(
821
+ f" Break-even: {breakeven} (conservative) | Method: {method.upper()}"
822
+ )
823
+
824
+ return method
825
+
826
+ # Area-based thresholds from benchmark data
827
+ if mean_area_ha >= 20: # Large to very large polygons
828
+ breakeven = 250
829
+ elif mean_area_ha >= 5: # Medium polygons
786
830
  breakeven = 250
787
- method = "concurrent" if polygon_count >= breakeven else "sequential"
831
+ elif mean_area_ha >= 1: # Small polygons
832
+ # Vertex complexity matters more for small polygons
833
+ if mean_vertices is not None and mean_vertices >= 30:
834
+ breakeven = 500 # Complex small polygons
835
+ else:
836
+ breakeven = 500 # Simple small polygons
837
+ else: # Tiny polygons (< 1 ha)
838
+ breakeven = 500
839
+
840
+ # Vertex complexity adjustment for high-complexity geometries
841
+ if mean_vertices is not None and mean_vertices >= 50:
842
+ # High complexity: reduce breakeven by 20% (concurrent beneficial sooner)
843
+ breakeven = int(breakeven * 0.8)
788
844
 
789
- # Optional adjustment based on vertex complexity (very high complexity favors concurrent)
790
- if mean_vertices is not None and mean_vertices > 500:
791
- # Reduce breakeven by 25% for very complex geometries
792
- adjusted_breakeven = int(breakeven * 0.75)
793
- method = "concurrent" if polygon_count >= adjusted_breakeven else "sequential"
845
+ method = "concurrent" if feature_count >= breakeven else "sequential"
794
846
 
795
847
  if verbose:
796
- print(f"\nMETHOD RECOMMENDATION")
848
+ print(f"\nMETHOD RECOMMENDATION (Polygons)")
797
849
  print(
798
- f" Polygons: {polygon_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
850
+ f" Features: {feature_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
799
851
  )
800
852
  if mean_vertices is not None:
801
853
  print(f" | Mean Vertices: {mean_vertices:.1f}", end="")
802
854
  print()
803
- print(f" Breakeven: {breakeven} polygons | Method: {method.upper()}")
855
+ print(f" Break-even: {breakeven} features | Method: {method.upper()}")
804
856
 
805
857
  return method
@@ -1160,6 +1160,20 @@ def nci_ocs2020_prep():
1160
1160
  ).selfMask() # cocoa from national land cover map for Côte d'Ivoire
1161
1161
 
1162
1162
 
1163
+ # nCM - Cameroon
1164
+ # data from Aurelie Shapiro (FAO) working directly with country experts - info on methods and accuracy assessment to follow
1165
+
1166
+
1167
+ def ncm_treecover_2020_prep():
1168
+ return (
1169
+ ee.Image("projects/ee-cocoacmr/assets/land_cover/CMR_TNTMMU_2020")
1170
+ .select("FNF_2020")
1171
+ .eq(1)
1172
+ .rename("nCM_Treecover_2020")
1173
+ .selfMask()
1174
+ )
1175
+
1176
+
1163
1177
  # ============================================================================
1164
1178
  # CONTEXT BANDS (Administrative boundaries and water mask)
1165
1179
  # ============================================================================
openforis_whisp/logger.py CHANGED
@@ -8,9 +8,21 @@ BASE_MSG_FORMAT = (
8
8
 
9
9
  class StdoutLogger:
10
10
  def __init__(self, name: str, msg_format: str = BASE_MSG_FORMAT) -> None:
11
- self.handler = logging.StreamHandler(sys.stdout)
12
- self.handler.setFormatter(logging.Formatter(msg_format))
13
- self.handler.setLevel(logging.DEBUG)
11
+ # Create handler that auto-flushes for Colab/notebook visibility
12
+ handler = logging.StreamHandler(sys.stdout)
13
+ handler.setFormatter(logging.Formatter(msg_format))
14
+ handler.setLevel(logging.DEBUG)
15
+
16
+ # Override emit to force flush after each message
17
+ original_emit = handler.emit
18
+
19
+ def emit_with_flush(record):
20
+ original_emit(record)
21
+ sys.stdout.flush()
22
+
23
+ handler.emit = emit_with_flush
24
+
25
+ self.handler = handler
14
26
  self.logger = logging.getLogger(name)
15
27
  self.logger.addHandler(self.handler)
16
28
  self.logger.propagate = False
@@ -2,9 +2,9 @@ name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude
2
2
  EUFO_2020,10,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_jrc_gfc_2020_prep
3
3
  GLAD_Primary,20,,treecover,primary,1,1,0,float32,1,0,g_glad_pht_prep
4
4
  TMF_undist,30,,treecover,primary,1,1,0,float32,1,0,g_jrc_tmf_undisturbed_prep
5
- GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
5
+ GFC_TC_2020,50,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_glad_gfc_10pc_prep
6
6
  Forest_FDaP,60,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
7
- ESA_TC_2020,70,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_esa_worldcover_trees_prep
7
+ ESA_TC_2020,70,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_esa_worldcover_trees_prep
8
8
  TMF_plant,80,,commodities,NA,1,1,0,float32,1,0,g_jrc_tmf_plantation_prep
9
9
  Oil_palm_Descals,90,,commodities,NA,1,1,0,float32,1,0,g_creaf_descals_palm_prep
10
10
  Oil_palm_FDaP,100,,commodities,NA,1,1,0,float32,1,0,g_fdap_palm_prep
@@ -197,3 +197,4 @@ nBR_INPE_TCamz_pasture_2020,2422,BR,commodities,NA,1,1,0,float32,1,0,nbr_terracl
197
197
  nBR_INPE_TCcer_pasture_2020,2423,BR,commodities,NA,1,1,0,float32,1,0,nbr_terraclass_cer20_ac_prep
198
198
  nBR_MapBiomas_col9_pasture_2020,2424,BR,commodities,NA,1,1,0,float32,1,0,nbr_mapbiomasc9_pasture_prep
199
199
  nCI_Cocoa_bnetd,3000,CI,commodities,NA,1,1,0,float32,1,0,nci_ocs2020_prep
200
+ nCM_Treecover_2020,3100,CM,treecover,NA,1,0,0,float32,1,0,ncm_treecover_2020_prep
@@ -1,5 +1,10 @@
1
- import pandera as pa
2
- from pandera.typing import DataFrame, Series
1
+ # Support both old and new pandera import paths
2
+ try:
3
+ import pandera.pandas as pa
4
+ from pandera.typing.pandas import DataFrame, Series
5
+ except (ImportError, ModuleNotFoundError):
6
+ import pandera as pa
7
+ from pandera.typing import DataFrame, Series
3
8
 
4
9
  # Define a schema for validating a DataFrame related to GEE (Google Earth Engine) datasets.
5
10
  class DataLookupSchema(pa.DataFrameModel):
@@ -1,5 +1,10 @@
1
1
  # !pip install pandera[io] # special version used
2
- import pandera as pa
2
+ # Support both old and new pandera import paths
3
+ try:
4
+ import pandera.pandas as pa
5
+ except (ImportError, ModuleNotFoundError):
6
+ import pandera as pa
7
+
3
8
  import pandas as pd
4
9
  import os
5
10
  import logging
openforis_whisp/stats.py CHANGED
@@ -88,7 +88,6 @@ def get_admin_boundaries_fc():
88
88
  def whisp_formatted_stats_geojson_to_df_legacy(
89
89
  input_geojson_filepath: Path | str,
90
90
  external_id_column=None,
91
- remove_geom=False,
92
91
  national_codes=None,
93
92
  unit_type="ha",
94
93
  whisp_image=None,
@@ -147,7 +146,6 @@ def whisp_formatted_stats_geojson_to_df_legacy(
147
146
  return whisp_formatted_stats_ee_to_df(
148
147
  feature_collection,
149
148
  external_id_column,
150
- remove_geom,
151
149
  national_codes=national_codes,
152
150
  unit_type=unit_type,
153
151
  whisp_image=whisp_image,
@@ -167,6 +165,7 @@ def whisp_formatted_stats_geojson_to_df(
167
165
  batch_size: int = 10,
168
166
  max_concurrent: int = 20,
169
167
  geometry_audit_trail: bool = False,
168
+ status_file: str = None,
170
169
  ) -> pd.DataFrame:
171
170
  """
172
171
  Main entry point for converting GeoJSON to Whisp statistics.
@@ -188,11 +187,7 @@ def whisp_formatted_stats_geojson_to_df(
188
187
  The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
189
188
  This column must exist as a property in ALL features of the GeoJSON file.
190
189
  Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
191
- remove_geom : bool, default=False
192
- If True, the geometry of the GeoJSON is removed from the output DataFrame.
193
190
  national_codes : list, optional
194
- List of ISO2 country codes to include national datasets.
195
- unit_type: str, optional
196
191
  Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
197
192
  whisp_image : ee.Image, optional
198
193
  Pre-combined multiband Earth Engine Image containing all Whisp datasets.
@@ -224,6 +219,13 @@ def whisp_formatted_stats_geojson_to_df(
224
219
 
225
220
  Processing metadata stored in df.attrs['processing_metadata'].
226
221
  These columns enable full transparency for geometry modifications during processing.
222
+ status_file : str, optional
223
+ Path to JSON status file or directory for real-time progress tracking.
224
+ If a directory is provided, creates 'whisp_processing_status.json' in that directory.
225
+ Updates every 3 minutes and at progress milestones (5%, 10%, etc.).
226
+ Format: {"status": "processing", "progress": "450/1000", "percent": 45.0,
227
+ "elapsed_sec": 120, "eta_sec": 145, "updated_at": "2025-11-13T14:23:45"}
228
+ Most useful for large concurrent jobs. Works in both concurrent and sequential modes.
227
229
 
228
230
  Returns
229
231
  -------
@@ -283,7 +285,6 @@ def whisp_formatted_stats_geojson_to_df(
283
285
  return whisp_formatted_stats_geojson_to_df_legacy(
284
286
  input_geojson_filepath=input_geojson_filepath,
285
287
  external_id_column=external_id_column,
286
- remove_geom=remove_geom,
287
288
  national_codes=national_codes,
288
289
  unit_type=unit_type,
289
290
  whisp_image=whisp_image,
@@ -306,7 +307,6 @@ def whisp_formatted_stats_geojson_to_df(
306
307
  return whisp_formatted_stats_geojson_to_df_fast(
307
308
  input_geojson_filepath=input_geojson_filepath,
308
309
  external_id_column=external_id_column,
309
- remove_geom=remove_geom,
310
310
  national_codes=national_codes,
311
311
  unit_type=unit_type,
312
312
  whisp_image=whisp_image,
@@ -315,6 +315,7 @@ def whisp_formatted_stats_geojson_to_df(
315
315
  batch_size=batch_size,
316
316
  max_concurrent=max_concurrent,
317
317
  geometry_audit_trail=geometry_audit_trail,
318
+ status_file=status_file,
318
319
  )
319
320
  else:
320
321
  raise ValueError(
@@ -473,7 +474,6 @@ def whisp_formatted_stats_ee_to_df(
473
474
  def whisp_stats_geojson_to_df(
474
475
  input_geojson_filepath: Path | str,
475
476
  external_id_column=None,
476
- remove_geom=False,
477
477
  national_codes=None,
478
478
  unit_type="ha",
479
479
  whisp_image=None, # New parameter
@@ -506,7 +506,6 @@ def whisp_stats_geojson_to_df(
506
506
  return whisp_stats_ee_to_df(
507
507
  feature_collection,
508
508
  external_id_column,
509
- remove_geom,
510
509
  national_codes=national_codes,
511
510
  unit_type=unit_type,
512
511
  whisp_image=whisp_image, # Pass through
@@ -990,7 +989,7 @@ def whisp_stats_ee_to_drive(
990
989
  )
991
990
  task.start()
992
991
  print(
993
- "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
992
+ "Exporting to Google Drive: 'whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
994
993
  )
995
994
  except Exception as e:
996
995
  print(f"An error occurred during the export: {e}")
openforis_whisp/utils.py CHANGED
@@ -5,6 +5,8 @@ import os
5
5
  import pandas as pd
6
6
  import random
7
7
  import numpy as np
8
+ import logging
9
+ import sys
8
10
 
9
11
  import urllib.request
10
12
  import os
@@ -19,6 +21,23 @@ from shapely.validation import make_valid
19
21
 
20
22
  from .logger import StdoutLogger
21
23
 
24
+ # Configure the "whisp" logger with auto-flush handler for Colab visibility
25
+ _whisp_logger = logging.getLogger("whisp")
26
+ if not _whisp_logger.handlers:
27
+ _handler = logging.StreamHandler(sys.stdout)
28
+ _handler.setLevel(logging.DEBUG)
29
+ _handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
30
+ # Override emit to force flush after each message for Colab
31
+ _original_emit = _handler.emit
32
+
33
+ def _emit_with_flush(record):
34
+ _original_emit(record)
35
+ sys.stdout.flush()
36
+
37
+ _handler.emit = _emit_with_flush
38
+ _whisp_logger.addHandler(_handler)
39
+ _whisp_logger.setLevel(logging.INFO)
40
+ _whisp_logger.propagate = False # Don't propagate to root to avoid duplicates
22
41
 
23
42
  logger = StdoutLogger(__name__)
24
43
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: openforis-whisp
3
- Version: 3.0.0a3
3
+ Version: 3.0.0a4
4
4
  Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
5
5
  License: MIT
6
6
  Keywords: whisp,geospatial,data-processing
@@ -0,0 +1,20 @@
1
+ openforis_whisp/__init__.py,sha256=5zJK84LYnlslxSajdCz6ZIYxRS4xgN3sGxSD6_GXEHs,3547
2
+ openforis_whisp/advanced_stats.py,sha256=FC1YasSZ93jplF1qBgDopzBIsO2ueXnidomQU3rpP_Q,100006
3
+ openforis_whisp/data_checks.py,sha256=ErIKGbCa3R8eYP0sVoAl-ZUl607W1QrG0Jr2SIVgm2I,34056
4
+ openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
5
+ openforis_whisp/datasets.py,sha256=F1WxXc93mxxmN-WHa0bf-XX-FloSQyEBJKmnrQEHYn8,53855
6
+ openforis_whisp/logger.py,sha256=gFkRTwJDJKIBWcHDOK74Uln3JM7fAybURo7pQpGL790,3395
7
+ openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
9
+ openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=KgK0ik_Gd4t_Nq5cUkGPT4ZFZVO93HWSG82jRrOukt4,1298
10
+ openforis_whisp/parameters/lookup_gaul1_admin.py,sha256=cQr5liRdXi85QieTxrz4VAkn0COvRCp82ZV0dYFWOio,474980
11
+ openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=7KdnFocEgbZO5m8JmWQchzZTurg9rJ96y17z8UyLtI0,17537
12
+ openforis_whisp/pd_schemas.py,sha256=0z-oPmYIDUIn7mNY41W_uUpmTwjoR7e254mOCoHVsOg,2878
13
+ openforis_whisp/reformat.py,sha256=gvhIa-_kTT5BSO8LuVmJ1TQcf_NwheskXboFM9e0KJY,32758
14
+ openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
15
+ openforis_whisp/stats.py,sha256=pTSYs77ISRBOIglRpq4SUx3lKRkrUZOKROLRX5IP9IY,63941
16
+ openforis_whisp/utils.py,sha256=AISWF-MpfFdYkhd6bei4BViw2Iag20mmq61ykrF9YTk,31287
17
+ openforis_whisp-3.0.0a4.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
+ openforis_whisp-3.0.0a4.dist-info/METADATA,sha256=ak2Dw632lgOtXEXkl5-haYK7vF3hPaJ6IkaRRJRdH0Y,16684
19
+ openforis_whisp-3.0.0a4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
+ openforis_whisp-3.0.0a4.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- openforis_whisp/__init__.py,sha256=s42Q0VJdzm8mgnxfYg1hUEJPM2VLWIva2h-mdKyr444,3538
2
- openforis_whisp/advanced_stats.py,sha256=tvhgNTCGlT3aYecUPP6QCTO0FRrjk0qjs95NoVZvIt4,90935
3
- openforis_whisp/data_checks.py,sha256=KwgD72FA_n7joiJadGRpzntd2sLo0aqGNbOjRkB8iQI,32293
4
- openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
5
- openforis_whisp/datasets.py,sha256=aGJy0OYN4d0nsH3_IOYlHl-WCB7KFwZwMJ-dBi5Hc5Y,53470
6
- openforis_whisp/logger.py,sha256=9M6_3mdpoiWfC-pDwM9vKmB2l5Gul6Rb5rNTNh-_nzs,3054
7
- openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
9
- openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=KgK0ik_Gd4t_Nq5cUkGPT4ZFZVO93HWSG82jRrOukt4,1298
10
- openforis_whisp/parameters/lookup_gaul1_admin.py,sha256=cQr5liRdXi85QieTxrz4VAkn0COvRCp82ZV0dYFWOio,474980
11
- openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=UDvZrQsL5rXJn6CW6P3wofUrPLRmUFZWt6ETbXaxBMs,17454
12
- openforis_whisp/pd_schemas.py,sha256=W_ocS773LHfc05dJqvWRa-bRdX0wKFoNp0lMxgFx94Y,2681
13
- openforis_whisp/reformat.py,sha256=MPjP5lb218GTcTpd_Qvbj5ER_8EY4JjLDteQaS5OZCQ,32620
14
- openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
15
- openforis_whisp/stats.py,sha256=nVzQpSu7BoSb2S6HheLeoK_pmguZ9Lyw0ZfbTTMVq4Q,63720
16
- openforis_whisp/utils.py,sha256=Q-EwhUaohk63WCx7Rr5VuR3X-oGtgILZDc8JsjbWhgg,30538
17
- openforis_whisp-3.0.0a3.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
- openforis_whisp-3.0.0a3.dist-info/METADATA,sha256=6xuNhUpQWyzKU3m13FnJ7SX39jAVry1YEKNAdH0D2to,16684
19
- openforis_whisp-3.0.0a3.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
- openforis_whisp-3.0.0a3.dist-info/RECORD,,