PyPI - openforis-whisp - Versions diffs - 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl - Mend

openforis-whisp 3.0.0a2py3-none-any.whl → 3.0.0a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

openforis_whisp/__init__.py +8 -8
openforis_whisp/advanced_stats.py +476 -312
openforis_whisp/data_checks.py +80 -28
openforis_whisp/datasets.py +14 -0
openforis_whisp/logger.py +15 -3
openforis_whisp/parameters/lookup_gee_datasets.csv +3 -2
openforis_whisp/pd_schemas.py +7 -2
openforis_whisp/reformat.py +8 -30
openforis_whisp/stats.py +16 -62
openforis_whisp/utils.py +468 -80
{openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/METADATA +1 -1
openforis_whisp-3.0.0a4.dist-info/RECORD +20 -0
openforis_whisp-3.0.0a2.dist-info/RECORD +0 -20
{openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/LICENSE +0 -0
{openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/WHEEL +0 -0

openforis_whisp/advanced_stats.py CHANGED Viewed

@@ -36,6 +36,24 @@ from typing import Optional, List, Dict, Any, Tuple, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import tempfile
+# Configure the "whisp" logger with auto-flush handler for Colab visibility
+_whisp_logger = logging.getLogger("whisp")
+if not _whisp_logger.handlers:
+    _handler = logging.StreamHandler(sys.stdout)
+    _handler.setLevel(logging.DEBUG)
+    _handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
+    # Override emit to force flush after each message for Colab
+    _original_emit = _handler.emit
+    def _emit_with_flush(record):
+        _original_emit(record)
+        sys.stdout.flush()
+    _handler.emit = _emit_with_flush
+    _whisp_logger.addHandler(_handler)
+    _whisp_logger.setLevel(logging.INFO)
+    _whisp_logger.propagate = False  # Don't propagate to root to avoid duplicates
 # ============================================================================
 # STDOUT/STDERR SUPPRESSION CONTEXT MANAGER (for C-level output)
 # ============================================================================
@@ -445,6 +463,16 @@ def join_admin_codes(
             columns=["admin_code_for_join", "gaul1_code"], errors="ignore"
         )
+        # Fill NaN values with "Unknown" and "not found" for features outside admin boundaries
+        # (e.g., points in the ocean or international waters)
+        df_joined[iso3_country_column] = df_joined[iso3_country_column].fillna(
+            "Unknown"
+        )
+        df_joined[iso2_country_column] = df_joined[iso2_country_column].fillna(
+            "not found"
+        )
+        df_joined[admin_1_column] = df_joined[admin_1_column].fillna("Unknown")
         logger.debug(
             f"Admin codes joined: {iso3_country_column}, {iso2_country_column}, {admin_1_column}"
         )
@@ -461,10 +489,16 @@ class ProgressTracker:
     Shows progress at adaptive milestones (more frequent for small datasets,
     less frequent for large datasets) with estimated time remaining based on
-    processing speed.
+    processing speed. Includes time-based heartbeat to prevent long silences.
     """
-    def __init__(self, total: int, logger: logging.Logger = None):
+    def __init__(
+        self,
+        total: int,
+        logger: logging.Logger = None,
+        heartbeat_interval: int = 180,
+        status_file: str = None,
+    ):
         """
         Initialize progress tracker.
@@ -474,26 +508,147 @@ class ProgressTracker:
             Total number of items to process
         logger : logging.Logger, optional
             Logger for output
+        heartbeat_interval : int, optional
+            Seconds between heartbeat messages (default: 180 = 3 minutes)
+        status_file : str, optional
+            Path to JSON status file for API/web app consumption.
+            Checkpoints auto-save to same directory as status_file.
         """
         self.total = total
         self.completed = 0
         self.lock = threading.Lock()
         self.logger = logger or logging.getLogger("whisp")
+        self.heartbeat_interval = heartbeat_interval
+        # Handle status_file: if directory passed, auto-generate filename
+        if status_file:
+            import os
+            if os.path.isdir(status_file):
+                self.status_file = os.path.join(
+                    status_file, "whisp_processing_status.json"
+                )
+            else:
+                # Validate that parent directory exists
+                parent_dir = os.path.dirname(status_file)
+                if parent_dir and not os.path.isdir(parent_dir):
+                    self.logger.warning(
+                        f"Status file directory does not exist: {parent_dir}"
+                    )
+                    self.status_file = None
+                else:
+                    self.status_file = status_file
+        else:
+            self.status_file = None
         # Adaptive milestones based on dataset size
         # Small datasets (< 50): show every 25% (not too spammy)
         # Medium (50-500): show every 20%
-        # Large (500+): show every 10% (more frequent feedback on long runs)
+        # Large (500-1000): show every 10%
+        # Very large (1000+): show every 5% (cleaner for long jobs)
         if total < 50:
             self.milestones = {25, 50, 75, 100}
         elif total < 500:
             self.milestones = {20, 40, 60, 80, 100}
-        else:
+        elif total < 1000:
             self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
+        else:
+            self.milestones = {
+                5,
+                10,
+                15,
+                20,
+                25,
+                30,
+                35,
+                40,
+                45,
+                50,
+                55,
+                60,
+                65,
+                70,
+                75,
+                80,
+                85,
+                90,
+                95,
+                100,
+            }
         self.shown_milestones = set()
         self.start_time = time.time()
         self.last_update_time = self.start_time
+        self.heartbeat_stop = threading.Event()
+        self.heartbeat_thread = None
+    def _write_status_file(self, status: str = "processing") -> None:
+        """Write current progress to JSON status file using atomic write."""
+        if not self.status_file:
+            return
+        try:
+            import json
+            import os
+            elapsed = time.time() - self.start_time
+            percent = (self.completed / self.total * 100) if self.total > 0 else 0
+            rate = self.completed / elapsed if elapsed > 0 else 0
+            eta = (
+                (self.total - self.completed) / rate * 1.15
+                if rate > 0 and percent >= 5
+                else None
+            )
+            # Write to temp file then atomic rename to prevent partial reads
+            from datetime import datetime
+            temp_file = self.status_file + ".tmp"
+            with open(temp_file, "w") as f:
+                json.dump(
+                    {
+                        "status": status,
+                        "progress": f"{self.completed}/{self.total}",
+                        "percent": round(percent, 1),
+                        "elapsed_sec": round(elapsed),
+                        "eta_sec": round(eta) if eta else None,
+                        "updated_at": datetime.now().isoformat(),
+                    },
+                    f,
+                )
+            os.replace(temp_file, self.status_file)
+        except Exception:
+            pass
+    def start_heartbeat(self) -> None:
+        """Start background heartbeat thread for time-based progress updates."""
+        if self.heartbeat_thread is None or not self.heartbeat_thread.is_alive():
+            self.heartbeat_stop.clear()
+            self.heartbeat_thread = threading.Thread(
+                target=self._heartbeat_loop, daemon=True
+            )
+            self.heartbeat_thread.start()
+            # Write initial status
+            self._write_status_file(status="processing")
+    def _heartbeat_loop(self) -> None:
+        """Background loop that logs progress at time intervals."""
+        while not self.heartbeat_stop.wait(self.heartbeat_interval):
+            with self.lock:
+                # Only log if we haven't shown a milestone recently
+                time_since_update = time.time() - self.last_update_time
+                if (
+                    time_since_update >= self.heartbeat_interval
+                    and self.completed < self.total
+                ):
+                    elapsed = time.time() - self.start_time
+                    percent = int((self.completed / self.total) * 100)
+                    elapsed_str = self._format_time(elapsed)
+                    self.logger.info(
+                        f"[Processing] {self.completed:,}/{self.total:,} batches ({percent}%) | "
+                        f"Elapsed: {elapsed_str}"
+                    )
+                    self.last_update_time = time.time()
     def update(self, n: int = 1) -> None:
         """
@@ -508,7 +663,7 @@ class ProgressTracker:
             self.completed += n
             percent = int((self.completed / self.total) * 100)
-            # Show milestone messages (25%, 50%, 75%, 100%)
+            # Show milestone messages (5%, 10%, 15%... for large datasets)
             for milestone in sorted(self.milestones):
                 if percent >= milestone and milestone not in self.shown_milestones:
                     self.shown_milestones.add(milestone)
@@ -517,20 +672,36 @@ class ProgressTracker:
                     elapsed = time.time() - self.start_time
                     rate = self.completed / elapsed if elapsed > 0 else 0
                     remaining_items = self.total - self.completed
-                    eta_seconds = remaining_items / rate if rate > 0 else 0
+                    # Calculate ETA with padding for overhead (loading, joins, etc.)
+                    # Don't show ETA until we have some samples (at least 5% complete)
+                    if rate > 0 and self.completed >= max(5, self.total * 0.05):
+                        eta_seconds = (
+                            remaining_items / rate
+                        ) * 1.15  # Add 15% padding for overhead
+                    else:
+                        eta_seconds = 0
                     # Format time strings
-                    eta_str = self._format_time(eta_seconds)
+                    eta_str = (
+                        self._format_time(eta_seconds)
+                        if eta_seconds > 0
+                        else "calculating..."
+                    )
                     elapsed_str = self._format_time(elapsed)
                     # Build progress message
-                    msg = f"Progress: {self.completed}/{self.total} ({percent}%)"
+                    msg = f"Progress: {self.completed:,}/{self.total:,} batches ({percent}%)"
                     if percent < 100:
                         msg += f" | Elapsed: {elapsed_str} | ETA: {eta_str}"
                     else:
                         msg += f" | Total time: {elapsed_str}"
                     self.logger.info(msg)
+                    self.last_update_time = time.time()
+        # Update status file for API consumption
+        self._write_status_file()
     @staticmethod
     def _format_time(seconds: float) -> str:
@@ -544,14 +715,21 @@ class ProgressTracker:
             hours = seconds / 3600
             return f"{hours:.1f}h"
-    def finish(self) -> None:
-        """Log completion."""
+    def finish(self, output_file: str = None) -> None:
+        """Stop heartbeat and log completion."""
+        # Stop heartbeat thread
+        self.heartbeat_stop.set()
+        if self.heartbeat_thread and self.heartbeat_thread.is_alive():
+            self.heartbeat_thread.join(timeout=1)
         with self.lock:
             total_time = time.time() - self.start_time
             time_str = self._format_time(total_time)
-            self.logger.info(
-                f"Processing complete: {self.completed}/{self.total} batches in {time_str}"
-            )
+            msg = f"Processing complete: {self.completed:,}/{self.total:,} batches in {time_str}"
+            self.logger.info(msg)
+        # Write final status
+        self._write_status_file(status="completed")
 # ============================================================================
@@ -600,18 +778,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
         If incorrect endpoint and raise_error=True
     """
     if not check_ee_endpoint(endpoint_type):
-        msg = (
-            f"Not using {endpoint_type.upper()} endpoint.\n"
-            f"Current URL: {ee.data._cloud_api_base_url}\n"
-            f"\nTo use {endpoint_type} endpoint, run:\n"
-        )
-        msg += "ee.Reset()\n"
         if endpoint_type == "high-volume":
-            msg += (
-                "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
+            msg = (
+                "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
+                "ee.Reset()\n"
+                "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
+                "Or with project specified (e.g. when in Colab):\n"
+                "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
+            )
+        else:  # standard endpoint
+            msg = (
+                "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
+                "ee.Reset()\n"
+                "ee.Initialize()\n"
+                "Or with project specified (e.g. when in Colab):\n"
+                "ee.Initialize(project='your_cloud_project_name')"
             )
-        else:
-            msg += "ee.Initialize()  # Uses standard endpoint by default"
         if raise_error:
             raise RuntimeError(msg)
@@ -808,8 +990,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
 def clean_geodataframe(
     gdf: gpd.GeoDataFrame,
-    remove_nulls: bool = True,
-    fix_invalid: bool = True,
+    remove_nulls: bool = False,
+    repair_geometries: bool = False,
     logger: logging.Logger = None,
 ) -> gpd.GeoDataFrame:
     """
@@ -820,9 +1002,11 @@ def clean_geodataframe(
     gdf : gpd.GeoDataFrame
         Input GeoDataFrame
     remove_nulls : bool
-        Remove null geometries
-    fix_invalid : bool
-        Fix invalid geometries
+        Remove null geometries. Defaults to False to preserve data integrity.
+        Set to True only if you explicitly want to drop rows with null geometries.
+    repair_geometries : bool
+        Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
+        original geometries. Set to True only if you want to automatically repair invalid geometries.
     logger : logging.Logger, optional
         Logger for output
@@ -839,11 +1023,11 @@ def clean_geodataframe(
             logger.warning(f"Removing {null_count} null geometries")
             gdf = gdf[~gdf.geometry.isna()].copy()
-    if fix_invalid:
+    if repair_geometries:
         valid_count = gdf.geometry.is_valid.sum()
         invalid_count = len(gdf) - valid_count
         if invalid_count > 0:
-            logger.warning(f"Fixing {invalid_count} invalid geometries")
+            logger.warning(f"Repairing {invalid_count} invalid geometries")
             from shapely.validation import make_valid
             gdf = gdf.copy()
@@ -855,6 +1039,19 @@ def clean_geodataframe(
     return gdf
+# ============================================================================
+# BATCH RETRY HELPER
+# ============================================================================
+# ============================================================================
+# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
+# ============================================================================
+# Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
+# approach: when a batch fails, reduce batch_size parameter and retry manually.
+# This avoids semaphore deadlocks and provides clearer error messages.
 # ============================================================================
 # EE PROCESSING WITH RETRY LOGIC
 # ============================================================================
@@ -964,7 +1161,6 @@ def process_ee_batch(
 def whisp_stats_geojson_to_df_concurrent(
     input_geojson_filepath: str,
     external_id_column: str = None,
-    remove_geom: bool = False,
     national_codes: List[str] = None,
     unit_type: str = "ha",
     whisp_image: ee.Image = None,
@@ -977,6 +1173,7 @@ def whisp_stats_geojson_to_df_concurrent(
     logger: logging.Logger = None,
     # Format parameters (auto-detect from config if not provided)
     decimal_places: int = None,
+    status_file: str = None,
 ) -> pd.DataFrame:
     """
     Process GeoJSON concurrently to compute Whisp statistics with automatic formatting.
@@ -991,8 +1188,6 @@ def whisp_stats_geojson_to_df_concurrent(
         Path to input GeoJSON file
     external_id_column : str, optional
         Column name for external IDs
-    remove_geom : bool
-        Remove geometry column from output
     national_codes : List[str], optional
         ISO2 codes for national datasets
     unit_type : str
@@ -1040,8 +1235,29 @@ def whisp_stats_geojson_to_df_concurrent(
     gdf = _load_geojson_silently(input_geojson_filepath)
     logger.info(f"Loaded {len(gdf):,} features")
+    # Validate external_id_column if provided (lightweight client-side check)
+    if external_id_column and external_id_column not in gdf.columns:
+        # Exclude geometry column from available columns list
+        available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
+        raise ValueError(
+            f"Column '{external_id_column}' not found in GeoJSON properties. "
+            f"Available columns: {available_cols}"
+        )
+    # Check completeness of external_id_column (warn if nulls exist)
+    if external_id_column and external_id_column in gdf.columns:
+        null_count = gdf[external_id_column].isna().sum()
+        if null_count > 0:
+            null_pct = (null_count / len(gdf)) * 100
+            logger.warning(
+                f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
+                f"These features may have missing external IDs in output."
+            )
     if validate_geometries:
-        gdf = clean_geodataframe(gdf, logger=logger)
+        gdf = clean_geodataframe(
+            gdf, remove_nulls=False, repair_geometries=False, logger=logger
+        )
     # Add stable plotIds for merging (starting from 1, not 0)
     gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1080,13 +1296,18 @@ def whisp_stats_geojson_to_df_concurrent(
     # Batch the data
     batches = batch_geodataframe(gdf_for_ee, batch_size)
-    logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
+    logger.info(
+        f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches (concurrent mode)..."
+    )
     # Setup semaphore for EE concurrency control
     ee_semaphore = threading.BoundedSemaphore(max_concurrent)
-    # Progress tracker
-    progress = ProgressTracker(len(batches), logger=logger)
+    # Progress tracker with heartbeat for long-running jobs
+    progress = ProgressTracker(
+        len(batches), logger=logger, heartbeat_interval=180, status_file=status_file
+    )
+    progress.start_heartbeat()
     results = []
@@ -1127,64 +1348,77 @@ def whisp_stats_geojson_to_df_concurrent(
     pyogrio_logger.setLevel(logging.CRITICAL)
     try:
-        with redirect_stdout(io.StringIO()):
-            with ThreadPoolExecutor(max_workers=pool_workers) as executor:
-                futures = {
-                    executor.submit(process_batch, i, batch): i
-                    for i, batch in enumerate(batches)
-                }
-                for future in as_completed(futures):
-                    try:
-                        batch_idx, df_server, df_client = future.result()
-                        # Merge server and client results
-                        if plot_id_column not in df_server.columns:
-                            df_server[plot_id_column] = range(len(df_server))
-                        # Keep all EE statistics from server (all columns with _sum and _median suffixes)
-                        # These are the actual EE processing results
-                        df_server_clean = df_server.copy()
-                        # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
-                        # (formatted wrapper handles keep_external_columns parameter)
-                        keep_external_columns = [plot_id_column]
-                        if (
-                            external_id_column
-                            and external_id_column in df_client.columns
-                        ):
-                            keep_external_columns.append(external_id_column)
-                        if "geometry" in df_client.columns:
-                            keep_external_columns.append("geometry")
-                        # Keep geometry type column (Geometry_type)
-                        if geometry_type_column in df_client.columns:
-                            keep_external_columns.append(geometry_type_column)
-                        # Also keep centroid columns (Centroid_lon, Centroid_lat)
-                        centroid_cols = [
-                            c for c in df_client.columns if c.startswith("Centroid_")
-                        ]
-                        keep_external_columns.extend(centroid_cols)
-                        df_client_clean = df_client[
-                            [c for c in keep_external_columns if c in df_client.columns]
-                        ].drop_duplicates()
-                        merged = df_server_clean.merge(
-                            df_client_clean,
-                            on=plot_id_column,
-                            how="left",
-                            suffixes=("_ee", "_client"),
-                        )
-                        results.append(merged)
-                        progress.update()
+        # Don't suppress stdout here - we want progress messages to show in Colab
+        with ThreadPoolExecutor(max_workers=pool_workers) as executor:
+            futures = {
+                executor.submit(process_batch, i, batch): i
+                for i, batch in enumerate(batches)
+            }
-                    except Exception as e:
-                        error_msg = str(e)
-                        logger.error(f"Batch processing error: {error_msg[:100]}")
-                        import traceback
+            # Track which batches failed for retry
+            batch_map = {i: batch for i, batch in enumerate(batches)}
+            batch_futures = {future: i for future, i in futures.items()}
-                        logger.debug(traceback.format_exc())
-                        batch_errors.append(error_msg)
+            for future in as_completed(futures):
+                batch_idx = batch_futures[future]
+                try:
+                    batch_idx, df_server, df_client = future.result()
+                    # Merge server and client results
+                    if plot_id_column not in df_server.columns:
+                        df_server[plot_id_column] = range(len(df_server))
+                    # Keep all EE statistics from server (all columns with _sum and _median suffixes)
+                    # These are the actual EE processing results
+                    df_server_clean = df_server.copy()
+                    # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
+                    # (formatted wrapper handles keep_external_columns parameter)
+                    keep_external_columns = [plot_id_column]
+                    if external_id_column and external_id_column in df_client.columns:
+                        keep_external_columns.append(external_id_column)
+                    if "geometry" in df_client.columns:
+                        keep_external_columns.append("geometry")
+                    # Keep geometry type column (Geometry_type)
+                    if geometry_type_column in df_client.columns:
+                        keep_external_columns.append(geometry_type_column)
+                    # Also keep centroid columns (Centroid_lon, Centroid_lat)
+                    centroid_cols = [
+                        c for c in df_client.columns if c.startswith("Centroid_")
+                    ]
+                    keep_external_columns.extend(centroid_cols)
+                    df_client_clean = df_client[
+                        [c for c in keep_external_columns if c in df_client.columns]
+                    ]
+                    # Don't drop duplicates - we need one row per feature (one per plot_id)
+                    # Each plot_id should have exactly one row with its metadata
+                    merged = df_server_clean.merge(
+                        df_client_clean,
+                        on=plot_id_column,
+                        how="left",
+                        suffixes=("_ee", "_client"),
+                    )
+                    results.append(merged)
+                    progress.update()
+                except Exception as e:
+                    # Batch failed - fail fast with clear guidance
+                    error_msg = str(e)
+                    logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
+                    logger.debug(f"Full error: {error_msg}")
+                    # Get original batch for error reporting
+                    original_batch = batch_map[batch_idx]
+                    # Add to batch errors for final reporting
+                    batch_errors.append((batch_idx, original_batch, error_msg))
+    except (KeyboardInterrupt, SystemExit) as interrupt:
+        logger.warning("Processing interrupted by user")
+        # Update status file with interrupted state
+        progress._write_status_file(status="interrupted")
+        raise interrupt
     finally:
         # Restore logger levels
         fiona_logger.setLevel(old_fiona_level)
@@ -1192,8 +1426,60 @@ def whisp_stats_geojson_to_df_concurrent(
     progress.finish()
-    # Check if we should retry with validation due to band errors
-    if batch_errors and not results:
+    # If we have batch errors after retry attempts, fail the entire process
+    if batch_errors:
+        total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
+        failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
+        # Format detailed error information for debugging
+        error_details_list = []
+        for idx, batch, msg in batch_errors:
+            error_details_list.append(f"  Batch {idx} ({len(batch)} features): {msg}")
+        error_details = "\n".join(error_details_list)
+        # Analyze error patterns for debugging hints
+        error_patterns = {
+            "memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
+            "request_size": any(
+                keyword in msg.lower()
+                for _, _, msg in batch_errors
+                for keyword in ["too large", "10mb", "payload", "size limit"]
+            ),
+            "quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
+            "timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
+        }
+        # Build helpful suggestions based on error patterns
+        suggestions = []
+        if error_patterns["memory"]:
+            suggestions.append(
+                f"  • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
+            )
+        if error_patterns["request_size"]:
+            suggestions.append(
+                "  • Request payload too large: reduce batch_size or simplify input geometries"
+            )
+        if error_patterns["quota"]:
+            suggestions.append("  • Earth Engine quota exceeded: wait and retry later")
+        if error_patterns["timeout"]:
+            suggestions.append(
+                "  • Processing timeout: reduce batch_size or simplify input geometries"
+            )
+        suggestions_text = (
+            "\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
+        )
+        raise RuntimeError(
+            f"Failed to process {len(batch_errors)} batch(es):\n"
+            f"\n{error_details}\n"
+            f"\nTotal rows affected: {total_failed_rows}\n"
+            f"{suggestions_text}\n"
+            f"Please reduce batch_size and try again."
+        )
+    # Check if we should retry with validation due to band errors (legacy band error handling)
+    if not results:
         # All batches failed - likely a bad band issue
         is_band_error = any(
             keyword in str(batch_errors)
@@ -1483,7 +1769,7 @@ def whisp_stats_geojson_to_df_concurrent(
                 )
                 raise retry_e
-        logger.info(f"Processed {len(formatted):,} features successfully")
+        logger.info(f"Processing complete: {len(formatted):,} features")
         return formatted
     else:
         logger.error(" No results produced")
@@ -1498,7 +1784,6 @@ def whisp_stats_geojson_to_df_concurrent(
 def whisp_stats_geojson_to_df_sequential(
     input_geojson_filepath: str,
     external_id_column: str = None,
-    remove_geom: bool = False,
     national_codes: List[str] = None,
     unit_type: str = "ha",
     whisp_image: ee.Image = None,
@@ -1523,8 +1808,6 @@ def whisp_stats_geojson_to_df_sequential(
         Path to input GeoJSON
     external_id_column : str, optional
         Column name for external IDs
-    remove_geom : bool
-        Remove geometry from output
     national_codes : List[str], optional
         ISO2 codes for national datasets
     unit_type : str
@@ -1564,8 +1847,29 @@ def whisp_stats_geojson_to_df_sequential(
     gdf = _load_geojson_silently(input_geojson_filepath)
     logger.info(f"Loaded {len(gdf):,} features")
-    # Clean geometries
-    gdf = clean_geodataframe(gdf, logger=logger)
+    # Validate external_id_column if provided (lightweight client-side check)
+    if external_id_column and external_id_column not in gdf.columns:
+        # Exclude geometry column from available columns list
+        available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
+        raise ValueError(
+            f"Column '{external_id_column}' not found in GeoJSON properties. "
+            f"Available columns: {available_cols}"
+        )
+    # Check completeness of external_id_column (warn if nulls exist)
+    if external_id_column and external_id_column in gdf.columns:
+        null_count = gdf[external_id_column].isna().sum()
+        if null_count > 0:
+            null_pct = (null_count / len(gdf)) * 100
+            logger.warning(
+                f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
+                f"These features may have missing external IDs in output."
+            )
+    # Clean geometries (preserve both null and invalid geometries by default)
+    gdf = clean_geodataframe(
+        gdf, remove_nulls=False, repair_geometries=False, logger=logger
+    )
     # Add stable plotIds for merging (starting from 1, not 0)
     gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1612,7 +1916,9 @@ def whisp_stats_geojson_to_df_sequential(
     reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
     # Process server-side with error handling for bad bands
-    logger.info("Processing with Earth Engine...")
+    logger.info(
+        f"Processing {len(gdf):,} features with Earth Engine (sequential mode)..."
+    )
     try:
         results_fc = whisp_image.reduceRegions(collection=fc, reducer=reducer, scale=10)
         df_server = convert_ee_to_df(results_fc)
@@ -1698,7 +2004,7 @@ def whisp_stats_geojson_to_df_sequential(
         convert_water_flag=True,
     )
-    logger.info(f"Processed {len(formatted):,} features")
+    logger.info(f"Processing complete: {len(formatted):,} features")
     # Consolidate external_id_column to standardized 'external_id'
     if external_id_column:
@@ -1731,7 +2037,6 @@ def whisp_stats_geojson_to_df_sequential(
 def whisp_formatted_stats_geojson_to_df_concurrent(
     input_geojson_filepath: str,
     external_id_column: str = None,
-    remove_geom: bool = False,
     national_codes: List[str] = None,
     unit_type: str = "ha",
     whisp_image: ee.Image = None,
@@ -1748,7 +2053,8 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
-    include_geometry_audit_trail: bool = False,
+    geometry_audit_trail: bool = False,
+    status_file: str = None,
 ) -> pd.DataFrame:
     """
     Process GeoJSON concurrently with automatic formatting and validation.
@@ -1764,8 +2070,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         Path to input GeoJSON file
     external_id_column : str, optional
         Column name for external IDs
-    remove_geom : bool
-        Remove geometry column from output
     national_codes : List[str], optional
         ISO2 codes for national datasets
     unit_type : str
@@ -1799,14 +2103,10 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         Water flag ratio threshold (default 0.5)
     sort_column : str
         Column to sort by (default "plotId", None to skip)
-    include_geometry_audit_trail : bool, default False
-        If True, includes audit trail columns:
-        - geo_original: Original input geometry (before EE processing)
-        - geometry_type_original: Original geometry type
-        - geometry_type: Processed geometry type (from EE)
-        - geometry_type_changed: Boolean flag if geometry changed
-        - geometry_type_transition: Description of how it changed
-        These columns enable full transparency and auditability for compliance tracking.
+    geometry_audit_trail : bool, default False
+        If True, includes original input geometry column:
+        - geo_original: Original input geometry (before EE processing), stored as GeoJSON
+        Enables geometry traceability for compliance and audit purposes.
     Returns
     -------
@@ -1826,15 +2126,17 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         decimal_places = _extract_decimal_places(stats_area_columns_formatting)
         logger.debug(f"Using decimal_places={decimal_places} from config")
-    # Normalize keep_external_columns parameter early (will be used in merge logic later)
-    # Load GeoJSON temporarily to get column names for normalization
+    # Load original geometries once here if needed for audit trail (avoid reloading later)
+    gdf_original_geoms = None
+    if geometry_audit_trail:
+        logger.debug("Pre-loading GeoJSON for geometry audit trail...")
+        gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (concurrent)...")
     df_raw = whisp_stats_geojson_to_df_concurrent(
         input_geojson_filepath=input_geojson_filepath,
         external_id_column=external_id_column,
-        remove_geom=remove_geom,
         national_codes=national_codes,
         unit_type=unit_type,
         whisp_image=whisp_image,
@@ -1845,6 +2147,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         max_retries=max_retries,
         add_metadata_server=add_metadata_server,
         logger=logger,
+        status_file=status_file,
     )
     # Step 2: Format the output
@@ -1890,95 +2193,39 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
     )
     # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
-    if include_geometry_audit_trail:
+    if geometry_audit_trail:
         logger.debug("Adding audit trail columns...")
         try:
-            # Capture original geometries AFTER we have the raw stats
-            logger.debug("Capturing original geometries for audit trail...")
-            gdf_original = _load_geojson_silently(input_geojson_filepath)
+            # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
+            if gdf_original_geoms is None:
+                logger.warning("Original geometries not pre-loaded, loading now...")
+                gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
             # Use plotId from df_validated to maintain mapping
             df_original_geom = pd.DataFrame(
                 {
-                    "plotId": df_validated["plotId"].values[: len(gdf_original)],
-                    "geo_original": gdf_original["geometry"].apply(
+                    "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
+                    "geo_original": gdf_original_geoms["geometry"].apply(
                         lambda g: json.dumps(mapping(g)) if g is not None else None
                     ),
-                    "geometry_type_original": gdf_original["geometry"].geom_type.values,
                 }
             )
             # Merge original geometries back
             df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
-            # Extract geometry type from processed 'geo' column if it exists
-            # Note: 'geo' column may not exist after validation removes extra columns
-            if "geo" in df_validated.columns:
-                # Use geo column from validated dataframe
-                def extract_geom_type(x):
-                    try:
-                        if isinstance(x, dict):
-                            return x.get("type")
-                        elif isinstance(x, str):
-                            # Handle both JSON strings and Python dict string representations
-                            try:
-                                parsed = json.loads(x)
-                            except:
-                                # Try ast.literal_eval for Python dict representations
-                                import ast
-                                parsed = ast.literal_eval(x)
-                            return (
-                                parsed.get("type") if isinstance(parsed, dict) else None
-                            )
-                    except:
-                        pass
-                    return None
-                df_validated["geometry_type"] = df_validated["geo"].apply(
-                    extract_geom_type
-                )
-            else:
-                # If geo doesn't exist, just use the original type
-                df_validated["geometry_type"] = df_validated["geometry_type_original"]
-            # Flag if geometry changed
-            df_validated["geometry_type_changed"] = (
-                df_validated["geometry_type_original"] != df_validated["geometry_type"]
-            )
-            # Classify the geometry type transition
-            def classify_transition(orig, proc):
-                if orig == proc:
-                    return "no_change"
-                elif proc == "LineString":
-                    return f"{orig}_simplified_to_linestring"
-                elif proc == "Point":
-                    return f"{orig}_simplified_to_point"
-                else:
-                    return f"{orig}_to_{proc}"
-            df_validated["geometry_type_transition"] = df_validated.apply(
-                lambda row: classify_transition(
-                    row["geometry_type_original"], row["geometry_type"]
-                ),
-                axis=1,
-            )
             # Store processing metadata
             df_validated.attrs["processing_metadata"] = {
-                "whisp_version": "2.0",
+                "whisp_version": "3.0.0a1",
                 "processing_date": datetime.now().isoformat(),
                 "processing_mode": "concurrent",
                 "ee_endpoint": "high_volume",
                 "validate_geometries": validate_geometries,
                 "datasets_used": national_codes or [],
-                "include_geometry_audit_trail": True,
+                "geometry_audit_trail": True,
             }
-            logger.info(
-                f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
-            )
+            logger.info(f"Audit trail added: geo_original column")
         except Exception as e:
             logger.warning(f"Error adding audit trail: {e}")
@@ -2003,7 +2250,6 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
 def whisp_formatted_stats_geojson_to_df_sequential(
     input_geojson_filepath: str,
     external_id_column: str = None,
-    remove_geom: bool = False,
     national_codes: List[str] = None,
     unit_type: str = "ha",
     whisp_image: ee.Image = None,
@@ -2016,7 +2262,8 @@ def whisp_formatted_stats_geojson_to_df_sequential(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
-    include_geometry_audit_trail: bool = False,
+    geometry_audit_trail: bool = False,
+    status_file: str = None,
 ) -> pd.DataFrame:
     """
     Process GeoJSON sequentially with automatic formatting and validation.
@@ -2032,8 +2279,6 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         Path to input GeoJSON file
     external_id_column : str, optional
         Column name for external IDs
-    remove_geom : bool
-        Remove geometry from output
     national_codes : List[str], optional
         ISO2 codes for national datasets
     unit_type : str
@@ -2059,14 +2304,10 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         Water flag ratio threshold (default 0.5)
     sort_column : str
         Column to sort by (default "plotId", None to skip)
-    include_geometry_audit_trail : bool, default True
-        If True, includes audit trail columns:
-        - geo_original: Original input geometry (before EE processing)
-        - geometry_type_original: Original geometry type
-        - geometry_type: Processed geometry type (from EE)
-        - geometry_type_changed: Boolean flag if geometry changed
-        - geometry_type_transition: Description of how it changed
-        These columns enable full transparency and auditability for EUDR compliance.
+    geometry_audit_trail : bool, default True
+        If True, includes original input geometry column:
+        - geo_original: Original input geometry (before EE processing), stored as GeoJSON
+        Enables geometry traceability for compliance and audit purposes.
     Returns
     -------
@@ -2086,12 +2327,17 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         decimal_places = _extract_decimal_places(stats_area_columns_formatting)
         logger.debug(f"Using decimal_places={decimal_places} from config")
+    # Load original geometries once here if needed for audit trail (avoid reloading later)
+    gdf_original_geoms = None
+    if geometry_audit_trail:
+        logger.debug("Pre-loading GeoJSON for geometry audit trail...")
+        gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (sequential)...")
     df_raw = whisp_stats_geojson_to_df_sequential(
         input_geojson_filepath=input_geojson_filepath,
         external_id_column=external_id_column,
-        remove_geom=remove_geom,
         national_codes=national_codes,
         unit_type=unit_type,
         whisp_image=whisp_image,
@@ -2143,94 +2389,38 @@ def whisp_formatted_stats_geojson_to_df_sequential(
     )
     # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
-    if include_geometry_audit_trail:
+    if geometry_audit_trail:
         logger.debug("Adding audit trail columns...")
         try:
-            # Capture original geometries AFTER we have the raw stats
-            logger.debug("Capturing original geometries for audit trail...")
-            gdf_original = _load_geojson_silently(input_geojson_filepath)
+            # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
+            if gdf_original_geoms is None:
+                logger.warning("Original geometries not pre-loaded, loading now...")
+                gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
             # Use plotId from df_validated to maintain mapping
             df_original_geom = pd.DataFrame(
                 {
-                    "plotId": df_validated["plotId"].values[: len(gdf_original)],
-                    "geo_original": gdf_original["geometry"].apply(
+                    "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
+                    "geo_original": gdf_original_geoms["geometry"].apply(
                         lambda g: json.dumps(mapping(g)) if g is not None else None
                     ),
-                    "geometry_type_original": gdf_original["geometry"].geom_type.values,
                 }
             )
             # Merge original geometries back
             df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
-            # Extract geometry type from processed 'geo' column if it exists
-            # Note: 'geo' column may not exist after validation removes extra columns
-            if "geo" in df_validated.columns:
-                # Use geo column from validated dataframe
-                def extract_geom_type(x):
-                    try:
-                        if isinstance(x, dict):
-                            return x.get("type")
-                        elif isinstance(x, str):
-                            # Handle both JSON strings and Python dict string representations
-                            try:
-                                parsed = json.loads(x)
-                            except:
-                                # Try ast.literal_eval for Python dict representations
-                                import ast
-                                parsed = ast.literal_eval(x)
-                            return (
-                                parsed.get("type") if isinstance(parsed, dict) else None
-                            )
-                    except:
-                        pass
-                    return None
-                df_validated["geometry_type"] = df_validated["geo"].apply(
-                    extract_geom_type
-                )
-            else:
-                # If geo doesn't exist, just use the original type
-                df_validated["geometry_type"] = df_validated["geometry_type_original"]
-            # Flag if geometry changed
-            df_validated["geometry_type_changed"] = (
-                df_validated["geometry_type_original"] != df_validated["geometry_type"]
-            )
-            # Classify the geometry type transition
-            def classify_transition(orig, proc):
-                if orig == proc:
-                    return "no_change"
-                elif proc == "LineString":
-                    return f"{orig}_simplified_to_linestring"
-                elif proc == "Point":
-                    return f"{orig}_simplified_to_point"
-                else:
-                    return f"{orig}_to_{proc}"
-            df_validated["geometry_type_transition"] = df_validated.apply(
-                lambda row: classify_transition(
-                    row["geometry_type_original"], row["geometry_type"]
-                ),
-                axis=1,
-            )
             # Store processing metadata
             df_validated.attrs["processing_metadata"] = {
-                "whisp_version": "2.0",
+                "whisp_version": "3.0.0a1",
                 "processing_date": datetime.now().isoformat(),
                 "processing_mode": "sequential",
                 "ee_endpoint": "standard",
                 "datasets_used": national_codes or [],
-                "include_geometry_audit_trail": True,
+                "geometry_audit_trail": True,
             }
-            logger.info(
-                f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
-            )
+            logger.info(f"Audit trail added: geo_original column")
         except Exception as e:
             logger.warning(f"Error adding audit trail: {e}")
@@ -2260,12 +2450,11 @@ def whisp_formatted_stats_geojson_to_df_sequential(
 def whisp_formatted_stats_geojson_to_df_fast(
     input_geojson_filepath: str,
     external_id_column: str = None,
-    remove_geom: bool = False,
     national_codes: List[str] = None,
     unit_type: str = "ha",
     whisp_image: ee.Image = None,
     custom_bands: Dict[str, Any] = None,
-    mode: str = "auto",
+    mode: str = "sequential",
     # Concurrent-specific parameters
     batch_size: int = 10,
     max_concurrent: int = 20,
@@ -2278,15 +2467,16 @@ def whisp_formatted_stats_geojson_to_df_fast(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
-    include_geometry_audit_trail: bool = False,
+    geometry_audit_trail: bool = False,
+    status_file: str = None,
 ) -> pd.DataFrame:
     """
     Process GeoJSON to Whisp statistics with optimized fast processing.
-    Automatically selects between concurrent (high-volume endpoint) and sequential
-    (standard endpoint) based on file size, or allows explicit mode selection.
+    Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
+    based on explicit mode selection.
-    This is the recommended entry point for most users who want automatic optimization.
+    This is the recommended entry point for most users.
     Parameters
     ----------
@@ -2294,8 +2484,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
         Path to input GeoJSON file
     external_id_column : str, optional
         Column name for external IDs
-    remove_geom : bool
-        Remove geometry column from output
     national_codes : List[str], optional
         ISO2 codes for national datasets
     unit_type : str
@@ -2306,12 +2494,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
         Custom band information
     mode : str
         Processing mode:
-        - "auto": Choose based on file size (default)
-          * <1MB: sequential
-          * 1-5MB: sequential
-          * >5MB: concurrent
-        - "concurrent": Force high-volume endpoint (batch processing)
-        - "sequential": Force standard endpoint (single-threaded)
+        - "concurrent": Uses high-volume endpoint with batch processing
+        - "sequential": Uses standard endpoint for sequential processing
     batch_size : int
         Features per batch (only for concurrent mode)
     max_concurrent : int
@@ -2332,6 +2516,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
         Water flag ratio threshold
     sort_column : str
         Column to sort by
+    geometry_audit_trail : bool
+        Include geometry modification audit trail columns
     Returns
     -------
@@ -2340,16 +2526,13 @@ def whisp_formatted_stats_geojson_to_df_fast(
     Examples
     --------
-    >>> # Auto-detect best method based on file size
-    >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
-    >>> # Force concurrent processing for large datasets
+    >>> # Use concurrent processing (recommended for most datasets)
     >>> df = whisp_formatted_stats_geojson_to_df_fast(
-    ...     "large_data.geojson",
+    ...     "data.geojson",
     ...     mode="concurrent"
     ... )
-    >>> # Use sequential for guaranteed completion
+    >>> # Use sequential processing for more stable results
     >>> df = whisp_formatted_stats_geojson_to_df_fast(
     ...     "data.geojson",
     ...     mode="sequential"
@@ -2357,40 +2540,20 @@ def whisp_formatted_stats_geojson_to_df_fast(
     """
     logger = logging.getLogger("whisp")
-    # Determine processing mode
-    if mode == "auto":
-        try:
-            file_size = Path(input_geojson_filepath).stat().st_size
-            if file_size > 5_000_000:  # >5MB
-                chosen_mode = "concurrent"
-                logger.info(
-                    f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
-                )
-            else:  # <=5MB
-                chosen_mode = "sequential"
-                logger.info(
-                    f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
-                )
-        except Exception as e:
-            logger.warning(
-                f"Could not determine file size: {e}. Defaulting to sequential."
-            )
-            chosen_mode = "sequential"
-    elif mode in ("concurrent", "sequential"):
-        chosen_mode = mode
-        logger.info(f"Mode explicitly set to: {mode}")
-    else:
+    # Validate mode parameter
+    if mode not in ("concurrent", "sequential"):
         raise ValueError(
-            f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
+            f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
         )
+    logger.info(f"Mode: {mode}")
     # Route to appropriate function
-    if chosen_mode == "concurrent":
+    if mode == "concurrent":
         logger.debug("Routing to concurrent processing...")
         return whisp_formatted_stats_geojson_to_df_concurrent(
             input_geojson_filepath=input_geojson_filepath,
             external_id_column=external_id_column,
-            remove_geom=remove_geom,
             national_codes=national_codes,
             unit_type=unit_type,
             whisp_image=whisp_image,
@@ -2406,14 +2569,14 @@ def whisp_formatted_stats_geojson_to_df_fast(
             convert_water_flag=convert_water_flag,
             water_flag_threshold=water_flag_threshold,
             sort_column=sort_column,
-            include_geometry_audit_trail=include_geometry_audit_trail,
+            geometry_audit_trail=geometry_audit_trail,
+            status_file=status_file,
         )
     else:  # sequential
         logger.debug("Routing to sequential processing...")
         return whisp_formatted_stats_geojson_to_df_sequential(
             input_geojson_filepath=input_geojson_filepath,
             external_id_column=external_id_column,
-            remove_geom=remove_geom,
             national_codes=national_codes,
             unit_type=unit_type,
             whisp_image=whisp_image,
@@ -2424,5 +2587,6 @@ def whisp_formatted_stats_geojson_to_df_fast(
             convert_water_flag=convert_water_flag,
             water_flag_threshold=water_flag_threshold,
             sort_column=sort_column,
-            include_geometry_audit_trail=include_geometry_audit_trail,
+            geometry_audit_trail=geometry_audit_trail,
+            status_file=status_file,
         )

openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl

openforis-whisp 3.0.0a2py3-none-any.whl → 3.0.0a4py3-none-any.whl