PyPI - caption-flow - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

caption-flow 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

caption_flow/cli.py +308 -0
caption_flow/models.py +134 -1
caption_flow/monitor.py +1 -1
caption_flow/orchestrator.py +423 -1715
caption_flow/processors/__init__.py +11 -0
caption_flow/processors/base.py +219 -0
caption_flow/processors/huggingface.py +832 -0
caption_flow/processors/local_filesystem.py +683 -0
caption_flow/processors/webdataset.py +782 -0
caption_flow/storage/__init__.py +1 -0
caption_flow/storage/exporter.py +550 -0
caption_flow/{storage.py → storage/manager.py} +489 -401
caption_flow/utils/checkpoint_tracker.py +2 -2
caption_flow/utils/chunk_tracker.py +73 -32
caption_flow/utils/dataset_loader.py +58 -298
caption_flow/utils/dataset_metadata_cache.py +67 -0
caption_flow/utils/image_processor.py +1 -4
caption_flow/utils/shard_processor.py +5 -265
caption_flow/utils/shard_tracker.py +1 -5
caption_flow/viewer.py +594 -0
caption_flow/workers/base.py +3 -3
caption_flow/workers/caption.py +416 -792
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/METADATA +49 -180
caption_flow-0.2.4.dist-info/RECORD +38 -0
caption_flow-0.2.2.dist-info/RECORD +0 -29
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/WHEEL +0 -0
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/top_level.txt +0 -0

caption_flow/{storage.py → storage/manager.py} RENAMED Viewed

@@ -4,18 +4,21 @@ import asyncio
 import json
 import logging
 from dataclasses import asdict
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
 from typing import List, Optional, Set, Dict, Any
 import pyarrow as pa
 import pyarrow.parquet as pq
 from pyarrow import fs
 import pandas as pd
-from collections import defaultdict
+from collections import defaultdict, deque
+import time
+import numpy as np
-from .models import Job, Caption, Contributor, JobStatus
+from ..models import Job, Caption, Contributor, StorageContents, JobId
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 class StorageManager:
@@ -60,6 +63,11 @@ class StorageManager:
         self.total_flushes = 0
         self.duplicates_skipped = 0
+        # Rate tracking
+        self.row_additions = deque(maxlen=10000)  # Store (timestamp, row_count) tuples
+        self.start_time = time.time()
+        self.last_rate_log_time = time.time()
         # Base caption schema without dynamic output fields
         self.base_caption_fields = [
             ("job_id", pa.string()),
@@ -68,6 +76,8 @@ class StorageManager:
             ("chunk_id", pa.string()),
             ("item_key", pa.string()),
             ("item_index", pa.int32()),
+            ("filename", pa.string()),
+            ("url", pa.string()),
             ("caption_count", pa.int32()),
             ("contributor_id", pa.string()),
             ("timestamp", pa.timestamp("us")),
@@ -105,6 +115,137 @@ class StorageManager:
             ]
         )
+    def _is_column_empty(self, df: pd.DataFrame, column_name: str) -> bool:
+        """Check if a column is entirely empty, null, or contains only zeros/empty lists."""
+        if column_name not in df.columns:
+            return True
+        col = df[column_name]
+        # Check if all values are null/NaN
+        if col.isna().all():
+            return True
+        # For numeric columns, check if all non-null values are 0
+        if pd.api.types.is_numeric_dtype(col):
+            non_null_values = col.dropna()
+            if len(non_null_values) > 0 and (non_null_values == 0).all():
+                return True
+        # For list columns, check if all are None or empty lists
+        if col.dtype == "object":
+            non_null_values = col.dropna()
+            if len(non_null_values) == 0:
+                return True
+            # Check if all non-null values are empty lists
+            all_empty_lists = True
+            for val in non_null_values:
+                if isinstance(val, list) and len(val) > 0:
+                    all_empty_lists = False
+                    break
+                elif not isinstance(val, list):
+                    all_empty_lists = False
+                    break
+            if all_empty_lists:
+                return True
+        return False
+    def _get_non_empty_columns(
+        self, df: pd.DataFrame, preserve_base_fields: bool = True
+    ) -> List[str]:
+        """Get list of columns that contain actual data.
+        Args:
+            df: DataFrame to check
+            preserve_base_fields: If True, always include base fields even if empty
+        """
+        base_field_names = {field[0] for field in self.base_caption_fields}
+        non_empty_columns = []
+        for col in df.columns:
+            # Always keep base fields if preserve_base_fields is True
+            if preserve_base_fields and col in base_field_names:
+                non_empty_columns.append(col)
+            elif not self._is_column_empty(df, col):
+                non_empty_columns.append(col)
+        return non_empty_columns
+    def _calculate_rates(self) -> Dict[str, float]:
+        """Calculate row addition rates over different time windows."""
+        current_time = time.time()
+        rates = {}
+        # Define time windows in minutes
+        windows = {"1min": 1, "5min": 5, "15min": 15, "60min": 60}
+        # Clean up old entries beyond the largest window
+        cutoff_time = current_time - (60 * 60)  # 60 minutes
+        while self.row_additions and self.row_additions[0][0] < cutoff_time:
+            self.row_additions.popleft()
+        # Calculate rates for each window
+        for window_name, window_minutes in windows.items():
+            window_seconds = window_minutes * 60
+            window_start = current_time - window_seconds
+            # Sum rows added within this window
+            rows_in_window = sum(
+                count for timestamp, count in self.row_additions if timestamp >= window_start
+            )
+            # Calculate rate (rows per second)
+            # For windows larger than elapsed time, use elapsed time
+            elapsed = current_time - self.start_time
+            actual_window = min(window_seconds, elapsed)
+            if actual_window > 0:
+                rate = rows_in_window / actual_window
+                rates[window_name] = rate
+            else:
+                rates[window_name] = 0.0
+        # Calculate instantaneous rate (last minute)
+        instant_window_start = current_time - 60  # Last 60 seconds
+        instant_rows = sum(
+            count for timestamp, count in self.row_additions if timestamp >= instant_window_start
+        )
+        instant_window = min(60, current_time - self.start_time)
+        rates["instant"] = instant_rows / instant_window if instant_window > 0 else 0.0
+        # Calculate overall rate since start
+        total_elapsed = current_time - self.start_time
+        if total_elapsed > 0:
+            rates["overall"] = self.total_captions_written / total_elapsed
+        else:
+            rates["overall"] = 0.0
+        return rates
+    def _log_rates(self, rows_added: int):
+        """Log rate information if enough time has passed."""
+        current_time = time.time()
+        # Log rates every 10 seconds or if it's been more than 30 seconds
+        time_since_last_log = current_time - self.last_rate_log_time
+        if time_since_last_log < 10 and rows_added < 50:
+            return
+        rates = self._calculate_rates()
+        # Format the rate information
+        rate_str = (
+            f"Rate stats - Instant: {rates['instant']:.1f} rows/s | "
+            f"Avg (5m): {rates['5min']:.1f} | "
+            f"Avg (15m): {rates['15min']:.1f} | "
+            f"Avg (60m): {rates['60min']:.1f} | "
+            f"Overall: {rates['overall']:.1f} rows/s"
+        )
+        logger.info(rate_str)
+        self.last_rate_log_time = current_time
     def _get_existing_output_columns(self) -> Set[str]:
         """Get output field columns that actually exist in the parquet file."""
         if not self.captions_path.exists():
@@ -216,9 +357,14 @@ class StorageManager:
         if "outputs" in df.columns:
             df = df.drop(columns=["outputs"])
-        # Update known fields and schema
-        self.known_output_fields = output_fields
-        self.caption_schema = self._build_caption_schema(output_fields)
+        # Remove empty columns before saving (but preserve base fields)
+        non_empty_columns = self._get_non_empty_columns(df, preserve_base_fields=True)
+        df = df[non_empty_columns]
+        # Update known fields and schema based on non-empty columns
+        base_field_names = {field[0] for field in self.base_caption_fields}
+        self.known_output_fields = set(non_empty_columns) - base_field_names
+        self.caption_schema = self._build_caption_schema(self.known_output_fields)
         # Write migrated table
         migrated_table = pa.Table.from_pandas(df, schema=self.caption_schema)
@@ -226,8 +372,7 @@ class StorageManager:
         logger.info("Migration complete - outputs now stored in dynamic columns")
     async def save_caption(self, caption: Caption):
-        """Save a caption entry with dynamic output columns."""
-        # Convert to dict
+        """Save a caption entry, grouping outputs by job_id/item_key (not separating captions)."""
         caption_dict = asdict(caption)
         # Extract item_index from metadata if present
@@ -242,16 +387,61 @@ class StorageManager:
         # Remove old "captions" field if it exists (will be in outputs)
         caption_dict.pop("captions", None)
-        new_fields = set()
+        # Grouping key: (job_id, item_key)
+        _job_id = caption_dict.get("job_id")
+        job_id = JobId.from_dict(_job_id).get_sample_str()
+        group_key = job_id
+        logger.debug(
+            f"save_caption: group_key={group_key}, outputs={list(outputs.keys())}, caption_count={caption_dict.get('caption_count')}, item_index={caption_dict.get('item_index')}"
+        )
+        # Try to find existing buffered row for this group
+        found_row = False
+        for idx, row in enumerate(self.caption_buffer):
+            check_key = row.get("job_id")
+            logger.debug(f"Checking buffer row {idx}: check_key={check_key}, group_key={group_key}")
+            if check_key == group_key:
+                found_row = True
+                logger.debug(f"Found existing buffer row for group_key={group_key} at index {idx}")
+                # Merge outputs into existing row
+                for field_name, field_values in outputs.items():
+                    if field_name not in self.known_output_fields:
+                        self.known_output_fields.add(field_name)
+                        logger.info(f"New output field detected: {field_name}")
+                    if field_name in row and isinstance(row[field_name], list):
+                        logger.debug(
+                            f"Merging output field '{field_name}' into existing row: before={row[field_name]}, adding={field_values}"
+                        )
+                        row[field_name].extend(field_values)
+                        logger.debug(f"After merge: {row[field_name]}")
+                    else:
+                        logger.debug(
+                            f"Setting new output field '{field_name}' in existing row: {field_values}"
+                        )
+                        row[field_name] = list(field_values)
+                # Optionally update other fields (e.g., caption_count)
+                if "caption_count" in caption_dict:
+                    old_count = row.get("caption_count", 0)
+                    row["caption_count"] = old_count + caption_dict["caption_count"]
+                    logger.debug(
+                        f"Updated caption_count for group_key={group_key}: {old_count} + {caption_dict['caption_count']} = {row['caption_count']}"
+                    )
+                return  # Already merged, no need to add new row
+            else:
+                logger.debug(f"Caption row not found for group key: {group_key} vs {check_key}")
+        if not found_row:
+            logger.debug(
+                f"No existing buffer row found for group_key={group_key}, creating new row."
+            )
+        # If not found, create new row
         for field_name, field_values in outputs.items():
-            caption_dict[field_name] = field_values
             if field_name not in self.known_output_fields:
-                new_fields.add(field_name)
-                self.known_output_fields.add(field_name)  # Add immediately
-        if new_fields:
-            logger.info(f"New output fields detected: {sorted(new_fields)}")
-            logger.info(f"Total known output fields: {sorted(self.known_output_fields)}")
+                self.known_output_fields.add(field_name)
+                logger.info(f"New output field detected: {field_name}")
+            caption_dict[field_name] = list(field_values)
+            logger.debug(f"Adding output field '{field_name}' to new row: {field_values}")
         # Serialize metadata to JSON if present
         if "metadata" in caption_dict:
@@ -259,68 +449,16 @@ class StorageManager:
         else:
             caption_dict["metadata"] = "{}"
-        # Add to buffer
-        self.caption_buffer.append(caption_dict)
-        # Log buffer status
-        logger.debug(f"Caption buffer size: {len(self.caption_buffer)}/{self.caption_buffer_size}")
-        # Flush if buffer is large enough
-        if len(self.caption_buffer) >= self.caption_buffer_size:
-            await self._flush_captions()
-    async def save_captions(self, caption_data: Dict[str, Any]):
-        """Save captions for an image - compatible with dict input."""
-        job_id = caption_data["job_id"]
-        # Check if we already have captions for this job_id
-        if job_id in self.existing_caption_job_ids:
-            self.duplicates_skipped += 1
-            logger.debug(f"Skipping duplicate captions for job_id: {job_id}")
-            return
-        # Check if it's already in the buffer
-        for buffered in self.caption_buffer:
-            if buffered["job_id"] == job_id:
-                logger.debug(f"Captions for job_id {job_id} already in buffer")
-                return
-        # Handle outputs if present
-        if "outputs" in caption_data:
-            outputs = caption_data.pop("outputs")
-            # Add each output field directly to caption_data
-            for field_name, field_values in outputs.items():
-                caption_data[field_name] = field_values
-                if field_name not in self.known_output_fields:
-                    self.known_output_fields.add(field_name)
-                    logger.info(f"New output field detected: {field_name}")
-        # Handle legacy captions field
-        if "captions" in caption_data and "captions" not in self.known_output_fields:
-            self.known_output_fields.add("captions")
-        # Count all outputs
-        caption_count = 0
-        for field_name in self.known_output_fields:
-            if field_name in caption_data and isinstance(caption_data[field_name], list):
-                caption_count += len(caption_data[field_name])
-        caption_data["caption_count"] = caption_count
+        if isinstance(caption_dict.get("job_id"), dict):
+            caption_dict["job_id"] = job_id
-        # Add default values for optional fields
-        if "quality_scores" not in caption_data:
-            caption_data["quality_scores"] = None
-        if "metadata" in caption_data and isinstance(caption_data["metadata"], dict):
-            caption_data["metadata"] = json.dumps(caption_data["metadata"])
-        elif "metadata" not in caption_data:
-            caption_data["metadata"] = "{}"
-        self.caption_buffer.append(caption_data)
-        self.existing_caption_job_ids.add(job_id)
+        self.caption_buffer.append(caption_dict)
+        logger.debug(
+            f"Appended new caption row for group_key={group_key}. Caption buffer size: {len(self.caption_buffer)}/{self.caption_buffer_size}"
+        )
-        # Flush if buffer is large enough
         if len(self.caption_buffer) >= self.caption_buffer_size:
+            logger.debug("Caption buffer full, flushing captions.")
             await self._flush_captions()
     async def _flush_captions(self):
@@ -337,25 +475,7 @@ class StorageManager:
                 if field_name in row and isinstance(row[field_name], list):
                     total_outputs += len(row[field_name])
-        logger.info(f"Flushing {num_rows} rows with {total_outputs} total outputs to disk")
-        # Check if we need to evolve the schema
-        current_schema_fields = set(self.caption_schema.names) if self.caption_schema else set()
-        all_fields_needed = set(
-            self.base_caption_fields[i][0] for i in range(len(self.base_caption_fields))
-        )
-        all_fields_needed.update(self.known_output_fields)
-        if all_fields_needed != current_schema_fields:
-            # Schema evolution needed
-            logger.info(
-                f"Evolving schema to include new fields: {all_fields_needed - current_schema_fields}"
-            )
-            self.caption_schema = self._build_caption_schema(self.known_output_fields)
-            # If file exists, we need to migrate it
-            if self.captions_path.exists():
-                await self._evolve_schema_on_disk()
+        logger.debug(f"Flushing {num_rows} rows with {total_outputs} total outputs to disk")
         # Prepare data with all required columns
         prepared_buffer = []
@@ -374,8 +494,9 @@ class StorageManager:
             prepared_buffer.append(prepared_row)
-        # Create table from buffer
-        table = pa.Table.from_pylist(prepared_buffer, schema=self.caption_schema)
+        # Build schema with all known fields (base + output)
+        schema = self._build_caption_schema(self.known_output_fields)
+        table = pa.Table.from_pylist(prepared_buffer, schema=schema)
         if self.captions_path.exists():
             # Read existing table
@@ -391,45 +512,146 @@ class StorageManager:
                 if row["job_id"] not in existing_job_ids:
                     new_rows.append(row)
                 elif row not in duplicate_rows:
-                    duplicate_rows.append(row)
+                    duplicate_rows.append(
+                        {
+                            "input": row,
+                            "existing_job": existing.to_pandas()[
+                                existing.to_pandas()["job_id"] == row["job_id"]
+                            ].to_dict(orient="records"),
+                        }
+                    )
             if duplicate_rows:
                 logger.info(f"Example duplicate row: {duplicate_rows[0]}")
             if new_rows:
                 # Create table from new rows only
-                new_table = pa.Table.from_pylist(new_rows, schema=self.caption_schema)
+                new_table = pa.Table.from_pylist(new_rows, schema=schema)
-                # Combine tables
-                combined = pa.concat_tables([existing, new_table])
+                # Concatenate with promote_options="default" to handle schema differences automatically
+                combined = pa.concat_tables([existing, new_table], promote_options="default")
-                # Write with proper preservation
+                # Write combined table
                 pq.write_table(combined, self.captions_path, compression="snappy")
-                logger.info(
-                    f"Added {len(new_rows)} new rows (skipped {num_rows - len(new_rows)} duplicates)"
-                )
+                self.duplicates_skipped = num_rows - len(new_rows)
                 actual_new = len(new_rows)
             else:
-                logger.info(f"All {num_rows} rows were duplicates, skipping write")
-                actual_new = 0
+                logger.info(f"All {num_rows} rows were duplicates, exiting")
+                raise SystemError("No duplicates can be submitted")
         else:
-            # Write new file
+            # Write new file with all fields
             pq.write_table(table, self.captions_path, compression="snappy")
             actual_new = num_rows
+        # Update statistics
         self.total_captions_written += actual_new
         self.total_caption_entries_written += total_outputs
         self.total_flushes += 1
         self.caption_buffer.clear()
+        # Track row additions for rate calculation
+        if actual_new > 0:
+            current_time = time.time()
+            self.row_additions.append((current_time, actual_new))
+            # Log rates
+            self._log_rates(actual_new)
         logger.info(
-            f"Successfully wrote captions (rows: {self.total_captions_written}, "
-            f"total outputs: {self.total_caption_entries_written}, "
-            f"duplicates skipped: {self.duplicates_skipped})"
+            f"Successfully wrote captions (new rows: {actual_new}, "
+            f"total rows written: {self.total_captions_written}, "
+            f"total captions written: {self.total_caption_entries_written}, "
+            f"duplicates skipped: {self.duplicates_skipped}, "
+            f"output fields: {sorted(list(self.known_output_fields))})"
         )
+    async def optimize_storage(self):
+        """Optimize storage by dropping empty columns. Run this periodically or on-demand."""
+        if not self.captions_path.exists():
+            logger.info("No captions file to optimize")
+            return
+        logger.info("Starting storage optimization...")
+        # Read the full table
+        backup_path = None
+        table = pq.read_table(self.captions_path)
+        df = table.to_pandas()
+        original_columns = len(df.columns)
+        # Find non-empty columns (don't preserve empty base fields)
+        non_empty_columns = self._get_non_empty_columns(df, preserve_base_fields=False)
+        # Always keep at least job_id
+        if "job_id" not in non_empty_columns:
+            non_empty_columns.append("job_id")
+        if len(non_empty_columns) < original_columns:
+            # We have columns to drop
+            df_optimized = df[non_empty_columns]
+            # Rebuild schema for non-empty columns only
+            base_field_names = {f[0] for f in self.base_caption_fields}
+            fields = []
+            output_fields = set()
+            # Process columns in a consistent order: base fields first, then output fields
+            for col in non_empty_columns:
+                if col in base_field_names:
+                    # Find the base field definition
+                    for fname, ftype in self.base_caption_fields:
+                        if fname == col:
+                            fields.append((fname, ftype))
+                            break
+                else:
+                    # Output field
+                    output_fields.add(col)
+            # Add output fields in sorted order
+            for field_name in sorted(output_fields):
+                fields.append((field_name, pa.list_(pa.string())))
+            # Create optimized schema and table
+            optimized_schema = pa.schema(fields)
+            optimized_table = pa.Table.from_pandas(df_optimized, schema=optimized_schema)
+            # Backup the original file (optional)
+            backup_path = self.captions_path.with_suffix(".parquet.bak")
+            import shutil
+            shutil.copy2(self.captions_path, backup_path)
+            # Write optimized table
+            pq.write_table(optimized_table, self.captions_path, compression="snappy")
+            # Update known output fields
+            self.known_output_fields = output_fields
+            # Clean up backup (optional - keep it for safety)
+            # backup_path.unlink()
+            logger.info(
+                f"Storage optimization complete: {original_columns} -> {len(non_empty_columns)} columns. "
+                f"Removed columns: {sorted(set(df.columns) - set(non_empty_columns))}"
+            )
+        else:
+            logger.info(f"No optimization needed - all {original_columns} columns contain data")
+        # Report file size reduction
+        import os
+        if backup_path and backup_path.exists():
+            original_size = os.path.getsize(backup_path)
+            new_size = os.path.getsize(self.captions_path)
+            reduction_pct = (1 - new_size / original_size) * 100
+            logger.info(
+                f"File size: {original_size/1024/1024:.1f}MB -> {new_size/1024/1024:.1f}MB "
+                f"({reduction_pct:.1f}% reduction)"
+            )
     async def _evolve_schema_on_disk(self):
-        """Evolve the schema of the existing parquet file to include new columns."""
+        """Evolve the schema of the existing parquet file to include new columns, removing empty ones."""
         logger.info("Evolving schema on disk to add new columns...")
         # Read existing data
@@ -442,63 +664,24 @@ class StorageManager:
                 df[field_name] = None
                 logger.info(f"Added new column: {field_name}")
-        # Recreate table with new schema
-        evolved_table = pa.Table.from_pandas(df, schema=self.caption_schema)
-        pq.write_table(evolved_table, self.captions_path, compression="snappy")
-        logger.info("Schema evolution complete")
+        # Remove empty columns (but preserve base fields)
+        non_empty_columns = self._get_non_empty_columns(df, preserve_base_fields=True)
+        df = df[non_empty_columns]
-    async def get_captions(self, job_id: str) -> Optional[Dict[str, List[str]]]:
-        """Retrieve all output fields for a specific job_id."""
-        # Check buffer first
-        for buffered in self.caption_buffer:
-            if buffered["job_id"] == job_id:
-                outputs = {}
-                for field_name in self.known_output_fields:
-                    if field_name in buffered and buffered[field_name]:
-                        outputs[field_name] = buffered[field_name]
-                return outputs
-        if not self.captions_path.exists():
-            return None
-        table = pq.read_table(self.captions_path)
-        df = table.to_pandas()
+        # Update known output fields
+        base_field_names = {field[0] for field in self.base_caption_fields}
+        self.known_output_fields = set(non_empty_columns) - base_field_names
-        row = df[df["job_id"] == job_id]
-        if row.empty:
-            return None
+        # Recreate schema with only non-empty fields
+        self.caption_schema = self._build_caption_schema(self.known_output_fields)
-        # Collect all output fields
-        outputs = {}
-        for field_name in self.known_output_fields:
-            if field_name in row.columns:
-                value = row.iloc[0][field_name]
-                if pd.notna(value) and value is not None:
-                    outputs[field_name] = value
-        return outputs if outputs else None
-    async def save_job(self, job: Job):
-        """Save or update a job - buffers until batch size reached."""
-        # For updates, we still add to buffer (will be handled in flush)
-        self.job_buffer.append(
-            {
-                "job_id": job.job_id,
-                "dataset": job.dataset,
-                "shard": job.shard,
-                "item_key": job.item_key,
-                "status": job.status.value,
-                "assigned_to": job.assigned_to,
-                "created_at": job.created_at,
-                "updated_at": datetime.utcnow(),
-            }
+        # Recreate table with new schema
+        evolved_table = pa.Table.from_pandas(df, schema=self.caption_schema)
+        pq.write_table(evolved_table, self.captions_path, compression="snappy")
+        logger.info(
+            f"Schema evolution complete. Active output fields: {sorted(list(self.known_output_fields))}"
         )
-        self.existing_job_ids.add(job.job_id)
-        if len(self.job_buffer) >= self.job_buffer_size:
-            await self._flush_jobs()
     async def save_contributor(self, contributor: Contributor):
         """Save or update contributor stats - buffers until batch size reached."""
         self.contributor_buffer.append(asdict(contributor))
@@ -575,84 +758,134 @@ class StorageManager:
         await self._flush_jobs()
         await self._flush_contributors()
-        logger.info(
-            f"Checkpoint complete. Total rows: {self.total_captions_written}, "
-            f"Total caption entries: {self.total_caption_entries_written}, "
-            f"Duplicates skipped: {self.duplicates_skipped}"
-        )
+        # Log final rate statistics
+        if self.total_captions_written > 0:
+            rates = self._calculate_rates()
+            logger.info(
+                f"Checkpoint complete. Total rows: {self.total_captions_written}, "
+                f"Total caption entries: {self.total_caption_entries_written}, "
+                f"Duplicates skipped: {self.duplicates_skipped} | "
+                f"Overall rate: {rates['overall']:.1f} rows/s"
+            )
+        else:
+            logger.info(
+                f"Checkpoint complete. Total rows: {self.total_captions_written}, "
+                f"Total caption entries: {self.total_caption_entries_written}, "
+                f"Duplicates skipped: {self.duplicates_skipped}"
+            )
-    async def job_exists(self, job_id: str) -> bool:
-        """Check if a job already exists in storage or buffer."""
-        if job_id in self.existing_job_ids:
-            return True
+    def get_all_processed_job_ids(self) -> Set[str]:
+        """Get all processed job_ids - useful for resumption."""
+        if not self.captions_path.exists():
+            logger.info("No captions file found, returning empty processed job_ids set")
+            return set()
-        # Check buffer
-        for buffered in self.job_buffer:
-            if buffered["job_id"] == job_id:
-                return True
+        # Read only the job_id column
+        table = pq.read_table(self.captions_path, columns=["job_id"])
+        job_ids = set(table["job_id"].to_pylist())
-        return False
+        # Add buffered job_ids
+        for row in self.caption_buffer:
+            if "job_id" in row:
+                job_ids.add(row["job_id"])
-    async def get_job(self, job_id: str) -> Optional[Job]:
-        """Retrieve a job by ID."""
-        # Check buffer first
-        for buffered in self.job_buffer:
-            if buffered["job_id"] == job_id:
-                return Job(
-                    job_id=buffered["job_id"],
-                    dataset=buffered["dataset"],
-                    shard=buffered["shard"],
-                    item_key=buffered["item_key"],
-                    status=JobStatus(buffered["status"]),
-                    assigned_to=buffered["assigned_to"],
-                    created_at=buffered["created_at"],
-                )
+        return job_ids
-        if not self.jobs_path.exists():
-            return None
+    async def get_storage_contents(
+        self,
+        limit: Optional[int] = None,
+        columns: Optional[List[str]] = None,
+        include_metadata: bool = True,
+    ) -> StorageContents:
+        """Retrieve storage contents for export.
+        Args:
+            limit: Maximum number of rows to retrieve
+            columns: Specific columns to include (None for all)
+            include_metadata: Whether to include metadata in the result
+        Returns:
+            StorageContents instance with the requested data
+        """
+        if not self.captions_path.exists():
+            return StorageContents(
+                rows=[],
+                columns=[],
+                output_fields=list(self.known_output_fields),
+                total_rows=0,
+                metadata={"message": "No captions file found"},
+            )
+        # Flush buffers first to ensure all data is on disk
+        await self.checkpoint()
+        # Determine columns to read
+        if columns:
+            # Validate requested columns exist
+            table_metadata = pq.read_metadata(self.captions_path)
+            available_columns = set(table_metadata.schema.names)
+            invalid_columns = set(columns) - available_columns
+            if invalid_columns:
+                raise ValueError(f"Columns not found: {invalid_columns}")
+            columns_to_read = columns
+        else:
+            # Read all columns
+            columns_to_read = None
-        table = pq.read_table(self.jobs_path)
+        # Read the table
+        table = pq.read_table(self.captions_path, columns=columns_to_read)
         df = table.to_pandas()
-        row = df[df["job_id"] == job_id]
-        if row.empty:
-            return None
+        # Apply limit if specified
+        if limit:
+            df = df.head(limit)
+        # Convert to list of dicts
+        rows = df.to_dict("records")
+        # Parse metadata JSON strings back to dicts if present
+        if "metadata" in df.columns:
+            for row in rows:
+                if row.get("metadata"):
+                    try:
+                        row["metadata"] = json.loads(row["metadata"])
+                    except:
+                        pass  # Keep as string if parsing fails
+        # Prepare metadata
+        metadata = {}
+        if include_metadata:
+            stats = await self.get_caption_stats()
+            metadata.update(
+                {
+                    "export_timestamp": pd.Timestamp.now().isoformat(),
+                    "total_available_rows": stats.get("total_rows", 0),
+                    "rows_exported": len(rows),
+                    "storage_path": str(self.captions_path),
+                    "field_stats": stats.get("field_stats", {}),
+                }
+            )
-        return Job(
-            job_id=row.iloc[0]["job_id"],
-            dataset=row.iloc[0]["dataset"],
-            shard=row.iloc[0]["shard"],
-            item_key=row.iloc[0]["item_key"],
-            status=JobStatus(row.iloc[0]["status"]),
-            assigned_to=row.iloc[0]["assigned_to"],
-            created_at=row.iloc[0]["created_at"],
+        return StorageContents(
+            rows=rows,
+            columns=list(df.columns),
+            output_fields=list(self.known_output_fields),
+            total_rows=len(df),
+            metadata=metadata,
         )
-    async def get_jobs_by_worker(self, worker_id: str) -> List[Job]:
-        """Get all jobs assigned to a worker."""
-        if not self.jobs_path.exists():
-            return []
+    async def get_processed_jobs_for_chunk(self, chunk_id: str) -> Set[str]:
+        """Get all processed job_ids for a given chunk."""
+        if not self.captions_path.exists():
+            return set()
-        table = pq.read_table(self.jobs_path)
+        # Read only job_id and chunk_id columns
+        table = pq.read_table(self.captions_path, columns=["job_id", "chunk_id"])
         df = table.to_pandas()
-        rows = df[df["assigned_to"] == worker_id]
-        jobs = []
-        for _, row in rows.iterrows():
-            jobs.append(
-                Job(
-                    job_id=row["job_id"],
-                    dataset=row["dataset"],
-                    shard=row["shard"],
-                    item_key=row["item_key"],
-                    status=JobStatus(row["status"]),
-                    assigned_to=row["assigned_to"],
-                    created_at=row["created_at"],
-                )
-            )
-        return jobs
+        # Filter by chunk_id and return job_ids
+        chunk_jobs = df[df["chunk_id"] == chunk_id]["job_id"].tolist()
+        return set(chunk_jobs)
     async def get_caption_stats(self) -> Dict[str, Any]:
         """Get statistics about stored captions including field-specific stats."""
@@ -683,11 +916,12 @@ class StorageManager:
                 field_lengths = []
                 for value in df.loc[non_null_mask, field_name]:
+                    # list or array-like
                     if isinstance(value, list):
                         length = len(value)
                         field_total += length
                         field_lengths.append(length)
-                    elif pd.notna(value):
+                    elif value.any():
                         length = 1
                         field_total += length
                         field_lengths.append(length)
@@ -719,46 +953,6 @@ class StorageManager:
             },
         }
-    async def get_sample_captions(self, n: int = 5) -> List[Dict[str, Any]]:
-        """Get a sample of caption entries showing all output fields."""
-        if not self.captions_path.exists():
-            return []
-        table = pq.read_table(self.captions_path)
-        df = table.to_pandas()
-        if len(df) == 0:
-            return []
-        sample_df = df.sample(min(n, len(df)))
-        samples = []
-        for _, row in sample_df.iterrows():
-            # Collect outputs from dynamic columns
-            outputs = {}
-            total_outputs = 0
-            for field_name in self.known_output_fields:
-                if field_name in row and pd.notna(row[field_name]):
-                    value = row[field_name]
-                    outputs[field_name] = value
-                    if isinstance(value, list):
-                        total_outputs += len(value)
-            samples.append(
-                {
-                    "job_id": row["job_id"],
-                    "item_key": row["item_key"],
-                    "outputs": outputs,
-                    "field_count": len(outputs),
-                    "total_outputs": total_outputs,
-                    "image_dims": f"{row.get('image_width', 'N/A')}x{row.get('image_height', 'N/A')}",
-                    "has_metadata": bool(row.get("metadata") and row["metadata"] != "{}"),
-                }
-            )
-        return samples
     async def count_captions(self) -> int:
         """Count total outputs across all dynamic fields."""
         total = 0
@@ -888,142 +1082,26 @@ class StorageManager:
             "fields": sorted(list(field_counts.keys())),
         }
-    async def get_captions_with_field(
-        self, field_name: str, limit: int = 100
-    ) -> List[Dict[str, Any]]:
-        """Get captions that have a specific output field."""
-        if not self.captions_path.exists():
-            return []
-        if field_name not in self.known_output_fields:
-            logger.warning(f"Field '{field_name}' not found in known output fields")
-            return []
-        # Check if the field actually exists in the file
-        existing_output_columns = self._get_existing_output_columns()
-        if field_name not in existing_output_columns:
-            logger.warning(
-                f"Field '{field_name}' exists in known fields but not in parquet file yet"
-            )
-            return []
-        # Only read necessary columns
-        columns_to_read = ["job_id", "item_key", field_name]
-        try:
-            table = pq.read_table(self.captions_path, columns=columns_to_read)
-        except Exception as e:
-            logger.error(f"Error reading field '{field_name}': {e}")
-            return []
-        df = table.to_pandas()
-        # Filter rows where field has data
-        mask = df[field_name].notna()
-        filtered_df = df[mask].head(limit)
-        results = []
-        for _, row in filtered_df.iterrows():
-            results.append(
-                {
-                    "job_id": row["job_id"],
-                    "item_key": row["item_key"],
-                    field_name: row[field_name],
-                    "value_count": len(row[field_name]) if isinstance(row[field_name], list) else 1,
-                }
-            )
-        return results
-    async def export_by_field(self, field_name: str, output_path: Path, format: str = "jsonl"):
-        """Export all captions for a specific field."""
-        if not self.captions_path.exists():
-            logger.warning("No captions to export")
-            return 0
-        if field_name not in self.known_output_fields:
-            logger.warning(f"Field '{field_name}' not found in known output fields")
-            return 0
-        # Check if the field actually exists in the file
-        existing_output_columns = self._get_existing_output_columns()
-        if field_name not in existing_output_columns:
-            logger.warning(f"Field '{field_name}' not found in parquet file")
-            return 0
-        # Read only necessary columns
-        columns_to_read = ["item_key", "dataset", field_name]
-        table = pq.read_table(self.captions_path, columns=columns_to_read)
-        df = table.to_pandas()
-        exported = 0
-        with open(output_path, "w") as f:
-            for _, row in df.iterrows():
-                if pd.notna(row[field_name]) and row[field_name]:
-                    if format == "jsonl":
-                        record = {
-                            "item_key": row["item_key"],
-                            "dataset": row["dataset"],
-                            field_name: row[field_name],
-                        }
-                        f.write(json.dumps(record) + "\n")
-                        exported += 1
-        logger.info(f"Exported {exported} items with field '{field_name}' to {output_path}")
-        return exported
-    async def get_pending_jobs(self) -> List[Job]:
-        """Get all pending jobs for restoration on startup."""
-        if not self.jobs_path.exists():
-            return []
-        table = pq.read_table(self.jobs_path)
-        df = table.to_pandas()
-        # Get jobs with PENDING or PROCESSING status
-        pending_df = df[df["status"].isin([JobStatus.PENDING.value, JobStatus.PROCESSING.value])]
-        jobs = []
-        for _, row in pending_df.iterrows():
-            jobs.append(
-                Job(
-                    job_id=row["job_id"],
-                    dataset=row["dataset"],
-                    shard=row["shard"],
-                    item_key=row["item_key"],
-                    status=JobStatus(row["status"]),
-                    assigned_to=row.get("assigned_to"),
-                    created_at=row["created_at"],
-                )
-            )
-        return jobs
-    async def count_jobs(self) -> int:
-        """Count total jobs."""
-        if not self.jobs_path.exists():
-            return 0
-        table = pq.read_table(self.jobs_path)
-        return len(table)
-    async def count_completed_jobs(self) -> int:
-        """Count completed jobs."""
-        if not self.jobs_path.exists():
-            return 0
-        table = pq.read_table(self.jobs_path)
-        df = table.to_pandas()
-        return len(df[df["status"] == JobStatus.COMPLETED.value])
     async def close(self):
         """Close storage and flush buffers."""
         await self.checkpoint()
-        logger.info(
-            f"Storage closed. Total rows: {self.total_captions_written}, "
-            f"Total caption entries: {self.total_caption_entries_written}, "
-            f"Duplicates skipped: {self.duplicates_skipped}"
-        )
+        # Log final rate statistics
+        if self.total_captions_written > 0:
+            rates = self._calculate_rates()
+            logger.info(
+                f"Storage closed. Total rows: {self.total_captions_written}, "
+                f"Total caption entries: {self.total_caption_entries_written}, "
+                f"Duplicates skipped: {self.duplicates_skipped} | "
+                f"Final rates - Overall: {rates['overall']:.1f} rows/s, "
+                f"Last hour: {rates['60min']:.1f} rows/s"
+            )
+        else:
+            logger.info(
+                f"Storage closed. Total rows: {self.total_captions_written}, "
+                f"Total caption entries: {self.total_caption_entries_written}, "
+                f"Duplicates skipped: {self.duplicates_skipped}"
+            )
     async def get_storage_stats(self) -> Dict[str, Any]:
         """Get all storage-related statistics."""
@@ -1041,6 +1119,9 @@ class StorageManager:
         field_stats = await self.get_caption_stats()
         total_rows_including_buffer = await self.count_caption_rows() + len(self.caption_buffer)
+        # Calculate rates
+        rates = self._calculate_rates()
         return {
             "total_captions": disk_outputs + buffer_outputs,
             "total_rows": total_rows_including_buffer,
@@ -1053,4 +1134,11 @@ class StorageManager:
             "field_breakdown": field_stats.get("field_stats", None),
             "job_buffer_size": len(self.job_buffer),
             "contributor_buffer_size": len(self.contributor_buffer),
+            "rates": {
+                "instant": f"{rates['instant']:.1f} rows/s",
+                "5min": f"{rates['5min']:.1f} rows/s",
+                "15min": f"{rates['15min']:.1f} rows/s",
+                "60min": f"{rates['60min']:.1f} rows/s",
+                "overall": f"{rates['overall']:.1f} rows/s",
+            },
         }

caption-flow 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

caption-flow 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl