PyPI - caption-flow - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

caption-flow 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

caption_flow/cli.py +308 -0
caption_flow/models.py +134 -1
caption_flow/monitor.py +1 -1
caption_flow/orchestrator.py +423 -1715
caption_flow/processors/__init__.py +11 -0
caption_flow/processors/base.py +219 -0
caption_flow/processors/huggingface.py +832 -0
caption_flow/processors/local_filesystem.py +683 -0
caption_flow/processors/webdataset.py +782 -0
caption_flow/storage/__init__.py +1 -0
caption_flow/storage/exporter.py +550 -0
caption_flow/{storage.py → storage/manager.py} +489 -401
caption_flow/utils/checkpoint_tracker.py +2 -2
caption_flow/utils/chunk_tracker.py +73 -32
caption_flow/utils/dataset_loader.py +58 -298
caption_flow/utils/dataset_metadata_cache.py +67 -0
caption_flow/utils/image_processor.py +1 -4
caption_flow/utils/shard_processor.py +5 -265
caption_flow/utils/shard_tracker.py +1 -5
caption_flow/viewer.py +594 -0
caption_flow/workers/base.py +3 -3
caption_flow/workers/caption.py +416 -792
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/METADATA +49 -180
caption_flow-0.2.4.dist-info/RECORD +38 -0
caption_flow-0.2.2.dist-info/RECORD +0 -29
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/WHEEL +0 -0
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.2.dist-info → caption_flow-0.2.4.dist-info}/top_level.txt +0 -0

caption_flow/cli.py CHANGED Viewed

@@ -161,6 +161,7 @@ def main(ctx, verbose: bool):
 @click.option("--key", help="SSL key path")
 @click.option("--no-ssl", is_flag=True, help="Disable SSL (development only)")
 @click.option("--vllm", is_flag=True, help="Use vLLM orchestrator for WebDataset/HF datasets")
+@click.option("--verbose", is_flag=True, help="Enable verbose logging")
 @click.pass_context
 def orchestrator(ctx, config: Optional[str], **kwargs):
     """Start the orchestrator server."""
@@ -366,6 +367,63 @@ def monitor(
         sys.exit(1)
+# Add this command after the export command in cli.py
+@main.command()
+@click.option("--data-dir", default="./caption_data", help="Storage directory")
+@click.option("--refresh-rate", default=10, type=int, help="Display refresh rate (Hz)")
+@click.option("--no-images", is_flag=True, help="Disable image preview")
+@click.pass_context
+def view(ctx, data_dir: str, refresh_rate: int, no_images: bool):
+    """Browse captioned dataset with interactive TUI viewer."""
+    from .viewer import DatasetViewer
+    data_path = Path(data_dir)
+    if not data_path.exists():
+        console.print(f"[red]Storage directory not found: {data_dir}[/red]")
+        sys.exit(1)
+    if not (data_path / "captions.parquet").exists():
+        console.print(f"[red]No captions file found in {data_dir}[/red]")
+        console.print("[yellow]Have you exported any captions yet?[/yellow]")
+        sys.exit(1)
+    # Check for term-image if images are enabled
+    if not no_images:
+        try:
+            import term_image
+        except ImportError:
+            console.print("[yellow]Warning: term-image not installed[/yellow]")
+            console.print("Install with: pip install term-image")
+            console.print("Running without image preview...")
+            no_images = True
+    try:
+        viewer = DatasetViewer(data_path)
+        if no_images:
+            viewer.disable_images = True
+        viewer.refresh_rate = refresh_rate
+        console.print(f"[cyan]Starting dataset viewer...[/cyan]")
+        console.print(f"[dim]Data directory: {data_path}[/dim]")
+        asyncio.run(viewer.run())
+    except FileNotFoundError as e:
+        console.print(f"[red]Error: {e}[/red]")
+        sys.exit(1)
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Viewer closed[/yellow]")
+    except Exception as e:
+        console.print(f"[red]Error: {e}[/red]")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
 @main.command()
 @click.option("--config", type=click.Path(exists=True), help="Configuration file")
 @click.option("--server", help="Orchestrator WebSocket URL")
@@ -635,6 +693,256 @@ def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
         tracker.save_checkpoint()
+@main.command()
+@click.option("--data-dir", default="./caption_data", help="Storage directory")
+@click.option(
+    "--format",
+    type=click.Choice(
+        ["jsonl", "json", "csv", "txt", "huggingface_hub", "all"], case_sensitive=False
+    ),
+    default="jsonl",
+    help="Export format (default: jsonl)",
+)
+@click.option("--output", "-o", help="Output path (file for jsonl/csv, directory for json/txt)")
+@click.option("--limit", type=int, help="Limit number of rows to export")
+@click.option("--columns", help="Comma-separated list of columns to export (default: all)")
+@click.option("--export-column", default="captions", help="Column to export for txt format")
+@click.option("--filename-column", default="filename", help="Column containing filenames")
+@click.option("--include-empty", is_flag=True, help="Include rows with empty export column")
+@click.option("--stats-only", is_flag=True, help="Show statistics without exporting")
+@click.option(
+    "--optimize", is_flag=True, help="Optimize storage before export (remove empty columns)"
+)
+@click.option("--verbose", is_flag=True, help="Show detailed export progress")
+@click.option("--hf-dataset", help="Dataset name on HF Hub (e.g., username/dataset-name)")
+@click.option("--license", help="License for the dataset (required for new HF datasets)")
+@click.option("--private", is_flag=True, help="Make HF dataset private")
+@click.option("--nsfw", is_flag=True, help="Add not-for-all-audiences tag")
+@click.option("--tags", help="Comma-separated tags for HF dataset")
+def export(
+    data_dir: str,
+    format: str,
+    output: Optional[str],
+    limit: Optional[int],
+    columns: Optional[str],
+    export_column: str,
+    filename_column: str,
+    include_empty: bool,
+    stats_only: bool,
+    optimize: bool,
+    verbose: bool,
+    hf_dataset: Optional[str],
+    license: Optional[str],
+    private: bool,
+    nsfw: bool,
+    tags: Optional[str],
+):
+    """Export caption data to various formats."""
+    from .storage import StorageManager
+    from .storage.exporter import StorageExporter, ExportError
+    # Initialize storage manager
+    storage_path = Path(data_dir)
+    if not storage_path.exists():
+        console.print(f"[red]Storage directory not found: {data_dir}[/red]")
+        sys.exit(1)
+    storage = StorageManager(storage_path)
+    async def run_export():
+        await storage.initialize()
+        # Show statistics first
+        stats = await storage.get_caption_stats()
+        console.print("\n[bold cyan]Storage Statistics:[/bold cyan]")
+        console.print(f"[green]Total rows:[/green] {stats['total_rows']:,}")
+        console.print(f"[green]Total outputs:[/green] {stats['total_outputs']:,}")
+        console.print(f"[green]Output fields:[/green] {', '.join(stats['output_fields'])}")
+        if stats.get("field_stats"):
+            console.print("\n[cyan]Field breakdown:[/cyan]")
+            for field, field_stat in stats["field_stats"].items():
+                console.print(
+                    f"  • {field}: {field_stat['total_items']:,} items "
+                    f"in {field_stat['rows_with_data']:,} rows"
+                )
+        if stats_only:
+            return
+        # Optimize storage if requested
+        if optimize:
+            console.print("\n[yellow]Optimizing storage (removing empty columns)...[/yellow]")
+            await storage.optimize_storage()
+        # Prepare columns list
+        column_list = None
+        if columns:
+            column_list = [col.strip() for col in columns.split(",")]
+            console.print(f"\n[cyan]Exporting columns:[/cyan] {', '.join(column_list)}")
+        # Get storage contents
+        console.print("\n[yellow]Loading data...[/yellow]")
+        try:
+            contents = await storage.get_storage_contents(
+                limit=limit, columns=column_list, include_metadata=True
+            )
+        except ValueError as e:
+            console.print(f"[red]Error: {e}[/red]")
+            sys.exit(1)
+        if not contents.rows:
+            console.print("[yellow]No data to export![/yellow]")
+            return
+        # Filter out empty rows if not including empty
+        if not include_empty and format in ["txt", "json"]:
+            original_count = len(contents.rows)
+            contents.rows = [
+                row
+                for row in contents.rows
+                if row.get(export_column)
+                and (not isinstance(row[export_column], list) or len(row[export_column]) > 0)
+            ]
+            filtered_count = original_count - len(contents.rows)
+            if filtered_count > 0:
+                console.print(f"[dim]Filtered {filtered_count} empty rows[/dim]")
+        # Create exporter
+        exporter = StorageExporter(contents)
+        # Determine output paths
+        if format == "all":
+            # Export to all formats
+            base_name = output or "caption_export"
+            base_path = Path(base_name)
+            formats_exported = []
+            # JSONL
+            jsonl_path = base_path.with_suffix(".jsonl")
+            console.print(f"\n[cyan]Exporting to JSONL:[/cyan] {jsonl_path}")
+            rows = exporter.to_jsonl(jsonl_path)
+            formats_exported.append(f"JSONL: {rows:,} rows")
+            # CSV
+            csv_path = base_path.with_suffix(".csv")
+            console.print(f"[cyan]Exporting to CSV:[/cyan] {csv_path}")
+            try:
+                rows = exporter.to_csv(csv_path)
+                formats_exported.append(f"CSV: {rows:,} rows")
+            except ExportError as e:
+                console.print(f"[yellow]Skipping CSV: {e}[/yellow]")
+            # JSON files
+            json_dir = base_path.parent / f"{base_path.stem}_json"
+            console.print(f"[cyan]Exporting to JSON files:[/cyan] {json_dir}/")
+            try:
+                files = exporter.to_json(json_dir, filename_column)
+                formats_exported.append(f"JSON: {files:,} files")
+            except ExportError as e:
+                console.print(f"[yellow]Skipping JSON files: {e}[/yellow]")
+            # Text files
+            txt_dir = base_path.parent / f"{base_path.stem}_txt"
+            console.print(f"[cyan]Exporting to text files:[/cyan] {txt_dir}/")
+            try:
+                files = exporter.to_txt(txt_dir, filename_column, export_column)
+                formats_exported.append(f"Text: {files:,} files")
+            except ExportError as e:
+                console.print(f"[yellow]Skipping text files: {e}[/yellow]")
+            console.print(f"\n[green]✓ Export complete![/green]")
+            for fmt in formats_exported:
+                console.print(f"  • {fmt}")
+        else:
+            # Single format export
+            try:
+                if format == "jsonl":
+                    output_path = output or "captions.jsonl"
+                    console.print(f"\n[cyan]Exporting to JSONL:[/cyan] {output_path}")
+                    rows = exporter.to_jsonl(output_path)
+                    console.print(f"[green]✓ Exported {rows:,} rows[/green]")
+                elif format == "csv":
+                    output_path = output or "captions.csv"
+                    console.print(f"\n[cyan]Exporting to CSV:[/cyan] {output_path}")
+                    rows = exporter.to_csv(output_path)
+                    console.print(f"[green]✓ Exported {rows:,} rows[/green]")
+                elif format == "json":
+                    output_dir = output or "./json_output"
+                    console.print(f"\n[cyan]Exporting to JSON files:[/cyan] {output_dir}/")
+                    files = exporter.to_json(output_dir, filename_column)
+                    console.print(f"[green]✓ Created {files:,} JSON files[/green]")
+                elif format == "txt":
+                    output_dir = output or "./txt_output"
+                    console.print(f"\n[cyan]Exporting to text files:[/cyan] {output_dir}/")
+                    console.print(f"[dim]Export column: {export_column}[/dim]")
+                    files = exporter.to_txt(output_dir, filename_column, export_column)
+                    console.print(f"[green]✓ Created {files:,} text files[/green]")
+                elif format == "huggingface_hub":
+                    # Validate required parameters
+                    if not hf_dataset:
+                        console.print(
+                            "[red]Error: --hf-dataset required for huggingface_hub format[/red]"
+                        )
+                        console.print(
+                            "[dim]Example: --hf-dataset username/my-caption-dataset[/dim]"
+                        )
+                        sys.exit(1)
+                    # Parse tags
+                    tag_list = None
+                    if tags:
+                        tag_list = [tag.strip() for tag in tags.split(",")]
+                    console.print(f"\n[cyan]Uploading to Hugging Face Hub:[/cyan] {hf_dataset}")
+                    if private:
+                        console.print("[dim]Privacy: Private dataset[/dim]")
+                    if nsfw:
+                        console.print("[dim]Content: Not for all audiences[/dim]")
+                    if tag_list:
+                        console.print(f"[dim]Tags: {', '.join(tag_list)}[/dim]")
+                    url = exporter.to_huggingface_hub(
+                        dataset_name=hf_dataset,
+                        license=license,
+                        private=private,
+                        nsfw=nsfw,
+                        tags=tag_list,
+                    )
+                    console.print(f"[green]✓ Dataset uploaded to: {url}[/green]")
+            except ExportError as e:
+                console.print(f"[red]Export error: {e}[/red]")
+                sys.exit(1)
+        # Show export metadata
+        if verbose and contents.metadata:
+            console.print("\n[dim]Export metadata:[/dim]")
+            console.print(f"  Timestamp: {contents.metadata.get('export_timestamp')}")
+            console.print(f"  Total available: {contents.metadata.get('total_available_rows'):,}")
+            console.print(f"  Rows exported: {contents.metadata.get('rows_exported'):,}")
+    # Run the async export
+    try:
+        asyncio.run(run_export())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Export cancelled[/yellow]")
+        sys.exit(1)
+    except Exception as e:
+        console.print(f"[red]Unexpected error: {e}[/red]")
+        if verbose:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
 @main.command()
 @click.option("--domain", help="Domain for Let's Encrypt certificate")
 @click.option("--email", help="Email for Let's Encrypt registration")

caption_flow/models.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """Data models for CaptionFlow."""
+import PIL
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
+from PIL import Image
 class JobStatus(Enum):
@@ -38,6 +40,38 @@ class Job:
             self.created_at = datetime.utcnow()
+@dataclass
+class JobId:
+    shard_id: str
+    chunk_id: str
+    sample_id: str
+    def get_shard_str(self):
+        return f"{self.shard_id}"
+    def get_chunk_str(self):
+        return f"{self.shard_id}:chunk:{self.chunk_id}"
+    def get_sample_str(self):
+        return f"{self.shard_id}:chunk:{self.chunk_id}:idx:{self.sample_id}"
+    @staticmethod
+    def from_dict(job: dict) -> "JobId":
+        return JobId(shard_id=job["shard_id"], chunk_id=job["chunk_id"], sample_id=job["sample_id"])
+    @staticmethod
+    def from_values(shard_id: str, chunk_id: str, sample_id: str) -> "JobId":
+        return JobId(shard_id=shard_id, chunk_id=chunk_id, sample_id=sample_id)
+    @staticmethod
+    def from_str(job_id: str):
+        # from data-0000:chunk:0:idx:0
+        parts = job_id.split(":")
+        if len(parts) != 5:
+            raise ValueError(f"Invalid job_id format: {job_id}")
+        return JobId(shard_id=parts[0], chunk_id=parts[2], sample_id=parts[4])
 @dataclass
 class Caption:
     """Generated caption with attribution and image metadata."""
@@ -61,6 +95,8 @@ class Caption:
     image_height: Optional[int] = None
     image_format: Optional[str] = None
     file_size: Optional[int] = None
+    filename: Optional[str] = None
+    url: Optional[str] = None
     # Processing metadata
     caption_index: Optional[int] = None  # Which caption this is (0, 1, 2...)
@@ -82,3 +118,100 @@ class Contributor:
     name: str
     total_captions: int = 0
     trust_level: int = 1
+@dataclass
+class ProcessingStage:
+    """Configuration for a single processing stage."""
+    name: str
+    model: str
+    prompts: List[str]
+    output_field: str
+    requires: List[str] = field(default_factory=list)
+    sampling: Optional[Dict[str, Any]] = None
+    # Model-specific overrides
+    tensor_parallel_size: Optional[int] = None
+    max_model_len: Optional[int] = None
+    dtype: Optional[str] = None
+    gpu_memory_utilization: Optional[float] = None
+@dataclass
+class StageResult:
+    """Results from a single stage."""
+    stage_name: str
+    output_field: str
+    outputs: List[str]  # Multiple outputs from multiple prompts
+    error: Optional[str] = None
+    def is_success(self) -> bool:
+        return self.error is None and bool(self.outputs)
+@dataclass
+class ShardChunk:
+    """Shard chunk assignment with unprocessed ranges."""
+    chunk_id: str
+    shard_url: str
+    shard_name: str
+    start_index: int
+    chunk_size: int
+    unprocessed_ranges: List[Tuple[int, int]] = field(default_factory=list)
+@dataclass
+class ProcessingItem:
+    """Item being processed."""
+    chunk_id: str
+    item_key: str
+    image: Image.Image
+    image_data: bytes
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    stage_results: Dict[str, StageResult] = field(default_factory=dict)  # Accumulated results
+@dataclass
+class ProcessedResult:
+    """Result with multi-stage outputs."""
+    chunk_id: str
+    shard_name: str
+    item_key: str
+    outputs: Dict[str, List[str]]  # field_name -> list of outputs
+    image_width: int
+    image_height: int
+    image_format: str
+    file_size: int
+    processing_time_ms: float
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class StorageContents:
+    """Container for storage data to be exported."""
+    rows: List[Dict[str, Any]]
+    columns: List[str]
+    output_fields: List[str]
+    total_rows: int
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self):
+        """Validate data consistency."""
+        if self.rows and self.columns:
+            # Ensure all rows have the expected columns
+            for row in self.rows:
+                missing_cols = set(self.columns) - set(row.keys())
+                if missing_cols:
+                    logger.warning(f"Row missing columns: {missing_cols}")
+class ExportError(Exception):
+    """Base exception for export-related errors."""
+    pass

caption_flow/monitor.py CHANGED Viewed

@@ -83,7 +83,7 @@ class Monitor:
                         await self._handle_update(data)
             except Exception as e:
-                logger.error(f"Connection error: {e}")
+                logger.error(f"Connection error: {e}", exc_info=True)
                 await asyncio.sleep(5)
     async def _handle_update(self, data: Dict):

caption-flow 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

caption-flow 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl