PyPI - caption-flow - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +937 -416
caption_flow/models.py +45 -3
caption_flow/monitor.py +5 -3
caption_flow/orchestrator.py +186 -116
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +440 -68
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +66 -25
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +41 -19
caption_flow/utils/chunk_tracker.py +200 -65
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +12 -6
caption_flow/workers/caption.py +272 -91
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.3.dist-info/RECORD +0 -33
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/cli.py CHANGED Viewed

@@ -1,21 +1,22 @@
 """Command-line interface for CaptionFlow with smart configuration handling."""
 import asyncio
+import datetime as _datetime
 import json
 import logging
 import os
 import sys
+from datetime import datetime
 from pathlib import Path
-from typing import Optional, Dict, Any, List
+from typing import Any, Dict, List, Optional
 import click
 import yaml
 from rich.console import Console
 from rich.logging import RichHandler
-from datetime import datetime
-from .orchestrator import Orchestrator
 from .monitor import Monitor
+from .orchestrator import Orchestrator
 from .utils.certificates import CertificateManager
 console = Console()
@@ -48,8 +49,7 @@ class ConfigManager:
     def find_config(
         cls, component: str, explicit_path: Optional[str] = None
     ) -> Optional[Dict[str, Any]]:
-        """
-        Find and load configuration for a component.
+        """Find and load configuration for a component.
         Search order:
         1. Explicit path if provided
@@ -120,22 +120,76 @@ class ConfigManager:
 def setup_logging(verbose: bool = False):
-    """Configure logging with rich handler, including timestamp."""
+    """Configure logging with rich handler and file output to XDG state directory."""
     level = logging.DEBUG if verbose else logging.INFO
-    logging.basicConfig(
-        level=level,
-        format="%(message)s",
-        datefmt="[%Y-%m-%d %H:%M:%S]",
-        handlers=[
+    # Determine log directory based on environment or XDG spec
+    log_dir_env = os.environ.get("CAPTIONFLOW_LOG_DIR")
+    if log_dir_env:
+        log_dir = Path(log_dir_env)
+    else:
+        # Use XDG_STATE_HOME for logs, with platform-specific fallbacks
+        xdg_state_home = os.environ.get("XDG_STATE_HOME")
+        if xdg_state_home:
+            base_dir = Path(xdg_state_home)
+        elif sys.platform == "darwin":
+            base_dir = Path.home() / "Library" / "Logs"
+        else:
+            # Default to ~/.local/state on Linux and other systems
+            base_dir = Path.home() / ".local" / "state"
+        log_dir = base_dir / "caption-flow"
+    try:
+        # Ensure log directory exists
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file_path = log_dir / "caption_flow.log"
+        # Set up handlers
+        handlers: List[logging.Handler] = [
+            RichHandler(
+                console=console,
+                rich_tracebacks=True,
+                show_path=False,
+                show_time=True,
+            )
+        ]
+        # Add file handler
+        file_handler = logging.FileHandler(log_file_path, mode="a")
+        file_handler.setFormatter(
+            logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+        )
+        handlers.append(file_handler)
+        log_msg = f"Logging to {log_file_path}"
+    except (OSError, PermissionError) as e:
+        # Fallback to only console logging if file logging fails
+        handlers = [
             RichHandler(
                 console=console,
                 rich_tracebacks=True,
                 show_path=False,
-                show_time=True,  # Enables timestamp in RichHandler output
+                show_time=True,
             )
-        ],
+        ]
+        log_file = log_dir / "caption_flow.log"
+        log_msg = f"[yellow]Warning: Could not write to log file {log_file}: {e}[/yellow]"
+    logging.basicConfig(
+        level=level,
+        format="%(message)s",  # RichHandler overrides this format for console
+        datefmt="[%Y-%m-%d %H:%M:%S]",
+        handlers=handlers,
     )
+    # Suppress noisy libraries
+    logging.getLogger("websockets").setLevel(logging.WARNING)
+    logging.getLogger("pyarrow").setLevel(logging.WARNING)
+    # Use a dedicated logger to print the log file path to avoid format issues
+    if "log_msg" in locals():
+        logging.getLogger("setup").info(log_msg)
 def apply_cli_overrides(config: Dict[str, Any], **kwargs) -> Dict[str, Any]:
     """Apply CLI arguments as overrides to config, filtering out None values."""
@@ -189,9 +243,11 @@ def orchestrator(ctx, config: Optional[str], **kwargs):
             config_data["ssl"]["cert"] = kwargs["cert"]
             config_data["ssl"]["key"] = kwargs["key"]
         elif not config_data.get("ssl"):
-            console.print(
-                "[yellow]Warning: Running without SSL. Use --cert and --key for production.[/yellow]"
+            warning_msg = (
+                "[yellow]Warning: Running without SSL. "
+                "Use --cert and --key for production.[/yellow]"
             )
+            console.print(warning_msg)
     if kwargs.get("vllm") and "vllm" not in config_data:
         raise ValueError("Must provide vLLM config.")
@@ -259,33 +315,11 @@ def worker(ctx, config: Optional[str], **kwargs):
         asyncio.run(worker_instance.shutdown())
-@main.command()
-@click.option("--config", type=click.Path(exists=True), help="Configuration file")
-@click.option("--server", help="Orchestrator WebSocket URL")
-@click.option("--token", help="Authentication token")
-@click.option("--no-verify-ssl", is_flag=True, help="Skip SSL verification")
-@click.option("--debug", is_flag=True, help="Enable debug output")
-@click.pass_context
-def monitor(
-    ctx,
-    config: Optional[str],
-    server: Optional[str],
-    token: Optional[str],
-    no_verify_ssl: bool,
-    debug: bool,
-):
-    """Start the monitoring TUI."""
-    # Enable debug logging if requested
-    if debug:
-        setup_logging(verbose=True)
-        console.print("[yellow]Debug mode enabled[/yellow]")
-    # Load configuration
+def _load_monitor_config(config, server, token):
+    """Load monitor configuration from file or fallback to orchestrator config."""
     base_config = ConfigManager.find_config("monitor", config)
     if not base_config:
-        # Try to find monitor config in orchestrator config as fallback
         orch_config = ConfigManager.find_config("orchestrator")
         if orch_config and "monitor" in orch_config:
             base_config = {"monitor": orch_config["monitor"]}
@@ -295,15 +329,11 @@ def monitor(
             if not server or not token:
                 console.print("[yellow]No monitor config found, using CLI args[/yellow]")
-    # Handle different config structures
-    # Case 1: Config has top-level 'monitor' section
-    if "monitor" in base_config:
-        config_data = base_config["monitor"]
-    # Case 2: Config IS the monitor config (no wrapper)
-    else:
-        config_data = base_config
+    return base_config.get("monitor", base_config)
-    # Apply CLI overrides (CLI always wins)
+def _apply_monitor_overrides(config_data, server, token, no_verify_ssl):
+    """Apply CLI overrides to monitor configuration."""
     if server:
         config_data["server"] = server
     if token:
@@ -311,17 +341,20 @@ def monitor(
     if no_verify_ssl:
         config_data["verify_ssl"] = False
-    # Debug output
-    if debug:
-        console.print("\n[cyan]Final monitor configuration:[/cyan]")
-        console.print(f"  Server: {config_data.get('server', 'NOT SET')}")
-        console.print(
-            f"  Token: {'***' + config_data.get('token', '')[-4:] if config_data.get('token') else 'NOT SET'}"
-        )
-        console.print(f"  Verify SSL: {config_data.get('verify_ssl', True)}")
-        console.print()
-    # Validate required fields
+def _debug_monitor_config(config_data):
+    """Print debug information about monitor configuration."""
+    console.print("\n[cyan]Final monitor configuration:[/cyan]")
+    console.print(f"  Server: {config_data.get('server', 'NOT SET')}")
+    console.print(
+        f"  Token: {'***' + config_data.get('token', '')[-4:] if config_data.get('token') else 'NOT SET'}"
+    )
+    console.print(f"  Verify SSL: {config_data.get('verify_ssl', True)}")
+    console.print()
+def _validate_monitor_config(config_data):
+    """Validate required monitor configuration fields."""
     if not config_data.get("server"):
         console.print("[red]Error: --server required (or set 'server' in monitor.yaml)[/red]")
         console.print("\n[dim]Example monitor.yaml:[/dim]")
@@ -336,12 +369,43 @@ def monitor(
         console.print("token: your-token-here")
         sys.exit(1)
-    # Set defaults for optional settings
+def _set_monitor_defaults(config_data):
+    """Set default values for optional monitor settings."""
     config_data.setdefault("refresh_interval", 1.0)
     config_data.setdefault("show_inactive_workers", False)
     config_data.setdefault("max_log_lines", 100)
-    # Create and start monitor
+@main.command()
+@click.option("--config", type=click.Path(exists=True), help="Configuration file")
+@click.option("--server", help="Orchestrator WebSocket URL")
+@click.option("--token", help="Authentication token")
+@click.option("--no-verify-ssl", is_flag=True, help="Skip SSL verification")
+@click.option("--debug", is_flag=True, help="Enable debug output")
+@click.pass_context
+def monitor(
+    ctx,
+    config: Optional[str],
+    server: Optional[str],
+    token: Optional[str],
+    no_verify_ssl: bool,
+    debug: bool,
+):
+    """Start the monitoring TUI."""
+    if debug:
+        setup_logging(verbose=True)
+        console.print("[yellow]Debug mode enabled[/yellow]")
+    config_data = _load_monitor_config(config, server, token)
+    _apply_monitor_overrides(config_data, server, token, no_verify_ssl)
+    if debug:
+        _debug_monitor_config(config_data)
+    _validate_monitor_config(config_data)
+    _set_monitor_defaults(config_data)
     try:
         monitor_instance = Monitor(config_data)
@@ -406,7 +470,7 @@ def view(ctx, data_dir: str, refresh_rate: int, no_images: bool):
             viewer.disable_images = True
         viewer.refresh_rate = refresh_rate
-        console.print(f"[cyan]Starting dataset viewer...[/cyan]")
+        console.print("[cyan]Starting dataset viewer...[/cyan]")
         console.print(f"[dim]Data directory: {data_path}[/dim]")
         asyncio.run(viewer.run())
@@ -424,6 +488,400 @@ def view(ctx, data_dir: str, refresh_rate: int, no_images: bool):
         sys.exit(1)
+def _load_admin_credentials(config, server, token):
+    """Load admin server and token from config if not provided."""
+    if server and token:
+        return server, token
+    base_config = ConfigManager.find_config("orchestrator", config) or {}
+    admin_config = base_config.get("admin", {})
+    admin_tokens = base_config.get("orchestrator", {}).get("auth", {}).get("admin_tokens", [])
+    final_server = server or admin_config.get("server", "ws://localhost:8765")
+    final_token = token or admin_config.get("token")
+    if not final_token and admin_tokens:
+        console.print("Using first admin token.")
+        final_token = admin_tokens[0].get("token")
+    return final_server, final_token
+def _setup_ssl_context(server, no_verify_ssl):
+    """Setup SSL context for websocket connection."""
+    import ssl
+    ssl_context = None
+    if server.startswith("wss://"):
+        ssl_context = ssl.create_default_context()
+        if no_verify_ssl:
+            ssl_context.check_hostname = False
+            ssl_context.verify_mode = ssl.CERT_NONE
+    return ssl_context
+async def _authenticate_admin(websocket, token):
+    """Authenticate as admin with the websocket."""
+    await websocket.send(json.dumps({"token": token, "role": "admin"}))
+    response = await websocket.recv()
+    auth_response = json.loads(response)
+    if "error" in auth_response:
+        console.print(f"[red]Authentication failed: {auth_response['error']}[/red]")
+        return False
+    console.print("[green]✓ Authenticated as admin[/green]")
+    return True
+async def _send_reload_command(websocket, new_cfg):
+    """Send reload command and handle response."""
+    await websocket.send(json.dumps({"type": "reload_config", "config": new_cfg}))
+    response = await websocket.recv()
+    reload_response = json.loads(response)
+    if reload_response.get("type") == "reload_complete":
+        if "message" in reload_response and "No changes" in reload_response["message"]:
+            console.print(f"[yellow]{reload_response['message']}[/yellow]")
+        else:
+            console.print("[green]✓ Configuration reloaded successfully![/green]")
+            if "updated" in reload_response and reload_response["updated"]:
+                console.print("\n[cyan]Updated sections:[/cyan]")
+                for section in reload_response["updated"]:
+                    console.print(f"  • {section}")
+            if "warnings" in reload_response and reload_response["warnings"]:
+                console.print("\n[yellow]Warnings:[/yellow]")
+                for warning in reload_response["warnings"]:
+                    console.print(f"  ⚠ {warning}")
+        return True
+    else:
+        error = reload_response.get("error", "Unknown error")
+        console.print(f"[red]Reload failed: {error} ({reload_response=})[/red]")
+        return False
+def _add_token_to_config(config_data: Dict[str, Any], role: str, name: str, token: str) -> bool:
+    """Add a new token to the config data."""
+    # Ensure the auth section exists
+    if "orchestrator" not in config_data:
+        config_data["orchestrator"] = {}
+    if "auth" not in config_data["orchestrator"]:
+        config_data["orchestrator"]["auth"] = {}
+    auth_config = config_data["orchestrator"]["auth"]
+    token_key = f"{role}_tokens"
+    # Initialize token list if it doesn't exist
+    if token_key not in auth_config:
+        auth_config[token_key] = []
+    # Check if token already exists
+    for existing_token in auth_config[token_key]:
+        if existing_token.get("token") == token:
+            console.print(f"[yellow]Token already exists for {role}: {name}[/yellow]")
+            return False
+        if existing_token.get("name") == name:
+            console.print(f"[yellow]Name already exists for {role}: {name}[/yellow]")
+            return False
+    # Add the new token
+    auth_config[token_key].append({"name": name, "token": token})
+    console.print(f"[green]✓ Added {role} token for {name}[/green]")
+    return True
+def _remove_token_from_config(config_data: Dict[str, Any], role: str, identifier: str) -> bool:
+    """Remove a token from the config data by name or token."""
+    auth_config = config_data.get("orchestrator", {}).get("auth", {})
+    token_key = f"{role}_tokens"
+    if token_key not in auth_config:
+        console.print(f"[red]No {role} tokens found in config[/red]")
+        return False
+    tokens = auth_config[token_key]
+    removed = False
+    for i, token_entry in enumerate(tokens):
+        if token_entry.get("name") == identifier or token_entry.get("token") == identifier:
+            removed_entry = tokens.pop(i)
+            console.print(f"[green]✓ Removed {role} token: {removed_entry['name']}[/green]")
+            removed = True
+            break
+    if not removed:
+        console.print(f"[red]Token not found for {role}: {identifier}[/red]")
+    return removed
+def _list_tokens_in_config(config_data: Dict[str, Any], role: Optional[str] = None):
+    """List tokens in the config data."""
+    auth_config = config_data.get("orchestrator", {}).get("auth", {})
+    if not auth_config:
+        console.print("[yellow]No auth configuration found[/yellow]")
+        return
+    roles_to_show = [role] if role else ["worker", "admin", "monitor"]
+    for token_role in roles_to_show:
+        token_key = f"{token_role}_tokens"
+        tokens = auth_config.get(token_key, [])
+        if tokens:
+            console.print(f"\n[cyan]{token_role.title()} tokens:[/cyan]")
+            for token_entry in tokens:
+                name = token_entry.get("name", "Unknown")
+                token = token_entry.get("token", "")
+                masked_token = f"***{token[-4:]}" if len(token) > 4 else "***"
+                console.print(f"  • {name}: {masked_token}")
+        else:
+            console.print(f"\n[dim]No {token_role} tokens configured[/dim]")
+def _save_config_file(config_data: Dict[str, Any], config_path: Path) -> bool:
+    """Save the config data to a file."""
+    try:
+        with open(config_path, "w") as f:
+            yaml.safe_dump(config_data, f, default_flow_style=False, sort_keys=False)
+        console.print(f"[green]✓ Configuration saved to {config_path}[/green]")
+        return True
+    except Exception as e:
+        console.print(f"[red]Error saving config: {e}[/red]")
+        return False
+async def _reload_orchestrator_config(
+    server: str, token: str, config_data: Dict[str, Any], no_verify_ssl: bool
+) -> bool:
+    """Reload the orchestrator configuration."""
+    import websockets
+    ssl_context = _setup_ssl_context(server, no_verify_ssl)
+    try:
+        async with websockets.connect(
+            server, ssl=ssl_context, ping_interval=20, ping_timeout=60, close_timeout=10
+        ) as websocket:
+            if not await _authenticate_admin(websocket, token):
+                return False
+            return await _send_reload_command(websocket, config_data)
+    except Exception as e:
+        console.print(f"[red]Error connecting to orchestrator: {e}[/red]")
+        return False
+@main.group()
+@click.option("--config", type=click.Path(exists=True), help="Configuration file")
+@click.option("--server", help="Orchestrator WebSocket URL")
+@click.option("--token", help="Admin authentication token")
+@click.option("--no-verify-ssl", is_flag=True, help="Skip SSL verification")
+@click.pass_context
+def auth(
+    ctx, config: Optional[str], server: Optional[str], token: Optional[str], no_verify_ssl: bool
+):
+    """Manage authentication tokens for the orchestrator."""
+    ctx.ensure_object(dict)
+    ctx.obj.update(
+        {"config": config, "server": server, "token": token, "no_verify_ssl": no_verify_ssl}
+    )
+@auth.command()
+@click.argument("role", type=click.Choice(["worker", "admin", "monitor"]))
+@click.argument("name")
+@click.argument("token_value")
+@click.option(
+    "--no-reload", is_flag=True, help="Don't reload orchestrator config after adding token"
+)
+@click.pass_context
+def add(ctx, role: str, name: str, token_value: str, no_reload: bool):
+    """Add a new authentication token.
+    ROLE: Type of token (worker, admin, monitor)
+    NAME: Display name for the token
+    TOKEN_VALUE: The actual token string
+    """
+    config_file = ctx.obj.get("config")
+    server = ctx.obj.get("server")
+    admin_token = ctx.obj.get("token")
+    no_verify_ssl = ctx.obj.get("no_verify_ssl", False)
+    # Load config
+    config_data = ConfigManager.find_config("orchestrator", config_file)
+    if not config_data:
+        console.print("[red]No orchestrator config found[/red]")
+        console.print("[dim]Use --config to specify config file path[/dim]")
+        sys.exit(1)
+    # Find config file path for saving
+    config_path = None
+    if config_file:
+        config_path = Path(config_file)
+    else:
+        # Try to find the config file that was loaded
+        for search_path in [
+            Path.cwd() / "orchestrator.yaml",
+            Path.cwd() / "config" / "orchestrator.yaml",
+            Path.home() / ".caption-flow" / "orchestrator.yaml",
+            ConfigManager.get_xdg_config_home() / "caption-flow" / "orchestrator.yaml",
+        ]:
+            if search_path.exists():
+                config_path = search_path
+                break
+    if not config_path:
+        console.print("[red]Could not determine config file to save to[/red]")
+        console.print("[dim]Use --config to specify config file path[/dim]")
+        sys.exit(1)
+    # Add token to config
+    if not _add_token_to_config(config_data, role, name, token_value):
+        sys.exit(1)
+    # Save config file
+    if not _save_config_file(config_data, config_path):
+        sys.exit(1)
+    # Reload orchestrator if requested
+    if not no_reload:
+        server, admin_token = _load_admin_credentials(config_file, server, admin_token)
+        if not server:
+            console.print("[yellow]No server specified, skipping orchestrator reload[/yellow]")
+            console.print("[dim]Use --server to reload orchestrator config[/dim]")
+        elif not admin_token:
+            console.print("[yellow]No admin token specified, skipping orchestrator reload[/yellow]")
+            console.print("[dim]Use --token to reload orchestrator config[/dim]")
+        else:
+            console.print(f"[cyan]Reloading orchestrator config...[/cyan]")
+            success = asyncio.run(
+                _reload_orchestrator_config(server, admin_token, config_data, no_verify_ssl)
+            )
+            if not success:
+                console.print("[yellow]Config file updated but orchestrator reload failed[/yellow]")
+                console.print("[dim]You may need to restart the orchestrator manually[/dim]")
+@auth.command()
+@click.argument("role", type=click.Choice(["worker", "admin", "monitor"]))
+@click.argument("identifier")
+@click.option(
+    "--no-reload", is_flag=True, help="Don't reload orchestrator config after removing token"
+)
+@click.pass_context
+def remove(ctx, role: str, identifier: str, no_reload: bool):
+    """Remove an authentication token.
+    ROLE: Type of token (worker, admin, monitor)
+    IDENTIFIER: Name or token value to remove
+    """
+    config_file = ctx.obj.get("config")
+    server = ctx.obj.get("server")
+    admin_token = ctx.obj.get("token")
+    no_verify_ssl = ctx.obj.get("no_verify_ssl", False)
+    # Load config
+    config_data = ConfigManager.find_config("orchestrator", config_file)
+    if not config_data:
+        console.print("[red]No orchestrator config found[/red]")
+        sys.exit(1)
+    # Find config file path for saving
+    config_path = None
+    if config_file:
+        config_path = Path(config_file)
+    else:
+        # Try to find the config file that was loaded
+        for search_path in [
+            Path.cwd() / "orchestrator.yaml",
+            Path.cwd() / "config" / "orchestrator.yaml",
+            Path.home() / ".caption-flow" / "orchestrator.yaml",
+            ConfigManager.get_xdg_config_home() / "caption-flow" / "orchestrator.yaml",
+        ]:
+            if search_path.exists():
+                config_path = search_path
+                break
+    if not config_path:
+        console.print("[red]Could not determine config file to save to[/red]")
+        sys.exit(1)
+    # Remove token from config
+    if not _remove_token_from_config(config_data, role, identifier):
+        sys.exit(1)
+    # Save config file
+    if not _save_config_file(config_data, config_path):
+        sys.exit(1)
+    # Reload orchestrator if requested
+    if not no_reload:
+        server, admin_token = _load_admin_credentials(config_file, server, admin_token)
+        if not server:
+            console.print("[yellow]No server specified, skipping orchestrator reload[/yellow]")
+        elif not admin_token:
+            console.print("[yellow]No admin token specified, skipping orchestrator reload[/yellow]")
+        else:
+            console.print(f"[cyan]Reloading orchestrator config...[/cyan]")
+            success = asyncio.run(
+                _reload_orchestrator_config(server, admin_token, config_data, no_verify_ssl)
+            )
+            if not success:
+                console.print("[yellow]Config file updated but orchestrator reload failed[/yellow]")
+@auth.command()
+@click.argument("role", type=click.Choice(["worker", "admin", "monitor", "all"]), required=False)
+@click.pass_context
+def list(ctx, role: Optional[str]):
+    """List authentication tokens.
+    ROLE: Type of tokens to list (worker, admin, monitor, all). Default: all
+    """
+    config_file = ctx.obj.get("config")
+    # Load config
+    config_data = ConfigManager.find_config("orchestrator", config_file)
+    if not config_data:
+        console.print("[red]No orchestrator config found[/red]")
+        sys.exit(1)
+    # Show tokens
+    if role == "all" or role is None:
+        _list_tokens_in_config(config_data)
+    else:
+        _list_tokens_in_config(config_data, role)
+@auth.command()
+@click.option("--length", default=32, help="Token length (default: 32)")
+@click.option("--count", default=1, help="Number of tokens to generate (default: 1)")
+def generate(length: int, count: int):
+    """Generate random authentication tokens."""
+    import secrets
+    import string
+    alphabet = string.ascii_letters + string.digits + "-_"
+    console.print(
+        f"[cyan]Generated {count} token{'s' if count > 1 else ''} ({length} characters each):[/cyan]\n"
+    )
+    for i in range(count):
+        token = "".join(secrets.choice(alphabet) for _ in range(length))
+        console.print(f"  {i + 1}: {token}")
 @main.command()
 @click.option("--config", type=click.Path(exists=True), help="Configuration file")
 @click.option("--server", help="Orchestrator WebSocket URL")
@@ -441,27 +899,8 @@ def reload_config(
 ):
     """Reload orchestrator configuration via admin connection."""
     import websockets
-    import ssl
-    # Load base config to get server/token if not provided via CLI
-    if not server or not token:
-        base_config = ConfigManager.find_config("orchestrator", config) or {}
-        admin_config = base_config.get("admin", {})
-        admin_tokens = base_config.get("orchestrator", {}).get("auth", {}).get("admin_tokens", [])
-        has_admin_tokens = False
-        if len(admin_tokens) > 0:
-            has_admin_tokens = True
-            first_admin_token = admin_tokens[0].get("token", None)
-        # Do not print sensitive admin token to console.
-        if not server:
-            server = admin_config.get("server", "ws://localhost:8765")
-        if not token:
-            token = admin_config.get("token", None)
-            if token is None and has_admin_tokens:
-                # grab the first one, we'll just assume we're localhost.
-                console.print("Using first admin token.")
-                token = first_admin_token
+    server, token = _load_admin_credentials(config, server, token)
     if not server:
         console.print("[red]Error: --server required (or set in config)[/red]")
@@ -472,64 +911,22 @@ def reload_config(
     console.print(f"[cyan]Loading configuration from {new_config}...[/cyan]")
-    # Load the new configuration
     new_cfg = ConfigManager.load_yaml(Path(new_config))
     if not new_cfg:
         console.print("[red]Failed to load configuration[/red]")
         sys.exit(1)
-    # Setup SSL
-    ssl_context = None
-    if server.startswith("wss://"):
-        if no_verify_ssl:
-            ssl_context = ssl.create_default_context()
-            ssl_context.check_hostname = False
-            ssl_context.verify_mode = ssl.CERT_NONE
-        else:
-            ssl_context = ssl.create_default_context()
+    ssl_context = _setup_ssl_context(server, no_verify_ssl)
     async def send_reload():
         try:
-            async with websockets.connect(server, ssl=ssl_context) as websocket:
-                # Authenticate as admin
-                await websocket.send(json.dumps({"token": token, "role": "admin"}))
-                response = await websocket.recv()
-                auth_response = json.loads(response)
-                if "error" in auth_response:
-                    console.print(f"[red]Authentication failed: {auth_response['error']}[/red]")
+            async with websockets.connect(
+                server, ssl=ssl_context, ping_interval=20, ping_timeout=60, close_timeout=10
+            ) as websocket:
+                if not await _authenticate_admin(websocket, token):
                     return False
-                console.print("[green]✓ Authenticated as admin[/green]")
-                # Send reload command
-                await websocket.send(json.dumps({"type": "reload_config", "config": new_cfg}))
-                response = await websocket.recv()
-                reload_response = json.loads(response)
-                if reload_response.get("type") == "reload_complete":
-                    if "message" in reload_response and "No changes" in reload_response["message"]:
-                        console.print(f"[yellow]{reload_response['message']}[/yellow]")
-                    else:
-                        console.print("[green]✓ Configuration reloaded successfully![/green]")
-                        if "updated" in reload_response and reload_response["updated"]:
-                            console.print("\n[cyan]Updated sections:[/cyan]")
-                            for section in reload_response["updated"]:
-                                console.print(f"  • {section}")
-                        if "warnings" in reload_response and reload_response["warnings"]:
-                            console.print("\n[yellow]Warnings:[/yellow]")
-                            for warning in reload_response["warnings"]:
-                                console.print(f"  ⚠ {warning}")
-                    return True
-                else:
-                    error = reload_response.get("error", "Unknown error")
-                    console.print(f"[red]Reload failed: {error} ({reload_response=})[/red]")
-                    return False
+                return await _send_reload_command(websocket, new_cfg)
         except Exception as e:
             console.print(f"[red]Error: {e}[/red]")
@@ -540,39 +937,20 @@ def reload_config(
         sys.exit(1)
-@main.command()
-@click.option("--data-dir", default="./caption_data", help="Storage directory")
-@click.option("--checkpoint-dir", default="./checkpoints", help="Checkpoint directory")
-@click.option("--fix", is_flag=True, help="Fix issues by resetting abandoned chunks")
-@click.option("--verbose", is_flag=True, help="Show detailed information")
-def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
-    """Scan for sparse or abandoned chunks and optionally fix them."""
-    from .utils.chunk_tracker import ChunkTracker
-    from .storage import StorageManager
-    import pyarrow.parquet as pq
-    console.print("[bold cyan]Scanning for sparse/abandoned chunks...[/bold cyan]\n")
-    checkpoint_path = Path(checkpoint_dir) / "chunks.json"
-    if not checkpoint_path.exists():
-        console.print("[red]No chunk checkpoint found![/red]")
-        return
-    tracker = ChunkTracker(checkpoint_path)
-    storage = StorageManager(Path(data_dir))
-    # Get and display stats
-    stats = tracker.get_stats()
+def _display_chunk_stats(stats):
+    """Display chunk statistics."""
     console.print(f"[green]Total chunks:[/green] {stats['total']}")
     console.print(f"[green]Completed:[/green] {stats['completed']}")
     console.print(f"[yellow]Pending:[/yellow] {stats['pending']}")
     console.print(f"[yellow]Assigned:[/yellow] {stats['assigned']}")
     console.print(f"[red]Failed:[/red] {stats['failed']}\n")
-    # Find abandoned chunks
+def _find_abandoned_chunks(tracker):
+    """Find chunks that have been assigned for too long."""
     abandoned_chunks = []
     stale_threshold = 3600  # 1 hour
-    current_time = datetime.utcnow()
+    current_time = datetime.now(_datetime.UTC)
     for chunk_id, chunk_state in tracker.chunks.items():
         if chunk_state.status == "assigned" and chunk_state.assigned_at:
@@ -580,24 +958,31 @@ def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
             if age > stale_threshold:
                 abandoned_chunks.append((chunk_id, chunk_state, age))
-    if abandoned_chunks:
-        console.print(f"[red]Found {len(abandoned_chunks)} abandoned chunks:[/red]")
-        for chunk_id, chunk_state, age in abandoned_chunks[:10]:
-            age_str = f"{age/3600:.1f} hours" if age > 3600 else f"{age/60:.1f} minutes"
-            console.print(f"  • {chunk_id} (assigned to {chunk_state.assigned_to} {age_str} ago)")
+    return abandoned_chunks
-        if len(abandoned_chunks) > 10:
-            console.print(f"  ... and {len(abandoned_chunks) - 10} more")
-        if fix:
-            console.print("\n[yellow]Resetting abandoned chunks to pending...[/yellow]")
-            for chunk_id, _, _ in abandoned_chunks:
-                tracker.mark_failed(chunk_id)
-            console.print(f"[green]✓ Reset {len(abandoned_chunks)} chunks[/green]")
+def _display_abandoned_chunks(abandoned_chunks, fix, tracker):
+    """Display abandoned chunks and optionally fix them."""
+    if not abandoned_chunks:
+        return
-    # Check for sparse shards
-    console.print("\n[bold cyan]Checking for sparse shards...[/bold cyan]")
+    console.print(f"[red]Found {len(abandoned_chunks)} abandoned chunks:[/red]")
+    for chunk_id, chunk_state, age in abandoned_chunks[:10]:
+        age_str = f"{age / 3600:.1f} hours" if age > 3600 else f"{age / 60:.1f} minutes"
+        console.print(f"  • {chunk_id} (assigned to {chunk_state.assigned_to} {age_str} ago)")
+    if len(abandoned_chunks) > 10:
+        console.print(f"  ... and {len(abandoned_chunks) - 10} more")
+    if fix:
+        console.print("\n[yellow]Resetting abandoned chunks to pending...[/yellow]")
+        for chunk_id, _, _ in abandoned_chunks:
+            tracker.mark_failed(chunk_id)
+        console.print(f"[green]✓ Reset {len(abandoned_chunks)} chunks[/green]")
+def _find_sparse_shards(tracker):
+    """Find shards with gaps or issues."""
     shards_summary = tracker.get_shards_summary()
     sparse_shards = []
@@ -616,60 +1001,108 @@ def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
             if has_gaps or shard_info["failed_chunks"] > 0:
                 sparse_shards.append((shard_name, shard_info, has_gaps))
-    if sparse_shards:
-        console.print(f"\n[yellow]Found {len(sparse_shards)} sparse/incomplete shards:[/yellow]")
-        for shard_name, shard_info, has_gaps in sparse_shards[:5]:
-            status = []
-            if shard_info["pending_chunks"] > 0:
-                status.append(f"{shard_info['pending_chunks']} pending")
-            if shard_info["assigned_chunks"] > 0:
-                status.append(f"{shard_info['assigned_chunks']} assigned")
-            if shard_info["failed_chunks"] > 0:
-                status.append(f"{shard_info['failed_chunks']} failed")
-            if has_gaps:
-                status.append("has gaps")
-            console.print(f"  • {shard_name}: {', '.join(status)}")
+    return sparse_shards
+def _display_sparse_shards(sparse_shards):
+    """Display sparse/incomplete shards."""
+    if not sparse_shards:
+        return
+    console.print(f"\n[yellow]Found {len(sparse_shards)} sparse/incomplete shards:[/yellow]")
+    for shard_name, shard_info, has_gaps in sparse_shards[:5]:
+        status = []
+        if shard_info["pending_chunks"] > 0:
+            status.append(f"{shard_info['pending_chunks']} pending")
+        if shard_info["assigned_chunks"] > 0:
+            status.append(f"{shard_info['assigned_chunks']} assigned")
+        if shard_info["failed_chunks"] > 0:
+            status.append(f"{shard_info['failed_chunks']} failed")
+        if has_gaps:
+            status.append("has gaps")
+        console.print(f"  • {shard_name}: {', '.join(status)}")
+        console.print(
+            f"    Progress: {shard_info['completed_chunks']}/{shard_info['total_chunks']} chunks"
+        )
+    if len(sparse_shards) > 5:
+        console.print(f"  ... and {len(sparse_shards) - 5} more")
+def _cross_check_storage(storage, tracker, fix):
+    """Cross-check chunk tracker against storage."""
+    import pyarrow.parquet as pq
+    console.print("\n[bold cyan]Cross-checking with stored captions...[/bold cyan]")
+    try:
+        table = pq.read_table(storage.captions_path, columns=["chunk_id"])
+        stored_chunk_ids = set(c for c in table["chunk_id"].to_pylist() if c)
+        tracker_completed = set(c for c, s in tracker.chunks.items() if s.status == "completed")
+        missing_in_storage = tracker_completed - stored_chunk_ids
+        missing_in_tracker = stored_chunk_ids - set(tracker.chunks.keys())
+        if missing_in_storage:
             console.print(
-                f"    Progress: {shard_info['completed_chunks']}/{shard_info['total_chunks']} chunks"
+                f"\n[red]Chunks marked complete but missing from storage:[/red] {len(missing_in_storage)}"
             )
+            for chunk_id in list(missing_in_storage)[:5]:
+                console.print(f"  • {chunk_id}")
-        if len(sparse_shards) > 5:
-            console.print(f"  ... and {len(sparse_shards) - 5} more")
+            if fix:
+                console.print("[yellow]Resetting these chunks to pending...[/yellow]")
+                for chunk_id in missing_in_storage:
+                    tracker.mark_failed(chunk_id)
+                console.print(f"[green]✓ Reset {len(missing_in_storage)} chunks[/green]")
-    # Cross-check with storage if verbose
-    if storage.captions_path.exists() and verbose:
-        console.print("\n[bold cyan]Cross-checking with stored captions...[/bold cyan]")
+        if missing_in_tracker:
+            console.print(
+                f"\n[yellow]Chunks in storage but not tracked:[/yellow] {len(missing_in_tracker)}"
+            )
-        try:
-            table = pq.read_table(storage.captions_path, columns=["chunk_id"])
-            stored_chunk_ids = set(c for c in table["chunk_id"].to_pylist() if c)
+    except Exception as e:
+        console.print(f"[red]Error reading storage: {e}[/red]")
-            tracker_completed = set(c for c, s in tracker.chunks.items() if s.status == "completed")
-            missing_in_storage = tracker_completed - stored_chunk_ids
-            missing_in_tracker = stored_chunk_ids - set(tracker.chunks.keys())
+@main.command()
+@click.option("--data-dir", default="./caption_data", help="Storage directory")
+@click.option("--checkpoint-dir", default="./checkpoints", help="Checkpoint directory")
+@click.option("--fix", is_flag=True, help="Fix issues by resetting abandoned chunks")
+@click.option("--verbose", is_flag=True, help="Show detailed information")
+def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
+    """Scan for sparse or abandoned chunks and optionally fix them."""
+    from .storage import StorageManager
+    from .utils.chunk_tracker import ChunkTracker
-            if missing_in_storage:
-                console.print(
-                    f"\n[red]Chunks marked complete but missing from storage:[/red] {len(missing_in_storage)}"
-                )
-                for chunk_id in list(missing_in_storage)[:5]:
-                    console.print(f"  • {chunk_id}")
+    console.print("[bold cyan]Scanning for sparse/abandoned chunks...[/bold cyan]\n")
-                if fix:
-                    console.print("[yellow]Resetting these chunks to pending...[/yellow]")
-                    for chunk_id in missing_in_storage:
-                        tracker.mark_failed(chunk_id)
-                    console.print(f"[green]✓ Reset {len(missing_in_storage)} chunks[/green]")
+    checkpoint_path = Path(checkpoint_dir) / "chunks.json"
+    if not checkpoint_path.exists():
+        console.print("[red]No chunk checkpoint found![/red]")
+        return
-            if missing_in_tracker:
-                console.print(
-                    f"\n[yellow]Chunks in storage but not tracked:[/yellow] {len(missing_in_tracker)}"
-                )
+    tracker = ChunkTracker(checkpoint_path)
+    storage = StorageManager(Path(data_dir))
-        except Exception as e:
-            console.print(f"[red]Error reading storage: {e}[/red]")
+    # Get and display stats
+    stats = tracker.get_stats()
+    _display_chunk_stats(stats)
+    # Find and handle abandoned chunks
+    abandoned_chunks = _find_abandoned_chunks(tracker)
+    _display_abandoned_chunks(abandoned_chunks, fix, tracker)
+    # Check for sparse shards
+    console.print("\n[bold cyan]Checking for sparse shards...[/bold cyan]")
+    sparse_shards = _find_sparse_shards(tracker)
+    _display_sparse_shards(sparse_shards)
+    # Cross-check with storage if verbose
+    if storage.captions_path.exists() and verbose:
+        _cross_check_storage(storage, tracker, fix)
     # Summary
     console.print("\n[bold cyan]Summary:[/bold cyan]")
@@ -693,12 +1126,163 @@ def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
         tracker.save_checkpoint()
+def _display_export_stats(stats):
+    """Display storage statistics."""
+    console.print("\n[bold cyan]Storage Statistics:[/bold cyan]")
+    console.print(f"[green]Total rows:[/green] {stats['total_rows']:,}")
+    console.print(f"[green]Total outputs:[/green] {stats['total_outputs']:,}")
+    console.print(f"[green]Shards:[/green] {stats['shard_count']} ({', '.join(stats['shards'])})")
+    console.print(f"[green]Output fields:[/green] {', '.join(stats['output_fields'])}")
+    if stats.get("field_stats"):
+        console.print("\n[cyan]Field breakdown:[/cyan]")
+        for field, count in stats["field_stats"].items():
+            console.print(f"  • {field}: {count['total_items']:,} items")
+def _prepare_export_params(shard, shards, columns):
+    """Prepare shard filter and column list."""
+    shard_filter = None
+    if shard:
+        shard_filter = [shard]
+    elif shards:
+        shard_filter = [s.strip() for s in shards.split(",")]
+    column_list = None
+    if columns:
+        column_list = [col.strip() for col in columns.split(",")]
+        console.print(f"\n[cyan]Exporting columns:[/cyan] {', '.join(column_list)}")
+    return shard_filter, column_list
+async def _export_all_formats(
+    exporter, output, shard_filter, column_list, limit, filename_column, export_column
+):
+    """Export to all formats."""
+    base_name = output or "caption_export"
+    base_path = Path(base_name)
+    results = {}
+    for export_format in ["jsonl", "csv", "parquet", "json", "txt"]:
+        console.print(f"\n[cyan]Exporting to {export_format.upper()}...[/cyan]")
+        try:
+            format_results = await exporter.export_all_shards(
+                export_format,
+                base_path,
+                columns=column_list,
+                limit_per_shard=limit,
+                shard_filter=shard_filter,
+                filename_column=filename_column,
+                export_column=export_column,
+            )
+            results[export_format] = sum(format_results.values())
+        except Exception as e:
+            console.print(f"[yellow]Skipping {export_format}: {e}[/yellow]")
+            results[export_format] = 0
+    console.print("\n[green]✓ Export complete![/green]")
+    for fmt, count in results.items():
+        if count > 0:
+            console.print(f"  • {fmt.upper()}: {count:,} items")
+async def _export_to_lance(exporter, output, column_list, shard_filter):
+    """Export to Lance dataset."""
+    output_path = output or "exported_captions.lance"
+    console.print(f"\n[cyan]Exporting to Lance dataset:[/cyan] {output_path}")
+    total_rows = await exporter.export_to_lance(
+        output_path, columns=column_list, shard_filter=shard_filter
+    )
+    console.print(f"[green]✓ Exported {total_rows:,} rows to Lance dataset[/green]")
+async def _export_to_huggingface(exporter, hf_dataset, license, private, nsfw, tags, shard_filter):
+    """Export to Hugging Face Hub."""
+    if not hf_dataset:
+        console.print("[red]Error: --hf-dataset required for huggingface_hub format[/red]")
+        console.print("[dim]Example: --hf-dataset username/my-caption-dataset[/dim]")
+        sys.exit(1)
+    tag_list = None
+    if tags:
+        tag_list = [tag.strip() for tag in tags.split(",")]
+    console.print(f"\n[cyan]Uploading to Hugging Face Hub:[/cyan] {hf_dataset}")
+    if private:
+        console.print("[dim]Privacy: Private dataset[/dim]")
+    if nsfw:
+        console.print("[dim]Content: Not for all audiences[/dim]")
+    if tag_list:
+        console.print(f"[dim]Tags: {', '.join(tag_list)}[/dim]")
+    if shard_filter:
+        console.print(f"[dim]Shards: {', '.join(shard_filter)}[/dim]")
+    url = await exporter.export_to_huggingface_hub(
+        dataset_name=hf_dataset,
+        license=license,
+        private=private,
+        nsfw=nsfw,
+        tags=tag_list,
+        shard_filter=shard_filter,
+    )
+    console.print(f"[green]✓ Dataset uploaded to: {url}[/green]")
+async def _export_single_format(
+    exporter,
+    format,
+    output,
+    shard_filter,
+    column_list,
+    limit,
+    filename_column,
+    export_column,
+    verbose,
+):
+    """Export to a single format."""
+    output_path = output or "export"
+    if shard_filter and len(shard_filter) == 1:
+        console.print(f"\n[cyan]Exporting shard {shard_filter[0]} to {format.upper()}...[/cyan]")
+        count = await exporter.export_shard(
+            shard_filter[0],
+            format,
+            output_path,
+            columns=column_list,
+            limit=limit,
+            filename_column=filename_column,
+            export_column=export_column,
+        )
+        console.print(f"[green]✓ Exported {count:,} items[/green]")
+    else:
+        console.print(f"\n[cyan]Exporting to {format.upper()}...[/cyan]")
+        results = await exporter.export_all_shards(
+            format,
+            output_path,
+            columns=column_list,
+            limit_per_shard=limit,
+            shard_filter=shard_filter,
+            filename_column=filename_column,
+            export_column=export_column,
+        )
+        total = sum(results.values())
+        console.print(f"[green]✓ Exported {total:,} items total[/green]")
+        if verbose and len(results) > 1:
+            console.print("\n[dim]Per-shard breakdown:[/dim]")
+            for shard_name, count in sorted(results.items()):
+                console.print(f"  • {shard_name}: {count:,} items")
 @main.command()
 @click.option("--data-dir", default="./caption_data", help="Storage directory")
 @click.option(
     "--format",
     type=click.Choice(
-        ["jsonl", "json", "csv", "txt", "huggingface_hub", "all"], case_sensitive=False
+        ["jsonl", "json", "csv", "txt", "parquet", "lance", "huggingface_hub", "all"],
+        case_sensitive=False,
     ),
     default="jsonl",
     help="Export format (default: jsonl)",
@@ -708,17 +1292,117 @@ def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
 @click.option("--columns", help="Comma-separated list of columns to export (default: all)")
 @click.option("--export-column", default="captions", help="Column to export for txt format")
 @click.option("--filename-column", default="filename", help="Column containing filenames")
+@click.option("--shard", help="Specific shard to export (e.g., data-0001)")
+@click.option("--shards", help="Comma-separated list of shards to export")
 @click.option("--include-empty", is_flag=True, help="Include rows with empty export column")
 @click.option("--stats-only", is_flag=True, help="Show statistics without exporting")
-@click.option(
-    "--optimize", is_flag=True, help="Optimize storage before export (remove empty columns)"
-)
+@click.option("--optimize", is_flag=True, help="Optimize storage before export")
 @click.option("--verbose", is_flag=True, help="Show detailed export progress")
 @click.option("--hf-dataset", help="Dataset name on HF Hub (e.g., username/dataset-name)")
-@click.option("--license", help="License for the dataset (required for new HF datasets)")
+@click.option("--license", default="apache-2.0", help="License for the dataset")
 @click.option("--private", is_flag=True, help="Make HF dataset private")
 @click.option("--nsfw", is_flag=True, help="Add not-for-all-audiences tag")
 @click.option("--tags", help="Comma-separated tags for HF dataset")
+def _validate_export_setup(data_dir):
+    """Validate export setup and create storage manager."""
+    from .storage import StorageManager
+    storage_path = Path(data_dir)
+    if not storage_path.exists():
+        console.print(f"[red]Storage directory not found: {data_dir}[/red]")
+        sys.exit(1)
+    return StorageManager(storage_path)
+async def _run_export_process(
+    storage,
+    format,
+    output,
+    shard,
+    shards,
+    columns,
+    limit,
+    filename_column,
+    export_column,
+    verbose,
+    hf_dataset,
+    license,
+    private,
+    nsfw,
+    tags,
+    stats_only,
+    optimize,
+):
+    """Execute the main export process."""
+    from .storage.exporter import LanceStorageExporter
+    await storage.initialize()
+    stats = await storage.get_caption_stats()
+    _display_export_stats(stats)
+    if stats_only:
+        return
+    if optimize:
+        console.print("\n[yellow]Optimizing storage...[/yellow]")
+        await storage.optimize_storage()
+    shard_filter, column_list = _prepare_export_params(shard, shards, columns)
+    exporter = LanceStorageExporter(storage)
+    if format == "all":
+        await _export_all_formats(
+            exporter, output, shard_filter, column_list, limit, filename_column, export_column
+        )
+    elif format == "lance":
+        await _export_to_lance(exporter, output, column_list, shard_filter)
+    elif format == "huggingface_hub":
+        await _export_to_huggingface(
+            exporter, hf_dataset, license, private, nsfw, tags, shard_filter
+        )
+    else:
+        await _export_single_format(
+            exporter,
+            format,
+            output,
+            shard_filter,
+            column_list,
+            limit,
+            filename_column,
+            export_column,
+            verbose,
+        )
+@main.command()
+@click.option("--data-dir", default="./caption_data", help="Storage directory")
+@click.option(
+    "--format",
+    type=click.Choice(
+        ["jsonl", "json", "csv", "txt", "parquet", "lance", "huggingface_hub", "all"],
+        case_sensitive=False,
+    ),
+    default="jsonl",
+    help="Export format (default: jsonl)",
+)
+@click.option("--output", help="Output filename or directory")
+@click.option("--limit", type=int, help="Maximum number of items to export")
+@click.option("--columns", help="Comma-separated list of columns to include")
+@click.option("--export-column", default="captions", help="Column to export (default: captions)")
+@click.option("--filename-column", default="filename", help="Filename column (default: filename)")
+@click.option("--shard", help="Export only specific shard (e.g., 'data-001')")
+@click.option("--shards", help="Comma-separated list of shards to export")
+@click.option("--include-empty", is_flag=True, help="Include items with empty/null export column")
+@click.option("--stats-only", is_flag=True, help="Show statistics only, don't export")
+@click.option("--optimize", is_flag=True, help="Optimize storage before export")
+@click.option("--verbose", is_flag=True, help="Verbose output")
+@click.option("--hf-dataset", help="HuggingFace Hub dataset name (for huggingface_hub format)")
+@click.option("--license", default="MIT", help="Dataset license (default: MIT)")
+@click.option("--private", is_flag=True, help="Make HuggingFace dataset private")
+@click.option("--nsfw", is_flag=True, help="Mark dataset as NSFW")
+@click.option("--tags", help="Comma-separated tags for HuggingFace dataset")
 def export(
     data_dir: str,
     format: str,
@@ -727,219 +1411,56 @@ def export(
     columns: Optional[str],
     export_column: str,
     filename_column: str,
+    shard: Optional[str],
+    shards: Optional[str],
     include_empty: bool,
     stats_only: bool,
     optimize: bool,
     verbose: bool,
     hf_dataset: Optional[str],
-    license: Optional[str],
+    license: str,
     private: bool,
     nsfw: bool,
     tags: Optional[str],
 ):
-    """Export caption data to various formats."""
-    from .storage import StorageManager
-    from .storage.exporter import StorageExporter, ExportError
+    """Export caption data to various formats with per-shard support."""
+    from .storage.exporter import ExportError
-    # Initialize storage manager
-    storage_path = Path(data_dir)
-    if not storage_path.exists():
-        console.print(f"[red]Storage directory not found: {data_dir}[/red]")
-        sys.exit(1)
-    storage = StorageManager(storage_path)
-    async def run_export():
-        await storage.initialize()
-        # Show statistics first
-        stats = await storage.get_caption_stats()
-        console.print("\n[bold cyan]Storage Statistics:[/bold cyan]")
-        console.print(f"[green]Total rows:[/green] {stats['total_rows']:,}")
-        console.print(f"[green]Total outputs:[/green] {stats['total_outputs']:,}")
-        console.print(f"[green]Output fields:[/green] {', '.join(stats['output_fields'])}")
-        if stats.get("field_stats"):
-            console.print("\n[cyan]Field breakdown:[/cyan]")
-            for field, field_stat in stats["field_stats"].items():
-                console.print(
-                    f"  • {field}: {field_stat['total_items']:,} items "
-                    f"in {field_stat['rows_with_data']:,} rows"
-                )
-        if stats_only:
-            return
-        # Optimize storage if requested
-        if optimize:
-            console.print("\n[yellow]Optimizing storage (removing empty columns)...[/yellow]")
-            await storage.optimize_storage()
-        # Prepare columns list
-        column_list = None
-        if columns:
-            column_list = [col.strip() for col in columns.split(",")]
-            console.print(f"\n[cyan]Exporting columns:[/cyan] {', '.join(column_list)}")
-        # Get storage contents
-        console.print("\n[yellow]Loading data...[/yellow]")
-        try:
-            contents = await storage.get_storage_contents(
-                limit=limit, columns=column_list, include_metadata=True
-            )
-        except ValueError as e:
-            console.print(f"[red]Error: {e}[/red]")
-            sys.exit(1)
+    storage = _validate_export_setup(data_dir)
-        if not contents.rows:
-            console.print("[yellow]No data to export![/yellow]")
-            return
-        # Filter out empty rows if not including empty
-        if not include_empty and format in ["txt", "json"]:
-            original_count = len(contents.rows)
-            contents.rows = [
-                row
-                for row in contents.rows
-                if row.get(export_column)
-                and (not isinstance(row[export_column], list) or len(row[export_column]) > 0)
-            ]
-            filtered_count = original_count - len(contents.rows)
-            if filtered_count > 0:
-                console.print(f"[dim]Filtered {filtered_count} empty rows[/dim]")
-        # Create exporter
-        exporter = StorageExporter(contents)
-        # Determine output paths
-        if format == "all":
-            # Export to all formats
-            base_name = output or "caption_export"
-            base_path = Path(base_name)
-            formats_exported = []
-            # JSONL
-            jsonl_path = base_path.with_suffix(".jsonl")
-            console.print(f"\n[cyan]Exporting to JSONL:[/cyan] {jsonl_path}")
-            rows = exporter.to_jsonl(jsonl_path)
-            formats_exported.append(f"JSONL: {rows:,} rows")
-            # CSV
-            csv_path = base_path.with_suffix(".csv")
-            console.print(f"[cyan]Exporting to CSV:[/cyan] {csv_path}")
-            try:
-                rows = exporter.to_csv(csv_path)
-                formats_exported.append(f"CSV: {rows:,} rows")
-            except ExportError as e:
-                console.print(f"[yellow]Skipping CSV: {e}[/yellow]")
-            # JSON files
-            json_dir = base_path.parent / f"{base_path.stem}_json"
-            console.print(f"[cyan]Exporting to JSON files:[/cyan] {json_dir}/")
-            try:
-                files = exporter.to_json(json_dir, filename_column)
-                formats_exported.append(f"JSON: {files:,} files")
-            except ExportError as e:
-                console.print(f"[yellow]Skipping JSON files: {e}[/yellow]")
-            # Text files
-            txt_dir = base_path.parent / f"{base_path.stem}_txt"
-            console.print(f"[cyan]Exporting to text files:[/cyan] {txt_dir}/")
-            try:
-                files = exporter.to_txt(txt_dir, filename_column, export_column)
-                formats_exported.append(f"Text: {files:,} files")
-            except ExportError as e:
-                console.print(f"[yellow]Skipping text files: {e}[/yellow]")
-            console.print(f"\n[green]✓ Export complete![/green]")
-            for fmt in formats_exported:
-                console.print(f"  • {fmt}")
-        else:
-            # Single format export
-            try:
-                if format == "jsonl":
-                    output_path = output or "captions.jsonl"
-                    console.print(f"\n[cyan]Exporting to JSONL:[/cyan] {output_path}")
-                    rows = exporter.to_jsonl(output_path)
-                    console.print(f"[green]✓ Exported {rows:,} rows[/green]")
-                elif format == "csv":
-                    output_path = output or "captions.csv"
-                    console.print(f"\n[cyan]Exporting to CSV:[/cyan] {output_path}")
-                    rows = exporter.to_csv(output_path)
-                    console.print(f"[green]✓ Exported {rows:,} rows[/green]")
-                elif format == "json":
-                    output_dir = output or "./json_output"
-                    console.print(f"\n[cyan]Exporting to JSON files:[/cyan] {output_dir}/")
-                    files = exporter.to_json(output_dir, filename_column)
-                    console.print(f"[green]✓ Created {files:,} JSON files[/green]")
-                elif format == "txt":
-                    output_dir = output or "./txt_output"
-                    console.print(f"\n[cyan]Exporting to text files:[/cyan] {output_dir}/")
-                    console.print(f"[dim]Export column: {export_column}[/dim]")
-                    files = exporter.to_txt(output_dir, filename_column, export_column)
-                    console.print(f"[green]✓ Created {files:,} text files[/green]")
-                elif format == "huggingface_hub":
-                    # Validate required parameters
-                    if not hf_dataset:
-                        console.print(
-                            "[red]Error: --hf-dataset required for huggingface_hub format[/red]"
-                        )
-                        console.print(
-                            "[dim]Example: --hf-dataset username/my-caption-dataset[/dim]"
-                        )
-                        sys.exit(1)
-                    # Parse tags
-                    tag_list = None
-                    if tags:
-                        tag_list = [tag.strip() for tag in tags.split(",")]
-                    console.print(f"\n[cyan]Uploading to Hugging Face Hub:[/cyan] {hf_dataset}")
-                    if private:
-                        console.print("[dim]Privacy: Private dataset[/dim]")
-                    if nsfw:
-                        console.print("[dim]Content: Not for all audiences[/dim]")
-                    if tag_list:
-                        console.print(f"[dim]Tags: {', '.join(tag_list)}[/dim]")
-                    url = exporter.to_huggingface_hub(
-                        dataset_name=hf_dataset,
-                        license=license,
-                        private=private,
-                        nsfw=nsfw,
-                        tags=tag_list,
-                    )
-                    console.print(f"[green]✓ Dataset uploaded to: {url}[/green]")
-            except ExportError as e:
-                console.print(f"[red]Export error: {e}[/red]")
-                sys.exit(1)
-        # Show export metadata
-        if verbose and contents.metadata:
-            console.print("\n[dim]Export metadata:[/dim]")
-            console.print(f"  Timestamp: {contents.metadata.get('export_timestamp')}")
-            console.print(f"  Total available: {contents.metadata.get('total_available_rows'):,}")
-            console.print(f"  Rows exported: {contents.metadata.get('rows_exported'):,}")
-    # Run the async export
     try:
-        asyncio.run(run_export())
+        asyncio.run(
+            _run_export_process(
+                storage,
+                format,
+                output,
+                shard,
+                shards,
+                columns,
+                limit,
+                filename_column,
+                export_column,
+                verbose,
+                hf_dataset,
+                license,
+                private,
+                nsfw,
+                tags,
+                stats_only,
+                optimize,
+            )
+        )
+    except ExportError as e:
+        console.print(f"[red]Export error: {e}[/red]")
+        sys.exit(1)
     except KeyboardInterrupt:
         console.print("\n[yellow]Export cancelled[/yellow]")
         sys.exit(1)
     except Exception as e:
         console.print(f"[red]Unexpected error: {e}[/red]")
-        if verbose:
-            import traceback
+        import traceback
-            traceback.print_exc()
+        traceback.print_exc()
         sys.exit(1)
@@ -961,7 +1482,7 @@ def generate_cert(
         cert_path, key_path = cert_manager.generate_self_signed(Path(output_dir), cert_domain)
         console.print(f"[green]✓[/green] Certificate: {cert_path}")
         console.print(f"[green]✓[/green] Key: {key_path}")
-        console.print(f"\n[cyan]Use these paths in your config or CLI:[/cyan]")
+        console.print("\n[cyan]Use these paths in your config or CLI:[/cyan]")
         console.print(f"  --cert {cert_path}")
         console.print(f"  --key {key_path}")
     elif domain and email:
@@ -978,7 +1499,7 @@ def generate_cert(
             )
             console.print(f"[green]✓[/green] Certificate: {cert_path}")
             console.print(f"[green]✓[/green] Key: {key_path}")
-            console.print(f"\n[cyan]Use these paths in your config or CLI:[/cyan]")
+            console.print("\n[cyan]Use these paths in your config or CLI:[/cyan]")
             console.print(f"  --cert {cert_path}")
             console.print(f"  --key {key_path}")
@@ -1022,13 +1543,13 @@ def inspect_cert(cert_path: str):
         from datetime import datetime
-        if info["not_after"] < datetime.utcnow():
+        if info["not_after"] < datetime.now(_datetime.UTC):
             console.print("[red]✗ Certificate has expired![/red]")
-        elif (info["not_after"] - datetime.utcnow()).days < 30:
-            days_left = (info["not_after"] - datetime.utcnow()).days
+        elif (info["not_after"] - datetime.now(_datetime.UTC)).days < 30:
+            days_left = (info["not_after"] - datetime.now(_datetime.UTC)).days
             console.print(f"[yellow]⚠ Certificate expires in {days_left} days[/yellow]")
         else:
-            days_left = (info["not_after"] - datetime.utcnow()).days
+            days_left = (info["not_after"] - datetime.now(_datetime.UTC)).days
             console.print(f"[green]✓ Certificate valid for {days_left} more days[/green]")
     except Exception as e:

caption-flow 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl