caption-flow 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/__init__.py +9 -0
- caption_flow/cli.py +709 -0
- caption_flow/models.py +82 -0
- caption_flow/monitor.py +211 -0
- caption_flow/orchestrator.py +1301 -0
- caption_flow/storage.py +694 -0
- caption_flow/utils/__init__.py +4 -0
- caption_flow/utils/auth.py +67 -0
- caption_flow/utils/caption_utils.py +172 -0
- caption_flow/utils/certificates.py +140 -0
- caption_flow/utils/chunk_tracker.py +365 -0
- caption_flow/utils/dataset_loader.py +186 -0
- caption_flow/utils/image_processor.py +51 -0
- caption_flow/utils/job_queue.py +41 -0
- caption_flow/utils/json_utils.py +201 -0
- caption_flow/utils/vllm_config.py +164 -0
- caption_flow/worker.py +300 -0
- caption_flow/worker_data.py +482 -0
- caption_flow/worker_vllm.py +1028 -0
- caption_flow-0.1.0.dist-info/METADATA +427 -0
- caption_flow-0.1.0.dist-info/RECORD +25 -0
- caption_flow-0.1.0.dist-info/WHEEL +5 -0
- caption_flow-0.1.0.dist-info/entry_points.txt +2 -0
- caption_flow-0.1.0.dist-info/licenses/LICENSE +661 -0
- caption_flow-0.1.0.dist-info/top_level.txt +1 -0
caption_flow/cli.py
ADDED
@@ -0,0 +1,709 @@
|
|
1
|
+
"""Command-line interface for CaptionFlow with smart configuration handling."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import sys
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Optional, Dict, Any, List
|
10
|
+
|
11
|
+
import click
|
12
|
+
import yaml
|
13
|
+
from rich.console import Console
|
14
|
+
from rich.logging import RichHandler
|
15
|
+
from datetime import datetime
|
16
|
+
|
17
|
+
from .orchestrator import Orchestrator
|
18
|
+
from .worker import Worker
|
19
|
+
from .monitor import Monitor
|
20
|
+
from .utils.certificates import CertificateManager
|
21
|
+
|
22
|
+
console = Console()
|
23
|
+
|
24
|
+
|
25
|
+
class ConfigManager:
|
26
|
+
"""Smart configuration discovery and management following XDG Base Directory spec."""
|
27
|
+
|
28
|
+
CONFIG_NAMES = {
|
29
|
+
"orchestrator": "orchestrator.yaml",
|
30
|
+
"worker": "worker.yaml",
|
31
|
+
"monitor": "monitor.yaml",
|
32
|
+
}
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def get_xdg_config_home(cls) -> Path:
|
36
|
+
"""Get XDG_CONFIG_HOME or default."""
|
37
|
+
xdg_config = os.environ.get("XDG_CONFIG_HOME")
|
38
|
+
if xdg_config:
|
39
|
+
return Path(xdg_config)
|
40
|
+
return Path.home() / ".config"
|
41
|
+
|
42
|
+
@classmethod
|
43
|
+
def get_xdg_config_dirs(cls) -> List[Path]:
|
44
|
+
"""Get XDG_CONFIG_DIRS or defaults."""
|
45
|
+
xdg_dirs = os.environ.get("XDG_CONFIG_DIRS", "/etc/xdg").split(":")
|
46
|
+
return [Path(d) for d in xdg_dirs]
|
47
|
+
|
48
|
+
@classmethod
|
49
|
+
def find_config(
|
50
|
+
cls, component: str, explicit_path: Optional[str] = None
|
51
|
+
) -> Optional[Dict[str, Any]]:
|
52
|
+
"""
|
53
|
+
Find and load configuration for a component.
|
54
|
+
|
55
|
+
Search order:
|
56
|
+
1. Explicit path if provided
|
57
|
+
2. Current directory
|
58
|
+
3. ~/.caption-flow/<component_config>.yaml
|
59
|
+
4. $XDG_CONFIG_HOME/caption-flow/<component_config>.yaml
|
60
|
+
5. /etc/caption-flow/<component_config>.yaml (system-wide)
|
61
|
+
6. $XDG_CONFIG_DIRS/caption-flow/<component_config>.yaml
|
62
|
+
7. ./examples/<component_config>.yaml (fallback)
|
63
|
+
"""
|
64
|
+
config_name = cls.CONFIG_NAMES.get(component, "config.yaml")
|
65
|
+
|
66
|
+
# If explicit path provided, use only that
|
67
|
+
if explicit_path:
|
68
|
+
path = Path(explicit_path)
|
69
|
+
if path.exists():
|
70
|
+
console.print(f"[dim]Using config: {path}[/dim]")
|
71
|
+
return cls.load_yaml(path)
|
72
|
+
console.print(f"[yellow]Config not found: {path}[/yellow]")
|
73
|
+
return None
|
74
|
+
|
75
|
+
# Search paths in order
|
76
|
+
search_paths = [
|
77
|
+
Path.cwd() / config_name, # Current directory
|
78
|
+
Path.cwd() / "config" / config_name, # Current directory / config subdir
|
79
|
+
Path.home() / ".caption-flow" / config_name, # Home directory
|
80
|
+
cls.get_xdg_config_home() / "caption-flow" / config_name, # XDG config home
|
81
|
+
Path("/etc/caption-flow") / config_name, # System-wide
|
82
|
+
]
|
83
|
+
|
84
|
+
# Add XDG config dirs
|
85
|
+
for xdg_dir in cls.get_xdg_config_dirs():
|
86
|
+
search_paths.append(xdg_dir / "caption-flow" / config_name)
|
87
|
+
|
88
|
+
# Fallback to examples
|
89
|
+
search_paths.append(Path("examples") / config_name)
|
90
|
+
|
91
|
+
# Try each path
|
92
|
+
for path in search_paths:
|
93
|
+
if path.exists():
|
94
|
+
console.print(f"[dim]Found config: {path}[/dim]")
|
95
|
+
return cls.load_yaml(path)
|
96
|
+
|
97
|
+
return None
|
98
|
+
|
99
|
+
@classmethod
|
100
|
+
def load_yaml(cls, path: Path) -> Optional[Dict[str, Any]]:
|
101
|
+
"""Load and parse YAML config file."""
|
102
|
+
try:
|
103
|
+
with open(path) as f:
|
104
|
+
return yaml.safe_load(f) or {}
|
105
|
+
except Exception as e:
|
106
|
+
console.print(f"[red]Error loading {path}: {e}[/red]")
|
107
|
+
return None
|
108
|
+
|
109
|
+
@classmethod
|
110
|
+
def merge_configs(cls, base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
|
111
|
+
"""Deep merge override config into base config."""
|
112
|
+
result = base.copy()
|
113
|
+
|
114
|
+
for key, value in override.items():
|
115
|
+
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
116
|
+
result[key] = cls.merge_configs(result[key], value)
|
117
|
+
else:
|
118
|
+
result[key] = value
|
119
|
+
|
120
|
+
return result
|
121
|
+
|
122
|
+
|
123
|
+
def setup_logging(verbose: bool = False):
|
124
|
+
"""Configure logging with rich handler."""
|
125
|
+
level = logging.DEBUG if verbose else logging.INFO
|
126
|
+
logging.basicConfig(
|
127
|
+
level=level,
|
128
|
+
format="%(message)s",
|
129
|
+
handlers=[
|
130
|
+
RichHandler(console=console, rich_tracebacks=True, show_path=False, show_time=False)
|
131
|
+
],
|
132
|
+
)
|
133
|
+
|
134
|
+
|
135
|
+
def apply_cli_overrides(config: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
136
|
+
"""Apply CLI arguments as overrides to config, filtering out None values."""
|
137
|
+
overrides = {k: v for k, v in kwargs.items() if v is not None}
|
138
|
+
return ConfigManager.merge_configs(config, overrides)
|
139
|
+
|
140
|
+
|
141
|
+
@click.group()
|
142
|
+
@click.option("--verbose", is_flag=True, help="Enable verbose logging")
|
143
|
+
@click.pass_context
|
144
|
+
def main(ctx, verbose: bool):
|
145
|
+
"""CaptionFlow - Distributed community captioning system."""
|
146
|
+
setup_logging(verbose)
|
147
|
+
ctx.obj = {"verbose": verbose}
|
148
|
+
|
149
|
+
|
150
|
+
@main.command()
|
151
|
+
@click.option("--config", type=click.Path(exists=True), help="Configuration file")
|
152
|
+
@click.option("--port", type=int, help="WebSocket server port")
|
153
|
+
@click.option("--host", help="Bind address")
|
154
|
+
@click.option("--data-dir", help="Storage directory")
|
155
|
+
@click.option("--cert", help="SSL certificate path")
|
156
|
+
@click.option("--key", help="SSL key path")
|
157
|
+
@click.option("--no-ssl", is_flag=True, help="Disable SSL (development only)")
|
158
|
+
@click.option("--vllm", is_flag=True, help="Use vLLM orchestrator for WebDataset/HF datasets")
|
159
|
+
@click.pass_context
|
160
|
+
def orchestrator(ctx, config: Optional[str], **kwargs):
|
161
|
+
"""Start the orchestrator server."""
|
162
|
+
# Load configuration
|
163
|
+
base_config = ConfigManager.find_config("orchestrator", config) or {}
|
164
|
+
|
165
|
+
# Extract orchestrator section if it exists
|
166
|
+
if "orchestrator" in base_config:
|
167
|
+
config_data = base_config["orchestrator"]
|
168
|
+
else:
|
169
|
+
config_data = base_config
|
170
|
+
|
171
|
+
console.print(f"Config contents: {config_data}")
|
172
|
+
|
173
|
+
# Apply CLI overrides
|
174
|
+
if kwargs.get("port"):
|
175
|
+
config_data["port"] = kwargs["port"]
|
176
|
+
if kwargs.get("host"):
|
177
|
+
config_data["host"] = kwargs["host"]
|
178
|
+
if kwargs.get("data_dir"):
|
179
|
+
config_data.setdefault("storage", {})["data_dir"] = kwargs["data_dir"]
|
180
|
+
|
181
|
+
# Handle SSL configuration
|
182
|
+
if not kwargs.get("no_ssl"):
|
183
|
+
if kwargs.get("cert") and kwargs.get("key"):
|
184
|
+
config_data.setdefault("ssl", {})
|
185
|
+
config_data["ssl"]["cert"] = kwargs["cert"]
|
186
|
+
config_data["ssl"]["key"] = kwargs["key"]
|
187
|
+
elif not config_data.get("ssl"):
|
188
|
+
console.print(
|
189
|
+
"[yellow]Warning: Running without SSL. Use --cert and --key for production.[/yellow]"
|
190
|
+
)
|
191
|
+
|
192
|
+
if kwargs.get("vllm") and "vllm" not in config_data:
|
193
|
+
raise ValueError("Must provide vLLM config.")
|
194
|
+
|
195
|
+
orchestrator_instance = Orchestrator(config_data)
|
196
|
+
|
197
|
+
try:
|
198
|
+
asyncio.run(orchestrator_instance.start())
|
199
|
+
except KeyboardInterrupt:
|
200
|
+
console.print("\n[yellow]Shutting down orchestrator...[/yellow]")
|
201
|
+
asyncio.run(orchestrator_instance.shutdown())
|
202
|
+
|
203
|
+
|
204
|
+
@main.command()
|
205
|
+
@click.option("--config", type=click.Path(exists=True), help="Configuration file")
|
206
|
+
@click.option("--server", help="Orchestrator WebSocket URL")
|
207
|
+
@click.option("--token", help="Worker authentication token")
|
208
|
+
@click.option("--name", help="Worker display name")
|
209
|
+
@click.option("--batch-size", type=int, help="Inference batch size")
|
210
|
+
@click.option("--no-verify-ssl", is_flag=True, help="Skip SSL verification")
|
211
|
+
@click.option("--vllm", is_flag=True, help="Use vLLM worker for GPU inference")
|
212
|
+
@click.option("--gpu-id", type=int, help="GPU device ID (for vLLM)")
|
213
|
+
@click.option("--precision", help="Model precision (for vLLM)")
|
214
|
+
@click.option("--model", help="Model name (for vLLM)")
|
215
|
+
@click.pass_context
|
216
|
+
def worker(ctx, config: Optional[str], **kwargs):
|
217
|
+
"""Start a worker node."""
|
218
|
+
# Load configuration
|
219
|
+
base_config = ConfigManager.find_config("worker", config) or {}
|
220
|
+
|
221
|
+
# Extract worker section if it exists
|
222
|
+
if "worker" in base_config:
|
223
|
+
config_data = base_config["worker"]
|
224
|
+
else:
|
225
|
+
config_data = base_config
|
226
|
+
|
227
|
+
# Apply CLI overrides (only non-None values)
|
228
|
+
for key in ["server", "token", "name", "batch_size", "gpu_id", "precision", "model"]:
|
229
|
+
if kwargs.get(key) is not None:
|
230
|
+
config_data[key] = kwargs[key]
|
231
|
+
|
232
|
+
if kwargs.get("no_verify_ssl"):
|
233
|
+
config_data["verify_ssl"] = False
|
234
|
+
|
235
|
+
# Validate required fields
|
236
|
+
if not config_data.get("server"):
|
237
|
+
console.print("[red]Error: --server required (or set in config)[/red]")
|
238
|
+
sys.exit(1)
|
239
|
+
if not config_data.get("token"):
|
240
|
+
console.print("[red]Error: --token required (or set in config)[/red]")
|
241
|
+
sys.exit(1)
|
242
|
+
|
243
|
+
# Choose worker type
|
244
|
+
if kwargs.get("vllm") or config_data.get("vllm"):
|
245
|
+
from .worker_vllm import VLLMWorker
|
246
|
+
|
247
|
+
worker_instance = VLLMWorker(config_data)
|
248
|
+
else:
|
249
|
+
worker_instance = Worker(config_data)
|
250
|
+
|
251
|
+
try:
|
252
|
+
asyncio.run(worker_instance.start())
|
253
|
+
except KeyboardInterrupt:
|
254
|
+
console.print("\n[yellow]Shutting down worker...[/yellow]")
|
255
|
+
asyncio.run(worker_instance.shutdown())
|
256
|
+
|
257
|
+
|
258
|
+
@main.command()
|
259
|
+
@click.option("--config", type=click.Path(exists=True), help="Configuration file")
|
260
|
+
@click.option("--server", help="Orchestrator WebSocket URL")
|
261
|
+
@click.option("--token", help="Authentication token")
|
262
|
+
@click.option("--no-verify-ssl", is_flag=True, help="Skip SSL verification")
|
263
|
+
@click.option("--debug", is_flag=True, help="Enable debug output")
|
264
|
+
@click.pass_context
|
265
|
+
def monitor(ctx, config: Optional[str], server: Optional[str], token: Optional[str],
|
266
|
+
no_verify_ssl: bool, debug: bool):
|
267
|
+
"""Start the monitoring TUI."""
|
268
|
+
|
269
|
+
# Enable debug logging if requested
|
270
|
+
if debug:
|
271
|
+
setup_logging(verbose=True)
|
272
|
+
console.print("[yellow]Debug mode enabled[/yellow]")
|
273
|
+
|
274
|
+
# Load configuration
|
275
|
+
base_config = ConfigManager.find_config('monitor', config)
|
276
|
+
|
277
|
+
if not base_config:
|
278
|
+
# Try to find monitor config in orchestrator config as fallback
|
279
|
+
orch_config = ConfigManager.find_config('orchestrator')
|
280
|
+
if orch_config and 'monitor' in orch_config:
|
281
|
+
base_config = {'monitor': orch_config['monitor']}
|
282
|
+
console.print("[dim]Using monitor config from orchestrator.yaml[/dim]")
|
283
|
+
else:
|
284
|
+
base_config = {}
|
285
|
+
if not server or not token:
|
286
|
+
console.print("[yellow]No monitor config found, using CLI args[/yellow]")
|
287
|
+
|
288
|
+
# Handle different config structures
|
289
|
+
# Case 1: Config has top-level 'monitor' section
|
290
|
+
if 'monitor' in base_config:
|
291
|
+
config_data = base_config['monitor']
|
292
|
+
# Case 2: Config IS the monitor config (no wrapper)
|
293
|
+
else:
|
294
|
+
config_data = base_config
|
295
|
+
|
296
|
+
# Apply CLI overrides (CLI always wins)
|
297
|
+
if server:
|
298
|
+
config_data['server'] = server
|
299
|
+
if token:
|
300
|
+
config_data['token'] = token
|
301
|
+
if no_verify_ssl:
|
302
|
+
config_data['verify_ssl'] = False
|
303
|
+
|
304
|
+
# Debug output
|
305
|
+
if debug:
|
306
|
+
console.print("\n[cyan]Final monitor configuration:[/cyan]")
|
307
|
+
console.print(f" Server: {config_data.get('server', 'NOT SET')}")
|
308
|
+
console.print(f" Token: {'***' + config_data.get('token', '')[-4:] if config_data.get('token') else 'NOT SET'}")
|
309
|
+
console.print(f" Verify SSL: {config_data.get('verify_ssl', True)}")
|
310
|
+
console.print()
|
311
|
+
|
312
|
+
# Validate required fields
|
313
|
+
if not config_data.get('server'):
|
314
|
+
console.print("[red]Error: --server required (or set 'server' in monitor.yaml)[/red]")
|
315
|
+
console.print("\n[dim]Example monitor.yaml:[/dim]")
|
316
|
+
console.print("server: wss://localhost:8765")
|
317
|
+
console.print("token: your-token-here")
|
318
|
+
sys.exit(1)
|
319
|
+
|
320
|
+
if not config_data.get('token'):
|
321
|
+
console.print("[red]Error: --token required (or set 'token' in monitor.yaml)[/red]")
|
322
|
+
console.print("\n[dim]Example monitor.yaml:[/dim]")
|
323
|
+
console.print("server: wss://localhost:8765")
|
324
|
+
console.print("token: your-token-here")
|
325
|
+
sys.exit(1)
|
326
|
+
|
327
|
+
# Set defaults for optional settings
|
328
|
+
config_data.setdefault('refresh_interval', 1.0)
|
329
|
+
config_data.setdefault('show_inactive_workers', False)
|
330
|
+
config_data.setdefault('max_log_lines', 100)
|
331
|
+
|
332
|
+
# Create and start monitor
|
333
|
+
try:
|
334
|
+
monitor_instance = Monitor(config_data)
|
335
|
+
|
336
|
+
if debug:
|
337
|
+
console.print("[green]Starting monitor...[/green]")
|
338
|
+
console.print(f"[dim]Connecting to: {config_data['server']}[/dim]")
|
339
|
+
sys.exit(1)
|
340
|
+
|
341
|
+
asyncio.run(monitor_instance.start())
|
342
|
+
|
343
|
+
except KeyboardInterrupt:
|
344
|
+
console.print("\n[yellow]Closing monitor...[/yellow]")
|
345
|
+
except ConnectionRefusedError:
|
346
|
+
console.print(f"\n[red]Error: Cannot connect to {config_data['server']}[/red]")
|
347
|
+
console.print("[yellow]Check that the orchestrator is running and accessible[/yellow]")
|
348
|
+
sys.exit(1)
|
349
|
+
except Exception as e:
|
350
|
+
console.print(f"\n[red]Error starting monitor: {e}[/red]")
|
351
|
+
if debug:
|
352
|
+
import traceback
|
353
|
+
traceback.print_exc()
|
354
|
+
sys.exit(1)
|
355
|
+
|
356
|
+
@main.command()
|
357
|
+
@click.option("--config", type=click.Path(exists=True), help="Configuration file")
|
358
|
+
@click.option("--server", help="Orchestrator WebSocket URL")
|
359
|
+
@click.option("--token", help="Admin authentication token")
|
360
|
+
@click.option(
|
361
|
+
"--new-config", type=click.Path(exists=True), required=True, help="New configuration file"
|
362
|
+
)
|
363
|
+
@click.option("--no-verify-ssl", is_flag=True, help="Skip SSL verification")
|
364
|
+
def reload_config(
|
365
|
+
config: Optional[str],
|
366
|
+
server: Optional[str],
|
367
|
+
token: Optional[str],
|
368
|
+
new_config: str,
|
369
|
+
no_verify_ssl: bool,
|
370
|
+
):
|
371
|
+
"""Reload orchestrator configuration via admin connection."""
|
372
|
+
import websockets
|
373
|
+
import ssl
|
374
|
+
|
375
|
+
# Load base config to get server/token if not provided via CLI
|
376
|
+
if not server or not token:
|
377
|
+
base_config = ConfigManager.find_config("orchestrator", config) or {}
|
378
|
+
admin_config = base_config.get("admin", {})
|
379
|
+
|
380
|
+
if not server:
|
381
|
+
server = admin_config.get("server")
|
382
|
+
if not token:
|
383
|
+
token = admin_config.get("token")
|
384
|
+
|
385
|
+
if not server:
|
386
|
+
console.print("[red]Error: --server required (or set in config)[/red]")
|
387
|
+
sys.exit(1)
|
388
|
+
if not token:
|
389
|
+
console.print("[red]Error: --token required (or set in config)[/red]")
|
390
|
+
sys.exit(1)
|
391
|
+
|
392
|
+
console.print(f"[cyan]Loading configuration from {new_config}...[/cyan]")
|
393
|
+
|
394
|
+
# Load the new configuration
|
395
|
+
new_cfg = ConfigManager.load_yaml(Path(new_config))
|
396
|
+
if not new_cfg:
|
397
|
+
console.print("[red]Failed to load configuration[/red]")
|
398
|
+
sys.exit(1)
|
399
|
+
|
400
|
+
# Setup SSL
|
401
|
+
ssl_context = None
|
402
|
+
if server.startswith("wss://"):
|
403
|
+
if no_verify_ssl:
|
404
|
+
ssl_context = ssl.create_default_context()
|
405
|
+
ssl_context.check_hostname = False
|
406
|
+
ssl_context.verify_mode = ssl.CERT_NONE
|
407
|
+
else:
|
408
|
+
ssl_context = ssl.create_default_context()
|
409
|
+
|
410
|
+
async def send_reload():
|
411
|
+
try:
|
412
|
+
async with websockets.connect(server, ssl=ssl_context) as websocket:
|
413
|
+
# Authenticate as admin
|
414
|
+
await websocket.send(json.dumps({"token": token, "role": "admin"}))
|
415
|
+
|
416
|
+
response = await websocket.recv()
|
417
|
+
auth_response = json.loads(response)
|
418
|
+
|
419
|
+
if "error" in auth_response:
|
420
|
+
console.print(f"[red]Authentication failed: {auth_response['error']}[/red]")
|
421
|
+
return False
|
422
|
+
|
423
|
+
console.print("[green]✓ Authenticated as admin[/green]")
|
424
|
+
|
425
|
+
# Send reload command
|
426
|
+
await websocket.send(json.dumps({"type": "reload_config", "config": new_cfg}))
|
427
|
+
|
428
|
+
response = await websocket.recv()
|
429
|
+
reload_response = json.loads(response)
|
430
|
+
|
431
|
+
if reload_response.get("type") == "reload_complete":
|
432
|
+
if "message" in reload_response and "No changes" in reload_response["message"]:
|
433
|
+
console.print(f"[yellow]{reload_response['message']}[/yellow]")
|
434
|
+
else:
|
435
|
+
console.print("[green]✓ Configuration reloaded successfully![/green]")
|
436
|
+
|
437
|
+
if "updated" in reload_response and reload_response["updated"]:
|
438
|
+
console.print("\n[cyan]Updated sections:[/cyan]")
|
439
|
+
for section in reload_response["updated"]:
|
440
|
+
console.print(f" • {section}")
|
441
|
+
|
442
|
+
if "warnings" in reload_response and reload_response["warnings"]:
|
443
|
+
console.print("\n[yellow]Warnings:[/yellow]")
|
444
|
+
for warning in reload_response["warnings"]:
|
445
|
+
console.print(f" ⚠ {warning}")
|
446
|
+
|
447
|
+
return True
|
448
|
+
else:
|
449
|
+
error = reload_response.get("error", "Unknown error")
|
450
|
+
console.print(f"[red]Reload failed: {error}[/red]")
|
451
|
+
return False
|
452
|
+
|
453
|
+
except Exception as e:
|
454
|
+
console.print(f"[red]Error: {e}[/red]")
|
455
|
+
return False
|
456
|
+
|
457
|
+
success = asyncio.run(send_reload())
|
458
|
+
if not success:
|
459
|
+
sys.exit(1)
|
460
|
+
|
461
|
+
|
462
|
+
@main.command()
|
463
|
+
@click.option("--data-dir", default="./caption_data", help="Storage directory")
|
464
|
+
@click.option("--checkpoint-dir", default="./checkpoints", help="Checkpoint directory")
|
465
|
+
@click.option("--fix", is_flag=True, help="Fix issues by resetting abandoned chunks")
|
466
|
+
@click.option("--verbose", is_flag=True, help="Show detailed information")
|
467
|
+
def scan_chunks(data_dir: str, checkpoint_dir: str, fix: bool, verbose: bool):
|
468
|
+
"""Scan for sparse or abandoned chunks and optionally fix them."""
|
469
|
+
from .utils.chunk_tracker import ChunkTracker
|
470
|
+
from .storage import StorageManager
|
471
|
+
import pyarrow.parquet as pq
|
472
|
+
|
473
|
+
console.print("[bold cyan]Scanning for sparse/abandoned chunks...[/bold cyan]\n")
|
474
|
+
|
475
|
+
checkpoint_path = Path(checkpoint_dir) / "chunks.json"
|
476
|
+
if not checkpoint_path.exists():
|
477
|
+
console.print("[red]No chunk checkpoint found![/red]")
|
478
|
+
return
|
479
|
+
|
480
|
+
tracker = ChunkTracker(checkpoint_path)
|
481
|
+
storage = StorageManager(Path(data_dir))
|
482
|
+
|
483
|
+
# Get and display stats
|
484
|
+
stats = tracker.get_stats()
|
485
|
+
console.print(f"[green]Total chunks:[/green] {stats['total']}")
|
486
|
+
console.print(f"[green]Completed:[/green] {stats['completed']}")
|
487
|
+
console.print(f"[yellow]Pending:[/yellow] {stats['pending']}")
|
488
|
+
console.print(f"[yellow]Assigned:[/yellow] {stats['assigned']}")
|
489
|
+
console.print(f"[red]Failed:[/red] {stats['failed']}\n")
|
490
|
+
|
491
|
+
# Find abandoned chunks
|
492
|
+
abandoned_chunks = []
|
493
|
+
stale_threshold = 3600 # 1 hour
|
494
|
+
current_time = datetime.utcnow()
|
495
|
+
|
496
|
+
for chunk_id, chunk_state in tracker.chunks.items():
|
497
|
+
if chunk_state.status == "assigned" and chunk_state.assigned_at:
|
498
|
+
age = (current_time - chunk_state.assigned_at).total_seconds()
|
499
|
+
if age > stale_threshold:
|
500
|
+
abandoned_chunks.append((chunk_id, chunk_state, age))
|
501
|
+
|
502
|
+
if abandoned_chunks:
|
503
|
+
console.print(f"[red]Found {len(abandoned_chunks)} abandoned chunks:[/red]")
|
504
|
+
for chunk_id, chunk_state, age in abandoned_chunks[:10]:
|
505
|
+
age_str = f"{age/3600:.1f} hours" if age > 3600 else f"{age/60:.1f} minutes"
|
506
|
+
console.print(f" • {chunk_id} (assigned to {chunk_state.assigned_to} {age_str} ago)")
|
507
|
+
|
508
|
+
if len(abandoned_chunks) > 10:
|
509
|
+
console.print(f" ... and {len(abandoned_chunks) - 10} more")
|
510
|
+
|
511
|
+
if fix:
|
512
|
+
console.print("\n[yellow]Resetting abandoned chunks to pending...[/yellow]")
|
513
|
+
for chunk_id, _, _ in abandoned_chunks:
|
514
|
+
tracker.mark_failed(chunk_id)
|
515
|
+
console.print(f"[green]✓ Reset {len(abandoned_chunks)} chunks[/green]")
|
516
|
+
|
517
|
+
# Check for sparse shards
|
518
|
+
console.print("\n[bold cyan]Checking for sparse shards...[/bold cyan]")
|
519
|
+
|
520
|
+
shards_summary = tracker.get_shards_summary()
|
521
|
+
sparse_shards = []
|
522
|
+
|
523
|
+
for shard_name, shard_info in shards_summary.items():
|
524
|
+
if not shard_info["is_complete"]:
|
525
|
+
chunks = sorted(shard_info["chunks"], key=lambda c: c.start_index)
|
526
|
+
expected_index = 0
|
527
|
+
has_gaps = False
|
528
|
+
|
529
|
+
for chunk in chunks:
|
530
|
+
if chunk.start_index != expected_index:
|
531
|
+
has_gaps = True
|
532
|
+
break
|
533
|
+
expected_index = chunk.start_index + chunk.chunk_size
|
534
|
+
|
535
|
+
if has_gaps or shard_info["failed_chunks"] > 0:
|
536
|
+
sparse_shards.append((shard_name, shard_info, has_gaps))
|
537
|
+
|
538
|
+
if sparse_shards:
|
539
|
+
console.print(f"\n[yellow]Found {len(sparse_shards)} sparse/incomplete shards:[/yellow]")
|
540
|
+
for shard_name, shard_info, has_gaps in sparse_shards[:5]:
|
541
|
+
status = []
|
542
|
+
if shard_info["pending_chunks"] > 0:
|
543
|
+
status.append(f"{shard_info['pending_chunks']} pending")
|
544
|
+
if shard_info["assigned_chunks"] > 0:
|
545
|
+
status.append(f"{shard_info['assigned_chunks']} assigned")
|
546
|
+
if shard_info["failed_chunks"] > 0:
|
547
|
+
status.append(f"{shard_info['failed_chunks']} failed")
|
548
|
+
if has_gaps:
|
549
|
+
status.append("has gaps")
|
550
|
+
|
551
|
+
console.print(f" • {shard_name}: {', '.join(status)}")
|
552
|
+
console.print(
|
553
|
+
f" Progress: {shard_info['completed_chunks']}/{shard_info['total_chunks']} chunks"
|
554
|
+
)
|
555
|
+
|
556
|
+
if len(sparse_shards) > 5:
|
557
|
+
console.print(f" ... and {len(sparse_shards) - 5} more")
|
558
|
+
|
559
|
+
# Cross-check with storage if verbose
|
560
|
+
if storage.captions_path.exists() and verbose:
|
561
|
+
console.print("\n[bold cyan]Cross-checking with stored captions...[/bold cyan]")
|
562
|
+
|
563
|
+
try:
|
564
|
+
table = pq.read_table(storage.captions_path, columns=["chunk_id"])
|
565
|
+
stored_chunk_ids = set(c for c in table["chunk_id"].to_pylist() if c)
|
566
|
+
|
567
|
+
tracker_completed = set(c for c, s in tracker.chunks.items() if s.status == "completed")
|
568
|
+
|
569
|
+
missing_in_storage = tracker_completed - stored_chunk_ids
|
570
|
+
missing_in_tracker = stored_chunk_ids - set(tracker.chunks.keys())
|
571
|
+
|
572
|
+
if missing_in_storage:
|
573
|
+
console.print(
|
574
|
+
f"\n[red]Chunks marked complete but missing from storage:[/red] {len(missing_in_storage)}"
|
575
|
+
)
|
576
|
+
for chunk_id in list(missing_in_storage)[:5]:
|
577
|
+
console.print(f" • {chunk_id}")
|
578
|
+
|
579
|
+
if fix:
|
580
|
+
console.print("[yellow]Resetting these chunks to pending...[/yellow]")
|
581
|
+
for chunk_id in missing_in_storage:
|
582
|
+
tracker.mark_failed(chunk_id)
|
583
|
+
console.print(f"[green]✓ Reset {len(missing_in_storage)} chunks[/green]")
|
584
|
+
|
585
|
+
if missing_in_tracker:
|
586
|
+
console.print(
|
587
|
+
f"\n[yellow]Chunks in storage but not tracked:[/yellow] {len(missing_in_tracker)}"
|
588
|
+
)
|
589
|
+
|
590
|
+
except Exception as e:
|
591
|
+
console.print(f"[red]Error reading storage: {e}[/red]")
|
592
|
+
|
593
|
+
# Summary
|
594
|
+
console.print("\n[bold cyan]Summary:[/bold cyan]")
|
595
|
+
|
596
|
+
total_issues = len(abandoned_chunks) + len(sparse_shards)
|
597
|
+
if total_issues == 0:
|
598
|
+
console.print("[green]✓ No issues found![/green]")
|
599
|
+
else:
|
600
|
+
console.print(f"[yellow]Found {total_issues} total issues[/yellow]")
|
601
|
+
|
602
|
+
if not fix:
|
603
|
+
console.print(
|
604
|
+
"\n[cyan]Run with --fix flag to automatically reset abandoned chunks[/cyan]"
|
605
|
+
)
|
606
|
+
else:
|
607
|
+
console.print(
|
608
|
+
"\n[green]✓ Issues have been fixed. Restart orchestrator to reprocess.[/green]"
|
609
|
+
)
|
610
|
+
|
611
|
+
if fix:
|
612
|
+
tracker.save_checkpoint()
|
613
|
+
|
614
|
+
|
615
|
+
@main.command()
|
616
|
+
@click.option("--domain", help="Domain for Let's Encrypt certificate")
|
617
|
+
@click.option("--email", help="Email for Let's Encrypt registration")
|
618
|
+
@click.option("--self-signed", is_flag=True, help="Generate self-signed certificate")
|
619
|
+
@click.option("--output-dir", default="./certs", help="Output directory for certificates")
|
620
|
+
@click.option("--staging", is_flag=True, help="Use Let's Encrypt staging server (for testing)")
|
621
|
+
def generate_cert(
|
622
|
+
domain: Optional[str], email: Optional[str], self_signed: bool, output_dir: str, staging: bool
|
623
|
+
):
|
624
|
+
"""Generate SSL certificates."""
|
625
|
+
cert_manager = CertificateManager()
|
626
|
+
|
627
|
+
if self_signed:
|
628
|
+
console.print("[yellow]Generating self-signed certificate...[/yellow]")
|
629
|
+
cert_domain = domain or "localhost"
|
630
|
+
cert_path, key_path = cert_manager.generate_self_signed(Path(output_dir), cert_domain)
|
631
|
+
console.print(f"[green]✓[/green] Certificate: {cert_path}")
|
632
|
+
console.print(f"[green]✓[/green] Key: {key_path}")
|
633
|
+
console.print(f"\n[cyan]Use these paths in your config or CLI:[/cyan]")
|
634
|
+
console.print(f" --cert {cert_path}")
|
635
|
+
console.print(f" --key {key_path}")
|
636
|
+
elif domain and email:
|
637
|
+
mode = "staging" if staging else "production"
|
638
|
+
console.print(
|
639
|
+
f"[yellow]Requesting Let's Encrypt {mode} certificate for {domain}...[/yellow]"
|
640
|
+
)
|
641
|
+
|
642
|
+
le_output = Path(output_dir) if output_dir != "./certs" else None
|
643
|
+
|
644
|
+
try:
|
645
|
+
cert_path, key_path = cert_manager.generate_letsencrypt(
|
646
|
+
domain, email, output_dir=le_output, staging=staging
|
647
|
+
)
|
648
|
+
console.print(f"[green]✓[/green] Certificate: {cert_path}")
|
649
|
+
console.print(f"[green]✓[/green] Key: {key_path}")
|
650
|
+
console.print(f"\n[cyan]Use these paths in your config or CLI:[/cyan]")
|
651
|
+
console.print(f" --cert {cert_path}")
|
652
|
+
console.print(f" --key {key_path}")
|
653
|
+
|
654
|
+
if staging:
|
655
|
+
console.print(
|
656
|
+
"\n[yellow]⚠ This is a staging certificate (not trusted by browsers)[/yellow]"
|
657
|
+
)
|
658
|
+
console.print(
|
659
|
+
"[yellow] Remove --staging flag for production certificates[/yellow]"
|
660
|
+
)
|
661
|
+
except RuntimeError as e:
|
662
|
+
console.print(f"[red]Error: {e}[/red]")
|
663
|
+
console.print("\n[yellow]Troubleshooting:[/yellow]")
|
664
|
+
console.print(" • Ensure port 80 is accessible for Let's Encrypt validation")
|
665
|
+
console.print(" • Check that the domain points to this server")
|
666
|
+
console.print(" • Try --staging flag for testing")
|
667
|
+
sys.exit(1)
|
668
|
+
else:
|
669
|
+
console.print("[red]Error: Specify either --self-signed or --domain with --email[/red]")
|
670
|
+
sys.exit(1)
|
671
|
+
|
672
|
+
|
673
|
+
@main.command()
|
674
|
+
@click.argument("cert_path", type=click.Path(exists=True))
|
675
|
+
def inspect_cert(cert_path: str):
|
676
|
+
"""Inspect an SSL certificate."""
|
677
|
+
cert_manager = CertificateManager()
|
678
|
+
|
679
|
+
try:
|
680
|
+
info = cert_manager.get_cert_info(Path(cert_path))
|
681
|
+
|
682
|
+
console.print("\n[bold cyan]Certificate Information[/bold cyan]")
|
683
|
+
console.print(f"[green]Subject:[/green] {info['subject']}")
|
684
|
+
console.print(f"[green]Issuer:[/green] {info['issuer']}")
|
685
|
+
console.print(f"[green]Valid From:[/green] {info['not_before']}")
|
686
|
+
console.print(f"[green]Valid Until:[/green] {info['not_after']}")
|
687
|
+
console.print(f"[green]Serial Number:[/green] {info['serial_number']}")
|
688
|
+
|
689
|
+
if info["is_self_signed"]:
|
690
|
+
console.print("[yellow]⚠ This is a self-signed certificate[/yellow]")
|
691
|
+
|
692
|
+
from datetime import datetime
|
693
|
+
|
694
|
+
if info["not_after"] < datetime.utcnow():
|
695
|
+
console.print("[red]✗ Certificate has expired![/red]")
|
696
|
+
elif (info["not_after"] - datetime.utcnow()).days < 30:
|
697
|
+
days_left = (info["not_after"] - datetime.utcnow()).days
|
698
|
+
console.print(f"[yellow]⚠ Certificate expires in {days_left} days[/yellow]")
|
699
|
+
else:
|
700
|
+
days_left = (info["not_after"] - datetime.utcnow()).days
|
701
|
+
console.print(f"[green]✓ Certificate valid for {days_left} more days[/green]")
|
702
|
+
|
703
|
+
except Exception as e:
|
704
|
+
console.print(f"[red]Error reading certificate: {e}[/red]")
|
705
|
+
sys.exit(1)
|
706
|
+
|
707
|
+
|
708
|
+
if __name__ == "__main__":
|
709
|
+
main()
|