caption-flow 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/__init__.py +9 -0
- caption_flow/cli.py +709 -0
- caption_flow/models.py +82 -0
- caption_flow/monitor.py +211 -0
- caption_flow/orchestrator.py +1301 -0
- caption_flow/storage.py +694 -0
- caption_flow/utils/__init__.py +4 -0
- caption_flow/utils/auth.py +67 -0
- caption_flow/utils/caption_utils.py +172 -0
- caption_flow/utils/certificates.py +140 -0
- caption_flow/utils/chunk_tracker.py +365 -0
- caption_flow/utils/dataset_loader.py +186 -0
- caption_flow/utils/image_processor.py +51 -0
- caption_flow/utils/job_queue.py +41 -0
- caption_flow/utils/json_utils.py +201 -0
- caption_flow/utils/vllm_config.py +164 -0
- caption_flow/worker.py +300 -0
- caption_flow/worker_data.py +482 -0
- caption_flow/worker_vllm.py +1028 -0
- caption_flow-0.1.0.dist-info/METADATA +427 -0
- caption_flow-0.1.0.dist-info/RECORD +25 -0
- caption_flow-0.1.0.dist-info/WHEEL +5 -0
- caption_flow-0.1.0.dist-info/entry_points.txt +2 -0
- caption_flow-0.1.0.dist-info/licenses/LICENSE +661 -0
- caption_flow-0.1.0.dist-info/top_level.txt +1 -0
caption_flow/models.py
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
"""Data models for CaptionFlow."""
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from datetime import datetime
|
5
|
+
from enum import Enum
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
|
9
|
+
class JobStatus(Enum):
|
10
|
+
"""Job processing status."""
|
11
|
+
|
12
|
+
PENDING = "pending"
|
13
|
+
PROCESSING = "processing"
|
14
|
+
COMPLETED = "completed"
|
15
|
+
FAILED = "failed"
|
16
|
+
|
17
|
+
def __str__(self):
|
18
|
+
return self.value
|
19
|
+
|
20
|
+
def to_json(self):
|
21
|
+
return self.value
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class Job:
|
26
|
+
"""Captioning job."""
|
27
|
+
|
28
|
+
job_id: str
|
29
|
+
dataset: str
|
30
|
+
shard: str
|
31
|
+
item_key: str
|
32
|
+
status: JobStatus = JobStatus.PENDING
|
33
|
+
assigned_to: Optional[str] = None
|
34
|
+
created_at: datetime = None
|
35
|
+
|
36
|
+
def __post_init__(self):
|
37
|
+
if self.created_at is None:
|
38
|
+
self.created_at = datetime.utcnow()
|
39
|
+
|
40
|
+
|
41
|
+
@dataclass
|
42
|
+
class Caption:
|
43
|
+
"""Generated caption with attribution and image metadata."""
|
44
|
+
|
45
|
+
# Core fields
|
46
|
+
job_id: str
|
47
|
+
dataset: str
|
48
|
+
shard: str
|
49
|
+
item_key: str
|
50
|
+
contributor_id: str
|
51
|
+
timestamp: datetime
|
52
|
+
caption_count: int = 1 # Number of captions generated for this item
|
53
|
+
caption: Optional[str] = None
|
54
|
+
captions: Optional[list] = None
|
55
|
+
quality_score: Optional[float] = None
|
56
|
+
quality_scores: Optional[list] = None
|
57
|
+
|
58
|
+
# Image metadata
|
59
|
+
image_width: Optional[int] = None
|
60
|
+
image_height: Optional[int] = None
|
61
|
+
image_format: Optional[str] = None
|
62
|
+
file_size: Optional[int] = None
|
63
|
+
|
64
|
+
# Processing metadata
|
65
|
+
caption_index: Optional[int] = None # Which caption this is (0, 1, 2...)
|
66
|
+
total_captions: Optional[int] = None # Total captions for this image
|
67
|
+
processing_time_ms: Optional[float] = None
|
68
|
+
chunk_id: Optional[str] = None
|
69
|
+
|
70
|
+
def __post_init__(self):
|
71
|
+
if self.caption is None and self.captions is None:
|
72
|
+
raise ValueError("At least one of 'caption' or 'captions' must be provided")
|
73
|
+
|
74
|
+
|
75
|
+
@dataclass
|
76
|
+
class Contributor:
|
77
|
+
"""Contributor information."""
|
78
|
+
|
79
|
+
contributor_id: str
|
80
|
+
name: str
|
81
|
+
total_captions: int = 0
|
82
|
+
trust_level: int = 1
|
caption_flow/monitor.py
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
"""TUI monitor for CaptionFlow system."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import ssl
|
7
|
+
import time
|
8
|
+
from datetime import datetime
|
9
|
+
from typing import Dict, Any, List, Optional
|
10
|
+
|
11
|
+
import websockets
|
12
|
+
from rich.console import Console
|
13
|
+
from rich.layout import Layout
|
14
|
+
from rich.live import Live
|
15
|
+
from rich.panel import Panel
|
16
|
+
from rich.table import Table
|
17
|
+
from rich.text import Text
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class Monitor:
|
23
|
+
"""Real-time monitoring interface for CaptionFlow."""
|
24
|
+
|
25
|
+
def __init__(self, config: Dict[str, Any]):
|
26
|
+
self.config = config
|
27
|
+
print(f"Config loaded: {self.config}")
|
28
|
+
self.server_url = config["server"]
|
29
|
+
self.token = config["token"]
|
30
|
+
|
31
|
+
# SSL configuration
|
32
|
+
self.ssl_context = self._setup_ssl()
|
33
|
+
|
34
|
+
# Display state
|
35
|
+
self.stats = {}
|
36
|
+
self.leaderboard = []
|
37
|
+
self.recent_activity = []
|
38
|
+
self.running = False
|
39
|
+
|
40
|
+
# Rate tracking
|
41
|
+
self.rate_info = {
|
42
|
+
"current_rate": 0.0,
|
43
|
+
"average_rate": 0.0,
|
44
|
+
"expected_rate": 0.0,
|
45
|
+
}
|
46
|
+
|
47
|
+
# Rich console
|
48
|
+
self.console = Console()
|
49
|
+
|
50
|
+
def _setup_ssl(self) -> Optional[ssl.SSLContext]:
|
51
|
+
"""Configure SSL context."""
|
52
|
+
if not self.config.get("verify_ssl", True):
|
53
|
+
context = ssl.create_default_context()
|
54
|
+
context.check_hostname = False
|
55
|
+
context.verify_mode = ssl.CERT_NONE
|
56
|
+
return context
|
57
|
+
return ssl.create_default_context()
|
58
|
+
|
59
|
+
async def start(self):
|
60
|
+
"""Start the monitor interface."""
|
61
|
+
self.running = True
|
62
|
+
|
63
|
+
# Connect to orchestrator
|
64
|
+
asyncio.create_task(self._connect_to_orchestrator())
|
65
|
+
|
66
|
+
# Start display loop
|
67
|
+
await self._display_loop()
|
68
|
+
|
69
|
+
async def _connect_to_orchestrator(self):
|
70
|
+
"""Maintain connection to orchestrator."""
|
71
|
+
while self.running:
|
72
|
+
try:
|
73
|
+
async with websockets.connect(
|
74
|
+
self.server_url,
|
75
|
+
ssl=self.ssl_context if self.server_url.startswith("wss://") else None,
|
76
|
+
) as websocket:
|
77
|
+
# Authenticate
|
78
|
+
await websocket.send(json.dumps({"token": self.token}))
|
79
|
+
|
80
|
+
# Receive updates
|
81
|
+
async for message in websocket:
|
82
|
+
data = json.loads(message)
|
83
|
+
await self._handle_update(data)
|
84
|
+
|
85
|
+
except Exception as e:
|
86
|
+
logger.error(f"Connection error: {e}")
|
87
|
+
await asyncio.sleep(5)
|
88
|
+
|
89
|
+
async def _handle_update(self, data: Dict):
|
90
|
+
"""Process update from orchestrator."""
|
91
|
+
msg_type = data.get("type")
|
92
|
+
|
93
|
+
if msg_type == "stats":
|
94
|
+
self.stats = data["data"]
|
95
|
+
# Extract rate info if present
|
96
|
+
self.rate_info["current_rate"] = self.stats.get("current_rate", 0.0)
|
97
|
+
self.rate_info["average_rate"] = self.stats.get("average_rate", 0.0)
|
98
|
+
self.rate_info["expected_rate"] = self.stats.get("expected_rate", 0.0)
|
99
|
+
elif msg_type == "leaderboard":
|
100
|
+
self.leaderboard = data["data"]
|
101
|
+
elif msg_type == "activity":
|
102
|
+
self.recent_activity.append(data["data"])
|
103
|
+
# Keep only recent activity
|
104
|
+
self.recent_activity = self.recent_activity[-20:]
|
105
|
+
|
106
|
+
async def _display_loop(self):
|
107
|
+
"""Main display update loop."""
|
108
|
+
layout = self._create_layout()
|
109
|
+
|
110
|
+
with Live(layout, console=self.console, refresh_per_second=4, screen=True) as live:
|
111
|
+
while self.running:
|
112
|
+
self._update_layout(layout)
|
113
|
+
await asyncio.sleep(0.25)
|
114
|
+
|
115
|
+
def _create_layout(self) -> Layout:
|
116
|
+
"""Create the display layout."""
|
117
|
+
layout = Layout()
|
118
|
+
|
119
|
+
layout.split_column(
|
120
|
+
Layout(name="header", size=3),
|
121
|
+
Layout(name="rates", size=5),
|
122
|
+
Layout(name="body"),
|
123
|
+
Layout(name="footer", size=3),
|
124
|
+
)
|
125
|
+
|
126
|
+
layout["body"].split_row(
|
127
|
+
Layout(name="stats", ratio=1),
|
128
|
+
Layout(name="leaderboard", ratio=1),
|
129
|
+
Layout(name="activity", ratio=1),
|
130
|
+
)
|
131
|
+
|
132
|
+
return layout
|
133
|
+
|
134
|
+
def _update_layout(self, layout: Layout):
|
135
|
+
"""Update layout with current data."""
|
136
|
+
# Header
|
137
|
+
layout["header"].update(
|
138
|
+
Panel(
|
139
|
+
Text("CaptionFlow Monitor", style="bold magenta", justify="center"),
|
140
|
+
border_style="bright_blue",
|
141
|
+
)
|
142
|
+
)
|
143
|
+
|
144
|
+
# Rates panel
|
145
|
+
rates_table = Table(show_header=False, expand=True)
|
146
|
+
rates_table.add_column("Metric", style="bold")
|
147
|
+
rates_table.add_column("Value", style="cyan", justify="right")
|
148
|
+
|
149
|
+
rates_table.add_row("Current Rate", f"{self.rate_info['current_rate']:.1f} captions/min")
|
150
|
+
rates_table.add_row("Average Rate", f"{self.rate_info['average_rate']:.1f} captions/min")
|
151
|
+
rates_table.add_row("Expected Rate", f"{self.rate_info['expected_rate']:.1f} captions/min")
|
152
|
+
|
153
|
+
# Add efficiency percentage if we have expected rate
|
154
|
+
if self.rate_info["expected_rate"] > 0:
|
155
|
+
efficiency = (self.rate_info["current_rate"] / self.rate_info["expected_rate"]) * 100
|
156
|
+
color = "green" if efficiency >= 80 else "yellow" if efficiency >= 50 else "red"
|
157
|
+
rates_table.add_row("Efficiency", f"[{color}]{efficiency:.1f}%[/{color}]")
|
158
|
+
|
159
|
+
layout["rates"].update(Panel(rates_table, title="Processing Rates", border_style="magenta"))
|
160
|
+
|
161
|
+
# Statistics panel
|
162
|
+
stats_table = Table(show_header=False, expand=True)
|
163
|
+
stats_table.add_column("Metric")
|
164
|
+
stats_table.add_column("Value", style="cyan")
|
165
|
+
|
166
|
+
# Filter out rate stats (already shown in rates panel)
|
167
|
+
for key, value in self.stats.items():
|
168
|
+
if key not in ["current_rate", "average_rate", "expected_rate"]:
|
169
|
+
stats_table.add_row(key.replace("_", " ").title(), str(value))
|
170
|
+
|
171
|
+
layout["stats"].update(Panel(stats_table, title="System Statistics", border_style="green"))
|
172
|
+
|
173
|
+
# Leaderboard panel
|
174
|
+
leaderboard_table = Table(expand=True)
|
175
|
+
leaderboard_table.add_column("Rank", style="yellow")
|
176
|
+
leaderboard_table.add_column("Contributor")
|
177
|
+
leaderboard_table.add_column("Captions", style="cyan")
|
178
|
+
leaderboard_table.add_column("Trust", style="green")
|
179
|
+
|
180
|
+
for i, contributor in enumerate(self.leaderboard[:10], 1):
|
181
|
+
leaderboard_table.add_row(
|
182
|
+
str(i),
|
183
|
+
contributor.get("name", "Unknown"),
|
184
|
+
str(contributor.get("total_captions", 0)),
|
185
|
+
"⭐" * contributor.get("trust_level", 0),
|
186
|
+
)
|
187
|
+
|
188
|
+
layout["leaderboard"].update(
|
189
|
+
Panel(leaderboard_table, title="Top Contributors", border_style="yellow")
|
190
|
+
)
|
191
|
+
|
192
|
+
# Activity panel
|
193
|
+
activity_text = Text()
|
194
|
+
for activity in self.recent_activity[-10:]:
|
195
|
+
activity_text.append(f"{activity}\n", style="dim")
|
196
|
+
|
197
|
+
layout["activity"].update(
|
198
|
+
Panel(activity_text, title="Recent Activity", border_style="blue")
|
199
|
+
)
|
200
|
+
|
201
|
+
# Footer
|
202
|
+
layout["footer"].update(
|
203
|
+
Panel(
|
204
|
+
Text(
|
205
|
+
f"Updated: {datetime.now().strftime('%H:%M:%S')} | Press Ctrl+C to exit",
|
206
|
+
justify="center",
|
207
|
+
style="dim",
|
208
|
+
),
|
209
|
+
border_style="bright_black",
|
210
|
+
)
|
211
|
+
)
|