caption-flow 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/cli.py +8 -2
- caption_flow/monitor.py +1 -1
- caption_flow/orchestrator.py +522 -129
- caption_flow/storage.py +5 -0
- caption_flow/utils/chunk_tracker.py +22 -4
- caption_flow/utils/dataset_loader.py +99 -142
- caption_flow/utils/shard_processor.py +100 -36
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/METADATA +2 -1
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/RECORD +13 -13
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/WHEEL +0 -0
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.2.0.dist-info → caption_flow-0.2.2.dist-info}/top_level.txt +0 -0
caption_flow/orchestrator.py
CHANGED
@@ -16,7 +16,7 @@ import uuid
|
|
16
16
|
from dataclasses import dataclass, asdict
|
17
17
|
from datetime import datetime
|
18
18
|
from pathlib import Path
|
19
|
-
from typing import Dict, Set, Optional, Any, List, Deque
|
19
|
+
from typing import Dict, Set, Optional, Any, List, Deque, Tuple
|
20
20
|
from collections import deque, defaultdict
|
21
21
|
import threading
|
22
22
|
from queue import Queue, Empty
|
@@ -97,27 +97,9 @@ class ChunkManager:
|
|
97
97
|
self.lock = threading.Lock()
|
98
98
|
self.tracker = tracker # Reference to chunk tracker
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
"""Create chunks from a shard."""
|
104
|
-
chunks = []
|
105
|
-
|
106
|
-
for start_idx in range(0, total_items, self.chunk_size):
|
107
|
-
chunk = ShardChunk.create(
|
108
|
-
shard_url=shard_url,
|
109
|
-
shard_name=shard_name,
|
110
|
-
start_index=start_idx,
|
111
|
-
chunk_size=min(self.chunk_size, total_items - start_idx),
|
112
|
-
)
|
113
|
-
|
114
|
-
with self.lock:
|
115
|
-
self.chunks[chunk.chunk_id] = chunk
|
116
|
-
self.pending_chunks.append(chunk.chunk_id)
|
117
|
-
|
118
|
-
chunks.append(chunk)
|
119
|
-
|
120
|
-
return chunks
|
100
|
+
# NEW: Track assigned ranges to prevent double allocation
|
101
|
+
# Format: {chunk_id: {(start, end): worker_id}}
|
102
|
+
self.assigned_ranges: Dict[str, Dict[Tuple[int, int], str]] = defaultdict(dict)
|
121
103
|
|
122
104
|
def get_chunks_for_worker(
|
123
105
|
self, worker_id: str, count: int = 1, tracker: Optional["ChunkTracker"] = None
|
@@ -127,7 +109,6 @@ class ChunkManager:
|
|
127
109
|
|
128
110
|
with self.lock:
|
129
111
|
# FIRST PRIORITY: Check if this worker already has assigned chunks
|
130
|
-
# Workers should complete their current chunks before getting new ones
|
131
112
|
if worker_id in self.assigned_chunks:
|
132
113
|
existing_chunk_ids = list(self.assigned_chunks[worker_id])
|
133
114
|
for chunk_id in existing_chunk_ids:
|
@@ -142,12 +123,29 @@ class ChunkManager:
|
|
142
123
|
if tracker:
|
143
124
|
chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
|
144
125
|
if chunk_info and chunk_info["unprocessed_ranges"]:
|
145
|
-
assigned
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
126
|
+
# Filter out ranges that are assigned to other workers
|
127
|
+
clean_ranges = []
|
128
|
+
for start, end in chunk_info["unprocessed_ranges"]:
|
129
|
+
range_key = (start, end)
|
130
|
+
if range_key in self.assigned_ranges[chunk_id]:
|
131
|
+
assigned_worker = self.assigned_ranges[chunk_id][range_key]
|
132
|
+
if assigned_worker != worker_id:
|
133
|
+
# Skip this range - it's assigned to another worker
|
134
|
+
logger.warning(
|
135
|
+
f"Skipping range {start}-{end} in chunk {chunk_id} "
|
136
|
+
f"(assigned to {assigned_worker}, not {worker_id})"
|
137
|
+
)
|
138
|
+
continue
|
139
|
+
# else: this worker already owns this range, include it
|
140
|
+
clean_ranges.append((start, end))
|
141
|
+
|
142
|
+
if clean_ranges:
|
143
|
+
assigned.append(
|
144
|
+
{
|
145
|
+
"chunk": chunk,
|
146
|
+
"unprocessed_ranges": clean_ranges,
|
147
|
+
}
|
148
|
+
)
|
151
149
|
else:
|
152
150
|
# No tracker, assume chunk needs processing
|
153
151
|
assigned.append(
|
@@ -158,7 +156,6 @@ class ChunkManager:
|
|
158
156
|
)
|
159
157
|
|
160
158
|
# SECOND PRIORITY: Get new pending chunks
|
161
|
-
# Only if worker doesn't have enough chunks already
|
162
159
|
while len(assigned) < count and self.pending_chunks:
|
163
160
|
chunk_id = self.pending_chunks.popleft()
|
164
161
|
chunk = self.chunks.get(chunk_id)
|
@@ -166,7 +163,7 @@ class ChunkManager:
|
|
166
163
|
if not chunk:
|
167
164
|
continue
|
168
165
|
|
169
|
-
# Verify chunk is truly pending
|
166
|
+
# Verify chunk is truly pending
|
170
167
|
if chunk.status != "pending" or chunk.assigned_to is not None:
|
171
168
|
logger.warning(
|
172
169
|
f"Chunk {chunk_id} in pending queue but status={chunk.status}, assigned_to={chunk.assigned_to}"
|
@@ -179,15 +176,48 @@ class ChunkManager:
|
|
179
176
|
chunk.assigned_at = datetime.utcnow()
|
180
177
|
self.assigned_chunks[worker_id].add(chunk_id)
|
181
178
|
|
182
|
-
# Get unprocessed ranges
|
179
|
+
# Get unprocessed ranges and filter out any that are somehow already assigned
|
183
180
|
unprocessed_ranges = [(0, chunk.chunk_size - 1)] # Default
|
184
181
|
if tracker:
|
185
182
|
chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
|
186
183
|
if chunk_info:
|
187
|
-
|
184
|
+
# Filter out any ranges that are already assigned (shouldn't happen for new chunks)
|
185
|
+
clean_ranges = []
|
186
|
+
for start, end in chunk_info["unprocessed_ranges"]:
|
187
|
+
range_key = (start, end)
|
188
|
+
if range_key not in self.assigned_ranges[chunk_id]:
|
189
|
+
clean_ranges.append((start, end))
|
190
|
+
else:
|
191
|
+
logger.error(
|
192
|
+
f"Range {start}-{end} in newly assigned chunk {chunk_id} "
|
193
|
+
f"is already assigned to {self.assigned_ranges[chunk_id][range_key]}!"
|
194
|
+
)
|
195
|
+
unprocessed_ranges = clean_ranges if clean_ranges else []
|
196
|
+
|
188
197
|
tracker.mark_assigned(chunk_id, worker_id)
|
189
198
|
|
190
|
-
|
199
|
+
if unprocessed_ranges:
|
200
|
+
assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
|
201
|
+
|
202
|
+
# Track assigned ranges and verify no double allocation
|
203
|
+
for info in assigned:
|
204
|
+
chunk_id = info["chunk"].chunk_id
|
205
|
+
for start, end in info["unprocessed_ranges"]:
|
206
|
+
range_key = (start, end)
|
207
|
+
|
208
|
+
# Check if this range is already assigned
|
209
|
+
if range_key in self.assigned_ranges[chunk_id]:
|
210
|
+
existing_worker = self.assigned_ranges[chunk_id][range_key]
|
211
|
+
if existing_worker != worker_id:
|
212
|
+
# This should never happen - raise assertion
|
213
|
+
raise AssertionError(
|
214
|
+
f"CRITICAL: Attempting to assign range {start}-{end} in chunk {chunk_id} "
|
215
|
+
f"to worker {worker_id}, but it's already assigned to {existing_worker}! "
|
216
|
+
f"This would cause duplicate processing."
|
217
|
+
)
|
218
|
+
|
219
|
+
# Track this assignment
|
220
|
+
self.assigned_ranges[chunk_id][range_key] = worker_id
|
191
221
|
|
192
222
|
# Log what we're assigning
|
193
223
|
if assigned:
|
@@ -199,6 +229,12 @@ class ChunkManager:
|
|
199
229
|
)
|
200
230
|
logger.info(f"Assigning to worker {worker_id}: {chunk_summary}")
|
201
231
|
|
232
|
+
# Detailed range logging for debugging
|
233
|
+
for info in assigned:
|
234
|
+
chunk_id = info["chunk"].chunk_id
|
235
|
+
ranges_str = ", ".join([f"{s}-{e}" for s, e in info["unprocessed_ranges"]])
|
236
|
+
logger.debug(f" Chunk {chunk_id} ranges: {ranges_str}")
|
237
|
+
|
202
238
|
return assigned
|
203
239
|
|
204
240
|
def complete_chunk(self, chunk_id: str, worker_id: str) -> bool:
|
@@ -210,6 +246,16 @@ class ChunkManager:
|
|
210
246
|
chunk.status = "completed"
|
211
247
|
chunk.completed_at = datetime.utcnow()
|
212
248
|
self.assigned_chunks[worker_id].discard(chunk_id)
|
249
|
+
|
250
|
+
# Clear assigned ranges for this chunk
|
251
|
+
if chunk_id in self.assigned_ranges:
|
252
|
+
# Log what ranges we're clearing
|
253
|
+
ranges_to_clear = list(self.assigned_ranges[chunk_id].keys())
|
254
|
+
logger.debug(
|
255
|
+
f"Clearing {len(ranges_to_clear)} assigned ranges for completed chunk {chunk_id}"
|
256
|
+
)
|
257
|
+
del self.assigned_ranges[chunk_id]
|
258
|
+
|
213
259
|
return True
|
214
260
|
return False
|
215
261
|
|
@@ -224,6 +270,20 @@ class ChunkManager:
|
|
224
270
|
chunk.assigned_at = None
|
225
271
|
self.assigned_chunks[worker_id].discard(chunk_id)
|
226
272
|
self.pending_chunks.append(chunk_id)
|
273
|
+
|
274
|
+
# Clear assigned ranges for this chunk/worker
|
275
|
+
if chunk_id in self.assigned_ranges:
|
276
|
+
ranges_to_clear = [
|
277
|
+
range_key
|
278
|
+
for range_key, assigned_worker in self.assigned_ranges[chunk_id].items()
|
279
|
+
if assigned_worker == worker_id
|
280
|
+
]
|
281
|
+
for range_key in ranges_to_clear:
|
282
|
+
del self.assigned_ranges[chunk_id][range_key]
|
283
|
+
logger.debug(
|
284
|
+
f"Cleared {len(ranges_to_clear)} assigned ranges for failed chunk {chunk_id}"
|
285
|
+
)
|
286
|
+
|
227
287
|
return True
|
228
288
|
return False
|
229
289
|
|
@@ -240,18 +300,62 @@ class ChunkManager:
|
|
240
300
|
chunk.assigned_at = None
|
241
301
|
self.pending_chunks.append(chunk_id)
|
242
302
|
|
303
|
+
# Clear assigned ranges for this worker
|
304
|
+
if chunk_id in self.assigned_ranges:
|
305
|
+
ranges_to_clear = [
|
306
|
+
range_key
|
307
|
+
for range_key, assigned_worker in self.assigned_ranges[
|
308
|
+
chunk_id
|
309
|
+
].items()
|
310
|
+
if assigned_worker == worker_id
|
311
|
+
]
|
312
|
+
for range_key in ranges_to_clear:
|
313
|
+
del self.assigned_ranges[chunk_id][range_key]
|
314
|
+
|
315
|
+
if ranges_to_clear:
|
316
|
+
logger.info(
|
317
|
+
f"Released {len(ranges_to_clear)} ranges from chunk {chunk_id} "
|
318
|
+
f"previously assigned to disconnected worker {worker_id}"
|
319
|
+
)
|
320
|
+
|
243
321
|
if worker_id in self.assigned_chunks:
|
244
322
|
del self.assigned_chunks[worker_id]
|
245
323
|
|
324
|
+
def mark_ranges_processed(
|
325
|
+
self, chunk_id: str, processed_ranges: List[Tuple[int, int]], worker_id: str
|
326
|
+
):
|
327
|
+
"""Remove ranges from assignment tracking once they're processed."""
|
328
|
+
with self.lock:
|
329
|
+
if chunk_id in self.assigned_ranges:
|
330
|
+
for start, end in processed_ranges:
|
331
|
+
range_key = (start, end)
|
332
|
+
if range_key in self.assigned_ranges[chunk_id]:
|
333
|
+
assigned_worker = self.assigned_ranges[chunk_id][range_key]
|
334
|
+
if assigned_worker == worker_id:
|
335
|
+
del self.assigned_ranges[chunk_id][range_key]
|
336
|
+
logger.debug(
|
337
|
+
f"Cleared assignment of range {start}-{end} in chunk {chunk_id} "
|
338
|
+
f"after processing by {worker_id}"
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
logger.warning(
|
342
|
+
f"Worker {worker_id} claims to have processed range {start}-{end} "
|
343
|
+
f"in chunk {chunk_id}, but it was assigned to {assigned_worker}"
|
344
|
+
)
|
345
|
+
|
246
346
|
def get_stats(self) -> Dict[str, int]:
|
247
347
|
"""Get chunk statistics."""
|
248
348
|
with self.lock:
|
349
|
+
# Count total assigned ranges
|
350
|
+
total_assigned_ranges = sum(len(ranges) for ranges in self.assigned_ranges.values())
|
351
|
+
|
249
352
|
stats = {
|
250
353
|
"total": len(self.chunks),
|
251
354
|
"pending": len(self.pending_chunks),
|
252
355
|
"assigned": sum(len(chunks) for chunks in self.assigned_chunks.values()),
|
253
356
|
"completed": sum(1 for c in self.chunks.values() if c.status == "completed"),
|
254
357
|
"failed": sum(1 for c in self.chunks.values() if c.status == "failed"),
|
358
|
+
"assigned_ranges": total_assigned_ranges,
|
255
359
|
}
|
256
360
|
return stats
|
257
361
|
|
@@ -363,6 +467,7 @@ class Orchestrator:
|
|
363
467
|
self.ssl_context = self._setup_ssl()
|
364
468
|
|
365
469
|
# Statistics
|
470
|
+
self.is_generating_stats = False
|
366
471
|
self.stats = {
|
367
472
|
"total_chunks": 0,
|
368
473
|
"completed_chunks": 0,
|
@@ -490,13 +595,15 @@ class Orchestrator:
|
|
490
595
|
with self.chunk_manager.lock:
|
491
596
|
for chunk_state in shard_info["chunks"]:
|
492
597
|
if chunk_state.status in ["pending", "failed", "assigned"]:
|
493
|
-
#
|
598
|
+
# For assigned chunks, reset them to pending since workers don't exist
|
494
599
|
chunk = ShardChunk(
|
495
600
|
chunk_id=chunk_state.chunk_id,
|
496
601
|
shard_url=chunk_state.shard_url,
|
497
602
|
shard_name=chunk_state.shard_name,
|
498
603
|
start_index=chunk_state.start_index,
|
499
604
|
chunk_size=chunk_state.chunk_size,
|
605
|
+
status="pending", # Reset to pending
|
606
|
+
assigned_to=None, # Clear assignment
|
500
607
|
)
|
501
608
|
self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
|
502
609
|
self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
|
@@ -1409,28 +1516,36 @@ class Orchestrator:
|
|
1409
1516
|
finally:
|
1410
1517
|
del self.data_workers[worker_id]
|
1411
1518
|
|
1412
|
-
async def
|
1413
|
-
"""
|
1414
|
-
|
1415
|
-
logger.info("Monitor connected")
|
1416
|
-
|
1519
|
+
async def _send_leaderboard_to_monitor(self, websocket: WebSocketServerProtocol):
|
1520
|
+
"""Send leaderboard data to a specific monitor."""
|
1521
|
+
total_start = time.time()
|
1417
1522
|
try:
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
# Send chunk stats
|
1422
|
-
chunk_stats = self.chunk_manager.get_stats()
|
1423
|
-
await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
|
1523
|
+
if websocket not in self.monitors:
|
1524
|
+
return
|
1424
1525
|
|
1425
|
-
#
|
1526
|
+
# Get contributors asynchronously
|
1527
|
+
contributors_start = time.time()
|
1426
1528
|
contributors = await self.storage.get_top_contributors(10)
|
1529
|
+
logger.debug(
|
1530
|
+
f"Contributors retrieved in {(time.time() - contributors_start)*1000:.1f}ms"
|
1531
|
+
)
|
1427
1532
|
|
1428
|
-
#
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1533
|
+
# Get worker counts in thread pool
|
1534
|
+
worker_counts_start = time.time()
|
1535
|
+
loop = asyncio.get_event_loop()
|
1536
|
+
worker_counts = await loop.run_in_executor(
|
1537
|
+
None,
|
1538
|
+
lambda: (
|
1539
|
+
self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
|
1540
|
+
),
|
1541
|
+
)
|
1542
|
+
logger.debug(
|
1543
|
+
f"Worker counts retrieved in {(time.time() - worker_counts_start)*1000:.1f}ms"
|
1432
1544
|
)
|
1433
1545
|
|
1546
|
+
# Build enhanced contributors list
|
1547
|
+
build_start = time.time()
|
1548
|
+
enhanced_contributors = []
|
1434
1549
|
for contributor in contributors:
|
1435
1550
|
contrib_dict = {
|
1436
1551
|
"contributor_id": contributor.contributor_id,
|
@@ -1442,40 +1557,157 @@ class Orchestrator:
|
|
1442
1557
|
),
|
1443
1558
|
}
|
1444
1559
|
enhanced_contributors.append(contrib_dict)
|
1560
|
+
logger.debug(f"Enhanced contributors built in {(time.time() - build_start)*1000:.1f}ms")
|
1445
1561
|
|
1446
|
-
|
1447
|
-
|
1562
|
+
# Cache for future monitors
|
1563
|
+
self._cached_leaderboard = enhanced_contributors
|
1564
|
+
|
1565
|
+
# Send if still connected
|
1566
|
+
if websocket in self.monitors:
|
1567
|
+
send_start = time.time()
|
1568
|
+
await websocket.send(
|
1569
|
+
safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
|
1570
|
+
)
|
1571
|
+
logger.debug(
|
1572
|
+
f"Leaderboard sent to monitor in {(time.time() - send_start)*1000:.1f}ms"
|
1573
|
+
)
|
1574
|
+
|
1575
|
+
logger.debug(
|
1576
|
+
f"Leaderboard send to monitor completed in {(time.time() - total_start)*1000:.1f}ms"
|
1448
1577
|
)
|
1449
1578
|
|
1450
|
-
|
1451
|
-
|
1452
|
-
|
1579
|
+
except websockets.exceptions.ConnectionClosed:
|
1580
|
+
logger.debug("Monitor disconnected during leaderboard send")
|
1581
|
+
except Exception as e:
|
1582
|
+
logger.error(f"Error sending leaderboard to monitor: {e}")
|
1583
|
+
|
1584
|
+
async def _send_initial_monitor_data(self, websocket: WebSocketServerProtocol):
|
1585
|
+
"""Send initial data to monitor in a separate task to avoid blocking."""
|
1586
|
+
total_start = time.time()
|
1587
|
+
try:
|
1588
|
+
# Check if websocket is still in monitors set
|
1589
|
+
if websocket not in self.monitors:
|
1590
|
+
logger.debug("Monitor disconnected before initial data send")
|
1591
|
+
return
|
1592
|
+
|
1593
|
+
# Send current stats (already in memory)
|
1594
|
+
stats_start = time.time()
|
1595
|
+
await websocket.send(safe_json_dumps({"type": "stats", "data": self.stats}))
|
1596
|
+
logger.debug(f"Monitor stats sent in {(time.time() - stats_start)*1000:.1f}ms")
|
1597
|
+
|
1598
|
+
# Get chunk stats asynchronously
|
1599
|
+
chunk_stats_start = time.time()
|
1600
|
+
loop = asyncio.get_event_loop()
|
1601
|
+
chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
|
1602
|
+
logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
|
1603
|
+
|
1604
|
+
if websocket not in self.monitors:
|
1605
|
+
return
|
1606
|
+
|
1607
|
+
chunk_send_start = time.time()
|
1608
|
+
await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
|
1609
|
+
logger.debug(f"Chunk stats sent in {(time.time() - chunk_send_start)*1000:.1f}ms")
|
1610
|
+
|
1611
|
+
# For leaderboard, check if we have a cached version first
|
1612
|
+
if hasattr(self, "_cached_leaderboard") and self._cached_leaderboard:
|
1613
|
+
# Use cached leaderboard if available
|
1614
|
+
cache_send_start = time.time()
|
1615
|
+
await websocket.send(
|
1616
|
+
safe_json_dumps({"type": "leaderboard", "data": self._cached_leaderboard})
|
1617
|
+
)
|
1618
|
+
logger.debug(
|
1619
|
+
f"Cached leaderboard sent in {(time.time() - cache_send_start)*1000:.1f}ms"
|
1620
|
+
)
|
1621
|
+
else:
|
1622
|
+
# Schedule leaderboard update separately
|
1623
|
+
leaderboard_task_start = time.time()
|
1624
|
+
asyncio.create_task(self._send_leaderboard_to_monitor(websocket))
|
1625
|
+
logger.debug(
|
1626
|
+
f"Leaderboard task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
|
1627
|
+
)
|
1628
|
+
|
1629
|
+
logger.debug(
|
1630
|
+
f"Monitor initial data send completed in {(time.time() - total_start)*1000:.1f}ms"
|
1631
|
+
)
|
1632
|
+
|
1633
|
+
except websockets.exceptions.ConnectionClosed:
|
1634
|
+
logger.debug("Monitor disconnected during initial data send")
|
1635
|
+
except Exception as e:
|
1636
|
+
logger.error(f"Error sending initial monitor data: {e}")
|
1637
|
+
|
1638
|
+
async def _handle_monitor(self, websocket: WebSocketServerProtocol):
|
1639
|
+
"""Handle monitor connection - truly non-blocking version."""
|
1640
|
+
monitor_start = time.time()
|
1641
|
+
self.monitors.add(websocket)
|
1642
|
+
logger.info(f"Monitor connected (total monitors: {len(self.monitors)})")
|
1643
|
+
|
1644
|
+
try:
|
1645
|
+
# Send welcome message immediately
|
1646
|
+
welcome_start = time.time()
|
1647
|
+
await websocket.send(safe_json_dumps({"type": "welcome", "role": "monitor"}))
|
1648
|
+
logger.debug(f"Monitor welcome sent in {(time.time() - welcome_start)*1000:.1f}ms")
|
1649
|
+
|
1650
|
+
# Schedule initial data send as a separate task to avoid blocking
|
1651
|
+
task_create_start = time.time()
|
1652
|
+
asyncio.create_task(self._send_initial_monitor_data(websocket))
|
1653
|
+
logger.debug(
|
1654
|
+
f"Monitor initial data task created in {(time.time() - task_create_start)*1000:.1f}ms"
|
1655
|
+
)
|
1656
|
+
|
1657
|
+
# Just keep the connection alive - no blocking work here
|
1658
|
+
try:
|
1659
|
+
async for message in websocket:
|
1660
|
+
# Handle any incoming messages from monitor if needed
|
1661
|
+
# For now, just ignore them
|
1662
|
+
pass
|
1663
|
+
except websockets.exceptions.ConnectionClosed:
|
1664
|
+
pass # Normal disconnection
|
1453
1665
|
|
1454
1666
|
except websockets.exceptions.ConnectionClosed:
|
1455
1667
|
logger.info("Monitor disconnected")
|
1668
|
+
except Exception as e:
|
1669
|
+
logger.error(f"Error in monitor handler: {e}")
|
1456
1670
|
finally:
|
1457
1671
|
self.monitors.discard(websocket)
|
1672
|
+
logger.debug(f"Monitor handler completed in {(time.time() - monitor_start)*1000:.1f}ms")
|
1458
1673
|
|
1459
1674
|
async def _broadcast_stats(self):
|
1460
|
-
"""Broadcast statistics to all monitors -
|
1675
|
+
"""Broadcast statistics to all monitors - truly non-blocking version."""
|
1461
1676
|
if not self.monitors:
|
1462
1677
|
return
|
1678
|
+
if self.is_generating_stats:
|
1679
|
+
return # Already generating stats, skip this call
|
1680
|
+
self.is_generating_stats = True
|
1681
|
+
total_start = time.time()
|
1463
1682
|
|
1464
|
-
#
|
1683
|
+
# Prepare all the data first
|
1684
|
+
data_prep_start = time.time()
|
1685
|
+
loop = asyncio.get_event_loop()
|
1686
|
+
|
1687
|
+
# Get storage stats (already async)
|
1688
|
+
storage_stats_start = time.time()
|
1465
1689
|
storage_stats = await self.storage.get_storage_stats()
|
1690
|
+
logger.debug(f"Storage stats retrieved in {(time.time() - storage_stats_start)*1000:.1f}ms")
|
1691
|
+
|
1692
|
+
caption_stats_start = time.time()
|
1466
1693
|
caption_stats = await self.storage.get_caption_stats()
|
1694
|
+
logger.debug(f"Caption stats retrieved in {(time.time() - caption_stats_start)*1000:.1f}ms")
|
1467
1695
|
|
1468
|
-
#
|
1469
|
-
|
1470
|
-
|
1696
|
+
# Get chunk stats in thread pool
|
1697
|
+
chunk_stats_start = time.time()
|
1698
|
+
chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
|
1699
|
+
logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
|
1471
1700
|
|
1472
|
-
#
|
1473
|
-
|
1474
|
-
|
1475
|
-
|
1701
|
+
# Build stats dict
|
1702
|
+
build_stats_start = time.time()
|
1703
|
+
stats_update = self.stats.copy()
|
1704
|
+
stats_update.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
|
1705
|
+
stats_update.update(storage_stats)
|
1706
|
+
stats_update["field_breakdown"] = caption_stats.get("field_stats", {})
|
1707
|
+
stats_update["output_fields_list"] = caption_stats.get("output_fields", [])
|
1476
1708
|
|
1477
1709
|
# Add rate information
|
1478
|
-
|
1710
|
+
stats_update.update(
|
1479
1711
|
{
|
1480
1712
|
"current_rate": self.rate_tracker["current_rate"],
|
1481
1713
|
"average_rate": self.rate_tracker["average_rate"],
|
@@ -1483,41 +1715,106 @@ class Orchestrator:
|
|
1483
1715
|
}
|
1484
1716
|
)
|
1485
1717
|
|
1486
|
-
# Add vLLM info
|
1487
|
-
|
1488
|
-
|
1718
|
+
# Add vLLM info
|
1719
|
+
stats_update["vllm_model"] = self.vllm_config.get("model", "unknown")
|
1720
|
+
stats_update["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
|
1489
1721
|
|
1490
|
-
#
|
1722
|
+
# Add stage information
|
1491
1723
|
stages = self.vllm_config.get("stages", [])
|
1492
1724
|
if stages:
|
1493
|
-
|
1494
|
-
|
1725
|
+
stats_update["stage_count"] = len(stages)
|
1726
|
+
stats_update["stage_names"] = [s.get("name", "unnamed") for s in stages]
|
1495
1727
|
else:
|
1496
|
-
|
1497
|
-
|
1728
|
+
stats_update["stage_count"] = 1
|
1729
|
+
stats_update["stage_names"] = ["default"]
|
1498
1730
|
|
1731
|
+
# Get field stats
|
1732
|
+
field_stats_start = time.time()
|
1499
1733
|
field_stats = await self.storage.get_output_field_stats()
|
1500
|
-
|
1734
|
+
stats_update["output_fields"] = field_stats
|
1735
|
+
logger.debug(f"Field stats retrieved in {(time.time() - field_stats_start)*1000:.1f}ms")
|
1501
1736
|
|
1502
|
-
|
1737
|
+
# Update our internal stats
|
1738
|
+
self.stats = stats_update
|
1739
|
+
logger.debug(f"Stats prepared in {(time.time() - build_stats_start)*1000:.1f}ms")
|
1503
1740
|
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1741
|
+
logger.debug(f"Total data preparation took {(time.time() - data_prep_start)*1000:.1f}ms")
|
1742
|
+
|
1743
|
+
# Create message once
|
1744
|
+
message_create_start = time.time()
|
1745
|
+
stats_message = safe_json_dumps({"type": "stats", "data": self.stats})
|
1746
|
+
logger.debug(f"Stats message created in {(time.time() - message_create_start)*1000:.1f}ms")
|
1747
|
+
|
1748
|
+
# Send to all monitors asynchronously in parallel
|
1749
|
+
send_start = time.time()
|
1750
|
+
|
1751
|
+
async def send_to_monitor(monitor):
|
1508
1752
|
try:
|
1509
|
-
await monitor.send(
|
1753
|
+
await monitor.send(stats_message)
|
1510
1754
|
except websockets.exceptions.ConnectionClosed:
|
1511
|
-
|
1755
|
+
return monitor # Return for removal
|
1756
|
+
except Exception as e:
|
1757
|
+
logger.debug(f"Error sending stats to monitor: {e}")
|
1758
|
+
return monitor # Return for removal
|
1759
|
+
return None
|
1760
|
+
|
1761
|
+
# Send to all monitors in parallel
|
1762
|
+
monitors_copy = self.monitors.copy()
|
1763
|
+
results = await asyncio.gather(
|
1764
|
+
*[send_to_monitor(m) for m in monitors_copy], return_exceptions=True
|
1765
|
+
)
|
1766
|
+
|
1767
|
+
# Remove disconnected monitors
|
1768
|
+
disconnected = {
|
1769
|
+
m
|
1770
|
+
for m, r in zip(monitors_copy, results)
|
1771
|
+
if r is not None and not isinstance(r, Exception)
|
1772
|
+
}
|
1773
|
+
self.monitors -= disconnected
|
1774
|
+
|
1775
|
+
logger.debug(
|
1776
|
+
f"Stats sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
|
1777
|
+
)
|
1778
|
+
|
1779
|
+
# Send leaderboard update in a separate task to avoid blocking
|
1780
|
+
leaderboard_task_start = time.time()
|
1781
|
+
asyncio.create_task(self._broadcast_leaderboard())
|
1782
|
+
self.is_generating_stats = False
|
1783
|
+
logger.debug(
|
1784
|
+
f"Leaderboard broadcast task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
|
1785
|
+
)
|
1786
|
+
logger.debug(f"Stats broadcast completed in {(time.time() - total_start)*1000:.1f}ms")
|
1512
1787
|
|
1513
|
-
|
1788
|
+
async def _broadcast_leaderboard(self):
|
1789
|
+
"""Send leaderboard updates to monitors - separate from stats to avoid blocking."""
|
1790
|
+
if not self.monitors:
|
1791
|
+
return
|
1792
|
+
|
1793
|
+
total_start = time.time()
|
1514
1794
|
try:
|
1795
|
+
# Get contributors
|
1796
|
+
contributors_start = time.time()
|
1515
1797
|
contributors = await self.storage.get_top_contributors(10)
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1798
|
+
logger.debug(
|
1799
|
+
f"Contributors retrieved for broadcast in {(time.time() - contributors_start)*1000:.1f}ms"
|
1800
|
+
)
|
1801
|
+
|
1802
|
+
# Get worker counts
|
1803
|
+
worker_counts_start = time.time()
|
1804
|
+
loop = asyncio.get_event_loop()
|
1805
|
+
worker_counts = await loop.run_in_executor(
|
1806
|
+
None,
|
1807
|
+
lambda: (
|
1808
|
+
self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
|
1809
|
+
),
|
1810
|
+
)
|
1811
|
+
logger.debug(
|
1812
|
+
f"Worker counts retrieved for broadcast in {(time.time() - worker_counts_start)*1000:.1f}ms"
|
1519
1813
|
)
|
1520
1814
|
|
1815
|
+
# Build enhanced contributors list
|
1816
|
+
build_start = time.time()
|
1817
|
+
enhanced_contributors = []
|
1521
1818
|
for contributor in contributors:
|
1522
1819
|
contrib_dict = {
|
1523
1820
|
"contributor_id": contributor.contributor_id,
|
@@ -1529,26 +1826,64 @@ class Orchestrator:
|
|
1529
1826
|
),
|
1530
1827
|
}
|
1531
1828
|
enhanced_contributors.append(contrib_dict)
|
1829
|
+
logger.debug(
|
1830
|
+
f"Enhanced contributors built for broadcast in {(time.time() - build_start)*1000:.1f}ms"
|
1831
|
+
)
|
1532
1832
|
|
1833
|
+
# Cache it
|
1834
|
+
self._cached_leaderboard = enhanced_contributors
|
1835
|
+
|
1836
|
+
# Create message once
|
1837
|
+
message_create_start = time.time()
|
1533
1838
|
leaderboard_message = safe_json_dumps(
|
1534
1839
|
{"type": "leaderboard", "data": enhanced_contributors}
|
1535
1840
|
)
|
1841
|
+
logger.debug(
|
1842
|
+
f"Leaderboard message created in {(time.time() - message_create_start)*1000:.1f}ms"
|
1843
|
+
)
|
1844
|
+
|
1845
|
+
# Send to all monitors in parallel
|
1846
|
+
send_start = time.time()
|
1536
1847
|
|
1537
|
-
|
1538
|
-
disconnected = set()
|
1539
|
-
for monitor in self.monitors.copy():
|
1848
|
+
async def send_leaderboard(monitor):
|
1540
1849
|
try:
|
1541
1850
|
await monitor.send(leaderboard_message)
|
1542
|
-
except
|
1543
|
-
|
1851
|
+
except:
|
1852
|
+
return monitor # Mark for removal
|
1853
|
+
return None
|
1854
|
+
|
1855
|
+
monitors_copy = self.monitors.copy()
|
1856
|
+
results = await asyncio.gather(
|
1857
|
+
*[send_leaderboard(m) for m in monitors_copy], return_exceptions=True
|
1858
|
+
)
|
1544
1859
|
|
1860
|
+
# Remove disconnected
|
1861
|
+
disconnected = {
|
1862
|
+
m
|
1863
|
+
for m, r in zip(monitors_copy, results)
|
1864
|
+
if r is not None and not isinstance(r, Exception)
|
1865
|
+
}
|
1545
1866
|
self.monitors -= disconnected
|
1546
1867
|
|
1547
|
-
|
1548
|
-
|
1868
|
+
logger.debug(
|
1869
|
+
f"Leaderboard sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
|
1870
|
+
)
|
1871
|
+
logger.debug(
|
1872
|
+
f"Leaderboard broadcast completed in {(time.time() - total_start)*1000:.1f}ms"
|
1873
|
+
)
|
1549
1874
|
|
1550
|
-
|
1551
|
-
|
1875
|
+
except Exception as e:
|
1876
|
+
logger.error(f"Error broadcasting leaderboard: {e}")
|
1877
|
+
|
1878
|
+
def _get_queue_stats(self) -> Dict[str, int]:
|
1879
|
+
"""Get queue statistics - synchronous helper for thread pool."""
|
1880
|
+
with self.chunk_manager.lock:
|
1881
|
+
return {
|
1882
|
+
"pending_chunks": len(self.chunk_manager.pending_chunks),
|
1883
|
+
"assigned_chunks": sum(
|
1884
|
+
len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
|
1885
|
+
),
|
1886
|
+
}
|
1552
1887
|
|
1553
1888
|
async def _flush_processed_items(self):
|
1554
1889
|
"""Flush batched processed items to chunk tracker."""
|
@@ -1582,21 +1917,37 @@ class Orchestrator:
|
|
1582
1917
|
# Don't forget the last range
|
1583
1918
|
ranges.append((start, end))
|
1584
1919
|
|
1585
|
-
# Mark ranges as processed
|
1920
|
+
# Mark ranges as processed
|
1586
1921
|
for start_idx, end_idx in ranges:
|
1587
1922
|
self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
|
1588
1923
|
|
1924
|
+
with self.chunk_manager.lock:
|
1925
|
+
if chunk_id in self.chunk_manager.assigned_ranges:
|
1926
|
+
for start_idx, end_idx in ranges:
|
1927
|
+
# Clear any assignments in this range
|
1928
|
+
to_remove = []
|
1929
|
+
for range_start, range_end in self.chunk_manager.assigned_ranges[
|
1930
|
+
chunk_id
|
1931
|
+
]:
|
1932
|
+
if range_start >= start_idx and range_end <= end_idx:
|
1933
|
+
to_remove.append((range_start, range_end))
|
1934
|
+
|
1935
|
+
for range_key in to_remove:
|
1936
|
+
del self.chunk_manager.assigned_ranges[chunk_id][range_key]
|
1937
|
+
|
1589
1938
|
# Clear pending items
|
1590
1939
|
self.pending_processed_items.clear()
|
1591
1940
|
self.last_item_batch_flush = time.time()
|
1592
1941
|
|
1593
1942
|
def get_workers_by_user_stats(self) -> Dict[str, Any]:
|
1594
|
-
"""Get statistics about workers grouped by user/token."""
|
1943
|
+
"""Get statistics about workers grouped by user/token - thread-safe version."""
|
1595
1944
|
if not hasattr(self, "workers_by_user"):
|
1596
1945
|
return {}
|
1597
1946
|
|
1947
|
+
# Create a copy to avoid issues with concurrent modification
|
1598
1948
|
stats = {}
|
1599
|
-
|
1949
|
+
workers_snapshot = dict(self.workers_by_user)
|
1950
|
+
for user, worker_ids in workers_snapshot.items():
|
1600
1951
|
stats[user] = {"worker_count": len(worker_ids), "worker_ids": list(worker_ids)}
|
1601
1952
|
return stats
|
1602
1953
|
|
@@ -1621,21 +1972,63 @@ class Orchestrator:
|
|
1621
1972
|
async def _heartbeat_loop(self):
|
1622
1973
|
"""Send periodic heartbeats to maintain connections."""
|
1623
1974
|
while True:
|
1624
|
-
|
1975
|
+
try:
|
1976
|
+
await asyncio.sleep(30)
|
1625
1977
|
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1978
|
+
# Create a copy of worker items to avoid modification during iteration
|
1979
|
+
worker_items = list(self.workers.items())
|
1980
|
+
disconnected = []
|
1981
|
+
|
1982
|
+
for worker_id, ws in worker_items:
|
1983
|
+
try:
|
1984
|
+
# Check if worker still exists before pinging
|
1985
|
+
if worker_id not in self.workers:
|
1986
|
+
continue
|
1987
|
+
|
1988
|
+
# Send ping with timeout
|
1989
|
+
pong_waiter = await ws.ping()
|
1990
|
+
try:
|
1991
|
+
await asyncio.wait_for(pong_waiter, timeout=10)
|
1992
|
+
except asyncio.TimeoutError:
|
1993
|
+
logger.warning(f"Worker {worker_id} failed to respond to ping")
|
1994
|
+
disconnected.append(worker_id)
|
1995
|
+
except websockets.exceptions.ConnectionClosed:
|
1996
|
+
logger.info(f"Worker {worker_id} connection already closed")
|
1997
|
+
disconnected.append(worker_id)
|
1998
|
+
except Exception as e:
|
1999
|
+
logger.error(f"Error pinging worker {worker_id}: {e}")
|
2000
|
+
disconnected.append(worker_id)
|
2001
|
+
|
2002
|
+
# Clean up disconnected workers
|
2003
|
+
for worker_id in disconnected:
|
2004
|
+
if worker_id in self.workers:
|
2005
|
+
logger.info(f"Removing unresponsive worker {worker_id}")
|
2006
|
+
del self.workers[worker_id]
|
2007
|
+
self.chunk_manager.release_worker_chunks(worker_id)
|
2008
|
+
|
2009
|
+
# Update stats
|
2010
|
+
self.stats["connected_workers"] = len(self.workers)
|
1633
2011
|
|
1634
|
-
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
|
2012
|
+
# Also clean up from workers_by_user if it exists
|
2013
|
+
if hasattr(self, "workers_by_user"):
|
2014
|
+
worker_user = (
|
2015
|
+
worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
|
2016
|
+
)
|
2017
|
+
if worker_user in self.workers_by_user:
|
2018
|
+
self.workers_by_user[worker_user].discard(worker_id)
|
2019
|
+
if not self.workers_by_user[worker_user]:
|
2020
|
+
del self.workers_by_user[worker_user]
|
2021
|
+
|
2022
|
+
# Notify monitors
|
2023
|
+
await self._broadcast_stats()
|
2024
|
+
await self._send_activity(
|
2025
|
+
f"Worker {worker_id} removed due to heartbeat timeout"
|
2026
|
+
)
|
2027
|
+
|
2028
|
+
except Exception as e:
|
2029
|
+
logger.error(f"Error in heartbeat loop: {e}", exc_info=True)
|
2030
|
+
# Continue the loop even if there's an error
|
2031
|
+
await asyncio.sleep(5)
|
1639
2032
|
|
1640
2033
|
async def _checkpoint_loop(self):
|
1641
2034
|
"""Periodically checkpoint storage."""
|
@@ -1663,7 +2056,10 @@ class Orchestrator:
|
|
1663
2056
|
)
|
1664
2057
|
|
1665
2058
|
async def _stats_update_loop(self):
|
1666
|
-
"""Periodically update and broadcast stats."""
|
2059
|
+
"""Periodically update and broadcast stats - non-blocking version."""
|
2060
|
+
# Get the event loop for running blocking operations
|
2061
|
+
loop = asyncio.get_event_loop()
|
2062
|
+
|
1667
2063
|
# Track session start values
|
1668
2064
|
storage_stats = await self.storage.get_storage_stats()
|
1669
2065
|
session_start_outputs = storage_stats["total_captions"] # This now counts ALL outputs
|
@@ -1675,8 +2071,8 @@ class Orchestrator:
|
|
1675
2071
|
while True:
|
1676
2072
|
await asyncio.sleep(10)
|
1677
2073
|
|
1678
|
-
# Update chunk stats
|
1679
|
-
chunk_stats = self.chunk_manager.get_stats
|
2074
|
+
# Update chunk stats in thread pool to avoid blocking
|
2075
|
+
chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
|
1680
2076
|
storage_stats = await self.storage.get_storage_stats()
|
1681
2077
|
current_total_outputs = storage_stats["total_captions"] # ALL outputs
|
1682
2078
|
if self.chunk_tracker:
|
@@ -1690,12 +2086,9 @@ class Orchestrator:
|
|
1690
2086
|
self.stats["total_outputs"] = current_total_outputs
|
1691
2087
|
self.stats["total_captions"] = current_total_outputs # Keep for backward compatibility
|
1692
2088
|
|
1693
|
-
#
|
1694
|
-
|
1695
|
-
|
1696
|
-
self.stats["assigned_chunks"] = sum(
|
1697
|
-
len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
|
1698
|
-
)
|
2089
|
+
# Get queue stats in thread pool to avoid blocking
|
2090
|
+
queue_stats = await loop.run_in_executor(None, self._get_queue_stats)
|
2091
|
+
self.stats.update(queue_stats)
|
1699
2092
|
|
1700
2093
|
# Calculate if we need more chunks
|
1701
2094
|
worker_count = self.stats.get("connected_workers", 0)
|
@@ -1754,15 +2147,15 @@ class Orchestrator:
|
|
1754
2147
|
last_known_total = current_total_outputs
|
1755
2148
|
|
1756
2149
|
# Log rate information when workers are connected
|
1757
|
-
if (
|
1758
|
-
|
1759
|
-
): # Only log non-negative rates
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
2150
|
+
# if (
|
2151
|
+
# worker_count > 0 and self.rate_tracker["current_rate"] >= 0
|
2152
|
+
# ): # Only log non-negative rates
|
2153
|
+
# logger.info(
|
2154
|
+
# f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
|
2155
|
+
# f"(avg: {self.rate_tracker['average_rate']:.1f}, "
|
2156
|
+
# f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
|
2157
|
+
# f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
|
2158
|
+
# )
|
1766
2159
|
|
1767
2160
|
await self._broadcast_stats()
|
1768
2161
|
|