caption-flow 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ import uuid
16
16
  from dataclasses import dataclass, asdict
17
17
  from datetime import datetime
18
18
  from pathlib import Path
19
- from typing import Dict, Set, Optional, Any, List, Deque
19
+ from typing import Dict, Set, Optional, Any, List, Deque, Tuple
20
20
  from collections import deque, defaultdict
21
21
  import threading
22
22
  from queue import Queue, Empty
@@ -97,27 +97,9 @@ class ChunkManager:
97
97
  self.lock = threading.Lock()
98
98
  self.tracker = tracker # Reference to chunk tracker
99
99
 
100
- def create_chunks_from_shard(
101
- self, shard_url: str, shard_name: str, total_items: int
102
- ) -> List[ShardChunk]:
103
- """Create chunks from a shard."""
104
- chunks = []
105
-
106
- for start_idx in range(0, total_items, self.chunk_size):
107
- chunk = ShardChunk.create(
108
- shard_url=shard_url,
109
- shard_name=shard_name,
110
- start_index=start_idx,
111
- chunk_size=min(self.chunk_size, total_items - start_idx),
112
- )
113
-
114
- with self.lock:
115
- self.chunks[chunk.chunk_id] = chunk
116
- self.pending_chunks.append(chunk.chunk_id)
117
-
118
- chunks.append(chunk)
119
-
120
- return chunks
100
+ # NEW: Track assigned ranges to prevent double allocation
101
+ # Format: {chunk_id: {(start, end): worker_id}}
102
+ self.assigned_ranges: Dict[str, Dict[Tuple[int, int], str]] = defaultdict(dict)
121
103
 
122
104
  def get_chunks_for_worker(
123
105
  self, worker_id: str, count: int = 1, tracker: Optional["ChunkTracker"] = None
@@ -127,7 +109,6 @@ class ChunkManager:
127
109
 
128
110
  with self.lock:
129
111
  # FIRST PRIORITY: Check if this worker already has assigned chunks
130
- # Workers should complete their current chunks before getting new ones
131
112
  if worker_id in self.assigned_chunks:
132
113
  existing_chunk_ids = list(self.assigned_chunks[worker_id])
133
114
  for chunk_id in existing_chunk_ids:
@@ -142,12 +123,29 @@ class ChunkManager:
142
123
  if tracker:
143
124
  chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
144
125
  if chunk_info and chunk_info["unprocessed_ranges"]:
145
- assigned.append(
146
- {
147
- "chunk": chunk,
148
- "unprocessed_ranges": chunk_info["unprocessed_ranges"],
149
- }
150
- )
126
+ # Filter out ranges that are assigned to other workers
127
+ clean_ranges = []
128
+ for start, end in chunk_info["unprocessed_ranges"]:
129
+ range_key = (start, end)
130
+ if range_key in self.assigned_ranges[chunk_id]:
131
+ assigned_worker = self.assigned_ranges[chunk_id][range_key]
132
+ if assigned_worker != worker_id:
133
+ # Skip this range - it's assigned to another worker
134
+ logger.warning(
135
+ f"Skipping range {start}-{end} in chunk {chunk_id} "
136
+ f"(assigned to {assigned_worker}, not {worker_id})"
137
+ )
138
+ continue
139
+ # else: this worker already owns this range, include it
140
+ clean_ranges.append((start, end))
141
+
142
+ if clean_ranges:
143
+ assigned.append(
144
+ {
145
+ "chunk": chunk,
146
+ "unprocessed_ranges": clean_ranges,
147
+ }
148
+ )
151
149
  else:
152
150
  # No tracker, assume chunk needs processing
153
151
  assigned.append(
@@ -158,7 +156,6 @@ class ChunkManager:
158
156
  )
159
157
 
160
158
  # SECOND PRIORITY: Get new pending chunks
161
- # Only if worker doesn't have enough chunks already
162
159
  while len(assigned) < count and self.pending_chunks:
163
160
  chunk_id = self.pending_chunks.popleft()
164
161
  chunk = self.chunks.get(chunk_id)
@@ -166,7 +163,7 @@ class ChunkManager:
166
163
  if not chunk:
167
164
  continue
168
165
 
169
- # Verify chunk is truly pending (defensive check)
166
+ # Verify chunk is truly pending
170
167
  if chunk.status != "pending" or chunk.assigned_to is not None:
171
168
  logger.warning(
172
169
  f"Chunk {chunk_id} in pending queue but status={chunk.status}, assigned_to={chunk.assigned_to}"
@@ -179,15 +176,48 @@ class ChunkManager:
179
176
  chunk.assigned_at = datetime.utcnow()
180
177
  self.assigned_chunks[worker_id].add(chunk_id)
181
178
 
182
- # Get unprocessed ranges
179
+ # Get unprocessed ranges and filter out any that are somehow already assigned
183
180
  unprocessed_ranges = [(0, chunk.chunk_size - 1)] # Default
184
181
  if tracker:
185
182
  chunk_info = tracker.get_chunk_with_unprocessed_items(chunk_id)
186
183
  if chunk_info:
187
- unprocessed_ranges = chunk_info["unprocessed_ranges"]
184
+ # Filter out any ranges that are already assigned (shouldn't happen for new chunks)
185
+ clean_ranges = []
186
+ for start, end in chunk_info["unprocessed_ranges"]:
187
+ range_key = (start, end)
188
+ if range_key not in self.assigned_ranges[chunk_id]:
189
+ clean_ranges.append((start, end))
190
+ else:
191
+ logger.error(
192
+ f"Range {start}-{end} in newly assigned chunk {chunk_id} "
193
+ f"is already assigned to {self.assigned_ranges[chunk_id][range_key]}!"
194
+ )
195
+ unprocessed_ranges = clean_ranges if clean_ranges else []
196
+
188
197
  tracker.mark_assigned(chunk_id, worker_id)
189
198
 
190
- assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
199
+ if unprocessed_ranges:
200
+ assigned.append({"chunk": chunk, "unprocessed_ranges": unprocessed_ranges})
201
+
202
+ # Track assigned ranges and verify no double allocation
203
+ for info in assigned:
204
+ chunk_id = info["chunk"].chunk_id
205
+ for start, end in info["unprocessed_ranges"]:
206
+ range_key = (start, end)
207
+
208
+ # Check if this range is already assigned
209
+ if range_key in self.assigned_ranges[chunk_id]:
210
+ existing_worker = self.assigned_ranges[chunk_id][range_key]
211
+ if existing_worker != worker_id:
212
+ # This should never happen - raise assertion
213
+ raise AssertionError(
214
+ f"CRITICAL: Attempting to assign range {start}-{end} in chunk {chunk_id} "
215
+ f"to worker {worker_id}, but it's already assigned to {existing_worker}! "
216
+ f"This would cause duplicate processing."
217
+ )
218
+
219
+ # Track this assignment
220
+ self.assigned_ranges[chunk_id][range_key] = worker_id
191
221
 
192
222
  # Log what we're assigning
193
223
  if assigned:
@@ -199,6 +229,12 @@ class ChunkManager:
199
229
  )
200
230
  logger.info(f"Assigning to worker {worker_id}: {chunk_summary}")
201
231
 
232
+ # Detailed range logging for debugging
233
+ for info in assigned:
234
+ chunk_id = info["chunk"].chunk_id
235
+ ranges_str = ", ".join([f"{s}-{e}" for s, e in info["unprocessed_ranges"]])
236
+ logger.debug(f" Chunk {chunk_id} ranges: {ranges_str}")
237
+
202
238
  return assigned
203
239
 
204
240
  def complete_chunk(self, chunk_id: str, worker_id: str) -> bool:
@@ -210,6 +246,16 @@ class ChunkManager:
210
246
  chunk.status = "completed"
211
247
  chunk.completed_at = datetime.utcnow()
212
248
  self.assigned_chunks[worker_id].discard(chunk_id)
249
+
250
+ # Clear assigned ranges for this chunk
251
+ if chunk_id in self.assigned_ranges:
252
+ # Log what ranges we're clearing
253
+ ranges_to_clear = list(self.assigned_ranges[chunk_id].keys())
254
+ logger.debug(
255
+ f"Clearing {len(ranges_to_clear)} assigned ranges for completed chunk {chunk_id}"
256
+ )
257
+ del self.assigned_ranges[chunk_id]
258
+
213
259
  return True
214
260
  return False
215
261
 
@@ -224,6 +270,20 @@ class ChunkManager:
224
270
  chunk.assigned_at = None
225
271
  self.assigned_chunks[worker_id].discard(chunk_id)
226
272
  self.pending_chunks.append(chunk_id)
273
+
274
+ # Clear assigned ranges for this chunk/worker
275
+ if chunk_id in self.assigned_ranges:
276
+ ranges_to_clear = [
277
+ range_key
278
+ for range_key, assigned_worker in self.assigned_ranges[chunk_id].items()
279
+ if assigned_worker == worker_id
280
+ ]
281
+ for range_key in ranges_to_clear:
282
+ del self.assigned_ranges[chunk_id][range_key]
283
+ logger.debug(
284
+ f"Cleared {len(ranges_to_clear)} assigned ranges for failed chunk {chunk_id}"
285
+ )
286
+
227
287
  return True
228
288
  return False
229
289
 
@@ -240,18 +300,62 @@ class ChunkManager:
240
300
  chunk.assigned_at = None
241
301
  self.pending_chunks.append(chunk_id)
242
302
 
303
+ # Clear assigned ranges for this worker
304
+ if chunk_id in self.assigned_ranges:
305
+ ranges_to_clear = [
306
+ range_key
307
+ for range_key, assigned_worker in self.assigned_ranges[
308
+ chunk_id
309
+ ].items()
310
+ if assigned_worker == worker_id
311
+ ]
312
+ for range_key in ranges_to_clear:
313
+ del self.assigned_ranges[chunk_id][range_key]
314
+
315
+ if ranges_to_clear:
316
+ logger.info(
317
+ f"Released {len(ranges_to_clear)} ranges from chunk {chunk_id} "
318
+ f"previously assigned to disconnected worker {worker_id}"
319
+ )
320
+
243
321
  if worker_id in self.assigned_chunks:
244
322
  del self.assigned_chunks[worker_id]
245
323
 
324
+ def mark_ranges_processed(
325
+ self, chunk_id: str, processed_ranges: List[Tuple[int, int]], worker_id: str
326
+ ):
327
+ """Remove ranges from assignment tracking once they're processed."""
328
+ with self.lock:
329
+ if chunk_id in self.assigned_ranges:
330
+ for start, end in processed_ranges:
331
+ range_key = (start, end)
332
+ if range_key in self.assigned_ranges[chunk_id]:
333
+ assigned_worker = self.assigned_ranges[chunk_id][range_key]
334
+ if assigned_worker == worker_id:
335
+ del self.assigned_ranges[chunk_id][range_key]
336
+ logger.debug(
337
+ f"Cleared assignment of range {start}-{end} in chunk {chunk_id} "
338
+ f"after processing by {worker_id}"
339
+ )
340
+ else:
341
+ logger.warning(
342
+ f"Worker {worker_id} claims to have processed range {start}-{end} "
343
+ f"in chunk {chunk_id}, but it was assigned to {assigned_worker}"
344
+ )
345
+
246
346
  def get_stats(self) -> Dict[str, int]:
247
347
  """Get chunk statistics."""
248
348
  with self.lock:
349
+ # Count total assigned ranges
350
+ total_assigned_ranges = sum(len(ranges) for ranges in self.assigned_ranges.values())
351
+
249
352
  stats = {
250
353
  "total": len(self.chunks),
251
354
  "pending": len(self.pending_chunks),
252
355
  "assigned": sum(len(chunks) for chunks in self.assigned_chunks.values()),
253
356
  "completed": sum(1 for c in self.chunks.values() if c.status == "completed"),
254
357
  "failed": sum(1 for c in self.chunks.values() if c.status == "failed"),
358
+ "assigned_ranges": total_assigned_ranges,
255
359
  }
256
360
  return stats
257
361
 
@@ -363,6 +467,7 @@ class Orchestrator:
363
467
  self.ssl_context = self._setup_ssl()
364
468
 
365
469
  # Statistics
470
+ self.is_generating_stats = False
366
471
  self.stats = {
367
472
  "total_chunks": 0,
368
473
  "completed_chunks": 0,
@@ -490,13 +595,15 @@ class Orchestrator:
490
595
  with self.chunk_manager.lock:
491
596
  for chunk_state in shard_info["chunks"]:
492
597
  if chunk_state.status in ["pending", "failed", "assigned"]:
493
- # ChunkState already has shard_url stored
598
+ # For assigned chunks, reset them to pending since workers don't exist
494
599
  chunk = ShardChunk(
495
600
  chunk_id=chunk_state.chunk_id,
496
601
  shard_url=chunk_state.shard_url,
497
602
  shard_name=chunk_state.shard_name,
498
603
  start_index=chunk_state.start_index,
499
604
  chunk_size=chunk_state.chunk_size,
605
+ status="pending", # Reset to pending
606
+ assigned_to=None, # Clear assignment
500
607
  )
501
608
  self.chunk_manager.chunks[chunk_state.chunk_id] = chunk
502
609
  self.chunk_manager.pending_chunks.append(chunk_state.chunk_id)
@@ -1409,28 +1516,36 @@ class Orchestrator:
1409
1516
  finally:
1410
1517
  del self.data_workers[worker_id]
1411
1518
 
1412
- async def _handle_monitor(self, websocket: WebSocketServerProtocol):
1413
- """Handle monitor connection."""
1414
- self.monitors.add(websocket)
1415
- logger.info("Monitor connected")
1416
-
1519
+ async def _send_leaderboard_to_monitor(self, websocket: WebSocketServerProtocol):
1520
+ """Send leaderboard data to a specific monitor."""
1521
+ total_start = time.time()
1417
1522
  try:
1418
- # Send initial stats
1419
- await websocket.send(safe_json_dumps({"type": "stats", "data": self.stats}))
1420
-
1421
- # Send chunk stats
1422
- chunk_stats = self.chunk_manager.get_stats()
1423
- await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
1523
+ if websocket not in self.monitors:
1524
+ return
1424
1525
 
1425
- # Send contributor leaderboard with active worker counts
1526
+ # Get contributors asynchronously
1527
+ contributors_start = time.time()
1426
1528
  contributors = await self.storage.get_top_contributors(10)
1529
+ logger.debug(
1530
+ f"Contributors retrieved in {(time.time() - contributors_start)*1000:.1f}ms"
1531
+ )
1427
1532
 
1428
- # Enhance contributor data with active worker counts
1429
- enhanced_contributors = []
1430
- worker_counts = (
1431
- self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
1533
+ # Get worker counts in thread pool
1534
+ worker_counts_start = time.time()
1535
+ loop = asyncio.get_event_loop()
1536
+ worker_counts = await loop.run_in_executor(
1537
+ None,
1538
+ lambda: (
1539
+ self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
1540
+ ),
1541
+ )
1542
+ logger.debug(
1543
+ f"Worker counts retrieved in {(time.time() - worker_counts_start)*1000:.1f}ms"
1432
1544
  )
1433
1545
 
1546
+ # Build enhanced contributors list
1547
+ build_start = time.time()
1548
+ enhanced_contributors = []
1434
1549
  for contributor in contributors:
1435
1550
  contrib_dict = {
1436
1551
  "contributor_id": contributor.contributor_id,
@@ -1442,40 +1557,157 @@ class Orchestrator:
1442
1557
  ),
1443
1558
  }
1444
1559
  enhanced_contributors.append(contrib_dict)
1560
+ logger.debug(f"Enhanced contributors built in {(time.time() - build_start)*1000:.1f}ms")
1445
1561
 
1446
- await websocket.send(
1447
- safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
1562
+ # Cache for future monitors
1563
+ self._cached_leaderboard = enhanced_contributors
1564
+
1565
+ # Send if still connected
1566
+ if websocket in self.monitors:
1567
+ send_start = time.time()
1568
+ await websocket.send(
1569
+ safe_json_dumps({"type": "leaderboard", "data": enhanced_contributors})
1570
+ )
1571
+ logger.debug(
1572
+ f"Leaderboard sent to monitor in {(time.time() - send_start)*1000:.1f}ms"
1573
+ )
1574
+
1575
+ logger.debug(
1576
+ f"Leaderboard send to monitor completed in {(time.time() - total_start)*1000:.1f}ms"
1448
1577
  )
1449
1578
 
1450
- # Keep connection alive
1451
- async for _ in websocket:
1452
- pass
1579
+ except websockets.exceptions.ConnectionClosed:
1580
+ logger.debug("Monitor disconnected during leaderboard send")
1581
+ except Exception as e:
1582
+ logger.error(f"Error sending leaderboard to monitor: {e}")
1583
+
1584
+ async def _send_initial_monitor_data(self, websocket: WebSocketServerProtocol):
1585
+ """Send initial data to monitor in a separate task to avoid blocking."""
1586
+ total_start = time.time()
1587
+ try:
1588
+ # Check if websocket is still in monitors set
1589
+ if websocket not in self.monitors:
1590
+ logger.debug("Monitor disconnected before initial data send")
1591
+ return
1592
+
1593
+ # Send current stats (already in memory)
1594
+ stats_start = time.time()
1595
+ await websocket.send(safe_json_dumps({"type": "stats", "data": self.stats}))
1596
+ logger.debug(f"Monitor stats sent in {(time.time() - stats_start)*1000:.1f}ms")
1597
+
1598
+ # Get chunk stats asynchronously
1599
+ chunk_stats_start = time.time()
1600
+ loop = asyncio.get_event_loop()
1601
+ chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
1602
+ logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
1603
+
1604
+ if websocket not in self.monitors:
1605
+ return
1606
+
1607
+ chunk_send_start = time.time()
1608
+ await websocket.send(safe_json_dumps({"type": "chunk_stats", "data": chunk_stats}))
1609
+ logger.debug(f"Chunk stats sent in {(time.time() - chunk_send_start)*1000:.1f}ms")
1610
+
1611
+ # For leaderboard, check if we have a cached version first
1612
+ if hasattr(self, "_cached_leaderboard") and self._cached_leaderboard:
1613
+ # Use cached leaderboard if available
1614
+ cache_send_start = time.time()
1615
+ await websocket.send(
1616
+ safe_json_dumps({"type": "leaderboard", "data": self._cached_leaderboard})
1617
+ )
1618
+ logger.debug(
1619
+ f"Cached leaderboard sent in {(time.time() - cache_send_start)*1000:.1f}ms"
1620
+ )
1621
+ else:
1622
+ # Schedule leaderboard update separately
1623
+ leaderboard_task_start = time.time()
1624
+ asyncio.create_task(self._send_leaderboard_to_monitor(websocket))
1625
+ logger.debug(
1626
+ f"Leaderboard task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
1627
+ )
1628
+
1629
+ logger.debug(
1630
+ f"Monitor initial data send completed in {(time.time() - total_start)*1000:.1f}ms"
1631
+ )
1632
+
1633
+ except websockets.exceptions.ConnectionClosed:
1634
+ logger.debug("Monitor disconnected during initial data send")
1635
+ except Exception as e:
1636
+ logger.error(f"Error sending initial monitor data: {e}")
1637
+
1638
+ async def _handle_monitor(self, websocket: WebSocketServerProtocol):
1639
+ """Handle monitor connection - truly non-blocking version."""
1640
+ monitor_start = time.time()
1641
+ self.monitors.add(websocket)
1642
+ logger.info(f"Monitor connected (total monitors: {len(self.monitors)})")
1643
+
1644
+ try:
1645
+ # Send welcome message immediately
1646
+ welcome_start = time.time()
1647
+ await websocket.send(safe_json_dumps({"type": "welcome", "role": "monitor"}))
1648
+ logger.debug(f"Monitor welcome sent in {(time.time() - welcome_start)*1000:.1f}ms")
1649
+
1650
+ # Schedule initial data send as a separate task to avoid blocking
1651
+ task_create_start = time.time()
1652
+ asyncio.create_task(self._send_initial_monitor_data(websocket))
1653
+ logger.debug(
1654
+ f"Monitor initial data task created in {(time.time() - task_create_start)*1000:.1f}ms"
1655
+ )
1656
+
1657
+ # Just keep the connection alive - no blocking work here
1658
+ try:
1659
+ async for message in websocket:
1660
+ # Handle any incoming messages from monitor if needed
1661
+ # For now, just ignore them
1662
+ pass
1663
+ except websockets.exceptions.ConnectionClosed:
1664
+ pass # Normal disconnection
1453
1665
 
1454
1666
  except websockets.exceptions.ConnectionClosed:
1455
1667
  logger.info("Monitor disconnected")
1668
+ except Exception as e:
1669
+ logger.error(f"Error in monitor handler: {e}")
1456
1670
  finally:
1457
1671
  self.monitors.discard(websocket)
1672
+ logger.debug(f"Monitor handler completed in {(time.time() - monitor_start)*1000:.1f}ms")
1458
1673
 
1459
1674
  async def _broadcast_stats(self):
1460
- """Broadcast statistics to all monitors - enhanced for multi-stage."""
1675
+ """Broadcast statistics to all monitors - truly non-blocking version."""
1461
1676
  if not self.monitors:
1462
1677
  return
1678
+ if self.is_generating_stats:
1679
+ return # Already generating stats, skip this call
1680
+ self.is_generating_stats = True
1681
+ total_start = time.time()
1463
1682
 
1464
- # Get storage stats
1683
+ # Prepare all the data first
1684
+ data_prep_start = time.time()
1685
+ loop = asyncio.get_event_loop()
1686
+
1687
+ # Get storage stats (already async)
1688
+ storage_stats_start = time.time()
1465
1689
  storage_stats = await self.storage.get_storage_stats()
1690
+ logger.debug(f"Storage stats retrieved in {(time.time() - storage_stats_start)*1000:.1f}ms")
1691
+
1692
+ caption_stats_start = time.time()
1466
1693
  caption_stats = await self.storage.get_caption_stats()
1694
+ logger.debug(f"Caption stats retrieved in {(time.time() - caption_stats_start)*1000:.1f}ms")
1467
1695
 
1468
- # Include chunk stats
1469
- chunk_stats = self.chunk_manager.get_stats()
1470
- self.stats.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
1696
+ # Get chunk stats in thread pool
1697
+ chunk_stats_start = time.time()
1698
+ chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
1699
+ logger.debug(f"Chunk stats retrieved in {(time.time() - chunk_stats_start)*1000:.1f}ms")
1471
1700
 
1472
- # Merge storage stats
1473
- self.stats.update(storage_stats)
1474
- self.stats["field_breakdown"] = caption_stats.get("field_stats", {})
1475
- self.stats["output_fields_list"] = caption_stats.get("output_fields", [])
1701
+ # Build stats dict
1702
+ build_stats_start = time.time()
1703
+ stats_update = self.stats.copy()
1704
+ stats_update.update({f"chunks_{k}": v for k, v in chunk_stats.items()})
1705
+ stats_update.update(storage_stats)
1706
+ stats_update["field_breakdown"] = caption_stats.get("field_stats", {})
1707
+ stats_update["output_fields_list"] = caption_stats.get("output_fields", [])
1476
1708
 
1477
1709
  # Add rate information
1478
- self.stats.update(
1710
+ stats_update.update(
1479
1711
  {
1480
1712
  "current_rate": self.rate_tracker["current_rate"],
1481
1713
  "average_rate": self.rate_tracker["average_rate"],
@@ -1483,41 +1715,106 @@ class Orchestrator:
1483
1715
  }
1484
1716
  )
1485
1717
 
1486
- # Add vLLM info - now includes stage count
1487
- self.stats["vllm_model"] = self.vllm_config.get("model", "unknown")
1488
- self.stats["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
1718
+ # Add vLLM info
1719
+ stats_update["vllm_model"] = self.vllm_config.get("model", "unknown")
1720
+ stats_update["vllm_batch_size"] = self.vllm_config.get("batch_size", 0)
1489
1721
 
1490
- # NEW: Add stage information
1722
+ # Add stage information
1491
1723
  stages = self.vllm_config.get("stages", [])
1492
1724
  if stages:
1493
- self.stats["stage_count"] = len(stages)
1494
- self.stats["stage_names"] = [s.get("name", "unnamed") for s in stages]
1725
+ stats_update["stage_count"] = len(stages)
1726
+ stats_update["stage_names"] = [s.get("name", "unnamed") for s in stages]
1495
1727
  else:
1496
- self.stats["stage_count"] = 1 # Backward compatibility
1497
- self.stats["stage_names"] = ["default"]
1728
+ stats_update["stage_count"] = 1
1729
+ stats_update["stage_names"] = ["default"]
1498
1730
 
1731
+ # Get field stats
1732
+ field_stats_start = time.time()
1499
1733
  field_stats = await self.storage.get_output_field_stats()
1500
- self.stats["output_fields"] = field_stats
1734
+ stats_update["output_fields"] = field_stats
1735
+ logger.debug(f"Field stats retrieved in {(time.time() - field_stats_start)*1000:.1f}ms")
1501
1736
 
1502
- message = safe_json_dumps({"type": "stats", "data": self.stats})
1737
+ # Update our internal stats
1738
+ self.stats = stats_update
1739
+ logger.debug(f"Stats prepared in {(time.time() - build_stats_start)*1000:.1f}ms")
1503
1740
 
1504
- # Send to all monitors
1505
- disconnected = set()
1506
- _monitors = self.monitors.copy()
1507
- for monitor in _monitors:
1741
+ logger.debug(f"Total data preparation took {(time.time() - data_prep_start)*1000:.1f}ms")
1742
+
1743
+ # Create message once
1744
+ message_create_start = time.time()
1745
+ stats_message = safe_json_dumps({"type": "stats", "data": self.stats})
1746
+ logger.debug(f"Stats message created in {(time.time() - message_create_start)*1000:.1f}ms")
1747
+
1748
+ # Send to all monitors asynchronously in parallel
1749
+ send_start = time.time()
1750
+
1751
+ async def send_to_monitor(monitor):
1508
1752
  try:
1509
- await monitor.send(message)
1753
+ await monitor.send(stats_message)
1510
1754
  except websockets.exceptions.ConnectionClosed:
1511
- disconnected.add(monitor)
1755
+ return monitor # Return for removal
1756
+ except Exception as e:
1757
+ logger.debug(f"Error sending stats to monitor: {e}")
1758
+ return monitor # Return for removal
1759
+ return None
1760
+
1761
+ # Send to all monitors in parallel
1762
+ monitors_copy = self.monitors.copy()
1763
+ results = await asyncio.gather(
1764
+ *[send_to_monitor(m) for m in monitors_copy], return_exceptions=True
1765
+ )
1766
+
1767
+ # Remove disconnected monitors
1768
+ disconnected = {
1769
+ m
1770
+ for m, r in zip(monitors_copy, results)
1771
+ if r is not None and not isinstance(r, Exception)
1772
+ }
1773
+ self.monitors -= disconnected
1774
+
1775
+ logger.debug(
1776
+ f"Stats sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
1777
+ )
1778
+
1779
+ # Send leaderboard update in a separate task to avoid blocking
1780
+ leaderboard_task_start = time.time()
1781
+ asyncio.create_task(self._broadcast_leaderboard())
1782
+ self.is_generating_stats = False
1783
+ logger.debug(
1784
+ f"Leaderboard broadcast task created in {(time.time() - leaderboard_task_start)*1000:.1f}ms"
1785
+ )
1786
+ logger.debug(f"Stats broadcast completed in {(time.time() - total_start)*1000:.1f}ms")
1512
1787
 
1513
- # send updated leaderboard
1788
+ async def _broadcast_leaderboard(self):
1789
+ """Send leaderboard updates to monitors - separate from stats to avoid blocking."""
1790
+ if not self.monitors:
1791
+ return
1792
+
1793
+ total_start = time.time()
1514
1794
  try:
1795
+ # Get contributors
1796
+ contributors_start = time.time()
1515
1797
  contributors = await self.storage.get_top_contributors(10)
1516
- enhanced_contributors = []
1517
- worker_counts = (
1518
- self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
1798
+ logger.debug(
1799
+ f"Contributors retrieved for broadcast in {(time.time() - contributors_start)*1000:.1f}ms"
1800
+ )
1801
+
1802
+ # Get worker counts
1803
+ worker_counts_start = time.time()
1804
+ loop = asyncio.get_event_loop()
1805
+ worker_counts = await loop.run_in_executor(
1806
+ None,
1807
+ lambda: (
1808
+ self.get_workers_by_user_stats() if hasattr(self, "workers_by_user") else {}
1809
+ ),
1810
+ )
1811
+ logger.debug(
1812
+ f"Worker counts retrieved for broadcast in {(time.time() - worker_counts_start)*1000:.1f}ms"
1519
1813
  )
1520
1814
 
1815
+ # Build enhanced contributors list
1816
+ build_start = time.time()
1817
+ enhanced_contributors = []
1521
1818
  for contributor in contributors:
1522
1819
  contrib_dict = {
1523
1820
  "contributor_id": contributor.contributor_id,
@@ -1529,26 +1826,64 @@ class Orchestrator:
1529
1826
  ),
1530
1827
  }
1531
1828
  enhanced_contributors.append(contrib_dict)
1829
+ logger.debug(
1830
+ f"Enhanced contributors built for broadcast in {(time.time() - build_start)*1000:.1f}ms"
1831
+ )
1532
1832
 
1833
+ # Cache it
1834
+ self._cached_leaderboard = enhanced_contributors
1835
+
1836
+ # Create message once
1837
+ message_create_start = time.time()
1533
1838
  leaderboard_message = safe_json_dumps(
1534
1839
  {"type": "leaderboard", "data": enhanced_contributors}
1535
1840
  )
1841
+ logger.debug(
1842
+ f"Leaderboard message created in {(time.time() - message_create_start)*1000:.1f}ms"
1843
+ )
1844
+
1845
+ # Send to all monitors in parallel
1846
+ send_start = time.time()
1536
1847
 
1537
- # Send to all monitors
1538
- disconnected = set()
1539
- for monitor in self.monitors.copy():
1848
+ async def send_leaderboard(monitor):
1540
1849
  try:
1541
1850
  await monitor.send(leaderboard_message)
1542
- except websockets.exceptions.ConnectionClosed:
1543
- disconnected.add(monitor)
1851
+ except:
1852
+ return monitor # Mark for removal
1853
+ return None
1854
+
1855
+ monitors_copy = self.monitors.copy()
1856
+ results = await asyncio.gather(
1857
+ *[send_leaderboard(m) for m in monitors_copy], return_exceptions=True
1858
+ )
1544
1859
 
1860
+ # Remove disconnected
1861
+ disconnected = {
1862
+ m
1863
+ for m, r in zip(monitors_copy, results)
1864
+ if r is not None and not isinstance(r, Exception)
1865
+ }
1545
1866
  self.monitors -= disconnected
1546
1867
 
1547
- except Exception as e:
1548
- logger.error(f"Error sending leaderboard update: {e}")
1868
+ logger.debug(
1869
+ f"Leaderboard sent to {len(monitors_copy)} monitors in {(time.time() - send_start)*1000:.1f}ms"
1870
+ )
1871
+ logger.debug(
1872
+ f"Leaderboard broadcast completed in {(time.time() - total_start)*1000:.1f}ms"
1873
+ )
1549
1874
 
1550
- # Clean up disconnected monitors
1551
- self.monitors -= disconnected
1875
+ except Exception as e:
1876
+ logger.error(f"Error broadcasting leaderboard: {e}")
1877
+
1878
+ def _get_queue_stats(self) -> Dict[str, int]:
1879
+ """Get queue statistics - synchronous helper for thread pool."""
1880
+ with self.chunk_manager.lock:
1881
+ return {
1882
+ "pending_chunks": len(self.chunk_manager.pending_chunks),
1883
+ "assigned_chunks": sum(
1884
+ len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
1885
+ ),
1886
+ }
1552
1887
 
1553
1888
  async def _flush_processed_items(self):
1554
1889
  """Flush batched processed items to chunk tracker."""
@@ -1582,21 +1917,37 @@ class Orchestrator:
1582
1917
  # Don't forget the last range
1583
1918
  ranges.append((start, end))
1584
1919
 
1585
- # Mark ranges as processed (mark_items_processed expects absolute indices)
1920
+ # Mark ranges as processed
1586
1921
  for start_idx, end_idx in ranges:
1587
1922
  self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
1588
1923
 
1924
+ with self.chunk_manager.lock:
1925
+ if chunk_id in self.chunk_manager.assigned_ranges:
1926
+ for start_idx, end_idx in ranges:
1927
+ # Clear any assignments in this range
1928
+ to_remove = []
1929
+ for range_start, range_end in self.chunk_manager.assigned_ranges[
1930
+ chunk_id
1931
+ ]:
1932
+ if range_start >= start_idx and range_end <= end_idx:
1933
+ to_remove.append((range_start, range_end))
1934
+
1935
+ for range_key in to_remove:
1936
+ del self.chunk_manager.assigned_ranges[chunk_id][range_key]
1937
+
1589
1938
  # Clear pending items
1590
1939
  self.pending_processed_items.clear()
1591
1940
  self.last_item_batch_flush = time.time()
1592
1941
 
1593
1942
  def get_workers_by_user_stats(self) -> Dict[str, Any]:
1594
- """Get statistics about workers grouped by user/token."""
1943
+ """Get statistics about workers grouped by user/token - thread-safe version."""
1595
1944
  if not hasattr(self, "workers_by_user"):
1596
1945
  return {}
1597
1946
 
1947
+ # Create a copy to avoid issues with concurrent modification
1598
1948
  stats = {}
1599
- for user, worker_ids in self.workers_by_user.items():
1949
+ workers_snapshot = dict(self.workers_by_user)
1950
+ for user, worker_ids in workers_snapshot.items():
1600
1951
  stats[user] = {"worker_count": len(worker_ids), "worker_ids": list(worker_ids)}
1601
1952
  return stats
1602
1953
 
@@ -1621,21 +1972,63 @@ class Orchestrator:
1621
1972
  async def _heartbeat_loop(self):
1622
1973
  """Send periodic heartbeats to maintain connections."""
1623
1974
  while True:
1624
- await asyncio.sleep(30)
1975
+ try:
1976
+ await asyncio.sleep(30)
1625
1977
 
1626
- # Ping workers
1627
- disconnected = []
1628
- for worker_id, ws in self.workers.items():
1629
- try:
1630
- await ws.ping()
1631
- except:
1632
- disconnected.append(worker_id)
1978
+ # Create a copy of worker items to avoid modification during iteration
1979
+ worker_items = list(self.workers.items())
1980
+ disconnected = []
1981
+
1982
+ for worker_id, ws in worker_items:
1983
+ try:
1984
+ # Check if worker still exists before pinging
1985
+ if worker_id not in self.workers:
1986
+ continue
1987
+
1988
+ # Send ping with timeout
1989
+ pong_waiter = await ws.ping()
1990
+ try:
1991
+ await asyncio.wait_for(pong_waiter, timeout=10)
1992
+ except asyncio.TimeoutError:
1993
+ logger.warning(f"Worker {worker_id} failed to respond to ping")
1994
+ disconnected.append(worker_id)
1995
+ except websockets.exceptions.ConnectionClosed:
1996
+ logger.info(f"Worker {worker_id} connection already closed")
1997
+ disconnected.append(worker_id)
1998
+ except Exception as e:
1999
+ logger.error(f"Error pinging worker {worker_id}: {e}")
2000
+ disconnected.append(worker_id)
2001
+
2002
+ # Clean up disconnected workers
2003
+ for worker_id in disconnected:
2004
+ if worker_id in self.workers:
2005
+ logger.info(f"Removing unresponsive worker {worker_id}")
2006
+ del self.workers[worker_id]
2007
+ self.chunk_manager.release_worker_chunks(worker_id)
2008
+
2009
+ # Update stats
2010
+ self.stats["connected_workers"] = len(self.workers)
1633
2011
 
1634
- # Clean up disconnected workers
1635
- for worker_id in disconnected:
1636
- if worker_id in self.workers:
1637
- del self.workers[worker_id]
1638
- self.chunk_manager.release_worker_chunks(worker_id)
2012
+ # Also clean up from workers_by_user if it exists
2013
+ if hasattr(self, "workers_by_user"):
2014
+ worker_user = (
2015
+ worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
2016
+ )
2017
+ if worker_user in self.workers_by_user:
2018
+ self.workers_by_user[worker_user].discard(worker_id)
2019
+ if not self.workers_by_user[worker_user]:
2020
+ del self.workers_by_user[worker_user]
2021
+
2022
+ # Notify monitors
2023
+ await self._broadcast_stats()
2024
+ await self._send_activity(
2025
+ f"Worker {worker_id} removed due to heartbeat timeout"
2026
+ )
2027
+
2028
+ except Exception as e:
2029
+ logger.error(f"Error in heartbeat loop: {e}", exc_info=True)
2030
+ # Continue the loop even if there's an error
2031
+ await asyncio.sleep(5)
1639
2032
 
1640
2033
  async def _checkpoint_loop(self):
1641
2034
  """Periodically checkpoint storage."""
@@ -1663,7 +2056,10 @@ class Orchestrator:
1663
2056
  )
1664
2057
 
1665
2058
  async def _stats_update_loop(self):
1666
- """Periodically update and broadcast stats."""
2059
+ """Periodically update and broadcast stats - non-blocking version."""
2060
+ # Get the event loop for running blocking operations
2061
+ loop = asyncio.get_event_loop()
2062
+
1667
2063
  # Track session start values
1668
2064
  storage_stats = await self.storage.get_storage_stats()
1669
2065
  session_start_outputs = storage_stats["total_captions"] # This now counts ALL outputs
@@ -1675,8 +2071,8 @@ class Orchestrator:
1675
2071
  while True:
1676
2072
  await asyncio.sleep(10)
1677
2073
 
1678
- # Update chunk stats
1679
- chunk_stats = self.chunk_manager.get_stats()
2074
+ # Update chunk stats in thread pool to avoid blocking
2075
+ chunk_stats = await loop.run_in_executor(None, self.chunk_manager.get_stats)
1680
2076
  storage_stats = await self.storage.get_storage_stats()
1681
2077
  current_total_outputs = storage_stats["total_captions"] # ALL outputs
1682
2078
  if self.chunk_tracker:
@@ -1690,12 +2086,9 @@ class Orchestrator:
1690
2086
  self.stats["total_outputs"] = current_total_outputs
1691
2087
  self.stats["total_captions"] = current_total_outputs # Keep for backward compatibility
1692
2088
 
1693
- # Add queue information
1694
- with self.chunk_manager.lock:
1695
- self.stats["pending_chunks"] = len(self.chunk_manager.pending_chunks)
1696
- self.stats["assigned_chunks"] = sum(
1697
- len(chunks) for chunks in self.chunk_manager.assigned_chunks.values()
1698
- )
2089
+ # Get queue stats in thread pool to avoid blocking
2090
+ queue_stats = await loop.run_in_executor(None, self._get_queue_stats)
2091
+ self.stats.update(queue_stats)
1699
2092
 
1700
2093
  # Calculate if we need more chunks
1701
2094
  worker_count = self.stats.get("connected_workers", 0)
@@ -1754,15 +2147,15 @@ class Orchestrator:
1754
2147
  last_known_total = current_total_outputs
1755
2148
 
1756
2149
  # Log rate information when workers are connected
1757
- if (
1758
- worker_count > 0 and self.rate_tracker["current_rate"] >= 0
1759
- ): # Only log non-negative rates
1760
- logger.info(
1761
- f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
1762
- f"(avg: {self.rate_tracker['average_rate']:.1f}, "
1763
- f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
1764
- f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
1765
- )
2150
+ # if (
2151
+ # worker_count > 0 and self.rate_tracker["current_rate"] >= 0
2152
+ # ): # Only log non-negative rates
2153
+ # logger.info(
2154
+ # f"Rate: {self.rate_tracker['current_rate']:.1f} outputs/min "
2155
+ # f"(avg: {self.rate_tracker['average_rate']:.1f}, "
2156
+ # f"expected: {self.rate_tracker['expected_rate']:.1f}) | "
2157
+ # f"Workers: {worker_count}, Chunks: {active_chunks}/{target_buffer}"
2158
+ # )
1766
2159
 
1767
2160
  await self._broadcast_stats()
1768
2161