speedy-utils 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/chat_format/display.py +17 -4
- llm_utils/lm/async_lm/__init__.py +2 -0
- llm_utils/lm/async_lm/_utils.py +198 -0
- llm_utils/lm/async_lm/async_llm_task.py +154 -0
- llm_utils/lm/{async_lm.py → async_lm/async_lm.py} +191 -354
- llm_utils/scripts/vllm_load_balancer.py +220 -135
- {speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/METADATA +1 -1
- {speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/RECORD +10 -7
- {speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/WHEEL +0 -0
- {speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/entry_points.txt +0 -0
|
@@ -17,6 +17,7 @@ from speedy_utils import setup_logger
|
|
|
17
17
|
|
|
18
18
|
setup_logger(min_interval=5)
|
|
19
19
|
|
|
20
|
+
|
|
20
21
|
# --- CLI Argument Parsing ---
|
|
21
22
|
def parse_args():
|
|
22
23
|
parser = argparse.ArgumentParser(
|
|
@@ -27,14 +28,16 @@ Examples:
|
|
|
27
28
|
python vllm_load_balancer.py 8001 --ports 8140,8150,8160
|
|
28
29
|
python vllm_load_balancer.py 8080 --ports 8140,8150 --host 192.168.1.100
|
|
29
30
|
python vllm_load_balancer.py 8001 --ports 8140,8150 --status-interval 3
|
|
31
|
+
python vllm_load_balancer.py 8001 --ports 8140,8150 --throttle-ms 10
|
|
30
32
|
|
|
31
33
|
Features:
|
|
32
34
|
• Real-time dashboard with color-coded status
|
|
33
35
|
• Automatic health checks and failover
|
|
34
36
|
• Least-connections load balancing
|
|
37
|
+
• Request throttling to prevent server overload
|
|
35
38
|
• Professional terminal interface
|
|
36
39
|
• Connection statistics and monitoring
|
|
37
|
-
"""
|
|
40
|
+
""",
|
|
38
41
|
)
|
|
39
42
|
parser.add_argument(
|
|
40
43
|
"port",
|
|
@@ -71,8 +74,15 @@ Features:
|
|
|
71
74
|
default=None,
|
|
72
75
|
help="Port for the HTTP stats dashboard (default: proxy port + 1)",
|
|
73
76
|
)
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--throttle-ms",
|
|
79
|
+
type=float,
|
|
80
|
+
default=30.0,
|
|
81
|
+
help="Minimum milliseconds between requests to same server (default: 5ms)",
|
|
82
|
+
)
|
|
74
83
|
return parser.parse_args()
|
|
75
84
|
|
|
85
|
+
|
|
76
86
|
# --- Configuration (populated from CLI) ---
|
|
77
87
|
LOAD_BALANCER_HOST = "0.0.0.0"
|
|
78
88
|
LOAD_BALANCER_PORT = 8008 # Will be overwritten by CLI
|
|
@@ -81,11 +91,13 @@ BACKEND_HOST = "localhost" # Will be overwritten by CLI
|
|
|
81
91
|
BACKEND_PORTS = [] # Will be overwritten by CLI
|
|
82
92
|
STATUS_PRINT_INTERVAL = 5
|
|
83
93
|
HEALTH_CHECK_TIMEOUT = 2
|
|
94
|
+
THROTTLE_MS = 5.0 # Will be overwritten by CLI
|
|
84
95
|
BUFFER_SIZE = 4096
|
|
85
96
|
|
|
86
97
|
# --- Global Shared State ---
|
|
87
98
|
available_servers = []
|
|
88
99
|
connection_counts = defaultdict(int)
|
|
100
|
+
last_send_times = defaultdict(float) # Track last send time per server
|
|
89
101
|
state_lock = asyncio.Lock()
|
|
90
102
|
start_time = None
|
|
91
103
|
total_connections_served = 0
|
|
@@ -95,10 +107,10 @@ current_active_connections = 0
|
|
|
95
107
|
# --- Terminal Utilities ---
|
|
96
108
|
def clear_terminal():
|
|
97
109
|
"""Clear terminal screen with cross-platform support."""
|
|
98
|
-
if os.name ==
|
|
99
|
-
os.system(
|
|
110
|
+
if os.name == "nt": # Windows
|
|
111
|
+
os.system("cls")
|
|
100
112
|
else: # Unix/Linux/MacOS
|
|
101
|
-
os.system(
|
|
113
|
+
os.system("clear")
|
|
102
114
|
|
|
103
115
|
|
|
104
116
|
def get_terminal_size():
|
|
@@ -114,12 +126,12 @@ def format_uptime(start_time):
|
|
|
114
126
|
"""Format uptime in a human-readable way."""
|
|
115
127
|
if not start_time:
|
|
116
128
|
return "Unknown"
|
|
117
|
-
|
|
129
|
+
|
|
118
130
|
uptime_seconds = time.time() - start_time
|
|
119
131
|
hours = int(uptime_seconds // 3600)
|
|
120
132
|
minutes = int((uptime_seconds % 3600) // 60)
|
|
121
133
|
seconds = int(uptime_seconds % 60)
|
|
122
|
-
|
|
134
|
+
|
|
123
135
|
if hours > 0:
|
|
124
136
|
return f"{hours}h {minutes}m {seconds}s"
|
|
125
137
|
elif minutes > 0:
|
|
@@ -132,7 +144,7 @@ def print_banner():
|
|
|
132
144
|
"""Print a professional startup banner."""
|
|
133
145
|
columns, _ = get_terminal_size()
|
|
134
146
|
banner_width = min(columns - 4, 80)
|
|
135
|
-
|
|
147
|
+
|
|
136
148
|
print("=" * banner_width)
|
|
137
149
|
print(f"{'🚀 vLLM Load Balancer':^{banner_width}}")
|
|
138
150
|
print(f"{'High-Performance Async TCP/HTTP Load Balancer':^{banner_width}}")
|
|
@@ -143,41 +155,42 @@ def print_banner():
|
|
|
143
155
|
print(f"Backend Ports: {', '.join(map(str, BACKEND_PORTS))}")
|
|
144
156
|
print(f"Health Check Interval: 10s (Timeout: {HEALTH_CHECK_TIMEOUT}s)")
|
|
145
157
|
print(f"Status Update Interval: {STATUS_PRINT_INTERVAL}s")
|
|
158
|
+
print(f"Request Throttling: {THROTTLE_MS}ms minimum between requests")
|
|
146
159
|
print("=" * banner_width)
|
|
147
160
|
print()
|
|
148
161
|
|
|
149
162
|
|
|
150
163
|
# --- ANSI Color Codes ---
|
|
151
164
|
class Colors:
|
|
152
|
-
RESET =
|
|
153
|
-
BOLD =
|
|
154
|
-
DIM =
|
|
155
|
-
|
|
165
|
+
RESET = "\033[0m"
|
|
166
|
+
BOLD = "\033[1m"
|
|
167
|
+
DIM = "\033[2m"
|
|
168
|
+
|
|
156
169
|
# Foreground colors
|
|
157
|
-
BLACK =
|
|
158
|
-
RED =
|
|
159
|
-
GREEN =
|
|
160
|
-
YELLOW =
|
|
161
|
-
BLUE =
|
|
162
|
-
MAGENTA =
|
|
163
|
-
CYAN =
|
|
164
|
-
WHITE =
|
|
165
|
-
|
|
170
|
+
BLACK = "\033[30m"
|
|
171
|
+
RED = "\033[31m"
|
|
172
|
+
GREEN = "\033[32m"
|
|
173
|
+
YELLOW = "\033[33m"
|
|
174
|
+
BLUE = "\033[34m"
|
|
175
|
+
MAGENTA = "\033[35m"
|
|
176
|
+
CYAN = "\033[36m"
|
|
177
|
+
WHITE = "\033[37m"
|
|
178
|
+
|
|
166
179
|
# Bright colors
|
|
167
|
-
BRIGHT_BLACK =
|
|
168
|
-
BRIGHT_RED =
|
|
169
|
-
BRIGHT_GREEN =
|
|
170
|
-
BRIGHT_YELLOW =
|
|
171
|
-
BRIGHT_BLUE =
|
|
172
|
-
BRIGHT_MAGENTA =
|
|
173
|
-
BRIGHT_CYAN =
|
|
174
|
-
BRIGHT_WHITE =
|
|
175
|
-
|
|
180
|
+
BRIGHT_BLACK = "\033[90m"
|
|
181
|
+
BRIGHT_RED = "\033[91m"
|
|
182
|
+
BRIGHT_GREEN = "\033[92m"
|
|
183
|
+
BRIGHT_YELLOW = "\033[93m"
|
|
184
|
+
BRIGHT_BLUE = "\033[94m"
|
|
185
|
+
BRIGHT_MAGENTA = "\033[95m"
|
|
186
|
+
BRIGHT_CYAN = "\033[96m"
|
|
187
|
+
BRIGHT_WHITE = "\033[97m"
|
|
188
|
+
|
|
176
189
|
# Background colors
|
|
177
|
-
BG_RED =
|
|
178
|
-
BG_GREEN =
|
|
179
|
-
BG_YELLOW =
|
|
180
|
-
BG_BLUE =
|
|
190
|
+
BG_RED = "\033[41m"
|
|
191
|
+
BG_GREEN = "\033[42m"
|
|
192
|
+
BG_YELLOW = "\033[43m"
|
|
193
|
+
BG_BLUE = "\033[44m"
|
|
181
194
|
|
|
182
195
|
|
|
183
196
|
# --- Helper Functions --- (relay_data and safe_close_writer remain the same)
|
|
@@ -224,7 +237,6 @@ async def safe_close_writer(writer):
|
|
|
224
237
|
logger.debug(f"Error closing writer in context manager: {e}")
|
|
225
238
|
|
|
226
239
|
|
|
227
|
-
|
|
228
240
|
# --- Health Check for Provided Ports ---
|
|
229
241
|
async def check_server_health(session, host, port):
|
|
230
242
|
"""Performs an HTTP GET request to the /health endpoint."""
|
|
@@ -313,6 +325,11 @@ async def scan_and_update_servers():
|
|
|
313
325
|
logger.debug(
|
|
314
326
|
f"Removed connection count entry for unavailable server {server}"
|
|
315
327
|
)
|
|
328
|
+
if server in last_send_times:
|
|
329
|
+
del last_send_times[server]
|
|
330
|
+
logger.debug(
|
|
331
|
+
f"Removed throttling timestamp for unavailable server {server}"
|
|
332
|
+
)
|
|
316
333
|
|
|
317
334
|
available_servers = sorted(list(current_set))
|
|
318
335
|
for server in available_servers:
|
|
@@ -337,7 +354,6 @@ async def scan_and_update_servers():
|
|
|
337
354
|
async def handle_client(client_reader, client_writer):
|
|
338
355
|
"""Handles a single client connection."""
|
|
339
356
|
client_addr = client_writer.get_extra_info("peername")
|
|
340
|
-
logger.info(f"Accepted connection from {client_addr}")
|
|
341
357
|
|
|
342
358
|
backend_server = None
|
|
343
359
|
backend_reader = None
|
|
@@ -376,15 +392,11 @@ async def handle_client(client_reader, client_writer):
|
|
|
376
392
|
connection_counts[selected_server] += 1
|
|
377
393
|
backend_server = selected_server
|
|
378
394
|
server_selected = True
|
|
379
|
-
|
|
395
|
+
|
|
380
396
|
# Update global statistics
|
|
381
397
|
global total_connections_served, current_active_connections
|
|
382
398
|
total_connections_served += 1
|
|
383
399
|
current_active_connections += 1
|
|
384
|
-
|
|
385
|
-
logger.info(
|
|
386
|
-
f"Routing {client_addr} to {backend_server} (Current connections: {connection_counts[backend_server]})"
|
|
387
|
-
)
|
|
388
400
|
else:
|
|
389
401
|
logger.error(
|
|
390
402
|
f"Logic error: No server chosen despite available servers list not being empty for {client_addr}."
|
|
@@ -402,6 +414,29 @@ async def handle_client(client_reader, client_writer):
|
|
|
402
414
|
pass
|
|
403
415
|
server_selected = False
|
|
404
416
|
return
|
|
417
|
+
|
|
418
|
+
# --- Throttling Logic ---
|
|
419
|
+
# Check if we need to throttle requests to avoid overwhelming the backend
|
|
420
|
+
current_time = time.time() * 1000 # Convert to milliseconds
|
|
421
|
+
sleep_time = 0
|
|
422
|
+
async with state_lock:
|
|
423
|
+
last_send_time = last_send_times.get(backend_server, 0)
|
|
424
|
+
time_since_last_send = current_time - last_send_time
|
|
425
|
+
|
|
426
|
+
if time_since_last_send < THROTTLE_MS:
|
|
427
|
+
sleep_time = (THROTTLE_MS - time_since_last_send) / 1000 # Convert to seconds
|
|
428
|
+
logger.debug(
|
|
429
|
+
f"Throttling request to {backend_server} for {sleep_time:.3f}s (last send: {time_since_last_send:.1f}ms ago)"
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Sleep outside the lock to avoid blocking other clients
|
|
433
|
+
if sleep_time > 0:
|
|
434
|
+
await asyncio.sleep(sleep_time)
|
|
435
|
+
|
|
436
|
+
# Update last send time after throttling
|
|
437
|
+
async with state_lock:
|
|
438
|
+
last_send_times[backend_server] = time.time() * 1000
|
|
439
|
+
|
|
405
440
|
try:
|
|
406
441
|
logger.debug(
|
|
407
442
|
f"Attempting connection to backend {backend_server} for {client_addr}"
|
|
@@ -473,16 +508,14 @@ async def handle_client(client_reader, client_writer):
|
|
|
473
508
|
except Exception as e:
|
|
474
509
|
logger.error(f"Error handling client {client_addr}: {e}")
|
|
475
510
|
finally:
|
|
476
|
-
logger.info(f"Closing connection for {client_addr}")
|
|
477
511
|
# Decrement connection count only if we successfully selected/incremented
|
|
478
512
|
if backend_server and server_selected:
|
|
479
513
|
async with state_lock:
|
|
480
514
|
if backend_server in connection_counts:
|
|
481
515
|
if connection_counts[backend_server] > 0:
|
|
482
516
|
connection_counts[backend_server] -= 1
|
|
483
|
-
current_active_connections = max(
|
|
484
|
-
|
|
485
|
-
f"Connection closed for {client_addr}. Backend {backend_server} connections: {connection_counts[backend_server]}"
|
|
517
|
+
current_active_connections = max(
|
|
518
|
+
0, current_active_connections - 1
|
|
486
519
|
)
|
|
487
520
|
else:
|
|
488
521
|
logger.warning(
|
|
@@ -491,7 +524,6 @@ async def handle_client(client_reader, client_writer):
|
|
|
491
524
|
connection_counts[backend_server] = 0
|
|
492
525
|
|
|
493
526
|
|
|
494
|
-
|
|
495
527
|
# --- Status Reporting Task ---
|
|
496
528
|
async def print_status_periodically():
|
|
497
529
|
"""Periodically displays a professional real-time status dashboard."""
|
|
@@ -514,35 +546,52 @@ async def display_status_dashboard():
|
|
|
514
546
|
# Get terminal dimensions for responsive layout
|
|
515
547
|
columns, rows = get_terminal_size()
|
|
516
548
|
dash_width = min(columns - 4, 100)
|
|
517
|
-
|
|
549
|
+
|
|
518
550
|
# Header with title and timestamp
|
|
519
551
|
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
520
552
|
print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
|
|
521
|
-
print(
|
|
522
|
-
|
|
553
|
+
print(
|
|
554
|
+
f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'🚀 vLLM Load Balancer Dashboard':^{dash_width}}{Colors.RESET}"
|
|
555
|
+
)
|
|
556
|
+
print(
|
|
557
|
+
f"{Colors.BRIGHT_CYAN}{'Real-time Status & Monitoring':^{dash_width}}{Colors.RESET}"
|
|
558
|
+
)
|
|
523
559
|
print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
|
|
524
560
|
print()
|
|
525
|
-
|
|
561
|
+
|
|
526
562
|
# System Information Section
|
|
527
563
|
uptime = format_uptime(start_time)
|
|
528
564
|
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📊 System Information{Colors.RESET}")
|
|
529
565
|
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
|
|
530
566
|
print(f"{Colors.YELLOW}🕐 Current Time:{Colors.RESET} {current_time}")
|
|
531
567
|
print(f"{Colors.YELLOW}⏱️ Uptime:{Colors.RESET} {uptime}")
|
|
532
|
-
print(
|
|
568
|
+
print(
|
|
569
|
+
f"{Colors.YELLOW}🌐 Load Balancer:{Colors.RESET} {LOAD_BALANCER_HOST}:{LOAD_BALANCER_PORT}"
|
|
570
|
+
)
|
|
533
571
|
print(f"{Colors.YELLOW}🎯 Backend Host:{Colors.RESET} {BACKEND_HOST}")
|
|
534
|
-
print(
|
|
572
|
+
print(
|
|
573
|
+
f"{Colors.YELLOW}🔧 Configured Ports:{Colors.RESET} {', '.join(map(str, BACKEND_PORTS))}"
|
|
574
|
+
)
|
|
575
|
+
print(f"{Colors.YELLOW}⚡ Request Throttling:{Colors.RESET} {THROTTLE_MS}ms minimum")
|
|
535
576
|
print()
|
|
536
|
-
|
|
577
|
+
|
|
537
578
|
# Connection Statistics Section
|
|
538
579
|
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📈 Connection Statistics{Colors.RESET}")
|
|
539
580
|
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
|
|
540
|
-
print(
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
print(
|
|
581
|
+
print(
|
|
582
|
+
f"{Colors.GREEN}📊 Total Connections Served:{Colors.RESET} {total_connections_served:,}"
|
|
583
|
+
)
|
|
584
|
+
print(
|
|
585
|
+
f"{Colors.GREEN}🔗 Currently Active:{Colors.RESET} {current_active_connections}"
|
|
586
|
+
)
|
|
587
|
+
print(
|
|
588
|
+
f"{Colors.GREEN}⚡ Health Check Timeout:{Colors.RESET} {HEALTH_CHECK_TIMEOUT}s"
|
|
589
|
+
)
|
|
590
|
+
print(
|
|
591
|
+
f"{Colors.GREEN}🔄 Status Update Interval:{Colors.RESET} {STATUS_PRINT_INTERVAL}s"
|
|
592
|
+
)
|
|
544
593
|
print()
|
|
545
|
-
|
|
594
|
+
|
|
546
595
|
# Backend Servers Status
|
|
547
596
|
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}Backend Servers Status{Colors.RESET}")
|
|
548
597
|
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
|
|
@@ -552,7 +601,7 @@ async def display_status_dashboard():
|
|
|
552
601
|
f"{Colors.BOLD}Host{Colors.RESET}",
|
|
553
602
|
f"{Colors.BOLD}Port{Colors.RESET}",
|
|
554
603
|
f"{Colors.BOLD}Active Conn.{Colors.RESET}",
|
|
555
|
-
f"{Colors.BOLD}Status{Colors.RESET}"
|
|
604
|
+
f"{Colors.BOLD}Status{Colors.RESET}",
|
|
556
605
|
]
|
|
557
606
|
|
|
558
607
|
table_data = []
|
|
@@ -580,13 +629,15 @@ async def display_status_dashboard():
|
|
|
580
629
|
else f"{Colors.BG_RED}{Colors.WHITE} OFFLINE {Colors.RESET}"
|
|
581
630
|
)
|
|
582
631
|
|
|
583
|
-
table_data.append(
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
632
|
+
table_data.append(
|
|
633
|
+
[
|
|
634
|
+
f"{Colors.CYAN}{BACKEND_HOST}:{port}{Colors.RESET}",
|
|
635
|
+
BACKEND_HOST,
|
|
636
|
+
str(port),
|
|
637
|
+
conn_display,
|
|
638
|
+
status_display,
|
|
639
|
+
]
|
|
640
|
+
)
|
|
590
641
|
|
|
591
642
|
try:
|
|
592
643
|
table = tabulate(table_data, headers=headers, tablefmt="fancy_grid")
|
|
@@ -594,13 +645,23 @@ async def display_status_dashboard():
|
|
|
594
645
|
print()
|
|
595
646
|
|
|
596
647
|
# Summary metrics
|
|
597
|
-
online_count = sum(
|
|
598
|
-
|
|
648
|
+
online_count = sum(
|
|
649
|
+
1 for port in BACKEND_PORTS if (BACKEND_HOST, port) in current_available
|
|
650
|
+
)
|
|
651
|
+
avg_connections = (
|
|
652
|
+
total_backend_connections / online_count if online_count else 0
|
|
653
|
+
)
|
|
599
654
|
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📋 Summary{Colors.RESET}")
|
|
600
655
|
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 4)}{Colors.RESET}")
|
|
601
|
-
print(
|
|
602
|
-
|
|
603
|
-
|
|
656
|
+
print(
|
|
657
|
+
f"{Colors.MAGENTA}🟢 Available Servers:{Colors.RESET} {online_count} / {len(BACKEND_PORTS)}"
|
|
658
|
+
)
|
|
659
|
+
print(
|
|
660
|
+
f"{Colors.MAGENTA}📊 Total Backend Connections:{Colors.RESET} {total_backend_connections}"
|
|
661
|
+
)
|
|
662
|
+
print(
|
|
663
|
+
f"{Colors.MAGENTA}📈 Average Load per Online Server:{Colors.RESET} {avg_connections:.1f}"
|
|
664
|
+
)
|
|
604
665
|
|
|
605
666
|
except Exception as e:
|
|
606
667
|
logger.error(f"Error displaying status table: {e}")
|
|
@@ -609,15 +670,17 @@ async def display_status_dashboard():
|
|
|
609
670
|
# Footer with refresh info
|
|
610
671
|
print()
|
|
611
672
|
print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
|
|
612
|
-
print(
|
|
673
|
+
print(
|
|
674
|
+
f"{Colors.DIM}🔄 Auto-refresh every {STATUS_PRINT_INTERVAL}s | Press Ctrl+C to stop{Colors.RESET}"
|
|
675
|
+
)
|
|
613
676
|
print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
|
|
614
677
|
print()
|
|
615
678
|
|
|
616
679
|
|
|
617
|
-
|
|
618
680
|
# --- HTTP Stats Server ---
|
|
619
681
|
from aiohttp import web
|
|
620
682
|
|
|
683
|
+
|
|
621
684
|
async def stats_json(request):
|
|
622
685
|
async with state_lock:
|
|
623
686
|
# Build a list of all configured servers, with status and connections
|
|
@@ -626,12 +689,16 @@ async def stats_json(request):
|
|
|
626
689
|
for port in BACKEND_PORTS:
|
|
627
690
|
server = (BACKEND_HOST, port)
|
|
628
691
|
is_online = server in available_set
|
|
629
|
-
all_servers.append(
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
692
|
+
all_servers.append(
|
|
693
|
+
{
|
|
694
|
+
"host": BACKEND_HOST,
|
|
695
|
+
"port": port,
|
|
696
|
+
"active_connections": connection_counts.get(server, 0)
|
|
697
|
+
if is_online
|
|
698
|
+
else 0,
|
|
699
|
+
"status": "ONLINE" if is_online else "OFFLINE",
|
|
700
|
+
}
|
|
701
|
+
)
|
|
635
702
|
stats = {
|
|
636
703
|
"time": datetime.now().isoformat(),
|
|
637
704
|
"uptime": format_uptime(start_time),
|
|
@@ -643,10 +710,12 @@ async def stats_json(request):
|
|
|
643
710
|
"current_active_connections": current_active_connections,
|
|
644
711
|
"health_check_timeout": HEALTH_CHECK_TIMEOUT,
|
|
645
712
|
"status_update_interval": STATUS_PRINT_INTERVAL,
|
|
713
|
+
"throttle_ms": THROTTLE_MS,
|
|
646
714
|
"servers": all_servers,
|
|
647
715
|
}
|
|
648
716
|
return web.json_response(stats)
|
|
649
717
|
|
|
718
|
+
|
|
650
719
|
async def stats_page(request):
|
|
651
720
|
# High-quality HTML dashboard with auto-refresh and charts
|
|
652
721
|
return web.Response(
|
|
@@ -786,86 +855,101 @@ async def stats_page(request):
|
|
|
786
855
|
</script>
|
|
787
856
|
</body>
|
|
788
857
|
</html>
|
|
789
|
-
"""
|
|
858
|
+
""",
|
|
790
859
|
)
|
|
791
860
|
|
|
861
|
+
|
|
792
862
|
async def start_stats_server(loop):
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
863
|
+
app = web.Application()
|
|
864
|
+
app.router.add_get("/stats", stats_page)
|
|
865
|
+
app.router.add_get("/stats.json", stats_json)
|
|
866
|
+
runner = web.AppRunner(app)
|
|
867
|
+
await runner.setup()
|
|
868
|
+
site = web.TCPSite(runner, LOAD_BALANCER_HOST, STATS_PORT)
|
|
869
|
+
await site.start()
|
|
870
|
+
logger.info(
|
|
871
|
+
f"Stats HTTP server running at http://{LOAD_BALANCER_HOST}:{STATS_PORT}/stats"
|
|
872
|
+
)
|
|
873
|
+
|
|
801
874
|
|
|
802
875
|
async def main():
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
# Start background tasks
|
|
809
|
-
scan_task = asyncio.create_task(scan_and_update_servers())
|
|
810
|
-
status_task = asyncio.create_task(print_status_periodically())
|
|
811
|
-
|
|
812
|
-
# Start HTTP stats server (on STATS_PORT)
|
|
813
|
-
loop = asyncio.get_running_loop()
|
|
814
|
-
await start_stats_server(loop)
|
|
815
|
-
|
|
816
|
-
# Start TCP server (on LOAD_BALANCER_PORT)
|
|
817
|
-
server = await asyncio.start_server(
|
|
818
|
-
handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
|
|
819
|
-
)
|
|
876
|
+
global start_time
|
|
877
|
+
start_time = time.time()
|
|
878
|
+
clear_terminal()
|
|
879
|
+
print_banner()
|
|
820
880
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
881
|
+
# Start background tasks
|
|
882
|
+
scan_task = asyncio.create_task(scan_and_update_servers())
|
|
883
|
+
status_task = asyncio.create_task(print_status_periodically())
|
|
884
|
+
|
|
885
|
+
# Start HTTP stats server (on STATS_PORT)
|
|
886
|
+
loop = asyncio.get_running_loop()
|
|
887
|
+
await start_stats_server(loop)
|
|
888
|
+
|
|
889
|
+
# Start TCP server (on LOAD_BALANCER_PORT)
|
|
890
|
+
server = await asyncio.start_server(
|
|
891
|
+
handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
addrs = ", ".join(str(sock.getsockname()) for sock in server.sockets)
|
|
895
|
+
logger.info(f"Load balancer serving on {addrs}")
|
|
896
|
+
logger.info(f"Configured backend ports: {BACKEND_PORTS} on host {BACKEND_HOST}")
|
|
897
|
+
print(f"{Colors.BRIGHT_GREEN}✅ Load balancer started successfully!{Colors.RESET}")
|
|
898
|
+
print(f"{Colors.BRIGHT_GREEN}🌐 Proxy listening on: {addrs}{Colors.RESET}")
|
|
899
|
+
print(
|
|
900
|
+
f"{Colors.BRIGHT_GREEN}📊 Stats dashboard: http://localhost:{STATS_PORT}/stats{Colors.RESET}"
|
|
901
|
+
)
|
|
902
|
+
print(f"{Colors.YELLOW}🔍 Scanning backend servers...{Colors.RESET}")
|
|
903
|
+
print()
|
|
904
|
+
await asyncio.sleep(2)
|
|
832
905
|
|
|
833
|
-
|
|
906
|
+
async with server:
|
|
907
|
+
try:
|
|
908
|
+
await server.serve_forever()
|
|
909
|
+
except asyncio.CancelledError:
|
|
910
|
+
print(f"\n{Colors.YELLOW}🛑 Shutdown signal received...{Colors.RESET}")
|
|
911
|
+
logger.info("Load balancer server shutting down.")
|
|
912
|
+
except KeyboardInterrupt:
|
|
913
|
+
print(f"\n{Colors.YELLOW}🛑 Shutdown requested by user...{Colors.RESET}")
|
|
914
|
+
logger.info("Shutdown requested by user.")
|
|
915
|
+
finally:
|
|
916
|
+
print(f"{Colors.CYAN}🔄 Cleaning up background tasks...{Colors.RESET}")
|
|
917
|
+
logger.info("Cancelling background tasks...")
|
|
918
|
+
scan_task.cancel()
|
|
919
|
+
status_task.cancel()
|
|
834
920
|
try:
|
|
835
|
-
await
|
|
921
|
+
await asyncio.gather(scan_task, status_task, return_exceptions=True)
|
|
836
922
|
except asyncio.CancelledError:
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
logger.info("Shutdown requested by user.")
|
|
842
|
-
finally:
|
|
843
|
-
print(f"{Colors.CYAN}🔄 Cleaning up background tasks...{Colors.RESET}")
|
|
844
|
-
logger.info("Cancelling background tasks...")
|
|
845
|
-
scan_task.cancel()
|
|
846
|
-
status_task.cancel()
|
|
847
|
-
try:
|
|
848
|
-
await asyncio.gather(scan_task, status_task, return_exceptions=True)
|
|
849
|
-
except asyncio.CancelledError:
|
|
850
|
-
pass
|
|
851
|
-
print(f"{Colors.BRIGHT_GREEN}✅ Shutdown complete. Goodbye!{Colors.RESET}")
|
|
852
|
-
logger.info("Background tasks finished.")
|
|
923
|
+
pass
|
|
924
|
+
print(f"{Colors.BRIGHT_GREEN}✅ Shutdown complete. Goodbye!{Colors.RESET}")
|
|
925
|
+
logger.info("Background tasks finished.")
|
|
926
|
+
|
|
853
927
|
|
|
854
928
|
def run_load_balancer():
|
|
855
|
-
global
|
|
929
|
+
global \
|
|
930
|
+
LOAD_BALANCER_PORT, \
|
|
931
|
+
BACKEND_PORTS, \
|
|
932
|
+
BACKEND_HOST, \
|
|
933
|
+
STATUS_PRINT_INTERVAL, \
|
|
934
|
+
HEALTH_CHECK_TIMEOUT, \
|
|
935
|
+
THROTTLE_MS, \
|
|
936
|
+
STATS_PORT
|
|
856
937
|
args = parse_args()
|
|
857
938
|
LOAD_BALANCER_PORT = args.port
|
|
858
939
|
BACKEND_HOST = args.host
|
|
859
940
|
BACKEND_PORTS = [int(p.strip()) for p in args.ports.split(",") if p.strip()]
|
|
860
941
|
STATUS_PRINT_INTERVAL = args.status_interval
|
|
861
942
|
HEALTH_CHECK_TIMEOUT = args.health_timeout
|
|
943
|
+
THROTTLE_MS = args.throttle_ms
|
|
862
944
|
if args.stats_port is not None:
|
|
863
945
|
STATS_PORT = args.stats_port
|
|
864
946
|
else:
|
|
865
947
|
STATS_PORT = LOAD_BALANCER_PORT + 1
|
|
866
948
|
if not BACKEND_PORTS:
|
|
867
949
|
print(f"{Colors.BG_RED}{Colors.WHITE} ❌ ERROR {Colors.RESET}")
|
|
868
|
-
print(
|
|
950
|
+
print(
|
|
951
|
+
f"{Colors.RED}No backend ports specified. Use --ports 8140,8150 ...{Colors.RESET}"
|
|
952
|
+
)
|
|
869
953
|
logger.critical("No backend ports specified. Use --ports 8140,8150 ...")
|
|
870
954
|
sys.exit(1)
|
|
871
955
|
try:
|
|
@@ -878,5 +962,6 @@ def run_load_balancer():
|
|
|
878
962
|
print(f"{Colors.RED}Critical error in main execution: {e}{Colors.RESET}")
|
|
879
963
|
logger.critical(f"Critical error in main execution: {e}")
|
|
880
964
|
|
|
965
|
+
|
|
881
966
|
if __name__ == "__main__":
|
|
882
967
|
run_load_balancer()
|
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
llm_utils/__init__.py,sha256=SlaCMArn_uKVw4r0psz0q0IOQ1VFGdgCFOAKxQ81WTI,694
|
|
2
2
|
llm_utils/chat_format/__init__.py,sha256=8dBIUqFJvkgQYedxBtcyxt-4tt8JxAKVap2JlTXmgaM,737
|
|
3
|
-
llm_utils/chat_format/display.py,sha256=
|
|
3
|
+
llm_utils/chat_format/display.py,sha256=M-__JpcJSqjqeP4LiW7-yF8fVL37yUEUdaNC4VEgIo8,10181
|
|
4
4
|
llm_utils/chat_format/transform.py,sha256=8TZhvUS5DrjUeMNtDIuWY54B_QZ7jjpXEL9c8F5z79w,5400
|
|
5
5
|
llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
|
|
6
6
|
llm_utils/group_messages.py,sha256=8CU9nKOja3xeuhdrX5CvYVveSqSKb2zQ0eeNzA88aTQ,3621
|
|
7
7
|
llm_utils/lm/__init__.py,sha256=rX36_MsnekM5GHwWS56XELbm4W5x2TDwnPERDTfo0eU,194
|
|
8
|
-
llm_utils/lm/async_lm.py,sha256=
|
|
8
|
+
llm_utils/lm/async_lm/__init__.py,sha256=ouN2z1G24OwFglo2asBZ5w18RYcvTZ5r3ylx1aYp2rQ,70
|
|
9
|
+
llm_utils/lm/async_lm/_utils.py,sha256=16yks9grCmFotuqFKxVBfPvrrrquBfhzFUZ22mno3LY,5946
|
|
10
|
+
llm_utils/lm/async_lm/async_llm_task.py,sha256=JIU18Q7sPu_LCUR-geZmqiQ37stM8CqXmMzzkRUYibM,5352
|
|
11
|
+
llm_utils/lm/async_lm/async_lm.py,sha256=LzX1zMKiAENtqPtQ-laVbUyQ1g6q9Xhuyr_IWIjxdVk,28589
|
|
9
12
|
llm_utils/lm/chat_html.py,sha256=FkGo0Dv_nAHYBMZzXfMu_bGQKaCx302goh3XaT-_ETc,8674
|
|
10
13
|
llm_utils/lm/lm_json.py,sha256=fMt42phzFV2f6ulrtWcDXsWHi8WcG7gGkCzpIq8VSSM,1975
|
|
11
14
|
llm_utils/lm/sync_lm.py,sha256=ANw_m5KiWcRwwoeQ5no6dzPFLc6j9o2oEcJtkMKqrn8,34640
|
|
12
15
|
llm_utils/lm/utils.py,sha256=gUejbVZPYg97g4ftYEptYN52WhH3TAKOFW81sjLvi08,4585
|
|
13
16
|
llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
|
|
14
|
-
llm_utils/scripts/vllm_load_balancer.py,sha256=
|
|
17
|
+
llm_utils/scripts/vllm_load_balancer.py,sha256=DxZNDGl8tViE0SuGQPneBHqkkagyuEm70bNzWyqa_9Q,36837
|
|
15
18
|
llm_utils/scripts/vllm_serve.py,sha256=4NaqpVs7LBvxtvTCMPsNCAOfqiWkKRttxWMmWY7SitA,14729
|
|
16
19
|
speedy_utils/__init__.py,sha256=YCpiReW22zG4KkQXQe6V9BQ8bn7PtiXolOaW_iL8T4M,5734
|
|
17
20
|
speedy_utils/all.py,sha256=t-HKzDmhF1MTFnmq7xRnPs5nFG_aZaLH9Ua0RM6nQ9Y,4855
|
|
@@ -31,7 +34,7 @@ speedy_utils/multi_worker/thread.py,sha256=u_hTwXh7_FciMa5EukdEA1fDCY_vUC4moDceB
|
|
|
31
34
|
speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
35
|
speedy_utils/scripts/mpython.py,sha256=73PHm1jqbCt2APN4xuNjD0VDKwzOj4EZsViEMQiZU2g,3853
|
|
33
36
|
speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
|
|
34
|
-
speedy_utils-1.1.
|
|
35
|
-
speedy_utils-1.1.
|
|
36
|
-
speedy_utils-1.1.
|
|
37
|
-
speedy_utils-1.1.
|
|
37
|
+
speedy_utils-1.1.6.dist-info/METADATA,sha256=vfJU1DO0R5VkRqYrN1omYIgcHNDvvKJ4F87l77wjP2s,7441
|
|
38
|
+
speedy_utils-1.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
39
|
+
speedy_utils-1.1.6.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
|
|
40
|
+
speedy_utils-1.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|