speedy-utils 1.1.0__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_utils/lm/async_lm.py CHANGED
@@ -1,84 +1,10 @@
1
- """
2
- # ============================================================================= #
3
- # ASYNCHRONOUS LANGUAGE MODEL WRAPPER WITH CONCURRENT EXECUTION SUPPORT
4
- # ============================================================================= #
5
- #
6
- # Title & Intent:
7
- # High-performance asynchronous language model interface for concurrent LLM operations
8
- #
9
- # High-level Summary:
10
- # This module provides an async drop-in replacement for the synchronous LM class, designed
11
- # for high-throughput applications requiring concurrent language model operations. It maintains
12
- # full API compatibility while adding async/await semantics, connection pooling, and efficient
13
- # resource management. The AsyncLM class supports batch processing, concurrent request handling,
14
- # and maintains the same caching and type safety guarantees as the synchronous version.
15
- #
16
- # Public API / Data Contracts:
17
- # • AsyncLM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Async wrapper class
18
- # • async AsyncLM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
19
- # • async AsyncLM.list_models(port=None) -> List[str] - Enumerate available models
20
- # • async AsyncLM.count_tokens(messages, model=None) -> int - Token counting utility
21
- # • async AsyncLM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
22
- # • AsyncLM.set_model(model_name) -> None - Runtime model switching (sync method)
23
- # • async AsyncLM.batch_call(requests) -> List[Union[str, BaseModel]] - Concurrent batch processing
24
- # • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
25
- # • Messages = List[ChatCompletionMessageParam] - Typed message format
26
- #
27
- # Invariants / Constraints:
28
- # • MUST be used within async context (asyncio event loop required)
29
- # • MUST provide either 'prompt' or 'messages' parameter, but not both
30
- # • MUST properly await all async method calls
31
- # • Connection pooling MUST handle concurrent requests efficiently
32
- # • MUST maintain thread safety across concurrent operations
33
- # • Rate limit handling MUST use async backoff without blocking event loop
34
- # • MUST preserve all synchronous LM class behaviors and constraints
35
- # • Resource cleanup MUST occur on context manager exit or explicit close
36
- #
37
- # Usage Example:
38
- # ```python
39
- # import asyncio
40
- # from llm_utils.lm.async_lm import AsyncLM
41
- # from pydantic import BaseModel
42
- #
43
- # class SummaryResponse(BaseModel):
44
- # summary: str
45
- # key_points: List[str]
46
- # confidence: float
47
- #
48
- # async def main():
49
- # # Single async call
50
- # lm = AsyncLM(model="gpt-4o-mini", temperature=0.1)
51
- # response = await lm(prompt="Summarize quantum computing")
52
- # print(response)
53
- #
54
- # # Concurrent batch processing
55
- # texts = ["Text 1 to summarize", "Text 2 to summarize", "Text 3 to summarize"]
56
- # tasks = [lm(prompt=f"Summarize: {text}", response_format=SummaryResponse) for text in texts]
57
- # summaries = await asyncio.gather(*tasks)
58
- #
59
- # for summary in summaries:
60
- # print(f"Summary: {summary.summary}")
61
- # print(f"Key points: {summary.key_points}")
62
- #
63
- # asyncio.run(main())
64
- # ```
65
- #
66
- # TODO & Future Work:
67
- # • Add async context manager support for automatic resource cleanup
68
- # • Implement connection pool size optimization based on usage patterns
69
- # • Add async streaming response support with async generators
70
- # • Optimize memory usage for large-scale concurrent operations
71
- # • Add async rate limiting with priority queuing
72
- #
73
- # ============================================================================= #
74
- """
75
1
 
76
2
  import base64
77
3
  import hashlib
78
4
  import json
79
5
  import os
80
6
  from abc import ABC
81
- from functools import lru_cache
7
+ from functools import cache, lru_cache
82
8
  from typing import (
83
9
  Any,
84
10
  Dict,
@@ -110,7 +36,7 @@ from openai.types.chat import (
110
36
  )
111
37
  from openai.types.model import Model
112
38
  from pydantic import BaseModel
113
-
39
+ from pydantic import ValidationError
114
40
  from llm_utils.chat_format.display import get_conversation_one_turn
115
41
 
116
42
  # --------------------------------------------------------------------------- #
@@ -146,10 +72,13 @@ def _yellow(t):
146
72
  return _color(33, t)
147
73
 
148
74
 
149
- class ParsedOutput(TypedDict):
75
+ TParsed = TypeVar("TParsed", bound=BaseModel)
76
+
77
+
78
+ class ParsedOutput(TypedDict, Generic[TParsed]):
150
79
  messages: List
151
80
  completion: Any
152
- parsed: BaseModel
81
+ parsed: TParsed
153
82
 
154
83
 
155
84
  class AsyncLM:
@@ -460,7 +389,7 @@ class AsyncLM:
460
389
  # ------------------------------------------------------------------ #
461
390
  async def parse(
462
391
  self,
463
- response_model: Type[BaseModel],
392
+ response_model: Type[TParsed],
464
393
  instruction: Optional[str] = None,
465
394
  prompt: Optional[str] = None,
466
395
  messages: Optional[RawMsgs] = None,
@@ -470,7 +399,7 @@ class AsyncLM:
470
399
  max_tokens: Optional[int] = None,
471
400
  cache: Optional[bool] = True,
472
401
  **kwargs,
473
- ) -> ParsedOutput: # -> dict[str, Any]:
402
+ ) -> ParsedOutput[TParsed]:
474
403
  """Parse response using guided JSON generation."""
475
404
  if messages is None:
476
405
  assert instruction is not None, "Instruction must be provided."
@@ -513,6 +442,7 @@ class AsyncLM:
513
442
 
514
443
  use_cache = self.do_cache if cache is None else cache
515
444
  cache_key = None
445
+ completion = None
516
446
  if use_cache:
517
447
  cache_data = {
518
448
  "messages": messages,
@@ -522,7 +452,7 @@ class AsyncLM:
522
452
  }
523
453
  cache_key = self._cache_key(cache_data, {}, response_model)
524
454
  completion = self._load_cache(cache_key) # dict
525
- else:
455
+ if not completion:
526
456
  completion = await self.client.chat.completions.create(
527
457
  model=self.model, # type: ignore
528
458
  messages=messages, # type: ignore
@@ -532,10 +462,12 @@ class AsyncLM:
532
462
  completion = completion.model_dump()
533
463
  if cache_key:
534
464
  self._dump_cache(cache_key, completion)
535
-
465
+ assert isinstance(completion, dict), (
466
+ "Completion must be a dictionary with OpenAI response format."
467
+ )
536
468
  self.last_log = [prompt, messages, completion]
537
469
 
538
- output = self._parse_complete_output(completion, response_model)
470
+ output = cast(TParsed, self._parse_complete_output(completion, response_model))
539
471
  full_messages = messages + [completion]
540
472
  return ParsedOutput(
541
473
  messages=full_messages,
@@ -555,7 +487,49 @@ class AsyncLM:
555
487
 
556
488
  content = completion["choices"][0]["message"]["content"]
557
489
  if not content:
558
- raise ValueError("Empty content in response")
490
+ # Enhanced error for debugging: show input tokens and their count
491
+
492
+ # Try to extract tokens from the completion for debugging
493
+ input_tokens = None
494
+ try:
495
+ input_tokens = completion.get('usage', {}).get('prompt_tokens')
496
+ except Exception:
497
+ input_tokens = None
498
+
499
+ # Try to get the prompt/messages for tokenization
500
+ prompt = None
501
+ try:
502
+ prompt = completion.get('messages') or completion.get('prompt')
503
+ except Exception:
504
+ prompt = None
505
+
506
+ tokens_preview = ''
507
+ if prompt is not None:
508
+ try:
509
+ tokenizer = get_tokenizer(self.model)
510
+ if isinstance(prompt, list):
511
+ prompt_text = '\n'.join(
512
+ m.get('content', '') for m in prompt if isinstance(m, dict)
513
+ )
514
+ else:
515
+ prompt_text = str(prompt)
516
+ tokens = tokenizer.encode(prompt_text)
517
+ n_tokens = len(tokens)
518
+ first_100 = tokens[:100]
519
+ last_100 = tokens[-100:] if n_tokens > 100 else []
520
+ tokens_preview = (
521
+ f'\nInput tokens: {n_tokens}'
522
+ f'\nFirst 100 tokens: {first_100}'
523
+ f'\nLast 100 tokens: {last_100}'
524
+ )
525
+ except Exception as exc:
526
+ tokens_preview = f'\n[Tokenization failed: {exc}]'
527
+
528
+ raise ValueError(
529
+ f'Empty content in response.'
530
+ f'\nInput tokens (if available): {input_tokens}'
531
+ f'{tokens_preview}'
532
+ )
559
533
 
560
534
  try:
561
535
  data = json.loads(content)
@@ -737,6 +711,7 @@ async def inspect_word_probs_async(lm, tokenizer, messages):
737
711
  """Async version of inspect_word_probs."""
738
712
 
739
713
  import numpy as np
714
+
740
715
 
741
716
  async def compute_word_log_probs(
742
717
  tokenizer: Any,
@@ -894,12 +869,14 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
894
869
  temperature: float = 0.6
895
870
  think: bool = False
896
871
  add_json_schema: bool = False
872
+ cache: bool = False
897
873
 
898
874
  async def __call__(
899
875
  self,
900
876
  data: BaseModel | dict,
901
877
  temperature: float = 0.1,
902
878
  cache: bool = False,
879
+ think: Optional[bool] = None, # if not None, overrides self.think
903
880
  ) -> tuple[OutputModelType, List[Dict[str, Any]]]:
904
881
  # Get the input and output model types from the generic parameters
905
882
  type_args = getattr(self.__class__, "__orig_bases__", None)
@@ -940,9 +917,9 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
940
917
  instruction=self.__doc__ or "",
941
918
  response_model=output_model,
942
919
  temperature=temperature or self.temperature,
943
- think=self.think,
920
+ think=think if think is not None else self.think,
944
921
  add_json_schema_to_instruction=self.add_json_schema,
945
- cache=cache,
922
+ cache=self.cache or cache,
946
923
  )
947
924
 
948
925
  return (
@@ -1,104 +1,183 @@
1
- """
2
- # ============================================================================= #
3
- # VLLM LOAD BALANCER WITH HEALTH MONITORING AND DYNAMIC ROUTING
4
- # ============================================================================= #
5
- #
6
- # Title & Intent:
7
- # Production-ready TCP load balancer for vLLM model servers with health checks and connection management
8
- #
9
- # High-level Summary:
10
- # This module implements a high-performance load balancer specifically designed for vLLM model
11
- # serving infrastructure. It provides intelligent routing across multiple vLLM server instances,
12
- # continuous health monitoring, automatic failover, and connection pooling. The load balancer
13
- # uses async TCP proxying to handle concurrent requests efficiently while maintaining session
14
- # affinity and providing detailed metrics for monitoring and debugging.
15
- #
16
- # Public API / Data Contracts:
17
- # • LOAD_BALANCER_HOST = "0.0.0.0" - Load balancer bind address
18
- # • LOAD_BALANCER_PORT = 8008 - Load balancer listening port
19
- # • SCAN_TARGET_HOST = "localhost" - Target server host for health checks
20
- # • SCAN_PORT_START = 8140, SCAN_PORT_END = 8170 - Port range for server discovery
21
- # • start_load_balancer() -> None - Main entry point to start the service
22
- # • scan_for_healthy_servers() -> None - Background health monitoring task
23
- # • handle_client(reader, writer) -> None - Client connection handler
24
- # • relay_data(reader, writer, direction) -> None - Bidirectional data relay
25
- # • get_next_server() -> Optional[Tuple[str, int]] - Round-robin server selection
26
- #
27
- # Invariants / Constraints:
28
- # • MUST continuously monitor server health every SCAN_INTERVAL seconds
29
- # • MUST handle connection failures gracefully with automatic failover
30
- # • Health checks MUST complete within HEALTH_CHECK_TIMEOUT seconds
31
- # • MUST maintain connection counts for load balancing decisions
32
- # • Server availability MUST be updated atomically using async locks
33
- # • TCP connections MUST be properly closed on errors or completion
34
- # • MUST log all connection events and health status changes
35
- # • Round-robin selection MUST distribute load evenly across healthy servers
36
- #
37
- # Usage Example:
38
- # ```python
39
- # # Start the load balancer (blocking operation)
40
- # import asyncio
41
- # from llm_utils.scripts.vllm_load_balancer import start_load_balancer
42
- #
43
- # # Configure environment or modify constants as needed
44
- # LOAD_BALANCER_HOST = "0.0.0.0"
45
- # LOAD_BALANCER_PORT = 8008
46
- # SCAN_TARGET_HOST = "localhost"
47
- # SCAN_PORT_START = 8140
48
- # SCAN_PORT_END = 8150
49
- #
50
- # # Start the load balancer service
51
- # asyncio.run(start_load_balancer())
52
- #
53
- # # The service will:
54
- # # 1. Scan for healthy vLLM servers on ports 8140-8150
55
- # # 2. Accept client connections on port 8008
56
- # # 3. Route requests to healthy backend servers
57
- # # 4. Monitor server health continuously
58
- # # 5. Provide connection statistics
59
- # ```
60
- #
61
- # TODO & Future Work:
62
- # • Add weighted round-robin based on server capacity metrics
63
- # • Implement session affinity for stateful model interactions
64
- # • Add HTTP health check endpoints for better monitoring integration
65
- # • Support dynamic server registration and deregistration
66
- # • Add metrics export for Prometheus/Grafana monitoring
67
- # • Implement graceful shutdown with connection draining
68
- #
69
- # ============================================================================= #
70
- """
1
+ import argparse
2
+ import sys
3
+ import os
4
+ import time
5
+ from datetime import datetime
6
+ from collections import defaultdict
71
7
 
72
8
  import asyncio
73
9
  import contextlib
74
10
  import random
75
- from collections import defaultdict
76
11
 
77
- import aiohttp # <-- Import aiohttp
12
+ import aiohttp
78
13
  from loguru import logger
79
14
  from tabulate import tabulate
80
15
 
81
16
  from speedy_utils import setup_logger
82
17
 
83
18
  setup_logger(min_interval=5)
84
- # --- Configuration ---
85
- LOAD_BALANCER_HOST = "0.0.0.0"
86
- LOAD_BALANCER_PORT = 8008
87
19
 
88
- SCAN_TARGET_HOST = "localhost"
89
- SCAN_PORT_START = 8140
90
- SCAN_PORT_END = 8170 # Inclusive
91
- SCAN_INTERVAL = 30
92
- # Timeout applies to the HTTP health check request now
93
- HEALTH_CHECK_TIMEOUT = 2 # Increased slightly for HTTP requests
20
+ # --- CLI Argument Parsing ---
21
+ def parse_args():
22
+ parser = argparse.ArgumentParser(
23
+ description="🚀 vLLM Load Balancer - High-Performance Async TCP/HTTP Load Balancer",
24
+ formatter_class=argparse.RawDescriptionHelpFormatter,
25
+ epilog="""
26
+ Examples:
27
+ python vllm_load_balancer.py 8001 --ports 8140,8150,8160
28
+ python vllm_load_balancer.py 8080 --ports 8140,8150 --host 192.168.1.100
29
+ python vllm_load_balancer.py 8001 --ports 8140,8150 --status-interval 3
30
+
31
+ Features:
32
+ • Real-time dashboard with color-coded status
33
+ • Automatic health checks and failover
34
+ • Least-connections load balancing
35
+ • Professional terminal interface
36
+ • Connection statistics and monitoring
37
+ """
38
+ )
39
+ parser.add_argument(
40
+ "port",
41
+ type=int,
42
+ help="Port for the load balancer to listen on (e.g., 8001)",
43
+ )
44
+ parser.add_argument(
45
+ "--ports",
46
+ type=str,
47
+ required=True,
48
+ help="Comma-separated list of backend ports to use (e.g., 8140,8150)",
49
+ )
50
+ parser.add_argument(
51
+ "--host",
52
+ type=str,
53
+ default="localhost",
54
+ help="Backend host (default: localhost)",
55
+ )
56
+ parser.add_argument(
57
+ "--status-interval",
58
+ type=int,
59
+ default=5,
60
+ help="Status print interval in seconds (default: 5)",
61
+ )
62
+ parser.add_argument(
63
+ "--health-timeout",
64
+ type=int,
65
+ default=2,
66
+ help="Health check timeout in seconds (default: 2)",
67
+ )
68
+ parser.add_argument(
69
+ "--stats-port",
70
+ type=int,
71
+ default=None,
72
+ help="Port for the HTTP stats dashboard (default: proxy port + 1)",
73
+ )
74
+ return parser.parse_args()
94
75
 
76
+ # --- Configuration (populated from CLI) ---
77
+ LOAD_BALANCER_HOST = "0.0.0.0"
78
+ LOAD_BALANCER_PORT = 8008 # Will be overwritten by CLI
79
+ STATS_PORT = 8009 # Will be overwritten by CLI
80
+ BACKEND_HOST = "localhost" # Will be overwritten by CLI
81
+ BACKEND_PORTS = [] # Will be overwritten by CLI
95
82
  STATUS_PRINT_INTERVAL = 5
83
+ HEALTH_CHECK_TIMEOUT = 2
96
84
  BUFFER_SIZE = 4096
97
85
 
98
86
  # --- Global Shared State ---
99
87
  available_servers = []
100
88
  connection_counts = defaultdict(int)
101
89
  state_lock = asyncio.Lock()
90
+ start_time = None
91
+ total_connections_served = 0
92
+ current_active_connections = 0
93
+
94
+
95
+ # --- Terminal Utilities ---
96
+ def clear_terminal():
97
+ """Clear terminal screen with cross-platform support."""
98
+ if os.name == 'nt': # Windows
99
+ os.system('cls')
100
+ else: # Unix/Linux/MacOS
101
+ os.system('clear')
102
+
103
+
104
+ def get_terminal_size():
105
+ """Get terminal dimensions."""
106
+ try:
107
+ columns, rows = os.get_terminal_size()
108
+ return columns, rows
109
+ except OSError:
110
+ return 80, 24 # Default fallback
111
+
112
+
113
+ def format_uptime(start_time):
114
+ """Format uptime in a human-readable way."""
115
+ if not start_time:
116
+ return "Unknown"
117
+
118
+ uptime_seconds = time.time() - start_time
119
+ hours = int(uptime_seconds // 3600)
120
+ minutes = int((uptime_seconds % 3600) // 60)
121
+ seconds = int(uptime_seconds % 60)
122
+
123
+ if hours > 0:
124
+ return f"{hours}h {minutes}m {seconds}s"
125
+ elif minutes > 0:
126
+ return f"{minutes}m {seconds}s"
127
+ else:
128
+ return f"{seconds}s"
129
+
130
+
131
+ def print_banner():
132
+ """Print a professional startup banner."""
133
+ columns, _ = get_terminal_size()
134
+ banner_width = min(columns - 4, 80)
135
+
136
+ print("=" * banner_width)
137
+ print(f"{'🚀 vLLM Load Balancer':^{banner_width}}")
138
+ print(f"{'High-Performance Async TCP/HTTP Load Balancer':^{banner_width}}")
139
+ print("=" * banner_width)
140
+ print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
141
+ print(f"Load Balancer Port: {LOAD_BALANCER_PORT}")
142
+ print(f"Backend Host: {BACKEND_HOST}")
143
+ print(f"Backend Ports: {', '.join(map(str, BACKEND_PORTS))}")
144
+ print(f"Health Check Interval: 10s (Timeout: {HEALTH_CHECK_TIMEOUT}s)")
145
+ print(f"Status Update Interval: {STATUS_PRINT_INTERVAL}s")
146
+ print("=" * banner_width)
147
+ print()
148
+
149
+
150
+ # --- ANSI Color Codes ---
151
+ class Colors:
152
+ RESET = '\033[0m'
153
+ BOLD = '\033[1m'
154
+ DIM = '\033[2m'
155
+
156
+ # Foreground colors
157
+ BLACK = '\033[30m'
158
+ RED = '\033[31m'
159
+ GREEN = '\033[32m'
160
+ YELLOW = '\033[33m'
161
+ BLUE = '\033[34m'
162
+ MAGENTA = '\033[35m'
163
+ CYAN = '\033[36m'
164
+ WHITE = '\033[37m'
165
+
166
+ # Bright colors
167
+ BRIGHT_BLACK = '\033[90m'
168
+ BRIGHT_RED = '\033[91m'
169
+ BRIGHT_GREEN = '\033[92m'
170
+ BRIGHT_YELLOW = '\033[93m'
171
+ BRIGHT_BLUE = '\033[94m'
172
+ BRIGHT_MAGENTA = '\033[95m'
173
+ BRIGHT_CYAN = '\033[96m'
174
+ BRIGHT_WHITE = '\033[97m'
175
+
176
+ # Background colors
177
+ BG_RED = '\033[41m'
178
+ BG_GREEN = '\033[42m'
179
+ BG_YELLOW = '\033[43m'
180
+ BG_BLUE = '\033[44m'
102
181
 
103
182
 
104
183
  # --- Helper Functions --- (relay_data and safe_close_writer remain the same)
@@ -145,21 +224,17 @@ async def safe_close_writer(writer):
145
224
  logger.debug(f"Error closing writer in context manager: {e}")
146
225
 
147
226
 
148
- # --- Server Scanning and Health Check (Modified) ---
149
-
150
227
 
228
+ # --- Health Check for Provided Ports ---
151
229
  async def check_server_health(session, host, port):
152
230
  """Performs an HTTP GET request to the /health endpoint."""
153
231
  url = f"http://{host}:{port}/health"
154
232
  try:
155
- # Use the provided aiohttp session to make the GET request
156
233
  async with session.get(url, timeout=HEALTH_CHECK_TIMEOUT) as response:
157
- # Check for a successful status code (2xx range)
158
234
  if 200 <= response.status < 300:
159
235
  logger.debug(
160
236
  f"[{LOAD_BALANCER_PORT=}] Health check success for {url} (Status: {response.status})"
161
237
  )
162
- # Ensure the connection is released back to the pool
163
238
  await response.release()
164
239
  return True
165
240
  else:
@@ -172,61 +247,51 @@ async def check_server_health(session, host, port):
172
247
  logger.debug(f"Health check HTTP request timeout for {url}")
173
248
  return False
174
249
  except aiohttp.ClientConnectorError as e:
175
- # Handles connection refused, DNS errors etc. - server likely down
176
250
  logger.debug(f"Health check connection error for {url}: {e}")
177
251
  return False
178
252
  except aiohttp.ClientError as e:
179
- # Catch other potential client errors (e.g., invalid URL structure, too many redirects)
180
253
  logger.warning(f"Health check client error for {url}: {e}")
181
254
  return False
182
255
  except Exception as e:
183
- # Catch any other unexpected errors during the check
184
256
  logger.error(f"Unexpected health check error for {url}: {e}")
185
257
  return False
186
258
 
187
259
 
188
260
  async def scan_and_update_servers():
189
- """Periodically scans ports using HTTP /health check and updates available servers."""
261
+ """Periodically checks the provided backend ports and updates available servers."""
190
262
  global available_servers
191
263
  logger.debug(
192
- f"Starting server scan task (HTTP GET /health on Ports {SCAN_PORT_START}-{SCAN_PORT_END} every {SCAN_INTERVAL}s)"
264
+ f"Starting server scan task (HTTP GET /health on ports {BACKEND_PORTS} every 10s)"
193
265
  )
194
266
  while True:
195
267
  try:
196
268
  current_scan_results = []
197
269
  scan_tasks = []
198
- ports_to_scan = range(SCAN_PORT_START, SCAN_PORT_END + 1)
270
+ ports_to_scan = BACKEND_PORTS
199
271
 
200
- # Create ONE aiohttp session for all checks within this scan cycle for efficiency
201
272
  async with aiohttp.ClientSession() as session:
202
- # Create health check tasks for all ports, passing the shared session
203
273
  for port in ports_to_scan:
204
274
  task = asyncio.create_task(
205
- check_server_health(session, SCAN_TARGET_HOST, port)
275
+ check_server_health(session, BACKEND_HOST, port)
206
276
  )
207
277
  scan_tasks.append((task, port))
208
278
 
209
- # Wait for all health checks to complete
210
- # return_exceptions=True prevents gather from stopping if one check fails
211
279
  await asyncio.gather(
212
280
  *(task for task, port in scan_tasks), return_exceptions=True
213
281
  )
214
282
 
215
- # Collect results from completed tasks
216
283
  for task, port in scan_tasks:
217
284
  try:
218
- # Check if task finished, wasn't cancelled, and returned True
219
285
  if (
220
286
  task.done()
221
287
  and not task.cancelled()
222
288
  and task.result() is True
223
289
  ):
224
- current_scan_results.append((SCAN_TARGET_HOST, port))
290
+ current_scan_results.append((BACKEND_HOST, port))
225
291
  except Exception as e:
226
292
  logger.error(
227
293
  f"Error retrieving health check result for port {port}: {e}"
228
294
  )
229
- # --- Update Shared State (Locked) ---
230
295
  async with state_lock:
231
296
  previous_servers = set(available_servers)
232
297
  current_set = set(current_scan_results)
@@ -263,9 +328,9 @@ async def scan_and_update_servers():
263
328
  break
264
329
  except Exception as e:
265
330
  logger.error(f"Error in scan_and_update_servers loop: {e}")
266
- await asyncio.sleep(SCAN_INTERVAL / 2) # Avoid tight loop on error
331
+ await asyncio.sleep(5) # Avoid tight loop on error
267
332
 
268
- await asyncio.sleep(SCAN_INTERVAL)
333
+ await asyncio.sleep(10)
269
334
 
270
335
 
271
336
  # --- Core Load Balancer Logic (handle_client remains the same) ---
@@ -279,6 +344,7 @@ async def handle_client(client_reader, client_writer):
279
344
  backend_writer = None
280
345
  server_selected = False
281
346
 
347
+ global total_connections_served, current_active_connections
282
348
  try:
283
349
  # --- Select Backend Server (Least Connections from Available) ---
284
350
  selected_server = None
@@ -310,6 +376,12 @@ async def handle_client(client_reader, client_writer):
310
376
  connection_counts[selected_server] += 1
311
377
  backend_server = selected_server
312
378
  server_selected = True
379
+
380
+ # Update global statistics
381
+ global total_connections_served, current_active_connections
382
+ total_connections_served += 1
383
+ current_active_connections += 1
384
+
313
385
  logger.info(
314
386
  f"Routing {client_addr} to {backend_server} (Current connections: {connection_counts[backend_server]})"
315
387
  )
@@ -408,6 +480,7 @@ async def handle_client(client_reader, client_writer):
408
480
  if backend_server in connection_counts:
409
481
  if connection_counts[backend_server] > 0:
410
482
  connection_counts[backend_server] -= 1
483
+ current_active_connections = max(0, current_active_connections - 1)
411
484
  logger.info(
412
485
  f"Connection closed for {client_addr}. Backend {backend_server} connections: {connection_counts[backend_server]}"
413
486
  )
@@ -418,92 +491,392 @@ async def handle_client(client_reader, client_writer):
418
491
  connection_counts[backend_server] = 0
419
492
 
420
493
 
421
- # --- Status Reporting Task (print_status_periodically remains the same) ---
494
+
495
+ # --- Status Reporting Task ---
422
496
  async def print_status_periodically():
423
- """Periodically prints the connection status based on available servers."""
497
+ """Periodically displays a professional real-time status dashboard."""
424
498
  while True:
425
499
  await asyncio.sleep(STATUS_PRINT_INTERVAL)
426
- async with state_lock:
427
- headers = ["Backend Server", "Host", "Port", "Active Connections", "Status"]
428
- table_data = []
429
- total_connections = 0
430
- current_available = available_servers[:]
431
- current_counts = connection_counts.copy()
432
-
433
- if not current_available:
434
- # clear terminal and print status
435
- print("\033[H\033[J", end="") # Clear terminal
436
- print("\n----- Load Balancer Status -----")
437
- print("No backend servers currently available (failed /health check).")
438
- print("------------------------------\n")
439
- continue
440
-
441
- for server in current_available:
442
- host, port = server
443
- count = current_counts.get(server, 0)
444
- table_data.append([f"{host}:{port}", host, port, count, "Available"])
445
- total_connections += count
446
-
447
- table_data.sort(key=lambda row: (row[1], row[2]))
500
+ await display_status_dashboard()
501
+
502
+
503
+ async def display_status_dashboard():
504
+ """Display a professional real-time status dashboard."""
505
+ global current_active_connections, total_connections_served
506
+
507
+ async with state_lock:
508
+ current_available = set(available_servers)
509
+ current_counts = connection_counts.copy()
510
+
511
+ # Clear terminal for fresh display
512
+ clear_terminal()
513
+
514
+ # Get terminal dimensions for responsive layout
515
+ columns, rows = get_terminal_size()
516
+ dash_width = min(columns - 4, 100)
517
+
518
+ # Header with title and timestamp
519
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
520
+ print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
521
+ print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'🚀 vLLM Load Balancer Dashboard':^{dash_width}}{Colors.RESET}")
522
+ print(f"{Colors.BRIGHT_CYAN}{'Real-time Status & Monitoring':^{dash_width}}{Colors.RESET}")
523
+ print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
524
+ print()
525
+
526
+ # System Information Section
527
+ uptime = format_uptime(start_time)
528
+ print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📊 System Information{Colors.RESET}")
529
+ print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
530
+ print(f"{Colors.YELLOW}🕐 Current Time:{Colors.RESET} {current_time}")
531
+ print(f"{Colors.YELLOW}⏱️ Uptime:{Colors.RESET} {uptime}")
532
+ print(f"{Colors.YELLOW}🌐 Load Balancer:{Colors.RESET} {LOAD_BALANCER_HOST}:{LOAD_BALANCER_PORT}")
533
+ print(f"{Colors.YELLOW}🎯 Backend Host:{Colors.RESET} {BACKEND_HOST}")
534
+ print(f"{Colors.YELLOW}🔧 Configured Ports:{Colors.RESET} {', '.join(map(str, BACKEND_PORTS))}")
535
+ print()
536
+
537
+ # Connection Statistics Section
538
+ print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📈 Connection Statistics{Colors.RESET}")
539
+ print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
540
+ print(f"{Colors.GREEN}📊 Total Connections Served:{Colors.RESET} {total_connections_served:,}")
541
+ print(f"{Colors.GREEN}🔗 Currently Active:{Colors.RESET} {current_active_connections}")
542
+ print(f"{Colors.GREEN}⚡ Health Check Timeout:{Colors.RESET} {HEALTH_CHECK_TIMEOUT}s")
543
+ print(f"{Colors.GREEN}🔄 Status Update Interval:{Colors.RESET} {STATUS_PRINT_INTERVAL}s")
544
+ print()
545
+
546
+ # Backend Servers Status
547
+ print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}Backend Servers Status{Colors.RESET}")
548
+ print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
549
+
550
+ headers = [
551
+ f"{Colors.BOLD}Server{Colors.RESET}",
552
+ f"{Colors.BOLD}Host{Colors.RESET}",
553
+ f"{Colors.BOLD}Port{Colors.RESET}",
554
+ f"{Colors.BOLD}Active Conn.{Colors.RESET}",
555
+ f"{Colors.BOLD}Status{Colors.RESET}"
556
+ ]
557
+
558
+ table_data = []
559
+ total_backend_connections = 0
560
+
561
+ for port in BACKEND_PORTS:
562
+ server = (BACKEND_HOST, port)
563
+ is_online = server in current_available
564
+ count = current_counts.get(server, 0) if is_online else 0
565
+ total_backend_connections += count
566
+
567
+ # Color-code connection count based on load
568
+ if count == 0:
569
+ conn_display = f"{Colors.DIM}0{Colors.RESET}"
570
+ elif count < 5:
571
+ conn_display = f"{Colors.GREEN}{count}{Colors.RESET}"
572
+ elif count < 10:
573
+ conn_display = f"{Colors.YELLOW}{count}{Colors.RESET}"
574
+ else:
575
+ conn_display = f"{Colors.RED}{count}{Colors.RESET}"
576
+
577
+ status_display = (
578
+ f"{Colors.BG_GREEN}{Colors.BLACK} ONLINE {Colors.RESET}"
579
+ if is_online
580
+ else f"{Colors.BG_RED}{Colors.WHITE} OFFLINE {Colors.RESET}"
581
+ )
582
+
583
+ table_data.append([
584
+ f"{Colors.CYAN}{BACKEND_HOST}:{port}{Colors.RESET}",
585
+ BACKEND_HOST,
586
+ str(port),
587
+ conn_display,
588
+ status_display
589
+ ])
448
590
 
449
- try:
450
- table = tabulate(table_data, headers=headers, tablefmt="grid")
451
- print("\n----- Load Balancer Status -----")
452
- print(
453
- f"Scanning Ports: {SCAN_PORT_START}-{SCAN_PORT_END} on {SCAN_TARGET_HOST} (using /health endpoint)"
454
- )
455
- print(
456
- f"Scan Interval: {SCAN_INTERVAL}s | Health Check Timeout: {HEALTH_CHECK_TIMEOUT}s"
457
- )
458
- print(table)
459
- print(
460
- f"Total Active Connections (on available servers): {total_connections}"
461
- )
462
- print("------------------------------\n")
463
- except Exception as e:
464
- logger.error(f"Error printing status table: {e}")
465
-
466
-
467
- # --- Main Execution (main remains the same) ---
468
- async def main():
469
- scan_task = asyncio.create_task(scan_and_update_servers())
470
- status_task = asyncio.create_task(print_status_periodically())
591
+ try:
592
+ table = tabulate(table_data, headers=headers, tablefmt="fancy_grid")
593
+ print(table)
594
+ print()
595
+
596
+ # Summary metrics
597
+ online_count = sum(1 for port in BACKEND_PORTS if (BACKEND_HOST, port) in current_available)
598
+ avg_connections = total_backend_connections / online_count if online_count else 0
599
+ print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📋 Summary{Colors.RESET}")
600
+ print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 4)}{Colors.RESET}")
601
+ print(f"{Colors.MAGENTA}🟢 Available Servers:{Colors.RESET} {online_count} / {len(BACKEND_PORTS)}")
602
+ print(f"{Colors.MAGENTA}📊 Total Backend Connections:{Colors.RESET} {total_backend_connections}")
603
+ print(f"{Colors.MAGENTA}📈 Average Load per Online Server:{Colors.RESET} {avg_connections:.1f}")
471
604
 
472
- server = await asyncio.start_server(
473
- handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
605
+ except Exception as e:
606
+ logger.error(f"Error displaying status table: {e}")
607
+ print(f"{Colors.RED}Error displaying server table: {e}{Colors.RESET}")
608
+
609
+ # Footer with refresh info
610
+ print()
611
+ print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
612
+ print(f"{Colors.DIM}🔄 Auto-refresh every {STATUS_PRINT_INTERVAL}s | Press Ctrl+C to stop{Colors.RESET}")
613
+ print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
614
+ print()
615
+
616
+
617
+
618
+ # --- HTTP Stats Server ---
619
+ from aiohttp import web
620
+
621
+ async def stats_json(request):
622
+ async with state_lock:
623
+ # Build a list of all configured servers, with status and connections
624
+ all_servers = []
625
+ available_set = set(available_servers)
626
+ for port in BACKEND_PORTS:
627
+ server = (BACKEND_HOST, port)
628
+ is_online = server in available_set
629
+ all_servers.append({
630
+ "host": BACKEND_HOST,
631
+ "port": port,
632
+ "active_connections": connection_counts.get(server, 0) if is_online else 0,
633
+ "status": "ONLINE" if is_online else "OFFLINE",
634
+ })
635
+ stats = {
636
+ "time": datetime.now().isoformat(),
637
+ "uptime": format_uptime(start_time),
638
+ "load_balancer_host": LOAD_BALANCER_HOST,
639
+ "load_balancer_port": LOAD_BALANCER_PORT,
640
+ "backend_host": BACKEND_HOST,
641
+ "backend_ports": BACKEND_PORTS,
642
+ "total_connections_served": total_connections_served,
643
+ "current_active_connections": current_active_connections,
644
+ "health_check_timeout": HEALTH_CHECK_TIMEOUT,
645
+ "status_update_interval": STATUS_PRINT_INTERVAL,
646
+ "servers": all_servers,
647
+ }
648
+ return web.json_response(stats)
649
+
650
+ async def stats_page(request):
651
+ # High-quality HTML dashboard with auto-refresh and charts
652
+ return web.Response(
653
+ content_type="text/html",
654
+ text="""
655
+ <!DOCTYPE html>
656
+ <html lang='en'>
657
+ <head>
658
+ <meta charset='UTF-8'>
659
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
660
+ <title>vLLM Load Balancer Stats</title>
661
+ <link rel='preconnect' href='https://fonts.googleapis.com'>
662
+ <link rel='preconnect' href='https://fonts.gstatic.com' crossorigin>
663
+ <link href='https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap' rel='stylesheet'>
664
+ <script src='https://cdn.jsdelivr.net/npm/chart.js'></script>
665
+ <style>
666
+ body { font-family: 'Roboto', sans-serif; background: #181c20; color: #f3f3f3; margin: 0; }
667
+ .container { max-width: 900px; margin: 32px auto; background: #23272b; border-radius: 12px; box-shadow: 0 2px 16px #0008; padding: 32px; }
668
+ h1 { text-align: center; font-size: 2.2em; margin-bottom: 0.2em; }
669
+ .subtitle { text-align: center; color: #7fd7ff; margin-bottom: 1.5em; }
670
+ .stats-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 24px; margin-bottom: 2em; }
671
+ .stat-box { background: #20232a; border-radius: 8px; padding: 18px 24px; text-align: center; }
672
+ .stat-label { color: #7fd7ff; font-size: 1.1em; margin-bottom: 0.2em; }
673
+ .stat-value { font-size: 2em; font-weight: bold; }
674
+ .server-table { width: 100%; border-collapse: collapse; margin-top: 1.5em; }
675
+ .server-table th, .server-table td { padding: 10px 8px; text-align: center; }
676
+ .server-table th { background: #2c313a; color: #7fd7ff; }
677
+ .server-table tr:nth-child(even) { background: #23272b; }
678
+ .server-table tr:nth-child(odd) { background: #1b1e22; }
679
+ .status-online { color: #00e676; font-weight: bold; }
680
+ .status-offline { color: #ff5252; font-weight: bold; }
681
+ .chart-container { background: #20232a; border-radius: 8px; padding: 18px 24px; margin-top: 2em; }
682
+ @media (max-width: 700px) { .stats-grid { grid-template-columns: 1fr; } }
683
+ </style>
684
+ </head>
685
+ <body>
686
+ <div class='container'>
687
+ <h1>🚀 vLLM Load Balancer</h1>
688
+ <div class='subtitle'>Live Stats Dashboard</div>
689
+ <div class='stats-grid' id='statsGrid'>
690
+ <!-- Stats will be injected here -->
691
+ </div>
692
+ <div class='chart-container'>
693
+ <canvas id='connChart' height='80'></canvas>
694
+ </div>
695
+ <table class='server-table' id='serverTable'>
696
+ <thead>
697
+ <tr>
698
+ <th>Backend Server</th>
699
+ <th>Host</th>
700
+ <th>Port</th>
701
+ <th>Active Connections</th>
702
+ <th>Status</th>
703
+ </tr>
704
+ </thead>
705
+ <tbody></tbody>
706
+ </table>
707
+ <div style='text-align:center; margin-top:2em; color:#888;'>
708
+ <span id='lastUpdate'></span> | Auto-refreshing every 1s
709
+ </div>
710
+ </div>
711
+ <script>
712
+ let connChart;
713
+ let connHistory = [];
714
+ let timeHistory = [];
715
+ async function fetchStats() {
716
+ const res = await fetch('/stats.json');
717
+ return await res.json();
718
+ }
719
+ function updateStats(stats) {
720
+ document.getElementById('lastUpdate').textContent = 'Last update: ' + new Date(stats.time).toLocaleTimeString();
721
+ // Stats grid
722
+ document.getElementById('statsGrid').innerHTML = `
723
+ <div class='stat-box'><div class='stat-label'>Uptime</div><div class='stat-value'>${stats.uptime}</div></div>
724
+ <div class='stat-box'><div class='stat-label'>Total Connections</div><div class='stat-value'>${stats.total_connections_served}</div></div>
725
+ <div class='stat-box'><div class='stat-label'>Active Connections</div><div class='stat-value'>${stats.current_active_connections}</div></div>
726
+ <div class='stat-box'><div class='stat-label'>Configured Servers</div><div class='stat-value'>${stats.servers.length}</div></div>
727
+ `;
728
+ // Server table
729
+ let tbody = document.querySelector('#serverTable tbody');
730
+ tbody.innerHTML = '';
731
+ for (const s of stats.servers) {
732
+ tbody.innerHTML += `<tr>
733
+ <td>${s.host}:${s.port}</td>
734
+ <td>${s.host}</td>
735
+ <td>${s.port}</td>
736
+ <td>${s.active_connections}</td>
737
+ <td class='${s.status === "ONLINE" ? "status-online" : "status-offline"}'>${s.status}</td>
738
+ </tr>`;
739
+ }
740
+ // Chart (only count online servers for active connections)
741
+ connHistory.push(stats.current_active_connections);
742
+ timeHistory.push(new Date(stats.time).toLocaleTimeString());
743
+ if (connHistory.length > 60) { connHistory.shift(); timeHistory.shift(); }
744
+ if (!connChart) {
745
+ connChart = new Chart(document.getElementById('connChart').getContext('2d'), {
746
+ type: 'line',
747
+ data: {
748
+ labels: timeHistory,
749
+ datasets: [{
750
+ label: 'Active Connections',
751
+ data: connHistory,
752
+ borderColor: '#7fd7ff',
753
+ backgroundColor: 'rgba(127,215,255,0.1)',
754
+ tension: 0.3,
755
+ fill: true,
756
+ pointRadius: 0
757
+ }]
758
+ },
759
+ options: {
760
+ plugins: { legend: { display: false } },
761
+ scales: {
762
+ x: { display: false },
763
+ y: { beginAtZero: true, grid: { color: '#333' }, ticks: { color: '#7fd7ff' } }
764
+ },
765
+ animation: false,
766
+ responsive: true,
767
+ maintainAspectRatio: false
768
+ }
769
+ });
770
+ } else {
771
+ connChart.data.labels = timeHistory;
772
+ connChart.data.datasets[0].data = connHistory;
773
+ connChart.update();
774
+ }
775
+ }
776
+ async function refresh() {
777
+ try {
778
+ const stats = await fetchStats();
779
+ updateStats(stats);
780
+ } catch (e) {
781
+ document.getElementById('lastUpdate').textContent = 'Error fetching stats';
782
+ }
783
+ setTimeout(refresh, 1000);
784
+ }
785
+ refresh();
786
+ </script>
787
+ </body>
788
+ </html>
789
+ """
474
790
  )
475
791
 
476
- addrs = ", ".join(str(sock.getsockname()) for sock in server.sockets)
477
- logger.info(f"Load balancer serving on {addrs}")
478
- logger.info(
479
- f"Dynamically discovering servers via HTTP /health on {SCAN_TARGET_HOST}:{SCAN_PORT_START}-{SCAN_PORT_END}"
480
- )
792
+ async def start_stats_server(loop):
793
+ app = web.Application()
794
+ app.router.add_get('/stats', stats_page)
795
+ app.router.add_get('/stats.json', stats_json)
796
+ runner = web.AppRunner(app)
797
+ await runner.setup()
798
+ site = web.TCPSite(runner, LOAD_BALANCER_HOST, STATS_PORT)
799
+ await site.start()
800
+ logger.info(f"Stats HTTP server running at http://{LOAD_BALANCER_HOST}:{STATS_PORT}/stats")
481
801
 
482
- async with server:
483
- try:
484
- await server.serve_forever()
485
- except asyncio.CancelledError:
486
- logger.info("Load balancer server shutting down.")
487
- finally:
488
- logger.info("Cancelling background tasks...")
489
- scan_task.cancel()
490
- status_task.cancel()
802
+ async def main():
803
+ global start_time
804
+ start_time = time.time()
805
+ clear_terminal()
806
+ print_banner()
807
+
808
+ # Start background tasks
809
+ scan_task = asyncio.create_task(scan_and_update_servers())
810
+ status_task = asyncio.create_task(print_status_periodically())
811
+
812
+ # Start HTTP stats server (on STATS_PORT)
813
+ loop = asyncio.get_running_loop()
814
+ await start_stats_server(loop)
815
+
816
+ # Start TCP server (on LOAD_BALANCER_PORT)
817
+ server = await asyncio.start_server(
818
+ handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
819
+ )
820
+
821
+ addrs = ", ".join(str(sock.getsockname()) for sock in server.sockets)
822
+ logger.info(f"Load balancer serving on {addrs}")
823
+ logger.info(
824
+ f"Configured backend ports: {BACKEND_PORTS} on host {BACKEND_HOST}"
825
+ )
826
+ print(f"{Colors.BRIGHT_GREEN}✅ Load balancer started successfully!{Colors.RESET}")
827
+ print(f"{Colors.BRIGHT_GREEN}🌐 Proxy listening on: {addrs}{Colors.RESET}")
828
+ print(f"{Colors.BRIGHT_GREEN}📊 Stats dashboard: http://localhost:{STATS_PORT}/stats{Colors.RESET}")
829
+ print(f"{Colors.YELLOW}🔍 Scanning backend servers...{Colors.RESET}")
830
+ print()
831
+ await asyncio.sleep(2)
832
+
833
+ async with server:
491
834
  try:
492
- await asyncio.gather(scan_task, status_task, return_exceptions=True)
835
+ await server.serve_forever()
493
836
  except asyncio.CancelledError:
494
- pass
495
- logger.info("Background tasks finished.")
496
-
837
+ print(f"\n{Colors.YELLOW}🛑 Shutdown signal received...{Colors.RESET}")
838
+ logger.info("Load balancer server shutting down.")
839
+ except KeyboardInterrupt:
840
+ print(f"\n{Colors.YELLOW}🛑 Shutdown requested by user...{Colors.RESET}")
841
+ logger.info("Shutdown requested by user.")
842
+ finally:
843
+ print(f"{Colors.CYAN}🔄 Cleaning up background tasks...{Colors.RESET}")
844
+ logger.info("Cancelling background tasks...")
845
+ scan_task.cancel()
846
+ status_task.cancel()
847
+ try:
848
+ await asyncio.gather(scan_task, status_task, return_exceptions=True)
849
+ except asyncio.CancelledError:
850
+ pass
851
+ print(f"{Colors.BRIGHT_GREEN}✅ Shutdown complete. Goodbye!{Colors.RESET}")
852
+ logger.info("Background tasks finished.")
497
853
 
498
854
  def run_load_balancer():
499
- # Make sure to install aiohttp: pip install aiohttp
855
+ global LOAD_BALANCER_PORT, BACKEND_PORTS, BACKEND_HOST, STATUS_PRINT_INTERVAL, HEALTH_CHECK_TIMEOUT, STATS_PORT
856
+ args = parse_args()
857
+ LOAD_BALANCER_PORT = args.port
858
+ BACKEND_HOST = args.host
859
+ BACKEND_PORTS = [int(p.strip()) for p in args.ports.split(",") if p.strip()]
860
+ STATUS_PRINT_INTERVAL = args.status_interval
861
+ HEALTH_CHECK_TIMEOUT = args.health_timeout
862
+ if args.stats_port is not None:
863
+ STATS_PORT = args.stats_port
864
+ else:
865
+ STATS_PORT = LOAD_BALANCER_PORT + 1
866
+ if not BACKEND_PORTS:
867
+ print(f"{Colors.BG_RED}{Colors.WHITE} ❌ ERROR {Colors.RESET}")
868
+ print(f"{Colors.RED}No backend ports specified. Use --ports 8140,8150 ...{Colors.RESET}")
869
+ logger.critical("No backend ports specified. Use --ports 8140,8150 ...")
870
+ sys.exit(1)
500
871
  try:
501
872
  asyncio.run(main())
502
873
  except KeyboardInterrupt:
503
- logger.info("Shutdown requested by user.")
874
+ # This is handled in the main() function now
875
+ pass
504
876
  except Exception as e:
877
+ print(f"\n{Colors.BG_RED}{Colors.WHITE} ❌ CRITICAL ERROR {Colors.RESET}")
878
+ print(f"{Colors.RED}Critical error in main execution: {e}{Colors.RESET}")
505
879
  logger.critical(f"Critical error in main execution: {e}")
506
880
 
507
-
508
881
  if __name__ == "__main__":
509
882
  run_load_balancer()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: speedy-utils
3
- Version: 1.1.0
3
+ Version: 1.1.3
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Author: AnhVTH
6
6
  Author-email: anhvth.226@gmail.com
@@ -5,13 +5,13 @@ llm_utils/chat_format/transform.py,sha256=8TZhvUS5DrjUeMNtDIuWY54B_QZ7jjpXEL9c8F
5
5
  llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
6
6
  llm_utils/group_messages.py,sha256=8CU9nKOja3xeuhdrX5CvYVveSqSKb2zQ0eeNzA88aTQ,3621
7
7
  llm_utils/lm/__init__.py,sha256=rX36_MsnekM5GHwWS56XELbm4W5x2TDwnPERDTfo0eU,194
8
- llm_utils/lm/async_lm.py,sha256=kiWEecrkCTTQFlQj5JiHNziFeLOF1-7G_2xC2Dra1bw,35806
8
+ llm_utils/lm/async_lm.py,sha256=eTyI9x4iZc4ZhYdwNadTYap5HgBJygiV_EBDZ-Og1cQ,34357
9
9
  llm_utils/lm/chat_html.py,sha256=FkGo0Dv_nAHYBMZzXfMu_bGQKaCx302goh3XaT-_ETc,8674
10
10
  llm_utils/lm/lm_json.py,sha256=fMt42phzFV2f6ulrtWcDXsWHi8WcG7gGkCzpIq8VSSM,1975
11
11
  llm_utils/lm/sync_lm.py,sha256=ANw_m5KiWcRwwoeQ5no6dzPFLc6j9o2oEcJtkMKqrn8,34640
12
12
  llm_utils/lm/utils.py,sha256=gUejbVZPYg97g4ftYEptYN52WhH3TAKOFW81sjLvi08,4585
13
13
  llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
14
- llm_utils/scripts/vllm_load_balancer.py,sha256=GjMdoZrdT9cSLos0qSdkLg2dwZgW1enAMsD3aTZAfNs,20845
14
+ llm_utils/scripts/vllm_load_balancer.py,sha256=zz5aTaYwy5tYrv2RIhrizrGP-PnPAohgrl9kQvvJywA,35091
15
15
  llm_utils/scripts/vllm_serve.py,sha256=4NaqpVs7LBvxtvTCMPsNCAOfqiWkKRttxWMmWY7SitA,14729
16
16
  speedy_utils/__init__.py,sha256=YCpiReW22zG4KkQXQe6V9BQ8bn7PtiXolOaW_iL8T4M,5734
17
17
  speedy_utils/all.py,sha256=t-HKzDmhF1MTFnmq7xRnPs5nFG_aZaLH9Ua0RM6nQ9Y,4855
@@ -31,7 +31,7 @@ speedy_utils/multi_worker/thread.py,sha256=u_hTwXh7_FciMa5EukdEA1fDCY_vUC4moDceB
31
31
  speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  speedy_utils/scripts/mpython.py,sha256=73PHm1jqbCt2APN4xuNjD0VDKwzOj4EZsViEMQiZU2g,3853
33
33
  speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
34
- speedy_utils-1.1.0.dist-info/METADATA,sha256=h1Alzm4q92GSiw5GNZWn6d8sHaSJS4X8RTMXStjkqHY,7441
35
- speedy_utils-1.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
36
- speedy_utils-1.1.0.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
37
- speedy_utils-1.1.0.dist-info/RECORD,,
34
+ speedy_utils-1.1.3.dist-info/METADATA,sha256=zTlCW23Gcdxio26wZ9L5FeWUsrvg5NOkGA0TPFQWRI8,7441
35
+ speedy_utils-1.1.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
36
+ speedy_utils-1.1.3.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
37
+ speedy_utils-1.1.3.dist-info/RECORD,,