speedy-utils 1.1.0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/lm/async_lm.py +63 -86
- llm_utils/scripts/vllm_load_balancer.py +545 -172
- {speedy_utils-1.1.0.dist-info → speedy_utils-1.1.3.dist-info}/METADATA +1 -1
- {speedy_utils-1.1.0.dist-info → speedy_utils-1.1.3.dist-info}/RECORD +6 -6
- {speedy_utils-1.1.0.dist-info → speedy_utils-1.1.3.dist-info}/WHEEL +0 -0
- {speedy_utils-1.1.0.dist-info → speedy_utils-1.1.3.dist-info}/entry_points.txt +0 -0
llm_utils/lm/async_lm.py
CHANGED
|
@@ -1,84 +1,10 @@
|
|
|
1
|
-
"""
|
|
2
|
-
# ============================================================================= #
|
|
3
|
-
# ASYNCHRONOUS LANGUAGE MODEL WRAPPER WITH CONCURRENT EXECUTION SUPPORT
|
|
4
|
-
# ============================================================================= #
|
|
5
|
-
#
|
|
6
|
-
# Title & Intent:
|
|
7
|
-
# High-performance asynchronous language model interface for concurrent LLM operations
|
|
8
|
-
#
|
|
9
|
-
# High-level Summary:
|
|
10
|
-
# This module provides an async drop-in replacement for the synchronous LM class, designed
|
|
11
|
-
# for high-throughput applications requiring concurrent language model operations. It maintains
|
|
12
|
-
# full API compatibility while adding async/await semantics, connection pooling, and efficient
|
|
13
|
-
# resource management. The AsyncLM class supports batch processing, concurrent request handling,
|
|
14
|
-
# and maintains the same caching and type safety guarantees as the synchronous version.
|
|
15
|
-
#
|
|
16
|
-
# Public API / Data Contracts:
|
|
17
|
-
# • AsyncLM(model, temperature=0.0, max_tokens=2000, host="localhost", port=None, **kwargs) - Async wrapper class
|
|
18
|
-
# • async AsyncLM.__call__(prompt=None, messages=None, response_format=str, cache=None, **kwargs) -> str | BaseModel
|
|
19
|
-
# • async AsyncLM.list_models(port=None) -> List[str] - Enumerate available models
|
|
20
|
-
# • async AsyncLM.count_tokens(messages, model=None) -> int - Token counting utility
|
|
21
|
-
# • async AsyncLM.price(messages, model=None, response_tokens=0) -> float - Cost estimation
|
|
22
|
-
# • AsyncLM.set_model(model_name) -> None - Runtime model switching (sync method)
|
|
23
|
-
# • async AsyncLM.batch_call(requests) -> List[Union[str, BaseModel]] - Concurrent batch processing
|
|
24
|
-
# • TModel = TypeVar("TModel", bound=BaseModel) - Generic type for structured responses
|
|
25
|
-
# • Messages = List[ChatCompletionMessageParam] - Typed message format
|
|
26
|
-
#
|
|
27
|
-
# Invariants / Constraints:
|
|
28
|
-
# • MUST be used within async context (asyncio event loop required)
|
|
29
|
-
# • MUST provide either 'prompt' or 'messages' parameter, but not both
|
|
30
|
-
# • MUST properly await all async method calls
|
|
31
|
-
# • Connection pooling MUST handle concurrent requests efficiently
|
|
32
|
-
# • MUST maintain thread safety across concurrent operations
|
|
33
|
-
# • Rate limit handling MUST use async backoff without blocking event loop
|
|
34
|
-
# • MUST preserve all synchronous LM class behaviors and constraints
|
|
35
|
-
# • Resource cleanup MUST occur on context manager exit or explicit close
|
|
36
|
-
#
|
|
37
|
-
# Usage Example:
|
|
38
|
-
# ```python
|
|
39
|
-
# import asyncio
|
|
40
|
-
# from llm_utils.lm.async_lm import AsyncLM
|
|
41
|
-
# from pydantic import BaseModel
|
|
42
|
-
#
|
|
43
|
-
# class SummaryResponse(BaseModel):
|
|
44
|
-
# summary: str
|
|
45
|
-
# key_points: List[str]
|
|
46
|
-
# confidence: float
|
|
47
|
-
#
|
|
48
|
-
# async def main():
|
|
49
|
-
# # Single async call
|
|
50
|
-
# lm = AsyncLM(model="gpt-4o-mini", temperature=0.1)
|
|
51
|
-
# response = await lm(prompt="Summarize quantum computing")
|
|
52
|
-
# print(response)
|
|
53
|
-
#
|
|
54
|
-
# # Concurrent batch processing
|
|
55
|
-
# texts = ["Text 1 to summarize", "Text 2 to summarize", "Text 3 to summarize"]
|
|
56
|
-
# tasks = [lm(prompt=f"Summarize: {text}", response_format=SummaryResponse) for text in texts]
|
|
57
|
-
# summaries = await asyncio.gather(*tasks)
|
|
58
|
-
#
|
|
59
|
-
# for summary in summaries:
|
|
60
|
-
# print(f"Summary: {summary.summary}")
|
|
61
|
-
# print(f"Key points: {summary.key_points}")
|
|
62
|
-
#
|
|
63
|
-
# asyncio.run(main())
|
|
64
|
-
# ```
|
|
65
|
-
#
|
|
66
|
-
# TODO & Future Work:
|
|
67
|
-
# • Add async context manager support for automatic resource cleanup
|
|
68
|
-
# • Implement connection pool size optimization based on usage patterns
|
|
69
|
-
# • Add async streaming response support with async generators
|
|
70
|
-
# • Optimize memory usage for large-scale concurrent operations
|
|
71
|
-
# • Add async rate limiting with priority queuing
|
|
72
|
-
#
|
|
73
|
-
# ============================================================================= #
|
|
74
|
-
"""
|
|
75
1
|
|
|
76
2
|
import base64
|
|
77
3
|
import hashlib
|
|
78
4
|
import json
|
|
79
5
|
import os
|
|
80
6
|
from abc import ABC
|
|
81
|
-
from functools import lru_cache
|
|
7
|
+
from functools import cache, lru_cache
|
|
82
8
|
from typing import (
|
|
83
9
|
Any,
|
|
84
10
|
Dict,
|
|
@@ -110,7 +36,7 @@ from openai.types.chat import (
|
|
|
110
36
|
)
|
|
111
37
|
from openai.types.model import Model
|
|
112
38
|
from pydantic import BaseModel
|
|
113
|
-
|
|
39
|
+
from pydantic import ValidationError
|
|
114
40
|
from llm_utils.chat_format.display import get_conversation_one_turn
|
|
115
41
|
|
|
116
42
|
# --------------------------------------------------------------------------- #
|
|
@@ -146,10 +72,13 @@ def _yellow(t):
|
|
|
146
72
|
return _color(33, t)
|
|
147
73
|
|
|
148
74
|
|
|
149
|
-
|
|
75
|
+
TParsed = TypeVar("TParsed", bound=BaseModel)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ParsedOutput(TypedDict, Generic[TParsed]):
|
|
150
79
|
messages: List
|
|
151
80
|
completion: Any
|
|
152
|
-
parsed:
|
|
81
|
+
parsed: TParsed
|
|
153
82
|
|
|
154
83
|
|
|
155
84
|
class AsyncLM:
|
|
@@ -460,7 +389,7 @@ class AsyncLM:
|
|
|
460
389
|
# ------------------------------------------------------------------ #
|
|
461
390
|
async def parse(
|
|
462
391
|
self,
|
|
463
|
-
response_model: Type[
|
|
392
|
+
response_model: Type[TParsed],
|
|
464
393
|
instruction: Optional[str] = None,
|
|
465
394
|
prompt: Optional[str] = None,
|
|
466
395
|
messages: Optional[RawMsgs] = None,
|
|
@@ -470,7 +399,7 @@ class AsyncLM:
|
|
|
470
399
|
max_tokens: Optional[int] = None,
|
|
471
400
|
cache: Optional[bool] = True,
|
|
472
401
|
**kwargs,
|
|
473
|
-
) -> ParsedOutput
|
|
402
|
+
) -> ParsedOutput[TParsed]:
|
|
474
403
|
"""Parse response using guided JSON generation."""
|
|
475
404
|
if messages is None:
|
|
476
405
|
assert instruction is not None, "Instruction must be provided."
|
|
@@ -513,6 +442,7 @@ class AsyncLM:
|
|
|
513
442
|
|
|
514
443
|
use_cache = self.do_cache if cache is None else cache
|
|
515
444
|
cache_key = None
|
|
445
|
+
completion = None
|
|
516
446
|
if use_cache:
|
|
517
447
|
cache_data = {
|
|
518
448
|
"messages": messages,
|
|
@@ -522,7 +452,7 @@ class AsyncLM:
|
|
|
522
452
|
}
|
|
523
453
|
cache_key = self._cache_key(cache_data, {}, response_model)
|
|
524
454
|
completion = self._load_cache(cache_key) # dict
|
|
525
|
-
|
|
455
|
+
if not completion:
|
|
526
456
|
completion = await self.client.chat.completions.create(
|
|
527
457
|
model=self.model, # type: ignore
|
|
528
458
|
messages=messages, # type: ignore
|
|
@@ -532,10 +462,12 @@ class AsyncLM:
|
|
|
532
462
|
completion = completion.model_dump()
|
|
533
463
|
if cache_key:
|
|
534
464
|
self._dump_cache(cache_key, completion)
|
|
535
|
-
|
|
465
|
+
assert isinstance(completion, dict), (
|
|
466
|
+
"Completion must be a dictionary with OpenAI response format."
|
|
467
|
+
)
|
|
536
468
|
self.last_log = [prompt, messages, completion]
|
|
537
469
|
|
|
538
|
-
output = self._parse_complete_output(completion, response_model)
|
|
470
|
+
output = cast(TParsed, self._parse_complete_output(completion, response_model))
|
|
539
471
|
full_messages = messages + [completion]
|
|
540
472
|
return ParsedOutput(
|
|
541
473
|
messages=full_messages,
|
|
@@ -555,7 +487,49 @@ class AsyncLM:
|
|
|
555
487
|
|
|
556
488
|
content = completion["choices"][0]["message"]["content"]
|
|
557
489
|
if not content:
|
|
558
|
-
|
|
490
|
+
# Enhanced error for debugging: show input tokens and their count
|
|
491
|
+
|
|
492
|
+
# Try to extract tokens from the completion for debugging
|
|
493
|
+
input_tokens = None
|
|
494
|
+
try:
|
|
495
|
+
input_tokens = completion.get('usage', {}).get('prompt_tokens')
|
|
496
|
+
except Exception:
|
|
497
|
+
input_tokens = None
|
|
498
|
+
|
|
499
|
+
# Try to get the prompt/messages for tokenization
|
|
500
|
+
prompt = None
|
|
501
|
+
try:
|
|
502
|
+
prompt = completion.get('messages') or completion.get('prompt')
|
|
503
|
+
except Exception:
|
|
504
|
+
prompt = None
|
|
505
|
+
|
|
506
|
+
tokens_preview = ''
|
|
507
|
+
if prompt is not None:
|
|
508
|
+
try:
|
|
509
|
+
tokenizer = get_tokenizer(self.model)
|
|
510
|
+
if isinstance(prompt, list):
|
|
511
|
+
prompt_text = '\n'.join(
|
|
512
|
+
m.get('content', '') for m in prompt if isinstance(m, dict)
|
|
513
|
+
)
|
|
514
|
+
else:
|
|
515
|
+
prompt_text = str(prompt)
|
|
516
|
+
tokens = tokenizer.encode(prompt_text)
|
|
517
|
+
n_tokens = len(tokens)
|
|
518
|
+
first_100 = tokens[:100]
|
|
519
|
+
last_100 = tokens[-100:] if n_tokens > 100 else []
|
|
520
|
+
tokens_preview = (
|
|
521
|
+
f'\nInput tokens: {n_tokens}'
|
|
522
|
+
f'\nFirst 100 tokens: {first_100}'
|
|
523
|
+
f'\nLast 100 tokens: {last_100}'
|
|
524
|
+
)
|
|
525
|
+
except Exception as exc:
|
|
526
|
+
tokens_preview = f'\n[Tokenization failed: {exc}]'
|
|
527
|
+
|
|
528
|
+
raise ValueError(
|
|
529
|
+
f'Empty content in response.'
|
|
530
|
+
f'\nInput tokens (if available): {input_tokens}'
|
|
531
|
+
f'{tokens_preview}'
|
|
532
|
+
)
|
|
559
533
|
|
|
560
534
|
try:
|
|
561
535
|
data = json.loads(content)
|
|
@@ -737,6 +711,7 @@ async def inspect_word_probs_async(lm, tokenizer, messages):
|
|
|
737
711
|
"""Async version of inspect_word_probs."""
|
|
738
712
|
|
|
739
713
|
import numpy as np
|
|
714
|
+
|
|
740
715
|
|
|
741
716
|
async def compute_word_log_probs(
|
|
742
717
|
tokenizer: Any,
|
|
@@ -894,12 +869,14 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
|
|
|
894
869
|
temperature: float = 0.6
|
|
895
870
|
think: bool = False
|
|
896
871
|
add_json_schema: bool = False
|
|
872
|
+
cache: bool = False
|
|
897
873
|
|
|
898
874
|
async def __call__(
|
|
899
875
|
self,
|
|
900
876
|
data: BaseModel | dict,
|
|
901
877
|
temperature: float = 0.1,
|
|
902
878
|
cache: bool = False,
|
|
879
|
+
think: Optional[bool] = None, # if not None, overrides self.think
|
|
903
880
|
) -> tuple[OutputModelType, List[Dict[str, Any]]]:
|
|
904
881
|
# Get the input and output model types from the generic parameters
|
|
905
882
|
type_args = getattr(self.__class__, "__orig_bases__", None)
|
|
@@ -940,9 +917,9 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
|
|
|
940
917
|
instruction=self.__doc__ or "",
|
|
941
918
|
response_model=output_model,
|
|
942
919
|
temperature=temperature or self.temperature,
|
|
943
|
-
think=self.think,
|
|
920
|
+
think=think if think is not None else self.think,
|
|
944
921
|
add_json_schema_to_instruction=self.add_json_schema,
|
|
945
|
-
cache=cache,
|
|
922
|
+
cache=self.cache or cache,
|
|
946
923
|
)
|
|
947
924
|
|
|
948
925
|
return (
|
|
@@ -1,104 +1,183 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
# Production-ready TCP load balancer for vLLM model servers with health checks and connection management
|
|
8
|
-
#
|
|
9
|
-
# High-level Summary:
|
|
10
|
-
# This module implements a high-performance load balancer specifically designed for vLLM model
|
|
11
|
-
# serving infrastructure. It provides intelligent routing across multiple vLLM server instances,
|
|
12
|
-
# continuous health monitoring, automatic failover, and connection pooling. The load balancer
|
|
13
|
-
# uses async TCP proxying to handle concurrent requests efficiently while maintaining session
|
|
14
|
-
# affinity and providing detailed metrics for monitoring and debugging.
|
|
15
|
-
#
|
|
16
|
-
# Public API / Data Contracts:
|
|
17
|
-
# • LOAD_BALANCER_HOST = "0.0.0.0" - Load balancer bind address
|
|
18
|
-
# • LOAD_BALANCER_PORT = 8008 - Load balancer listening port
|
|
19
|
-
# • SCAN_TARGET_HOST = "localhost" - Target server host for health checks
|
|
20
|
-
# • SCAN_PORT_START = 8140, SCAN_PORT_END = 8170 - Port range for server discovery
|
|
21
|
-
# • start_load_balancer() -> None - Main entry point to start the service
|
|
22
|
-
# • scan_for_healthy_servers() -> None - Background health monitoring task
|
|
23
|
-
# • handle_client(reader, writer) -> None - Client connection handler
|
|
24
|
-
# • relay_data(reader, writer, direction) -> None - Bidirectional data relay
|
|
25
|
-
# • get_next_server() -> Optional[Tuple[str, int]] - Round-robin server selection
|
|
26
|
-
#
|
|
27
|
-
# Invariants / Constraints:
|
|
28
|
-
# • MUST continuously monitor server health every SCAN_INTERVAL seconds
|
|
29
|
-
# • MUST handle connection failures gracefully with automatic failover
|
|
30
|
-
# • Health checks MUST complete within HEALTH_CHECK_TIMEOUT seconds
|
|
31
|
-
# • MUST maintain connection counts for load balancing decisions
|
|
32
|
-
# • Server availability MUST be updated atomically using async locks
|
|
33
|
-
# • TCP connections MUST be properly closed on errors or completion
|
|
34
|
-
# • MUST log all connection events and health status changes
|
|
35
|
-
# • Round-robin selection MUST distribute load evenly across healthy servers
|
|
36
|
-
#
|
|
37
|
-
# Usage Example:
|
|
38
|
-
# ```python
|
|
39
|
-
# # Start the load balancer (blocking operation)
|
|
40
|
-
# import asyncio
|
|
41
|
-
# from llm_utils.scripts.vllm_load_balancer import start_load_balancer
|
|
42
|
-
#
|
|
43
|
-
# # Configure environment or modify constants as needed
|
|
44
|
-
# LOAD_BALANCER_HOST = "0.0.0.0"
|
|
45
|
-
# LOAD_BALANCER_PORT = 8008
|
|
46
|
-
# SCAN_TARGET_HOST = "localhost"
|
|
47
|
-
# SCAN_PORT_START = 8140
|
|
48
|
-
# SCAN_PORT_END = 8150
|
|
49
|
-
#
|
|
50
|
-
# # Start the load balancer service
|
|
51
|
-
# asyncio.run(start_load_balancer())
|
|
52
|
-
#
|
|
53
|
-
# # The service will:
|
|
54
|
-
# # 1. Scan for healthy vLLM servers on ports 8140-8150
|
|
55
|
-
# # 2. Accept client connections on port 8008
|
|
56
|
-
# # 3. Route requests to healthy backend servers
|
|
57
|
-
# # 4. Monitor server health continuously
|
|
58
|
-
# # 5. Provide connection statistics
|
|
59
|
-
# ```
|
|
60
|
-
#
|
|
61
|
-
# TODO & Future Work:
|
|
62
|
-
# • Add weighted round-robin based on server capacity metrics
|
|
63
|
-
# • Implement session affinity for stateful model interactions
|
|
64
|
-
# • Add HTTP health check endpoints for better monitoring integration
|
|
65
|
-
# • Support dynamic server registration and deregistration
|
|
66
|
-
# • Add metrics export for Prometheus/Grafana monitoring
|
|
67
|
-
# • Implement graceful shutdown with connection draining
|
|
68
|
-
#
|
|
69
|
-
# ============================================================================= #
|
|
70
|
-
"""
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from collections import defaultdict
|
|
71
7
|
|
|
72
8
|
import asyncio
|
|
73
9
|
import contextlib
|
|
74
10
|
import random
|
|
75
|
-
from collections import defaultdict
|
|
76
11
|
|
|
77
|
-
import aiohttp
|
|
12
|
+
import aiohttp
|
|
78
13
|
from loguru import logger
|
|
79
14
|
from tabulate import tabulate
|
|
80
15
|
|
|
81
16
|
from speedy_utils import setup_logger
|
|
82
17
|
|
|
83
18
|
setup_logger(min_interval=5)
|
|
84
|
-
# --- Configuration ---
|
|
85
|
-
LOAD_BALANCER_HOST = "0.0.0.0"
|
|
86
|
-
LOAD_BALANCER_PORT = 8008
|
|
87
19
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
20
|
+
# --- CLI Argument Parsing ---
|
|
21
|
+
def parse_args():
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
description="🚀 vLLM Load Balancer - High-Performance Async TCP/HTTP Load Balancer",
|
|
24
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
25
|
+
epilog="""
|
|
26
|
+
Examples:
|
|
27
|
+
python vllm_load_balancer.py 8001 --ports 8140,8150,8160
|
|
28
|
+
python vllm_load_balancer.py 8080 --ports 8140,8150 --host 192.168.1.100
|
|
29
|
+
python vllm_load_balancer.py 8001 --ports 8140,8150 --status-interval 3
|
|
30
|
+
|
|
31
|
+
Features:
|
|
32
|
+
• Real-time dashboard with color-coded status
|
|
33
|
+
• Automatic health checks and failover
|
|
34
|
+
• Least-connections load balancing
|
|
35
|
+
• Professional terminal interface
|
|
36
|
+
• Connection statistics and monitoring
|
|
37
|
+
"""
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"port",
|
|
41
|
+
type=int,
|
|
42
|
+
help="Port for the load balancer to listen on (e.g., 8001)",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--ports",
|
|
46
|
+
type=str,
|
|
47
|
+
required=True,
|
|
48
|
+
help="Comma-separated list of backend ports to use (e.g., 8140,8150)",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--host",
|
|
52
|
+
type=str,
|
|
53
|
+
default="localhost",
|
|
54
|
+
help="Backend host (default: localhost)",
|
|
55
|
+
)
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"--status-interval",
|
|
58
|
+
type=int,
|
|
59
|
+
default=5,
|
|
60
|
+
help="Status print interval in seconds (default: 5)",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--health-timeout",
|
|
64
|
+
type=int,
|
|
65
|
+
default=2,
|
|
66
|
+
help="Health check timeout in seconds (default: 2)",
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--stats-port",
|
|
70
|
+
type=int,
|
|
71
|
+
default=None,
|
|
72
|
+
help="Port for the HTTP stats dashboard (default: proxy port + 1)",
|
|
73
|
+
)
|
|
74
|
+
return parser.parse_args()
|
|
94
75
|
|
|
76
|
+
# --- Configuration (populated from CLI) ---
|
|
77
|
+
LOAD_BALANCER_HOST = "0.0.0.0"
|
|
78
|
+
LOAD_BALANCER_PORT = 8008 # Will be overwritten by CLI
|
|
79
|
+
STATS_PORT = 8009 # Will be overwritten by CLI
|
|
80
|
+
BACKEND_HOST = "localhost" # Will be overwritten by CLI
|
|
81
|
+
BACKEND_PORTS = [] # Will be overwritten by CLI
|
|
95
82
|
STATUS_PRINT_INTERVAL = 5
|
|
83
|
+
HEALTH_CHECK_TIMEOUT = 2
|
|
96
84
|
BUFFER_SIZE = 4096
|
|
97
85
|
|
|
98
86
|
# --- Global Shared State ---
|
|
99
87
|
available_servers = []
|
|
100
88
|
connection_counts = defaultdict(int)
|
|
101
89
|
state_lock = asyncio.Lock()
|
|
90
|
+
start_time = None
|
|
91
|
+
total_connections_served = 0
|
|
92
|
+
current_active_connections = 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# --- Terminal Utilities ---
|
|
96
|
+
def clear_terminal():
|
|
97
|
+
"""Clear terminal screen with cross-platform support."""
|
|
98
|
+
if os.name == 'nt': # Windows
|
|
99
|
+
os.system('cls')
|
|
100
|
+
else: # Unix/Linux/MacOS
|
|
101
|
+
os.system('clear')
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_terminal_size():
|
|
105
|
+
"""Get terminal dimensions."""
|
|
106
|
+
try:
|
|
107
|
+
columns, rows = os.get_terminal_size()
|
|
108
|
+
return columns, rows
|
|
109
|
+
except OSError:
|
|
110
|
+
return 80, 24 # Default fallback
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def format_uptime(start_time):
|
|
114
|
+
"""Format uptime in a human-readable way."""
|
|
115
|
+
if not start_time:
|
|
116
|
+
return "Unknown"
|
|
117
|
+
|
|
118
|
+
uptime_seconds = time.time() - start_time
|
|
119
|
+
hours = int(uptime_seconds // 3600)
|
|
120
|
+
minutes = int((uptime_seconds % 3600) // 60)
|
|
121
|
+
seconds = int(uptime_seconds % 60)
|
|
122
|
+
|
|
123
|
+
if hours > 0:
|
|
124
|
+
return f"{hours}h {minutes}m {seconds}s"
|
|
125
|
+
elif minutes > 0:
|
|
126
|
+
return f"{minutes}m {seconds}s"
|
|
127
|
+
else:
|
|
128
|
+
return f"{seconds}s"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def print_banner():
|
|
132
|
+
"""Print a professional startup banner."""
|
|
133
|
+
columns, _ = get_terminal_size()
|
|
134
|
+
banner_width = min(columns - 4, 80)
|
|
135
|
+
|
|
136
|
+
print("=" * banner_width)
|
|
137
|
+
print(f"{'🚀 vLLM Load Balancer':^{banner_width}}")
|
|
138
|
+
print(f"{'High-Performance Async TCP/HTTP Load Balancer':^{banner_width}}")
|
|
139
|
+
print("=" * banner_width)
|
|
140
|
+
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
141
|
+
print(f"Load Balancer Port: {LOAD_BALANCER_PORT}")
|
|
142
|
+
print(f"Backend Host: {BACKEND_HOST}")
|
|
143
|
+
print(f"Backend Ports: {', '.join(map(str, BACKEND_PORTS))}")
|
|
144
|
+
print(f"Health Check Interval: 10s (Timeout: {HEALTH_CHECK_TIMEOUT}s)")
|
|
145
|
+
print(f"Status Update Interval: {STATUS_PRINT_INTERVAL}s")
|
|
146
|
+
print("=" * banner_width)
|
|
147
|
+
print()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# --- ANSI Color Codes ---
|
|
151
|
+
class Colors:
|
|
152
|
+
RESET = '\033[0m'
|
|
153
|
+
BOLD = '\033[1m'
|
|
154
|
+
DIM = '\033[2m'
|
|
155
|
+
|
|
156
|
+
# Foreground colors
|
|
157
|
+
BLACK = '\033[30m'
|
|
158
|
+
RED = '\033[31m'
|
|
159
|
+
GREEN = '\033[32m'
|
|
160
|
+
YELLOW = '\033[33m'
|
|
161
|
+
BLUE = '\033[34m'
|
|
162
|
+
MAGENTA = '\033[35m'
|
|
163
|
+
CYAN = '\033[36m'
|
|
164
|
+
WHITE = '\033[37m'
|
|
165
|
+
|
|
166
|
+
# Bright colors
|
|
167
|
+
BRIGHT_BLACK = '\033[90m'
|
|
168
|
+
BRIGHT_RED = '\033[91m'
|
|
169
|
+
BRIGHT_GREEN = '\033[92m'
|
|
170
|
+
BRIGHT_YELLOW = '\033[93m'
|
|
171
|
+
BRIGHT_BLUE = '\033[94m'
|
|
172
|
+
BRIGHT_MAGENTA = '\033[95m'
|
|
173
|
+
BRIGHT_CYAN = '\033[96m'
|
|
174
|
+
BRIGHT_WHITE = '\033[97m'
|
|
175
|
+
|
|
176
|
+
# Background colors
|
|
177
|
+
BG_RED = '\033[41m'
|
|
178
|
+
BG_GREEN = '\033[42m'
|
|
179
|
+
BG_YELLOW = '\033[43m'
|
|
180
|
+
BG_BLUE = '\033[44m'
|
|
102
181
|
|
|
103
182
|
|
|
104
183
|
# --- Helper Functions --- (relay_data and safe_close_writer remain the same)
|
|
@@ -145,21 +224,17 @@ async def safe_close_writer(writer):
|
|
|
145
224
|
logger.debug(f"Error closing writer in context manager: {e}")
|
|
146
225
|
|
|
147
226
|
|
|
148
|
-
# --- Server Scanning and Health Check (Modified) ---
|
|
149
|
-
|
|
150
227
|
|
|
228
|
+
# --- Health Check for Provided Ports ---
|
|
151
229
|
async def check_server_health(session, host, port):
|
|
152
230
|
"""Performs an HTTP GET request to the /health endpoint."""
|
|
153
231
|
url = f"http://{host}:{port}/health"
|
|
154
232
|
try:
|
|
155
|
-
# Use the provided aiohttp session to make the GET request
|
|
156
233
|
async with session.get(url, timeout=HEALTH_CHECK_TIMEOUT) as response:
|
|
157
|
-
# Check for a successful status code (2xx range)
|
|
158
234
|
if 200 <= response.status < 300:
|
|
159
235
|
logger.debug(
|
|
160
236
|
f"[{LOAD_BALANCER_PORT=}] Health check success for {url} (Status: {response.status})"
|
|
161
237
|
)
|
|
162
|
-
# Ensure the connection is released back to the pool
|
|
163
238
|
await response.release()
|
|
164
239
|
return True
|
|
165
240
|
else:
|
|
@@ -172,61 +247,51 @@ async def check_server_health(session, host, port):
|
|
|
172
247
|
logger.debug(f"Health check HTTP request timeout for {url}")
|
|
173
248
|
return False
|
|
174
249
|
except aiohttp.ClientConnectorError as e:
|
|
175
|
-
# Handles connection refused, DNS errors etc. - server likely down
|
|
176
250
|
logger.debug(f"Health check connection error for {url}: {e}")
|
|
177
251
|
return False
|
|
178
252
|
except aiohttp.ClientError as e:
|
|
179
|
-
# Catch other potential client errors (e.g., invalid URL structure, too many redirects)
|
|
180
253
|
logger.warning(f"Health check client error for {url}: {e}")
|
|
181
254
|
return False
|
|
182
255
|
except Exception as e:
|
|
183
|
-
# Catch any other unexpected errors during the check
|
|
184
256
|
logger.error(f"Unexpected health check error for {url}: {e}")
|
|
185
257
|
return False
|
|
186
258
|
|
|
187
259
|
|
|
188
260
|
async def scan_and_update_servers():
|
|
189
|
-
"""Periodically
|
|
261
|
+
"""Periodically checks the provided backend ports and updates available servers."""
|
|
190
262
|
global available_servers
|
|
191
263
|
logger.debug(
|
|
192
|
-
f"Starting server scan task (HTTP GET /health on
|
|
264
|
+
f"Starting server scan task (HTTP GET /health on ports {BACKEND_PORTS} every 10s)"
|
|
193
265
|
)
|
|
194
266
|
while True:
|
|
195
267
|
try:
|
|
196
268
|
current_scan_results = []
|
|
197
269
|
scan_tasks = []
|
|
198
|
-
ports_to_scan =
|
|
270
|
+
ports_to_scan = BACKEND_PORTS
|
|
199
271
|
|
|
200
|
-
# Create ONE aiohttp session for all checks within this scan cycle for efficiency
|
|
201
272
|
async with aiohttp.ClientSession() as session:
|
|
202
|
-
# Create health check tasks for all ports, passing the shared session
|
|
203
273
|
for port in ports_to_scan:
|
|
204
274
|
task = asyncio.create_task(
|
|
205
|
-
check_server_health(session,
|
|
275
|
+
check_server_health(session, BACKEND_HOST, port)
|
|
206
276
|
)
|
|
207
277
|
scan_tasks.append((task, port))
|
|
208
278
|
|
|
209
|
-
# Wait for all health checks to complete
|
|
210
|
-
# return_exceptions=True prevents gather from stopping if one check fails
|
|
211
279
|
await asyncio.gather(
|
|
212
280
|
*(task for task, port in scan_tasks), return_exceptions=True
|
|
213
281
|
)
|
|
214
282
|
|
|
215
|
-
# Collect results from completed tasks
|
|
216
283
|
for task, port in scan_tasks:
|
|
217
284
|
try:
|
|
218
|
-
# Check if task finished, wasn't cancelled, and returned True
|
|
219
285
|
if (
|
|
220
286
|
task.done()
|
|
221
287
|
and not task.cancelled()
|
|
222
288
|
and task.result() is True
|
|
223
289
|
):
|
|
224
|
-
current_scan_results.append((
|
|
290
|
+
current_scan_results.append((BACKEND_HOST, port))
|
|
225
291
|
except Exception as e:
|
|
226
292
|
logger.error(
|
|
227
293
|
f"Error retrieving health check result for port {port}: {e}"
|
|
228
294
|
)
|
|
229
|
-
# --- Update Shared State (Locked) ---
|
|
230
295
|
async with state_lock:
|
|
231
296
|
previous_servers = set(available_servers)
|
|
232
297
|
current_set = set(current_scan_results)
|
|
@@ -263,9 +328,9 @@ async def scan_and_update_servers():
|
|
|
263
328
|
break
|
|
264
329
|
except Exception as e:
|
|
265
330
|
logger.error(f"Error in scan_and_update_servers loop: {e}")
|
|
266
|
-
await asyncio.sleep(
|
|
331
|
+
await asyncio.sleep(5) # Avoid tight loop on error
|
|
267
332
|
|
|
268
|
-
await asyncio.sleep(
|
|
333
|
+
await asyncio.sleep(10)
|
|
269
334
|
|
|
270
335
|
|
|
271
336
|
# --- Core Load Balancer Logic (handle_client remains the same) ---
|
|
@@ -279,6 +344,7 @@ async def handle_client(client_reader, client_writer):
|
|
|
279
344
|
backend_writer = None
|
|
280
345
|
server_selected = False
|
|
281
346
|
|
|
347
|
+
global total_connections_served, current_active_connections
|
|
282
348
|
try:
|
|
283
349
|
# --- Select Backend Server (Least Connections from Available) ---
|
|
284
350
|
selected_server = None
|
|
@@ -310,6 +376,12 @@ async def handle_client(client_reader, client_writer):
|
|
|
310
376
|
connection_counts[selected_server] += 1
|
|
311
377
|
backend_server = selected_server
|
|
312
378
|
server_selected = True
|
|
379
|
+
|
|
380
|
+
# Update global statistics
|
|
381
|
+
global total_connections_served, current_active_connections
|
|
382
|
+
total_connections_served += 1
|
|
383
|
+
current_active_connections += 1
|
|
384
|
+
|
|
313
385
|
logger.info(
|
|
314
386
|
f"Routing {client_addr} to {backend_server} (Current connections: {connection_counts[backend_server]})"
|
|
315
387
|
)
|
|
@@ -408,6 +480,7 @@ async def handle_client(client_reader, client_writer):
|
|
|
408
480
|
if backend_server in connection_counts:
|
|
409
481
|
if connection_counts[backend_server] > 0:
|
|
410
482
|
connection_counts[backend_server] -= 1
|
|
483
|
+
current_active_connections = max(0, current_active_connections - 1)
|
|
411
484
|
logger.info(
|
|
412
485
|
f"Connection closed for {client_addr}. Backend {backend_server} connections: {connection_counts[backend_server]}"
|
|
413
486
|
)
|
|
@@ -418,92 +491,392 @@ async def handle_client(client_reader, client_writer):
|
|
|
418
491
|
connection_counts[backend_server] = 0
|
|
419
492
|
|
|
420
493
|
|
|
421
|
-
|
|
494
|
+
|
|
495
|
+
# --- Status Reporting Task ---
|
|
422
496
|
async def print_status_periodically():
|
|
423
|
-
"""Periodically
|
|
497
|
+
"""Periodically displays a professional real-time status dashboard."""
|
|
424
498
|
while True:
|
|
425
499
|
await asyncio.sleep(STATUS_PRINT_INTERVAL)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
500
|
+
await display_status_dashboard()
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
async def display_status_dashboard():
|
|
504
|
+
"""Display a professional real-time status dashboard."""
|
|
505
|
+
global current_active_connections, total_connections_served
|
|
506
|
+
|
|
507
|
+
async with state_lock:
|
|
508
|
+
current_available = set(available_servers)
|
|
509
|
+
current_counts = connection_counts.copy()
|
|
510
|
+
|
|
511
|
+
# Clear terminal for fresh display
|
|
512
|
+
clear_terminal()
|
|
513
|
+
|
|
514
|
+
# Get terminal dimensions for responsive layout
|
|
515
|
+
columns, rows = get_terminal_size()
|
|
516
|
+
dash_width = min(columns - 4, 100)
|
|
517
|
+
|
|
518
|
+
# Header with title and timestamp
|
|
519
|
+
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
520
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
|
|
521
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'🚀 vLLM Load Balancer Dashboard':^{dash_width}}{Colors.RESET}")
|
|
522
|
+
print(f"{Colors.BRIGHT_CYAN}{'Real-time Status & Monitoring':^{dash_width}}{Colors.RESET}")
|
|
523
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
|
|
524
|
+
print()
|
|
525
|
+
|
|
526
|
+
# System Information Section
|
|
527
|
+
uptime = format_uptime(start_time)
|
|
528
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📊 System Information{Colors.RESET}")
|
|
529
|
+
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
|
|
530
|
+
print(f"{Colors.YELLOW}🕐 Current Time:{Colors.RESET} {current_time}")
|
|
531
|
+
print(f"{Colors.YELLOW}⏱️ Uptime:{Colors.RESET} {uptime}")
|
|
532
|
+
print(f"{Colors.YELLOW}🌐 Load Balancer:{Colors.RESET} {LOAD_BALANCER_HOST}:{LOAD_BALANCER_PORT}")
|
|
533
|
+
print(f"{Colors.YELLOW}🎯 Backend Host:{Colors.RESET} {BACKEND_HOST}")
|
|
534
|
+
print(f"{Colors.YELLOW}🔧 Configured Ports:{Colors.RESET} {', '.join(map(str, BACKEND_PORTS))}")
|
|
535
|
+
print()
|
|
536
|
+
|
|
537
|
+
# Connection Statistics Section
|
|
538
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📈 Connection Statistics{Colors.RESET}")
|
|
539
|
+
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
|
|
540
|
+
print(f"{Colors.GREEN}📊 Total Connections Served:{Colors.RESET} {total_connections_served:,}")
|
|
541
|
+
print(f"{Colors.GREEN}🔗 Currently Active:{Colors.RESET} {current_active_connections}")
|
|
542
|
+
print(f"{Colors.GREEN}⚡ Health Check Timeout:{Colors.RESET} {HEALTH_CHECK_TIMEOUT}s")
|
|
543
|
+
print(f"{Colors.GREEN}🔄 Status Update Interval:{Colors.RESET} {STATUS_PRINT_INTERVAL}s")
|
|
544
|
+
print()
|
|
545
|
+
|
|
546
|
+
# Backend Servers Status
|
|
547
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}Backend Servers Status{Colors.RESET}")
|
|
548
|
+
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
|
|
549
|
+
|
|
550
|
+
headers = [
|
|
551
|
+
f"{Colors.BOLD}Server{Colors.RESET}",
|
|
552
|
+
f"{Colors.BOLD}Host{Colors.RESET}",
|
|
553
|
+
f"{Colors.BOLD}Port{Colors.RESET}",
|
|
554
|
+
f"{Colors.BOLD}Active Conn.{Colors.RESET}",
|
|
555
|
+
f"{Colors.BOLD}Status{Colors.RESET}"
|
|
556
|
+
]
|
|
557
|
+
|
|
558
|
+
table_data = []
|
|
559
|
+
total_backend_connections = 0
|
|
560
|
+
|
|
561
|
+
for port in BACKEND_PORTS:
|
|
562
|
+
server = (BACKEND_HOST, port)
|
|
563
|
+
is_online = server in current_available
|
|
564
|
+
count = current_counts.get(server, 0) if is_online else 0
|
|
565
|
+
total_backend_connections += count
|
|
566
|
+
|
|
567
|
+
# Color-code connection count based on load
|
|
568
|
+
if count == 0:
|
|
569
|
+
conn_display = f"{Colors.DIM}0{Colors.RESET}"
|
|
570
|
+
elif count < 5:
|
|
571
|
+
conn_display = f"{Colors.GREEN}{count}{Colors.RESET}"
|
|
572
|
+
elif count < 10:
|
|
573
|
+
conn_display = f"{Colors.YELLOW}{count}{Colors.RESET}"
|
|
574
|
+
else:
|
|
575
|
+
conn_display = f"{Colors.RED}{count}{Colors.RESET}"
|
|
576
|
+
|
|
577
|
+
status_display = (
|
|
578
|
+
f"{Colors.BG_GREEN}{Colors.BLACK} ONLINE {Colors.RESET}"
|
|
579
|
+
if is_online
|
|
580
|
+
else f"{Colors.BG_RED}{Colors.WHITE} OFFLINE {Colors.RESET}"
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
table_data.append([
|
|
584
|
+
f"{Colors.CYAN}{BACKEND_HOST}:{port}{Colors.RESET}",
|
|
585
|
+
BACKEND_HOST,
|
|
586
|
+
str(port),
|
|
587
|
+
conn_display,
|
|
588
|
+
status_display
|
|
589
|
+
])
|
|
448
590
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
print("------------------------------\n")
|
|
463
|
-
except Exception as e:
|
|
464
|
-
logger.error(f"Error printing status table: {e}")
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
# --- Main Execution (main remains the same) ---
|
|
468
|
-
async def main():
|
|
469
|
-
scan_task = asyncio.create_task(scan_and_update_servers())
|
|
470
|
-
status_task = asyncio.create_task(print_status_periodically())
|
|
591
|
+
try:
|
|
592
|
+
table = tabulate(table_data, headers=headers, tablefmt="fancy_grid")
|
|
593
|
+
print(table)
|
|
594
|
+
print()
|
|
595
|
+
|
|
596
|
+
# Summary metrics
|
|
597
|
+
online_count = sum(1 for port in BACKEND_PORTS if (BACKEND_HOST, port) in current_available)
|
|
598
|
+
avg_connections = total_backend_connections / online_count if online_count else 0
|
|
599
|
+
print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📋 Summary{Colors.RESET}")
|
|
600
|
+
print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 4)}{Colors.RESET}")
|
|
601
|
+
print(f"{Colors.MAGENTA}🟢 Available Servers:{Colors.RESET} {online_count} / {len(BACKEND_PORTS)}")
|
|
602
|
+
print(f"{Colors.MAGENTA}📊 Total Backend Connections:{Colors.RESET} {total_backend_connections}")
|
|
603
|
+
print(f"{Colors.MAGENTA}📈 Average Load per Online Server:{Colors.RESET} {avg_connections:.1f}")
|
|
471
604
|
|
|
472
|
-
|
|
473
|
-
|
|
605
|
+
except Exception as e:
|
|
606
|
+
logger.error(f"Error displaying status table: {e}")
|
|
607
|
+
print(f"{Colors.RED}Error displaying server table: {e}{Colors.RESET}")
|
|
608
|
+
|
|
609
|
+
# Footer with refresh info
|
|
610
|
+
print()
|
|
611
|
+
print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
|
|
612
|
+
print(f"{Colors.DIM}🔄 Auto-refresh every {STATUS_PRINT_INTERVAL}s | Press Ctrl+C to stop{Colors.RESET}")
|
|
613
|
+
print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
|
|
614
|
+
print()
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
# --- HTTP Stats Server ---
|
|
619
|
+
from aiohttp import web
|
|
620
|
+
|
|
621
|
+
async def stats_json(request):
|
|
622
|
+
async with state_lock:
|
|
623
|
+
# Build a list of all configured servers, with status and connections
|
|
624
|
+
all_servers = []
|
|
625
|
+
available_set = set(available_servers)
|
|
626
|
+
for port in BACKEND_PORTS:
|
|
627
|
+
server = (BACKEND_HOST, port)
|
|
628
|
+
is_online = server in available_set
|
|
629
|
+
all_servers.append({
|
|
630
|
+
"host": BACKEND_HOST,
|
|
631
|
+
"port": port,
|
|
632
|
+
"active_connections": connection_counts.get(server, 0) if is_online else 0,
|
|
633
|
+
"status": "ONLINE" if is_online else "OFFLINE",
|
|
634
|
+
})
|
|
635
|
+
stats = {
|
|
636
|
+
"time": datetime.now().isoformat(),
|
|
637
|
+
"uptime": format_uptime(start_time),
|
|
638
|
+
"load_balancer_host": LOAD_BALANCER_HOST,
|
|
639
|
+
"load_balancer_port": LOAD_BALANCER_PORT,
|
|
640
|
+
"backend_host": BACKEND_HOST,
|
|
641
|
+
"backend_ports": BACKEND_PORTS,
|
|
642
|
+
"total_connections_served": total_connections_served,
|
|
643
|
+
"current_active_connections": current_active_connections,
|
|
644
|
+
"health_check_timeout": HEALTH_CHECK_TIMEOUT,
|
|
645
|
+
"status_update_interval": STATUS_PRINT_INTERVAL,
|
|
646
|
+
"servers": all_servers,
|
|
647
|
+
}
|
|
648
|
+
return web.json_response(stats)
|
|
649
|
+
|
|
650
|
+
async def stats_page(request):
|
|
651
|
+
# High-quality HTML dashboard with auto-refresh and charts
|
|
652
|
+
return web.Response(
|
|
653
|
+
content_type="text/html",
|
|
654
|
+
text="""
|
|
655
|
+
<!DOCTYPE html>
|
|
656
|
+
<html lang='en'>
|
|
657
|
+
<head>
|
|
658
|
+
<meta charset='UTF-8'>
|
|
659
|
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
|
660
|
+
<title>vLLM Load Balancer Stats</title>
|
|
661
|
+
<link rel='preconnect' href='https://fonts.googleapis.com'>
|
|
662
|
+
<link rel='preconnect' href='https://fonts.gstatic.com' crossorigin>
|
|
663
|
+
<link href='https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap' rel='stylesheet'>
|
|
664
|
+
<script src='https://cdn.jsdelivr.net/npm/chart.js'></script>
|
|
665
|
+
<style>
|
|
666
|
+
body { font-family: 'Roboto', sans-serif; background: #181c20; color: #f3f3f3; margin: 0; }
|
|
667
|
+
.container { max-width: 900px; margin: 32px auto; background: #23272b; border-radius: 12px; box-shadow: 0 2px 16px #0008; padding: 32px; }
|
|
668
|
+
h1 { text-align: center; font-size: 2.2em; margin-bottom: 0.2em; }
|
|
669
|
+
.subtitle { text-align: center; color: #7fd7ff; margin-bottom: 1.5em; }
|
|
670
|
+
.stats-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 24px; margin-bottom: 2em; }
|
|
671
|
+
.stat-box { background: #20232a; border-radius: 8px; padding: 18px 24px; text-align: center; }
|
|
672
|
+
.stat-label { color: #7fd7ff; font-size: 1.1em; margin-bottom: 0.2em; }
|
|
673
|
+
.stat-value { font-size: 2em; font-weight: bold; }
|
|
674
|
+
.server-table { width: 100%; border-collapse: collapse; margin-top: 1.5em; }
|
|
675
|
+
.server-table th, .server-table td { padding: 10px 8px; text-align: center; }
|
|
676
|
+
.server-table th { background: #2c313a; color: #7fd7ff; }
|
|
677
|
+
.server-table tr:nth-child(even) { background: #23272b; }
|
|
678
|
+
.server-table tr:nth-child(odd) { background: #1b1e22; }
|
|
679
|
+
.status-online { color: #00e676; font-weight: bold; }
|
|
680
|
+
.status-offline { color: #ff5252; font-weight: bold; }
|
|
681
|
+
.chart-container { background: #20232a; border-radius: 8px; padding: 18px 24px; margin-top: 2em; }
|
|
682
|
+
@media (max-width: 700px) { .stats-grid { grid-template-columns: 1fr; } }
|
|
683
|
+
</style>
|
|
684
|
+
</head>
|
|
685
|
+
<body>
|
|
686
|
+
<div class='container'>
|
|
687
|
+
<h1>🚀 vLLM Load Balancer</h1>
|
|
688
|
+
<div class='subtitle'>Live Stats Dashboard</div>
|
|
689
|
+
<div class='stats-grid' id='statsGrid'>
|
|
690
|
+
<!-- Stats will be injected here -->
|
|
691
|
+
</div>
|
|
692
|
+
<div class='chart-container'>
|
|
693
|
+
<canvas id='connChart' height='80'></canvas>
|
|
694
|
+
</div>
|
|
695
|
+
<table class='server-table' id='serverTable'>
|
|
696
|
+
<thead>
|
|
697
|
+
<tr>
|
|
698
|
+
<th>Backend Server</th>
|
|
699
|
+
<th>Host</th>
|
|
700
|
+
<th>Port</th>
|
|
701
|
+
<th>Active Connections</th>
|
|
702
|
+
<th>Status</th>
|
|
703
|
+
</tr>
|
|
704
|
+
</thead>
|
|
705
|
+
<tbody></tbody>
|
|
706
|
+
</table>
|
|
707
|
+
<div style='text-align:center; margin-top:2em; color:#888;'>
|
|
708
|
+
<span id='lastUpdate'></span> | Auto-refreshing every 1s
|
|
709
|
+
</div>
|
|
710
|
+
</div>
|
|
711
|
+
<script>
|
|
712
|
+
let connChart;
|
|
713
|
+
let connHistory = [];
|
|
714
|
+
let timeHistory = [];
|
|
715
|
+
async function fetchStats() {
|
|
716
|
+
const res = await fetch('/stats.json');
|
|
717
|
+
return await res.json();
|
|
718
|
+
}
|
|
719
|
+
function updateStats(stats) {
|
|
720
|
+
document.getElementById('lastUpdate').textContent = 'Last update: ' + new Date(stats.time).toLocaleTimeString();
|
|
721
|
+
// Stats grid
|
|
722
|
+
document.getElementById('statsGrid').innerHTML = `
|
|
723
|
+
<div class='stat-box'><div class='stat-label'>Uptime</div><div class='stat-value'>${stats.uptime}</div></div>
|
|
724
|
+
<div class='stat-box'><div class='stat-label'>Total Connections</div><div class='stat-value'>${stats.total_connections_served}</div></div>
|
|
725
|
+
<div class='stat-box'><div class='stat-label'>Active Connections</div><div class='stat-value'>${stats.current_active_connections}</div></div>
|
|
726
|
+
<div class='stat-box'><div class='stat-label'>Configured Servers</div><div class='stat-value'>${stats.servers.length}</div></div>
|
|
727
|
+
`;
|
|
728
|
+
// Server table
|
|
729
|
+
let tbody = document.querySelector('#serverTable tbody');
|
|
730
|
+
tbody.innerHTML = '';
|
|
731
|
+
for (const s of stats.servers) {
|
|
732
|
+
tbody.innerHTML += `<tr>
|
|
733
|
+
<td>${s.host}:${s.port}</td>
|
|
734
|
+
<td>${s.host}</td>
|
|
735
|
+
<td>${s.port}</td>
|
|
736
|
+
<td>${s.active_connections}</td>
|
|
737
|
+
<td class='${s.status === "ONLINE" ? "status-online" : "status-offline"}'>${s.status}</td>
|
|
738
|
+
</tr>`;
|
|
739
|
+
}
|
|
740
|
+
// Chart (only count online servers for active connections)
|
|
741
|
+
connHistory.push(stats.current_active_connections);
|
|
742
|
+
timeHistory.push(new Date(stats.time).toLocaleTimeString());
|
|
743
|
+
if (connHistory.length > 60) { connHistory.shift(); timeHistory.shift(); }
|
|
744
|
+
if (!connChart) {
|
|
745
|
+
connChart = new Chart(document.getElementById('connChart').getContext('2d'), {
|
|
746
|
+
type: 'line',
|
|
747
|
+
data: {
|
|
748
|
+
labels: timeHistory,
|
|
749
|
+
datasets: [{
|
|
750
|
+
label: 'Active Connections',
|
|
751
|
+
data: connHistory,
|
|
752
|
+
borderColor: '#7fd7ff',
|
|
753
|
+
backgroundColor: 'rgba(127,215,255,0.1)',
|
|
754
|
+
tension: 0.3,
|
|
755
|
+
fill: true,
|
|
756
|
+
pointRadius: 0
|
|
757
|
+
}]
|
|
758
|
+
},
|
|
759
|
+
options: {
|
|
760
|
+
plugins: { legend: { display: false } },
|
|
761
|
+
scales: {
|
|
762
|
+
x: { display: false },
|
|
763
|
+
y: { beginAtZero: true, grid: { color: '#333' }, ticks: { color: '#7fd7ff' } }
|
|
764
|
+
},
|
|
765
|
+
animation: false,
|
|
766
|
+
responsive: true,
|
|
767
|
+
maintainAspectRatio: false
|
|
768
|
+
}
|
|
769
|
+
});
|
|
770
|
+
} else {
|
|
771
|
+
connChart.data.labels = timeHistory;
|
|
772
|
+
connChart.data.datasets[0].data = connHistory;
|
|
773
|
+
connChart.update();
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
async function refresh() {
|
|
777
|
+
try {
|
|
778
|
+
const stats = await fetchStats();
|
|
779
|
+
updateStats(stats);
|
|
780
|
+
} catch (e) {
|
|
781
|
+
document.getElementById('lastUpdate').textContent = 'Error fetching stats';
|
|
782
|
+
}
|
|
783
|
+
setTimeout(refresh, 1000);
|
|
784
|
+
}
|
|
785
|
+
refresh();
|
|
786
|
+
</script>
|
|
787
|
+
</body>
|
|
788
|
+
</html>
|
|
789
|
+
"""
|
|
474
790
|
)
|
|
475
791
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
792
|
+
async def start_stats_server(loop):
|
|
793
|
+
app = web.Application()
|
|
794
|
+
app.router.add_get('/stats', stats_page)
|
|
795
|
+
app.router.add_get('/stats.json', stats_json)
|
|
796
|
+
runner = web.AppRunner(app)
|
|
797
|
+
await runner.setup()
|
|
798
|
+
site = web.TCPSite(runner, LOAD_BALANCER_HOST, STATS_PORT)
|
|
799
|
+
await site.start()
|
|
800
|
+
logger.info(f"Stats HTTP server running at http://{LOAD_BALANCER_HOST}:{STATS_PORT}/stats")
|
|
481
801
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
802
|
+
async def main():
|
|
803
|
+
global start_time
|
|
804
|
+
start_time = time.time()
|
|
805
|
+
clear_terminal()
|
|
806
|
+
print_banner()
|
|
807
|
+
|
|
808
|
+
# Start background tasks
|
|
809
|
+
scan_task = asyncio.create_task(scan_and_update_servers())
|
|
810
|
+
status_task = asyncio.create_task(print_status_periodically())
|
|
811
|
+
|
|
812
|
+
# Start HTTP stats server (on STATS_PORT)
|
|
813
|
+
loop = asyncio.get_running_loop()
|
|
814
|
+
await start_stats_server(loop)
|
|
815
|
+
|
|
816
|
+
# Start TCP server (on LOAD_BALANCER_PORT)
|
|
817
|
+
server = await asyncio.start_server(
|
|
818
|
+
handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
addrs = ", ".join(str(sock.getsockname()) for sock in server.sockets)
|
|
822
|
+
logger.info(f"Load balancer serving on {addrs}")
|
|
823
|
+
logger.info(
|
|
824
|
+
f"Configured backend ports: {BACKEND_PORTS} on host {BACKEND_HOST}"
|
|
825
|
+
)
|
|
826
|
+
print(f"{Colors.BRIGHT_GREEN}✅ Load balancer started successfully!{Colors.RESET}")
|
|
827
|
+
print(f"{Colors.BRIGHT_GREEN}🌐 Proxy listening on: {addrs}{Colors.RESET}")
|
|
828
|
+
print(f"{Colors.BRIGHT_GREEN}📊 Stats dashboard: http://localhost:{STATS_PORT}/stats{Colors.RESET}")
|
|
829
|
+
print(f"{Colors.YELLOW}🔍 Scanning backend servers...{Colors.RESET}")
|
|
830
|
+
print()
|
|
831
|
+
await asyncio.sleep(2)
|
|
832
|
+
|
|
833
|
+
async with server:
|
|
491
834
|
try:
|
|
492
|
-
await
|
|
835
|
+
await server.serve_forever()
|
|
493
836
|
except asyncio.CancelledError:
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
837
|
+
print(f"\n{Colors.YELLOW}🛑 Shutdown signal received...{Colors.RESET}")
|
|
838
|
+
logger.info("Load balancer server shutting down.")
|
|
839
|
+
except KeyboardInterrupt:
|
|
840
|
+
print(f"\n{Colors.YELLOW}🛑 Shutdown requested by user...{Colors.RESET}")
|
|
841
|
+
logger.info("Shutdown requested by user.")
|
|
842
|
+
finally:
|
|
843
|
+
print(f"{Colors.CYAN}🔄 Cleaning up background tasks...{Colors.RESET}")
|
|
844
|
+
logger.info("Cancelling background tasks...")
|
|
845
|
+
scan_task.cancel()
|
|
846
|
+
status_task.cancel()
|
|
847
|
+
try:
|
|
848
|
+
await asyncio.gather(scan_task, status_task, return_exceptions=True)
|
|
849
|
+
except asyncio.CancelledError:
|
|
850
|
+
pass
|
|
851
|
+
print(f"{Colors.BRIGHT_GREEN}✅ Shutdown complete. Goodbye!{Colors.RESET}")
|
|
852
|
+
logger.info("Background tasks finished.")
|
|
497
853
|
|
|
498
854
|
def run_load_balancer():
|
|
499
|
-
|
|
855
|
+
global LOAD_BALANCER_PORT, BACKEND_PORTS, BACKEND_HOST, STATUS_PRINT_INTERVAL, HEALTH_CHECK_TIMEOUT, STATS_PORT
|
|
856
|
+
args = parse_args()
|
|
857
|
+
LOAD_BALANCER_PORT = args.port
|
|
858
|
+
BACKEND_HOST = args.host
|
|
859
|
+
BACKEND_PORTS = [int(p.strip()) for p in args.ports.split(",") if p.strip()]
|
|
860
|
+
STATUS_PRINT_INTERVAL = args.status_interval
|
|
861
|
+
HEALTH_CHECK_TIMEOUT = args.health_timeout
|
|
862
|
+
if args.stats_port is not None:
|
|
863
|
+
STATS_PORT = args.stats_port
|
|
864
|
+
else:
|
|
865
|
+
STATS_PORT = LOAD_BALANCER_PORT + 1
|
|
866
|
+
if not BACKEND_PORTS:
|
|
867
|
+
print(f"{Colors.BG_RED}{Colors.WHITE} ❌ ERROR {Colors.RESET}")
|
|
868
|
+
print(f"{Colors.RED}No backend ports specified. Use --ports 8140,8150 ...{Colors.RESET}")
|
|
869
|
+
logger.critical("No backend ports specified. Use --ports 8140,8150 ...")
|
|
870
|
+
sys.exit(1)
|
|
500
871
|
try:
|
|
501
872
|
asyncio.run(main())
|
|
502
873
|
except KeyboardInterrupt:
|
|
503
|
-
|
|
874
|
+
# This is handled in the main() function now
|
|
875
|
+
pass
|
|
504
876
|
except Exception as e:
|
|
877
|
+
print(f"\n{Colors.BG_RED}{Colors.WHITE} ❌ CRITICAL ERROR {Colors.RESET}")
|
|
878
|
+
print(f"{Colors.RED}Critical error in main execution: {e}{Colors.RESET}")
|
|
505
879
|
logger.critical(f"Critical error in main execution: {e}")
|
|
506
880
|
|
|
507
|
-
|
|
508
881
|
if __name__ == "__main__":
|
|
509
882
|
run_load_balancer()
|
|
@@ -5,13 +5,13 @@ llm_utils/chat_format/transform.py,sha256=8TZhvUS5DrjUeMNtDIuWY54B_QZ7jjpXEL9c8F
|
|
|
5
5
|
llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
|
|
6
6
|
llm_utils/group_messages.py,sha256=8CU9nKOja3xeuhdrX5CvYVveSqSKb2zQ0eeNzA88aTQ,3621
|
|
7
7
|
llm_utils/lm/__init__.py,sha256=rX36_MsnekM5GHwWS56XELbm4W5x2TDwnPERDTfo0eU,194
|
|
8
|
-
llm_utils/lm/async_lm.py,sha256=
|
|
8
|
+
llm_utils/lm/async_lm.py,sha256=eTyI9x4iZc4ZhYdwNadTYap5HgBJygiV_EBDZ-Og1cQ,34357
|
|
9
9
|
llm_utils/lm/chat_html.py,sha256=FkGo0Dv_nAHYBMZzXfMu_bGQKaCx302goh3XaT-_ETc,8674
|
|
10
10
|
llm_utils/lm/lm_json.py,sha256=fMt42phzFV2f6ulrtWcDXsWHi8WcG7gGkCzpIq8VSSM,1975
|
|
11
11
|
llm_utils/lm/sync_lm.py,sha256=ANw_m5KiWcRwwoeQ5no6dzPFLc6j9o2oEcJtkMKqrn8,34640
|
|
12
12
|
llm_utils/lm/utils.py,sha256=gUejbVZPYg97g4ftYEptYN52WhH3TAKOFW81sjLvi08,4585
|
|
13
13
|
llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
|
|
14
|
-
llm_utils/scripts/vllm_load_balancer.py,sha256=
|
|
14
|
+
llm_utils/scripts/vllm_load_balancer.py,sha256=zz5aTaYwy5tYrv2RIhrizrGP-PnPAohgrl9kQvvJywA,35091
|
|
15
15
|
llm_utils/scripts/vllm_serve.py,sha256=4NaqpVs7LBvxtvTCMPsNCAOfqiWkKRttxWMmWY7SitA,14729
|
|
16
16
|
speedy_utils/__init__.py,sha256=YCpiReW22zG4KkQXQe6V9BQ8bn7PtiXolOaW_iL8T4M,5734
|
|
17
17
|
speedy_utils/all.py,sha256=t-HKzDmhF1MTFnmq7xRnPs5nFG_aZaLH9Ua0RM6nQ9Y,4855
|
|
@@ -31,7 +31,7 @@ speedy_utils/multi_worker/thread.py,sha256=u_hTwXh7_FciMa5EukdEA1fDCY_vUC4moDceB
|
|
|
31
31
|
speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
32
|
speedy_utils/scripts/mpython.py,sha256=73PHm1jqbCt2APN4xuNjD0VDKwzOj4EZsViEMQiZU2g,3853
|
|
33
33
|
speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
|
|
34
|
-
speedy_utils-1.1.
|
|
35
|
-
speedy_utils-1.1.
|
|
36
|
-
speedy_utils-1.1.
|
|
37
|
-
speedy_utils-1.1.
|
|
34
|
+
speedy_utils-1.1.3.dist-info/METADATA,sha256=zTlCW23Gcdxio26wZ9L5FeWUsrvg5NOkGA0TPFQWRI8,7441
|
|
35
|
+
speedy_utils-1.1.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
36
|
+
speedy_utils-1.1.3.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
|
|
37
|
+
speedy_utils-1.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|