PyPI - speedy-utils - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl - Mend

speedy-utils 1.1.5py3-none-any.whl → 1.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

llm_utils/chat_format/display.py +17 -4
llm_utils/lm/async_lm/__init__.py +2 -0
llm_utils/lm/async_lm/_utils.py +198 -0
llm_utils/lm/async_lm/async_llm_task.py +154 -0
llm_utils/lm/{async_lm.py → async_lm/async_lm.py} +191 -354
llm_utils/scripts/vllm_load_balancer.py +220 -135
{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/METADATA +1 -1
{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/RECORD +10 -7
{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/WHEEL +0 -0
{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/entry_points.txt +0 -0

llm_utils/scripts/vllm_load_balancer.py CHANGED Viewed

@@ -17,6 +17,7 @@ from speedy_utils import setup_logger
 setup_logger(min_interval=5)
 # --- CLI Argument Parsing ---
 def parse_args():
     parser = argparse.ArgumentParser(
@@ -27,14 +28,16 @@ Examples:
   python vllm_load_balancer.py 8001 --ports 8140,8150,8160
   python vllm_load_balancer.py 8080 --ports 8140,8150 --host 192.168.1.100
   python vllm_load_balancer.py 8001 --ports 8140,8150 --status-interval 3
+  python vllm_load_balancer.py 8001 --ports 8140,8150 --throttle-ms 10
 Features:
   • Real-time dashboard with color-coded status
   • Automatic health checks and failover
   • Least-connections load balancing
+  • Request throttling to prevent server overload
   • Professional terminal interface
   • Connection statistics and monitoring
-        """
+        """,
     )
     parser.add_argument(
         "port",
@@ -71,8 +74,15 @@ Features:
         default=None,
         help="Port for the HTTP stats dashboard (default: proxy port + 1)",
     )
+    parser.add_argument(
+        "--throttle-ms",
+        type=float,
+        default=30.0,
+        help="Minimum milliseconds between requests to same server (default: 5ms)",
+    )
     return parser.parse_args()
 # --- Configuration (populated from CLI) ---
 LOAD_BALANCER_HOST = "0.0.0.0"
 LOAD_BALANCER_PORT = 8008  # Will be overwritten by CLI
@@ -81,11 +91,13 @@ BACKEND_HOST = "localhost"  # Will be overwritten by CLI
 BACKEND_PORTS = []  # Will be overwritten by CLI
 STATUS_PRINT_INTERVAL = 5
 HEALTH_CHECK_TIMEOUT = 2
+THROTTLE_MS = 5.0  # Will be overwritten by CLI
 BUFFER_SIZE = 4096
 # --- Global Shared State ---
 available_servers = []
 connection_counts = defaultdict(int)
+last_send_times = defaultdict(float)  # Track last send time per server
 state_lock = asyncio.Lock()
 start_time = None
 total_connections_served = 0
@@ -95,10 +107,10 @@ current_active_connections = 0
 # --- Terminal Utilities ---
 def clear_terminal():
     """Clear terminal screen with cross-platform support."""
-    if os.name == 'nt':  # Windows
-        os.system('cls')
+    if os.name == "nt":  # Windows
+        os.system("cls")
     else:  # Unix/Linux/MacOS
-        os.system('clear')
+        os.system("clear")
 def get_terminal_size():
@@ -114,12 +126,12 @@ def format_uptime(start_time):
     """Format uptime in a human-readable way."""
     if not start_time:
         return "Unknown"
     uptime_seconds = time.time() - start_time
     hours = int(uptime_seconds // 3600)
     minutes = int((uptime_seconds % 3600) // 60)
     seconds = int(uptime_seconds % 60)
     if hours > 0:
         return f"{hours}h {minutes}m {seconds}s"
     elif minutes > 0:
@@ -132,7 +144,7 @@ def print_banner():
     """Print a professional startup banner."""
     columns, _ = get_terminal_size()
     banner_width = min(columns - 4, 80)
     print("=" * banner_width)
     print(f"{'🚀 vLLM Load Balancer':^{banner_width}}")
     print(f"{'High-Performance Async TCP/HTTP Load Balancer':^{banner_width}}")
@@ -143,41 +155,42 @@ def print_banner():
     print(f"Backend Ports: {', '.join(map(str, BACKEND_PORTS))}")
     print(f"Health Check Interval: 10s (Timeout: {HEALTH_CHECK_TIMEOUT}s)")
     print(f"Status Update Interval: {STATUS_PRINT_INTERVAL}s")
+    print(f"Request Throttling: {THROTTLE_MS}ms minimum between requests")
     print("=" * banner_width)
     print()
 # --- ANSI Color Codes ---
 class Colors:
-    RESET = '\033[0m'
-    BOLD = '\033[1m'
-    DIM = '\033[2m'
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
     # Foreground colors
-    BLACK = '\033[30m'
-    RED = '\033[31m'
-    GREEN = '\033[32m'
-    YELLOW = '\033[33m'
-    BLUE = '\033[34m'
-    MAGENTA = '\033[35m'
-    CYAN = '\033[36m'
-    WHITE = '\033[37m'
+    BLACK = "\033[30m"
+    RED = "\033[31m"
+    GREEN = "\033[32m"
+    YELLOW = "\033[33m"
+    BLUE = "\033[34m"
+    MAGENTA = "\033[35m"
+    CYAN = "\033[36m"
+    WHITE = "\033[37m"
     # Bright colors
-    BRIGHT_BLACK = '\033[90m'
-    BRIGHT_RED = '\033[91m'
-    BRIGHT_GREEN = '\033[92m'
-    BRIGHT_YELLOW = '\033[93m'
-    BRIGHT_BLUE = '\033[94m'
-    BRIGHT_MAGENTA = '\033[95m'
-    BRIGHT_CYAN = '\033[96m'
-    BRIGHT_WHITE = '\033[97m'
+    BRIGHT_BLACK = "\033[90m"
+    BRIGHT_RED = "\033[91m"
+    BRIGHT_GREEN = "\033[92m"
+    BRIGHT_YELLOW = "\033[93m"
+    BRIGHT_BLUE = "\033[94m"
+    BRIGHT_MAGENTA = "\033[95m"
+    BRIGHT_CYAN = "\033[96m"
+    BRIGHT_WHITE = "\033[97m"
     # Background colors
-    BG_RED = '\033[41m'
-    BG_GREEN = '\033[42m'
-    BG_YELLOW = '\033[43m'
-    BG_BLUE = '\033[44m'
+    BG_RED = "\033[41m"
+    BG_GREEN = "\033[42m"
+    BG_YELLOW = "\033[43m"
+    BG_BLUE = "\033[44m"
 # --- Helper Functions --- (relay_data and safe_close_writer remain the same)
@@ -224,7 +237,6 @@ async def safe_close_writer(writer):
                 logger.debug(f"Error closing writer in context manager: {e}")
 # --- Health Check for Provided Ports ---
 async def check_server_health(session, host, port):
     """Performs an HTTP GET request to the /health endpoint."""
@@ -313,6 +325,11 @@ async def scan_and_update_servers():
                             logger.debug(
                                 f"Removed connection count entry for unavailable server {server}"
                             )
+                        if server in last_send_times:
+                            del last_send_times[server]
+                            logger.debug(
+                                f"Removed throttling timestamp for unavailable server {server}"
+                            )
                 available_servers = sorted(list(current_set))
                 for server in available_servers:
@@ -337,7 +354,6 @@ async def scan_and_update_servers():
 async def handle_client(client_reader, client_writer):
     """Handles a single client connection."""
     client_addr = client_writer.get_extra_info("peername")
-    logger.info(f"Accepted connection from {client_addr}")
     backend_server = None
     backend_reader = None
@@ -376,15 +392,11 @@ async def handle_client(client_reader, client_writer):
                 connection_counts[selected_server] += 1
                 backend_server = selected_server
                 server_selected = True
                 # Update global statistics
                 global total_connections_served, current_active_connections
                 total_connections_served += 1
                 current_active_connections += 1
-                logger.info(
-                    f"Routing {client_addr} to {backend_server} (Current connections: {connection_counts[backend_server]})"
-                )
             else:
                 logger.error(
                     f"Logic error: No server chosen despite available servers list not being empty for {client_addr}."
@@ -402,6 +414,29 @@ async def handle_client(client_reader, client_writer):
                 pass
             server_selected = False
             return
+        # --- Throttling Logic ---
+        # Check if we need to throttle requests to avoid overwhelming the backend
+        current_time = time.time() * 1000  # Convert to milliseconds
+        sleep_time = 0
+        async with state_lock:
+            last_send_time = last_send_times.get(backend_server, 0)
+            time_since_last_send = current_time - last_send_time
+            if time_since_last_send < THROTTLE_MS:
+                sleep_time = (THROTTLE_MS - time_since_last_send) / 1000  # Convert to seconds
+                logger.debug(
+                    f"Throttling request to {backend_server} for {sleep_time:.3f}s (last send: {time_since_last_send:.1f}ms ago)"
+                )
+        # Sleep outside the lock to avoid blocking other clients
+        if sleep_time > 0:
+            await asyncio.sleep(sleep_time)
+        # Update last send time after throttling
+        async with state_lock:
+            last_send_times[backend_server] = time.time() * 1000
         try:
             logger.debug(
                 f"Attempting connection to backend {backend_server} for {client_addr}"
@@ -473,16 +508,14 @@ async def handle_client(client_reader, client_writer):
     except Exception as e:
         logger.error(f"Error handling client {client_addr}: {e}")
     finally:
-        logger.info(f"Closing connection for {client_addr}")
         # Decrement connection count only if we successfully selected/incremented
         if backend_server and server_selected:
             async with state_lock:
                 if backend_server in connection_counts:
                     if connection_counts[backend_server] > 0:
                         connection_counts[backend_server] -= 1
-                        current_active_connections = max(0, current_active_connections - 1)
-                        logger.info(
-                            f"Connection closed for {client_addr}. Backend {backend_server} connections: {connection_counts[backend_server]}"
+                        current_active_connections = max(
+                            0, current_active_connections - 1
                         )
                     else:
                         logger.warning(
@@ -491,7 +524,6 @@ async def handle_client(client_reader, client_writer):
                         connection_counts[backend_server] = 0
 # --- Status Reporting Task ---
 async def print_status_periodically():
     """Periodically displays a professional real-time status dashboard."""
@@ -514,35 +546,52 @@ async def display_status_dashboard():
     # Get terminal dimensions for responsive layout
     columns, rows = get_terminal_size()
     dash_width = min(columns - 4, 100)
     # Header with title and timestamp
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
-    print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'🚀 vLLM Load Balancer Dashboard':^{dash_width}}{Colors.RESET}")
-    print(f"{Colors.BRIGHT_CYAN}{'Real-time Status & Monitoring':^{dash_width}}{Colors.RESET}")
+    print(
+        f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'🚀 vLLM Load Balancer Dashboard':^{dash_width}}{Colors.RESET}"
+    )
+    print(
+        f"{Colors.BRIGHT_CYAN}{'Real-time Status & Monitoring':^{dash_width}}{Colors.RESET}"
+    )
     print(f"{Colors.BOLD}{Colors.BRIGHT_CYAN}{'=' * dash_width}{Colors.RESET}")
     print()
     # System Information Section
     uptime = format_uptime(start_time)
     print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📊 System Information{Colors.RESET}")
     print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
     print(f"{Colors.YELLOW}🕐 Current Time:{Colors.RESET} {current_time}")
     print(f"{Colors.YELLOW}⏱️  Uptime:{Colors.RESET} {uptime}")
-    print(f"{Colors.YELLOW}🌐 Load Balancer:{Colors.RESET} {LOAD_BALANCER_HOST}:{LOAD_BALANCER_PORT}")
+    print(
+        f"{Colors.YELLOW}🌐 Load Balancer:{Colors.RESET} {LOAD_BALANCER_HOST}:{LOAD_BALANCER_PORT}"
+    )
     print(f"{Colors.YELLOW}🎯 Backend Host:{Colors.RESET} {BACKEND_HOST}")
-    print(f"{Colors.YELLOW}🔧 Configured Ports:{Colors.RESET} {', '.join(map(str, BACKEND_PORTS))}")
+    print(
+        f"{Colors.YELLOW}🔧 Configured Ports:{Colors.RESET} {', '.join(map(str, BACKEND_PORTS))}"
+    )
+    print(f"{Colors.YELLOW}⚡ Request Throttling:{Colors.RESET} {THROTTLE_MS}ms minimum")
     print()
     # Connection Statistics Section
     print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📈 Connection Statistics{Colors.RESET}")
     print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
-    print(f"{Colors.GREEN}📊 Total Connections Served:{Colors.RESET} {total_connections_served:,}")
-    print(f"{Colors.GREEN}🔗 Currently Active:{Colors.RESET} {current_active_connections}")
-    print(f"{Colors.GREEN}⚡ Health Check Timeout:{Colors.RESET} {HEALTH_CHECK_TIMEOUT}s")
-    print(f"{Colors.GREEN}🔄 Status Update Interval:{Colors.RESET} {STATUS_PRINT_INTERVAL}s")
+    print(
+        f"{Colors.GREEN}📊 Total Connections Served:{Colors.RESET} {total_connections_served:,}"
+    )
+    print(
+        f"{Colors.GREEN}🔗 Currently Active:{Colors.RESET} {current_active_connections}"
+    )
+    print(
+        f"{Colors.GREEN}⚡ Health Check Timeout:{Colors.RESET} {HEALTH_CHECK_TIMEOUT}s"
+    )
+    print(
+        f"{Colors.GREEN}🔄 Status Update Interval:{Colors.RESET} {STATUS_PRINT_INTERVAL}s"
+    )
     print()
     # Backend Servers Status
     print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}Backend Servers Status{Colors.RESET}")
     print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 2)}{Colors.RESET}")
@@ -552,7 +601,7 @@ async def display_status_dashboard():
         f"{Colors.BOLD}Host{Colors.RESET}",
         f"{Colors.BOLD}Port{Colors.RESET}",
         f"{Colors.BOLD}Active Conn.{Colors.RESET}",
-        f"{Colors.BOLD}Status{Colors.RESET}"
+        f"{Colors.BOLD}Status{Colors.RESET}",
     ]
     table_data = []
@@ -580,13 +629,15 @@ async def display_status_dashboard():
             else f"{Colors.BG_RED}{Colors.WHITE} OFFLINE {Colors.RESET}"
         )
-        table_data.append([
-            f"{Colors.CYAN}{BACKEND_HOST}:{port}{Colors.RESET}",
-            BACKEND_HOST,
-            str(port),
-            conn_display,
-            status_display
-        ])
+        table_data.append(
+            [
+                f"{Colors.CYAN}{BACKEND_HOST}:{port}{Colors.RESET}",
+                BACKEND_HOST,
+                str(port),
+                conn_display,
+                status_display,
+            ]
+        )
     try:
         table = tabulate(table_data, headers=headers, tablefmt="fancy_grid")
@@ -594,13 +645,23 @@ async def display_status_dashboard():
         print()
         # Summary metrics
-        online_count = sum(1 for port in BACKEND_PORTS if (BACKEND_HOST, port) in current_available)
-        avg_connections = total_backend_connections / online_count if online_count else 0
+        online_count = sum(
+            1 for port in BACKEND_PORTS if (BACKEND_HOST, port) in current_available
+        )
+        avg_connections = (
+            total_backend_connections / online_count if online_count else 0
+        )
         print(f"{Colors.BOLD}{Colors.BRIGHT_WHITE}📋 Summary{Colors.RESET}")
         print(f"{Colors.BRIGHT_BLACK}{'─' * (dash_width // 4)}{Colors.RESET}")
-        print(f"{Colors.MAGENTA}🟢 Available Servers:{Colors.RESET} {online_count} / {len(BACKEND_PORTS)}")
-        print(f"{Colors.MAGENTA}📊 Total Backend Connections:{Colors.RESET} {total_backend_connections}")
-        print(f"{Colors.MAGENTA}📈 Average Load per Online Server:{Colors.RESET} {avg_connections:.1f}")
+        print(
+            f"{Colors.MAGENTA}🟢 Available Servers:{Colors.RESET} {online_count} / {len(BACKEND_PORTS)}"
+        )
+        print(
+            f"{Colors.MAGENTA}📊 Total Backend Connections:{Colors.RESET} {total_backend_connections}"
+        )
+        print(
+            f"{Colors.MAGENTA}📈 Average Load per Online Server:{Colors.RESET} {avg_connections:.1f}"
+        )
     except Exception as e:
         logger.error(f"Error displaying status table: {e}")
@@ -609,15 +670,17 @@ async def display_status_dashboard():
     # Footer with refresh info
     print()
     print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
-    print(f"{Colors.DIM}🔄 Auto-refresh every {STATUS_PRINT_INTERVAL}s | Press Ctrl+C to stop{Colors.RESET}")
+    print(
+        f"{Colors.DIM}🔄 Auto-refresh every {STATUS_PRINT_INTERVAL}s | Press Ctrl+C to stop{Colors.RESET}"
+    )
     print(f"{Colors.BRIGHT_BLACK}{'─' * dash_width}{Colors.RESET}")
     print()
 # --- HTTP Stats Server ---
 from aiohttp import web
 async def stats_json(request):
     async with state_lock:
         # Build a list of all configured servers, with status and connections
@@ -626,12 +689,16 @@ async def stats_json(request):
         for port in BACKEND_PORTS:
             server = (BACKEND_HOST, port)
             is_online = server in available_set
-            all_servers.append({
-                "host": BACKEND_HOST,
-                "port": port,
-                "active_connections": connection_counts.get(server, 0) if is_online else 0,
-                "status": "ONLINE" if is_online else "OFFLINE",
-            })
+            all_servers.append(
+                {
+                    "host": BACKEND_HOST,
+                    "port": port,
+                    "active_connections": connection_counts.get(server, 0)
+                    if is_online
+                    else 0,
+                    "status": "ONLINE" if is_online else "OFFLINE",
+                }
+            )
         stats = {
             "time": datetime.now().isoformat(),
             "uptime": format_uptime(start_time),
@@ -643,10 +710,12 @@ async def stats_json(request):
             "current_active_connections": current_active_connections,
             "health_check_timeout": HEALTH_CHECK_TIMEOUT,
             "status_update_interval": STATUS_PRINT_INTERVAL,
+            "throttle_ms": THROTTLE_MS,
             "servers": all_servers,
         }
     return web.json_response(stats)
 async def stats_page(request):
     # High-quality HTML dashboard with auto-refresh and charts
     return web.Response(
@@ -786,86 +855,101 @@ async def stats_page(request):
     </script>
 </body>
 </html>
-            """
+            """,
     )
 async def start_stats_server(loop):
-        app = web.Application()
-        app.router.add_get('/stats', stats_page)
-        app.router.add_get('/stats.json', stats_json)
-        runner = web.AppRunner(app)
-        await runner.setup()
-        site = web.TCPSite(runner, LOAD_BALANCER_HOST, STATS_PORT)
-        await site.start()
-        logger.info(f"Stats HTTP server running at http://{LOAD_BALANCER_HOST}:{STATS_PORT}/stats")
+    app = web.Application()
+    app.router.add_get("/stats", stats_page)
+    app.router.add_get("/stats.json", stats_json)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, LOAD_BALANCER_HOST, STATS_PORT)
+    await site.start()
+    logger.info(
+        f"Stats HTTP server running at http://{LOAD_BALANCER_HOST}:{STATS_PORT}/stats"
+    )
 async def main():
-        global start_time
-        start_time = time.time()
-        clear_terminal()
-        print_banner()
-        # Start background tasks
-        scan_task = asyncio.create_task(scan_and_update_servers())
-        status_task = asyncio.create_task(print_status_periodically())
-        # Start HTTP stats server (on STATS_PORT)
-        loop = asyncio.get_running_loop()
-        await start_stats_server(loop)
-        # Start TCP server (on LOAD_BALANCER_PORT)
-        server = await asyncio.start_server(
-            handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
-        )
+    global start_time
+    start_time = time.time()
+    clear_terminal()
+    print_banner()
-        addrs = ", ".join(str(sock.getsockname()) for sock in server.sockets)
-        logger.info(f"Load balancer serving on {addrs}")
-        logger.info(
-            f"Configured backend ports: {BACKEND_PORTS} on host {BACKEND_HOST}"
-        )
-        print(f"{Colors.BRIGHT_GREEN}✅ Load balancer started successfully!{Colors.RESET}")
-        print(f"{Colors.BRIGHT_GREEN}🌐 Proxy listening on: {addrs}{Colors.RESET}")
-        print(f"{Colors.BRIGHT_GREEN}📊 Stats dashboard: http://localhost:{STATS_PORT}/stats{Colors.RESET}")
-        print(f"{Colors.YELLOW}🔍 Scanning backend servers...{Colors.RESET}")
-        print()
-        await asyncio.sleep(2)
+    # Start background tasks
+    scan_task = asyncio.create_task(scan_and_update_servers())
+    status_task = asyncio.create_task(print_status_periodically())
+    # Start HTTP stats server (on STATS_PORT)
+    loop = asyncio.get_running_loop()
+    await start_stats_server(loop)
+    # Start TCP server (on LOAD_BALANCER_PORT)
+    server = await asyncio.start_server(
+        handle_client, LOAD_BALANCER_HOST, LOAD_BALANCER_PORT
+    )
+    addrs = ", ".join(str(sock.getsockname()) for sock in server.sockets)
+    logger.info(f"Load balancer serving on {addrs}")
+    logger.info(f"Configured backend ports: {BACKEND_PORTS} on host {BACKEND_HOST}")
+    print(f"{Colors.BRIGHT_GREEN}✅ Load balancer started successfully!{Colors.RESET}")
+    print(f"{Colors.BRIGHT_GREEN}🌐 Proxy listening on: {addrs}{Colors.RESET}")
+    print(
+        f"{Colors.BRIGHT_GREEN}📊 Stats dashboard: http://localhost:{STATS_PORT}/stats{Colors.RESET}"
+    )
+    print(f"{Colors.YELLOW}🔍 Scanning backend servers...{Colors.RESET}")
+    print()
+    await asyncio.sleep(2)
-        async with server:
+    async with server:
+        try:
+            await server.serve_forever()
+        except asyncio.CancelledError:
+            print(f"\n{Colors.YELLOW}🛑 Shutdown signal received...{Colors.RESET}")
+            logger.info("Load balancer server shutting down.")
+        except KeyboardInterrupt:
+            print(f"\n{Colors.YELLOW}🛑 Shutdown requested by user...{Colors.RESET}")
+            logger.info("Shutdown requested by user.")
+        finally:
+            print(f"{Colors.CYAN}🔄 Cleaning up background tasks...{Colors.RESET}")
+            logger.info("Cancelling background tasks...")
+            scan_task.cancel()
+            status_task.cancel()
             try:
-                await server.serve_forever()
+                await asyncio.gather(scan_task, status_task, return_exceptions=True)
             except asyncio.CancelledError:
-                print(f"\n{Colors.YELLOW}🛑 Shutdown signal received...{Colors.RESET}")
-                logger.info("Load balancer server shutting down.")
-            except KeyboardInterrupt:
-                print(f"\n{Colors.YELLOW}🛑 Shutdown requested by user...{Colors.RESET}")
-                logger.info("Shutdown requested by user.")
-            finally:
-                print(f"{Colors.CYAN}🔄 Cleaning up background tasks...{Colors.RESET}")
-                logger.info("Cancelling background tasks...")
-                scan_task.cancel()
-                status_task.cancel()
-                try:
-                    await asyncio.gather(scan_task, status_task, return_exceptions=True)
-                except asyncio.CancelledError:
-                    pass
-                print(f"{Colors.BRIGHT_GREEN}✅ Shutdown complete. Goodbye!{Colors.RESET}")
-                logger.info("Background tasks finished.")
+                pass
+            print(f"{Colors.BRIGHT_GREEN}✅ Shutdown complete. Goodbye!{Colors.RESET}")
+            logger.info("Background tasks finished.")
 def run_load_balancer():
-    global LOAD_BALANCER_PORT, BACKEND_PORTS, BACKEND_HOST, STATUS_PRINT_INTERVAL, HEALTH_CHECK_TIMEOUT, STATS_PORT
+    global \
+        LOAD_BALANCER_PORT, \
+        BACKEND_PORTS, \
+        BACKEND_HOST, \
+        STATUS_PRINT_INTERVAL, \
+        HEALTH_CHECK_TIMEOUT, \
+        THROTTLE_MS, \
+        STATS_PORT
     args = parse_args()
     LOAD_BALANCER_PORT = args.port
     BACKEND_HOST = args.host
     BACKEND_PORTS = [int(p.strip()) for p in args.ports.split(",") if p.strip()]
     STATUS_PRINT_INTERVAL = args.status_interval
     HEALTH_CHECK_TIMEOUT = args.health_timeout
+    THROTTLE_MS = args.throttle_ms
     if args.stats_port is not None:
         STATS_PORT = args.stats_port
     else:
         STATS_PORT = LOAD_BALANCER_PORT + 1
     if not BACKEND_PORTS:
         print(f"{Colors.BG_RED}{Colors.WHITE} ❌ ERROR {Colors.RESET}")
-        print(f"{Colors.RED}No backend ports specified. Use --ports 8140,8150 ...{Colors.RESET}")
+        print(
+            f"{Colors.RED}No backend ports specified. Use --ports 8140,8150 ...{Colors.RESET}"
+        )
         logger.critical("No backend ports specified. Use --ports 8140,8150 ...")
         sys.exit(1)
     try:
@@ -878,5 +962,6 @@ def run_load_balancer():
         print(f"{Colors.RED}Critical error in main execution: {e}{Colors.RESET}")
         logger.critical(f"Critical error in main execution: {e}")
 if __name__ == "__main__":
     run_load_balancer()

{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: speedy-utils
-Version: 1.1.5
+Version: 1.1.6
 Summary: Fast and easy-to-use package for data science
 Author: AnhVTH
 Author-email: anhvth.226@gmail.com

{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,20 @@
 llm_utils/__init__.py,sha256=SlaCMArn_uKVw4r0psz0q0IOQ1VFGdgCFOAKxQ81WTI,694
 llm_utils/chat_format/__init__.py,sha256=8dBIUqFJvkgQYedxBtcyxt-4tt8JxAKVap2JlTXmgaM,737
-llm_utils/chat_format/display.py,sha256=qaEGADGP8iQFzWOuzEO7_HyrqAFdEnUfkHAH28b0ym0,9772
+llm_utils/chat_format/display.py,sha256=M-__JpcJSqjqeP4LiW7-yF8fVL37yUEUdaNC4VEgIo8,10181
 llm_utils/chat_format/transform.py,sha256=8TZhvUS5DrjUeMNtDIuWY54B_QZ7jjpXEL9c8F5z79w,5400
 llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
 llm_utils/group_messages.py,sha256=8CU9nKOja3xeuhdrX5CvYVveSqSKb2zQ0eeNzA88aTQ,3621
 llm_utils/lm/__init__.py,sha256=rX36_MsnekM5GHwWS56XELbm4W5x2TDwnPERDTfo0eU,194
-llm_utils/lm/async_lm.py,sha256=eTyI9x4iZc4ZhYdwNadTYap5HgBJygiV_EBDZ-Og1cQ,34357
+llm_utils/lm/async_lm/__init__.py,sha256=ouN2z1G24OwFglo2asBZ5w18RYcvTZ5r3ylx1aYp2rQ,70
+llm_utils/lm/async_lm/_utils.py,sha256=16yks9grCmFotuqFKxVBfPvrrrquBfhzFUZ22mno3LY,5946
+llm_utils/lm/async_lm/async_llm_task.py,sha256=JIU18Q7sPu_LCUR-geZmqiQ37stM8CqXmMzzkRUYibM,5352
+llm_utils/lm/async_lm/async_lm.py,sha256=LzX1zMKiAENtqPtQ-laVbUyQ1g6q9Xhuyr_IWIjxdVk,28589
 llm_utils/lm/chat_html.py,sha256=FkGo0Dv_nAHYBMZzXfMu_bGQKaCx302goh3XaT-_ETc,8674
 llm_utils/lm/lm_json.py,sha256=fMt42phzFV2f6ulrtWcDXsWHi8WcG7gGkCzpIq8VSSM,1975
 llm_utils/lm/sync_lm.py,sha256=ANw_m5KiWcRwwoeQ5no6dzPFLc6j9o2oEcJtkMKqrn8,34640
 llm_utils/lm/utils.py,sha256=gUejbVZPYg97g4ftYEptYN52WhH3TAKOFW81sjLvi08,4585
 llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
-llm_utils/scripts/vllm_load_balancer.py,sha256=zz5aTaYwy5tYrv2RIhrizrGP-PnPAohgrl9kQvvJywA,35091
+llm_utils/scripts/vllm_load_balancer.py,sha256=DxZNDGl8tViE0SuGQPneBHqkkagyuEm70bNzWyqa_9Q,36837
 llm_utils/scripts/vllm_serve.py,sha256=4NaqpVs7LBvxtvTCMPsNCAOfqiWkKRttxWMmWY7SitA,14729
 speedy_utils/__init__.py,sha256=YCpiReW22zG4KkQXQe6V9BQ8bn7PtiXolOaW_iL8T4M,5734
 speedy_utils/all.py,sha256=t-HKzDmhF1MTFnmq7xRnPs5nFG_aZaLH9Ua0RM6nQ9Y,4855
@@ -31,7 +34,7 @@ speedy_utils/multi_worker/thread.py,sha256=u_hTwXh7_FciMa5EukdEA1fDCY_vUC4moDceB
 speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 speedy_utils/scripts/mpython.py,sha256=73PHm1jqbCt2APN4xuNjD0VDKwzOj4EZsViEMQiZU2g,3853
 speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
-speedy_utils-1.1.5.dist-info/METADATA,sha256=j4-glLlCDTANN-zB9g7NMG3viJwBH2FhYGWu_HpD4kc,7441
-speedy_utils-1.1.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-speedy_utils-1.1.5.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
-speedy_utils-1.1.5.dist-info/RECORD,,
+speedy_utils-1.1.6.dist-info/METADATA,sha256=vfJU1DO0R5VkRqYrN1omYIgcHNDvvKJ4F87l77wjP2s,7441
+speedy_utils-1.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+speedy_utils-1.1.6.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
+speedy_utils-1.1.6.dist-info/RECORD,,

{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{speedy_utils-1.1.5.dist-info → speedy_utils-1.1.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

speedy-utils 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

speedy-utils 1.1.5py3-none-any.whl → 1.1.6py3-none-any.whl