lumen-app 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lumen_app/__init__.py +7 -0
- lumen_app/core/__init__.py +0 -0
- lumen_app/core/config.py +661 -0
- lumen_app/core/installer.py +274 -0
- lumen_app/core/loader.py +45 -0
- lumen_app/core/router.py +87 -0
- lumen_app/core/server.py +389 -0
- lumen_app/core/service.py +49 -0
- lumen_app/core/tests/__init__.py +1 -0
- lumen_app/core/tests/test_core_integration.py +561 -0
- lumen_app/core/tests/test_env_checker.py +487 -0
- lumen_app/proto/README.md +12 -0
- lumen_app/proto/ml_service.proto +88 -0
- lumen_app/proto/ml_service_pb2.py +66 -0
- lumen_app/proto/ml_service_pb2.pyi +136 -0
- lumen_app/proto/ml_service_pb2_grpc.py +251 -0
- lumen_app/server.py +362 -0
- lumen_app/utils/env_checker.py +752 -0
- lumen_app/utils/installation/__init__.py +25 -0
- lumen_app/utils/installation/env_manager.py +152 -0
- lumen_app/utils/installation/micromamba_installer.py +459 -0
- lumen_app/utils/installation/package_installer.py +149 -0
- lumen_app/utils/installation/verifier.py +95 -0
- lumen_app/utils/logger.py +181 -0
- lumen_app/utils/mamba/cuda.yaml +12 -0
- lumen_app/utils/mamba/default.yaml +6 -0
- lumen_app/utils/mamba/openvino.yaml +7 -0
- lumen_app/utils/mamba/tensorrt.yaml +13 -0
- lumen_app/utils/package_resolver.py +309 -0
- lumen_app/utils/preset_registry.py +219 -0
- lumen_app/web/__init__.py +3 -0
- lumen_app/web/api/__init__.py +1 -0
- lumen_app/web/api/config.py +229 -0
- lumen_app/web/api/hardware.py +201 -0
- lumen_app/web/api/install.py +608 -0
- lumen_app/web/api/server.py +253 -0
- lumen_app/web/core/__init__.py +1 -0
- lumen_app/web/core/server_manager.py +348 -0
- lumen_app/web/core/state.py +264 -0
- lumen_app/web/main.py +145 -0
- lumen_app/web/models/__init__.py +28 -0
- lumen_app/web/models/config.py +63 -0
- lumen_app/web/models/hardware.py +64 -0
- lumen_app/web/models/install.py +134 -0
- lumen_app/web/models/server.py +95 -0
- lumen_app/web/static/assets/index-CGuhGHC9.css +1 -0
- lumen_app/web/static/assets/index-DN6HmxWS.js +56 -0
- lumen_app/web/static/index.html +14 -0
- lumen_app/web/static/vite.svg +1 -0
- lumen_app/web/websockets/__init__.py +1 -0
- lumen_app/web/websockets/logs.py +159 -0
- lumen_app-0.4.2.dist-info/METADATA +23 -0
- lumen_app-0.4.2.dist-info/RECORD +56 -0
- lumen_app-0.4.2.dist-info/WHEEL +5 -0
- lumen_app-0.4.2.dist-info/entry_points.txt +3 -0
- lumen_app-0.4.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""Server management API endpoints."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, HTTPException
|
|
6
|
+
|
|
7
|
+
from lumen_app.utils.logger import get_logger
|
|
8
|
+
from lumen_app.web.core.state import app_state
|
|
9
|
+
from lumen_app.web.models.server import (
|
|
10
|
+
ServerLogs,
|
|
11
|
+
ServerRestartRequest,
|
|
12
|
+
ServerStartRequest,
|
|
13
|
+
ServerStatus,
|
|
14
|
+
ServerStopRequest,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = get_logger("lumen.web.api.server")
|
|
18
|
+
router = APIRouter()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@router.get("/status", response_model=ServerStatus)
|
|
22
|
+
async def get_server_status():
|
|
23
|
+
"""
|
|
24
|
+
Get the current ML server status.
|
|
25
|
+
|
|
26
|
+
Returns detailed information about the running server including:
|
|
27
|
+
- Running state and PID
|
|
28
|
+
- Port and host configuration
|
|
29
|
+
- Uptime in seconds
|
|
30
|
+
- Health status
|
|
31
|
+
- Configuration path
|
|
32
|
+
"""
|
|
33
|
+
manager = app_state.server_manager
|
|
34
|
+
|
|
35
|
+
# Get basic status
|
|
36
|
+
running = manager.is_running
|
|
37
|
+
pid = manager.pid
|
|
38
|
+
uptime = manager.uptime_seconds
|
|
39
|
+
|
|
40
|
+
# Perform health check if running
|
|
41
|
+
health = "unknown"
|
|
42
|
+
if running:
|
|
43
|
+
health = await manager.health_check()
|
|
44
|
+
|
|
45
|
+
return ServerStatus(
|
|
46
|
+
running=running,
|
|
47
|
+
pid=pid,
|
|
48
|
+
port=manager.port or 50051,
|
|
49
|
+
host="0.0.0.0",
|
|
50
|
+
uptime_seconds=uptime,
|
|
51
|
+
service_name="lumen-ai", # TODO: Get from config
|
|
52
|
+
config_path=manager.config_path,
|
|
53
|
+
environment="lumen_env", # TODO: Get from config
|
|
54
|
+
health=health,
|
|
55
|
+
last_error=None, # TODO: Track last error
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@router.post("/start", response_model=ServerStatus)
|
|
60
|
+
async def start_server(request: ServerStartRequest):
|
|
61
|
+
"""
|
|
62
|
+
Start the ML server with specified configuration.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
request: Server start configuration including:
|
|
66
|
+
- config_path: Path to the Lumen YAML configuration
|
|
67
|
+
- port: Optional port override
|
|
68
|
+
- host: Host address (currently unused, always 0.0.0.0)
|
|
69
|
+
- environment: Conda environment name
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Current server status after startup
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
HTTPException 400: If server is already running
|
|
76
|
+
HTTPException 404: If config file not found
|
|
77
|
+
HTTPException 500: If server fails to start
|
|
78
|
+
"""
|
|
79
|
+
logger.info(f"Starting ML server with config: {request.config_path}")
|
|
80
|
+
|
|
81
|
+
manager = app_state.server_manager
|
|
82
|
+
|
|
83
|
+
# Check if already running
|
|
84
|
+
if manager.is_running:
|
|
85
|
+
raise HTTPException(
|
|
86
|
+
status_code=400,
|
|
87
|
+
detail="Server is already running. Stop it first or use restart.",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
# Start the server
|
|
92
|
+
success = await manager.start(
|
|
93
|
+
config_path=request.config_path,
|
|
94
|
+
port=request.port,
|
|
95
|
+
log_level="INFO", # TODO: Make configurable
|
|
96
|
+
environment=request.environment,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if not success:
|
|
100
|
+
raise HTTPException(
|
|
101
|
+
status_code=500,
|
|
102
|
+
detail="Server failed to start. Check logs for details.",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
logger.info("✓ ML server started successfully")
|
|
106
|
+
|
|
107
|
+
# Return current status
|
|
108
|
+
return await get_server_status()
|
|
109
|
+
|
|
110
|
+
except FileNotFoundError as e:
|
|
111
|
+
logger.error(f"Config file not found: {e}")
|
|
112
|
+
raise HTTPException(status_code=404, detail=str(e))
|
|
113
|
+
|
|
114
|
+
except RuntimeError as e:
|
|
115
|
+
logger.error(f"Runtime error starting server: {e}")
|
|
116
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"Unexpected error starting server: {e}", exc_info=True)
|
|
120
|
+
raise HTTPException(status_code=500, detail=f"Failed to start server: {str(e)}")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@router.post("/stop", response_model=ServerStatus)
|
|
124
|
+
async def stop_server(request: ServerStopRequest):
|
|
125
|
+
"""
|
|
126
|
+
Stop the running ML server.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
request: Stop configuration including:
|
|
130
|
+
- force: If True, force kill immediately without graceful shutdown
|
|
131
|
+
- timeout: Maximum seconds to wait for graceful shutdown
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Current server status after shutdown
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
HTTPException 400: If server is not running
|
|
138
|
+
HTTPException 500: If server fails to stop
|
|
139
|
+
"""
|
|
140
|
+
logger.info("Stopping ML server")
|
|
141
|
+
|
|
142
|
+
manager = app_state.server_manager
|
|
143
|
+
|
|
144
|
+
# Check if running
|
|
145
|
+
if not manager.is_running:
|
|
146
|
+
raise HTTPException(
|
|
147
|
+
status_code=400, detail="Server is not running. Nothing to stop."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Stop the server
|
|
152
|
+
success = await manager.stop(timeout=request.timeout, force=request.force)
|
|
153
|
+
|
|
154
|
+
if not success:
|
|
155
|
+
raise HTTPException(
|
|
156
|
+
status_code=500,
|
|
157
|
+
detail="Server failed to stop gracefully. Check logs for details.",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
logger.info("✓ ML server stopped successfully")
|
|
161
|
+
|
|
162
|
+
# Return current status
|
|
163
|
+
return await get_server_status()
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"Error stopping server: {e}", exc_info=True)
|
|
167
|
+
raise HTTPException(status_code=500, detail=f"Failed to stop server: {str(e)}")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@router.post("/restart", response_model=ServerStatus)
|
|
171
|
+
async def restart_server(request: ServerRestartRequest):
|
|
172
|
+
"""
|
|
173
|
+
Restart the ML server with optional new configuration.
|
|
174
|
+
|
|
175
|
+
This is equivalent to stop + start, but handles the sequencing automatically.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
request: Restart configuration including:
|
|
179
|
+
- config_path: Optional new config path (uses existing if not provided)
|
|
180
|
+
- port: Optional new port (uses existing if not provided)
|
|
181
|
+
- host: Host address (currently unused)
|
|
182
|
+
- environment: Environment name
|
|
183
|
+
- force: If True, force kill during stop
|
|
184
|
+
- timeout: Maximum seconds to wait for graceful shutdown
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Current server status after restart
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
HTTPException 400: If no config path available
|
|
191
|
+
HTTPException 500: If restart fails
|
|
192
|
+
"""
|
|
193
|
+
logger.info("Restarting ML server")
|
|
194
|
+
|
|
195
|
+
manager = app_state.server_manager
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
# Restart the server
|
|
199
|
+
success = await manager.restart(
|
|
200
|
+
config_path=request.config_path,
|
|
201
|
+
port=request.port,
|
|
202
|
+
log_level="INFO", # TODO: Make configurable
|
|
203
|
+
timeout=request.timeout,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
if not success:
|
|
207
|
+
raise HTTPException(
|
|
208
|
+
status_code=500,
|
|
209
|
+
detail="Server failed to restart. Check logs for details.",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
logger.info("✓ ML server restarted successfully")
|
|
213
|
+
|
|
214
|
+
# Return current status
|
|
215
|
+
return await get_server_status()
|
|
216
|
+
|
|
217
|
+
except ValueError as e:
|
|
218
|
+
logger.error(f"Invalid restart configuration: {e}")
|
|
219
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
220
|
+
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.error(f"Error restarting server: {e}", exc_info=True)
|
|
223
|
+
raise HTTPException(
|
|
224
|
+
status_code=500, detail=f"Failed to restart server: {str(e)}"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@router.get("/logs", response_model=ServerLogs)
|
|
229
|
+
async def get_server_logs(lines: int = 100, since: float | None = None):
|
|
230
|
+
"""
|
|
231
|
+
Get recent server logs.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
lines: Number of recent log lines to return (default: 100, 0 for all)
|
|
235
|
+
since: Unix timestamp to filter logs (currently unused)
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Server logs with metadata
|
|
239
|
+
|
|
240
|
+
Note:
|
|
241
|
+
The 'since' parameter is reserved for future filtering implementation.
|
|
242
|
+
Currently returns the most recent N lines from the log buffer.
|
|
243
|
+
"""
|
|
244
|
+
manager = app_state.server_manager
|
|
245
|
+
|
|
246
|
+
# Get logs from manager
|
|
247
|
+
log_lines = manager.get_logs(tail=lines)
|
|
248
|
+
|
|
249
|
+
return ServerLogs(
|
|
250
|
+
logs=log_lines,
|
|
251
|
+
total_lines=len(log_lines),
|
|
252
|
+
new_lines=0, # TODO: Implement incremental log fetching
|
|
253
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core modules for Lumen Web API."""
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Server Manager for gRPC ML Server Process.
|
|
3
|
+
|
|
4
|
+
This module provides lifecycle management for the gRPC ML server subprocess,
|
|
5
|
+
including starting, stopping, health checking, and log capture.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import time
|
|
12
|
+
from collections import deque
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Literal
|
|
15
|
+
|
|
16
|
+
from lumen_app.utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger("lumen.web.server_manager")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ServerManager:
|
|
22
|
+
"""
|
|
23
|
+
Manages the gRPC ML server as a subprocess.
|
|
24
|
+
|
|
25
|
+
This class handles:
|
|
26
|
+
- Starting the server with proper configuration
|
|
27
|
+
- Monitoring server health and status
|
|
28
|
+
- Capturing and buffering logs
|
|
29
|
+
- Graceful shutdown with timeout handling
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, max_log_lines: int = 1000):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the server manager.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
max_log_lines: Maximum number of log lines to keep in memory
|
|
38
|
+
"""
|
|
39
|
+
self.process: asyncio.subprocess.Process | None = None
|
|
40
|
+
self.config_path: str | None = None
|
|
41
|
+
self.port: int | None = None
|
|
42
|
+
self.start_time: float | None = None
|
|
43
|
+
self.log_buffer: deque[str] = deque(maxlen=max_log_lines)
|
|
44
|
+
self._log_task: asyncio.Task | None = None
|
|
45
|
+
self._shutdown_event = asyncio.Event()
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def is_running(self) -> bool:
|
|
49
|
+
"""Check if the server process is running."""
|
|
50
|
+
if self.process is None:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
# Check if process is still alive
|
|
54
|
+
return self.process.returncode is None
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def pid(self) -> int | None:
|
|
58
|
+
"""Get the server process PID."""
|
|
59
|
+
if self.process:
|
|
60
|
+
return self.process.pid
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def uptime_seconds(self) -> float | None:
|
|
65
|
+
"""Get server uptime in seconds."""
|
|
66
|
+
if self.start_time and self.is_running:
|
|
67
|
+
return time.time() - self.start_time
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
async def start(
|
|
71
|
+
self,
|
|
72
|
+
config_path: str,
|
|
73
|
+
port: int | None = None,
|
|
74
|
+
log_level: str = "INFO",
|
|
75
|
+
environment: str | None = None,
|
|
76
|
+
) -> bool:
|
|
77
|
+
"""
|
|
78
|
+
Start the gRPC ML server as a subprocess.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
config_path: Path to the Lumen configuration YAML file
|
|
82
|
+
port: Port number (overrides config file)
|
|
83
|
+
log_level: Logging level for the server
|
|
84
|
+
environment: Conda environment name (if using conda)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
True if server started successfully, False otherwise
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
RuntimeError: If server is already running or config is invalid
|
|
91
|
+
"""
|
|
92
|
+
if self.is_running:
|
|
93
|
+
raise RuntimeError("Server is already running")
|
|
94
|
+
|
|
95
|
+
# Validate config path
|
|
96
|
+
config_file = Path(config_path).expanduser()
|
|
97
|
+
if not config_file.exists():
|
|
98
|
+
raise FileNotFoundError(f"Config file not found: {config_path}")
|
|
99
|
+
|
|
100
|
+
logger.info(f"Starting ML server with config: {config_path}")
|
|
101
|
+
|
|
102
|
+
# Build command
|
|
103
|
+
# We'll use python -m to run the server module
|
|
104
|
+
cmd = [
|
|
105
|
+
"python",
|
|
106
|
+
"-m",
|
|
107
|
+
"lumen_app.core.server",
|
|
108
|
+
"--config",
|
|
109
|
+
str(config_file),
|
|
110
|
+
"--log-level",
|
|
111
|
+
log_level,
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
if port:
|
|
115
|
+
cmd.extend(["--port", str(port)])
|
|
116
|
+
|
|
117
|
+
logger.debug(f"Server command: {' '.join(cmd)}")
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
# Start the subprocess
|
|
121
|
+
self.process = await asyncio.create_subprocess_exec(
|
|
122
|
+
*cmd,
|
|
123
|
+
stdout=asyncio.subprocess.PIPE,
|
|
124
|
+
stderr=asyncio.subprocess.STDOUT, # Merge stderr into stdout
|
|
125
|
+
stdin=asyncio.subprocess.DEVNULL,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
self.config_path = config_path
|
|
129
|
+
self.port = port
|
|
130
|
+
self.start_time = time.time()
|
|
131
|
+
|
|
132
|
+
logger.info(f"Server process started with PID: {self.process.pid}")
|
|
133
|
+
|
|
134
|
+
# Start log capture task
|
|
135
|
+
self._log_task = asyncio.create_task(self._capture_logs())
|
|
136
|
+
|
|
137
|
+
# Wait for server to be ready (with timeout)
|
|
138
|
+
ready = await self._wait_for_ready(timeout=30.0)
|
|
139
|
+
|
|
140
|
+
if not ready:
|
|
141
|
+
logger.error("Server failed to start within timeout")
|
|
142
|
+
await self.stop(force=True)
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
logger.info("✓ ML server is ready")
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Failed to start server: {e}", exc_info=True)
|
|
150
|
+
if self.process:
|
|
151
|
+
await self.stop(force=True)
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
async def stop(self, timeout: float = 30.0, force: bool = False) -> bool:
|
|
155
|
+
"""
|
|
156
|
+
Stop the gRPC ML server.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
timeout: Maximum time to wait for graceful shutdown
|
|
160
|
+
force: If True, skip graceful shutdown and kill immediately
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
True if server stopped successfully, False otherwise
|
|
164
|
+
"""
|
|
165
|
+
if not self.process:
|
|
166
|
+
logger.warning("No server process to stop")
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
if not self.is_running:
|
|
170
|
+
logger.info("Server process already stopped")
|
|
171
|
+
self._cleanup()
|
|
172
|
+
return True
|
|
173
|
+
|
|
174
|
+
logger.info(f"Stopping ML server (PID: {self.process.pid})")
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
if force:
|
|
178
|
+
# Force kill immediately
|
|
179
|
+
logger.warning("Force killing server process")
|
|
180
|
+
self.process.kill()
|
|
181
|
+
else:
|
|
182
|
+
# Try graceful shutdown first
|
|
183
|
+
logger.info("Sending SIGTERM for graceful shutdown")
|
|
184
|
+
self.process.terminate()
|
|
185
|
+
|
|
186
|
+
# Wait for graceful shutdown with timeout
|
|
187
|
+
try:
|
|
188
|
+
await asyncio.wait_for(self.process.wait(), timeout=timeout)
|
|
189
|
+
logger.info("Server stopped gracefully")
|
|
190
|
+
except asyncio.TimeoutError:
|
|
191
|
+
logger.warning(
|
|
192
|
+
f"Server did not stop within {timeout}s, force killing"
|
|
193
|
+
)
|
|
194
|
+
self.process.kill()
|
|
195
|
+
await self.process.wait()
|
|
196
|
+
|
|
197
|
+
self._cleanup()
|
|
198
|
+
logger.info("✓ Server stopped")
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.error(f"Error stopping server: {e}", exc_info=True)
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
async def restart(
|
|
206
|
+
self,
|
|
207
|
+
config_path: str | None = None,
|
|
208
|
+
port: int | None = None,
|
|
209
|
+
log_level: str = "INFO",
|
|
210
|
+
timeout: float = 30.0,
|
|
211
|
+
) -> bool:
|
|
212
|
+
"""
|
|
213
|
+
Restart the server.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
config_path: New config path (or use existing)
|
|
217
|
+
port: New port (or use existing)
|
|
218
|
+
log_level: Logging level
|
|
219
|
+
timeout: Timeout for stop operation
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
True if restart successful, False otherwise
|
|
223
|
+
"""
|
|
224
|
+
logger.info("Restarting ML server")
|
|
225
|
+
|
|
226
|
+
# Use existing config if not provided
|
|
227
|
+
config_path = config_path or self.config_path
|
|
228
|
+
port = port or self.port
|
|
229
|
+
|
|
230
|
+
if not config_path:
|
|
231
|
+
raise ValueError("No config path specified for restart")
|
|
232
|
+
|
|
233
|
+
# Stop if running
|
|
234
|
+
if self.is_running:
|
|
235
|
+
success = await self.stop(timeout=timeout)
|
|
236
|
+
if not success:
|
|
237
|
+
logger.error("Failed to stop server for restart")
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
# Wait a bit for resources to be freed
|
|
241
|
+
await asyncio.sleep(1.0)
|
|
242
|
+
|
|
243
|
+
# Start with new/existing config
|
|
244
|
+
return await self.start(config_path=config_path, port=port, log_level=log_level)
|
|
245
|
+
|
|
246
|
+
async def health_check(self) -> Literal["healthy", "unhealthy", "unknown"]:
|
|
247
|
+
"""
|
|
248
|
+
Perform a health check on the server.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
"healthy" if server is running and responsive
|
|
252
|
+
"unhealthy" if server process exists but not responsive
|
|
253
|
+
"unknown" if server is not running
|
|
254
|
+
"""
|
|
255
|
+
if not self.is_running:
|
|
256
|
+
return "unknown"
|
|
257
|
+
|
|
258
|
+
# TODO: Implement actual gRPC health check
|
|
259
|
+
# For now, just check if process is running
|
|
260
|
+
return "healthy"
|
|
261
|
+
|
|
262
|
+
def get_logs(self, tail: int = 100) -> list[str]:
|
|
263
|
+
"""
|
|
264
|
+
Get recent log lines.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
tail: Number of recent lines to return (0 for all)
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
List of log lines
|
|
271
|
+
"""
|
|
272
|
+
if tail > 0:
|
|
273
|
+
# Return last N lines
|
|
274
|
+
return list(self.log_buffer)[-tail:]
|
|
275
|
+
else:
|
|
276
|
+
# Return all lines
|
|
277
|
+
return list(self.log_buffer)
|
|
278
|
+
|
|
279
|
+
async def _capture_logs(self):
|
|
280
|
+
"""Background task to capture server stdout/stderr."""
|
|
281
|
+
if not self.process or not self.process.stdout:
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
logger.debug("Starting log capture")
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
async for line in self.process.stdout:
|
|
288
|
+
try:
|
|
289
|
+
log_line = line.decode("utf-8").rstrip()
|
|
290
|
+
self.log_buffer.append(log_line)
|
|
291
|
+
|
|
292
|
+
# Also log to our logger for debugging
|
|
293
|
+
logger.debug(f"[Server] {log_line}")
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.warning(f"Error decoding log line: {e}")
|
|
297
|
+
|
|
298
|
+
except asyncio.CancelledError:
|
|
299
|
+
logger.debug("Log capture cancelled")
|
|
300
|
+
except Exception as e:
|
|
301
|
+
logger.error(f"Error capturing logs: {e}", exc_info=True)
|
|
302
|
+
|
|
303
|
+
async def _wait_for_ready(self, timeout: float = 30.0) -> bool:
|
|
304
|
+
"""
|
|
305
|
+
Wait for the server to be ready.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
timeout: Maximum time to wait
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
True if server is ready, False if timeout or error
|
|
312
|
+
"""
|
|
313
|
+
logger.info(f"Waiting for server to be ready (timeout: {timeout}s)...")
|
|
314
|
+
|
|
315
|
+
start_time = time.time()
|
|
316
|
+
|
|
317
|
+
while time.time() - start_time < timeout:
|
|
318
|
+
# Check if process died
|
|
319
|
+
if not self.is_running:
|
|
320
|
+
logger.error("Server process died during startup")
|
|
321
|
+
return False
|
|
322
|
+
|
|
323
|
+
# Look for startup success indicators in logs
|
|
324
|
+
recent_logs = self.get_logs(tail=10)
|
|
325
|
+
for log_line in recent_logs:
|
|
326
|
+
if (
|
|
327
|
+
"listening on" in log_line.lower()
|
|
328
|
+
or "server running" in log_line.lower()
|
|
329
|
+
):
|
|
330
|
+
logger.info("Server startup detected in logs")
|
|
331
|
+
# Give it a moment to fully initialize
|
|
332
|
+
await asyncio.sleep(1.0)
|
|
333
|
+
return True
|
|
334
|
+
|
|
335
|
+
# Wait a bit before checking again
|
|
336
|
+
await asyncio.sleep(0.5)
|
|
337
|
+
|
|
338
|
+
logger.error(f"Server did not start within {timeout}s")
|
|
339
|
+
return False
|
|
340
|
+
|
|
341
|
+
def _cleanup(self):
|
|
342
|
+
"""Clean up server state after shutdown."""
|
|
343
|
+
if self._log_task and not self._log_task.done():
|
|
344
|
+
self._log_task.cancel()
|
|
345
|
+
|
|
346
|
+
self.process = None
|
|
347
|
+
self.start_time = None
|
|
348
|
+
self._log_task = None
|