mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,1045 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import inspect
|
|
7
|
+
import logging
|
|
8
|
+
import multiprocessing
|
|
9
|
+
import os
|
|
10
|
+
import queue
|
|
11
|
+
import signal
|
|
12
|
+
import socket
|
|
13
|
+
import threading
|
|
14
|
+
import time
|
|
15
|
+
import traceback
|
|
16
|
+
from contextlib import asynccontextmanager, suppress
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from multiprocessing.process import BaseProcess
|
|
19
|
+
from typing import Any, AsyncContextManager, AsyncIterator, Dict, Literal, Optional, cast
|
|
20
|
+
|
|
21
|
+
import aiohttp
|
|
22
|
+
import requests
|
|
23
|
+
import uvicorn
|
|
24
|
+
from fastapi import FastAPI
|
|
25
|
+
from gunicorn.app.base import BaseApplication
|
|
26
|
+
from gunicorn.arbiter import Arbiter
|
|
27
|
+
from portpicker import pick_unused_port
|
|
28
|
+
|
|
29
|
+
__all__ = ["PythonServerLauncher", "PythonServerLauncherArgs", "LaunchMode"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
LaunchMode = Literal["asyncio", "thread", "mp"]
|
|
33
|
+
"""The launch mode for the server."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class PythonServerLauncherArgs:
|
|
38
|
+
port: Optional[int] = None
|
|
39
|
+
"""The TCP port to listen on. If not provided, the server will use a random available port."""
|
|
40
|
+
host: Optional[str] = None
|
|
41
|
+
"""The hostname or IP address to bind the server to."""
|
|
42
|
+
access_host: Optional[str] = None
|
|
43
|
+
"""The hostname or IP address to advertise to the client. If not provided, the server will use the default outbound IPv4 address for this machine."""
|
|
44
|
+
launch_mode: LaunchMode = "asyncio"
|
|
45
|
+
"""The launch mode. `asyncio` is the default mode to runs the server in the current thread.
|
|
46
|
+
`thread` runs the server in a separate thread. `mp` runs the server in a separate process."""
|
|
47
|
+
n_workers: int = 1
|
|
48
|
+
"""The number of workers to run in the server. Only applicable for `mp` mode.
|
|
49
|
+
When `n_workers > 1`, the server will be run using Gunicorn.
|
|
50
|
+
"""
|
|
51
|
+
healthcheck_url: Optional[str] = None
|
|
52
|
+
"""The health check URL to use.
|
|
53
|
+
If not provided, the server will not be checked for healthiness after starting.
|
|
54
|
+
"""
|
|
55
|
+
log_level: int = logging.INFO
|
|
56
|
+
"""The log level to use."""
|
|
57
|
+
access_log: bool = False
|
|
58
|
+
"""Whether to turn on access logs."""
|
|
59
|
+
startup_timeout: float = 60.0
|
|
60
|
+
"""The timeout to wait for the server to start up."""
|
|
61
|
+
kill_unhealthy_server: bool = True
|
|
62
|
+
"""Whether to kill the server if it is not healthy after startup.
|
|
63
|
+
This setting is ignored when `launch_mode` is not `asyncio`.
|
|
64
|
+
"""
|
|
65
|
+
thread_join_timeout: float = 10.0
|
|
66
|
+
"""The timeout to wait for the thread to join."""
|
|
67
|
+
process_join_timeout: float = 10.0
|
|
68
|
+
"""The timeout to wait for the process to join."""
|
|
69
|
+
timeout_keep_alive: int = 30
|
|
70
|
+
"""The timeout to keep the connection alive."""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class ChildEvent:
|
|
75
|
+
"""An event that occurred in a child process."""
|
|
76
|
+
|
|
77
|
+
kind: Literal["ready", "error"]
|
|
78
|
+
"""The kind of message."""
|
|
79
|
+
exc_type: Optional[str] = None
|
|
80
|
+
"""The type of the exception, only used for error messages."""
|
|
81
|
+
message: Optional[str] = None
|
|
82
|
+
"""The message of the exception, only used for error messages."""
|
|
83
|
+
traceback: Optional[str] = None
|
|
84
|
+
"""The traceback of the exception, only used for error messages."""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
logger = logging.getLogger(__name__)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class GunicornApp(BaseApplication):
|
|
91
|
+
"""
|
|
92
|
+
Programmatic Gunicorn application that:
|
|
93
|
+
|
|
94
|
+
- Accepts a `FastAPI` app object and option dict.
|
|
95
|
+
- Uses `uvicorn_worker.UvicornWorker`.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, app: FastAPI, options: Dict[str, Any]):
|
|
99
|
+
self.application = app
|
|
100
|
+
self.options = options
|
|
101
|
+
super().__init__() # type: ignore
|
|
102
|
+
|
|
103
|
+
def load_config(self):
|
|
104
|
+
cfg = self.cfg
|
|
105
|
+
valid_keys = cfg.settings.keys() # type: ignore
|
|
106
|
+
for k, v in (self.options or {}).items():
|
|
107
|
+
if k in valid_keys and v is not None:
|
|
108
|
+
cfg.set(k, v) # type: ignore
|
|
109
|
+
|
|
110
|
+
def load(self):
|
|
111
|
+
return self.application
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
async def shutdown_uvicorn_server(server: uvicorn.Server, task: asyncio.Task[None], timeout: float = 5.0) -> None:
|
|
115
|
+
"""Shutdown a uvicorn server and await the serving task."""
|
|
116
|
+
logger.debug("Requesting graceful shutdown of uvicorn server.")
|
|
117
|
+
server.should_exit = True
|
|
118
|
+
# Give uvicorn a brief window to shut down cleanly.
|
|
119
|
+
try:
|
|
120
|
+
logger.debug("Waiting for graceful shutdown of uvicorn server.")
|
|
121
|
+
await asyncio.wait_for(task, timeout=timeout)
|
|
122
|
+
logger.debug("Graceful shutdown of uvicorn server completed.")
|
|
123
|
+
except asyncio.TimeoutError:
|
|
124
|
+
logger.error("Graceful shutdown of uvicorn server timed out.")
|
|
125
|
+
# As a last resort, cancel; this shouldn't happen under normal circumstances.
|
|
126
|
+
task.cancel()
|
|
127
|
+
with suppress(asyncio.CancelledError):
|
|
128
|
+
await task
|
|
129
|
+
logger.warning("Uvicorn server forced to stop.")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@asynccontextmanager
|
|
133
|
+
async def noop_context() -> AsyncIterator[None]:
|
|
134
|
+
"""A real async context manager that does nothing (satisfies serve_context)."""
|
|
135
|
+
yield
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
async def run_uvicorn_asyncio(
|
|
139
|
+
uvicorn_server: uvicorn.Server,
|
|
140
|
+
serve_context: AsyncContextManager[Any],
|
|
141
|
+
timeout: float = 60.0,
|
|
142
|
+
health_url: Optional[str] = None,
|
|
143
|
+
wait_for_serve: bool = True,
|
|
144
|
+
kill_unhealthy_server: bool = True,
|
|
145
|
+
) -> asyncio.Task[None]:
|
|
146
|
+
"""Run two Asyncio tasks in parallel:
|
|
147
|
+
|
|
148
|
+
- A watcher task that waits for the server to start up and then checks for healthiness.
|
|
149
|
+
- A server task that serves the server.
|
|
150
|
+
"""
|
|
151
|
+
server_start_exception: Optional[BaseException] = None
|
|
152
|
+
|
|
153
|
+
# watcher: when server.started flips True, announce READY once
|
|
154
|
+
async def _watch_server() -> None:
|
|
155
|
+
start_time = time.time()
|
|
156
|
+
deadline = start_time + timeout # child-side startup window
|
|
157
|
+
logger.debug(f"Waiting for server to start up for {timeout:.2f} seconds...")
|
|
158
|
+
# Wait for the server to start up or the deadline to be reached, or an exception to be raised.
|
|
159
|
+
while time.time() < deadline and not uvicorn_server.started and server_start_exception is None:
|
|
160
|
+
await asyncio.sleep(0.1)
|
|
161
|
+
|
|
162
|
+
if not uvicorn_server.started:
|
|
163
|
+
# Normally, the program will not reach this point, as the server will throw the exception itself earlier.
|
|
164
|
+
raise RuntimeError(
|
|
165
|
+
f"Server did not start up within {time.time() - start_time:.2f} seconds."
|
|
166
|
+
) from server_start_exception
|
|
167
|
+
|
|
168
|
+
logger.info(f"Server started up in {time.time() - start_time:.2f} seconds.")
|
|
169
|
+
|
|
170
|
+
# Check for health endpoint status if provided
|
|
171
|
+
if health_url is not None:
|
|
172
|
+
logger.info(f"Probing health endpoint {health_url}...")
|
|
173
|
+
async with aiohttp.ClientSession() as session:
|
|
174
|
+
while time.time() < deadline:
|
|
175
|
+
try:
|
|
176
|
+
async with session.get(health_url) as resp:
|
|
177
|
+
if resp.status == 200:
|
|
178
|
+
logger.info(
|
|
179
|
+
f"Server is healthy at {health_url} in {time.time() - start_time:.2f} seconds."
|
|
180
|
+
)
|
|
181
|
+
return
|
|
182
|
+
else:
|
|
183
|
+
logger.debug(
|
|
184
|
+
f"Server is NOT healthy at {health_url} in {time.time() - start_time:.2f} seconds. Got status {resp.status}."
|
|
185
|
+
)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.debug(f"Error probing health endpoint {health_url}: {str(e)}")
|
|
188
|
+
await asyncio.sleep(0.1)
|
|
189
|
+
|
|
190
|
+
# If the server is not healthy, kill it if requested.
|
|
191
|
+
health_failed_seconds = time.time() - start_time
|
|
192
|
+
if kill_unhealthy_server:
|
|
193
|
+
logger.error(
|
|
194
|
+
f"Server is not healthy at {health_url} after {health_failed_seconds:.2f} seconds. Shutting down server gracefully."
|
|
195
|
+
)
|
|
196
|
+
uvicorn_server.should_exit = True
|
|
197
|
+
await serve_task
|
|
198
|
+
|
|
199
|
+
raise RuntimeError(
|
|
200
|
+
f"Server is not healthy at {health_url} after {health_failed_seconds:.2f} seconds. It has been killed."
|
|
201
|
+
)
|
|
202
|
+
else:
|
|
203
|
+
logger.error(
|
|
204
|
+
f"Server is not healthy at {health_url} after {health_failed_seconds:.2f} seconds. It has been left running."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
else:
|
|
208
|
+
logger.info("Server does not provide a health check endpoint. Skipping health check.")
|
|
209
|
+
|
|
210
|
+
async def _serve_server() -> None:
|
|
211
|
+
nonlocal server_start_exception
|
|
212
|
+
async with serve_context:
|
|
213
|
+
try:
|
|
214
|
+
await uvicorn_server.serve()
|
|
215
|
+
except (asyncio.CancelledError, KeyboardInterrupt):
|
|
216
|
+
# Normal shutdown path; propagate without rewrapping
|
|
217
|
+
raise
|
|
218
|
+
except BaseException as exc:
|
|
219
|
+
server_start_exception = exc
|
|
220
|
+
if wait_for_serve:
|
|
221
|
+
# This probably sends out earlier than watcher exception; but either one is fine.
|
|
222
|
+
raise RuntimeError("Uvicorn server failed to serve") from exc
|
|
223
|
+
else:
|
|
224
|
+
# If the caller is not waiting for this coroutine, we just log the error.
|
|
225
|
+
# It will be handled by the watch task.
|
|
226
|
+
logger.exception("Uvicorn server failed to serve. Inspect the logs for details.")
|
|
227
|
+
|
|
228
|
+
serve_task = asyncio.create_task(_serve_server())
|
|
229
|
+
watch_task = asyncio.create_task(_watch_server())
|
|
230
|
+
|
|
231
|
+
if wait_for_serve:
|
|
232
|
+
await asyncio.gather(watch_task, serve_task)
|
|
233
|
+
else:
|
|
234
|
+
# Wait for watch only, the serve task will run in the background.
|
|
235
|
+
await watch_task
|
|
236
|
+
return serve_task
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def run_uvicorn_thread(
|
|
240
|
+
uvicorn_server: uvicorn.Server,
|
|
241
|
+
serve_context: AsyncContextManager[Any],
|
|
242
|
+
event_queue: queue.Queue[ChildEvent],
|
|
243
|
+
stop_event: threading.Event,
|
|
244
|
+
timeout: float = 60.0,
|
|
245
|
+
health_url: Optional[str] = None,
|
|
246
|
+
):
|
|
247
|
+
"""
|
|
248
|
+
Run a uvicorn server in a thread.
|
|
249
|
+
|
|
250
|
+
How to stop programmatically (from the main thread):
|
|
251
|
+
|
|
252
|
+
uvicorn_server.should_exit = True
|
|
253
|
+
|
|
254
|
+
This function:
|
|
255
|
+
|
|
256
|
+
- starts the server and waits for startup/health (if provided),
|
|
257
|
+
- then blocks until the server exits,
|
|
258
|
+
- shuts down cleanly if an error happens during startup/health,
|
|
259
|
+
- or if the thread is stopped by stop event.
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
async def _main() -> None:
|
|
263
|
+
# Start server without waiting for full lifecycle; return once startup/health is done.
|
|
264
|
+
serve_task: Optional[asyncio.Task[None]] = None
|
|
265
|
+
try:
|
|
266
|
+
serve_task = await run_uvicorn_asyncio(
|
|
267
|
+
uvicorn_server=uvicorn_server,
|
|
268
|
+
serve_context=serve_context,
|
|
269
|
+
timeout=timeout,
|
|
270
|
+
health_url=health_url,
|
|
271
|
+
wait_for_serve=False, # return after startup watcher finishes
|
|
272
|
+
kill_unhealthy_server=True, # raise if health fails within timeout
|
|
273
|
+
)
|
|
274
|
+
event_queue.put(ChildEvent(kind="ready"))
|
|
275
|
+
except Exception as exc:
|
|
276
|
+
# Startup/health failed; nothing is running in the background.
|
|
277
|
+
logger.exception("Uvicorn failed to start or was unhealthy.")
|
|
278
|
+
event_queue.put(
|
|
279
|
+
ChildEvent(
|
|
280
|
+
kind="error", exc_type=type(exc).__name__, message=str(exc), traceback=traceback.format_exc()
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
logger.debug("Thread server started and ready.")
|
|
286
|
+
try:
|
|
287
|
+
# At this point, the server is up and serving in the same thread's loop.
|
|
288
|
+
# Block here until it exits (caller can stop it via setting the stop_event).
|
|
289
|
+
while not stop_event.is_set():
|
|
290
|
+
await asyncio.sleep(0.1)
|
|
291
|
+
except asyncio.CancelledError:
|
|
292
|
+
# Shutdown the server.
|
|
293
|
+
logger.warning(
|
|
294
|
+
"Thread server received asyncio cancellation signal. Shutting down gracefully. This is not the recommended way to stop the server."
|
|
295
|
+
)
|
|
296
|
+
raise
|
|
297
|
+
except Exception as exc:
|
|
298
|
+
logger.exception("Exception during the thread event waiting loop.")
|
|
299
|
+
event_queue.put(
|
|
300
|
+
ChildEvent(
|
|
301
|
+
kind="error", exc_type=type(exc).__name__, message=str(exc), traceback=traceback.format_exc()
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
finally:
|
|
305
|
+
logger.info("Requesting graceful shutdown of uvicorn server.")
|
|
306
|
+
await shutdown_uvicorn_server(uvicorn_server, serve_task)
|
|
307
|
+
logger.info("Uvicorn server shut down gracefully.")
|
|
308
|
+
|
|
309
|
+
# Each thread needs its own event loop; use asyncio.run to manage it cleanly.
|
|
310
|
+
try:
|
|
311
|
+
asyncio.run(_main())
|
|
312
|
+
except Exception:
|
|
313
|
+
# Exceptions are already logged above; don't crash the process from a thread.
|
|
314
|
+
# (Caller can inspect logs or add a queue/handler if they need to propagate.)
|
|
315
|
+
logger.exception("Exception within the thread server loop. Inspect the logs for details.")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def run_uvicorn_subprocess(
|
|
319
|
+
uvicorn_server: uvicorn.Server,
|
|
320
|
+
serve_context: AsyncContextManager[Any],
|
|
321
|
+
event_queue: multiprocessing.Queue[ChildEvent],
|
|
322
|
+
timeout: float = 60.0,
|
|
323
|
+
health_url: Optional[str] = None,
|
|
324
|
+
):
|
|
325
|
+
"""Run a uvicorn server in a subprocess.
|
|
326
|
+
|
|
327
|
+
Behavior:
|
|
328
|
+
|
|
329
|
+
- Start uvicorn and wait for startup/health (if provided).
|
|
330
|
+
- Post `ChildEvent(kind="ready")` once the server is up.
|
|
331
|
+
- Stay alive until a termination signal (SIGTERM/SIGINT).
|
|
332
|
+
- On signal, request graceful shutdown and wait for the server to exit.
|
|
333
|
+
|
|
334
|
+
This must be used with forked multiprocessing.Process.
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
async def _main() -> None:
|
|
338
|
+
stop_event = asyncio.Event()
|
|
339
|
+
|
|
340
|
+
# Register signal handlers
|
|
341
|
+
loop = asyncio.get_running_loop()
|
|
342
|
+
for sig in (signal.SIGTERM, signal.SIGINT):
|
|
343
|
+
loop.add_signal_handler(sig, stop_event.set)
|
|
344
|
+
logger.debug("Subprocess signal handlers registered.")
|
|
345
|
+
|
|
346
|
+
serve_task: Optional[asyncio.Task[None]] = None
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
# Start server but don't block on its full lifecycle; this returns once the watcher finishes.
|
|
350
|
+
serve_task = await run_uvicorn_asyncio(
|
|
351
|
+
uvicorn_server=uvicorn_server,
|
|
352
|
+
serve_context=serve_context,
|
|
353
|
+
timeout=timeout,
|
|
354
|
+
health_url=health_url,
|
|
355
|
+
wait_for_serve=False, # return after startup/health passes
|
|
356
|
+
kill_unhealthy_server=True, # if unhealthy, fail fast in the child
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# Announce readiness only after watcher success.
|
|
360
|
+
event_queue.put(ChildEvent(kind="ready"))
|
|
361
|
+
|
|
362
|
+
logger.debug("Subprocess server started and ready.")
|
|
363
|
+
|
|
364
|
+
# Wait until we're told to stop.
|
|
365
|
+
await stop_event.wait()
|
|
366
|
+
|
|
367
|
+
except Exception as exc:
|
|
368
|
+
# Propagate any startup/health errors to the parent.
|
|
369
|
+
event_queue.put(
|
|
370
|
+
ChildEvent(
|
|
371
|
+
kind="error",
|
|
372
|
+
exc_type=type(exc).__name__,
|
|
373
|
+
message=str(exc),
|
|
374
|
+
traceback=traceback.format_exc(),
|
|
375
|
+
)
|
|
376
|
+
)
|
|
377
|
+
logger.exception("Subprocess server failed to start or was unhealthy.")
|
|
378
|
+
|
|
379
|
+
finally:
|
|
380
|
+
# Request graceful shutdown if the server is running.
|
|
381
|
+
if serve_task is not None:
|
|
382
|
+
logger.info("Requesting graceful shutdown of subprocess server.")
|
|
383
|
+
await shutdown_uvicorn_server(uvicorn_server, serve_task)
|
|
384
|
+
logger.info("Subprocess server shut down gracefully.")
|
|
385
|
+
else:
|
|
386
|
+
logger.info("Subprocess server was not running. Nothing to stop.")
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
asyncio.run(_main())
|
|
390
|
+
except Exception as exc:
|
|
391
|
+
# If something escapes _main(), make sure the parent hears about it.
|
|
392
|
+
event_queue.put(
|
|
393
|
+
ChildEvent(
|
|
394
|
+
kind="error",
|
|
395
|
+
exc_type=type(exc).__name__,
|
|
396
|
+
message=str(exc),
|
|
397
|
+
traceback=traceback.format_exc(),
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def run_gunicorn(
|
|
403
|
+
gunicorn_app: GunicornApp,
|
|
404
|
+
serve_context: AsyncContextManager[Any],
|
|
405
|
+
event_queue: multiprocessing.Queue[ChildEvent],
|
|
406
|
+
timeout: float = 60.0,
|
|
407
|
+
health_url: Optional[str] = None,
|
|
408
|
+
):
|
|
409
|
+
"""Run a gunicorn server in a subprocess.
|
|
410
|
+
|
|
411
|
+
The master arbiter will reside in a non-daemon subprocess,
|
|
412
|
+
and the workers will be forked from the arbiter.
|
|
413
|
+
|
|
414
|
+
Behavior:
|
|
415
|
+
|
|
416
|
+
- Start Arbiter.run() (blocking) in this process.
|
|
417
|
+
- A watchdog thread waits for workers to spawn, then (optionally) verifies a health URL.
|
|
418
|
+
- On success: put `ChildEvent(kind="ready")`.
|
|
419
|
+
- On failure/timeout: put `ChildEvent(kind="error")` and request a graceful shutdown.
|
|
420
|
+
|
|
421
|
+
`serve_context` will be applied around the `arbiter.run()` call.
|
|
422
|
+
"""
|
|
423
|
+
# Create the arbiter up-front so the watchdog can inspect it.
|
|
424
|
+
try:
|
|
425
|
+
arbiter = Arbiter(gunicorn_app)
|
|
426
|
+
except Exception as exc:
|
|
427
|
+
logger.exception("Failed to initialize Gunicorn Arbiter.")
|
|
428
|
+
event_queue.put(
|
|
429
|
+
ChildEvent(
|
|
430
|
+
kind="error",
|
|
431
|
+
exc_type=type(exc).__name__,
|
|
432
|
+
message=str(exc),
|
|
433
|
+
traceback=traceback.format_exc(),
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
return
|
|
437
|
+
|
|
438
|
+
runtime_error: Optional[BaseException] = None
|
|
439
|
+
|
|
440
|
+
def _watchdog() -> None:
|
|
441
|
+
start = time.time()
|
|
442
|
+
deadline = start + timeout
|
|
443
|
+
|
|
444
|
+
# First, wait for arbiter.workers to get populated
|
|
445
|
+
while time.time() < deadline and not arbiter.WORKERS: # type: ignore
|
|
446
|
+
# If arbiter died early, abort quickly.
|
|
447
|
+
if runtime_error is not None:
|
|
448
|
+
logger.error("Gunicorn arbiter exited during startup. Watchdog exiting.")
|
|
449
|
+
return
|
|
450
|
+
time.sleep(0.1)
|
|
451
|
+
|
|
452
|
+
if not arbiter.WORKERS: # type: ignore
|
|
453
|
+
elapsed_time = time.time() - start
|
|
454
|
+
logger.error("Gunicorn workers did not start within %.2f seconds.", elapsed_time)
|
|
455
|
+
if runtime_error is None:
|
|
456
|
+
# Timeout case: arbiter throws no exception.
|
|
457
|
+
event_queue.put(
|
|
458
|
+
ChildEvent(
|
|
459
|
+
kind="error",
|
|
460
|
+
exc_type="RuntimeError",
|
|
461
|
+
message=f"Gunicorn workers did not start within {elapsed_time:.2f} seconds.",
|
|
462
|
+
traceback=None,
|
|
463
|
+
)
|
|
464
|
+
)
|
|
465
|
+
logger.info("Halting Gunicorn arbiter.")
|
|
466
|
+
# Ask arbiter to stop if it's still alive.
|
|
467
|
+
# It will make the watchdog exit too.
|
|
468
|
+
arbiter.signal(signal.SIGTERM, inspect.currentframe()) # type: ignore
|
|
469
|
+
else:
|
|
470
|
+
# Timeout case: arbiter has thrown an exception.
|
|
471
|
+
logger.error("Gunicorn arbiter exited during startup. Watchdog exiting.")
|
|
472
|
+
return
|
|
473
|
+
|
|
474
|
+
# Second, check for health endpoint status if provided
|
|
475
|
+
if health_url:
|
|
476
|
+
while time.time() < deadline:
|
|
477
|
+
# If arbiter died early, abort.
|
|
478
|
+
if runtime_error is not None:
|
|
479
|
+
logger.error("Gunicorn arbiter exited during health check. Watchdog exiting.")
|
|
480
|
+
return
|
|
481
|
+
|
|
482
|
+
# Check if the server is healthy.
|
|
483
|
+
try:
|
|
484
|
+
resp = requests.get(health_url, timeout=2.0)
|
|
485
|
+
if resp.status_code == 200:
|
|
486
|
+
logger.debug(f"Server is healthy at {health_url} in {time.time() - start:.2f} seconds.")
|
|
487
|
+
# Check arbiter status again.
|
|
488
|
+
if runtime_error is None:
|
|
489
|
+
event_queue.put(ChildEvent(kind="ready"))
|
|
490
|
+
else:
|
|
491
|
+
logger.error(
|
|
492
|
+
"Response status is 200 but arbiter has thrown an exception. This should not happen."
|
|
493
|
+
)
|
|
494
|
+
return
|
|
495
|
+
except Exception:
|
|
496
|
+
logger.debug(
|
|
497
|
+
f"Server is still not healthy at {health_url} in {time.time() - start:.2f} seconds.",
|
|
498
|
+
exc_info=True,
|
|
499
|
+
)
|
|
500
|
+
time.sleep(0.1)
|
|
501
|
+
|
|
502
|
+
# Health failed: report and shut down.
|
|
503
|
+
elapsed = time.time() - start
|
|
504
|
+
logger.error(
|
|
505
|
+
"Server is not healthy at %s after %.2f seconds. Shutting down.",
|
|
506
|
+
health_url,
|
|
507
|
+
elapsed,
|
|
508
|
+
)
|
|
509
|
+
if runtime_error is None:
|
|
510
|
+
# Arbiter throws no exception. This is a simple timeout case.
|
|
511
|
+
event_queue.put(
|
|
512
|
+
ChildEvent(
|
|
513
|
+
kind="error",
|
|
514
|
+
exc_type="RuntimeError",
|
|
515
|
+
message=(
|
|
516
|
+
f"Server is not healthy at {health_url} after "
|
|
517
|
+
f"{elapsed:.2f} seconds. It will be killed by the watchdog."
|
|
518
|
+
),
|
|
519
|
+
traceback=None,
|
|
520
|
+
)
|
|
521
|
+
)
|
|
522
|
+
logger.info("Halting Gunicorn arbiter.")
|
|
523
|
+
# Ask arbiter to stop if it's still alive.
|
|
524
|
+
arbiter.signal(signal.SIGTERM, inspect.currentframe()) # type: ignore
|
|
525
|
+
else:
|
|
526
|
+
# If arbiter has thrown an exception, report it.
|
|
527
|
+
logger.error("Gunicorn arbiter exited during health check. Watchdog exiting.")
|
|
528
|
+
|
|
529
|
+
else:
|
|
530
|
+
# No health check; workers up => ready.
|
|
531
|
+
if runtime_error is None:
|
|
532
|
+
event_queue.put(ChildEvent(kind="ready"))
|
|
533
|
+
else:
|
|
534
|
+
# If arbiter has thrown an exception, report it.
|
|
535
|
+
logger.error("Gunicorn arbiter exited unexpectedly before health check. Watchdog exiting.")
|
|
536
|
+
|
|
537
|
+
def _watchdog_with_exception() -> None:
|
|
538
|
+
try:
|
|
539
|
+
_watchdog()
|
|
540
|
+
except Exception as exc:
|
|
541
|
+
logger.exception("Exception in watchdog thread.")
|
|
542
|
+
event_queue.put(
|
|
543
|
+
ChildEvent(
|
|
544
|
+
kind="error", exc_type=type(exc).__name__, message=str(exc), traceback=traceback.format_exc()
|
|
545
|
+
)
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
watchdog_thread = threading.Thread(target=_watchdog_with_exception, daemon=True)
|
|
549
|
+
watchdog_thread.start()
|
|
550
|
+
|
|
551
|
+
async def _serve() -> None:
|
|
552
|
+
nonlocal runtime_error
|
|
553
|
+
try:
|
|
554
|
+
async with serve_context:
|
|
555
|
+
arbiter.run()
|
|
556
|
+
except Exception as exc:
|
|
557
|
+
runtime_error = exc
|
|
558
|
+
event_queue.put(
|
|
559
|
+
ChildEvent(
|
|
560
|
+
kind="error",
|
|
561
|
+
exc_type=type(exc).__name__,
|
|
562
|
+
message=str(exc),
|
|
563
|
+
traceback=traceback.format_exc(),
|
|
564
|
+
)
|
|
565
|
+
)
|
|
566
|
+
logger.exception("Gunicorn server failed to start.")
|
|
567
|
+
|
|
568
|
+
try:
|
|
569
|
+
asyncio.run(_serve())
|
|
570
|
+
# Most exceptions should have been caught within the _serve() coroutine.
|
|
571
|
+
finally:
|
|
572
|
+
# Ensure watchdog doesn't try to act on a dead arbiter for long.
|
|
573
|
+
watchdog_thread.join(timeout=5.0)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def _get_default_ipv4_address() -> str:
|
|
577
|
+
"""Determine the default outbound IPv4 address for this machine.
|
|
578
|
+
|
|
579
|
+
Implementation:
|
|
580
|
+
Opens a UDP socket and "connects" to a public address to force route
|
|
581
|
+
selection, then inspects the socket's local address. No packets are sent.
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
str: Best-guess IPv4 like `192.168.x.y`. Falls back to `127.0.0.1`.
|
|
585
|
+
"""
|
|
586
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
587
|
+
try:
|
|
588
|
+
# Doesn't actually contact 8.8.8.8; just forces the OS to pick a route.
|
|
589
|
+
s.connect(("8.8.8.8", 80))
|
|
590
|
+
return s.getsockname()[0]
|
|
591
|
+
except Exception:
|
|
592
|
+
return "127.0.0.1"
|
|
593
|
+
finally:
|
|
594
|
+
s.close()
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
class PythonServerLauncher:
|
|
598
|
+
"""Unified launcher for FastAPI, using uvicorn or gunicorn per mode/worker count.
|
|
599
|
+
|
|
600
|
+
See [`PythonServerLauncherArgs`][mantisdk.utils.server_launcher.PythonServerLauncherArgs] for configuration options.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
app: The FastAPI app to launch.
|
|
604
|
+
args: The configuration for the server.
|
|
605
|
+
serve_context: An optional context manager to apply around the server startup.
|
|
606
|
+
"""
|
|
607
|
+
|
|
608
|
+
def __init__(
|
|
609
|
+
self, app: FastAPI, args: PythonServerLauncherArgs, serve_context: Optional[AsyncContextManager[Any]] = None
|
|
610
|
+
):
|
|
611
|
+
"""Initialize the launcher with the FastAPI app, configuration, and optional serve context."""
|
|
612
|
+
self.app = app
|
|
613
|
+
self.args = args
|
|
614
|
+
self.serve_context = serve_context
|
|
615
|
+
self._host: Optional[str] = self.args.host
|
|
616
|
+
self._port: Optional[int] = self.args.port
|
|
617
|
+
self._access_host: Optional[str] = self.args.access_host
|
|
618
|
+
self.initialize()
|
|
619
|
+
|
|
620
|
+
def initialize(self):
|
|
621
|
+
# ensure the host/port/access_host are set
|
|
622
|
+
self._ensure_host()
|
|
623
|
+
self._ensure_port()
|
|
624
|
+
self._ensure_access_host()
|
|
625
|
+
|
|
626
|
+
# uvicorn (in-proc asyncio)
|
|
627
|
+
self._uvicorn_server: Optional[uvicorn.Server] = None
|
|
628
|
+
self._uvicorn_task: Optional[asyncio.Task[None]] = None # returned by run_uvicorn_asyncio()
|
|
629
|
+
|
|
630
|
+
# uvicorn (thread)
|
|
631
|
+
self._thread: Optional[threading.Thread] = None
|
|
632
|
+
self._thread_event_queue: Optional[queue.Queue[ChildEvent]] = None
|
|
633
|
+
self._thread_stop_event: Optional[threading.Event] = None
|
|
634
|
+
|
|
635
|
+
# subprocess (uvicorn / gunicorn)
|
|
636
|
+
self._proc: Optional[BaseProcess] = None
|
|
637
|
+
self._mp_event_queue: Optional[multiprocessing.Queue[ChildEvent]] = None
|
|
638
|
+
self._gunicorn_app: Optional[GunicornApp] = None # programmatic gunicorn wrapper
|
|
639
|
+
|
|
640
|
+
# is_running flag
|
|
641
|
+
self._is_running: bool = False
|
|
642
|
+
|
|
643
|
+
def __getstate__(self):
|
|
644
|
+
"""Control pickling to prevent server state from being sent to subprocesses."""
|
|
645
|
+
return {
|
|
646
|
+
"app": self.app,
|
|
647
|
+
"args": self.args,
|
|
648
|
+
"serve_context": self.serve_context,
|
|
649
|
+
"_host": self._host,
|
|
650
|
+
"_port": self._port,
|
|
651
|
+
"_access_host": self._access_host,
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
def __setstate__(self, state: Dict[str, Any]):
|
|
655
|
+
self.app = state["app"]
|
|
656
|
+
self.args = cast(PythonServerLauncherArgs, state["args"])
|
|
657
|
+
self.serve_context = state["serve_context"]
|
|
658
|
+
self._host = state["_host"]
|
|
659
|
+
self._port = state["_port"]
|
|
660
|
+
self._access_host = state["_access_host"]
|
|
661
|
+
self.initialize()
|
|
662
|
+
|
|
663
|
+
@property
|
|
664
|
+
def endpoint(self) -> str:
|
|
665
|
+
"""Return the externally advertised host:port pair regardless of accessibility."""
|
|
666
|
+
return f"http://{self._ensure_host()}:{self._ensure_port()}"
|
|
667
|
+
|
|
668
|
+
@property
|
|
669
|
+
def access_endpoint(self) -> str:
|
|
670
|
+
"""Return a loopback-friendly URL so health checks succeed even when binding to 0.0.0.0."""
|
|
671
|
+
return f"http://{self._ensure_access_host()}:{self._ensure_port()}"
|
|
672
|
+
|
|
673
|
+
@property
|
|
674
|
+
def health_url(self) -> Optional[str]:
|
|
675
|
+
"""Build the absolute health-check endpoint from args, if one is configured."""
|
|
676
|
+
if not self.args.healthcheck_url:
|
|
677
|
+
return None
|
|
678
|
+
path = self.args.healthcheck_url
|
|
679
|
+
if not path.startswith("/"):
|
|
680
|
+
path = "/" + path
|
|
681
|
+
return f"{self.access_endpoint}{path}"
|
|
682
|
+
|
|
683
|
+
async def start(self):
|
|
684
|
+
"""Starts the server according to launch_mode and n_workers."""
|
|
685
|
+
logger.info(f"Starting server {self._normalize_app_ref(self.app)}...")
|
|
686
|
+
mode = self.args.launch_mode
|
|
687
|
+
if mode == "mp":
|
|
688
|
+
await self._start_serving_process()
|
|
689
|
+
elif mode == "thread":
|
|
690
|
+
await self._start_uvicorn_thread()
|
|
691
|
+
elif mode == "asyncio":
|
|
692
|
+
await self._start_uvicorn_asyncio()
|
|
693
|
+
else:
|
|
694
|
+
raise ValueError(f"Unsupported launch mode: {mode}")
|
|
695
|
+
logger.info(f"Server {self._normalize_app_ref(self.app)} started at {self.endpoint}")
|
|
696
|
+
|
|
697
|
+
async def stop(self):
|
|
698
|
+
"""Stop the server using the inverse of whatever launch mode was used to start it."""
|
|
699
|
+
logger.info(f"Stopping server {self._normalize_app_ref(self.app)}...")
|
|
700
|
+
mode = self.args.launch_mode
|
|
701
|
+
if mode == "mp":
|
|
702
|
+
await self._stop_serving_process()
|
|
703
|
+
elif mode == "thread":
|
|
704
|
+
await self._stop_uvicorn_thread()
|
|
705
|
+
elif mode == "asyncio":
|
|
706
|
+
await self._stop_uvicorn_asyncio()
|
|
707
|
+
else:
|
|
708
|
+
raise ValueError(f"Unsupported launch mode: {mode}")
|
|
709
|
+
logger.info(f"Server {self._normalize_app_ref(self.app)} stopped")
|
|
710
|
+
|
|
711
|
+
async def reload(self):
|
|
712
|
+
"""Restart the server by stopping it if necessary and invoking start again."""
|
|
713
|
+
if self.is_running():
|
|
714
|
+
await self.stop()
|
|
715
|
+
await self.start()
|
|
716
|
+
|
|
717
|
+
async def run_forever(self):
|
|
718
|
+
"""Start the server and block the caller until it exits, respecting the configured mode."""
|
|
719
|
+
mode = self.args.launch_mode
|
|
720
|
+
if mode == "asyncio":
|
|
721
|
+
await self._start_uvicorn_asyncio()
|
|
722
|
+
try:
|
|
723
|
+
if self._uvicorn_task is not None:
|
|
724
|
+
# Wait for the server
|
|
725
|
+
# Won't allow outer cancel to directly cancel the inner task
|
|
726
|
+
await asyncio.shield(self._uvicorn_task)
|
|
727
|
+
except (asyncio.CancelledError, KeyboardInterrupt):
|
|
728
|
+
logger.warning("Server received cancellation signal. Shutting down gracefully.")
|
|
729
|
+
await self._stop_uvicorn_asyncio()
|
|
730
|
+
raise
|
|
731
|
+
|
|
732
|
+
elif mode == "thread":
|
|
733
|
+
await self._start_uvicorn_thread()
|
|
734
|
+
try:
|
|
735
|
+
# Wait for the thread to exit
|
|
736
|
+
while self._thread and self._thread.is_alive():
|
|
737
|
+
await asyncio.sleep(0.5)
|
|
738
|
+
except (asyncio.CancelledError, KeyboardInterrupt):
|
|
739
|
+
logger.warning("Server thread received cancellation signal. Shutting down gracefully.")
|
|
740
|
+
await self._stop_uvicorn_thread()
|
|
741
|
+
raise
|
|
742
|
+
|
|
743
|
+
elif mode == "mp":
|
|
744
|
+
await self._start_serving_process()
|
|
745
|
+
try:
|
|
746
|
+
# Wait for the process to exit
|
|
747
|
+
while self._proc and self._proc.is_alive():
|
|
748
|
+
await asyncio.sleep(0.5)
|
|
749
|
+
except (asyncio.CancelledError, KeyboardInterrupt):
|
|
750
|
+
logger.warning("Server process received cancellation signal. Shutting down gracefully.")
|
|
751
|
+
await self._stop_serving_process()
|
|
752
|
+
raise
|
|
753
|
+
|
|
754
|
+
else:
|
|
755
|
+
raise ValueError(f"Unsupported launch mode: {mode}")
|
|
756
|
+
|
|
757
|
+
def is_running(self) -> bool:
|
|
758
|
+
"""Return True if the server has been started and not yet stopped."""
|
|
759
|
+
return self._is_running
|
|
760
|
+
|
|
761
|
+
@staticmethod
|
|
762
|
+
def _normalize_app_ref(app: FastAPI) -> str:
|
|
763
|
+
module = getattr(app, "__module__", None)
|
|
764
|
+
if module and module != "__main__":
|
|
765
|
+
return f"{module}:app"
|
|
766
|
+
return "unknown:app"
|
|
767
|
+
|
|
768
|
+
def _ensure_host(self) -> str:
|
|
769
|
+
if self._host is None:
|
|
770
|
+
logger.warning("No host provided, using 0.0.0.0.")
|
|
771
|
+
self._host = "0.0.0.0"
|
|
772
|
+
return self._host
|
|
773
|
+
|
|
774
|
+
def _ensure_port(self) -> int:
|
|
775
|
+
if self._port is None:
|
|
776
|
+
logger.warning("No port provided, using pick_unused_port to pick a random unused port.")
|
|
777
|
+
self._port = pick_unused_port()
|
|
778
|
+
return self._port
|
|
779
|
+
|
|
780
|
+
def _ensure_access_host(self) -> str:
|
|
781
|
+
if self._access_host is None:
|
|
782
|
+
if self.args.access_host is None:
|
|
783
|
+
if self._ensure_host() in ("0.0.0.0", "::"):
|
|
784
|
+
# Probe host normalization for 0.0.0.0
|
|
785
|
+
logger.warning("No access host provided, using default outbound IPv4 address for this machine.")
|
|
786
|
+
self._access_host = _get_default_ipv4_address()
|
|
787
|
+
else:
|
|
788
|
+
logger.warning("No access host provided, using the host provided.")
|
|
789
|
+
self._access_host = self._ensure_host()
|
|
790
|
+
else:
|
|
791
|
+
self._access_host = self.args.access_host
|
|
792
|
+
return self._access_host # type: ignore
|
|
793
|
+
|
|
794
|
+
def _create_uvicorn_server(self) -> uvicorn.Server:
|
|
795
|
+
config = uvicorn.Config(
|
|
796
|
+
app=self.app,
|
|
797
|
+
host=self._ensure_host(),
|
|
798
|
+
port=self._ensure_port(),
|
|
799
|
+
log_level=self.args.log_level,
|
|
800
|
+
access_log=self.args.access_log,
|
|
801
|
+
loop="asyncio",
|
|
802
|
+
timeout_keep_alive=self.args.timeout_keep_alive,
|
|
803
|
+
)
|
|
804
|
+
return uvicorn.Server(config)
|
|
805
|
+
|
|
806
|
+
def _ctx(self) -> AsyncContextManager[Any]:
|
|
807
|
+
# Use the provided serve_context if any; otherwise a no-op async CM
|
|
808
|
+
if self.serve_context is None:
|
|
809
|
+
logger.info("No serve_context provided, using noop_context.")
|
|
810
|
+
return noop_context()
|
|
811
|
+
return self.serve_context
|
|
812
|
+
|
|
813
|
+
# --- Mode 1: asyncio (in-proc) using run_uvicorn_asyncio ---
|
|
814
|
+
|
|
815
|
+
async def _start_uvicorn_asyncio(self):
|
|
816
|
+
if self.is_running():
|
|
817
|
+
raise RuntimeError("Server is already running. Stopping it first.")
|
|
818
|
+
|
|
819
|
+
logger.info("Starting uvicorn asyncio server...")
|
|
820
|
+
self._uvicorn_server = self._create_uvicorn_server()
|
|
821
|
+
# Start server; return after health passes; keep serving in background task
|
|
822
|
+
self._uvicorn_task = await run_uvicorn_asyncio(
|
|
823
|
+
uvicorn_server=self._uvicorn_server,
|
|
824
|
+
serve_context=self._ctx(),
|
|
825
|
+
timeout=self.args.startup_timeout,
|
|
826
|
+
health_url=self.health_url,
|
|
827
|
+
wait_for_serve=False, # return once startup/health OK
|
|
828
|
+
kill_unhealthy_server=self.args.kill_unhealthy_server,
|
|
829
|
+
)
|
|
830
|
+
self._is_running = True
|
|
831
|
+
logger.info("Uvicorn asyncio server started")
|
|
832
|
+
|
|
833
|
+
async def _stop_uvicorn_asyncio(self):
|
|
834
|
+
# Gracefully shut down the in-proc uvicorn server task if running
|
|
835
|
+
logger.info("Stopping uvicorn asyncio server...")
|
|
836
|
+
if self._uvicorn_server and self._uvicorn_task:
|
|
837
|
+
await shutdown_uvicorn_server(self._uvicorn_server, self._uvicorn_task)
|
|
838
|
+
self._uvicorn_task = None
|
|
839
|
+
self._uvicorn_server = None
|
|
840
|
+
self._is_running = False
|
|
841
|
+
logger.info("Uvicorn asyncio server stopped")
|
|
842
|
+
|
|
843
|
+
# --- Mode 2: thread (in-proc) using run_uvicorn_thread ---
|
|
844
|
+
|
|
845
|
+
async def _start_uvicorn_thread(self):
|
|
846
|
+
if self.is_running():
|
|
847
|
+
raise RuntimeError("Server is already running. Stopping it first.")
|
|
848
|
+
|
|
849
|
+
logger.info("Starting uvicorn thread server...")
|
|
850
|
+
self._uvicorn_server = self._create_uvicorn_server()
|
|
851
|
+
self._thread_event_queue = queue.Queue()
|
|
852
|
+
self._thread_stop_event = threading.Event()
|
|
853
|
+
|
|
854
|
+
self._thread = threading.Thread(
|
|
855
|
+
target=run_uvicorn_thread,
|
|
856
|
+
kwargs={
|
|
857
|
+
"uvicorn_server": self._uvicorn_server,
|
|
858
|
+
"serve_context": self._ctx(),
|
|
859
|
+
"event_queue": self._thread_event_queue,
|
|
860
|
+
"stop_event": self._thread_stop_event,
|
|
861
|
+
"timeout": self.args.startup_timeout,
|
|
862
|
+
"health_url": self.health_url,
|
|
863
|
+
},
|
|
864
|
+
daemon=True,
|
|
865
|
+
)
|
|
866
|
+
self._thread.start()
|
|
867
|
+
|
|
868
|
+
# Wait for ready or error event from the thread
|
|
869
|
+
timeout = self.args.startup_timeout * 2 # Allows twice the timeout for the thread to get the event
|
|
870
|
+
try:
|
|
871
|
+
evt: ChildEvent = await asyncio.to_thread(self._thread_event_queue.get, True, timeout)
|
|
872
|
+
except queue.Empty:
|
|
873
|
+
if not self._thread.is_alive():
|
|
874
|
+
raise RuntimeError("Threaded server failed to start and is not alive. No error event was received.")
|
|
875
|
+
logger.error(
|
|
876
|
+
"Threaded server failed to start and sends no event. This should not happen. Shutting down server."
|
|
877
|
+
)
|
|
878
|
+
await self._stop_uvicorn_thread()
|
|
879
|
+
raise RuntimeError("Threaded server failed to start and sends no event. This should not happen.")
|
|
880
|
+
|
|
881
|
+
if evt.kind == "error":
|
|
882
|
+
logger.error("Threaded server failed to start (%s): %s\n%s", evt.exc_type, evt.message, evt.traceback)
|
|
883
|
+
await asyncio.to_thread(self._thread.join, self.args.thread_join_timeout)
|
|
884
|
+
if self._thread.is_alive():
|
|
885
|
+
logger.error("Threaded server failed to start and refused to shut down.")
|
|
886
|
+
raise RuntimeError(evt.message)
|
|
887
|
+
else:
|
|
888
|
+
logger.info("Threaded server started successfully.")
|
|
889
|
+
self._is_running = True
|
|
890
|
+
|
|
891
|
+
async def _stop_uvicorn_thread(self):
|
|
892
|
+
logger.info("Stopping uvicorn thread server...")
|
|
893
|
+
if self._thread_stop_event:
|
|
894
|
+
self._thread_stop_event.set()
|
|
895
|
+
if self._thread:
|
|
896
|
+
await asyncio.to_thread(self._thread.join, self.args.thread_join_timeout)
|
|
897
|
+
if self._thread.is_alive():
|
|
898
|
+
raise RuntimeError("Threaded server refused to shut down.")
|
|
899
|
+
else:
|
|
900
|
+
logger.info("Uvicorn thread server was not running. Nothing to stop.")
|
|
901
|
+
|
|
902
|
+
self._thread = None
|
|
903
|
+
self._thread_event_queue = None
|
|
904
|
+
self._thread_stop_event = None
|
|
905
|
+
self._uvicorn_server = None
|
|
906
|
+
self._is_running = False
|
|
907
|
+
logger.info("Uvicorn thread server stopped")
|
|
908
|
+
|
|
909
|
+
# --- Mode 3: subprocess (uvicorn / gunicorn) using run_uvicorn_subprocess or run_gunicorn ---
|
|
910
|
+
|
|
911
|
+
async def _start_serving_process(self):
|
|
912
|
+
if self.is_running():
|
|
913
|
+
raise RuntimeError("Server process is already running. Stopping it first.")
|
|
914
|
+
|
|
915
|
+
host = self._ensure_host()
|
|
916
|
+
port = self._ensure_port()
|
|
917
|
+
|
|
918
|
+
try:
|
|
919
|
+
ctx = multiprocessing.get_context("fork")
|
|
920
|
+
except ValueError as e:
|
|
921
|
+
raise RuntimeError(
|
|
922
|
+
"Process launch requires 'fork' start method (Linux/macOS). "
|
|
923
|
+
"On Windows, use 'thread' or 'asyncio' modes."
|
|
924
|
+
) from e
|
|
925
|
+
self._mp_event_queue = ctx.Queue()
|
|
926
|
+
|
|
927
|
+
# Gunicorn path when n_workers > 1
|
|
928
|
+
if self.args.n_workers > 1:
|
|
929
|
+
logger.info(f"Starting Gunicorn server...")
|
|
930
|
+
options = {
|
|
931
|
+
"bind": f"{host}:{port}",
|
|
932
|
+
"workers": int(self.args.n_workers),
|
|
933
|
+
"worker_class": "uvicorn_worker.UvicornWorker",
|
|
934
|
+
"loglevel": logging.getLevelName(self.args.log_level).lower(),
|
|
935
|
+
"accesslog": "-" if self.args.access_log else None,
|
|
936
|
+
"errorlog": "-",
|
|
937
|
+
"preload_app": True,
|
|
938
|
+
"graceful_timeout": int(
|
|
939
|
+
self.args.process_join_timeout / 2
|
|
940
|
+
), # Allow half the timeout for graceful shutdown
|
|
941
|
+
}
|
|
942
|
+
if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
|
|
943
|
+
from mantisdk.utils.metrics import shutdown_metrics
|
|
944
|
+
|
|
945
|
+
options["child_exit"] = shutdown_metrics # type: ignore
|
|
946
|
+
|
|
947
|
+
self._gunicorn_app = GunicornApp(self.app, options)
|
|
948
|
+
|
|
949
|
+
self._proc = ctx.Process(
|
|
950
|
+
target=run_gunicorn,
|
|
951
|
+
kwargs={
|
|
952
|
+
"gunicorn_app": self._gunicorn_app,
|
|
953
|
+
"serve_context": self._ctx(),
|
|
954
|
+
"event_queue": self._mp_event_queue,
|
|
955
|
+
"timeout": self.args.startup_timeout,
|
|
956
|
+
"health_url": self.health_url,
|
|
957
|
+
},
|
|
958
|
+
daemon=False,
|
|
959
|
+
)
|
|
960
|
+
self._proc.start()
|
|
961
|
+
|
|
962
|
+
else:
|
|
963
|
+
# Single-worker subprocess uvicorn
|
|
964
|
+
logger.info("Starting uvicorn subprocess server...")
|
|
965
|
+
self._uvicorn_server = self._create_uvicorn_server()
|
|
966
|
+
|
|
967
|
+
self._proc = ctx.Process(
|
|
968
|
+
target=run_uvicorn_subprocess,
|
|
969
|
+
kwargs={
|
|
970
|
+
"uvicorn_server": self._uvicorn_server,
|
|
971
|
+
"serve_context": self._ctx(),
|
|
972
|
+
"event_queue": self._mp_event_queue,
|
|
973
|
+
"timeout": self.args.startup_timeout,
|
|
974
|
+
"health_url": self.health_url,
|
|
975
|
+
},
|
|
976
|
+
daemon=True,
|
|
977
|
+
)
|
|
978
|
+
self._proc.start()
|
|
979
|
+
|
|
980
|
+
# Wait for ready or error event from the thread
|
|
981
|
+
timeout = self.args.startup_timeout * 2 # Allows twice the timeout for the thread to get the event
|
|
982
|
+
try:
|
|
983
|
+
evt: ChildEvent = await asyncio.to_thread(self._mp_event_queue.get, True, timeout)
|
|
984
|
+
except queue.Empty:
|
|
985
|
+
if not self._proc.is_alive():
|
|
986
|
+
raise RuntimeError("Server process failed to start and is not alive. No error event was received.")
|
|
987
|
+
logger.error(
|
|
988
|
+
"Server process failed to start and sends no event. This should not happen. Shutting down server."
|
|
989
|
+
)
|
|
990
|
+
await self._stop_serving_process()
|
|
991
|
+
raise RuntimeError("Server process failed to start and sends no event. This should not happen.")
|
|
992
|
+
|
|
993
|
+
if evt.kind == "error":
|
|
994
|
+
logger.error(
|
|
995
|
+
"Server process (%s) failed to start (%s): %s\n%s",
|
|
996
|
+
"gunicorn" if self.args.n_workers > 1 else "uvicorn",
|
|
997
|
+
evt.exc_type,
|
|
998
|
+
evt.message,
|
|
999
|
+
evt.traceback,
|
|
1000
|
+
)
|
|
1001
|
+
await asyncio.to_thread(self._proc.join, self.args.process_join_timeout)
|
|
1002
|
+
if self._proc.is_alive():
|
|
1003
|
+
logger.error("Server process failed to start and refused to shut down.")
|
|
1004
|
+
raise RuntimeError(evt.message)
|
|
1005
|
+
else:
|
|
1006
|
+
logger.info("Subprocess server started successfully.")
|
|
1007
|
+
self._is_running = True
|
|
1008
|
+
|
|
1009
|
+
async def _stop_serving_process(self):
|
|
1010
|
+
logger.info("Stopping subprocess server...")
|
|
1011
|
+
if self._proc is not None:
|
|
1012
|
+
if self._proc.is_alive():
|
|
1013
|
+
# Prefer graceful: SIGTERM, then wait
|
|
1014
|
+
try:
|
|
1015
|
+
self._proc.terminate()
|
|
1016
|
+
except Exception:
|
|
1017
|
+
logger.exception("Error sending SIGTERM to server process.")
|
|
1018
|
+
await asyncio.to_thread(self._proc.join, self.args.process_join_timeout)
|
|
1019
|
+
|
|
1020
|
+
if self._proc.is_alive():
|
|
1021
|
+
# Still alive, send SIGKILL
|
|
1022
|
+
try:
|
|
1023
|
+
self._proc.kill()
|
|
1024
|
+
except Exception:
|
|
1025
|
+
logger.exception("Error sending SIGKILL to server process.")
|
|
1026
|
+
await asyncio.to_thread(self._proc.join, 5.0) # Use a constant timeout for SIGKILL
|
|
1027
|
+
|
|
1028
|
+
if self._proc.is_alive():
|
|
1029
|
+
raise RuntimeError("Server process failed to shut down after SIGTERM and SIGKILL.")
|
|
1030
|
+
else:
|
|
1031
|
+
logger.info("Subprocess server was not running. Nothing to stop.")
|
|
1032
|
+
|
|
1033
|
+
if self._mp_event_queue is not None:
|
|
1034
|
+
self._mp_event_queue.close()
|
|
1035
|
+
try:
|
|
1036
|
+
self._mp_event_queue.join_thread()
|
|
1037
|
+
except Exception:
|
|
1038
|
+
logger.exception("Error joining event queue thread.")
|
|
1039
|
+
|
|
1040
|
+
self._proc = None
|
|
1041
|
+
self._mp_event_queue = None
|
|
1042
|
+
self._gunicorn_app = None
|
|
1043
|
+
self._uvicorn_server = None
|
|
1044
|
+
self._is_running = False
|
|
1045
|
+
logger.info("Subprocess server stopped")
|