mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing
|
|
6
|
+
import os
|
|
7
|
+
import signal
|
|
8
|
+
import time
|
|
9
|
+
from multiprocessing.context import BaseContext
|
|
10
|
+
from typing import Callable, Iterable, Literal, cast
|
|
11
|
+
|
|
12
|
+
from mantisdk.env_var import LightningEnvVar, resolve_bool_env_var, resolve_int_env_var, resolve_str_env_var
|
|
13
|
+
from mantisdk.store.base import LightningStore
|
|
14
|
+
from mantisdk.store.client_server import LightningStoreClient, LightningStoreServer
|
|
15
|
+
|
|
16
|
+
from .base import AlgorithmBundle, ExecutionStrategy, RunnerBundle
|
|
17
|
+
from .events import ExecutionEvent, MultiprocessingEvent
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ClientServerExecutionStrategy(ExecutionStrategy):
|
|
23
|
+
"""Run algorithm and runner bundles as separate processes over HTTP.
|
|
24
|
+
|
|
25
|
+
Execution Roles:
|
|
26
|
+
|
|
27
|
+
- `"algorithm"`: Start [`LightningStoreServer`][mantisdk.LightningStoreServer]
|
|
28
|
+
in-process and execute the algorithm bundle against it.
|
|
29
|
+
- `"runner"`: Connect to an existing server with
|
|
30
|
+
[`LightningStoreClient`][mantisdk.LightningStoreClient] and run the
|
|
31
|
+
runner bundle locally (spawning multiple processes when requested).
|
|
32
|
+
- `"both"`: Spawn runner processes first, then execute the algorithm and
|
|
33
|
+
server on the same machine. This mode orchestrates the full loop locally.
|
|
34
|
+
|
|
35
|
+
When `role == "both"` you may choose which side runs on the main process
|
|
36
|
+
via `main_process`. The runner-on-main option is limited to
|
|
37
|
+
`n_runners == 1` because each additional runner requires its own event
|
|
38
|
+
loop and process.
|
|
39
|
+
|
|
40
|
+
!!! warning
|
|
41
|
+
When `main_process == "runner"` the algorithm and HTTP server execute
|
|
42
|
+
in a child process. Store mutations remain isolated inside that process,
|
|
43
|
+
so the original store instance passed to
|
|
44
|
+
[execute()][mantisdk.ExecutionStrategy.execute] is not updated.
|
|
45
|
+
|
|
46
|
+
Abort Model (four-step escalation):
|
|
47
|
+
|
|
48
|
+
1. Cooperative stop. Every bundle receives a shared
|
|
49
|
+
[`MultiprocessingEvent`][mantisdk.MultiprocessingEvent] (`stop_evt`).
|
|
50
|
+
Any failure flips the event so peers can exit cleanly. Ctrl+C on the main
|
|
51
|
+
process also sets the flag.
|
|
52
|
+
2. KeyboardInterrupt synthesis. Remaining subprocesses receive ``SIGINT`` to
|
|
53
|
+
trigger `KeyboardInterrupt` handlers.
|
|
54
|
+
3. Termination. Stubborn processes are asked to ``terminate()``
|
|
55
|
+
(`SIGTERM` on POSIX).
|
|
56
|
+
4. Kill. As a last resort `kill()` is invoked (`SIGKILL` on POSIX).
|
|
57
|
+
|
|
58
|
+
This mirrors the semantics implemented in
|
|
59
|
+
[`SharedMemoryExecutionStrategy`][mantisdk.SharedMemoryExecutionStrategy]
|
|
60
|
+
but adapts them to multiple processes and the HTTP client/server boundary.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
alias: str = "cs"
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
role: Literal["algorithm", "runner", "both"] | None = None,
|
|
68
|
+
server_host: str | None = None,
|
|
69
|
+
server_port: int | None = None,
|
|
70
|
+
n_runners: int = 1,
|
|
71
|
+
graceful_timeout: float = 10.0,
|
|
72
|
+
terminate_timeout: float = 10.0,
|
|
73
|
+
main_process: Literal["algorithm", "runner"] = "algorithm",
|
|
74
|
+
managed_store: bool | None = None,
|
|
75
|
+
allowed_exit_codes: Iterable[int] = (0, -15),
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Configure the strategy.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
role: Which side(s) to run in this process. When omitted, the
|
|
81
|
+
`AGL_CURRENT_ROLE` environment variable is used.
|
|
82
|
+
server_host: Interface the HTTP server binds to when running the
|
|
83
|
+
algorithm bundle locally. Defaults to `AGL_SERVER_HOST`
|
|
84
|
+
or `"localhost"` if unset.
|
|
85
|
+
server_port: Port for the HTTP server in "algorithm"/"both" modes.
|
|
86
|
+
Defaults to `AGL_SERVER_PORT` or `4747` if unset.
|
|
87
|
+
n_runners: Number of runner processes to spawn in "runner"/"both".
|
|
88
|
+
graceful_timeout: How long to wait (seconds) after setting the stop
|
|
89
|
+
event before escalating to signals.
|
|
90
|
+
terminate_timeout: How long to wait between escalation steps beyond
|
|
91
|
+
the cooperative phase (re-used for SIGINT, terminate, and kill).
|
|
92
|
+
main_process: Which bundle runs on the main process when
|
|
93
|
+
`role == "both"`. `"runner"` requires `n_runners == 1` and is
|
|
94
|
+
primarily intended for debugging.
|
|
95
|
+
managed_store: When `True` (default) the strategy constructs
|
|
96
|
+
LightningStore client/server wrappers automatically. When
|
|
97
|
+
`False` the provided `store` is passed directly to the
|
|
98
|
+
bundles, allowing callers to manage store wrappers manually.
|
|
99
|
+
allowed_exit_codes: Allowed exit codes for subprocesses.
|
|
100
|
+
By default, runner can exit gracefully with code 0 or terminated
|
|
101
|
+
by SIGTERM (-15).
|
|
102
|
+
"""
|
|
103
|
+
resolved_role = resolve_str_env_var(LightningEnvVar.AGL_CURRENT_ROLE, override=role, fallback="both")
|
|
104
|
+
if resolved_role not in ("algorithm", "runner", "both"):
|
|
105
|
+
raise ValueError("role must be one of 'algorithm', 'runner', or 'both'")
|
|
106
|
+
self.role: Literal["algorithm", "runner", "both"] = resolved_role
|
|
107
|
+
self.n_runners = n_runners
|
|
108
|
+
self.server_host = resolve_str_env_var(
|
|
109
|
+
LightningEnvVar.AGL_SERVER_HOST, override=server_host, fallback="localhost"
|
|
110
|
+
)
|
|
111
|
+
self.server_port = resolve_int_env_var(LightningEnvVar.AGL_SERVER_PORT, override=server_port, fallback=4747)
|
|
112
|
+
self.graceful_timeout = graceful_timeout
|
|
113
|
+
self.terminate_timeout = terminate_timeout
|
|
114
|
+
if main_process not in ("algorithm", "runner"):
|
|
115
|
+
raise ValueError("main_process must be 'algorithm' or 'runner'")
|
|
116
|
+
if main_process == "runner":
|
|
117
|
+
if self.role != "both":
|
|
118
|
+
raise ValueError("main_process='runner' is only supported when role='both'")
|
|
119
|
+
if n_runners != 1:
|
|
120
|
+
raise ValueError("main_process='runner' requires n_runners to be 1")
|
|
121
|
+
self.main_process = main_process
|
|
122
|
+
self.managed_store = resolve_bool_env_var(
|
|
123
|
+
LightningEnvVar.AGL_MANAGED_STORE, override=managed_store, fallback=True
|
|
124
|
+
)
|
|
125
|
+
self.allowed_exit_codes = tuple(allowed_exit_codes)
|
|
126
|
+
|
|
127
|
+
async def _execute_algorithm(
|
|
128
|
+
self, algorithm: AlgorithmBundle, store: LightningStore, stop_evt: ExecutionEvent
|
|
129
|
+
) -> None:
|
|
130
|
+
wrapper_store: LightningStore | None = None
|
|
131
|
+
if self.managed_store:
|
|
132
|
+
logger.info("Starting LightningStore server on %s:%s", self.server_host, self.server_port)
|
|
133
|
+
wrapper_store = LightningStoreServer(store, host=self.server_host, port=self.server_port)
|
|
134
|
+
server_started = False
|
|
135
|
+
else:
|
|
136
|
+
wrapper_store = store
|
|
137
|
+
server_started = False
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
if self.managed_store and isinstance(wrapper_store, LightningStoreServer):
|
|
141
|
+
await wrapper_store.start()
|
|
142
|
+
server_started = True
|
|
143
|
+
logger.debug("Algorithm bundle starting against endpoint %s", wrapper_store.endpoint)
|
|
144
|
+
await algorithm(wrapper_store, stop_evt)
|
|
145
|
+
logger.debug("Algorithm bundle completed successfully")
|
|
146
|
+
except asyncio.CancelledError:
|
|
147
|
+
logger.info("Algorithm received CancelledError; signaling stop event")
|
|
148
|
+
stop_evt.set()
|
|
149
|
+
raise
|
|
150
|
+
except KeyboardInterrupt:
|
|
151
|
+
logger.warning("Algorithm received KeyboardInterrupt; signaling stop event")
|
|
152
|
+
stop_evt.set()
|
|
153
|
+
raise
|
|
154
|
+
except BaseException:
|
|
155
|
+
logger.exception("Algorithm bundle crashed; signaling stop event")
|
|
156
|
+
stop_evt.set()
|
|
157
|
+
raise
|
|
158
|
+
finally:
|
|
159
|
+
if self.managed_store and isinstance(wrapper_store, LightningStoreServer) and server_started:
|
|
160
|
+
try:
|
|
161
|
+
await wrapper_store.stop()
|
|
162
|
+
except Exception:
|
|
163
|
+
logger.exception("Error stopping LightningStore server")
|
|
164
|
+
else:
|
|
165
|
+
logger.debug("LightningStore server shutdown completed")
|
|
166
|
+
|
|
167
|
+
async def _execute_runner(
|
|
168
|
+
self,
|
|
169
|
+
runner: RunnerBundle,
|
|
170
|
+
worker_id: int,
|
|
171
|
+
store: LightningStore,
|
|
172
|
+
stop_evt: ExecutionEvent,
|
|
173
|
+
) -> None:
|
|
174
|
+
if self.managed_store:
|
|
175
|
+
# If managed, we actually do not use the provided store
|
|
176
|
+
client_store = LightningStoreClient(f"http://{self.server_host}:{self.server_port}")
|
|
177
|
+
else:
|
|
178
|
+
client_store = store
|
|
179
|
+
try:
|
|
180
|
+
if self.managed_store:
|
|
181
|
+
logger.debug("Runner %s connecting to server at %s:%s", worker_id, self.server_host, self.server_port)
|
|
182
|
+
else:
|
|
183
|
+
logger.debug("Runner %s executing with provided store", worker_id)
|
|
184
|
+
await runner(client_store, worker_id, stop_evt)
|
|
185
|
+
logger.debug("Runner %s completed successfully", worker_id)
|
|
186
|
+
except asyncio.CancelledError:
|
|
187
|
+
logger.debug("Runner %s received CancelledError; signaling stop event", worker_id)
|
|
188
|
+
stop_evt.set()
|
|
189
|
+
raise
|
|
190
|
+
except KeyboardInterrupt:
|
|
191
|
+
logger.warning("Runner %s received KeyboardInterrupt; signaling stop event", worker_id)
|
|
192
|
+
stop_evt.set()
|
|
193
|
+
raise
|
|
194
|
+
except BaseException:
|
|
195
|
+
logger.exception("Runner %s crashed; signaling stop event", worker_id)
|
|
196
|
+
stop_evt.set()
|
|
197
|
+
raise
|
|
198
|
+
finally:
|
|
199
|
+
if self.managed_store and isinstance(client_store, LightningStoreClient):
|
|
200
|
+
try:
|
|
201
|
+
await client_store.close()
|
|
202
|
+
except Exception:
|
|
203
|
+
logger.exception("Error closing LightningStore client for runner %s", worker_id)
|
|
204
|
+
else:
|
|
205
|
+
logger.debug("Runner %s closed LightningStore client", worker_id)
|
|
206
|
+
|
|
207
|
+
def _spawn_runners(
|
|
208
|
+
self,
|
|
209
|
+
runner: RunnerBundle,
|
|
210
|
+
store: LightningStore,
|
|
211
|
+
stop_evt: ExecutionEvent,
|
|
212
|
+
*,
|
|
213
|
+
ctx: BaseContext,
|
|
214
|
+
) -> list[multiprocessing.Process]:
|
|
215
|
+
"""Used when `role == "runner"` or `role == "both"` and `n_runners > 1`."""
|
|
216
|
+
processes: list[multiprocessing.Process] = []
|
|
217
|
+
|
|
218
|
+
def _runner_sync(runner: RunnerBundle, worker_id: int, store: LightningStore, stop_evt: ExecutionEvent) -> None:
|
|
219
|
+
# Runners are executed in child processes; each process owns its own
|
|
220
|
+
# event loop to keep the asyncio scheduler isolated.
|
|
221
|
+
try:
|
|
222
|
+
asyncio.run(self._execute_runner(runner, worker_id, store, stop_evt))
|
|
223
|
+
except KeyboardInterrupt:
|
|
224
|
+
logger.warning("Runner (asyncio) %s received KeyboardInterrupt; exiting gracefully", worker_id)
|
|
225
|
+
except BaseException as exc:
|
|
226
|
+
logger.exception("Runner (asyncio) %s crashed by %s; signaling stop event", worker_id, exc)
|
|
227
|
+
raise
|
|
228
|
+
|
|
229
|
+
for i in range(self.n_runners):
|
|
230
|
+
process = cast(
|
|
231
|
+
multiprocessing.Process,
|
|
232
|
+
ctx.Process(target=_runner_sync, args=(runner, i, store, stop_evt), name=f"runner-{i}"), # type: ignore
|
|
233
|
+
)
|
|
234
|
+
process.start()
|
|
235
|
+
logger.debug("Spawned runner process %s (pid=%s)", process.name, process.pid)
|
|
236
|
+
processes.append(process)
|
|
237
|
+
|
|
238
|
+
return processes
|
|
239
|
+
|
|
240
|
+
def _spawn_algorithm_process(
|
|
241
|
+
self,
|
|
242
|
+
algorithm: AlgorithmBundle,
|
|
243
|
+
store: LightningStore,
|
|
244
|
+
stop_evt: ExecutionEvent,
|
|
245
|
+
*,
|
|
246
|
+
ctx: BaseContext,
|
|
247
|
+
) -> multiprocessing.Process:
|
|
248
|
+
"""Used when `main_process == "runner"`."""
|
|
249
|
+
|
|
250
|
+
def _algorithm_sync(algorithm: AlgorithmBundle, store: LightningStore, stop_evt: ExecutionEvent) -> None:
|
|
251
|
+
try:
|
|
252
|
+
asyncio.run(self._execute_algorithm(algorithm, store, stop_evt))
|
|
253
|
+
except KeyboardInterrupt:
|
|
254
|
+
logger.warning("Algorithm (asyncio.run) received KeyboardInterrupt; exiting gracefully")
|
|
255
|
+
except BaseException as exc:
|
|
256
|
+
logger.exception("Algorithm (asyncio.run) crashed by %s; signaling stop event", exc)
|
|
257
|
+
raise
|
|
258
|
+
|
|
259
|
+
process = cast(
|
|
260
|
+
multiprocessing.Process,
|
|
261
|
+
ctx.Process(target=_algorithm_sync, args=(algorithm, store, stop_evt), name="algorithm"), # type: ignore
|
|
262
|
+
)
|
|
263
|
+
process.start()
|
|
264
|
+
logger.debug("Spawned algorithm process %s (pid=%s)", process.name, process.pid)
|
|
265
|
+
return process
|
|
266
|
+
|
|
267
|
+
def _join_until_deadline(
|
|
268
|
+
self,
|
|
269
|
+
processes: Iterable[multiprocessing.Process],
|
|
270
|
+
timeout: float,
|
|
271
|
+
) -> list[multiprocessing.Process]:
|
|
272
|
+
"""Join ``processes`` until ``timeout`` elapses, returning those still alive."""
|
|
273
|
+
deadline = time.monotonic() + timeout
|
|
274
|
+
still_alive: list[multiprocessing.Process] = []
|
|
275
|
+
for process in processes:
|
|
276
|
+
remaining = deadline - time.monotonic()
|
|
277
|
+
if remaining > 0:
|
|
278
|
+
process.join(remaining)
|
|
279
|
+
else:
|
|
280
|
+
process.join(0)
|
|
281
|
+
if process.is_alive():
|
|
282
|
+
still_alive.append(process)
|
|
283
|
+
return still_alive
|
|
284
|
+
|
|
285
|
+
def _signal_processes(
|
|
286
|
+
self,
|
|
287
|
+
processes: Iterable[multiprocessing.Process],
|
|
288
|
+
action: Callable[[multiprocessing.Process], None],
|
|
289
|
+
) -> None:
|
|
290
|
+
"""Invoke ``action`` on each process while suppressing individual failures."""
|
|
291
|
+
for process in processes:
|
|
292
|
+
try:
|
|
293
|
+
action(process)
|
|
294
|
+
except Exception:
|
|
295
|
+
logger.exception("Error signaling process %s (pid=%s)", process.name, process.pid)
|
|
296
|
+
|
|
297
|
+
def _shutdown_processes(
|
|
298
|
+
self,
|
|
299
|
+
processes: list[multiprocessing.Process],
|
|
300
|
+
stop_evt: ExecutionEvent,
|
|
301
|
+
) -> None:
|
|
302
|
+
"""4-step escalation shutdown of ``processes``."""
|
|
303
|
+
if not processes:
|
|
304
|
+
logger.debug("No subprocesses to shutdown")
|
|
305
|
+
return
|
|
306
|
+
|
|
307
|
+
if not stop_evt.is_set():
|
|
308
|
+
logger.debug("Sending cooperative stop signal to subprocesses")
|
|
309
|
+
stop_evt.set()
|
|
310
|
+
else:
|
|
311
|
+
logger.debug("Stop event already set; waiting for subprocesses to exit")
|
|
312
|
+
|
|
313
|
+
alive = self._join_until_deadline(processes, self.graceful_timeout)
|
|
314
|
+
if not alive:
|
|
315
|
+
return
|
|
316
|
+
|
|
317
|
+
logger.warning(
|
|
318
|
+
"Subprocesses still alive after cooperative wait; sending SIGINT to %s",
|
|
319
|
+
", ".join(p.name or str(p.pid) for p in alive),
|
|
320
|
+
)
|
|
321
|
+
# SIGINT is not reliable on Windows, but we do not consider such case yet.
|
|
322
|
+
self._signal_processes(alive, lambda p: os.kill(cast(int, p.pid), signal.SIGINT))
|
|
323
|
+
alive = self._join_until_deadline(alive, self.terminate_timeout)
|
|
324
|
+
if not alive:
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
logger.warning(
|
|
328
|
+
"Subprocesses still alive after SIGINT wait; sending terminate() to %s",
|
|
329
|
+
", ".join(p.name or str(p.pid) for p in alive),
|
|
330
|
+
)
|
|
331
|
+
self._signal_processes(alive, lambda p: p.terminate())
|
|
332
|
+
|
|
333
|
+
alive = self._join_until_deadline(alive, self.terminate_timeout)
|
|
334
|
+
if not alive:
|
|
335
|
+
return
|
|
336
|
+
|
|
337
|
+
logger.error(
|
|
338
|
+
"Subprocesses still alive after terminate(); sending kill() to %s",
|
|
339
|
+
", ".join(p.name or str(p.pid) for p in alive),
|
|
340
|
+
)
|
|
341
|
+
self._signal_processes(alive, lambda p: p.kill())
|
|
342
|
+
alive = self._join_until_deadline(alive, self.terminate_timeout)
|
|
343
|
+
|
|
344
|
+
if alive:
|
|
345
|
+
logger.error(
|
|
346
|
+
"Subprocesses failed to exit even after kill(): %s", ", ".join(p.name or str(p.pid) for p in alive)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def _check_process_exitcodes(self, processes: Iterable[multiprocessing.Process]) -> None:
|
|
350
|
+
"""Raise an error if any managed process exited with a non-zero status."""
|
|
351
|
+
failed = [p for p in processes if p.exitcode not in self.allowed_exit_codes + (None,)]
|
|
352
|
+
if failed:
|
|
353
|
+
formatted = ", ".join(f"{p.name or p.pid} (exitcode={p.exitcode})" for p in failed)
|
|
354
|
+
raise RuntimeError(f"Subprocesses failed with unexpected exit codes: {formatted}")
|
|
355
|
+
|
|
356
|
+
def execute(self, algorithm: AlgorithmBundle, runner: RunnerBundle, store: LightningStore) -> None:
|
|
357
|
+
logger.info(
|
|
358
|
+
"Starting client-server execution with %d runner(s) [role=%s, main_process=%s]",
|
|
359
|
+
self.n_runners,
|
|
360
|
+
self.role,
|
|
361
|
+
self.main_process,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Re-use the active multiprocessing context so the event and processes
|
|
365
|
+
# agree on the start method (fork/spawn/forkserver).
|
|
366
|
+
ctx = multiprocessing.get_context()
|
|
367
|
+
stop_evt = MultiprocessingEvent(ctx=ctx)
|
|
368
|
+
# Track spawned processes so we can enforce termination ordering and
|
|
369
|
+
# surface non-zero exit codes back to the caller.
|
|
370
|
+
processes: list[multiprocessing.Process] = []
|
|
371
|
+
|
|
372
|
+
exception: BaseException | None = None
|
|
373
|
+
keyboard_interrupt = False
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
if self.role == "algorithm":
|
|
377
|
+
logger.info("Running algorithm solely...")
|
|
378
|
+
asyncio.run(self._execute_algorithm(algorithm, store, stop_evt))
|
|
379
|
+
elif self.role == "runner":
|
|
380
|
+
if self.n_runners == 1:
|
|
381
|
+
logger.info("Running runner solely...")
|
|
382
|
+
asyncio.run(self._execute_runner(runner, 0, store, stop_evt))
|
|
383
|
+
else:
|
|
384
|
+
logger.info("Spawning runner processes...")
|
|
385
|
+
processes = self._spawn_runners(runner, store, stop_evt, ctx=ctx)
|
|
386
|
+
# Wait for the processes to finish naturally.
|
|
387
|
+
for process in processes:
|
|
388
|
+
process.join()
|
|
389
|
+
self._check_process_exitcodes(processes)
|
|
390
|
+
elif self.role == "both":
|
|
391
|
+
if self.main_process == "algorithm":
|
|
392
|
+
logger.info("Spawning runner processes...")
|
|
393
|
+
processes = self._spawn_runners(runner, store, stop_evt, ctx=ctx)
|
|
394
|
+
try:
|
|
395
|
+
logger.info("Running algorithm...")
|
|
396
|
+
asyncio.run(self._execute_algorithm(algorithm, store, stop_evt))
|
|
397
|
+
finally:
|
|
398
|
+
# Always request the runner side to unwind once the
|
|
399
|
+
# algorithm/server portion finishes (successfully or not).
|
|
400
|
+
stop_evt.set()
|
|
401
|
+
else: # main_process == "runner"
|
|
402
|
+
if self.n_runners > 1:
|
|
403
|
+
raise ValueError("main_process='runner' requires n_runners to be 1")
|
|
404
|
+
|
|
405
|
+
logger.info("Spawning algorithm process...")
|
|
406
|
+
algorithm_process = self._spawn_algorithm_process(algorithm, store, stop_evt, ctx=ctx)
|
|
407
|
+
processes = [algorithm_process]
|
|
408
|
+
|
|
409
|
+
# Run the lone runner cooperatively in-process so users can
|
|
410
|
+
# attach a debugger. The algorithm + HTTP server live in
|
|
411
|
+
# the background process spawned above (the provided
|
|
412
|
+
# store must therefore be picklable when using spawn).
|
|
413
|
+
logger.info("Running runner...")
|
|
414
|
+
asyncio.run(self._execute_runner(runner, 0, store, stop_evt))
|
|
415
|
+
|
|
416
|
+
# Wait for the algorithm process to finish.
|
|
417
|
+
algorithm_process.join()
|
|
418
|
+
else:
|
|
419
|
+
raise ValueError(f"Unknown role: {self.role}")
|
|
420
|
+
except KeyboardInterrupt:
|
|
421
|
+
logger.warning("KeyboardInterrupt received; initiating shutdown")
|
|
422
|
+
stop_evt.set()
|
|
423
|
+
keyboard_interrupt = True
|
|
424
|
+
except BaseException as exc:
|
|
425
|
+
logger.exception("Unhandled exception in execute method")
|
|
426
|
+
stop_evt.set()
|
|
427
|
+
# Preserve the original exception so we can avoid masking it during
|
|
428
|
+
# the cleanup phase.
|
|
429
|
+
exception = exc
|
|
430
|
+
raise
|
|
431
|
+
finally:
|
|
432
|
+
logger.info("Shutting down subprocesses")
|
|
433
|
+
self._shutdown_processes(processes, stop_evt)
|
|
434
|
+
if processes:
|
|
435
|
+
try:
|
|
436
|
+
self._check_process_exitcodes(processes)
|
|
437
|
+
except RuntimeError as err:
|
|
438
|
+
if exception is not None or keyboard_interrupt:
|
|
439
|
+
# We already propagate/handled a different failure, so
|
|
440
|
+
# emit a warning instead of raising a secondary error.
|
|
441
|
+
logger.warning("Subprocesses ended abnormally during shutdown: %s", err)
|
|
442
|
+
else:
|
|
443
|
+
raise
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import multiprocessing as mp
|
|
4
|
+
import threading
|
|
5
|
+
from multiprocessing.context import BaseContext
|
|
6
|
+
from typing import Optional, Protocol
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ExecutionEvent(Protocol):
|
|
10
|
+
"""Protocol capturing the cooperative stop contract shared by strategies.
|
|
11
|
+
|
|
12
|
+
Implementations mirror the API of ``threading.Event`` and
|
|
13
|
+
``multiprocessing.Event`` so the rest of the execution layer can remain
|
|
14
|
+
agnostic to the underlying concurrency primitive.
|
|
15
|
+
|
|
16
|
+
Methods:
|
|
17
|
+
|
|
18
|
+
set: Signal cancellation. The call must be idempotent.
|
|
19
|
+
clear: Reset the event to the unsignaled state.
|
|
20
|
+
is_set: Return ``True`` when cancellation has been requested.
|
|
21
|
+
wait: Block until the event is signaled or an optional timeout elapses.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def set(self) -> None: ...
|
|
25
|
+
def clear(self) -> None: ...
|
|
26
|
+
def is_set(self) -> bool: ...
|
|
27
|
+
def wait(self, timeout: Optional[float] = None) -> bool: ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ThreadingEvent:
|
|
31
|
+
"""Thread-safe implementation of [`ExecutionEvent`][mantisdk.ExecutionEvent]."""
|
|
32
|
+
|
|
33
|
+
__slots__ = ("_evt",)
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
self._evt = threading.Event()
|
|
37
|
+
|
|
38
|
+
def set(self) -> None:
|
|
39
|
+
self._evt.set()
|
|
40
|
+
|
|
41
|
+
def clear(self) -> None:
|
|
42
|
+
self._evt.clear()
|
|
43
|
+
|
|
44
|
+
def is_set(self) -> bool:
|
|
45
|
+
return self._evt.is_set()
|
|
46
|
+
|
|
47
|
+
def wait(self, timeout: Optional[float] = None) -> bool:
|
|
48
|
+
return self._evt.wait(timeout)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MultiprocessingEvent:
|
|
52
|
+
"""Process-safe implementation of [`ExecutionEvent`][mantisdk.ExecutionEvent]."""
|
|
53
|
+
|
|
54
|
+
__slots__ = ("_evt",)
|
|
55
|
+
|
|
56
|
+
def __init__(self, *, ctx: Optional[BaseContext] = None) -> None:
|
|
57
|
+
self._evt = (ctx or mp).Event()
|
|
58
|
+
|
|
59
|
+
def set(self) -> None:
|
|
60
|
+
self._evt.set()
|
|
61
|
+
|
|
62
|
+
def clear(self) -> None:
|
|
63
|
+
self._evt.clear()
|
|
64
|
+
|
|
65
|
+
def is_set(self) -> bool:
|
|
66
|
+
return self._evt.is_set()
|
|
67
|
+
|
|
68
|
+
def wait(self, timeout: Optional[float] = None) -> bool:
|
|
69
|
+
return self._evt.wait(timeout)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from .base import ExecutionStrategy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class InterProcessExecutionStrategy(ExecutionStrategy):
|
|
7
|
+
"""Placeholder strategy for future inter-process primitives.
|
|
8
|
+
|
|
9
|
+
The class exists to reserve the `ipc` alias and make the planned
|
|
10
|
+
implementation discoverable. Attempting to use it today will raise
|
|
11
|
+
`NotImplementedError` once the execution contract is finalized.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
alias: str = "ipc"
|
|
15
|
+
|
|
16
|
+
# TODO: to be implemented
|