mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show
  1. mantisdk/__init__.py +22 -0
  2. mantisdk/adapter/__init__.py +15 -0
  3. mantisdk/adapter/base.py +94 -0
  4. mantisdk/adapter/messages.py +270 -0
  5. mantisdk/adapter/triplet.py +1028 -0
  6. mantisdk/algorithm/__init__.py +39 -0
  7. mantisdk/algorithm/apo/__init__.py +5 -0
  8. mantisdk/algorithm/apo/apo.py +889 -0
  9. mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
  10. mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
  11. mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
  12. mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
  13. mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
  14. mantisdk/algorithm/base.py +162 -0
  15. mantisdk/algorithm/decorator.py +264 -0
  16. mantisdk/algorithm/fast.py +250 -0
  17. mantisdk/algorithm/gepa/__init__.py +59 -0
  18. mantisdk/algorithm/gepa/adapter.py +459 -0
  19. mantisdk/algorithm/gepa/gepa.py +364 -0
  20. mantisdk/algorithm/gepa/lib/__init__.py +18 -0
  21. mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
  22. mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
  23. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
  24. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
  25. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
  26. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
  27. mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
  28. mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
  29. mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
  30. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
  31. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
  32. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
  33. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
  34. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
  35. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
  36. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
  37. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
  38. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
  39. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
  40. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
  41. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
  42. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
  43. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
  44. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
  45. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
  46. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
  47. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
  48. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
  49. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
  50. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
  51. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
  52. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
  53. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
  54. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
  55. mantisdk/algorithm/gepa/lib/api.py +375 -0
  56. mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
  57. mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
  58. mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
  59. mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
  60. mantisdk/algorithm/gepa/lib/core/result.py +233 -0
  61. mantisdk/algorithm/gepa/lib/core/state.py +636 -0
  62. mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
  63. mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
  64. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
  65. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
  66. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
  67. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
  68. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
  69. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
  70. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
  71. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
  72. mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
  73. mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
  74. mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
  75. mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
  76. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
  77. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
  78. mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
  79. mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
  80. mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
  81. mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
  82. mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
  83. mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
  84. mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
  85. mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
  86. mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
  87. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
  88. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
  89. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
  90. mantisdk/algorithm/gepa/lib/py.typed +0 -0
  91. mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
  92. mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
  93. mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
  94. mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
  95. mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
  96. mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
  97. mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
  98. mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
  99. mantisdk/algorithm/gepa/tracing.py +105 -0
  100. mantisdk/algorithm/utils.py +177 -0
  101. mantisdk/algorithm/verl/__init__.py +5 -0
  102. mantisdk/algorithm/verl/interface.py +202 -0
  103. mantisdk/cli/__init__.py +56 -0
  104. mantisdk/cli/prometheus.py +115 -0
  105. mantisdk/cli/store.py +131 -0
  106. mantisdk/cli/vllm.py +29 -0
  107. mantisdk/client.py +408 -0
  108. mantisdk/config.py +348 -0
  109. mantisdk/emitter/__init__.py +43 -0
  110. mantisdk/emitter/annotation.py +370 -0
  111. mantisdk/emitter/exception.py +54 -0
  112. mantisdk/emitter/message.py +61 -0
  113. mantisdk/emitter/object.py +117 -0
  114. mantisdk/emitter/reward.py +320 -0
  115. mantisdk/env_var.py +156 -0
  116. mantisdk/execution/__init__.py +15 -0
  117. mantisdk/execution/base.py +64 -0
  118. mantisdk/execution/client_server.py +443 -0
  119. mantisdk/execution/events.py +69 -0
  120. mantisdk/execution/inter_process.py +16 -0
  121. mantisdk/execution/shared_memory.py +282 -0
  122. mantisdk/instrumentation/__init__.py +119 -0
  123. mantisdk/instrumentation/agentops.py +314 -0
  124. mantisdk/instrumentation/agentops_langchain.py +45 -0
  125. mantisdk/instrumentation/litellm.py +83 -0
  126. mantisdk/instrumentation/vllm.py +81 -0
  127. mantisdk/instrumentation/weave.py +500 -0
  128. mantisdk/litagent/__init__.py +11 -0
  129. mantisdk/litagent/decorator.py +536 -0
  130. mantisdk/litagent/litagent.py +252 -0
  131. mantisdk/llm_proxy.py +1890 -0
  132. mantisdk/logging.py +370 -0
  133. mantisdk/reward.py +7 -0
  134. mantisdk/runner/__init__.py +11 -0
  135. mantisdk/runner/agent.py +845 -0
  136. mantisdk/runner/base.py +182 -0
  137. mantisdk/runner/legacy.py +309 -0
  138. mantisdk/semconv.py +170 -0
  139. mantisdk/server.py +401 -0
  140. mantisdk/store/__init__.py +23 -0
  141. mantisdk/store/base.py +897 -0
  142. mantisdk/store/client_server.py +2092 -0
  143. mantisdk/store/collection/__init__.py +30 -0
  144. mantisdk/store/collection/base.py +587 -0
  145. mantisdk/store/collection/memory.py +970 -0
  146. mantisdk/store/collection/mongo.py +1412 -0
  147. mantisdk/store/collection_based.py +1823 -0
  148. mantisdk/store/insight.py +648 -0
  149. mantisdk/store/listener.py +58 -0
  150. mantisdk/store/memory.py +396 -0
  151. mantisdk/store/mongo.py +165 -0
  152. mantisdk/store/sqlite.py +3 -0
  153. mantisdk/store/threading.py +357 -0
  154. mantisdk/store/utils.py +142 -0
  155. mantisdk/tracer/__init__.py +16 -0
  156. mantisdk/tracer/agentops.py +242 -0
  157. mantisdk/tracer/base.py +287 -0
  158. mantisdk/tracer/dummy.py +106 -0
  159. mantisdk/tracer/otel.py +555 -0
  160. mantisdk/tracer/weave.py +677 -0
  161. mantisdk/trainer/__init__.py +6 -0
  162. mantisdk/trainer/init_utils.py +263 -0
  163. mantisdk/trainer/legacy.py +367 -0
  164. mantisdk/trainer/registry.py +12 -0
  165. mantisdk/trainer/trainer.py +618 -0
  166. mantisdk/types/__init__.py +6 -0
  167. mantisdk/types/core.py +553 -0
  168. mantisdk/types/resources.py +204 -0
  169. mantisdk/types/tracer.py +515 -0
  170. mantisdk/types/tracing.py +218 -0
  171. mantisdk/utils/__init__.py +1 -0
  172. mantisdk/utils/id.py +18 -0
  173. mantisdk/utils/metrics.py +1025 -0
  174. mantisdk/utils/otel.py +578 -0
  175. mantisdk/utils/otlp.py +536 -0
  176. mantisdk/utils/server_launcher.py +1045 -0
  177. mantisdk/utils/system_snapshot.py +81 -0
  178. mantisdk/verl/__init__.py +8 -0
  179. mantisdk/verl/__main__.py +6 -0
  180. mantisdk/verl/async_server.py +46 -0
  181. mantisdk/verl/config.yaml +27 -0
  182. mantisdk/verl/daemon.py +1154 -0
  183. mantisdk/verl/dataset.py +44 -0
  184. mantisdk/verl/entrypoint.py +248 -0
  185. mantisdk/verl/trainer.py +549 -0
  186. mantisdk-0.1.0.dist-info/METADATA +119 -0
  187. mantisdk-0.1.0.dist-info/RECORD +190 -0
  188. mantisdk-0.1.0.dist-info/WHEEL +4 -0
  189. mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
  190. mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
@@ -0,0 +1,443 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ import asyncio
4
+ import logging
5
+ import multiprocessing
6
+ import os
7
+ import signal
8
+ import time
9
+ from multiprocessing.context import BaseContext
10
+ from typing import Callable, Iterable, Literal, cast
11
+
12
+ from mantisdk.env_var import LightningEnvVar, resolve_bool_env_var, resolve_int_env_var, resolve_str_env_var
13
+ from mantisdk.store.base import LightningStore
14
+ from mantisdk.store.client_server import LightningStoreClient, LightningStoreServer
15
+
16
+ from .base import AlgorithmBundle, ExecutionStrategy, RunnerBundle
17
+ from .events import ExecutionEvent, MultiprocessingEvent
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class ClientServerExecutionStrategy(ExecutionStrategy):
23
+ """Run algorithm and runner bundles as separate processes over HTTP.
24
+
25
+ Execution Roles:
26
+
27
+ - `"algorithm"`: Start [`LightningStoreServer`][mantisdk.LightningStoreServer]
28
+ in-process and execute the algorithm bundle against it.
29
+ - `"runner"`: Connect to an existing server with
30
+ [`LightningStoreClient`][mantisdk.LightningStoreClient] and run the
31
+ runner bundle locally (spawning multiple processes when requested).
32
+ - `"both"`: Spawn runner processes first, then execute the algorithm and
33
+ server on the same machine. This mode orchestrates the full loop locally.
34
+
35
+ When `role == "both"` you may choose which side runs on the main process
36
+ via `main_process`. The runner-on-main option is limited to
37
+ `n_runners == 1` because each additional runner requires its own event
38
+ loop and process.
39
+
40
+ !!! warning
41
+ When `main_process == "runner"` the algorithm and HTTP server execute
42
+ in a child process. Store mutations remain isolated inside that process,
43
+ so the original store instance passed to
44
+ [execute()][mantisdk.ExecutionStrategy.execute] is not updated.
45
+
46
+ Abort Model (four-step escalation):
47
+
48
+ 1. Cooperative stop. Every bundle receives a shared
49
+ [`MultiprocessingEvent`][mantisdk.MultiprocessingEvent] (`stop_evt`).
50
+ Any failure flips the event so peers can exit cleanly. Ctrl+C on the main
51
+ process also sets the flag.
52
+ 2. KeyboardInterrupt synthesis. Remaining subprocesses receive ``SIGINT`` to
53
+ trigger `KeyboardInterrupt` handlers.
54
+ 3. Termination. Stubborn processes are asked to ``terminate()``
55
+ (`SIGTERM` on POSIX).
56
+ 4. Kill. As a last resort `kill()` is invoked (`SIGKILL` on POSIX).
57
+
58
+ This mirrors the semantics implemented in
59
+ [`SharedMemoryExecutionStrategy`][mantisdk.SharedMemoryExecutionStrategy]
60
+ but adapts them to multiple processes and the HTTP client/server boundary.
61
+ """
62
+
63
+ alias: str = "cs"
64
+
65
+ def __init__(
66
+ self,
67
+ role: Literal["algorithm", "runner", "both"] | None = None,
68
+ server_host: str | None = None,
69
+ server_port: int | None = None,
70
+ n_runners: int = 1,
71
+ graceful_timeout: float = 10.0,
72
+ terminate_timeout: float = 10.0,
73
+ main_process: Literal["algorithm", "runner"] = "algorithm",
74
+ managed_store: bool | None = None,
75
+ allowed_exit_codes: Iterable[int] = (0, -15),
76
+ ) -> None:
77
+ """Configure the strategy.
78
+
79
+ Args:
80
+ role: Which side(s) to run in this process. When omitted, the
81
+ `AGL_CURRENT_ROLE` environment variable is used.
82
+ server_host: Interface the HTTP server binds to when running the
83
+ algorithm bundle locally. Defaults to `AGL_SERVER_HOST`
84
+ or `"localhost"` if unset.
85
+ server_port: Port for the HTTP server in "algorithm"/"both" modes.
86
+ Defaults to `AGL_SERVER_PORT` or `4747` if unset.
87
+ n_runners: Number of runner processes to spawn in "runner"/"both".
88
+ graceful_timeout: How long to wait (seconds) after setting the stop
89
+ event before escalating to signals.
90
+ terminate_timeout: How long to wait between escalation steps beyond
91
+ the cooperative phase (re-used for SIGINT, terminate, and kill).
92
+ main_process: Which bundle runs on the main process when
93
+ `role == "both"`. `"runner"` requires `n_runners == 1` and is
94
+ primarily intended for debugging.
95
+ managed_store: When `True` (default) the strategy constructs
96
+ LightningStore client/server wrappers automatically. When
97
+ `False` the provided `store` is passed directly to the
98
+ bundles, allowing callers to manage store wrappers manually.
99
+ allowed_exit_codes: Allowed exit codes for subprocesses.
100
+ By default, runner can exit gracefully with code 0 or terminated
101
+ by SIGTERM (-15).
102
+ """
103
+ resolved_role = resolve_str_env_var(LightningEnvVar.AGL_CURRENT_ROLE, override=role, fallback="both")
104
+ if resolved_role not in ("algorithm", "runner", "both"):
105
+ raise ValueError("role must be one of 'algorithm', 'runner', or 'both'")
106
+ self.role: Literal["algorithm", "runner", "both"] = resolved_role
107
+ self.n_runners = n_runners
108
+ self.server_host = resolve_str_env_var(
109
+ LightningEnvVar.AGL_SERVER_HOST, override=server_host, fallback="localhost"
110
+ )
111
+ self.server_port = resolve_int_env_var(LightningEnvVar.AGL_SERVER_PORT, override=server_port, fallback=4747)
112
+ self.graceful_timeout = graceful_timeout
113
+ self.terminate_timeout = terminate_timeout
114
+ if main_process not in ("algorithm", "runner"):
115
+ raise ValueError("main_process must be 'algorithm' or 'runner'")
116
+ if main_process == "runner":
117
+ if self.role != "both":
118
+ raise ValueError("main_process='runner' is only supported when role='both'")
119
+ if n_runners != 1:
120
+ raise ValueError("main_process='runner' requires n_runners to be 1")
121
+ self.main_process = main_process
122
+ self.managed_store = resolve_bool_env_var(
123
+ LightningEnvVar.AGL_MANAGED_STORE, override=managed_store, fallback=True
124
+ )
125
+ self.allowed_exit_codes = tuple(allowed_exit_codes)
126
+
127
+ async def _execute_algorithm(
128
+ self, algorithm: AlgorithmBundle, store: LightningStore, stop_evt: ExecutionEvent
129
+ ) -> None:
130
+ wrapper_store: LightningStore | None = None
131
+ if self.managed_store:
132
+ logger.info("Starting LightningStore server on %s:%s", self.server_host, self.server_port)
133
+ wrapper_store = LightningStoreServer(store, host=self.server_host, port=self.server_port)
134
+ server_started = False
135
+ else:
136
+ wrapper_store = store
137
+ server_started = False
138
+
139
+ try:
140
+ if self.managed_store and isinstance(wrapper_store, LightningStoreServer):
141
+ await wrapper_store.start()
142
+ server_started = True
143
+ logger.debug("Algorithm bundle starting against endpoint %s", wrapper_store.endpoint)
144
+ await algorithm(wrapper_store, stop_evt)
145
+ logger.debug("Algorithm bundle completed successfully")
146
+ except asyncio.CancelledError:
147
+ logger.info("Algorithm received CancelledError; signaling stop event")
148
+ stop_evt.set()
149
+ raise
150
+ except KeyboardInterrupt:
151
+ logger.warning("Algorithm received KeyboardInterrupt; signaling stop event")
152
+ stop_evt.set()
153
+ raise
154
+ except BaseException:
155
+ logger.exception("Algorithm bundle crashed; signaling stop event")
156
+ stop_evt.set()
157
+ raise
158
+ finally:
159
+ if self.managed_store and isinstance(wrapper_store, LightningStoreServer) and server_started:
160
+ try:
161
+ await wrapper_store.stop()
162
+ except Exception:
163
+ logger.exception("Error stopping LightningStore server")
164
+ else:
165
+ logger.debug("LightningStore server shutdown completed")
166
+
167
+ async def _execute_runner(
168
+ self,
169
+ runner: RunnerBundle,
170
+ worker_id: int,
171
+ store: LightningStore,
172
+ stop_evt: ExecutionEvent,
173
+ ) -> None:
174
+ if self.managed_store:
175
+ # If managed, we actually do not use the provided store
176
+ client_store = LightningStoreClient(f"http://{self.server_host}:{self.server_port}")
177
+ else:
178
+ client_store = store
179
+ try:
180
+ if self.managed_store:
181
+ logger.debug("Runner %s connecting to server at %s:%s", worker_id, self.server_host, self.server_port)
182
+ else:
183
+ logger.debug("Runner %s executing with provided store", worker_id)
184
+ await runner(client_store, worker_id, stop_evt)
185
+ logger.debug("Runner %s completed successfully", worker_id)
186
+ except asyncio.CancelledError:
187
+ logger.debug("Runner %s received CancelledError; signaling stop event", worker_id)
188
+ stop_evt.set()
189
+ raise
190
+ except KeyboardInterrupt:
191
+ logger.warning("Runner %s received KeyboardInterrupt; signaling stop event", worker_id)
192
+ stop_evt.set()
193
+ raise
194
+ except BaseException:
195
+ logger.exception("Runner %s crashed; signaling stop event", worker_id)
196
+ stop_evt.set()
197
+ raise
198
+ finally:
199
+ if self.managed_store and isinstance(client_store, LightningStoreClient):
200
+ try:
201
+ await client_store.close()
202
+ except Exception:
203
+ logger.exception("Error closing LightningStore client for runner %s", worker_id)
204
+ else:
205
+ logger.debug("Runner %s closed LightningStore client", worker_id)
206
+
207
+ def _spawn_runners(
208
+ self,
209
+ runner: RunnerBundle,
210
+ store: LightningStore,
211
+ stop_evt: ExecutionEvent,
212
+ *,
213
+ ctx: BaseContext,
214
+ ) -> list[multiprocessing.Process]:
215
+ """Used when `role == "runner"` or `role == "both"` and `n_runners > 1`."""
216
+ processes: list[multiprocessing.Process] = []
217
+
218
+ def _runner_sync(runner: RunnerBundle, worker_id: int, store: LightningStore, stop_evt: ExecutionEvent) -> None:
219
+ # Runners are executed in child processes; each process owns its own
220
+ # event loop to keep the asyncio scheduler isolated.
221
+ try:
222
+ asyncio.run(self._execute_runner(runner, worker_id, store, stop_evt))
223
+ except KeyboardInterrupt:
224
+ logger.warning("Runner (asyncio) %s received KeyboardInterrupt; exiting gracefully", worker_id)
225
+ except BaseException as exc:
226
+ logger.exception("Runner (asyncio) %s crashed by %s; signaling stop event", worker_id, exc)
227
+ raise
228
+
229
+ for i in range(self.n_runners):
230
+ process = cast(
231
+ multiprocessing.Process,
232
+ ctx.Process(target=_runner_sync, args=(runner, i, store, stop_evt), name=f"runner-{i}"), # type: ignore
233
+ )
234
+ process.start()
235
+ logger.debug("Spawned runner process %s (pid=%s)", process.name, process.pid)
236
+ processes.append(process)
237
+
238
+ return processes
239
+
240
+ def _spawn_algorithm_process(
241
+ self,
242
+ algorithm: AlgorithmBundle,
243
+ store: LightningStore,
244
+ stop_evt: ExecutionEvent,
245
+ *,
246
+ ctx: BaseContext,
247
+ ) -> multiprocessing.Process:
248
+ """Used when `main_process == "runner"`."""
249
+
250
+ def _algorithm_sync(algorithm: AlgorithmBundle, store: LightningStore, stop_evt: ExecutionEvent) -> None:
251
+ try:
252
+ asyncio.run(self._execute_algorithm(algorithm, store, stop_evt))
253
+ except KeyboardInterrupt:
254
+ logger.warning("Algorithm (asyncio.run) received KeyboardInterrupt; exiting gracefully")
255
+ except BaseException as exc:
256
+ logger.exception("Algorithm (asyncio.run) crashed by %s; signaling stop event", exc)
257
+ raise
258
+
259
+ process = cast(
260
+ multiprocessing.Process,
261
+ ctx.Process(target=_algorithm_sync, args=(algorithm, store, stop_evt), name="algorithm"), # type: ignore
262
+ )
263
+ process.start()
264
+ logger.debug("Spawned algorithm process %s (pid=%s)", process.name, process.pid)
265
+ return process
266
+
267
+ def _join_until_deadline(
268
+ self,
269
+ processes: Iterable[multiprocessing.Process],
270
+ timeout: float,
271
+ ) -> list[multiprocessing.Process]:
272
+ """Join ``processes`` until ``timeout`` elapses, returning those still alive."""
273
+ deadline = time.monotonic() + timeout
274
+ still_alive: list[multiprocessing.Process] = []
275
+ for process in processes:
276
+ remaining = deadline - time.monotonic()
277
+ if remaining > 0:
278
+ process.join(remaining)
279
+ else:
280
+ process.join(0)
281
+ if process.is_alive():
282
+ still_alive.append(process)
283
+ return still_alive
284
+
285
+ def _signal_processes(
286
+ self,
287
+ processes: Iterable[multiprocessing.Process],
288
+ action: Callable[[multiprocessing.Process], None],
289
+ ) -> None:
290
+ """Invoke ``action`` on each process while suppressing individual failures."""
291
+ for process in processes:
292
+ try:
293
+ action(process)
294
+ except Exception:
295
+ logger.exception("Error signaling process %s (pid=%s)", process.name, process.pid)
296
+
297
+ def _shutdown_processes(
298
+ self,
299
+ processes: list[multiprocessing.Process],
300
+ stop_evt: ExecutionEvent,
301
+ ) -> None:
302
+ """4-step escalation shutdown of ``processes``."""
303
+ if not processes:
304
+ logger.debug("No subprocesses to shutdown")
305
+ return
306
+
307
+ if not stop_evt.is_set():
308
+ logger.debug("Sending cooperative stop signal to subprocesses")
309
+ stop_evt.set()
310
+ else:
311
+ logger.debug("Stop event already set; waiting for subprocesses to exit")
312
+
313
+ alive = self._join_until_deadline(processes, self.graceful_timeout)
314
+ if not alive:
315
+ return
316
+
317
+ logger.warning(
318
+ "Subprocesses still alive after cooperative wait; sending SIGINT to %s",
319
+ ", ".join(p.name or str(p.pid) for p in alive),
320
+ )
321
+ # SIGINT is not reliable on Windows, but we do not consider such case yet.
322
+ self._signal_processes(alive, lambda p: os.kill(cast(int, p.pid), signal.SIGINT))
323
+ alive = self._join_until_deadline(alive, self.terminate_timeout)
324
+ if not alive:
325
+ return
326
+
327
+ logger.warning(
328
+ "Subprocesses still alive after SIGINT wait; sending terminate() to %s",
329
+ ", ".join(p.name or str(p.pid) for p in alive),
330
+ )
331
+ self._signal_processes(alive, lambda p: p.terminate())
332
+
333
+ alive = self._join_until_deadline(alive, self.terminate_timeout)
334
+ if not alive:
335
+ return
336
+
337
+ logger.error(
338
+ "Subprocesses still alive after terminate(); sending kill() to %s",
339
+ ", ".join(p.name or str(p.pid) for p in alive),
340
+ )
341
+ self._signal_processes(alive, lambda p: p.kill())
342
+ alive = self._join_until_deadline(alive, self.terminate_timeout)
343
+
344
+ if alive:
345
+ logger.error(
346
+ "Subprocesses failed to exit even after kill(): %s", ", ".join(p.name or str(p.pid) for p in alive)
347
+ )
348
+
349
+ def _check_process_exitcodes(self, processes: Iterable[multiprocessing.Process]) -> None:
350
+ """Raise an error if any managed process exited with a non-zero status."""
351
+ failed = [p for p in processes if p.exitcode not in self.allowed_exit_codes + (None,)]
352
+ if failed:
353
+ formatted = ", ".join(f"{p.name or p.pid} (exitcode={p.exitcode})" for p in failed)
354
+ raise RuntimeError(f"Subprocesses failed with unexpected exit codes: {formatted}")
355
+
356
+ def execute(self, algorithm: AlgorithmBundle, runner: RunnerBundle, store: LightningStore) -> None:
357
+ logger.info(
358
+ "Starting client-server execution with %d runner(s) [role=%s, main_process=%s]",
359
+ self.n_runners,
360
+ self.role,
361
+ self.main_process,
362
+ )
363
+
364
+ # Re-use the active multiprocessing context so the event and processes
365
+ # agree on the start method (fork/spawn/forkserver).
366
+ ctx = multiprocessing.get_context()
367
+ stop_evt = MultiprocessingEvent(ctx=ctx)
368
+ # Track spawned processes so we can enforce termination ordering and
369
+ # surface non-zero exit codes back to the caller.
370
+ processes: list[multiprocessing.Process] = []
371
+
372
+ exception: BaseException | None = None
373
+ keyboard_interrupt = False
374
+
375
+ try:
376
+ if self.role == "algorithm":
377
+ logger.info("Running algorithm solely...")
378
+ asyncio.run(self._execute_algorithm(algorithm, store, stop_evt))
379
+ elif self.role == "runner":
380
+ if self.n_runners == 1:
381
+ logger.info("Running runner solely...")
382
+ asyncio.run(self._execute_runner(runner, 0, store, stop_evt))
383
+ else:
384
+ logger.info("Spawning runner processes...")
385
+ processes = self._spawn_runners(runner, store, stop_evt, ctx=ctx)
386
+ # Wait for the processes to finish naturally.
387
+ for process in processes:
388
+ process.join()
389
+ self._check_process_exitcodes(processes)
390
+ elif self.role == "both":
391
+ if self.main_process == "algorithm":
392
+ logger.info("Spawning runner processes...")
393
+ processes = self._spawn_runners(runner, store, stop_evt, ctx=ctx)
394
+ try:
395
+ logger.info("Running algorithm...")
396
+ asyncio.run(self._execute_algorithm(algorithm, store, stop_evt))
397
+ finally:
398
+ # Always request the runner side to unwind once the
399
+ # algorithm/server portion finishes (successfully or not).
400
+ stop_evt.set()
401
+ else: # main_process == "runner"
402
+ if self.n_runners > 1:
403
+ raise ValueError("main_process='runner' requires n_runners to be 1")
404
+
405
+ logger.info("Spawning algorithm process...")
406
+ algorithm_process = self._spawn_algorithm_process(algorithm, store, stop_evt, ctx=ctx)
407
+ processes = [algorithm_process]
408
+
409
+ # Run the lone runner cooperatively in-process so users can
410
+ # attach a debugger. The algorithm + HTTP server live in
411
+ # the background process spawned above (the provided
412
+ # store must therefore be picklable when using spawn).
413
+ logger.info("Running runner...")
414
+ asyncio.run(self._execute_runner(runner, 0, store, stop_evt))
415
+
416
+ # Wait for the algorithm process to finish.
417
+ algorithm_process.join()
418
+ else:
419
+ raise ValueError(f"Unknown role: {self.role}")
420
+ except KeyboardInterrupt:
421
+ logger.warning("KeyboardInterrupt received; initiating shutdown")
422
+ stop_evt.set()
423
+ keyboard_interrupt = True
424
+ except BaseException as exc:
425
+ logger.exception("Unhandled exception in execute method")
426
+ stop_evt.set()
427
+ # Preserve the original exception so we can avoid masking it during
428
+ # the cleanup phase.
429
+ exception = exc
430
+ raise
431
+ finally:
432
+ logger.info("Shutting down subprocesses")
433
+ self._shutdown_processes(processes, stop_evt)
434
+ if processes:
435
+ try:
436
+ self._check_process_exitcodes(processes)
437
+ except RuntimeError as err:
438
+ if exception is not None or keyboard_interrupt:
439
+ # We already propagate/handled a different failure, so
440
+ # emit a warning instead of raising a secondary error.
441
+ logger.warning("Subprocesses ended abnormally during shutdown: %s", err)
442
+ else:
443
+ raise
@@ -0,0 +1,69 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ import multiprocessing as mp
4
+ import threading
5
+ from multiprocessing.context import BaseContext
6
+ from typing import Optional, Protocol
7
+
8
+
9
+ class ExecutionEvent(Protocol):
10
+ """Protocol capturing the cooperative stop contract shared by strategies.
11
+
12
+ Implementations mirror the API of ``threading.Event`` and
13
+ ``multiprocessing.Event`` so the rest of the execution layer can remain
14
+ agnostic to the underlying concurrency primitive.
15
+
16
+ Methods:
17
+
18
+ set: Signal cancellation. The call must be idempotent.
19
+ clear: Reset the event to the unsignaled state.
20
+ is_set: Return ``True`` when cancellation has been requested.
21
+ wait: Block until the event is signaled or an optional timeout elapses.
22
+ """
23
+
24
+ def set(self) -> None: ...
25
+ def clear(self) -> None: ...
26
+ def is_set(self) -> bool: ...
27
+ def wait(self, timeout: Optional[float] = None) -> bool: ...
28
+
29
+
30
+ class ThreadingEvent:
31
+ """Thread-safe implementation of [`ExecutionEvent`][mantisdk.ExecutionEvent]."""
32
+
33
+ __slots__ = ("_evt",)
34
+
35
+ def __init__(self) -> None:
36
+ self._evt = threading.Event()
37
+
38
+ def set(self) -> None:
39
+ self._evt.set()
40
+
41
+ def clear(self) -> None:
42
+ self._evt.clear()
43
+
44
+ def is_set(self) -> bool:
45
+ return self._evt.is_set()
46
+
47
+ def wait(self, timeout: Optional[float] = None) -> bool:
48
+ return self._evt.wait(timeout)
49
+
50
+
51
+ class MultiprocessingEvent:
52
+ """Process-safe implementation of [`ExecutionEvent`][mantisdk.ExecutionEvent]."""
53
+
54
+ __slots__ = ("_evt",)
55
+
56
+ def __init__(self, *, ctx: Optional[BaseContext] = None) -> None:
57
+ self._evt = (ctx or mp).Event()
58
+
59
+ def set(self) -> None:
60
+ self._evt.set()
61
+
62
+ def clear(self) -> None:
63
+ self._evt.clear()
64
+
65
+ def is_set(self) -> bool:
66
+ return self._evt.is_set()
67
+
68
+ def wait(self, timeout: Optional[float] = None) -> bool:
69
+ return self._evt.wait(timeout)
@@ -0,0 +1,16 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ from .base import ExecutionStrategy
4
+
5
+
6
+ class InterProcessExecutionStrategy(ExecutionStrategy):
7
+ """Placeholder strategy for future inter-process primitives.
8
+
9
+ The class exists to reserve the `ipc` alias and make the planned
10
+ implementation discoverable. Attempting to use it today will raise
11
+ `NotImplementedError` once the execution contract is finalized.
12
+ """
13
+
14
+ alias: str = "ipc"
15
+
16
+ # TODO: to be implemented