mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import threading
|
|
6
|
+
from contextlib import suppress
|
|
7
|
+
from queue import SimpleQueue
|
|
8
|
+
from typing import Any, Awaitable, Callable, List, Literal, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from mantisdk.env_var import LightningEnvVar, resolve_bool_env_var
|
|
11
|
+
from mantisdk.store.base import LightningStore
|
|
12
|
+
from mantisdk.store.threading import LightningStoreThreaded
|
|
13
|
+
|
|
14
|
+
from .base import AlgorithmBundle, ExecutionStrategy, RunnerBundle
|
|
15
|
+
from .events import ExecutionEvent, ThreadingEvent
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SharedMemoryExecutionStrategy(ExecutionStrategy):
|
|
21
|
+
"""Execute bundles in a single process with cooperative worker threads.
|
|
22
|
+
|
|
23
|
+
Stop Model:
|
|
24
|
+
|
|
25
|
+
- All bundles share one [`ThreadingEvent`][mantisdk.ThreadingEvent]
|
|
26
|
+
named `stop_evt`.
|
|
27
|
+
- Only the main thread receives `KeyboardInterrupt`. When Ctrl+C occurs we
|
|
28
|
+
set `stop_evt`.
|
|
29
|
+
- Any exception raised inside a bundle sets `stop_evt` so other threads can
|
|
30
|
+
unwind cooperatively.
|
|
31
|
+
- Once the bundle running on the main thread exits successfully the
|
|
32
|
+
treatment depends on `main_thread`:
|
|
33
|
+
- `"algorithm"`: the runners are asked to stop by setting `stop_evt`.
|
|
34
|
+
- `"runner"`: the algorithm keeps running until it exits naturally.
|
|
35
|
+
- Background threads are marked as daemons. We join them briefly and log any
|
|
36
|
+
stragglers before shutting down.
|
|
37
|
+
|
|
38
|
+
!!! note
|
|
39
|
+
Signals other than `SIGINT` (such as `SIGTERM`) are not intercepted;
|
|
40
|
+
Python's default behavior for those signals is preserved.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
alias: str = "shm"
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
n_runners: int = 1,
|
|
48
|
+
main_thread: Literal["algorithm", "runner"] = "runner",
|
|
49
|
+
join_timeout: float = 15.0,
|
|
50
|
+
graceful_delay: float = 5.0,
|
|
51
|
+
poll_interval: float = 0.05,
|
|
52
|
+
managed_store: bool | None = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
if main_thread not in ("algorithm", "runner"):
|
|
55
|
+
raise ValueError("main_thread must be 'algorithm' or 'runner'")
|
|
56
|
+
if main_thread == "runner" and n_runners != 1:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
"When main_thread is 'runner', n_runners must be 1. "
|
|
59
|
+
"Either use 'algorithm' on the main thread or set n_runners to 1."
|
|
60
|
+
)
|
|
61
|
+
self.n_runners = n_runners
|
|
62
|
+
self.main_thread = main_thread
|
|
63
|
+
self.join_timeout = join_timeout
|
|
64
|
+
self.graceful_delay = graceful_delay
|
|
65
|
+
self.poll_interval = poll_interval
|
|
66
|
+
self.managed_store = resolve_bool_env_var(
|
|
67
|
+
LightningEnvVar.AGL_MANAGED_STORE, override=managed_store, fallback=True
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
async def _run_until_completed_or_canceled(self, coro: Awaitable[Any], stop_evt: ExecutionEvent) -> Any:
|
|
71
|
+
"""Run `coro` until it finishes or a cooperative stop is requested.
|
|
72
|
+
|
|
73
|
+
Control flow:
|
|
74
|
+
|
|
75
|
+
1. Start the bundle coroutine as `task`.
|
|
76
|
+
2. Launch a watcher that polls `stop_evt` without blocking the loop.
|
|
77
|
+
3. When the stop event flips:
|
|
78
|
+
a. Give the bundle `graceful_delay` seconds to finish on its own,
|
|
79
|
+
because well-behaved bundles will check the event and return.
|
|
80
|
+
b. Cancel the bundle task if it is still running after the grace
|
|
81
|
+
period.
|
|
82
|
+
4. Await both tasks and swallow `CancelledError` where appropriate.
|
|
83
|
+
|
|
84
|
+
This is a *backup* mechanism for bundles that might not poll the event
|
|
85
|
+
frequently; cooperative shutdown (checking `stop_evt` inside the
|
|
86
|
+
bundle) remains the preferred approach.
|
|
87
|
+
"""
|
|
88
|
+
task: asyncio.Task[Any] = asyncio.create_task(coro) # type: ignore
|
|
89
|
+
task_exception: Optional[BaseException] = None
|
|
90
|
+
|
|
91
|
+
async def watcher() -> None:
|
|
92
|
+
# Poll the threading event without blocking the event loop. Using a
|
|
93
|
+
# background thread via ``asyncio.to_thread`` makes cancellation
|
|
94
|
+
# difficult because ``ThreadingEvent.wait`` is not interruptible.
|
|
95
|
+
# Instead we cooperatively check the flag from the loop so the
|
|
96
|
+
# watcher task stays cancellable and tests don't hang when the
|
|
97
|
+
# bundle finishes naturally before the stop event is set.
|
|
98
|
+
while not stop_evt.is_set():
|
|
99
|
+
await asyncio.sleep(self.poll_interval)
|
|
100
|
+
|
|
101
|
+
# Grace period: let a cooperative bundle exit on its own.
|
|
102
|
+
try:
|
|
103
|
+
# At this point of waiting, the main task should already see the stop event.
|
|
104
|
+
await asyncio.wait_for(asyncio.shield(task), timeout=self.graceful_delay) # type: ignore
|
|
105
|
+
logger.debug("Bundle finished by itself during grace period.")
|
|
106
|
+
return # bundle finished by itself during grace period
|
|
107
|
+
except asyncio.TimeoutError:
|
|
108
|
+
# Still running after the grace window.
|
|
109
|
+
pass
|
|
110
|
+
except asyncio.CancelledError:
|
|
111
|
+
# If someone else canceled the task already, we're done.
|
|
112
|
+
logger.debug("Bundle already canceled by someone else; exiting watcher.")
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
# Still running after the grace window: cancel it.
|
|
116
|
+
if not task.done():
|
|
117
|
+
logger.debug("Graceful delay elapsed; canceling bundle task...")
|
|
118
|
+
task.cancel()
|
|
119
|
+
|
|
120
|
+
watcher_task = asyncio.create_task(watcher())
|
|
121
|
+
result: Any = None
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
# We don't wait on FIRST_COMPLETED here, because we want the watcher
|
|
125
|
+
# to be able to grant a grace window after stop_evt flips.
|
|
126
|
+
await asyncio.wait(
|
|
127
|
+
{task, watcher_task}, return_when=asyncio.FIRST_COMPLETED
|
|
128
|
+
) # pyright: ignore[reportUnknownArgumentType]
|
|
129
|
+
finally:
|
|
130
|
+
# If the main task hasn't completed yet (e.g., watcher scheduled cancel),
|
|
131
|
+
# finish the cancellation handshake.
|
|
132
|
+
if not task.done():
|
|
133
|
+
try:
|
|
134
|
+
await asyncio.wait_for(task, timeout=self.graceful_delay) # second chance
|
|
135
|
+
except asyncio.TimeoutError:
|
|
136
|
+
logger.error(
|
|
137
|
+
"Bundle task did not stop after cancellation; abandoning task."
|
|
138
|
+
"This thread could live until the process exits."
|
|
139
|
+
)
|
|
140
|
+
# We return without awaiting it. asyncio.run will still try to cancel
|
|
141
|
+
# pending tasks on loop close; if the task ignores cancellation, this
|
|
142
|
+
# thread may still stick. It's the best we can do in Python.
|
|
143
|
+
# We don't raise an exception here, but the thread could be a zombie.
|
|
144
|
+
return result
|
|
145
|
+
else:
|
|
146
|
+
# Task completed naturally; retrieve result.
|
|
147
|
+
try:
|
|
148
|
+
result = await task # type: ignore
|
|
149
|
+
except asyncio.CancelledError:
|
|
150
|
+
pass
|
|
151
|
+
except BaseException as exc:
|
|
152
|
+
task_exception = exc
|
|
153
|
+
|
|
154
|
+
watcher_task.cancel()
|
|
155
|
+
with suppress(asyncio.CancelledError):
|
|
156
|
+
await watcher_task
|
|
157
|
+
|
|
158
|
+
if task_exception is not None:
|
|
159
|
+
raise task_exception
|
|
160
|
+
|
|
161
|
+
return result # type: ignore
|
|
162
|
+
|
|
163
|
+
def _run_algorithm(
|
|
164
|
+
self,
|
|
165
|
+
algorithm: AlgorithmBundle,
|
|
166
|
+
store: LightningStore,
|
|
167
|
+
stop_evt: ExecutionEvent,
|
|
168
|
+
thread_exceptions: Optional[SimpleQueue[BaseException]],
|
|
169
|
+
) -> None:
|
|
170
|
+
try:
|
|
171
|
+
asyncio.run(self._run_until_completed_or_canceled(algorithm(store, stop_evt), stop_evt))
|
|
172
|
+
except asyncio.CancelledError:
|
|
173
|
+
logger.info("Algorithm bundle canceled due to stop signal.")
|
|
174
|
+
except BaseException as exc:
|
|
175
|
+
logger.exception("Algorithm bundle crashed; signaling stop to others.")
|
|
176
|
+
if thread_exceptions is not None:
|
|
177
|
+
thread_exceptions.put(exc)
|
|
178
|
+
stop_evt.set()
|
|
179
|
+
raise
|
|
180
|
+
|
|
181
|
+
def _run_runner(
|
|
182
|
+
self,
|
|
183
|
+
runner: RunnerBundle,
|
|
184
|
+
store: LightningStore,
|
|
185
|
+
worker_id: int,
|
|
186
|
+
stop_evt: ExecutionEvent,
|
|
187
|
+
thread_exceptions: Optional[SimpleQueue[BaseException]],
|
|
188
|
+
) -> None:
|
|
189
|
+
try:
|
|
190
|
+
asyncio.run(self._run_until_completed_or_canceled(runner(store, worker_id, stop_evt), stop_evt))
|
|
191
|
+
except asyncio.CancelledError:
|
|
192
|
+
logger.info("Runner bundle (worker_id=%s) canceled due to stop signal.", worker_id)
|
|
193
|
+
except BaseException as exc:
|
|
194
|
+
logger.exception("Runner bundle crashed (worker_id=%s); signaling stop to others.", worker_id)
|
|
195
|
+
if thread_exceptions is not None:
|
|
196
|
+
thread_exceptions.put(exc)
|
|
197
|
+
stop_evt.set()
|
|
198
|
+
raise
|
|
199
|
+
|
|
200
|
+
def execute(self, algorithm: AlgorithmBundle, runner: RunnerBundle, store: LightningStore) -> None:
|
|
201
|
+
logger.info(
|
|
202
|
+
"Starting shm execution with %d runner(s); main thread runs '%s'",
|
|
203
|
+
self.n_runners,
|
|
204
|
+
self.main_thread,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Create stop event and thread-safe store.
|
|
208
|
+
stop_evt = ThreadingEvent()
|
|
209
|
+
if self.managed_store:
|
|
210
|
+
thread_safe_store = LightningStoreThreaded(store)
|
|
211
|
+
else:
|
|
212
|
+
thread_safe_store = store
|
|
213
|
+
|
|
214
|
+
thread_exceptions: SimpleQueue[BaseException] = SimpleQueue()
|
|
215
|
+
raised_from_thread: Optional[BaseException] = None
|
|
216
|
+
|
|
217
|
+
def make_thread(name: str, target: Callable[..., Any], args: Tuple[Any, ...]) -> threading.Thread:
|
|
218
|
+
t = threading.Thread(name=name, target=target, args=args, daemon=True)
|
|
219
|
+
t.start()
|
|
220
|
+
return t
|
|
221
|
+
|
|
222
|
+
threads: List[threading.Thread] = []
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
if self.main_thread == "algorithm":
|
|
226
|
+
# Start runner threads; algorithm runs on main thread.
|
|
227
|
+
for i in range(self.n_runners):
|
|
228
|
+
thread = make_thread(
|
|
229
|
+
name=f"runner-{i}",
|
|
230
|
+
target=self._run_runner,
|
|
231
|
+
args=(runner, thread_safe_store, i, stop_evt, thread_exceptions),
|
|
232
|
+
)
|
|
233
|
+
threads.append(thread)
|
|
234
|
+
|
|
235
|
+
# Ctrl+C here raises KeyboardInterrupt on this stack.
|
|
236
|
+
# Main thread doesn't need to collect exceptions.
|
|
237
|
+
self._run_algorithm(algorithm, thread_safe_store, stop_evt, None)
|
|
238
|
+
|
|
239
|
+
# If algo finishes naturally, request runners to stop.
|
|
240
|
+
stop_evt.set()
|
|
241
|
+
|
|
242
|
+
else: # main_thread == "runner"
|
|
243
|
+
# Start algorithm in background; runner runs on main thread.
|
|
244
|
+
thread = make_thread(
|
|
245
|
+
name="algorithm",
|
|
246
|
+
target=self._run_algorithm,
|
|
247
|
+
args=(algorithm, thread_safe_store, stop_evt, thread_exceptions),
|
|
248
|
+
)
|
|
249
|
+
threads.append(thread)
|
|
250
|
+
|
|
251
|
+
# Ctrl+C here raises KeyboardInterrupt on this stack.
|
|
252
|
+
# Main thread doesn't need to collect exceptions.
|
|
253
|
+
self._run_runner(runner, thread_safe_store, 0, stop_evt, None)
|
|
254
|
+
|
|
255
|
+
# If runner finishes naturally, WAIT FOR ALGORITHM TO FINISH.
|
|
256
|
+
thread.join()
|
|
257
|
+
|
|
258
|
+
if not thread_exceptions.empty():
|
|
259
|
+
raised_from_thread = thread_exceptions.get()
|
|
260
|
+
|
|
261
|
+
except KeyboardInterrupt:
|
|
262
|
+
logger.warning("KeyboardInterrupt received on main thread; initiating cooperative shutdown...")
|
|
263
|
+
stop_evt.set()
|
|
264
|
+
finally:
|
|
265
|
+
# Attempt a clean join; if some threads don't comply, log and move on.
|
|
266
|
+
for t in threads:
|
|
267
|
+
logger.debug("Joining thread %s...", t.name)
|
|
268
|
+
t.join(timeout=self.join_timeout)
|
|
269
|
+
|
|
270
|
+
alive = [t.name for t in threads if t.is_alive()]
|
|
271
|
+
if alive:
|
|
272
|
+
logger.error(
|
|
273
|
+
"Threads still alive after %.1fs: %s. They are daemons; continuing shutdown.",
|
|
274
|
+
self.join_timeout,
|
|
275
|
+
", ".join(alive),
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if raised_from_thread is None and not thread_exceptions.empty():
|
|
279
|
+
raised_from_thread = thread_exceptions.get()
|
|
280
|
+
|
|
281
|
+
if raised_from_thread is not None:
|
|
282
|
+
raise raised_from_thread
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
AGENTOPS_INSTALLED: bool = False
|
|
6
|
+
AGENTOPS_LANGCHAIN_INSTALLED: bool = False
|
|
7
|
+
LITELLM_INSTALLED: bool = False
|
|
8
|
+
VLLM_INSTALLED: bool = False
|
|
9
|
+
WEAVE_INSTALLED: bool = False
|
|
10
|
+
OPENAI_OTEL_INSTALLED: bool = False
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from . import agentops # type: ignore
|
|
14
|
+
|
|
15
|
+
AGENTOPS_INSTALLED = True # type: ignore
|
|
16
|
+
except ImportError:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from . import litellm # type: ignore
|
|
21
|
+
|
|
22
|
+
LITELLM_INSTALLED = True # type: ignore
|
|
23
|
+
except ImportError:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from opentelemetry.instrumentation.openai import OpenAIInstrumentor # type: ignore
|
|
28
|
+
|
|
29
|
+
OPENAI_OTEL_INSTALLED = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def instrument_all():
|
|
35
|
+
"""Instrument all the instrumentation libraries."""
|
|
36
|
+
if AGENTOPS_INSTALLED:
|
|
37
|
+
from .agentops import instrument_agentops
|
|
38
|
+
|
|
39
|
+
instrument_agentops()
|
|
40
|
+
else:
|
|
41
|
+
warnings.warn("agentops is not installed. It's therefore not instrumented.")
|
|
42
|
+
|
|
43
|
+
if LITELLM_INSTALLED:
|
|
44
|
+
from .litellm import instrument_litellm
|
|
45
|
+
|
|
46
|
+
instrument_litellm()
|
|
47
|
+
else:
|
|
48
|
+
warnings.warn("litellm is not installed. It's therefore not instrumented.")
|
|
49
|
+
|
|
50
|
+
if VLLM_INSTALLED:
|
|
51
|
+
from .vllm import instrument_vllm
|
|
52
|
+
|
|
53
|
+
instrument_vllm()
|
|
54
|
+
else:
|
|
55
|
+
warnings.warn("vllm is not installed. It's therefore not instrumented.")
|
|
56
|
+
|
|
57
|
+
if AGENTOPS_LANGCHAIN_INSTALLED:
|
|
58
|
+
from .agentops_langchain import instrument_agentops_langchain
|
|
59
|
+
|
|
60
|
+
instrument_agentops_langchain()
|
|
61
|
+
else:
|
|
62
|
+
warnings.warn("Agentops-langchain integration is not installed. It's therefore not instrumented.")
|
|
63
|
+
|
|
64
|
+
# Enable standard OpenAI instrumentation if available
|
|
65
|
+
# This allows client-side spans to be generated for OpenAI SDK calls
|
|
66
|
+
if OPENAI_OTEL_INSTALLED:
|
|
67
|
+
try:
|
|
68
|
+
OpenAIInstrumentor().instrument() # type: ignore
|
|
69
|
+
except Exception as e:
|
|
70
|
+
warnings.warn(f"Failed to instrument OpenAI SDK: {e}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def uninstrument_all():
|
|
74
|
+
"""Uninstrument all the instrumentation libraries."""
|
|
75
|
+
if AGENTOPS_INSTALLED:
|
|
76
|
+
try:
|
|
77
|
+
from .agentops import uninstrument_agentops
|
|
78
|
+
|
|
79
|
+
uninstrument_agentops()
|
|
80
|
+
except ImportError:
|
|
81
|
+
warnings.warn("agentops is installed but uninstrument_agentops could not be imported.")
|
|
82
|
+
else:
|
|
83
|
+
warnings.warn("agentops is not installed. It's therefore not uninstrumented.")
|
|
84
|
+
|
|
85
|
+
if LITELLM_INSTALLED:
|
|
86
|
+
try:
|
|
87
|
+
from .litellm import uninstrument_litellm
|
|
88
|
+
|
|
89
|
+
uninstrument_litellm()
|
|
90
|
+
except ImportError:
|
|
91
|
+
warnings.warn("litellm is installed but uninstrument_litellm could not be imported.")
|
|
92
|
+
else:
|
|
93
|
+
warnings.warn("litellm is not installed. It's therefore not uninstrumented.")
|
|
94
|
+
|
|
95
|
+
if VLLM_INSTALLED:
|
|
96
|
+
try:
|
|
97
|
+
from .vllm import uninstrument_vllm
|
|
98
|
+
|
|
99
|
+
uninstrument_vllm()
|
|
100
|
+
except ImportError:
|
|
101
|
+
warnings.warn("vllm is installed but uninstrument_vllm could not be imported.")
|
|
102
|
+
else:
|
|
103
|
+
warnings.warn("vllm is not installed. It's therefore not uninstrumented.")
|
|
104
|
+
|
|
105
|
+
if AGENTOPS_LANGCHAIN_INSTALLED:
|
|
106
|
+
try:
|
|
107
|
+
from .agentops_langchain import uninstrument_agentops_langchain
|
|
108
|
+
|
|
109
|
+
uninstrument_agentops_langchain()
|
|
110
|
+
except ImportError:
|
|
111
|
+
warnings.warn("agentops_langchain is installed but uninstrument_agentops_langchain could not be imported.")
|
|
112
|
+
else:
|
|
113
|
+
warnings.warn("Agentops-langchain integration is not installed. It's therefore not uninstrumented.")
|
|
114
|
+
|
|
115
|
+
if OPENAI_OTEL_INSTALLED:
|
|
116
|
+
try:
|
|
117
|
+
OpenAIInstrumentor().uninstrument() # type: ignore
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|