mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
|
|
7
|
+
|
|
8
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
9
|
+
|
|
10
|
+
from mantisdk.types import (
|
|
11
|
+
Attempt,
|
|
12
|
+
AttemptedRollout,
|
|
13
|
+
AttemptStatus,
|
|
14
|
+
EnqueueRolloutRequest,
|
|
15
|
+
NamedResources,
|
|
16
|
+
ResourcesUpdate,
|
|
17
|
+
Rollout,
|
|
18
|
+
RolloutConfig,
|
|
19
|
+
RolloutStatus,
|
|
20
|
+
Span,
|
|
21
|
+
TaskInput,
|
|
22
|
+
Worker,
|
|
23
|
+
WorkerStatus,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from .base import UNSET, LightningStore, LightningStoreCapabilities, LightningStoreStatistics, Unset
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LightningStoreThreaded(LightningStore):
|
|
30
|
+
"""Facade that delegates all store operations to a underlying store instance.
|
|
31
|
+
|
|
32
|
+
The operations are guaranteed to be thread-safe.
|
|
33
|
+
Make sure the threaded stores are instantiated before initializing the threads.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, store: LightningStore) -> None:
|
|
37
|
+
super().__init__() # watchdog relies on the underlying store
|
|
38
|
+
self.store = store
|
|
39
|
+
self._lock = threading.Lock()
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def capabilities(self) -> LightningStoreCapabilities:
|
|
43
|
+
"""Return the capabilities of the store."""
|
|
44
|
+
capabilities = self.store.capabilities
|
|
45
|
+
return {
|
|
46
|
+
**capabilities,
|
|
47
|
+
"async_safe": True,
|
|
48
|
+
"thread_safe": True,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async def statistics(self) -> LightningStoreStatistics:
|
|
52
|
+
"""Return the statistics of the store."""
|
|
53
|
+
with self._lock:
|
|
54
|
+
return await self.store.statistics()
|
|
55
|
+
|
|
56
|
+
def otlp_traces_endpoint(self) -> str:
|
|
57
|
+
"""Return the OTLP/HTTP traces endpoint of the underlying store."""
|
|
58
|
+
return self.store.otlp_traces_endpoint()
|
|
59
|
+
|
|
60
|
+
def get_otlp_headers(self) -> Dict[str, str]:
|
|
61
|
+
"""Return the OTLP authentication headers from the underlying store."""
|
|
62
|
+
if hasattr(self.store, "get_otlp_headers"):
|
|
63
|
+
return self.store.get_otlp_headers()
|
|
64
|
+
return {}
|
|
65
|
+
|
|
66
|
+
async def start_rollout(
|
|
67
|
+
self,
|
|
68
|
+
input: TaskInput,
|
|
69
|
+
mode: Literal["train", "val", "test"] | None = None,
|
|
70
|
+
resources_id: str | None = None,
|
|
71
|
+
config: RolloutConfig | None = None,
|
|
72
|
+
metadata: Dict[str, Any] | None = None,
|
|
73
|
+
worker_id: Optional[str] = None,
|
|
74
|
+
) -> AttemptedRollout:
|
|
75
|
+
with self._lock:
|
|
76
|
+
return await self.store.start_rollout(
|
|
77
|
+
input,
|
|
78
|
+
mode,
|
|
79
|
+
resources_id,
|
|
80
|
+
config,
|
|
81
|
+
metadata,
|
|
82
|
+
worker_id,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
async def enqueue_rollout(
|
|
86
|
+
self,
|
|
87
|
+
input: TaskInput,
|
|
88
|
+
mode: Literal["train", "val", "test"] | None = None,
|
|
89
|
+
resources_id: str | None = None,
|
|
90
|
+
config: RolloutConfig | None = None,
|
|
91
|
+
metadata: Dict[str, Any] | None = None,
|
|
92
|
+
) -> Rollout:
|
|
93
|
+
with self._lock:
|
|
94
|
+
return await self.store.enqueue_rollout(input, mode, resources_id, config, metadata)
|
|
95
|
+
|
|
96
|
+
async def enqueue_many_rollouts(self, rollouts: Sequence[EnqueueRolloutRequest]) -> Sequence[Rollout]:
|
|
97
|
+
with self._lock:
|
|
98
|
+
return await self.store.enqueue_many_rollouts(rollouts)
|
|
99
|
+
|
|
100
|
+
async def dequeue_rollout(self, worker_id: Optional[str] = None) -> Optional[AttemptedRollout]:
|
|
101
|
+
with self._lock:
|
|
102
|
+
return await self.store.dequeue_rollout(worker_id=worker_id)
|
|
103
|
+
|
|
104
|
+
async def dequeue_many_rollouts(
|
|
105
|
+
self,
|
|
106
|
+
*,
|
|
107
|
+
limit: int = 1,
|
|
108
|
+
worker_id: Optional[str] = None,
|
|
109
|
+
) -> Sequence[AttemptedRollout]:
|
|
110
|
+
with self._lock:
|
|
111
|
+
return await self.store.dequeue_many_rollouts(limit=limit, worker_id=worker_id)
|
|
112
|
+
|
|
113
|
+
async def start_attempt(self, rollout_id: str, worker_id: Optional[str] = None) -> AttemptedRollout:
|
|
114
|
+
with self._lock:
|
|
115
|
+
return await self.store.start_attempt(rollout_id, worker_id)
|
|
116
|
+
|
|
117
|
+
async def query_rollouts(
|
|
118
|
+
self,
|
|
119
|
+
*,
|
|
120
|
+
status_in: Optional[Sequence[RolloutStatus]] = None,
|
|
121
|
+
rollout_id_in: Optional[Sequence[str]] = None,
|
|
122
|
+
rollout_id_contains: Optional[str] = None,
|
|
123
|
+
filter_logic: Literal["and", "or"] = "and",
|
|
124
|
+
sort_by: Optional[str] = None,
|
|
125
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
126
|
+
limit: int = -1,
|
|
127
|
+
offset: int = 0,
|
|
128
|
+
status: Optional[Sequence[RolloutStatus]] = None,
|
|
129
|
+
rollout_ids: Optional[Sequence[str]] = None,
|
|
130
|
+
) -> Sequence[Rollout]:
|
|
131
|
+
with self._lock:
|
|
132
|
+
return await self.store.query_rollouts(
|
|
133
|
+
status_in=status_in,
|
|
134
|
+
rollout_id_in=rollout_id_in,
|
|
135
|
+
rollout_id_contains=rollout_id_contains,
|
|
136
|
+
filter_logic=filter_logic,
|
|
137
|
+
sort_by=sort_by,
|
|
138
|
+
sort_order=sort_order,
|
|
139
|
+
limit=limit,
|
|
140
|
+
offset=offset,
|
|
141
|
+
status=status,
|
|
142
|
+
rollout_ids=rollout_ids,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
async def query_attempts(
|
|
146
|
+
self,
|
|
147
|
+
rollout_id: str,
|
|
148
|
+
*,
|
|
149
|
+
sort_by: Optional[str] = "sequence_id",
|
|
150
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
151
|
+
limit: int = -1,
|
|
152
|
+
offset: int = 0,
|
|
153
|
+
) -> Sequence[Attempt]:
|
|
154
|
+
with self._lock:
|
|
155
|
+
return await self.store.query_attempts(
|
|
156
|
+
rollout_id,
|
|
157
|
+
sort_by=sort_by,
|
|
158
|
+
sort_order=sort_order,
|
|
159
|
+
limit=limit,
|
|
160
|
+
offset=offset,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
async def get_rollout_by_id(self, rollout_id: str) -> Optional[Rollout]:
|
|
164
|
+
with self._lock:
|
|
165
|
+
return await self.store.get_rollout_by_id(rollout_id)
|
|
166
|
+
|
|
167
|
+
async def get_latest_attempt(self, rollout_id: str) -> Optional[Attempt]:
|
|
168
|
+
with self._lock:
|
|
169
|
+
return await self.store.get_latest_attempt(rollout_id)
|
|
170
|
+
|
|
171
|
+
async def query_resources(
|
|
172
|
+
self,
|
|
173
|
+
*,
|
|
174
|
+
resources_id: Optional[str] = None,
|
|
175
|
+
resources_id_contains: Optional[str] = None,
|
|
176
|
+
sort_by: Optional[str] = None,
|
|
177
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
178
|
+
limit: int = -1,
|
|
179
|
+
offset: int = 0,
|
|
180
|
+
) -> Sequence[ResourcesUpdate]:
|
|
181
|
+
with self._lock:
|
|
182
|
+
return await self.store.query_resources(
|
|
183
|
+
resources_id=resources_id,
|
|
184
|
+
resources_id_contains=resources_id_contains,
|
|
185
|
+
sort_by=sort_by,
|
|
186
|
+
sort_order=sort_order,
|
|
187
|
+
limit=limit,
|
|
188
|
+
offset=offset,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
async def add_resources(self, resources: NamedResources) -> ResourcesUpdate:
|
|
192
|
+
with self._lock:
|
|
193
|
+
return await self.store.add_resources(resources)
|
|
194
|
+
|
|
195
|
+
async def update_resources(self, resources_id: str, resources: NamedResources) -> ResourcesUpdate:
|
|
196
|
+
with self._lock:
|
|
197
|
+
return await self.store.update_resources(resources_id, resources)
|
|
198
|
+
|
|
199
|
+
async def get_resources_by_id(self, resources_id: str) -> Optional[ResourcesUpdate]:
|
|
200
|
+
with self._lock:
|
|
201
|
+
return await self.store.get_resources_by_id(resources_id)
|
|
202
|
+
|
|
203
|
+
async def get_latest_resources(self) -> Optional[ResourcesUpdate]:
|
|
204
|
+
with self._lock:
|
|
205
|
+
return await self.store.get_latest_resources()
|
|
206
|
+
|
|
207
|
+
async def add_many_spans(self, spans: Sequence[Span]) -> Sequence[Span]:
|
|
208
|
+
with self._lock:
|
|
209
|
+
return await self.store.add_many_spans(spans)
|
|
210
|
+
|
|
211
|
+
async def add_span(self, span: Span) -> Optional[Span]:
|
|
212
|
+
with self._lock:
|
|
213
|
+
return await self.store.add_span(span)
|
|
214
|
+
|
|
215
|
+
async def add_otel_span(
|
|
216
|
+
self,
|
|
217
|
+
rollout_id: str,
|
|
218
|
+
attempt_id: str,
|
|
219
|
+
readable_span: ReadableSpan,
|
|
220
|
+
sequence_id: int | None = None,
|
|
221
|
+
) -> Optional[Span]:
|
|
222
|
+
with self._lock:
|
|
223
|
+
return await self.store.add_otel_span(rollout_id, attempt_id, readable_span, sequence_id)
|
|
224
|
+
|
|
225
|
+
async def wait_for_rollouts(self, *, rollout_ids: List[str], timeout: Optional[float] = None) -> List[Rollout]:
|
|
226
|
+
# This method does not change the state of the store, and it's not thread-safe.
|
|
227
|
+
return await self.store.wait_for_rollouts(rollout_ids=rollout_ids, timeout=timeout)
|
|
228
|
+
|
|
229
|
+
async def get_next_span_sequence_id(self, rollout_id: str, attempt_id: str) -> int:
|
|
230
|
+
with self._lock:
|
|
231
|
+
return await self.store.get_next_span_sequence_id(rollout_id, attempt_id)
|
|
232
|
+
|
|
233
|
+
async def get_many_span_sequence_ids(self, rollout_attempt_ids: Sequence[Tuple[str, str]]) -> Sequence[int]:
|
|
234
|
+
with self._lock:
|
|
235
|
+
return await self.store.get_many_span_sequence_ids(rollout_attempt_ids)
|
|
236
|
+
|
|
237
|
+
async def query_spans(
|
|
238
|
+
self,
|
|
239
|
+
rollout_id: str,
|
|
240
|
+
attempt_id: str | Literal["latest"] | None = None,
|
|
241
|
+
*,
|
|
242
|
+
trace_id: Optional[str] = None,
|
|
243
|
+
trace_id_contains: Optional[str] = None,
|
|
244
|
+
span_id: Optional[str] = None,
|
|
245
|
+
span_id_contains: Optional[str] = None,
|
|
246
|
+
parent_id: Optional[str] = None,
|
|
247
|
+
parent_id_contains: Optional[str] = None,
|
|
248
|
+
name: Optional[str] = None,
|
|
249
|
+
name_contains: Optional[str] = None,
|
|
250
|
+
filter_logic: Literal["and", "or"] = "and",
|
|
251
|
+
limit: int = -1,
|
|
252
|
+
offset: int = 0,
|
|
253
|
+
sort_by: Optional[str] = "sequence_id",
|
|
254
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
255
|
+
) -> Sequence[Span]:
|
|
256
|
+
with self._lock:
|
|
257
|
+
return await self.store.query_spans(
|
|
258
|
+
rollout_id,
|
|
259
|
+
attempt_id,
|
|
260
|
+
trace_id=trace_id,
|
|
261
|
+
trace_id_contains=trace_id_contains,
|
|
262
|
+
span_id=span_id,
|
|
263
|
+
span_id_contains=span_id_contains,
|
|
264
|
+
parent_id=parent_id,
|
|
265
|
+
parent_id_contains=parent_id_contains,
|
|
266
|
+
name=name,
|
|
267
|
+
name_contains=name_contains,
|
|
268
|
+
filter_logic=filter_logic,
|
|
269
|
+
limit=limit,
|
|
270
|
+
offset=offset,
|
|
271
|
+
sort_by=sort_by,
|
|
272
|
+
sort_order=sort_order,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
async def update_rollout(
|
|
276
|
+
self,
|
|
277
|
+
rollout_id: str,
|
|
278
|
+
input: TaskInput | Unset = UNSET,
|
|
279
|
+
mode: Optional[Literal["train", "val", "test"]] | Unset = UNSET,
|
|
280
|
+
resources_id: Optional[str] | Unset = UNSET,
|
|
281
|
+
status: RolloutStatus | Unset = UNSET,
|
|
282
|
+
config: RolloutConfig | Unset = UNSET,
|
|
283
|
+
metadata: Optional[Dict[str, Any]] | Unset = UNSET,
|
|
284
|
+
) -> Rollout:
|
|
285
|
+
with self._lock:
|
|
286
|
+
return await self.store.update_rollout(
|
|
287
|
+
rollout_id=rollout_id,
|
|
288
|
+
input=input,
|
|
289
|
+
mode=mode,
|
|
290
|
+
resources_id=resources_id,
|
|
291
|
+
status=status,
|
|
292
|
+
config=config,
|
|
293
|
+
metadata=metadata,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
async def update_attempt(
|
|
297
|
+
self,
|
|
298
|
+
rollout_id: str,
|
|
299
|
+
attempt_id: str | Literal["latest"],
|
|
300
|
+
status: AttemptStatus | Unset = UNSET,
|
|
301
|
+
worker_id: str | Unset = UNSET,
|
|
302
|
+
last_heartbeat_time: float | Unset = UNSET,
|
|
303
|
+
metadata: Optional[Dict[str, Any]] | Unset = UNSET,
|
|
304
|
+
) -> Attempt:
|
|
305
|
+
with self._lock:
|
|
306
|
+
return await self.store.update_attempt(
|
|
307
|
+
rollout_id=rollout_id,
|
|
308
|
+
attempt_id=attempt_id,
|
|
309
|
+
status=status,
|
|
310
|
+
worker_id=worker_id,
|
|
311
|
+
last_heartbeat_time=last_heartbeat_time,
|
|
312
|
+
metadata=metadata,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
async def query_workers(
|
|
316
|
+
self,
|
|
317
|
+
*,
|
|
318
|
+
status_in: Optional[Sequence[WorkerStatus]] = None,
|
|
319
|
+
worker_id_contains: Optional[str] = None,
|
|
320
|
+
filter_logic: Literal["and", "or"] = "and",
|
|
321
|
+
sort_by: Optional[str] = None,
|
|
322
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
323
|
+
limit: int = -1,
|
|
324
|
+
offset: int = 0,
|
|
325
|
+
) -> Sequence[Worker]:
|
|
326
|
+
with self._lock:
|
|
327
|
+
return await self.store.query_workers(
|
|
328
|
+
status_in=status_in,
|
|
329
|
+
worker_id_contains=worker_id_contains,
|
|
330
|
+
sort_by=sort_by,
|
|
331
|
+
sort_order=sort_order,
|
|
332
|
+
limit=limit,
|
|
333
|
+
offset=offset,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
async def get_worker_by_id(self, worker_id: str) -> Optional[Worker]:
|
|
337
|
+
with self._lock:
|
|
338
|
+
return await self.store.get_worker_by_id(worker_id)
|
|
339
|
+
|
|
340
|
+
async def update_worker(
|
|
341
|
+
self,
|
|
342
|
+
worker_id: str,
|
|
343
|
+
heartbeat_stats: Dict[str, Any] | Unset = UNSET,
|
|
344
|
+
) -> Worker:
|
|
345
|
+
with self._lock:
|
|
346
|
+
return await self.store.update_worker(
|
|
347
|
+
worker_id=worker_id,
|
|
348
|
+
heartbeat_stats=heartbeat_stats,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def complete_job(self, summary: Optional[Dict[str, Any]] = None) -> None:
|
|
352
|
+
"""Delegate job completion to the underlying store."""
|
|
353
|
+
with self._lock:
|
|
354
|
+
# We check if the underlying store has the method to be safe,
|
|
355
|
+
# though base LightningStore now has it.
|
|
356
|
+
if hasattr(self.store, "complete_job"):
|
|
357
|
+
self.store.complete_job(summary)
|
mantisdk/store/utils.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Awaitable, Callable, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from mantisdk.types import Attempt, AttemptedRollout, AttemptStatus, Rollout, RolloutConfig, RolloutStatus
|
|
7
|
+
|
|
8
|
+
UpdateRolloutStatus = Callable[[str, RolloutStatus], Awaitable[Rollout]]
|
|
9
|
+
UpdateAttemptStatus = Callable[[str, str, AttemptStatus], Awaitable[Attempt]]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LATENCY_BUCKETS = [
|
|
13
|
+
0.000001,
|
|
14
|
+
0.000002,
|
|
15
|
+
0.000005,
|
|
16
|
+
0.00001,
|
|
17
|
+
0.00002,
|
|
18
|
+
0.00005,
|
|
19
|
+
0.0001,
|
|
20
|
+
0.0002,
|
|
21
|
+
0.0005,
|
|
22
|
+
0.001,
|
|
23
|
+
0.002,
|
|
24
|
+
0.003,
|
|
25
|
+
0.005,
|
|
26
|
+
0.007,
|
|
27
|
+
0.01,
|
|
28
|
+
0.015,
|
|
29
|
+
0.02,
|
|
30
|
+
0.03,
|
|
31
|
+
0.05,
|
|
32
|
+
0.07,
|
|
33
|
+
0.1,
|
|
34
|
+
0.2,
|
|
35
|
+
0.3,
|
|
36
|
+
0.5,
|
|
37
|
+
0.7,
|
|
38
|
+
1.0,
|
|
39
|
+
2.0,
|
|
40
|
+
3.0,
|
|
41
|
+
5.0,
|
|
42
|
+
7.0,
|
|
43
|
+
10.0,
|
|
44
|
+
12.0,
|
|
45
|
+
15.0,
|
|
46
|
+
20.0,
|
|
47
|
+
25.0,
|
|
48
|
+
30.0,
|
|
49
|
+
40.0,
|
|
50
|
+
50.0,
|
|
51
|
+
60.0,
|
|
52
|
+
90.0,
|
|
53
|
+
120.0,
|
|
54
|
+
180.0,
|
|
55
|
+
240.0,
|
|
56
|
+
300.0,
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def rollout_status_from_attempt(
|
|
61
|
+
attempt: Attempt,
|
|
62
|
+
config: RolloutConfig,
|
|
63
|
+
) -> RolloutStatus:
|
|
64
|
+
"""
|
|
65
|
+
Propagate the status of an attempt to the rollout.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The status of the rollout from the perspective of the attempt.
|
|
69
|
+
"""
|
|
70
|
+
# Propagate the status directly to the rollout
|
|
71
|
+
if attempt.status == "preparing" or attempt.status == "running" or attempt.status == "succeeded":
|
|
72
|
+
return attempt.status
|
|
73
|
+
|
|
74
|
+
if attempt.status == "failed" or attempt.status == "timeout" or attempt.status == "unresponsive":
|
|
75
|
+
# Check if this status should trigger a retry
|
|
76
|
+
if attempt.status in config.retry_condition:
|
|
77
|
+
# If we haven't exceeded max attempts, retry
|
|
78
|
+
if attempt.sequence_id < config.max_attempts:
|
|
79
|
+
return "requeuing"
|
|
80
|
+
|
|
81
|
+
# If we can't retry or shouldn't retry, mark as failed
|
|
82
|
+
return "failed"
|
|
83
|
+
|
|
84
|
+
raise ValueError(f"Invalid attempt status: {attempt.status}")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
async def scan_unhealthy_rollouts(
|
|
88
|
+
rollouts: List[AttemptedRollout],
|
|
89
|
+
) -> Dict[Tuple[str, str], AttemptStatus]:
|
|
90
|
+
"""
|
|
91
|
+
Perform health check on all running rollouts in the store.
|
|
92
|
+
|
|
93
|
+
This method should be called periodically to:
|
|
94
|
+
|
|
95
|
+
1. Check for unresponsive attempts (no heartbeat or spans for a while)
|
|
96
|
+
2. Check for timed-out rollouts (running too long since start_time)
|
|
97
|
+
|
|
98
|
+
This operation is completely unlocked. The caller is responsible for locking the store.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
rollouts: The list of running rollouts to check.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
A dictionary of updates to the rollouts.
|
|
105
|
+
"""
|
|
106
|
+
current_time = time.time()
|
|
107
|
+
updates: Dict[Tuple[str, str], AttemptStatus] = {}
|
|
108
|
+
|
|
109
|
+
for rollout in rollouts:
|
|
110
|
+
config = rollout.config # policy for retry and timeout
|
|
111
|
+
|
|
112
|
+
# Get the latest attempt for this rollout
|
|
113
|
+
latest_attempt = rollout.attempt
|
|
114
|
+
if not latest_attempt:
|
|
115
|
+
# This should not happen
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Check for timeout condition (based on attempt start_time, instead of rollout start_time)
|
|
119
|
+
if config.timeout_seconds is not None and current_time - latest_attempt.start_time > config.timeout_seconds:
|
|
120
|
+
updates[(latest_attempt.rollout_id, latest_attempt.attempt_id)] = "timeout"
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
# Check for unresponsive condition (based on last heartbeat)
|
|
124
|
+
# (1) Haven't received heartbeat for a while
|
|
125
|
+
if (
|
|
126
|
+
latest_attempt.last_heartbeat_time
|
|
127
|
+
and config.unresponsive_seconds is not None
|
|
128
|
+
and current_time - latest_attempt.last_heartbeat_time > config.unresponsive_seconds
|
|
129
|
+
):
|
|
130
|
+
updates[(latest_attempt.rollout_id, latest_attempt.attempt_id)] = "unresponsive"
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
# (2) Check if there's no last heartbeat (no spans) at all
|
|
134
|
+
if (
|
|
135
|
+
latest_attempt.last_heartbeat_time is None
|
|
136
|
+
and config.unresponsive_seconds is not None
|
|
137
|
+
and current_time - latest_attempt.start_time > config.unresponsive_seconds
|
|
138
|
+
):
|
|
139
|
+
updates[(latest_attempt.rollout_id, latest_attempt.attempt_id)] = "unresponsive"
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
return updates
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from .agentops import AgentOpsTracer
|
|
4
|
+
from .base import Tracer, clear_active_tracer, get_active_tracer, set_active_tracer
|
|
5
|
+
from .dummy import DummyTracer
|
|
6
|
+
from .otel import OtelTracer
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AgentOpsTracer",
|
|
10
|
+
"Tracer",
|
|
11
|
+
"OtelTracer",
|
|
12
|
+
"DummyTracer",
|
|
13
|
+
"get_active_tracer",
|
|
14
|
+
"set_active_tracer",
|
|
15
|
+
"clear_active_tracer",
|
|
16
|
+
]
|