mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show
  1. mantisdk/__init__.py +22 -0
  2. mantisdk/adapter/__init__.py +15 -0
  3. mantisdk/adapter/base.py +94 -0
  4. mantisdk/adapter/messages.py +270 -0
  5. mantisdk/adapter/triplet.py +1028 -0
  6. mantisdk/algorithm/__init__.py +39 -0
  7. mantisdk/algorithm/apo/__init__.py +5 -0
  8. mantisdk/algorithm/apo/apo.py +889 -0
  9. mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
  10. mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
  11. mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
  12. mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
  13. mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
  14. mantisdk/algorithm/base.py +162 -0
  15. mantisdk/algorithm/decorator.py +264 -0
  16. mantisdk/algorithm/fast.py +250 -0
  17. mantisdk/algorithm/gepa/__init__.py +59 -0
  18. mantisdk/algorithm/gepa/adapter.py +459 -0
  19. mantisdk/algorithm/gepa/gepa.py +364 -0
  20. mantisdk/algorithm/gepa/lib/__init__.py +18 -0
  21. mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
  22. mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
  23. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
  24. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
  25. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
  26. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
  27. mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
  28. mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
  29. mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
  30. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
  31. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
  32. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
  33. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
  34. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
  35. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
  36. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
  37. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
  38. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
  39. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
  40. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
  41. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
  42. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
  43. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
  44. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
  45. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
  46. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
  47. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
  48. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
  49. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
  50. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
  51. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
  52. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
  53. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
  54. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
  55. mantisdk/algorithm/gepa/lib/api.py +375 -0
  56. mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
  57. mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
  58. mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
  59. mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
  60. mantisdk/algorithm/gepa/lib/core/result.py +233 -0
  61. mantisdk/algorithm/gepa/lib/core/state.py +636 -0
  62. mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
  63. mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
  64. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
  65. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
  66. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
  67. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
  68. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
  69. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
  70. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
  71. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
  72. mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
  73. mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
  74. mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
  75. mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
  76. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
  77. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
  78. mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
  79. mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
  80. mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
  81. mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
  82. mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
  83. mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
  84. mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
  85. mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
  86. mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
  87. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
  88. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
  89. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
  90. mantisdk/algorithm/gepa/lib/py.typed +0 -0
  91. mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
  92. mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
  93. mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
  94. mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
  95. mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
  96. mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
  97. mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
  98. mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
  99. mantisdk/algorithm/gepa/tracing.py +105 -0
  100. mantisdk/algorithm/utils.py +177 -0
  101. mantisdk/algorithm/verl/__init__.py +5 -0
  102. mantisdk/algorithm/verl/interface.py +202 -0
  103. mantisdk/cli/__init__.py +56 -0
  104. mantisdk/cli/prometheus.py +115 -0
  105. mantisdk/cli/store.py +131 -0
  106. mantisdk/cli/vllm.py +29 -0
  107. mantisdk/client.py +408 -0
  108. mantisdk/config.py +348 -0
  109. mantisdk/emitter/__init__.py +43 -0
  110. mantisdk/emitter/annotation.py +370 -0
  111. mantisdk/emitter/exception.py +54 -0
  112. mantisdk/emitter/message.py +61 -0
  113. mantisdk/emitter/object.py +117 -0
  114. mantisdk/emitter/reward.py +320 -0
  115. mantisdk/env_var.py +156 -0
  116. mantisdk/execution/__init__.py +15 -0
  117. mantisdk/execution/base.py +64 -0
  118. mantisdk/execution/client_server.py +443 -0
  119. mantisdk/execution/events.py +69 -0
  120. mantisdk/execution/inter_process.py +16 -0
  121. mantisdk/execution/shared_memory.py +282 -0
  122. mantisdk/instrumentation/__init__.py +119 -0
  123. mantisdk/instrumentation/agentops.py +314 -0
  124. mantisdk/instrumentation/agentops_langchain.py +45 -0
  125. mantisdk/instrumentation/litellm.py +83 -0
  126. mantisdk/instrumentation/vllm.py +81 -0
  127. mantisdk/instrumentation/weave.py +500 -0
  128. mantisdk/litagent/__init__.py +11 -0
  129. mantisdk/litagent/decorator.py +536 -0
  130. mantisdk/litagent/litagent.py +252 -0
  131. mantisdk/llm_proxy.py +1890 -0
  132. mantisdk/logging.py +370 -0
  133. mantisdk/reward.py +7 -0
  134. mantisdk/runner/__init__.py +11 -0
  135. mantisdk/runner/agent.py +845 -0
  136. mantisdk/runner/base.py +182 -0
  137. mantisdk/runner/legacy.py +309 -0
  138. mantisdk/semconv.py +170 -0
  139. mantisdk/server.py +401 -0
  140. mantisdk/store/__init__.py +23 -0
  141. mantisdk/store/base.py +897 -0
  142. mantisdk/store/client_server.py +2092 -0
  143. mantisdk/store/collection/__init__.py +30 -0
  144. mantisdk/store/collection/base.py +587 -0
  145. mantisdk/store/collection/memory.py +970 -0
  146. mantisdk/store/collection/mongo.py +1412 -0
  147. mantisdk/store/collection_based.py +1823 -0
  148. mantisdk/store/insight.py +648 -0
  149. mantisdk/store/listener.py +58 -0
  150. mantisdk/store/memory.py +396 -0
  151. mantisdk/store/mongo.py +165 -0
  152. mantisdk/store/sqlite.py +3 -0
  153. mantisdk/store/threading.py +357 -0
  154. mantisdk/store/utils.py +142 -0
  155. mantisdk/tracer/__init__.py +16 -0
  156. mantisdk/tracer/agentops.py +242 -0
  157. mantisdk/tracer/base.py +287 -0
  158. mantisdk/tracer/dummy.py +106 -0
  159. mantisdk/tracer/otel.py +555 -0
  160. mantisdk/tracer/weave.py +677 -0
  161. mantisdk/trainer/__init__.py +6 -0
  162. mantisdk/trainer/init_utils.py +263 -0
  163. mantisdk/trainer/legacy.py +367 -0
  164. mantisdk/trainer/registry.py +12 -0
  165. mantisdk/trainer/trainer.py +618 -0
  166. mantisdk/types/__init__.py +6 -0
  167. mantisdk/types/core.py +553 -0
  168. mantisdk/types/resources.py +204 -0
  169. mantisdk/types/tracer.py +515 -0
  170. mantisdk/types/tracing.py +218 -0
  171. mantisdk/utils/__init__.py +1 -0
  172. mantisdk/utils/id.py +18 -0
  173. mantisdk/utils/metrics.py +1025 -0
  174. mantisdk/utils/otel.py +578 -0
  175. mantisdk/utils/otlp.py +536 -0
  176. mantisdk/utils/server_launcher.py +1045 -0
  177. mantisdk/utils/system_snapshot.py +81 -0
  178. mantisdk/verl/__init__.py +8 -0
  179. mantisdk/verl/__main__.py +6 -0
  180. mantisdk/verl/async_server.py +46 -0
  181. mantisdk/verl/config.yaml +27 -0
  182. mantisdk/verl/daemon.py +1154 -0
  183. mantisdk/verl/dataset.py +44 -0
  184. mantisdk/verl/entrypoint.py +248 -0
  185. mantisdk/verl/trainer.py +549 -0
  186. mantisdk-0.1.0.dist-info/METADATA +119 -0
  187. mantisdk-0.1.0.dist-info/RECORD +190 -0
  188. mantisdk-0.1.0.dist-info/WHEEL +4 -0
  189. mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
  190. mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
@@ -0,0 +1,357 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
7
+
8
+ from opentelemetry.sdk.trace import ReadableSpan
9
+
10
+ from mantisdk.types import (
11
+ Attempt,
12
+ AttemptedRollout,
13
+ AttemptStatus,
14
+ EnqueueRolloutRequest,
15
+ NamedResources,
16
+ ResourcesUpdate,
17
+ Rollout,
18
+ RolloutConfig,
19
+ RolloutStatus,
20
+ Span,
21
+ TaskInput,
22
+ Worker,
23
+ WorkerStatus,
24
+ )
25
+
26
+ from .base import UNSET, LightningStore, LightningStoreCapabilities, LightningStoreStatistics, Unset
27
+
28
+
29
+ class LightningStoreThreaded(LightningStore):
30
+ """Facade that delegates all store operations to a underlying store instance.
31
+
32
+ The operations are guaranteed to be thread-safe.
33
+ Make sure the threaded stores are instantiated before initializing the threads.
34
+ """
35
+
36
+ def __init__(self, store: LightningStore) -> None:
37
+ super().__init__() # watchdog relies on the underlying store
38
+ self.store = store
39
+ self._lock = threading.Lock()
40
+
41
+ @property
42
+ def capabilities(self) -> LightningStoreCapabilities:
43
+ """Return the capabilities of the store."""
44
+ capabilities = self.store.capabilities
45
+ return {
46
+ **capabilities,
47
+ "async_safe": True,
48
+ "thread_safe": True,
49
+ }
50
+
51
+ async def statistics(self) -> LightningStoreStatistics:
52
+ """Return the statistics of the store."""
53
+ with self._lock:
54
+ return await self.store.statistics()
55
+
56
+ def otlp_traces_endpoint(self) -> str:
57
+ """Return the OTLP/HTTP traces endpoint of the underlying store."""
58
+ return self.store.otlp_traces_endpoint()
59
+
60
+ def get_otlp_headers(self) -> Dict[str, str]:
61
+ """Return the OTLP authentication headers from the underlying store."""
62
+ if hasattr(self.store, "get_otlp_headers"):
63
+ return self.store.get_otlp_headers()
64
+ return {}
65
+
66
+ async def start_rollout(
67
+ self,
68
+ input: TaskInput,
69
+ mode: Literal["train", "val", "test"] | None = None,
70
+ resources_id: str | None = None,
71
+ config: RolloutConfig | None = None,
72
+ metadata: Dict[str, Any] | None = None,
73
+ worker_id: Optional[str] = None,
74
+ ) -> AttemptedRollout:
75
+ with self._lock:
76
+ return await self.store.start_rollout(
77
+ input,
78
+ mode,
79
+ resources_id,
80
+ config,
81
+ metadata,
82
+ worker_id,
83
+ )
84
+
85
+ async def enqueue_rollout(
86
+ self,
87
+ input: TaskInput,
88
+ mode: Literal["train", "val", "test"] | None = None,
89
+ resources_id: str | None = None,
90
+ config: RolloutConfig | None = None,
91
+ metadata: Dict[str, Any] | None = None,
92
+ ) -> Rollout:
93
+ with self._lock:
94
+ return await self.store.enqueue_rollout(input, mode, resources_id, config, metadata)
95
+
96
+ async def enqueue_many_rollouts(self, rollouts: Sequence[EnqueueRolloutRequest]) -> Sequence[Rollout]:
97
+ with self._lock:
98
+ return await self.store.enqueue_many_rollouts(rollouts)
99
+
100
+ async def dequeue_rollout(self, worker_id: Optional[str] = None) -> Optional[AttemptedRollout]:
101
+ with self._lock:
102
+ return await self.store.dequeue_rollout(worker_id=worker_id)
103
+
104
+ async def dequeue_many_rollouts(
105
+ self,
106
+ *,
107
+ limit: int = 1,
108
+ worker_id: Optional[str] = None,
109
+ ) -> Sequence[AttemptedRollout]:
110
+ with self._lock:
111
+ return await self.store.dequeue_many_rollouts(limit=limit, worker_id=worker_id)
112
+
113
+ async def start_attempt(self, rollout_id: str, worker_id: Optional[str] = None) -> AttemptedRollout:
114
+ with self._lock:
115
+ return await self.store.start_attempt(rollout_id, worker_id)
116
+
117
+ async def query_rollouts(
118
+ self,
119
+ *,
120
+ status_in: Optional[Sequence[RolloutStatus]] = None,
121
+ rollout_id_in: Optional[Sequence[str]] = None,
122
+ rollout_id_contains: Optional[str] = None,
123
+ filter_logic: Literal["and", "or"] = "and",
124
+ sort_by: Optional[str] = None,
125
+ sort_order: Literal["asc", "desc"] = "asc",
126
+ limit: int = -1,
127
+ offset: int = 0,
128
+ status: Optional[Sequence[RolloutStatus]] = None,
129
+ rollout_ids: Optional[Sequence[str]] = None,
130
+ ) -> Sequence[Rollout]:
131
+ with self._lock:
132
+ return await self.store.query_rollouts(
133
+ status_in=status_in,
134
+ rollout_id_in=rollout_id_in,
135
+ rollout_id_contains=rollout_id_contains,
136
+ filter_logic=filter_logic,
137
+ sort_by=sort_by,
138
+ sort_order=sort_order,
139
+ limit=limit,
140
+ offset=offset,
141
+ status=status,
142
+ rollout_ids=rollout_ids,
143
+ )
144
+
145
+ async def query_attempts(
146
+ self,
147
+ rollout_id: str,
148
+ *,
149
+ sort_by: Optional[str] = "sequence_id",
150
+ sort_order: Literal["asc", "desc"] = "asc",
151
+ limit: int = -1,
152
+ offset: int = 0,
153
+ ) -> Sequence[Attempt]:
154
+ with self._lock:
155
+ return await self.store.query_attempts(
156
+ rollout_id,
157
+ sort_by=sort_by,
158
+ sort_order=sort_order,
159
+ limit=limit,
160
+ offset=offset,
161
+ )
162
+
163
+ async def get_rollout_by_id(self, rollout_id: str) -> Optional[Rollout]:
164
+ with self._lock:
165
+ return await self.store.get_rollout_by_id(rollout_id)
166
+
167
+ async def get_latest_attempt(self, rollout_id: str) -> Optional[Attempt]:
168
+ with self._lock:
169
+ return await self.store.get_latest_attempt(rollout_id)
170
+
171
+ async def query_resources(
172
+ self,
173
+ *,
174
+ resources_id: Optional[str] = None,
175
+ resources_id_contains: Optional[str] = None,
176
+ sort_by: Optional[str] = None,
177
+ sort_order: Literal["asc", "desc"] = "asc",
178
+ limit: int = -1,
179
+ offset: int = 0,
180
+ ) -> Sequence[ResourcesUpdate]:
181
+ with self._lock:
182
+ return await self.store.query_resources(
183
+ resources_id=resources_id,
184
+ resources_id_contains=resources_id_contains,
185
+ sort_by=sort_by,
186
+ sort_order=sort_order,
187
+ limit=limit,
188
+ offset=offset,
189
+ )
190
+
191
+ async def add_resources(self, resources: NamedResources) -> ResourcesUpdate:
192
+ with self._lock:
193
+ return await self.store.add_resources(resources)
194
+
195
+ async def update_resources(self, resources_id: str, resources: NamedResources) -> ResourcesUpdate:
196
+ with self._lock:
197
+ return await self.store.update_resources(resources_id, resources)
198
+
199
+ async def get_resources_by_id(self, resources_id: str) -> Optional[ResourcesUpdate]:
200
+ with self._lock:
201
+ return await self.store.get_resources_by_id(resources_id)
202
+
203
+ async def get_latest_resources(self) -> Optional[ResourcesUpdate]:
204
+ with self._lock:
205
+ return await self.store.get_latest_resources()
206
+
207
+ async def add_many_spans(self, spans: Sequence[Span]) -> Sequence[Span]:
208
+ with self._lock:
209
+ return await self.store.add_many_spans(spans)
210
+
211
+ async def add_span(self, span: Span) -> Optional[Span]:
212
+ with self._lock:
213
+ return await self.store.add_span(span)
214
+
215
+ async def add_otel_span(
216
+ self,
217
+ rollout_id: str,
218
+ attempt_id: str,
219
+ readable_span: ReadableSpan,
220
+ sequence_id: int | None = None,
221
+ ) -> Optional[Span]:
222
+ with self._lock:
223
+ return await self.store.add_otel_span(rollout_id, attempt_id, readable_span, sequence_id)
224
+
225
+ async def wait_for_rollouts(self, *, rollout_ids: List[str], timeout: Optional[float] = None) -> List[Rollout]:
226
+ # This method does not change the state of the store, and it's not thread-safe.
227
+ return await self.store.wait_for_rollouts(rollout_ids=rollout_ids, timeout=timeout)
228
+
229
+ async def get_next_span_sequence_id(self, rollout_id: str, attempt_id: str) -> int:
230
+ with self._lock:
231
+ return await self.store.get_next_span_sequence_id(rollout_id, attempt_id)
232
+
233
+ async def get_many_span_sequence_ids(self, rollout_attempt_ids: Sequence[Tuple[str, str]]) -> Sequence[int]:
234
+ with self._lock:
235
+ return await self.store.get_many_span_sequence_ids(rollout_attempt_ids)
236
+
237
+ async def query_spans(
238
+ self,
239
+ rollout_id: str,
240
+ attempt_id: str | Literal["latest"] | None = None,
241
+ *,
242
+ trace_id: Optional[str] = None,
243
+ trace_id_contains: Optional[str] = None,
244
+ span_id: Optional[str] = None,
245
+ span_id_contains: Optional[str] = None,
246
+ parent_id: Optional[str] = None,
247
+ parent_id_contains: Optional[str] = None,
248
+ name: Optional[str] = None,
249
+ name_contains: Optional[str] = None,
250
+ filter_logic: Literal["and", "or"] = "and",
251
+ limit: int = -1,
252
+ offset: int = 0,
253
+ sort_by: Optional[str] = "sequence_id",
254
+ sort_order: Literal["asc", "desc"] = "asc",
255
+ ) -> Sequence[Span]:
256
+ with self._lock:
257
+ return await self.store.query_spans(
258
+ rollout_id,
259
+ attempt_id,
260
+ trace_id=trace_id,
261
+ trace_id_contains=trace_id_contains,
262
+ span_id=span_id,
263
+ span_id_contains=span_id_contains,
264
+ parent_id=parent_id,
265
+ parent_id_contains=parent_id_contains,
266
+ name=name,
267
+ name_contains=name_contains,
268
+ filter_logic=filter_logic,
269
+ limit=limit,
270
+ offset=offset,
271
+ sort_by=sort_by,
272
+ sort_order=sort_order,
273
+ )
274
+
275
+ async def update_rollout(
276
+ self,
277
+ rollout_id: str,
278
+ input: TaskInput | Unset = UNSET,
279
+ mode: Optional[Literal["train", "val", "test"]] | Unset = UNSET,
280
+ resources_id: Optional[str] | Unset = UNSET,
281
+ status: RolloutStatus | Unset = UNSET,
282
+ config: RolloutConfig | Unset = UNSET,
283
+ metadata: Optional[Dict[str, Any]] | Unset = UNSET,
284
+ ) -> Rollout:
285
+ with self._lock:
286
+ return await self.store.update_rollout(
287
+ rollout_id=rollout_id,
288
+ input=input,
289
+ mode=mode,
290
+ resources_id=resources_id,
291
+ status=status,
292
+ config=config,
293
+ metadata=metadata,
294
+ )
295
+
296
+ async def update_attempt(
297
+ self,
298
+ rollout_id: str,
299
+ attempt_id: str | Literal["latest"],
300
+ status: AttemptStatus | Unset = UNSET,
301
+ worker_id: str | Unset = UNSET,
302
+ last_heartbeat_time: float | Unset = UNSET,
303
+ metadata: Optional[Dict[str, Any]] | Unset = UNSET,
304
+ ) -> Attempt:
305
+ with self._lock:
306
+ return await self.store.update_attempt(
307
+ rollout_id=rollout_id,
308
+ attempt_id=attempt_id,
309
+ status=status,
310
+ worker_id=worker_id,
311
+ last_heartbeat_time=last_heartbeat_time,
312
+ metadata=metadata,
313
+ )
314
+
315
+ async def query_workers(
316
+ self,
317
+ *,
318
+ status_in: Optional[Sequence[WorkerStatus]] = None,
319
+ worker_id_contains: Optional[str] = None,
320
+ filter_logic: Literal["and", "or"] = "and",
321
+ sort_by: Optional[str] = None,
322
+ sort_order: Literal["asc", "desc"] = "asc",
323
+ limit: int = -1,
324
+ offset: int = 0,
325
+ ) -> Sequence[Worker]:
326
+ with self._lock:
327
+ return await self.store.query_workers(
328
+ status_in=status_in,
329
+ worker_id_contains=worker_id_contains,
330
+ sort_by=sort_by,
331
+ sort_order=sort_order,
332
+ limit=limit,
333
+ offset=offset,
334
+ )
335
+
336
+ async def get_worker_by_id(self, worker_id: str) -> Optional[Worker]:
337
+ with self._lock:
338
+ return await self.store.get_worker_by_id(worker_id)
339
+
340
+ async def update_worker(
341
+ self,
342
+ worker_id: str,
343
+ heartbeat_stats: Dict[str, Any] | Unset = UNSET,
344
+ ) -> Worker:
345
+ with self._lock:
346
+ return await self.store.update_worker(
347
+ worker_id=worker_id,
348
+ heartbeat_stats=heartbeat_stats,
349
+ )
350
+
351
+ def complete_job(self, summary: Optional[Dict[str, Any]] = None) -> None:
352
+ """Delegate job completion to the underlying store."""
353
+ with self._lock:
354
+ # We check if the underlying store has the method to be safe,
355
+ # though base LightningStore now has it.
356
+ if hasattr(self.store, "complete_job"):
357
+ self.store.complete_job(summary)
@@ -0,0 +1,142 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ import time
4
+ from typing import Awaitable, Callable, Dict, List, Tuple
5
+
6
+ from mantisdk.types import Attempt, AttemptedRollout, AttemptStatus, Rollout, RolloutConfig, RolloutStatus
7
+
8
+ UpdateRolloutStatus = Callable[[str, RolloutStatus], Awaitable[Rollout]]
9
+ UpdateAttemptStatus = Callable[[str, str, AttemptStatus], Awaitable[Attempt]]
10
+
11
+
12
+ LATENCY_BUCKETS = [
13
+ 0.000001,
14
+ 0.000002,
15
+ 0.000005,
16
+ 0.00001,
17
+ 0.00002,
18
+ 0.00005,
19
+ 0.0001,
20
+ 0.0002,
21
+ 0.0005,
22
+ 0.001,
23
+ 0.002,
24
+ 0.003,
25
+ 0.005,
26
+ 0.007,
27
+ 0.01,
28
+ 0.015,
29
+ 0.02,
30
+ 0.03,
31
+ 0.05,
32
+ 0.07,
33
+ 0.1,
34
+ 0.2,
35
+ 0.3,
36
+ 0.5,
37
+ 0.7,
38
+ 1.0,
39
+ 2.0,
40
+ 3.0,
41
+ 5.0,
42
+ 7.0,
43
+ 10.0,
44
+ 12.0,
45
+ 15.0,
46
+ 20.0,
47
+ 25.0,
48
+ 30.0,
49
+ 40.0,
50
+ 50.0,
51
+ 60.0,
52
+ 90.0,
53
+ 120.0,
54
+ 180.0,
55
+ 240.0,
56
+ 300.0,
57
+ ]
58
+
59
+
60
+ async def rollout_status_from_attempt(
61
+ attempt: Attempt,
62
+ config: RolloutConfig,
63
+ ) -> RolloutStatus:
64
+ """
65
+ Propagate the status of an attempt to the rollout.
66
+
67
+ Returns:
68
+ The status of the rollout from the perspective of the attempt.
69
+ """
70
+ # Propagate the status directly to the rollout
71
+ if attempt.status == "preparing" or attempt.status == "running" or attempt.status == "succeeded":
72
+ return attempt.status
73
+
74
+ if attempt.status == "failed" or attempt.status == "timeout" or attempt.status == "unresponsive":
75
+ # Check if this status should trigger a retry
76
+ if attempt.status in config.retry_condition:
77
+ # If we haven't exceeded max attempts, retry
78
+ if attempt.sequence_id < config.max_attempts:
79
+ return "requeuing"
80
+
81
+ # If we can't retry or shouldn't retry, mark as failed
82
+ return "failed"
83
+
84
+ raise ValueError(f"Invalid attempt status: {attempt.status}")
85
+
86
+
87
+ async def scan_unhealthy_rollouts(
88
+ rollouts: List[AttemptedRollout],
89
+ ) -> Dict[Tuple[str, str], AttemptStatus]:
90
+ """
91
+ Perform health check on all running rollouts in the store.
92
+
93
+ This method should be called periodically to:
94
+
95
+ 1. Check for unresponsive attempts (no heartbeat or spans for a while)
96
+ 2. Check for timed-out rollouts (running too long since start_time)
97
+
98
+ This operation is completely unlocked. The caller is responsible for locking the store.
99
+
100
+ Args:
101
+ rollouts: The list of running rollouts to check.
102
+
103
+ Returns:
104
+ A dictionary of updates to the rollouts.
105
+ """
106
+ current_time = time.time()
107
+ updates: Dict[Tuple[str, str], AttemptStatus] = {}
108
+
109
+ for rollout in rollouts:
110
+ config = rollout.config # policy for retry and timeout
111
+
112
+ # Get the latest attempt for this rollout
113
+ latest_attempt = rollout.attempt
114
+ if not latest_attempt:
115
+ # This should not happen
116
+ continue
117
+
118
+ # Check for timeout condition (based on attempt start_time, instead of rollout start_time)
119
+ if config.timeout_seconds is not None and current_time - latest_attempt.start_time > config.timeout_seconds:
120
+ updates[(latest_attempt.rollout_id, latest_attempt.attempt_id)] = "timeout"
121
+ continue
122
+
123
+ # Check for unresponsive condition (based on last heartbeat)
124
+ # (1) Haven't received heartbeat for a while
125
+ if (
126
+ latest_attempt.last_heartbeat_time
127
+ and config.unresponsive_seconds is not None
128
+ and current_time - latest_attempt.last_heartbeat_time > config.unresponsive_seconds
129
+ ):
130
+ updates[(latest_attempt.rollout_id, latest_attempt.attempt_id)] = "unresponsive"
131
+ continue
132
+
133
+ # (2) Check if there's no last heartbeat (no spans) at all
134
+ if (
135
+ latest_attempt.last_heartbeat_time is None
136
+ and config.unresponsive_seconds is not None
137
+ and current_time - latest_attempt.start_time > config.unresponsive_seconds
138
+ ):
139
+ updates[(latest_attempt.rollout_id, latest_attempt.attempt_id)] = "unresponsive"
140
+ continue
141
+
142
+ return updates
@@ -0,0 +1,16 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ from .agentops import AgentOpsTracer
4
+ from .base import Tracer, clear_active_tracer, get_active_tracer, set_active_tracer
5
+ from .dummy import DummyTracer
6
+ from .otel import OtelTracer
7
+
8
+ __all__ = [
9
+ "AgentOpsTracer",
10
+ "Tracer",
11
+ "OtelTracer",
12
+ "DummyTracer",
13
+ "get_active_tracer",
14
+ "set_active_tracer",
15
+ "clear_active_tracer",
16
+ ]