mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
mantisdk/store/base.py
ADDED
|
@@ -0,0 +1,897 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, TypedDict, TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
8
|
+
|
|
9
|
+
from mantisdk.types import (
|
|
10
|
+
Attempt,
|
|
11
|
+
AttemptedRollout,
|
|
12
|
+
AttemptStatus,
|
|
13
|
+
EnqueueRolloutRequest,
|
|
14
|
+
NamedResources,
|
|
15
|
+
ResourcesUpdate,
|
|
16
|
+
Rollout,
|
|
17
|
+
RolloutConfig,
|
|
18
|
+
RolloutMode,
|
|
19
|
+
RolloutStatus,
|
|
20
|
+
Span,
|
|
21
|
+
TaskInput,
|
|
22
|
+
Worker,
|
|
23
|
+
WorkerStatus,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from .listener import StorageListener
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_queuing(rollout: Rollout) -> bool:
|
|
31
|
+
return rollout.status == "queuing" or rollout.status == "requeuing"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def is_running(rollout: Rollout) -> bool:
|
|
35
|
+
return rollout.status == "preparing" or rollout.status == "running"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_finished(rollout: Rollout) -> bool:
|
|
39
|
+
return rollout.status == "failed" or rollout.status == "succeeded" or rollout.status == "cancelled"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _UnsetType:
|
|
43
|
+
"""A sentinel type to indicate an unset value."""
|
|
44
|
+
|
|
45
|
+
__slots__ = ()
|
|
46
|
+
|
|
47
|
+
def __repr__(self) -> str:
|
|
48
|
+
return "UNSET"
|
|
49
|
+
|
|
50
|
+
def __reduce__(self):
|
|
51
|
+
return (_get_unset, ())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_unset() -> _UnsetType:
|
|
55
|
+
return UNSET
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
UNSET = _UnsetType()
|
|
59
|
+
Unset = _UnsetType # Alias for convenience
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class LightningStoreCapabilities(TypedDict, total=False):
|
|
63
|
+
"""Capability of a LightningStore implementation.
|
|
64
|
+
|
|
65
|
+
All keys are optional and false by default.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
thread_safe: bool
|
|
69
|
+
"""Whether the store is thread-safe."""
|
|
70
|
+
async_safe: bool
|
|
71
|
+
"""Whether the store is async-safe."""
|
|
72
|
+
zero_copy: bool
|
|
73
|
+
"""Whether the store has only one copy across all threads/processes."""
|
|
74
|
+
otlp_traces: bool
|
|
75
|
+
"""Whether the store supports OTLP/HTTP traces."""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class LightningStoreStatistics(TypedDict, total=False):
|
|
79
|
+
"""Statistics of a LightningStore implementation."""
|
|
80
|
+
|
|
81
|
+
name: str
|
|
82
|
+
"""Name of the store implementation."""
|
|
83
|
+
total_rollouts: int
|
|
84
|
+
"""Total number of rollouts in the store."""
|
|
85
|
+
total_attempts: int
|
|
86
|
+
"""Total number of attempts in the store."""
|
|
87
|
+
total_spans: int
|
|
88
|
+
"""Total number of spans in the store."""
|
|
89
|
+
total_resources: int
|
|
90
|
+
"""Total number of resources in the store."""
|
|
91
|
+
total_workers: int
|
|
92
|
+
"""Total number of workers in the store."""
|
|
93
|
+
uptime: float
|
|
94
|
+
"""Uptime of since the store has been started."""
|
|
95
|
+
|
|
96
|
+
# Memory-related statistics
|
|
97
|
+
total_span_bytes: int
|
|
98
|
+
"""Total number of bytes of spans in the store."""
|
|
99
|
+
eviction_threshold_bytes: int
|
|
100
|
+
"""Eviction threshold for spans in bytes."""
|
|
101
|
+
safe_threshold_bytes: int
|
|
102
|
+
"""Safe threshold for spans in bytes."""
|
|
103
|
+
memory_capacity_bytes: int
|
|
104
|
+
"""Memory capacity of the store in bytes."""
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class LightningStore:
|
|
108
|
+
"""Contract for the persistent control-plane that coordinates training rollouts.
|
|
109
|
+
|
|
110
|
+
A `LightningStore` mediates every interaction between algorithms and runners:
|
|
111
|
+
|
|
112
|
+
- **Rollout lifecycle:** accept new rollouts, queue them for execution, create attempts,
|
|
113
|
+
and drive the rollout status machine (`"queuing"` → `"preparing"` → `"running"` →
|
|
114
|
+
`{"succeeded","failed","cancelled"}` or `"requeuing"` when a retry is justified).
|
|
115
|
+
- **Attempt tracking:** record each execution attempt, including progress heartbeats,
|
|
116
|
+
retry sequencing, and terminal states such as `"timeout"` or `"unresponsive"`.
|
|
117
|
+
- **Span ingest:** capture structured telemetry emitted by runners (either as native
|
|
118
|
+
[`Span`][mantisdk.Span] objects or as `opentelemetry.sdk.trace.ReadableSpan`
|
|
119
|
+
instances) so that algorithms can reconstruct trajectories and rewards.
|
|
120
|
+
- **Resource versioning:** manage immutable snapshots of named resources
|
|
121
|
+
(prompt templates, model checkpoints, proxy endpoints, …) and expose a single
|
|
122
|
+
"latest" snapshot that runners can fetch just after claiming work.
|
|
123
|
+
|
|
124
|
+
Implementations must provide thread-safe/async-safe semantics: each coroutine should
|
|
125
|
+
appear atomic to callers even when multiple algorithms or runners call the API concurrently.
|
|
126
|
+
Unless stated otherwise, missing identifiers should result in a `ValueError`.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(self, listeners: Optional[Sequence[StorageListener]] = None):
|
|
130
|
+
self.listeners: Sequence[StorageListener] = listeners or []
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def capabilities(self) -> LightningStoreCapabilities:
|
|
134
|
+
"""Return the capabilities of the store."""
|
|
135
|
+
base_caps = LightningStoreCapabilities(
|
|
136
|
+
thread_safe=False,
|
|
137
|
+
async_safe=False,
|
|
138
|
+
zero_copy=False,
|
|
139
|
+
otlp_traces=False,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Merge capabilities from listeners
|
|
143
|
+
for listener in self.listeners:
|
|
144
|
+
base_caps.update(listener.capabilities)
|
|
145
|
+
|
|
146
|
+
return base_caps
|
|
147
|
+
|
|
148
|
+
async def statistics(self) -> LightningStoreStatistics:
|
|
149
|
+
"""Return the statistics of the store."""
|
|
150
|
+
return {
|
|
151
|
+
"name": self.__class__.__name__,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def otlp_traces_endpoint(self) -> str:
|
|
155
|
+
"""Return the OTLP/HTTP traces endpoint of the store.
|
|
156
|
+
|
|
157
|
+
The traces can have rollout ID and attempt ID (and optionally sequence ID)
|
|
158
|
+
saved in the "resource" of the spans.
|
|
159
|
+
The store, if it supports OTLP, should be able to receive the traces and save them
|
|
160
|
+
via [`add_span`][mantisdk.LightningStore.add_span] or
|
|
161
|
+
[`add_otel_span`][mantisdk.LightningStore.add_otel_span].
|
|
162
|
+
|
|
163
|
+
The endpoint should be compatible with [OTLP HTTP protocol](https://opentelemetry.io/docs/specs/otlp/).
|
|
164
|
+
It's not necessarily compatible with OTLP gRPC protocol.
|
|
165
|
+
|
|
166
|
+
The returned endpoint will usually ends with `/v1/traces`.
|
|
167
|
+
"""
|
|
168
|
+
# Delegate to listeners if any support OTLP
|
|
169
|
+
for listener in self.listeners:
|
|
170
|
+
endpoint = listener.otlp_traces_endpoint()
|
|
171
|
+
if endpoint:
|
|
172
|
+
return endpoint
|
|
173
|
+
raise NotImplementedError()
|
|
174
|
+
|
|
175
|
+
def get_otlp_headers(self) -> Dict[str, str]:
|
|
176
|
+
"""Return the authentication headers for OTLP export."""
|
|
177
|
+
# Delegate to listeners if any support OTLP
|
|
178
|
+
for listener in self.listeners:
|
|
179
|
+
headers = listener.get_otlp_headers()
|
|
180
|
+
if headers:
|
|
181
|
+
return headers
|
|
182
|
+
return {}
|
|
183
|
+
|
|
184
|
+
def complete_job(self, summary: Optional[Dict[str, Any]] = None) -> None:
|
|
185
|
+
"""Complete the job with an optional summary.
|
|
186
|
+
|
|
187
|
+
Delegates to any listener that has a `complete` method (e.g., InsightTracker)
|
|
188
|
+
to send the job.completed event with the provided summary.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
summary: Optional dictionary containing job summary data (e.g., GEPA results).
|
|
192
|
+
"""
|
|
193
|
+
for listener in self.listeners:
|
|
194
|
+
if hasattr(listener, 'complete'):
|
|
195
|
+
listener.complete(summary)
|
|
196
|
+
|
|
197
|
+
async def start_rollout(
|
|
198
|
+
self,
|
|
199
|
+
input: TaskInput,
|
|
200
|
+
mode: RolloutMode | None = None,
|
|
201
|
+
resources_id: str | None = None,
|
|
202
|
+
config: RolloutConfig | None = None,
|
|
203
|
+
metadata: Dict[str, Any] | None = None,
|
|
204
|
+
worker_id: str | None = None,
|
|
205
|
+
) -> AttemptedRollout:
|
|
206
|
+
"""Register a rollout and immediately create its first attempt.
|
|
207
|
+
|
|
208
|
+
!!! note
|
|
209
|
+
Use [`enqueue_rollout()`][mantisdk.LightningStore.enqueue_rollout] when the
|
|
210
|
+
caller only wants to submit work for later scheduling.
|
|
211
|
+
|
|
212
|
+
The rollout must be persisted with `status="preparing"` and an initial attempt
|
|
213
|
+
with `sequence_id == 1` so the caller can begin execution without visiting the
|
|
214
|
+
public queue. Implementations are expected to:
|
|
215
|
+
|
|
216
|
+
1. Generate a unique `rollout_id` and `attempt_id`.
|
|
217
|
+
2. Record `start_time` for both rollout and attempt based on the current clock.
|
|
218
|
+
3. Copy `config` and `metadata` so later mutations do not leak shared references.
|
|
219
|
+
4. Resolve `resources_id` to the latest resource snapshot when `None` is supplied.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
input: Arbitrary task payload supplied by an algorithm.
|
|
223
|
+
mode: Optional semantic mode for downstream analytics (`"train"`, `"val"`, `"test"`).
|
|
224
|
+
resources_id: Concrete resource snapshot to execute against; defaults to the latest stored snapshot.
|
|
225
|
+
config: Rollout retry/timeout policy. Should default to a fresh [`RolloutConfig`][mantisdk.RolloutConfig].
|
|
226
|
+
metadata: Free-form metadata persisted verbatim with the rollout.
|
|
227
|
+
worker_id: Optional worker identifier to associate the new attempt with.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
The fully-populated [`AttemptedRollout`][mantisdk.AttemptedRollout] including
|
|
231
|
+
the just-created attempt.
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
NotImplementedError: Subclasses must provide durable storage for the rollout.
|
|
235
|
+
ValueError: Implementations should raise when `resources_id` does not exist.
|
|
236
|
+
"""
|
|
237
|
+
raise NotImplementedError()
|
|
238
|
+
|
|
239
|
+
async def enqueue_rollout(
|
|
240
|
+
self,
|
|
241
|
+
input: TaskInput,
|
|
242
|
+
mode: Literal["train", "val", "test"] | None = None,
|
|
243
|
+
resources_id: str | None = None,
|
|
244
|
+
config: RolloutConfig | None = None,
|
|
245
|
+
metadata: Dict[str, Any] | None = None,
|
|
246
|
+
) -> Rollout:
|
|
247
|
+
"""Persist a rollout in `queuing` state so runners can claim it later.
|
|
248
|
+
|
|
249
|
+
!!! note
|
|
250
|
+
Different from [`start_rollout()`][mantisdk.LightningStore.start_rollout],
|
|
251
|
+
this method is called when the caller only wants to submit work for later scheduling.
|
|
252
|
+
|
|
253
|
+
Implementations must generate a unique `rollout_id`, stamp `start_time` with
|
|
254
|
+
the current time, default `config` to a fresh [`RolloutConfig`][mantisdk.RolloutConfig],
|
|
255
|
+
and insert the rollout at the tail of the scheduling queue. No attempt is created yet.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
input: Arbitrary task payload supplied by an algorithm.
|
|
259
|
+
mode: Optional semantic mode indicator (`"train"`, `"val"`, `"test"`).
|
|
260
|
+
resources_id: Resource snapshot used when a runner eventually executes the rollout.
|
|
261
|
+
config: Fine-grained retry/timeout parameters to persist with the rollout.
|
|
262
|
+
metadata: Free-form metadata stored verbatim with the rollout record.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
The stored [`Rollout`][mantisdk.Rollout] in `queuing` status.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
NotImplementedError: Subclasses must persist the rollout.
|
|
269
|
+
ValueError: Implementations should raise when `resources_id` does not exist.
|
|
270
|
+
"""
|
|
271
|
+
raise NotImplementedError()
|
|
272
|
+
|
|
273
|
+
async def enqueue_many_rollouts(self, rollouts: Sequence[EnqueueRolloutRequest]) -> Sequence[Rollout]:
|
|
274
|
+
"""Persist multiple rollouts in `queuing` state.
|
|
275
|
+
|
|
276
|
+
The implementation can delegate to [`enqueue_rollout()`][mantisdk.LightningStore.enqueue_rollout]
|
|
277
|
+
per request and preserves the input ordering. Subclasses can override to provide
|
|
278
|
+
more efficient bulk enqueue semantics.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
rollouts: Rollout submission payloads mirroring [`enqueue_rollout()`][mantisdk.LightningStore.enqueue_rollout]'s
|
|
282
|
+
parameters. Each entry requires `input` and can optionally include other fields.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Rollouts enqueued in the same order as `rollouts`.
|
|
286
|
+
"""
|
|
287
|
+
raise NotImplementedError()
|
|
288
|
+
|
|
289
|
+
async def dequeue_rollout(self, worker_id: Optional[str] = None) -> Optional[AttemptedRollout]:
|
|
290
|
+
"""Claim the oldest queued rollout and transition it to `preparing`.
|
|
291
|
+
|
|
292
|
+
This function do not block.
|
|
293
|
+
|
|
294
|
+
Retrieval must be FIFO across rollouts that remain in `queuing` or `requeuing`
|
|
295
|
+
state. When a rollout is claimed, implementations must:
|
|
296
|
+
|
|
297
|
+
* Transition its status to `"preparing"`.
|
|
298
|
+
* Create a new attempt with `status="preparing"` and `sequence_id` equal to
|
|
299
|
+
the number of attempts already registered for the rollout plus one.
|
|
300
|
+
* Return an [`AttemptedRollout`][mantisdk.AttemptedRollout] snapshot so the
|
|
301
|
+
runner knows both rollout metadata and the attempt identifier.
|
|
302
|
+
* Optionally refresh the caller's [`Worker`][mantisdk.Worker] telemetry
|
|
303
|
+
(e.g., `last_dequeue_time`) when `worker_id` is provided.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
worker_id: Optional worker identifier to associate the claimed attempt with.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
The next attempt to execute, or `None` when no eligible rollouts are queued.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
NotImplementedError: Subclasses must implement queue retrieval.
|
|
313
|
+
"""
|
|
314
|
+
raise NotImplementedError()
|
|
315
|
+
|
|
316
|
+
async def dequeue_many_rollouts(
|
|
317
|
+
self,
|
|
318
|
+
*,
|
|
319
|
+
limit: int = 1,
|
|
320
|
+
worker_id: Optional[str] = None,
|
|
321
|
+
) -> Sequence[AttemptedRollout]:
|
|
322
|
+
"""Claim up to `limit` queued rollouts without blocking.
|
|
323
|
+
|
|
324
|
+
The implementation can repeatedly invokes
|
|
325
|
+
[`dequeue_rollout()`][mantisdk.LightningStore.dequeue_rollout] until reaching
|
|
326
|
+
the requested limit or the queue is empty. Subclasses can override it to fetch
|
|
327
|
+
multiple rollouts atomically.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
limit: Maximum number of rollouts to claim. Non-positive values return an empty list.
|
|
331
|
+
worker_id: Optional worker identifier passed through to each dequeue call.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Attempted rollouts claimed in FIFO order. May contain fewer than `limit` entries
|
|
335
|
+
when the queue is exhausted.
|
|
336
|
+
"""
|
|
337
|
+
raise NotImplementedError()
|
|
338
|
+
|
|
339
|
+
async def start_attempt(self, rollout_id: str, worker_id: Optional[str] = None) -> AttemptedRollout:
|
|
340
|
+
"""Create a manual retry attempt for an existing rollout.
|
|
341
|
+
|
|
342
|
+
This is typically invoked by runners that wish to retry outside of the
|
|
343
|
+
normal queue flow (for example in an online RL setup).
|
|
344
|
+
Implementations must validate that the rollout exists, allocate a fresh `attempt_id`,
|
|
345
|
+
increment the `sequence_id` monotonically, stamp the new attempt with `status="preparing"`,
|
|
346
|
+
and return an up-to-date [`AttemptedRollout`][mantisdk.AttemptedRollout].
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
rollout_id: Unique identifier of the rollout receiving a new attempt.
|
|
350
|
+
worker_id: Optional worker identifier to associate the new attempt with.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
The rollout paired with its newly-created attempt.
|
|
354
|
+
|
|
355
|
+
Raises:
|
|
356
|
+
NotImplementedError: Subclasses must implement attempt creation.
|
|
357
|
+
ValueError: Implementations must raise when `rollout_id` is unknown.
|
|
358
|
+
"""
|
|
359
|
+
raise NotImplementedError()
|
|
360
|
+
|
|
361
|
+
async def add_many_spans(self, spans: Sequence[Span]) -> Sequence[Span]:
|
|
362
|
+
"""Persist a sequence of pre-constructed spans emitted during rollout execution.
|
|
363
|
+
|
|
364
|
+
Implementations can simply delegate to [`add_span()`][mantisdk.LightningStore.add_span] for each span.
|
|
365
|
+
However, if the store supports bulk insertion, it can implement this method to improve performance.
|
|
366
|
+
"""
|
|
367
|
+
raise NotImplementedError()
|
|
368
|
+
|
|
369
|
+
async def add_span(self, span: Span) -> Optional[Span]:
|
|
370
|
+
"""Persist a pre-constructed span emitted during rollout execution.
|
|
371
|
+
|
|
372
|
+
The provided [`Span`][mantisdk.Span] must already contain the `rollout_id`,
|
|
373
|
+
`attempt_id`, and `sequence_id`. Implementations must:
|
|
374
|
+
|
|
375
|
+
* Verify that both rollout and attempt exist.
|
|
376
|
+
* Ensure span ordering remains strictly increasing per attempt (rejecting or keeping duplicates).
|
|
377
|
+
* Treat the span arrival as a heartbeat: update the attempt's `last_heartbeat_time`
|
|
378
|
+
and transition both attempt and rollout to `"running"` if they were still
|
|
379
|
+
`"preparing"` or `"requeuing"`.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
span: Fully populated span to persist.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
The stored span record (implementations may return a copy).
|
|
386
|
+
Return `None` if the span was not added due to a duplicate.
|
|
387
|
+
|
|
388
|
+
Raises:
|
|
389
|
+
NotImplementedError: Subclasses must implement span persistence.
|
|
390
|
+
ValueError: Implementations must raise when the referenced rollout or attempt is missing.
|
|
391
|
+
"""
|
|
392
|
+
raise NotImplementedError()
|
|
393
|
+
|
|
394
|
+
async def add_otel_span(
|
|
395
|
+
self,
|
|
396
|
+
rollout_id: str,
|
|
397
|
+
attempt_id: str,
|
|
398
|
+
readable_span: ReadableSpan,
|
|
399
|
+
sequence_id: int | None = None,
|
|
400
|
+
) -> Optional[Span]:
|
|
401
|
+
"""Convert and persist an OpenTelemetry span for a particular attempt.
|
|
402
|
+
|
|
403
|
+
Implementations must transform the `readable_span` into a [`Span`][mantisdk.Span]
|
|
404
|
+
(typically via [`Span.from_opentelemetry()`][mantisdk.Span.from_opentelemetry]),
|
|
405
|
+
assign a strictly increasing `sequence_id` when one is not provided, and persist it
|
|
406
|
+
using the same semantics as [`add_span()`][mantisdk.LightningStore.add_span].
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
rollout_id: Identifier of the rollout that produced the span.
|
|
410
|
+
attempt_id: Attempt identifier the span belongs to.
|
|
411
|
+
readable_span: OpenTelemetry span in SDK form.
|
|
412
|
+
sequence_id: Optional explicit ordering hint. When omitted, call
|
|
413
|
+
[`get_next_span_sequence_id()`][mantisdk.LightningStore.get_next_span_sequence_id]
|
|
414
|
+
automatically.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
The stored span record. Return `None` if the span was not added due to a duplicate.
|
|
418
|
+
|
|
419
|
+
Raises:
|
|
420
|
+
NotImplementedError: Subclasses must implement span persistence.
|
|
421
|
+
ValueError: Implementations must raise when the rollout or attempt is unknown.
|
|
422
|
+
"""
|
|
423
|
+
raise NotImplementedError()
|
|
424
|
+
|
|
425
|
+
async def query_rollouts(
|
|
426
|
+
self,
|
|
427
|
+
*,
|
|
428
|
+
status_in: Optional[Sequence[RolloutStatus]] = None,
|
|
429
|
+
rollout_id_in: Optional[Sequence[str]] = None,
|
|
430
|
+
rollout_id_contains: Optional[str] = None,
|
|
431
|
+
filter_logic: Literal["and", "or"] = "and",
|
|
432
|
+
sort_by: Optional[str] = None,
|
|
433
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
434
|
+
limit: int = -1,
|
|
435
|
+
offset: int = 0,
|
|
436
|
+
# Deprecated fields
|
|
437
|
+
status: Optional[Sequence[RolloutStatus]] = None,
|
|
438
|
+
rollout_ids: Optional[Sequence[str]] = None,
|
|
439
|
+
) -> Sequence[Rollout]:
|
|
440
|
+
"""Retrieve rollouts filtered by status and/or explicit identifiers.
|
|
441
|
+
|
|
442
|
+
This interface supports structured filtering, sorting, and pagination so
|
|
443
|
+
callers can build simple dashboards without copying data out of the
|
|
444
|
+
store. The legacy parameters `status` and `rollout_ids` remain valid and
|
|
445
|
+
are treated as aliases for `status_in` and `rollout_id_in`
|
|
446
|
+
respectively—when both the new and deprecated parameters are supplied
|
|
447
|
+
the new parameters take precedence.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
status_in: Optional whitelist of [`RolloutStatus`][mantisdk.RolloutStatus] values.
|
|
451
|
+
rollout_id_in: Optional whitelist of rollout identifiers to include.
|
|
452
|
+
rollout_id_contains: Optional substring match for rollout identifiers.
|
|
453
|
+
filter_logic: Logical operator to combine filters.
|
|
454
|
+
sort_by: Optional field to sort by. Must reference a numeric or string
|
|
455
|
+
field on [`Rollout`][mantisdk.Rollout].
|
|
456
|
+
sort_order: Direction to sort when `sort_by` is provided.
|
|
457
|
+
limit: Maximum number of rows to return. Use `-1` for "no limit".
|
|
458
|
+
offset: Number of rows to skip before returning results.
|
|
459
|
+
status: Deprecated field. Use `status_in` instead.
|
|
460
|
+
rollout_ids: Deprecated field. Use `rollout_id_in` instead.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
A sequence of matching rollouts (or [`AttemptedRollout`][mantisdk.AttemptedRollout]
|
|
464
|
+
when attempts exist). Ordering is deterministic when `sort_by` is set.
|
|
465
|
+
The return value is not guaranteed to be a list.
|
|
466
|
+
|
|
467
|
+
Raises:
|
|
468
|
+
NotImplementedError: Subclasses must implement the query.
|
|
469
|
+
"""
|
|
470
|
+
raise NotImplementedError()
|
|
471
|
+
|
|
472
|
+
async def query_attempts(
|
|
473
|
+
self,
|
|
474
|
+
rollout_id: str,
|
|
475
|
+
*,
|
|
476
|
+
sort_by: Optional[str] = "sequence_id",
|
|
477
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
478
|
+
limit: int = -1,
|
|
479
|
+
offset: int = 0,
|
|
480
|
+
) -> Sequence[Attempt]:
|
|
481
|
+
"""Return every attempt ever created for `rollout_id` in ascending sequence order.
|
|
482
|
+
|
|
483
|
+
The parameters allow callers to re-order or paginate the attempts so that
|
|
484
|
+
large retry histories can be streamed lazily.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
rollout_id: Identifier of the rollout being inspected.
|
|
488
|
+
sort_by: Field to sort by. Must be a numeric or string field of
|
|
489
|
+
[`Attempt`][mantisdk.Attempt]. Defaults to `sequence_id` (oldest first).
|
|
490
|
+
sort_order: Order to sort by.
|
|
491
|
+
limit: Limit on the number of results. `-1` for unlimited.
|
|
492
|
+
offset: Offset into the results.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
Sequence of Attempts. Returns an empty sequence when none exist.
|
|
496
|
+
The return value is not guaranteed to be a list.
|
|
497
|
+
|
|
498
|
+
Raises:
|
|
499
|
+
NotImplementedError: Subclasses must implement the query.
|
|
500
|
+
ValueError: Implementations must raise when the rollout does not exist.
|
|
501
|
+
"""
|
|
502
|
+
raise NotImplementedError()
|
|
503
|
+
|
|
504
|
+
async def get_rollout_by_id(self, rollout_id: str) -> Optional[Rollout]:
|
|
505
|
+
"""Fetch a rollout by identifier without mutating its state.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
rollout_id: Identifier to retrieve.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
The rollout when found, otherwise `None`.
|
|
512
|
+
|
|
513
|
+
Raises:
|
|
514
|
+
NotImplementedError: Subclasses must implement retrieval.
|
|
515
|
+
"""
|
|
516
|
+
raise NotImplementedError()
|
|
517
|
+
|
|
518
|
+
async def get_latest_attempt(self, rollout_id: str) -> Optional[Attempt]:
|
|
519
|
+
"""Fetch the attempt with the highest `sequence_id` for `rollout_id`.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
rollout_id: Identifier to inspect.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
The most recent attempt or `None` when no attempts exist yet.
|
|
526
|
+
|
|
527
|
+
Raises:
|
|
528
|
+
NotImplementedError: Subclasses must implement retrieval.
|
|
529
|
+
ValueError: Implementations must raise when the rollout does not exist.
|
|
530
|
+
"""
|
|
531
|
+
raise NotImplementedError()
|
|
532
|
+
|
|
533
|
+
async def query_resources(
|
|
534
|
+
self,
|
|
535
|
+
*,
|
|
536
|
+
resources_id: Optional[str] = None,
|
|
537
|
+
resources_id_contains: Optional[str] = None,
|
|
538
|
+
# Filter logic is not supported here because I can't see why it's needed.
|
|
539
|
+
sort_by: Optional[str] = None,
|
|
540
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
541
|
+
limit: int = -1,
|
|
542
|
+
offset: int = 0,
|
|
543
|
+
) -> Sequence[ResourcesUpdate]:
|
|
544
|
+
"""List every stored resource snapshot in insertion order.
|
|
545
|
+
|
|
546
|
+
Supports lightweight filtering, sorting, and pagination for embedding in
|
|
547
|
+
dashboards.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
resources_id: Optional identifier of the resources to include.
|
|
551
|
+
resources_id_contains: Optional substring match for resources identifiers.
|
|
552
|
+
sort_by: Optional field to sort by (must be numeric or string on
|
|
553
|
+
[`ResourcesUpdate`][mantisdk.ResourcesUpdate]).
|
|
554
|
+
sort_order: Order to sort by.
|
|
555
|
+
limit: Limit on the number of results. `-1` for unlimited.
|
|
556
|
+
offset: Offset into the results.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
[`ResourcesUpdate`][mantisdk.ResourcesUpdate] objects.
|
|
560
|
+
By default, resources are sorted in a deterministic but undefined order.
|
|
561
|
+
The return value is not guaranteed to be a list.
|
|
562
|
+
|
|
563
|
+
Raises:
|
|
564
|
+
NotImplementedError: Subclasses must implement retrieval.
|
|
565
|
+
"""
|
|
566
|
+
raise NotImplementedError()
|
|
567
|
+
|
|
568
|
+
async def get_resources_by_id(self, resources_id: str) -> Optional[ResourcesUpdate]:
|
|
569
|
+
"""Return a specific named resource snapshot by identifier.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
resources_id: Identifier of the snapshot.
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
The stored [`ResourcesUpdate`][mantisdk.ResourcesUpdate], or `None` when missing.
|
|
576
|
+
|
|
577
|
+
Raises:
|
|
578
|
+
NotImplementedError: Subclasses must implement retrieval.
|
|
579
|
+
"""
|
|
580
|
+
raise NotImplementedError()
|
|
581
|
+
|
|
582
|
+
async def get_latest_resources(self) -> Optional[ResourcesUpdate]:
|
|
583
|
+
"""Fetch the latest resource snapshot marked as the global default.
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
The current latest [`ResourcesUpdate`][mantisdk.ResourcesUpdate], or `None` when
|
|
587
|
+
no resources have been registered yet.
|
|
588
|
+
|
|
589
|
+
Raises:
|
|
590
|
+
NotImplementedError: Subclasses must implement retrieval.
|
|
591
|
+
"""
|
|
592
|
+
raise NotImplementedError()
|
|
593
|
+
|
|
594
|
+
async def get_next_span_sequence_id(self, rollout_id: str, attempt_id: str) -> int:
|
|
595
|
+
"""Allocate the next strictly increasing sequence number used to order spans.
|
|
596
|
+
|
|
597
|
+
Implementations must retain counters so repeated calls return `1, 2, ...` without
|
|
598
|
+
gaps unless spans were explicitly inserted with a custom `sequence_id`. The
|
|
599
|
+
counter may be scoped per rollout or per attempt, but the sequence must be
|
|
600
|
+
strictly increasing for spans emitted by the specified attempt so traces remain
|
|
601
|
+
totally ordered.
|
|
602
|
+
|
|
603
|
+
See [Distributed Tracing][distributed-tracing] for detailed motivations.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
rollout_id: Identifier of the rollout emitting spans.
|
|
607
|
+
attempt_id: Attempt identifier for the upcoming span.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
The next integer sequence identifier, unique within the attempt.
|
|
611
|
+
|
|
612
|
+
Raises:
|
|
613
|
+
NotImplementedError: Subclasses must provide the allocator.
|
|
614
|
+
ValueError: Implementations must raise when the rollout or attempt does not exist.
|
|
615
|
+
"""
|
|
616
|
+
raise NotImplementedError()
|
|
617
|
+
|
|
618
|
+
async def get_many_span_sequence_ids(self, rollout_attempt_ids: Sequence[Tuple[str, str]]) -> Sequence[int]:
|
|
619
|
+
"""Bulk allocate the next strictly increasing sequence number used to order spans.
|
|
620
|
+
|
|
621
|
+
Implementations may delegate to [`get_next_span_sequence_id()`][mantisdk.LightningStore.get_next_span_sequence_id]
|
|
622
|
+
for each rollout and attempt.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
rollout_attempt_ids: List of tuples of rollout and attempt identifiers.
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
List of sequence numbers.
|
|
629
|
+
"""
|
|
630
|
+
raise NotImplementedError()
|
|
631
|
+
|
|
632
|
+
async def wait_for_rollouts(self, *, rollout_ids: List[str], timeout: Optional[float] = None) -> List[Rollout]:
|
|
633
|
+
"""Block until the targeted rollouts reach a terminal status or the timeout expires.
|
|
634
|
+
|
|
635
|
+
Terminal statuses are `"succeeded"`, `"failed"`, and `"cancelled"`. When the timeout
|
|
636
|
+
elapses, implementations should return the subset of rollouts that are already terminal
|
|
637
|
+
and omit the rest.
|
|
638
|
+
|
|
639
|
+
!!! warning
|
|
640
|
+
It's dangerous and might be event-loop blocking to call this function
|
|
641
|
+
with a long timeout. It's a good idea to poll for the method to check
|
|
642
|
+
if new completed rollouts can coming. Be careful in implementing the sleep logic
|
|
643
|
+
to avoid busy-waiting.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
rollout_ids: Identifiers of rollouts to watch.
|
|
647
|
+
timeout: Maximum time in seconds to wait. `None` waits indefinitely.
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
Rollouts that finished before the deadline, in arbitrary order.
|
|
651
|
+
|
|
652
|
+
Raises:
|
|
653
|
+
NotImplementedError: Subclasses must implement waiting semantics.
|
|
654
|
+
ValueError: Implementations must raise when a rollout identifier is unknown.
|
|
655
|
+
"""
|
|
656
|
+
raise NotImplementedError()
|
|
657
|
+
|
|
658
|
+
async def query_spans(
|
|
659
|
+
self,
|
|
660
|
+
rollout_id: str,
|
|
661
|
+
attempt_id: str | Literal["latest"] | None = None,
|
|
662
|
+
*,
|
|
663
|
+
# Filtering
|
|
664
|
+
trace_id: Optional[str] = None,
|
|
665
|
+
trace_id_contains: Optional[str] = None,
|
|
666
|
+
span_id: Optional[str] = None,
|
|
667
|
+
span_id_contains: Optional[str] = None,
|
|
668
|
+
parent_id: Optional[str] = None,
|
|
669
|
+
parent_id_contains: Optional[str] = None,
|
|
670
|
+
name: Optional[str] = None,
|
|
671
|
+
name_contains: Optional[str] = None,
|
|
672
|
+
filter_logic: Literal["and", "or"] = "and",
|
|
673
|
+
# Pagination
|
|
674
|
+
limit: int = -1,
|
|
675
|
+
offset: int = 0,
|
|
676
|
+
# Sorting
|
|
677
|
+
sort_by: Optional[str] = "sequence_id",
|
|
678
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
679
|
+
) -> Sequence[Span]:
|
|
680
|
+
"""Return the stored spans for a rollout, optionally scoped to one attempt.
|
|
681
|
+
|
|
682
|
+
Supports a handful of filters that cover the most common debugging
|
|
683
|
+
scenarios (matching `trace_id`/`span_id`/`parent_id` or substring
|
|
684
|
+
matches on the span name). `attempt_id="latest"` acts as a convenience
|
|
685
|
+
that resolves the most recent attempt before evaluating filters. When
|
|
686
|
+
`attempt_id=None`, spans across every attempt are eligible. By default
|
|
687
|
+
results are sorted by `sequence_id` (oldest first). Implementations may
|
|
688
|
+
raise a `RuntimeError` when spans were evicted or expired.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
rollout_id: Identifier of the rollout being inspected.
|
|
692
|
+
attempt_id: Attempt identifier to filter by. Pass `"latest"` to retrieve only the
|
|
693
|
+
most recent attempt, or `None` to return all spans across attempts.
|
|
694
|
+
trace_id: Optional trace ID to filter by.
|
|
695
|
+
trace_id_contains: Optional substring match for trace IDs.
|
|
696
|
+
span_id: Optional span ID to filter by.
|
|
697
|
+
span_id_contains: Optional substring match for span IDs.
|
|
698
|
+
parent_id: Optional parent span ID to filter by.
|
|
699
|
+
parent_id_contains: Optional substring match for parent span IDs.
|
|
700
|
+
name: Optional span name to filter by.
|
|
701
|
+
name_contains: Optional substring match for span names.
|
|
702
|
+
filter_logic: Logical operator to combine the optional filters above.
|
|
703
|
+
The `rollout_id` argument is always applied with AND semantics.
|
|
704
|
+
limit: Limit on the number of results. `-1` for unlimited.
|
|
705
|
+
offset: Offset into the results.
|
|
706
|
+
sort_by: Field to sort by. Must be a numeric or string field of
|
|
707
|
+
[`Span`][mantisdk.Span].
|
|
708
|
+
sort_order: Order to sort by.
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
An ordered list of spans (possibly empty).
|
|
712
|
+
The return value is not guaranteed to be a list.
|
|
713
|
+
|
|
714
|
+
Raises:
|
|
715
|
+
NotImplementedError: Subclasses must implement the query.
|
|
716
|
+
ValueError: Implementations must raise when the rollout or attempt is unknown.
|
|
717
|
+
"""
|
|
718
|
+
raise NotImplementedError()
|
|
719
|
+
|
|
720
|
+
async def add_resources(self, resources: NamedResources) -> ResourcesUpdate:
|
|
721
|
+
"""Persist a new immutable snapshot of named resources and mark it as latest.
|
|
722
|
+
|
|
723
|
+
Implementations must assign a fresh `resources_id` and ensure subsequent calls to
|
|
724
|
+
[`get_latest_resources()`][mantisdk.LightningStore.get_latest_resources] return the
|
|
725
|
+
snapshot produced here.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
resources: Mapping of resource names to their serialized payloads.
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
The stored [`ResourcesUpdate`][mantisdk.ResourcesUpdate] including its generated id.
|
|
732
|
+
|
|
733
|
+
Raises:
|
|
734
|
+
NotImplementedError: Subclasses must implement resource persistence.
|
|
735
|
+
"""
|
|
736
|
+
raise NotImplementedError()
|
|
737
|
+
|
|
738
|
+
async def update_resources(self, resources_id: str, resources: NamedResources) -> ResourcesUpdate:
|
|
739
|
+
"""Overwrite or extend an existing resource snapshot and mark it as latest.
|
|
740
|
+
|
|
741
|
+
This API is typically used by algorithms that maintain mutable resources (e.g., model
|
|
742
|
+
checkpoints) under a stable identifier.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
resources_id: Identifier of the snapshot to replace.
|
|
746
|
+
resources: Updated mapping of resource names to payloads.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
The persisted [`ResourcesUpdate`][mantisdk.ResourcesUpdate].
|
|
750
|
+
|
|
751
|
+
Raises:
|
|
752
|
+
NotImplementedError: Subclasses must implement resource persistence.
|
|
753
|
+
ValueError: Implementations must raise when `resources_id` does not exist.
|
|
754
|
+
"""
|
|
755
|
+
raise NotImplementedError()
|
|
756
|
+
|
|
757
|
+
async def update_rollout(
|
|
758
|
+
self,
|
|
759
|
+
rollout_id: str,
|
|
760
|
+
input: TaskInput | Unset = UNSET,
|
|
761
|
+
mode: Optional[Literal["train", "val", "test"]] | Unset = UNSET,
|
|
762
|
+
resources_id: Optional[str] | Unset = UNSET,
|
|
763
|
+
status: RolloutStatus | Unset = UNSET,
|
|
764
|
+
config: RolloutConfig | Unset = UNSET,
|
|
765
|
+
metadata: Optional[Dict[str, Any]] | Unset = UNSET,
|
|
766
|
+
) -> Rollout:
|
|
767
|
+
"""Update rollout metadata and, when provided, drive status transitions.
|
|
768
|
+
|
|
769
|
+
Parameters default to the sentinel [`UNSET`][mantisdk.store.base.UNSET] to
|
|
770
|
+
distinguish omitted fields from explicit `None` assignments. Implementations must:
|
|
771
|
+
|
|
772
|
+
* Validate the rollout exists before mutating it.
|
|
773
|
+
* Replace each property when a concrete value (including `None`) is supplied.
|
|
774
|
+
* When the status switches into a terminal state, set `end_time` and signal any waiters.
|
|
775
|
+
* When the status re-enters a queueing state, ensure the rollout is enqueued exactly once.
|
|
776
|
+
|
|
777
|
+
Args:
|
|
778
|
+
rollout_id: Identifier of the rollout to update.
|
|
779
|
+
input: Replacement task payload; pass `None` to explicitly clear the input.
|
|
780
|
+
mode: Replacement rollout mode.
|
|
781
|
+
resources_id: Replacement resources snapshot reference.
|
|
782
|
+
status: Target rollout status.
|
|
783
|
+
config: Replacement retry/timeout configuration.
|
|
784
|
+
metadata: Replacement metadata dictionary.
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
The updated rollout record.
|
|
788
|
+
|
|
789
|
+
Raises:
|
|
790
|
+
NotImplementedError: Subclasses must implement mutation logic.
|
|
791
|
+
ValueError: Implementations must raise when the rollout is unknown or the update is invalid.
|
|
792
|
+
"""
|
|
793
|
+
raise NotImplementedError()
|
|
794
|
+
|
|
795
|
+
async def update_attempt(
|
|
796
|
+
self,
|
|
797
|
+
rollout_id: str,
|
|
798
|
+
attempt_id: str | Literal["latest"],
|
|
799
|
+
status: AttemptStatus | Unset = UNSET,
|
|
800
|
+
worker_id: str | Unset = UNSET,
|
|
801
|
+
last_heartbeat_time: float | Unset = UNSET,
|
|
802
|
+
metadata: Optional[Dict[str, Any]] | Unset = UNSET,
|
|
803
|
+
) -> Attempt:
|
|
804
|
+
"""Update attempt bookkeeping such as status, worker ownership, and heartbeats.
|
|
805
|
+
|
|
806
|
+
When `attempt_id` is `"latest"` the update must target the attempt with the highest
|
|
807
|
+
`sequence_id`; otherwise it must target the specific attempt. Implementations should
|
|
808
|
+
propagate status changes to the rollout (for example
|
|
809
|
+
via [`rollout_status_from_attempt()`][mantisdk.store.utils.rollout_status_from_attempt])
|
|
810
|
+
once the latest attempt transitions to a terminal state.
|
|
811
|
+
|
|
812
|
+
Similar to [`update_rollout()`][mantisdk.LightningStore.update_rollout],
|
|
813
|
+
parameters also default to the sentinel [`UNSET`][mantisdk.store.base.UNSET].
|
|
814
|
+
|
|
815
|
+
If `worker_id` is present, the worker status will be updated following the rules:
|
|
816
|
+
|
|
817
|
+
1. If attempt status is "succeeded" or "failed", the corresponding worker status will be set to "idle".
|
|
818
|
+
2. If attempt status is "unresponsive" or "timeout", the corresponding worker status will be set to "unknown".
|
|
819
|
+
3. Otherwise, the worker status will be set to "busy".
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
rollout_id: Identifier of the rollout whose attempt will be updated.
|
|
823
|
+
attempt_id: Attempt identifier or `"latest"` as a convenience.
|
|
824
|
+
status: Replacement attempt status. Terminal statuses must set `end_time`.
|
|
825
|
+
worker_id: Identifier for the worker currently processing the attempt.
|
|
826
|
+
last_heartbeat_time: Wall-clock timestamp (seconds) of the latest heartbeat/span.
|
|
827
|
+
metadata: Replacement metadata dictionary.
|
|
828
|
+
|
|
829
|
+
Returns:
|
|
830
|
+
The updated attempt record.
|
|
831
|
+
|
|
832
|
+
Raises:
|
|
833
|
+
NotImplementedError: Subclasses must implement mutation logic.
|
|
834
|
+
ValueError: Implementations must raise when the rollout or attempt is unknown.
|
|
835
|
+
"""
|
|
836
|
+
raise NotImplementedError()
|
|
837
|
+
|
|
838
|
+
async def query_workers(
|
|
839
|
+
self,
|
|
840
|
+
*,
|
|
841
|
+
status_in: Optional[Sequence[WorkerStatus]] = None,
|
|
842
|
+
worker_id_contains: Optional[str] = None,
|
|
843
|
+
filter_logic: Literal["and", "or"] = "and",
|
|
844
|
+
sort_by: Optional[str] = None,
|
|
845
|
+
sort_order: Literal["asc", "desc"] = "asc",
|
|
846
|
+
limit: int = -1,
|
|
847
|
+
offset: int = 0,
|
|
848
|
+
) -> Sequence[Worker]:
|
|
849
|
+
"""Query all workers in the system.
|
|
850
|
+
|
|
851
|
+
Args:
|
|
852
|
+
status_in: Optional whitelist of [`WorkerStatus`][mantisdk.WorkerStatus] values.
|
|
853
|
+
worker_id_contains: Optional substring match for worker identifiers.
|
|
854
|
+
filter_logic: Logical operator to combine the optional filters above.
|
|
855
|
+
sort_by: Field to sort by. Must be a numeric or string field of [`Worker`][mantisdk.Worker].
|
|
856
|
+
sort_order: Order to sort by.
|
|
857
|
+
limit: Limit on the number of results. `-1` for unlimited.
|
|
858
|
+
offset: Offset into the results.
|
|
859
|
+
|
|
860
|
+
Returns:
|
|
861
|
+
Sequence of Workers. Returns an empty sequence when none exist.
|
|
862
|
+
The return value is not guaranteed to be a list.
|
|
863
|
+
"""
|
|
864
|
+
raise NotImplementedError()
|
|
865
|
+
|
|
866
|
+
async def get_worker_by_id(self, worker_id: str) -> Optional[Worker]:
|
|
867
|
+
"""Retrieve a single worker by identifier.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
worker_id: Identifier of the worker.
|
|
871
|
+
|
|
872
|
+
Returns:
|
|
873
|
+
The worker record if it exists, otherwise `None`.
|
|
874
|
+
|
|
875
|
+
Raises:
|
|
876
|
+
NotImplementedError: Subclasses must implement lookup semantics.
|
|
877
|
+
"""
|
|
878
|
+
raise NotImplementedError()
|
|
879
|
+
|
|
880
|
+
async def update_worker(
|
|
881
|
+
self,
|
|
882
|
+
worker_id: str,
|
|
883
|
+
heartbeat_stats: Dict[str, Any] | Unset = UNSET,
|
|
884
|
+
) -> Worker:
|
|
885
|
+
"""Record a heartbeat for `worker_id` and refresh telemetry.
|
|
886
|
+
|
|
887
|
+
Implementations must treat this API as heartbeat-only: it should snapshot
|
|
888
|
+
the latest stats when provided, stamp `last_heartbeat_time` with the
|
|
889
|
+
current wall clock, and rely on other store mutations (`dequeue_rollout`,
|
|
890
|
+
`update_attempt`, etc.) to drive the worker's busy/idle status,
|
|
891
|
+
assignment, and activity timestamps.
|
|
892
|
+
|
|
893
|
+
Args:
|
|
894
|
+
worker_id: Identifier of the worker to update.
|
|
895
|
+
heartbeat_stats: Replacement worker heartbeat statistics (non-null when provided).
|
|
896
|
+
"""
|
|
897
|
+
raise NotImplementedError()
|