mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
mantisdk/server.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
"""Legacy HTTP server compatible with the original Mantisdk protocol.
|
|
4
|
+
|
|
5
|
+
The implementation in this module predates the modern store-powered runtime and
|
|
6
|
+
is kept for backwards compatibility with older deployments. New applications
|
|
7
|
+
should migrate to the store architecture where possible.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import logging
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
import uuid
|
|
17
|
+
import warnings
|
|
18
|
+
from contextlib import asynccontextmanager
|
|
19
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
20
|
+
|
|
21
|
+
import uvicorn
|
|
22
|
+
from fastapi import FastAPI, HTTPException, Path
|
|
23
|
+
|
|
24
|
+
from .types import (
|
|
25
|
+
GenericResponse,
|
|
26
|
+
NamedResources,
|
|
27
|
+
ResourcesUpdate,
|
|
28
|
+
RolloutLegacy,
|
|
29
|
+
Task,
|
|
30
|
+
TaskIfAny,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ServerDataStore:
|
|
37
|
+
"""Async-safe container for in-memory server state.
|
|
38
|
+
|
|
39
|
+
The store tracks queued tasks, claimed tasks, uploaded rollouts, and the
|
|
40
|
+
currently published resources. All interactions are guarded by asyncio locks
|
|
41
|
+
so that the FastAPI handlers can safely run in parallel.
|
|
42
|
+
|
|
43
|
+
!!! warning "Deprecated"
|
|
44
|
+
[`ServerDataStore`][mantisdk.server.ServerDataStore] is part of
|
|
45
|
+
the legacy client/server stack. Use [`LightningStore`][mantisdk.LightningStore] instead.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
self._task_queue: asyncio.Queue[Task] = asyncio.Queue()
|
|
50
|
+
self._processing_tasks: Dict[str, Task] = {} # Currently processing tasks
|
|
51
|
+
self._completed_rollouts: Dict[str, RolloutLegacy] = {}
|
|
52
|
+
|
|
53
|
+
# Store for versioned resources
|
|
54
|
+
self._resource_versions: Dict[str, NamedResources] = {}
|
|
55
|
+
self._latest_resources_id: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
# Locks for thread-safe access
|
|
58
|
+
self._results_lock = asyncio.Lock()
|
|
59
|
+
self._resources_lock = asyncio.Lock()
|
|
60
|
+
|
|
61
|
+
async def add_task(
|
|
62
|
+
self,
|
|
63
|
+
sample: Any,
|
|
64
|
+
mode: Literal["train", "val", "test"] | None = None,
|
|
65
|
+
resources_id: str | None = None,
|
|
66
|
+
metadata: Dict[str, Any] | None = None,
|
|
67
|
+
) -> str:
|
|
68
|
+
"""Enqueue a new task and return the generated rollout identifier.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
sample: Payload that describes the task input.
|
|
72
|
+
mode: Phase in which the sample should be executed (`"train"`, `"val"`, or
|
|
73
|
+
`"test"`).
|
|
74
|
+
resources_id: Identifier of a resource bundle that the executor should
|
|
75
|
+
load before running the task.
|
|
76
|
+
metadata: Optional metadata forwarded to the executor.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Unique rollout identifier assigned to the task.
|
|
80
|
+
"""
|
|
81
|
+
rollout_id = f"rollout-{uuid.uuid4()}"
|
|
82
|
+
task = Task(
|
|
83
|
+
rollout_id=rollout_id,
|
|
84
|
+
input=sample,
|
|
85
|
+
mode=mode,
|
|
86
|
+
resources_id=resources_id,
|
|
87
|
+
create_time=time.time(),
|
|
88
|
+
num_claims=0,
|
|
89
|
+
metadata=metadata or {},
|
|
90
|
+
)
|
|
91
|
+
await self._task_queue.put(task)
|
|
92
|
+
logger.info(f"Task queued: {rollout_id} (mode: {mode}, resources_id: {resources_id})")
|
|
93
|
+
return rollout_id
|
|
94
|
+
|
|
95
|
+
async def get_next_task(self) -> Optional[Task]:
|
|
96
|
+
"""Retrieve the next task from the queue without blocking.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Next [`Task`][mantisdk.Task] ready to execute, or ``None``
|
|
100
|
+
when the queue is empty.
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
async with self._results_lock:
|
|
104
|
+
task = self._task_queue.get_nowait()
|
|
105
|
+
task = task.model_copy(
|
|
106
|
+
update={
|
|
107
|
+
"last_claim_time": time.time(),
|
|
108
|
+
"num_claims": (task.num_claims or 0) + 1,
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
self._processing_tasks[task.rollout_id] = task
|
|
112
|
+
if task.num_claims == 1:
|
|
113
|
+
logger.debug(f"Next task retrieved: {task.rollout_id}")
|
|
114
|
+
else:
|
|
115
|
+
logger.info(f"Task {task.rollout_id} re-claimed (attempt {task.num_claims})")
|
|
116
|
+
return task
|
|
117
|
+
except asyncio.QueueEmpty:
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
async def update_resources(self, update: ResourcesUpdate):
|
|
121
|
+
"""Persist a new resource bundle and mark it as the latest version.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
update: Resource payload received from a client.
|
|
125
|
+
"""
|
|
126
|
+
# TODO: evict old resources if necessary.
|
|
127
|
+
async with self._resources_lock:
|
|
128
|
+
self._resource_versions[update.resources_id] = update.resources
|
|
129
|
+
self._latest_resources_id = update.resources_id
|
|
130
|
+
logger.info(f"Resources updated. New version '{update.resources_id}' is now latest.")
|
|
131
|
+
|
|
132
|
+
async def get_resources_by_id(self, resources_id: str) -> Optional[ResourcesUpdate]:
|
|
133
|
+
"""Retrieve a specific resource bundle by identifier.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
resources_id: Identifier that was previously published to the store.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Matching [`ResourcesUpdate`][mantisdk.ResourcesUpdate]
|
|
140
|
+
instance, or ``None`` when the identifier is unknown.
|
|
141
|
+
"""
|
|
142
|
+
async with self._resources_lock:
|
|
143
|
+
resources = self._resource_versions.get(resources_id)
|
|
144
|
+
if resources:
|
|
145
|
+
return ResourcesUpdate(
|
|
146
|
+
resources_id=resources_id,
|
|
147
|
+
resources=resources,
|
|
148
|
+
create_time=time.time(),
|
|
149
|
+
update_time=time.time(),
|
|
150
|
+
version=1,
|
|
151
|
+
)
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
async def get_latest_resources(self) -> Optional[ResourcesUpdate]:
|
|
155
|
+
"""Return the most recent resource bundle, if one exists."""
|
|
156
|
+
if self._latest_resources_id:
|
|
157
|
+
return await self.get_resources_by_id(self._latest_resources_id)
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
async def store_rollout(self, rollout: RolloutLegacy):
|
|
161
|
+
"""Persist a completed rollout for later inspection.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
rollout: Rollout returned by a client.
|
|
165
|
+
"""
|
|
166
|
+
async with self._results_lock:
|
|
167
|
+
self._processing_tasks.pop(rollout.rollout_id, None)
|
|
168
|
+
self._completed_rollouts[rollout.rollout_id] = rollout
|
|
169
|
+
logger.info(f"Rollout received and stored: {rollout.rollout_id}")
|
|
170
|
+
|
|
171
|
+
async def retrieve_rollout(self, rollout_id: str) -> Optional[RolloutLegacy]:
|
|
172
|
+
"""Retrieve and remove a stored rollout by identifier.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
rollout_id: Identifier of the rollout to fetch.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Stored [`RolloutLegacy`][mantisdk.RolloutLegacy], or ``None``
|
|
179
|
+
when the identifier is unknown.
|
|
180
|
+
"""
|
|
181
|
+
async with self._results_lock:
|
|
182
|
+
return self._completed_rollouts.pop(rollout_id, None)
|
|
183
|
+
|
|
184
|
+
async def retrieve_completed_rollouts(self) -> List[RolloutLegacy]:
|
|
185
|
+
"""Return all completed rollouts and clear the internal buffer."""
|
|
186
|
+
async with self._results_lock:
|
|
187
|
+
rollouts = list(self._completed_rollouts.values())
|
|
188
|
+
self._completed_rollouts.clear()
|
|
189
|
+
return rollouts
|
|
190
|
+
|
|
191
|
+
def get_processing_tasks(self) -> Dict[str, Task]:
|
|
192
|
+
"""Return a copy of currently processing tasks for timeout checking."""
|
|
193
|
+
return self._processing_tasks.copy()
|
|
194
|
+
|
|
195
|
+
async def requeue_task(self, task: Task):
|
|
196
|
+
"""Requeue a task that timed out while being processed."""
|
|
197
|
+
logger.warning(f"Requeuing task {task.rollout_id} after timeout (attempt {task.num_claims})")
|
|
198
|
+
async with self._results_lock:
|
|
199
|
+
# Remove from processing tasks
|
|
200
|
+
self._processing_tasks.pop(task.rollout_id, None)
|
|
201
|
+
self._task_queue.put_nowait(task)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class MantisdkServer:
|
|
205
|
+
"""High-level controller for the legacy Mantisdk FastAPI server.
|
|
206
|
+
|
|
207
|
+
The controller orchestrates server start-up, task queueing, resource updates,
|
|
208
|
+
and retrieval of client rollouts. It is primarily used by existing systems that
|
|
209
|
+
still rely on the HTTP-based workflow.
|
|
210
|
+
|
|
211
|
+
!!! warning "Deprecated"
|
|
212
|
+
[`MantisdkServer`][mantisdk.server.MantisdkServer] is part of
|
|
213
|
+
the legacy client/server stack. Prefer the store-based runtime for new
|
|
214
|
+
integrations.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
def __init__(self, host: str = "127.0.0.1", port: int = 8000, task_timeout_seconds: float = 300.0):
|
|
218
|
+
"""Initialize the controller.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
host: Hostname or IP address to bind the HTTP server to.
|
|
222
|
+
port: TCP port exposed by the server.
|
|
223
|
+
task_timeout_seconds: Seconds before a claimed task is considered stale and
|
|
224
|
+
re-queued.
|
|
225
|
+
"""
|
|
226
|
+
warnings.warn(
|
|
227
|
+
"MantisdkServer is deprecated. Please use LightningStoreServer instead.", DeprecationWarning
|
|
228
|
+
)
|
|
229
|
+
self.host = host
|
|
230
|
+
self.port = port
|
|
231
|
+
self.endpoint = f"http://{host}:{port}"
|
|
232
|
+
self._task_timeout_seconds = task_timeout_seconds
|
|
233
|
+
|
|
234
|
+
# Defer initialization and use event for cross-thread communication
|
|
235
|
+
self._store: Optional[ServerDataStore] = None
|
|
236
|
+
self.loop: Optional[asyncio.AbstractEventLoop] = None
|
|
237
|
+
self.startup_event = threading.Event()
|
|
238
|
+
|
|
239
|
+
# Create FastAPI app instance with a lifespan manager
|
|
240
|
+
self._app = FastAPI(lifespan=self._lifespan)
|
|
241
|
+
self._setup_routes()
|
|
242
|
+
|
|
243
|
+
self._uvicorn_config = uvicorn.Config(self._app, host=self.host, port=self.port, log_level="info")
|
|
244
|
+
self._uvicorn_server = uvicorn.Server(self._uvicorn_config)
|
|
245
|
+
|
|
246
|
+
# --- ADDED: Lifespan context manager ---
|
|
247
|
+
@asynccontextmanager
|
|
248
|
+
async def _lifespan(self, app: FastAPI):
|
|
249
|
+
"""Manage server start-up and shutdown within the event loop."""
|
|
250
|
+
logger.info("Server is starting up...")
|
|
251
|
+
self.loop = asyncio.get_running_loop()
|
|
252
|
+
self._store = ServerDataStore() # Initialize data store here
|
|
253
|
+
self.startup_event.set() # Signal that the server is ready
|
|
254
|
+
|
|
255
|
+
yield
|
|
256
|
+
|
|
257
|
+
logger.info("Server is shutting down.")
|
|
258
|
+
self._store = None
|
|
259
|
+
self.startup_event.clear() # Clear the startup event
|
|
260
|
+
self.loop = None
|
|
261
|
+
|
|
262
|
+
async def _check_and_requeue_stale_tasks(self):
|
|
263
|
+
"""Check for stale tasks and requeue them when they exceed the timeout."""
|
|
264
|
+
current_time = time.time()
|
|
265
|
+
# Ensure store is initialized before checking
|
|
266
|
+
if not self._store:
|
|
267
|
+
return
|
|
268
|
+
processing_tasks = self._store.get_processing_tasks()
|
|
269
|
+
|
|
270
|
+
for _, task in processing_tasks.items():
|
|
271
|
+
if task.last_claim_time and current_time - task.last_claim_time > self._task_timeout_seconds:
|
|
272
|
+
await self._store.requeue_task(task)
|
|
273
|
+
logger.warning(
|
|
274
|
+
f"Task {task.rollout_id} timed out after {self._task_timeout_seconds}s, requeued (attempt {task.num_claims})"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
def _setup_routes(self):
|
|
278
|
+
"""Configure the FastAPI routes that make up the legacy HTTP API."""
|
|
279
|
+
|
|
280
|
+
@self._app.get("/task", response_model=TaskIfAny)
|
|
281
|
+
async def next_task() -> TaskIfAny: # type: ignore
|
|
282
|
+
"""Provide the next available task to a client."""
|
|
283
|
+
await self._check_and_requeue_stale_tasks()
|
|
284
|
+
|
|
285
|
+
if not self._store:
|
|
286
|
+
return TaskIfAny(is_available=False)
|
|
287
|
+
|
|
288
|
+
task = await self._store.get_next_task()
|
|
289
|
+
if task:
|
|
290
|
+
logger.debug(f"Serving task {task.rollout_id} to a client.")
|
|
291
|
+
return TaskIfAny(is_available=True, task=task)
|
|
292
|
+
else:
|
|
293
|
+
logger.debug("No task available for client.")
|
|
294
|
+
return TaskIfAny(is_available=False)
|
|
295
|
+
|
|
296
|
+
@self._app.get("/resources/latest", response_model=ResourcesUpdate)
|
|
297
|
+
async def fetch_latest_resources() -> ResourcesUpdate: # type: ignore
|
|
298
|
+
"""Return the most recent resource bundle published to the server."""
|
|
299
|
+
if not self._store:
|
|
300
|
+
raise HTTPException(status_code=503, detail="Server not fully initialized.")
|
|
301
|
+
resources_update = await self._store.get_latest_resources()
|
|
302
|
+
if not resources_update:
|
|
303
|
+
raise HTTPException(status_code=404, detail="No resources have been set on the server.")
|
|
304
|
+
logger.debug(f"Serving latest resources '{resources_update.resources_id}' to a client.")
|
|
305
|
+
return resources_update
|
|
306
|
+
|
|
307
|
+
@self._app.get("/resources/{resource_id}", response_model=ResourcesUpdate)
|
|
308
|
+
async def fetch_resources_by_id( # type: ignore
|
|
309
|
+
resource_id: str = Path(..., description="The unique identifier for the resource version.")
|
|
310
|
+
) -> ResourcesUpdate:
|
|
311
|
+
"""Return a specific version of resources by identifier."""
|
|
312
|
+
if not self._store:
|
|
313
|
+
raise HTTPException(status_code=503, detail="Server not fully initialized.")
|
|
314
|
+
resources_update = await self._store.get_resources_by_id(resource_id)
|
|
315
|
+
if not resources_update:
|
|
316
|
+
raise HTTPException(status_code=404, detail=f"Resource ID '{resource_id}' not found.")
|
|
317
|
+
logger.debug(f"Serving resources for ID '{resource_id}' to a client.")
|
|
318
|
+
return resources_update
|
|
319
|
+
|
|
320
|
+
@self._app.post("/rollout", response_model=GenericResponse)
|
|
321
|
+
async def post_rollout(payload: RolloutLegacy) -> GenericResponse: # type: ignore
|
|
322
|
+
"""Persist the rollout reported by a client."""
|
|
323
|
+
if not self._store:
|
|
324
|
+
raise HTTPException(status_code=503, detail="Server not fully initialized.")
|
|
325
|
+
await self._store.store_rollout(payload)
|
|
326
|
+
return GenericResponse(
|
|
327
|
+
status="ok",
|
|
328
|
+
message=f"Rollout {payload.rollout_id} received and stored.",
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
async def start(self):
|
|
332
|
+
"""Start the FastAPI server in the background."""
|
|
333
|
+
logger.info(f"Starting server at {self.endpoint}")
|
|
334
|
+
asyncio.create_task(self._uvicorn_server.serve())
|
|
335
|
+
await asyncio.sleep(1) # Allow time for server to start up.
|
|
336
|
+
|
|
337
|
+
async def stop(self):
|
|
338
|
+
"""Stop the FastAPI server and wait for a graceful shutdown."""
|
|
339
|
+
if self._uvicorn_server.started:
|
|
340
|
+
logger.info("Stopping server...")
|
|
341
|
+
self._uvicorn_server.should_exit = True
|
|
342
|
+
await asyncio.sleep(1) # Allow time for graceful shutdown.
|
|
343
|
+
logger.info("Server stopped.")
|
|
344
|
+
|
|
345
|
+
async def run_forever(self):
|
|
346
|
+
"""Run the server indefinitely until `stop()` is invoked."""
|
|
347
|
+
await self._uvicorn_server.serve()
|
|
348
|
+
|
|
349
|
+
async def queue_task(
|
|
350
|
+
self,
|
|
351
|
+
sample: Any,
|
|
352
|
+
mode: Literal["train", "val", "test"] | None = None,
|
|
353
|
+
resources_id: str | None = None,
|
|
354
|
+
metadata: Dict[str, Any] | None = None,
|
|
355
|
+
) -> str:
|
|
356
|
+
"""Add a task to the queue for a client to process."""
|
|
357
|
+
if not self._store:
|
|
358
|
+
raise RuntimeError("Store not initialized. The server may not be running.")
|
|
359
|
+
return await self._store.add_task(sample, mode=mode, resources_id=resources_id, metadata=metadata)
|
|
360
|
+
|
|
361
|
+
async def update_resources(self, resources: NamedResources) -> str:
|
|
362
|
+
"""Publish a new resource bundle and return its generated identifier."""
|
|
363
|
+
if not self._store:
|
|
364
|
+
raise RuntimeError("Store not initialized. The server may not be running.")
|
|
365
|
+
resources_id = f"res-{uuid.uuid4()}"
|
|
366
|
+
update = ResourcesUpdate(
|
|
367
|
+
resources_id=resources_id, resources=resources, create_time=time.time(), update_time=time.time(), version=1
|
|
368
|
+
)
|
|
369
|
+
await self._store.update_resources(update)
|
|
370
|
+
return resources_id
|
|
371
|
+
|
|
372
|
+
async def get_completed_rollout(self, rollout_id: str) -> Optional[RolloutLegacy]:
|
|
373
|
+
"""Retrieve a specific completed rollout by identifier."""
|
|
374
|
+
if not self._store:
|
|
375
|
+
raise RuntimeError("Store not initialized. The server may not be running.")
|
|
376
|
+
return await self._store.retrieve_rollout(rollout_id)
|
|
377
|
+
|
|
378
|
+
async def poll_completed_rollout(self, rollout_id: str, timeout: Optional[float] = None) -> Optional[RolloutLegacy]:
|
|
379
|
+
"""Poll for a completed rollout until it becomes available or a timeout expires.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
rollout_id: Identifier of the rollout to wait for.
|
|
383
|
+
timeout: Maximum number of seconds to wait. ``None`` waits indefinitely.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Retrieved rollout, or ``None`` when the timeout is reached without success.
|
|
387
|
+
"""
|
|
388
|
+
start_time = time.time()
|
|
389
|
+
while True:
|
|
390
|
+
rollout = await self.get_completed_rollout(rollout_id)
|
|
391
|
+
if rollout:
|
|
392
|
+
return rollout
|
|
393
|
+
if timeout and (time.time() - start_time) >= timeout:
|
|
394
|
+
return None
|
|
395
|
+
await asyncio.sleep(1)
|
|
396
|
+
|
|
397
|
+
async def retrieve_completed_rollouts(self) -> List[RolloutLegacy]:
|
|
398
|
+
"""Return every completed rollout and clear the internal buffer."""
|
|
399
|
+
if not self._store:
|
|
400
|
+
raise RuntimeError("Store not initialized. The server may not be running.")
|
|
401
|
+
return await self._store.retrieve_completed_rollouts()
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from .base import LightningStore, LightningStoreCapabilities, LightningStoreStatistics
|
|
4
|
+
from .client_server import LightningStoreClient, LightningStoreServer
|
|
5
|
+
from .collection_based import CollectionBasedLightningStore
|
|
6
|
+
from .insight import InsightLightningStore, InsightTracker
|
|
7
|
+
from .listener import StorageListener
|
|
8
|
+
from .memory import InMemoryLightningStore
|
|
9
|
+
from .threading import LightningStoreThreaded
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"LightningStore",
|
|
13
|
+
"LightningStoreCapabilities",
|
|
14
|
+
"LightningStoreStatistics",
|
|
15
|
+
"LightningStoreClient",
|
|
16
|
+
"LightningStoreServer",
|
|
17
|
+
"InMemoryLightningStore",
|
|
18
|
+
"InsightLightningStore",
|
|
19
|
+
"InsightTracker",
|
|
20
|
+
"StorageListener",
|
|
21
|
+
"CollectionBasedLightningStore",
|
|
22
|
+
"LightningStoreThreaded",
|
|
23
|
+
]
|