PyPI - mantisdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show

mantisdk/__init__.py +22 -0
mantisdk/adapter/__init__.py +15 -0
mantisdk/adapter/base.py +94 -0
mantisdk/adapter/messages.py +270 -0
mantisdk/adapter/triplet.py +1028 -0
mantisdk/algorithm/__init__.py +39 -0
mantisdk/algorithm/apo/__init__.py +5 -0
mantisdk/algorithm/apo/apo.py +889 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
mantisdk/algorithm/base.py +162 -0
mantisdk/algorithm/decorator.py +264 -0
mantisdk/algorithm/fast.py +250 -0
mantisdk/algorithm/gepa/__init__.py +59 -0
mantisdk/algorithm/gepa/adapter.py +459 -0
mantisdk/algorithm/gepa/gepa.py +364 -0
mantisdk/algorithm/gepa/lib/__init__.py +18 -0
mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
mantisdk/algorithm/gepa/lib/api.py +375 -0
mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
mantisdk/algorithm/gepa/lib/core/result.py +233 -0
mantisdk/algorithm/gepa/lib/core/state.py +636 -0
mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
mantisdk/algorithm/gepa/lib/py.typed +0 -0
mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
mantisdk/algorithm/gepa/tracing.py +105 -0
mantisdk/algorithm/utils.py +177 -0
mantisdk/algorithm/verl/__init__.py +5 -0
mantisdk/algorithm/verl/interface.py +202 -0
mantisdk/cli/__init__.py +56 -0
mantisdk/cli/prometheus.py +115 -0
mantisdk/cli/store.py +131 -0
mantisdk/cli/vllm.py +29 -0
mantisdk/client.py +408 -0
mantisdk/config.py +348 -0
mantisdk/emitter/__init__.py +43 -0
mantisdk/emitter/annotation.py +370 -0
mantisdk/emitter/exception.py +54 -0
mantisdk/emitter/message.py +61 -0
mantisdk/emitter/object.py +117 -0
mantisdk/emitter/reward.py +320 -0
mantisdk/env_var.py +156 -0
mantisdk/execution/__init__.py +15 -0
mantisdk/execution/base.py +64 -0
mantisdk/execution/client_server.py +443 -0
mantisdk/execution/events.py +69 -0
mantisdk/execution/inter_process.py +16 -0
mantisdk/execution/shared_memory.py +282 -0
mantisdk/instrumentation/__init__.py +119 -0
mantisdk/instrumentation/agentops.py +314 -0
mantisdk/instrumentation/agentops_langchain.py +45 -0
mantisdk/instrumentation/litellm.py +83 -0
mantisdk/instrumentation/vllm.py +81 -0
mantisdk/instrumentation/weave.py +500 -0
mantisdk/litagent/__init__.py +11 -0
mantisdk/litagent/decorator.py +536 -0
mantisdk/litagent/litagent.py +252 -0
mantisdk/llm_proxy.py +1890 -0
mantisdk/logging.py +370 -0
mantisdk/reward.py +7 -0
mantisdk/runner/__init__.py +11 -0
mantisdk/runner/agent.py +845 -0
mantisdk/runner/base.py +182 -0
mantisdk/runner/legacy.py +309 -0
mantisdk/semconv.py +170 -0
mantisdk/server.py +401 -0
mantisdk/store/__init__.py +23 -0
mantisdk/store/base.py +897 -0
mantisdk/store/client_server.py +2092 -0
mantisdk/store/collection/__init__.py +30 -0
mantisdk/store/collection/base.py +587 -0
mantisdk/store/collection/memory.py +970 -0
mantisdk/store/collection/mongo.py +1412 -0
mantisdk/store/collection_based.py +1823 -0
mantisdk/store/insight.py +648 -0
mantisdk/store/listener.py +58 -0
mantisdk/store/memory.py +396 -0
mantisdk/store/mongo.py +165 -0
mantisdk/store/sqlite.py +3 -0
mantisdk/store/threading.py +357 -0
mantisdk/store/utils.py +142 -0
mantisdk/tracer/__init__.py +16 -0
mantisdk/tracer/agentops.py +242 -0
mantisdk/tracer/base.py +287 -0
mantisdk/tracer/dummy.py +106 -0
mantisdk/tracer/otel.py +555 -0
mantisdk/tracer/weave.py +677 -0
mantisdk/trainer/__init__.py +6 -0
mantisdk/trainer/init_utils.py +263 -0
mantisdk/trainer/legacy.py +367 -0
mantisdk/trainer/registry.py +12 -0
mantisdk/trainer/trainer.py +618 -0
mantisdk/types/__init__.py +6 -0
mantisdk/types/core.py +553 -0
mantisdk/types/resources.py +204 -0
mantisdk/types/tracer.py +515 -0
mantisdk/types/tracing.py +218 -0
mantisdk/utils/__init__.py +1 -0
mantisdk/utils/id.py +18 -0
mantisdk/utils/metrics.py +1025 -0
mantisdk/utils/otel.py +578 -0
mantisdk/utils/otlp.py +536 -0
mantisdk/utils/server_launcher.py +1045 -0
mantisdk/utils/system_snapshot.py +81 -0
mantisdk/verl/__init__.py +8 -0
mantisdk/verl/__main__.py +6 -0
mantisdk/verl/async_server.py +46 -0
mantisdk/verl/config.yaml +27 -0
mantisdk/verl/daemon.py +1154 -0
mantisdk/verl/dataset.py +44 -0
mantisdk/verl/entrypoint.py +248 -0
mantisdk/verl/trainer.py +549 -0
mantisdk-0.1.0.dist-info/METADATA +119 -0
mantisdk-0.1.0.dist-info/RECORD +190 -0
mantisdk-0.1.0.dist-info/WHEEL +4 -0
mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0

mantisdk/store/listener.py ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft. All rights reserved.
+from __future__ import annotations
+from typing import Any, Dict, Optional, Protocol, runtime_checkable
+from mantisdk.types import Attempt, AttemptedRollout, NamedResources, ResourcesUpdate, Rollout, Span
+@runtime_checkable
+class StorageListener(Protocol):
+    """Protocol for listening to storage events.
+    Listeners can be attached to a LightningStore to observe state changes
+    and perform side effects (logging, tracking, etc.) without modifying
+    the core storage logic.
+    """
+    @property
+    def capabilities(self) -> Dict[str, bool]:
+        """Return the capabilities of the listener (e.g., {"otlp_traces": True})."""
+        ...
+    def otlp_traces_endpoint(self) -> Optional[str]:
+        """Return OTLP endpoint if supported, else None."""
+        ...
+    def get_otlp_headers(self) -> Dict[str, str]:
+        """Return OTLP headers if supported, else empty dict."""
+        ...
+    async def on_job_created(self, job_id: str, project_id: Optional[str] = None) -> None:
+        """Called when the store/job is initialized."""
+        ...
+    async def on_rollout_created(self, rollout: Rollout) -> None:
+        """Called when a rollout is created (start or enqueue)."""
+        ...
+    async def on_rollout_updated(self, rollout: Rollout) -> None:
+        """Called when a rollout is updated (status change, etc.)."""
+        ...
+    async def on_attempt_created(self, attempt: Attempt) -> None:
+        """Called when an attempt is created."""
+        ...
+    async def on_attempt_updated(self, attempt: Attempt, rollout_id: str) -> None:
+        """Called when an attempt is updated."""
+        ...
+    async def on_span_created(self, span: Span) -> None:
+        """Called when a span is added."""
+        ...
+    async def on_resource_registered(self, resource: ResourcesUpdate) -> None:
+        """Called when a resource snapshot is registered/updated."""
+        ...

mantisdk/store/memory.py ADDED Viewed

@@ -0,0 +1,396 @@
+# Copyright (c) Microsoft. All rights reserved.
+from __future__ import annotations
+import asyncio
+import logging
+import sys
+from collections.abc import Iterable
+from collections.abc import Mapping as MappingABC
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Counter,
+    Dict,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+if TYPE_CHECKING:
+    from .listener import StorageListener
+import aiologic
+from pydantic import BaseModel
+from mantisdk.types import AttemptedRollout, NamedResources, PaginatedResult, ResourcesUpdate, Rollout, Span
+from mantisdk.utils.metrics import MetricsBackend
+from .base import UNSET, LightningStoreCapabilities, LightningStoreStatistics, Unset, is_finished, is_running
+from .collection import InMemoryLightningCollections
+from .collection_based import CollectionBasedLightningStore, tracked
+T_callable = TypeVar("T_callable", bound=Callable[..., Any])
+logger = logging.getLogger(__name__)
+def estimate_model_size(obj: Any) -> int:
+    """Rough recursive size estimate for Pydantic BaseModel instances."""
+    if isinstance(obj, BaseModel):
+        values = cast(Iterable[Any], obj.__dict__.values())
+        return sum(estimate_model_size(value) for value in values) + sys.getsizeof(cast(object, obj))
+    if isinstance(obj, MappingABC):
+        mapping = cast(Mapping[Any, Any], obj)
+        return sum(estimate_model_size(value) for value in mapping.values()) + sys.getsizeof(cast(object, obj))
+    if isinstance(obj, (list, tuple, set)):
+        iterable = cast(Iterable[Any], obj)
+        return sum(estimate_model_size(value) for value in iterable) + sys.getsizeof(cast(object, obj))
+    return sys.getsizeof(cast(object, obj))
+def _detect_total_memory_bytes() -> int:
+    """Best-effort detection of the total available system memory in bytes."""
+    try:
+        import psutil
+        return int(psutil.virtual_memory().total)
+    except ImportError:
+        # Fallback to 8GB if memory cannot be detected.
+        logger.error("psutil is not installed. Falling back to 8GB of memory in total.")
+        return 8 * 1024**3
+class InMemoryLightningStore(CollectionBasedLightningStore[InMemoryLightningCollections]):
+    """
+    In-memory implementation of LightningStore using Python data structures.
+    Thread-safe and async-compatible but data is not persistent.
+    Args:
+        thread_safe: Whether the store is thread-safe.
+        eviction_memory_threshold: The threshold for evicting spans in bytes.
+            By default, it's 70% of the total VRAM available.
+        safe_memory_threshold: The threshold for safe memory usage in bytes.
+            By default, it's 80% of the eviction threshold.
+        span_size_estimator: A function to estimate the size of a span in bytes.
+            By default, it's a simple size estimator that uses sys.getsizeof.
+        tracker: The metrics tracker to use.
+        scan_debounce_seconds: The debounce time for the scan for unhealthy rollouts.
+            Set to 0 to disable debouncing.
+    """
+    def __init__(
+        self,
+        *,
+        thread_safe: bool = False,
+        eviction_memory_threshold: float | int | None = None,
+        safe_memory_threshold: float | int | None = None,
+        span_size_estimator: Callable[[Span], int] | None = None,
+        tracker: MetricsBackend | None = None,
+        scan_debounce_seconds: float = 10.0,
+        listeners: Optional[Sequence["StorageListener"]] = None,
+    ):
+        super().__init__(
+            collections=InMemoryLightningCollections(lock_type="thread" if thread_safe else "asyncio", tracker=tracker),
+            tracker=tracker,
+            scan_debounce_seconds=scan_debounce_seconds,
+            listeners=listeners,
+        )
+        self._thread_safe = thread_safe
+        self._start_time_by_rollout: Dict[str, float] = {}
+        self._span_bytes_by_rollout: Dict[str, int] = Counter()
+        self._total_span_bytes: int = 0
+        self._evicted_rollout_span_sets: Set[str] = set()
+        self._memory_capacity_bytes = _detect_total_memory_bytes()
+        if self._memory_capacity_bytes <= 0:
+            raise ValueError("Detected memory capacity must be positive")
+        self._eviction_threshold_bytes = self._resolve_memory_threshold(
+            eviction_memory_threshold,
+            default_ratio=0.7,
+            capacity_bytes=self._memory_capacity_bytes,
+            name="eviction_memory_threshold",
+            minimum=1,
+        )
+        if safe_memory_threshold is None:
+            safe_memory_threshold = max(int(self._eviction_threshold_bytes * 0.8), 0)
+        self._safe_threshold_bytes = self._resolve_memory_threshold(
+            safe_memory_threshold,
+            default_ratio=self._eviction_threshold_bytes / self._memory_capacity_bytes,
+            capacity_bytes=self._memory_capacity_bytes,
+            name="safe_memory_threshold",
+            minimum=0,
+        )
+        if not (0 <= self._safe_threshold_bytes < self._eviction_threshold_bytes):
+            raise ValueError("safe_memory_threshold must be smaller than eviction_memory_threshold")
+        self._custom_span_size_estimator = span_size_estimator
+        # Completion tracking for wait_for_rollouts (cross-loop safe)
+        self._completion_events: Dict[str, aiologic.Event] = {}
+        # Running rollouts cache, including preparing and running rollouts
+        self._running_rollout_ids: Set[str] = set()
+        # Caches the latest resources ID.
+        self._latest_resources_id: Union[str, None, Unset] = UNSET
+    @property
+    def capabilities(self) -> LightningStoreCapabilities:
+        """Return the capabilities of the store.
+        Merges base store capabilities with listener capabilities (e.g., InsightTracker).
+        """
+        base_caps = LightningStoreCapabilities(
+            thread_safe=self._thread_safe,
+            async_safe=True,
+            zero_copy=False,
+            otlp_traces=False,
+        )
+        # Merge capabilities from listeners (e.g., InsightTracker provides otlp_traces=True)
+        for listener in self.listeners:
+            if hasattr(listener, "capabilities"):
+                base_caps.update(listener.capabilities)
+        return base_caps
+    async def statistics(self) -> LightningStoreStatistics:
+        """Return the statistics of the store."""
+        return {
+            **(await super().statistics()),
+            "total_span_bytes": self._total_span_bytes,
+            "eviction_threshold_bytes": self._eviction_threshold_bytes,
+            "safe_threshold_bytes": self._safe_threshold_bytes,
+            "memory_capacity_bytes": self._memory_capacity_bytes,
+        }
+    @tracked("wait_for_rollout")
+    async def wait_for_rollout(self, rollout_id: str, timeout: Optional[float] = None) -> Optional[Rollout]:
+        """Wait for a specific rollout to complete with a timeout."""
+        async with self.collections.atomic(mode="r", snapshot=self._read_snapshot, labels=["rollouts"]) as collections:
+            rollout = await collections.rollouts.get({"rollout_id": {"exact": rollout_id}})
+            if rollout and is_finished(rollout):
+                return rollout
+        if timeout is not None and timeout <= 0:
+            return None
+        # If not completed and we have an event, wait for completion
+        if rollout_id in self._completion_events:
+            evt = self._completion_events[rollout_id]
+            # Wait for the event with proper timeout handling
+            # evt.wait() returns True if event was set, False if timeout occurred
+            if timeout is None:
+                # Wait indefinitely by polling with finite timeouts
+                # This allows threads to exit cleanly on shutdown
+                while True:
+                    result = await asyncio.to_thread(evt.wait, 10.0)  # Poll every 10 seconds
+                    if result:  # Event was set
+                        break
+                    # Loop and check again (continues indefinitely since timeout=None)
+            else:
+                # Wait with the specified timeout
+                result = await asyncio.to_thread(evt.wait, timeout)
+            # If event was set (not timeout), check if rollout is finished
+            if result:
+                async with self.collections.atomic(
+                    mode="r", snapshot=self._read_snapshot, labels=["rollouts"]
+                ) as collections:
+                    rollout = await collections.rollouts.get({"rollout_id": {"exact": rollout_id}})
+                    if rollout and is_finished(rollout):
+                        return rollout
+        return None
+    @tracked("add_resources_inmemory")
+    async def add_resources(self, resources: NamedResources) -> ResourcesUpdate:
+        ret = await super().add_resources(resources)
+        async with self.collections.atomic(mode="rw", snapshot=self._read_snapshot, labels=["resources"]):
+            self._latest_resources_id = ret.resources_id
+        return ret
+    @tracked("update_resources_inmemory")
+    async def update_resources(self, resources_id: str, resources: NamedResources) -> ResourcesUpdate:
+        ret = await super().update_resources(resources_id, resources)
+        async with self.collections.atomic(mode="rw", snapshot=self._read_snapshot, labels=["resources"]):
+            self._latest_resources_id = ret.resources_id
+        return ret
+    @tracked("_post_update_rollout_inmemory")
+    async def _post_update_rollout(
+        self, rollouts: Sequence[Tuple[Rollout, Sequence[str]]], skip_enqueue: bool = False
+    ) -> None:
+        """Update the running rollout ids set when the rollout updates."""
+        await super()._post_update_rollout(rollouts, skip_enqueue=skip_enqueue)
+        async with self.collections.atomic(mode="rw", snapshot=self._read_snapshot, labels=["rollouts"]):
+            for rollout, _ in rollouts:
+                if is_running(rollout):
+                    self._running_rollout_ids.add(rollout.rollout_id)
+                else:
+                    self._running_rollout_ids.discard(rollout.rollout_id)
+                if is_finished(rollout):
+                    self._completion_events.setdefault(rollout.rollout_id, aiologic.Event())
+                    self._completion_events[rollout.rollout_id].set()
+                else:
+                    self._completion_events.setdefault(rollout.rollout_id, aiologic.Event())
+                # Rollout status can never transition from finished to running (unlike attempt)
+                # so we don't need to clear the completion event even in case of retrying.
+                if rollout.rollout_id not in self._start_time_by_rollout:
+                    self._start_time_by_rollout[rollout.rollout_id] = rollout.start_time
+    @tracked("_unlocked_query_rollouts_by_rollout_ids")
+    async def _unlocked_query_rollouts_by_rollout_ids(
+        self, collections: InMemoryLightningCollections, rollout_ids: Sequence[str]
+    ) -> List[Rollout]:
+        """Always use exact. This is faster than within filter for in-memory store."""
+        if len(rollout_ids) == 0:
+            return []
+        rollouts = [await collections.rollouts.get({"rollout_id": {"exact": rollout_id}}) for rollout_id in rollout_ids]
+        return [rollout for rollout in rollouts if rollout is not None]
+    @tracked("_unlocked_get_running_rollouts")
+    async def _unlocked_get_running_rollouts(self, collections: InMemoryLightningCollections) -> List[AttemptedRollout]:
+        """Accelerated version of `_unlocked_get_running_rollouts` for in-memory store. Used for healthcheck."""
+        async with self.collections.atomic(
+            mode="r", snapshot=self._read_snapshot, labels=["rollouts", "attempts"]
+        ) as collections:
+            rollouts = await self._unlocked_query_rollouts_by_rollout_ids(collections, list(self._running_rollout_ids))
+            running_rollouts: List[AttemptedRollout] = []
+            for rollout in rollouts:
+                latest_attempt = await collections.attempts.get(
+                    filter={"rollout_id": {"exact": rollout.rollout_id}},
+                    sort={"name": "sequence_id", "order": "desc"},
+                )
+                if not latest_attempt:
+                    # The rollout is running but has no attempts, this should not happen
+                    logger.error(f"Rollout {rollout.rollout_id} is running but has no attempts")
+                    continue
+                running_rollouts.append(AttemptedRollout(**rollout.model_dump(), attempt=latest_attempt))
+        return running_rollouts
+    @tracked("query_spans_inmemory")  # Since this method calls super, we need to track it separately
+    async def query_spans(
+        self,
+        rollout_id: str,
+        attempt_id: str | Literal["latest"] | None = None,
+        **kwargs: Any,
+    ) -> PaginatedResult[Span]:
+        if rollout_id in self._evicted_rollout_span_sets:
+            raise RuntimeError(f"Spans for rollout {rollout_id} have been evicted")
+        return await super().query_spans(rollout_id, attempt_id, **kwargs)
+    @tracked("_post_add_spans")
+    async def _post_add_spans(self, spans: Sequence[Span], rollout_id: str, attempt_id: str) -> None:
+        """In-memory store needs to maintain the span data in memory, and evict spans when memory is low."""
+        await super()._post_add_spans(spans, rollout_id, attempt_id)
+        async with self.collections.atomic(
+            mode="rw", snapshot=self._read_snapshot, labels=["rollouts", "spans"]
+        ) as collections:
+            for span in spans:
+                await self._account_span_size(span)
+            await self._maybe_evict_spans(collections)
+    @tracked("_get_latest_resources_inmemory")
+    async def _get_latest_resources(self) -> Optional[ResourcesUpdate]:
+        if isinstance(self._latest_resources_id, Unset):
+            return await super()._get_latest_resources()
+        if self._latest_resources_id is not None:
+            async with self.collections.atomic(
+                mode="r", snapshot=self._read_snapshot, labels=["resources"]
+            ) as collections:
+                return await collections.resources.get(filter={"resources_id": {"exact": self._latest_resources_id}})
+        return None
+    @staticmethod
+    def _resolve_memory_threshold(
+        value: float | int | None,
+        *,
+        default_ratio: float,
+        capacity_bytes: int,
+        name: str,
+        minimum: int,
+    ) -> int:
+        if value is None:
+            resolved = int(capacity_bytes * default_ratio)
+        elif isinstance(value, float):
+            if minimum == 0:
+                if not (0 <= value <= 1):
+                    raise ValueError(f"{name} ratio must be between 0 and 1 inclusive")
+            else:
+                if not (0 < value <= 1):
+                    raise ValueError(f"{name} ratio must be greater than 0 and at most 1")
+            resolved = int(capacity_bytes * value)
+        else:
+            value_int = value
+            if value_int < 0:
+                raise ValueError(f"{name} must be non-negative")
+            resolved = value_int
+        if resolved < minimum:
+            raise ValueError(f"{name} must be at least {minimum} bytes")
+        return resolved
+    @tracked("_account_span_size")
+    async def _account_span_size(self, span: Span) -> int:
+        if self._custom_span_size_estimator is not None:
+            size = max(int(self._custom_span_size_estimator(span)), 0)
+        else:
+            size = estimate_model_size(span)
+        self._span_bytes_by_rollout[span.rollout_id] += size
+        self._total_span_bytes += size
+        return size
+    @tracked("_maybe_evict_spans")
+    async def _maybe_evict_spans(self, collections: InMemoryLightningCollections) -> None:
+        if self._total_span_bytes <= self._eviction_threshold_bytes:
+            return
+        logger.info(
+            f"Total span bytes: {self._total_span_bytes}, eviction threshold: {self._eviction_threshold_bytes}, "
+            f"safe threshold: {self._safe_threshold_bytes}. Evicting spans..."
+        )
+        candidates: List[tuple[float, str]] = [
+            (start_time, rollout_id) for rollout_id, start_time in self._start_time_by_rollout.items()
+        ]
+        candidates.sort()
+        logger.info(f"Evicting spans for {len(candidates)} rollouts to free up memory...")
+        memory_consumed_before = self._total_span_bytes
+        for _, rollout_id in candidates:
+            if self._total_span_bytes <= self._safe_threshold_bytes:
+                break
+            logger.debug(f"Evicting spans for rollout {rollout_id} to free up memory...")
+            await self._evict_spans_for_rollout(collections, rollout_id)
+        logger.info(f"Freed up {memory_consumed_before - self._total_span_bytes} bytes of memory")
+    @tracked("_evict_spans_for_rollout")
+    async def _evict_spans_for_rollout(self, collections: InMemoryLightningCollections, rollout_id: str) -> None:
+        await collections.evict_spans_for_rollout(rollout_id)
+        removed_bytes = self._span_bytes_by_rollout.pop(rollout_id, 0)
+        if removed_bytes > 0:
+            # There is something removed for real
+            self._total_span_bytes = max(self._total_span_bytes - removed_bytes, 0)
+            self._evicted_rollout_span_sets.add(rollout_id)

mantisdk/store/mongo.py ADDED Viewed

@@ -0,0 +1,165 @@
+# Copyright (c) Microsoft. All rights reserved.
+from __future__ import annotations
+import asyncio
+import hashlib
+import logging
+import time
+import uuid
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, TypeVar, Union
+from mantisdk.types import Attempt, AttemptedRollout, Rollout
+from mantisdk.utils.metrics import MetricsBackend
+from .base import LightningStoreCapabilities, is_finished
+from .collection.mongo import MongoClientPool, MongoLightningCollections
+from .collection_based import CollectionBasedLightningStore, healthcheck_before, tracked
+T_callable = TypeVar("T_callable", bound=Callable[..., Any])
+logger = logging.getLogger(__name__)
+def _generate_partition_id() -> str:
+    return "pt-" + hashlib.sha1(uuid.uuid4().bytes).hexdigest()[:12]
+class MongoLightningStore(CollectionBasedLightningStore[MongoLightningCollections]):
+    """
+    MongoDB implementation of LightningStore using MongoDB collections.
+    Data is persistent and can be shared between multiple processes.
+    Args:
+        mongo_uri: MongoDB connection string (defaults to local replica set).
+        mongo_client_kwargs: Extra keyword arguments forwarded to `AsyncMongoClient`.
+        database_name: The MongoDB database name. Defaults to ``mantisdk``.
+        partition_id: The partition id. Useful when sharing the database among multiple Mantisdk trainers.
+        tracker: The metrics tracker to use.
+        scan_debounce_seconds: The debounce time for the scan for unhealthy rollouts.
+            Set to 0 to disable debouncing.
+    """
+    def __init__(
+        self,
+        *,
+        mongo_uri: str = "mongodb://localhost:27017/?replicaSet=rs0",
+        mongo_client_kwargs: Mapping[str, Any] | None = None,
+        database_name: str | None = None,
+        partition_id: str | None = None,
+        tracker: MetricsBackend | None = None,
+        scan_debounce_seconds: float = 10.0,
+    ) -> None:
+        self._mongo_uri = mongo_uri
+        self._mongo_client_kwargs = dict(mongo_client_kwargs or {})
+        if database_name is None:
+            database_name = "mantisdk"
+            logger.info("No database name provided, using default 'mantisdk'")
+        if partition_id is None:
+            partition_id = _generate_partition_id()
+            logger.info("No partition id provided, generated a new one: %s", partition_id)
+        self._client_pool = MongoClientPool[Mapping[str, Any]](
+            mongo_uri=self._mongo_uri,
+            mongo_client_kwargs=self._mongo_client_kwargs,
+        )
+        super().__init__(
+            collections=MongoLightningCollections(
+                self._client_pool,
+                database_name,
+                partition_id,
+                tracker=tracker,
+            ),
+            tracker=tracker,
+            scan_debounce_seconds=scan_debounce_seconds,
+        )
+    @property
+    def capabilities(self) -> LightningStoreCapabilities:
+        """Return the capabilities of the store."""
+        return LightningStoreCapabilities(
+            thread_safe=True,
+            async_safe=True,
+            zero_copy=True,
+            otlp_traces=False,
+        )
+    async def close(self) -> None:
+        """Close the store by closing the client pool."""
+        await self._client_pool.close()
+    @tracked("wait_for_rollouts")
+    @healthcheck_before
+    async def wait_for_rollouts(self, *, rollout_ids: List[str], timeout: Optional[float] = None) -> List[Rollout]:
+        """Wait for specified rollouts to complete with a timeout.
+        Concurrently wait for all rollouts to complete with a timeout.
+        """
+        start_time = time.time()
+        current_time = start_time
+        deadline = start_time + timeout if timeout is not None else None
+        finished_rollouts: Dict[str, Rollout] = {}
+        unfinished_rollout_ids = set(rollout_ids)
+        while deadline is None or current_time <= deadline:
+            async with self.collections.atomic(
+                mode="r", snapshot=self._read_snapshot, labels=["rollouts"]
+            ) as collections:
+                # Query the rollouts that are not finished in a single query
+                rollouts = await collections.rollouts.query(
+                    filter={"rollout_id": {"within": list(unfinished_rollout_ids)}}
+                )
+            for rollout in rollouts.items:
+                if is_finished(rollout):
+                    finished_rollouts[rollout.rollout_id] = rollout
+                    unfinished_rollout_ids.remove(rollout.rollout_id)
+            if not unfinished_rollout_ids:
+                break
+            # Poll every 10 seconds by default
+            # Minus 0.1 to make sure the time is still sufficient for another call
+            rest_time = max(0.01, min(deadline - time.time() - 0.1, 10.0)) if deadline is not None else 10.0
+            await asyncio.sleep(rest_time)
+            current_time = time.time()
+        # Logging will help debugging when there are stuck rollouts.
+        logger.debug(
+            "Waiting for rollouts. Number of finished rollouts: %d; number of unfinished rollouts: %d",
+            len(finished_rollouts),
+            len(unfinished_rollout_ids),
+        )
+        if len(unfinished_rollout_ids) < 30:
+            logger.debug("Unfinished rollouts: %s", unfinished_rollout_ids)
+        # Reorder the rollouts to match the input order
+        return [finished_rollouts[rollout_id] for rollout_id in rollout_ids if rollout_id in finished_rollouts]
+    @tracked("_unlocked_many_rollouts_to_attempted_rollouts")
+    async def _unlocked_many_rollouts_to_attempted_rollouts(
+        self, collections: MongoLightningCollections, rollouts: Sequence[Rollout]
+    ) -> List[Union[Rollout, AttemptedRollout]]:
+        """Query the latest attempts for the rollouts, and attach them to the rollout objects."""
+        async with collections.atomic(mode="r", snapshot=self._read_snapshot, labels=["attempts"]) as collections:
+            attempts = await collections.attempts.query(
+                filter={"rollout_id": {"within": [rollout.rollout_id for rollout in rollouts]}},
+                sort={"name": "sequence_id", "order": "desc"},
+            )
+        latest_attempts: Dict[str, Attempt] = {}
+        for attempt in attempts:
+            if attempt.rollout_id not in latest_attempts:
+                latest_attempts[attempt.rollout_id] = attempt
+            # Otherwise we ignore the attempt because there's already a newer attempt
+        return [
+            (
+                AttemptedRollout(**rollout.model_dump(), attempt=latest_attempts[rollout.rollout_id])
+                if rollout.rollout_id in latest_attempts
+                else rollout
+            )
+            for rollout in rollouts
+        ]

mantisdk/store/sqlite.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) Microsoft. All rights reserved.
+# TODO: Implement this