PyPI - mantisdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show

mantisdk/__init__.py +22 -0
mantisdk/adapter/__init__.py +15 -0
mantisdk/adapter/base.py +94 -0
mantisdk/adapter/messages.py +270 -0
mantisdk/adapter/triplet.py +1028 -0
mantisdk/algorithm/__init__.py +39 -0
mantisdk/algorithm/apo/__init__.py +5 -0
mantisdk/algorithm/apo/apo.py +889 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
mantisdk/algorithm/base.py +162 -0
mantisdk/algorithm/decorator.py +264 -0
mantisdk/algorithm/fast.py +250 -0
mantisdk/algorithm/gepa/__init__.py +59 -0
mantisdk/algorithm/gepa/adapter.py +459 -0
mantisdk/algorithm/gepa/gepa.py +364 -0
mantisdk/algorithm/gepa/lib/__init__.py +18 -0
mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
mantisdk/algorithm/gepa/lib/api.py +375 -0
mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
mantisdk/algorithm/gepa/lib/core/result.py +233 -0
mantisdk/algorithm/gepa/lib/core/state.py +636 -0
mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
mantisdk/algorithm/gepa/lib/py.typed +0 -0
mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
mantisdk/algorithm/gepa/tracing.py +105 -0
mantisdk/algorithm/utils.py +177 -0
mantisdk/algorithm/verl/__init__.py +5 -0
mantisdk/algorithm/verl/interface.py +202 -0
mantisdk/cli/__init__.py +56 -0
mantisdk/cli/prometheus.py +115 -0
mantisdk/cli/store.py +131 -0
mantisdk/cli/vllm.py +29 -0
mantisdk/client.py +408 -0
mantisdk/config.py +348 -0
mantisdk/emitter/__init__.py +43 -0
mantisdk/emitter/annotation.py +370 -0
mantisdk/emitter/exception.py +54 -0
mantisdk/emitter/message.py +61 -0
mantisdk/emitter/object.py +117 -0
mantisdk/emitter/reward.py +320 -0
mantisdk/env_var.py +156 -0
mantisdk/execution/__init__.py +15 -0
mantisdk/execution/base.py +64 -0
mantisdk/execution/client_server.py +443 -0
mantisdk/execution/events.py +69 -0
mantisdk/execution/inter_process.py +16 -0
mantisdk/execution/shared_memory.py +282 -0
mantisdk/instrumentation/__init__.py +119 -0
mantisdk/instrumentation/agentops.py +314 -0
mantisdk/instrumentation/agentops_langchain.py +45 -0
mantisdk/instrumentation/litellm.py +83 -0
mantisdk/instrumentation/vllm.py +81 -0
mantisdk/instrumentation/weave.py +500 -0
mantisdk/litagent/__init__.py +11 -0
mantisdk/litagent/decorator.py +536 -0
mantisdk/litagent/litagent.py +252 -0
mantisdk/llm_proxy.py +1890 -0
mantisdk/logging.py +370 -0
mantisdk/reward.py +7 -0
mantisdk/runner/__init__.py +11 -0
mantisdk/runner/agent.py +845 -0
mantisdk/runner/base.py +182 -0
mantisdk/runner/legacy.py +309 -0
mantisdk/semconv.py +170 -0
mantisdk/server.py +401 -0
mantisdk/store/__init__.py +23 -0
mantisdk/store/base.py +897 -0
mantisdk/store/client_server.py +2092 -0
mantisdk/store/collection/__init__.py +30 -0
mantisdk/store/collection/base.py +587 -0
mantisdk/store/collection/memory.py +970 -0
mantisdk/store/collection/mongo.py +1412 -0
mantisdk/store/collection_based.py +1823 -0
mantisdk/store/insight.py +648 -0
mantisdk/store/listener.py +58 -0
mantisdk/store/memory.py +396 -0
mantisdk/store/mongo.py +165 -0
mantisdk/store/sqlite.py +3 -0
mantisdk/store/threading.py +357 -0
mantisdk/store/utils.py +142 -0
mantisdk/tracer/__init__.py +16 -0
mantisdk/tracer/agentops.py +242 -0
mantisdk/tracer/base.py +287 -0
mantisdk/tracer/dummy.py +106 -0
mantisdk/tracer/otel.py +555 -0
mantisdk/tracer/weave.py +677 -0
mantisdk/trainer/__init__.py +6 -0
mantisdk/trainer/init_utils.py +263 -0
mantisdk/trainer/legacy.py +367 -0
mantisdk/trainer/registry.py +12 -0
mantisdk/trainer/trainer.py +618 -0
mantisdk/types/__init__.py +6 -0
mantisdk/types/core.py +553 -0
mantisdk/types/resources.py +204 -0
mantisdk/types/tracer.py +515 -0
mantisdk/types/tracing.py +218 -0
mantisdk/utils/__init__.py +1 -0
mantisdk/utils/id.py +18 -0
mantisdk/utils/metrics.py +1025 -0
mantisdk/utils/otel.py +578 -0
mantisdk/utils/otlp.py +536 -0
mantisdk/utils/server_launcher.py +1045 -0
mantisdk/utils/system_snapshot.py +81 -0
mantisdk/verl/__init__.py +8 -0
mantisdk/verl/__main__.py +6 -0
mantisdk/verl/async_server.py +46 -0
mantisdk/verl/config.yaml +27 -0
mantisdk/verl/daemon.py +1154 -0
mantisdk/verl/dataset.py +44 -0
mantisdk/verl/entrypoint.py +248 -0
mantisdk/verl/trainer.py +549 -0
mantisdk-0.1.0.dist-info/METADATA +119 -0
mantisdk-0.1.0.dist-info/RECORD +190 -0
mantisdk-0.1.0.dist-info/WHEEL +4 -0
mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0

mantisdk/llm_proxy.py ADDED Viewed

@@ -0,0 +1,1890 @@
+# Copyright (c) Microsoft. All rights reserved.
+from __future__ import annotations
+import ast
+import asyncio
+import json
+import logging
+import os
+import re
+import tempfile
+import threading
+import time
+from contextlib import asynccontextmanager
+from contextvars import ContextVar
+from datetime import datetime
+from typing import (
+    Any,
+    AsyncGenerator,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TypedDict,
+    Union,
+    cast,
+)
+import litellm
+import opentelemetry.trace as trace_api
+import yaml
+from fastapi import Request, Response
+from fastapi.responses import StreamingResponse
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
+from litellm.proxy.proxy_server import app, save_worker_config  # pyright: ignore[reportUnknownVariableType]
+from litellm.types.utils import CallTypes
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import ReadableSpan, SpanContext
+from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
+from opentelemetry.trace import Link, Status
+from opentelemetry.util.types import Attributes
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.types import Scope
+from mantisdk.semconv import LightningResourceAttributes
+from mantisdk.types import LLM, ProxyLLM
+from mantisdk.utils.server_launcher import (
+    LaunchMode,
+    PythonServerLauncher,
+    PythonServerLauncherArgs,
+    noop_context,
+)
+from .store.base import LightningStore
+logger = logging.getLogger(__name__)
+# Context variable to store HTTP request headers for LiteLLM callback access
+_request_headers_context: ContextVar[Optional[Dict[str, str]]] = ContextVar("request_headers", default=None)
+__all__ = [
+    "LLMProxy",
+]
+class ModelConfig(TypedDict):
+    """LiteLLM model registration entry.
+    This mirrors the items in LiteLLM's `model_list` section.
+    Attributes:
+        model_name: Logical model name exposed by the proxy.
+        litellm_params: Parameters passed to LiteLLM for this model
+            (e.g., backend model id, api_base, additional options).
+    """  # Google style kept concise.
+    model_name: str
+    litellm_params: Dict[str, Any]
+def _get_pre_call_data(args: Any, kwargs: Any) -> Dict[str, Any]:
+    """Extract LiteLLM request payload from hook args.
+    The LiteLLM logger hooks receive `(*args, **kwargs)` whose third positional
+    argument or `data=` kwarg contains the request payload.
+    Args:
+        args: Positional arguments from the hook.
+        kwargs: Keyword arguments from the hook.
+    Returns:
+        The request payload dict.
+    Raises:
+        ValueError: If the payload cannot be located or is not a dict.
+    """
+    if kwargs.get("data"):
+        data = kwargs["data"]
+    elif len(args) >= 3:
+        data = args[2]
+    else:
+        raise ValueError(f"Unable to get request data from args or kwargs: {args}, {kwargs}")
+    if not isinstance(data, dict):
+        raise ValueError(f"Request data is not a dictionary: {data}")
+    return cast(Dict[str, Any], data)
+def _reset_litellm_logging_worker() -> None:
+    """Reset LiteLLM's global logging worker to the current event loop.
+    LiteLLM keeps a module-level ``GLOBAL_LOGGING_WORKER`` singleton that owns an
+    ``asyncio.Queue``. The queue is bound to the event loop where it was created.
+    When the proxy is restarted, Uvicorn spins up a brand new event loop in a new
+    thread. If the existing logging worker (and its queue) are reused, LiteLLM
+    raises ``RuntimeError: <Queue ...> is bound to a different event loop`` the
+    next time it tries to log. Recreating the worker ensures that LiteLLM will
+    lazily initialise a fresh queue on the new loop.
+    """
+    # ``GLOBAL_LOGGING_WORKER`` is imported in a few LiteLLM modules at runtime.
+    # Update any already-imported references so future calls use the fresh worker.
+    try:
+        import litellm.utils as litellm_utils
+        from litellm.litellm_core_utils import logging_worker as litellm_logging_worker
+        litellm_logging_worker.GLOBAL_LOGGING_WORKER = litellm_logging_worker.LoggingWorker()
+        litellm_utils.GLOBAL_LOGGING_WORKER = litellm_logging_worker.GLOBAL_LOGGING_WORKER  # type: ignore[reportAttributeAccessIssue]
+    except Exception:  # pragma: no cover - best-effort hygiene
+        logger.warning("Unable to propagate LiteLLM logging worker reset.", exc_info=True)
+def _reset_litellm_logging_callback_manager() -> None:
+    """Reset LiteLLM's global callback manager.
+    To get rid of the warning message: "Cannot add callback - would exceed MAX_CALLBACKS limit of 30."
+    when litellm is restarted multiple times in the same process.
+    It does not respect existing input/output callbacks.
+    """
+    try:
+        litellm.logging_callback_manager._reset_all_callbacks()  # pyright: ignore[reportPrivateUsage]
+    except Exception:  # pragma: no cover - best-effort hygiene
+        logger.warning("Unable to reset LiteLLM logging callback manager.", exc_info=True)
+class AddReturnTokenIds(CustomLogger):
+    """LiteLLM logger hook to request token ids from vLLM.
+    This mutates the outgoing request payload to include `return_token_ids=True`
+    for backends that support token id return (e.g., vLLM).
+    See also:
+        [vLLM PR #22587](https://github.com/vllm-project/vllm/pull/22587)
+    """
+    async def async_pre_call_hook(self, *args: Any, **kwargs: Any) -> Optional[Union[Exception, str, Dict[str, Any]]]:
+        """Async pre-call hook to adjust request payload.
+        Args:
+            args: Positional args from LiteLLM.
+            kwargs: Keyword args from LiteLLM.
+        Returns:
+            Either an updated payload dict or an Exception to short-circuit.
+        """
+        try:
+            data = _get_pre_call_data(args, kwargs)
+        except Exception as e:
+            return e
+        # Ensure token ids are requested from the backend when supported.
+        return {**data, "return_token_ids": True}
+class AddLogprobs(CustomLogger):
+    """LiteLLM logger hook to request logprobs from vLLM.
+    This mutates the outgoing request payload to include `logprobs=1`
+    for backends that support logprobs return (e.g., vLLM).
+    """
+    async def async_pre_call_hook(self, *args: Any, **kwargs: Any) -> Optional[Union[Exception, str, Dict[str, Any]]]:
+        """Async pre-call hook to adjust request payload."""
+        try:
+            data = _get_pre_call_data(args, kwargs)
+        except Exception as e:
+            return e
+        # Ensure logprobs are requested from the backend when supported.
+        return {**data, "logprobs": 1}
+class SpanWithExtraAttributes(ReadableSpan):
+    """Wrapper around ReadableSpan that adds extra span attributes.
+    Since ReadableSpan is immutable, this wrapper intercepts the attributes
+    property to include additional attributes for Langfuse integration
+    (environment, tags, etc.).
+    """
+    def __init__(self, wrapped_span: ReadableSpan, extra_attributes: Dict[str, Any]):
+        """Initialize wrapper with a span and extra attributes to inject.
+        Args:
+            wrapped_span: The original ReadableSpan to wrap.
+            extra_attributes: Dictionary of extra attributes to add.
+                These should use Langfuse conventions (e.g., "langfuse.environment").
+        """
+        self._wrapped = wrapped_span
+        self._extra_attributes = extra_attributes
+    @property
+    def name(self) -> str:
+        return self._wrapped.name
+    @property
+    def context(self) -> Optional[SpanContext]:
+        return self._wrapped.context
+    def get_span_context(self) -> SpanContext:
+        return self._wrapped.get_span_context()
+    @property
+    def parent(self) -> Optional[SpanContext]:
+        return self._wrapped.parent
+    @property
+    def start_time(self) -> int:
+        return self._wrapped.start_time
+    @property
+    def end_time(self) -> int:
+        return self._wrapped.end_time
+    @property
+    def status(self) -> Status:
+        return self._wrapped.status
+    @property
+    def attributes(self) -> Attributes:
+        """Return original attributes merged with extra attributes."""
+        original_attrs = self._wrapped.attributes or {}
+        # Create a merged dict with original attrs and extra attrs
+        merged = dict(original_attrs)
+        merged.update(self._extra_attributes)
+        return merged
+    @property
+    def events(self) -> tuple:
+        return self._wrapped.events
+    @property
+    def links(self) -> tuple:
+        return self._wrapped.links
+    @property
+    def resource(self) -> Resource:
+        return self._wrapped.resource
+    @property
+    def instrumentation_scope(self):
+        return self._wrapped.instrumentation_scope
+    # Also expose _resource for direct modification if needed
+    @property
+    def _resource(self) -> Resource:
+        return self._wrapped._resource  # pyright: ignore[reportPrivateUsage]
+    @_resource.setter
+    def _resource(self, value: Resource):
+        self._wrapped._resource = value  # pyright: ignore[reportPrivateUsage]
+    def __getattr__(self, name: str):
+        """Delegate all other attribute access to the wrapped span.
+        This ensures OTLPSpanExporter can access all private attributes (_events, _links, etc.)
+        that it needs for serialization without us having to enumerate them all.
+        """
+        return getattr(self._wrapped, name)
+class LightningSpanExporter(SpanExporter):
+    """Buffered OTEL span exporter with subtree flushing and training-store sink.
+    Design:
+    * Spans are buffered until a root span's entire subtree is available.
+    * A private event loop on a daemon thread runs async flush logic.
+    * Rollout/attempt/sequence metadata is reconstructed by merging headers
+      from any span within a subtree.
+    Thread-safety:
+    * Buffer access is protected by a re-entrant lock.
+    * Export is synchronous to the caller yet schedules an async flush on the
+      internal loop, then waits for completion.
+    """
+    def __init__(
+        self,
+        _store: Optional[LightningStore] = None,
+        otlp_endpoint: Optional[str] = None,
+        otlp_headers: Optional[Dict[str, str]] = None,
+    ):
+        self._store: Optional[LightningStore] = _store  # this is only for testing purposes
+        self._otlp_endpoint: Optional[str] = otlp_endpoint  # Direct OTLP export endpoint
+        self._buffer: List[ReadableSpan] = []
+        self._lock: Optional[threading.Lock] = None
+        self._loop_lock_pid: Optional[int] = None
+        # Single dedicated event loop running in a daemon thread.
+        # This decouples OTEL SDK threads from our async store I/O.
+        # Deferred creation until first use.
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+        self._loop_thread: Optional[threading.Thread] = None
+        # Initialize OTLP exporter with custom endpoint and headers if provided
+        if otlp_endpoint:
+            self._otlp_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, headers=otlp_headers or {})
+        else:
+            self._otlp_exporter = OTLPSpanExporter()
+    def _ensure_loop(self) -> asyncio.AbstractEventLoop:
+        """Lazily initialize the event loop and thread on first use.
+        Returns:
+            asyncio.AbstractEventLoop: The initialized event loop.
+        """
+        self._clear_loop_and_lock()
+        if self._loop is None:
+            self._loop = asyncio.new_event_loop()
+            self._loop_thread = threading.Thread(target=self._run_loop, name="LightningSpanExporterLoop", daemon=True)
+            self._loop_thread.start()
+        return self._loop
+    def _ensure_lock(self) -> threading.Lock:
+        """Lazily initialize the lock on first use.
+        Returns:
+            threading.Lock: The initialized lock.
+        """
+        self._clear_loop_and_lock()
+        if self._lock is None:
+            self._lock = threading.Lock()
+        return self._lock
+    def _clear_loop_and_lock(self) -> None:
+        """Clear the loop and lock.
+        This happens if the exporter was used in a process then used in another process.
+        This should only happen in CI.
+        """
+        if os.getpid() != self._loop_lock_pid:
+            logger.warning("Loop and lock are not owned by the current process. Clearing them.")
+            self._loop = None
+            self._loop_thread = None
+            self._lock = None
+            self._loop_lock_pid = os.getpid()
+        elif self._loop_lock_pid is None:
+            self._loop_lock_pid = os.getpid()
+    def _run_loop(self) -> None:
+        """Run the private asyncio loop forever on the exporter thread."""
+        assert self._loop is not None, "Loop should be initialized before thread starts"
+        asyncio.set_event_loop(self._loop)
+        self._loop.run_forever()
+    def shutdown(self) -> None:
+        """Shut down the exporter event loop.
+        Safe to call at process exit.
+        """
+        if self._loop is None:
+            return
+        try:
+            def _stop():
+                assert self._loop is not None
+                self._loop.stop()
+            self._loop.call_soon_threadsafe(_stop)
+            if self._loop_thread is not None:
+                self._loop_thread.join(timeout=2.0)
+            self._loop.close()
+        except Exception:
+            logger.exception("Error during exporter shutdown")
+    def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
+        """Export spans via buffered subtree flush.
+        Appends spans to the internal buffer, then triggers an async flush on the
+        private event loop. Blocks until that flush completes.
+        Args:
+            spans: Sequence of spans to export.
+        Returns:
+            SpanExportResult: SUCCESS on flush success, else FAILURE.
+        """
+        # Buffer append under lock to protect against concurrent exporters.
+        with self._ensure_lock():
+            for span in spans:
+                self._buffer.append(span)
+            default_endpoint = self._otlp_exporter._endpoint  # pyright: ignore[reportPrivateUsage]
+            try:
+                self._maybe_flush()
+            except Exception as e:
+                logger.exception("Export flush failed: %s", e)
+                return SpanExportResult.FAILURE
+            finally:
+                self._otlp_exporter._endpoint = default_endpoint  # pyright: ignore[reportPrivateUsage]
+        return SpanExportResult.SUCCESS
+    def _get_job_id_from_store(self, store: Any) -> Optional[str]:
+        """Get the job_id from the store's listeners (if any InsightTracker is attached)."""
+        if hasattr(store, "listeners"):
+            for listener in store.listeners:
+                if hasattr(listener, "job_id"):
+                    return listener.job_id
+        return None
+    def _get_tracing_metadata_from_rollout(
+        self, store: Any, rollout_id: str
+    ) -> Tuple[Optional[str], Optional[List[str]], Optional[str]]:
+        """Fetch tracing metadata (environment, tags, session_id) from a rollout.
+        Args:
+            store: The LightningStore instance.
+            rollout_id: The rollout ID to fetch.
+        Returns:
+            Tuple of (environment, tags, session_id). All may be None if not set.
+        """
+        print(f"[TracingMetadata] Fetching metadata for rollout {rollout_id}")
+        logger.info(f"[TracingMetadata] Fetching metadata for rollout {rollout_id}")
+        try:
+            loop = self._ensure_loop()
+            get_rollout_task = store.get_rollout_by_id(rollout_id)
+            fut = asyncio.run_coroutine_threadsafe(get_rollout_task, loop)
+            rollout = fut.result(timeout=5.0)  # Short timeout for metadata fetch
+            if rollout is None:
+                logger.warning(f"[TracingMetadata] Rollout {rollout_id} not found in store")
+                return None, None, None
+            logger.info(f"[TracingMetadata] Rollout {rollout_id} found, metadata={rollout.metadata}")
+            if rollout.metadata is None:
+                logger.warning(f"[TracingMetadata] Rollout {rollout_id} has no metadata (None)")
+                return None, None, None
+            if not rollout.metadata:
+                logger.warning(f"[TracingMetadata] Rollout {rollout_id} has empty metadata dict")
+                return None, None, None
+            environment = rollout.metadata.get("environment")
+            tags = rollout.metadata.get("tags")
+            session_id = rollout.metadata.get("session_id")
+            logger.info(f"[TracingMetadata] Rollout {rollout_id}: environment={environment}, tags={tags}, session_id={session_id}")
+            return environment, tags, session_id
+        except Exception as e:
+            logger.warning(f"[TracingMetadata] Failed to fetch rollout metadata for {rollout_id}: {e}")
+            import traceback
+            traceback.print_exc()
+            return None, None, None
+    def _maybe_flush(self):
+        """Flush ready subtrees from the buffer.
+        Strategy:
+            We consider a subtree "ready" if we can identify a root span. We
+            then take that root and all its descendants out of the buffer and
+            try to reconstruct rollout/attempt/sequence headers by merging any
+            span's `metadata.requester_custom_headers` within the subtree.
+        Span types:
+            - Rollout spans: Have rollout_id/attempt_id/sequence_id headers
+            - Job spans: No rollout context, tagged with job_id for experiment tracking
+        Direct OTLP mode:
+            When `otlp_endpoint` is configured, spans are exported directly to the
+            endpoint without requiring a store or header validation.
+        Raises:
+            None directly. Logs and skips malformed spans.
+        """
+        # Iterate over current roots. Each iteration pops a whole subtree.
+        for root_span_id in self._get_root_span_ids():
+            subtree_spans = self._pop_subtrees(root_span_id)
+            if not subtree_spans:
+                continue
+            # Merge all custom headers found in the subtree.
+            # This must happen BEFORE the direct OTLP check so both paths can apply tags.
+            headers_merged: Dict[str, Any] = {}
+            for span in subtree_spans:
+                if span.attributes is None:
+                    continue
+                headers_str = span.attributes.get("metadata.requester_custom_headers")
+                if headers_str is None:
+                    continue
+                if not isinstance(headers_str, str):
+                    logger.debug(f"metadata.requester_custom_headers is not a string: {headers_str}")
+                    continue
+                if not headers_str.strip():
+                    continue
+                try:
+                    # Use literal_eval to parse the stringified dict safely.
+                    headers = ast.literal_eval(headers_str)
+                except Exception as e:
+                    logger.debug(f"Failed to parse metadata.requester_custom_headers: {e}")
+                    continue
+                if isinstance(headers, dict):
+                    headers_merged.update(cast(Dict[str, Any], headers))
+            # Extract rollout context if available
+            rollout_id = headers_merged.get("x-rollout-id")
+            attempt_id = headers_merged.get("x-attempt-id")
+            sequence_id = headers_merged.get("x-sequence-id")
+            # Determine if we're using OTLP export (either direct or store-based)
+            store = self._store or get_active_llm_proxy().get_store()
+            otlp_enabled = bool(self._otlp_endpoint) or (store and store.capabilities.get("otlp_traces", False))
+            has_rollout_context = (
+                rollout_id
+                and attempt_id
+                and sequence_id
+                and isinstance(rollout_id, str)
+                and isinstance(attempt_id, str)
+                and isinstance(sequence_id, str)
+                and sequence_id.isdigit()
+            )
+            if has_rollout_context:
+                # Rollout-scoped spans: tag with rollout/attempt/sequence
+                sequence_id_decimal = int(sequence_id)
+                print(f"[TracingMetadata] Processing rollout {rollout_id} with {len(subtree_spans)} spans, otlp_enabled={otlp_enabled}")
+                logger.info(f"[TracingMetadata] Processing rollout {rollout_id} with {len(subtree_spans)} spans, otlp_enabled={otlp_enabled}")
+                # Fetch tracing metadata (environment, tags, session_id) from the rollout
+                environment, tags, session_id = self._get_tracing_metadata_from_rollout(store, rollout_id)
+                logger.info(f"[TracingMetadata] Fetched: environment={environment}, tags={tags}, session_id={session_id}")
+                if otlp_enabled:
+                    # Build resource attributes for Mantisdk metadata
+                    resource_attrs: Dict[str, Any] = {
+                        LightningResourceAttributes.ROLLOUT_ID.value: rollout_id,
+                        LightningResourceAttributes.ATTEMPT_ID.value: attempt_id,
+                        LightningResourceAttributes.SPAN_SEQUENCE_ID.value: sequence_id_decimal,
+                        LightningResourceAttributes.SPAN_TYPE.value: "rollout",
+                    }
+                    # Build span attributes for Langfuse-expected metadata
+                    # Per Langfuse docs, use langfuse.* namespace for environment and tags
+                    span_extra_attrs: Dict[str, Any] = {}
+                    if session_id:
+                        span_extra_attrs["session.id"] = session_id
+                        logger.info(f"[TracingMetadata] Setting session.id={session_id}")
+                    if environment:
+                        span_extra_attrs["langfuse.environment"] = environment
+                        logger.info(f"[TracingMetadata] Setting langfuse.environment={environment}")
+                    # Extract call type from headers (set by @gepa.judge, @gepa.agent decorators)
+                    call_type = headers_merged.get("x-mantis-call-type")
+                    # Build final tags list, including call_type if present
+                    final_tags = list(tags) if tags else []
+                    if call_type and call_type not in final_tags:
+                        final_tags.append(call_type)
+                    if final_tags:
+                        # Insight's OTEL ingestion expects tags on the *resource* under `langfuse.trace.tags`.
+                        # Span-level `langfuse.tags` is not reliably ingested into `traces.tags`.
+                        resource_attrs["langfuse.trace.tags"] = final_tags
+                        # Keep span-level tags too for backwards-compat/debuggability
+                        span_extra_attrs["langfuse.tags"] = final_tags
+                        logger.info(f"[TracingMetadata] Setting langfuse.trace.tags={final_tags}")
+                    # Prepare spans for export
+                    spans_to_export: List[ReadableSpan] = []
+                    for span in subtree_spans:
+                        # Add resource attributes
+                        span._resource = span._resource.merge(  # pyright: ignore[reportPrivateUsage]
+                            Resource.create(resource_attrs)
+                        )
+                        # Wrap with extra span attributes if we have any
+                        if span_extra_attrs:
+                            wrapped_span = SpanWithExtraAttributes(span, span_extra_attrs)
+                            spans_to_export.append(wrapped_span)
+                        else:
+                            spans_to_export.append(span)
+                    export_result = self._otlp_exporter.export(spans_to_export)
+                    if export_result != SpanExportResult.SUCCESS:
+                        logger.error(f"Failed to export rollout spans via OTLP. Result: {export_result}")
+                else:
+                    # The old way: store does not support OTLP endpoint
+                    for span in subtree_spans:
+                        loop = self._ensure_loop()
+                        add_otel_span_task = store.add_otel_span(
+                            rollout_id=rollout_id,
+                            attempt_id=attempt_id,
+                            sequence_id=sequence_id_decimal,
+                            readable_span=span,
+                        )
+                        fut = asyncio.run_coroutine_threadsafe(add_otel_span_task, loop)
+                        fut.result()
+            elif otlp_enabled:
+                # Job-scoped spans (no rollout context): tag with job_id for experiment tracking
+                job_id = self._get_job_id_from_store(store)
+                # Extract Mantis tracing metadata from x-mantis-* headers
+                mantis_session_id = headers_merged.get("x-mantis-session-id")
+                mantis_environment = headers_merged.get("x-mantis-environment")
+                mantis_tags_str = headers_merged.get("x-mantis-tags")
+                mantis_call_type = headers_merged.get("x-mantis-call-type")
+                mantis_tags = None
+                if mantis_tags_str:
+                    try:
+                        mantis_tags = ast.literal_eval(mantis_tags_str) if mantis_tags_str.startswith("[") else None
+                    except Exception:
+                        pass
+                # Add call_type to tags if present
+                if mantis_call_type:
+                    if mantis_tags is None:
+                        mantis_tags = []
+                    if mantis_call_type not in mantis_tags:
+                        mantis_tags.append(mantis_call_type)
+                # Build span attributes for Mantis/Langfuse metadata
+                span_extra_attrs: Dict[str, Any] = {}
+                job_resource_attrs: Dict[str, Any] = {}
+                if mantis_session_id:
+                    span_extra_attrs["session.id"] = mantis_session_id
+                    logger.info(f"[TracingMetadata] Job span: session.id={mantis_session_id}")
+                if mantis_environment:
+                    span_extra_attrs["langfuse.environment"] = mantis_environment
+                    logger.info(f"[TracingMetadata] Job span: langfuse.environment={mantis_environment}")
+                if mantis_tags:
+                    # Set tags as RESOURCE attributes (required for Insight ingestion)
+                    job_resource_attrs["langfuse.trace.tags"] = mantis_tags
+                    # Also set as span attributes for backwards-compat
+                    span_extra_attrs["langfuse.tags"] = mantis_tags
+                    logger.info(f"[TracingMetadata] Job span: langfuse.trace.tags={mantis_tags}")
+                # Prepare spans for export
+                spans_to_export: List[ReadableSpan] = []
+                for span in subtree_spans:
+                    if job_id:
+                        span._resource = span._resource.merge(  # pyright: ignore[reportPrivateUsage]
+                            Resource.create(
+                                {
+                                    LightningResourceAttributes.JOB_ID.value: job_id,
+                                    LightningResourceAttributes.SPAN_TYPE.value: "job",
+                                }
+                            )
+                        )
+                    # Merge job-level langfuse resource attrs (tags) if present
+                    try:
+                        if "job_resource_attrs" in locals() and job_resource_attrs:
+                            span._resource = span._resource.merge(  # pyright: ignore[reportPrivateUsage]
+                                Resource.create(job_resource_attrs)
+                            )
+                    except Exception:
+                        pass
+                    # Wrap with extra span attributes if we have any
+                    if span_extra_attrs:
+                        wrapped_span = SpanWithExtraAttributes(span, span_extra_attrs)
+                        spans_to_export.append(wrapped_span)
+                    else:
+                        spans_to_export.append(span)
+                export_result = self._otlp_exporter.export(spans_to_export)
+                if export_result != SpanExportResult.SUCCESS:
+                    logger.error(f"Failed to export job spans via OTLP. Result: {export_result}")
+                else:
+                    logger.debug(f"Exported {len(spans_to_export)} job-scoped spans (job_id={job_id}, has_mantis_metadata={bool(span_extra_attrs)})")
+            else:
+                # No OTLP and no rollout context - skip with warning
+                logger.debug(
+                    f"Skipping {len(subtree_spans)} spans: no rollout context and OTLP not enabled"
+                )
+    def _get_root_span_ids(self) -> Iterable[int]:
+        """Yield span_ids for root spans currently in the buffer.
+        A root span is defined as one with `parent is None`.
+        Yields:
+            int: Span id for each root span found.
+        """
+        for span in self._buffer:
+            if span.parent is None:
+                span_context = span.get_span_context()
+                if span_context is not None:
+                    yield span_context.span_id
+    def _get_subtrees(self, root_span_id: int) -> Iterable[int]:
+        """Yield span_ids in the subtree rooted at `root_span_id`.
+        Depth-first traversal over the current buffer.
+        Args:
+            root_span_id: The span id of the root.
+        Yields:
+            int: Span ids including the root and all descendants found.
+        """
+        # Yield the root span id first.
+        yield root_span_id
+        for span in self._buffer:
+            # Check whether the span's parent is the root_span_id.
+            if span.parent is not None and span.parent.span_id == root_span_id:
+                span_context = span.get_span_context()
+                if span_context is not None:
+                    # Recursively get child spans.
+                    yield from self._get_subtrees(span_context.span_id)
+    def _pop_subtrees(self, root_span_id: int) -> List[ReadableSpan]:
+        """Remove and return the subtree for a particular root from the buffer.
+        Args:
+            root_span_id: Root span id identifying the subtree.
+        Returns:
+            list[ReadableSpan]: Spans that were part of the subtree. Order follows buffer order.
+        """
+        subtree_span_ids = set(self._get_subtrees(root_span_id))
+        subtree_spans: List[ReadableSpan] = []
+        new_buffer: List[ReadableSpan] = []
+        for span in self._buffer:
+            span_context = span.get_span_context()
+            if span_context is not None and span_context.span_id in subtree_span_ids:
+                subtree_spans.append(span)
+            else:
+                new_buffer.append(span)
+        # Replace buffer with remaining spans to avoid re-processing.
+        self._buffer = new_buffer
+        return subtree_spans
+class LightningOpenTelemetry(OpenTelemetry):
+    """OpenTelemetry integration that exports spans to the Lightning store.
+    Responsibilities:
+    * Ensures each request is annotated with a per-attempt sequence id so spans
+      are ordered deterministically even with clock skew across nodes.
+    * Uses [`LightningSpanExporter`][mantisdk.llm_proxy.LightningSpanExporter] to persist spans for analytics and training.
+    * Adds Mantisdk-specific attributes (session_id, tags, environment) to spans.
+    Args:
+        otlp_endpoint: Optional OTLP endpoint URL for direct trace export to external
+            collectors (e.g., Langfuse/Insight). When set, spans are exported directly without
+            requiring a store or rollout/attempt headers.
+        otlp_headers: Optional dict of HTTP headers for OTLP authentication (e.g., Basic Auth).
+    """
+    def __init__(self, otlp_endpoint: Optional[str] = None, otlp_headers: Optional[Dict[str, str]] = None):
+        exporter = LightningSpanExporter(otlp_endpoint=otlp_endpoint, otlp_headers=otlp_headers)
+        config = OpenTelemetryConfig(exporter=exporter)
+        # Check for tracer initialization
+        if _check_tracer_provider():
+            logger.error("Tracer is already initialized. OpenTelemetry may not work as expected.")
+        super().__init__(config=config)  # pyright: ignore[reportUnknownMemberType]
+        # Store exporter reference for debugging
+        self._custom_exporter = exporter
+    def _init_tracing(self, tracer_provider):
+        """Override to ensure our span processor is added even when reusing existing TracerProvider.
+        LiteLLM's parent _init_tracing reuses existing TracerProviders but doesn't add
+        our custom span processor to them. We override to force adding our processor.
+        """
+        from opentelemetry import trace as otel_trace_api
+        from opentelemetry.sdk.trace import TracerProvider as TracerProviderSDK
+        from opentelemetry.trace import SpanKind
+        # Call parent to set up tracer
+        super()._init_tracing(tracer_provider)  # pyright: ignore[reportUnknownMemberType]
+        # If an existing provider was reused, add our span processor to it
+        current_provider = otel_trace_api.get_tracer_provider()
+        if isinstance(current_provider, TracerProviderSDK):
+            # Check if our processor is already added
+            has_our_processor = False
+            if hasattr(current_provider, "_active_span_processor"):
+                active_processor = current_provider._active_span_processor  # pyright: ignore[reportPrivateUsage]
+                if hasattr(active_processor, "_span_processors"):
+                    for proc in active_processor._span_processors:  # pyright: ignore[reportPrivateUsage]
+                        if hasattr(proc, "_exporter") and isinstance(proc._exporter, LightningSpanExporter):  # pyright: ignore[reportPrivateUsage]
+                            has_our_processor = True
+                            break
+            if not has_our_processor:
+                # Add our span processor
+                span_processor = self._get_span_processor()
+                current_provider.add_span_processor(span_processor)
+        self.span_kind = SpanKind
+    def _get_span_processor(self, dynamic_headers: Optional[dict] = None):
+        """Override to ensure our custom exporter is used.
+        LiteLLM's parent class checks if OTEL_EXPORTER has export() method and wraps it
+        in SimpleSpanProcessor. We override to add logging and ensure this happens.
+        """
+        from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+        # Use our custom exporter directly
+        if hasattr(self.OTEL_EXPORTER, "export"):
+            processor = SimpleSpanProcessor(self.OTEL_EXPORTER)
+            return processor
+        # Fallback to parent implementation
+        return super()._get_span_processor(dynamic_headers)  # pyright: ignore[reportUnknownMemberType]
+    def set_attributes(self, span: Any, kwargs: Dict[str, Any], response_obj: Optional[Any]) -> None:
+        """Override to add Mantisdk-specific attributes from metadata.
+        Extracts session_id, tags, and environment from kwargs["metadata"] and sets
+        them as OTEL span attributes for visibility in Insight/Langfuse.
+        Also extracts extra_headers from kwargs and sets them as metadata.requester_custom_headers
+        so the proxy can read them for tagging.
+        """
+        # Call parent implementation first
+        super().set_attributes(span, kwargs, response_obj)  # pyright: ignore[reportUnknownMemberType]
+        # Extract extra_headers from kwargs and set as metadata.requester_custom_headers
+        # This allows the proxy to read x-mantis-* headers for tagging
+        # Check multiple sources: extra_headers (from OpenAI SDK), kwargs["headers"], and context variable
+        extra_headers = kwargs.get("extra_headers") or kwargs.get("extraHeaders")
+        request_headers = kwargs.get("headers") or {}
+        context_headers = _request_headers_context.get() or {}
+        # Merge all sources: extra_headers, request_headers, and context headers
+        merged_headers = {}
+        if extra_headers and isinstance(extra_headers, dict):
+            merged_headers.update(extra_headers)
+        if request_headers and isinstance(request_headers, dict):
+            # Extract x-mantis-* headers from HTTP request headers passed by LiteLLM
+            for key, value in request_headers.items():
+                if isinstance(key, str) and key.lower().startswith("x-mantis-"):
+                    merged_headers[key] = value
+        # Also check context variable (set by MantisHeadersMiddleware)
+        if context_headers:
+            merged_headers.update(context_headers)
+        if merged_headers:
+            # LiteLLM stores this as a stringified dict in metadata.requester_custom_headers
+            span.set_attribute("metadata.requester_custom_headers", str(merged_headers))
+        # Extract Mantisdk tracing metadata from kwargs
+        metadata = kwargs.get("metadata", {}) if kwargs else {}
+        if not metadata:
+            return
+        # Set session_id as span attribute (standard Langfuse/Insight attribute)
+        session_id = metadata.get("session_id")
+        if session_id:
+            span.set_attribute("session.id", session_id)
+        # Set tags as span attribute
+        tags = metadata.get("tags")
+        if tags and isinstance(tags, list):
+            # Set as JSON array for Langfuse compatibility
+            import json
+            span.set_attribute("tags", json.dumps(tags))
+            # Also set individual tag attributes for filtering
+            for i, tag in enumerate(tags):
+                span.set_attribute(f"tag.{i}", str(tag))
+        # Set environment as span attribute
+        environment = metadata.get("environment")
+        if environment:
+            span.set_attribute("environment", environment)
+    async def async_pre_call_deployment_hook(
+        self, kwargs: Dict[str, Any], call_type: Optional[CallTypes] = None
+    ) -> Optional[Dict[str, Any]]:
+        """The root span is sometimes missing (e.g., when Anthropic endpoint is used).
+        It is created in an auth module in LiteLLM. If it's missing, we create it here.
+        """
+        if "metadata" not in kwargs or "litellm_parent_otel_span" not in kwargs["metadata"]:
+            parent_otel_span = self.create_litellm_proxy_request_started_span(  # type: ignore
+                start_time=datetime.now(),
+                headers=kwargs.get("headers", {}),
+            )
+            updated_metadata = {**kwargs.get("metadata", {}), "litellm_parent_otel_span": parent_otel_span}
+            return {**kwargs, "metadata": updated_metadata}
+        else:
+            return kwargs
+class RolloutAttemptMiddleware(BaseHTTPMiddleware):
+    """
+    Rewrites /rollout/{rid}/attempt/{aid}/... -> /...
+    and injects x-rollout-id, x-attempt-id, x-sequence-id headers.
+    LLMProxy can update store later without rebuilding middleware.
+    """
+    async def dispatch(self, request: Request, call_next: Callable[[Request], Awaitable[Response]]) -> Response:
+        # Decode rollout and attempt from the URL prefix. Example:
+        #   /rollout/r123/attempt/a456/v1/chat/completions
+        # becomes
+        #   /v1/chat/completions
+        # while adding request-scoped headers for trace attribution.
+        path = request.url.path
+        match = re.match(r"^/rollout/([^/]+)/attempt/([^/]+)(/.*)?$", path)
+        if match:
+            rollout_id = match.group(1)
+            attempt_id = match.group(2)
+            new_path = match.group(3) if match.group(3) is not None else "/"
+            # Rewrite the ASGI scope path so downstream sees a clean OpenAI path.
+            request.scope["path"] = new_path
+            request.scope["raw_path"] = new_path.encode()
+            store = get_active_llm_proxy().get_store()
+            if store is not None:
+                # Allocate a monotonic sequence id per (rollout, attempt).
+                sequence_id = await store.get_next_span_sequence_id(rollout_id, attempt_id)
+                # Inject headers so downstream components and exporters can retrieve them.
+                request.scope["headers"] = list(request.scope["headers"]) + [
+                    (b"x-rollout-id", rollout_id.encode()),
+                    (b"x-attempt-id", attempt_id.encode()),
+                    (b"x-sequence-id", str(sequence_id).encode()),
+                ]
+            else:
+                logger.warning("Store is not set. Skipping sequence id allocation and header injection.")
+        response = await call_next(request)
+        return response
+class MantisHeadersMiddleware(BaseHTTPMiddleware):
+    """Middleware to intercept x-mantis-* HTTP headers and store them in context.
+    This allows LiteLLM callbacks to access custom headers (like x-mantis-call-type)
+    even if LiteLLM doesn't pass them through kwargs.
+    """
+    async def dispatch(self, request: Request, call_next: Callable[[Request], Awaitable[Response]]) -> Response:
+        # Extract x-mantis-* headers from HTTP request
+        mantis_headers: Dict[str, str] = {}
+        for header_name, header_value in request.headers.items():
+            if isinstance(header_name, str) and header_name.lower().startswith("x-mantis-"):
+                mantis_headers[header_name] = header_value
+        # Store in context variable for callback access
+        if mantis_headers:
+            _request_headers_context.set(mantis_headers)
+        response = await call_next(request)
+        return response
+class MessageInspectionMiddleware(BaseHTTPMiddleware):
+    """Middleware to inspect the request and response bodies.
+    It's for debugging purposes. Add it via "message_inspection" middleware alias.
+    """
+    async def dispatch(self, request: Request, call_next: Callable[[Request], Awaitable[Response]]) -> Response:
+        ti = time.time()
+        logger.info(f"Received request with scope: {request.scope}")
+        logger.info(f"Received request with body: {await request.body()}")
+        response = await call_next(request)
+        elapsed = time.time() - ti
+        logger.info(f"Response to request took {elapsed} seconds")
+        logger.info(f"Received response with status code: {response.status_code}")
+        logger.info(f"Received response with body: {response.body}")
+        return response
+class StreamConversionMiddleware(BaseHTTPMiddleware):
+    """Middleware to convert streaming responses to non-streaming responses.
+    Useful for backend that only supports non-streaming responses.
+    LiteLLM's OpenTelemetry is also buggy with streaming responses.
+    The conversion will hopefully bypass the bug.
+    """
+    async def dispatch(self, request: Request, call_next: Callable[[Request], Awaitable[Response]]) -> Response:
+        # Only process POST requests to completion endpoints
+        if request.method != "POST":
+            return await call_next(request)
+        # Check if it's a chat completions or messages endpoint
+        endpoint_format: Literal["openai", "anthropic", "unknown"] = "unknown"
+        if request.url.path.endswith("/chat/completions") or "/chat/completions?" in request.url.path:
+            endpoint_format = "openai"
+        elif request.url.path.endswith("/messages") or "/messages?" in request.url.path:
+            endpoint_format = "anthropic"
+        else:
+            endpoint_format = "unknown"
+        if endpoint_format == "unknown":
+            # Directly bypass the middleware
+            return await call_next(request)
+        # Read the request body
+        try:
+            json_body = await request.json()
+        except json.JSONDecodeError:
+            logger.warning(f"Request body is not valid JSON: {request.body}")
+            return await call_next(request)
+        # Check if streaming is requested
+        is_streaming = json_body.get("stream", False)
+        # Simple case: no streaming requested, just return the response
+        if not is_streaming:
+            return await call_next(request)
+        # Now the stream case
+        return await self._handle_stream_case(request, json_body, endpoint_format, call_next)
+    async def _handle_stream_case(
+        self,
+        request: Request,
+        json_body: Dict[str, Any],
+        endpoint_format: Literal["openai", "anthropic"],
+        call_next: Callable[[Request], Awaitable[Response]],
+    ) -> Response:
+        # 1) Modify the request body to force stream=False
+        modified_json = dict(json_body)
+        modified_json["stream"] = False
+        modified_body = json.dumps(modified_json).encode("utf-8")
+        # 2) Build a new scope + receive that yields our modified body
+        scope: Scope = dict(request.scope)
+        # rewrite headers for accept/content-length
+        new_headers: List[Tuple[bytes, bytes]] = []
+        saw_accept = False
+        for k, v in scope["headers"]:
+            kl = k.lower()
+            if kl == b"accept":
+                saw_accept = True
+                new_headers.append((k, b"application/json"))
+            elif kl == b"content-length":
+                # replace with new length
+                continue
+            else:
+                new_headers.append((k, v))
+        if not saw_accept:
+            new_headers.append((b"accept", b"application/json"))
+        new_headers.append((b"content-length", str(len(modified_body)).encode("ascii")))
+        scope["headers"] = new_headers
+        # Directly modify the request body
+        # Creating a new request won't work because request is cached in the base class
+        request._body = modified_body  # type: ignore
+        response = await call_next(request)
+        buffered: Optional[bytes] = None
+        # 4) If OK, buffer the response body (it should be JSON because we forced stream=False)
+        if 200 <= response.status_code < 300:
+            try:
+                if hasattr(response, "body_iterator"):
+                    # Buffer body safely
+                    body_chunks: List[bytes] = []
+                    async for chunk in response.body_iterator:  # type: ignore
+                        body_chunks.append(chunk)  # type: ignore
+                    buffered = b"".join(body_chunks)
+                else:
+                    buffered = response.body  # type: ignore
+                data = json.loads(buffered or b"{}")
+                if endpoint_format == "anthropic":
+                    return StreamingResponse(
+                        self.anthropic_stream_generator(data),
+                        media_type="text/event-stream",
+                        headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
+                    )
+                else:
+                    # openai format
+                    return StreamingResponse(
+                        self.openai_stream_generator(data),
+                        media_type="text/event-stream",
+                        headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
+                    )
+            except Exception as e:
+                # If anything goes wrong, fall back to non-streaming JSON
+                logger.exception(f"Error converting to stream; returning non-stream response: {e}")
+                # Rebuild the consumed response
+                return Response(
+                    content=buffered if buffered is not None else b"",
+                    status_code=response.status_code,
+                    headers=dict(response.headers),
+                    media_type=response.media_type,
+                    background=response.background,
+                )
+        else:
+            return response
+    async def anthropic_stream_generator(self, original_response: Dict[str, Any]):
+        """Generate Anthropic SSE-formatted chunks from complete content blocks
+        This is a dirty hack for Anthropic-style streaming from non-streaming response.
+        The sse format is subject to change based on Anthropic's implementation.
+        If so, try to use `MessageInspectionMiddleware` to inspect the update and fix accordingly.
+        """
+        # Anthropic format - handle multiple content blocks (text + tool_use)
+        content_blocks: List[Dict[str, Any]] = original_response.get("content", [])
+        message_id = original_response.get("id", f"msg_{int(time.time() * 1000)}")
+        model = original_response.get("model", "claude")
+        # Send message_start event
+        message_start: Dict[str, Any] = {
+            "type": "message_start",
+            "message": {
+                "id": message_id,
+                "type": "message",
+                "role": "assistant",
+                "content": [],
+                "model": model,
+                "stop_reason": None,
+                "stop_sequence": None,
+                "usage": original_response.get("usage", {"input_tokens": 0, "output_tokens": 0}),
+            },
+        }
+        yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
+        # Send ping to keep connection alive
+        ping = {"type": "ping"}
+        yield f"event: ping\ndata: {json.dumps(ping)}\n\n"
+        # Process each content block
+        for block_index, block in enumerate(content_blocks):
+            block_type = block.get("type", "text")
+            if block_type == "text":
+                # Handle text block
+                content = block.get("text", "")
+                # Send content_block_start event
+                content_block_start = {
+                    "type": "content_block_start",
+                    "index": block_index,
+                    "content_block": {"type": "text", "text": ""},
+                }
+                yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
+                # Stream text content in chunks
+                if content:
+                    words = content.split()
+                    chunk_size = 5
+                    for i in range(0, len(words), chunk_size):
+                        chunk_words = words[i : i + chunk_size]
+                        text_chunk = " ".join(chunk_words)
+                        # Add space after chunk unless it's the last one
+                        if i + chunk_size < len(words):
+                            text_chunk += " "
+                        content_block_delta = {
+                            "type": "content_block_delta",
+                            "index": block_index,
+                            "delta": {"type": "text_delta", "text": text_chunk},
+                        }
+                        yield f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n"
+                        await asyncio.sleep(0.02)
+                # Send content_block_stop event
+                content_block_stop = {"type": "content_block_stop", "index": block_index}
+                yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
+            elif block_type == "tool_use":
+                # Handle tool_use block
+                tool_name = block.get("name", "")
+                tool_input = block.get("input", {})
+                tool_id = block.get("id", f"toolu_{int(time.time() * 1000)}")
+                # Send content_block_start event for tool use
+                content_block_start: Dict[str, Any] = {
+                    "type": "content_block_start",
+                    "index": block_index,
+                    "content_block": {"type": "tool_use", "id": tool_id, "name": tool_name, "input": {}},
+                }
+                yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
+                # Stream tool input as JSON string chunks
+                input_json = json.dumps(tool_input)
+                chunk_size = 20  # characters per chunk for JSON
+                for i in range(0, len(input_json), chunk_size):
+                    json_chunk = input_json[i : i + chunk_size]
+                    content_block_delta = {
+                        "type": "content_block_delta",
+                        "index": block_index,
+                        "delta": {"type": "input_json_delta", "partial_json": json_chunk},
+                    }
+                    yield f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n"
+                    await asyncio.sleep(0.01)
+                # Send content_block_stop event
+                content_block_stop = {"type": "content_block_stop", "index": block_index}
+                yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
+        # Send message_delta event with stop reason
+        message_delta = {
+            "type": "message_delta",
+            "delta": {"stop_reason": original_response.get("stop_reason", "end_turn"), "stop_sequence": None},
+            "usage": {"output_tokens": original_response.get("usage", {}).get("output_tokens", 0)},
+        }
+        yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
+        # Send message_stop event
+        message_stop = {"type": "message_stop"}
+        yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
+    async def openai_stream_generator(self, response_json: Dict[str, Any]) -> AsyncGenerator[str, Any]:
+        """
+        Convert a *complete* OpenAI chat.completions choice into a stream of
+        OpenAI-compatible SSE chunks.
+        This emits:
+          - an initial delta with the role ("assistant"),
+          - a sequence of deltas for message.content (split into small chunks),
+          - deltas for any tool_calls (including id/name and chunked arguments),
+          - a terminal chunk with finish_reason,
+          - and finally the literal '[DONE]'.
+        Notes:
+        - We only handle a *single* choice (index 0 typically).
+        - We purposefully don't attempt to stream logprobs.
+        - Chunking strategy is simple and conservative to avoid splitting
+          multi-byte characters: we slice on spaces where possible, then fall
+          back to fixed-size substrings.
+        """
+        choice = cast(Dict[str, Any], (response_json.get("choices") or [{}])[0])
+        model = response_json.get("model", "unknown")
+        created: int = int(time.time())
+        index: int = choice.get("index", 0)
+        message: Dict[str, Any] = choice.get("message", {}) or {}
+        role: str = message.get("role", "assistant")
+        content: str = message.get("content") or ""
+        tool_calls: List[Any] = message.get("tool_calls") or []
+        finish_reason: Optional[str] = choice.get(
+            "finish_reason"
+        )  # e.g., "stop", "length", "tool_calls", "content_filter"
+        def sse_chunk(obj: Dict[str, Any]) -> str:
+            return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"
+        # 1) initial chunk with the role
+        yield sse_chunk(
+            {
+                "id": f"chatcmpl-{created}",
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model,
+                "choices": [{"index": index, "delta": {"role": role}, "finish_reason": None}],
+            }
+        )
+        # 2) stream textual content as small deltas
+        async def stream_content(text: str):
+            if not text:
+                return
+            # prefer splitting on spaces in ~20–40 char pieces
+            approx = 28
+            start = 0
+            n = len(text)
+            while start < n:
+                end = min(start + approx, n)
+                if end < n:
+                    # try to break on a space going forward
+                    space = text.rfind(" ", start, end)
+                    if space > start:
+                        end = space + 1
+                delta_text = text[start:end]
+                start = end
+                if not delta_text:
+                    break
+                yield sse_chunk(
+                    {
+                        "id": f"chatcmpl-{created}",
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [{"index": index, "delta": {"content": delta_text}, "finish_reason": None}],
+                    }
+                )
+                # tiny pause helps some UIs animate smoothly; keep very small
+                await asyncio.sleep(0.0)
+        async for piece in stream_content(content):  # type: ignore[misc]
+            yield piece  # pass through the produced chunks
+        # 3) stream tool_calls if present (id/name first, then arguments piecemeal)
+        for tc_index, tc in enumerate(tool_calls):
+            tc_type = tc.get("type", "function")
+            tc_id = tc.get("id") or f"call_{created}_{tc_index}"
+            fn: Dict[str, Any] = (tc.get("function") or {}) if tc_type == "function" else {}
+            fn_name: str = fn.get("name", "")
+            fn_args: str = fn.get("arguments", "") or ""
+            # (a) delta that announces the tool call id/type/name
+            yield sse_chunk(
+                {
+                    "id": f"chatcmpl-{created}",
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model,
+                    "choices": [
+                        {
+                            "index": index,
+                            "delta": {
+                                "tool_calls": [
+                                    {"index": tc_index, "id": tc_id, "type": tc_type, "function": {"name": fn_name}}
+                                ]
+                            },
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+            )
+            # (b) stream arguments in small substrings
+            arg_chunk_size = 40
+            for pos in range(0, len(fn_args), arg_chunk_size):
+                partial = fn_args[pos : pos + arg_chunk_size]
+                yield sse_chunk(
+                    {
+                        "id": f"chatcmpl-{created}",
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [
+                            {
+                                "index": index,
+                                "delta": {"tool_calls": [{"index": tc_index, "function": {"arguments": partial}}]},
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                )
+                await asyncio.sleep(0.0)
+        # 4) terminal chunk with finish_reason (default to "stop" if missing)
+        yield sse_chunk(
+            {
+                "id": f"chatcmpl-{created}",
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model,
+                "choices": [
+                    {
+                        "index": index,
+                        "delta": {},
+                        "finish_reason": finish_reason or ("tool_calls" if tool_calls else "stop"),
+                    }
+                ],
+            }
+        )
+        # 5) literal DONE sentinel
+        yield "data: [DONE]\n\n"
+_MIDDLEWARE_REGISTRY: Dict[str, Type[BaseHTTPMiddleware]] = {
+    "rollout_attempt": RolloutAttemptMiddleware,
+    "stream_conversion": StreamConversionMiddleware,
+    "message_inspection": MessageInspectionMiddleware,
+    "mantis_headers": MantisHeadersMiddleware,
+}
+_CALLBACK_REGISTRY = {
+    "return_token_ids": AddReturnTokenIds,
+    "logprobs": AddLogprobs,
+    "opentelemetry": LightningOpenTelemetry,
+}
+class LLMProxy:
+    """Host a LiteLLM OpenAI-compatible proxy bound to a LightningStore.
+    The proxy:
+    * Serves an OpenAI-compatible API via uvicorn.
+    * Adds rollout/attempt routing and headers via middleware.
+    * Registers OTEL export and token-id callbacks.
+    * Writes a LiteLLM worker config file with `model_list` and settings.
+    Lifecycle:
+    * [`start()`][mantisdk.LLMProxy.start] writes config, starts uvicorn server in a thread, and waits until ready.
+    * [`stop()`][mantisdk.LLMProxy.stop] tears down the server and removes the temp config file.
+    * [`restart()`][mantisdk.LLMProxy.restart] convenience wrapper to stop then start.
+    !!! note
+        As the LLM Proxy sets up an OpenTelemetry tracer, it's recommended to run it in a different
+        process from the main runner (i.e., tracer from agents). See `launch_mode` for how to change that.
+    !!! warning
+        By default (or when "stream_conversion" middleware is enabled), the LLM Proxy will convert OpenAI and Anthropic requests with `stream=True`
+        to a non-streaming request before going through the LiteLLM proxy. This is because the OpenTelemetry tracer provided by
+        LiteLLM is buggy with streaming responses. You can disable this by removing the "stream_conversion" middleware.
+        In that case, you might lose some tracing information like token IDs.
+    !!! danger
+        Do not run LLM proxy in the same process as the main runner. It's easy to cause conflicts in the tracer provider
+        with tracers like [`AgentOpsTracer`][mantisdk.AgentOpsTracer].
+    Args:
+        port: TCP port to bind. Will bind to a random port if not provided.
+        model_list: LiteLLM `model_list` entries.
+        store: LightningStore used for span sequence and persistence.
+        host: Publicly reachable host used in resource endpoints. See `host` of `launcher_args` for more details.
+        litellm_config: Extra LiteLLM proxy config merged with `model_list`.
+        num_retries: Default LiteLLM retry count injected into `litellm_settings`.
+        num_workers: Number of workers to run in the server. Only applicable for "mp" launch mode. Ignored if launcher_args is provided.
+            When `num_workers > 1`, the server will be run using [gunicorn](https://gunicorn.org/).
+        launch_mode: Launch mode for the server. Defaults to "mp". Cannot be used together with launcher_args. Ignored if launcher_args is provided.
+            It's recommended to use `launch_mode="mp"` to launch the proxy, which will launch the server in a separate process.
+            `launch_mode="thread"` can also be used if used in caution. It will launch the server in a separate thread.
+            `launch_mode="asyncio"` launches the server in the current thread as an asyncio task.
+            It is NOT recommended because it often causes hanging requests. Only use it if you know what you are doing.
+        launcher_args: Arguments for the server launcher. If this is provided, host, port, and launch_mode will be ignored. Cannot be used together with port, host, and launch_mode.
+        middlewares: List of FastAPI middleware classes or strings to register. You can specify the class aliases or classes that have been imported.
+            If not provided, the default middlewares (RolloutAttemptMiddleware and StreamConversionMiddleware) will be used.
+            Available middleware aliases are: "rollout_attempt", "stream_conversion", "message_inspection".
+            Middlewares are the **first layer** of request processing. They are applied to all requests before the LiteLLM proxy.
+        callbacks: List of LiteLLM callback classes or strings to register. You can specify the class aliases or classes that have been imported.
+            If not provided, the default callbacks (AddReturnTokenIds and LightningOpenTelemetry) will be used.
+            Available callback aliases are: "return_token_ids", "opentelemetry", "logprobs".
+        otlp_endpoint: Optional OTLP endpoint URL for direct trace export to external
+            collectors (e.g., Langfuse/Insight). When set, spans are exported directly without
+            requiring rollout/attempt headers. Format: "http://host:port/api/public/otel/v1/traces"
+        otlp_headers: Optional dict of HTTP headers for OTLP authentication.
+            For Langfuse/Insight, use Basic Auth: {"Authorization": "Basic base64(publicKey:secretKey)"}
+    """
+    def __init__(
+        self,
+        port: int | None = None,
+        model_list: List[ModelConfig] | None = None,
+        store: Optional[LightningStore] = None,
+        host: str | None = None,
+        litellm_config: Dict[str, Any] | None = None,
+        num_retries: int = 0,
+        num_workers: int = 1,
+        launch_mode: LaunchMode = "mp",
+        launcher_args: PythonServerLauncherArgs | None = None,
+        middlewares: Sequence[Union[Type[BaseHTTPMiddleware], str]] | None = None,
+        callbacks: Sequence[Union[Type[CustomLogger], str]] | None = None,
+        otlp_endpoint: Optional[str] = None,
+        otlp_headers: Optional[Dict[str, str]] = None,
+    ):
+        self.store = store
+        self._otlp_endpoint = otlp_endpoint
+        self._otlp_headers = otlp_headers
+        # Log OTLP configuration for diagnostics
+        if otlp_endpoint:
+            logger.info(f"LLMProxy initialized with OTLP endpoint: {otlp_endpoint}")
+            if otlp_headers:
+                logger.info(f"LLMProxy OTLP headers configured: {list(otlp_headers.keys())}")
+        else:
+            logger.debug("LLMProxy initialized without OTLP endpoint (will use store-based export)")
+        if launcher_args is not None and (
+            port is not None or host is not None or launch_mode != "mp" or num_workers != 1
+        ):
+            raise ValueError("port, host, launch_mode, and num_workers cannot be set when launcher_args is provided.")
+        self.server_launcher_args = launcher_args or PythonServerLauncherArgs(
+            port=port,
+            host=host,
+            launch_mode=launch_mode,
+            n_workers=num_workers,
+            # NOTE: This /health endpoint can be slow sometimes because it actually probes the backend LLM service.
+            healthcheck_url="/health",
+            startup_timeout=60.0,
+        )
+        if self.server_launcher_args.healthcheck_url is None:
+            logger.warning("healthcheck_url is not set. LLM Proxy will not be checked for healthiness after starting.")
+        self.model_list = model_list or []
+        self.litellm_config = litellm_config or {}
+        # Ensure num_retries is present inside the litellm_settings block.
+        self.litellm_config.setdefault("litellm_settings", {})
+        self.litellm_config["litellm_settings"].setdefault("num_retries", num_retries)
+        self.server_launcher = PythonServerLauncher(app, self.server_launcher_args, noop_context())
+        self._config_file = None
+        self.middlewares: List[Type[BaseHTTPMiddleware]] = []
+        if middlewares is None:
+            middlewares = ["mantis_headers", "rollout_attempt", "stream_conversion"]
+        for middleware in middlewares:
+            if isinstance(middleware, str):
+                if middleware not in _MIDDLEWARE_REGISTRY:
+                    raise ValueError(
+                        f"Invalid middleware alias: {middleware}. Available aliases are: {list(_MIDDLEWARE_REGISTRY.keys())}"
+                    )
+                middleware = _MIDDLEWARE_REGISTRY[middleware]
+                self.middlewares.append(middleware)
+            else:
+                self.middlewares.append(middleware)
+        self.callbacks: List[Type[CustomLogger]] = []
+        if callbacks is None:
+            callbacks = ["return_token_ids", "opentelemetry"]
+        for callback in callbacks:
+            if isinstance(callback, str):
+                if callback not in _CALLBACK_REGISTRY:
+                    raise ValueError(
+                        f"Invalid callback alias: {callback}. Available aliases are: {list(_CALLBACK_REGISTRY.keys())}"
+                    )
+                callback = _CALLBACK_REGISTRY[callback]
+                self.callbacks.append(callback)
+            else:
+                self.callbacks.append(callback)
+    def get_store(self) -> Optional[LightningStore]:
+        """Get the store used by the proxy.
+        Returns:
+            The store used by the proxy.
+        """
+        return self.store
+    def set_store(self, store: LightningStore) -> None:
+        """Set the store for the proxy.
+        Args:
+            store: The store to use for the proxy.
+        """
+        self.store = store
+    def update_model_list(self, model_list: List[ModelConfig]) -> None:
+        """Replace the in-memory model list.
+        Args:
+            model_list: New list of model entries.
+        """
+        self.model_list = model_list
+        logger.info(f"Updating LLMProxy model list to: {model_list}")
+        # Do nothing if the server is not running.
+    def initialize(self):
+        """Initialize global middleware and LiteLLM callbacks.
+        Installs:
+        * A FastAPI middleware that rewrites /rollout/{rid}/attempt/{aid}/... paths,
+        injects rollout/attempt/sequence headers, and forwards downstream.
+        * LiteLLM callbacks for token ids and OpenTelemetry export.
+        The middleware can only be installed once because once the FastAPI app has started,
+        the middleware cannot be changed any more.
+        This function does not start any server. It only wires global hooks.
+        """
+        if self.store is None:
+            raise ValueError("Store is not set. Please set the store before initializing the LLMProxy.")
+        if _global_llm_proxy is not None:
+            logger.warning("A global LLMProxy is already set. Overwriting it with the new instance.")
+        # Patch for LiteLLM v1.80.6+: https://github.com/BerriAI/litellm/issues/17243
+        os.environ["USE_OTEL_LITELLM_REQUEST_SPAN"] = "true"
+        # Set the global LLMProxy reference for middleware/exporter access.
+        set_active_llm_proxy(self)
+        # Install middleware if it's not already installed.
+        installation_status: Dict[Any, bool] = {}
+        for mw in app.user_middleware:
+            installation_status[mw.cls] = True
+        for mw in self.middlewares:
+            if mw not in installation_status:
+                logger.info(f"Adding middleware {mw} to the FastAPI app.")
+                app.add_middleware(mw)
+            else:
+                logger.info(f"Middleware {mw} is already installed. Will not install a new one.")
+        if not initialize_llm_callbacks(self.callbacks, otlp_endpoint=self._otlp_endpoint, otlp_headers=self._otlp_headers):
+            # If it's not the first time to initialize the callbacks, also
+            # reset LiteLLM's logging worker so its asyncio.Queue binds to the new loop.
+            _reset_litellm_logging_worker()
+    @asynccontextmanager
+    async def _serve_context(self) -> AsyncGenerator[None, None]:
+        """Context manager to serve the proxy server.
+        See [`start`][mantisdk.LLMProxy.start] and [`stop`][mantisdk.LLMProxy.stop] for more details.
+        """
+        if not self.store:
+            raise ValueError("Store is not set. Please set the store before starting the LLMProxy.")
+        # Initialize global middleware and callbacks.
+        self.initialize()
+        # Persist a temp worker config for LiteLLM and point the proxy at it.
+        self._config_file = tempfile.NamedTemporaryFile(suffix=".yaml", delete=False).name
+        with open(self._config_file, "w") as fp:
+            yaml.safe_dump(
+                {
+                    "model_list": self.model_list,
+                    **self.litellm_config,
+                },
+                fp,
+            )
+        save_worker_config(config=self._config_file)
+        # NOTE: When running the _serve_context in current process, you might encounter the following problems:
+        # Problem 1: in litellm worker, <Queue at 0x70f1d028cd90 maxsize=50000> is bound to a different event loop
+        # Problem 2: Proxy has conflicted opentelemetry setup with the main process.
+        # Ready
+        logger.info("LLMProxy preparation is done. Will start the server.")
+        yield
+        # Clean up
+        logger.info("LLMProxy server is cleaning up.")
+        # Remove worker config to avoid stale references.
+        if self._config_file and os.path.exists(self._config_file):
+            os.unlink(self._config_file)
+        logger.info("LLMProxy server finishes.")
+    async def start(self):
+        """Start the proxy server thread and initialize global wiring.
+        Side effects:
+        * Sets the module-level global store for middleware/exporter access.
+        * Calls `initialize()` once to register middleware and callbacks.
+        * Writes a temporary YAML config consumed by LiteLLM worker.
+        * Launches uvicorn in a daemon thread and waits for readiness.
+        """
+        # Refresh the serve context
+        self.server_launcher.serve_context = self._serve_context()
+        if self.store is None:
+            raise ValueError("Store is not set. Please set the store before starting the LLMProxy.")
+        store_capabilities = self.store.capabilities
+        if self.server_launcher.args.launch_mode == "mp" and not store_capabilities.get("zero_copy", False):
+            raise RuntimeError(
+                "The store does not support zero-copy. Please use another store, or use asyncio or thread mode to launch the server."
+            )
+        elif self.server_launcher.args.launch_mode == "thread" and not store_capabilities.get("thread_safe", False):
+            raise RuntimeError(
+                "The store is not thread-safe. Please use another store, or use asyncio mode to launch the server."
+            )
+        elif self.server_launcher.args.launch_mode == "asyncio" and not store_capabilities.get("async_safe", False):
+            raise RuntimeError("The store is not async-safe. Please use another store.")
+        logger.info(
+            f"Starting LLMProxy server in {self.server_launcher.args.launch_mode} mode with store capabilities: {store_capabilities}"
+        )
+        await self.server_launcher.start()
+    async def stop(self):
+        """Stop the proxy server and clean up temporary artifacts.
+        This is a best-effort graceful shutdown with a bounded join timeout.
+        """
+        if not self.is_running():
+            logger.warning("LLMProxy is not running. Nothing to stop.")
+            return
+        await self.server_launcher.stop()
+    async def restart(self, *, _port: int | None = None) -> None:
+        """Restart the proxy if running, else start it.
+        Convenience wrapper calling `stop()` followed by `start()`.
+        """
+        logger.info("Restarting LLMProxy server...")
+        if self.is_running():
+            await self.stop()
+        if _port is not None:
+            self.server_launcher_args.port = _port
+        await self.start()
+    def is_running(self) -> bool:
+        """Return whether the uvicorn server is active.
+        Returns:
+            bool: True if server was started and did not signal exit.
+        """
+        return self.server_launcher.is_running()
+    def as_resource(
+        self,
+        rollout_id: str | None = None,
+        attempt_id: str | None = None,
+        model: str | None = None,
+        sampling_parameters: Dict[str, Any] | None = None,
+    ) -> LLM:
+        """Create an `LLM` resource pointing at this proxy with rollout context.
+        The returned endpoint is:
+            `http://{host}:{port}/rollout/{rollout_id}/attempt/{attempt_id}`
+        Args:
+            rollout_id: Rollout identifier used for span attribution. If None, will instantiate a ProxyLLM resource.
+            attempt_id: Attempt identifier used for span attribution. If None, will instantiate a ProxyLLM resource.
+            model: Logical model name to use. If omitted and exactly one model
+                is configured or all models have the same name, that model is used.
+            sampling_parameters: Optional default sampling parameters.
+        Returns:
+            LLM: Configured resource ready for OpenAI-compatible calls.
+        Raises:
+            ValueError: If `model` is omitted and zero or multiple models are configured.
+        """
+        if model is None:
+            if len(self.model_list) == 1:
+                model = self.model_list[0]["model_name"]
+            elif len(self.model_list) == 0:
+                raise ValueError("No models found in model_list. Please specify the model.")
+            else:
+                first_model_name = self.model_list[0]["model_name"]
+                if all(model_config["model_name"] == first_model_name for model_config in self.model_list):
+                    model = first_model_name
+                else:
+                    raise ValueError(
+                        f"Multiple models found in model_list: {self.model_list}. Please specify the model."
+                    )
+        if rollout_id is None and attempt_id is None:
+            return ProxyLLM(
+                endpoint=self.server_launcher.access_endpoint,
+                model=model,
+                sampling_parameters=dict(sampling_parameters or {}),
+            )
+        elif rollout_id is not None and attempt_id is not None:
+            return LLM(
+                endpoint=f"{self.server_launcher.access_endpoint}/rollout/{rollout_id}/attempt/{attempt_id}",
+                model=model,
+                sampling_parameters=dict(sampling_parameters or {}),
+            )
+        else:
+            raise ValueError("Either rollout_id and attempt_id must be provided, or neither.")
+_global_llm_proxy: Optional[LLMProxy] = None
+_callbacks_before_litellm_start: Optional[List[Any]] = None
+def get_active_llm_proxy() -> LLMProxy:
+    """Get the current global LLMProxy instance.
+    Returns:
+        Optional[LLMProxy]: The current LLMProxy if set, else None.
+    """
+    if _global_llm_proxy is None:
+        raise ValueError("Global LLMProxy is not set. Please call llm_proxy.start() first.")
+    return _global_llm_proxy
+def set_active_llm_proxy(proxy: LLMProxy) -> None:
+    """Set the current global LLMProxy instance.
+    Args:
+        proxy: The LLMProxy instance to set as global.
+    """
+    global _global_llm_proxy
+    _global_llm_proxy = proxy
+def initialize_llm_callbacks(
+    callback_classes: List[Type[CustomLogger]],
+    otlp_endpoint: Optional[str] = None,
+    otlp_headers: Optional[Dict[str, str]] = None,
+) -> bool:
+    """Restore `litellm.callbacks` to a state that is just initialized by mantisdk.
+    When litellm is restarted multiple times in the same process, more and more callbacks
+    will be appended to `litellm.callbacks`, which may exceed the MAX_CALLBACKS limit.
+    This function remembers the initial state of `litellm.callbacks` and always restore to that state.
+    Args:
+        callback_classes: List of callback classes to register.
+        otlp_endpoint: Optional OTLP endpoint URL for direct trace export.
+        otlp_headers: Optional dict of HTTP headers for OTLP authentication.
+    Returns:
+        Whether the callbacks are initialized for the first time.
+    """
+    global _callbacks_before_litellm_start
+    def _instantiate_callback(cls: Type[CustomLogger]) -> CustomLogger:
+        """Instantiate callback with appropriate arguments."""
+        if cls is LightningOpenTelemetry:
+            return LightningOpenTelemetry(otlp_endpoint=otlp_endpoint, otlp_headers=otlp_headers)
+        return cls()
+    if _callbacks_before_litellm_start is None:
+        litellm.callbacks.extend([_instantiate_callback(cls) for cls in callback_classes])  # type: ignore
+        _callbacks_before_litellm_start = [*litellm.callbacks]  # type: ignore
+        return True
+    else:
+        # Put whatever is missing in the new callback classes to the existing callbacks.
+        for cls in callback_classes:
+            if not any(isinstance(cb, cls) for cb in _callbacks_before_litellm_start):
+                logger.info(f"Adding missing callback {cls} to the existing callbacks.")
+                _callbacks_before_litellm_start.append(_instantiate_callback(cls))
+    _reset_litellm_logging_callback_manager()
+    if LightningOpenTelemetry in callback_classes:
+        # Check if tracer provider is malformed due to global tracer clear in tests.
+        if not _check_tracer_provider():
+            logger.warning(
+                "Global tracer provider might have been cleared outside. Re-initializing OpenTelemetry callback."
+            )
+            _callbacks_before_litellm_start = [
+                cb for cb in _callbacks_before_litellm_start if not isinstance(cb, LightningOpenTelemetry)
+            ] + [LightningOpenTelemetry(otlp_endpoint=otlp_endpoint, otlp_headers=otlp_headers)]
+        else:
+            logger.debug("Global tracer provider is valid. Reusing existing OpenTelemetry callback.")
+    # Otherwise, we just skip the check for opentelemetry and use the existing callback.
+    litellm.callbacks.clear()  # type: ignore
+    litellm.callbacks.extend(_callbacks_before_litellm_start)  # type: ignore
+    return False
+def _check_tracer_provider() -> bool:
+    """Check if the global tracer provider is properly initialized.
+    We don't guarantee the tracer provider is our tracer provider.
+    Returns:
+        bool: True if the tracer provider is valid, else False.
+    """
+    if (
+        hasattr(trace_api, "_TRACER_PROVIDER")
+        and trace_api._TRACER_PROVIDER is not None  # pyright: ignore[reportPrivateUsage]
+    ):
+        return True
+    return False