mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
mantisdk/tracer/otel.py
ADDED
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
# Copyright (c) Microsoft. All rights reserved.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import threading
|
|
8
|
+
import warnings
|
|
9
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
10
|
+
from typing import Any, AsyncGenerator, Awaitable, Iterator, List, Optional
|
|
11
|
+
|
|
12
|
+
import opentelemetry.trace as trace_api
|
|
13
|
+
from opentelemetry.instrumentation.utils import suppress_instrumentation
|
|
14
|
+
from opentelemetry.sdk.resources import Resource
|
|
15
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
16
|
+
from opentelemetry.sdk.trace import TracerProvider as TracerProviderImpl
|
|
17
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
|
|
18
|
+
|
|
19
|
+
from mantisdk.semconv import LightningResourceAttributes
|
|
20
|
+
from mantisdk.store.base import LightningStore
|
|
21
|
+
from mantisdk.types import Attributes, Span, SpanCoreFields, SpanRecordingContext, StatusCode, TraceStatus
|
|
22
|
+
from mantisdk.types.tracer import convert_timestamp
|
|
23
|
+
from mantisdk.utils.otel import get_tracer_provider
|
|
24
|
+
from mantisdk.utils.otlp import LightningStoreOTLPExporter
|
|
25
|
+
|
|
26
|
+
from .base import Tracer, with_active_tracer_context
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
STORE_WRITE_TIMEOUT_SECONDS = 10.0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def to_otel_status_code(status_code: StatusCode) -> trace_api.StatusCode:
|
|
34
|
+
if status_code == "UNSET":
|
|
35
|
+
return trace_api.StatusCode.UNSET
|
|
36
|
+
elif status_code == "ERROR":
|
|
37
|
+
return trace_api.StatusCode.ERROR
|
|
38
|
+
else:
|
|
39
|
+
return trace_api.StatusCode.OK
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class OtelSpanRecordingContext(SpanRecordingContext):
|
|
43
|
+
def __init__(self, span: trace_api.Span) -> None:
|
|
44
|
+
self._span = span
|
|
45
|
+
|
|
46
|
+
def record_exception(self, exception: BaseException) -> None:
|
|
47
|
+
self._span.record_exception(exception)
|
|
48
|
+
self.record_status("ERROR", str(exception))
|
|
49
|
+
|
|
50
|
+
def record_attributes(self, attributes: Attributes) -> None:
|
|
51
|
+
self._span.set_attributes(attributes)
|
|
52
|
+
|
|
53
|
+
def record_status(self, status_code: StatusCode, description: Optional[str] = None) -> None:
|
|
54
|
+
otel_status_code = to_otel_status_code(status_code)
|
|
55
|
+
self._span.set_status(otel_status_code, description)
|
|
56
|
+
|
|
57
|
+
def get_otel_span(self) -> trace_api.Span:
|
|
58
|
+
return self._span
|
|
59
|
+
|
|
60
|
+
def get_recorded_span(self) -> SpanCoreFields:
|
|
61
|
+
if isinstance(self._span, ReadableSpan):
|
|
62
|
+
return SpanCoreFields(
|
|
63
|
+
name=self._span.name,
|
|
64
|
+
attributes=dict(self._span.attributes) if self._span.attributes else {},
|
|
65
|
+
start_time=convert_timestamp(self._span.start_time),
|
|
66
|
+
end_time=convert_timestamp(self._span.end_time),
|
|
67
|
+
status=TraceStatus.from_opentelemetry(self._span.status),
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
raise ValueError(f"Span is not a ReadableSpan: {self._span}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class OtelTracer(Tracer):
|
|
74
|
+
"""Tracer that provides a basic OpenTelemetry tracer provider.
|
|
75
|
+
|
|
76
|
+
You should be able to collect mantisdk signals like rewards with this tracer,
|
|
77
|
+
but no other function instrumentations like `openai.chat.completion`.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self):
|
|
81
|
+
super().__init__()
|
|
82
|
+
# This provider is only initialized when the worker is initialized.
|
|
83
|
+
self._tracer_provider: Optional[trace_api.TracerProvider] = None
|
|
84
|
+
self._lightning_span_processor: Optional[LightningSpanProcessor] = None
|
|
85
|
+
self._simple_span_processor: Optional[SimpleSpanProcessor] = None
|
|
86
|
+
self._otlp_span_exporter: Optional[LightningStoreOTLPExporter] = None
|
|
87
|
+
self._initialized: bool = False
|
|
88
|
+
|
|
89
|
+
def init_worker(self, worker_id: int, store: Optional[LightningStore] = None):
|
|
90
|
+
super().init_worker(worker_id, store)
|
|
91
|
+
self._initialize_tracer_provider(worker_id)
|
|
92
|
+
|
|
93
|
+
def _initialize_tracer_provider(self, worker_id: int):
|
|
94
|
+
logger.info(f"[Worker {worker_id}] Setting up OpenTelemetry tracer...")
|
|
95
|
+
|
|
96
|
+
if self._initialized:
|
|
97
|
+
logger.info(f"[Worker {worker_id}] Tracer provider is already initialized. Skipping initialization.")
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
get_tracer_provider()
|
|
102
|
+
logger.error(
|
|
103
|
+
f"[Worker {worker_id}] Tracer provider is already initialized but not by OtelTracer. OpenTelemetry may not work as expected."
|
|
104
|
+
)
|
|
105
|
+
except RuntimeError:
|
|
106
|
+
logger.debug(f"[Worker {worker_id}] Tracer provider is not initialized by OtelTracer. Initializing it now.")
|
|
107
|
+
|
|
108
|
+
self._tracer_provider = TracerProviderImpl()
|
|
109
|
+
trace_api.set_tracer_provider(self._tracer_provider)
|
|
110
|
+
|
|
111
|
+
# Note: Call-type tagging is handled via x-mantis-call-type headers
|
|
112
|
+
# passed to the LLM proxy, not via client-side span processing.
|
|
113
|
+
# This avoids tagging internal SDK spans like mantisdk.annotation.
|
|
114
|
+
|
|
115
|
+
self._lightning_span_processor = LightningSpanProcessor()
|
|
116
|
+
self._tracer_provider.add_span_processor(self._lightning_span_processor)
|
|
117
|
+
self._otlp_span_exporter = LightningStoreOTLPExporter()
|
|
118
|
+
self._simple_span_processor = SimpleSpanProcessor(self._otlp_span_exporter)
|
|
119
|
+
self._tracer_provider.add_span_processor(self._simple_span_processor)
|
|
120
|
+
self._initialized = True
|
|
121
|
+
|
|
122
|
+
logger.info(f"[Worker {worker_id}] OpenTelemetry tracer provider initialized.")
|
|
123
|
+
|
|
124
|
+
def teardown_worker(self, worker_id: int):
|
|
125
|
+
super().teardown_worker(worker_id)
|
|
126
|
+
logger.info(f"[Worker {worker_id}] Tearing down OpenTelemetry tracer does NOT remove the tracer provider.")
|
|
127
|
+
|
|
128
|
+
@with_active_tracer_context
|
|
129
|
+
@asynccontextmanager
|
|
130
|
+
async def trace_context(
|
|
131
|
+
self,
|
|
132
|
+
name: Optional[str] = None,
|
|
133
|
+
*,
|
|
134
|
+
store: Optional[LightningStore] = None,
|
|
135
|
+
rollout_id: Optional[str] = None,
|
|
136
|
+
attempt_id: Optional[str] = None,
|
|
137
|
+
) -> AsyncGenerator[trace_api.Tracer, None]:
|
|
138
|
+
"""
|
|
139
|
+
Starts a new tracing context. This should be used as a context manager.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
name: Optional name for the tracing context.
|
|
143
|
+
store: Optional store to add the spans to.
|
|
144
|
+
rollout_id: Optional rollout ID to add the spans to.
|
|
145
|
+
attempt_id: Optional attempt ID to add the spans to.
|
|
146
|
+
|
|
147
|
+
Yields:
|
|
148
|
+
The OpenTelemetry tracer instance to collect spans.
|
|
149
|
+
"""
|
|
150
|
+
if not self._lightning_span_processor:
|
|
151
|
+
raise RuntimeError("LightningSpanProcessor is not initialized. Call init_worker() first.")
|
|
152
|
+
|
|
153
|
+
if store is not None:
|
|
154
|
+
warnings.warn(
|
|
155
|
+
"store is deprecated in favor of init_worker(). It will be removed in the future.",
|
|
156
|
+
DeprecationWarning,
|
|
157
|
+
stacklevel=3,
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
store = self._store
|
|
161
|
+
|
|
162
|
+
if rollout_id is not None and attempt_id is not None:
|
|
163
|
+
if store is None:
|
|
164
|
+
raise ValueError("store is required to be initialized when rollout_id and attempt_id are provided")
|
|
165
|
+
if store.capabilities.get("otlp_traces", False) is True:
|
|
166
|
+
logger.debug(f"Tracing to LightningStore rollout_id={rollout_id}, attempt_id={attempt_id}")
|
|
167
|
+
await self._enable_native_otlp_exporter(store, rollout_id, attempt_id)
|
|
168
|
+
else:
|
|
169
|
+
self._disable_native_otlp_exporter()
|
|
170
|
+
ctx = self._lightning_span_processor.with_context(store=store, rollout_id=rollout_id, attempt_id=attempt_id)
|
|
171
|
+
with ctx:
|
|
172
|
+
yield trace_api.get_tracer(__name__, tracer_provider=self._tracer_provider)
|
|
173
|
+
elif rollout_id is None and attempt_id is None:
|
|
174
|
+
self._disable_native_otlp_exporter()
|
|
175
|
+
with self._lightning_span_processor:
|
|
176
|
+
yield trace_api.get_tracer(__name__, tracer_provider=self._tracer_provider)
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError("rollout_id and attempt_id must be either all provided or all None")
|
|
179
|
+
|
|
180
|
+
def create_span(
|
|
181
|
+
self,
|
|
182
|
+
name: str,
|
|
183
|
+
attributes: Optional[Attributes] = None,
|
|
184
|
+
timestamp: Optional[float] = None,
|
|
185
|
+
status: Optional[TraceStatus] = None,
|
|
186
|
+
) -> SpanCoreFields:
|
|
187
|
+
# Fire the span to the current active tracer provider.
|
|
188
|
+
tracer_provider = self._get_tracer_provider()
|
|
189
|
+
tracer = tracer_provider.get_tracer(__name__)
|
|
190
|
+
span = tracer.start_span(
|
|
191
|
+
name, attributes=attributes, start_time=int(timestamp * 1_000_000_000) if timestamp else None
|
|
192
|
+
)
|
|
193
|
+
if status is not None:
|
|
194
|
+
span.set_status(to_otel_status_code(status.status_code), status.description)
|
|
195
|
+
span.end(int(timestamp * 1_000_000_000) if timestamp else None)
|
|
196
|
+
|
|
197
|
+
# The span should have been auto-created by now.
|
|
198
|
+
# Return the core fields of the span.
|
|
199
|
+
if isinstance(span, ReadableSpan):
|
|
200
|
+
return SpanCoreFields(
|
|
201
|
+
name=name,
|
|
202
|
+
attributes=dict(span.attributes) if span.attributes else {},
|
|
203
|
+
start_time=convert_timestamp(span.start_time),
|
|
204
|
+
end_time=convert_timestamp(span.end_time),
|
|
205
|
+
status=TraceStatus.from_opentelemetry(span.status),
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError(f"Span is not a ReadableSpan: {span}")
|
|
209
|
+
|
|
210
|
+
@contextmanager
|
|
211
|
+
def operation_context(
|
|
212
|
+
self,
|
|
213
|
+
name: str,
|
|
214
|
+
attributes: Optional[Attributes] = None,
|
|
215
|
+
start_time: Optional[float] = None,
|
|
216
|
+
end_time: Optional[float] = None,
|
|
217
|
+
) -> Iterator[SpanRecordingContext]:
|
|
218
|
+
if end_time is not None:
|
|
219
|
+
logger.warning("OpenTelemetry doesn't support customizing the end time of a span. End time is ignored.")
|
|
220
|
+
# Record the span to the current active tracer provider.
|
|
221
|
+
tracer_provider = self._get_tracer_provider()
|
|
222
|
+
tracer = tracer_provider.get_tracer(__name__)
|
|
223
|
+
|
|
224
|
+
# Activate the span as the current span within otel.
|
|
225
|
+
with tracer.start_as_current_span(
|
|
226
|
+
name, attributes=attributes, start_time=int(start_time * 1_000_000_000) if start_time else None
|
|
227
|
+
) as span:
|
|
228
|
+
recording_context = OtelSpanRecordingContext(span)
|
|
229
|
+
try:
|
|
230
|
+
yield recording_context
|
|
231
|
+
except Exception as exc:
|
|
232
|
+
recording_context.record_exception(exc)
|
|
233
|
+
raise
|
|
234
|
+
|
|
235
|
+
# No need to retrieve the span here. It's already been sent to otel processor.
|
|
236
|
+
|
|
237
|
+
def get_last_trace(self) -> List[Span]:
|
|
238
|
+
"""
|
|
239
|
+
Retrieves the raw list of captured spans from the most recent trace.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
A list of [`Span`][mantisdk.Span] objects captured during the most recent trace.
|
|
243
|
+
"""
|
|
244
|
+
if not self._lightning_span_processor:
|
|
245
|
+
raise RuntimeError("LightningSpanProcessor is not initialized. Call init_worker() first.")
|
|
246
|
+
return self._lightning_span_processor.spans()
|
|
247
|
+
|
|
248
|
+
def _get_tracer_provider(self) -> TracerProviderImpl:
|
|
249
|
+
if self._tracer_provider is None:
|
|
250
|
+
raise RuntimeError("TracerProvider is not initialized. Call init_worker() first.")
|
|
251
|
+
if not isinstance(self._tracer_provider, TracerProviderImpl):
|
|
252
|
+
raise TypeError(f"TracerProvider is not a opentelemetry.sdk.trace.TracerProvider: {self._tracer_provider}")
|
|
253
|
+
return self._tracer_provider
|
|
254
|
+
|
|
255
|
+
async def _enable_native_otlp_exporter(self, store: LightningStore, rollout_id: str, attempt_id: str):
|
|
256
|
+
tracer_provider = self._get_tracer_provider()
|
|
257
|
+
active_span_processor = tracer_provider._active_span_processor # pyright: ignore[reportPrivateUsage]
|
|
258
|
+
|
|
259
|
+
# Override the resources so that the server knows where the request comes from.
|
|
260
|
+
tracer_provider._resource = tracer_provider._resource.merge( # pyright: ignore[reportPrivateUsage]
|
|
261
|
+
Resource.create(
|
|
262
|
+
{
|
|
263
|
+
LightningResourceAttributes.ROLLOUT_ID.value: rollout_id,
|
|
264
|
+
LightningResourceAttributes.ATTEMPT_ID.value: attempt_id,
|
|
265
|
+
}
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Fetch rollout metadata for tracing configuration (environment, tags)
|
|
270
|
+
environment = None
|
|
271
|
+
tags = None
|
|
272
|
+
try:
|
|
273
|
+
rollout = await store.get_rollout_by_id(rollout_id)
|
|
274
|
+
if rollout and rollout.metadata:
|
|
275
|
+
environment = rollout.metadata.get("environment")
|
|
276
|
+
tags = rollout.metadata.get("tags")
|
|
277
|
+
logger.debug(f"Fetched tracing metadata for rollout {rollout_id}: environment={environment}, tags={tags}")
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.warning(f"Failed to fetch rollout metadata for {rollout_id}: {e}")
|
|
280
|
+
|
|
281
|
+
instrumented = False
|
|
282
|
+
candidates: List[str] = []
|
|
283
|
+
for processor in active_span_processor._span_processors: # pyright: ignore[reportPrivateUsage]
|
|
284
|
+
if isinstance(processor, LightningSpanProcessor):
|
|
285
|
+
# We don't need the LightningSpanProcessor any more.
|
|
286
|
+
logger.debug("LightningSpanProcessor already present in TracerProvider, disabling it.")
|
|
287
|
+
processor.disable_store_submission = True
|
|
288
|
+
elif isinstance(processor, (SimpleSpanProcessor, BatchSpanProcessor)):
|
|
289
|
+
# Instead, we rely on the OTLPSpanExporter to send spans to the store.
|
|
290
|
+
if isinstance(processor.span_exporter, LightningStoreOTLPExporter):
|
|
291
|
+
# Get optional auth headers from store
|
|
292
|
+
otlp_headers = None
|
|
293
|
+
if hasattr(store, "get_otlp_headers"):
|
|
294
|
+
otlp_headers = store.get_otlp_headers()
|
|
295
|
+
processor.span_exporter.enable_store_otlp(
|
|
296
|
+
store.otlp_traces_endpoint(), rollout_id, attempt_id,
|
|
297
|
+
headers=otlp_headers, environment=environment, tags=tags
|
|
298
|
+
)
|
|
299
|
+
logger.debug(f"Set LightningStoreOTLPExporter endpoint to {store.otlp_traces_endpoint()}")
|
|
300
|
+
instrumented = True
|
|
301
|
+
else:
|
|
302
|
+
candidates.append(
|
|
303
|
+
f"{processor.__class__.__name__} with {processor.span_exporter.__class__.__name__}"
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
candidates.append(f"{processor.__class__.__name__}")
|
|
307
|
+
|
|
308
|
+
if not instrumented:
|
|
309
|
+
raise RuntimeError(
|
|
310
|
+
"Failed to enable native OTLP exporter: no BatchSpanProcessor or SimpleSpanProcessor with "
|
|
311
|
+
"LightningStoreOTLPExporter found in TracerProvider. Please try using a non-OTLP store."
|
|
312
|
+
"Candidates are: " + ", ".join(candidates)
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
def _disable_native_otlp_exporter(self):
|
|
316
|
+
tracer_provider = self._get_tracer_provider()
|
|
317
|
+
active_span_processor = tracer_provider._active_span_processor # pyright: ignore[reportPrivateUsage]
|
|
318
|
+
tracer_provider._resource = tracer_provider._resource.merge( # pyright: ignore[reportPrivateUsage]
|
|
319
|
+
Resource.create(
|
|
320
|
+
{
|
|
321
|
+
LightningResourceAttributes.ROLLOUT_ID.value: "",
|
|
322
|
+
LightningResourceAttributes.ATTEMPT_ID.value: "",
|
|
323
|
+
}
|
|
324
|
+
)
|
|
325
|
+
) # reset resource
|
|
326
|
+
for processor in active_span_processor._span_processors: # pyright: ignore[reportPrivateUsage]
|
|
327
|
+
if isinstance(processor, LightningSpanProcessor):
|
|
328
|
+
# We will be in need of the LightningSpanProcessor again.
|
|
329
|
+
logger.debug("Enabling LightningSpanProcessor in TracerProvider.")
|
|
330
|
+
processor.disable_store_submission = False
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class LightningSpanProcessor(SpanProcessor):
|
|
334
|
+
"""Span processor that subclasses OpenTelemetry's `SpanProcessor` and adds support to dump traces
|
|
335
|
+
to a [`LightningStore`][mantisdk.LightningStore].
|
|
336
|
+
|
|
337
|
+
It serves two purposes:
|
|
338
|
+
|
|
339
|
+
1. Records all the spans in a local buffer.
|
|
340
|
+
2. Submits the spans to the event loop to be added to the store.
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
def __init__(self, disable_store_submission: bool = False):
|
|
344
|
+
self._disable_store_submission: bool = disable_store_submission
|
|
345
|
+
self._spans: List[Span] = []
|
|
346
|
+
|
|
347
|
+
# Store related context and states
|
|
348
|
+
self._store: Optional[LightningStore] = None
|
|
349
|
+
self._rollout_id: Optional[str] = None
|
|
350
|
+
self._attempt_id: Optional[str] = None
|
|
351
|
+
self._local_sequence_id: int = 0
|
|
352
|
+
self._lock = threading.Lock()
|
|
353
|
+
|
|
354
|
+
# private asyncio loop running in a daemon thread
|
|
355
|
+
self._loop_ready = threading.Event()
|
|
356
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
357
|
+
self._loop_thread: Optional[threading.Thread] = None
|
|
358
|
+
self._loop_init_lock = threading.Lock()
|
|
359
|
+
|
|
360
|
+
def __repr__(self) -> str:
|
|
361
|
+
return (
|
|
362
|
+
f"{self.__class__.__name__}("
|
|
363
|
+
+ f"disable_store_submission={self.disable_store_submission}, "
|
|
364
|
+
+ f"store={self.store!r}, "
|
|
365
|
+
+ f"rollout_id={self.rollout_id!r}, "
|
|
366
|
+
+ f"attempt_id={self.attempt_id!r})"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
@property
|
|
370
|
+
def store(self) -> Optional[LightningStore]:
|
|
371
|
+
"""The store to submit the spans to."""
|
|
372
|
+
return self._store
|
|
373
|
+
|
|
374
|
+
@property
|
|
375
|
+
def rollout_id(self) -> Optional[str]:
|
|
376
|
+
"""The rollout ID to submit the spans to."""
|
|
377
|
+
return self._rollout_id
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def attempt_id(self) -> Optional[str]:
|
|
381
|
+
"""The attempt ID to submit the spans to."""
|
|
382
|
+
return self._attempt_id
|
|
383
|
+
|
|
384
|
+
@property
|
|
385
|
+
def disable_store_submission(self) -> bool:
|
|
386
|
+
"""Whether to disable submitting spans to the store."""
|
|
387
|
+
return self._disable_store_submission
|
|
388
|
+
|
|
389
|
+
@disable_store_submission.setter
|
|
390
|
+
def disable_store_submission(self, value: bool) -> None:
|
|
391
|
+
self._disable_store_submission = value
|
|
392
|
+
|
|
393
|
+
def _ensure_loop(self) -> None:
|
|
394
|
+
# Fast path: loop already initialized
|
|
395
|
+
if self._loop_thread is not None and self._loop is not None:
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
with self._loop_init_lock:
|
|
399
|
+
# Double-check after acquiring lock
|
|
400
|
+
if self._loop_thread is not None and self._loop is not None:
|
|
401
|
+
return
|
|
402
|
+
self._loop_ready.clear()
|
|
403
|
+
self._loop_thread = threading.Thread(target=self._loop_runner, name="otel-loop", daemon=True)
|
|
404
|
+
self._loop_thread.start()
|
|
405
|
+
if not self._loop_ready.wait(timeout=30.0):
|
|
406
|
+
raise RuntimeError("Timed out waiting for otel-loop thread to start")
|
|
407
|
+
|
|
408
|
+
def _loop_runner(self):
|
|
409
|
+
loop = asyncio.new_event_loop()
|
|
410
|
+
self._loop = loop
|
|
411
|
+
asyncio.set_event_loop(loop)
|
|
412
|
+
self._loop_ready.set()
|
|
413
|
+
loop.run_forever()
|
|
414
|
+
loop.close()
|
|
415
|
+
|
|
416
|
+
def __enter__(self):
|
|
417
|
+
self._last_trace = None
|
|
418
|
+
self._spans = []
|
|
419
|
+
return self
|
|
420
|
+
|
|
421
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):
|
|
422
|
+
self._store = None
|
|
423
|
+
self._rollout_id = None
|
|
424
|
+
self._attempt_id = None
|
|
425
|
+
|
|
426
|
+
def _await_in_loop(self, coro: Awaitable[Any], timeout: Optional[float] = None) -> Any:
|
|
427
|
+
# submit to the dedicated loop and wait synchronously
|
|
428
|
+
self._ensure_loop()
|
|
429
|
+
if self._loop is None:
|
|
430
|
+
raise RuntimeError("Loop is not initialized. This should not happen.")
|
|
431
|
+
|
|
432
|
+
# If already on the exporter loop thread, schedule and return immediately.
|
|
433
|
+
# ---------------------------------------------------------------------------
|
|
434
|
+
# WHY THIS CONDITIONAL EXISTS:
|
|
435
|
+
# In rare cases, span.end() is triggered from a LangchainCallbackHandler.__del__
|
|
436
|
+
# (or another finalizer) while the Python garbage collector is running on the
|
|
437
|
+
# *same thread* that owns our exporter event loop ("otel-loop").
|
|
438
|
+
#
|
|
439
|
+
# When that happens, on_end() executes on the exporter loop thread itself.
|
|
440
|
+
# If we were to call `asyncio.run_coroutine_threadsafe(...).result()` here,
|
|
441
|
+
# it would deadlock immediately — because the loop cannot both wait on and run
|
|
442
|
+
# the same coroutine. The Future stays pending forever and the loop stops
|
|
443
|
+
# processing scheduled callbacks.
|
|
444
|
+
#
|
|
445
|
+
# To avoid that self-deadlock, we detect when on_end() runs on the exporter
|
|
446
|
+
# loop thread. If so, we *schedule* the coroutine on the loop (fire-and-forget)
|
|
447
|
+
# instead of blocking with .result().
|
|
448
|
+
#
|
|
449
|
+
# This situation can occur because Python calls __del__ in whatever thread
|
|
450
|
+
# releases the last reference, which can easily be our loop thread if the
|
|
451
|
+
# object is dereferenced during loop._run_once().
|
|
452
|
+
# ---------------------------------------------------------------------------
|
|
453
|
+
if threading.current_thread() is self._loop_thread:
|
|
454
|
+
self._loop.call_soon_threadsafe(asyncio.create_task, coro) # type: ignore
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
fut = asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore
|
|
458
|
+
return fut.result(timeout=timeout) # raises on error # type: ignore
|
|
459
|
+
|
|
460
|
+
def shutdown(self) -> None:
|
|
461
|
+
if self._loop:
|
|
462
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
463
|
+
self._loop = None
|
|
464
|
+
if self._loop_thread:
|
|
465
|
+
self._loop_thread.join(timeout=5)
|
|
466
|
+
|
|
467
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
468
|
+
return True
|
|
469
|
+
|
|
470
|
+
def spans(self) -> List[Span]:
|
|
471
|
+
"""
|
|
472
|
+
Get the list of spans collected by this processor.
|
|
473
|
+
This is useful for debugging and testing purposes.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
List of [`Span`][mantisdk.Span] objects collected during tracing.
|
|
477
|
+
"""
|
|
478
|
+
return self._spans
|
|
479
|
+
|
|
480
|
+
def with_context(self, store: LightningStore, rollout_id: str, attempt_id: str):
|
|
481
|
+
# simple context manager without nesting into asyncio
|
|
482
|
+
class _Ctx:
|
|
483
|
+
def __enter__(_): # type: ignore
|
|
484
|
+
# Use _ instead of self to avoid shadowing the instance method.
|
|
485
|
+
with self._lock:
|
|
486
|
+
self._store, self._rollout_id, self._attempt_id = store, rollout_id, attempt_id
|
|
487
|
+
self._last_trace = None
|
|
488
|
+
self._spans = []
|
|
489
|
+
return self
|
|
490
|
+
|
|
491
|
+
def __exit__(_, exc_type, exc, tb): # type: ignore
|
|
492
|
+
with self._lock:
|
|
493
|
+
self._store = self._rollout_id = self._attempt_id = None
|
|
494
|
+
|
|
495
|
+
return _Ctx()
|
|
496
|
+
|
|
497
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
498
|
+
"""
|
|
499
|
+
Process a span when it ends.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
span: The span that has ended.
|
|
503
|
+
"""
|
|
504
|
+
# Skip if span is not sampled
|
|
505
|
+
if not span.context or not span.context.trace_flags.sampled:
|
|
506
|
+
return
|
|
507
|
+
|
|
508
|
+
if not self._disable_store_submission and self._store and self._rollout_id and self._attempt_id:
|
|
509
|
+
try:
|
|
510
|
+
# Submit add_otel_span to the event loop and wait for it to complete
|
|
511
|
+
with suppress_instrumentation():
|
|
512
|
+
self._ensure_loop()
|
|
513
|
+
uploaded_span = self._await_in_loop(
|
|
514
|
+
self._store.add_otel_span(self._rollout_id, self._attempt_id, span),
|
|
515
|
+
timeout=STORE_WRITE_TIMEOUT_SECONDS,
|
|
516
|
+
)
|
|
517
|
+
if uploaded_span is not None:
|
|
518
|
+
self._spans.append(uploaded_span)
|
|
519
|
+
except TimeoutError:
|
|
520
|
+
logger.warning(
|
|
521
|
+
"Timed out adding span %s to store after %.1f seconds. The span will be stored locally "
|
|
522
|
+
"but it's not guaranteed to be persisted.",
|
|
523
|
+
span.name,
|
|
524
|
+
STORE_WRITE_TIMEOUT_SECONDS,
|
|
525
|
+
)
|
|
526
|
+
self._spans.append(
|
|
527
|
+
Span.from_opentelemetry(
|
|
528
|
+
span,
|
|
529
|
+
rollout_id=self._rollout_id,
|
|
530
|
+
attempt_id=self._attempt_id,
|
|
531
|
+
sequence_id=self._local_sequence_id,
|
|
532
|
+
)
|
|
533
|
+
)
|
|
534
|
+
except Exception:
|
|
535
|
+
# log; on_end MUST NOT raise
|
|
536
|
+
logger.exception(f"Error adding span to store: {span.name}. The span will be store locally only.")
|
|
537
|
+
self._spans.append(
|
|
538
|
+
Span.from_opentelemetry(
|
|
539
|
+
span,
|
|
540
|
+
rollout_id=self._rollout_id,
|
|
541
|
+
attempt_id=self._attempt_id,
|
|
542
|
+
sequence_id=self._local_sequence_id,
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
else:
|
|
547
|
+
# Fallback path
|
|
548
|
+
created_span = Span.from_opentelemetry(
|
|
549
|
+
span,
|
|
550
|
+
rollout_id=self._rollout_id or "rollout-dummy",
|
|
551
|
+
attempt_id=self._attempt_id or "attempt-dummy",
|
|
552
|
+
sequence_id=self._local_sequence_id,
|
|
553
|
+
)
|
|
554
|
+
self._local_sequence_id += 1
|
|
555
|
+
self._spans.append(created_span)
|