mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show
  1. mantisdk/__init__.py +22 -0
  2. mantisdk/adapter/__init__.py +15 -0
  3. mantisdk/adapter/base.py +94 -0
  4. mantisdk/adapter/messages.py +270 -0
  5. mantisdk/adapter/triplet.py +1028 -0
  6. mantisdk/algorithm/__init__.py +39 -0
  7. mantisdk/algorithm/apo/__init__.py +5 -0
  8. mantisdk/algorithm/apo/apo.py +889 -0
  9. mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
  10. mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
  11. mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
  12. mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
  13. mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
  14. mantisdk/algorithm/base.py +162 -0
  15. mantisdk/algorithm/decorator.py +264 -0
  16. mantisdk/algorithm/fast.py +250 -0
  17. mantisdk/algorithm/gepa/__init__.py +59 -0
  18. mantisdk/algorithm/gepa/adapter.py +459 -0
  19. mantisdk/algorithm/gepa/gepa.py +364 -0
  20. mantisdk/algorithm/gepa/lib/__init__.py +18 -0
  21. mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
  22. mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
  23. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
  24. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
  25. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
  26. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
  27. mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
  28. mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
  29. mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
  30. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
  31. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
  32. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
  33. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
  34. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
  35. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
  36. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
  37. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
  38. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
  39. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
  40. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
  41. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
  42. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
  43. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
  44. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
  45. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
  46. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
  47. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
  48. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
  49. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
  50. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
  51. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
  52. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
  53. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
  54. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
  55. mantisdk/algorithm/gepa/lib/api.py +375 -0
  56. mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
  57. mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
  58. mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
  59. mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
  60. mantisdk/algorithm/gepa/lib/core/result.py +233 -0
  61. mantisdk/algorithm/gepa/lib/core/state.py +636 -0
  62. mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
  63. mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
  64. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
  65. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
  66. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
  67. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
  68. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
  69. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
  70. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
  71. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
  72. mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
  73. mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
  74. mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
  75. mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
  76. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
  77. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
  78. mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
  79. mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
  80. mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
  81. mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
  82. mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
  83. mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
  84. mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
  85. mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
  86. mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
  87. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
  88. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
  89. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
  90. mantisdk/algorithm/gepa/lib/py.typed +0 -0
  91. mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
  92. mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
  93. mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
  94. mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
  95. mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
  96. mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
  97. mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
  98. mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
  99. mantisdk/algorithm/gepa/tracing.py +105 -0
  100. mantisdk/algorithm/utils.py +177 -0
  101. mantisdk/algorithm/verl/__init__.py +5 -0
  102. mantisdk/algorithm/verl/interface.py +202 -0
  103. mantisdk/cli/__init__.py +56 -0
  104. mantisdk/cli/prometheus.py +115 -0
  105. mantisdk/cli/store.py +131 -0
  106. mantisdk/cli/vllm.py +29 -0
  107. mantisdk/client.py +408 -0
  108. mantisdk/config.py +348 -0
  109. mantisdk/emitter/__init__.py +43 -0
  110. mantisdk/emitter/annotation.py +370 -0
  111. mantisdk/emitter/exception.py +54 -0
  112. mantisdk/emitter/message.py +61 -0
  113. mantisdk/emitter/object.py +117 -0
  114. mantisdk/emitter/reward.py +320 -0
  115. mantisdk/env_var.py +156 -0
  116. mantisdk/execution/__init__.py +15 -0
  117. mantisdk/execution/base.py +64 -0
  118. mantisdk/execution/client_server.py +443 -0
  119. mantisdk/execution/events.py +69 -0
  120. mantisdk/execution/inter_process.py +16 -0
  121. mantisdk/execution/shared_memory.py +282 -0
  122. mantisdk/instrumentation/__init__.py +119 -0
  123. mantisdk/instrumentation/agentops.py +314 -0
  124. mantisdk/instrumentation/agentops_langchain.py +45 -0
  125. mantisdk/instrumentation/litellm.py +83 -0
  126. mantisdk/instrumentation/vllm.py +81 -0
  127. mantisdk/instrumentation/weave.py +500 -0
  128. mantisdk/litagent/__init__.py +11 -0
  129. mantisdk/litagent/decorator.py +536 -0
  130. mantisdk/litagent/litagent.py +252 -0
  131. mantisdk/llm_proxy.py +1890 -0
  132. mantisdk/logging.py +370 -0
  133. mantisdk/reward.py +7 -0
  134. mantisdk/runner/__init__.py +11 -0
  135. mantisdk/runner/agent.py +845 -0
  136. mantisdk/runner/base.py +182 -0
  137. mantisdk/runner/legacy.py +309 -0
  138. mantisdk/semconv.py +170 -0
  139. mantisdk/server.py +401 -0
  140. mantisdk/store/__init__.py +23 -0
  141. mantisdk/store/base.py +897 -0
  142. mantisdk/store/client_server.py +2092 -0
  143. mantisdk/store/collection/__init__.py +30 -0
  144. mantisdk/store/collection/base.py +587 -0
  145. mantisdk/store/collection/memory.py +970 -0
  146. mantisdk/store/collection/mongo.py +1412 -0
  147. mantisdk/store/collection_based.py +1823 -0
  148. mantisdk/store/insight.py +648 -0
  149. mantisdk/store/listener.py +58 -0
  150. mantisdk/store/memory.py +396 -0
  151. mantisdk/store/mongo.py +165 -0
  152. mantisdk/store/sqlite.py +3 -0
  153. mantisdk/store/threading.py +357 -0
  154. mantisdk/store/utils.py +142 -0
  155. mantisdk/tracer/__init__.py +16 -0
  156. mantisdk/tracer/agentops.py +242 -0
  157. mantisdk/tracer/base.py +287 -0
  158. mantisdk/tracer/dummy.py +106 -0
  159. mantisdk/tracer/otel.py +555 -0
  160. mantisdk/tracer/weave.py +677 -0
  161. mantisdk/trainer/__init__.py +6 -0
  162. mantisdk/trainer/init_utils.py +263 -0
  163. mantisdk/trainer/legacy.py +367 -0
  164. mantisdk/trainer/registry.py +12 -0
  165. mantisdk/trainer/trainer.py +618 -0
  166. mantisdk/types/__init__.py +6 -0
  167. mantisdk/types/core.py +553 -0
  168. mantisdk/types/resources.py +204 -0
  169. mantisdk/types/tracer.py +515 -0
  170. mantisdk/types/tracing.py +218 -0
  171. mantisdk/utils/__init__.py +1 -0
  172. mantisdk/utils/id.py +18 -0
  173. mantisdk/utils/metrics.py +1025 -0
  174. mantisdk/utils/otel.py +578 -0
  175. mantisdk/utils/otlp.py +536 -0
  176. mantisdk/utils/server_launcher.py +1045 -0
  177. mantisdk/utils/system_snapshot.py +81 -0
  178. mantisdk/verl/__init__.py +8 -0
  179. mantisdk/verl/__main__.py +6 -0
  180. mantisdk/verl/async_server.py +46 -0
  181. mantisdk/verl/config.yaml +27 -0
  182. mantisdk/verl/daemon.py +1154 -0
  183. mantisdk/verl/dataset.py +44 -0
  184. mantisdk/verl/entrypoint.py +248 -0
  185. mantisdk/verl/trainer.py +549 -0
  186. mantisdk-0.1.0.dist-info/METADATA +119 -0
  187. mantisdk-0.1.0.dist-info/RECORD +190 -0
  188. mantisdk-0.1.0.dist-info/WHEEL +4 -0
  189. mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
  190. mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
@@ -0,0 +1,555 @@
1
+ # Copyright (c) Microsoft. All rights reserved.
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import threading
8
+ import warnings
9
+ from contextlib import asynccontextmanager, contextmanager
10
+ from typing import Any, AsyncGenerator, Awaitable, Iterator, List, Optional
11
+
12
+ import opentelemetry.trace as trace_api
13
+ from opentelemetry.instrumentation.utils import suppress_instrumentation
14
+ from opentelemetry.sdk.resources import Resource
15
+ from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
16
+ from opentelemetry.sdk.trace import TracerProvider as TracerProviderImpl
17
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
18
+
19
+ from mantisdk.semconv import LightningResourceAttributes
20
+ from mantisdk.store.base import LightningStore
21
+ from mantisdk.types import Attributes, Span, SpanCoreFields, SpanRecordingContext, StatusCode, TraceStatus
22
+ from mantisdk.types.tracer import convert_timestamp
23
+ from mantisdk.utils.otel import get_tracer_provider
24
+ from mantisdk.utils.otlp import LightningStoreOTLPExporter
25
+
26
+ from .base import Tracer, with_active_tracer_context
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ STORE_WRITE_TIMEOUT_SECONDS = 10.0
31
+
32
+
33
+ def to_otel_status_code(status_code: StatusCode) -> trace_api.StatusCode:
34
+ if status_code == "UNSET":
35
+ return trace_api.StatusCode.UNSET
36
+ elif status_code == "ERROR":
37
+ return trace_api.StatusCode.ERROR
38
+ else:
39
+ return trace_api.StatusCode.OK
40
+
41
+
42
+ class OtelSpanRecordingContext(SpanRecordingContext):
43
+ def __init__(self, span: trace_api.Span) -> None:
44
+ self._span = span
45
+
46
+ def record_exception(self, exception: BaseException) -> None:
47
+ self._span.record_exception(exception)
48
+ self.record_status("ERROR", str(exception))
49
+
50
+ def record_attributes(self, attributes: Attributes) -> None:
51
+ self._span.set_attributes(attributes)
52
+
53
+ def record_status(self, status_code: StatusCode, description: Optional[str] = None) -> None:
54
+ otel_status_code = to_otel_status_code(status_code)
55
+ self._span.set_status(otel_status_code, description)
56
+
57
+ def get_otel_span(self) -> trace_api.Span:
58
+ return self._span
59
+
60
+ def get_recorded_span(self) -> SpanCoreFields:
61
+ if isinstance(self._span, ReadableSpan):
62
+ return SpanCoreFields(
63
+ name=self._span.name,
64
+ attributes=dict(self._span.attributes) if self._span.attributes else {},
65
+ start_time=convert_timestamp(self._span.start_time),
66
+ end_time=convert_timestamp(self._span.end_time),
67
+ status=TraceStatus.from_opentelemetry(self._span.status),
68
+ )
69
+ else:
70
+ raise ValueError(f"Span is not a ReadableSpan: {self._span}")
71
+
72
+
73
+ class OtelTracer(Tracer):
74
+ """Tracer that provides a basic OpenTelemetry tracer provider.
75
+
76
+ You should be able to collect mantisdk signals like rewards with this tracer,
77
+ but no other function instrumentations like `openai.chat.completion`.
78
+ """
79
+
80
+ def __init__(self):
81
+ super().__init__()
82
+ # This provider is only initialized when the worker is initialized.
83
+ self._tracer_provider: Optional[trace_api.TracerProvider] = None
84
+ self._lightning_span_processor: Optional[LightningSpanProcessor] = None
85
+ self._simple_span_processor: Optional[SimpleSpanProcessor] = None
86
+ self._otlp_span_exporter: Optional[LightningStoreOTLPExporter] = None
87
+ self._initialized: bool = False
88
+
89
+ def init_worker(self, worker_id: int, store: Optional[LightningStore] = None):
90
+ super().init_worker(worker_id, store)
91
+ self._initialize_tracer_provider(worker_id)
92
+
93
+ def _initialize_tracer_provider(self, worker_id: int):
94
+ logger.info(f"[Worker {worker_id}] Setting up OpenTelemetry tracer...")
95
+
96
+ if self._initialized:
97
+ logger.info(f"[Worker {worker_id}] Tracer provider is already initialized. Skipping initialization.")
98
+ return
99
+
100
+ try:
101
+ get_tracer_provider()
102
+ logger.error(
103
+ f"[Worker {worker_id}] Tracer provider is already initialized but not by OtelTracer. OpenTelemetry may not work as expected."
104
+ )
105
+ except RuntimeError:
106
+ logger.debug(f"[Worker {worker_id}] Tracer provider is not initialized by OtelTracer. Initializing it now.")
107
+
108
+ self._tracer_provider = TracerProviderImpl()
109
+ trace_api.set_tracer_provider(self._tracer_provider)
110
+
111
+ # Note: Call-type tagging is handled via x-mantis-call-type headers
112
+ # passed to the LLM proxy, not via client-side span processing.
113
+ # This avoids tagging internal SDK spans like mantisdk.annotation.
114
+
115
+ self._lightning_span_processor = LightningSpanProcessor()
116
+ self._tracer_provider.add_span_processor(self._lightning_span_processor)
117
+ self._otlp_span_exporter = LightningStoreOTLPExporter()
118
+ self._simple_span_processor = SimpleSpanProcessor(self._otlp_span_exporter)
119
+ self._tracer_provider.add_span_processor(self._simple_span_processor)
120
+ self._initialized = True
121
+
122
+ logger.info(f"[Worker {worker_id}] OpenTelemetry tracer provider initialized.")
123
+
124
+ def teardown_worker(self, worker_id: int):
125
+ super().teardown_worker(worker_id)
126
+ logger.info(f"[Worker {worker_id}] Tearing down OpenTelemetry tracer does NOT remove the tracer provider.")
127
+
128
+ @with_active_tracer_context
129
+ @asynccontextmanager
130
+ async def trace_context(
131
+ self,
132
+ name: Optional[str] = None,
133
+ *,
134
+ store: Optional[LightningStore] = None,
135
+ rollout_id: Optional[str] = None,
136
+ attempt_id: Optional[str] = None,
137
+ ) -> AsyncGenerator[trace_api.Tracer, None]:
138
+ """
139
+ Starts a new tracing context. This should be used as a context manager.
140
+
141
+ Args:
142
+ name: Optional name for the tracing context.
143
+ store: Optional store to add the spans to.
144
+ rollout_id: Optional rollout ID to add the spans to.
145
+ attempt_id: Optional attempt ID to add the spans to.
146
+
147
+ Yields:
148
+ The OpenTelemetry tracer instance to collect spans.
149
+ """
150
+ if not self._lightning_span_processor:
151
+ raise RuntimeError("LightningSpanProcessor is not initialized. Call init_worker() first.")
152
+
153
+ if store is not None:
154
+ warnings.warn(
155
+ "store is deprecated in favor of init_worker(). It will be removed in the future.",
156
+ DeprecationWarning,
157
+ stacklevel=3,
158
+ )
159
+ else:
160
+ store = self._store
161
+
162
+ if rollout_id is not None and attempt_id is not None:
163
+ if store is None:
164
+ raise ValueError("store is required to be initialized when rollout_id and attempt_id are provided")
165
+ if store.capabilities.get("otlp_traces", False) is True:
166
+ logger.debug(f"Tracing to LightningStore rollout_id={rollout_id}, attempt_id={attempt_id}")
167
+ await self._enable_native_otlp_exporter(store, rollout_id, attempt_id)
168
+ else:
169
+ self._disable_native_otlp_exporter()
170
+ ctx = self._lightning_span_processor.with_context(store=store, rollout_id=rollout_id, attempt_id=attempt_id)
171
+ with ctx:
172
+ yield trace_api.get_tracer(__name__, tracer_provider=self._tracer_provider)
173
+ elif rollout_id is None and attempt_id is None:
174
+ self._disable_native_otlp_exporter()
175
+ with self._lightning_span_processor:
176
+ yield trace_api.get_tracer(__name__, tracer_provider=self._tracer_provider)
177
+ else:
178
+ raise ValueError("rollout_id and attempt_id must be either all provided or all None")
179
+
180
+ def create_span(
181
+ self,
182
+ name: str,
183
+ attributes: Optional[Attributes] = None,
184
+ timestamp: Optional[float] = None,
185
+ status: Optional[TraceStatus] = None,
186
+ ) -> SpanCoreFields:
187
+ # Fire the span to the current active tracer provider.
188
+ tracer_provider = self._get_tracer_provider()
189
+ tracer = tracer_provider.get_tracer(__name__)
190
+ span = tracer.start_span(
191
+ name, attributes=attributes, start_time=int(timestamp * 1_000_000_000) if timestamp else None
192
+ )
193
+ if status is not None:
194
+ span.set_status(to_otel_status_code(status.status_code), status.description)
195
+ span.end(int(timestamp * 1_000_000_000) if timestamp else None)
196
+
197
+ # The span should have been auto-created by now.
198
+ # Return the core fields of the span.
199
+ if isinstance(span, ReadableSpan):
200
+ return SpanCoreFields(
201
+ name=name,
202
+ attributes=dict(span.attributes) if span.attributes else {},
203
+ start_time=convert_timestamp(span.start_time),
204
+ end_time=convert_timestamp(span.end_time),
205
+ status=TraceStatus.from_opentelemetry(span.status),
206
+ )
207
+ else:
208
+ raise ValueError(f"Span is not a ReadableSpan: {span}")
209
+
210
+ @contextmanager
211
+ def operation_context(
212
+ self,
213
+ name: str,
214
+ attributes: Optional[Attributes] = None,
215
+ start_time: Optional[float] = None,
216
+ end_time: Optional[float] = None,
217
+ ) -> Iterator[SpanRecordingContext]:
218
+ if end_time is not None:
219
+ logger.warning("OpenTelemetry doesn't support customizing the end time of a span. End time is ignored.")
220
+ # Record the span to the current active tracer provider.
221
+ tracer_provider = self._get_tracer_provider()
222
+ tracer = tracer_provider.get_tracer(__name__)
223
+
224
+ # Activate the span as the current span within otel.
225
+ with tracer.start_as_current_span(
226
+ name, attributes=attributes, start_time=int(start_time * 1_000_000_000) if start_time else None
227
+ ) as span:
228
+ recording_context = OtelSpanRecordingContext(span)
229
+ try:
230
+ yield recording_context
231
+ except Exception as exc:
232
+ recording_context.record_exception(exc)
233
+ raise
234
+
235
+ # No need to retrieve the span here. It's already been sent to otel processor.
236
+
237
+ def get_last_trace(self) -> List[Span]:
238
+ """
239
+ Retrieves the raw list of captured spans from the most recent trace.
240
+
241
+ Returns:
242
+ A list of [`Span`][mantisdk.Span] objects captured during the most recent trace.
243
+ """
244
+ if not self._lightning_span_processor:
245
+ raise RuntimeError("LightningSpanProcessor is not initialized. Call init_worker() first.")
246
+ return self._lightning_span_processor.spans()
247
+
248
+ def _get_tracer_provider(self) -> TracerProviderImpl:
249
+ if self._tracer_provider is None:
250
+ raise RuntimeError("TracerProvider is not initialized. Call init_worker() first.")
251
+ if not isinstance(self._tracer_provider, TracerProviderImpl):
252
+ raise TypeError(f"TracerProvider is not a opentelemetry.sdk.trace.TracerProvider: {self._tracer_provider}")
253
+ return self._tracer_provider
254
+
255
+ async def _enable_native_otlp_exporter(self, store: LightningStore, rollout_id: str, attempt_id: str):
256
+ tracer_provider = self._get_tracer_provider()
257
+ active_span_processor = tracer_provider._active_span_processor # pyright: ignore[reportPrivateUsage]
258
+
259
+ # Override the resources so that the server knows where the request comes from.
260
+ tracer_provider._resource = tracer_provider._resource.merge( # pyright: ignore[reportPrivateUsage]
261
+ Resource.create(
262
+ {
263
+ LightningResourceAttributes.ROLLOUT_ID.value: rollout_id,
264
+ LightningResourceAttributes.ATTEMPT_ID.value: attempt_id,
265
+ }
266
+ )
267
+ )
268
+
269
+ # Fetch rollout metadata for tracing configuration (environment, tags)
270
+ environment = None
271
+ tags = None
272
+ try:
273
+ rollout = await store.get_rollout_by_id(rollout_id)
274
+ if rollout and rollout.metadata:
275
+ environment = rollout.metadata.get("environment")
276
+ tags = rollout.metadata.get("tags")
277
+ logger.debug(f"Fetched tracing metadata for rollout {rollout_id}: environment={environment}, tags={tags}")
278
+ except Exception as e:
279
+ logger.warning(f"Failed to fetch rollout metadata for {rollout_id}: {e}")
280
+
281
+ instrumented = False
282
+ candidates: List[str] = []
283
+ for processor in active_span_processor._span_processors: # pyright: ignore[reportPrivateUsage]
284
+ if isinstance(processor, LightningSpanProcessor):
285
+ # We don't need the LightningSpanProcessor any more.
286
+ logger.debug("LightningSpanProcessor already present in TracerProvider, disabling it.")
287
+ processor.disable_store_submission = True
288
+ elif isinstance(processor, (SimpleSpanProcessor, BatchSpanProcessor)):
289
+ # Instead, we rely on the OTLPSpanExporter to send spans to the store.
290
+ if isinstance(processor.span_exporter, LightningStoreOTLPExporter):
291
+ # Get optional auth headers from store
292
+ otlp_headers = None
293
+ if hasattr(store, "get_otlp_headers"):
294
+ otlp_headers = store.get_otlp_headers()
295
+ processor.span_exporter.enable_store_otlp(
296
+ store.otlp_traces_endpoint(), rollout_id, attempt_id,
297
+ headers=otlp_headers, environment=environment, tags=tags
298
+ )
299
+ logger.debug(f"Set LightningStoreOTLPExporter endpoint to {store.otlp_traces_endpoint()}")
300
+ instrumented = True
301
+ else:
302
+ candidates.append(
303
+ f"{processor.__class__.__name__} with {processor.span_exporter.__class__.__name__}"
304
+ )
305
+ else:
306
+ candidates.append(f"{processor.__class__.__name__}")
307
+
308
+ if not instrumented:
309
+ raise RuntimeError(
310
+ "Failed to enable native OTLP exporter: no BatchSpanProcessor or SimpleSpanProcessor with "
311
+ "LightningStoreOTLPExporter found in TracerProvider. Please try using a non-OTLP store."
312
+ "Candidates are: " + ", ".join(candidates)
313
+ )
314
+
315
+ def _disable_native_otlp_exporter(self):
316
+ tracer_provider = self._get_tracer_provider()
317
+ active_span_processor = tracer_provider._active_span_processor # pyright: ignore[reportPrivateUsage]
318
+ tracer_provider._resource = tracer_provider._resource.merge( # pyright: ignore[reportPrivateUsage]
319
+ Resource.create(
320
+ {
321
+ LightningResourceAttributes.ROLLOUT_ID.value: "",
322
+ LightningResourceAttributes.ATTEMPT_ID.value: "",
323
+ }
324
+ )
325
+ ) # reset resource
326
+ for processor in active_span_processor._span_processors: # pyright: ignore[reportPrivateUsage]
327
+ if isinstance(processor, LightningSpanProcessor):
328
+ # We will be in need of the LightningSpanProcessor again.
329
+ logger.debug("Enabling LightningSpanProcessor in TracerProvider.")
330
+ processor.disable_store_submission = False
331
+
332
+
333
+ class LightningSpanProcessor(SpanProcessor):
334
+ """Span processor that subclasses OpenTelemetry's `SpanProcessor` and adds support to dump traces
335
+ to a [`LightningStore`][mantisdk.LightningStore].
336
+
337
+ It serves two purposes:
338
+
339
+ 1. Records all the spans in a local buffer.
340
+ 2. Submits the spans to the event loop to be added to the store.
341
+ """
342
+
343
+ def __init__(self, disable_store_submission: bool = False):
344
+ self._disable_store_submission: bool = disable_store_submission
345
+ self._spans: List[Span] = []
346
+
347
+ # Store related context and states
348
+ self._store: Optional[LightningStore] = None
349
+ self._rollout_id: Optional[str] = None
350
+ self._attempt_id: Optional[str] = None
351
+ self._local_sequence_id: int = 0
352
+ self._lock = threading.Lock()
353
+
354
+ # private asyncio loop running in a daemon thread
355
+ self._loop_ready = threading.Event()
356
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
357
+ self._loop_thread: Optional[threading.Thread] = None
358
+ self._loop_init_lock = threading.Lock()
359
+
360
+ def __repr__(self) -> str:
361
+ return (
362
+ f"{self.__class__.__name__}("
363
+ + f"disable_store_submission={self.disable_store_submission}, "
364
+ + f"store={self.store!r}, "
365
+ + f"rollout_id={self.rollout_id!r}, "
366
+ + f"attempt_id={self.attempt_id!r})"
367
+ )
368
+
369
+ @property
370
+ def store(self) -> Optional[LightningStore]:
371
+ """The store to submit the spans to."""
372
+ return self._store
373
+
374
+ @property
375
+ def rollout_id(self) -> Optional[str]:
376
+ """The rollout ID to submit the spans to."""
377
+ return self._rollout_id
378
+
379
+ @property
380
+ def attempt_id(self) -> Optional[str]:
381
+ """The attempt ID to submit the spans to."""
382
+ return self._attempt_id
383
+
384
+ @property
385
+ def disable_store_submission(self) -> bool:
386
+ """Whether to disable submitting spans to the store."""
387
+ return self._disable_store_submission
388
+
389
+ @disable_store_submission.setter
390
+ def disable_store_submission(self, value: bool) -> None:
391
+ self._disable_store_submission = value
392
+
393
+ def _ensure_loop(self) -> None:
394
+ # Fast path: loop already initialized
395
+ if self._loop_thread is not None and self._loop is not None:
396
+ return
397
+
398
+ with self._loop_init_lock:
399
+ # Double-check after acquiring lock
400
+ if self._loop_thread is not None and self._loop is not None:
401
+ return
402
+ self._loop_ready.clear()
403
+ self._loop_thread = threading.Thread(target=self._loop_runner, name="otel-loop", daemon=True)
404
+ self._loop_thread.start()
405
+ if not self._loop_ready.wait(timeout=30.0):
406
+ raise RuntimeError("Timed out waiting for otel-loop thread to start")
407
+
408
+ def _loop_runner(self):
409
+ loop = asyncio.new_event_loop()
410
+ self._loop = loop
411
+ asyncio.set_event_loop(loop)
412
+ self._loop_ready.set()
413
+ loop.run_forever()
414
+ loop.close()
415
+
416
+ def __enter__(self):
417
+ self._last_trace = None
418
+ self._spans = []
419
+ return self
420
+
421
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):
422
+ self._store = None
423
+ self._rollout_id = None
424
+ self._attempt_id = None
425
+
426
+ def _await_in_loop(self, coro: Awaitable[Any], timeout: Optional[float] = None) -> Any:
427
+ # submit to the dedicated loop and wait synchronously
428
+ self._ensure_loop()
429
+ if self._loop is None:
430
+ raise RuntimeError("Loop is not initialized. This should not happen.")
431
+
432
+ # If already on the exporter loop thread, schedule and return immediately.
433
+ # ---------------------------------------------------------------------------
434
+ # WHY THIS CONDITIONAL EXISTS:
435
+ # In rare cases, span.end() is triggered from a LangchainCallbackHandler.__del__
436
+ # (or another finalizer) while the Python garbage collector is running on the
437
+ # *same thread* that owns our exporter event loop ("otel-loop").
438
+ #
439
+ # When that happens, on_end() executes on the exporter loop thread itself.
440
+ # If we were to call `asyncio.run_coroutine_threadsafe(...).result()` here,
441
+ # it would deadlock immediately — because the loop cannot both wait on and run
442
+ # the same coroutine. The Future stays pending forever and the loop stops
443
+ # processing scheduled callbacks.
444
+ #
445
+ # To avoid that self-deadlock, we detect when on_end() runs on the exporter
446
+ # loop thread. If so, we *schedule* the coroutine on the loop (fire-and-forget)
447
+ # instead of blocking with .result().
448
+ #
449
+ # This situation can occur because Python calls __del__ in whatever thread
450
+ # releases the last reference, which can easily be our loop thread if the
451
+ # object is dereferenced during loop._run_once().
452
+ # ---------------------------------------------------------------------------
453
+ if threading.current_thread() is self._loop_thread:
454
+ self._loop.call_soon_threadsafe(asyncio.create_task, coro) # type: ignore
455
+ return None
456
+
457
+ fut = asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore
458
+ return fut.result(timeout=timeout) # raises on error # type: ignore
459
+
460
+ def shutdown(self) -> None:
461
+ if self._loop:
462
+ self._loop.call_soon_threadsafe(self._loop.stop)
463
+ self._loop = None
464
+ if self._loop_thread:
465
+ self._loop_thread.join(timeout=5)
466
+
467
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
468
+ return True
469
+
470
+ def spans(self) -> List[Span]:
471
+ """
472
+ Get the list of spans collected by this processor.
473
+ This is useful for debugging and testing purposes.
474
+
475
+ Returns:
476
+ List of [`Span`][mantisdk.Span] objects collected during tracing.
477
+ """
478
+ return self._spans
479
+
480
+ def with_context(self, store: LightningStore, rollout_id: str, attempt_id: str):
481
+ # simple context manager without nesting into asyncio
482
+ class _Ctx:
483
+ def __enter__(_): # type: ignore
484
+ # Use _ instead of self to avoid shadowing the instance method.
485
+ with self._lock:
486
+ self._store, self._rollout_id, self._attempt_id = store, rollout_id, attempt_id
487
+ self._last_trace = None
488
+ self._spans = []
489
+ return self
490
+
491
+ def __exit__(_, exc_type, exc, tb): # type: ignore
492
+ with self._lock:
493
+ self._store = self._rollout_id = self._attempt_id = None
494
+
495
+ return _Ctx()
496
+
497
+ def on_end(self, span: ReadableSpan) -> None:
498
+ """
499
+ Process a span when it ends.
500
+
501
+ Args:
502
+ span: The span that has ended.
503
+ """
504
+ # Skip if span is not sampled
505
+ if not span.context or not span.context.trace_flags.sampled:
506
+ return
507
+
508
+ if not self._disable_store_submission and self._store and self._rollout_id and self._attempt_id:
509
+ try:
510
+ # Submit add_otel_span to the event loop and wait for it to complete
511
+ with suppress_instrumentation():
512
+ self._ensure_loop()
513
+ uploaded_span = self._await_in_loop(
514
+ self._store.add_otel_span(self._rollout_id, self._attempt_id, span),
515
+ timeout=STORE_WRITE_TIMEOUT_SECONDS,
516
+ )
517
+ if uploaded_span is not None:
518
+ self._spans.append(uploaded_span)
519
+ except TimeoutError:
520
+ logger.warning(
521
+ "Timed out adding span %s to store after %.1f seconds. The span will be stored locally "
522
+ "but it's not guaranteed to be persisted.",
523
+ span.name,
524
+ STORE_WRITE_TIMEOUT_SECONDS,
525
+ )
526
+ self._spans.append(
527
+ Span.from_opentelemetry(
528
+ span,
529
+ rollout_id=self._rollout_id,
530
+ attempt_id=self._attempt_id,
531
+ sequence_id=self._local_sequence_id,
532
+ )
533
+ )
534
+ except Exception:
535
+ # log; on_end MUST NOT raise
536
+ logger.exception(f"Error adding span to store: {span.name}. The span will be store locally only.")
537
+ self._spans.append(
538
+ Span.from_opentelemetry(
539
+ span,
540
+ rollout_id=self._rollout_id,
541
+ attempt_id=self._attempt_id,
542
+ sequence_id=self._local_sequence_id,
543
+ )
544
+ )
545
+
546
+ else:
547
+ # Fallback path
548
+ created_span = Span.from_opentelemetry(
549
+ span,
550
+ rollout_id=self._rollout_id or "rollout-dummy",
551
+ attempt_id=self._attempt_id or "attempt-dummy",
552
+ sequence_id=self._local_sequence_id,
553
+ )
554
+ self._local_sequence_id += 1
555
+ self._spans.append(created_span)