by-framework 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- by_framework/client/client.py +2 -6
- by_framework/{observability/metrics.py → metrics/__init__.py} +24 -1
- by_framework/metrics/read_client.py +257 -0
- by_framework/{observability → metrics}/snapshot.py +3 -3
- by_framework/trace/__init__.py +55 -0
- by_framework/{observability → trace}/external_trace.py +1 -1
- by_framework/{observability → trace}/span_recorder.py +116 -55
- by_framework/trace/trace_schema.py +329 -0
- by_framework/trace/trace_writer.py +157 -0
- by_framework/worker/_control_handling.py +1 -1
- by_framework/worker/context.py +208 -129
- by_framework/worker/runner.py +29 -9
- by_framework/worker/worker.py +116 -9
- {by_framework-0.2.2.dev1.dist-info → by_framework-0.2.2.dev2.dist-info}/METADATA +5 -4
- {by_framework-0.2.2.dev1.dist-info → by_framework-0.2.2.dev2.dist-info}/RECORD +17 -24
- by_framework/observability/__init__.py +0 -62
- by_framework/observability/dashboard.py +0 -1145
- by_framework/observability/frontend/index.html +0 -12
- by_framework/observability/frontend/package-lock.json +0 -1696
- by_framework/observability/frontend/package.json +0 -18
- by_framework/observability/frontend/src/main.jsx +0 -1351
- by_framework/observability/frontend/src/styles.css +0 -1214
- by_framework/observability/frontend/vite.config.js +0 -18
- by_framework/observability/static/app.js +0 -115
- by_framework/observability/static/index.html +0 -13
- by_framework/observability/static/styles.css +0 -1
- {by_framework-0.2.2.dev1.dist-info → by_framework-0.2.2.dev2.dist-info}/WHEEL +0 -0
- {by_framework-0.2.2.dev1.dist-info → by_framework-0.2.2.dev2.dist-info}/licenses/LICENSE +0 -0
by_framework/client/client.py
CHANGED
|
@@ -43,11 +43,7 @@ from by_framework.core.protocol.responses import (
|
|
|
43
43
|
)
|
|
44
44
|
from by_framework.core.registry import WorkerRegistry
|
|
45
45
|
from by_framework.errors import WorkerRegistryNotSetError
|
|
46
|
-
from by_framework.
|
|
47
|
-
SpanRecorder,
|
|
48
|
-
TraceSpan,
|
|
49
|
-
str_to_uint64,
|
|
50
|
-
)
|
|
46
|
+
from by_framework.trace.span_recorder import (SpanRecorder, TraceSpan, str_to_uint64)
|
|
51
47
|
|
|
52
48
|
if TYPE_CHECKING:
|
|
53
49
|
pass
|
|
@@ -688,7 +684,7 @@ class GatewayClient:
|
|
|
688
684
|
)
|
|
689
685
|
)
|
|
690
686
|
try:
|
|
691
|
-
from by_framework.
|
|
687
|
+
from by_framework.metrics import record_availability_metrics
|
|
692
688
|
|
|
693
689
|
record_availability_metrics(
|
|
694
690
|
agent_type=params["target_agent_type"],
|
|
@@ -4,7 +4,12 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any, Optional
|
|
6
6
|
|
|
7
|
-
from by_framework.
|
|
7
|
+
from by_framework.metrics.read_client import (
|
|
8
|
+
MetricsDiagnostic,
|
|
9
|
+
MetricsReadClient,
|
|
10
|
+
MetricsReadResult,
|
|
11
|
+
MetricsWindow,
|
|
12
|
+
)
|
|
8
13
|
|
|
9
14
|
try:
|
|
10
15
|
from prometheus_client import REGISTRY, Counter, Histogram # type: ignore
|
|
@@ -39,6 +44,10 @@ class DummyMetric:
|
|
|
39
44
|
del value
|
|
40
45
|
|
|
41
46
|
|
|
47
|
+
def _escape_label(value: str) -> str:
|
|
48
|
+
return value.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
|
49
|
+
|
|
50
|
+
|
|
42
51
|
def get_registry() -> Any:
|
|
43
52
|
"""Get the active Prometheus collector registry."""
|
|
44
53
|
return REGISTRY
|
|
@@ -193,3 +202,17 @@ def build_observability_diagnostics_metrics(diagnostics: dict[str, Any]) -> str:
|
|
|
193
202
|
f'{{exporter="{_escape_label(str(exporter))}"}} {int(count)}'
|
|
194
203
|
)
|
|
195
204
|
return "\n".join(lines) + "\n"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
__all__ = [
|
|
208
|
+
"MetricsDiagnostic",
|
|
209
|
+
"MetricsReadClient",
|
|
210
|
+
"MetricsReadResult",
|
|
211
|
+
"MetricsWindow",
|
|
212
|
+
"PROMETHEUS_AVAILABLE",
|
|
213
|
+
"build_observability_diagnostics_metrics",
|
|
214
|
+
"generate_latest_metrics",
|
|
215
|
+
"get_registry",
|
|
216
|
+
"record_availability_metrics",
|
|
217
|
+
"record_execution_metrics",
|
|
218
|
+
]
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""Read SDK for correlating metrics history with trace time windows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, dataclass, field
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
from by_framework.common.redis_client import Redis, get_redis
|
|
10
|
+
from by_framework.metrics.snapshot import (
|
|
11
|
+
REDIS_HISTORY_KEY,
|
|
12
|
+
build_history_point,
|
|
13
|
+
build_observability_snapshot,
|
|
14
|
+
load_history_from_redis,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class MetricsDiagnostic:
|
|
20
|
+
"""A metrics read diagnostic."""
|
|
21
|
+
|
|
22
|
+
code: str
|
|
23
|
+
message: str
|
|
24
|
+
severity: str = "info"
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> dict[str, Any]:
|
|
27
|
+
return {
|
|
28
|
+
key: value for key, value in asdict(self).items() if value not in ("", None)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class MetricsWindow:
|
|
34
|
+
"""A time window used to correlate metrics with trace spans."""
|
|
35
|
+
|
|
36
|
+
start_ts: int
|
|
37
|
+
end_ts: int
|
|
38
|
+
buffer_ms: int = 0
|
|
39
|
+
|
|
40
|
+
def expanded(self) -> "MetricsWindow":
|
|
41
|
+
"""Return this window with buffer applied on both sides."""
|
|
42
|
+
return MetricsWindow(
|
|
43
|
+
start_ts=max(0, self.start_ts - self.buffer_ms),
|
|
44
|
+
end_ts=max(self.start_ts, self.end_ts + self.buffer_ms),
|
|
45
|
+
buffer_ms=self.buffer_ms,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict[str, int]:
|
|
49
|
+
return asdict(self)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class MetricsReadResult:
|
|
54
|
+
"""Metrics samples and compact summary for a time window."""
|
|
55
|
+
|
|
56
|
+
window: MetricsWindow
|
|
57
|
+
samples: list[dict[str, int]] = field(default_factory=list)
|
|
58
|
+
summary: dict[str, Any] = field(default_factory=dict)
|
|
59
|
+
diagnostics: list[MetricsDiagnostic] = field(default_factory=list)
|
|
60
|
+
status: str = "ok"
|
|
61
|
+
|
|
62
|
+
def to_dict(self) -> dict[str, Any]:
|
|
63
|
+
return {
|
|
64
|
+
"window": self.window.to_dict(),
|
|
65
|
+
"samples": self.samples,
|
|
66
|
+
"summary": self.summary,
|
|
67
|
+
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
|
|
68
|
+
"status": self.status,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class MetricsReadClient:
|
|
73
|
+
"""Read metrics snapshots and history from by-framework Redis storage."""
|
|
74
|
+
|
|
75
|
+
def __init__(self, redis_client: Optional[Redis] = None) -> None:
|
|
76
|
+
self.redis = redis_client or get_redis()
|
|
77
|
+
|
|
78
|
+
async def get_snapshot(self) -> dict[str, Any]:
|
|
79
|
+
"""Return the current observability snapshot."""
|
|
80
|
+
return await build_observability_snapshot(self.redis)
|
|
81
|
+
|
|
82
|
+
async def get_history(
|
|
83
|
+
self,
|
|
84
|
+
*,
|
|
85
|
+
start_ts: int,
|
|
86
|
+
end_ts: int,
|
|
87
|
+
limit: int = 120,
|
|
88
|
+
) -> list[dict[str, int]]:
|
|
89
|
+
"""Return compact metrics history points that overlap a time window."""
|
|
90
|
+
window = MetricsWindow(start_ts=start_ts, end_ts=end_ts).expanded()
|
|
91
|
+
if window.end_ts <= 0:
|
|
92
|
+
return []
|
|
93
|
+
samples = await self._load_history_between(
|
|
94
|
+
window.start_ts,
|
|
95
|
+
window.end_ts,
|
|
96
|
+
limit=max(1, limit),
|
|
97
|
+
)
|
|
98
|
+
return sorted(samples, key=lambda item: int(item.get("generated_at", 0) or 0))
|
|
99
|
+
|
|
100
|
+
async def explain_window(
|
|
101
|
+
self,
|
|
102
|
+
*,
|
|
103
|
+
start_ts: int,
|
|
104
|
+
end_ts: int,
|
|
105
|
+
buffer_ms: int = 5_000,
|
|
106
|
+
limit: int = 120,
|
|
107
|
+
) -> MetricsReadResult:
|
|
108
|
+
"""Summarize metrics conditions around a trace/span time window."""
|
|
109
|
+
window = MetricsWindow(
|
|
110
|
+
start_ts=int(start_ts or 0),
|
|
111
|
+
end_ts=max(int(start_ts or 0), int(end_ts or 0)),
|
|
112
|
+
buffer_ms=max(0, int(buffer_ms or 0)),
|
|
113
|
+
).expanded()
|
|
114
|
+
diagnostics: list[MetricsDiagnostic] = []
|
|
115
|
+
samples = await self.get_history(
|
|
116
|
+
start_ts=window.start_ts,
|
|
117
|
+
end_ts=window.end_ts,
|
|
118
|
+
limit=limit,
|
|
119
|
+
)
|
|
120
|
+
if not samples:
|
|
121
|
+
diagnostics.append(
|
|
122
|
+
MetricsDiagnostic(
|
|
123
|
+
code="metrics_history_missing",
|
|
124
|
+
message="No metrics history points were found for this window.",
|
|
125
|
+
severity="warning",
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
try:
|
|
129
|
+
current_point = build_history_point(await self.get_snapshot())
|
|
130
|
+
if self._point_in_window(current_point, window):
|
|
131
|
+
samples = [current_point]
|
|
132
|
+
diagnostics.append(
|
|
133
|
+
MetricsDiagnostic(
|
|
134
|
+
code="metrics_current_snapshot_used",
|
|
135
|
+
message="Current metrics snapshot was used as fallback.",
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
except Exception as err: # pylint: disable=broad-exception-caught
|
|
139
|
+
diagnostics.append(
|
|
140
|
+
MetricsDiagnostic(
|
|
141
|
+
code="metrics_snapshot_failed",
|
|
142
|
+
message=f"Current metrics snapshot fallback failed: {err}",
|
|
143
|
+
severity="warning",
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
summary = self._summarize(samples)
|
|
147
|
+
diagnostics.extend(self._diagnose_summary(summary))
|
|
148
|
+
status = "partial" if diagnostics else "ok"
|
|
149
|
+
return MetricsReadResult(
|
|
150
|
+
window=window,
|
|
151
|
+
samples=samples,
|
|
152
|
+
summary=summary,
|
|
153
|
+
diagnostics=diagnostics,
|
|
154
|
+
status=status,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
async def _load_history_between(
|
|
158
|
+
self,
|
|
159
|
+
start_ts: int,
|
|
160
|
+
end_ts: int,
|
|
161
|
+
*,
|
|
162
|
+
limit: int,
|
|
163
|
+
) -> list[dict[str, int]]:
|
|
164
|
+
zrangebyscore = getattr(self.redis, "zrangebyscore", None)
|
|
165
|
+
if callable(zrangebyscore):
|
|
166
|
+
try:
|
|
167
|
+
raw_entries = await zrangebyscore(
|
|
168
|
+
REDIS_HISTORY_KEY,
|
|
169
|
+
start_ts,
|
|
170
|
+
end_ts,
|
|
171
|
+
start=0,
|
|
172
|
+
num=limit,
|
|
173
|
+
)
|
|
174
|
+
return [
|
|
175
|
+
point for point in map(self._decode_point, raw_entries) if point
|
|
176
|
+
]
|
|
177
|
+
except TypeError:
|
|
178
|
+
pass
|
|
179
|
+
points = await load_history_from_redis(self.redis, limit=limit)
|
|
180
|
+
return [
|
|
181
|
+
point
|
|
182
|
+
for point in points
|
|
183
|
+
if start_ts <= int(point.get("generated_at", 0) or 0) <= end_ts
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def _decode_point(raw: Any) -> dict[str, int]:
|
|
188
|
+
if isinstance(raw, bytes):
|
|
189
|
+
raw = raw.decode("utf-8")
|
|
190
|
+
if isinstance(raw, str):
|
|
191
|
+
try:
|
|
192
|
+
raw = json.loads(raw)
|
|
193
|
+
except json.JSONDecodeError:
|
|
194
|
+
return {}
|
|
195
|
+
return raw if isinstance(raw, dict) else {}
|
|
196
|
+
|
|
197
|
+
@staticmethod
|
|
198
|
+
def _point_in_window(point: dict[str, int], window: MetricsWindow) -> bool:
|
|
199
|
+
generated_at = int(point.get("generated_at", 0) or 0)
|
|
200
|
+
return window.start_ts <= generated_at <= window.end_ts
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def _summarize(samples: list[dict[str, int]]) -> dict[str, Any]:
|
|
204
|
+
if not samples:
|
|
205
|
+
return {"sample_count": 0}
|
|
206
|
+
fields = (
|
|
207
|
+
"workers_online",
|
|
208
|
+
"active_executions",
|
|
209
|
+
"queued_executions",
|
|
210
|
+
"failed_executions",
|
|
211
|
+
"queue_depth_total",
|
|
212
|
+
"consumer_pending_total",
|
|
213
|
+
"alert_count",
|
|
214
|
+
"latency_p95_ms",
|
|
215
|
+
"queue_latency_p95_ms",
|
|
216
|
+
"total_latency_p95_ms",
|
|
217
|
+
)
|
|
218
|
+
summary: dict[str, Any] = {"sample_count": len(samples)}
|
|
219
|
+
for field_name in fields:
|
|
220
|
+
values = [int(sample.get(field_name, 0) or 0) for sample in samples]
|
|
221
|
+
summary[field_name] = {
|
|
222
|
+
"min": min(values),
|
|
223
|
+
"max": max(values),
|
|
224
|
+
"last": values[-1],
|
|
225
|
+
}
|
|
226
|
+
return summary
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def _diagnose_summary(summary: dict[str, Any]) -> list[MetricsDiagnostic]:
|
|
230
|
+
diagnostics: list[MetricsDiagnostic] = []
|
|
231
|
+
if not summary or int(summary.get("sample_count", 0) or 0) <= 0:
|
|
232
|
+
return diagnostics
|
|
233
|
+
checks = (
|
|
234
|
+
("queue_depth_total", "metrics_queue_backlog", "Queue depth was non-zero."),
|
|
235
|
+
(
|
|
236
|
+
"consumer_pending_total",
|
|
237
|
+
"metrics_consumer_pending",
|
|
238
|
+
"Consumer pending messages were non-zero.",
|
|
239
|
+
),
|
|
240
|
+
("alert_count", "metrics_alerts_present", "System alerts were present."),
|
|
241
|
+
(
|
|
242
|
+
"failed_executions",
|
|
243
|
+
"metrics_failures_present",
|
|
244
|
+
"Failed executions were present.",
|
|
245
|
+
),
|
|
246
|
+
)
|
|
247
|
+
for field_name, code, message in checks:
|
|
248
|
+
max_value = int(summary.get(field_name, {}).get("max", 0) or 0)
|
|
249
|
+
if max_value > 0:
|
|
250
|
+
diagnostics.append(
|
|
251
|
+
MetricsDiagnostic(
|
|
252
|
+
code=code,
|
|
253
|
+
message=message,
|
|
254
|
+
severity="warning",
|
|
255
|
+
)
|
|
256
|
+
)
|
|
257
|
+
return diagnostics
|
|
@@ -523,12 +523,12 @@ def build_prometheus_metrics(snapshot: dict[str, Any]) -> str:
|
|
|
523
523
|
)
|
|
524
524
|
lines.append(
|
|
525
525
|
"by_framework_stream_pending_messages"
|
|
526
|
-
f"{{{labels}}} {int(group.get(
|
|
526
|
+
f"{{{labels}}} {int(group.get('pending', 0) or 0)}"
|
|
527
527
|
)
|
|
528
528
|
lag = group.get("lag")
|
|
529
529
|
if lag is not None:
|
|
530
530
|
lines.append(
|
|
531
|
-
"by_framework_stream_consumer_lag
|
|
531
|
+
f"by_framework_stream_consumer_lag{{{labels}}} {int(lag or 0)}"
|
|
532
532
|
)
|
|
533
533
|
|
|
534
534
|
lines.extend(
|
|
@@ -1424,7 +1424,7 @@ def _build_queue_alerts(
|
|
|
1424
1424
|
"code": "QUEUE_BACKLOG",
|
|
1425
1425
|
"severity": "warning",
|
|
1426
1426
|
"message": (
|
|
1427
|
-
f"{length} messages queued for agent type
|
|
1427
|
+
f"{length} messages queued for agent type {agent_type}."
|
|
1428
1428
|
),
|
|
1429
1429
|
"value": length,
|
|
1430
1430
|
"threshold": policy.queue_backlog_threshold,
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Trace helpers, context propagation, and write-side observability APIs."""
|
|
2
|
+
|
|
3
|
+
from .external_trace import (
|
|
4
|
+
ExternalTraceContext,
|
|
5
|
+
build_langfuse_trace_context,
|
|
6
|
+
build_otel_parent_context,
|
|
7
|
+
extract_external_trace_context,
|
|
8
|
+
start_langfuse_observation,
|
|
9
|
+
to_langfuse_trace_id,
|
|
10
|
+
)
|
|
11
|
+
from .span_recorder import (
|
|
12
|
+
LiveSpanHandle,
|
|
13
|
+
ObservabilityConfig,
|
|
14
|
+
SpanRecorder,
|
|
15
|
+
TraceSpan,
|
|
16
|
+
build_observability_config,
|
|
17
|
+
get_observability_diagnostics,
|
|
18
|
+
live_execution_otel_span,
|
|
19
|
+
reset_observability_diagnostics,
|
|
20
|
+
)
|
|
21
|
+
from .trace_schema import (
|
|
22
|
+
EventRecord,
|
|
23
|
+
ExecutionRecord,
|
|
24
|
+
SpanNode,
|
|
25
|
+
SpanRecord,
|
|
26
|
+
TraceDiagnostic,
|
|
27
|
+
TraceReadResult,
|
|
28
|
+
TraceRecord,
|
|
29
|
+
)
|
|
30
|
+
from .trace_writer import TraceWriteClient
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"ExternalTraceContext",
|
|
34
|
+
"EventRecord",
|
|
35
|
+
"ExecutionRecord",
|
|
36
|
+
"LiveSpanHandle",
|
|
37
|
+
"ObservabilityConfig",
|
|
38
|
+
"SpanNode",
|
|
39
|
+
"SpanRecord",
|
|
40
|
+
"SpanRecorder",
|
|
41
|
+
"TraceDiagnostic",
|
|
42
|
+
"TraceReadResult",
|
|
43
|
+
"TraceRecord",
|
|
44
|
+
"TraceSpan",
|
|
45
|
+
"TraceWriteClient",
|
|
46
|
+
"build_langfuse_trace_context",
|
|
47
|
+
"build_observability_config",
|
|
48
|
+
"build_otel_parent_context",
|
|
49
|
+
"extract_external_trace_context",
|
|
50
|
+
"get_observability_diagnostics",
|
|
51
|
+
"live_execution_otel_span",
|
|
52
|
+
"reset_observability_diagnostics",
|
|
53
|
+
"start_langfuse_observation",
|
|
54
|
+
"to_langfuse_trace_id",
|
|
55
|
+
]
|
|
@@ -7,7 +7,7 @@ from typing import Any
|
|
|
7
7
|
|
|
8
8
|
from by_framework.core.protocol.commands import BaseCommand
|
|
9
9
|
from by_framework.core.protocol.message_header import MessageHeader
|
|
10
|
-
from by_framework.
|
|
10
|
+
from by_framework.trace.span_recorder import str_to_uint128
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@dataclass(frozen=True)
|
|
@@ -6,6 +6,7 @@ import contextvars
|
|
|
6
6
|
import hashlib
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
|
+
import threading
|
|
9
10
|
import time
|
|
10
11
|
from contextlib import asynccontextmanager
|
|
11
12
|
from dataclasses import asdict, dataclass, field, replace
|
|
@@ -77,7 +78,27 @@ try:
|
|
|
77
78
|
except ImportError:
|
|
78
79
|
|
|
79
80
|
class ContextIdGenerator: # type: ignore
|
|
80
|
-
|
|
81
|
+
"""Fallback ID generator used when OpenTelemetry is not installed."""
|
|
82
|
+
|
|
83
|
+
def generate_trace_id(self) -> int:
|
|
84
|
+
"""Return a context-provided or random 128-bit trace id."""
|
|
85
|
+
val = current_trace_id_var.get()
|
|
86
|
+
if val is not None:
|
|
87
|
+
return val
|
|
88
|
+
import secrets
|
|
89
|
+
|
|
90
|
+
val_rand = secrets.randbits(128)
|
|
91
|
+
return val_rand if val_rand != 0 else 1
|
|
92
|
+
|
|
93
|
+
def generate_span_id(self) -> int:
|
|
94
|
+
"""Return a context-provided or random 64-bit span id."""
|
|
95
|
+
val = current_span_id_var.get()
|
|
96
|
+
if val is not None:
|
|
97
|
+
return val
|
|
98
|
+
import secrets
|
|
99
|
+
|
|
100
|
+
val_rand = secrets.randbits(64)
|
|
101
|
+
return val_rand if val_rand != 0 else 1
|
|
81
102
|
|
|
82
103
|
|
|
83
104
|
def configure_otel_id_generator() -> None:
|
|
@@ -125,6 +146,9 @@ _OBSERVABILITY_DIAGNOSTICS: dict[str, Any] = {
|
|
|
125
146
|
"export_failures_total": 0,
|
|
126
147
|
"export_failures_by_exporter": {},
|
|
127
148
|
}
|
|
149
|
+
_OBSERVABILITY_DIAGNOSTICS_LOCK = threading.Lock()
|
|
150
|
+
_LANGFUSE_PROCESSOR_PROVIDER_IDS: set[int] = set()
|
|
151
|
+
_LANGFUSE_PROCESSOR_LOCK = threading.Lock()
|
|
128
152
|
|
|
129
153
|
|
|
130
154
|
@dataclass(frozen=True)
|
|
@@ -171,38 +195,44 @@ def build_observability_config() -> ObservabilityConfig:
|
|
|
171
195
|
|
|
172
196
|
def get_observability_diagnostics() -> dict[str, Any]:
|
|
173
197
|
"""Return trace exporter self-diagnostics."""
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
198
|
+
with _OBSERVABILITY_DIAGNOSTICS_LOCK:
|
|
199
|
+
return {
|
|
200
|
+
"dropped_spans_total": int(
|
|
201
|
+
_OBSERVABILITY_DIAGNOSTICS["dropped_spans_total"]
|
|
202
|
+
),
|
|
203
|
+
"dropped_spans_by_reason": dict(
|
|
204
|
+
_OBSERVABILITY_DIAGNOSTICS["dropped_spans_by_reason"]
|
|
205
|
+
),
|
|
206
|
+
"export_failures_total": int(
|
|
207
|
+
_OBSERVABILITY_DIAGNOSTICS["export_failures_total"]
|
|
208
|
+
),
|
|
209
|
+
"export_failures_by_exporter": dict(
|
|
210
|
+
_OBSERVABILITY_DIAGNOSTICS["export_failures_by_exporter"]
|
|
211
|
+
),
|
|
212
|
+
}
|
|
186
213
|
|
|
187
214
|
|
|
188
215
|
def reset_observability_diagnostics() -> None:
|
|
189
216
|
"""Reset trace exporter self-diagnostics for tests."""
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
217
|
+
with _OBSERVABILITY_DIAGNOSTICS_LOCK:
|
|
218
|
+
_OBSERVABILITY_DIAGNOSTICS["dropped_spans_total"] = 0
|
|
219
|
+
_OBSERVABILITY_DIAGNOSTICS["dropped_spans_by_reason"] = {}
|
|
220
|
+
_OBSERVABILITY_DIAGNOSTICS["export_failures_total"] = 0
|
|
221
|
+
_OBSERVABILITY_DIAGNOSTICS["export_failures_by_exporter"] = {}
|
|
194
222
|
|
|
195
223
|
|
|
196
224
|
def _record_drop(reason: str) -> None:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
225
|
+
with _OBSERVABILITY_DIAGNOSTICS_LOCK:
|
|
226
|
+
_OBSERVABILITY_DIAGNOSTICS["dropped_spans_total"] += 1
|
|
227
|
+
by_reason = _OBSERVABILITY_DIAGNOSTICS["dropped_spans_by_reason"]
|
|
228
|
+
by_reason[reason] = int(by_reason.get(reason, 0)) + 1
|
|
200
229
|
|
|
201
230
|
|
|
202
231
|
def _record_export_failure(exporter_name: str) -> None:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
232
|
+
with _OBSERVABILITY_DIAGNOSTICS_LOCK:
|
|
233
|
+
_OBSERVABILITY_DIAGNOSTICS["export_failures_total"] += 1
|
|
234
|
+
by_exporter = _OBSERVABILITY_DIAGNOSTICS["export_failures_by_exporter"]
|
|
235
|
+
by_exporter[exporter_name] = int(by_exporter.get(exporter_name, 0)) + 1
|
|
206
236
|
|
|
207
237
|
|
|
208
238
|
def _clean_env(value: str | None) -> str:
|
|
@@ -287,6 +317,13 @@ class TraceSpan:
|
|
|
287
317
|
start_ts: int
|
|
288
318
|
end_ts: int
|
|
289
319
|
status: str
|
|
320
|
+
name: str = ""
|
|
321
|
+
kind: str = ""
|
|
322
|
+
source: str = "redis"
|
|
323
|
+
input: Any = None
|
|
324
|
+
output: Any = None
|
|
325
|
+
tokens: dict[str, Any] = field(default_factory=dict)
|
|
326
|
+
cost: dict[str, Any] = field(default_factory=dict)
|
|
290
327
|
session_id: str = ""
|
|
291
328
|
execution_id: str = ""
|
|
292
329
|
message_id: str = ""
|
|
@@ -312,17 +349,14 @@ class TraceSpan:
|
|
|
312
349
|
payload["start_ts"] = int(self.start_ts or 0)
|
|
313
350
|
payload["end_ts"] = max(payload["start_ts"], int(self.end_ts or 0))
|
|
314
351
|
payload["duration_ms"] = max(0, payload["end_ts"] - payload["start_ts"])
|
|
352
|
+
payload["name"] = self.name or self.operation
|
|
315
353
|
if payload.get("error_message"):
|
|
316
354
|
payload["error_message"] = _sanitize_value(
|
|
317
355
|
"error_message", payload["error_message"]
|
|
318
356
|
)
|
|
319
357
|
if payload.get("metadata"):
|
|
320
358
|
payload["metadata"] = _sanitize_value("metadata", payload["metadata"])
|
|
321
|
-
return {
|
|
322
|
-
key: value
|
|
323
|
-
for key, value in payload.items()
|
|
324
|
-
if value not in ("", None) and value is not False
|
|
325
|
-
}
|
|
359
|
+
return {key: value for key, value in payload.items() if value not in ("", None)}
|
|
326
360
|
|
|
327
361
|
|
|
328
362
|
@runtime_checkable
|
|
@@ -351,8 +385,17 @@ class RedisSpanExporter:
|
|
|
351
385
|
payload = span.to_payload()
|
|
352
386
|
trace_id = str(payload["trace_id"])
|
|
353
387
|
start_ts = int(payload.get("start_ts", 0) or 0)
|
|
388
|
+
end_ts = int(payload.get("end_ts", start_ts) or start_ts)
|
|
354
389
|
meta_key = RedisKeys.trace_meta(trace_id)
|
|
355
390
|
spans_key = RedisKeys.trace_spans(trace_id)
|
|
391
|
+
existing_start_ts = await self._read_hash_int(meta_key, "start_ts")
|
|
392
|
+
existing_updated_at = await self._read_hash_int(meta_key, "updated_at")
|
|
393
|
+
trace_start_ts = (
|
|
394
|
+
min(value for value in (existing_start_ts, start_ts) if value > 0)
|
|
395
|
+
if existing_start_ts or start_ts
|
|
396
|
+
else 0
|
|
397
|
+
)
|
|
398
|
+
updated_at = max(existing_updated_at, end_ts)
|
|
356
399
|
pipe = self.redis.pipeline()
|
|
357
400
|
if isawaitable(pipe):
|
|
358
401
|
pipe = await pipe
|
|
@@ -363,14 +406,29 @@ class RedisSpanExporter:
|
|
|
363
406
|
await self._call_pipeline(
|
|
364
407
|
pipe, "hset", meta_key, "status", str(payload.get("status", ""))
|
|
365
408
|
)
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
409
|
+
operation = str(payload.get("operation", ""))
|
|
410
|
+
if payload.get("name") and operation.startswith("client.dispatch"):
|
|
411
|
+
await self._call_pipeline(
|
|
412
|
+
pipe, "hset", meta_key, "name", str(payload.get("name", ""))
|
|
413
|
+
)
|
|
414
|
+
if payload.get("target_agent_type") and operation.startswith("client.dispatch"):
|
|
415
|
+
await self._call_pipeline(
|
|
416
|
+
pipe,
|
|
417
|
+
"hset",
|
|
418
|
+
meta_key,
|
|
419
|
+
"root_agent_type",
|
|
420
|
+
str(payload.get("target_agent_type", "")),
|
|
421
|
+
)
|
|
422
|
+
if payload.get("message_id") and operation.startswith("client.dispatch"):
|
|
423
|
+
await self._call_pipeline(
|
|
424
|
+
pipe,
|
|
425
|
+
"hset",
|
|
426
|
+
meta_key,
|
|
427
|
+
"root_message_id",
|
|
428
|
+
str(payload.get("message_id", "")),
|
|
429
|
+
)
|
|
430
|
+
await self._call_pipeline(pipe, "hset", meta_key, "start_ts", trace_start_ts)
|
|
431
|
+
await self._call_pipeline(pipe, "hset", meta_key, "updated_at", updated_at)
|
|
374
432
|
await self._call_pipeline(
|
|
375
433
|
pipe, "rpush", spans_key, json.dumps(payload, ensure_ascii=False)
|
|
376
434
|
)
|
|
@@ -413,6 +471,20 @@ class RedisSpanExporter:
|
|
|
413
471
|
if isawaitable(result):
|
|
414
472
|
await result
|
|
415
473
|
|
|
474
|
+
async def _read_hash_int(self, name: str, field_name: str) -> int:
|
|
475
|
+
hget = getattr(self.redis, "hget", None)
|
|
476
|
+
if not callable(hget):
|
|
477
|
+
return 0
|
|
478
|
+
try:
|
|
479
|
+
value = hget(name, field_name) # pylint: disable=not-callable
|
|
480
|
+
if isawaitable(value):
|
|
481
|
+
value = await value
|
|
482
|
+
if isinstance(value, bytes):
|
|
483
|
+
value = value.decode("utf-8")
|
|
484
|
+
return int(value or 0)
|
|
485
|
+
except (TypeError, ValueError):
|
|
486
|
+
return 0
|
|
487
|
+
|
|
416
488
|
|
|
417
489
|
class OTelSpanExporter:
|
|
418
490
|
"""Export TraceSpan objects into OpenTelemetry's global tracer."""
|
|
@@ -667,30 +739,18 @@ def register_langfuse_span_processor() -> None:
|
|
|
667
739
|
if secret_key and public_key and base_url:
|
|
668
740
|
# 1. Ensure the global TracerProvider exists and patch the ID generator.
|
|
669
741
|
provider = trace.get_tracer_provider()
|
|
670
|
-
if hasattr(provider, "_delegate") and provider._delegate is not None:
|
|
671
|
-
provider = provider._delegate
|
|
672
|
-
|
|
673
742
|
if not isinstance(provider, TracerProvider):
|
|
674
743
|
provider = TracerProvider()
|
|
675
744
|
trace.set_tracer_provider(provider)
|
|
676
745
|
configure_otel_id_generator()
|
|
677
746
|
|
|
678
|
-
# 2. Avoid
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
else:
|
|
686
|
-
processors = [active_processor]
|
|
687
|
-
|
|
688
|
-
for p in processors:
|
|
689
|
-
if p.__class__.__name__ == "LangfuseSpanProcessor":
|
|
690
|
-
has_processor = True
|
|
691
|
-
break
|
|
692
|
-
|
|
693
|
-
if not has_processor:
|
|
747
|
+
# 2. Avoid duplicate registration from this integration without
|
|
748
|
+
# relying on OpenTelemetry SDK private provider internals.
|
|
749
|
+
provider_id = id(provider)
|
|
750
|
+
with _LANGFUSE_PROCESSOR_LOCK:
|
|
751
|
+
if provider_id in _LANGFUSE_PROCESSOR_PROVIDER_IDS:
|
|
752
|
+
return
|
|
753
|
+
|
|
694
754
|
# 3. Dynamically import and attach LangfuseSpanProcessor.
|
|
695
755
|
langfuse_processor_mod = import_module(
|
|
696
756
|
"langfuse._client.span_processor"
|
|
@@ -725,6 +785,7 @@ def register_langfuse_span_processor() -> None:
|
|
|
725
785
|
should_export_span=should_export_span,
|
|
726
786
|
)
|
|
727
787
|
provider.add_span_processor(processor)
|
|
788
|
+
_LANGFUSE_PROCESSOR_PROVIDER_IDS.add(provider_id)
|
|
728
789
|
logger.info(
|
|
729
790
|
"LangfuseSpanProcessor registered successfully to global OTel "
|
|
730
791
|
"TracerProvider. Base URL: %s",
|