spanforge 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. spanforge/__init__.py +695 -0
  2. spanforge/_batch_exporter.py +322 -0
  3. spanforge/_cli.py +3081 -0
  4. spanforge/_hooks.py +340 -0
  5. spanforge/_server.py +953 -0
  6. spanforge/_span.py +1015 -0
  7. spanforge/_store.py +287 -0
  8. spanforge/_stream.py +654 -0
  9. spanforge/_trace.py +334 -0
  10. spanforge/_tracer.py +253 -0
  11. spanforge/actor.py +141 -0
  12. spanforge/alerts.py +464 -0
  13. spanforge/auto.py +181 -0
  14. spanforge/baseline.py +336 -0
  15. spanforge/config.py +460 -0
  16. spanforge/consent.py +227 -0
  17. spanforge/consumer.py +379 -0
  18. spanforge/core/__init__.py +5 -0
  19. spanforge/core/compliance_mapping.py +1060 -0
  20. spanforge/cost.py +597 -0
  21. spanforge/debug.py +514 -0
  22. spanforge/drift.py +488 -0
  23. spanforge/egress.py +63 -0
  24. spanforge/eval.py +575 -0
  25. spanforge/event.py +1052 -0
  26. spanforge/exceptions.py +246 -0
  27. spanforge/explain.py +181 -0
  28. spanforge/export/__init__.py +50 -0
  29. spanforge/export/append_only.py +342 -0
  30. spanforge/export/cloud.py +349 -0
  31. spanforge/export/datadog.py +495 -0
  32. spanforge/export/grafana.py +331 -0
  33. spanforge/export/jsonl.py +198 -0
  34. spanforge/export/otel_bridge.py +291 -0
  35. spanforge/export/otlp.py +817 -0
  36. spanforge/export/otlp_bridge.py +231 -0
  37. spanforge/export/redis_backend.py +282 -0
  38. spanforge/export/webhook.py +302 -0
  39. spanforge/exporters/__init__.py +29 -0
  40. spanforge/exporters/console.py +271 -0
  41. spanforge/exporters/jsonl.py +144 -0
  42. spanforge/hitl.py +297 -0
  43. spanforge/inspect.py +429 -0
  44. spanforge/integrations/__init__.py +39 -0
  45. spanforge/integrations/_pricing.py +277 -0
  46. spanforge/integrations/anthropic.py +388 -0
  47. spanforge/integrations/bedrock.py +306 -0
  48. spanforge/integrations/crewai.py +251 -0
  49. spanforge/integrations/gemini.py +349 -0
  50. spanforge/integrations/groq.py +444 -0
  51. spanforge/integrations/langchain.py +349 -0
  52. spanforge/integrations/llamaindex.py +370 -0
  53. spanforge/integrations/ollama.py +286 -0
  54. spanforge/integrations/openai.py +370 -0
  55. spanforge/integrations/together.py +485 -0
  56. spanforge/metrics.py +393 -0
  57. spanforge/metrics_export.py +342 -0
  58. spanforge/migrate.py +278 -0
  59. spanforge/model_registry.py +282 -0
  60. spanforge/models.py +407 -0
  61. spanforge/namespaces/__init__.py +215 -0
  62. spanforge/namespaces/audit.py +253 -0
  63. spanforge/namespaces/cache.py +209 -0
  64. spanforge/namespaces/chain.py +74 -0
  65. spanforge/namespaces/confidence.py +69 -0
  66. spanforge/namespaces/consent.py +85 -0
  67. spanforge/namespaces/cost.py +175 -0
  68. spanforge/namespaces/decision.py +135 -0
  69. spanforge/namespaces/diff.py +146 -0
  70. spanforge/namespaces/drift.py +79 -0
  71. spanforge/namespaces/eval_.py +232 -0
  72. spanforge/namespaces/fence.py +180 -0
  73. spanforge/namespaces/guard.py +104 -0
  74. spanforge/namespaces/hitl.py +92 -0
  75. spanforge/namespaces/latency.py +69 -0
  76. spanforge/namespaces/prompt.py +185 -0
  77. spanforge/namespaces/redact.py +172 -0
  78. spanforge/namespaces/template.py +197 -0
  79. spanforge/namespaces/tool_call.py +76 -0
  80. spanforge/namespaces/trace.py +1006 -0
  81. spanforge/normalizer.py +183 -0
  82. spanforge/presidio_backend.py +149 -0
  83. spanforge/processor.py +258 -0
  84. spanforge/prompt_registry.py +415 -0
  85. spanforge/py.typed +0 -0
  86. spanforge/redact.py +780 -0
  87. spanforge/sampling.py +500 -0
  88. spanforge/schemas/v1.0/schema.json +170 -0
  89. spanforge/schemas/v2.0/schema.json +536 -0
  90. spanforge/signing.py +1152 -0
  91. spanforge/stream.py +559 -0
  92. spanforge/testing.py +376 -0
  93. spanforge/trace.py +199 -0
  94. spanforge/types.py +696 -0
  95. spanforge/ulid.py +304 -0
  96. spanforge/validate.py +383 -0
  97. spanforge-2.0.0.dist-info/METADATA +1777 -0
  98. spanforge-2.0.0.dist-info/RECORD +101 -0
  99. spanforge-2.0.0.dist-info/WHEEL +4 -0
  100. spanforge-2.0.0.dist-info/entry_points.txt +5 -0
  101. spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/metrics.py ADDED
@@ -0,0 +1,393 @@
1
+ """spanforge.metrics — Programmatic metrics extraction from SpanForge traces.
2
+
3
+ Provides aggregation functions that accept any ``Iterable[Event]`` — such as
4
+ an in-memory list, an ``EventStream.from_file(...)`` iterator, or a
5
+ :class:`~spanforge._store.TraceStore` query result — and return structured
6
+ :class:`MetricsSummary` / :class:`LatencyStats` objects.
7
+
8
+ Usage::
9
+
10
+ import spanforge.metrics as metrics
11
+ from spanforge.stream import iter_file
12
+
13
+ events = list(iter_file("events.jsonl"))
14
+ summary = metrics.aggregate(events)
15
+ print(f"Success rate: {summary.agent_success_rate:.1%}")
16
+ print(f"p95 LLM latency: {summary.llm_latency_ms.p95:.1f} ms")
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import statistics
22
+ from collections import defaultdict
23
+ from dataclasses import dataclass, field
24
+ from typing import Iterable, TYPE_CHECKING
25
+
26
+ if TYPE_CHECKING:
27
+ from spanforge.event import Event
28
+ from spanforge.namespaces.trace import TokenUsage
29
+
30
+ __all__ = [
31
+ "LatencyStats",
32
+ "MetricsSummary",
33
+ "aggregate",
34
+ "agent_success_rate",
35
+ "llm_latency",
36
+ "tool_failure_rate",
37
+ "token_usage",
38
+ ]
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # EventType string constants (avoid circular import)
42
+ # ---------------------------------------------------------------------------
43
+
44
+ _SPAN_COMPLETED = "llm.trace.span.completed"
45
+ _SPAN_FAILED = "llm.trace.span.failed"
46
+ _AGENT_COMPLETED = "llm.trace.agent.completed"
47
+
48
+ _SPAN_EVENT_TYPES = frozenset({_SPAN_COMPLETED, _SPAN_FAILED})
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Value objects
52
+ # ---------------------------------------------------------------------------
53
+
54
+
55
+ @dataclass(frozen=True)
56
+ class LatencyStats:
57
+ """Latency percentile distribution for LLM calls (all values in ms)."""
58
+
59
+ min: float
60
+ max: float
61
+ p50: float
62
+ p95: float
63
+ p99: float
64
+
65
+ @classmethod
66
+ def _from_samples(cls, samples: list[float]) -> "LatencyStats":
67
+ if not samples:
68
+ return cls(min=0.0, max=0.0, p50=0.0, p95=0.0, p99=0.0)
69
+ samples = sorted(samples)
70
+ return cls(
71
+ min=samples[0],
72
+ max=samples[-1],
73
+ p50=_percentile(samples, 50),
74
+ p95=_percentile(samples, 95),
75
+ p99=_percentile(samples, 99),
76
+ )
77
+
78
+
79
+ @dataclass
80
+ class MetricsSummary:
81
+ """Aggregated metrics extracted from a collection of SpanForge events.
82
+
83
+ Attributes:
84
+ trace_count: Number of distinct ``trace_id`` values seen.
85
+ span_count: Total number of span events.
86
+ agent_success_rate: Fraction of traces that contain no error spans
87
+ (0.0 – 1.0).
88
+ avg_trace_duration_ms: Mean duration across all agent-run events.
89
+ p50_trace_duration_ms: Median trace duration.
90
+ p95_trace_duration_ms: 95th-percentile trace duration.
91
+ total_input_tokens: Cumulative input/prompt tokens across all spans.
92
+ total_output_tokens: Cumulative output/completion tokens across all spans.
93
+ total_cost_usd: Cumulative inferred cost in USD.
94
+ llm_latency_ms: :class:`LatencyStats` for LLM-type spans.
95
+ tool_failure_rate: Fraction of tool-call spans with ``status="error"``.
96
+ token_usage_by_model: Per-model ``TokenUsage``-like dict (input/output/total).
97
+ cost_by_model: Per-model total cost in USD.
98
+ drift_incidents: Count of ``drift.threshold_breach`` events in the stream.
99
+ confidence_trend: Rolling mean confidence score per 50-event window;
100
+ empty when no ``confidence.sample`` events are present.
101
+ baseline_deviation_pct: Coefficient of variation of observed confidence scores
102
+ (``stddev / mean * 100``); 0.0 when unavailable.
103
+ """
104
+
105
+ trace_count: int = 0
106
+ span_count: int = 0
107
+ agent_success_rate: float = 1.0
108
+ avg_trace_duration_ms: float = 0.0
109
+ p50_trace_duration_ms: float = 0.0
110
+ p95_trace_duration_ms: float = 0.0
111
+ total_input_tokens: int = 0
112
+ total_output_tokens: int = 0
113
+ total_cost_usd: float = 0.0
114
+ llm_latency_ms: LatencyStats = field(default_factory=lambda: LatencyStats(0, 0, 0, 0, 0))
115
+ tool_failure_rate: float = 0.0
116
+ token_usage_by_model: dict[str, dict[str, int]] = field(default_factory=dict)
117
+ cost_by_model: dict[str, float] = field(default_factory=dict)
118
+ drift_incidents: int = 0
119
+ confidence_trend: list[float] = field(default_factory=list)
120
+ baseline_deviation_pct: float = 0.0
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Helpers
125
+ # ---------------------------------------------------------------------------
126
+
127
+
128
+ def _percentile(sorted_data: list[float], pct: float) -> float:
129
+ """Return the *pct*-th percentile of an already-sorted list."""
130
+ if not sorted_data:
131
+ return 0.0
132
+ if len(sorted_data) == 1:
133
+ return sorted_data[0]
134
+ idx = (pct / 100.0) * (len(sorted_data) - 1)
135
+ lo = int(idx)
136
+ hi = lo + 1
137
+ if hi >= len(sorted_data):
138
+ return float(sorted_data[-1])
139
+ frac = idx - lo
140
+ return sorted_data[lo] * (1 - frac) + sorted_data[hi] * frac
141
+
142
+
143
+ def _event_type_str(event: "Event") -> str:
144
+ """Return the string value of ``event.event_type``."""
145
+ et = event.event_type
146
+ return et.value if hasattr(et, "value") else str(et)
147
+
148
+
149
+ def _is_span_event(event: "Event") -> bool:
150
+ return _event_type_str(event) in _SPAN_EVENT_TYPES
151
+
152
+
153
+ def _is_agent_completed(event: "Event") -> bool:
154
+ return _event_type_str(event) == _AGENT_COMPLETED
155
+
156
+
157
+ def _is_llm_span(payload: dict) -> bool:
158
+ op = payload.get("operation", "")
159
+ return op in ("chat", "completion", "embedding", "chat_completion", "generate")
160
+
161
+
162
+ def _is_tool_span(payload: dict) -> bool:
163
+ op = payload.get("operation", "")
164
+ return op == "tool_call"
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Public API
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def _process_llm_span(
173
+ payload: dict[str, object],
174
+ duration_ms: float,
175
+ llm_latencies: list[float],
176
+ token_by_model: dict[str, dict[str, int]],
177
+ cost_by_model: dict[str, float],
178
+ ) -> tuple[int, int, float]:
179
+ """Process LLM span metrics; returns (input_tokens, output_tokens, cost_usd)."""
180
+ if duration_ms >= 0:
181
+ llm_latencies.append(duration_ms)
182
+ inp = out = 0
183
+ cost_usd = 0.0
184
+ tu = payload.get("token_usage")
185
+ if tu:
186
+ inp = int(tu.get("input_tokens", 0)) # type: ignore[union-attr]
187
+ out = int(tu.get("output_tokens", 0)) # type: ignore[union-attr]
188
+ tot = int(tu.get("total_tokens", 0)) # type: ignore[union-attr]
189
+ model_name = (payload.get("model") or {}).get("name", "unknown") # type: ignore[union-attr]
190
+ token_by_model[model_name]["input_tokens"] += inp
191
+ token_by_model[model_name]["output_tokens"] += out
192
+ token_by_model[model_name]["total_tokens"] += tot
193
+ cost = payload.get("cost")
194
+ if cost:
195
+ cost_usd = float(cost.get("total_cost_usd", 0.0)) # type: ignore[union-attr]
196
+ model_name = (payload.get("model") or {}).get("name", "unknown") # type: ignore[union-attr]
197
+ cost_by_model[model_name] += cost_usd
198
+ return inp, out, cost_usd
199
+
200
+
201
+ def _process_span_event(
202
+ event: "Event",
203
+ span_count: int,
204
+ trace_errors: dict[str, bool],
205
+ llm_latencies: list[float],
206
+ token_by_model: dict[str, dict[str, int]],
207
+ cost_by_model: dict[str, float],
208
+ tool_total: int,
209
+ tool_errors: int,
210
+ total_input_tokens: int,
211
+ total_output_tokens: int,
212
+ total_cost_usd: float,
213
+ ) -> tuple[int, int, int, int, float]:
214
+ """Process a single span event; returns updated counters."""
215
+ payload = event.payload
216
+ span_count += 1
217
+ status = payload.get("status", "ok")
218
+ trace_id = payload.get("trace_id", "")
219
+ duration_ms = float(payload.get("duration_ms", 0.0))
220
+
221
+ if trace_id and trace_id not in trace_errors:
222
+ trace_errors[trace_id] = False # type: ignore[assignment]
223
+
224
+ if status == "error" and trace_id:
225
+ trace_errors[trace_id] = True # type: ignore[assignment]
226
+
227
+ if _is_llm_span(payload): # type: ignore[arg-type]
228
+ inp, out, cost_usd = _process_llm_span(
229
+ payload, duration_ms, llm_latencies, token_by_model, cost_by_model # type: ignore[arg-type]
230
+ )
231
+ total_input_tokens += inp
232
+ total_output_tokens += out
233
+ total_cost_usd += cost_usd
234
+
235
+ if _is_tool_span(payload): # type: ignore[arg-type]
236
+ tool_total += 1
237
+ if status == "error":
238
+ tool_errors += 1
239
+
240
+ return span_count, tool_total, tool_errors, total_input_tokens, total_output_tokens, total_cost_usd # type: ignore[return-value]
241
+
242
+
243
+ def aggregate(events: Iterable["Event"]) -> MetricsSummary:
244
+ """Aggregate a collection of SpanForge events into a :class:`MetricsSummary`.
245
+
246
+ Args:
247
+ events: Any iterable of :class:`~spanforge.event.Event` objects.
248
+
249
+ Returns:
250
+ A fully-populated :class:`MetricsSummary`.
251
+ """
252
+ events_list = list(events)
253
+
254
+ # Track per-trace error status (trace_id → has_error)
255
+ trace_errors: dict[str, bool] = {}
256
+ trace_durations: list[float] = []
257
+
258
+ span_count = 0
259
+ llm_latencies: list[float] = []
260
+ tool_total = 0
261
+ tool_errors = 0
262
+ total_input_tokens = 0
263
+ total_output_tokens = 0
264
+ total_cost_usd = 0.0
265
+ token_by_model: dict[str, dict[str, int]] = defaultdict(
266
+ lambda: {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
267
+ )
268
+ cost_by_model: dict[str, float] = defaultdict(float)
269
+
270
+ drift_incidents = 0
271
+ confidence_scores: list[float] = []
272
+
273
+ for event in events_list:
274
+ payload = event.payload
275
+
276
+ if _is_span_event(event):
277
+ span_count, tool_total, tool_errors, total_input_tokens, total_output_tokens, total_cost_usd = _process_span_event( # type: ignore[assignment]
278
+ event, span_count, trace_errors, llm_latencies,
279
+ token_by_model, cost_by_model, # type: ignore[arg-type]
280
+ tool_total, tool_errors, total_input_tokens,
281
+ total_output_tokens, total_cost_usd,
282
+ )
283
+
284
+ elif _is_agent_completed(event):
285
+ dur = float(payload.get("duration_ms", 0.0))
286
+ trace_durations.append(dur)
287
+
288
+ elif _event_type_str(event) == "drift.threshold_breach":
289
+ drift_incidents += 1
290
+
291
+ elif _event_type_str(event) == "confidence.sample":
292
+ score = payload.get("score")
293
+ if score is not None:
294
+ confidence_scores.append(float(score))
295
+
296
+ # Success rate
297
+ if trace_errors:
298
+ success_count = sum(1 for has_err in trace_errors.values() if not has_err)
299
+ success_rate = success_count / len(trace_errors)
300
+ else:
301
+ success_rate = 1.0
302
+
303
+ # Trace duration stats
304
+ sorted_durations = sorted(trace_durations)
305
+ avg_dur = statistics.mean(sorted_durations) if sorted_durations else 0.0
306
+ p50_dur = _percentile(sorted_durations, 50)
307
+ p95_dur = _percentile(sorted_durations, 95)
308
+
309
+ # Confidence trend: rolling mean per 50-event window
310
+ _CONFIDENCE_WINDOW = 50
311
+ confidence_trend: list[float] = []
312
+ for i in range(0, len(confidence_scores), _CONFIDENCE_WINDOW):
313
+ window = confidence_scores[i : i + _CONFIDENCE_WINDOW]
314
+ if window:
315
+ confidence_trend.append(statistics.mean(window))
316
+
317
+ # Baseline deviation: coefficient of variation (stddev / mean * 100)
318
+ baseline_deviation_pct = 0.0
319
+ if len(confidence_scores) >= 2:
320
+ mean_conf = statistics.mean(confidence_scores)
321
+ if mean_conf > 0:
322
+ baseline_deviation_pct = (
323
+ statistics.stdev(confidence_scores) / mean_conf
324
+ ) * 100.0
325
+
326
+ return MetricsSummary(
327
+ trace_count=len(trace_errors),
328
+ span_count=span_count,
329
+ agent_success_rate=success_rate,
330
+ avg_trace_duration_ms=avg_dur,
331
+ p50_trace_duration_ms=p50_dur,
332
+ p95_trace_duration_ms=p95_dur,
333
+ total_input_tokens=total_input_tokens,
334
+ total_output_tokens=total_output_tokens,
335
+ total_cost_usd=total_cost_usd,
336
+ llm_latency_ms=LatencyStats._from_samples(llm_latencies),
337
+ tool_failure_rate=tool_errors / tool_total if tool_total > 0 else 0.0,
338
+ token_usage_by_model=dict(token_by_model),
339
+ cost_by_model=dict(cost_by_model),
340
+ drift_incidents=drift_incidents,
341
+ confidence_trend=confidence_trend,
342
+ baseline_deviation_pct=baseline_deviation_pct,
343
+ )
344
+
345
+
346
+ def agent_success_rate(events: Iterable["Event"]) -> float:
347
+ """Return the fraction of traces with no error spans.
348
+
349
+ Args:
350
+ events: Any iterable of :class:`~spanforge.event.Event` objects.
351
+
352
+ Returns:
353
+ Success rate in the range 0.0 – 1.0. Returns ``1.0`` when there are
354
+ no span events (nothing to interpret as a failure).
355
+ """
356
+ return aggregate(events).agent_success_rate
357
+
358
+
359
+ def llm_latency(events: Iterable["Event"]) -> LatencyStats:
360
+ """Return :class:`LatencyStats` for all LLM-operation spans.
361
+
362
+ Args:
363
+ events: Any iterable of :class:`~spanforge.event.Event` objects.
364
+
365
+ Returns:
366
+ Latency percentiles in milliseconds.
367
+ """
368
+ return aggregate(events).llm_latency_ms
369
+
370
+
371
+ def tool_failure_rate(events: Iterable["Event"]) -> float:
372
+ """Return the fraction of tool-call spans that ended with ``status="error"``.
373
+
374
+ Args:
375
+ events: Any iterable of :class:`~spanforge.event.Event` objects.
376
+
377
+ Returns:
378
+ Failure rate in the range 0.0 – 1.0.
379
+ """
380
+ return aggregate(events).tool_failure_rate
381
+
382
+
383
+ def token_usage(events: Iterable["Event"]) -> dict[str, dict[str, int]]:
384
+ """Return per-model token usage totals.
385
+
386
+ Args:
387
+ events: Any iterable of :class:`~spanforge.event.Event` objects.
388
+
389
+ Returns:
390
+ Dict mapping model name → ``{"input_tokens": int, "output_tokens": int,
391
+ "total_tokens": int}``.
392
+ """
393
+ return aggregate(events).token_usage_by_model