spanforge 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +695 -0
- spanforge/_batch_exporter.py +322 -0
- spanforge/_cli.py +3081 -0
- spanforge/_hooks.py +340 -0
- spanforge/_server.py +953 -0
- spanforge/_span.py +1015 -0
- spanforge/_store.py +287 -0
- spanforge/_stream.py +654 -0
- spanforge/_trace.py +334 -0
- spanforge/_tracer.py +253 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +464 -0
- spanforge/auto.py +181 -0
- spanforge/baseline.py +336 -0
- spanforge/config.py +460 -0
- spanforge/consent.py +227 -0
- spanforge/consumer.py +379 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1060 -0
- spanforge/cost.py +597 -0
- spanforge/debug.py +514 -0
- spanforge/drift.py +488 -0
- spanforge/egress.py +63 -0
- spanforge/eval.py +575 -0
- spanforge/event.py +1052 -0
- spanforge/exceptions.py +246 -0
- spanforge/explain.py +181 -0
- spanforge/export/__init__.py +50 -0
- spanforge/export/append_only.py +342 -0
- spanforge/export/cloud.py +349 -0
- spanforge/export/datadog.py +495 -0
- spanforge/export/grafana.py +331 -0
- spanforge/export/jsonl.py +198 -0
- spanforge/export/otel_bridge.py +291 -0
- spanforge/export/otlp.py +817 -0
- spanforge/export/otlp_bridge.py +231 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/webhook.py +302 -0
- spanforge/exporters/__init__.py +29 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/hitl.py +297 -0
- spanforge/inspect.py +429 -0
- spanforge/integrations/__init__.py +39 -0
- spanforge/integrations/_pricing.py +277 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/bedrock.py +306 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +349 -0
- spanforge/integrations/groq.py +444 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/llamaindex.py +370 -0
- spanforge/integrations/ollama.py +286 -0
- spanforge/integrations/openai.py +370 -0
- spanforge/integrations/together.py +485 -0
- spanforge/metrics.py +393 -0
- spanforge/metrics_export.py +342 -0
- spanforge/migrate.py +278 -0
- spanforge/model_registry.py +282 -0
- spanforge/models.py +407 -0
- spanforge/namespaces/__init__.py +215 -0
- spanforge/namespaces/audit.py +253 -0
- spanforge/namespaces/cache.py +209 -0
- spanforge/namespaces/chain.py +74 -0
- spanforge/namespaces/confidence.py +69 -0
- spanforge/namespaces/consent.py +85 -0
- spanforge/namespaces/cost.py +175 -0
- spanforge/namespaces/decision.py +135 -0
- spanforge/namespaces/diff.py +146 -0
- spanforge/namespaces/drift.py +79 -0
- spanforge/namespaces/eval_.py +232 -0
- spanforge/namespaces/fence.py +180 -0
- spanforge/namespaces/guard.py +104 -0
- spanforge/namespaces/hitl.py +92 -0
- spanforge/namespaces/latency.py +69 -0
- spanforge/namespaces/prompt.py +185 -0
- spanforge/namespaces/redact.py +172 -0
- spanforge/namespaces/template.py +197 -0
- spanforge/namespaces/tool_call.py +76 -0
- spanforge/namespaces/trace.py +1006 -0
- spanforge/normalizer.py +183 -0
- spanforge/presidio_backend.py +149 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +415 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +780 -0
- spanforge/sampling.py +500 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/signing.py +1152 -0
- spanforge/stream.py +559 -0
- spanforge/testing.py +376 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +304 -0
- spanforge/validate.py +383 -0
- spanforge-2.0.0.dist-info/METADATA +1777 -0
- spanforge-2.0.0.dist-info/RECORD +101 -0
- spanforge-2.0.0.dist-info/WHEEL +4 -0
- spanforge-2.0.0.dist-info/entry_points.txt +5 -0
- spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/auto.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""spanforge.auto — Automatic integration discovery and patching.
|
|
2
|
+
|
|
3
|
+
Call :func:`setup` to automatically detect and patch all SpanForge-supported
|
|
4
|
+
LLM libraries that are installed in the current environment. This eliminates
|
|
5
|
+
the need to manually import each integration module.
|
|
6
|
+
|
|
7
|
+
Usage \u2014 fastest path to value::
|
|
8
|
+
|
|
9
|
+
import spanforge.auto
|
|
10
|
+
spanforge.auto.setup() # patches everything installed
|
|
11
|
+
|
|
12
|
+
Or call explicitly for programmatic control::
|
|
13
|
+
|
|
14
|
+
from spanforge.auto import setup
|
|
15
|
+
patched = setup(verbose=True)
|
|
16
|
+
# patched = {"openai", "anthropic"}
|
|
17
|
+
|
|
18
|
+
Note
|
|
19
|
+
----
|
|
20
|
+
:func:`setup` is **not** called automatically on import. You must call it
|
|
21
|
+
explicitly so that importing :mod:`spanforge` never silently monkey-patches
|
|
22
|
+
third-party libraries without your consent.
|
|
23
|
+
|
|
24
|
+
Supported libraries (patched when installed):
|
|
25
|
+
* **openai** — :mod:`spanforge.integrations.openai`
|
|
26
|
+
* **anthropic** — :mod:`spanforge.integrations.anthropic`
|
|
27
|
+
* **groq** — :mod:`spanforge.integrations.groq`
|
|
28
|
+
* **ollama** — :mod:`spanforge.integrations.ollama`
|
|
29
|
+
* **together** — :mod:`spanforge.integrations.together`
|
|
30
|
+
|
|
31
|
+
Callback-based integrations (register manually):
|
|
32
|
+
* **LangChain** — use :class:`~spanforge.integrations.langchain.LLMSchemaCallbackHandler`
|
|
33
|
+
* **LlamaIndex** — use :class:`~spanforge.integrations.llamaindex.LLMSchemaEventHandler`
|
|
34
|
+
* **CrewAI** — use :func:`~spanforge.integrations.crewai.patch`
|
|
35
|
+
|
|
36
|
+
Security note
|
|
37
|
+
-------------
|
|
38
|
+
Monkey-patching is only applied when the target library is already installed.
|
|
39
|
+
The patching flag ``_spanforge_patched`` prevents double-patching. Each
|
|
40
|
+
integration is wrapped in a ``try/except`` so a broken integration never
|
|
41
|
+
prevents the others from loading.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
from __future__ import annotations
|
|
45
|
+
|
|
46
|
+
import importlib.util
|
|
47
|
+
import threading
|
|
48
|
+
import warnings
|
|
49
|
+
|
|
50
|
+
__all__ = ["setup", "teardown", "patched_integrations"]
|
|
51
|
+
|
|
52
|
+
# Internal registry of successfully patched integrations (module name → patch fn).
|
|
53
|
+
_PATCHED: set[str] = set()
|
|
54
|
+
_PATCHED_LOCK = threading.Lock()
|
|
55
|
+
|
|
56
|
+
# Map of library import name → (integration module path, patch fn name, unpatch fn name)
|
|
57
|
+
_INTEGRATIONS: list[tuple[str, str, str, str]] = [
|
|
58
|
+
("openai", "spanforge.integrations.openai", "patch", "unpatch"),
|
|
59
|
+
("anthropic", "spanforge.integrations.anthropic", "patch", "unpatch"),
|
|
60
|
+
("groq", "spanforge.integrations.groq", "patch", "unpatch"),
|
|
61
|
+
("ollama", "spanforge.integrations.ollama", "patch", "unpatch"),
|
|
62
|
+
("together", "spanforge.integrations.together", "patch", "unpatch"),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _try_patch_integration(lib_name: str, integration_module: str, patch_fn: str, verbose: bool) -> bool:
|
|
67
|
+
"""Attempt to patch one integration; returns True if newly patched."""
|
|
68
|
+
try:
|
|
69
|
+
mod = importlib.import_module(integration_module)
|
|
70
|
+
getattr(mod, patch_fn)()
|
|
71
|
+
_PATCHED.add(lib_name)
|
|
72
|
+
if verbose:
|
|
73
|
+
print(f" {lib_name}: patched \u2713")
|
|
74
|
+
return True
|
|
75
|
+
except Exception as exc:
|
|
76
|
+
warnings.warn(
|
|
77
|
+
f"spanforge.auto: failed to patch {lib_name!r}: {exc}",
|
|
78
|
+
UserWarning,
|
|
79
|
+
stacklevel=3,
|
|
80
|
+
)
|
|
81
|
+
if verbose:
|
|
82
|
+
print(f" {lib_name}: patch failed — {exc}")
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def setup(*, verbose: bool = False) -> set[str]:
|
|
87
|
+
"""Detect and patch all installed SpanForge-supported LLM libraries.
|
|
88
|
+
|
|
89
|
+
Iterates over supported integrations and calls their ``patch()`` function
|
|
90
|
+
if the underlying library is installed. Already-patched integrations are
|
|
91
|
+
skipped silently (idempotent).
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
verbose: When ``True``, print a status line for each integration
|
|
95
|
+
attempted.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Set of library names that were newly patched in this call (does not
|
|
99
|
+
include libraries already patched in previous calls).
|
|
100
|
+
|
|
101
|
+
Example::
|
|
102
|
+
|
|
103
|
+
from spanforge.auto import setup
|
|
104
|
+
patched = setup(verbose=True)
|
|
105
|
+
# openai patched ✓
|
|
106
|
+
# anthropic not installed, skipped
|
|
107
|
+
|
|
108
|
+
Note:
|
|
109
|
+
Callback-based integrations (LangChain, LlamaIndex, CrewAI) are not
|
|
110
|
+
auto-patched because they require manual handler registration. See
|
|
111
|
+
their respective integration guides.
|
|
112
|
+
"""
|
|
113
|
+
newly_patched: set[str] = set()
|
|
114
|
+
|
|
115
|
+
for lib_name, integration_module, patch_fn, _unpatch_fn in _INTEGRATIONS:
|
|
116
|
+
if lib_name in _PATCHED:
|
|
117
|
+
if verbose:
|
|
118
|
+
print(f" {lib_name}: already patched, skipped")
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if importlib.util.find_spec(lib_name) is None:
|
|
122
|
+
if verbose:
|
|
123
|
+
print(f" {lib_name}: not installed, skipped")
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
if _try_patch_integration(lib_name, integration_module, patch_fn, verbose):
|
|
127
|
+
newly_patched.add(lib_name)
|
|
128
|
+
|
|
129
|
+
return newly_patched
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def teardown(*, verbose: bool = False) -> set[str]:
|
|
133
|
+
"""Unpatch all auto-patched integrations and reset the auto-patch registry.
|
|
134
|
+
|
|
135
|
+
Calls ``unpatch()`` on every integration that was patched via
|
|
136
|
+
:func:`setup`. Safe to call even if :func:`setup` was never called.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
verbose: When ``True``, print a status line for each integration.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Set of library names that were unpatched.
|
|
143
|
+
"""
|
|
144
|
+
unpatched: set[str] = set()
|
|
145
|
+
|
|
146
|
+
for lib_name, integration_module, _patch_fn, unpatch_fn in _INTEGRATIONS:
|
|
147
|
+
with _PATCHED_LOCK:
|
|
148
|
+
if lib_name not in _PATCHED:
|
|
149
|
+
continue
|
|
150
|
+
try:
|
|
151
|
+
mod = importlib.import_module(integration_module)
|
|
152
|
+
getattr(mod, unpatch_fn)()
|
|
153
|
+
with _PATCHED_LOCK:
|
|
154
|
+
_PATCHED.discard(lib_name)
|
|
155
|
+
unpatched.add(lib_name)
|
|
156
|
+
if verbose:
|
|
157
|
+
print(f" {lib_name}: unpatched \u2713")
|
|
158
|
+
except Exception as exc:
|
|
159
|
+
warnings.warn(
|
|
160
|
+
f"spanforge.auto: failed to unpatch {lib_name!r}: {exc}",
|
|
161
|
+
UserWarning,
|
|
162
|
+
stacklevel=2,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return unpatched
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def patched_integrations() -> set[str]:
|
|
169
|
+
"""Return the set of library names currently patched via :func:`setup`.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Snapshot of the currently patched integration names.
|
|
173
|
+
"""
|
|
174
|
+
with _PATCHED_LOCK:
|
|
175
|
+
return set(_PATCHED)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# NOTE: setup() is NOT called automatically on import.
|
|
179
|
+
# Call spanforge.auto.setup() explicitly to patch installed integrations.
|
|
180
|
+
# This is intentional: importing spanforge should never monkey-patch
|
|
181
|
+
# third-party libraries without explicit user consent.
|
spanforge/baseline.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
"""spanforge.baseline — Behavioural baseline construction for drift detection.
|
|
2
|
+
|
|
3
|
+
:class:`BehaviouralBaseline` captures the statistical summary of an agent's
|
|
4
|
+
typical behaviour over an initial traffic window (default: up to 1 000 events
|
|
5
|
+
or 24 hours). The baseline is serialisable to JSON so it can be persisted and
|
|
6
|
+
reloaded across restarts.
|
|
7
|
+
|
|
8
|
+
Usage::
|
|
9
|
+
|
|
10
|
+
from spanforge.baseline import BehaviouralBaseline
|
|
11
|
+
from spanforge.stream import iter_file
|
|
12
|
+
|
|
13
|
+
events = list(iter_file("events.jsonl"))
|
|
14
|
+
baseline = BehaviouralBaseline.from_events(events)
|
|
15
|
+
baseline.save("baseline.json")
|
|
16
|
+
|
|
17
|
+
# — on restart —
|
|
18
|
+
baseline = BehaviouralBaseline.load("baseline.json")
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import datetime
|
|
24
|
+
import json
|
|
25
|
+
import pathlib
|
|
26
|
+
import statistics
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
from typing import TYPE_CHECKING, Any, Iterable
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from spanforge.event import Event
|
|
32
|
+
|
|
33
|
+
__all__ = ["BehaviouralBaseline", "DistributionStats"]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Statistical helpers
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _percentile(sorted_data: list[float], pct: float) -> float:
|
|
42
|
+
"""Return the *pct*-th percentile of an already-sorted list."""
|
|
43
|
+
if not sorted_data:
|
|
44
|
+
return 0.0
|
|
45
|
+
if len(sorted_data) == 1:
|
|
46
|
+
return float(sorted_data[0])
|
|
47
|
+
idx = (pct / 100.0) * (len(sorted_data) - 1)
|
|
48
|
+
lo = int(idx)
|
|
49
|
+
hi = lo + 1
|
|
50
|
+
if hi >= len(sorted_data):
|
|
51
|
+
return float(sorted_data[-1])
|
|
52
|
+
frac = idx - lo
|
|
53
|
+
return sorted_data[lo] * (1.0 - frac) + sorted_data[hi] * frac
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _event_type_str(event: "Event") -> str:
|
|
57
|
+
et = event.event_type
|
|
58
|
+
return et.value if hasattr(et, "value") else str(et)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Value object
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class DistributionStats:
|
|
68
|
+
"""Mean, standard deviation, and percentiles for a numeric metric.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
mean: Arithmetic mean of the sample population.
|
|
72
|
+
stddev: Sample standard deviation (0.0 when fewer than 2 samples).
|
|
73
|
+
p50: 50th percentile (median).
|
|
74
|
+
p95: 95th percentile.
|
|
75
|
+
p99: 99th percentile.
|
|
76
|
+
sample_count: Number of observations used to compute the statistics.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
mean: float
|
|
80
|
+
stddev: float
|
|
81
|
+
p50: float
|
|
82
|
+
p95: float
|
|
83
|
+
p99: float
|
|
84
|
+
sample_count: int
|
|
85
|
+
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
# Factory
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def from_samples(cls, samples: list[float]) -> "DistributionStats":
|
|
92
|
+
"""Build a :class:`DistributionStats` from a list of observations."""
|
|
93
|
+
if not samples:
|
|
94
|
+
return cls(mean=0.0, stddev=0.0, p50=0.0, p95=0.0, p99=0.0, sample_count=0)
|
|
95
|
+
s = sorted(samples)
|
|
96
|
+
mean = statistics.mean(s)
|
|
97
|
+
stddev = statistics.stdev(s) if len(s) >= 2 else 0.0
|
|
98
|
+
return cls(
|
|
99
|
+
mean=mean,
|
|
100
|
+
stddev=stddev,
|
|
101
|
+
p50=_percentile(s, 50),
|
|
102
|
+
p95=_percentile(s, 95),
|
|
103
|
+
p99=_percentile(s, 99),
|
|
104
|
+
sample_count=len(s),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# ------------------------------------------------------------------
|
|
108
|
+
# Serialisation
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
def to_dict(self) -> dict[str, Any]:
|
|
112
|
+
return {
|
|
113
|
+
"mean": self.mean,
|
|
114
|
+
"stddev": self.stddev,
|
|
115
|
+
"p50": self.p50,
|
|
116
|
+
"p95": self.p95,
|
|
117
|
+
"p99": self.p99,
|
|
118
|
+
"sample_count": self.sample_count,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def from_dict(cls, d: dict[str, Any]) -> "DistributionStats":
|
|
123
|
+
return cls(
|
|
124
|
+
mean=float(d["mean"]),
|
|
125
|
+
stddev=float(d["stddev"]),
|
|
126
|
+
p50=float(d["p50"]),
|
|
127
|
+
p95=float(d["p95"]),
|
|
128
|
+
p99=float(d["p99"]),
|
|
129
|
+
sample_count=int(d["sample_count"]),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
# Baseline
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class BehaviouralBaseline:
|
|
140
|
+
"""Statistical summary of an agent's typical behaviour.
|
|
141
|
+
|
|
142
|
+
Built from an initial traffic window and used by :class:`~spanforge.drift.DriftDetector`
|
|
143
|
+
to detect statistically significant deviations at runtime.
|
|
144
|
+
|
|
145
|
+
Attributes:
|
|
146
|
+
tokens: Token count distribution across all LLM spans.
|
|
147
|
+
confidence_by_type: Per-decision-type confidence score distributions.
|
|
148
|
+
latency_by_operation: Per-operation latency distributions (milliseconds).
|
|
149
|
+
tool_rate_per_hour: Observed tool invocation rate per tool name (calls/h).
|
|
150
|
+
decision_rate_per_hour: Observed decision rate per decision type (decisions/h).
|
|
151
|
+
event_count: Number of events consumed to build this baseline.
|
|
152
|
+
window_seconds: Duration of the baseline traffic window in seconds.
|
|
153
|
+
recorded_at: ISO 8601 UTC timestamp when the baseline was created.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
tokens: DistributionStats
|
|
157
|
+
confidence_by_type: dict[str, DistributionStats] = field(default_factory=dict)
|
|
158
|
+
latency_by_operation: dict[str, DistributionStats] = field(default_factory=dict)
|
|
159
|
+
tool_rate_per_hour: dict[str, float] = field(default_factory=dict)
|
|
160
|
+
decision_rate_per_hour: dict[str, float] = field(default_factory=dict)
|
|
161
|
+
event_count: int = 0
|
|
162
|
+
window_seconds: float = 86400.0
|
|
163
|
+
recorded_at: str = ""
|
|
164
|
+
|
|
165
|
+
# ------------------------------------------------------------------
|
|
166
|
+
# Factory
|
|
167
|
+
# ------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def from_events(
|
|
171
|
+
cls,
|
|
172
|
+
events: Iterable["Event"],
|
|
173
|
+
max_events: int = 1000,
|
|
174
|
+
window_seconds: float = 86400.0,
|
|
175
|
+
) -> "BehaviouralBaseline":
|
|
176
|
+
"""Build a baseline from a stream of events.
|
|
177
|
+
|
|
178
|
+
Consumes at most *max_events* events from *events* (or the whole
|
|
179
|
+
iterable, whichever comes first) and computes statistical distributions
|
|
180
|
+
for the following metric groups:
|
|
181
|
+
|
|
182
|
+
- **Tokens** — total token count from ``llm.trace.span.completed``
|
|
183
|
+
payloads that contain a ``token_usage`` dict.
|
|
184
|
+
- **Confidence** — per-decision-type score from ``confidence.sample``
|
|
185
|
+
events.
|
|
186
|
+
- **Latency** — per-operation latency from ``llm.trace.span.completed``,
|
|
187
|
+
``tool_call.*``, and ``latency.sample`` events.
|
|
188
|
+
- **Tool invocation rates** — calls per hour from ``tool_call.*`` events.
|
|
189
|
+
- **Decision rates** — decisions per hour from ``decision.made`` events.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
events: Source iterable of :class:`~spanforge.event.Event`.
|
|
193
|
+
max_events: Upper bound on events consumed (default 1 000).
|
|
194
|
+
window_seconds: Denominator for rate calculations (default 86 400 s = 24 h).
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
A fully-populated :class:`BehaviouralBaseline`.
|
|
198
|
+
"""
|
|
199
|
+
token_samples: list[float] = []
|
|
200
|
+
confidence_samples: dict[str, list[float]] = {}
|
|
201
|
+
latency_samples: dict[str, list[float]] = {}
|
|
202
|
+
tool_counts: dict[str, int] = {}
|
|
203
|
+
decision_counts: dict[str, int] = {}
|
|
204
|
+
|
|
205
|
+
count = 0
|
|
206
|
+
for event in events:
|
|
207
|
+
if count >= max_events:
|
|
208
|
+
break
|
|
209
|
+
count += 1
|
|
210
|
+
etype = _event_type_str(event)
|
|
211
|
+
payload = event.payload
|
|
212
|
+
|
|
213
|
+
# LLM span events — tokens + latency
|
|
214
|
+
if etype in ("llm.trace.span.completed", "llm.trace.span.failed"):
|
|
215
|
+
tu = payload.get("token_usage")
|
|
216
|
+
if tu:
|
|
217
|
+
total = int(tu.get("total_tokens", 0) or 0)
|
|
218
|
+
if total > 0:
|
|
219
|
+
token_samples.append(float(total))
|
|
220
|
+
dur = payload.get("duration_ms")
|
|
221
|
+
if dur is not None:
|
|
222
|
+
op = str(payload.get("operation", "unknown"))
|
|
223
|
+
latency_samples.setdefault(op, []).append(float(dur))
|
|
224
|
+
if op == "tool_call":
|
|
225
|
+
tool_counts[op] = tool_counts.get(op, 0) + 1
|
|
226
|
+
|
|
227
|
+
# Confidence namespace events
|
|
228
|
+
elif etype == "confidence.sample":
|
|
229
|
+
dtype = str(payload.get("decision_type", "unknown"))
|
|
230
|
+
score = payload.get("score")
|
|
231
|
+
if score is not None:
|
|
232
|
+
confidence_samples.setdefault(dtype, []).append(float(score))
|
|
233
|
+
|
|
234
|
+
# Decision namespace events
|
|
235
|
+
elif etype == "decision.made":
|
|
236
|
+
dtype = str(payload.get("decision_type", "unknown"))
|
|
237
|
+
decision_counts[dtype] = decision_counts.get(dtype, 0) + 1
|
|
238
|
+
|
|
239
|
+
# Tool call namespace events
|
|
240
|
+
elif etype.startswith("tool_call."):
|
|
241
|
+
tool_name = str(payload.get("tool_name", "unknown"))
|
|
242
|
+
tool_counts[tool_name] = tool_counts.get(tool_name, 0) + 1
|
|
243
|
+
lat = payload.get("latency_ms")
|
|
244
|
+
if lat is not None:
|
|
245
|
+
latency_samples.setdefault(tool_name, []).append(float(lat))
|
|
246
|
+
|
|
247
|
+
# Latency namespace events
|
|
248
|
+
elif etype == "latency.sample":
|
|
249
|
+
op = str(payload.get("operation", "unknown"))
|
|
250
|
+
lat = payload.get("latency_ms")
|
|
251
|
+
if lat is not None:
|
|
252
|
+
latency_samples.setdefault(op, []).append(float(lat))
|
|
253
|
+
|
|
254
|
+
hours = (window_seconds / 3600.0) if window_seconds > 0 else 1.0
|
|
255
|
+
|
|
256
|
+
return cls(
|
|
257
|
+
tokens=DistributionStats.from_samples(token_samples),
|
|
258
|
+
confidence_by_type={
|
|
259
|
+
dt: DistributionStats.from_samples(samples)
|
|
260
|
+
for dt, samples in confidence_samples.items()
|
|
261
|
+
},
|
|
262
|
+
latency_by_operation={
|
|
263
|
+
op: DistributionStats.from_samples(samples)
|
|
264
|
+
for op, samples in latency_samples.items()
|
|
265
|
+
},
|
|
266
|
+
tool_rate_per_hour={
|
|
267
|
+
op: cnt / hours for op, cnt in tool_counts.items()
|
|
268
|
+
},
|
|
269
|
+
decision_rate_per_hour={
|
|
270
|
+
dt: cnt / hours for dt, cnt in decision_counts.items()
|
|
271
|
+
},
|
|
272
|
+
event_count=count,
|
|
273
|
+
window_seconds=window_seconds,
|
|
274
|
+
recorded_at=datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f") + "Z",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# ------------------------------------------------------------------
|
|
278
|
+
# Serialisation
|
|
279
|
+
# ------------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
def to_dict(self) -> dict[str, Any]:
|
|
282
|
+
return {
|
|
283
|
+
"tokens": self.tokens.to_dict(),
|
|
284
|
+
"confidence_by_type": {
|
|
285
|
+
k: v.to_dict() for k, v in self.confidence_by_type.items()
|
|
286
|
+
},
|
|
287
|
+
"latency_by_operation": {
|
|
288
|
+
k: v.to_dict() for k, v in self.latency_by_operation.items()
|
|
289
|
+
},
|
|
290
|
+
"tool_rate_per_hour": dict(self.tool_rate_per_hour),
|
|
291
|
+
"decision_rate_per_hour": dict(self.decision_rate_per_hour),
|
|
292
|
+
"event_count": self.event_count,
|
|
293
|
+
"window_seconds": self.window_seconds,
|
|
294
|
+
"recorded_at": self.recorded_at,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
def to_json(self) -> str:
|
|
298
|
+
"""Serialise to a compact JSON string (keys sorted)."""
|
|
299
|
+
return json.dumps(self.to_dict(), sort_keys=True, indent=2)
|
|
300
|
+
|
|
301
|
+
@classmethod
|
|
302
|
+
def from_dict(cls, d: dict[str, Any]) -> "BehaviouralBaseline":
|
|
303
|
+
return cls(
|
|
304
|
+
tokens=DistributionStats.from_dict(d["tokens"]),
|
|
305
|
+
confidence_by_type={
|
|
306
|
+
k: DistributionStats.from_dict(v)
|
|
307
|
+
for k, v in d.get("confidence_by_type", {}).items()
|
|
308
|
+
},
|
|
309
|
+
latency_by_operation={
|
|
310
|
+
k: DistributionStats.from_dict(v)
|
|
311
|
+
for k, v in d.get("latency_by_operation", {}).items()
|
|
312
|
+
},
|
|
313
|
+
tool_rate_per_hour={
|
|
314
|
+
k: float(v) for k, v in d.get("tool_rate_per_hour", {}).items()
|
|
315
|
+
},
|
|
316
|
+
decision_rate_per_hour={
|
|
317
|
+
k: float(v) for k, v in d.get("decision_rate_per_hour", {}).items()
|
|
318
|
+
},
|
|
319
|
+
event_count=int(d.get("event_count", 0)),
|
|
320
|
+
window_seconds=float(d.get("window_seconds", 86400.0)),
|
|
321
|
+
recorded_at=str(d.get("recorded_at", "")),
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
@classmethod
|
|
325
|
+
def from_json(cls, s: str) -> "BehaviouralBaseline":
|
|
326
|
+
"""Deserialise from a JSON string produced by :meth:`to_json`."""
|
|
327
|
+
return cls.from_dict(json.loads(s))
|
|
328
|
+
|
|
329
|
+
def save(self, path: str | pathlib.Path) -> None:
|
|
330
|
+
"""Write the baseline to *path* as UTF-8 JSON."""
|
|
331
|
+
pathlib.Path(path).write_text(self.to_json(), encoding="utf-8")
|
|
332
|
+
|
|
333
|
+
@classmethod
|
|
334
|
+
def load(cls, path: str | pathlib.Path) -> "BehaviouralBaseline":
|
|
335
|
+
"""Load a baseline previously saved with :meth:`save`."""
|
|
336
|
+
return cls.from_json(pathlib.Path(path).read_text(encoding="utf-8"))
|