spanforge 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +695 -0
- spanforge/_batch_exporter.py +322 -0
- spanforge/_cli.py +3081 -0
- spanforge/_hooks.py +340 -0
- spanforge/_server.py +953 -0
- spanforge/_span.py +1015 -0
- spanforge/_store.py +287 -0
- spanforge/_stream.py +654 -0
- spanforge/_trace.py +334 -0
- spanforge/_tracer.py +253 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +464 -0
- spanforge/auto.py +181 -0
- spanforge/baseline.py +336 -0
- spanforge/config.py +460 -0
- spanforge/consent.py +227 -0
- spanforge/consumer.py +379 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1060 -0
- spanforge/cost.py +597 -0
- spanforge/debug.py +514 -0
- spanforge/drift.py +488 -0
- spanforge/egress.py +63 -0
- spanforge/eval.py +575 -0
- spanforge/event.py +1052 -0
- spanforge/exceptions.py +246 -0
- spanforge/explain.py +181 -0
- spanforge/export/__init__.py +50 -0
- spanforge/export/append_only.py +342 -0
- spanforge/export/cloud.py +349 -0
- spanforge/export/datadog.py +495 -0
- spanforge/export/grafana.py +331 -0
- spanforge/export/jsonl.py +198 -0
- spanforge/export/otel_bridge.py +291 -0
- spanforge/export/otlp.py +817 -0
- spanforge/export/otlp_bridge.py +231 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/webhook.py +302 -0
- spanforge/exporters/__init__.py +29 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/hitl.py +297 -0
- spanforge/inspect.py +429 -0
- spanforge/integrations/__init__.py +39 -0
- spanforge/integrations/_pricing.py +277 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/bedrock.py +306 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +349 -0
- spanforge/integrations/groq.py +444 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/llamaindex.py +370 -0
- spanforge/integrations/ollama.py +286 -0
- spanforge/integrations/openai.py +370 -0
- spanforge/integrations/together.py +485 -0
- spanforge/metrics.py +393 -0
- spanforge/metrics_export.py +342 -0
- spanforge/migrate.py +278 -0
- spanforge/model_registry.py +282 -0
- spanforge/models.py +407 -0
- spanforge/namespaces/__init__.py +215 -0
- spanforge/namespaces/audit.py +253 -0
- spanforge/namespaces/cache.py +209 -0
- spanforge/namespaces/chain.py +74 -0
- spanforge/namespaces/confidence.py +69 -0
- spanforge/namespaces/consent.py +85 -0
- spanforge/namespaces/cost.py +175 -0
- spanforge/namespaces/decision.py +135 -0
- spanforge/namespaces/diff.py +146 -0
- spanforge/namespaces/drift.py +79 -0
- spanforge/namespaces/eval_.py +232 -0
- spanforge/namespaces/fence.py +180 -0
- spanforge/namespaces/guard.py +104 -0
- spanforge/namespaces/hitl.py +92 -0
- spanforge/namespaces/latency.py +69 -0
- spanforge/namespaces/prompt.py +185 -0
- spanforge/namespaces/redact.py +172 -0
- spanforge/namespaces/template.py +197 -0
- spanforge/namespaces/tool_call.py +76 -0
- spanforge/namespaces/trace.py +1006 -0
- spanforge/normalizer.py +183 -0
- spanforge/presidio_backend.py +149 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +415 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +780 -0
- spanforge/sampling.py +500 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/signing.py +1152 -0
- spanforge/stream.py +559 -0
- spanforge/testing.py +376 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +304 -0
- spanforge/validate.py +383 -0
- spanforge-2.0.0.dist-info/METADATA +1777 -0
- spanforge-2.0.0.dist-info/RECORD +101 -0
- spanforge-2.0.0.dist-info/WHEEL +4 -0
- spanforge-2.0.0.dist-info/entry_points.txt +5 -0
- spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/ulid.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Zero-dependency ULID (Universally Unique Lexicographically Sortable Identifier).
|
|
2
|
+
|
|
3
|
+
Specification: https://github.com/ulid/spec
|
|
4
|
+
|
|
5
|
+
Format (26 Crockford Base32 characters, 128 bits)
|
|
6
|
+
--------------------------------------------------
|
|
7
|
+
::
|
|
8
|
+
|
|
9
|
+
01ARZ3NDEKTSV4RRFFQ69G5FAV
|
|
10
|
+
├──────────┤├────────────────┤
|
|
11
|
+
Timestamp (ms) Random (80 bits)
|
|
12
|
+
48 bits, 10 chars 16 chars
|
|
13
|
+
|
|
14
|
+
Properties
|
|
15
|
+
----------
|
|
16
|
+
* **Lexicographically sortable** — events can be sorted by ULID without parsing
|
|
17
|
+
the timestamp field.
|
|
18
|
+
* **Monotonic within the same millisecond** — the random component is
|
|
19
|
+
incremented rather than regenerated when two ULIDs are requested within the
|
|
20
|
+
same millisecond clock tick, preserving ordering.
|
|
21
|
+
* **URL and filename safe** — only uppercase alphanumerics (Crockford Base32).
|
|
22
|
+
* **Zero external dependencies** — uses only :mod:`os` and :mod:`time`.
|
|
23
|
+
|
|
24
|
+
Security note
|
|
25
|
+
-------------
|
|
26
|
+
The random component is seeded from :func:`os.urandom` (CSPRNG), making ULIDs
|
|
27
|
+
safe for use as non-guessable identifiers in audit chains.
|
|
28
|
+
|
|
29
|
+
Performance note
|
|
30
|
+
----------------
|
|
31
|
+
The module-level :class:`_ULIDGenerator` instance is thread-safe via the GIL
|
|
32
|
+
for standard CPython but is explicitly protected with :class:`threading.Lock`
|
|
33
|
+
for correctness on alternative runtimes and as documentation of intent.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import os
|
|
39
|
+
import threading
|
|
40
|
+
import time
|
|
41
|
+
from typing import Final
|
|
42
|
+
|
|
43
|
+
from spanforge.exceptions import ULIDError
|
|
44
|
+
|
|
45
|
+
__all__ = ["ULID_REGEX", "generate", "validate"]
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Crockford Base32 alphabet (excludes I, L, O, U to avoid confusion)
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
_ALPHABET: Final[str] = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
|
51
|
+
_ALPHABET_LEN: Final[int] = 32 # exactly 2^5 — one char encodes 5 bits
|
|
52
|
+
|
|
53
|
+
# Pre-compute a decode lookup table for O(1) character → value conversion.
|
|
54
|
+
_DECODE: Final[dict[str, int]] = {ch: idx for idx, ch in enumerate(_ALPHABET)}
|
|
55
|
+
|
|
56
|
+
# Extra entries for lowercase and visually-similar characters (I/L/O/U).
|
|
57
|
+
_DECODE.update({ch.lower(): idx for ch, idx in _DECODE.items()})
|
|
58
|
+
_DECODE.update({"i": 1, "I": 1, "l": 1, "L": 1, "o": 0, "O": 0})
|
|
59
|
+
|
|
60
|
+
# Strict charset for validation — excludes I/L/O/U aliases (generate() never
|
|
61
|
+
# emits them; validate() must reject them for canonical-form compliance).
|
|
62
|
+
_VALID_CHARS: Final[frozenset[str]] = frozenset(_ALPHABET + _ALPHABET.lower())
|
|
63
|
+
|
|
64
|
+
ULID_LENGTH: Final[int] = 26
|
|
65
|
+
# RFC-0001 §6.3 — first character must be 0-7 (timestamp MSBs, max value
|
|
66
|
+
# «0111» ensures the 48-bit timestamp fits in 10 Crockford characters).
|
|
67
|
+
ULID_REGEX: Final[str] = r"^[0-7][0-9A-HJKMNP-TV-Z]{25}$"
|
|
68
|
+
|
|
69
|
+
_MAX_TIMESTAMP: Final[int] = (1 << 48) - 1 # 281 474 976 710 655 ms
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Monotonic generator
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class _ULIDGenerator:
|
|
77
|
+
"""Stateful generator that guarantees monotonicity within one millisecond.
|
|
78
|
+
|
|
79
|
+
When two calls are made within the same millisecond, the random segment is
|
|
80
|
+
incremented by 1, preserving lexicographic ordering. If the random segment
|
|
81
|
+
would overflow (2**80) clock advancement is waited for.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
__slots__ = ("_last_ms", "_last_rand", "_lock")
|
|
85
|
+
|
|
86
|
+
_rand_max: Final[int] = (1 << 80) - 1 # type: ignore[misc]
|
|
87
|
+
|
|
88
|
+
def __init__(self) -> None:
|
|
89
|
+
self._lock = threading.Lock()
|
|
90
|
+
self._last_ms: int = 0
|
|
91
|
+
self._last_rand: int = 0
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# Public interface
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def generate(self) -> str:
|
|
98
|
+
"""Return a new ULID string.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ULIDError: If the system clock is not monotonic or the random source
|
|
102
|
+
is exhausted (astronomically unlikely).
|
|
103
|
+
"""
|
|
104
|
+
ms, rand = self._next_ms_rand()
|
|
105
|
+
return _encode_ulid(ms, rand)
|
|
106
|
+
|
|
107
|
+
# ------------------------------------------------------------------
|
|
108
|
+
# Internals
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
def _next_ms_rand(self) -> tuple[int, int]:
|
|
112
|
+
"""Return (timestamp_ms, random_int) ensuring monotonic ordering."""
|
|
113
|
+
with self._lock:
|
|
114
|
+
ms = _now_ms()
|
|
115
|
+
|
|
116
|
+
if ms > self._last_ms:
|
|
117
|
+
# New millisecond — fresh random segment.
|
|
118
|
+
rand = _secure_random_80()
|
|
119
|
+
self._last_ms = ms
|
|
120
|
+
self._last_rand = rand
|
|
121
|
+
return ms, rand
|
|
122
|
+
|
|
123
|
+
if ms == self._last_ms:
|
|
124
|
+
# Same millisecond — increment random to preserve ordering.
|
|
125
|
+
next_rand = self._last_rand + 1
|
|
126
|
+
if next_rand > self._rand_max:
|
|
127
|
+
# Overflow — spin until the clock advances.
|
|
128
|
+
ms = _spin_until_next_ms(ms)
|
|
129
|
+
next_rand = _secure_random_80()
|
|
130
|
+
self._last_ms = ms
|
|
131
|
+
self._last_rand = next_rand
|
|
132
|
+
return ms, next_rand
|
|
133
|
+
|
|
134
|
+
# Clock went backwards — still safe: we use last_ms + increment.
|
|
135
|
+
next_rand = self._last_rand + 1
|
|
136
|
+
if next_rand > self._rand_max:
|
|
137
|
+
raise ULIDError(
|
|
138
|
+
"Random segment overflow with backwards clock — "
|
|
139
|
+
"cannot guarantee monotonicity"
|
|
140
|
+
)
|
|
141
|
+
self._last_rand = next_rand
|
|
142
|
+
return self._last_ms, next_rand
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# Module-level helpers
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _now_ms() -> int:
|
|
151
|
+
"""Return current Unix time in milliseconds as an integer."""
|
|
152
|
+
return int(time.time() * 1_000)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _secure_random_80() -> int:
|
|
156
|
+
"""Return 80 cryptographically-secure random bits as an integer."""
|
|
157
|
+
return int.from_bytes(os.urandom(10), "big")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _spin_until_next_ms(current_ms: int) -> int:
|
|
161
|
+
"""Wait until the clock advances past *current_ms*, with a 1-second deadline.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
ULIDError: If the system clock does not advance within 1 second (e.g.
|
|
165
|
+
clock is frozen or running backwards).
|
|
166
|
+
"""
|
|
167
|
+
deadline = time.monotonic() + 1.0
|
|
168
|
+
while True:
|
|
169
|
+
ms = _now_ms()
|
|
170
|
+
if ms > current_ms:
|
|
171
|
+
return ms
|
|
172
|
+
if time.monotonic() > deadline:
|
|
173
|
+
raise ULIDError(
|
|
174
|
+
"Clock did not advance within 1 s — possible system clock freeze"
|
|
175
|
+
)
|
|
176
|
+
# Yield CPU so other threads can run and the OS clock can tick.
|
|
177
|
+
time.sleep(0.001)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _encode_ulid(timestamp_ms: int, random_int: int) -> str:
|
|
181
|
+
"""Encode (timestamp_ms, random_int) into a 26-character ULID string.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
timestamp_ms: 48-bit millisecond timestamp.
|
|
185
|
+
random_int: 80-bit random value.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
26-character Crockford Base32 ULID string (uppercase).
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
ULIDError: If timestamp_ms exceeds the 48-bit maximum.
|
|
192
|
+
"""
|
|
193
|
+
if timestamp_ms > _MAX_TIMESTAMP:
|
|
194
|
+
raise ULIDError(
|
|
195
|
+
f"Timestamp {timestamp_ms} ms exceeds ULID maximum "
|
|
196
|
+
f"({_MAX_TIMESTAMP} ms ≈ year 10889)"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Encode timestamp — 10 characters (50 bits needed; 48 used)
|
|
200
|
+
ts_chars = [""] * 10
|
|
201
|
+
t = timestamp_ms
|
|
202
|
+
for i in range(9, -1, -1):
|
|
203
|
+
ts_chars[i] = _ALPHABET[t & 0x1F]
|
|
204
|
+
t >>= 5
|
|
205
|
+
|
|
206
|
+
# Encode random — 16 characters (80 bits)
|
|
207
|
+
rand_chars = [""] * 16
|
|
208
|
+
r = random_int
|
|
209
|
+
for i in range(15, -1, -1):
|
|
210
|
+
rand_chars[i] = _ALPHABET[r & 0x1F]
|
|
211
|
+
r >>= 5
|
|
212
|
+
|
|
213
|
+
return "".join(ts_chars) + "".join(rand_chars)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ---------------------------------------------------------------------------
|
|
217
|
+
# Public API
|
|
218
|
+
# ---------------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
_generator = _ULIDGenerator()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def generate() -> str:
|
|
224
|
+
"""Generate a new ULID string.
|
|
225
|
+
|
|
226
|
+
The returned value is:
|
|
227
|
+
|
|
228
|
+
* 26 characters long
|
|
229
|
+
* Composed of Crockford Base32 characters (``[0-9A-HJKMNP-TV-Z]``)
|
|
230
|
+
* Lexicographically sortable (earlier ULIDs < later ULIDs as strings)
|
|
231
|
+
* Monotonic within the same millisecond
|
|
232
|
+
* Seeded from :func:`os.urandom` (CSPRNG)
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
A 26-character uppercase ULID string.
|
|
236
|
+
|
|
237
|
+
Raises:
|
|
238
|
+
ULIDError: On the astronomically unlikely event of internal state
|
|
239
|
+
overflow or backwards-clock exhaustion.
|
|
240
|
+
|
|
241
|
+
Example::
|
|
242
|
+
|
|
243
|
+
from spanforge.ulid import generate
|
|
244
|
+
event_id = generate() # "01ARYZ3NDEKTSV4RRFFQ69G5FAV"
|
|
245
|
+
"""
|
|
246
|
+
return _generator.generate()
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def validate(value: str) -> bool:
|
|
250
|
+
"""Return ``True`` if *value* is a syntactically valid ULID string.
|
|
251
|
+
|
|
252
|
+
Validation checks:
|
|
253
|
+
|
|
254
|
+
1. Exactly 26 characters long.
|
|
255
|
+
2. All characters are in the Crockford Base32 alphabet (case-insensitive,
|
|
256
|
+
I/L/O treated as 1/1/0).
|
|
257
|
+
3. The timestamp component does not overflow the 48-bit range.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
value: The string to validate.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
``True`` if valid, ``False`` otherwise.
|
|
264
|
+
|
|
265
|
+
Example::
|
|
266
|
+
|
|
267
|
+
validate("01ARYZ3NDEKTSV4RRFFQ69G5FAV") # True
|
|
268
|
+
validate("not-a-ulid") # False
|
|
269
|
+
"""
|
|
270
|
+
if not isinstance(value, str) or len(value) != ULID_LENGTH:
|
|
271
|
+
return False
|
|
272
|
+
upper = value.upper()
|
|
273
|
+
if not all(c in _VALID_CHARS for c in upper):
|
|
274
|
+
return False
|
|
275
|
+
# Decode timestamp and check range
|
|
276
|
+
t = 0
|
|
277
|
+
for ch in upper[:10]:
|
|
278
|
+
t = (t << 5) | _DECODE[ch]
|
|
279
|
+
return t <= _MAX_TIMESTAMP
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def extract_timestamp_ms(ulid: str) -> int:
|
|
283
|
+
"""Extract the embedded millisecond timestamp from a ULID.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
ulid: A valid 26-character ULID string.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Unix timestamp in milliseconds.
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
ULIDError: If *ulid* is not a valid ULID.
|
|
293
|
+
|
|
294
|
+
Example::
|
|
295
|
+
|
|
296
|
+
ms = extract_timestamp_ms("01ARYZ3NDEKTSV4RRFFQ69G5FAV")
|
|
297
|
+
print(datetime.utcfromtimestamp(ms / 1000))
|
|
298
|
+
"""
|
|
299
|
+
if not validate(ulid):
|
|
300
|
+
raise ULIDError(f"Cannot extract timestamp from invalid ULID: {ulid!r}")
|
|
301
|
+
t = 0
|
|
302
|
+
for ch in ulid.upper()[:10]:
|
|
303
|
+
t = (t << 5) | _DECODE[ch]
|
|
304
|
+
return t
|
spanforge/validate.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
"""spanforge.validate — JSON Schema validation for Event envelopes.
|
|
2
|
+
|
|
3
|
+
This module validates :class:`~spanforge.event.Event` instances against the
|
|
4
|
+
published JSON Schema specification. Schema version is selected automatically
|
|
5
|
+
from the event's ``schema_version`` field:
|
|
6
|
+
|
|
7
|
+
* ``"1.0"`` → ``schemas/v1.0/schema.json``
|
|
8
|
+
* ``"2.0"`` (default) → ``schemas/v2.0/schema.json``
|
|
9
|
+
|
|
10
|
+
It uses the optional ``jsonschema`` library when available for full Draft 2020-12
|
|
11
|
+
validation. If ``jsonschema`` is not installed, a lightweight structural check
|
|
12
|
+
is performed using only the Python standard library — external dependencies are
|
|
13
|
+
strictly optional in line with *spanforge*'s zero-required-dependency policy.
|
|
14
|
+
|
|
15
|
+
Usage
|
|
16
|
+
-----
|
|
17
|
+
::
|
|
18
|
+
|
|
19
|
+
from spanforge import Event, EventType
|
|
20
|
+
from spanforge.validate import validate_event
|
|
21
|
+
|
|
22
|
+
event = Event(
|
|
23
|
+
event_type=EventType.TRACE_SPAN_COMPLETED,
|
|
24
|
+
source="llm-trace@0.3.1",
|
|
25
|
+
payload={"span_name": "run", "status": "ok"},
|
|
26
|
+
)
|
|
27
|
+
validate_event(event) # raises SchemaValidationError if invalid
|
|
28
|
+
|
|
29
|
+
Public API
|
|
30
|
+
----------
|
|
31
|
+
* :func:`validate_event` — validate an :class:`~spanforge.event.Event`
|
|
32
|
+
against the matching envelope schema (version-aware).
|
|
33
|
+
* :func:`load_schema` — load a specific schema version by key.
|
|
34
|
+
* :exc:`~spanforge.exceptions.SchemaValidationError` — raised on validation
|
|
35
|
+
failure (re-exported from :mod:`spanforge.exceptions`).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import json
|
|
41
|
+
import pathlib
|
|
42
|
+
import re
|
|
43
|
+
from typing import Any
|
|
44
|
+
|
|
45
|
+
from spanforge.event import Event
|
|
46
|
+
from spanforge.exceptions import EventTypeError, SchemaValidationError
|
|
47
|
+
from spanforge.types import is_registered, validate_custom
|
|
48
|
+
|
|
49
|
+
__all__: list[str] = ["load_schema", "validate_event"]
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# Schema paths — version-aware (RFC-0001 §15.5)
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
_SCHEMAS_DIR: pathlib.Path = pathlib.Path(__file__).parent / "schemas"
|
|
56
|
+
|
|
57
|
+
#: Map of schema-version strings to their JSON Schema file paths.
|
|
58
|
+
_SCHEMA_PATHS: dict[str, pathlib.Path] = {
|
|
59
|
+
"1.0": _SCHEMAS_DIR / "v1.0" / "schema.json",
|
|
60
|
+
"2.0": _SCHEMAS_DIR / "v2.0" / "schema.json",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
#: Default (current) schema version (RFC-0001-SPANFORGE-Enterprise-2.0).
|
|
64
|
+
_DEFAULT_SCHEMA_VERSION: str = "2.0"
|
|
65
|
+
|
|
66
|
+
# Legacy single-path alias kept for backwards-compatible callers.
|
|
67
|
+
_SCHEMA_PATH: pathlib.Path = _SCHEMA_PATHS["1.0"]
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Compiled patterns from schema (stdlib fallback)
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
# RFC-0001 §6.3 — first char 0-7 (timestamp MSB constraint)
|
|
74
|
+
_ULID_RE: re.Pattern[str] = re.compile(r"^[0-7][0-9A-HJKMNP-TV-Z]{25}$")
|
|
75
|
+
# RFC-0001 §15.5 — only 1.0 and 2.0 are accepted schema versions.
|
|
76
|
+
_ACCEPTED_SCHEMA_VERSIONS: frozenset[str] = frozenset({"1.0", "2.0"})
|
|
77
|
+
_EVENT_TYPE_RE: re.Pattern[str] = re.compile(
|
|
78
|
+
r"^(?:llm\.(?:trace|cost|cache|eval|guard|fence|prompt|redact|diff|template|audit)\.(?:[a-z][a-z0-9_]*|[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*)|(?!llm\.)[a-z][a-z0-9-]*(?:\.[a-z][a-z0-9-]*)+\.[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*)$" # NOSONAR — RFC §7 grammar with registered llm namespaces
|
|
79
|
+
)
|
|
80
|
+
# RFC-0001 §6.1 — microsecond precision mandatory (exactly 6 decimal places)
|
|
81
|
+
_TIMESTAMP_RE: re.Pattern[str] = re.compile(
|
|
82
|
+
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z$"
|
|
83
|
+
)
|
|
84
|
+
# RFC-0001 §5.1 — source: letter start, letters/digits/._-, then @semver
|
|
85
|
+
_SOURCE_RE: re.Pattern[str] = re.compile(
|
|
86
|
+
r"^[a-zA-Z][a-zA-Z0-9._\-]*@\d+\.\d+\.\d+(?:[.\-][a-zA-Z0-9.]+)?$"
|
|
87
|
+
)
|
|
88
|
+
_TRACE_ID_RE: re.Pattern[str] = re.compile(r"^[0-9a-f]{32}$")
|
|
89
|
+
_SPAN_ID_RE: re.Pattern[str] = re.compile(r"^[0-9a-f]{16}$")
|
|
90
|
+
# Checksum and signature carry distinct prefix indicators set by signing.py.
|
|
91
|
+
_CHECKSUM_RE: re.Pattern[str] = re.compile(r"^sha256:[0-9a-f]{64}$")
|
|
92
|
+
_SIGNATURE_RE: re.Pattern[str] = re.compile(r"^hmac-sha256:[0-9a-f]{64}$")
|
|
93
|
+
_MAX_TAG_KEYS: int = 50
|
|
94
|
+
|
|
95
|
+
# RFC-0001 §6.3 — ULID max length is 26 characters; 1 MB payload cap.
|
|
96
|
+
_MAX_EVENT_ID_LEN: int = 26
|
|
97
|
+
_MAX_PAYLOAD_BYTES: int = 1_000_000
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
# Schema loader
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
_CACHED_SCHEMAS: dict[str, dict[str, Any]] = {}
|
|
104
|
+
|
|
105
|
+
# Legacy alias kept for call sites that used the old single-schema API.
|
|
106
|
+
_CACHED_SCHEMA: dict[str, Any] | None = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_schema(version: str | None = None) -> dict[str, Any]:
|
|
110
|
+
"""Load and cache a JSON Schema from disk by version.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
version:
|
|
115
|
+
Schema version string, e.g. ``"1.0"`` or ``"2.0"``.
|
|
116
|
+
Defaults to the current SDK schema version (``"2.0"``; RFC §15.5).
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
-------
|
|
120
|
+
dict
|
|
121
|
+
Parsed JSON Schema as a plain Python dict.
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
------
|
|
125
|
+
FileNotFoundError
|
|
126
|
+
If the requested schema file cannot be found relative to the
|
|
127
|
+
package root. This should never happen in a correctly installed
|
|
128
|
+
distribution.
|
|
129
|
+
ValueError
|
|
130
|
+
If an unknown schema version is requested.
|
|
131
|
+
"""
|
|
132
|
+
resolved = version or _DEFAULT_SCHEMA_VERSION
|
|
133
|
+
if resolved in _CACHED_SCHEMAS:
|
|
134
|
+
return _CACHED_SCHEMAS[resolved]
|
|
135
|
+
|
|
136
|
+
# RFC-0001 §15.5: unknown schema versions MUST raise and stop processing.
|
|
137
|
+
path = _SCHEMA_PATHS.get(resolved)
|
|
138
|
+
if path is None:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Unknown schema version {resolved!r}. "
|
|
141
|
+
f"Available versions: {list(_SCHEMA_PATHS)}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if not path.is_file():
|
|
145
|
+
raise FileNotFoundError(
|
|
146
|
+
f"JSON Schema not found at {path}. "
|
|
147
|
+
"Ensure the 'schemas/' directory is included in the "
|
|
148
|
+
"installed package."
|
|
149
|
+
)
|
|
150
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
151
|
+
schema = json.load(fh)
|
|
152
|
+
_CACHED_SCHEMAS[resolved] = schema
|
|
153
|
+
return schema
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# Internal: stdlib structural validation
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _check_string_field(
|
|
162
|
+
doc: dict[str, Any],
|
|
163
|
+
field: str,
|
|
164
|
+
*,
|
|
165
|
+
required: bool = True,
|
|
166
|
+
pattern: re.Pattern[str] | None = None,
|
|
167
|
+
min_length: int = 1,
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Validate a single string field in *doc*."""
|
|
170
|
+
if field not in doc:
|
|
171
|
+
if required:
|
|
172
|
+
raise SchemaValidationError(
|
|
173
|
+
field=field,
|
|
174
|
+
received=None,
|
|
175
|
+
reason=f"required field '{field}' is missing",
|
|
176
|
+
)
|
|
177
|
+
return
|
|
178
|
+
value = doc[field]
|
|
179
|
+
if not isinstance(value, str):
|
|
180
|
+
raise SchemaValidationError(
|
|
181
|
+
field=field,
|
|
182
|
+
received=value,
|
|
183
|
+
reason=f"'{field}' must be a string",
|
|
184
|
+
)
|
|
185
|
+
if len(value) < min_length:
|
|
186
|
+
raise SchemaValidationError(
|
|
187
|
+
field=field,
|
|
188
|
+
received=value,
|
|
189
|
+
reason=f"'{field}' must be at least {min_length} character(s)",
|
|
190
|
+
)
|
|
191
|
+
if pattern is not None and not pattern.match(value):
|
|
192
|
+
raise SchemaValidationError(
|
|
193
|
+
field=field,
|
|
194
|
+
received=value,
|
|
195
|
+
reason=f"'{field}' does not match pattern {pattern.pattern!r}",
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _validate_tags(tags: Any) -> None:
|
|
200
|
+
"""Validate the tags dict; raise SchemaValidationError on any violation."""
|
|
201
|
+
if not isinstance(tags, dict):
|
|
202
|
+
raise SchemaValidationError(
|
|
203
|
+
field="tags",
|
|
204
|
+
received=tags,
|
|
205
|
+
reason="'tags' must be an object",
|
|
206
|
+
)
|
|
207
|
+
if len(tags) > _MAX_TAG_KEYS:
|
|
208
|
+
raise SchemaValidationError(
|
|
209
|
+
field="tags",
|
|
210
|
+
received=tags,
|
|
211
|
+
reason=f"'tags' must contain at most {_MAX_TAG_KEYS} keys",
|
|
212
|
+
)
|
|
213
|
+
for k, v in tags.items():
|
|
214
|
+
if not isinstance(k, str) or not k:
|
|
215
|
+
raise SchemaValidationError(
|
|
216
|
+
field=f"tags.{k!r}",
|
|
217
|
+
received=k,
|
|
218
|
+
reason="tag key must be a non-empty string",
|
|
219
|
+
)
|
|
220
|
+
if not isinstance(v, str) or not v:
|
|
221
|
+
raise SchemaValidationError(
|
|
222
|
+
field=f"tags.{k}",
|
|
223
|
+
received=v,
|
|
224
|
+
reason="tag value must be a non-empty string",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _stdlib_validate(doc: dict[str, Any]) -> None:
|
|
229
|
+
"""Perform structural validation without the ``jsonschema`` library.
|
|
230
|
+
|
|
231
|
+
Checks required fields, types, and regex patterns as per the published
|
|
232
|
+
JSON Schema spec. Raises :exc:`~spanforge.exceptions.SchemaValidationError`
|
|
233
|
+
on the first violation found.
|
|
234
|
+
"""
|
|
235
|
+
if not isinstance(doc, dict):
|
|
236
|
+
raise SchemaValidationError(
|
|
237
|
+
field="<root>",
|
|
238
|
+
received=doc,
|
|
239
|
+
reason="event must serialise to a JSON object",
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
_check_string_field(doc, "schema_version")
|
|
243
|
+
if doc["schema_version"] not in _ACCEPTED_SCHEMA_VERSIONS:
|
|
244
|
+
raise SchemaValidationError(
|
|
245
|
+
field="schema_version",
|
|
246
|
+
received=doc["schema_version"],
|
|
247
|
+
reason=f"'schema_version' must be one of {sorted(_ACCEPTED_SCHEMA_VERSIONS)!r}",
|
|
248
|
+
)
|
|
249
|
+
_check_string_field(doc, "event_id", pattern=_ULID_RE)
|
|
250
|
+
_check_string_field(doc, "event_type", pattern=_EVENT_TYPE_RE)
|
|
251
|
+
if not is_registered(doc["event_type"]):
|
|
252
|
+
try:
|
|
253
|
+
validate_custom(doc["event_type"])
|
|
254
|
+
except EventTypeError as exc:
|
|
255
|
+
raise SchemaValidationError(
|
|
256
|
+
field="event_type",
|
|
257
|
+
received=doc["event_type"],
|
|
258
|
+
reason=str(exc),
|
|
259
|
+
) from exc
|
|
260
|
+
_check_string_field(doc, "timestamp", pattern=_TIMESTAMP_RE)
|
|
261
|
+
_check_string_field(doc, "source", pattern=_SOURCE_RE)
|
|
262
|
+
|
|
263
|
+
# payload
|
|
264
|
+
if "payload" not in doc:
|
|
265
|
+
raise SchemaValidationError(
|
|
266
|
+
field="payload",
|
|
267
|
+
received=None,
|
|
268
|
+
reason="required field 'payload' is missing",
|
|
269
|
+
)
|
|
270
|
+
if not isinstance(doc["payload"], dict) or not doc["payload"]:
|
|
271
|
+
raise SchemaValidationError(
|
|
272
|
+
field="payload",
|
|
273
|
+
received=doc["payload"],
|
|
274
|
+
reason="'payload' must be a non-empty object",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Optional tracing fields
|
|
278
|
+
for span_field in ("span_id", "parent_span_id"):
|
|
279
|
+
_check_string_field(doc, span_field, required=False, pattern=_SPAN_ID_RE)
|
|
280
|
+
_check_string_field(doc, "trace_id", required=False, pattern=_TRACE_ID_RE)
|
|
281
|
+
|
|
282
|
+
# Optional context fields
|
|
283
|
+
for ctx_field in ("org_id", "team_id", "actor_id", "session_id"):
|
|
284
|
+
_check_string_field(doc, ctx_field, required=False, min_length=1)
|
|
285
|
+
|
|
286
|
+
# Optional integrity fields — checksum and signature use distinct prefix patterns.
|
|
287
|
+
_check_string_field(doc, "checksum", required=False, pattern=_CHECKSUM_RE)
|
|
288
|
+
_check_string_field(doc, "signature", required=False, pattern=_SIGNATURE_RE)
|
|
289
|
+
_check_string_field(doc, "prev_id", required=False, pattern=_ULID_RE)
|
|
290
|
+
|
|
291
|
+
# tags
|
|
292
|
+
if "tags" in doc:
|
|
293
|
+
_validate_tags(doc["tags"])
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# ---------------------------------------------------------------------------
|
|
297
|
+
# Public API
|
|
298
|
+
# ---------------------------------------------------------------------------
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def validate_event(event: Event) -> None:
|
|
302
|
+
"""Validate *event* against the published v1.0 JSON Schema.
|
|
303
|
+
|
|
304
|
+
Serialises *event* to a plain dict and validates the envelope structure.
|
|
305
|
+
When the optional ``jsonschema`` package is installed, full Draft 2020-12
|
|
306
|
+
validation is performed. Otherwise a stdlib-only structural check is run
|
|
307
|
+
that covers all required fields, types, and regex patterns.
|
|
308
|
+
|
|
309
|
+
Parameters
|
|
310
|
+
----------
|
|
311
|
+
event:
|
|
312
|
+
The :class:`~spanforge.event.Event` instance to validate.
|
|
313
|
+
|
|
314
|
+
Raises:
|
|
315
|
+
------
|
|
316
|
+
SchemaValidationError
|
|
317
|
+
If the event does not conform to the envelope schema.
|
|
318
|
+
FileNotFoundError
|
|
319
|
+
If the schema file is missing from the installed distribution.
|
|
320
|
+
|
|
321
|
+
Examples:
|
|
322
|
+
--------
|
|
323
|
+
::
|
|
324
|
+
|
|
325
|
+
from spanforge import Event, EventType
|
|
326
|
+
from spanforge.validate import validate_event
|
|
327
|
+
|
|
328
|
+
event = Event(
|
|
329
|
+
event_type=EventType.TRACE_SPAN_COMPLETED,
|
|
330
|
+
source="llm-trace@0.3.1",
|
|
331
|
+
payload={"span_name": "run", "status": "ok"},
|
|
332
|
+
)
|
|
333
|
+
validate_event(event) # passes silently
|
|
334
|
+
"""
|
|
335
|
+
if not isinstance(event, Event):
|
|
336
|
+
raise TypeError(f"validate_event() expects an Event instance, got {type(event)!r}")
|
|
337
|
+
|
|
338
|
+
doc = event.to_dict()
|
|
339
|
+
|
|
340
|
+
# H9: bound-check event_id length and payload wire size before schema validation.
|
|
341
|
+
event_id_val: str = doc.get("event_id", "")
|
|
342
|
+
if len(event_id_val) > _MAX_EVENT_ID_LEN:
|
|
343
|
+
raise SchemaValidationError(
|
|
344
|
+
field="event_id",
|
|
345
|
+
received=event_id_val,
|
|
346
|
+
reason=(
|
|
347
|
+
f"event_id length {len(event_id_val)} exceeds maximum "
|
|
348
|
+
f"{_MAX_EVENT_ID_LEN} characters"
|
|
349
|
+
),
|
|
350
|
+
)
|
|
351
|
+
_payload_bytes = len(json.dumps(doc.get("payload", {}), default=str).encode())
|
|
352
|
+
if _payload_bytes > _MAX_PAYLOAD_BYTES:
|
|
353
|
+
raise SchemaValidationError(
|
|
354
|
+
field="payload",
|
|
355
|
+
received=None,
|
|
356
|
+
reason=(
|
|
357
|
+
f"payload size {_payload_bytes} bytes exceeds maximum "
|
|
358
|
+
f"{_MAX_PAYLOAD_BYTES} bytes"
|
|
359
|
+
),
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Select schema version from event envelope (RFC §15.5).
|
|
363
|
+
schema_version: str = doc.get("schema_version") or _DEFAULT_SCHEMA_VERSION
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
import jsonschema # noqa: PLC0415 (optional import)
|
|
367
|
+
import jsonschema.exceptions # noqa: PLC0415
|
|
368
|
+
|
|
369
|
+
schema = load_schema(schema_version)
|
|
370
|
+
try:
|
|
371
|
+
jsonschema.validate(instance=doc, schema=schema)
|
|
372
|
+
except jsonschema.exceptions.ValidationError as exc:
|
|
373
|
+
# Convert jsonschema's error into our domain error.
|
|
374
|
+
field_path = ".".join(str(part) for part in exc.absolute_path) or "<root>"
|
|
375
|
+
raise SchemaValidationError(
|
|
376
|
+
field=field_path,
|
|
377
|
+
received=exc.instance,
|
|
378
|
+
reason=exc.message,
|
|
379
|
+
) from exc
|
|
380
|
+
|
|
381
|
+
except ImportError:
|
|
382
|
+
# jsonschema not installed — fall back to stdlib structural check.
|
|
383
|
+
_stdlib_validate(doc)
|