spanforge 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. spanforge/__init__.py +695 -0
  2. spanforge/_batch_exporter.py +322 -0
  3. spanforge/_cli.py +3081 -0
  4. spanforge/_hooks.py +340 -0
  5. spanforge/_server.py +953 -0
  6. spanforge/_span.py +1015 -0
  7. spanforge/_store.py +287 -0
  8. spanforge/_stream.py +654 -0
  9. spanforge/_trace.py +334 -0
  10. spanforge/_tracer.py +253 -0
  11. spanforge/actor.py +141 -0
  12. spanforge/alerts.py +464 -0
  13. spanforge/auto.py +181 -0
  14. spanforge/baseline.py +336 -0
  15. spanforge/config.py +460 -0
  16. spanforge/consent.py +227 -0
  17. spanforge/consumer.py +379 -0
  18. spanforge/core/__init__.py +5 -0
  19. spanforge/core/compliance_mapping.py +1060 -0
  20. spanforge/cost.py +597 -0
  21. spanforge/debug.py +514 -0
  22. spanforge/drift.py +488 -0
  23. spanforge/egress.py +63 -0
  24. spanforge/eval.py +575 -0
  25. spanforge/event.py +1052 -0
  26. spanforge/exceptions.py +246 -0
  27. spanforge/explain.py +181 -0
  28. spanforge/export/__init__.py +50 -0
  29. spanforge/export/append_only.py +342 -0
  30. spanforge/export/cloud.py +349 -0
  31. spanforge/export/datadog.py +495 -0
  32. spanforge/export/grafana.py +331 -0
  33. spanforge/export/jsonl.py +198 -0
  34. spanforge/export/otel_bridge.py +291 -0
  35. spanforge/export/otlp.py +817 -0
  36. spanforge/export/otlp_bridge.py +231 -0
  37. spanforge/export/redis_backend.py +282 -0
  38. spanforge/export/webhook.py +302 -0
  39. spanforge/exporters/__init__.py +29 -0
  40. spanforge/exporters/console.py +271 -0
  41. spanforge/exporters/jsonl.py +144 -0
  42. spanforge/hitl.py +297 -0
  43. spanforge/inspect.py +429 -0
  44. spanforge/integrations/__init__.py +39 -0
  45. spanforge/integrations/_pricing.py +277 -0
  46. spanforge/integrations/anthropic.py +388 -0
  47. spanforge/integrations/bedrock.py +306 -0
  48. spanforge/integrations/crewai.py +251 -0
  49. spanforge/integrations/gemini.py +349 -0
  50. spanforge/integrations/groq.py +444 -0
  51. spanforge/integrations/langchain.py +349 -0
  52. spanforge/integrations/llamaindex.py +370 -0
  53. spanforge/integrations/ollama.py +286 -0
  54. spanforge/integrations/openai.py +370 -0
  55. spanforge/integrations/together.py +485 -0
  56. spanforge/metrics.py +393 -0
  57. spanforge/metrics_export.py +342 -0
  58. spanforge/migrate.py +278 -0
  59. spanforge/model_registry.py +282 -0
  60. spanforge/models.py +407 -0
  61. spanforge/namespaces/__init__.py +215 -0
  62. spanforge/namespaces/audit.py +253 -0
  63. spanforge/namespaces/cache.py +209 -0
  64. spanforge/namespaces/chain.py +74 -0
  65. spanforge/namespaces/confidence.py +69 -0
  66. spanforge/namespaces/consent.py +85 -0
  67. spanforge/namespaces/cost.py +175 -0
  68. spanforge/namespaces/decision.py +135 -0
  69. spanforge/namespaces/diff.py +146 -0
  70. spanforge/namespaces/drift.py +79 -0
  71. spanforge/namespaces/eval_.py +232 -0
  72. spanforge/namespaces/fence.py +180 -0
  73. spanforge/namespaces/guard.py +104 -0
  74. spanforge/namespaces/hitl.py +92 -0
  75. spanforge/namespaces/latency.py +69 -0
  76. spanforge/namespaces/prompt.py +185 -0
  77. spanforge/namespaces/redact.py +172 -0
  78. spanforge/namespaces/template.py +197 -0
  79. spanforge/namespaces/tool_call.py +76 -0
  80. spanforge/namespaces/trace.py +1006 -0
  81. spanforge/normalizer.py +183 -0
  82. spanforge/presidio_backend.py +149 -0
  83. spanforge/processor.py +258 -0
  84. spanforge/prompt_registry.py +415 -0
  85. spanforge/py.typed +0 -0
  86. spanforge/redact.py +780 -0
  87. spanforge/sampling.py +500 -0
  88. spanforge/schemas/v1.0/schema.json +170 -0
  89. spanforge/schemas/v2.0/schema.json +536 -0
  90. spanforge/signing.py +1152 -0
  91. spanforge/stream.py +559 -0
  92. spanforge/testing.py +376 -0
  93. spanforge/trace.py +199 -0
  94. spanforge/types.py +696 -0
  95. spanforge/ulid.py +304 -0
  96. spanforge/validate.py +383 -0
  97. spanforge-2.0.0.dist-info/METADATA +1777 -0
  98. spanforge-2.0.0.dist-info/RECORD +101 -0
  99. spanforge-2.0.0.dist-info/WHEEL +4 -0
  100. spanforge-2.0.0.dist-info/entry_points.txt +5 -0
  101. spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/ulid.py ADDED
@@ -0,0 +1,304 @@
1
+ """Zero-dependency ULID (Universally Unique Lexicographically Sortable Identifier).
2
+
3
+ Specification: https://github.com/ulid/spec
4
+
5
+ Format (26 Crockford Base32 characters, 128 bits)
6
+ --------------------------------------------------
7
+ ::
8
+
9
+ 01ARZ3NDEKTSV4RRFFQ69G5FAV
10
+ ├──────────┤├────────────────┤
11
+ Timestamp (ms) Random (80 bits)
12
+ 48 bits, 10 chars 16 chars
13
+
14
+ Properties
15
+ ----------
16
+ * **Lexicographically sortable** — events can be sorted by ULID without parsing
17
+ the timestamp field.
18
+ * **Monotonic within the same millisecond** — the random component is
19
+ incremented rather than regenerated when two ULIDs are requested within the
20
+ same millisecond clock tick, preserving ordering.
21
+ * **URL and filename safe** — only uppercase alphanumerics (Crockford Base32).
22
+ * **Zero external dependencies** — uses only :mod:`os` and :mod:`time`.
23
+
24
+ Security note
25
+ -------------
26
+ The random component is seeded from :func:`os.urandom` (CSPRNG), making ULIDs
27
+ safe for use as non-guessable identifiers in audit chains.
28
+
29
+ Performance note
30
+ ----------------
31
+ The module-level :class:`_ULIDGenerator` instance is thread-safe via the GIL
32
+ for standard CPython but is explicitly protected with :class:`threading.Lock`
33
+ for correctness on alternative runtimes and as documentation of intent.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import os
39
+ import threading
40
+ import time
41
+ from typing import Final
42
+
43
+ from spanforge.exceptions import ULIDError
44
+
45
+ __all__ = ["ULID_REGEX", "generate", "validate"]
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Crockford Base32 alphabet (excludes I, L, O, U to avoid confusion)
49
+ # ---------------------------------------------------------------------------
50
+ _ALPHABET: Final[str] = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
51
+ _ALPHABET_LEN: Final[int] = 32 # exactly 2^5 — one char encodes 5 bits
52
+
53
+ # Pre-compute a decode lookup table for O(1) character → value conversion.
54
+ _DECODE: Final[dict[str, int]] = {ch: idx for idx, ch in enumerate(_ALPHABET)}
55
+
56
+ # Extra entries for lowercase and visually-similar characters (I/L/O/U).
57
+ _DECODE.update({ch.lower(): idx for ch, idx in _DECODE.items()})
58
+ _DECODE.update({"i": 1, "I": 1, "l": 1, "L": 1, "o": 0, "O": 0})
59
+
60
+ # Strict charset for validation — excludes I/L/O/U aliases (generate() never
61
+ # emits them; validate() must reject them for canonical-form compliance).
62
+ _VALID_CHARS: Final[frozenset[str]] = frozenset(_ALPHABET + _ALPHABET.lower())
63
+
64
+ ULID_LENGTH: Final[int] = 26
65
+ # RFC-0001 §6.3 — first character must be 0-7 (timestamp MSBs, max value
66
+ # «0111» ensures the 48-bit timestamp fits in 10 Crockford characters).
67
+ ULID_REGEX: Final[str] = r"^[0-7][0-9A-HJKMNP-TV-Z]{25}$"
68
+
69
+ _MAX_TIMESTAMP: Final[int] = (1 << 48) - 1 # 281 474 976 710 655 ms
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Monotonic generator
73
+ # ---------------------------------------------------------------------------
74
+
75
+
76
+ class _ULIDGenerator:
77
+ """Stateful generator that guarantees monotonicity within one millisecond.
78
+
79
+ When two calls are made within the same millisecond, the random segment is
80
+ incremented by 1, preserving lexicographic ordering. If the random segment
81
+ would overflow (2**80) clock advancement is waited for.
82
+ """
83
+
84
+ __slots__ = ("_last_ms", "_last_rand", "_lock")
85
+
86
+ _rand_max: Final[int] = (1 << 80) - 1 # type: ignore[misc]
87
+
88
+ def __init__(self) -> None:
89
+ self._lock = threading.Lock()
90
+ self._last_ms: int = 0
91
+ self._last_rand: int = 0
92
+
93
+ # ------------------------------------------------------------------
94
+ # Public interface
95
+ # ------------------------------------------------------------------
96
+
97
+ def generate(self) -> str:
98
+ """Return a new ULID string.
99
+
100
+ Raises:
101
+ ULIDError: If the system clock is not monotonic or the random source
102
+ is exhausted (astronomically unlikely).
103
+ """
104
+ ms, rand = self._next_ms_rand()
105
+ return _encode_ulid(ms, rand)
106
+
107
+ # ------------------------------------------------------------------
108
+ # Internals
109
+ # ------------------------------------------------------------------
110
+
111
+ def _next_ms_rand(self) -> tuple[int, int]:
112
+ """Return (timestamp_ms, random_int) ensuring monotonic ordering."""
113
+ with self._lock:
114
+ ms = _now_ms()
115
+
116
+ if ms > self._last_ms:
117
+ # New millisecond — fresh random segment.
118
+ rand = _secure_random_80()
119
+ self._last_ms = ms
120
+ self._last_rand = rand
121
+ return ms, rand
122
+
123
+ if ms == self._last_ms:
124
+ # Same millisecond — increment random to preserve ordering.
125
+ next_rand = self._last_rand + 1
126
+ if next_rand > self._rand_max:
127
+ # Overflow — spin until the clock advances.
128
+ ms = _spin_until_next_ms(ms)
129
+ next_rand = _secure_random_80()
130
+ self._last_ms = ms
131
+ self._last_rand = next_rand
132
+ return ms, next_rand
133
+
134
+ # Clock went backwards — still safe: we use last_ms + increment.
135
+ next_rand = self._last_rand + 1
136
+ if next_rand > self._rand_max:
137
+ raise ULIDError(
138
+ "Random segment overflow with backwards clock — "
139
+ "cannot guarantee monotonicity"
140
+ )
141
+ self._last_rand = next_rand
142
+ return self._last_ms, next_rand
143
+
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Module-level helpers
147
+ # ---------------------------------------------------------------------------
148
+
149
+
150
+ def _now_ms() -> int:
151
+ """Return current Unix time in milliseconds as an integer."""
152
+ return int(time.time() * 1_000)
153
+
154
+
155
+ def _secure_random_80() -> int:
156
+ """Return 80 cryptographically-secure random bits as an integer."""
157
+ return int.from_bytes(os.urandom(10), "big")
158
+
159
+
160
+ def _spin_until_next_ms(current_ms: int) -> int:
161
+ """Wait until the clock advances past *current_ms*, with a 1-second deadline.
162
+
163
+ Raises:
164
+ ULIDError: If the system clock does not advance within 1 second (e.g.
165
+ clock is frozen or running backwards).
166
+ """
167
+ deadline = time.monotonic() + 1.0
168
+ while True:
169
+ ms = _now_ms()
170
+ if ms > current_ms:
171
+ return ms
172
+ if time.monotonic() > deadline:
173
+ raise ULIDError(
174
+ "Clock did not advance within 1 s — possible system clock freeze"
175
+ )
176
+ # Yield CPU so other threads can run and the OS clock can tick.
177
+ time.sleep(0.001)
178
+
179
+
180
+ def _encode_ulid(timestamp_ms: int, random_int: int) -> str:
181
+ """Encode (timestamp_ms, random_int) into a 26-character ULID string.
182
+
183
+ Args:
184
+ timestamp_ms: 48-bit millisecond timestamp.
185
+ random_int: 80-bit random value.
186
+
187
+ Returns:
188
+ 26-character Crockford Base32 ULID string (uppercase).
189
+
190
+ Raises:
191
+ ULIDError: If timestamp_ms exceeds the 48-bit maximum.
192
+ """
193
+ if timestamp_ms > _MAX_TIMESTAMP:
194
+ raise ULIDError(
195
+ f"Timestamp {timestamp_ms} ms exceeds ULID maximum "
196
+ f"({_MAX_TIMESTAMP} ms ≈ year 10889)"
197
+ )
198
+
199
+ # Encode timestamp — 10 characters (50 bits needed; 48 used)
200
+ ts_chars = [""] * 10
201
+ t = timestamp_ms
202
+ for i in range(9, -1, -1):
203
+ ts_chars[i] = _ALPHABET[t & 0x1F]
204
+ t >>= 5
205
+
206
+ # Encode random — 16 characters (80 bits)
207
+ rand_chars = [""] * 16
208
+ r = random_int
209
+ for i in range(15, -1, -1):
210
+ rand_chars[i] = _ALPHABET[r & 0x1F]
211
+ r >>= 5
212
+
213
+ return "".join(ts_chars) + "".join(rand_chars)
214
+
215
+
216
+ # ---------------------------------------------------------------------------
217
+ # Public API
218
+ # ---------------------------------------------------------------------------
219
+
220
+ _generator = _ULIDGenerator()
221
+
222
+
223
+ def generate() -> str:
224
+ """Generate a new ULID string.
225
+
226
+ The returned value is:
227
+
228
+ * 26 characters long
229
+ * Composed of Crockford Base32 characters (``[0-9A-HJKMNP-TV-Z]``)
230
+ * Lexicographically sortable (earlier ULIDs < later ULIDs as strings)
231
+ * Monotonic within the same millisecond
232
+ * Seeded from :func:`os.urandom` (CSPRNG)
233
+
234
+ Returns:
235
+ A 26-character uppercase ULID string.
236
+
237
+ Raises:
238
+ ULIDError: On the astronomically unlikely event of internal state
239
+ overflow or backwards-clock exhaustion.
240
+
241
+ Example::
242
+
243
+ from spanforge.ulid import generate
244
+ event_id = generate() # "01ARYZ3NDEKTSV4RRFFQ69G5FAV"
245
+ """
246
+ return _generator.generate()
247
+
248
+
249
+ def validate(value: str) -> bool:
250
+ """Return ``True`` if *value* is a syntactically valid ULID string.
251
+
252
+ Validation checks:
253
+
254
+ 1. Exactly 26 characters long.
255
+ 2. All characters are in the Crockford Base32 alphabet (case-insensitive,
256
+ I/L/O treated as 1/1/0).
257
+ 3. The timestamp component does not overflow the 48-bit range.
258
+
259
+ Args:
260
+ value: The string to validate.
261
+
262
+ Returns:
263
+ ``True`` if valid, ``False`` otherwise.
264
+
265
+ Example::
266
+
267
+ validate("01ARYZ3NDEKTSV4RRFFQ69G5FAV") # True
268
+ validate("not-a-ulid") # False
269
+ """
270
+ if not isinstance(value, str) or len(value) != ULID_LENGTH:
271
+ return False
272
+ upper = value.upper()
273
+ if not all(c in _VALID_CHARS for c in upper):
274
+ return False
275
+ # Decode timestamp and check range
276
+ t = 0
277
+ for ch in upper[:10]:
278
+ t = (t << 5) | _DECODE[ch]
279
+ return t <= _MAX_TIMESTAMP
280
+
281
+
282
+ def extract_timestamp_ms(ulid: str) -> int:
283
+ """Extract the embedded millisecond timestamp from a ULID.
284
+
285
+ Args:
286
+ ulid: A valid 26-character ULID string.
287
+
288
+ Returns:
289
+ Unix timestamp in milliseconds.
290
+
291
+ Raises:
292
+ ULIDError: If *ulid* is not a valid ULID.
293
+
294
+ Example::
295
+
296
+ ms = extract_timestamp_ms("01ARYZ3NDEKTSV4RRFFQ69G5FAV")
297
+ print(datetime.utcfromtimestamp(ms / 1000))
298
+ """
299
+ if not validate(ulid):
300
+ raise ULIDError(f"Cannot extract timestamp from invalid ULID: {ulid!r}")
301
+ t = 0
302
+ for ch in ulid.upper()[:10]:
303
+ t = (t << 5) | _DECODE[ch]
304
+ return t
spanforge/validate.py ADDED
@@ -0,0 +1,383 @@
1
+ """spanforge.validate — JSON Schema validation for Event envelopes.
2
+
3
+ This module validates :class:`~spanforge.event.Event` instances against the
4
+ published JSON Schema specification. Schema version is selected automatically
5
+ from the event's ``schema_version`` field:
6
+
7
+ * ``"1.0"`` → ``schemas/v1.0/schema.json``
8
+ * ``"2.0"`` (default) → ``schemas/v2.0/schema.json``
9
+
10
+ It uses the optional ``jsonschema`` library when available for full Draft 2020-12
11
+ validation. If ``jsonschema`` is not installed, a lightweight structural check
12
+ is performed using only the Python standard library — external dependencies are
13
+ strictly optional in line with *spanforge*'s zero-required-dependency policy.
14
+
15
+ Usage
16
+ -----
17
+ ::
18
+
19
+ from spanforge import Event, EventType
20
+ from spanforge.validate import validate_event
21
+
22
+ event = Event(
23
+ event_type=EventType.TRACE_SPAN_COMPLETED,
24
+ source="llm-trace@0.3.1",
25
+ payload={"span_name": "run", "status": "ok"},
26
+ )
27
+ validate_event(event) # raises SchemaValidationError if invalid
28
+
29
+ Public API
30
+ ----------
31
+ * :func:`validate_event` — validate an :class:`~spanforge.event.Event`
32
+ against the matching envelope schema (version-aware).
33
+ * :func:`load_schema` — load a specific schema version by key.
34
+ * :exc:`~spanforge.exceptions.SchemaValidationError` — raised on validation
35
+ failure (re-exported from :mod:`spanforge.exceptions`).
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import json
41
+ import pathlib
42
+ import re
43
+ from typing import Any
44
+
45
+ from spanforge.event import Event
46
+ from spanforge.exceptions import EventTypeError, SchemaValidationError
47
+ from spanforge.types import is_registered, validate_custom
48
+
49
+ __all__: list[str] = ["load_schema", "validate_event"]
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Schema paths — version-aware (RFC-0001 §15.5)
53
+ # ---------------------------------------------------------------------------
54
+
55
+ _SCHEMAS_DIR: pathlib.Path = pathlib.Path(__file__).parent / "schemas"
56
+
57
+ #: Map of schema-version strings to their JSON Schema file paths.
58
+ _SCHEMA_PATHS: dict[str, pathlib.Path] = {
59
+ "1.0": _SCHEMAS_DIR / "v1.0" / "schema.json",
60
+ "2.0": _SCHEMAS_DIR / "v2.0" / "schema.json",
61
+ }
62
+
63
+ #: Default (current) schema version (RFC-0001-SPANFORGE-Enterprise-2.0).
64
+ _DEFAULT_SCHEMA_VERSION: str = "2.0"
65
+
66
+ # Legacy single-path alias kept for backwards-compatible callers.
67
+ _SCHEMA_PATH: pathlib.Path = _SCHEMA_PATHS["1.0"]
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Compiled patterns from schema (stdlib fallback)
71
+ # ---------------------------------------------------------------------------
72
+
73
+ # RFC-0001 §6.3 — first char 0-7 (timestamp MSB constraint)
74
+ _ULID_RE: re.Pattern[str] = re.compile(r"^[0-7][0-9A-HJKMNP-TV-Z]{25}$")
75
+ # RFC-0001 §15.5 — only 1.0 and 2.0 are accepted schema versions.
76
+ _ACCEPTED_SCHEMA_VERSIONS: frozenset[str] = frozenset({"1.0", "2.0"})
77
+ _EVENT_TYPE_RE: re.Pattern[str] = re.compile(
78
+ r"^(?:llm\.(?:trace|cost|cache|eval|guard|fence|prompt|redact|diff|template|audit)\.(?:[a-z][a-z0-9_]*|[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*)|(?!llm\.)[a-z][a-z0-9-]*(?:\.[a-z][a-z0-9-]*)+\.[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*)$" # NOSONAR — RFC §7 grammar with registered llm namespaces
79
+ )
80
+ # RFC-0001 §6.1 — microsecond precision mandatory (exactly 6 decimal places)
81
+ _TIMESTAMP_RE: re.Pattern[str] = re.compile(
82
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z$"
83
+ )
84
+ # RFC-0001 §5.1 — source: letter start, letters/digits/._-, then @semver
85
+ _SOURCE_RE: re.Pattern[str] = re.compile(
86
+ r"^[a-zA-Z][a-zA-Z0-9._\-]*@\d+\.\d+\.\d+(?:[.\-][a-zA-Z0-9.]+)?$"
87
+ )
88
+ _TRACE_ID_RE: re.Pattern[str] = re.compile(r"^[0-9a-f]{32}$")
89
+ _SPAN_ID_RE: re.Pattern[str] = re.compile(r"^[0-9a-f]{16}$")
90
+ # Checksum and signature carry distinct prefix indicators set by signing.py.
91
+ _CHECKSUM_RE: re.Pattern[str] = re.compile(r"^sha256:[0-9a-f]{64}$")
92
+ _SIGNATURE_RE: re.Pattern[str] = re.compile(r"^hmac-sha256:[0-9a-f]{64}$")
93
+ _MAX_TAG_KEYS: int = 50
94
+
95
+ # RFC-0001 §6.3 — ULID max length is 26 characters; 1 MB payload cap.
96
+ _MAX_EVENT_ID_LEN: int = 26
97
+ _MAX_PAYLOAD_BYTES: int = 1_000_000
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Schema loader
101
+ # ---------------------------------------------------------------------------
102
+
103
+ _CACHED_SCHEMAS: dict[str, dict[str, Any]] = {}
104
+
105
+ # Legacy alias kept for call sites that used the old single-schema API.
106
+ _CACHED_SCHEMA: dict[str, Any] | None = None
107
+
108
+
109
+ def load_schema(version: str | None = None) -> dict[str, Any]:
110
+ """Load and cache a JSON Schema from disk by version.
111
+
112
+ Parameters
113
+ ----------
114
+ version:
115
+ Schema version string, e.g. ``"1.0"`` or ``"2.0"``.
116
+ Defaults to the current SDK schema version (``"2.0"``; RFC §15.5).
117
+
118
+ Returns:
119
+ -------
120
+ dict
121
+ Parsed JSON Schema as a plain Python dict.
122
+
123
+ Raises:
124
+ ------
125
+ FileNotFoundError
126
+ If the requested schema file cannot be found relative to the
127
+ package root. This should never happen in a correctly installed
128
+ distribution.
129
+ ValueError
130
+ If an unknown schema version is requested.
131
+ """
132
+ resolved = version or _DEFAULT_SCHEMA_VERSION
133
+ if resolved in _CACHED_SCHEMAS:
134
+ return _CACHED_SCHEMAS[resolved]
135
+
136
+ # RFC-0001 §15.5: unknown schema versions MUST raise and stop processing.
137
+ path = _SCHEMA_PATHS.get(resolved)
138
+ if path is None:
139
+ raise ValueError(
140
+ f"Unknown schema version {resolved!r}. "
141
+ f"Available versions: {list(_SCHEMA_PATHS)}"
142
+ )
143
+
144
+ if not path.is_file():
145
+ raise FileNotFoundError(
146
+ f"JSON Schema not found at {path}. "
147
+ "Ensure the 'schemas/' directory is included in the "
148
+ "installed package."
149
+ )
150
+ with path.open("r", encoding="utf-8") as fh:
151
+ schema = json.load(fh)
152
+ _CACHED_SCHEMAS[resolved] = schema
153
+ return schema
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Internal: stdlib structural validation
158
+ # ---------------------------------------------------------------------------
159
+
160
+
161
+ def _check_string_field(
162
+ doc: dict[str, Any],
163
+ field: str,
164
+ *,
165
+ required: bool = True,
166
+ pattern: re.Pattern[str] | None = None,
167
+ min_length: int = 1,
168
+ ) -> None:
169
+ """Validate a single string field in *doc*."""
170
+ if field not in doc:
171
+ if required:
172
+ raise SchemaValidationError(
173
+ field=field,
174
+ received=None,
175
+ reason=f"required field '{field}' is missing",
176
+ )
177
+ return
178
+ value = doc[field]
179
+ if not isinstance(value, str):
180
+ raise SchemaValidationError(
181
+ field=field,
182
+ received=value,
183
+ reason=f"'{field}' must be a string",
184
+ )
185
+ if len(value) < min_length:
186
+ raise SchemaValidationError(
187
+ field=field,
188
+ received=value,
189
+ reason=f"'{field}' must be at least {min_length} character(s)",
190
+ )
191
+ if pattern is not None and not pattern.match(value):
192
+ raise SchemaValidationError(
193
+ field=field,
194
+ received=value,
195
+ reason=f"'{field}' does not match pattern {pattern.pattern!r}",
196
+ )
197
+
198
+
199
+ def _validate_tags(tags: Any) -> None:
200
+ """Validate the tags dict; raise SchemaValidationError on any violation."""
201
+ if not isinstance(tags, dict):
202
+ raise SchemaValidationError(
203
+ field="tags",
204
+ received=tags,
205
+ reason="'tags' must be an object",
206
+ )
207
+ if len(tags) > _MAX_TAG_KEYS:
208
+ raise SchemaValidationError(
209
+ field="tags",
210
+ received=tags,
211
+ reason=f"'tags' must contain at most {_MAX_TAG_KEYS} keys",
212
+ )
213
+ for k, v in tags.items():
214
+ if not isinstance(k, str) or not k:
215
+ raise SchemaValidationError(
216
+ field=f"tags.{k!r}",
217
+ received=k,
218
+ reason="tag key must be a non-empty string",
219
+ )
220
+ if not isinstance(v, str) or not v:
221
+ raise SchemaValidationError(
222
+ field=f"tags.{k}",
223
+ received=v,
224
+ reason="tag value must be a non-empty string",
225
+ )
226
+
227
+
228
+ def _stdlib_validate(doc: dict[str, Any]) -> None:
229
+ """Perform structural validation without the ``jsonschema`` library.
230
+
231
+ Checks required fields, types, and regex patterns as per the published
232
+ JSON Schema spec. Raises :exc:`~spanforge.exceptions.SchemaValidationError`
233
+ on the first violation found.
234
+ """
235
+ if not isinstance(doc, dict):
236
+ raise SchemaValidationError(
237
+ field="<root>",
238
+ received=doc,
239
+ reason="event must serialise to a JSON object",
240
+ )
241
+
242
+ _check_string_field(doc, "schema_version")
243
+ if doc["schema_version"] not in _ACCEPTED_SCHEMA_VERSIONS:
244
+ raise SchemaValidationError(
245
+ field="schema_version",
246
+ received=doc["schema_version"],
247
+ reason=f"'schema_version' must be one of {sorted(_ACCEPTED_SCHEMA_VERSIONS)!r}",
248
+ )
249
+ _check_string_field(doc, "event_id", pattern=_ULID_RE)
250
+ _check_string_field(doc, "event_type", pattern=_EVENT_TYPE_RE)
251
+ if not is_registered(doc["event_type"]):
252
+ try:
253
+ validate_custom(doc["event_type"])
254
+ except EventTypeError as exc:
255
+ raise SchemaValidationError(
256
+ field="event_type",
257
+ received=doc["event_type"],
258
+ reason=str(exc),
259
+ ) from exc
260
+ _check_string_field(doc, "timestamp", pattern=_TIMESTAMP_RE)
261
+ _check_string_field(doc, "source", pattern=_SOURCE_RE)
262
+
263
+ # payload
264
+ if "payload" not in doc:
265
+ raise SchemaValidationError(
266
+ field="payload",
267
+ received=None,
268
+ reason="required field 'payload' is missing",
269
+ )
270
+ if not isinstance(doc["payload"], dict) or not doc["payload"]:
271
+ raise SchemaValidationError(
272
+ field="payload",
273
+ received=doc["payload"],
274
+ reason="'payload' must be a non-empty object",
275
+ )
276
+
277
+ # Optional tracing fields
278
+ for span_field in ("span_id", "parent_span_id"):
279
+ _check_string_field(doc, span_field, required=False, pattern=_SPAN_ID_RE)
280
+ _check_string_field(doc, "trace_id", required=False, pattern=_TRACE_ID_RE)
281
+
282
+ # Optional context fields
283
+ for ctx_field in ("org_id", "team_id", "actor_id", "session_id"):
284
+ _check_string_field(doc, ctx_field, required=False, min_length=1)
285
+
286
+ # Optional integrity fields — checksum and signature use distinct prefix patterns.
287
+ _check_string_field(doc, "checksum", required=False, pattern=_CHECKSUM_RE)
288
+ _check_string_field(doc, "signature", required=False, pattern=_SIGNATURE_RE)
289
+ _check_string_field(doc, "prev_id", required=False, pattern=_ULID_RE)
290
+
291
+ # tags
292
+ if "tags" in doc:
293
+ _validate_tags(doc["tags"])
294
+
295
+
296
+ # ---------------------------------------------------------------------------
297
+ # Public API
298
+ # ---------------------------------------------------------------------------
299
+
300
+
301
+ def validate_event(event: Event) -> None:
302
+ """Validate *event* against the published v1.0 JSON Schema.
303
+
304
+ Serialises *event* to a plain dict and validates the envelope structure.
305
+ When the optional ``jsonschema`` package is installed, full Draft 2020-12
306
+ validation is performed. Otherwise a stdlib-only structural check is run
307
+ that covers all required fields, types, and regex patterns.
308
+
309
+ Parameters
310
+ ----------
311
+ event:
312
+ The :class:`~spanforge.event.Event` instance to validate.
313
+
314
+ Raises:
315
+ ------
316
+ SchemaValidationError
317
+ If the event does not conform to the envelope schema.
318
+ FileNotFoundError
319
+ If the schema file is missing from the installed distribution.
320
+
321
+ Examples:
322
+ --------
323
+ ::
324
+
325
+ from spanforge import Event, EventType
326
+ from spanforge.validate import validate_event
327
+
328
+ event = Event(
329
+ event_type=EventType.TRACE_SPAN_COMPLETED,
330
+ source="llm-trace@0.3.1",
331
+ payload={"span_name": "run", "status": "ok"},
332
+ )
333
+ validate_event(event) # passes silently
334
+ """
335
+ if not isinstance(event, Event):
336
+ raise TypeError(f"validate_event() expects an Event instance, got {type(event)!r}")
337
+
338
+ doc = event.to_dict()
339
+
340
+ # H9: bound-check event_id length and payload wire size before schema validation.
341
+ event_id_val: str = doc.get("event_id", "")
342
+ if len(event_id_val) > _MAX_EVENT_ID_LEN:
343
+ raise SchemaValidationError(
344
+ field="event_id",
345
+ received=event_id_val,
346
+ reason=(
347
+ f"event_id length {len(event_id_val)} exceeds maximum "
348
+ f"{_MAX_EVENT_ID_LEN} characters"
349
+ ),
350
+ )
351
+ _payload_bytes = len(json.dumps(doc.get("payload", {}), default=str).encode())
352
+ if _payload_bytes > _MAX_PAYLOAD_BYTES:
353
+ raise SchemaValidationError(
354
+ field="payload",
355
+ received=None,
356
+ reason=(
357
+ f"payload size {_payload_bytes} bytes exceeds maximum "
358
+ f"{_MAX_PAYLOAD_BYTES} bytes"
359
+ ),
360
+ )
361
+
362
+ # Select schema version from event envelope (RFC §15.5).
363
+ schema_version: str = doc.get("schema_version") or _DEFAULT_SCHEMA_VERSION
364
+
365
+ try:
366
+ import jsonschema # noqa: PLC0415 (optional import)
367
+ import jsonschema.exceptions # noqa: PLC0415
368
+
369
+ schema = load_schema(schema_version)
370
+ try:
371
+ jsonschema.validate(instance=doc, schema=schema)
372
+ except jsonschema.exceptions.ValidationError as exc:
373
+ # Convert jsonschema's error into our domain error.
374
+ field_path = ".".join(str(part) for part in exc.absolute_path) or "<root>"
375
+ raise SchemaValidationError(
376
+ field=field_path,
377
+ received=exc.instance,
378
+ reason=exc.message,
379
+ ) from exc
380
+
381
+ except ImportError:
382
+ # jsonschema not installed — fall back to stdlib structural check.
383
+ _stdlib_validate(doc)