spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/redact.py ADDED
@@ -0,0 +1,914 @@
1
+ """PII redaction framework for spanforge.
2
+
3
+ Provides a layered, policy-driven approach to PII identification and redaction
4
+ in event payloads. Redaction is **opt-in per field** — fields must be
5
+ explicitly wrapped in :class:`Redactable` to participate in the lifecycle.
6
+
7
+ Sensitivity ladder
8
+ ------------------
9
+
10
+ ``low`` < ``medium`` < ``high`` < ``pii`` < ``phi``
11
+
12
+ A :class:`RedactionPolicy` is configured with a ``min_sensitivity`` level.
13
+ Only fields whose sensitivity is **≥ min_sensitivity** are scrubbed when
14
+ :meth:`RedactionPolicy.apply` is called.
15
+
16
+ Usage example
17
+ -------------
18
+ ::
19
+
20
+ from spanforge.redact import Redactable, RedactionPolicy, Sensitivity, contains_pii
21
+ from spanforge import Event, EventType
22
+
23
+ policy = RedactionPolicy(
24
+ min_sensitivity=Sensitivity.PII,
25
+ redacted_by="policy:corp-default",
26
+ )
27
+
28
+ event = Event(
29
+ event_type=EventType.PROMPT_SAVED,
30
+ source="promptlock@1.0.0",
31
+ payload={
32
+ "version": "v3",
33
+ "author": Redactable("alice@example.com", Sensitivity.PII, {"email"}),
34
+ },
35
+ )
36
+
37
+ result = policy.apply(event)
38
+ # result.event.payload["author"] == "[REDACTED:pii]"
39
+ # result.redaction_count == 1
40
+ # contains_pii(result.event) == False
41
+
42
+ Security guarantees
43
+ -------------------
44
+ * :class:`Redactable` never exposes its wrapped value in ``__repr__``,
45
+ ``__str__``, or any exception message.
46
+ * Exception messages only reveal the *sensitivity level* and *field depth*,
47
+ never the content of the wrapped value.
48
+ * The literal replacement strings (``"[REDACTED:pii]"`` etc.) are safe to
49
+ log, export, or include in error messages.
50
+ * :meth:`RedactionPolicy.apply` rebuilds the payload recursively so nested
51
+ structures are fully scanned even in deeply-nested payloads.
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ import datetime
57
+ import hashlib
58
+ import re
59
+ from collections.abc import Mapping
60
+ from dataclasses import dataclass
61
+ from enum import Enum
62
+ from typing import TYPE_CHECKING, Any, Final
63
+
64
+ from spanforge.exceptions import LLMSchemaError
65
+
66
+ if TYPE_CHECKING:
67
+ from spanforge.event import Event
68
+
69
+ __all__ = [
70
+ "PII_TYPES",
71
+ "PIINotRedactedError",
72
+ "PIIScanResult",
73
+ "Redactable",
74
+ "RedactionPolicy",
75
+ "RedactionResult",
76
+ "Sensitivity",
77
+ "contains_pii",
78
+ "scan_payload",
79
+ ]
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Known PII type label constants
83
+ # ---------------------------------------------------------------------------
84
+
85
+ PII_TYPES: Final[frozenset[str]] = frozenset(
86
+ [
87
+ "credit_card",
88
+ "date_of_birth",
89
+ "email",
90
+ "financial_id",
91
+ "ip_address",
92
+ "medical_id",
93
+ "name",
94
+ "phone",
95
+ "ssn",
96
+ "address",
97
+ ]
98
+ )
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # Sensitivity ordering
102
+ # ---------------------------------------------------------------------------
103
+
104
+ #: Numeric ordering for each sensitivity level (ascending sensitivity).
105
+ _SENSITIVITY_ORDER: Final[dict[str, int]] = {
106
+ "low": 0,
107
+ "medium": 1,
108
+ "high": 2,
109
+ "pii": 3,
110
+ "phi": 4,
111
+ }
112
+
113
+
114
+ class Sensitivity(str, Enum):
115
+ """Ordered sensitivity levels for PII classification.
116
+
117
+ Levels increase in sensitivity: LOW < MEDIUM < HIGH < PII < PHI.
118
+
119
+ * **LOW** — Non-sensitive; informational or operational metadata.
120
+ * **MEDIUM** — Pseudonymous or indirectly identifying data.
121
+ * **HIGH** — Directly identifying but non-regulated (e.g. usernames).
122
+ * **PII** — Directly identifying, regulated personal data (GDPR / CCPA).
123
+ * **PHI** — Protected health information (HIPAA). Most restrictive.
124
+
125
+ Comparison operators (``<``, ``<=``, ``>``, ``>=``) work as expected::
126
+
127
+ Sensitivity.PII > Sensitivity.HIGH # True
128
+ Sensitivity.PHI >= Sensitivity.PII # True
129
+ Sensitivity.LOW < Sensitivity.MEDIUM # True
130
+ """
131
+
132
+ LOW = "low"
133
+ MEDIUM = "medium"
134
+ HIGH = "high"
135
+ PII = "pii"
136
+ PHI = "phi"
137
+
138
+ # ------------------------------------------------------------------
139
+ # Ordered comparisons (delegated to integer order table)
140
+ # ------------------------------------------------------------------
141
+
142
+ @property
143
+ def _order(self) -> int:
144
+ """Integer rank — for comparison only; not part of the public API."""
145
+ return _SENSITIVITY_ORDER[self.value]
146
+
147
+ def __lt__(self, other: object) -> bool:
148
+ if not isinstance(other, Sensitivity):
149
+ return NotImplemented
150
+ return self._order < other._order
151
+
152
+ def __le__(self, other: object) -> bool:
153
+ if not isinstance(other, Sensitivity):
154
+ return NotImplemented
155
+ return self._order <= other._order
156
+
157
+ def __gt__(self, other: object) -> bool:
158
+ if not isinstance(other, Sensitivity):
159
+ return NotImplemented
160
+ return self._order > other._order
161
+
162
+ def __ge__(self, other: object) -> bool:
163
+ if not isinstance(other, Sensitivity):
164
+ return NotImplemented
165
+ return self._order >= other._order
166
+
167
+ def __eq__(self, other: object) -> bool:
168
+ if isinstance(other, str) and not isinstance(other, Sensitivity):
169
+ return str.__eq__(self, other)
170
+ return Enum.__eq__(self, other)
171
+
172
+ def __hash__(self) -> int:
173
+ return str.__hash__(self)
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # Redactable wrapper
178
+ # ---------------------------------------------------------------------------
179
+
180
+
181
+ class Redactable:
182
+ """Immutable wrapper that marks a payload value as PII-sensitive.
183
+
184
+ Wrapping a value in :class:`Redactable` does **not** redact it immediately.
185
+ The value is redacted only when :meth:`RedactionPolicy.apply` is called on
186
+ the event that contains it.
187
+
188
+ Security: :class:`Redactable` never surfaces its wrapped value in
189
+ ``__repr__``, ``__str__``, or exceptions. Only the sensitivity level and
190
+ PII type labels are visible in any string representation.
191
+
192
+ Args:
193
+ value: The raw PII-sensitive value.
194
+ sensitivity: How sensitive the value is.
195
+ pii_types: Labels describing what type of PII this is. Use
196
+ constants from :data:`PII_TYPES` or custom strings.
197
+ Defaults to an empty frozenset.
198
+
199
+ Example::
200
+
201
+ field = Redactable("alice@example.com", Sensitivity.PII, {"email"})
202
+ str(field) # "<Redactable:pii>" — value hidden
203
+ repr(field) # "<Redactable sensitivity='pii' pii_types={'email'}>"
204
+ """
205
+
206
+ __slots__ = ("_pii_types", "_sensitivity", "_value")
207
+
208
+ def __init__(
209
+ self,
210
+ value: Any,
211
+ sensitivity: Sensitivity,
212
+ pii_types: frozenset[str] = frozenset(),
213
+ ) -> None:
214
+ object.__setattr__(self, "_value", value)
215
+ object.__setattr__(self, "_sensitivity", sensitivity)
216
+ object.__setattr__(self, "_pii_types", frozenset(pii_types))
217
+
218
+ # ------------------------------------------------------------------
219
+ # Public interface
220
+ # ------------------------------------------------------------------
221
+
222
+ @property
223
+ def sensitivity(self) -> Sensitivity:
224
+ """The sensitivity level of this field."""
225
+ return self._sensitivity # type: ignore[no-any-return,attr-defined]
226
+
227
+ @property
228
+ def pii_types(self) -> frozenset[str]:
229
+ """Set of PII type labels (e.g. ``{'email', 'pii_identifier'}``)."""
230
+ return self._pii_types # type: ignore[no-any-return,attr-defined]
231
+
232
+ def reveal(self) -> Any:
233
+ """Return the raw unredacted value.
234
+
235
+ Use with extreme care. Access to raw values should be restricted to
236
+ trusted internal code paths. Ensure the returned value is never
237
+ logged or included in any observable output.
238
+
239
+ Returns:
240
+ The original unwrapped value passed to the constructor.
241
+ """
242
+ return self._value # type: ignore[attr-defined]
243
+
244
+ # ------------------------------------------------------------------
245
+ # Immutability guard
246
+ # ------------------------------------------------------------------
247
+
248
+ def __setattr__(self, name: str, value: object) -> None:
249
+ raise AttributeError("Redactable is immutable — use a new instance to change values")
250
+
251
+ # ------------------------------------------------------------------
252
+ # Safe string representations — value intentionally hidden
253
+ # ------------------------------------------------------------------
254
+
255
+ def __repr__(self) -> str:
256
+ return (
257
+ f"<Redactable sensitivity={self._sensitivity!r} " # type: ignore[attr-defined]
258
+ f"pii_types={set(self._pii_types)!r}>" # type: ignore[attr-defined]
259
+ )
260
+
261
+ def __str__(self) -> str:
262
+ return f"<Redactable:{self._sensitivity}>" # type: ignore[attr-defined]
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # Redaction result
267
+ # ---------------------------------------------------------------------------
268
+
269
+
270
+ @dataclass(frozen=True)
271
+ class RedactionResult:
272
+ """Immutable result returned by :meth:`RedactionPolicy.apply`.
273
+
274
+ Attributes:
275
+ event: The newly constructed event with PII removed.
276
+ redaction_count: How many :class:`Redactable` fields were scrubbed.
277
+ redacted_at: UTC ISO-8601 timestamp when redaction was applied.
278
+ redacted_by: The policy identifier string.
279
+ """
280
+
281
+ event: Event
282
+ redaction_count: int
283
+ redacted_at: str
284
+ redacted_by: str
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # PIINotRedactedError
289
+ # ---------------------------------------------------------------------------
290
+
291
+
292
+ class PIINotRedactedError(LLMSchemaError):
293
+ """Raised when :func:`contains_pii` detects un-redacted PII in an event.
294
+
295
+ This error signals that a :class:`Redactable` instance is still present in
296
+ the event payload after a :class:`RedactionPolicy` should have been applied.
297
+
298
+ Security: the error message never reveals the actual PII value — only field
299
+ path depth and sensitivity information.
300
+
301
+ Args:
302
+ count: Number of unredacted :class:`Redactable` instances found.
303
+ context: Optional short label for where the check was done.
304
+
305
+ Attributes:
306
+ count: Number of outstanding :class:`Redactable` instances found.
307
+ """
308
+
309
+ count: int
310
+
311
+ def __init__(self, count: int, context: str = "") -> None:
312
+ self.count = count
313
+ # M11: never embed the raw context string — it may itself contain PII.
314
+ # Include only a hash for correlation without disclosure.
315
+ ctx = ""
316
+ if context:
317
+ ctx_hash = hashlib.sha256(context.encode()).hexdigest()[:8]
318
+ ctx = f" [context-hash:{ctx_hash}]"
319
+ super().__init__(
320
+ f"Found {count} unredacted PII field(s){ctx}. "
321
+ "Apply a RedactionPolicy before serialising or exporting this event."
322
+ )
323
+
324
+
325
+ # ---------------------------------------------------------------------------
326
+ # RedactionPolicy
327
+ # ---------------------------------------------------------------------------
328
+
329
+
330
+ @dataclass(frozen=True)
331
+ class RedactionPolicy:
332
+ """Policy that defines which fields to scrub and how to label redactions.
333
+
334
+ A policy is immutable; create a new instance to change configuration.
335
+ Apply it to an event via :meth:`apply`, which returns a :class:`RedactionResult`
336
+ containing a new event with PII removed.
337
+
338
+ Args:
339
+ min_sensitivity: Fields with sensitivity **≥** this level are
340
+ redacted. Defaults to :attr:`Sensitivity.PII`.
341
+ redacted_by: Identifier embedded in the redaction metadata
342
+ (e.g. ``"policy:corp-default"``).
343
+ replacement_template: String template for the redaction marker.
344
+ The ``{sensitivity}`` placeholder is replaced
345
+ with the field's sensitivity level value.
346
+ Defaults to ``"[REDACTED:{sensitivity}]"``.
347
+
348
+ Example::
349
+
350
+ policy = RedactionPolicy(
351
+ min_sensitivity=Sensitivity.HIGH,
352
+ redacted_by="policy:strict",
353
+ )
354
+ result = policy.apply(event)
355
+ """
356
+
357
+ min_sensitivity: Sensitivity = Sensitivity.PII
358
+ redacted_by: str = "policy:default"
359
+ replacement_template: str = "[REDACTED:{sensitivity}]"
360
+
361
+ def _make_marker(self, sensitivity: Sensitivity) -> str:
362
+ """Format the replacement string for a given sensitivity level."""
363
+ return self.replacement_template.format(sensitivity=sensitivity.value)
364
+
365
+ def _should_redact(self, r: Redactable) -> bool:
366
+ """Return True if the Redactable field meets the policy threshold."""
367
+ return r.sensitivity >= self.min_sensitivity
368
+
369
+ def _redact_value(self, value: Any, counter: list[int], _depth: int = 0) -> Any:
370
+ """Recursively replace Redactable instances in *value*.
371
+
372
+ Args:
373
+ value: Any Python value (dict, list, Redactable, or scalar).
374
+ counter: Single-element list used as a mutable integer counter.
375
+ _depth: Current recursion depth (internal; raises at > 100).
376
+
377
+ Returns:
378
+ The value with any qualifying Redactable instances replaced by
379
+ their marker strings. Non-Redactable values are returned as-is.
380
+ """
381
+ if _depth > 100:
382
+ raise RecursionError(
383
+ "RedactionPolicy._redact_value: maximum nesting depth (100) exceeded"
384
+ )
385
+ if isinstance(value, Redactable):
386
+ if self._should_redact(value):
387
+ counter[0] += 1
388
+ return self._make_marker(value.sensitivity)
389
+ # Below threshold — leave as-is for now;
390
+ # contains_pii() will detect it post-apply if needed.
391
+ return value
392
+ if isinstance(value, dict):
393
+ return {k: self._redact_value(v, counter, _depth + 1) for k, v in value.items()}
394
+ if isinstance(value, list):
395
+ return [self._redact_value(v, counter, _depth + 1) for v in value]
396
+ if isinstance(value, tuple):
397
+ return tuple(self._redact_value(v, counter, _depth + 1) for v in value)
398
+ return value
399
+
400
+ def apply(self, event: Event) -> RedactionResult:
401
+ """Apply this policy to *event*, returning a new redacted event.
402
+
403
+ All :class:`Redactable` fields in the payload whose sensitivity is ≥
404
+ :attr:`min_sensitivity` are replaced with safe marker strings.
405
+ Redaction metadata is appended under the reserved ``__redacted_*``
406
+ keys in the payload.
407
+
408
+ The original event is **not** mutated; a new :class:`Event` is returned
409
+ inside the :class:`RedactionResult`.
410
+
411
+ Args:
412
+ event: The event whose payload should be scanned and redacted.
413
+
414
+ Returns:
415
+ A :class:`RedactionResult` with the new event and redaction stats.
416
+
417
+ Raises:
418
+ LLMSchemaError: If reconstruction of the redacted event fails for
419
+ structural reasons.
420
+ """
421
+ # Import here to avoid circular dependency at module load time.
422
+ from spanforge.event import Event
423
+
424
+ counter: list[int] = [0]
425
+ redacted_payload = self._redact_value(dict(event.payload), counter)
426
+
427
+ now = _utcnow_iso()
428
+
429
+ if isinstance(redacted_payload, dict) and counter[0] > 0:
430
+ redacted_payload["__redacted_at"] = now
431
+ redacted_payload["__redacted_by"] = self.redacted_by
432
+ redacted_payload["__redaction_count"] = counter[0]
433
+
434
+ new_event = Event(
435
+ schema_version=event.schema_version,
436
+ event_id=event.event_id,
437
+ event_type=event.event_type,
438
+ timestamp=event.timestamp,
439
+ source=event.source,
440
+ payload=redacted_payload,
441
+ trace_id=event.trace_id,
442
+ span_id=event.span_id,
443
+ parent_span_id=event.parent_span_id,
444
+ org_id=event.org_id,
445
+ team_id=event.team_id,
446
+ actor_id=event.actor_id,
447
+ session_id=event.session_id,
448
+ tags=event.tags,
449
+ checksum=event.checksum,
450
+ signature=event.signature,
451
+ prev_id=event.prev_id,
452
+ )
453
+
454
+ return RedactionResult(
455
+ event=new_event,
456
+ redaction_count=counter[0],
457
+ redacted_at=now,
458
+ redacted_by=self.redacted_by,
459
+ )
460
+
461
+
462
+ # ---------------------------------------------------------------------------
463
+ # Public helpers
464
+ # ---------------------------------------------------------------------------
465
+
466
+
467
+ def contains_pii(event: Event, *, scan_raw: bool = True) -> bool:
468
+ """Return ``True`` if any unredacted :class:`Redactable` values remain.
469
+
470
+ Use this after :meth:`RedactionPolicy.apply` to verify that all qualifying
471
+ fields were scrubbed before the event is serialised or exported.
472
+
473
+ Does **not** raise; callers decide the appropriate response. For a
474
+ strict raising version, see :func:`assert_redacted`.
475
+
476
+ .. versionchanged:: 2.1
477
+ Default for *scan_raw* changed from ``False`` to ``True`` so that
478
+ raw-string PII is caught by default. Pass ``scan_raw=False``
479
+ explicitly to restore the old behaviour.
480
+
481
+ Args:
482
+ event: The event to inspect.
483
+ scan_raw: When ``True`` (default), also run regex-based PII scanning
484
+ on the payload strings (via :func:`scan_payload`), not just
485
+ check for :class:`Redactable` wrappers.
486
+
487
+ Returns:
488
+ ``True`` if at least one :class:`Redactable` instance is found in the
489
+ payload (at any nesting depth), or if ``scan_raw=True`` and a regex
490
+ PII hit is detected. ``False`` if the payload is clean.
491
+
492
+ Example::
493
+
494
+ if contains_pii(event):
495
+ raise RuntimeError("Unredacted PII detected — cannot export")
496
+ """
497
+ if _has_redactable(event.payload):
498
+ return True
499
+ if scan_raw and isinstance(event.payload, Mapping):
500
+ result = scan_payload(event.payload) # type: ignore[arg-type]
501
+ return not result.clean
502
+ return False
503
+
504
+
505
+ def assert_redacted(event: Event, context: str = "", *, scan_raw: bool = True) -> None:
506
+ """Assert that *event* contains no unredacted :class:`Redactable` values.
507
+
508
+ This is the strict variant of :func:`contains_pii`. It raises
509
+ :exc:`PIINotRedactedError` if any :class:`Redactable` instances remain,
510
+ or if ``scan_raw=True`` and regex-based PII is detected.
511
+
512
+ .. versionchanged:: 2.1
513
+ Default for *scan_raw* changed from ``False`` to ``True``.
514
+
515
+ Args:
516
+ event: The event to inspect.
517
+ context: Optional short label for the error message (e.g. filename).
518
+ scan_raw: When ``True`` (default), also run regex-based PII scanning.
519
+
520
+ Raises:
521
+ PIINotRedactedError: If any :class:`Redactable` instances or raw PII
522
+ patterns are found.
523
+
524
+ Example::
525
+
526
+ assert_redacted(event, context="export_to_otlp", scan_raw=True)
527
+ """
528
+ count = _count_redactable(event.payload)
529
+ if count > 0:
530
+ raise PIINotRedactedError(count=count, context=context)
531
+ if scan_raw and isinstance(event.payload, Mapping):
532
+ result = scan_payload(event.payload) # type: ignore[arg-type]
533
+ if not result.clean:
534
+ raise PIINotRedactedError(count=len(result.hits), context=context)
535
+
536
+
537
+ # ---------------------------------------------------------------------------
538
+ # Internal helpers (module-private)
539
+ # ---------------------------------------------------------------------------
540
+
541
+
542
+ def _has_redactable(value: Any) -> bool:
543
+ """Return True if *value* contains any Redactable instance (recursive)."""
544
+ if isinstance(value, Redactable):
545
+ return True
546
+ if isinstance(value, Mapping):
547
+ return any(_has_redactable(v) for v in value.values())
548
+ if isinstance(value, (list, tuple)):
549
+ return any(_has_redactable(v) for v in value)
550
+ return False
551
+
552
+
553
+ def _count_redactable(value: Any, _depth: int = 0) -> int:
554
+ """Count the total number of Redactable instances in *value* (recursive)."""
555
+ if isinstance(value, Redactable):
556
+ return 1
557
+ if isinstance(value, Mapping):
558
+ return sum(_count_redactable(v, _depth + 1) for v in value.values())
559
+ if isinstance(value, (list, tuple)):
560
+ return sum(_count_redactable(v, _depth + 1) for v in value)
561
+ return 0
562
+
563
+
564
+ def _utcnow_iso() -> str:
565
+ """Return current UTC time as an ISO-8601 string (same format as Event)."""
566
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
567
+ return now.strftime("%Y-%m-%dT%H:%M:%S.") + f"{now.microsecond:06d}Z"
568
+
569
+
570
+ # ---------------------------------------------------------------------------
571
+ # GA-03: Deep PII scanning — regex-based detection
572
+ # ---------------------------------------------------------------------------
573
+
574
+ _PII_PATTERNS: Final[dict[str, re.Pattern[str]]] = {
575
+ "email": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}", re.ASCII),
576
+ "phone": re.compile(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"),
577
+ "ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
578
+ "credit_card": re.compile(r"\b(?:\d[ -]?){13,19}\b"),
579
+ "ip_address": re.compile(
580
+ r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
581
+ r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
582
+ ),
583
+ "uk_national_insurance": re.compile(
584
+ r"\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b",
585
+ re.IGNORECASE,
586
+ ),
587
+ # Date of birth — numeric (/, -, .) and written-month forms covering
588
+ # ISO/YMD, US MDY, day-first DMY (Europe/Asia/Australia/etc.), and
589
+ # long/short written-month variants. Years restricted to 19xx-20xx to
590
+ # limit false positives. _is_valid_date() provides secondary calendar-
591
+ # correctness check (leap-year rules, month lengths, etc.).
592
+ "date_of_birth": re.compile(
593
+ # ISO / YMD: YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD
594
+ r"\b(?:19|20)\d{2}[-/.](?:0?[1-9]|1[0-2])[-/.](?:0?[1-9]|[12]\d|3[01])\b"
595
+ r"|"
596
+ # US MDY: MM/DD/YYYY, MM-DD-YYYY, MM.DD.YYYY
597
+ r"\b(?:0?[1-9]|1[0-2])[-/.](?:0?[1-9]|[12]\d|3[01])[-/.](?:19|20)\d{2}\b"
598
+ r"|"
599
+ # Day-first DMY: DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY (UK, EU, Asia, etc.)
600
+ r"\b(?:0?[1-9]|[12]\d|3[01])[-/.](?:0?[1-9]|1[0-2])[-/.](?:19|20)\d{2}\b"
601
+ r"|"
602
+ # Written DMY: "15 Jan 2000", "15-Jan-2000", "15 January 2000"
603
+ r"\b(?:0?[1-9]|[12]\d|3[01])[\s\-]"
604
+ r"(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
605
+ r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
606
+ r"[\s\-](?:19|20)\d{2}\b"
607
+ r"|"
608
+ # Written MDY: "Jan 15, 2000", "January 15 2000"
609
+ r"\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
610
+ r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
611
+ r"\s+(?:0?[1-9]|[12]\d|3[01]),?\s+(?:19|20)\d{2}\b",
612
+ re.IGNORECASE,
613
+ ),
614
+ # Street address — house number + street name + recognised suffix
615
+ "address": re.compile(
616
+ r"\b\d{1,5}\s+(?:[A-Za-z0-9'.#\-]+\s+){1,5}"
617
+ r"(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|"
618
+ r"Court|Ct|Way|Place|Pl|Circle|Cir|Trail|Trl|Terrace|Ter|"
619
+ r"Parkway|Pkwy|Highway|Hwy|Route|Rte)\.?\b",
620
+ re.IGNORECASE,
621
+ ),
622
+ }
623
+
624
+
625
+ # ---------------------------------------------------------------------------
626
+ # GA-03-IN: India PII patterns — DPDP Act (Digital Personal Data Protection)
627
+ # ---------------------------------------------------------------------------
628
+
629
+ # Verhoeff checksum tables for Aadhaar validation
630
+ _VERHOEFF_D = (
631
+ (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
632
+ (1, 2, 3, 4, 0, 6, 7, 8, 9, 5),
633
+ (2, 3, 4, 0, 1, 7, 8, 9, 5, 6),
634
+ (3, 4, 0, 1, 2, 8, 9, 5, 6, 7),
635
+ (4, 0, 1, 2, 3, 9, 5, 6, 7, 8),
636
+ (5, 9, 8, 7, 6, 0, 4, 3, 2, 1),
637
+ (6, 5, 9, 8, 7, 1, 0, 4, 3, 2),
638
+ (7, 6, 5, 9, 8, 2, 1, 0, 4, 3),
639
+ (8, 7, 6, 5, 9, 3, 2, 1, 0, 4),
640
+ (9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
641
+ )
642
+
643
+ _VERHOEFF_P = (
644
+ (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
645
+ (1, 5, 7, 6, 2, 8, 3, 0, 9, 4),
646
+ (5, 8, 0, 3, 7, 9, 6, 1, 4, 2),
647
+ (8, 9, 1, 6, 0, 4, 3, 5, 2, 7),
648
+ (9, 4, 5, 3, 1, 2, 6, 8, 7, 0),
649
+ (4, 2, 8, 6, 5, 7, 3, 9, 0, 1),
650
+ (2, 7, 9, 3, 8, 0, 6, 4, 1, 5),
651
+ (7, 0, 4, 6, 9, 1, 3, 2, 5, 8),
652
+ )
653
+
654
+ _VERHOEFF_INV = (0, 4, 3, 2, 1, 5, 6, 7, 8, 9)
655
+
656
+
657
+ def _verhoeff_check(number_str: str) -> bool:
658
+ """Validate a number string using the Verhoeff checksum algorithm."""
659
+ digits = [int(d) for d in number_str if d.isdigit()]
660
+ c = 0
661
+ for i, d in enumerate(reversed(digits)):
662
+ c = _VERHOEFF_D[c][_VERHOEFF_P[i % 8][d]]
663
+ return c == 0
664
+
665
+
666
+ DPDP_PATTERNS: Final[dict[str, re.Pattern[str]]] = {
667
+ "aadhaar": re.compile(r"\b[2-9]\d{3}[\s-]?\d{4}[\s-]?\d{4}\b"),
668
+ "pan": re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b"),
669
+ }
670
+
671
+
672
+ @dataclass(frozen=True)
673
+ class PIIScanHit:
674
+ """Single PII detection hit.
675
+
676
+ Attributes:
677
+ pii_type: The type of PII detected (e.g. ``"email"``, ``"ssn"``).
678
+ path: Dot-separated path to the field in the payload.
679
+ match_count: Number of matches of this type at this path.
680
+ sensitivity: Sensitivity level: ``"high"`` for SSN/credit_card,
681
+ ``"medium"`` for email/phone, ``"low"`` for IP/NI.
682
+ """
683
+
684
+ pii_type: str
685
+ path: str
686
+ match_count: int = 1
687
+ sensitivity: str = "medium"
688
+
689
+
690
+ _SENSITIVITY_MAP: dict[str, str] = {
691
+ "ssn": "high",
692
+ "credit_card": "high",
693
+ "aadhaar": "high",
694
+ "pan": "high",
695
+ "date_of_birth": "high",
696
+ "email": "medium",
697
+ "phone": "medium",
698
+ "address": "medium",
699
+ "ip_address": "low",
700
+ "uk_national_insurance": "low",
701
+ }
702
+
703
+
704
+ @dataclass(frozen=True)
705
+ class PIIScanResult:
706
+ """Result of a deep PII scan on a payload dictionary.
707
+
708
+ Attributes:
709
+ hits: List of :class:`PIIScanHit` instances found.
710
+ scanned: Number of string values scanned.
711
+ clean: ``True`` if no PII was detected.
712
+ """
713
+
714
+ hits: list[PIIScanHit]
715
+ scanned: int
716
+
717
+ @property
718
+ def clean(self) -> bool:
719
+ """Return True if no PII hits were found."""
720
+ return len(self.hits) == 0
721
+
722
+
723
+ def _luhn_check(number_str: str) -> bool:
724
+ """Validate a credit card number using the Luhn algorithm."""
725
+ digits = [int(d) for d in number_str if d.isdigit()]
726
+ if len(digits) < 13 or len(digits) > 19:
727
+ return False
728
+ total = 0
729
+ for i, d in enumerate(reversed(digits)):
730
+ if i % 2 == 1:
731
+ d *= 2
732
+ if d > 9:
733
+ d -= 9
734
+ total += d
735
+ return total % 10 == 0
736
+
737
+
738
+ def _is_valid_ssn(ssn_str: str) -> bool:
739
+ """Return ``False`` for SSNs in known-invalid SSA number ranges.
740
+
741
+ Filters out the following ranges that the SSA has *never* assigned:
742
+
743
+ * Area ``000`` — never issued.
744
+ * Area ``666`` — explicitly excluded by SSA policy.
745
+ * Areas ``900``-``999`` — reserved for Individual Taxpayer
746
+ Identification Numbers (ITINs); never used as SSNs.
747
+ * Group ``00`` — never issued within any valid area.
748
+ * Serial ``0000`` — never issued within any valid area/group.
749
+
750
+ Args:
751
+ ssn_str: Raw match string from :data:`_PII_PATTERNS` ``"ssn"``
752
+ regex (e.g. ``"123-45-6789"``).
753
+
754
+ Returns:
755
+ ``True`` if the SSN passes all range checks; ``False`` otherwise.
756
+ """
757
+ digits = "".join(c for c in ssn_str if c.isdigit())
758
+ if len(digits) != 9:
759
+ return False
760
+ area = int(digits[:3])
761
+ group = int(digits[3:5])
762
+ serial = int(digits[5:])
763
+ if area in {0, 666} or area >= 900:
764
+ return False
765
+ if group == 0:
766
+ return False
767
+ return serial != 0
768
+
769
+
770
+ def _is_valid_date(date_str: str) -> bool:
771
+ """Return ``True`` if *date_str* is a valid calendar date.
772
+
773
+ Accepts all numeric and written-month formats produced by the
774
+ ``"date_of_birth"`` regex in :data:`_PII_PATTERNS`.
775
+
776
+ Numeric formats (separators ``/``, ``-``, ``.``):
777
+
778
+ * ``YYYY/MM/DD``, ``YYYY-MM-DD``, ``YYYY.MM.DD`` — ISO / year-first
779
+ * ``MM/DD/YYYY``, ``MM-DD-YYYY``, ``MM.DD.YYYY`` — US month-first
780
+ * ``DD/MM/YYYY``, ``DD-MM-YYYY``, ``DD.MM.YYYY`` — day-first (Europe,
781
+ Asia, Australia, Latin America, etc.)
782
+
783
+ Written-month formats:
784
+
785
+ * ``DD Mon YYYY``, ``DD-Mon-YYYY``, ``DD Month YYYY`` (e.g. 15 Jan 2000)
786
+ * ``Mon DD, YYYY``, ``Mon DD YYYY``, ``Month DD, YYYY`` (e.g. Jan 15, 2000)
787
+
788
+ Delegates to :func:`datetime.datetime.strptime` so leap-year rules and
789
+ month-length limits are enforced (e.g. ``31/04/1990`` is rejected).
790
+
791
+ Args:
792
+ date_str: Raw match string from the ``"date_of_birth"`` regex.
793
+
794
+ Returns:
795
+ ``True`` if the string represents a real calendar date in any of the
796
+ recognised formats; ``False`` otherwise.
797
+ """
798
+ _formats = (
799
+ # ISO / YMD
800
+ "%Y/%m/%d",
801
+ "%Y-%m-%d",
802
+ "%Y.%m.%d",
803
+ # US MDY
804
+ "%m/%d/%Y",
805
+ "%m-%d-%Y",
806
+ "%m.%d.%Y",
807
+ # Day-first DMY (Europe, Asia, Australia, Latin America, etc.)
808
+ "%d/%m/%Y",
809
+ "%d-%m-%Y",
810
+ "%d.%m.%Y",
811
+ # Written DMY: "15 Jan 2000", "15-Jan-2000", "15 January 2000"
812
+ "%d %b %Y",
813
+ "%d-%b-%Y",
814
+ "%d %B %Y",
815
+ "%d-%B-%Y",
816
+ # Written MDY: "Jan 15, 2000", "Jan 15 2000", "January 15, 2000"
817
+ "%b %d, %Y",
818
+ "%b %d %Y",
819
+ "%B %d, %Y",
820
+ "%B %d %Y",
821
+ )
822
+ for fmt in _formats:
823
+ try:
824
+ datetime.datetime.strptime(date_str.strip(), fmt)
825
+ except ValueError:
826
+ continue
827
+ else:
828
+ return True
829
+ return False
830
+
831
+
832
+ def scan_payload(
833
+ payload: dict[str, Any],
834
+ *,
835
+ extra_patterns: dict[str, re.Pattern[str]] | None = None,
836
+ max_depth: int = 10,
837
+ ) -> PIIScanResult:
838
+ """Scan a payload dict for PII using regex detectors.
839
+
840
+ Walks the entire payload recursively (up to *max_depth*), testing every
841
+ string value against the built-in pattern set (email, phone, SSN, credit
842
+ card, IP address, UK National Insurance number) plus any caller-supplied
843
+ patterns.
844
+
845
+ **Security**: matched values are never returned — only the PII type, path,
846
+ match count, and sensitivity level.
847
+
848
+ Args:
849
+ payload: The dictionary to scan.
850
+ extra_patterns: Additional ``{label: compiled_regex}`` detectors.
851
+ max_depth: Maximum nesting depth to scan (default 10).
852
+
853
+ Returns:
854
+ A :class:`PIIScanResult` summarising all detections.
855
+ """
856
+ patterns = {**_PII_PATTERNS, **DPDP_PATTERNS}
857
+ if extra_patterns:
858
+ patterns.update(extra_patterns)
859
+
860
+ hits: list[PIIScanHit] = []
861
+ scanned = 0
862
+
863
+ def _walk(obj: Any, path: str, depth: int) -> None:
864
+ nonlocal scanned
865
+ if depth > max_depth:
866
+ return
867
+ if isinstance(obj, str):
868
+ scanned += 1
869
+ for label, pat in patterns.items():
870
+ matches = list(pat.finditer(obj))
871
+ if not matches:
872
+ continue
873
+ # Luhn validation for credit card patterns
874
+ if label == "credit_card":
875
+ valid_matches = [m for m in matches if _luhn_check(m.group())]
876
+ if not valid_matches:
877
+ continue
878
+ matches = valid_matches
879
+ # Verhoeff validation for Aadhaar patterns
880
+ if label == "aadhaar":
881
+ valid_matches = [m for m in matches if _verhoeff_check(m.group())]
882
+ if not valid_matches:
883
+ continue
884
+ matches = valid_matches
885
+ # SSN range validation — drop known-invalid SSA ranges
886
+ if label == "ssn":
887
+ valid_matches = [m for m in matches if _is_valid_ssn(m.group())]
888
+ if not valid_matches:
889
+ continue
890
+ matches = valid_matches
891
+ # Calendar validation for date_of_birth patterns
892
+ if label == "date_of_birth":
893
+ valid_matches = [m for m in matches if _is_valid_date(m.group())]
894
+ if not valid_matches:
895
+ continue
896
+ matches = valid_matches
897
+ sensitivity = _SENSITIVITY_MAP.get(label, "medium")
898
+ hits.append(
899
+ PIIScanHit(
900
+ pii_type=label,
901
+ path=path,
902
+ match_count=len(matches),
903
+ sensitivity=sensitivity,
904
+ )
905
+ )
906
+ elif isinstance(obj, Mapping):
907
+ for k, v in obj.items():
908
+ _walk(v, f"{path}.{k}" if path else str(k), depth + 1)
909
+ elif isinstance(obj, (list, tuple)):
910
+ for i, v in enumerate(obj):
911
+ _walk(v, f"{path}[{i}]", depth + 1)
912
+
913
+ _walk(payload, "", 0)
914
+ return PIIScanResult(hits=hits, scanned=scanned)