spanforge 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. spanforge/__init__.py +695 -0
  2. spanforge/_batch_exporter.py +322 -0
  3. spanforge/_cli.py +3081 -0
  4. spanforge/_hooks.py +340 -0
  5. spanforge/_server.py +953 -0
  6. spanforge/_span.py +1015 -0
  7. spanforge/_store.py +287 -0
  8. spanforge/_stream.py +654 -0
  9. spanforge/_trace.py +334 -0
  10. spanforge/_tracer.py +253 -0
  11. spanforge/actor.py +141 -0
  12. spanforge/alerts.py +464 -0
  13. spanforge/auto.py +181 -0
  14. spanforge/baseline.py +336 -0
  15. spanforge/config.py +460 -0
  16. spanforge/consent.py +227 -0
  17. spanforge/consumer.py +379 -0
  18. spanforge/core/__init__.py +5 -0
  19. spanforge/core/compliance_mapping.py +1060 -0
  20. spanforge/cost.py +597 -0
  21. spanforge/debug.py +514 -0
  22. spanforge/drift.py +488 -0
  23. spanforge/egress.py +63 -0
  24. spanforge/eval.py +575 -0
  25. spanforge/event.py +1052 -0
  26. spanforge/exceptions.py +246 -0
  27. spanforge/explain.py +181 -0
  28. spanforge/export/__init__.py +50 -0
  29. spanforge/export/append_only.py +342 -0
  30. spanforge/export/cloud.py +349 -0
  31. spanforge/export/datadog.py +495 -0
  32. spanforge/export/grafana.py +331 -0
  33. spanforge/export/jsonl.py +198 -0
  34. spanforge/export/otel_bridge.py +291 -0
  35. spanforge/export/otlp.py +817 -0
  36. spanforge/export/otlp_bridge.py +231 -0
  37. spanforge/export/redis_backend.py +282 -0
  38. spanforge/export/webhook.py +302 -0
  39. spanforge/exporters/__init__.py +29 -0
  40. spanforge/exporters/console.py +271 -0
  41. spanforge/exporters/jsonl.py +144 -0
  42. spanforge/hitl.py +297 -0
  43. spanforge/inspect.py +429 -0
  44. spanforge/integrations/__init__.py +39 -0
  45. spanforge/integrations/_pricing.py +277 -0
  46. spanforge/integrations/anthropic.py +388 -0
  47. spanforge/integrations/bedrock.py +306 -0
  48. spanforge/integrations/crewai.py +251 -0
  49. spanforge/integrations/gemini.py +349 -0
  50. spanforge/integrations/groq.py +444 -0
  51. spanforge/integrations/langchain.py +349 -0
  52. spanforge/integrations/llamaindex.py +370 -0
  53. spanforge/integrations/ollama.py +286 -0
  54. spanforge/integrations/openai.py +370 -0
  55. spanforge/integrations/together.py +485 -0
  56. spanforge/metrics.py +393 -0
  57. spanforge/metrics_export.py +342 -0
  58. spanforge/migrate.py +278 -0
  59. spanforge/model_registry.py +282 -0
  60. spanforge/models.py +407 -0
  61. spanforge/namespaces/__init__.py +215 -0
  62. spanforge/namespaces/audit.py +253 -0
  63. spanforge/namespaces/cache.py +209 -0
  64. spanforge/namespaces/chain.py +74 -0
  65. spanforge/namespaces/confidence.py +69 -0
  66. spanforge/namespaces/consent.py +85 -0
  67. spanforge/namespaces/cost.py +175 -0
  68. spanforge/namespaces/decision.py +135 -0
  69. spanforge/namespaces/diff.py +146 -0
  70. spanforge/namespaces/drift.py +79 -0
  71. spanforge/namespaces/eval_.py +232 -0
  72. spanforge/namespaces/fence.py +180 -0
  73. spanforge/namespaces/guard.py +104 -0
  74. spanforge/namespaces/hitl.py +92 -0
  75. spanforge/namespaces/latency.py +69 -0
  76. spanforge/namespaces/prompt.py +185 -0
  77. spanforge/namespaces/redact.py +172 -0
  78. spanforge/namespaces/template.py +197 -0
  79. spanforge/namespaces/tool_call.py +76 -0
  80. spanforge/namespaces/trace.py +1006 -0
  81. spanforge/normalizer.py +183 -0
  82. spanforge/presidio_backend.py +149 -0
  83. spanforge/processor.py +258 -0
  84. spanforge/prompt_registry.py +415 -0
  85. spanforge/py.typed +0 -0
  86. spanforge/redact.py +780 -0
  87. spanforge/sampling.py +500 -0
  88. spanforge/schemas/v1.0/schema.json +170 -0
  89. spanforge/schemas/v2.0/schema.json +536 -0
  90. spanforge/signing.py +1152 -0
  91. spanforge/stream.py +559 -0
  92. spanforge/testing.py +376 -0
  93. spanforge/trace.py +199 -0
  94. spanforge/types.py +696 -0
  95. spanforge/ulid.py +304 -0
  96. spanforge/validate.py +383 -0
  97. spanforge-2.0.0.dist-info/METADATA +1777 -0
  98. spanforge-2.0.0.dist-info/RECORD +101 -0
  99. spanforge-2.0.0.dist-info/WHEEL +4 -0
  100. spanforge-2.0.0.dist-info/entry_points.txt +5 -0
  101. spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/redact.py ADDED
@@ -0,0 +1,780 @@
1
+ """PII redaction framework for spanforge.
2
+
3
+ Provides a layered, policy-driven approach to PII identification and redaction
4
+ in event payloads. Redaction is **opt-in per field** — fields must be
5
+ explicitly wrapped in :class:`Redactable` to participate in the lifecycle.
6
+
7
+ Sensitivity ladder
8
+ ------------------
9
+
10
+ ``low`` < ``medium`` < ``high`` < ``pii`` < ``phi``
11
+
12
+ A :class:`RedactionPolicy` is configured with a ``min_sensitivity`` level.
13
+ Only fields whose sensitivity is **≥ min_sensitivity** are scrubbed when
14
+ :meth:`RedactionPolicy.apply` is called.
15
+
16
+ Usage example
17
+ -------------
18
+ ::
19
+
20
+ from spanforge.redact import Redactable, RedactionPolicy, Sensitivity, contains_pii
21
+ from spanforge import Event, EventType
22
+
23
+ policy = RedactionPolicy(
24
+ min_sensitivity=Sensitivity.PII,
25
+ redacted_by="policy:corp-default",
26
+ )
27
+
28
+ event = Event(
29
+ event_type=EventType.PROMPT_SAVED,
30
+ source="promptlock@1.0.0",
31
+ payload={
32
+ "version": "v3",
33
+ "author": Redactable("alice@example.com", Sensitivity.PII, {"email"}),
34
+ },
35
+ )
36
+
37
+ result = policy.apply(event)
38
+ # result.event.payload["author"] == "[REDACTED:pii]"
39
+ # result.redaction_count == 1
40
+ # contains_pii(result.event) == False
41
+
42
+ Security guarantees
43
+ -------------------
44
+ * :class:`Redactable` never exposes its wrapped value in ``__repr__``,
45
+ ``__str__``, or any exception message.
46
+ * Exception messages only reveal the *sensitivity level* and *field depth*,
47
+ never the content of the wrapped value.
48
+ * The literal replacement strings (``"[REDACTED:pii]"`` etc.) are safe to
49
+ log, export, or include in error messages.
50
+ * :meth:`RedactionPolicy.apply` rebuilds the payload recursively so nested
51
+ structures are fully scanned even in deeply-nested payloads.
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ import datetime
57
+ import hashlib
58
+ import re
59
+ from collections.abc import Mapping
60
+ from dataclasses import dataclass
61
+ from enum import Enum
62
+ from typing import TYPE_CHECKING, Any, Final
63
+
64
+ from spanforge.exceptions import LLMSchemaError
65
+
66
+ if TYPE_CHECKING:
67
+ from spanforge.event import Event
68
+
69
+ __all__ = [
70
+ "PII_TYPES",
71
+ "PIINotRedactedError",
72
+ "PIIScanResult",
73
+ "Redactable",
74
+ "RedactionPolicy",
75
+ "RedactionResult",
76
+ "Sensitivity",
77
+ "contains_pii",
78
+ "scan_payload",
79
+ ]
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Known PII type label constants
83
+ # ---------------------------------------------------------------------------
84
+
85
+ PII_TYPES: Final[frozenset[str]] = frozenset(
86
+ [
87
+ "credit_card",
88
+ "date_of_birth",
89
+ "email",
90
+ "financial_id",
91
+ "ip_address",
92
+ "medical_id",
93
+ "name",
94
+ "phone",
95
+ "ssn",
96
+ "address",
97
+ ]
98
+ )
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # Sensitivity ordering
102
+ # ---------------------------------------------------------------------------
103
+
104
+ #: Numeric ordering for each sensitivity level (ascending sensitivity).
105
+ _SENSITIVITY_ORDER: Final[dict[str, int]] = {
106
+ "low": 0,
107
+ "medium": 1,
108
+ "high": 2,
109
+ "pii": 3,
110
+ "phi": 4,
111
+ }
112
+
113
+
114
+ class Sensitivity(str, Enum):
115
+ """Ordered sensitivity levels for PII classification.
116
+
117
+ Levels increase in sensitivity: LOW < MEDIUM < HIGH < PII < PHI.
118
+
119
+ * **LOW** — Non-sensitive; informational or operational metadata.
120
+ * **MEDIUM** — Pseudonymous or indirectly identifying data.
121
+ * **HIGH** — Directly identifying but non-regulated (e.g. usernames).
122
+ * **PII** — Directly identifying, regulated personal data (GDPR / CCPA).
123
+ * **PHI** — Protected health information (HIPAA). Most restrictive.
124
+
125
+ Comparison operators (``<``, ``<=``, ``>``, ``>=``) work as expected::
126
+
127
+ Sensitivity.PII > Sensitivity.HIGH # True
128
+ Sensitivity.PHI >= Sensitivity.PII # True
129
+ Sensitivity.LOW < Sensitivity.MEDIUM # True
130
+ """
131
+
132
+ LOW = "low"
133
+ MEDIUM = "medium"
134
+ HIGH = "high"
135
+ PII = "pii"
136
+ PHI = "phi"
137
+
138
+ # ------------------------------------------------------------------
139
+ # Ordered comparisons (delegated to integer order table)
140
+ # ------------------------------------------------------------------
141
+
142
+ @property
143
+ def _order(self) -> int:
144
+ """Integer rank — for comparison only; not part of the public API."""
145
+ return _SENSITIVITY_ORDER[self.value]
146
+
147
+ def __lt__(self, other: object) -> bool:
148
+ if not isinstance(other, Sensitivity):
149
+ return NotImplemented # type: ignore[return-value]
150
+ return self._order < other._order
151
+
152
+ def __le__(self, other: object) -> bool:
153
+ if not isinstance(other, Sensitivity):
154
+ return NotImplemented # type: ignore[return-value]
155
+ return self._order <= other._order
156
+
157
+ def __gt__(self, other: object) -> bool:
158
+ if not isinstance(other, Sensitivity):
159
+ return NotImplemented # type: ignore[return-value]
160
+ return self._order > other._order
161
+
162
+ def __ge__(self, other: object) -> bool:
163
+ if not isinstance(other, Sensitivity):
164
+ return NotImplemented # type: ignore[return-value]
165
+ return self._order >= other._order
166
+
167
+ def __eq__(self, other: object) -> bool:
168
+ if isinstance(other, str) and not isinstance(other, Sensitivity):
169
+ return str.__eq__(self, other)
170
+ return Enum.__eq__(self, other)
171
+
172
+ def __hash__(self) -> int:
173
+ return str.__hash__(self)
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # Redactable wrapper
178
+ # ---------------------------------------------------------------------------
179
+
180
+
181
+ class Redactable:
182
+ """Immutable wrapper that marks a payload value as PII-sensitive.
183
+
184
+ Wrapping a value in :class:`Redactable` does **not** redact it immediately.
185
+ The value is redacted only when :meth:`RedactionPolicy.apply` is called on
186
+ the event that contains it.
187
+
188
+ Security: :class:`Redactable` never surfaces its wrapped value in
189
+ ``__repr__``, ``__str__``, or exceptions. Only the sensitivity level and
190
+ PII type labels are visible in any string representation.
191
+
192
+ Args:
193
+ value: The raw PII-sensitive value.
194
+ sensitivity: How sensitive the value is.
195
+ pii_types: Labels describing what type of PII this is. Use
196
+ constants from :data:`PII_TYPES` or custom strings.
197
+ Defaults to an empty frozenset.
198
+
199
+ Example::
200
+
201
+ field = Redactable("alice@example.com", Sensitivity.PII, {"email"})
202
+ str(field) # "<Redactable:pii>" — value hidden
203
+ repr(field) # "<Redactable sensitivity='pii' pii_types={'email'}>"
204
+ """
205
+
206
+ __slots__ = ("_pii_types", "_sensitivity", "_value")
207
+
208
+ def __init__(
209
+ self,
210
+ value: Any, # noqa: ANN401
211
+ sensitivity: Sensitivity,
212
+ pii_types: frozenset[str] = frozenset(),
213
+ ) -> None:
214
+ object.__setattr__(self, "_value", value)
215
+ object.__setattr__(self, "_sensitivity", sensitivity)
216
+ object.__setattr__(self, "_pii_types", frozenset(pii_types))
217
+
218
+ # ------------------------------------------------------------------
219
+ # Public interface
220
+ # ------------------------------------------------------------------
221
+
222
+ @property
223
+ def sensitivity(self) -> Sensitivity:
224
+ """The sensitivity level of this field."""
225
+ return self._sensitivity # type: ignore[return-value]
226
+
227
+ @property
228
+ def pii_types(self) -> frozenset[str]:
229
+ """Set of PII type labels (e.g. ``{'email', 'pii_identifier'}``)."""
230
+ return self._pii_types # type: ignore[return-value]
231
+
232
+ def reveal(self) -> Any: # noqa: ANN401
233
+ """Return the raw unredacted value.
234
+
235
+ Use with extreme care. Access to raw values should be restricted to
236
+ trusted internal code paths. Ensure the returned value is never
237
+ logged or included in any observable output.
238
+
239
+ Returns:
240
+ The original unwrapped value passed to the constructor.
241
+ """
242
+ return self._value # type: ignore[return-value]
243
+
244
+ # ------------------------------------------------------------------
245
+ # Immutability guard
246
+ # ------------------------------------------------------------------
247
+
248
+ def __setattr__(self, name: str, value: object) -> None: # type: ignore[override]
249
+ raise AttributeError("Redactable is immutable — use a new instance to change values")
250
+
251
+ # ------------------------------------------------------------------
252
+ # Safe string representations — value intentionally hidden
253
+ # ------------------------------------------------------------------
254
+
255
+ def __repr__(self) -> str:
256
+ return (
257
+ f"<Redactable sensitivity={self._sensitivity!r} " # type: ignore[misc]
258
+ f"pii_types={set(self._pii_types)!r}>" # type: ignore[misc]
259
+ )
260
+
261
+ def __str__(self) -> str:
262
+ return f"<Redactable:{self._sensitivity}>" # type: ignore[misc]
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # Redaction result
267
+ # ---------------------------------------------------------------------------
268
+
269
+
270
+ @dataclass(frozen=True)
271
+ class RedactionResult:
272
+ """Immutable result returned by :meth:`RedactionPolicy.apply`.
273
+
274
+ Attributes:
275
+ event: The newly constructed event with PII removed.
276
+ redaction_count: How many :class:`Redactable` fields were scrubbed.
277
+ redacted_at: UTC ISO-8601 timestamp when redaction was applied.
278
+ redacted_by: The policy identifier string.
279
+ """
280
+
281
+ event: Event
282
+ redaction_count: int
283
+ redacted_at: str
284
+ redacted_by: str
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # PIINotRedactedError
289
+ # ---------------------------------------------------------------------------
290
+
291
+
292
+ class PIINotRedactedError(LLMSchemaError):
293
+ """Raised when :func:`contains_pii` detects un-redacted PII in an event.
294
+
295
+ This error signals that a :class:`Redactable` instance is still present in
296
+ the event payload after a :class:`RedactionPolicy` should have been applied.
297
+
298
+ Security: the error message never reveals the actual PII value — only field
299
+ path depth and sensitivity information.
300
+
301
+ Args:
302
+ count: Number of unredacted :class:`Redactable` instances found.
303
+ context: Optional short label for where the check was done.
304
+
305
+ Attributes:
306
+ count: Number of outstanding :class:`Redactable` instances found.
307
+ """
308
+
309
+ count: int
310
+
311
+ def __init__(self, count: int, context: str = "") -> None:
312
+ self.count = count
313
+ # M11: never embed the raw context string — it may itself contain PII.
314
+ # Include only a hash for correlation without disclosure.
315
+ ctx = ""
316
+ if context:
317
+ ctx_hash = hashlib.sha256(context.encode()).hexdigest()[:8]
318
+ ctx = f" [context-hash:{ctx_hash}]"
319
+ super().__init__(
320
+ f"Found {count} unredacted PII field(s){ctx}. "
321
+ "Apply a RedactionPolicy before serialising or exporting this event."
322
+ )
323
+
324
+
325
+ # ---------------------------------------------------------------------------
326
+ # RedactionPolicy
327
+ # ---------------------------------------------------------------------------
328
+
329
+
330
+ @dataclass(frozen=True)
331
+ class RedactionPolicy:
332
+ """Policy that defines which fields to scrub and how to label redactions.
333
+
334
+ A policy is immutable; create a new instance to change configuration.
335
+ Apply it to an event via :meth:`apply`, which returns a :class:`RedactionResult`
336
+ containing a new event with PII removed.
337
+
338
+ Args:
339
+ min_sensitivity: Fields with sensitivity **≥** this level are
340
+ redacted. Defaults to :attr:`Sensitivity.PII`.
341
+ redacted_by: Identifier embedded in the redaction metadata
342
+ (e.g. ``"policy:corp-default"``).
343
+ replacement_template: String template for the redaction marker.
344
+ The ``{sensitivity}`` placeholder is replaced
345
+ with the field's sensitivity level value.
346
+ Defaults to ``"[REDACTED:{sensitivity}]"``.
347
+
348
+ Example::
349
+
350
+ policy = RedactionPolicy(
351
+ min_sensitivity=Sensitivity.HIGH,
352
+ redacted_by="policy:strict",
353
+ )
354
+ result = policy.apply(event)
355
+ """
356
+
357
+ min_sensitivity: Sensitivity = Sensitivity.PII
358
+ redacted_by: str = "policy:default"
359
+ replacement_template: str = "[REDACTED:{sensitivity}]"
360
+
361
+ def _make_marker(self, sensitivity: Sensitivity) -> str:
362
+ """Format the replacement string for a given sensitivity level."""
363
+ return self.replacement_template.format(sensitivity=sensitivity.value)
364
+
365
+ def _should_redact(self, r: Redactable) -> bool:
366
+ """Return True if the Redactable field meets the policy threshold."""
367
+ return r.sensitivity >= self.min_sensitivity
368
+
369
+ def _redact_value(self, value: Any, counter: list[int], _depth: int = 0) -> Any: # noqa: ANN401
370
+ """Recursively replace Redactable instances in *value*.
371
+
372
+ Args:
373
+ value: Any Python value (dict, list, Redactable, or scalar).
374
+ counter: Single-element list used as a mutable integer counter.
375
+ _depth: Current recursion depth (internal; raises at > 100).
376
+
377
+ Returns:
378
+ The value with any qualifying Redactable instances replaced by
379
+ their marker strings. Non-Redactable values are returned as-is.
380
+ """
381
+ if _depth > 100:
382
+ raise RecursionError(
383
+ "RedactionPolicy._redact_value: maximum nesting depth (100) exceeded"
384
+ )
385
+ if isinstance(value, Redactable):
386
+ if self._should_redact(value):
387
+ counter[0] += 1
388
+ return self._make_marker(value.sensitivity)
389
+ # Below threshold — leave as-is for now;
390
+ # contains_pii() will detect it post-apply if needed.
391
+ return value
392
+ if isinstance(value, dict):
393
+ return {k: self._redact_value(v, counter, _depth + 1) for k, v in value.items()}
394
+ if isinstance(value, list):
395
+ return [self._redact_value(v, counter, _depth + 1) for v in value]
396
+ if isinstance(value, tuple):
397
+ return tuple(self._redact_value(v, counter, _depth + 1) for v in value)
398
+ return value
399
+
400
+ def apply(self, event: Event) -> RedactionResult:
401
+ """Apply this policy to *event*, returning a new redacted event.
402
+
403
+ All :class:`Redactable` fields in the payload whose sensitivity is ≥
404
+ :attr:`min_sensitivity` are replaced with safe marker strings.
405
+ Redaction metadata is appended under the reserved ``__redacted_*``
406
+ keys in the payload.
407
+
408
+ The original event is **not** mutated; a new :class:`Event` is returned
409
+ inside the :class:`RedactionResult`.
410
+
411
+ Args:
412
+ event: The event whose payload should be scanned and redacted.
413
+
414
+ Returns:
415
+ A :class:`RedactionResult` with the new event and redaction stats.
416
+
417
+ Raises:
418
+ LLMSchemaError: If reconstruction of the redacted event fails for
419
+ structural reasons.
420
+ """
421
+ # Import here to avoid circular dependency at module load time.
422
+ from spanforge.event import Event # noqa: PLC0415
423
+
424
+ counter: list[int] = [0]
425
+ redacted_payload = self._redact_value(dict(event.payload), counter)
426
+
427
+ now = _utcnow_iso()
428
+
429
+ if isinstance(redacted_payload, dict) and counter[0] > 0:
430
+ redacted_payload["__redacted_at"] = now
431
+ redacted_payload["__redacted_by"] = self.redacted_by
432
+ redacted_payload["__redaction_count"] = counter[0]
433
+
434
+ new_event = Event(
435
+ schema_version=event.schema_version,
436
+ event_id=event.event_id,
437
+ event_type=event.event_type,
438
+ timestamp=event.timestamp,
439
+ source=event.source,
440
+ payload=redacted_payload,
441
+ trace_id=event.trace_id,
442
+ span_id=event.span_id,
443
+ parent_span_id=event.parent_span_id,
444
+ org_id=event.org_id,
445
+ team_id=event.team_id,
446
+ actor_id=event.actor_id,
447
+ session_id=event.session_id,
448
+ tags=event.tags,
449
+ checksum=event.checksum,
450
+ signature=event.signature,
451
+ prev_id=event.prev_id,
452
+ )
453
+
454
+ return RedactionResult(
455
+ event=new_event,
456
+ redaction_count=counter[0],
457
+ redacted_at=now,
458
+ redacted_by=self.redacted_by,
459
+ )
460
+
461
+
462
+ # ---------------------------------------------------------------------------
463
+ # Public helpers
464
+ # ---------------------------------------------------------------------------
465
+
466
+
467
+ def contains_pii(event: Event, *, scan_raw: bool = True) -> bool:
468
+ """Return ``True`` if any unredacted :class:`Redactable` values remain.
469
+
470
+ Use this after :meth:`RedactionPolicy.apply` to verify that all qualifying
471
+ fields were scrubbed before the event is serialised or exported.
472
+
473
+ Does **not** raise; callers decide the appropriate response. For a
474
+ strict raising version, see :func:`assert_redacted`.
475
+
476
+ .. versionchanged:: 2.1
477
+ Default for *scan_raw* changed from ``False`` to ``True`` so that
478
+ raw-string PII is caught by default. Pass ``scan_raw=False``
479
+ explicitly to restore the old behaviour.
480
+
481
+ Args:
482
+ event: The event to inspect.
483
+ scan_raw: When ``True`` (default), also run regex-based PII scanning
484
+ on the payload strings (via :func:`scan_payload`), not just
485
+ check for :class:`Redactable` wrappers.
486
+
487
+ Returns:
488
+ ``True`` if at least one :class:`Redactable` instance is found in the
489
+ payload (at any nesting depth), or if ``scan_raw=True`` and a regex
490
+ PII hit is detected. ``False`` if the payload is clean.
491
+
492
+ Example::
493
+
494
+ if contains_pii(event):
495
+ raise RuntimeError("Unredacted PII detected — cannot export")
496
+ """
497
+ if _has_redactable(event.payload):
498
+ return True
499
+ if scan_raw and isinstance(event.payload, Mapping):
500
+ result = scan_payload(event.payload) # type: ignore[arg-type]
501
+ return not result.clean
502
+ return False
503
+
504
+
505
+ def assert_redacted(event: Event, context: str = "", *, scan_raw: bool = True) -> None:
506
+ """Assert that *event* contains no unredacted :class:`Redactable` values.
507
+
508
+ This is the strict variant of :func:`contains_pii`. It raises
509
+ :exc:`PIINotRedactedError` if any :class:`Redactable` instances remain,
510
+ or if ``scan_raw=True`` and regex-based PII is detected.
511
+
512
+ .. versionchanged:: 2.1
513
+ Default for *scan_raw* changed from ``False`` to ``True``.
514
+
515
+ Args:
516
+ event: The event to inspect.
517
+ context: Optional short label for the error message (e.g. filename).
518
+ scan_raw: When ``True`` (default), also run regex-based PII scanning.
519
+
520
+ Raises:
521
+ PIINotRedactedError: If any :class:`Redactable` instances or raw PII
522
+ patterns are found.
523
+
524
+ Example::
525
+
526
+ assert_redacted(event, context="export_to_otlp", scan_raw=True)
527
+ """
528
+ count = _count_redactable(event.payload)
529
+ if count > 0:
530
+ raise PIINotRedactedError(count=count, context=context)
531
+ if scan_raw and isinstance(event.payload, Mapping):
532
+ result = scan_payload(event.payload) # type: ignore[arg-type]
533
+ if not result.clean:
534
+ raise PIINotRedactedError(count=len(result.hits), context=context)
535
+
536
+
537
+ # ---------------------------------------------------------------------------
538
+ # Internal helpers (module-private)
539
+ # ---------------------------------------------------------------------------
540
+
541
+
542
+ def _has_redactable(value: Any) -> bool: # noqa: ANN401
543
+ """Return True if *value* contains any Redactable instance (recursive)."""
544
+ if isinstance(value, Redactable):
545
+ return True
546
+ if isinstance(value, Mapping):
547
+ return any(_has_redactable(v) for v in value.values())
548
+ if isinstance(value, (list, tuple)):
549
+ return any(_has_redactable(v) for v in value)
550
+ return False
551
+
552
+
553
+ def _count_redactable(value: Any, _depth: int = 0) -> int: # noqa: ANN401
554
+ """Count the total number of Redactable instances in *value* (recursive)."""
555
+ if isinstance(value, Redactable):
556
+ return 1
557
+ if isinstance(value, Mapping):
558
+ return sum(_count_redactable(v, _depth + 1) for v in value.values())
559
+ if isinstance(value, (list, tuple)):
560
+ return sum(_count_redactable(v, _depth + 1) for v in value)
561
+ return 0
562
+
563
+
564
+ def _utcnow_iso() -> str:
565
+ """Return current UTC time as an ISO-8601 string (same format as Event)."""
566
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
567
+ return now.strftime("%Y-%m-%dT%H:%M:%S.") + f"{now.microsecond:06d}Z"
568
+
569
+
570
+ # ---------------------------------------------------------------------------
571
+ # GA-03: Deep PII scanning — regex-based detection
572
+ # ---------------------------------------------------------------------------
573
+
574
+ _PII_PATTERNS: Final[dict[str, re.Pattern[str]]] = {
575
+ "email": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}", re.ASCII),
576
+ "phone": re.compile(
577
+ r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
578
+ ),
579
+ "ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
580
+ "credit_card": re.compile(r"\b(?:\d[ -]?){13,19}\b"),
581
+ "ip_address": re.compile(
582
+ r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
583
+ r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
584
+ ),
585
+ "uk_national_insurance": re.compile(
586
+ r"\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b",
587
+ re.IGNORECASE,
588
+ ),
589
+ }
590
+
591
+
592
+ # ---------------------------------------------------------------------------
593
+ # GA-03-IN: India PII patterns — DPDP Act (Digital Personal Data Protection)
594
+ # ---------------------------------------------------------------------------
595
+
596
+ # Verhoeff checksum tables for Aadhaar validation
597
+ _VERHOEFF_D = (
598
+ (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
599
+ (1, 2, 3, 4, 0, 6, 7, 8, 9, 5),
600
+ (2, 3, 4, 0, 1, 7, 8, 9, 5, 6),
601
+ (3, 4, 0, 1, 2, 8, 9, 5, 6, 7),
602
+ (4, 0, 1, 2, 3, 9, 5, 6, 7, 8),
603
+ (5, 9, 8, 7, 6, 0, 4, 3, 2, 1),
604
+ (6, 5, 9, 8, 7, 1, 0, 4, 3, 2),
605
+ (7, 6, 5, 9, 8, 2, 1, 0, 4, 3),
606
+ (8, 7, 6, 5, 9, 3, 2, 1, 0, 4),
607
+ (9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
608
+ )
609
+
610
+ _VERHOEFF_P = (
611
+ (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
612
+ (1, 5, 7, 6, 2, 8, 3, 0, 9, 4),
613
+ (5, 8, 0, 3, 7, 9, 6, 1, 4, 2),
614
+ (8, 9, 1, 6, 0, 4, 3, 5, 2, 7),
615
+ (9, 4, 5, 3, 1, 2, 6, 8, 7, 0),
616
+ (4, 2, 8, 6, 5, 7, 3, 9, 0, 1),
617
+ (2, 7, 9, 3, 8, 0, 6, 4, 1, 5),
618
+ (7, 0, 4, 6, 9, 1, 3, 2, 5, 8),
619
+ )
620
+
621
+ _VERHOEFF_INV = (0, 4, 3, 2, 1, 5, 6, 7, 8, 9)
622
+
623
+
624
+ def _verhoeff_check(number_str: str) -> bool:
625
+ """Validate a number string using the Verhoeff checksum algorithm."""
626
+ digits = [int(d) for d in number_str if d.isdigit()]
627
+ c = 0
628
+ for i, d in enumerate(reversed(digits)):
629
+ c = _VERHOEFF_D[c][_VERHOEFF_P[i % 8][d]]
630
+ return c == 0
631
+
632
+
633
+ DPDP_PATTERNS: Final[dict[str, re.Pattern[str]]] = {
634
+ "aadhaar": re.compile(
635
+ r"\b[2-9]\d{3}[\s-]?\d{4}[\s-]?\d{4}\b"
636
+ ),
637
+ "pan": re.compile(
638
+ r"\b[A-Z]{5}\d{4}[A-Z]\b"
639
+ ),
640
+ }
641
+
642
+
643
+ @dataclass(frozen=True)
644
+ class PIIScanHit:
645
+ """Single PII detection hit.
646
+
647
+ Attributes:
648
+ pii_type: The type of PII detected (e.g. ``"email"``, ``"ssn"``).
649
+ path: Dot-separated path to the field in the payload.
650
+ match_count: Number of matches of this type at this path.
651
+ sensitivity: Sensitivity level: ``"high"`` for SSN/credit_card,
652
+ ``"medium"`` for email/phone, ``"low"`` for IP/NI.
653
+ """
654
+
655
+ pii_type: str
656
+ path: str
657
+ match_count: int = 1
658
+ sensitivity: str = "medium"
659
+
660
+
661
+ _SENSITIVITY_MAP: dict[str, str] = {
662
+ "ssn": "high",
663
+ "credit_card": "high",
664
+ "aadhaar": "high",
665
+ "pan": "high",
666
+ "email": "medium",
667
+ "phone": "medium",
668
+ "ip_address": "low",
669
+ "uk_national_insurance": "low",
670
+ }
671
+
672
+
673
+ @dataclass(frozen=True)
674
+ class PIIScanResult:
675
+ """Result of a deep PII scan on a payload dictionary.
676
+
677
+ Attributes:
678
+ hits: List of :class:`PIIScanHit` instances found.
679
+ scanned: Number of string values scanned.
680
+ clean: ``True`` if no PII was detected.
681
+ """
682
+
683
+ hits: list[PIIScanHit]
684
+ scanned: int
685
+
686
+ @property
687
+ def clean(self) -> bool:
688
+ return len(self.hits) == 0
689
+
690
+
691
+ def _luhn_check(number_str: str) -> bool:
692
+ """Validate a credit card number using the Luhn algorithm."""
693
+ digits = [int(d) for d in number_str if d.isdigit()]
694
+ if len(digits) < 13 or len(digits) > 19:
695
+ return False
696
+ total = 0
697
+ for i, d in enumerate(reversed(digits)):
698
+ if i % 2 == 1:
699
+ d *= 2
700
+ if d > 9:
701
+ d -= 9
702
+ total += d
703
+ return total % 10 == 0
704
+
705
+
706
+ def scan_payload(
707
+ payload: dict[str, Any],
708
+ *,
709
+ extra_patterns: dict[str, re.Pattern[str]] | None = None,
710
+ max_depth: int = 10,
711
+ ) -> PIIScanResult:
712
+ """Scan a payload dict for PII using regex detectors.
713
+
714
+ Walks the entire payload recursively (up to *max_depth*), testing every
715
+ string value against the built-in pattern set (email, phone, SSN, credit
716
+ card, IP address, UK National Insurance number) plus any caller-supplied
717
+ patterns.
718
+
719
+ **Security**: matched values are never returned — only the PII type, path,
720
+ match count, and sensitivity level.
721
+
722
+ Args:
723
+ payload: The dictionary to scan.
724
+ extra_patterns: Additional ``{label: compiled_regex}`` detectors.
725
+ max_depth: Maximum nesting depth to scan (default 10).
726
+
727
+ Returns:
728
+ A :class:`PIIScanResult` summarising all detections.
729
+ """
730
+ patterns = {**_PII_PATTERNS, **DPDP_PATTERNS}
731
+ if extra_patterns:
732
+ patterns.update(extra_patterns)
733
+
734
+ hits: list[PIIScanHit] = []
735
+ scanned = 0
736
+
737
+ def _walk(obj: Any, path: str, depth: int) -> None: # noqa: ANN401
738
+ nonlocal scanned
739
+ if depth > max_depth:
740
+ return
741
+ if isinstance(obj, str):
742
+ scanned += 1
743
+ for label, pat in patterns.items():
744
+ matches = list(pat.finditer(obj))
745
+ if not matches:
746
+ continue
747
+ # Luhn validation for credit card patterns
748
+ if label == "credit_card":
749
+ valid_matches = [
750
+ m for m in matches
751
+ if _luhn_check(m.group())
752
+ ]
753
+ if not valid_matches:
754
+ continue
755
+ matches = valid_matches
756
+ # Verhoeff validation for Aadhaar patterns
757
+ if label == "aadhaar":
758
+ valid_matches = [
759
+ m for m in matches
760
+ if _verhoeff_check(m.group())
761
+ ]
762
+ if not valid_matches:
763
+ continue
764
+ matches = valid_matches
765
+ sensitivity = _SENSITIVITY_MAP.get(label, "medium")
766
+ hits.append(PIIScanHit(
767
+ pii_type=label,
768
+ path=path,
769
+ match_count=len(matches),
770
+ sensitivity=sensitivity,
771
+ ))
772
+ elif isinstance(obj, Mapping):
773
+ for k, v in obj.items():
774
+ _walk(v, f"{path}.{k}" if path else str(k), depth + 1)
775
+ elif isinstance(obj, (list, tuple)):
776
+ for i, v in enumerate(obj):
777
+ _walk(v, f"{path}[{i}]", depth + 1)
778
+
779
+ _walk(payload, "", 0)
780
+ return PIIScanResult(hits=hits, scanned=scanned)