spanforge 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +695 -0
- spanforge/_batch_exporter.py +322 -0
- spanforge/_cli.py +3081 -0
- spanforge/_hooks.py +340 -0
- spanforge/_server.py +953 -0
- spanforge/_span.py +1015 -0
- spanforge/_store.py +287 -0
- spanforge/_stream.py +654 -0
- spanforge/_trace.py +334 -0
- spanforge/_tracer.py +253 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +464 -0
- spanforge/auto.py +181 -0
- spanforge/baseline.py +336 -0
- spanforge/config.py +460 -0
- spanforge/consent.py +227 -0
- spanforge/consumer.py +379 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1060 -0
- spanforge/cost.py +597 -0
- spanforge/debug.py +514 -0
- spanforge/drift.py +488 -0
- spanforge/egress.py +63 -0
- spanforge/eval.py +575 -0
- spanforge/event.py +1052 -0
- spanforge/exceptions.py +246 -0
- spanforge/explain.py +181 -0
- spanforge/export/__init__.py +50 -0
- spanforge/export/append_only.py +342 -0
- spanforge/export/cloud.py +349 -0
- spanforge/export/datadog.py +495 -0
- spanforge/export/grafana.py +331 -0
- spanforge/export/jsonl.py +198 -0
- spanforge/export/otel_bridge.py +291 -0
- spanforge/export/otlp.py +817 -0
- spanforge/export/otlp_bridge.py +231 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/webhook.py +302 -0
- spanforge/exporters/__init__.py +29 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/hitl.py +297 -0
- spanforge/inspect.py +429 -0
- spanforge/integrations/__init__.py +39 -0
- spanforge/integrations/_pricing.py +277 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/bedrock.py +306 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +349 -0
- spanforge/integrations/groq.py +444 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/llamaindex.py +370 -0
- spanforge/integrations/ollama.py +286 -0
- spanforge/integrations/openai.py +370 -0
- spanforge/integrations/together.py +485 -0
- spanforge/metrics.py +393 -0
- spanforge/metrics_export.py +342 -0
- spanforge/migrate.py +278 -0
- spanforge/model_registry.py +282 -0
- spanforge/models.py +407 -0
- spanforge/namespaces/__init__.py +215 -0
- spanforge/namespaces/audit.py +253 -0
- spanforge/namespaces/cache.py +209 -0
- spanforge/namespaces/chain.py +74 -0
- spanforge/namespaces/confidence.py +69 -0
- spanforge/namespaces/consent.py +85 -0
- spanforge/namespaces/cost.py +175 -0
- spanforge/namespaces/decision.py +135 -0
- spanforge/namespaces/diff.py +146 -0
- spanforge/namespaces/drift.py +79 -0
- spanforge/namespaces/eval_.py +232 -0
- spanforge/namespaces/fence.py +180 -0
- spanforge/namespaces/guard.py +104 -0
- spanforge/namespaces/hitl.py +92 -0
- spanforge/namespaces/latency.py +69 -0
- spanforge/namespaces/prompt.py +185 -0
- spanforge/namespaces/redact.py +172 -0
- spanforge/namespaces/template.py +197 -0
- spanforge/namespaces/tool_call.py +76 -0
- spanforge/namespaces/trace.py +1006 -0
- spanforge/normalizer.py +183 -0
- spanforge/presidio_backend.py +149 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +415 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +780 -0
- spanforge/sampling.py +500 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/signing.py +1152 -0
- spanforge/stream.py +559 -0
- spanforge/testing.py +376 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +304 -0
- spanforge/validate.py +383 -0
- spanforge-2.0.0.dist-info/METADATA +1777 -0
- spanforge-2.0.0.dist-info/RECORD +101 -0
- spanforge-2.0.0.dist-info/WHEEL +4 -0
- spanforge-2.0.0.dist-info/entry_points.txt +5 -0
- spanforge-2.0.0.dist-info/licenses/LICENSE +21 -0
spanforge/redact.py
ADDED
|
@@ -0,0 +1,780 @@
|
|
|
1
|
+
"""PII redaction framework for spanforge.
|
|
2
|
+
|
|
3
|
+
Provides a layered, policy-driven approach to PII identification and redaction
|
|
4
|
+
in event payloads. Redaction is **opt-in per field** — fields must be
|
|
5
|
+
explicitly wrapped in :class:`Redactable` to participate in the lifecycle.
|
|
6
|
+
|
|
7
|
+
Sensitivity ladder
|
|
8
|
+
------------------
|
|
9
|
+
|
|
10
|
+
``low`` < ``medium`` < ``high`` < ``pii`` < ``phi``
|
|
11
|
+
|
|
12
|
+
A :class:`RedactionPolicy` is configured with a ``min_sensitivity`` level.
|
|
13
|
+
Only fields whose sensitivity is **≥ min_sensitivity** are scrubbed when
|
|
14
|
+
:meth:`RedactionPolicy.apply` is called.
|
|
15
|
+
|
|
16
|
+
Usage example
|
|
17
|
+
-------------
|
|
18
|
+
::
|
|
19
|
+
|
|
20
|
+
from spanforge.redact import Redactable, RedactionPolicy, Sensitivity, contains_pii
|
|
21
|
+
from spanforge import Event, EventType
|
|
22
|
+
|
|
23
|
+
policy = RedactionPolicy(
|
|
24
|
+
min_sensitivity=Sensitivity.PII,
|
|
25
|
+
redacted_by="policy:corp-default",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
event = Event(
|
|
29
|
+
event_type=EventType.PROMPT_SAVED,
|
|
30
|
+
source="promptlock@1.0.0",
|
|
31
|
+
payload={
|
|
32
|
+
"version": "v3",
|
|
33
|
+
"author": Redactable("alice@example.com", Sensitivity.PII, {"email"}),
|
|
34
|
+
},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
result = policy.apply(event)
|
|
38
|
+
# result.event.payload["author"] == "[REDACTED:pii]"
|
|
39
|
+
# result.redaction_count == 1
|
|
40
|
+
# contains_pii(result.event) == False
|
|
41
|
+
|
|
42
|
+
Security guarantees
|
|
43
|
+
-------------------
|
|
44
|
+
* :class:`Redactable` never exposes its wrapped value in ``__repr__``,
|
|
45
|
+
``__str__``, or any exception message.
|
|
46
|
+
* Exception messages only reveal the *sensitivity level* and *field depth*,
|
|
47
|
+
never the content of the wrapped value.
|
|
48
|
+
* The literal replacement strings (``"[REDACTED:pii]"`` etc.) are safe to
|
|
49
|
+
log, export, or include in error messages.
|
|
50
|
+
* :meth:`RedactionPolicy.apply` rebuilds the payload recursively so nested
|
|
51
|
+
structures are fully scanned even in deeply-nested payloads.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from __future__ import annotations
|
|
55
|
+
|
|
56
|
+
import datetime
|
|
57
|
+
import hashlib
|
|
58
|
+
import re
|
|
59
|
+
from collections.abc import Mapping
|
|
60
|
+
from dataclasses import dataclass
|
|
61
|
+
from enum import Enum
|
|
62
|
+
from typing import TYPE_CHECKING, Any, Final
|
|
63
|
+
|
|
64
|
+
from spanforge.exceptions import LLMSchemaError
|
|
65
|
+
|
|
66
|
+
if TYPE_CHECKING:
|
|
67
|
+
from spanforge.event import Event
|
|
68
|
+
|
|
69
|
+
__all__ = [
|
|
70
|
+
"PII_TYPES",
|
|
71
|
+
"PIINotRedactedError",
|
|
72
|
+
"PIIScanResult",
|
|
73
|
+
"Redactable",
|
|
74
|
+
"RedactionPolicy",
|
|
75
|
+
"RedactionResult",
|
|
76
|
+
"Sensitivity",
|
|
77
|
+
"contains_pii",
|
|
78
|
+
"scan_payload",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Known PII type label constants
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
PII_TYPES: Final[frozenset[str]] = frozenset(
|
|
86
|
+
[
|
|
87
|
+
"credit_card",
|
|
88
|
+
"date_of_birth",
|
|
89
|
+
"email",
|
|
90
|
+
"financial_id",
|
|
91
|
+
"ip_address",
|
|
92
|
+
"medical_id",
|
|
93
|
+
"name",
|
|
94
|
+
"phone",
|
|
95
|
+
"ssn",
|
|
96
|
+
"address",
|
|
97
|
+
]
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
# Sensitivity ordering
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
#: Numeric ordering for each sensitivity level (ascending sensitivity).
|
|
105
|
+
_SENSITIVITY_ORDER: Final[dict[str, int]] = {
|
|
106
|
+
"low": 0,
|
|
107
|
+
"medium": 1,
|
|
108
|
+
"high": 2,
|
|
109
|
+
"pii": 3,
|
|
110
|
+
"phi": 4,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Sensitivity(str, Enum):
|
|
115
|
+
"""Ordered sensitivity levels for PII classification.
|
|
116
|
+
|
|
117
|
+
Levels increase in sensitivity: LOW < MEDIUM < HIGH < PII < PHI.
|
|
118
|
+
|
|
119
|
+
* **LOW** — Non-sensitive; informational or operational metadata.
|
|
120
|
+
* **MEDIUM** — Pseudonymous or indirectly identifying data.
|
|
121
|
+
* **HIGH** — Directly identifying but non-regulated (e.g. usernames).
|
|
122
|
+
* **PII** — Directly identifying, regulated personal data (GDPR / CCPA).
|
|
123
|
+
* **PHI** — Protected health information (HIPAA). Most restrictive.
|
|
124
|
+
|
|
125
|
+
Comparison operators (``<``, ``<=``, ``>``, ``>=``) work as expected::
|
|
126
|
+
|
|
127
|
+
Sensitivity.PII > Sensitivity.HIGH # True
|
|
128
|
+
Sensitivity.PHI >= Sensitivity.PII # True
|
|
129
|
+
Sensitivity.LOW < Sensitivity.MEDIUM # True
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
LOW = "low"
|
|
133
|
+
MEDIUM = "medium"
|
|
134
|
+
HIGH = "high"
|
|
135
|
+
PII = "pii"
|
|
136
|
+
PHI = "phi"
|
|
137
|
+
|
|
138
|
+
# ------------------------------------------------------------------
|
|
139
|
+
# Ordered comparisons (delegated to integer order table)
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def _order(self) -> int:
|
|
144
|
+
"""Integer rank — for comparison only; not part of the public API."""
|
|
145
|
+
return _SENSITIVITY_ORDER[self.value]
|
|
146
|
+
|
|
147
|
+
def __lt__(self, other: object) -> bool:
|
|
148
|
+
if not isinstance(other, Sensitivity):
|
|
149
|
+
return NotImplemented # type: ignore[return-value]
|
|
150
|
+
return self._order < other._order
|
|
151
|
+
|
|
152
|
+
def __le__(self, other: object) -> bool:
|
|
153
|
+
if not isinstance(other, Sensitivity):
|
|
154
|
+
return NotImplemented # type: ignore[return-value]
|
|
155
|
+
return self._order <= other._order
|
|
156
|
+
|
|
157
|
+
def __gt__(self, other: object) -> bool:
|
|
158
|
+
if not isinstance(other, Sensitivity):
|
|
159
|
+
return NotImplemented # type: ignore[return-value]
|
|
160
|
+
return self._order > other._order
|
|
161
|
+
|
|
162
|
+
def __ge__(self, other: object) -> bool:
|
|
163
|
+
if not isinstance(other, Sensitivity):
|
|
164
|
+
return NotImplemented # type: ignore[return-value]
|
|
165
|
+
return self._order >= other._order
|
|
166
|
+
|
|
167
|
+
def __eq__(self, other: object) -> bool:
|
|
168
|
+
if isinstance(other, str) and not isinstance(other, Sensitivity):
|
|
169
|
+
return str.__eq__(self, other)
|
|
170
|
+
return Enum.__eq__(self, other)
|
|
171
|
+
|
|
172
|
+
def __hash__(self) -> int:
|
|
173
|
+
return str.__hash__(self)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Redactable wrapper
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class Redactable:
|
|
182
|
+
"""Immutable wrapper that marks a payload value as PII-sensitive.
|
|
183
|
+
|
|
184
|
+
Wrapping a value in :class:`Redactable` does **not** redact it immediately.
|
|
185
|
+
The value is redacted only when :meth:`RedactionPolicy.apply` is called on
|
|
186
|
+
the event that contains it.
|
|
187
|
+
|
|
188
|
+
Security: :class:`Redactable` never surfaces its wrapped value in
|
|
189
|
+
``__repr__``, ``__str__``, or exceptions. Only the sensitivity level and
|
|
190
|
+
PII type labels are visible in any string representation.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
value: The raw PII-sensitive value.
|
|
194
|
+
sensitivity: How sensitive the value is.
|
|
195
|
+
pii_types: Labels describing what type of PII this is. Use
|
|
196
|
+
constants from :data:`PII_TYPES` or custom strings.
|
|
197
|
+
Defaults to an empty frozenset.
|
|
198
|
+
|
|
199
|
+
Example::
|
|
200
|
+
|
|
201
|
+
field = Redactable("alice@example.com", Sensitivity.PII, {"email"})
|
|
202
|
+
str(field) # "<Redactable:pii>" — value hidden
|
|
203
|
+
repr(field) # "<Redactable sensitivity='pii' pii_types={'email'}>"
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
__slots__ = ("_pii_types", "_sensitivity", "_value")
|
|
207
|
+
|
|
208
|
+
def __init__(
|
|
209
|
+
self,
|
|
210
|
+
value: Any, # noqa: ANN401
|
|
211
|
+
sensitivity: Sensitivity,
|
|
212
|
+
pii_types: frozenset[str] = frozenset(),
|
|
213
|
+
) -> None:
|
|
214
|
+
object.__setattr__(self, "_value", value)
|
|
215
|
+
object.__setattr__(self, "_sensitivity", sensitivity)
|
|
216
|
+
object.__setattr__(self, "_pii_types", frozenset(pii_types))
|
|
217
|
+
|
|
218
|
+
# ------------------------------------------------------------------
|
|
219
|
+
# Public interface
|
|
220
|
+
# ------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def sensitivity(self) -> Sensitivity:
|
|
224
|
+
"""The sensitivity level of this field."""
|
|
225
|
+
return self._sensitivity # type: ignore[return-value]
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def pii_types(self) -> frozenset[str]:
|
|
229
|
+
"""Set of PII type labels (e.g. ``{'email', 'pii_identifier'}``)."""
|
|
230
|
+
return self._pii_types # type: ignore[return-value]
|
|
231
|
+
|
|
232
|
+
def reveal(self) -> Any: # noqa: ANN401
|
|
233
|
+
"""Return the raw unredacted value.
|
|
234
|
+
|
|
235
|
+
Use with extreme care. Access to raw values should be restricted to
|
|
236
|
+
trusted internal code paths. Ensure the returned value is never
|
|
237
|
+
logged or included in any observable output.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
The original unwrapped value passed to the constructor.
|
|
241
|
+
"""
|
|
242
|
+
return self._value # type: ignore[return-value]
|
|
243
|
+
|
|
244
|
+
# ------------------------------------------------------------------
|
|
245
|
+
# Immutability guard
|
|
246
|
+
# ------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
def __setattr__(self, name: str, value: object) -> None: # type: ignore[override]
|
|
249
|
+
raise AttributeError("Redactable is immutable — use a new instance to change values")
|
|
250
|
+
|
|
251
|
+
# ------------------------------------------------------------------
|
|
252
|
+
# Safe string representations — value intentionally hidden
|
|
253
|
+
# ------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
def __repr__(self) -> str:
|
|
256
|
+
return (
|
|
257
|
+
f"<Redactable sensitivity={self._sensitivity!r} " # type: ignore[misc]
|
|
258
|
+
f"pii_types={set(self._pii_types)!r}>" # type: ignore[misc]
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def __str__(self) -> str:
|
|
262
|
+
return f"<Redactable:{self._sensitivity}>" # type: ignore[misc]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
# Redaction result
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@dataclass(frozen=True)
|
|
271
|
+
class RedactionResult:
|
|
272
|
+
"""Immutable result returned by :meth:`RedactionPolicy.apply`.
|
|
273
|
+
|
|
274
|
+
Attributes:
|
|
275
|
+
event: The newly constructed event with PII removed.
|
|
276
|
+
redaction_count: How many :class:`Redactable` fields were scrubbed.
|
|
277
|
+
redacted_at: UTC ISO-8601 timestamp when redaction was applied.
|
|
278
|
+
redacted_by: The policy identifier string.
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
event: Event
|
|
282
|
+
redaction_count: int
|
|
283
|
+
redacted_at: str
|
|
284
|
+
redacted_by: str
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
# PIINotRedactedError
|
|
289
|
+
# ---------------------------------------------------------------------------
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class PIINotRedactedError(LLMSchemaError):
|
|
293
|
+
"""Raised when :func:`contains_pii` detects un-redacted PII in an event.
|
|
294
|
+
|
|
295
|
+
This error signals that a :class:`Redactable` instance is still present in
|
|
296
|
+
the event payload after a :class:`RedactionPolicy` should have been applied.
|
|
297
|
+
|
|
298
|
+
Security: the error message never reveals the actual PII value — only field
|
|
299
|
+
path depth and sensitivity information.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
count: Number of unredacted :class:`Redactable` instances found.
|
|
303
|
+
context: Optional short label for where the check was done.
|
|
304
|
+
|
|
305
|
+
Attributes:
|
|
306
|
+
count: Number of outstanding :class:`Redactable` instances found.
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
count: int
|
|
310
|
+
|
|
311
|
+
def __init__(self, count: int, context: str = "") -> None:
|
|
312
|
+
self.count = count
|
|
313
|
+
# M11: never embed the raw context string — it may itself contain PII.
|
|
314
|
+
# Include only a hash for correlation without disclosure.
|
|
315
|
+
ctx = ""
|
|
316
|
+
if context:
|
|
317
|
+
ctx_hash = hashlib.sha256(context.encode()).hexdigest()[:8]
|
|
318
|
+
ctx = f" [context-hash:{ctx_hash}]"
|
|
319
|
+
super().__init__(
|
|
320
|
+
f"Found {count} unredacted PII field(s){ctx}. "
|
|
321
|
+
"Apply a RedactionPolicy before serialising or exporting this event."
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# ---------------------------------------------------------------------------
|
|
326
|
+
# RedactionPolicy
|
|
327
|
+
# ---------------------------------------------------------------------------
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
@dataclass(frozen=True)
|
|
331
|
+
class RedactionPolicy:
|
|
332
|
+
"""Policy that defines which fields to scrub and how to label redactions.
|
|
333
|
+
|
|
334
|
+
A policy is immutable; create a new instance to change configuration.
|
|
335
|
+
Apply it to an event via :meth:`apply`, which returns a :class:`RedactionResult`
|
|
336
|
+
containing a new event with PII removed.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
min_sensitivity: Fields with sensitivity **≥** this level are
|
|
340
|
+
redacted. Defaults to :attr:`Sensitivity.PII`.
|
|
341
|
+
redacted_by: Identifier embedded in the redaction metadata
|
|
342
|
+
(e.g. ``"policy:corp-default"``).
|
|
343
|
+
replacement_template: String template for the redaction marker.
|
|
344
|
+
The ``{sensitivity}`` placeholder is replaced
|
|
345
|
+
with the field's sensitivity level value.
|
|
346
|
+
Defaults to ``"[REDACTED:{sensitivity}]"``.
|
|
347
|
+
|
|
348
|
+
Example::
|
|
349
|
+
|
|
350
|
+
policy = RedactionPolicy(
|
|
351
|
+
min_sensitivity=Sensitivity.HIGH,
|
|
352
|
+
redacted_by="policy:strict",
|
|
353
|
+
)
|
|
354
|
+
result = policy.apply(event)
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
min_sensitivity: Sensitivity = Sensitivity.PII
|
|
358
|
+
redacted_by: str = "policy:default"
|
|
359
|
+
replacement_template: str = "[REDACTED:{sensitivity}]"
|
|
360
|
+
|
|
361
|
+
def _make_marker(self, sensitivity: Sensitivity) -> str:
|
|
362
|
+
"""Format the replacement string for a given sensitivity level."""
|
|
363
|
+
return self.replacement_template.format(sensitivity=sensitivity.value)
|
|
364
|
+
|
|
365
|
+
def _should_redact(self, r: Redactable) -> bool:
|
|
366
|
+
"""Return True if the Redactable field meets the policy threshold."""
|
|
367
|
+
return r.sensitivity >= self.min_sensitivity
|
|
368
|
+
|
|
369
|
+
def _redact_value(self, value: Any, counter: list[int], _depth: int = 0) -> Any: # noqa: ANN401
|
|
370
|
+
"""Recursively replace Redactable instances in *value*.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
value: Any Python value (dict, list, Redactable, or scalar).
|
|
374
|
+
counter: Single-element list used as a mutable integer counter.
|
|
375
|
+
_depth: Current recursion depth (internal; raises at > 100).
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
The value with any qualifying Redactable instances replaced by
|
|
379
|
+
their marker strings. Non-Redactable values are returned as-is.
|
|
380
|
+
"""
|
|
381
|
+
if _depth > 100:
|
|
382
|
+
raise RecursionError(
|
|
383
|
+
"RedactionPolicy._redact_value: maximum nesting depth (100) exceeded"
|
|
384
|
+
)
|
|
385
|
+
if isinstance(value, Redactable):
|
|
386
|
+
if self._should_redact(value):
|
|
387
|
+
counter[0] += 1
|
|
388
|
+
return self._make_marker(value.sensitivity)
|
|
389
|
+
# Below threshold — leave as-is for now;
|
|
390
|
+
# contains_pii() will detect it post-apply if needed.
|
|
391
|
+
return value
|
|
392
|
+
if isinstance(value, dict):
|
|
393
|
+
return {k: self._redact_value(v, counter, _depth + 1) for k, v in value.items()}
|
|
394
|
+
if isinstance(value, list):
|
|
395
|
+
return [self._redact_value(v, counter, _depth + 1) for v in value]
|
|
396
|
+
if isinstance(value, tuple):
|
|
397
|
+
return tuple(self._redact_value(v, counter, _depth + 1) for v in value)
|
|
398
|
+
return value
|
|
399
|
+
|
|
400
|
+
def apply(self, event: Event) -> RedactionResult:
|
|
401
|
+
"""Apply this policy to *event*, returning a new redacted event.
|
|
402
|
+
|
|
403
|
+
All :class:`Redactable` fields in the payload whose sensitivity is ≥
|
|
404
|
+
:attr:`min_sensitivity` are replaced with safe marker strings.
|
|
405
|
+
Redaction metadata is appended under the reserved ``__redacted_*``
|
|
406
|
+
keys in the payload.
|
|
407
|
+
|
|
408
|
+
The original event is **not** mutated; a new :class:`Event` is returned
|
|
409
|
+
inside the :class:`RedactionResult`.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
event: The event whose payload should be scanned and redacted.
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
A :class:`RedactionResult` with the new event and redaction stats.
|
|
416
|
+
|
|
417
|
+
Raises:
|
|
418
|
+
LLMSchemaError: If reconstruction of the redacted event fails for
|
|
419
|
+
structural reasons.
|
|
420
|
+
"""
|
|
421
|
+
# Import here to avoid circular dependency at module load time.
|
|
422
|
+
from spanforge.event import Event # noqa: PLC0415
|
|
423
|
+
|
|
424
|
+
counter: list[int] = [0]
|
|
425
|
+
redacted_payload = self._redact_value(dict(event.payload), counter)
|
|
426
|
+
|
|
427
|
+
now = _utcnow_iso()
|
|
428
|
+
|
|
429
|
+
if isinstance(redacted_payload, dict) and counter[0] > 0:
|
|
430
|
+
redacted_payload["__redacted_at"] = now
|
|
431
|
+
redacted_payload["__redacted_by"] = self.redacted_by
|
|
432
|
+
redacted_payload["__redaction_count"] = counter[0]
|
|
433
|
+
|
|
434
|
+
new_event = Event(
|
|
435
|
+
schema_version=event.schema_version,
|
|
436
|
+
event_id=event.event_id,
|
|
437
|
+
event_type=event.event_type,
|
|
438
|
+
timestamp=event.timestamp,
|
|
439
|
+
source=event.source,
|
|
440
|
+
payload=redacted_payload,
|
|
441
|
+
trace_id=event.trace_id,
|
|
442
|
+
span_id=event.span_id,
|
|
443
|
+
parent_span_id=event.parent_span_id,
|
|
444
|
+
org_id=event.org_id,
|
|
445
|
+
team_id=event.team_id,
|
|
446
|
+
actor_id=event.actor_id,
|
|
447
|
+
session_id=event.session_id,
|
|
448
|
+
tags=event.tags,
|
|
449
|
+
checksum=event.checksum,
|
|
450
|
+
signature=event.signature,
|
|
451
|
+
prev_id=event.prev_id,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
return RedactionResult(
|
|
455
|
+
event=new_event,
|
|
456
|
+
redaction_count=counter[0],
|
|
457
|
+
redacted_at=now,
|
|
458
|
+
redacted_by=self.redacted_by,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# ---------------------------------------------------------------------------
|
|
463
|
+
# Public helpers
|
|
464
|
+
# ---------------------------------------------------------------------------
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def contains_pii(event: Event, *, scan_raw: bool = True) -> bool:
|
|
468
|
+
"""Return ``True`` if any unredacted :class:`Redactable` values remain.
|
|
469
|
+
|
|
470
|
+
Use this after :meth:`RedactionPolicy.apply` to verify that all qualifying
|
|
471
|
+
fields were scrubbed before the event is serialised or exported.
|
|
472
|
+
|
|
473
|
+
Does **not** raise; callers decide the appropriate response. For a
|
|
474
|
+
strict raising version, see :func:`assert_redacted`.
|
|
475
|
+
|
|
476
|
+
.. versionchanged:: 2.1
|
|
477
|
+
Default for *scan_raw* changed from ``False`` to ``True`` so that
|
|
478
|
+
raw-string PII is caught by default. Pass ``scan_raw=False``
|
|
479
|
+
explicitly to restore the old behaviour.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
event: The event to inspect.
|
|
483
|
+
scan_raw: When ``True`` (default), also run regex-based PII scanning
|
|
484
|
+
on the payload strings (via :func:`scan_payload`), not just
|
|
485
|
+
check for :class:`Redactable` wrappers.
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
``True`` if at least one :class:`Redactable` instance is found in the
|
|
489
|
+
payload (at any nesting depth), or if ``scan_raw=True`` and a regex
|
|
490
|
+
PII hit is detected. ``False`` if the payload is clean.
|
|
491
|
+
|
|
492
|
+
Example::
|
|
493
|
+
|
|
494
|
+
if contains_pii(event):
|
|
495
|
+
raise RuntimeError("Unredacted PII detected — cannot export")
|
|
496
|
+
"""
|
|
497
|
+
if _has_redactable(event.payload):
|
|
498
|
+
return True
|
|
499
|
+
if scan_raw and isinstance(event.payload, Mapping):
|
|
500
|
+
result = scan_payload(event.payload) # type: ignore[arg-type]
|
|
501
|
+
return not result.clean
|
|
502
|
+
return False
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def assert_redacted(event: Event, context: str = "", *, scan_raw: bool = True) -> None:
|
|
506
|
+
"""Assert that *event* contains no unredacted :class:`Redactable` values.
|
|
507
|
+
|
|
508
|
+
This is the strict variant of :func:`contains_pii`. It raises
|
|
509
|
+
:exc:`PIINotRedactedError` if any :class:`Redactable` instances remain,
|
|
510
|
+
or if ``scan_raw=True`` and regex-based PII is detected.
|
|
511
|
+
|
|
512
|
+
.. versionchanged:: 2.1
|
|
513
|
+
Default for *scan_raw* changed from ``False`` to ``True``.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
event: The event to inspect.
|
|
517
|
+
context: Optional short label for the error message (e.g. filename).
|
|
518
|
+
scan_raw: When ``True`` (default), also run regex-based PII scanning.
|
|
519
|
+
|
|
520
|
+
Raises:
|
|
521
|
+
PIINotRedactedError: If any :class:`Redactable` instances or raw PII
|
|
522
|
+
patterns are found.
|
|
523
|
+
|
|
524
|
+
Example::
|
|
525
|
+
|
|
526
|
+
assert_redacted(event, context="export_to_otlp", scan_raw=True)
|
|
527
|
+
"""
|
|
528
|
+
count = _count_redactable(event.payload)
|
|
529
|
+
if count > 0:
|
|
530
|
+
raise PIINotRedactedError(count=count, context=context)
|
|
531
|
+
if scan_raw and isinstance(event.payload, Mapping):
|
|
532
|
+
result = scan_payload(event.payload) # type: ignore[arg-type]
|
|
533
|
+
if not result.clean:
|
|
534
|
+
raise PIINotRedactedError(count=len(result.hits), context=context)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
# ---------------------------------------------------------------------------
|
|
538
|
+
# Internal helpers (module-private)
|
|
539
|
+
# ---------------------------------------------------------------------------
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _has_redactable(value: Any) -> bool: # noqa: ANN401
|
|
543
|
+
"""Return True if *value* contains any Redactable instance (recursive)."""
|
|
544
|
+
if isinstance(value, Redactable):
|
|
545
|
+
return True
|
|
546
|
+
if isinstance(value, Mapping):
|
|
547
|
+
return any(_has_redactable(v) for v in value.values())
|
|
548
|
+
if isinstance(value, (list, tuple)):
|
|
549
|
+
return any(_has_redactable(v) for v in value)
|
|
550
|
+
return False
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _count_redactable(value: Any, _depth: int = 0) -> int: # noqa: ANN401
|
|
554
|
+
"""Count the total number of Redactable instances in *value* (recursive)."""
|
|
555
|
+
if isinstance(value, Redactable):
|
|
556
|
+
return 1
|
|
557
|
+
if isinstance(value, Mapping):
|
|
558
|
+
return sum(_count_redactable(v, _depth + 1) for v in value.values())
|
|
559
|
+
if isinstance(value, (list, tuple)):
|
|
560
|
+
return sum(_count_redactable(v, _depth + 1) for v in value)
|
|
561
|
+
return 0
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _utcnow_iso() -> str:
|
|
565
|
+
"""Return current UTC time as an ISO-8601 string (same format as Event)."""
|
|
566
|
+
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
567
|
+
return now.strftime("%Y-%m-%dT%H:%M:%S.") + f"{now.microsecond:06d}Z"
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
# ---------------------------------------------------------------------------
|
|
571
|
+
# GA-03: Deep PII scanning — regex-based detection
|
|
572
|
+
# ---------------------------------------------------------------------------
|
|
573
|
+
|
|
574
|
+
_PII_PATTERNS: Final[dict[str, re.Pattern[str]]] = {
|
|
575
|
+
"email": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}", re.ASCII),
|
|
576
|
+
"phone": re.compile(
|
|
577
|
+
r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
|
|
578
|
+
),
|
|
579
|
+
"ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
|
580
|
+
"credit_card": re.compile(r"\b(?:\d[ -]?){13,19}\b"),
|
|
581
|
+
"ip_address": re.compile(
|
|
582
|
+
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
|
|
583
|
+
r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
|
|
584
|
+
),
|
|
585
|
+
"uk_national_insurance": re.compile(
|
|
586
|
+
r"\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b",
|
|
587
|
+
re.IGNORECASE,
|
|
588
|
+
),
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
# ---------------------------------------------------------------------------
|
|
593
|
+
# GA-03-IN: India PII patterns — DPDP Act (Digital Personal Data Protection)
|
|
594
|
+
# ---------------------------------------------------------------------------
|
|
595
|
+
|
|
596
|
+
# Verhoeff checksum tables for Aadhaar validation
|
|
597
|
+
_VERHOEFF_D = (
|
|
598
|
+
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
|
|
599
|
+
(1, 2, 3, 4, 0, 6, 7, 8, 9, 5),
|
|
600
|
+
(2, 3, 4, 0, 1, 7, 8, 9, 5, 6),
|
|
601
|
+
(3, 4, 0, 1, 2, 8, 9, 5, 6, 7),
|
|
602
|
+
(4, 0, 1, 2, 3, 9, 5, 6, 7, 8),
|
|
603
|
+
(5, 9, 8, 7, 6, 0, 4, 3, 2, 1),
|
|
604
|
+
(6, 5, 9, 8, 7, 1, 0, 4, 3, 2),
|
|
605
|
+
(7, 6, 5, 9, 8, 2, 1, 0, 4, 3),
|
|
606
|
+
(8, 7, 6, 5, 9, 3, 2, 1, 0, 4),
|
|
607
|
+
(9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
_VERHOEFF_P = (
|
|
611
|
+
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
|
|
612
|
+
(1, 5, 7, 6, 2, 8, 3, 0, 9, 4),
|
|
613
|
+
(5, 8, 0, 3, 7, 9, 6, 1, 4, 2),
|
|
614
|
+
(8, 9, 1, 6, 0, 4, 3, 5, 2, 7),
|
|
615
|
+
(9, 4, 5, 3, 1, 2, 6, 8, 7, 0),
|
|
616
|
+
(4, 2, 8, 6, 5, 7, 3, 9, 0, 1),
|
|
617
|
+
(2, 7, 9, 3, 8, 0, 6, 4, 1, 5),
|
|
618
|
+
(7, 0, 4, 6, 9, 1, 3, 2, 5, 8),
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
_VERHOEFF_INV = (0, 4, 3, 2, 1, 5, 6, 7, 8, 9)
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def _verhoeff_check(number_str: str) -> bool:
|
|
625
|
+
"""Validate a number string using the Verhoeff checksum algorithm."""
|
|
626
|
+
digits = [int(d) for d in number_str if d.isdigit()]
|
|
627
|
+
c = 0
|
|
628
|
+
for i, d in enumerate(reversed(digits)):
|
|
629
|
+
c = _VERHOEFF_D[c][_VERHOEFF_P[i % 8][d]]
|
|
630
|
+
return c == 0
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
DPDP_PATTERNS: Final[dict[str, re.Pattern[str]]] = {
|
|
634
|
+
"aadhaar": re.compile(
|
|
635
|
+
r"\b[2-9]\d{3}[\s-]?\d{4}[\s-]?\d{4}\b"
|
|
636
|
+
),
|
|
637
|
+
"pan": re.compile(
|
|
638
|
+
r"\b[A-Z]{5}\d{4}[A-Z]\b"
|
|
639
|
+
),
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
@dataclass(frozen=True)
|
|
644
|
+
class PIIScanHit:
|
|
645
|
+
"""Single PII detection hit.
|
|
646
|
+
|
|
647
|
+
Attributes:
|
|
648
|
+
pii_type: The type of PII detected (e.g. ``"email"``, ``"ssn"``).
|
|
649
|
+
path: Dot-separated path to the field in the payload.
|
|
650
|
+
match_count: Number of matches of this type at this path.
|
|
651
|
+
sensitivity: Sensitivity level: ``"high"`` for SSN/credit_card,
|
|
652
|
+
``"medium"`` for email/phone, ``"low"`` for IP/NI.
|
|
653
|
+
"""
|
|
654
|
+
|
|
655
|
+
pii_type: str
|
|
656
|
+
path: str
|
|
657
|
+
match_count: int = 1
|
|
658
|
+
sensitivity: str = "medium"
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
_SENSITIVITY_MAP: dict[str, str] = {
|
|
662
|
+
"ssn": "high",
|
|
663
|
+
"credit_card": "high",
|
|
664
|
+
"aadhaar": "high",
|
|
665
|
+
"pan": "high",
|
|
666
|
+
"email": "medium",
|
|
667
|
+
"phone": "medium",
|
|
668
|
+
"ip_address": "low",
|
|
669
|
+
"uk_national_insurance": "low",
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
@dataclass(frozen=True)
|
|
674
|
+
class PIIScanResult:
|
|
675
|
+
"""Result of a deep PII scan on a payload dictionary.
|
|
676
|
+
|
|
677
|
+
Attributes:
|
|
678
|
+
hits: List of :class:`PIIScanHit` instances found.
|
|
679
|
+
scanned: Number of string values scanned.
|
|
680
|
+
clean: ``True`` if no PII was detected.
|
|
681
|
+
"""
|
|
682
|
+
|
|
683
|
+
hits: list[PIIScanHit]
|
|
684
|
+
scanned: int
|
|
685
|
+
|
|
686
|
+
@property
|
|
687
|
+
def clean(self) -> bool:
|
|
688
|
+
return len(self.hits) == 0
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _luhn_check(number_str: str) -> bool:
|
|
692
|
+
"""Validate a credit card number using the Luhn algorithm."""
|
|
693
|
+
digits = [int(d) for d in number_str if d.isdigit()]
|
|
694
|
+
if len(digits) < 13 or len(digits) > 19:
|
|
695
|
+
return False
|
|
696
|
+
total = 0
|
|
697
|
+
for i, d in enumerate(reversed(digits)):
|
|
698
|
+
if i % 2 == 1:
|
|
699
|
+
d *= 2
|
|
700
|
+
if d > 9:
|
|
701
|
+
d -= 9
|
|
702
|
+
total += d
|
|
703
|
+
return total % 10 == 0
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def scan_payload(
|
|
707
|
+
payload: dict[str, Any],
|
|
708
|
+
*,
|
|
709
|
+
extra_patterns: dict[str, re.Pattern[str]] | None = None,
|
|
710
|
+
max_depth: int = 10,
|
|
711
|
+
) -> PIIScanResult:
|
|
712
|
+
"""Scan a payload dict for PII using regex detectors.
|
|
713
|
+
|
|
714
|
+
Walks the entire payload recursively (up to *max_depth*), testing every
|
|
715
|
+
string value against the built-in pattern set (email, phone, SSN, credit
|
|
716
|
+
card, IP address, UK National Insurance number) plus any caller-supplied
|
|
717
|
+
patterns.
|
|
718
|
+
|
|
719
|
+
**Security**: matched values are never returned — only the PII type, path,
|
|
720
|
+
match count, and sensitivity level.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
payload: The dictionary to scan.
|
|
724
|
+
extra_patterns: Additional ``{label: compiled_regex}`` detectors.
|
|
725
|
+
max_depth: Maximum nesting depth to scan (default 10).
|
|
726
|
+
|
|
727
|
+
Returns:
|
|
728
|
+
A :class:`PIIScanResult` summarising all detections.
|
|
729
|
+
"""
|
|
730
|
+
patterns = {**_PII_PATTERNS, **DPDP_PATTERNS}
|
|
731
|
+
if extra_patterns:
|
|
732
|
+
patterns.update(extra_patterns)
|
|
733
|
+
|
|
734
|
+
hits: list[PIIScanHit] = []
|
|
735
|
+
scanned = 0
|
|
736
|
+
|
|
737
|
+
def _walk(obj: Any, path: str, depth: int) -> None: # noqa: ANN401
|
|
738
|
+
nonlocal scanned
|
|
739
|
+
if depth > max_depth:
|
|
740
|
+
return
|
|
741
|
+
if isinstance(obj, str):
|
|
742
|
+
scanned += 1
|
|
743
|
+
for label, pat in patterns.items():
|
|
744
|
+
matches = list(pat.finditer(obj))
|
|
745
|
+
if not matches:
|
|
746
|
+
continue
|
|
747
|
+
# Luhn validation for credit card patterns
|
|
748
|
+
if label == "credit_card":
|
|
749
|
+
valid_matches = [
|
|
750
|
+
m for m in matches
|
|
751
|
+
if _luhn_check(m.group())
|
|
752
|
+
]
|
|
753
|
+
if not valid_matches:
|
|
754
|
+
continue
|
|
755
|
+
matches = valid_matches
|
|
756
|
+
# Verhoeff validation for Aadhaar patterns
|
|
757
|
+
if label == "aadhaar":
|
|
758
|
+
valid_matches = [
|
|
759
|
+
m for m in matches
|
|
760
|
+
if _verhoeff_check(m.group())
|
|
761
|
+
]
|
|
762
|
+
if not valid_matches:
|
|
763
|
+
continue
|
|
764
|
+
matches = valid_matches
|
|
765
|
+
sensitivity = _SENSITIVITY_MAP.get(label, "medium")
|
|
766
|
+
hits.append(PIIScanHit(
|
|
767
|
+
pii_type=label,
|
|
768
|
+
path=path,
|
|
769
|
+
match_count=len(matches),
|
|
770
|
+
sensitivity=sensitivity,
|
|
771
|
+
))
|
|
772
|
+
elif isinstance(obj, Mapping):
|
|
773
|
+
for k, v in obj.items():
|
|
774
|
+
_walk(v, f"{path}.{k}" if path else str(k), depth + 1)
|
|
775
|
+
elif isinstance(obj, (list, tuple)):
|
|
776
|
+
for i, v in enumerate(obj):
|
|
777
|
+
_walk(v, f"{path}[{i}]", depth + 1)
|
|
778
|
+
|
|
779
|
+
_walk(payload, "", 0)
|
|
780
|
+
return PIIScanResult(hits=hits, scanned=scanned)
|