spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/sdk/pii.py
ADDED
|
@@ -0,0 +1,1473 @@
|
|
|
1
|
+
"""spanforge.sdk.pii — SpanForge sf-pii client.
|
|
2
|
+
|
|
3
|
+
Implements the full sf-pii API surface for Phase 3 (PII Service Hardening) of
|
|
4
|
+
the SpanForge roadmap, extending the Phase 2 foundation.
|
|
5
|
+
|
|
6
|
+
All operations run locally in-process (zero external dependencies) when
|
|
7
|
+
``config.endpoint`` is empty or when the remote service is unreachable and
|
|
8
|
+
``local_fallback_enabled`` is ``True``.
|
|
9
|
+
|
|
10
|
+
Local-mode feature parity
|
|
11
|
+
--------------------------
|
|
12
|
+
* :meth:`scan` — deep regex PII scan (dict payload).
|
|
13
|
+
* :meth:`scan_text` — Presidio-backed text scan (PII-001).
|
|
14
|
+
* :meth:`anonymise` — recursive dict anonymisation (PII-002).
|
|
15
|
+
* :meth:`scan_batch` — async parallel text scan (PII-003).
|
|
16
|
+
* :meth:`apply_pipeline_action` — pii_action routing hook (PII-010/011/012).
|
|
17
|
+
* :meth:`get_status` — sf_pii status contribution (PII-005).
|
|
18
|
+
* :meth:`redact` — apply RedactionPolicy to an event.
|
|
19
|
+
* :meth:`contains_pii` — check for unredacted PII.
|
|
20
|
+
* :meth:`assert_redacted` — raise if unredacted PII found.
|
|
21
|
+
* :meth:`anonymize` — replace PII in raw text strings.
|
|
22
|
+
* :meth:`wrap` — Redactable factory.
|
|
23
|
+
* :meth:`make_policy` — RedactionPolicy factory.
|
|
24
|
+
* :meth:`erase_subject` — GDPR Article 17 erasure (PII-021).
|
|
25
|
+
* :meth:`export_subject_data` — CCPA DSAR export (PII-022).
|
|
26
|
+
* :meth:`safe_harbor_deidentify` — HIPAA Safe Harbor (PII-023).
|
|
27
|
+
* :meth:`audit_training_data` — EU AI Act Article 10 audit (PII-025).
|
|
28
|
+
* :meth:`get_pii_stats` — PII heat map data (PII-032).
|
|
29
|
+
|
|
30
|
+
Security requirements
|
|
31
|
+
---------------------
|
|
32
|
+
* Scan and anonymize results **never** include matched PII values — only
|
|
33
|
+
type labels, field paths, counts, and anonymized replacement text.
|
|
34
|
+
* :exc:`~spanforge.sdk._exceptions.SFPIINotRedactedError` messages never
|
|
35
|
+
contain raw PII; context strings are SHA-256-hashed before inclusion.
|
|
36
|
+
* ``SecretStr`` API keys are never written to logs.
|
|
37
|
+
* Redaction manifest entries hash original values with SHA-256; raw values
|
|
38
|
+
are never stored.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import asyncio
|
|
44
|
+
import concurrent.futures
|
|
45
|
+
import datetime
|
|
46
|
+
import hashlib
|
|
47
|
+
import json
|
|
48
|
+
import re
|
|
49
|
+
import time
|
|
50
|
+
import uuid
|
|
51
|
+
from pathlib import Path
|
|
52
|
+
from typing import TYPE_CHECKING, Any
|
|
53
|
+
|
|
54
|
+
from spanforge.sdk._base import SFClientConfig, SFServiceClient
|
|
55
|
+
from spanforge.sdk._exceptions import (
|
|
56
|
+
SFPIIBlockedError,
|
|
57
|
+
SFPIIError,
|
|
58
|
+
SFPIINotRedactedError,
|
|
59
|
+
SFPIIPolicyError,
|
|
60
|
+
SFPIIScanError,
|
|
61
|
+
)
|
|
62
|
+
from spanforge.sdk._types import (
|
|
63
|
+
DSARExport,
|
|
64
|
+
ErasureReceipt,
|
|
65
|
+
PIIAnonymisedResult,
|
|
66
|
+
PIIEntity,
|
|
67
|
+
PIIHeatMapEntry,
|
|
68
|
+
PIIPipelineResult,
|
|
69
|
+
PIIRedactionManifestEntry,
|
|
70
|
+
PIIStatusInfo,
|
|
71
|
+
PIITextScanResult,
|
|
72
|
+
SafeHarborResult,
|
|
73
|
+
SFPIIAnonymizeResult,
|
|
74
|
+
SFPIIHit,
|
|
75
|
+
SFPIIRedactResult,
|
|
76
|
+
SFPIIScanResult,
|
|
77
|
+
TrainingDataPIIReport,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if TYPE_CHECKING:
|
|
81
|
+
from spanforge.event import Event
|
|
82
|
+
from spanforge.redact import Redactable, RedactionPolicy
|
|
83
|
+
|
|
84
|
+
__all__ = ["SFPIIClient"]
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Valid sensitivity levels — mirrors spanforge.redact.Sensitivity enum values
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
_VALID_SENSITIVITY: frozenset[str] = frozenset({"low", "medium", "high", "pii", "phi"})
|
|
91
|
+
|
|
92
|
+
# Validation labels for which secondary validators are applied in anonymize()
|
|
93
|
+
_CC_LABEL = "credit_card"
|
|
94
|
+
_AADHAAR_LABEL = "aadhaar"
|
|
95
|
+
_SSN_LABEL = "ssn"
|
|
96
|
+
_DOB_LABEL = "date_of_birth"
|
|
97
|
+
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
# Phase 3 constants
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
#: Default confidence threshold for pipeline action routing (PII-011).
|
|
103
|
+
_DEFAULT_PIPELINE_THRESHOLD: float = 0.85
|
|
104
|
+
|
|
105
|
+
#: Valid pipeline action values (PII-010).
|
|
106
|
+
_VALID_PIPELINE_ACTIONS: frozenset[str] = frozenset({"flag", "redact", "block"})
|
|
107
|
+
|
|
108
|
+
#: DPDP-regulated entity type labels (India DPDP Act).
|
|
109
|
+
_DPDP_ENTITY_TYPES: frozenset[str] = frozenset({"aadhaar", "pan"})
|
|
110
|
+
|
|
111
|
+
#: PIPL-sensitive entity type labels (China PIPL).
|
|
112
|
+
_PIPL_ENTITY_TYPES: frozenset[str] = frozenset({"cn_national_id", "cn_mobile", "cn_bank_card"})
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# HIPAA Safe Harbor — 18 PHI identifier patterns (45 CFR §164.514(b)(2))
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
#: Mapping of PHI identifier label → compiled regex for Safe Harbor de-identification.
|
|
119
|
+
_SAFE_HARBOR_PATTERNS: dict[str, re.Pattern[str]] = {
|
|
120
|
+
# 1. Names
|
|
121
|
+
"name": re.compile(
|
|
122
|
+
r"\b(?:Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Prof\.?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b"
|
|
123
|
+
r"|\b[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b",
|
|
124
|
+
),
|
|
125
|
+
# 2. Geographic subdivisions smaller than state — zip codes
|
|
126
|
+
"zip": re.compile(r"\b(\d{5})(?:-\d{4})?\b"),
|
|
127
|
+
# 3. Dates (other than year)
|
|
128
|
+
"date": re.compile(
|
|
129
|
+
r"\b(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])[/-](?:19|20)\d{2}\b"
|
|
130
|
+
r"|\b(?:0?[1-9]|[12]\d|3[01])[/-](?:0?[1-9]|1[0-2])[/-](?:19|20)\d{2}\b"
|
|
131
|
+
r"|\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
|
|
132
|
+
r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
|
|
133
|
+
r"\s+(?:0?[1-9]|[12]\d|3[01]),?\s+(?:19|20)\d{2}\b",
|
|
134
|
+
re.IGNORECASE,
|
|
135
|
+
),
|
|
136
|
+
# 4. Phone numbers
|
|
137
|
+
"phone": re.compile(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"),
|
|
138
|
+
# 5. Fax numbers — same pattern as phone
|
|
139
|
+
"fax": re.compile(r"(?i)fax\s*:?\s*(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"),
|
|
140
|
+
# 6. Email addresses
|
|
141
|
+
"email": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}", re.ASCII),
|
|
142
|
+
# 7. Social security numbers
|
|
143
|
+
"ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
|
144
|
+
# 8. Medical record numbers
|
|
145
|
+
"medical_record": re.compile(r"\bMRN?[\s#:]\s*\d{6,10}\b", re.IGNORECASE),
|
|
146
|
+
# 9. Health plan beneficiary numbers
|
|
147
|
+
"health_plan": re.compile(r"\b(?:HP|HB)[\s#:]\s*\d{6,12}\b", re.IGNORECASE),
|
|
148
|
+
# 10. Account numbers
|
|
149
|
+
"account": re.compile(r"\b(?:Acct?|Account)[\s#:.]\s*\d{6,16}\b", re.IGNORECASE),
|
|
150
|
+
# 11. Certificate/license numbers
|
|
151
|
+
"license": re.compile(r"\bLIC(?:ENSE)?[\s#:]\s*[A-Z0-9]{5,15}\b", re.IGNORECASE),
|
|
152
|
+
# 12. Vehicle identifiers (VIN)
|
|
153
|
+
"vin": re.compile(r"\b[A-HJ-NPR-Z0-9]{17}\b"),
|
|
154
|
+
# 13. Device identifiers (serial numbers — heuristic)
|
|
155
|
+
"device_serial": re.compile(r"\b(?:S/N|SN|Serial)[\s#:]\s*[A-Z0-9]{8,20}\b", re.IGNORECASE),
|
|
156
|
+
# 14. Web URLs
|
|
157
|
+
"url": re.compile(r"https?://[^\s\"'<>]{4,}", re.IGNORECASE),
|
|
158
|
+
# 15. IP addresses
|
|
159
|
+
"ip_address": re.compile(
|
|
160
|
+
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
|
|
161
|
+
r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
|
|
162
|
+
),
|
|
163
|
+
# 16. Biometric identifiers — fingerprint reference IDs (heuristic)
|
|
164
|
+
"biometric": re.compile(r"\b(?:FP|BIO)[\s#:]\s*[A-Z0-9]{8,20}\b", re.IGNORECASE),
|
|
165
|
+
# 17. Full face photos — placeholder (cannot regex-detect images)
|
|
166
|
+
# 18. Age > 89 — handled in safe_harbor_deidentify() as post-processing
|
|
167
|
+
"age_over_89": re.compile(
|
|
168
|
+
r"\b(9[0-9]|1[0-9]{2})\s*(?:years?(?:\s+old)?|yo|y/o)\b", re.IGNORECASE
|
|
169
|
+
),
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class SFPIIClient(SFServiceClient):
|
|
174
|
+
"""SpanForge PII redaction service client.
|
|
175
|
+
|
|
176
|
+
Provides scanning, redaction, containment checks, and text anonymization.
|
|
177
|
+
All operations run in-process when no ``endpoint`` is configured (local
|
|
178
|
+
mode) or when the remote service is unavailable and
|
|
179
|
+
``local_fallback_enabled`` is ``True``.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
config: Client configuration. Use :class:`~spanforge.sdk._base.SFClientConfig`
|
|
183
|
+
or :func:`~spanforge.sdk._base.SFClientConfig.from_env`.
|
|
184
|
+
|
|
185
|
+
Example::
|
|
186
|
+
|
|
187
|
+
from spanforge.sdk import sf_pii
|
|
188
|
+
|
|
189
|
+
# Scan a payload for PII
|
|
190
|
+
result = sf_pii.scan({"message": "Call me on 555-867-5309"})
|
|
191
|
+
if not result.clean:
|
|
192
|
+
for hit in result.hits:
|
|
193
|
+
print(hit.pii_type, hit.path, hit.match_count)
|
|
194
|
+
|
|
195
|
+
# Anonymize raw text
|
|
196
|
+
anon = sf_pii.anonymize("My email is alice@example.com")
|
|
197
|
+
print(anon.text) # "My email is [REDACTED:email]"
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
def __init__(self, config: SFClientConfig) -> None:
|
|
201
|
+
super().__init__(config, service_name="pii")
|
|
202
|
+
#: ISO-8601 timestamp of the most recent scan_text() call; None until first call.
|
|
203
|
+
self._last_scan_at: str | None = None
|
|
204
|
+
|
|
205
|
+
# ------------------------------------------------------------------
|
|
206
|
+
# scan
|
|
207
|
+
# ------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
def scan(
|
|
210
|
+
self,
|
|
211
|
+
payload: dict[str, Any],
|
|
212
|
+
*,
|
|
213
|
+
extra_patterns: dict[str, re.Pattern[str]] | None = None,
|
|
214
|
+
max_depth: int = 10,
|
|
215
|
+
) -> SFPIIScanResult:
|
|
216
|
+
"""Scan *payload* for PII using built-in and optional extra patterns.
|
|
217
|
+
|
|
218
|
+
Walks the entire payload recursively (up to *max_depth* levels),
|
|
219
|
+
testing every string value against the built-in detector set (email,
|
|
220
|
+
phone, SSN, credit card, IP address, UK NI number, Aadhaar, PAN,
|
|
221
|
+
date-of-birth, address) plus any caller-supplied patterns. Secondary
|
|
222
|
+
validators (Luhn, Verhoeff, SSN range checks, calendar validation)
|
|
223
|
+
are applied to reduce false positives.
|
|
224
|
+
|
|
225
|
+
Security: matched PII values are **never** included in the result —
|
|
226
|
+
only type labels, field paths, match counts, and sensitivity levels.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
payload: Dictionary to scan. Must be a :class:`dict`.
|
|
230
|
+
extra_patterns: Optional ``{label: compiled_regex}`` detectors.
|
|
231
|
+
max_depth: Maximum nesting depth (default 10).
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
:class:`~spanforge.sdk._types.SFPIIScanResult`.
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
SFPIIScanError: If *payload* is not a ``dict`` or scan fails.
|
|
238
|
+
SFServiceUnavailableError: Circuit breaker open, fallback disabled.
|
|
239
|
+
"""
|
|
240
|
+
if not isinstance(payload, dict):
|
|
241
|
+
msg = f"scan() requires a dict payload; got {type(payload).__name__}"
|
|
242
|
+
raise SFPIIScanError(msg)
|
|
243
|
+
if self._is_local_mode() or self._config.local_fallback_enabled:
|
|
244
|
+
return self._scan_local(payload, extra_patterns=extra_patterns, max_depth=max_depth)
|
|
245
|
+
return self._scan_remote(payload, extra_patterns=extra_patterns, max_depth=max_depth)
|
|
246
|
+
|
|
247
|
+
def _scan_local(
|
|
248
|
+
self,
|
|
249
|
+
payload: dict[str, Any],
|
|
250
|
+
*,
|
|
251
|
+
extra_patterns: dict[str, re.Pattern[str]] | None,
|
|
252
|
+
max_depth: int,
|
|
253
|
+
) -> SFPIIScanResult:
|
|
254
|
+
from spanforge.presidio_backend import is_available as _presidio_available
|
|
255
|
+
from spanforge.presidio_backend import presidio_scan_payload
|
|
256
|
+
from spanforge.redact import scan_payload
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
if _presidio_available():
|
|
260
|
+
result = presidio_scan_payload(
|
|
261
|
+
payload, max_depth=max_depth
|
|
262
|
+
)
|
|
263
|
+
# Supplement with caller-supplied regex patterns (extra_patterns).
|
|
264
|
+
# Presidio does not accept these; run a lightweight regex pass
|
|
265
|
+
# and merge only the custom-pattern hits so nothing is lost.
|
|
266
|
+
if extra_patterns:
|
|
267
|
+
extra_result = scan_payload(
|
|
268
|
+
payload, extra_patterns=extra_patterns, max_depth=max_depth
|
|
269
|
+
)
|
|
270
|
+
custom_hits = [h for h in extra_result.hits if h.pii_type in extra_patterns]
|
|
271
|
+
if custom_hits:
|
|
272
|
+
from spanforge.redact import PIIScanResult
|
|
273
|
+
result = PIIScanResult(
|
|
274
|
+
hits=result.hits + custom_hits,
|
|
275
|
+
scanned=result.scanned,
|
|
276
|
+
)
|
|
277
|
+
else:
|
|
278
|
+
result = scan_payload(payload, extra_patterns=extra_patterns, max_depth=max_depth)
|
|
279
|
+
except RecursionError as exc:
|
|
280
|
+
raise SFPIIScanError(str(exc)) from exc
|
|
281
|
+
|
|
282
|
+
hits = [
|
|
283
|
+
SFPIIHit(
|
|
284
|
+
pii_type=h.pii_type,
|
|
285
|
+
path=h.path,
|
|
286
|
+
match_count=h.match_count,
|
|
287
|
+
sensitivity=h.sensitivity,
|
|
288
|
+
)
|
|
289
|
+
for h in result.hits
|
|
290
|
+
]
|
|
291
|
+
return SFPIIScanResult(hits=hits, scanned=result.scanned)
|
|
292
|
+
|
|
293
|
+
def _scan_remote(
|
|
294
|
+
self,
|
|
295
|
+
payload: dict[str, Any],
|
|
296
|
+
*,
|
|
297
|
+
extra_patterns: dict[str, re.Pattern[str]] | None,
|
|
298
|
+
max_depth: int,
|
|
299
|
+
) -> SFPIIScanResult:
|
|
300
|
+
body: dict[str, Any] = {"payload": payload, "max_depth": max_depth}
|
|
301
|
+
raw = self._request("POST", "/pii/scan", body=body)
|
|
302
|
+
hits = [
|
|
303
|
+
SFPIIHit(
|
|
304
|
+
pii_type=str(h.get("pii_type", "")),
|
|
305
|
+
path=str(h.get("path", "")),
|
|
306
|
+
match_count=int(h.get("match_count", 1)),
|
|
307
|
+
sensitivity=str(h.get("sensitivity", "medium")),
|
|
308
|
+
)
|
|
309
|
+
for h in raw.get("hits", [])
|
|
310
|
+
]
|
|
311
|
+
return SFPIIScanResult(hits=hits, scanned=int(raw.get("scanned", 0)))
|
|
312
|
+
|
|
313
|
+
# ------------------------------------------------------------------
|
|
314
|
+
# redact
|
|
315
|
+
# ------------------------------------------------------------------
|
|
316
|
+
|
|
317
|
+
def redact(
|
|
318
|
+
self,
|
|
319
|
+
event: Event,
|
|
320
|
+
*,
|
|
321
|
+
policy: RedactionPolicy | None = None,
|
|
322
|
+
) -> SFPIIRedactResult:
|
|
323
|
+
"""Apply a redaction policy to *event*, returning a sanitised copy.
|
|
324
|
+
|
|
325
|
+
Fields wrapped in :class:`~spanforge.redact.Redactable` with
|
|
326
|
+
sensitivity ≥ the policy threshold are replaced with safe marker
|
|
327
|
+
strings (e.g. ``"[REDACTED:pii]"``). The original event is **not**
|
|
328
|
+
mutated; a new :class:`~spanforge.event.Event` is returned inside the
|
|
329
|
+
result.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
event: The :class:`~spanforge.event.Event` to redact.
|
|
333
|
+
policy: :class:`~spanforge.redact.RedactionPolicy` to apply.
|
|
334
|
+
Defaults to ``RedactionPolicy(redacted_by="policy:sf-pii")``,
|
|
335
|
+
which redacts all fields at ``Sensitivity.PII`` or above.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
:class:`~spanforge.sdk._types.SFPIIRedactResult` with the
|
|
339
|
+
sanitised event and redaction statistics.
|
|
340
|
+
|
|
341
|
+
Raises:
|
|
342
|
+
SFServiceUnavailableError: Circuit breaker open, fallback disabled.
|
|
343
|
+
"""
|
|
344
|
+
if self._is_local_mode() or self._config.local_fallback_enabled:
|
|
345
|
+
return self._redact_local(event, policy=policy)
|
|
346
|
+
return self._redact_remote(event, policy=policy)
|
|
347
|
+
|
|
348
|
+
def _redact_local(
|
|
349
|
+
self,
|
|
350
|
+
event: Event,
|
|
351
|
+
*,
|
|
352
|
+
policy: RedactionPolicy | None,
|
|
353
|
+
) -> SFPIIRedactResult:
|
|
354
|
+
from spanforge.redact import RedactionPolicy
|
|
355
|
+
|
|
356
|
+
effective = policy if policy is not None else RedactionPolicy(redacted_by="policy:sf-pii")
|
|
357
|
+
result = effective.apply(event)
|
|
358
|
+
return SFPIIRedactResult(
|
|
359
|
+
event=result.event,
|
|
360
|
+
redaction_count=result.redaction_count,
|
|
361
|
+
redacted_at=result.redacted_at,
|
|
362
|
+
redacted_by=result.redacted_by,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
def _redact_remote(
|
|
366
|
+
self,
|
|
367
|
+
event: Event,
|
|
368
|
+
*,
|
|
369
|
+
policy: RedactionPolicy | None,
|
|
370
|
+
) -> SFPIIRedactResult:
|
|
371
|
+
from spanforge.redact import RedactionPolicy, Sensitivity
|
|
372
|
+
|
|
373
|
+
effective = policy if policy is not None else RedactionPolicy(redacted_by="policy:sf-pii")
|
|
374
|
+
body: dict[str, Any] = {
|
|
375
|
+
"min_sensitivity": effective.min_sensitivity.value
|
|
376
|
+
if isinstance(effective.min_sensitivity, Sensitivity)
|
|
377
|
+
else str(effective.min_sensitivity),
|
|
378
|
+
"redacted_by": effective.redacted_by,
|
|
379
|
+
}
|
|
380
|
+
raw = self._request("POST", "/pii/redact", body=body)
|
|
381
|
+
return SFPIIRedactResult(
|
|
382
|
+
event=raw.get("event"),
|
|
383
|
+
redaction_count=int(raw.get("redaction_count", 0)),
|
|
384
|
+
redacted_at=str(raw.get("redacted_at", "")),
|
|
385
|
+
redacted_by=str(raw.get("redacted_by", effective.redacted_by)),
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# ------------------------------------------------------------------
|
|
389
|
+
# contains_pii
|
|
390
|
+
# ------------------------------------------------------------------
|
|
391
|
+
|
|
392
|
+
def contains_pii(
|
|
393
|
+
self,
|
|
394
|
+
event: Event,
|
|
395
|
+
*,
|
|
396
|
+
scan_raw: bool = True,
|
|
397
|
+
) -> bool:
|
|
398
|
+
"""Return ``True`` if any unredacted PII remains in *event*.
|
|
399
|
+
|
|
400
|
+
Checks both :class:`~spanforge.redact.Redactable` wrapper instances
|
|
401
|
+
(explicit PII markers) and, when *scan_raw* is ``True``, raw string
|
|
402
|
+
values via the built-in regex detectors.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
event: The :class:`~spanforge.event.Event` to inspect.
|
|
406
|
+
scan_raw: When ``True`` (default), also run regex PII scanning on
|
|
407
|
+
string values in the payload.
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
``True`` if PII is detected; ``False`` if the payload is clean.
|
|
411
|
+
"""
|
|
412
|
+
if self._is_local_mode() or self._config.local_fallback_enabled:
|
|
413
|
+
from spanforge.redact import contains_pii as _cp
|
|
414
|
+
|
|
415
|
+
return _cp(event, scan_raw=scan_raw)
|
|
416
|
+
raw = self._request("POST", "/pii/contains", body={"scan_raw": scan_raw})
|
|
417
|
+
return bool(raw.get("contains_pii", False))
|
|
418
|
+
|
|
419
|
+
# ------------------------------------------------------------------
|
|
420
|
+
# assert_redacted
|
|
421
|
+
# ------------------------------------------------------------------
|
|
422
|
+
|
|
423
|
+
def assert_redacted(
|
|
424
|
+
self,
|
|
425
|
+
event: Event,
|
|
426
|
+
*,
|
|
427
|
+
context: str = "",
|
|
428
|
+
scan_raw: bool = True,
|
|
429
|
+
) -> None:
|
|
430
|
+
"""Raise :exc:`SFPIINotRedactedError` if *event* contains unredacted PII.
|
|
431
|
+
|
|
432
|
+
A stricter alternative to :meth:`contains_pii`. Use this at export
|
|
433
|
+
or serialisation boundaries to enforce that all PII has been scrubbed
|
|
434
|
+
before the event leaves a trusted context.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
event: The :class:`~spanforge.event.Event` to verify.
|
|
438
|
+
context: Optional label identifying the call site for correlation
|
|
439
|
+
(SHA-256-hashed before use — never included raw).
|
|
440
|
+
scan_raw: When ``True`` (default), also run regex scanning.
|
|
441
|
+
|
|
442
|
+
Raises:
|
|
443
|
+
SFPIINotRedactedError: If unredacted PII is detected.
|
|
444
|
+
"""
|
|
445
|
+
if self._is_local_mode() or self._config.local_fallback_enabled:
|
|
446
|
+
self._assert_redacted_local(event, context=context, scan_raw=scan_raw)
|
|
447
|
+
return
|
|
448
|
+
raw = self._request(
|
|
449
|
+
"POST",
|
|
450
|
+
"/pii/assert-redacted",
|
|
451
|
+
body={"scan_raw": scan_raw},
|
|
452
|
+
)
|
|
453
|
+
if raw.get("has_pii"):
|
|
454
|
+
raise SFPIINotRedactedError(int(raw.get("count", 1)), context)
|
|
455
|
+
|
|
456
|
+
def _assert_redacted_local(
|
|
457
|
+
self,
|
|
458
|
+
event: Event,
|
|
459
|
+
*,
|
|
460
|
+
context: str,
|
|
461
|
+
scan_raw: bool,
|
|
462
|
+
) -> None:
|
|
463
|
+
from spanforge.redact import PIINotRedactedError, assert_redacted
|
|
464
|
+
|
|
465
|
+
try:
|
|
466
|
+
assert_redacted(event, context, scan_raw=scan_raw)
|
|
467
|
+
except PIINotRedactedError as exc:
|
|
468
|
+
raise SFPIINotRedactedError(exc.count, context) from exc
|
|
469
|
+
|
|
470
|
+
# ------------------------------------------------------------------
|
|
471
|
+
# anonymize
|
|
472
|
+
# ------------------------------------------------------------------
|
|
473
|
+
|
|
474
|
+
def anonymize(
|
|
475
|
+
self,
|
|
476
|
+
text: str,
|
|
477
|
+
*,
|
|
478
|
+
extra_patterns: dict[str, re.Pattern[str]] | None = None,
|
|
479
|
+
) -> SFPIIAnonymizeResult:
|
|
480
|
+
"""Replace all detected PII in *text* with type-tagged markers.
|
|
481
|
+
|
|
482
|
+
Runs the full built-in PII pattern set (and any *extra_patterns*)
|
|
483
|
+
against *text*, replacing each confirmed match with
|
|
484
|
+
``[REDACTED:<pii_type>]``. Secondary validators (Luhn checksum for
|
|
485
|
+
credit cards, Verhoeff checksum for Aadhaar, SSA range checks for
|
|
486
|
+
SSNs, calendar validation for dates of birth) are applied to minimise
|
|
487
|
+
false-positive replacements.
|
|
488
|
+
|
|
489
|
+
Security: the original matched values are **never** returned — only
|
|
490
|
+
the anonymized text, replacement count, and a list of PII type labels.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
text: Plain text string to anonymize.
|
|
494
|
+
extra_patterns: Optional ``{label: compiled_regex}`` detectors to
|
|
495
|
+
run in addition to the built-in patterns.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
:class:`~spanforge.sdk._types.SFPIIAnonymizeResult`.
|
|
499
|
+
|
|
500
|
+
Raises:
|
|
501
|
+
SFPIIScanError: If *text* is not a ``str``.
|
|
502
|
+
"""
|
|
503
|
+
if not isinstance(text, str):
|
|
504
|
+
msg = f"anonymize() requires a str; got {type(text).__name__}"
|
|
505
|
+
raise SFPIIScanError(msg)
|
|
506
|
+
if self._is_local_mode() or self._config.local_fallback_enabled:
|
|
507
|
+
return self._anonymize_local(text, extra_patterns=extra_patterns)
|
|
508
|
+
raw = self._request("POST", "/pii/anonymize", body={"text": text})
|
|
509
|
+
return SFPIIAnonymizeResult(
|
|
510
|
+
text=str(raw.get("text", text)),
|
|
511
|
+
replacements=int(raw.get("replacements", 0)),
|
|
512
|
+
pii_types_found=list(raw.get("pii_types_found", [])),
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
def _anonymize_local(
|
|
516
|
+
self,
|
|
517
|
+
text: str,
|
|
518
|
+
*,
|
|
519
|
+
extra_patterns: dict[str, re.Pattern[str]] | None,
|
|
520
|
+
) -> SFPIIAnonymizeResult:
|
|
521
|
+
import spanforge.redact as _redact
|
|
522
|
+
|
|
523
|
+
# Access built-in patterns; fall back gracefully if internal names change.
|
|
524
|
+
pii_patterns: dict[str, re.Pattern[str]] = dict(
|
|
525
|
+
getattr(_redact, "_PII_PATTERNS", {}),
|
|
526
|
+
)
|
|
527
|
+
dpdp_patterns: dict[str, re.Pattern[str]] = dict(
|
|
528
|
+
getattr(_redact, "DPDP_PATTERNS", {}),
|
|
529
|
+
)
|
|
530
|
+
patterns: dict[str, re.Pattern[str]] = {**pii_patterns, **dpdp_patterns}
|
|
531
|
+
if extra_patterns:
|
|
532
|
+
patterns.update(extra_patterns)
|
|
533
|
+
|
|
534
|
+
# Secondary validators (default to always-pass if internals unavailable)
|
|
535
|
+
_luhn = getattr(_redact, "_luhn_check", lambda _s: True)
|
|
536
|
+
_verhoeff = getattr(_redact, "_verhoeff_check", lambda _s: True)
|
|
537
|
+
_valid_ssn = getattr(_redact, "_is_valid_ssn", lambda _s: True)
|
|
538
|
+
_valid_date = getattr(_redact, "_is_valid_date", lambda _s: True)
|
|
539
|
+
|
|
540
|
+
result_text = text
|
|
541
|
+
replacements = 0
|
|
542
|
+
pii_types_found: list[str] = []
|
|
543
|
+
|
|
544
|
+
for label, pat in patterns.items():
|
|
545
|
+
counter: list[int] = [0]
|
|
546
|
+
|
|
547
|
+
def _replace(
|
|
548
|
+
m: re.Match[str],
|
|
549
|
+
_lbl: str = label,
|
|
550
|
+
_cnt: list[int] = counter,
|
|
551
|
+
) -> str:
|
|
552
|
+
val = m.group()
|
|
553
|
+
if _lbl == _CC_LABEL and not _luhn(val):
|
|
554
|
+
return val
|
|
555
|
+
if _lbl == _AADHAAR_LABEL and not _verhoeff(val):
|
|
556
|
+
return val
|
|
557
|
+
if _lbl == _SSN_LABEL and not _valid_ssn(val):
|
|
558
|
+
return val
|
|
559
|
+
if _lbl == _DOB_LABEL and not _valid_date(val):
|
|
560
|
+
return val
|
|
561
|
+
_cnt[0] += 1
|
|
562
|
+
return f"[REDACTED:{_lbl}]"
|
|
563
|
+
|
|
564
|
+
new_text = pat.sub(_replace, result_text)
|
|
565
|
+
if counter[0] > 0:
|
|
566
|
+
result_text = new_text
|
|
567
|
+
replacements += counter[0]
|
|
568
|
+
if label not in pii_types_found:
|
|
569
|
+
pii_types_found.append(label)
|
|
570
|
+
|
|
571
|
+
return SFPIIAnonymizeResult(
|
|
572
|
+
text=result_text,
|
|
573
|
+
replacements=replacements,
|
|
574
|
+
pii_types_found=pii_types_found,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# ------------------------------------------------------------------
|
|
578
|
+
# wrap
|
|
579
|
+
# ------------------------------------------------------------------
|
|
580
|
+
|
|
581
|
+
def wrap(
|
|
582
|
+
self,
|
|
583
|
+
value: object,
|
|
584
|
+
sensitivity: str,
|
|
585
|
+
pii_types: frozenset[str] = frozenset(),
|
|
586
|
+
) -> Redactable:
|
|
587
|
+
"""Wrap *value* as a :class:`~spanforge.redact.Redactable` sentinel.
|
|
588
|
+
|
|
589
|
+
Convenience factory that creates a :class:`~spanforge.redact.Redactable`
|
|
590
|
+
instance ready to embed in an event payload. The value will be
|
|
591
|
+
replaced by a safe marker string when a
|
|
592
|
+
:class:`~spanforge.redact.RedactionPolicy` is applied.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
value: The PII-sensitive value to protect.
|
|
596
|
+
sensitivity: Sensitivity level string: ``"low"``, ``"medium"``,
|
|
597
|
+
``"high"``, ``"pii"``, or ``"phi"``.
|
|
598
|
+
pii_types: Labels describing the PII category
|
|
599
|
+
(e.g. ``frozenset({"email"})``).
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
:class:`~spanforge.redact.Redactable` wrapping *value*.
|
|
603
|
+
|
|
604
|
+
Raises:
|
|
605
|
+
SFPIIPolicyError: If *sensitivity* is not a recognised level.
|
|
606
|
+
|
|
607
|
+
Example::
|
|
608
|
+
|
|
609
|
+
wrapped = sf_pii.wrap("alice@example.com", "pii", frozenset({"email"}))
|
|
610
|
+
"""
|
|
611
|
+
from spanforge.redact import Redactable, Sensitivity
|
|
612
|
+
|
|
613
|
+
if sensitivity not in _VALID_SENSITIVITY:
|
|
614
|
+
valid = sorted(_VALID_SENSITIVITY)
|
|
615
|
+
msg = f"Invalid sensitivity level {sensitivity!r}. Must be one of: {valid}"
|
|
616
|
+
raise SFPIIPolicyError(msg)
|
|
617
|
+
return Redactable(value, Sensitivity(sensitivity), pii_types)
|
|
618
|
+
|
|
619
|
+
# ------------------------------------------------------------------
|
|
620
|
+
# make_policy
|
|
621
|
+
# ------------------------------------------------------------------
|
|
622
|
+
|
|
623
|
+
def make_policy(
|
|
624
|
+
self,
|
|
625
|
+
*,
|
|
626
|
+
min_sensitivity: str = "pii",
|
|
627
|
+
redacted_by: str = "policy:sf-pii",
|
|
628
|
+
replacement_template: str = "[REDACTED:{sensitivity}]",
|
|
629
|
+
) -> RedactionPolicy:
|
|
630
|
+
"""Create a configured :class:`~spanforge.redact.RedactionPolicy`.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
min_sensitivity: Sensitivity threshold; fields at or above
|
|
634
|
+
this level are redacted. Must be one of
|
|
635
|
+
``"low"``, ``"medium"``, ``"high"``,
|
|
636
|
+
``"pii"``, or ``"phi"``.
|
|
637
|
+
Defaults to ``"pii"``.
|
|
638
|
+
redacted_by: Identifier embedded in the redaction
|
|
639
|
+
metadata (e.g. ``"policy:corp-default"``).
|
|
640
|
+
Defaults to ``"policy:sf-pii"``.
|
|
641
|
+
replacement_template: Marker template. Must contain
|
|
642
|
+
``{sensitivity}`` which is replaced with
|
|
643
|
+
the field's sensitivity level value.
|
|
644
|
+
Defaults to ``"[REDACTED:{sensitivity}]"``.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
Configured :class:`~spanforge.redact.RedactionPolicy`.
|
|
648
|
+
|
|
649
|
+
Raises:
|
|
650
|
+
SFPIIPolicyError: If *min_sensitivity* is not recognised or
|
|
651
|
+
*replacement_template* lacks ``{sensitivity}``.
|
|
652
|
+
|
|
653
|
+
Example::
|
|
654
|
+
|
|
655
|
+
policy = sf_pii.make_policy(min_sensitivity="high",
|
|
656
|
+
redacted_by="my-service")
|
|
657
|
+
"""
|
|
658
|
+
from spanforge.redact import RedactionPolicy, Sensitivity
|
|
659
|
+
|
|
660
|
+
if min_sensitivity not in _VALID_SENSITIVITY:
|
|
661
|
+
valid = sorted(_VALID_SENSITIVITY)
|
|
662
|
+
msg = f"Invalid min_sensitivity {min_sensitivity!r}. Must be one of: {valid}"
|
|
663
|
+
raise SFPIIPolicyError(msg)
|
|
664
|
+
if "{sensitivity}" not in replacement_template:
|
|
665
|
+
msg = (
|
|
666
|
+
"replacement_template must contain the '{sensitivity}' placeholder; "
|
|
667
|
+
f"received: {replacement_template!r}"
|
|
668
|
+
)
|
|
669
|
+
raise SFPIIPolicyError(msg)
|
|
670
|
+
return RedactionPolicy(
|
|
671
|
+
min_sensitivity=Sensitivity(min_sensitivity),
|
|
672
|
+
redacted_by=redacted_by,
|
|
673
|
+
replacement_template=replacement_template,
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
# ==================================================================
|
|
677
|
+
# Phase 3 — PII Service Hardening
|
|
678
|
+
# ==================================================================
|
|
679
|
+
|
|
680
|
+
# ------------------------------------------------------------------
|
|
681
|
+
# scan_text (PII-001)
|
|
682
|
+
# ------------------------------------------------------------------
|
|
683
|
+
|
|
684
|
+
def scan_text(
|
|
685
|
+
self,
|
|
686
|
+
text: str,
|
|
687
|
+
*,
|
|
688
|
+
language: str = "en",
|
|
689
|
+
score_threshold: float = 0.5,
|
|
690
|
+
) -> PIITextScanResult:
|
|
691
|
+
"""Scan a plain-text string for PII (PII-001).
|
|
692
|
+
|
|
693
|
+
Uses the Presidio ``AnalyzerEngine`` when available, falling back to
|
|
694
|
+
the built-in regex scanner. Response shape follows the spec:
|
|
695
|
+
``{entities: [{type, start, end, score}], redacted_text, detected}``.
|
|
696
|
+
|
|
697
|
+
**Security**: entity values are never returned — only type, position,
|
|
698
|
+
and confidence score.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
text: Plain text to scan.
|
|
702
|
+
language: Language code for Presidio analysis (default
|
|
703
|
+
``"en"``). Ignored when using regex fallback.
|
|
704
|
+
score_threshold: Minimum Presidio confidence score (default
|
|
705
|
+
0.5).
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
:class:`~spanforge.sdk._types.PIITextScanResult`.
|
|
709
|
+
|
|
710
|
+
Raises:
|
|
711
|
+
SFPIIScanError: If *text* is not a ``str``.
|
|
712
|
+
"""
|
|
713
|
+
if not isinstance(text, str):
|
|
714
|
+
msg = f"scan_text() requires a str; got {type(text).__name__}"
|
|
715
|
+
raise SFPIIScanError(msg)
|
|
716
|
+
self._last_scan_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
717
|
+
return self._scan_text_local(text, language=language, score_threshold=score_threshold)
|
|
718
|
+
|
|
719
|
+
def _scan_text_local(
|
|
720
|
+
self,
|
|
721
|
+
text: str,
|
|
722
|
+
*,
|
|
723
|
+
language: str,
|
|
724
|
+
score_threshold: float,
|
|
725
|
+
) -> PIITextScanResult:
|
|
726
|
+
from spanforge.presidio_backend import is_available, presidio_scan_text
|
|
727
|
+
|
|
728
|
+
if is_available():
|
|
729
|
+
try:
|
|
730
|
+
raw_entities, redacted_text, detected = presidio_scan_text(
|
|
731
|
+
text, language=language, score_threshold=score_threshold
|
|
732
|
+
)
|
|
733
|
+
entities = [
|
|
734
|
+
PIIEntity(
|
|
735
|
+
type=e["type"],
|
|
736
|
+
start=e["start"],
|
|
737
|
+
end=e["end"],
|
|
738
|
+
score=e["score"],
|
|
739
|
+
)
|
|
740
|
+
for e in raw_entities
|
|
741
|
+
]
|
|
742
|
+
return PIITextScanResult(
|
|
743
|
+
entities=entities,
|
|
744
|
+
redacted_text=redacted_text,
|
|
745
|
+
detected=detected,
|
|
746
|
+
)
|
|
747
|
+
except ImportError:
|
|
748
|
+
pass # fall through to regex fallback
|
|
749
|
+
|
|
750
|
+
# Regex fallback — synthesise character-level entities from pattern matches
|
|
751
|
+
return self._scan_text_regex_fallback(text)
|
|
752
|
+
|
|
753
|
+
def _scan_text_regex_fallback(self, text: str) -> PIITextScanResult:
|
|
754
|
+
"""Regex-based fallback for scan_text() when Presidio is unavailable."""
|
|
755
|
+
import spanforge.redact as _redact
|
|
756
|
+
|
|
757
|
+
pii_patterns: dict[str, re.Pattern[str]] = dict(
|
|
758
|
+
getattr(_redact, "_PII_PATTERNS", {}),
|
|
759
|
+
)
|
|
760
|
+
dpdp_patterns: dict[str, re.Pattern[str]] = dict(
|
|
761
|
+
getattr(_redact, "DPDP_PATTERNS", {}),
|
|
762
|
+
)
|
|
763
|
+
from spanforge.presidio_backend import PIPL_PATTERNS
|
|
764
|
+
|
|
765
|
+
all_patterns = {**pii_patterns, **dpdp_patterns, **PIPL_PATTERNS}
|
|
766
|
+
|
|
767
|
+
_luhn = getattr(_redact, "_luhn_check", lambda _s: True)
|
|
768
|
+
_verhoeff = getattr(_redact, "_verhoeff_check", lambda _s: True)
|
|
769
|
+
_valid_ssn = getattr(_redact, "_is_valid_ssn", lambda _s: True)
|
|
770
|
+
_valid_date = getattr(_redact, "_is_valid_date", lambda _s: True)
|
|
771
|
+
|
|
772
|
+
entities: list[PIIEntity] = []
|
|
773
|
+
for label, pat in all_patterns.items():
|
|
774
|
+
for m in pat.finditer(text):
|
|
775
|
+
val = m.group()
|
|
776
|
+
if label == _CC_LABEL and not _luhn(val):
|
|
777
|
+
continue
|
|
778
|
+
if label == _AADHAAR_LABEL and not _verhoeff(val):
|
|
779
|
+
continue
|
|
780
|
+
if label == _SSN_LABEL and not _valid_ssn(val):
|
|
781
|
+
continue
|
|
782
|
+
if label == _DOB_LABEL and not _valid_date(val):
|
|
783
|
+
continue
|
|
784
|
+
entities.append(PIIEntity(type=label, start=m.start(), end=m.end(), score=1.0))
|
|
785
|
+
|
|
786
|
+
# Sort by start position and build redacted text right-to-left
|
|
787
|
+
entities.sort(key=lambda e: e.start)
|
|
788
|
+
redacted = text
|
|
789
|
+
for ent in sorted(entities, key=lambda e: e.start, reverse=True):
|
|
790
|
+
redacted = redacted[: ent.start] + f"<{ent.type.upper()}>" + redacted[ent.end :]
|
|
791
|
+
|
|
792
|
+
return PIITextScanResult(
|
|
793
|
+
entities=entities,
|
|
794
|
+
redacted_text=redacted,
|
|
795
|
+
detected=bool(entities),
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
# ------------------------------------------------------------------
|
|
799
|
+
# anonymise (PII-002) — British spelling, dict input
|
|
800
|
+
# ------------------------------------------------------------------
|
|
801
|
+
|
|
802
|
+
def anonymise(
|
|
803
|
+
self,
|
|
804
|
+
payload: dict[str, Any],
|
|
805
|
+
*,
|
|
806
|
+
max_depth: int = 10,
|
|
807
|
+
) -> PIIAnonymisedResult:
|
|
808
|
+
"""Recursively anonymise all string fields in *payload* (PII-002).
|
|
809
|
+
|
|
810
|
+
Calls :meth:`scan_text` on every string field, replacing detected
|
|
811
|
+
entities with ``<TYPE>`` placeholders. Returns a clean copy of the
|
|
812
|
+
payload plus a manifest recording what was replaced (original values
|
|
813
|
+
are SHA-256-hashed — never stored in plain text).
|
|
814
|
+
|
|
815
|
+
This method replaces the custom Presidio pipeline in HallucCheck v5.0
|
|
816
|
+
§14 (leaderboard anonymisation).
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
payload: Dictionary to anonymise. Must be a :class:`dict`.
|
|
820
|
+
max_depth: Maximum nesting depth (default 10).
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
:class:`~spanforge.sdk._types.PIIAnonymisedResult` with
|
|
824
|
+
``clean_payload`` and ``redaction_manifest``.
|
|
825
|
+
|
|
826
|
+
Raises:
|
|
827
|
+
SFPIIScanError: If *payload* is not a ``dict``.
|
|
828
|
+
"""
|
|
829
|
+
if not isinstance(payload, dict):
|
|
830
|
+
msg = f"anonymise() requires a dict payload; got {type(payload).__name__}"
|
|
831
|
+
raise SFPIIScanError(msg)
|
|
832
|
+
manifest: list[PIIRedactionManifestEntry] = []
|
|
833
|
+
clean = self._anonymise_walk(
|
|
834
|
+
payload, path="", depth=0, max_depth=max_depth, manifest=manifest
|
|
835
|
+
)
|
|
836
|
+
return PIIAnonymisedResult(
|
|
837
|
+
clean_payload=clean,
|
|
838
|
+
redaction_manifest=manifest,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
def _anonymise_walk(
|
|
842
|
+
self,
|
|
843
|
+
obj: Any,
|
|
844
|
+
*,
|
|
845
|
+
path: str,
|
|
846
|
+
depth: int,
|
|
847
|
+
max_depth: int,
|
|
848
|
+
manifest: list[PIIRedactionManifestEntry],
|
|
849
|
+
) -> Any:
|
|
850
|
+
if depth > max_depth:
|
|
851
|
+
return obj
|
|
852
|
+
if isinstance(obj, str):
|
|
853
|
+
result = self._scan_text_local(obj, language="en", score_threshold=0.5)
|
|
854
|
+
if not result.detected:
|
|
855
|
+
return obj
|
|
856
|
+
# Replace detected entities and record manifest entries
|
|
857
|
+
clean_text = result.redacted_text
|
|
858
|
+
for ent in result.entities:
|
|
859
|
+
original_hash = hashlib.sha256(obj[ent.start : ent.end].encode()).hexdigest()
|
|
860
|
+
manifest.append(
|
|
861
|
+
PIIRedactionManifestEntry(
|
|
862
|
+
field_path=path,
|
|
863
|
+
type=ent.type,
|
|
864
|
+
original_hash=original_hash,
|
|
865
|
+
replacement=f"<{ent.type.upper()}>",
|
|
866
|
+
)
|
|
867
|
+
)
|
|
868
|
+
return clean_text
|
|
869
|
+
if isinstance(obj, dict):
|
|
870
|
+
return {
|
|
871
|
+
k: self._anonymise_walk(
|
|
872
|
+
v,
|
|
873
|
+
path=f"{path}.{k}" if path else str(k),
|
|
874
|
+
depth=depth + 1,
|
|
875
|
+
max_depth=max_depth,
|
|
876
|
+
manifest=manifest,
|
|
877
|
+
)
|
|
878
|
+
for k, v in obj.items()
|
|
879
|
+
}
|
|
880
|
+
if isinstance(obj, list):
|
|
881
|
+
return [
|
|
882
|
+
self._anonymise_walk(
|
|
883
|
+
v,
|
|
884
|
+
path=f"{path}[{i}]",
|
|
885
|
+
depth=depth + 1,
|
|
886
|
+
max_depth=max_depth,
|
|
887
|
+
manifest=manifest,
|
|
888
|
+
)
|
|
889
|
+
for i, v in enumerate(obj)
|
|
890
|
+
]
|
|
891
|
+
return obj
|
|
892
|
+
|
|
893
|
+
# ------------------------------------------------------------------
|
|
894
|
+
# scan_batch (PII-003)
|
|
895
|
+
# ------------------------------------------------------------------
|
|
896
|
+
|
|
897
|
+
def scan_batch(
|
|
898
|
+
self,
|
|
899
|
+
texts: list[str],
|
|
900
|
+
*,
|
|
901
|
+
language: str = "en",
|
|
902
|
+
score_threshold: float = 0.5,
|
|
903
|
+
max_workers: int = 8,
|
|
904
|
+
) -> list[PIITextScanResult]:
|
|
905
|
+
"""Scan a list of texts for PII in parallel (PII-003).
|
|
906
|
+
|
|
907
|
+
Uses a thread pool for concurrent execution. Used by
|
|
908
|
+
``hc trust-gate`` to bulk-check recent outputs.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
texts: List of plain text strings to scan.
|
|
912
|
+
language: Language code (default ``"en"``).
|
|
913
|
+
score_threshold: Minimum confidence score (default 0.5).
|
|
914
|
+
max_workers: Thread pool size (default 8).
|
|
915
|
+
|
|
916
|
+
Returns:
|
|
917
|
+
List of :class:`~spanforge.sdk._types.PIITextScanResult` in the
|
|
918
|
+
same order as *texts*.
|
|
919
|
+
|
|
920
|
+
Raises:
|
|
921
|
+
SFPIIScanError: If *texts* is not a list or any element is not a
|
|
922
|
+
``str``.
|
|
923
|
+
"""
|
|
924
|
+
if not isinstance(texts, list):
|
|
925
|
+
msg = f"scan_batch() requires a list; got {type(texts).__name__}"
|
|
926
|
+
raise SFPIIScanError(msg)
|
|
927
|
+
for i, t in enumerate(texts):
|
|
928
|
+
if not isinstance(t, str):
|
|
929
|
+
msg = f"scan_batch() element [{i}] must be str; got {type(t).__name__}"
|
|
930
|
+
raise SFPIIScanError(msg)
|
|
931
|
+
|
|
932
|
+
if not texts:
|
|
933
|
+
return []
|
|
934
|
+
|
|
935
|
+
def _scan_one(text: str) -> PIITextScanResult:
|
|
936
|
+
return self._scan_text_local(text, language=language, score_threshold=score_threshold)
|
|
937
|
+
|
|
938
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=min(max_workers, len(texts))) as ex:
|
|
939
|
+
futures = [ex.submit(_scan_one, t) for t in texts]
|
|
940
|
+
return [f.result() for f in futures]
|
|
941
|
+
|
|
942
|
+
# ------------------------------------------------------------------
|
|
943
|
+
# apply_pipeline_action (PII-010 / PII-011 / PII-012)
|
|
944
|
+
# ------------------------------------------------------------------
|
|
945
|
+
|
|
946
|
+
def apply_pipeline_action(
|
|
947
|
+
self,
|
|
948
|
+
text: str,
|
|
949
|
+
*,
|
|
950
|
+
action: str = "flag",
|
|
951
|
+
threshold: float = _DEFAULT_PIPELINE_THRESHOLD,
|
|
952
|
+
language: str = "en",
|
|
953
|
+
) -> PIIPipelineResult:
|
|
954
|
+
"""Apply pipeline pii_action routing to *text* (PII-010/011/012).
|
|
955
|
+
|
|
956
|
+
After scanning, enforces the configured *action*:
|
|
957
|
+
|
|
958
|
+
* ``"flag"`` — score normally; ``detected=True`` added to result.
|
|
959
|
+
* ``"redact"`` — substitute ``redacted_text`` as scoring input.
|
|
960
|
+
* ``"block"`` — raise :exc:`~spanforge.sdk._exceptions.SFPIIBlockedError`
|
|
961
|
+
(HTTP 422 ``PII_DETECTED``).
|
|
962
|
+
|
|
963
|
+
Only entities with ``score >= threshold`` trigger the action.
|
|
964
|
+
Sub-threshold hits are recorded in ``low_confidence_hits`` for audit.
|
|
965
|
+
|
|
966
|
+
Args:
|
|
967
|
+
text: Input text to scan.
|
|
968
|
+
action: Pipeline action: ``"flag"``, ``"redact"``, or
|
|
969
|
+
``"block"``. Default: ``"flag"``.
|
|
970
|
+
threshold: Confidence threshold (default 0.85). Entities below
|
|
971
|
+
this score are recorded but do not trigger the action.
|
|
972
|
+
language: Language code for Presidio (default ``"en"``).
|
|
973
|
+
|
|
974
|
+
Returns:
|
|
975
|
+
:class:`~spanforge.sdk._types.PIIPipelineResult`.
|
|
976
|
+
|
|
977
|
+
Raises:
|
|
978
|
+
SFPIIScanError: If *text* is not a ``str`` or *action* is
|
|
979
|
+
invalid.
|
|
980
|
+
SFPIIBlockedError: If *action* is ``"block"`` and PII is
|
|
981
|
+
detected above *threshold*.
|
|
982
|
+
"""
|
|
983
|
+
if not isinstance(text, str):
|
|
984
|
+
msg = f"apply_pipeline_action() requires a str; got {type(text).__name__}"
|
|
985
|
+
raise SFPIIScanError(msg)
|
|
986
|
+
if action not in _VALID_PIPELINE_ACTIONS:
|
|
987
|
+
valid = sorted(_VALID_PIPELINE_ACTIONS)
|
|
988
|
+
msg = f"Invalid action {action!r}. Must be one of: {valid}"
|
|
989
|
+
raise SFPIIScanError(msg)
|
|
990
|
+
|
|
991
|
+
scan_result = self._scan_text_local(text, language=language, score_threshold=0.0)
|
|
992
|
+
|
|
993
|
+
above = [e for e in scan_result.entities if e.score >= threshold]
|
|
994
|
+
below = [e for e in scan_result.entities if e.score < threshold]
|
|
995
|
+
detected = bool(above)
|
|
996
|
+
entity_types = sorted({e.type for e in above})
|
|
997
|
+
|
|
998
|
+
# Build redacted text from above-threshold entities only
|
|
999
|
+
redacted = text
|
|
1000
|
+
for ent in sorted(above, key=lambda e: e.start, reverse=True):
|
|
1001
|
+
redacted = redacted[: ent.start] + f"<{ent.type.upper()}>" + redacted[ent.end :]
|
|
1002
|
+
|
|
1003
|
+
if action == "block" and detected:
|
|
1004
|
+
raise SFPIIBlockedError(entity_types=entity_types, count=len(above))
|
|
1005
|
+
|
|
1006
|
+
effective_text = redacted if action == "redact" and detected else text
|
|
1007
|
+
|
|
1008
|
+
return PIIPipelineResult(
|
|
1009
|
+
text=effective_text,
|
|
1010
|
+
action=action,
|
|
1011
|
+
detected=detected,
|
|
1012
|
+
entity_types=entity_types,
|
|
1013
|
+
low_confidence_hits=below,
|
|
1014
|
+
redacted_text=redacted,
|
|
1015
|
+
blocked=False,
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
# ------------------------------------------------------------------
|
|
1019
|
+
# scan_async (F-10)
|
|
1020
|
+
# ------------------------------------------------------------------
|
|
1021
|
+
|
|
1022
|
+
async def scan_async(
|
|
1023
|
+
self,
|
|
1024
|
+
text: str,
|
|
1025
|
+
*,
|
|
1026
|
+
language: str = "en",
|
|
1027
|
+
score_threshold: float = 0.5,
|
|
1028
|
+
):
|
|
1029
|
+
"""Async variant of :meth:`scan_text` (F-10).
|
|
1030
|
+
|
|
1031
|
+
Runs :meth:`scan_text` in a thread-pool executor via
|
|
1032
|
+
:func:`asyncio.run_in_executor`, making it safe to ``await``
|
|
1033
|
+
from async code without blocking the event loop.
|
|
1034
|
+
|
|
1035
|
+
Args:
|
|
1036
|
+
text: Plain text to scan.
|
|
1037
|
+
language: Language code passed to :meth:`scan_text`.
|
|
1038
|
+
score_threshold: Minimum confidence score passed to :meth:`scan_text`.
|
|
1039
|
+
|
|
1040
|
+
Returns:
|
|
1041
|
+
:class:`~spanforge.sdk._types.PIITextScanResult` — same as
|
|
1042
|
+
:meth:`scan_text`.
|
|
1043
|
+
"""
|
|
1044
|
+
import functools
|
|
1045
|
+
|
|
1046
|
+
loop = asyncio.get_event_loop()
|
|
1047
|
+
return await loop.run_in_executor(
|
|
1048
|
+
None,
|
|
1049
|
+
functools.partial(
|
|
1050
|
+
self.scan_text, text, language=language, score_threshold=score_threshold
|
|
1051
|
+
),
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
# ------------------------------------------------------------------
|
|
1055
|
+
# get_status (PII-005)
|
|
1056
|
+
# ------------------------------------------------------------------
|
|
1057
|
+
|
|
1058
|
+
def get_status(self) -> PIIStatusInfo:
|
|
1059
|
+
"""Return sf-pii service status (PII-005).
|
|
1060
|
+
|
|
1061
|
+
Contributes the ``sf_pii`` field for ``GET /v1/spanforge/status``:
|
|
1062
|
+
``{status, presidio_available, entity_types_loaded, last_scan_at}``.
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
:class:`~spanforge.sdk._types.PIIStatusInfo`.
|
|
1066
|
+
"""
|
|
1067
|
+
from spanforge.presidio_backend import PIPL_PATTERNS, is_available
|
|
1068
|
+
|
|
1069
|
+
presidio_ok = is_available()
|
|
1070
|
+
|
|
1071
|
+
import spanforge.redact as _redact
|
|
1072
|
+
|
|
1073
|
+
pii_pats: dict[str, Any] = dict(getattr(_redact, "_PII_PATTERNS", {}))
|
|
1074
|
+
dpdp_pats: dict[str, Any] = dict(getattr(_redact, "DPDP_PATTERNS", {}))
|
|
1075
|
+
entity_types = sorted({*pii_pats, *dpdp_pats, *PIPL_PATTERNS})
|
|
1076
|
+
|
|
1077
|
+
return PIIStatusInfo(
|
|
1078
|
+
status="ok",
|
|
1079
|
+
presidio_available=presidio_ok,
|
|
1080
|
+
entity_types_loaded=entity_types,
|
|
1081
|
+
last_scan_at=getattr(self, "_last_scan_at", None),
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
# ------------------------------------------------------------------
|
|
1085
|
+
# erase_subject (PII-021 — GDPR Article 17)
|
|
1086
|
+
# ------------------------------------------------------------------
|
|
1087
|
+
|
|
1088
|
+
def erase_subject(
|
|
1089
|
+
self,
|
|
1090
|
+
subject_id: str,
|
|
1091
|
+
project_id: str,
|
|
1092
|
+
) -> ErasureReceipt:
|
|
1093
|
+
"""Issue a GDPR Article 17 Right to Erasure for *subject_id* (PII-021).
|
|
1094
|
+
|
|
1095
|
+
Finds all ``pii_detection`` audit records for *subject_id* in the
|
|
1096
|
+
scoping *project_id*, issues erasure instructions to downstream
|
|
1097
|
+
stores, and returns a receipt with timestamp for the Article 17(3)
|
|
1098
|
+
exceptions log.
|
|
1099
|
+
|
|
1100
|
+
**Security**: *subject_id* is SHA-256-hashed in log output; it is
|
|
1101
|
+
never written to records in plain text.
|
|
1102
|
+
|
|
1103
|
+
Args:
|
|
1104
|
+
subject_id: Opaque data subject identifier.
|
|
1105
|
+
project_id: Project scope for the erasure.
|
|
1106
|
+
|
|
1107
|
+
Returns:
|
|
1108
|
+
:class:`~spanforge.sdk._types.ErasureReceipt`.
|
|
1109
|
+
|
|
1110
|
+
Raises:
|
|
1111
|
+
SFPIIError: If erasure cannot be completed.
|
|
1112
|
+
"""
|
|
1113
|
+
if not subject_id or not project_id:
|
|
1114
|
+
msg = "erase_subject() requires non-empty subject_id and project_id"
|
|
1115
|
+
raise SFPIIError(msg)
|
|
1116
|
+
|
|
1117
|
+
erasure_id = str(uuid.uuid4())
|
|
1118
|
+
erased_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
1119
|
+
|
|
1120
|
+
# In local mode, we locate events from the in-process store.
|
|
1121
|
+
records_erased = self._local_erase_subject(subject_id, project_id)
|
|
1122
|
+
|
|
1123
|
+
return ErasureReceipt(
|
|
1124
|
+
subject_id=subject_id,
|
|
1125
|
+
project_id=project_id,
|
|
1126
|
+
records_erased=records_erased,
|
|
1127
|
+
erasure_id=erasure_id,
|
|
1128
|
+
erased_at=erased_at,
|
|
1129
|
+
exceptions=[],
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
def _local_erase_subject(self, subject_id: str, project_id: str) -> int:
|
|
1133
|
+
"""Attempt local store erasure; returns count of matching records."""
|
|
1134
|
+
try:
|
|
1135
|
+
from spanforge._store import TraceStore
|
|
1136
|
+
|
|
1137
|
+
store = TraceStore.get_default() # type: ignore[attr-defined]
|
|
1138
|
+
erased = 0
|
|
1139
|
+
with store._lock:
|
|
1140
|
+
for trace_events in store._traces.values():
|
|
1141
|
+
for ev in trace_events:
|
|
1142
|
+
payload = getattr(ev, "payload", {}) or {}
|
|
1143
|
+
if (
|
|
1144
|
+
payload.get("subject_id") == subject_id
|
|
1145
|
+
and payload.get("project_id") == project_id
|
|
1146
|
+
):
|
|
1147
|
+
# Mark for erasure — zero out identifiable fields
|
|
1148
|
+
payload.pop("subject_id", None)
|
|
1149
|
+
erased += 1
|
|
1150
|
+
except Exception:
|
|
1151
|
+
return 0
|
|
1152
|
+
else:
|
|
1153
|
+
return erased
|
|
1154
|
+
|
|
1155
|
+
# ------------------------------------------------------------------
|
|
1156
|
+
# export_subject_data (PII-022 — CCPA DSAR)
|
|
1157
|
+
# ------------------------------------------------------------------
|
|
1158
|
+
|
|
1159
|
+
def export_subject_data(
|
|
1160
|
+
self,
|
|
1161
|
+
subject_id: str,
|
|
1162
|
+
project_id: str,
|
|
1163
|
+
) -> DSARExport:
|
|
1164
|
+
"""Export all data for *subject_id* for a CCPA DSAR request (PII-022).
|
|
1165
|
+
|
|
1166
|
+
Aggregates all events referencing *subject_id* from sf-audit and
|
|
1167
|
+
returns a JSON-export package. Used by
|
|
1168
|
+
``GET /v1/privacy/dsar/{subject_id}``.
|
|
1169
|
+
|
|
1170
|
+
Args:
|
|
1171
|
+
subject_id: Opaque data subject identifier.
|
|
1172
|
+
project_id: Project scope.
|
|
1173
|
+
|
|
1174
|
+
Returns:
|
|
1175
|
+
:class:`~spanforge.sdk._types.DSARExport`.
|
|
1176
|
+
|
|
1177
|
+
Raises:
|
|
1178
|
+
SFPIIError: If *subject_id* or *project_id* is empty.
|
|
1179
|
+
"""
|
|
1180
|
+
if not subject_id or not project_id:
|
|
1181
|
+
msg = "export_subject_data() requires non-empty subject_id and project_id"
|
|
1182
|
+
raise SFPIIError(msg)
|
|
1183
|
+
|
|
1184
|
+
export_id = str(uuid.uuid4())
|
|
1185
|
+
exported_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
1186
|
+
events = self._local_collect_subject_events(subject_id, project_id)
|
|
1187
|
+
|
|
1188
|
+
return DSARExport(
|
|
1189
|
+
subject_id=subject_id,
|
|
1190
|
+
project_id=project_id,
|
|
1191
|
+
event_count=len(events),
|
|
1192
|
+
export_id=export_id,
|
|
1193
|
+
exported_at=exported_at,
|
|
1194
|
+
events=events,
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
def _local_collect_subject_events(
|
|
1198
|
+
self, subject_id: str, project_id: str
|
|
1199
|
+
) -> list[dict[str, Any]]:
|
|
1200
|
+
"""Collect events referencing subject_id from the local store."""
|
|
1201
|
+
try:
|
|
1202
|
+
from spanforge._store import TraceStore
|
|
1203
|
+
|
|
1204
|
+
store = TraceStore.get_default() # type: ignore[attr-defined]
|
|
1205
|
+
collected: list[dict[str, Any]] = []
|
|
1206
|
+
with store._lock:
|
|
1207
|
+
for trace_events in store._traces.values():
|
|
1208
|
+
for ev in trace_events:
|
|
1209
|
+
payload = getattr(ev, "payload", {}) or {}
|
|
1210
|
+
if (
|
|
1211
|
+
payload.get("subject_id") == subject_id
|
|
1212
|
+
and payload.get("project_id") == project_id
|
|
1213
|
+
):
|
|
1214
|
+
collected.append(
|
|
1215
|
+
{
|
|
1216
|
+
"event_id": str(getattr(ev, "event_id", "")),
|
|
1217
|
+
"event_type": str(getattr(ev, "event_type", "")),
|
|
1218
|
+
"timestamp": str(getattr(ev, "timestamp", "")),
|
|
1219
|
+
"project_id": project_id,
|
|
1220
|
+
}
|
|
1221
|
+
)
|
|
1222
|
+
except Exception:
|
|
1223
|
+
return []
|
|
1224
|
+
else:
|
|
1225
|
+
return collected
|
|
1226
|
+
|
|
1227
|
+
# ------------------------------------------------------------------
|
|
1228
|
+
# safe_harbor_deidentify (PII-023 — HIPAA Safe Harbor)
|
|
1229
|
+
# ------------------------------------------------------------------
|
|
1230
|
+
|
|
1231
|
+
def safe_harbor_deidentify(self, text: str) -> SafeHarborResult:
|
|
1232
|
+
"""Apply HIPAA Safe Harbor de-identification to *text* (PII-023).
|
|
1233
|
+
|
|
1234
|
+
Removes or generalises all 18 PHI identifier types per
|
|
1235
|
+
45 CFR §164.514(b)(2):
|
|
1236
|
+
|
|
1237
|
+
* Dates (other than year) → year only
|
|
1238
|
+
* Ages > 89 → ``"90+"``
|
|
1239
|
+
* ZIP codes → first 3 digits + ``"XX"``
|
|
1240
|
+
* All other identifiers → ``"[REMOVED]"``
|
|
1241
|
+
|
|
1242
|
+
Args:
|
|
1243
|
+
text: Input text.
|
|
1244
|
+
|
|
1245
|
+
Returns:
|
|
1246
|
+
:class:`~spanforge.sdk._types.SafeHarborResult`.
|
|
1247
|
+
|
|
1248
|
+
Raises:
|
|
1249
|
+
SFPIIScanError: If *text* is not a ``str``.
|
|
1250
|
+
"""
|
|
1251
|
+
if not isinstance(text, str):
|
|
1252
|
+
msg = f"safe_harbor_deidentify() requires a str; got {type(text).__name__}"
|
|
1253
|
+
raise SFPIIScanError(msg)
|
|
1254
|
+
|
|
1255
|
+
result = text
|
|
1256
|
+
replacements = 0
|
|
1257
|
+
phi_types_found: list[str] = []
|
|
1258
|
+
|
|
1259
|
+
# Special-case handling: ages > 89 -> "90+"
|
|
1260
|
+
age_pat = _SAFE_HARBOR_PATTERNS["age_over_89"]
|
|
1261
|
+
|
|
1262
|
+
def _replace_age(m: re.Match[str]) -> str:
|
|
1263
|
+
return "90+"
|
|
1264
|
+
|
|
1265
|
+
new_result, n_subs = re.subn(age_pat, _replace_age, result)
|
|
1266
|
+
if n_subs:
|
|
1267
|
+
result = new_result
|
|
1268
|
+
replacements += n_subs
|
|
1269
|
+
if "age_over_89" not in phi_types_found:
|
|
1270
|
+
phi_types_found.append("age_over_89")
|
|
1271
|
+
|
|
1272
|
+
# ZIP codes → first 3 digits + "XX"
|
|
1273
|
+
zip_pat = _SAFE_HARBOR_PATTERNS["zip"]
|
|
1274
|
+
|
|
1275
|
+
def _replace_zip(m: re.Match[str]) -> str:
|
|
1276
|
+
return m.group(1)[:3] + "XX"
|
|
1277
|
+
|
|
1278
|
+
new_result, n_subs = re.subn(zip_pat, _replace_zip, result)
|
|
1279
|
+
if n_subs:
|
|
1280
|
+
result = new_result
|
|
1281
|
+
replacements += n_subs
|
|
1282
|
+
if "zip" not in phi_types_found:
|
|
1283
|
+
phi_types_found.append("zip")
|
|
1284
|
+
|
|
1285
|
+
# Dates → year only
|
|
1286
|
+
date_pat = _SAFE_HARBOR_PATTERNS["date"]
|
|
1287
|
+
|
|
1288
|
+
def _replace_date(m: re.Match[str]) -> str:
|
|
1289
|
+
# Extract a 4-digit year from the match
|
|
1290
|
+
year_match = re.search(r"(19|20)\d{2}", m.group())
|
|
1291
|
+
return year_match.group() if year_match else "[DATE]"
|
|
1292
|
+
|
|
1293
|
+
new_result, n_subs = re.subn(date_pat, _replace_date, result)
|
|
1294
|
+
if n_subs:
|
|
1295
|
+
result = new_result
|
|
1296
|
+
replacements += n_subs
|
|
1297
|
+
if "date" not in phi_types_found:
|
|
1298
|
+
phi_types_found.append("date")
|
|
1299
|
+
|
|
1300
|
+
# Remaining PHI patterns → [REMOVED]
|
|
1301
|
+
skip_special = {"age_over_89", "zip", "date"}
|
|
1302
|
+
for label, pat in _SAFE_HARBOR_PATTERNS.items():
|
|
1303
|
+
if label in skip_special:
|
|
1304
|
+
continue
|
|
1305
|
+
new_result, n_subs = re.subn(pat, "[REMOVED]", result)
|
|
1306
|
+
if n_subs:
|
|
1307
|
+
result = new_result
|
|
1308
|
+
replacements += n_subs
|
|
1309
|
+
if label not in phi_types_found:
|
|
1310
|
+
phi_types_found.append(label)
|
|
1311
|
+
|
|
1312
|
+
return SafeHarborResult(
|
|
1313
|
+
text=result,
|
|
1314
|
+
replacements=replacements,
|
|
1315
|
+
phi_types_found=phi_types_found,
|
|
1316
|
+
)
|
|
1317
|
+
|
|
1318
|
+
# ------------------------------------------------------------------
|
|
1319
|
+
# audit_training_data (PII-025 — EU AI Act Article 10)
|
|
1320
|
+
# ------------------------------------------------------------------
|
|
1321
|
+
|
|
1322
|
+
def audit_training_data(
|
|
1323
|
+
self,
|
|
1324
|
+
dataset_path: str | Path,
|
|
1325
|
+
*,
|
|
1326
|
+
max_records: int = 100_000,
|
|
1327
|
+
) -> TrainingDataPIIReport:
|
|
1328
|
+
"""Batch-scan a dataset file for PII prevalence (PII-025).
|
|
1329
|
+
|
|
1330
|
+
Supports JSONL (one JSON object per line) and plain-text files (one
|
|
1331
|
+
record per line). Produces a PII prevalence report for use as
|
|
1332
|
+
compliance evidence for EU AI Act Article 10 training-data audits.
|
|
1333
|
+
|
|
1334
|
+
Args:
|
|
1335
|
+
dataset_path: Path to the dataset file.
|
|
1336
|
+
max_records: Maximum number of records to scan (default 100 000).
|
|
1337
|
+
|
|
1338
|
+
Returns:
|
|
1339
|
+
:class:`~spanforge.sdk._types.TrainingDataPIIReport`.
|
|
1340
|
+
|
|
1341
|
+
Raises:
|
|
1342
|
+
SFPIIScanError: If the file cannot be read or *dataset_path* is
|
|
1343
|
+
empty.
|
|
1344
|
+
"""
|
|
1345
|
+
path = Path(dataset_path)
|
|
1346
|
+
if not path.exists():
|
|
1347
|
+
msg = f"audit_training_data(): file not found: {path}"
|
|
1348
|
+
raise SFPIIScanError(msg)
|
|
1349
|
+
|
|
1350
|
+
report_id = str(uuid.uuid4())
|
|
1351
|
+
generated_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
1352
|
+
total_records = 0
|
|
1353
|
+
pii_records = 0
|
|
1354
|
+
entity_counts: dict[str, int] = {}
|
|
1355
|
+
|
|
1356
|
+
try:
|
|
1357
|
+
with path.open(encoding="utf-8", errors="replace") as fh:
|
|
1358
|
+
for line in fh:
|
|
1359
|
+
if total_records >= max_records:
|
|
1360
|
+
break
|
|
1361
|
+
line = line.strip()
|
|
1362
|
+
if not line:
|
|
1363
|
+
continue
|
|
1364
|
+
total_records += 1
|
|
1365
|
+
|
|
1366
|
+
# Determine text to scan
|
|
1367
|
+
if line.startswith("{"):
|
|
1368
|
+
try:
|
|
1369
|
+
record = json.loads(line)
|
|
1370
|
+
text = " ".join(str(v) for v in record.values() if isinstance(v, str))
|
|
1371
|
+
except (json.JSONDecodeError, AttributeError):
|
|
1372
|
+
text = line
|
|
1373
|
+
else:
|
|
1374
|
+
text = line
|
|
1375
|
+
|
|
1376
|
+
result = self._scan_text_local(text, language="en", score_threshold=0.5)
|
|
1377
|
+
if result.detected:
|
|
1378
|
+
pii_records += 1
|
|
1379
|
+
for ent in result.entities:
|
|
1380
|
+
entity_counts[ent.type] = entity_counts.get(ent.type, 0) + 1
|
|
1381
|
+
except OSError as exc:
|
|
1382
|
+
msg = f"audit_training_data(): cannot read {path}: {exc}"
|
|
1383
|
+
raise SFPIIScanError(msg) from exc
|
|
1384
|
+
|
|
1385
|
+
prevalence = round(pii_records / total_records * 100, 2) if total_records else 0.0
|
|
1386
|
+
|
|
1387
|
+
return TrainingDataPIIReport(
|
|
1388
|
+
dataset_path=str(path),
|
|
1389
|
+
total_records=total_records,
|
|
1390
|
+
pii_records=pii_records,
|
|
1391
|
+
prevalence_pct=prevalence,
|
|
1392
|
+
entity_counts=entity_counts,
|
|
1393
|
+
report_id=report_id,
|
|
1394
|
+
generated_at=generated_at,
|
|
1395
|
+
)
|
|
1396
|
+
|
|
1397
|
+
# ------------------------------------------------------------------
|
|
1398
|
+
# get_pii_stats (PII-032 — PII heat map)
|
|
1399
|
+
# ------------------------------------------------------------------
|
|
1400
|
+
|
|
1401
|
+
def get_pii_stats(
|
|
1402
|
+
self,
|
|
1403
|
+
project_id: str,
|
|
1404
|
+
*,
|
|
1405
|
+
entity_type: str | None = None,
|
|
1406
|
+
days: int = 30,
|
|
1407
|
+
) -> list[PIIHeatMapEntry]:
|
|
1408
|
+
"""Return PII detection stats for the dashboard heat map (PII-032).
|
|
1409
|
+
|
|
1410
|
+
Aggregates PII detection events per entity type per day for
|
|
1411
|
+
*project_id* over the last *days* days. Exposed via
|
|
1412
|
+
``GET /v1/pii/stats`` (Team+ tier).
|
|
1413
|
+
|
|
1414
|
+
Args:
|
|
1415
|
+
project_id: Project to aggregate stats for.
|
|
1416
|
+
entity_type: Optional filter — only return entries for this type.
|
|
1417
|
+
days: Look-back window in days (default 30).
|
|
1418
|
+
|
|
1419
|
+
Returns:
|
|
1420
|
+
Ordered list of :class:`~spanforge.sdk._types.PIIHeatMapEntry`
|
|
1421
|
+
items sorted by (date desc, entity_type asc).
|
|
1422
|
+
|
|
1423
|
+
Raises:
|
|
1424
|
+
SFPIIError: If *project_id* is empty.
|
|
1425
|
+
"""
|
|
1426
|
+
if not project_id:
|
|
1427
|
+
msg = "get_pii_stats() requires a non-empty project_id"
|
|
1428
|
+
raise SFPIIError(msg)
|
|
1429
|
+
|
|
1430
|
+
cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
|
|
1431
|
+
aggregated: dict[tuple[str, str], int] = {}
|
|
1432
|
+
|
|
1433
|
+
try:
|
|
1434
|
+
from spanforge._store import TraceStore
|
|
1435
|
+
|
|
1436
|
+
store = TraceStore.get_default() # type: ignore[attr-defined]
|
|
1437
|
+
with store._lock:
|
|
1438
|
+
for trace_events in store._traces.values():
|
|
1439
|
+
for ev in trace_events:
|
|
1440
|
+
payload = getattr(ev, "payload", {}) or {}
|
|
1441
|
+
if payload.get("project_id") != project_id:
|
|
1442
|
+
continue
|
|
1443
|
+
if payload.get("event_class") != "pii_detection":
|
|
1444
|
+
continue
|
|
1445
|
+
ts_str = str(getattr(ev, "timestamp", ""))
|
|
1446
|
+
try:
|
|
1447
|
+
ts = datetime.datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
1448
|
+
except (ValueError, TypeError):
|
|
1449
|
+
continue
|
|
1450
|
+
if ts < cutoff:
|
|
1451
|
+
continue
|
|
1452
|
+
date_str = ts.strftime("%Y-%m-%d")
|
|
1453
|
+
etype = str(payload.get("entity_type", "unknown"))
|
|
1454
|
+
if entity_type and etype != entity_type:
|
|
1455
|
+
continue
|
|
1456
|
+
key = (date_str, etype)
|
|
1457
|
+
aggregated[key] = aggregated.get(key, 0) + int(payload.get("count", 1))
|
|
1458
|
+
except Exception: # nosec B110
|
|
1459
|
+
pass
|
|
1460
|
+
|
|
1461
|
+
return sorted(
|
|
1462
|
+
[
|
|
1463
|
+
PIIHeatMapEntry(
|
|
1464
|
+
project_id=project_id,
|
|
1465
|
+
entity_type=etype,
|
|
1466
|
+
date=date_str,
|
|
1467
|
+
count=count,
|
|
1468
|
+
)
|
|
1469
|
+
for (date_str, etype), count in aggregated.items()
|
|
1470
|
+
],
|
|
1471
|
+
key=lambda e: (e.date, e.entity_type),
|
|
1472
|
+
reverse=True,
|
|
1473
|
+
)
|