spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/sdk/pii.py ADDED
@@ -0,0 +1,1473 @@
1
+ """spanforge.sdk.pii — SpanForge sf-pii client.
2
+
3
+ Implements the full sf-pii API surface for Phase 3 (PII Service Hardening) of
4
+ the SpanForge roadmap, extending the Phase 2 foundation.
5
+
6
+ All operations run locally in-process (zero external dependencies) when
7
+ ``config.endpoint`` is empty or when the remote service is unreachable and
8
+ ``local_fallback_enabled`` is ``True``.
9
+
10
+ Local-mode feature parity
11
+ --------------------------
12
+ * :meth:`scan` — deep regex PII scan (dict payload).
13
+ * :meth:`scan_text` — Presidio-backed text scan (PII-001).
14
+ * :meth:`anonymise` — recursive dict anonymisation (PII-002).
15
+ * :meth:`scan_batch` — async parallel text scan (PII-003).
16
+ * :meth:`apply_pipeline_action` — pii_action routing hook (PII-010/011/012).
17
+ * :meth:`get_status` — sf_pii status contribution (PII-005).
18
+ * :meth:`redact` — apply RedactionPolicy to an event.
19
+ * :meth:`contains_pii` — check for unredacted PII.
20
+ * :meth:`assert_redacted` — raise if unredacted PII found.
21
+ * :meth:`anonymize` — replace PII in raw text strings.
22
+ * :meth:`wrap` — Redactable factory.
23
+ * :meth:`make_policy` — RedactionPolicy factory.
24
+ * :meth:`erase_subject` — GDPR Article 17 erasure (PII-021).
25
+ * :meth:`export_subject_data` — CCPA DSAR export (PII-022).
26
+ * :meth:`safe_harbor_deidentify` — HIPAA Safe Harbor (PII-023).
27
+ * :meth:`audit_training_data` — EU AI Act Article 10 audit (PII-025).
28
+ * :meth:`get_pii_stats` — PII heat map data (PII-032).
29
+
30
+ Security requirements
31
+ ---------------------
32
+ * Scan and anonymize results **never** include matched PII values — only
33
+ type labels, field paths, counts, and anonymized replacement text.
34
+ * :exc:`~spanforge.sdk._exceptions.SFPIINotRedactedError` messages never
35
+ contain raw PII; context strings are SHA-256-hashed before inclusion.
36
+ * ``SecretStr`` API keys are never written to logs.
37
+ * Redaction manifest entries hash original values with SHA-256; raw values
38
+ are never stored.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import asyncio
44
+ import concurrent.futures
45
+ import datetime
46
+ import hashlib
47
+ import json
48
+ import re
49
+ import time
50
+ import uuid
51
+ from pathlib import Path
52
+ from typing import TYPE_CHECKING, Any
53
+
54
+ from spanforge.sdk._base import SFClientConfig, SFServiceClient
55
+ from spanforge.sdk._exceptions import (
56
+ SFPIIBlockedError,
57
+ SFPIIError,
58
+ SFPIINotRedactedError,
59
+ SFPIIPolicyError,
60
+ SFPIIScanError,
61
+ )
62
+ from spanforge.sdk._types import (
63
+ DSARExport,
64
+ ErasureReceipt,
65
+ PIIAnonymisedResult,
66
+ PIIEntity,
67
+ PIIHeatMapEntry,
68
+ PIIPipelineResult,
69
+ PIIRedactionManifestEntry,
70
+ PIIStatusInfo,
71
+ PIITextScanResult,
72
+ SafeHarborResult,
73
+ SFPIIAnonymizeResult,
74
+ SFPIIHit,
75
+ SFPIIRedactResult,
76
+ SFPIIScanResult,
77
+ TrainingDataPIIReport,
78
+ )
79
+
80
+ if TYPE_CHECKING:
81
+ from spanforge.event import Event
82
+ from spanforge.redact import Redactable, RedactionPolicy
83
+
84
+ __all__ = ["SFPIIClient"]
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Valid sensitivity levels — mirrors spanforge.redact.Sensitivity enum values
88
+ # ---------------------------------------------------------------------------
89
+
90
+ _VALID_SENSITIVITY: frozenset[str] = frozenset({"low", "medium", "high", "pii", "phi"})
91
+
92
+ # Validation labels for which secondary validators are applied in anonymize()
93
+ _CC_LABEL = "credit_card"
94
+ _AADHAAR_LABEL = "aadhaar"
95
+ _SSN_LABEL = "ssn"
96
+ _DOB_LABEL = "date_of_birth"
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Phase 3 constants
100
+ # ---------------------------------------------------------------------------
101
+
102
+ #: Default confidence threshold for pipeline action routing (PII-011).
103
+ _DEFAULT_PIPELINE_THRESHOLD: float = 0.85
104
+
105
+ #: Valid pipeline action values (PII-010).
106
+ _VALID_PIPELINE_ACTIONS: frozenset[str] = frozenset({"flag", "redact", "block"})
107
+
108
+ #: DPDP-regulated entity type labels (India DPDP Act).
109
+ _DPDP_ENTITY_TYPES: frozenset[str] = frozenset({"aadhaar", "pan"})
110
+
111
+ #: PIPL-sensitive entity type labels (China PIPL).
112
+ _PIPL_ENTITY_TYPES: frozenset[str] = frozenset({"cn_national_id", "cn_mobile", "cn_bank_card"})
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # HIPAA Safe Harbor — 18 PHI identifier patterns (45 CFR §164.514(b)(2))
116
+ # ---------------------------------------------------------------------------
117
+
118
+ #: Mapping of PHI identifier label → compiled regex for Safe Harbor de-identification.
119
+ _SAFE_HARBOR_PATTERNS: dict[str, re.Pattern[str]] = {
120
+ # 1. Names
121
+ "name": re.compile(
122
+ r"\b(?:Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Prof\.?)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b"
123
+ r"|\b[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b",
124
+ ),
125
+ # 2. Geographic subdivisions smaller than state — zip codes
126
+ "zip": re.compile(r"\b(\d{5})(?:-\d{4})?\b"),
127
+ # 3. Dates (other than year)
128
+ "date": re.compile(
129
+ r"\b(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])[/-](?:19|20)\d{2}\b"
130
+ r"|\b(?:0?[1-9]|[12]\d|3[01])[/-](?:0?[1-9]|1[0-2])[/-](?:19|20)\d{2}\b"
131
+ r"|\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?"
132
+ r"|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
133
+ r"\s+(?:0?[1-9]|[12]\d|3[01]),?\s+(?:19|20)\d{2}\b",
134
+ re.IGNORECASE,
135
+ ),
136
+ # 4. Phone numbers
137
+ "phone": re.compile(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"),
138
+ # 5. Fax numbers — same pattern as phone
139
+ "fax": re.compile(r"(?i)fax\s*:?\s*(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"),
140
+ # 6. Email addresses
141
+ "email": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}", re.ASCII),
142
+ # 7. Social security numbers
143
+ "ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
144
+ # 8. Medical record numbers
145
+ "medical_record": re.compile(r"\bMRN?[\s#:]\s*\d{6,10}\b", re.IGNORECASE),
146
+ # 9. Health plan beneficiary numbers
147
+ "health_plan": re.compile(r"\b(?:HP|HB)[\s#:]\s*\d{6,12}\b", re.IGNORECASE),
148
+ # 10. Account numbers
149
+ "account": re.compile(r"\b(?:Acct?|Account)[\s#:.]\s*\d{6,16}\b", re.IGNORECASE),
150
+ # 11. Certificate/license numbers
151
+ "license": re.compile(r"\bLIC(?:ENSE)?[\s#:]\s*[A-Z0-9]{5,15}\b", re.IGNORECASE),
152
+ # 12. Vehicle identifiers (VIN)
153
+ "vin": re.compile(r"\b[A-HJ-NPR-Z0-9]{17}\b"),
154
+ # 13. Device identifiers (serial numbers — heuristic)
155
+ "device_serial": re.compile(r"\b(?:S/N|SN|Serial)[\s#:]\s*[A-Z0-9]{8,20}\b", re.IGNORECASE),
156
+ # 14. Web URLs
157
+ "url": re.compile(r"https?://[^\s\"'<>]{4,}", re.IGNORECASE),
158
+ # 15. IP addresses
159
+ "ip_address": re.compile(
160
+ r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
161
+ r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
162
+ ),
163
+ # 16. Biometric identifiers — fingerprint reference IDs (heuristic)
164
+ "biometric": re.compile(r"\b(?:FP|BIO)[\s#:]\s*[A-Z0-9]{8,20}\b", re.IGNORECASE),
165
+ # 17. Full face photos — placeholder (cannot regex-detect images)
166
+ # 18. Age > 89 — handled in safe_harbor_deidentify() as post-processing
167
+ "age_over_89": re.compile(
168
+ r"\b(9[0-9]|1[0-9]{2})\s*(?:years?(?:\s+old)?|yo|y/o)\b", re.IGNORECASE
169
+ ),
170
+ }
171
+
172
+
173
+ class SFPIIClient(SFServiceClient):
174
+ """SpanForge PII redaction service client.
175
+
176
+ Provides scanning, redaction, containment checks, and text anonymization.
177
+ All operations run in-process when no ``endpoint`` is configured (local
178
+ mode) or when the remote service is unavailable and
179
+ ``local_fallback_enabled`` is ``True``.
180
+
181
+ Args:
182
+ config: Client configuration. Use :class:`~spanforge.sdk._base.SFClientConfig`
183
+ or :func:`~spanforge.sdk._base.SFClientConfig.from_env`.
184
+
185
+ Example::
186
+
187
+ from spanforge.sdk import sf_pii
188
+
189
+ # Scan a payload for PII
190
+ result = sf_pii.scan({"message": "Call me on 555-867-5309"})
191
+ if not result.clean:
192
+ for hit in result.hits:
193
+ print(hit.pii_type, hit.path, hit.match_count)
194
+
195
+ # Anonymize raw text
196
+ anon = sf_pii.anonymize("My email is alice@example.com")
197
+ print(anon.text) # "My email is [REDACTED:email]"
198
+ """
199
+
200
+ def __init__(self, config: SFClientConfig) -> None:
201
+ super().__init__(config, service_name="pii")
202
+ #: ISO-8601 timestamp of the most recent scan_text() call; None until first call.
203
+ self._last_scan_at: str | None = None
204
+
205
+ # ------------------------------------------------------------------
206
+ # scan
207
+ # ------------------------------------------------------------------
208
+
209
+ def scan(
210
+ self,
211
+ payload: dict[str, Any],
212
+ *,
213
+ extra_patterns: dict[str, re.Pattern[str]] | None = None,
214
+ max_depth: int = 10,
215
+ ) -> SFPIIScanResult:
216
+ """Scan *payload* for PII using built-in and optional extra patterns.
217
+
218
+ Walks the entire payload recursively (up to *max_depth* levels),
219
+ testing every string value against the built-in detector set (email,
220
+ phone, SSN, credit card, IP address, UK NI number, Aadhaar, PAN,
221
+ date-of-birth, address) plus any caller-supplied patterns. Secondary
222
+ validators (Luhn, Verhoeff, SSN range checks, calendar validation)
223
+ are applied to reduce false positives.
224
+
225
+ Security: matched PII values are **never** included in the result —
226
+ only type labels, field paths, match counts, and sensitivity levels.
227
+
228
+ Args:
229
+ payload: Dictionary to scan. Must be a :class:`dict`.
230
+ extra_patterns: Optional ``{label: compiled_regex}`` detectors.
231
+ max_depth: Maximum nesting depth (default 10).
232
+
233
+ Returns:
234
+ :class:`~spanforge.sdk._types.SFPIIScanResult`.
235
+
236
+ Raises:
237
+ SFPIIScanError: If *payload* is not a ``dict`` or scan fails.
238
+ SFServiceUnavailableError: Circuit breaker open, fallback disabled.
239
+ """
240
+ if not isinstance(payload, dict):
241
+ msg = f"scan() requires a dict payload; got {type(payload).__name__}"
242
+ raise SFPIIScanError(msg)
243
+ if self._is_local_mode() or self._config.local_fallback_enabled:
244
+ return self._scan_local(payload, extra_patterns=extra_patterns, max_depth=max_depth)
245
+ return self._scan_remote(payload, extra_patterns=extra_patterns, max_depth=max_depth)
246
+
247
+ def _scan_local(
248
+ self,
249
+ payload: dict[str, Any],
250
+ *,
251
+ extra_patterns: dict[str, re.Pattern[str]] | None,
252
+ max_depth: int,
253
+ ) -> SFPIIScanResult:
254
+ from spanforge.presidio_backend import is_available as _presidio_available
255
+ from spanforge.presidio_backend import presidio_scan_payload
256
+ from spanforge.redact import scan_payload
257
+
258
+ try:
259
+ if _presidio_available():
260
+ result = presidio_scan_payload(
261
+ payload, max_depth=max_depth
262
+ )
263
+ # Supplement with caller-supplied regex patterns (extra_patterns).
264
+ # Presidio does not accept these; run a lightweight regex pass
265
+ # and merge only the custom-pattern hits so nothing is lost.
266
+ if extra_patterns:
267
+ extra_result = scan_payload(
268
+ payload, extra_patterns=extra_patterns, max_depth=max_depth
269
+ )
270
+ custom_hits = [h for h in extra_result.hits if h.pii_type in extra_patterns]
271
+ if custom_hits:
272
+ from spanforge.redact import PIIScanResult
273
+ result = PIIScanResult(
274
+ hits=result.hits + custom_hits,
275
+ scanned=result.scanned,
276
+ )
277
+ else:
278
+ result = scan_payload(payload, extra_patterns=extra_patterns, max_depth=max_depth)
279
+ except RecursionError as exc:
280
+ raise SFPIIScanError(str(exc)) from exc
281
+
282
+ hits = [
283
+ SFPIIHit(
284
+ pii_type=h.pii_type,
285
+ path=h.path,
286
+ match_count=h.match_count,
287
+ sensitivity=h.sensitivity,
288
+ )
289
+ for h in result.hits
290
+ ]
291
+ return SFPIIScanResult(hits=hits, scanned=result.scanned)
292
+
293
+ def _scan_remote(
294
+ self,
295
+ payload: dict[str, Any],
296
+ *,
297
+ extra_patterns: dict[str, re.Pattern[str]] | None,
298
+ max_depth: int,
299
+ ) -> SFPIIScanResult:
300
+ body: dict[str, Any] = {"payload": payload, "max_depth": max_depth}
301
+ raw = self._request("POST", "/pii/scan", body=body)
302
+ hits = [
303
+ SFPIIHit(
304
+ pii_type=str(h.get("pii_type", "")),
305
+ path=str(h.get("path", "")),
306
+ match_count=int(h.get("match_count", 1)),
307
+ sensitivity=str(h.get("sensitivity", "medium")),
308
+ )
309
+ for h in raw.get("hits", [])
310
+ ]
311
+ return SFPIIScanResult(hits=hits, scanned=int(raw.get("scanned", 0)))
312
+
313
+ # ------------------------------------------------------------------
314
+ # redact
315
+ # ------------------------------------------------------------------
316
+
317
+ def redact(
318
+ self,
319
+ event: Event,
320
+ *,
321
+ policy: RedactionPolicy | None = None,
322
+ ) -> SFPIIRedactResult:
323
+ """Apply a redaction policy to *event*, returning a sanitised copy.
324
+
325
+ Fields wrapped in :class:`~spanforge.redact.Redactable` with
326
+ sensitivity ≥ the policy threshold are replaced with safe marker
327
+ strings (e.g. ``"[REDACTED:pii]"``). The original event is **not**
328
+ mutated; a new :class:`~spanforge.event.Event` is returned inside the
329
+ result.
330
+
331
+ Args:
332
+ event: The :class:`~spanforge.event.Event` to redact.
333
+ policy: :class:`~spanforge.redact.RedactionPolicy` to apply.
334
+ Defaults to ``RedactionPolicy(redacted_by="policy:sf-pii")``,
335
+ which redacts all fields at ``Sensitivity.PII`` or above.
336
+
337
+ Returns:
338
+ :class:`~spanforge.sdk._types.SFPIIRedactResult` with the
339
+ sanitised event and redaction statistics.
340
+
341
+ Raises:
342
+ SFServiceUnavailableError: Circuit breaker open, fallback disabled.
343
+ """
344
+ if self._is_local_mode() or self._config.local_fallback_enabled:
345
+ return self._redact_local(event, policy=policy)
346
+ return self._redact_remote(event, policy=policy)
347
+
348
+ def _redact_local(
349
+ self,
350
+ event: Event,
351
+ *,
352
+ policy: RedactionPolicy | None,
353
+ ) -> SFPIIRedactResult:
354
+ from spanforge.redact import RedactionPolicy
355
+
356
+ effective = policy if policy is not None else RedactionPolicy(redacted_by="policy:sf-pii")
357
+ result = effective.apply(event)
358
+ return SFPIIRedactResult(
359
+ event=result.event,
360
+ redaction_count=result.redaction_count,
361
+ redacted_at=result.redacted_at,
362
+ redacted_by=result.redacted_by,
363
+ )
364
+
365
+ def _redact_remote(
366
+ self,
367
+ event: Event,
368
+ *,
369
+ policy: RedactionPolicy | None,
370
+ ) -> SFPIIRedactResult:
371
+ from spanforge.redact import RedactionPolicy, Sensitivity
372
+
373
+ effective = policy if policy is not None else RedactionPolicy(redacted_by="policy:sf-pii")
374
+ body: dict[str, Any] = {
375
+ "min_sensitivity": effective.min_sensitivity.value
376
+ if isinstance(effective.min_sensitivity, Sensitivity)
377
+ else str(effective.min_sensitivity),
378
+ "redacted_by": effective.redacted_by,
379
+ }
380
+ raw = self._request("POST", "/pii/redact", body=body)
381
+ return SFPIIRedactResult(
382
+ event=raw.get("event"),
383
+ redaction_count=int(raw.get("redaction_count", 0)),
384
+ redacted_at=str(raw.get("redacted_at", "")),
385
+ redacted_by=str(raw.get("redacted_by", effective.redacted_by)),
386
+ )
387
+
388
+ # ------------------------------------------------------------------
389
+ # contains_pii
390
+ # ------------------------------------------------------------------
391
+
392
+ def contains_pii(
393
+ self,
394
+ event: Event,
395
+ *,
396
+ scan_raw: bool = True,
397
+ ) -> bool:
398
+ """Return ``True`` if any unredacted PII remains in *event*.
399
+
400
+ Checks both :class:`~spanforge.redact.Redactable` wrapper instances
401
+ (explicit PII markers) and, when *scan_raw* is ``True``, raw string
402
+ values via the built-in regex detectors.
403
+
404
+ Args:
405
+ event: The :class:`~spanforge.event.Event` to inspect.
406
+ scan_raw: When ``True`` (default), also run regex PII scanning on
407
+ string values in the payload.
408
+
409
+ Returns:
410
+ ``True`` if PII is detected; ``False`` if the payload is clean.
411
+ """
412
+ if self._is_local_mode() or self._config.local_fallback_enabled:
413
+ from spanforge.redact import contains_pii as _cp
414
+
415
+ return _cp(event, scan_raw=scan_raw)
416
+ raw = self._request("POST", "/pii/contains", body={"scan_raw": scan_raw})
417
+ return bool(raw.get("contains_pii", False))
418
+
419
+ # ------------------------------------------------------------------
420
+ # assert_redacted
421
+ # ------------------------------------------------------------------
422
+
423
+ def assert_redacted(
424
+ self,
425
+ event: Event,
426
+ *,
427
+ context: str = "",
428
+ scan_raw: bool = True,
429
+ ) -> None:
430
+ """Raise :exc:`SFPIINotRedactedError` if *event* contains unredacted PII.
431
+
432
+ A stricter alternative to :meth:`contains_pii`. Use this at export
433
+ or serialisation boundaries to enforce that all PII has been scrubbed
434
+ before the event leaves a trusted context.
435
+
436
+ Args:
437
+ event: The :class:`~spanforge.event.Event` to verify.
438
+ context: Optional label identifying the call site for correlation
439
+ (SHA-256-hashed before use — never included raw).
440
+ scan_raw: When ``True`` (default), also run regex scanning.
441
+
442
+ Raises:
443
+ SFPIINotRedactedError: If unredacted PII is detected.
444
+ """
445
+ if self._is_local_mode() or self._config.local_fallback_enabled:
446
+ self._assert_redacted_local(event, context=context, scan_raw=scan_raw)
447
+ return
448
+ raw = self._request(
449
+ "POST",
450
+ "/pii/assert-redacted",
451
+ body={"scan_raw": scan_raw},
452
+ )
453
+ if raw.get("has_pii"):
454
+ raise SFPIINotRedactedError(int(raw.get("count", 1)), context)
455
+
456
+ def _assert_redacted_local(
457
+ self,
458
+ event: Event,
459
+ *,
460
+ context: str,
461
+ scan_raw: bool,
462
+ ) -> None:
463
+ from spanforge.redact import PIINotRedactedError, assert_redacted
464
+
465
+ try:
466
+ assert_redacted(event, context, scan_raw=scan_raw)
467
+ except PIINotRedactedError as exc:
468
+ raise SFPIINotRedactedError(exc.count, context) from exc
469
+
470
+ # ------------------------------------------------------------------
471
+ # anonymize
472
+ # ------------------------------------------------------------------
473
+
474
+ def anonymize(
475
+ self,
476
+ text: str,
477
+ *,
478
+ extra_patterns: dict[str, re.Pattern[str]] | None = None,
479
+ ) -> SFPIIAnonymizeResult:
480
+ """Replace all detected PII in *text* with type-tagged markers.
481
+
482
+ Runs the full built-in PII pattern set (and any *extra_patterns*)
483
+ against *text*, replacing each confirmed match with
484
+ ``[REDACTED:<pii_type>]``. Secondary validators (Luhn checksum for
485
+ credit cards, Verhoeff checksum for Aadhaar, SSA range checks for
486
+ SSNs, calendar validation for dates of birth) are applied to minimise
487
+ false-positive replacements.
488
+
489
+ Security: the original matched values are **never** returned — only
490
+ the anonymized text, replacement count, and a list of PII type labels.
491
+
492
+ Args:
493
+ text: Plain text string to anonymize.
494
+ extra_patterns: Optional ``{label: compiled_regex}`` detectors to
495
+ run in addition to the built-in patterns.
496
+
497
+ Returns:
498
+ :class:`~spanforge.sdk._types.SFPIIAnonymizeResult`.
499
+
500
+ Raises:
501
+ SFPIIScanError: If *text* is not a ``str``.
502
+ """
503
+ if not isinstance(text, str):
504
+ msg = f"anonymize() requires a str; got {type(text).__name__}"
505
+ raise SFPIIScanError(msg)
506
+ if self._is_local_mode() or self._config.local_fallback_enabled:
507
+ return self._anonymize_local(text, extra_patterns=extra_patterns)
508
+ raw = self._request("POST", "/pii/anonymize", body={"text": text})
509
+ return SFPIIAnonymizeResult(
510
+ text=str(raw.get("text", text)),
511
+ replacements=int(raw.get("replacements", 0)),
512
+ pii_types_found=list(raw.get("pii_types_found", [])),
513
+ )
514
+
515
+ def _anonymize_local(
516
+ self,
517
+ text: str,
518
+ *,
519
+ extra_patterns: dict[str, re.Pattern[str]] | None,
520
+ ) -> SFPIIAnonymizeResult:
521
+ import spanforge.redact as _redact
522
+
523
+ # Access built-in patterns; fall back gracefully if internal names change.
524
+ pii_patterns: dict[str, re.Pattern[str]] = dict(
525
+ getattr(_redact, "_PII_PATTERNS", {}),
526
+ )
527
+ dpdp_patterns: dict[str, re.Pattern[str]] = dict(
528
+ getattr(_redact, "DPDP_PATTERNS", {}),
529
+ )
530
+ patterns: dict[str, re.Pattern[str]] = {**pii_patterns, **dpdp_patterns}
531
+ if extra_patterns:
532
+ patterns.update(extra_patterns)
533
+
534
+ # Secondary validators (default to always-pass if internals unavailable)
535
+ _luhn = getattr(_redact, "_luhn_check", lambda _s: True)
536
+ _verhoeff = getattr(_redact, "_verhoeff_check", lambda _s: True)
537
+ _valid_ssn = getattr(_redact, "_is_valid_ssn", lambda _s: True)
538
+ _valid_date = getattr(_redact, "_is_valid_date", lambda _s: True)
539
+
540
+ result_text = text
541
+ replacements = 0
542
+ pii_types_found: list[str] = []
543
+
544
+ for label, pat in patterns.items():
545
+ counter: list[int] = [0]
546
+
547
+ def _replace(
548
+ m: re.Match[str],
549
+ _lbl: str = label,
550
+ _cnt: list[int] = counter,
551
+ ) -> str:
552
+ val = m.group()
553
+ if _lbl == _CC_LABEL and not _luhn(val):
554
+ return val
555
+ if _lbl == _AADHAAR_LABEL and not _verhoeff(val):
556
+ return val
557
+ if _lbl == _SSN_LABEL and not _valid_ssn(val):
558
+ return val
559
+ if _lbl == _DOB_LABEL and not _valid_date(val):
560
+ return val
561
+ _cnt[0] += 1
562
+ return f"[REDACTED:{_lbl}]"
563
+
564
+ new_text = pat.sub(_replace, result_text)
565
+ if counter[0] > 0:
566
+ result_text = new_text
567
+ replacements += counter[0]
568
+ if label not in pii_types_found:
569
+ pii_types_found.append(label)
570
+
571
+ return SFPIIAnonymizeResult(
572
+ text=result_text,
573
+ replacements=replacements,
574
+ pii_types_found=pii_types_found,
575
+ )
576
+
577
+ # ------------------------------------------------------------------
578
+ # wrap
579
+ # ------------------------------------------------------------------
580
+
581
+ def wrap(
582
+ self,
583
+ value: object,
584
+ sensitivity: str,
585
+ pii_types: frozenset[str] = frozenset(),
586
+ ) -> Redactable:
587
+ """Wrap *value* as a :class:`~spanforge.redact.Redactable` sentinel.
588
+
589
+ Convenience factory that creates a :class:`~spanforge.redact.Redactable`
590
+ instance ready to embed in an event payload. The value will be
591
+ replaced by a safe marker string when a
592
+ :class:`~spanforge.redact.RedactionPolicy` is applied.
593
+
594
+ Args:
595
+ value: The PII-sensitive value to protect.
596
+ sensitivity: Sensitivity level string: ``"low"``, ``"medium"``,
597
+ ``"high"``, ``"pii"``, or ``"phi"``.
598
+ pii_types: Labels describing the PII category
599
+ (e.g. ``frozenset({"email"})``).
600
+
601
+ Returns:
602
+ :class:`~spanforge.redact.Redactable` wrapping *value*.
603
+
604
+ Raises:
605
+ SFPIIPolicyError: If *sensitivity* is not a recognised level.
606
+
607
+ Example::
608
+
609
+ wrapped = sf_pii.wrap("alice@example.com", "pii", frozenset({"email"}))
610
+ """
611
+ from spanforge.redact import Redactable, Sensitivity
612
+
613
+ if sensitivity not in _VALID_SENSITIVITY:
614
+ valid = sorted(_VALID_SENSITIVITY)
615
+ msg = f"Invalid sensitivity level {sensitivity!r}. Must be one of: {valid}"
616
+ raise SFPIIPolicyError(msg)
617
+ return Redactable(value, Sensitivity(sensitivity), pii_types)
618
+
619
+ # ------------------------------------------------------------------
620
+ # make_policy
621
+ # ------------------------------------------------------------------
622
+
623
+ def make_policy(
624
+ self,
625
+ *,
626
+ min_sensitivity: str = "pii",
627
+ redacted_by: str = "policy:sf-pii",
628
+ replacement_template: str = "[REDACTED:{sensitivity}]",
629
+ ) -> RedactionPolicy:
630
+ """Create a configured :class:`~spanforge.redact.RedactionPolicy`.
631
+
632
+ Args:
633
+ min_sensitivity: Sensitivity threshold; fields at or above
634
+ this level are redacted. Must be one of
635
+ ``"low"``, ``"medium"``, ``"high"``,
636
+ ``"pii"``, or ``"phi"``.
637
+ Defaults to ``"pii"``.
638
+ redacted_by: Identifier embedded in the redaction
639
+ metadata (e.g. ``"policy:corp-default"``).
640
+ Defaults to ``"policy:sf-pii"``.
641
+ replacement_template: Marker template. Must contain
642
+ ``{sensitivity}`` which is replaced with
643
+ the field's sensitivity level value.
644
+ Defaults to ``"[REDACTED:{sensitivity}]"``.
645
+
646
+ Returns:
647
+ Configured :class:`~spanforge.redact.RedactionPolicy`.
648
+
649
+ Raises:
650
+ SFPIIPolicyError: If *min_sensitivity* is not recognised or
651
+ *replacement_template* lacks ``{sensitivity}``.
652
+
653
+ Example::
654
+
655
+ policy = sf_pii.make_policy(min_sensitivity="high",
656
+ redacted_by="my-service")
657
+ """
658
+ from spanforge.redact import RedactionPolicy, Sensitivity
659
+
660
+ if min_sensitivity not in _VALID_SENSITIVITY:
661
+ valid = sorted(_VALID_SENSITIVITY)
662
+ msg = f"Invalid min_sensitivity {min_sensitivity!r}. Must be one of: {valid}"
663
+ raise SFPIIPolicyError(msg)
664
+ if "{sensitivity}" not in replacement_template:
665
+ msg = (
666
+ "replacement_template must contain the '{sensitivity}' placeholder; "
667
+ f"received: {replacement_template!r}"
668
+ )
669
+ raise SFPIIPolicyError(msg)
670
+ return RedactionPolicy(
671
+ min_sensitivity=Sensitivity(min_sensitivity),
672
+ redacted_by=redacted_by,
673
+ replacement_template=replacement_template,
674
+ )
675
+
676
+ # ==================================================================
677
+ # Phase 3 — PII Service Hardening
678
+ # ==================================================================
679
+
680
+ # ------------------------------------------------------------------
681
+ # scan_text (PII-001)
682
+ # ------------------------------------------------------------------
683
+
684
+ def scan_text(
685
+ self,
686
+ text: str,
687
+ *,
688
+ language: str = "en",
689
+ score_threshold: float = 0.5,
690
+ ) -> PIITextScanResult:
691
+ """Scan a plain-text string for PII (PII-001).
692
+
693
+ Uses the Presidio ``AnalyzerEngine`` when available, falling back to
694
+ the built-in regex scanner. Response shape follows the spec:
695
+ ``{entities: [{type, start, end, score}], redacted_text, detected}``.
696
+
697
+ **Security**: entity values are never returned — only type, position,
698
+ and confidence score.
699
+
700
+ Args:
701
+ text: Plain text to scan.
702
+ language: Language code for Presidio analysis (default
703
+ ``"en"``). Ignored when using regex fallback.
704
+ score_threshold: Minimum Presidio confidence score (default
705
+ 0.5).
706
+
707
+ Returns:
708
+ :class:`~spanforge.sdk._types.PIITextScanResult`.
709
+
710
+ Raises:
711
+ SFPIIScanError: If *text* is not a ``str``.
712
+ """
713
+ if not isinstance(text, str):
714
+ msg = f"scan_text() requires a str; got {type(text).__name__}"
715
+ raise SFPIIScanError(msg)
716
+ self._last_scan_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
717
+ return self._scan_text_local(text, language=language, score_threshold=score_threshold)
718
+
719
+ def _scan_text_local(
720
+ self,
721
+ text: str,
722
+ *,
723
+ language: str,
724
+ score_threshold: float,
725
+ ) -> PIITextScanResult:
726
+ from spanforge.presidio_backend import is_available, presidio_scan_text
727
+
728
+ if is_available():
729
+ try:
730
+ raw_entities, redacted_text, detected = presidio_scan_text(
731
+ text, language=language, score_threshold=score_threshold
732
+ )
733
+ entities = [
734
+ PIIEntity(
735
+ type=e["type"],
736
+ start=e["start"],
737
+ end=e["end"],
738
+ score=e["score"],
739
+ )
740
+ for e in raw_entities
741
+ ]
742
+ return PIITextScanResult(
743
+ entities=entities,
744
+ redacted_text=redacted_text,
745
+ detected=detected,
746
+ )
747
+ except ImportError:
748
+ pass # fall through to regex fallback
749
+
750
+ # Regex fallback — synthesise character-level entities from pattern matches
751
+ return self._scan_text_regex_fallback(text)
752
+
753
+ def _scan_text_regex_fallback(self, text: str) -> PIITextScanResult:
754
+ """Regex-based fallback for scan_text() when Presidio is unavailable."""
755
+ import spanforge.redact as _redact
756
+
757
+ pii_patterns: dict[str, re.Pattern[str]] = dict(
758
+ getattr(_redact, "_PII_PATTERNS", {}),
759
+ )
760
+ dpdp_patterns: dict[str, re.Pattern[str]] = dict(
761
+ getattr(_redact, "DPDP_PATTERNS", {}),
762
+ )
763
+ from spanforge.presidio_backend import PIPL_PATTERNS
764
+
765
+ all_patterns = {**pii_patterns, **dpdp_patterns, **PIPL_PATTERNS}
766
+
767
+ _luhn = getattr(_redact, "_luhn_check", lambda _s: True)
768
+ _verhoeff = getattr(_redact, "_verhoeff_check", lambda _s: True)
769
+ _valid_ssn = getattr(_redact, "_is_valid_ssn", lambda _s: True)
770
+ _valid_date = getattr(_redact, "_is_valid_date", lambda _s: True)
771
+
772
+ entities: list[PIIEntity] = []
773
+ for label, pat in all_patterns.items():
774
+ for m in pat.finditer(text):
775
+ val = m.group()
776
+ if label == _CC_LABEL and not _luhn(val):
777
+ continue
778
+ if label == _AADHAAR_LABEL and not _verhoeff(val):
779
+ continue
780
+ if label == _SSN_LABEL and not _valid_ssn(val):
781
+ continue
782
+ if label == _DOB_LABEL and not _valid_date(val):
783
+ continue
784
+ entities.append(PIIEntity(type=label, start=m.start(), end=m.end(), score=1.0))
785
+
786
+ # Sort by start position and build redacted text right-to-left
787
+ entities.sort(key=lambda e: e.start)
788
+ redacted = text
789
+ for ent in sorted(entities, key=lambda e: e.start, reverse=True):
790
+ redacted = redacted[: ent.start] + f"<{ent.type.upper()}>" + redacted[ent.end :]
791
+
792
+ return PIITextScanResult(
793
+ entities=entities,
794
+ redacted_text=redacted,
795
+ detected=bool(entities),
796
+ )
797
+
798
+ # ------------------------------------------------------------------
799
+ # anonymise (PII-002) — British spelling, dict input
800
+ # ------------------------------------------------------------------
801
+
802
+ def anonymise(
803
+ self,
804
+ payload: dict[str, Any],
805
+ *,
806
+ max_depth: int = 10,
807
+ ) -> PIIAnonymisedResult:
808
+ """Recursively anonymise all string fields in *payload* (PII-002).
809
+
810
+ Calls :meth:`scan_text` on every string field, replacing detected
811
+ entities with ``<TYPE>`` placeholders. Returns a clean copy of the
812
+ payload plus a manifest recording what was replaced (original values
813
+ are SHA-256-hashed — never stored in plain text).
814
+
815
+ This method replaces the custom Presidio pipeline in HallucCheck v5.0
816
+ §14 (leaderboard anonymisation).
817
+
818
+ Args:
819
+ payload: Dictionary to anonymise. Must be a :class:`dict`.
820
+ max_depth: Maximum nesting depth (default 10).
821
+
822
+ Returns:
823
+ :class:`~spanforge.sdk._types.PIIAnonymisedResult` with
824
+ ``clean_payload`` and ``redaction_manifest``.
825
+
826
+ Raises:
827
+ SFPIIScanError: If *payload* is not a ``dict``.
828
+ """
829
+ if not isinstance(payload, dict):
830
+ msg = f"anonymise() requires a dict payload; got {type(payload).__name__}"
831
+ raise SFPIIScanError(msg)
832
+ manifest: list[PIIRedactionManifestEntry] = []
833
+ clean = self._anonymise_walk(
834
+ payload, path="", depth=0, max_depth=max_depth, manifest=manifest
835
+ )
836
+ return PIIAnonymisedResult(
837
+ clean_payload=clean,
838
+ redaction_manifest=manifest,
839
+ )
840
+
841
+ def _anonymise_walk(
842
+ self,
843
+ obj: Any,
844
+ *,
845
+ path: str,
846
+ depth: int,
847
+ max_depth: int,
848
+ manifest: list[PIIRedactionManifestEntry],
849
+ ) -> Any:
850
+ if depth > max_depth:
851
+ return obj
852
+ if isinstance(obj, str):
853
+ result = self._scan_text_local(obj, language="en", score_threshold=0.5)
854
+ if not result.detected:
855
+ return obj
856
+ # Replace detected entities and record manifest entries
857
+ clean_text = result.redacted_text
858
+ for ent in result.entities:
859
+ original_hash = hashlib.sha256(obj[ent.start : ent.end].encode()).hexdigest()
860
+ manifest.append(
861
+ PIIRedactionManifestEntry(
862
+ field_path=path,
863
+ type=ent.type,
864
+ original_hash=original_hash,
865
+ replacement=f"<{ent.type.upper()}>",
866
+ )
867
+ )
868
+ return clean_text
869
+ if isinstance(obj, dict):
870
+ return {
871
+ k: self._anonymise_walk(
872
+ v,
873
+ path=f"{path}.{k}" if path else str(k),
874
+ depth=depth + 1,
875
+ max_depth=max_depth,
876
+ manifest=manifest,
877
+ )
878
+ for k, v in obj.items()
879
+ }
880
+ if isinstance(obj, list):
881
+ return [
882
+ self._anonymise_walk(
883
+ v,
884
+ path=f"{path}[{i}]",
885
+ depth=depth + 1,
886
+ max_depth=max_depth,
887
+ manifest=manifest,
888
+ )
889
+ for i, v in enumerate(obj)
890
+ ]
891
+ return obj
892
+
893
+ # ------------------------------------------------------------------
894
+ # scan_batch (PII-003)
895
+ # ------------------------------------------------------------------
896
+
897
+ def scan_batch(
898
+ self,
899
+ texts: list[str],
900
+ *,
901
+ language: str = "en",
902
+ score_threshold: float = 0.5,
903
+ max_workers: int = 8,
904
+ ) -> list[PIITextScanResult]:
905
+ """Scan a list of texts for PII in parallel (PII-003).
906
+
907
+ Uses a thread pool for concurrent execution. Used by
908
+ ``hc trust-gate`` to bulk-check recent outputs.
909
+
910
+ Args:
911
+ texts: List of plain text strings to scan.
912
+ language: Language code (default ``"en"``).
913
+ score_threshold: Minimum confidence score (default 0.5).
914
+ max_workers: Thread pool size (default 8).
915
+
916
+ Returns:
917
+ List of :class:`~spanforge.sdk._types.PIITextScanResult` in the
918
+ same order as *texts*.
919
+
920
+ Raises:
921
+ SFPIIScanError: If *texts* is not a list or any element is not a
922
+ ``str``.
923
+ """
924
+ if not isinstance(texts, list):
925
+ msg = f"scan_batch() requires a list; got {type(texts).__name__}"
926
+ raise SFPIIScanError(msg)
927
+ for i, t in enumerate(texts):
928
+ if not isinstance(t, str):
929
+ msg = f"scan_batch() element [{i}] must be str; got {type(t).__name__}"
930
+ raise SFPIIScanError(msg)
931
+
932
+ if not texts:
933
+ return []
934
+
935
+ def _scan_one(text: str) -> PIITextScanResult:
936
+ return self._scan_text_local(text, language=language, score_threshold=score_threshold)
937
+
938
+ with concurrent.futures.ThreadPoolExecutor(max_workers=min(max_workers, len(texts))) as ex:
939
+ futures = [ex.submit(_scan_one, t) for t in texts]
940
+ return [f.result() for f in futures]
941
+
942
+ # ------------------------------------------------------------------
943
+ # apply_pipeline_action (PII-010 / PII-011 / PII-012)
944
+ # ------------------------------------------------------------------
945
+
946
+ def apply_pipeline_action(
947
+ self,
948
+ text: str,
949
+ *,
950
+ action: str = "flag",
951
+ threshold: float = _DEFAULT_PIPELINE_THRESHOLD,
952
+ language: str = "en",
953
+ ) -> PIIPipelineResult:
954
+ """Apply pipeline pii_action routing to *text* (PII-010/011/012).
955
+
956
+ After scanning, enforces the configured *action*:
957
+
958
+ * ``"flag"`` — score normally; ``detected=True`` added to result.
959
+ * ``"redact"`` — substitute ``redacted_text`` as scoring input.
960
+ * ``"block"`` — raise :exc:`~spanforge.sdk._exceptions.SFPIIBlockedError`
961
+ (HTTP 422 ``PII_DETECTED``).
962
+
963
+ Only entities with ``score >= threshold`` trigger the action.
964
+ Sub-threshold hits are recorded in ``low_confidence_hits`` for audit.
965
+
966
+ Args:
967
+ text: Input text to scan.
968
+ action: Pipeline action: ``"flag"``, ``"redact"``, or
969
+ ``"block"``. Default: ``"flag"``.
970
+ threshold: Confidence threshold (default 0.85). Entities below
971
+ this score are recorded but do not trigger the action.
972
+ language: Language code for Presidio (default ``"en"``).
973
+
974
+ Returns:
975
+ :class:`~spanforge.sdk._types.PIIPipelineResult`.
976
+
977
+ Raises:
978
+ SFPIIScanError: If *text* is not a ``str`` or *action* is
979
+ invalid.
980
+ SFPIIBlockedError: If *action* is ``"block"`` and PII is
981
+ detected above *threshold*.
982
+ """
983
+ if not isinstance(text, str):
984
+ msg = f"apply_pipeline_action() requires a str; got {type(text).__name__}"
985
+ raise SFPIIScanError(msg)
986
+ if action not in _VALID_PIPELINE_ACTIONS:
987
+ valid = sorted(_VALID_PIPELINE_ACTIONS)
988
+ msg = f"Invalid action {action!r}. Must be one of: {valid}"
989
+ raise SFPIIScanError(msg)
990
+
991
+ scan_result = self._scan_text_local(text, language=language, score_threshold=0.0)
992
+
993
+ above = [e for e in scan_result.entities if e.score >= threshold]
994
+ below = [e for e in scan_result.entities if e.score < threshold]
995
+ detected = bool(above)
996
+ entity_types = sorted({e.type for e in above})
997
+
998
+ # Build redacted text from above-threshold entities only
999
+ redacted = text
1000
+ for ent in sorted(above, key=lambda e: e.start, reverse=True):
1001
+ redacted = redacted[: ent.start] + f"<{ent.type.upper()}>" + redacted[ent.end :]
1002
+
1003
+ if action == "block" and detected:
1004
+ raise SFPIIBlockedError(entity_types=entity_types, count=len(above))
1005
+
1006
+ effective_text = redacted if action == "redact" and detected else text
1007
+
1008
+ return PIIPipelineResult(
1009
+ text=effective_text,
1010
+ action=action,
1011
+ detected=detected,
1012
+ entity_types=entity_types,
1013
+ low_confidence_hits=below,
1014
+ redacted_text=redacted,
1015
+ blocked=False,
1016
+ )
1017
+
1018
+ # ------------------------------------------------------------------
1019
+ # scan_async (F-10)
1020
+ # ------------------------------------------------------------------
1021
+
1022
+ async def scan_async(
1023
+ self,
1024
+ text: str,
1025
+ *,
1026
+ language: str = "en",
1027
+ score_threshold: float = 0.5,
1028
+ ):
1029
+ """Async variant of :meth:`scan_text` (F-10).
1030
+
1031
+ Runs :meth:`scan_text` in a thread-pool executor via
1032
+ :func:`asyncio.run_in_executor`, making it safe to ``await``
1033
+ from async code without blocking the event loop.
1034
+
1035
+ Args:
1036
+ text: Plain text to scan.
1037
+ language: Language code passed to :meth:`scan_text`.
1038
+ score_threshold: Minimum confidence score passed to :meth:`scan_text`.
1039
+
1040
+ Returns:
1041
+ :class:`~spanforge.sdk._types.PIITextScanResult` — same as
1042
+ :meth:`scan_text`.
1043
+ """
1044
+ import functools
1045
+
1046
+ loop = asyncio.get_event_loop()
1047
+ return await loop.run_in_executor(
1048
+ None,
1049
+ functools.partial(
1050
+ self.scan_text, text, language=language, score_threshold=score_threshold
1051
+ ),
1052
+ )
1053
+
1054
+ # ------------------------------------------------------------------
1055
+ # get_status (PII-005)
1056
+ # ------------------------------------------------------------------
1057
+
1058
+ def get_status(self) -> PIIStatusInfo:
1059
+ """Return sf-pii service status (PII-005).
1060
+
1061
+ Contributes the ``sf_pii`` field for ``GET /v1/spanforge/status``:
1062
+ ``{status, presidio_available, entity_types_loaded, last_scan_at}``.
1063
+
1064
+ Returns:
1065
+ :class:`~spanforge.sdk._types.PIIStatusInfo`.
1066
+ """
1067
+ from spanforge.presidio_backend import PIPL_PATTERNS, is_available
1068
+
1069
+ presidio_ok = is_available()
1070
+
1071
+ import spanforge.redact as _redact
1072
+
1073
+ pii_pats: dict[str, Any] = dict(getattr(_redact, "_PII_PATTERNS", {}))
1074
+ dpdp_pats: dict[str, Any] = dict(getattr(_redact, "DPDP_PATTERNS", {}))
1075
+ entity_types = sorted({*pii_pats, *dpdp_pats, *PIPL_PATTERNS})
1076
+
1077
+ return PIIStatusInfo(
1078
+ status="ok",
1079
+ presidio_available=presidio_ok,
1080
+ entity_types_loaded=entity_types,
1081
+ last_scan_at=getattr(self, "_last_scan_at", None),
1082
+ )
1083
+
1084
+ # ------------------------------------------------------------------
1085
+ # erase_subject (PII-021 — GDPR Article 17)
1086
+ # ------------------------------------------------------------------
1087
+
1088
+ def erase_subject(
1089
+ self,
1090
+ subject_id: str,
1091
+ project_id: str,
1092
+ ) -> ErasureReceipt:
1093
+ """Issue a GDPR Article 17 Right to Erasure for *subject_id* (PII-021).
1094
+
1095
+ Finds all ``pii_detection`` audit records for *subject_id* in the
1096
+ scoping *project_id*, issues erasure instructions to downstream
1097
+ stores, and returns a receipt with timestamp for the Article 17(3)
1098
+ exceptions log.
1099
+
1100
+ **Security**: *subject_id* is SHA-256-hashed in log output; it is
1101
+ never written to records in plain text.
1102
+
1103
+ Args:
1104
+ subject_id: Opaque data subject identifier.
1105
+ project_id: Project scope for the erasure.
1106
+
1107
+ Returns:
1108
+ :class:`~spanforge.sdk._types.ErasureReceipt`.
1109
+
1110
+ Raises:
1111
+ SFPIIError: If erasure cannot be completed.
1112
+ """
1113
+ if not subject_id or not project_id:
1114
+ msg = "erase_subject() requires non-empty subject_id and project_id"
1115
+ raise SFPIIError(msg)
1116
+
1117
+ erasure_id = str(uuid.uuid4())
1118
+ erased_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
1119
+
1120
+ # In local mode, we locate events from the in-process store.
1121
+ records_erased = self._local_erase_subject(subject_id, project_id)
1122
+
1123
+ return ErasureReceipt(
1124
+ subject_id=subject_id,
1125
+ project_id=project_id,
1126
+ records_erased=records_erased,
1127
+ erasure_id=erasure_id,
1128
+ erased_at=erased_at,
1129
+ exceptions=[],
1130
+ )
1131
+
1132
+ def _local_erase_subject(self, subject_id: str, project_id: str) -> int:
1133
+ """Attempt local store erasure; returns count of matching records."""
1134
+ try:
1135
+ from spanforge._store import TraceStore
1136
+
1137
+ store = TraceStore.get_default() # type: ignore[attr-defined]
1138
+ erased = 0
1139
+ with store._lock:
1140
+ for trace_events in store._traces.values():
1141
+ for ev in trace_events:
1142
+ payload = getattr(ev, "payload", {}) or {}
1143
+ if (
1144
+ payload.get("subject_id") == subject_id
1145
+ and payload.get("project_id") == project_id
1146
+ ):
1147
+ # Mark for erasure — zero out identifiable fields
1148
+ payload.pop("subject_id", None)
1149
+ erased += 1
1150
+ except Exception:
1151
+ return 0
1152
+ else:
1153
+ return erased
1154
+
1155
+ # ------------------------------------------------------------------
1156
+ # export_subject_data (PII-022 — CCPA DSAR)
1157
+ # ------------------------------------------------------------------
1158
+
1159
+ def export_subject_data(
1160
+ self,
1161
+ subject_id: str,
1162
+ project_id: str,
1163
+ ) -> DSARExport:
1164
+ """Export all data for *subject_id* for a CCPA DSAR request (PII-022).
1165
+
1166
+ Aggregates all events referencing *subject_id* from sf-audit and
1167
+ returns a JSON-export package. Used by
1168
+ ``GET /v1/privacy/dsar/{subject_id}``.
1169
+
1170
+ Args:
1171
+ subject_id: Opaque data subject identifier.
1172
+ project_id: Project scope.
1173
+
1174
+ Returns:
1175
+ :class:`~spanforge.sdk._types.DSARExport`.
1176
+
1177
+ Raises:
1178
+ SFPIIError: If *subject_id* or *project_id* is empty.
1179
+ """
1180
+ if not subject_id or not project_id:
1181
+ msg = "export_subject_data() requires non-empty subject_id and project_id"
1182
+ raise SFPIIError(msg)
1183
+
1184
+ export_id = str(uuid.uuid4())
1185
+ exported_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
1186
+ events = self._local_collect_subject_events(subject_id, project_id)
1187
+
1188
+ return DSARExport(
1189
+ subject_id=subject_id,
1190
+ project_id=project_id,
1191
+ event_count=len(events),
1192
+ export_id=export_id,
1193
+ exported_at=exported_at,
1194
+ events=events,
1195
+ )
1196
+
1197
+ def _local_collect_subject_events(
1198
+ self, subject_id: str, project_id: str
1199
+ ) -> list[dict[str, Any]]:
1200
+ """Collect events referencing subject_id from the local store."""
1201
+ try:
1202
+ from spanforge._store import TraceStore
1203
+
1204
+ store = TraceStore.get_default() # type: ignore[attr-defined]
1205
+ collected: list[dict[str, Any]] = []
1206
+ with store._lock:
1207
+ for trace_events in store._traces.values():
1208
+ for ev in trace_events:
1209
+ payload = getattr(ev, "payload", {}) or {}
1210
+ if (
1211
+ payload.get("subject_id") == subject_id
1212
+ and payload.get("project_id") == project_id
1213
+ ):
1214
+ collected.append(
1215
+ {
1216
+ "event_id": str(getattr(ev, "event_id", "")),
1217
+ "event_type": str(getattr(ev, "event_type", "")),
1218
+ "timestamp": str(getattr(ev, "timestamp", "")),
1219
+ "project_id": project_id,
1220
+ }
1221
+ )
1222
+ except Exception:
1223
+ return []
1224
+ else:
1225
+ return collected
1226
+
1227
+ # ------------------------------------------------------------------
1228
+ # safe_harbor_deidentify (PII-023 — HIPAA Safe Harbor)
1229
+ # ------------------------------------------------------------------
1230
+
1231
+ def safe_harbor_deidentify(self, text: str) -> SafeHarborResult:
1232
+ """Apply HIPAA Safe Harbor de-identification to *text* (PII-023).
1233
+
1234
+ Removes or generalises all 18 PHI identifier types per
1235
+ 45 CFR §164.514(b)(2):
1236
+
1237
+ * Dates (other than year) → year only
1238
+ * Ages > 89 → ``"90+"``
1239
+ * ZIP codes → first 3 digits + ``"XX"``
1240
+ * All other identifiers → ``"[REMOVED]"``
1241
+
1242
+ Args:
1243
+ text: Input text.
1244
+
1245
+ Returns:
1246
+ :class:`~spanforge.sdk._types.SafeHarborResult`.
1247
+
1248
+ Raises:
1249
+ SFPIIScanError: If *text* is not a ``str``.
1250
+ """
1251
+ if not isinstance(text, str):
1252
+ msg = f"safe_harbor_deidentify() requires a str; got {type(text).__name__}"
1253
+ raise SFPIIScanError(msg)
1254
+
1255
+ result = text
1256
+ replacements = 0
1257
+ phi_types_found: list[str] = []
1258
+
1259
+ # Special-case handling: ages > 89 -> "90+"
1260
+ age_pat = _SAFE_HARBOR_PATTERNS["age_over_89"]
1261
+
1262
+ def _replace_age(m: re.Match[str]) -> str:
1263
+ return "90+"
1264
+
1265
+ new_result, n_subs = re.subn(age_pat, _replace_age, result)
1266
+ if n_subs:
1267
+ result = new_result
1268
+ replacements += n_subs
1269
+ if "age_over_89" not in phi_types_found:
1270
+ phi_types_found.append("age_over_89")
1271
+
1272
+ # ZIP codes → first 3 digits + "XX"
1273
+ zip_pat = _SAFE_HARBOR_PATTERNS["zip"]
1274
+
1275
+ def _replace_zip(m: re.Match[str]) -> str:
1276
+ return m.group(1)[:3] + "XX"
1277
+
1278
+ new_result, n_subs = re.subn(zip_pat, _replace_zip, result)
1279
+ if n_subs:
1280
+ result = new_result
1281
+ replacements += n_subs
1282
+ if "zip" not in phi_types_found:
1283
+ phi_types_found.append("zip")
1284
+
1285
+ # Dates → year only
1286
+ date_pat = _SAFE_HARBOR_PATTERNS["date"]
1287
+
1288
+ def _replace_date(m: re.Match[str]) -> str:
1289
+ # Extract a 4-digit year from the match
1290
+ year_match = re.search(r"(19|20)\d{2}", m.group())
1291
+ return year_match.group() if year_match else "[DATE]"
1292
+
1293
+ new_result, n_subs = re.subn(date_pat, _replace_date, result)
1294
+ if n_subs:
1295
+ result = new_result
1296
+ replacements += n_subs
1297
+ if "date" not in phi_types_found:
1298
+ phi_types_found.append("date")
1299
+
1300
+ # Remaining PHI patterns → [REMOVED]
1301
+ skip_special = {"age_over_89", "zip", "date"}
1302
+ for label, pat in _SAFE_HARBOR_PATTERNS.items():
1303
+ if label in skip_special:
1304
+ continue
1305
+ new_result, n_subs = re.subn(pat, "[REMOVED]", result)
1306
+ if n_subs:
1307
+ result = new_result
1308
+ replacements += n_subs
1309
+ if label not in phi_types_found:
1310
+ phi_types_found.append(label)
1311
+
1312
+ return SafeHarborResult(
1313
+ text=result,
1314
+ replacements=replacements,
1315
+ phi_types_found=phi_types_found,
1316
+ )
1317
+
1318
+ # ------------------------------------------------------------------
1319
+ # audit_training_data (PII-025 — EU AI Act Article 10)
1320
+ # ------------------------------------------------------------------
1321
+
1322
+ def audit_training_data(
1323
+ self,
1324
+ dataset_path: str | Path,
1325
+ *,
1326
+ max_records: int = 100_000,
1327
+ ) -> TrainingDataPIIReport:
1328
+ """Batch-scan a dataset file for PII prevalence (PII-025).
1329
+
1330
+ Supports JSONL (one JSON object per line) and plain-text files (one
1331
+ record per line). Produces a PII prevalence report for use as
1332
+ compliance evidence for EU AI Act Article 10 training-data audits.
1333
+
1334
+ Args:
1335
+ dataset_path: Path to the dataset file.
1336
+ max_records: Maximum number of records to scan (default 100 000).
1337
+
1338
+ Returns:
1339
+ :class:`~spanforge.sdk._types.TrainingDataPIIReport`.
1340
+
1341
+ Raises:
1342
+ SFPIIScanError: If the file cannot be read or *dataset_path* is
1343
+ empty.
1344
+ """
1345
+ path = Path(dataset_path)
1346
+ if not path.exists():
1347
+ msg = f"audit_training_data(): file not found: {path}"
1348
+ raise SFPIIScanError(msg)
1349
+
1350
+ report_id = str(uuid.uuid4())
1351
+ generated_at = datetime.datetime.now(datetime.timezone.utc).isoformat()
1352
+ total_records = 0
1353
+ pii_records = 0
1354
+ entity_counts: dict[str, int] = {}
1355
+
1356
+ try:
1357
+ with path.open(encoding="utf-8", errors="replace") as fh:
1358
+ for line in fh:
1359
+ if total_records >= max_records:
1360
+ break
1361
+ line = line.strip()
1362
+ if not line:
1363
+ continue
1364
+ total_records += 1
1365
+
1366
+ # Determine text to scan
1367
+ if line.startswith("{"):
1368
+ try:
1369
+ record = json.loads(line)
1370
+ text = " ".join(str(v) for v in record.values() if isinstance(v, str))
1371
+ except (json.JSONDecodeError, AttributeError):
1372
+ text = line
1373
+ else:
1374
+ text = line
1375
+
1376
+ result = self._scan_text_local(text, language="en", score_threshold=0.5)
1377
+ if result.detected:
1378
+ pii_records += 1
1379
+ for ent in result.entities:
1380
+ entity_counts[ent.type] = entity_counts.get(ent.type, 0) + 1
1381
+ except OSError as exc:
1382
+ msg = f"audit_training_data(): cannot read {path}: {exc}"
1383
+ raise SFPIIScanError(msg) from exc
1384
+
1385
+ prevalence = round(pii_records / total_records * 100, 2) if total_records else 0.0
1386
+
1387
+ return TrainingDataPIIReport(
1388
+ dataset_path=str(path),
1389
+ total_records=total_records,
1390
+ pii_records=pii_records,
1391
+ prevalence_pct=prevalence,
1392
+ entity_counts=entity_counts,
1393
+ report_id=report_id,
1394
+ generated_at=generated_at,
1395
+ )
1396
+
1397
+ # ------------------------------------------------------------------
1398
+ # get_pii_stats (PII-032 — PII heat map)
1399
+ # ------------------------------------------------------------------
1400
+
1401
+ def get_pii_stats(
1402
+ self,
1403
+ project_id: str,
1404
+ *,
1405
+ entity_type: str | None = None,
1406
+ days: int = 30,
1407
+ ) -> list[PIIHeatMapEntry]:
1408
+ """Return PII detection stats for the dashboard heat map (PII-032).
1409
+
1410
+ Aggregates PII detection events per entity type per day for
1411
+ *project_id* over the last *days* days. Exposed via
1412
+ ``GET /v1/pii/stats`` (Team+ tier).
1413
+
1414
+ Args:
1415
+ project_id: Project to aggregate stats for.
1416
+ entity_type: Optional filter — only return entries for this type.
1417
+ days: Look-back window in days (default 30).
1418
+
1419
+ Returns:
1420
+ Ordered list of :class:`~spanforge.sdk._types.PIIHeatMapEntry`
1421
+ items sorted by (date desc, entity_type asc).
1422
+
1423
+ Raises:
1424
+ SFPIIError: If *project_id* is empty.
1425
+ """
1426
+ if not project_id:
1427
+ msg = "get_pii_stats() requires a non-empty project_id"
1428
+ raise SFPIIError(msg)
1429
+
1430
+ cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
1431
+ aggregated: dict[tuple[str, str], int] = {}
1432
+
1433
+ try:
1434
+ from spanforge._store import TraceStore
1435
+
1436
+ store = TraceStore.get_default() # type: ignore[attr-defined]
1437
+ with store._lock:
1438
+ for trace_events in store._traces.values():
1439
+ for ev in trace_events:
1440
+ payload = getattr(ev, "payload", {}) or {}
1441
+ if payload.get("project_id") != project_id:
1442
+ continue
1443
+ if payload.get("event_class") != "pii_detection":
1444
+ continue
1445
+ ts_str = str(getattr(ev, "timestamp", ""))
1446
+ try:
1447
+ ts = datetime.datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
1448
+ except (ValueError, TypeError):
1449
+ continue
1450
+ if ts < cutoff:
1451
+ continue
1452
+ date_str = ts.strftime("%Y-%m-%d")
1453
+ etype = str(payload.get("entity_type", "unknown"))
1454
+ if entity_type and etype != entity_type:
1455
+ continue
1456
+ key = (date_str, etype)
1457
+ aggregated[key] = aggregated.get(key, 0) + int(payload.get("count", 1))
1458
+ except Exception: # nosec B110
1459
+ pass
1460
+
1461
+ return sorted(
1462
+ [
1463
+ PIIHeatMapEntry(
1464
+ project_id=project_id,
1465
+ entity_type=etype,
1466
+ date=date_str,
1467
+ count=count,
1468
+ )
1469
+ for (date_str, etype), count in aggregated.items()
1470
+ ],
1471
+ key=lambda e: (e.date, e.entity_type),
1472
+ reverse=True,
1473
+ )