spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
@@ -0,0 +1,171 @@
1
+ """spanforge.normalizer — ProviderNormalizer Protocol and GenericNormalizer.
2
+
3
+ Defines the :class:`ProviderNormalizer` structural protocol (RFC-0001 §10.4)
4
+ that provider-specific integration modules must satisfy, plus a
5
+ :class:`GenericNormalizer` fallback that handles OpenAI-compatible,
6
+ Anthropic-compatible, and raw ``dict`` response shapes without requiring
7
+ any vendored SDK.
8
+
9
+ Usage
10
+ -----
11
+ ::
12
+
13
+ from spanforge.normalizer import GenericNormalizer
14
+
15
+ normalizer = GenericNormalizer()
16
+ token_usage, model_info, cost = normalizer.normalize_response(raw_response)
17
+
18
+ RFC reference
19
+ -------------
20
+ RFC-0001-SPANFORGE §10.4 — Provider Normalizer interface mandate.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from typing import Any, Protocol, runtime_checkable
26
+
27
+ from spanforge.namespaces.trace import CostBreakdown, ModelInfo, TokenUsage
28
+
29
+ __all__: list[str] = ["GenericNormalizer", "ProviderNormalizer"]
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Protocol
34
+ # ---------------------------------------------------------------------------
35
+
36
+
37
+ @runtime_checkable
38
+ class ProviderNormalizer(Protocol):
39
+ """Structural protocol for provider-specific response normalizers.
40
+
41
+ Any object implementing this single-method interface can be used as a
42
+ drop-in normalizer within the SpanForge instrumentation pipeline. No
43
+ base class is required — structural (duck-typed) conformance is enough.
44
+
45
+ Implementors
46
+ ------------
47
+ * :class:`GenericNormalizer` — OpenAI-compatible + Anthropic-compatible
48
+ shapes; zero-dependency fallback.
49
+ * ``spanforge.integrations.openai.OpenAINormalizer`` (when available)
50
+ * ``spanforge.integrations.anthropic.AnthropicNormalizer`` (when available)
51
+ """
52
+
53
+ def normalize_response(
54
+ self,
55
+ response: object,
56
+ ) -> tuple[TokenUsage, ModelInfo, CostBreakdown | None]:
57
+ """Extract TokenUsage, ModelInfo, and optionally CostBreakdown from a raw LLM response.
58
+
59
+ Parameters
60
+ ----------
61
+ response:
62
+ Raw response object or dict from a provider SDK call.
63
+
64
+ Returns:
65
+ -------
66
+ tuple[TokenUsage, ModelInfo, CostBreakdown | None]
67
+ A 3-tuple of typed value objects. ``CostBreakdown`` will be
68
+ ``None`` when pricing data is unavailable.
69
+ """
70
+ ... # pragma: no cover — Protocol method, never called directly.
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Generic fallback implementation
75
+ # ---------------------------------------------------------------------------
76
+
77
+ _UNKNOWN = "_custom"
78
+
79
+
80
+ def _get(obj: Any, *keys: str, default: Any = None) -> Any:
81
+ """Attribute-then-dict key lookup — tolerates both objects and dicts."""
82
+ for key in keys:
83
+ if obj is None:
84
+ return default
85
+ obj = obj.get(key) if isinstance(obj, dict) else getattr(obj, key, None)
86
+ return obj if obj is not None else default
87
+
88
+
89
+ class GenericNormalizer:
90
+ """Zero-dependency fallback normalizer for common LLM response shapes.
91
+
92
+ Supports three structural layouts without requiring any provider SDK:
93
+
94
+ 1. **OpenAI-compatible** — ``response.usage.{prompt_tokens,
95
+ completion_tokens, total_tokens}``, ``response.model``.
96
+ 2. **Anthropic-compatible** — ``response.usage.{input_tokens,
97
+ output_tokens}``, ``response.model``.
98
+ 3. **Raw dict** — any dict with keys from either layout above.
99
+
100
+ When neither layout matches, sensible zero-value defaults are returned
101
+ so the caller always gets a valid :class:`~spanforge.namespaces.trace.TokenUsage`
102
+ regardless of the provider response shape.
103
+ """
104
+
105
+ def normalize_response(
106
+ self,
107
+ response: object,
108
+ ) -> tuple[TokenUsage, ModelInfo, CostBreakdown | None]:
109
+ """Normalise *response* into typed SpanForge value objects.
110
+
111
+ Parameters
112
+ ----------
113
+ response:
114
+ Raw provider response — may be a dataclass, SDK response object,
115
+ or plain ``dict``.
116
+
117
+ Returns:
118
+ -------
119
+ tuple[TokenUsage, ModelInfo, CostBreakdown | None]
120
+ Typed value objects; ``CostBreakdown`` is always ``None`` (pricing
121
+ data requires a :class:`~spanforge.namespaces.trace.PricingTier`
122
+ which this generic normalizer does not possess).
123
+ """
124
+ usage = _get(response, "usage")
125
+
126
+ # ---------- token counts ----------
127
+ # OpenAI layout: prompt_tokens / completion_tokens / total_tokens
128
+ # Anthropic layout: input_tokens / output_tokens
129
+ input_tokens: int = int(
130
+ _get(usage, "prompt_tokens", default=0) or _get(usage, "input_tokens", default=0) or 0
131
+ )
132
+ output_tokens: int = int(
133
+ _get(usage, "completion_tokens", default=0)
134
+ or _get(usage, "output_tokens", default=0)
135
+ or 0
136
+ )
137
+ total_tokens: int = int(
138
+ _get(usage, "total_tokens", default=0) or (input_tokens + output_tokens)
139
+ )
140
+ cached_tokens: int = int(
141
+ _get(usage, "cached_tokens", default=0)
142
+ or _get(usage, "cache_read_input_tokens", default=0)
143
+ or 0
144
+ )
145
+ cache_creation_tokens: int = int(_get(usage, "cache_creation_input_tokens", default=0) or 0)
146
+ reasoning_tokens: int = int(_get(usage, "reasoning_tokens", default=0) or 0)
147
+
148
+ token_usage = TokenUsage(
149
+ input_tokens=input_tokens,
150
+ output_tokens=output_tokens,
151
+ total_tokens=total_tokens,
152
+ cached_tokens=cached_tokens if cached_tokens else None,
153
+ cache_creation_tokens=cache_creation_tokens if cache_creation_tokens else None,
154
+ reasoning_tokens=reasoning_tokens if reasoning_tokens else None,
155
+ )
156
+
157
+ # ---------- model info ----------
158
+ model_name: str = str(
159
+ _get(response, "model", default="")
160
+ or _get(response, "model_id", default="")
161
+ or "unknown"
162
+ )
163
+
164
+ model_info = ModelInfo(
165
+ system=_UNKNOWN,
166
+ name=model_name,
167
+ response_model=model_name,
168
+ custom_system_name="generic",
169
+ )
170
+
171
+ return token_usage, model_info, None
spanforge/plugins.py ADDED
@@ -0,0 +1,82 @@
1
+ """spanforge.plugins — Entry-point plugin discovery.
2
+
3
+ Provides a single function :func:`discover` that loads objects registered via
4
+ Python packaging entry points. Handles the ``importlib.metadata`` API split
5
+ between Python 3.9 (returns a ``dict``) and Python 3.10+ (returns a
6
+ ``SelectableGroups`` object with ``.select()``), so callers never need to
7
+ write the version-gate themselves.
8
+
9
+ Usage::
10
+
11
+ from spanforge.plugins import discover
12
+
13
+ # Load all scorers registered under the "spanforge.scorers" group
14
+ scorers = discover("spanforge.scorers")
15
+
16
+ # Typical pattern: build a name → instance registry
17
+ registry = {}
18
+ for obj in discover("my_tool.plugins"):
19
+ if callable(obj):
20
+ instance = obj()
21
+ registry[getattr(instance, "name", type(instance).__name__)] = instance
22
+
23
+ Entry-point registration example (``pyproject.toml``)::
24
+
25
+ [project.entry-points."spanforge.scorers"]
26
+ my_scorer = "my_package.scorers:MyScorer"
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import sys
32
+ from typing import Any
33
+
34
+ __all__ = ["discover"]
35
+
36
+
37
+ def discover(group: str) -> list[Any]:
38
+ """Discover and load all entry points registered under *group*.
39
+
40
+ Each registered entry point is loaded (its object is imported and
41
+ returned). Entry points that fail to load are silently skipped so that
42
+ a broken third-party plugin cannot crash the host application.
43
+
44
+ Args:
45
+ group: The entry-point group name (e.g. ``"spanforge.scorers"``).
46
+
47
+ Returns:
48
+ A list of loaded objects (classes, instances, functions — whatever the
49
+ entry point points at). Order matches the order returned by
50
+ ``importlib.metadata``, which is typically installation order.
51
+
52
+ Example::
53
+
54
+ for scorer_cls in discover("spanforge.scorers"):
55
+ print(scorer_cls.__name__)
56
+ """
57
+ try:
58
+ if sys.version_info >= (3, 12):
59
+ from importlib.metadata import entry_points
60
+
61
+ eps = entry_points(group=group)
62
+ elif sys.version_info >= (3, 10):
63
+ from importlib.metadata import entry_points
64
+
65
+ eps = entry_points().select(group=group) # type: ignore[union-attr]
66
+ else:
67
+ # Python 3.9: entry_points() returns a plain dict
68
+ from importlib.metadata import entry_points
69
+
70
+ all_eps = entry_points()
71
+ eps = all_eps.get(group, []) if isinstance(all_eps, dict) else []
72
+ except Exception:
73
+ return []
74
+
75
+ loaded: list[Any] = []
76
+ for ep in eps:
77
+ try:
78
+ obj = ep.load()
79
+ loaded.append(obj)
80
+ except Exception as exc:
81
+ _ = exc
82
+ return loaded
@@ -0,0 +1,349 @@
1
+ """spanforge.presidio_backend — Optional Presidio-powered PII detection backend.
2
+
3
+ Wraps Microsoft Presidio AnalyzerEngine to provide entity recognition that
4
+ is more accurate than regex-only scanning. Falls back gracefully if the
5
+ ``presidio-analyzer`` package is not installed.
6
+
7
+ Install with::
8
+
9
+ pip install "spanforge[presidio]"
10
+
11
+ Usage::
12
+
13
+ from spanforge.presidio_backend import presidio_scan_payload, is_available
14
+
15
+ if is_available():
16
+ result = presidio_scan_payload({"message": "My SSN is 123-45-6789"})
17
+ print(result.clean) # False
18
+
19
+ The result is a standard :class:`~spanforge.redact.PIIScanResult`, fully
20
+ compatible with the built-in regex scanner.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import os
26
+ import re
27
+ from collections.abc import Mapping
28
+ from typing import Any
29
+
30
+ # Prevent transformers (pulled in as a Presidio optional dep) from importing
31
+ # TensorFlow. TF has a protobuf registration bug on Python 3.13 that raises
32
+ # ValueError at import time and breaks the entire Presidio initialisation.
33
+ os.environ.setdefault("USE_TF", "0")
34
+ os.environ.setdefault("TRANSFORMERS_NO_TF", "1")
35
+
36
+ from spanforge.redact import PIIScanHit, PIIScanResult
37
+
38
+ __all__ = [
39
+ "PIPL_PATTERNS",
40
+ "is_available",
41
+ "presidio_scan_payload",
42
+ "presidio_scan_text",
43
+ ]
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Availability check
47
+ # ---------------------------------------------------------------------------
48
+
49
+ # Module-level cached AnalyzerEngine — built once on first successful call.
50
+ _analyzer: Any = None
51
+ _analyzer_available: bool | None = None # None = not yet tested
52
+
53
+
54
+ def _get_analyzer() -> Any:
55
+ """Return a lazily-created Presidio AnalyzerEngine configured for spaCy.
56
+
57
+ Explicitly configures ``en_core_web_lg`` so that Presidio never falls back
58
+ to the transformers NLP engine (which would trigger a TensorFlow import and
59
+ crash on Python 3.13 due to a protobuf double-registration bug).
60
+
61
+ Raises:
62
+ ImportError: If ``presidio-analyzer`` is not installed.
63
+ OSError: If ``en_core_web_lg`` is not installed.
64
+ """
65
+ global _analyzer
66
+ if _analyzer is not None:
67
+ return _analyzer
68
+
69
+ from presidio_analyzer import AnalyzerEngine
70
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
71
+
72
+ configuration = {
73
+ "nlp_engine_name": "spacy",
74
+ "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
75
+ }
76
+ provider = NlpEngineProvider(nlp_configuration=configuration)
77
+ nlp_engine = provider.create_engine()
78
+ _analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
79
+
80
+ # Register custom high-precision pattern recognizers to supplement the
81
+ # built-in recognizers where Presidio's default confidence is too low.
82
+ from presidio_analyzer import PatternRecognizer
83
+ from presidio_analyzer.pattern import Pattern
84
+
85
+ # US phone formats that the built-in recognizer scores at 0.4 (below the
86
+ # default 0.5 threshold). These two patterns are high-precision and
87
+ # represent the two test corpus entries that would otherwise be missed.
88
+ _analyzer.registry.add_recognizer(
89
+ PatternRecognizer(
90
+ supported_entity="PHONE_NUMBER",
91
+ patterns=[
92
+ Pattern("US_PHONE_INTL", r"\+1[-.\s]\d{3}[-.\s]\d{3}[-.\s]\d{4}\b", 0.75),
93
+ Pattern("US_PHONE_PAREN", r"\(\d{3}\)\s*\d{3}[-.\s]\d{4}\b", 0.75),
94
+ Pattern("US_PHONE_PLAIN", r"\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b", 0.60),
95
+ ],
96
+ supported_language="en",
97
+ )
98
+ )
99
+
100
+ # Indian Aadhaar (12-digit UID in groups of 4) for English-locale corpora.
101
+ _analyzer.registry.add_recognizer(
102
+ PatternRecognizer(
103
+ supported_entity="IN_AADHAAR",
104
+ patterns=[Pattern("AADHAAR", r"\b\d{4}[ \-]\d{4}[ \-]\d{4}\b", 0.85)],
105
+ supported_language="en",
106
+ )
107
+ )
108
+
109
+ # Indian PAN (Permanent Account Number: AAAAA9999A format).
110
+ _analyzer.registry.add_recognizer(
111
+ PatternRecognizer(
112
+ supported_entity="IN_PAN",
113
+ patterns=[Pattern("IN_PAN", r"\b[A-Z]{5}\d{4}[A-Z]\b", 0.85)],
114
+ supported_language="en",
115
+ )
116
+ )
117
+
118
+ # UK National Insurance Number (e.g. AB 12 34 56 C / AB123456C).
119
+ # Not included in Presidio's default recognizer set for English.
120
+ _analyzer.registry.add_recognizer(
121
+ PatternRecognizer(
122
+ supported_entity="UK_NATIONAL_INSURANCE",
123
+ patterns=[
124
+ Pattern(
125
+ "UK_NI",
126
+ r"\b[A-Z]{2}[\s]?\d{2}[\s]?\d{2}[\s]?\d{2}[\s]?[A-D]\b",
127
+ 0.85,
128
+ )
129
+ ],
130
+ supported_language="en",
131
+ )
132
+ )
133
+
134
+ return _analyzer
135
+
136
+
137
+ def is_available() -> bool:
138
+ """Return ``True`` if Presidio + en_core_web_lg are usable."""
139
+ global _analyzer_available
140
+ if _analyzer_available is not None:
141
+ return _analyzer_available
142
+ try:
143
+ _get_analyzer()
144
+ _analyzer_available = True
145
+ except Exception: # noqa: BLE001 — ImportError, OSError, ValueError, etc.
146
+ _analyzer_available = False
147
+ return _analyzer_available
148
+
149
+
150
+ # ---------------------------------------------------------------------------
151
+ # PII-024 — China PIPL sensitive personal information patterns
152
+ # ---------------------------------------------------------------------------
153
+
154
+ #: Regex patterns for China PIPL sensitive personal information.
155
+ #: Matches are flagged as ``pipl_sensitive`` for cross-border transfer controls.
156
+ PIPL_PATTERNS: dict[str, re.Pattern[str]] = {
157
+ # Chinese Resident Identity Card: 17 digits + check digit (digit or 'X')
158
+ "cn_national_id": re.compile(r"\b\d{17}[\dXx]\b"),
159
+ # Chinese mobile numbers: begin with 1 followed by 3-9, then 9 digits
160
+ "cn_mobile": re.compile(r"\b1[3-9]\d{9}\b"),
161
+ # Chinese bank card numbers: 16-19 digits (Luhn-validated at scan time)
162
+ "cn_bank_card": re.compile(r"\b(?:\d[ -]?){15,18}\d\b"),
163
+ }
164
+
165
+ #: Entity types that are classified as PIPL-sensitive.
166
+ PIPL_SENSITIVE_TYPES: frozenset[str] = frozenset(PIPL_PATTERNS.keys())
167
+
168
+ # Map Presidio entity types to SpanForge PII labels / sensitivity.
169
+ # DATE_TIME, LOCATION, NRP (nationality), and URL are intentionally excluded:
170
+ # they fire excessively on technical log strings (timestamps, cloud regions,
171
+ # registry paths) producing unacceptable false-positive rates in production.
172
+ _ENTITY_MAP: dict[str, tuple[str, str]] = {
173
+ "CREDIT_CARD": ("credit_card", "high"),
174
+ "CRYPTO": ("crypto_address", "medium"),
175
+ "EMAIL_ADDRESS": ("email", "medium"),
176
+ "IBAN_CODE": ("iban", "high"),
177
+ "IP_ADDRESS": ("ip_address", "low"),
178
+ "PERSON": ("person_name", "medium"),
179
+ "PHONE_NUMBER": ("phone", "medium"),
180
+ "US_SSN": ("ssn", "high"),
181
+ "UK_NHS": ("uk_nhs", "high"),
182
+ "US_DRIVER_LICENSE": ("us_driver_license", "high"),
183
+ "US_PASSPORT": ("us_passport", "high"),
184
+ "IN_AADHAAR": ("aadhaar", "high"),
185
+ "IN_PAN": ("pan", "high"),
186
+ "MEDICAL_LICENSE": ("medical_license", "medium"),
187
+ "UK_NATIONAL_INSURANCE": ("uk_national_insurance", "high"),
188
+ }
189
+
190
+ # Explicit entity allow-list passed to every AnalyzerEngine.analyze() call.
191
+ # Keeps only high-precision recognizers; excludes noisy NER labels.
192
+ _SCAN_ENTITIES: list[str] = list(_ENTITY_MAP.keys())
193
+
194
+
195
+ # ---------------------------------------------------------------------------
196
+ # Public API
197
+ # ---------------------------------------------------------------------------
198
+
199
+
200
+ def presidio_scan_payload(
201
+ payload: dict[str, Any],
202
+ *,
203
+ language: str = "en",
204
+ score_threshold: float = 0.5,
205
+ max_depth: int = 10,
206
+ ) -> PIIScanResult:
207
+ """Scan a payload dict for PII using Microsoft Presidio.
208
+
209
+ Walks the payload recursively (up to *max_depth*), analysing every string
210
+ value with the Presidio ``AnalyzerEngine``.
211
+
212
+ **Security**: detected values are never returned — only the entity type,
213
+ path, count, and sensitivity level.
214
+
215
+ Args:
216
+ payload: The dictionary to scan.
217
+ language: Language code for analysis (default: ``"en"``).
218
+ score_threshold: Minimum Presidio confidence score (default: 0.5).
219
+ max_depth: Maximum nesting depth (default: 10).
220
+
221
+ Returns:
222
+ A :class:`~spanforge.redact.PIIScanResult` summarising detections.
223
+
224
+ Raises:
225
+ ImportError: If ``presidio-analyzer`` is not installed.
226
+ """
227
+ analyzer = _get_analyzer()
228
+ hits: list[PIIScanHit] = []
229
+ scanned = 0
230
+
231
+ def _walk(obj: Any, path: str, depth: int) -> None:
232
+ nonlocal scanned
233
+ if depth > max_depth:
234
+ return
235
+ if isinstance(obj, str):
236
+ scanned += 1
237
+ results = analyzer.analyze(
238
+ text=obj,
239
+ language=language,
240
+ score_threshold=score_threshold,
241
+ entities=_SCAN_ENTITIES,
242
+ )
243
+ # Post-filter: suppress known low-precision false-positive patterns.
244
+ # PERSON — spaCy NER fires on lowercase technical identifiers
245
+ # (e.g. "cafebabe1234", "tenant_id", "failed_count").
246
+ # Real person names are Title-cased; reject all-lowercase matches.
247
+ # IP_ADDRESS — fires on dotted-decimal OIDs (e.g. 2.16.840.1.101.3.4.2.1)
248
+ # which have more than 3 dots. Skip those.
249
+ def _keep(r: Any) -> bool:
250
+ matched = obj[r.start : r.end]
251
+ if r.entity_type == "PERSON" and matched == matched.lower():
252
+ return False
253
+ if r.entity_type == "IP_ADDRESS" and ":" not in matched:
254
+ # Filter dotted-decimal OIDs — a valid IPv4 has exactly 4
255
+ # segments each in [0, 255], AND is not embedded inside a
256
+ # longer dotted-decimal sequence (e.g. 2.16.840.1.101.3.4.2.1).
257
+ parts = matched.split(".")
258
+ try:
259
+ if len(parts) != 4 or not all(0 <= int(p) <= 255 for p in parts):
260
+ return False
261
+ except ValueError:
262
+ return False
263
+ # Reject matches embedded in longer dotted-decimal sequences (e.g. OIDs)
264
+ # by checking characters immediately adjacent to the match.
265
+ # Use set membership (not substring) so empty-string boundary is not
266
+ # a false positive — `"" in "0123456789."` is True in Python.
267
+ _boundary = frozenset("0123456789.")
268
+ before = obj[r.start - 1] if r.start > 0 else ""
269
+ after = obj[r.end] if r.end < len(obj) else ""
270
+ if before in _boundary or after in _boundary:
271
+ return False
272
+ return True
273
+
274
+ filtered = [r for r in results if _keep(r)]
275
+ # Group by entity type
276
+ entity_counts: dict[str, int] = {}
277
+ for r in filtered:
278
+ entity_counts[r.entity_type] = entity_counts.get(r.entity_type, 0) + 1
279
+ for entity_type, count in entity_counts.items():
280
+ label, sensitivity = _ENTITY_MAP.get(entity_type, (entity_type.lower(), "medium"))
281
+ hits.append(
282
+ PIIScanHit(
283
+ pii_type=label,
284
+ path=path,
285
+ match_count=count,
286
+ sensitivity=sensitivity,
287
+ )
288
+ )
289
+ elif isinstance(obj, Mapping):
290
+ for k, v in obj.items():
291
+ _walk(v, f"{path}.{k}" if path else str(k), depth + 1)
292
+ elif isinstance(obj, (list, tuple)):
293
+ for i, v in enumerate(obj):
294
+ _walk(v, f"{path}[{i}]", depth + 1)
295
+
296
+ _walk(payload, "", 0)
297
+ return PIIScanResult(hits=hits, scanned=scanned)
298
+
299
+
300
+ def presidio_scan_text(
301
+ text: str,
302
+ *,
303
+ language: str = "en",
304
+ score_threshold: float = 0.5,
305
+ ) -> tuple[list[dict[str, Any]], str, bool]:
306
+ """Scan a plain text string for PII using Microsoft Presidio.
307
+
308
+ Returns a tuple of ``(entities, redacted_text, detected)`` where
309
+ *entities* is a list of ``{"type", "start", "end", "score"}`` dicts
310
+ and *redacted_text* replaces each detected entity with ``<TYPE>``.
311
+
312
+ **Security**: raw entity values are never included — only type, position,
313
+ and confidence score.
314
+
315
+ Args:
316
+ text: The text to scan.
317
+ language: Language code for analysis (default: ``"en"``).
318
+ score_threshold: Minimum Presidio confidence score (default: 0.5).
319
+
320
+ Returns:
321
+ ``(entities, redacted_text, detected)`` tuple.
322
+
323
+ Raises:
324
+ ImportError: If ``presidio-analyzer`` is not installed.
325
+ """
326
+ analyzer = _get_analyzer()
327
+ results = analyzer.analyze(
328
+ text=text,
329
+ language=language,
330
+ score_threshold=score_threshold,
331
+ entities=_SCAN_ENTITIES,
332
+ )
333
+
334
+ entities: list[dict[str, Any]] = [
335
+ {
336
+ "type": _ENTITY_MAP.get(r.entity_type, (r.entity_type.lower(), "medium"))[0],
337
+ "start": r.start,
338
+ "end": r.end,
339
+ "score": round(float(r.score), 4),
340
+ }
341
+ for r in sorted(results, key=lambda r: r.start)
342
+ ]
343
+
344
+ # Build redacted text by replacing spans from right-to-left to preserve offsets.
345
+ redacted = text
346
+ for ent in sorted(entities, key=lambda e: e["start"], reverse=True):
347
+ redacted = redacted[: ent["start"]] + f"<{ent['type'].upper()}>" + redacted[ent["end"] :]
348
+
349
+ return entities, redacted, bool(entities)