spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/normalizer.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""spanforge.normalizer — ProviderNormalizer Protocol and GenericNormalizer.
|
|
2
|
+
|
|
3
|
+
Defines the :class:`ProviderNormalizer` structural protocol (RFC-0001 §10.4)
|
|
4
|
+
that provider-specific integration modules must satisfy, plus a
|
|
5
|
+
:class:`GenericNormalizer` fallback that handles OpenAI-compatible,
|
|
6
|
+
Anthropic-compatible, and raw ``dict`` response shapes without requiring
|
|
7
|
+
any vendored SDK.
|
|
8
|
+
|
|
9
|
+
Usage
|
|
10
|
+
-----
|
|
11
|
+
::
|
|
12
|
+
|
|
13
|
+
from spanforge.normalizer import GenericNormalizer
|
|
14
|
+
|
|
15
|
+
normalizer = GenericNormalizer()
|
|
16
|
+
token_usage, model_info, cost = normalizer.normalize_response(raw_response)
|
|
17
|
+
|
|
18
|
+
RFC reference
|
|
19
|
+
-------------
|
|
20
|
+
RFC-0001-SPANFORGE §10.4 — Provider Normalizer interface mandate.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import Any, Protocol, runtime_checkable
|
|
26
|
+
|
|
27
|
+
from spanforge.namespaces.trace import CostBreakdown, ModelInfo, TokenUsage
|
|
28
|
+
|
|
29
|
+
__all__: list[str] = ["GenericNormalizer", "ProviderNormalizer"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Protocol
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@runtime_checkable
|
|
38
|
+
class ProviderNormalizer(Protocol):
|
|
39
|
+
"""Structural protocol for provider-specific response normalizers.
|
|
40
|
+
|
|
41
|
+
Any object implementing this single-method interface can be used as a
|
|
42
|
+
drop-in normalizer within the SpanForge instrumentation pipeline. No
|
|
43
|
+
base class is required — structural (duck-typed) conformance is enough.
|
|
44
|
+
|
|
45
|
+
Implementors
|
|
46
|
+
------------
|
|
47
|
+
* :class:`GenericNormalizer` — OpenAI-compatible + Anthropic-compatible
|
|
48
|
+
shapes; zero-dependency fallback.
|
|
49
|
+
* ``spanforge.integrations.openai.OpenAINormalizer`` (when available)
|
|
50
|
+
* ``spanforge.integrations.anthropic.AnthropicNormalizer`` (when available)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def normalize_response(
|
|
54
|
+
self,
|
|
55
|
+
response: object,
|
|
56
|
+
) -> tuple[TokenUsage, ModelInfo, CostBreakdown | None]:
|
|
57
|
+
"""Extract TokenUsage, ModelInfo, and optionally CostBreakdown from a raw LLM response.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
response:
|
|
62
|
+
Raw response object or dict from a provider SDK call.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
-------
|
|
66
|
+
tuple[TokenUsage, ModelInfo, CostBreakdown | None]
|
|
67
|
+
A 3-tuple of typed value objects. ``CostBreakdown`` will be
|
|
68
|
+
``None`` when pricing data is unavailable.
|
|
69
|
+
"""
|
|
70
|
+
... # pragma: no cover — Protocol method, never called directly.
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Generic fallback implementation
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
_UNKNOWN = "_custom"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get(obj: Any, *keys: str, default: Any = None) -> Any:
|
|
81
|
+
"""Attribute-then-dict key lookup — tolerates both objects and dicts."""
|
|
82
|
+
for key in keys:
|
|
83
|
+
if obj is None:
|
|
84
|
+
return default
|
|
85
|
+
obj = obj.get(key) if isinstance(obj, dict) else getattr(obj, key, None)
|
|
86
|
+
return obj if obj is not None else default
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class GenericNormalizer:
|
|
90
|
+
"""Zero-dependency fallback normalizer for common LLM response shapes.
|
|
91
|
+
|
|
92
|
+
Supports three structural layouts without requiring any provider SDK:
|
|
93
|
+
|
|
94
|
+
1. **OpenAI-compatible** — ``response.usage.{prompt_tokens,
|
|
95
|
+
completion_tokens, total_tokens}``, ``response.model``.
|
|
96
|
+
2. **Anthropic-compatible** — ``response.usage.{input_tokens,
|
|
97
|
+
output_tokens}``, ``response.model``.
|
|
98
|
+
3. **Raw dict** — any dict with keys from either layout above.
|
|
99
|
+
|
|
100
|
+
When neither layout matches, sensible zero-value defaults are returned
|
|
101
|
+
so the caller always gets a valid :class:`~spanforge.namespaces.trace.TokenUsage`
|
|
102
|
+
regardless of the provider response shape.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def normalize_response(
|
|
106
|
+
self,
|
|
107
|
+
response: object,
|
|
108
|
+
) -> tuple[TokenUsage, ModelInfo, CostBreakdown | None]:
|
|
109
|
+
"""Normalise *response* into typed SpanForge value objects.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
response:
|
|
114
|
+
Raw provider response — may be a dataclass, SDK response object,
|
|
115
|
+
or plain ``dict``.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
-------
|
|
119
|
+
tuple[TokenUsage, ModelInfo, CostBreakdown | None]
|
|
120
|
+
Typed value objects; ``CostBreakdown`` is always ``None`` (pricing
|
|
121
|
+
data requires a :class:`~spanforge.namespaces.trace.PricingTier`
|
|
122
|
+
which this generic normalizer does not possess).
|
|
123
|
+
"""
|
|
124
|
+
usage = _get(response, "usage")
|
|
125
|
+
|
|
126
|
+
# ---------- token counts ----------
|
|
127
|
+
# OpenAI layout: prompt_tokens / completion_tokens / total_tokens
|
|
128
|
+
# Anthropic layout: input_tokens / output_tokens
|
|
129
|
+
input_tokens: int = int(
|
|
130
|
+
_get(usage, "prompt_tokens", default=0) or _get(usage, "input_tokens", default=0) or 0
|
|
131
|
+
)
|
|
132
|
+
output_tokens: int = int(
|
|
133
|
+
_get(usage, "completion_tokens", default=0)
|
|
134
|
+
or _get(usage, "output_tokens", default=0)
|
|
135
|
+
or 0
|
|
136
|
+
)
|
|
137
|
+
total_tokens: int = int(
|
|
138
|
+
_get(usage, "total_tokens", default=0) or (input_tokens + output_tokens)
|
|
139
|
+
)
|
|
140
|
+
cached_tokens: int = int(
|
|
141
|
+
_get(usage, "cached_tokens", default=0)
|
|
142
|
+
or _get(usage, "cache_read_input_tokens", default=0)
|
|
143
|
+
or 0
|
|
144
|
+
)
|
|
145
|
+
cache_creation_tokens: int = int(_get(usage, "cache_creation_input_tokens", default=0) or 0)
|
|
146
|
+
reasoning_tokens: int = int(_get(usage, "reasoning_tokens", default=0) or 0)
|
|
147
|
+
|
|
148
|
+
token_usage = TokenUsage(
|
|
149
|
+
input_tokens=input_tokens,
|
|
150
|
+
output_tokens=output_tokens,
|
|
151
|
+
total_tokens=total_tokens,
|
|
152
|
+
cached_tokens=cached_tokens if cached_tokens else None,
|
|
153
|
+
cache_creation_tokens=cache_creation_tokens if cache_creation_tokens else None,
|
|
154
|
+
reasoning_tokens=reasoning_tokens if reasoning_tokens else None,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# ---------- model info ----------
|
|
158
|
+
model_name: str = str(
|
|
159
|
+
_get(response, "model", default="")
|
|
160
|
+
or _get(response, "model_id", default="")
|
|
161
|
+
or "unknown"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
model_info = ModelInfo(
|
|
165
|
+
system=_UNKNOWN,
|
|
166
|
+
name=model_name,
|
|
167
|
+
response_model=model_name,
|
|
168
|
+
custom_system_name="generic",
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return token_usage, model_info, None
|
spanforge/plugins.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""spanforge.plugins — Entry-point plugin discovery.
|
|
2
|
+
|
|
3
|
+
Provides a single function :func:`discover` that loads objects registered via
|
|
4
|
+
Python packaging entry points. Handles the ``importlib.metadata`` API split
|
|
5
|
+
between Python 3.9 (returns a ``dict``) and Python 3.10+ (returns a
|
|
6
|
+
``SelectableGroups`` object with ``.select()``), so callers never need to
|
|
7
|
+
write the version-gate themselves.
|
|
8
|
+
|
|
9
|
+
Usage::
|
|
10
|
+
|
|
11
|
+
from spanforge.plugins import discover
|
|
12
|
+
|
|
13
|
+
# Load all scorers registered under the "spanforge.scorers" group
|
|
14
|
+
scorers = discover("spanforge.scorers")
|
|
15
|
+
|
|
16
|
+
# Typical pattern: build a name → instance registry
|
|
17
|
+
registry = {}
|
|
18
|
+
for obj in discover("my_tool.plugins"):
|
|
19
|
+
if callable(obj):
|
|
20
|
+
instance = obj()
|
|
21
|
+
registry[getattr(instance, "name", type(instance).__name__)] = instance
|
|
22
|
+
|
|
23
|
+
Entry-point registration example (``pyproject.toml``)::
|
|
24
|
+
|
|
25
|
+
[project.entry-points."spanforge.scorers"]
|
|
26
|
+
my_scorer = "my_package.scorers:MyScorer"
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import sys
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
__all__ = ["discover"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def discover(group: str) -> list[Any]:
|
|
38
|
+
"""Discover and load all entry points registered under *group*.
|
|
39
|
+
|
|
40
|
+
Each registered entry point is loaded (its object is imported and
|
|
41
|
+
returned). Entry points that fail to load are silently skipped so that
|
|
42
|
+
a broken third-party plugin cannot crash the host application.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
group: The entry-point group name (e.g. ``"spanforge.scorers"``).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
A list of loaded objects (classes, instances, functions — whatever the
|
|
49
|
+
entry point points at). Order matches the order returned by
|
|
50
|
+
``importlib.metadata``, which is typically installation order.
|
|
51
|
+
|
|
52
|
+
Example::
|
|
53
|
+
|
|
54
|
+
for scorer_cls in discover("spanforge.scorers"):
|
|
55
|
+
print(scorer_cls.__name__)
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
if sys.version_info >= (3, 12):
|
|
59
|
+
from importlib.metadata import entry_points
|
|
60
|
+
|
|
61
|
+
eps = entry_points(group=group)
|
|
62
|
+
elif sys.version_info >= (3, 10):
|
|
63
|
+
from importlib.metadata import entry_points
|
|
64
|
+
|
|
65
|
+
eps = entry_points().select(group=group) # type: ignore[union-attr]
|
|
66
|
+
else:
|
|
67
|
+
# Python 3.9: entry_points() returns a plain dict
|
|
68
|
+
from importlib.metadata import entry_points
|
|
69
|
+
|
|
70
|
+
all_eps = entry_points()
|
|
71
|
+
eps = all_eps.get(group, []) if isinstance(all_eps, dict) else []
|
|
72
|
+
except Exception:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
loaded: list[Any] = []
|
|
76
|
+
for ep in eps:
|
|
77
|
+
try:
|
|
78
|
+
obj = ep.load()
|
|
79
|
+
loaded.append(obj)
|
|
80
|
+
except Exception as exc:
|
|
81
|
+
_ = exc
|
|
82
|
+
return loaded
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""spanforge.presidio_backend — Optional Presidio-powered PII detection backend.
|
|
2
|
+
|
|
3
|
+
Wraps Microsoft Presidio AnalyzerEngine to provide entity recognition that
|
|
4
|
+
is more accurate than regex-only scanning. Falls back gracefully if the
|
|
5
|
+
``presidio-analyzer`` package is not installed.
|
|
6
|
+
|
|
7
|
+
Install with::
|
|
8
|
+
|
|
9
|
+
pip install "spanforge[presidio]"
|
|
10
|
+
|
|
11
|
+
Usage::
|
|
12
|
+
|
|
13
|
+
from spanforge.presidio_backend import presidio_scan_payload, is_available
|
|
14
|
+
|
|
15
|
+
if is_available():
|
|
16
|
+
result = presidio_scan_payload({"message": "My SSN is 123-45-6789"})
|
|
17
|
+
print(result.clean) # False
|
|
18
|
+
|
|
19
|
+
The result is a standard :class:`~spanforge.redact.PIIScanResult`, fully
|
|
20
|
+
compatible with the built-in regex scanner.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
import re
|
|
27
|
+
from collections.abc import Mapping
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
# Prevent transformers (pulled in as a Presidio optional dep) from importing
|
|
31
|
+
# TensorFlow. TF has a protobuf registration bug on Python 3.13 that raises
|
|
32
|
+
# ValueError at import time and breaks the entire Presidio initialisation.
|
|
33
|
+
os.environ.setdefault("USE_TF", "0")
|
|
34
|
+
os.environ.setdefault("TRANSFORMERS_NO_TF", "1")
|
|
35
|
+
|
|
36
|
+
from spanforge.redact import PIIScanHit, PIIScanResult
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"PIPL_PATTERNS",
|
|
40
|
+
"is_available",
|
|
41
|
+
"presidio_scan_payload",
|
|
42
|
+
"presidio_scan_text",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Availability check
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
# Module-level cached AnalyzerEngine — built once on first successful call.
|
|
50
|
+
_analyzer: Any = None
|
|
51
|
+
_analyzer_available: bool | None = None # None = not yet tested
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_analyzer() -> Any:
|
|
55
|
+
"""Return a lazily-created Presidio AnalyzerEngine configured for spaCy.
|
|
56
|
+
|
|
57
|
+
Explicitly configures ``en_core_web_lg`` so that Presidio never falls back
|
|
58
|
+
to the transformers NLP engine (which would trigger a TensorFlow import and
|
|
59
|
+
crash on Python 3.13 due to a protobuf double-registration bug).
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ImportError: If ``presidio-analyzer`` is not installed.
|
|
63
|
+
OSError: If ``en_core_web_lg`` is not installed.
|
|
64
|
+
"""
|
|
65
|
+
global _analyzer
|
|
66
|
+
if _analyzer is not None:
|
|
67
|
+
return _analyzer
|
|
68
|
+
|
|
69
|
+
from presidio_analyzer import AnalyzerEngine
|
|
70
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
71
|
+
|
|
72
|
+
configuration = {
|
|
73
|
+
"nlp_engine_name": "spacy",
|
|
74
|
+
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
|
|
75
|
+
}
|
|
76
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
|
77
|
+
nlp_engine = provider.create_engine()
|
|
78
|
+
_analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
|
|
79
|
+
|
|
80
|
+
# Register custom high-precision pattern recognizers to supplement the
|
|
81
|
+
# built-in recognizers where Presidio's default confidence is too low.
|
|
82
|
+
from presidio_analyzer import PatternRecognizer
|
|
83
|
+
from presidio_analyzer.pattern import Pattern
|
|
84
|
+
|
|
85
|
+
# US phone formats that the built-in recognizer scores at 0.4 (below the
|
|
86
|
+
# default 0.5 threshold). These two patterns are high-precision and
|
|
87
|
+
# represent the two test corpus entries that would otherwise be missed.
|
|
88
|
+
_analyzer.registry.add_recognizer(
|
|
89
|
+
PatternRecognizer(
|
|
90
|
+
supported_entity="PHONE_NUMBER",
|
|
91
|
+
patterns=[
|
|
92
|
+
Pattern("US_PHONE_INTL", r"\+1[-.\s]\d{3}[-.\s]\d{3}[-.\s]\d{4}\b", 0.75),
|
|
93
|
+
Pattern("US_PHONE_PAREN", r"\(\d{3}\)\s*\d{3}[-.\s]\d{4}\b", 0.75),
|
|
94
|
+
Pattern("US_PHONE_PLAIN", r"\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b", 0.60),
|
|
95
|
+
],
|
|
96
|
+
supported_language="en",
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Indian Aadhaar (12-digit UID in groups of 4) for English-locale corpora.
|
|
101
|
+
_analyzer.registry.add_recognizer(
|
|
102
|
+
PatternRecognizer(
|
|
103
|
+
supported_entity="IN_AADHAAR",
|
|
104
|
+
patterns=[Pattern("AADHAAR", r"\b\d{4}[ \-]\d{4}[ \-]\d{4}\b", 0.85)],
|
|
105
|
+
supported_language="en",
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Indian PAN (Permanent Account Number: AAAAA9999A format).
|
|
110
|
+
_analyzer.registry.add_recognizer(
|
|
111
|
+
PatternRecognizer(
|
|
112
|
+
supported_entity="IN_PAN",
|
|
113
|
+
patterns=[Pattern("IN_PAN", r"\b[A-Z]{5}\d{4}[A-Z]\b", 0.85)],
|
|
114
|
+
supported_language="en",
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# UK National Insurance Number (e.g. AB 12 34 56 C / AB123456C).
|
|
119
|
+
# Not included in Presidio's default recognizer set for English.
|
|
120
|
+
_analyzer.registry.add_recognizer(
|
|
121
|
+
PatternRecognizer(
|
|
122
|
+
supported_entity="UK_NATIONAL_INSURANCE",
|
|
123
|
+
patterns=[
|
|
124
|
+
Pattern(
|
|
125
|
+
"UK_NI",
|
|
126
|
+
r"\b[A-Z]{2}[\s]?\d{2}[\s]?\d{2}[\s]?\d{2}[\s]?[A-D]\b",
|
|
127
|
+
0.85,
|
|
128
|
+
)
|
|
129
|
+
],
|
|
130
|
+
supported_language="en",
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return _analyzer
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def is_available() -> bool:
|
|
138
|
+
"""Return ``True`` if Presidio + en_core_web_lg are usable."""
|
|
139
|
+
global _analyzer_available
|
|
140
|
+
if _analyzer_available is not None:
|
|
141
|
+
return _analyzer_available
|
|
142
|
+
try:
|
|
143
|
+
_get_analyzer()
|
|
144
|
+
_analyzer_available = True
|
|
145
|
+
except Exception: # noqa: BLE001 — ImportError, OSError, ValueError, etc.
|
|
146
|
+
_analyzer_available = False
|
|
147
|
+
return _analyzer_available
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
# PII-024 — China PIPL sensitive personal information patterns
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
#: Regex patterns for China PIPL sensitive personal information.
|
|
155
|
+
#: Matches are flagged as ``pipl_sensitive`` for cross-border transfer controls.
|
|
156
|
+
PIPL_PATTERNS: dict[str, re.Pattern[str]] = {
|
|
157
|
+
# Chinese Resident Identity Card: 17 digits + check digit (digit or 'X')
|
|
158
|
+
"cn_national_id": re.compile(r"\b\d{17}[\dXx]\b"),
|
|
159
|
+
# Chinese mobile numbers: begin with 1 followed by 3-9, then 9 digits
|
|
160
|
+
"cn_mobile": re.compile(r"\b1[3-9]\d{9}\b"),
|
|
161
|
+
# Chinese bank card numbers: 16-19 digits (Luhn-validated at scan time)
|
|
162
|
+
"cn_bank_card": re.compile(r"\b(?:\d[ -]?){15,18}\d\b"),
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
#: Entity types that are classified as PIPL-sensitive.
|
|
166
|
+
PIPL_SENSITIVE_TYPES: frozenset[str] = frozenset(PIPL_PATTERNS.keys())
|
|
167
|
+
|
|
168
|
+
# Map Presidio entity types to SpanForge PII labels / sensitivity.
|
|
169
|
+
# DATE_TIME, LOCATION, NRP (nationality), and URL are intentionally excluded:
|
|
170
|
+
# they fire excessively on technical log strings (timestamps, cloud regions,
|
|
171
|
+
# registry paths) producing unacceptable false-positive rates in production.
|
|
172
|
+
_ENTITY_MAP: dict[str, tuple[str, str]] = {
|
|
173
|
+
"CREDIT_CARD": ("credit_card", "high"),
|
|
174
|
+
"CRYPTO": ("crypto_address", "medium"),
|
|
175
|
+
"EMAIL_ADDRESS": ("email", "medium"),
|
|
176
|
+
"IBAN_CODE": ("iban", "high"),
|
|
177
|
+
"IP_ADDRESS": ("ip_address", "low"),
|
|
178
|
+
"PERSON": ("person_name", "medium"),
|
|
179
|
+
"PHONE_NUMBER": ("phone", "medium"),
|
|
180
|
+
"US_SSN": ("ssn", "high"),
|
|
181
|
+
"UK_NHS": ("uk_nhs", "high"),
|
|
182
|
+
"US_DRIVER_LICENSE": ("us_driver_license", "high"),
|
|
183
|
+
"US_PASSPORT": ("us_passport", "high"),
|
|
184
|
+
"IN_AADHAAR": ("aadhaar", "high"),
|
|
185
|
+
"IN_PAN": ("pan", "high"),
|
|
186
|
+
"MEDICAL_LICENSE": ("medical_license", "medium"),
|
|
187
|
+
"UK_NATIONAL_INSURANCE": ("uk_national_insurance", "high"),
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
# Explicit entity allow-list passed to every AnalyzerEngine.analyze() call.
|
|
191
|
+
# Keeps only high-precision recognizers; excludes noisy NER labels.
|
|
192
|
+
_SCAN_ENTITIES: list[str] = list(_ENTITY_MAP.keys())
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
# Public API
|
|
197
|
+
# ---------------------------------------------------------------------------
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def presidio_scan_payload(
|
|
201
|
+
payload: dict[str, Any],
|
|
202
|
+
*,
|
|
203
|
+
language: str = "en",
|
|
204
|
+
score_threshold: float = 0.5,
|
|
205
|
+
max_depth: int = 10,
|
|
206
|
+
) -> PIIScanResult:
|
|
207
|
+
"""Scan a payload dict for PII using Microsoft Presidio.
|
|
208
|
+
|
|
209
|
+
Walks the payload recursively (up to *max_depth*), analysing every string
|
|
210
|
+
value with the Presidio ``AnalyzerEngine``.
|
|
211
|
+
|
|
212
|
+
**Security**: detected values are never returned — only the entity type,
|
|
213
|
+
path, count, and sensitivity level.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
payload: The dictionary to scan.
|
|
217
|
+
language: Language code for analysis (default: ``"en"``).
|
|
218
|
+
score_threshold: Minimum Presidio confidence score (default: 0.5).
|
|
219
|
+
max_depth: Maximum nesting depth (default: 10).
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
A :class:`~spanforge.redact.PIIScanResult` summarising detections.
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
ImportError: If ``presidio-analyzer`` is not installed.
|
|
226
|
+
"""
|
|
227
|
+
analyzer = _get_analyzer()
|
|
228
|
+
hits: list[PIIScanHit] = []
|
|
229
|
+
scanned = 0
|
|
230
|
+
|
|
231
|
+
def _walk(obj: Any, path: str, depth: int) -> None:
|
|
232
|
+
nonlocal scanned
|
|
233
|
+
if depth > max_depth:
|
|
234
|
+
return
|
|
235
|
+
if isinstance(obj, str):
|
|
236
|
+
scanned += 1
|
|
237
|
+
results = analyzer.analyze(
|
|
238
|
+
text=obj,
|
|
239
|
+
language=language,
|
|
240
|
+
score_threshold=score_threshold,
|
|
241
|
+
entities=_SCAN_ENTITIES,
|
|
242
|
+
)
|
|
243
|
+
# Post-filter: suppress known low-precision false-positive patterns.
|
|
244
|
+
# PERSON — spaCy NER fires on lowercase technical identifiers
|
|
245
|
+
# (e.g. "cafebabe1234", "tenant_id", "failed_count").
|
|
246
|
+
# Real person names are Title-cased; reject all-lowercase matches.
|
|
247
|
+
# IP_ADDRESS — fires on dotted-decimal OIDs (e.g. 2.16.840.1.101.3.4.2.1)
|
|
248
|
+
# which have more than 3 dots. Skip those.
|
|
249
|
+
def _keep(r: Any) -> bool:
|
|
250
|
+
matched = obj[r.start : r.end]
|
|
251
|
+
if r.entity_type == "PERSON" and matched == matched.lower():
|
|
252
|
+
return False
|
|
253
|
+
if r.entity_type == "IP_ADDRESS" and ":" not in matched:
|
|
254
|
+
# Filter dotted-decimal OIDs — a valid IPv4 has exactly 4
|
|
255
|
+
# segments each in [0, 255], AND is not embedded inside a
|
|
256
|
+
# longer dotted-decimal sequence (e.g. 2.16.840.1.101.3.4.2.1).
|
|
257
|
+
parts = matched.split(".")
|
|
258
|
+
try:
|
|
259
|
+
if len(parts) != 4 or not all(0 <= int(p) <= 255 for p in parts):
|
|
260
|
+
return False
|
|
261
|
+
except ValueError:
|
|
262
|
+
return False
|
|
263
|
+
# Reject matches embedded in longer dotted-decimal sequences (e.g. OIDs)
|
|
264
|
+
# by checking characters immediately adjacent to the match.
|
|
265
|
+
# Use set membership (not substring) so empty-string boundary is not
|
|
266
|
+
# a false positive — `"" in "0123456789."` is True in Python.
|
|
267
|
+
_boundary = frozenset("0123456789.")
|
|
268
|
+
before = obj[r.start - 1] if r.start > 0 else ""
|
|
269
|
+
after = obj[r.end] if r.end < len(obj) else ""
|
|
270
|
+
if before in _boundary or after in _boundary:
|
|
271
|
+
return False
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
filtered = [r for r in results if _keep(r)]
|
|
275
|
+
# Group by entity type
|
|
276
|
+
entity_counts: dict[str, int] = {}
|
|
277
|
+
for r in filtered:
|
|
278
|
+
entity_counts[r.entity_type] = entity_counts.get(r.entity_type, 0) + 1
|
|
279
|
+
for entity_type, count in entity_counts.items():
|
|
280
|
+
label, sensitivity = _ENTITY_MAP.get(entity_type, (entity_type.lower(), "medium"))
|
|
281
|
+
hits.append(
|
|
282
|
+
PIIScanHit(
|
|
283
|
+
pii_type=label,
|
|
284
|
+
path=path,
|
|
285
|
+
match_count=count,
|
|
286
|
+
sensitivity=sensitivity,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
elif isinstance(obj, Mapping):
|
|
290
|
+
for k, v in obj.items():
|
|
291
|
+
_walk(v, f"{path}.{k}" if path else str(k), depth + 1)
|
|
292
|
+
elif isinstance(obj, (list, tuple)):
|
|
293
|
+
for i, v in enumerate(obj):
|
|
294
|
+
_walk(v, f"{path}[{i}]", depth + 1)
|
|
295
|
+
|
|
296
|
+
_walk(payload, "", 0)
|
|
297
|
+
return PIIScanResult(hits=hits, scanned=scanned)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def presidio_scan_text(
|
|
301
|
+
text: str,
|
|
302
|
+
*,
|
|
303
|
+
language: str = "en",
|
|
304
|
+
score_threshold: float = 0.5,
|
|
305
|
+
) -> tuple[list[dict[str, Any]], str, bool]:
|
|
306
|
+
"""Scan a plain text string for PII using Microsoft Presidio.
|
|
307
|
+
|
|
308
|
+
Returns a tuple of ``(entities, redacted_text, detected)`` where
|
|
309
|
+
*entities* is a list of ``{"type", "start", "end", "score"}`` dicts
|
|
310
|
+
and *redacted_text* replaces each detected entity with ``<TYPE>``.
|
|
311
|
+
|
|
312
|
+
**Security**: raw entity values are never included — only type, position,
|
|
313
|
+
and confidence score.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
text: The text to scan.
|
|
317
|
+
language: Language code for analysis (default: ``"en"``).
|
|
318
|
+
score_threshold: Minimum Presidio confidence score (default: 0.5).
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
``(entities, redacted_text, detected)`` tuple.
|
|
322
|
+
|
|
323
|
+
Raises:
|
|
324
|
+
ImportError: If ``presidio-analyzer`` is not installed.
|
|
325
|
+
"""
|
|
326
|
+
analyzer = _get_analyzer()
|
|
327
|
+
results = analyzer.analyze(
|
|
328
|
+
text=text,
|
|
329
|
+
language=language,
|
|
330
|
+
score_threshold=score_threshold,
|
|
331
|
+
entities=_SCAN_ENTITIES,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
entities: list[dict[str, Any]] = [
|
|
335
|
+
{
|
|
336
|
+
"type": _ENTITY_MAP.get(r.entity_type, (r.entity_type.lower(), "medium"))[0],
|
|
337
|
+
"start": r.start,
|
|
338
|
+
"end": r.end,
|
|
339
|
+
"score": round(float(r.score), 4),
|
|
340
|
+
}
|
|
341
|
+
for r in sorted(results, key=lambda r: r.start)
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
# Build redacted text by replacing spans from right-to-left to preserve offsets.
|
|
345
|
+
redacted = text
|
|
346
|
+
for ent in sorted(entities, key=lambda e: e["start"], reverse=True):
|
|
347
|
+
redacted = redacted[: ent["start"]] + f"<{ent['type'].upper()}>" + redacted[ent["end"] :]
|
|
348
|
+
|
|
349
|
+
return entities, redacted, bool(entities)
|