spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/secrets.py
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
"""spanforge.secrets — Secrets detection engine (sf-secrets Phase 2).
|
|
2
|
+
|
|
3
|
+
This module implements the core in-process secrets scanning logic for the
|
|
4
|
+
SpanForge sf-secrets service. It is designed to run without any network
|
|
5
|
+
calls and is safe to import directly — the :class:`SecretsScanner` class
|
|
6
|
+
wraps all pattern matching, entropy scoring, allowlist filtering, and
|
|
7
|
+
auto-block policy logic.
|
|
8
|
+
|
|
9
|
+
Detection model
|
|
10
|
+
---------------
|
|
11
|
+
Each candidate match is assigned a **confidence score** between 0 and 1:
|
|
12
|
+
|
|
13
|
+
* ``0.75`` — structural pattern match only.
|
|
14
|
+
* ``0.90`` — pattern match + Shannon entropy ≥ 3.5 bits/char on a token of
|
|
15
|
+
≥ 32 characters.
|
|
16
|
+
* ``0.97`` — pattern + entropy + a context keyword (``password``, ``token``,
|
|
17
|
+
``secret``, ``key``, ``credential``, ``api_key``, ``apikey``, ``auth``,
|
|
18
|
+
``access_key``, ``private_key``) appears within ±50 characters.
|
|
19
|
+
|
|
20
|
+
Auto-block policy
|
|
21
|
+
-----------------
|
|
22
|
+
* **Zero-tolerance types** are always blocked regardless of the confidence
|
|
23
|
+
threshold supplied by the caller: Bearer Token, AWS Access Key, GCP Service
|
|
24
|
+
Account JSON, PEM/OPENSSH Private Key, SSH Private Key, HC API key
|
|
25
|
+
(``hc_(live|test)_*``), SF API key (``sf_(live|test)_*``), GitHub PAT,
|
|
26
|
+
Stripe live key (``sk_live_*``), Generic JWT.
|
|
27
|
+
* **Confidence-gated types** are blocked only when their confidence reaches
|
|
28
|
+
≥ 0.90: Generic API Key, DB connection string.
|
|
29
|
+
|
|
30
|
+
Security requirements
|
|
31
|
+
---------------------
|
|
32
|
+
* ``SecretHit.redacted_value`` is **always** ``"[REDACTED:<SECRET_TYPE>]"`` — the
|
|
33
|
+
matched value is never included.
|
|
34
|
+
* The entropy function is constant-time with respect to the *length* of the
|
|
35
|
+
input string (not its content), so it is safe to call on secret material.
|
|
36
|
+
* The allowlist uses exact ``frozenset`` membership tests; no partial matching
|
|
37
|
+
is applied to allowlist entries.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import math
|
|
43
|
+
import re
|
|
44
|
+
from dataclasses import dataclass, field
|
|
45
|
+
from typing import Any
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"SecretHit",
|
|
49
|
+
"SecretsScanResult",
|
|
50
|
+
"SecretsScanner",
|
|
51
|
+
"entropy_score",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Constants
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
_ENTROPY_THRESHOLD: float = 3.5
|
|
59
|
+
_ENTROPY_MIN_LENGTH: int = 32
|
|
60
|
+
_CONTEXT_WINDOW: int = 50 # characters either side of a match to search
|
|
61
|
+
|
|
62
|
+
_CONTEXT_KEYWORDS: frozenset[str] = frozenset(
|
|
63
|
+
{
|
|
64
|
+
"password",
|
|
65
|
+
"token",
|
|
66
|
+
"secret",
|
|
67
|
+
"key",
|
|
68
|
+
"credential",
|
|
69
|
+
"api_key",
|
|
70
|
+
"apikey",
|
|
71
|
+
"auth",
|
|
72
|
+
"access_key",
|
|
73
|
+
"private_key",
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Zero-tolerance secret types (always auto-blocked)
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
_ZERO_TOLERANCE_TYPES: frozenset[str] = frozenset(
|
|
82
|
+
{
|
|
83
|
+
"bearer_token",
|
|
84
|
+
"aws_access_key",
|
|
85
|
+
"gcp_service_account",
|
|
86
|
+
"pem_private_key",
|
|
87
|
+
"ssh_private_key",
|
|
88
|
+
"halluccheck_api_key",
|
|
89
|
+
"spanforge_api_key",
|
|
90
|
+
"github_pat",
|
|
91
|
+
"stripe_live_key",
|
|
92
|
+
"generic_jwt",
|
|
93
|
+
}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Confidence-gated types — block only if confidence >= 0.90
|
|
97
|
+
_CONFIDENCE_GATED_TYPES: frozenset[str] = frozenset(
|
|
98
|
+
{
|
|
99
|
+
"generic_api_key",
|
|
100
|
+
"db_connection_string",
|
|
101
|
+
}
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
_CONFIDENCE_GATE_THRESHOLD: float = 0.90
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# Vault hints — suggest where to store each secret type
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
_VAULT_HINTS: dict[str, str] = {
|
|
111
|
+
"aws_access_key": (
|
|
112
|
+
"Move to AWS Secrets Manager: "
|
|
113
|
+
"aws secretsmanager create-secret --name my-aws-creds --secret-string <value>"
|
|
114
|
+
),
|
|
115
|
+
"gcp_service_account": (
|
|
116
|
+
"Move to Google Cloud Secret Manager: "
|
|
117
|
+
"gcloud secrets create my-gcp-key --data-file=service-account.json"
|
|
118
|
+
),
|
|
119
|
+
"azure_connection_string": (
|
|
120
|
+
"Move to Azure Key Vault: "
|
|
121
|
+
"az keyvault secret set --vault-name MyVault --name my-conn-str --value <value>"
|
|
122
|
+
),
|
|
123
|
+
"pem_private_key": (
|
|
124
|
+
"Move to HashiCorp Vault: vault kv put secret/tls private_key=@keyfile.pem"
|
|
125
|
+
),
|
|
126
|
+
"ssh_private_key": ("Move to HashiCorp Vault: vault kv put secret/ssh private_key=@id_rsa"),
|
|
127
|
+
"stripe_live_key": ("Move to HashiCorp Vault: vault kv put secret/stripe live_key=<value>"),
|
|
128
|
+
"stripe_test_key": ("Move to HashiCorp Vault: vault kv put secret/stripe test_key=<value>"),
|
|
129
|
+
"generic_api_key": ("Move to HashiCorp Vault: vault kv put secret/api key=<value>"),
|
|
130
|
+
"github_pat": (
|
|
131
|
+
"Move to GitHub Secrets or HashiCorp Vault: gh secret set MY_PAT --body <value>"
|
|
132
|
+
),
|
|
133
|
+
"slack_token": ( # nosec B105
|
|
134
|
+
"Move to HashiCorp Vault: vault kv put secret/slack token=<value>"
|
|
135
|
+
),
|
|
136
|
+
"sendgrid_key": ("Move to HashiCorp Vault: vault kv put secret/sendgrid api_key=<value>"),
|
|
137
|
+
"db_connection_string": (
|
|
138
|
+
"Move to AWS Secrets Manager, Azure Key Vault, or HashiCorp Vault. "
|
|
139
|
+
"Never embed credentials in connection strings in code."
|
|
140
|
+
),
|
|
141
|
+
# --- Previously missing vault hints ---
|
|
142
|
+
"bearer_token": (
|
|
143
|
+
"Rotate immediately. Store the signing secret in HashiCorp Vault: "
|
|
144
|
+
"vault kv put secret/jwt signing_key=<value>"
|
|
145
|
+
),
|
|
146
|
+
"halluccheck_api_key": (
|
|
147
|
+
"Move to HashiCorp Vault: vault kv put secret/halluccheck api_key=<value>"
|
|
148
|
+
),
|
|
149
|
+
"spanforge_api_key": (
|
|
150
|
+
"Move to HashiCorp Vault: vault kv put secret/spanforge api_key=<value>"
|
|
151
|
+
),
|
|
152
|
+
"npm_token": (
|
|
153
|
+
"Move to npm Automation token scoped to the package, or store in HashiCorp Vault: "
|
|
154
|
+
"vault kv put secret/npm publish_token=<value>"
|
|
155
|
+
),
|
|
156
|
+
"twilio_key": (
|
|
157
|
+
"Move to HashiCorp Vault: vault kv put secret/twilio auth_token=<value>"
|
|
158
|
+
),
|
|
159
|
+
"google_api_key": (
|
|
160
|
+
"Move to Google Cloud Secret Manager: "
|
|
161
|
+
"gcloud secrets create my-google-api-key --data-file=-"
|
|
162
|
+
),
|
|
163
|
+
"terraform_cloud_token": (
|
|
164
|
+
"Move to HashiCorp Vault or set via TF_TOKEN_app_terraform_io environment variable. "
|
|
165
|
+
"Never embed tokens in .terraformrc or tfvars files."
|
|
166
|
+
),
|
|
167
|
+
"vault_token": (
|
|
168
|
+
"Rotate the root/service token immediately. Use a short-TTL token or AppRole auth: "
|
|
169
|
+
"vault write auth/approle/login role_id=<role> secret_id=<secret>"
|
|
170
|
+
),
|
|
171
|
+
"generic_jwt": (
|
|
172
|
+
"Rotate the JWT signing secret. Store the signing key in HashiCorp Vault: "
|
|
173
|
+
"vault kv put secret/jwt signing_secret=<value>"
|
|
174
|
+
),
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# Pattern registry
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
# Each entry: (secret_type, compiled_regex, zero_tolerance)
|
|
181
|
+
# Order matters — more specific patterns should appear before generic ones.
|
|
182
|
+
|
|
183
|
+
_PatternEntry = tuple[str, re.Pattern[str], bool]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _compile(pattern: str) -> re.Pattern[str]:
|
|
187
|
+
return re.compile(pattern)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
_PATTERN_REGISTRY: list[_PatternEntry] = [
|
|
191
|
+
# --- Spec-required (7) ---
|
|
192
|
+
# SEC-001-A: Bearer token (JWT form)
|
|
193
|
+
(
|
|
194
|
+
"bearer_token",
|
|
195
|
+
_compile(r"(?i)Bearer\s+eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"),
|
|
196
|
+
True,
|
|
197
|
+
),
|
|
198
|
+
# SEC-001-B: AWS access key
|
|
199
|
+
(
|
|
200
|
+
"aws_access_key",
|
|
201
|
+
_compile(r"(?<![A-Z0-9])AKIA[0-9A-Z]{16}(?![A-Z0-9])"),
|
|
202
|
+
True,
|
|
203
|
+
),
|
|
204
|
+
# SEC-001-C: GCP service account JSON fragment
|
|
205
|
+
(
|
|
206
|
+
"gcp_service_account",
|
|
207
|
+
_compile(r'"type"\s*:\s*"service_account"'),
|
|
208
|
+
True,
|
|
209
|
+
),
|
|
210
|
+
# SEC-001-D: PEM private keys
|
|
211
|
+
(
|
|
212
|
+
"pem_private_key",
|
|
213
|
+
_compile(r"-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----"),
|
|
214
|
+
True,
|
|
215
|
+
),
|
|
216
|
+
# SEC-001-E: DB connection strings with embedded credentials
|
|
217
|
+
(
|
|
218
|
+
"db_connection_string",
|
|
219
|
+
_compile(
|
|
220
|
+
r"(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|mssql|oracle)"
|
|
221
|
+
r"://[^:/@\s]{1,255}:[^@\s]{1,1024}@"
|
|
222
|
+
),
|
|
223
|
+
False,
|
|
224
|
+
),
|
|
225
|
+
# SEC-001-F: HallucCheck / Spanforge API key variants
|
|
226
|
+
(
|
|
227
|
+
"halluccheck_api_key",
|
|
228
|
+
_compile(r"hc_(?:live|test)_[0-9A-Za-z]{48}"),
|
|
229
|
+
True,
|
|
230
|
+
),
|
|
231
|
+
(
|
|
232
|
+
"spanforge_api_key",
|
|
233
|
+
_compile(r"sf_(?:live|test)_[0-9A-Za-z]{48}"),
|
|
234
|
+
True,
|
|
235
|
+
),
|
|
236
|
+
# --- Extended (13) ---
|
|
237
|
+
# SEC-001-G: GitHub PAT (new and classic formats)
|
|
238
|
+
(
|
|
239
|
+
"github_pat",
|
|
240
|
+
_compile(
|
|
241
|
+
r"(?:ghp_[A-Za-z0-9]{36,255}"
|
|
242
|
+
r"|gho_[A-Za-z0-9]{36,255}"
|
|
243
|
+
r"|ghu_[A-Za-z0-9]{36,255}"
|
|
244
|
+
r"|ghs_[A-Za-z0-9]{36,255}"
|
|
245
|
+
r"|ghr_[A-Za-z0-9]{36,255}"
|
|
246
|
+
r"|github_pat_[A-Za-z0-9_]{36,255})"
|
|
247
|
+
),
|
|
248
|
+
True,
|
|
249
|
+
),
|
|
250
|
+
# SEC-001-H: npm publish token
|
|
251
|
+
(
|
|
252
|
+
"npm_token",
|
|
253
|
+
_compile(r"npm_[A-Za-z0-9]{36}"),
|
|
254
|
+
False,
|
|
255
|
+
),
|
|
256
|
+
# SEC-001-I: Slack bot/app tokens
|
|
257
|
+
(
|
|
258
|
+
"slack_token",
|
|
259
|
+
_compile(r"xox[baprs]-[0-9A-Za-z]{8,}-[0-9A-Za-z-]{8,}"),
|
|
260
|
+
False,
|
|
261
|
+
),
|
|
262
|
+
# SEC-001-J: Stripe live secret key
|
|
263
|
+
(
|
|
264
|
+
"stripe_live_key",
|
|
265
|
+
_compile(r"sk_live_[0-9A-Za-z]{24,}"),
|
|
266
|
+
True,
|
|
267
|
+
),
|
|
268
|
+
# SEC-001-K: Stripe test secret key
|
|
269
|
+
(
|
|
270
|
+
"stripe_test_key",
|
|
271
|
+
_compile(r"sk_test_[0-9A-Za-z]{24,}"),
|
|
272
|
+
False,
|
|
273
|
+
),
|
|
274
|
+
# SEC-001-L: Twilio auth token / SID
|
|
275
|
+
(
|
|
276
|
+
"twilio_key",
|
|
277
|
+
_compile(r"SK[0-9a-fA-F]{32}"),
|
|
278
|
+
False,
|
|
279
|
+
),
|
|
280
|
+
# SEC-001-M: SendGrid API key
|
|
281
|
+
(
|
|
282
|
+
"sendgrid_key",
|
|
283
|
+
_compile(r"SG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}"),
|
|
284
|
+
False,
|
|
285
|
+
),
|
|
286
|
+
# SEC-001-N: Azure storage / service bus connection strings
|
|
287
|
+
(
|
|
288
|
+
"azure_connection_string",
|
|
289
|
+
_compile(
|
|
290
|
+
r"(?:DefaultEndpointsProtocol=https?;AccountName=[^;]{1,255};AccountKey=[^;]{1,1024}"
|
|
291
|
+
r"|Endpoint=sb://[^;]{1,255};SharedAccessKeyName=[^;]{1,255};SharedAccessKey=[^;\s]{1,255})"
|
|
292
|
+
),
|
|
293
|
+
False,
|
|
294
|
+
),
|
|
295
|
+
# SEC-001-O: OPENSSH private key header
|
|
296
|
+
(
|
|
297
|
+
"ssh_private_key",
|
|
298
|
+
_compile(r"-----BEGIN OPENSSH PRIVATE KEY-----"),
|
|
299
|
+
True,
|
|
300
|
+
),
|
|
301
|
+
# SEC-001-P: Google API key
|
|
302
|
+
(
|
|
303
|
+
"google_api_key",
|
|
304
|
+
_compile(r"AIza[0-9A-Za-z\-_]{35}"),
|
|
305
|
+
False,
|
|
306
|
+
),
|
|
307
|
+
# SEC-001-Q: Terraform Cloud / Terraform Enterprise token
|
|
308
|
+
(
|
|
309
|
+
"terraform_cloud_token",
|
|
310
|
+
_compile(r"[Aa]tlas[Tt]oken\s*=\s*['\"]?[A-Za-z0-9.]{8,}['\"]?"),
|
|
311
|
+
False,
|
|
312
|
+
),
|
|
313
|
+
# SEC-001-R: HashiCorp Vault root/service token (s. prefix)
|
|
314
|
+
(
|
|
315
|
+
"vault_token",
|
|
316
|
+
_compile(r"(?<![A-Za-z0-9])s\.[A-Za-z0-9]{24,}(?![A-Za-z0-9])"),
|
|
317
|
+
False,
|
|
318
|
+
),
|
|
319
|
+
# SEC-001-S: Generic JWT (without Bearer prefix)
|
|
320
|
+
(
|
|
321
|
+
"generic_jwt",
|
|
322
|
+
_compile(r"(?<!\w)eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}(?!\w)"),
|
|
323
|
+
True,
|
|
324
|
+
),
|
|
325
|
+
]
|
|
326
|
+
|
|
327
|
+
# Generic API key pattern — applied separately with entropy check
|
|
328
|
+
_GENERIC_API_KEY_PATTERN: re.Pattern[str] = re.compile(r"[0-9A-Za-z_\-]{32,}")
|
|
329
|
+
|
|
330
|
+
# ---------------------------------------------------------------------------
|
|
331
|
+
# Default allowlist — known test/placeholder values that should never alert
|
|
332
|
+
# ---------------------------------------------------------------------------
|
|
333
|
+
|
|
334
|
+
_DEFAULT_ALLOWLIST: frozenset[str] = frozenset(
|
|
335
|
+
{
|
|
336
|
+
"AKIA_EXAMPLE",
|
|
337
|
+
"AKIAIOSFODNN7EXAMPLE",
|
|
338
|
+
"sk_test_" + "0" * 24,
|
|
339
|
+
"hc_test_" + "0" * 48,
|
|
340
|
+
"sf_test_" + "0" * 48,
|
|
341
|
+
"AIzaSyExampleKey1234567890123456789",
|
|
342
|
+
"SG.example",
|
|
343
|
+
"xoxb-000000000000-000000000000-000000000000000000000000",
|
|
344
|
+
}
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# ---------------------------------------------------------------------------
|
|
348
|
+
# Data types
|
|
349
|
+
# ---------------------------------------------------------------------------
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
@dataclass(frozen=True)
|
|
353
|
+
class SecretHit:
|
|
354
|
+
"""A single detected secret span within a scanned text.
|
|
355
|
+
|
|
356
|
+
Attributes:
|
|
357
|
+
secret_type: Label identifying the category of secret detected
|
|
358
|
+
(e.g. ``"aws_access_key"``).
|
|
359
|
+
start: Start character offset in the original text (inclusive).
|
|
360
|
+
end: End character offset in the original text (exclusive).
|
|
361
|
+
confidence: Detection confidence in ``[0.0, 1.0]``.
|
|
362
|
+
redacted_value: Safe placeholder — always
|
|
363
|
+
``"[REDACTED:<secret_type>]"``.
|
|
364
|
+
The actual matched text is **never** stored here.
|
|
365
|
+
auto_blocked: ``True`` when this hit triggers the auto-block policy.
|
|
366
|
+
vault_hint: Optional suggestion for migrating this secret to a
|
|
367
|
+
secrets vault.
|
|
368
|
+
"""
|
|
369
|
+
|
|
370
|
+
secret_type: str
|
|
371
|
+
start: int
|
|
372
|
+
end: int
|
|
373
|
+
confidence: float
|
|
374
|
+
redacted_value: str
|
|
375
|
+
auto_blocked: bool = False
|
|
376
|
+
vault_hint: str = ""
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
@dataclass
|
|
380
|
+
class SecretsScanResult:
|
|
381
|
+
"""Result of a secrets scan operation.
|
|
382
|
+
|
|
383
|
+
Attributes:
|
|
384
|
+
detected: ``True`` when at least one hit above the confidence
|
|
385
|
+
threshold was found.
|
|
386
|
+
hits: All detected :class:`SecretHit` objects above the
|
|
387
|
+
threshold, in order of appearance.
|
|
388
|
+
auto_blocked: ``True`` when any hit triggered the auto-block
|
|
389
|
+
policy (zero-tolerance or confidence-gated).
|
|
390
|
+
redacted_text: Full input text with every hit replaced by its
|
|
391
|
+
``redacted_value`` marker.
|
|
392
|
+
secret_types: Deduplicated list of ``secret_type`` labels from
|
|
393
|
+
all hits (order of first appearance).
|
|
394
|
+
confidence_scores: Parallel list of confidence scores for each hit.
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
detected: bool
|
|
398
|
+
hits: list[SecretHit]
|
|
399
|
+
auto_blocked: bool
|
|
400
|
+
redacted_text: str
|
|
401
|
+
secret_types: list[str] = field(default_factory=list)
|
|
402
|
+
confidence_scores: list[float] = field(default_factory=list)
|
|
403
|
+
|
|
404
|
+
def to_dict(self) -> dict[str, Any]:
|
|
405
|
+
"""Return a plain-dict representation safe for JSON serialisation."""
|
|
406
|
+
return {
|
|
407
|
+
"detected": self.detected,
|
|
408
|
+
"auto_blocked": self.auto_blocked,
|
|
409
|
+
"redacted_text": self.redacted_text,
|
|
410
|
+
"secret_types": self.secret_types,
|
|
411
|
+
"confidence_scores": self.confidence_scores,
|
|
412
|
+
"hits": [
|
|
413
|
+
{
|
|
414
|
+
"secret_type": h.secret_type,
|
|
415
|
+
"start": h.start,
|
|
416
|
+
"end": h.end,
|
|
417
|
+
"confidence": h.confidence,
|
|
418
|
+
"redacted_value": h.redacted_value,
|
|
419
|
+
"auto_blocked": h.auto_blocked,
|
|
420
|
+
**({"vault_hint": h.vault_hint} if h.vault_hint else {}),
|
|
421
|
+
}
|
|
422
|
+
for h in self.hits
|
|
423
|
+
],
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
def to_sarif(
|
|
427
|
+
self,
|
|
428
|
+
*,
|
|
429
|
+
tool_name: str = "spanforge-secrets",
|
|
430
|
+
version: str = "1.0.0",
|
|
431
|
+
) -> dict[str, Any]:
|
|
432
|
+
"""Return a minimal SARIF 2.1.0 report dict.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
tool_name: The tool name to embed in the SARIF ``tool`` object.
|
|
436
|
+
version: Tool version string.
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
Dict conforming to SARIF schema 2.1.0.
|
|
440
|
+
"""
|
|
441
|
+
results = [
|
|
442
|
+
{
|
|
443
|
+
"ruleId": hit.secret_type,
|
|
444
|
+
"level": "error" if hit.auto_blocked else "warning",
|
|
445
|
+
"message": {
|
|
446
|
+
"text": (
|
|
447
|
+
f"Detected secret of type '{hit.secret_type}' "
|
|
448
|
+
f"(confidence={hit.confidence:.2f}). "
|
|
449
|
+
f"{hit.vault_hint or 'Move this value to a secrets vault.'}"
|
|
450
|
+
)
|
|
451
|
+
},
|
|
452
|
+
"locations": [
|
|
453
|
+
{
|
|
454
|
+
"physicalLocation": {
|
|
455
|
+
"region": {
|
|
456
|
+
"charOffset": hit.start,
|
|
457
|
+
"charLength": hit.end - hit.start,
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
],
|
|
462
|
+
"properties": {
|
|
463
|
+
"confidence": hit.confidence,
|
|
464
|
+
"auto_blocked": hit.auto_blocked,
|
|
465
|
+
"redacted_value": hit.redacted_value,
|
|
466
|
+
},
|
|
467
|
+
}
|
|
468
|
+
for hit in self.hits
|
|
469
|
+
]
|
|
470
|
+
|
|
471
|
+
# Build deduplicated rules list (SARIF 2.1 §3.52 — tool.driver.rules)
|
|
472
|
+
seen_rule_ids: set[str] = set()
|
|
473
|
+
rules: list[dict[str, Any]] = []
|
|
474
|
+
for hit in self.hits:
|
|
475
|
+
if hit.secret_type not in seen_rule_ids:
|
|
476
|
+
seen_rule_ids.add(hit.secret_type)
|
|
477
|
+
label = hit.secret_type.replace("_", " ").title()
|
|
478
|
+
rules.append(
|
|
479
|
+
{
|
|
480
|
+
"id": hit.secret_type,
|
|
481
|
+
"shortDescription": {
|
|
482
|
+
"text": f"Detected hard-coded {label}."
|
|
483
|
+
},
|
|
484
|
+
"helpUri": "https://docs.spanforge.dev/secrets",
|
|
485
|
+
"properties": {
|
|
486
|
+
"severity": "error" if hit.auto_blocked else "warning",
|
|
487
|
+
},
|
|
488
|
+
}
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return {
|
|
492
|
+
"$schema": "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json",
|
|
493
|
+
"version": "2.1.0",
|
|
494
|
+
"runs": [
|
|
495
|
+
{
|
|
496
|
+
"tool": {
|
|
497
|
+
"driver": {
|
|
498
|
+
"name": tool_name,
|
|
499
|
+
"version": version,
|
|
500
|
+
"informationUri": "https://docs.spanforge.dev/secrets",
|
|
501
|
+
"rules": rules,
|
|
502
|
+
}
|
|
503
|
+
},
|
|
504
|
+
"results": results,
|
|
505
|
+
}
|
|
506
|
+
],
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# ---------------------------------------------------------------------------
|
|
511
|
+
# Entropy helper
|
|
512
|
+
# ---------------------------------------------------------------------------
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def entropy_score(s: str) -> float:
|
|
516
|
+
"""Return Shannon entropy in bits per character for *s*.
|
|
517
|
+
|
|
518
|
+
A value ≥ 3.5 bits/char on a token of ≥ 32 characters is a strong
|
|
519
|
+
indicator that the string was generated by a CSPRNG (e.g. an API key or
|
|
520
|
+
bearer token).
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
s: The string to measure.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
Shannon entropy in bits per character. Returns ``0.0`` for empty
|
|
527
|
+
strings.
|
|
528
|
+
|
|
529
|
+
Example::
|
|
530
|
+
|
|
531
|
+
>>> entropy_score("aaaaaaaaaaaaaaaaaaaaaaaaa")
|
|
532
|
+
0.0
|
|
533
|
+
>>> entropy_score("AKIAIOSFODNN7EXAMPLEKEY") # doctest: +ELLIPSIS
|
|
534
|
+
3.3...
|
|
535
|
+
"""
|
|
536
|
+
if not s:
|
|
537
|
+
return 0.0
|
|
538
|
+
freq: dict[str, int] = {}
|
|
539
|
+
for ch in s:
|
|
540
|
+
freq[ch] = freq.get(ch, 0) + 1
|
|
541
|
+
n = len(s)
|
|
542
|
+
return -sum((count / n) * math.log2(count / n) for count in freq.values())
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
# ---------------------------------------------------------------------------
|
|
546
|
+
# Scanner
|
|
547
|
+
# ---------------------------------------------------------------------------
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
class SecretsScanner:
|
|
551
|
+
"""In-process secrets detection engine.
|
|
552
|
+
|
|
553
|
+
All scanning logic runs locally — no network calls are made. Matches
|
|
554
|
+
are scored with a three-tier confidence model, filtered against an
|
|
555
|
+
allowlist, and subjected to the auto-block policy.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
confidence_threshold: Default minimum confidence required to include
|
|
559
|
+
a hit in the result (default: ``0.75``). Zero-tolerance types
|
|
560
|
+
always appear regardless of this threshold.
|
|
561
|
+
extra_allowlist: Additional literal strings that should never
|
|
562
|
+
trigger an alert (merged with the built-in allowlist).
|
|
563
|
+
auto_block_override: If ``True``, all hits above the threshold are
|
|
564
|
+
flagged ``auto_blocked``; if ``False``, the auto-block policy is
|
|
565
|
+
never applied (useful for audit-only mode). ``None`` (default)
|
|
566
|
+
uses the standard policy table.
|
|
567
|
+
|
|
568
|
+
Example::
|
|
569
|
+
|
|
570
|
+
scanner = SecretsScanner()
|
|
571
|
+
result = scanner.scan("AKIA" + "A" * 16 + " is my AWS key")
|
|
572
|
+
assert result.auto_blocked
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
def __init__(
|
|
576
|
+
self,
|
|
577
|
+
confidence_threshold: float = 0.75,
|
|
578
|
+
extra_allowlist: frozenset[str] | None = None,
|
|
579
|
+
auto_block_override: bool | None = None,
|
|
580
|
+
) -> None:
|
|
581
|
+
if not 0.0 <= confidence_threshold <= 1.0:
|
|
582
|
+
msg = f"confidence_threshold must be in [0, 1]; got {confidence_threshold}"
|
|
583
|
+
raise ValueError(msg)
|
|
584
|
+
self._threshold = confidence_threshold
|
|
585
|
+
self._allowlist: frozenset[str] = (
|
|
586
|
+
_DEFAULT_ALLOWLIST | extra_allowlist if extra_allowlist else _DEFAULT_ALLOWLIST
|
|
587
|
+
)
|
|
588
|
+
self._auto_block_override = auto_block_override
|
|
589
|
+
|
|
590
|
+
# ------------------------------------------------------------------
|
|
591
|
+
# Public interface
|
|
592
|
+
# ------------------------------------------------------------------
|
|
593
|
+
|
|
594
|
+
def scan(
|
|
595
|
+
self,
|
|
596
|
+
text: str,
|
|
597
|
+
*,
|
|
598
|
+
confidence_threshold: float | None = None,
|
|
599
|
+
) -> SecretsScanResult:
|
|
600
|
+
"""Scan *text* for secrets and return a :class:`SecretsScanResult`.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
text: The text to scan. May be any length.
|
|
604
|
+
confidence_threshold: Override the instance-level threshold for
|
|
605
|
+
this single call.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
A :class:`SecretsScanResult`. The ``redacted_text`` field always
|
|
609
|
+
contains the full input with every qualifying hit replaced by its
|
|
610
|
+
redaction marker, even when ``detected`` is ``False``.
|
|
611
|
+
"""
|
|
612
|
+
if not isinstance(text, str):
|
|
613
|
+
msg = f"scan() requires a str; got {type(text).__name__}"
|
|
614
|
+
raise TypeError(msg)
|
|
615
|
+
|
|
616
|
+
threshold = confidence_threshold if confidence_threshold is not None else self._threshold
|
|
617
|
+
|
|
618
|
+
raw_hits = self._find_all_hits(text)
|
|
619
|
+
qualified: list[SecretHit] = []
|
|
620
|
+
|
|
621
|
+
for hit in raw_hits:
|
|
622
|
+
# Allowlist suppression
|
|
623
|
+
matched_span = text[hit.start : hit.end]
|
|
624
|
+
if matched_span in self._allowlist:
|
|
625
|
+
continue
|
|
626
|
+
|
|
627
|
+
is_zero_tol = hit.secret_type in _ZERO_TOLERANCE_TYPES
|
|
628
|
+
# Zero-tolerance always included, others filtered by threshold
|
|
629
|
+
if not is_zero_tol and hit.confidence < threshold:
|
|
630
|
+
continue
|
|
631
|
+
|
|
632
|
+
# Determine auto_block
|
|
633
|
+
auto_blocked = self._compute_auto_block(hit)
|
|
634
|
+
|
|
635
|
+
qualified.append(
|
|
636
|
+
SecretHit(
|
|
637
|
+
secret_type=hit.secret_type,
|
|
638
|
+
start=hit.start,
|
|
639
|
+
end=hit.end,
|
|
640
|
+
confidence=hit.confidence,
|
|
641
|
+
redacted_value=f"[REDACTED:{hit.secret_type.upper()}]",
|
|
642
|
+
auto_blocked=auto_blocked,
|
|
643
|
+
vault_hint=_VAULT_HINTS.get(hit.secret_type, ""),
|
|
644
|
+
)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# Deduplicate overlapping spans — keep highest confidence hit per span
|
|
648
|
+
qualified = _dedup_hits(qualified)
|
|
649
|
+
|
|
650
|
+
detected = len(qualified) > 0
|
|
651
|
+
any_blocked = any(h.auto_blocked for h in qualified)
|
|
652
|
+
redacted_text = _build_redacted_text(text, qualified)
|
|
653
|
+
|
|
654
|
+
seen_types: list[str] = []
|
|
655
|
+
for h in qualified:
|
|
656
|
+
if h.secret_type not in seen_types:
|
|
657
|
+
seen_types.append(h.secret_type)
|
|
658
|
+
|
|
659
|
+
return SecretsScanResult(
|
|
660
|
+
detected=detected,
|
|
661
|
+
hits=qualified,
|
|
662
|
+
auto_blocked=any_blocked,
|
|
663
|
+
redacted_text=redacted_text,
|
|
664
|
+
secret_types=seen_types,
|
|
665
|
+
confidence_scores=[h.confidence for h in qualified],
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
# ------------------------------------------------------------------
|
|
669
|
+
# Internal helpers
|
|
670
|
+
# ------------------------------------------------------------------
|
|
671
|
+
|
|
672
|
+
def _find_all_hits(self, text: str) -> list[SecretHit]:
|
|
673
|
+
"""Find all candidate hits (unsanitised, before allowlist filtering)."""
|
|
674
|
+
hits: list[SecretHit] = []
|
|
675
|
+
|
|
676
|
+
for secret_type, pattern, _zero_tol in _PATTERN_REGISTRY:
|
|
677
|
+
for m in pattern.finditer(text):
|
|
678
|
+
conf = self._score_hit(secret_type, m, text)
|
|
679
|
+
hits.append(
|
|
680
|
+
SecretHit(
|
|
681
|
+
secret_type=secret_type,
|
|
682
|
+
start=m.start(),
|
|
683
|
+
end=m.end(),
|
|
684
|
+
confidence=conf,
|
|
685
|
+
redacted_value=f"[REDACTED:{secret_type.upper()}]",
|
|
686
|
+
)
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# Generic API key — entropy-gated
|
|
690
|
+
for m in _GENERIC_API_KEY_PATTERN.finditer(text):
|
|
691
|
+
token = m.group()
|
|
692
|
+
if len(token) >= _ENTROPY_MIN_LENGTH and entropy_score(token) >= _ENTROPY_THRESHOLD:
|
|
693
|
+
conf = self._score_hit("generic_api_key", m, text)
|
|
694
|
+
hits.append(
|
|
695
|
+
SecretHit(
|
|
696
|
+
secret_type="generic_api_key", # noqa: S106 # nosec B106
|
|
697
|
+
start=m.start(),
|
|
698
|
+
end=m.end(),
|
|
699
|
+
confidence=conf,
|
|
700
|
+
redacted_value="[REDACTED:GENERIC_API_KEY]",
|
|
701
|
+
)
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
return hits
|
|
705
|
+
|
|
706
|
+
def _score_hit(
|
|
707
|
+
self,
|
|
708
|
+
secret_type: str,
|
|
709
|
+
match: re.Match[str],
|
|
710
|
+
full_text: str,
|
|
711
|
+
) -> float:
|
|
712
|
+
"""Compute a confidence score for a single regex match.
|
|
713
|
+
|
|
714
|
+
Scoring tiers:
|
|
715
|
+
* ``0.75`` — structural pattern match alone.
|
|
716
|
+
* ``0.90`` — pattern + high entropy token (≥ 3.5 bits/char, ≥ 32 chars).
|
|
717
|
+
* ``0.97`` — pattern + entropy + context keyword within ±50 chars.
|
|
718
|
+
"""
|
|
719
|
+
confidence: float = 0.75
|
|
720
|
+
|
|
721
|
+
# Tier 2: entropy check on the matched token
|
|
722
|
+
token = match.group()
|
|
723
|
+
if len(token) >= _ENTROPY_MIN_LENGTH and entropy_score(token) >= _ENTROPY_THRESHOLD:
|
|
724
|
+
confidence = 0.90
|
|
725
|
+
|
|
726
|
+
# Tier 3: context keyword in surrounding text
|
|
727
|
+
start = max(0, match.start() - _CONTEXT_WINDOW)
|
|
728
|
+
end = min(len(full_text), match.end() + _CONTEXT_WINDOW)
|
|
729
|
+
context = full_text[start:end].lower()
|
|
730
|
+
if any(kw in context for kw in _CONTEXT_KEYWORDS):
|
|
731
|
+
confidence = 0.97
|
|
732
|
+
|
|
733
|
+
return confidence
|
|
734
|
+
|
|
735
|
+
def _compute_auto_block(self, hit: SecretHit) -> bool:
|
|
736
|
+
"""Apply auto-block policy to a hit."""
|
|
737
|
+
if self._auto_block_override is True:
|
|
738
|
+
return True
|
|
739
|
+
if self._auto_block_override is False:
|
|
740
|
+
return False
|
|
741
|
+
# Zero tolerance — always block
|
|
742
|
+
if hit.secret_type in _ZERO_TOLERANCE_TYPES:
|
|
743
|
+
return True
|
|
744
|
+
# Confidence-gated
|
|
745
|
+
if hit.secret_type in _CONFIDENCE_GATED_TYPES:
|
|
746
|
+
return hit.confidence >= _CONFIDENCE_GATE_THRESHOLD
|
|
747
|
+
return False
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
# ---------------------------------------------------------------------------
|
|
751
|
+
# Module-level helpers
|
|
752
|
+
# ---------------------------------------------------------------------------
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def _dedup_hits(hits: list[SecretHit]) -> list[SecretHit]:
|
|
756
|
+
"""Remove overlapping hits, keeping the one with highest confidence.
|
|
757
|
+
|
|
758
|
+
When two hits share any character positions, only the hit with the
|
|
759
|
+
higher confidence is retained. If confidence is equal, the first
|
|
760
|
+
occurrence (by ``start`` offset) is kept.
|
|
761
|
+
"""
|
|
762
|
+
if len(hits) <= 1:
|
|
763
|
+
return hits
|
|
764
|
+
|
|
765
|
+
# Sort by start offset, then descending confidence
|
|
766
|
+
sorted_hits = sorted(hits, key=lambda h: (h.start, -h.confidence))
|
|
767
|
+
result: list[SecretHit] = []
|
|
768
|
+
last_end = -1
|
|
769
|
+
|
|
770
|
+
for hit in sorted_hits:
|
|
771
|
+
if hit.start >= last_end:
|
|
772
|
+
result.append(hit)
|
|
773
|
+
last_end = hit.end
|
|
774
|
+
elif hit.confidence > result[-1].confidence:
|
|
775
|
+
# Overlapping — keep the one with higher confidence
|
|
776
|
+
result[-1] = hit
|
|
777
|
+
last_end = hit.end
|
|
778
|
+
|
|
779
|
+
return result
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _build_redacted_text(text: str, hits: list[SecretHit]) -> str:
|
|
783
|
+
"""Replace every hit span in *text* with its ``redacted_value`` marker."""
|
|
784
|
+
if not hits:
|
|
785
|
+
return text
|
|
786
|
+
|
|
787
|
+
parts: list[str] = []
|
|
788
|
+
cursor = 0
|
|
789
|
+
|
|
790
|
+
for hit in sorted(hits, key=lambda h: h.start):
|
|
791
|
+
if hit.start > cursor:
|
|
792
|
+
parts.append(text[cursor : hit.start])
|
|
793
|
+
parts.append(hit.redacted_value)
|
|
794
|
+
cursor = hit.end
|
|
795
|
+
|
|
796
|
+
if cursor < len(text):
|
|
797
|
+
parts.append(text[cursor:])
|
|
798
|
+
|
|
799
|
+
return "".join(parts)
|