spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/secrets.py ADDED
@@ -0,0 +1,799 @@
1
+ """spanforge.secrets — Secrets detection engine (sf-secrets Phase 2).
2
+
3
+ This module implements the core in-process secrets scanning logic for the
4
+ SpanForge sf-secrets service. It is designed to run without any network
5
+ calls and is safe to import directly — the :class:`SecretsScanner` class
6
+ wraps all pattern matching, entropy scoring, allowlist filtering, and
7
+ auto-block policy logic.
8
+
9
+ Detection model
10
+ ---------------
11
+ Each candidate match is assigned a **confidence score** between 0 and 1:
12
+
13
+ * ``0.75`` — structural pattern match only.
14
+ * ``0.90`` — pattern match + Shannon entropy ≥ 3.5 bits/char on a token of
15
+ ≥ 32 characters.
16
+ * ``0.97`` — pattern + entropy + a context keyword (``password``, ``token``,
17
+ ``secret``, ``key``, ``credential``, ``api_key``, ``apikey``, ``auth``,
18
+ ``access_key``, ``private_key``) appears within ±50 characters.
19
+
20
+ Auto-block policy
21
+ -----------------
22
+ * **Zero-tolerance types** are always blocked regardless of the confidence
23
+ threshold supplied by the caller: Bearer Token, AWS Access Key, GCP Service
24
+ Account JSON, PEM/OPENSSH Private Key, SSH Private Key, HC API key
25
+ (``hc_(live|test)_*``), SF API key (``sf_(live|test)_*``), GitHub PAT,
26
+ Stripe live key (``sk_live_*``), Generic JWT.
27
+ * **Confidence-gated types** are blocked only when their confidence reaches
28
+ ≥ 0.90: Generic API Key, DB connection string.
29
+
30
+ Security requirements
31
+ ---------------------
32
+ * ``SecretHit.redacted_value`` is **always** ``"[REDACTED:<SECRET_TYPE>]"`` — the
33
+ matched value is never included.
34
+ * The entropy function is constant-time with respect to the *length* of the
35
+ input string (not its content), so it is safe to call on secret material.
36
+ * The allowlist uses exact ``frozenset`` membership tests; no partial matching
37
+ is applied to allowlist entries.
38
+ """
39
+
40
+ from __future__ import annotations
41
+
42
+ import math
43
+ import re
44
+ from dataclasses import dataclass, field
45
+ from typing import Any
46
+
47
+ __all__ = [
48
+ "SecretHit",
49
+ "SecretsScanResult",
50
+ "SecretsScanner",
51
+ "entropy_score",
52
+ ]
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Constants
56
+ # ---------------------------------------------------------------------------
57
+
58
+ _ENTROPY_THRESHOLD: float = 3.5
59
+ _ENTROPY_MIN_LENGTH: int = 32
60
+ _CONTEXT_WINDOW: int = 50 # characters either side of a match to search
61
+
62
+ _CONTEXT_KEYWORDS: frozenset[str] = frozenset(
63
+ {
64
+ "password",
65
+ "token",
66
+ "secret",
67
+ "key",
68
+ "credential",
69
+ "api_key",
70
+ "apikey",
71
+ "auth",
72
+ "access_key",
73
+ "private_key",
74
+ }
75
+ )
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # Zero-tolerance secret types (always auto-blocked)
79
+ # ---------------------------------------------------------------------------
80
+
81
+ _ZERO_TOLERANCE_TYPES: frozenset[str] = frozenset(
82
+ {
83
+ "bearer_token",
84
+ "aws_access_key",
85
+ "gcp_service_account",
86
+ "pem_private_key",
87
+ "ssh_private_key",
88
+ "halluccheck_api_key",
89
+ "spanforge_api_key",
90
+ "github_pat",
91
+ "stripe_live_key",
92
+ "generic_jwt",
93
+ }
94
+ )
95
+
96
+ # Confidence-gated types — block only if confidence >= 0.90
97
+ _CONFIDENCE_GATED_TYPES: frozenset[str] = frozenset(
98
+ {
99
+ "generic_api_key",
100
+ "db_connection_string",
101
+ }
102
+ )
103
+
104
+ _CONFIDENCE_GATE_THRESHOLD: float = 0.90
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Vault hints — suggest where to store each secret type
108
+ # ---------------------------------------------------------------------------
109
+
110
+ _VAULT_HINTS: dict[str, str] = {
111
+ "aws_access_key": (
112
+ "Move to AWS Secrets Manager: "
113
+ "aws secretsmanager create-secret --name my-aws-creds --secret-string <value>"
114
+ ),
115
+ "gcp_service_account": (
116
+ "Move to Google Cloud Secret Manager: "
117
+ "gcloud secrets create my-gcp-key --data-file=service-account.json"
118
+ ),
119
+ "azure_connection_string": (
120
+ "Move to Azure Key Vault: "
121
+ "az keyvault secret set --vault-name MyVault --name my-conn-str --value <value>"
122
+ ),
123
+ "pem_private_key": (
124
+ "Move to HashiCorp Vault: vault kv put secret/tls private_key=@keyfile.pem"
125
+ ),
126
+ "ssh_private_key": ("Move to HashiCorp Vault: vault kv put secret/ssh private_key=@id_rsa"),
127
+ "stripe_live_key": ("Move to HashiCorp Vault: vault kv put secret/stripe live_key=<value>"),
128
+ "stripe_test_key": ("Move to HashiCorp Vault: vault kv put secret/stripe test_key=<value>"),
129
+ "generic_api_key": ("Move to HashiCorp Vault: vault kv put secret/api key=<value>"),
130
+ "github_pat": (
131
+ "Move to GitHub Secrets or HashiCorp Vault: gh secret set MY_PAT --body <value>"
132
+ ),
133
+ "slack_token": ( # nosec B105
134
+ "Move to HashiCorp Vault: vault kv put secret/slack token=<value>"
135
+ ),
136
+ "sendgrid_key": ("Move to HashiCorp Vault: vault kv put secret/sendgrid api_key=<value>"),
137
+ "db_connection_string": (
138
+ "Move to AWS Secrets Manager, Azure Key Vault, or HashiCorp Vault. "
139
+ "Never embed credentials in connection strings in code."
140
+ ),
141
+ # --- Previously missing vault hints ---
142
+ "bearer_token": (
143
+ "Rotate immediately. Store the signing secret in HashiCorp Vault: "
144
+ "vault kv put secret/jwt signing_key=<value>"
145
+ ),
146
+ "halluccheck_api_key": (
147
+ "Move to HashiCorp Vault: vault kv put secret/halluccheck api_key=<value>"
148
+ ),
149
+ "spanforge_api_key": (
150
+ "Move to HashiCorp Vault: vault kv put secret/spanforge api_key=<value>"
151
+ ),
152
+ "npm_token": (
153
+ "Move to npm Automation token scoped to the package, or store in HashiCorp Vault: "
154
+ "vault kv put secret/npm publish_token=<value>"
155
+ ),
156
+ "twilio_key": (
157
+ "Move to HashiCorp Vault: vault kv put secret/twilio auth_token=<value>"
158
+ ),
159
+ "google_api_key": (
160
+ "Move to Google Cloud Secret Manager: "
161
+ "gcloud secrets create my-google-api-key --data-file=-"
162
+ ),
163
+ "terraform_cloud_token": (
164
+ "Move to HashiCorp Vault or set via TF_TOKEN_app_terraform_io environment variable. "
165
+ "Never embed tokens in .terraformrc or tfvars files."
166
+ ),
167
+ "vault_token": (
168
+ "Rotate the root/service token immediately. Use a short-TTL token or AppRole auth: "
169
+ "vault write auth/approle/login role_id=<role> secret_id=<secret>"
170
+ ),
171
+ "generic_jwt": (
172
+ "Rotate the JWT signing secret. Store the signing key in HashiCorp Vault: "
173
+ "vault kv put secret/jwt signing_secret=<value>"
174
+ ),
175
+ }
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Pattern registry
179
+ # ---------------------------------------------------------------------------
180
+ # Each entry: (secret_type, compiled_regex, zero_tolerance)
181
+ # Order matters — more specific patterns should appear before generic ones.
182
+
183
+ _PatternEntry = tuple[str, re.Pattern[str], bool]
184
+
185
+
186
+ def _compile(pattern: str) -> re.Pattern[str]:
187
+ return re.compile(pattern)
188
+
189
+
190
+ _PATTERN_REGISTRY: list[_PatternEntry] = [
191
+ # --- Spec-required (7) ---
192
+ # SEC-001-A: Bearer token (JWT form)
193
+ (
194
+ "bearer_token",
195
+ _compile(r"(?i)Bearer\s+eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"),
196
+ True,
197
+ ),
198
+ # SEC-001-B: AWS access key
199
+ (
200
+ "aws_access_key",
201
+ _compile(r"(?<![A-Z0-9])AKIA[0-9A-Z]{16}(?![A-Z0-9])"),
202
+ True,
203
+ ),
204
+ # SEC-001-C: GCP service account JSON fragment
205
+ (
206
+ "gcp_service_account",
207
+ _compile(r'"type"\s*:\s*"service_account"'),
208
+ True,
209
+ ),
210
+ # SEC-001-D: PEM private keys
211
+ (
212
+ "pem_private_key",
213
+ _compile(r"-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----"),
214
+ True,
215
+ ),
216
+ # SEC-001-E: DB connection strings with embedded credentials
217
+ (
218
+ "db_connection_string",
219
+ _compile(
220
+ r"(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|mssql|oracle)"
221
+ r"://[^:/@\s]{1,255}:[^@\s]{1,1024}@"
222
+ ),
223
+ False,
224
+ ),
225
+ # SEC-001-F: HallucCheck / Spanforge API key variants
226
+ (
227
+ "halluccheck_api_key",
228
+ _compile(r"hc_(?:live|test)_[0-9A-Za-z]{48}"),
229
+ True,
230
+ ),
231
+ (
232
+ "spanforge_api_key",
233
+ _compile(r"sf_(?:live|test)_[0-9A-Za-z]{48}"),
234
+ True,
235
+ ),
236
+ # --- Extended (13) ---
237
+ # SEC-001-G: GitHub PAT (new and classic formats)
238
+ (
239
+ "github_pat",
240
+ _compile(
241
+ r"(?:ghp_[A-Za-z0-9]{36,255}"
242
+ r"|gho_[A-Za-z0-9]{36,255}"
243
+ r"|ghu_[A-Za-z0-9]{36,255}"
244
+ r"|ghs_[A-Za-z0-9]{36,255}"
245
+ r"|ghr_[A-Za-z0-9]{36,255}"
246
+ r"|github_pat_[A-Za-z0-9_]{36,255})"
247
+ ),
248
+ True,
249
+ ),
250
+ # SEC-001-H: npm publish token
251
+ (
252
+ "npm_token",
253
+ _compile(r"npm_[A-Za-z0-9]{36}"),
254
+ False,
255
+ ),
256
+ # SEC-001-I: Slack bot/app tokens
257
+ (
258
+ "slack_token",
259
+ _compile(r"xox[baprs]-[0-9A-Za-z]{8,}-[0-9A-Za-z-]{8,}"),
260
+ False,
261
+ ),
262
+ # SEC-001-J: Stripe live secret key
263
+ (
264
+ "stripe_live_key",
265
+ _compile(r"sk_live_[0-9A-Za-z]{24,}"),
266
+ True,
267
+ ),
268
+ # SEC-001-K: Stripe test secret key
269
+ (
270
+ "stripe_test_key",
271
+ _compile(r"sk_test_[0-9A-Za-z]{24,}"),
272
+ False,
273
+ ),
274
+ # SEC-001-L: Twilio auth token / SID
275
+ (
276
+ "twilio_key",
277
+ _compile(r"SK[0-9a-fA-F]{32}"),
278
+ False,
279
+ ),
280
+ # SEC-001-M: SendGrid API key
281
+ (
282
+ "sendgrid_key",
283
+ _compile(r"SG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}"),
284
+ False,
285
+ ),
286
+ # SEC-001-N: Azure storage / service bus connection strings
287
+ (
288
+ "azure_connection_string",
289
+ _compile(
290
+ r"(?:DefaultEndpointsProtocol=https?;AccountName=[^;]{1,255};AccountKey=[^;]{1,1024}"
291
+ r"|Endpoint=sb://[^;]{1,255};SharedAccessKeyName=[^;]{1,255};SharedAccessKey=[^;\s]{1,255})"
292
+ ),
293
+ False,
294
+ ),
295
+ # SEC-001-O: OPENSSH private key header
296
+ (
297
+ "ssh_private_key",
298
+ _compile(r"-----BEGIN OPENSSH PRIVATE KEY-----"),
299
+ True,
300
+ ),
301
+ # SEC-001-P: Google API key
302
+ (
303
+ "google_api_key",
304
+ _compile(r"AIza[0-9A-Za-z\-_]{35}"),
305
+ False,
306
+ ),
307
+ # SEC-001-Q: Terraform Cloud / Terraform Enterprise token
308
+ (
309
+ "terraform_cloud_token",
310
+ _compile(r"[Aa]tlas[Tt]oken\s*=\s*['\"]?[A-Za-z0-9.]{8,}['\"]?"),
311
+ False,
312
+ ),
313
+ # SEC-001-R: HashiCorp Vault root/service token (s. prefix)
314
+ (
315
+ "vault_token",
316
+ _compile(r"(?<![A-Za-z0-9])s\.[A-Za-z0-9]{24,}(?![A-Za-z0-9])"),
317
+ False,
318
+ ),
319
+ # SEC-001-S: Generic JWT (without Bearer prefix)
320
+ (
321
+ "generic_jwt",
322
+ _compile(r"(?<!\w)eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}(?!\w)"),
323
+ True,
324
+ ),
325
+ ]
326
+
327
+ # Generic API key pattern — applied separately with entropy check
328
+ _GENERIC_API_KEY_PATTERN: re.Pattern[str] = re.compile(r"[0-9A-Za-z_\-]{32,}")
329
+
330
+ # ---------------------------------------------------------------------------
331
+ # Default allowlist — known test/placeholder values that should never alert
332
+ # ---------------------------------------------------------------------------
333
+
334
+ _DEFAULT_ALLOWLIST: frozenset[str] = frozenset(
335
+ {
336
+ "AKIA_EXAMPLE",
337
+ "AKIAIOSFODNN7EXAMPLE",
338
+ "sk_test_" + "0" * 24,
339
+ "hc_test_" + "0" * 48,
340
+ "sf_test_" + "0" * 48,
341
+ "AIzaSyExampleKey1234567890123456789",
342
+ "SG.example",
343
+ "xoxb-000000000000-000000000000-000000000000000000000000",
344
+ }
345
+ )
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # Data types
349
+ # ---------------------------------------------------------------------------
350
+
351
+
352
+ @dataclass(frozen=True)
353
+ class SecretHit:
354
+ """A single detected secret span within a scanned text.
355
+
356
+ Attributes:
357
+ secret_type: Label identifying the category of secret detected
358
+ (e.g. ``"aws_access_key"``).
359
+ start: Start character offset in the original text (inclusive).
360
+ end: End character offset in the original text (exclusive).
361
+ confidence: Detection confidence in ``[0.0, 1.0]``.
362
+ redacted_value: Safe placeholder — always
363
+ ``"[REDACTED:<secret_type>]"``.
364
+ The actual matched text is **never** stored here.
365
+ auto_blocked: ``True`` when this hit triggers the auto-block policy.
366
+ vault_hint: Optional suggestion for migrating this secret to a
367
+ secrets vault.
368
+ """
369
+
370
+ secret_type: str
371
+ start: int
372
+ end: int
373
+ confidence: float
374
+ redacted_value: str
375
+ auto_blocked: bool = False
376
+ vault_hint: str = ""
377
+
378
+
379
+ @dataclass
380
+ class SecretsScanResult:
381
+ """Result of a secrets scan operation.
382
+
383
+ Attributes:
384
+ detected: ``True`` when at least one hit above the confidence
385
+ threshold was found.
386
+ hits: All detected :class:`SecretHit` objects above the
387
+ threshold, in order of appearance.
388
+ auto_blocked: ``True`` when any hit triggered the auto-block
389
+ policy (zero-tolerance or confidence-gated).
390
+ redacted_text: Full input text with every hit replaced by its
391
+ ``redacted_value`` marker.
392
+ secret_types: Deduplicated list of ``secret_type`` labels from
393
+ all hits (order of first appearance).
394
+ confidence_scores: Parallel list of confidence scores for each hit.
395
+ """
396
+
397
+ detected: bool
398
+ hits: list[SecretHit]
399
+ auto_blocked: bool
400
+ redacted_text: str
401
+ secret_types: list[str] = field(default_factory=list)
402
+ confidence_scores: list[float] = field(default_factory=list)
403
+
404
+ def to_dict(self) -> dict[str, Any]:
405
+ """Return a plain-dict representation safe for JSON serialisation."""
406
+ return {
407
+ "detected": self.detected,
408
+ "auto_blocked": self.auto_blocked,
409
+ "redacted_text": self.redacted_text,
410
+ "secret_types": self.secret_types,
411
+ "confidence_scores": self.confidence_scores,
412
+ "hits": [
413
+ {
414
+ "secret_type": h.secret_type,
415
+ "start": h.start,
416
+ "end": h.end,
417
+ "confidence": h.confidence,
418
+ "redacted_value": h.redacted_value,
419
+ "auto_blocked": h.auto_blocked,
420
+ **({"vault_hint": h.vault_hint} if h.vault_hint else {}),
421
+ }
422
+ for h in self.hits
423
+ ],
424
+ }
425
+
426
+ def to_sarif(
427
+ self,
428
+ *,
429
+ tool_name: str = "spanforge-secrets",
430
+ version: str = "1.0.0",
431
+ ) -> dict[str, Any]:
432
+ """Return a minimal SARIF 2.1.0 report dict.
433
+
434
+ Args:
435
+ tool_name: The tool name to embed in the SARIF ``tool`` object.
436
+ version: Tool version string.
437
+
438
+ Returns:
439
+ Dict conforming to SARIF schema 2.1.0.
440
+ """
441
+ results = [
442
+ {
443
+ "ruleId": hit.secret_type,
444
+ "level": "error" if hit.auto_blocked else "warning",
445
+ "message": {
446
+ "text": (
447
+ f"Detected secret of type '{hit.secret_type}' "
448
+ f"(confidence={hit.confidence:.2f}). "
449
+ f"{hit.vault_hint or 'Move this value to a secrets vault.'}"
450
+ )
451
+ },
452
+ "locations": [
453
+ {
454
+ "physicalLocation": {
455
+ "region": {
456
+ "charOffset": hit.start,
457
+ "charLength": hit.end - hit.start,
458
+ }
459
+ }
460
+ }
461
+ ],
462
+ "properties": {
463
+ "confidence": hit.confidence,
464
+ "auto_blocked": hit.auto_blocked,
465
+ "redacted_value": hit.redacted_value,
466
+ },
467
+ }
468
+ for hit in self.hits
469
+ ]
470
+
471
+ # Build deduplicated rules list (SARIF 2.1 §3.52 — tool.driver.rules)
472
+ seen_rule_ids: set[str] = set()
473
+ rules: list[dict[str, Any]] = []
474
+ for hit in self.hits:
475
+ if hit.secret_type not in seen_rule_ids:
476
+ seen_rule_ids.add(hit.secret_type)
477
+ label = hit.secret_type.replace("_", " ").title()
478
+ rules.append(
479
+ {
480
+ "id": hit.secret_type,
481
+ "shortDescription": {
482
+ "text": f"Detected hard-coded {label}."
483
+ },
484
+ "helpUri": "https://docs.spanforge.dev/secrets",
485
+ "properties": {
486
+ "severity": "error" if hit.auto_blocked else "warning",
487
+ },
488
+ }
489
+ )
490
+
491
+ return {
492
+ "$schema": "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json",
493
+ "version": "2.1.0",
494
+ "runs": [
495
+ {
496
+ "tool": {
497
+ "driver": {
498
+ "name": tool_name,
499
+ "version": version,
500
+ "informationUri": "https://docs.spanforge.dev/secrets",
501
+ "rules": rules,
502
+ }
503
+ },
504
+ "results": results,
505
+ }
506
+ ],
507
+ }
508
+
509
+
510
+ # ---------------------------------------------------------------------------
511
+ # Entropy helper
512
+ # ---------------------------------------------------------------------------
513
+
514
+
515
+ def entropy_score(s: str) -> float:
516
+ """Return Shannon entropy in bits per character for *s*.
517
+
518
+ A value ≥ 3.5 bits/char on a token of ≥ 32 characters is a strong
519
+ indicator that the string was generated by a CSPRNG (e.g. an API key or
520
+ bearer token).
521
+
522
+ Args:
523
+ s: The string to measure.
524
+
525
+ Returns:
526
+ Shannon entropy in bits per character. Returns ``0.0`` for empty
527
+ strings.
528
+
529
+ Example::
530
+
531
+ >>> entropy_score("aaaaaaaaaaaaaaaaaaaaaaaaa")
532
+ 0.0
533
+ >>> entropy_score("AKIAIOSFODNN7EXAMPLEKEY") # doctest: +ELLIPSIS
534
+ 3.3...
535
+ """
536
+ if not s:
537
+ return 0.0
538
+ freq: dict[str, int] = {}
539
+ for ch in s:
540
+ freq[ch] = freq.get(ch, 0) + 1
541
+ n = len(s)
542
+ return -sum((count / n) * math.log2(count / n) for count in freq.values())
543
+
544
+
545
+ # ---------------------------------------------------------------------------
546
+ # Scanner
547
+ # ---------------------------------------------------------------------------
548
+
549
+
550
+ class SecretsScanner:
551
+ """In-process secrets detection engine.
552
+
553
+ All scanning logic runs locally — no network calls are made. Matches
554
+ are scored with a three-tier confidence model, filtered against an
555
+ allowlist, and subjected to the auto-block policy.
556
+
557
+ Args:
558
+ confidence_threshold: Default minimum confidence required to include
559
+ a hit in the result (default: ``0.75``). Zero-tolerance types
560
+ always appear regardless of this threshold.
561
+ extra_allowlist: Additional literal strings that should never
562
+ trigger an alert (merged with the built-in allowlist).
563
+ auto_block_override: If ``True``, all hits above the threshold are
564
+ flagged ``auto_blocked``; if ``False``, the auto-block policy is
565
+ never applied (useful for audit-only mode). ``None`` (default)
566
+ uses the standard policy table.
567
+
568
+ Example::
569
+
570
+ scanner = SecretsScanner()
571
+ result = scanner.scan("AKIA" + "A" * 16 + " is my AWS key")
572
+ assert result.auto_blocked
573
+ """
574
+
575
+ def __init__(
576
+ self,
577
+ confidence_threshold: float = 0.75,
578
+ extra_allowlist: frozenset[str] | None = None,
579
+ auto_block_override: bool | None = None,
580
+ ) -> None:
581
+ if not 0.0 <= confidence_threshold <= 1.0:
582
+ msg = f"confidence_threshold must be in [0, 1]; got {confidence_threshold}"
583
+ raise ValueError(msg)
584
+ self._threshold = confidence_threshold
585
+ self._allowlist: frozenset[str] = (
586
+ _DEFAULT_ALLOWLIST | extra_allowlist if extra_allowlist else _DEFAULT_ALLOWLIST
587
+ )
588
+ self._auto_block_override = auto_block_override
589
+
590
+ # ------------------------------------------------------------------
591
+ # Public interface
592
+ # ------------------------------------------------------------------
593
+
594
+ def scan(
595
+ self,
596
+ text: str,
597
+ *,
598
+ confidence_threshold: float | None = None,
599
+ ) -> SecretsScanResult:
600
+ """Scan *text* for secrets and return a :class:`SecretsScanResult`.
601
+
602
+ Args:
603
+ text: The text to scan. May be any length.
604
+ confidence_threshold: Override the instance-level threshold for
605
+ this single call.
606
+
607
+ Returns:
608
+ A :class:`SecretsScanResult`. The ``redacted_text`` field always
609
+ contains the full input with every qualifying hit replaced by its
610
+ redaction marker, even when ``detected`` is ``False``.
611
+ """
612
+ if not isinstance(text, str):
613
+ msg = f"scan() requires a str; got {type(text).__name__}"
614
+ raise TypeError(msg)
615
+
616
+ threshold = confidence_threshold if confidence_threshold is not None else self._threshold
617
+
618
+ raw_hits = self._find_all_hits(text)
619
+ qualified: list[SecretHit] = []
620
+
621
+ for hit in raw_hits:
622
+ # Allowlist suppression
623
+ matched_span = text[hit.start : hit.end]
624
+ if matched_span in self._allowlist:
625
+ continue
626
+
627
+ is_zero_tol = hit.secret_type in _ZERO_TOLERANCE_TYPES
628
+ # Zero-tolerance always included, others filtered by threshold
629
+ if not is_zero_tol and hit.confidence < threshold:
630
+ continue
631
+
632
+ # Determine auto_block
633
+ auto_blocked = self._compute_auto_block(hit)
634
+
635
+ qualified.append(
636
+ SecretHit(
637
+ secret_type=hit.secret_type,
638
+ start=hit.start,
639
+ end=hit.end,
640
+ confidence=hit.confidence,
641
+ redacted_value=f"[REDACTED:{hit.secret_type.upper()}]",
642
+ auto_blocked=auto_blocked,
643
+ vault_hint=_VAULT_HINTS.get(hit.secret_type, ""),
644
+ )
645
+ )
646
+
647
+ # Deduplicate overlapping spans — keep highest confidence hit per span
648
+ qualified = _dedup_hits(qualified)
649
+
650
+ detected = len(qualified) > 0
651
+ any_blocked = any(h.auto_blocked for h in qualified)
652
+ redacted_text = _build_redacted_text(text, qualified)
653
+
654
+ seen_types: list[str] = []
655
+ for h in qualified:
656
+ if h.secret_type not in seen_types:
657
+ seen_types.append(h.secret_type)
658
+
659
+ return SecretsScanResult(
660
+ detected=detected,
661
+ hits=qualified,
662
+ auto_blocked=any_blocked,
663
+ redacted_text=redacted_text,
664
+ secret_types=seen_types,
665
+ confidence_scores=[h.confidence for h in qualified],
666
+ )
667
+
668
+ # ------------------------------------------------------------------
669
+ # Internal helpers
670
+ # ------------------------------------------------------------------
671
+
672
+ def _find_all_hits(self, text: str) -> list[SecretHit]:
673
+ """Find all candidate hits (unsanitised, before allowlist filtering)."""
674
+ hits: list[SecretHit] = []
675
+
676
+ for secret_type, pattern, _zero_tol in _PATTERN_REGISTRY:
677
+ for m in pattern.finditer(text):
678
+ conf = self._score_hit(secret_type, m, text)
679
+ hits.append(
680
+ SecretHit(
681
+ secret_type=secret_type,
682
+ start=m.start(),
683
+ end=m.end(),
684
+ confidence=conf,
685
+ redacted_value=f"[REDACTED:{secret_type.upper()}]",
686
+ )
687
+ )
688
+
689
+ # Generic API key — entropy-gated
690
+ for m in _GENERIC_API_KEY_PATTERN.finditer(text):
691
+ token = m.group()
692
+ if len(token) >= _ENTROPY_MIN_LENGTH and entropy_score(token) >= _ENTROPY_THRESHOLD:
693
+ conf = self._score_hit("generic_api_key", m, text)
694
+ hits.append(
695
+ SecretHit(
696
+ secret_type="generic_api_key", # noqa: S106 # nosec B106
697
+ start=m.start(),
698
+ end=m.end(),
699
+ confidence=conf,
700
+ redacted_value="[REDACTED:GENERIC_API_KEY]",
701
+ )
702
+ )
703
+
704
+ return hits
705
+
706
+ def _score_hit(
707
+ self,
708
+ secret_type: str,
709
+ match: re.Match[str],
710
+ full_text: str,
711
+ ) -> float:
712
+ """Compute a confidence score for a single regex match.
713
+
714
+ Scoring tiers:
715
+ * ``0.75`` — structural pattern match alone.
716
+ * ``0.90`` — pattern + high entropy token (≥ 3.5 bits/char, ≥ 32 chars).
717
+ * ``0.97`` — pattern + entropy + context keyword within ±50 chars.
718
+ """
719
+ confidence: float = 0.75
720
+
721
+ # Tier 2: entropy check on the matched token
722
+ token = match.group()
723
+ if len(token) >= _ENTROPY_MIN_LENGTH and entropy_score(token) >= _ENTROPY_THRESHOLD:
724
+ confidence = 0.90
725
+
726
+ # Tier 3: context keyword in surrounding text
727
+ start = max(0, match.start() - _CONTEXT_WINDOW)
728
+ end = min(len(full_text), match.end() + _CONTEXT_WINDOW)
729
+ context = full_text[start:end].lower()
730
+ if any(kw in context for kw in _CONTEXT_KEYWORDS):
731
+ confidence = 0.97
732
+
733
+ return confidence
734
+
735
+ def _compute_auto_block(self, hit: SecretHit) -> bool:
736
+ """Apply auto-block policy to a hit."""
737
+ if self._auto_block_override is True:
738
+ return True
739
+ if self._auto_block_override is False:
740
+ return False
741
+ # Zero tolerance — always block
742
+ if hit.secret_type in _ZERO_TOLERANCE_TYPES:
743
+ return True
744
+ # Confidence-gated
745
+ if hit.secret_type in _CONFIDENCE_GATED_TYPES:
746
+ return hit.confidence >= _CONFIDENCE_GATE_THRESHOLD
747
+ return False
748
+
749
+
750
+ # ---------------------------------------------------------------------------
751
+ # Module-level helpers
752
+ # ---------------------------------------------------------------------------
753
+
754
+
755
+ def _dedup_hits(hits: list[SecretHit]) -> list[SecretHit]:
756
+ """Remove overlapping hits, keeping the one with highest confidence.
757
+
758
+ When two hits share any character positions, only the hit with the
759
+ higher confidence is retained. If confidence is equal, the first
760
+ occurrence (by ``start`` offset) is kept.
761
+ """
762
+ if len(hits) <= 1:
763
+ return hits
764
+
765
+ # Sort by start offset, then descending confidence
766
+ sorted_hits = sorted(hits, key=lambda h: (h.start, -h.confidence))
767
+ result: list[SecretHit] = []
768
+ last_end = -1
769
+
770
+ for hit in sorted_hits:
771
+ if hit.start >= last_end:
772
+ result.append(hit)
773
+ last_end = hit.end
774
+ elif hit.confidence > result[-1].confidence:
775
+ # Overlapping — keep the one with higher confidence
776
+ result[-1] = hit
777
+ last_end = hit.end
778
+
779
+ return result
780
+
781
+
782
+ def _build_redacted_text(text: str, hits: list[SecretHit]) -> str:
783
+ """Replace every hit span in *text* with its ``redacted_value`` marker."""
784
+ if not hits:
785
+ return text
786
+
787
+ parts: list[str] = []
788
+ cursor = 0
789
+
790
+ for hit in sorted(hits, key=lambda h: h.start):
791
+ if hit.start > cursor:
792
+ parts.append(text[cursor : hit.start])
793
+ parts.append(hit.redacted_value)
794
+ cursor = hit.end
795
+
796
+ if cursor < len(text):
797
+ parts.append(text[cursor:])
798
+
799
+ return "".join(parts)