spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/sdk/rbac.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""spanforge.sdk.rbac - SpanForge sf-rbac client.
|
|
2
|
+
|
|
3
|
+
Phase 1 implementation for GA runtime RBAC enforcement. The client stores
|
|
4
|
+
local role manifests per actor, evaluates requested actions against required
|
|
5
|
+
roles, and emits signed RBAC decision records via sf-audit.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import threading
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from spanforge.namespaces.runtime_governance import RBACDecisionPayload
|
|
15
|
+
from spanforge.sdk._base import SFClientConfig, SFServiceClient
|
|
16
|
+
|
|
17
|
+
__all__ = ["RBACManifest", "RBACStatusInfo", "SFRBACClient"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class RBACManifest:
|
|
22
|
+
"""Registered RBAC manifest for one actor."""
|
|
23
|
+
|
|
24
|
+
actor_id: str
|
|
25
|
+
roles: list[str] = field(default_factory=list)
|
|
26
|
+
resource_roles: dict[str, list[str]] = field(default_factory=dict)
|
|
27
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self) -> None:
|
|
30
|
+
if not self.actor_id:
|
|
31
|
+
raise ValueError("RBACManifest.actor_id must be non-empty")
|
|
32
|
+
|
|
33
|
+
def to_dict(self) -> dict[str, Any]:
|
|
34
|
+
return {
|
|
35
|
+
"actor_id": self.actor_id,
|
|
36
|
+
"roles": list(self.roles),
|
|
37
|
+
"resource_roles": {key: list(value) for key, value in self.resource_roles.items()},
|
|
38
|
+
"metadata": dict(self.metadata),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class RBACStatusInfo:
|
|
44
|
+
"""sf-rbac service status."""
|
|
45
|
+
|
|
46
|
+
status: str
|
|
47
|
+
registered_actors: int
|
|
48
|
+
total_checks: int
|
|
49
|
+
denied_checks: int
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SFRBACClient(SFServiceClient):
|
|
53
|
+
"""SpanForge runtime RBAC authorization service client."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, config: SFClientConfig) -> None:
|
|
56
|
+
super().__init__(config, service_name="rbac")
|
|
57
|
+
self._lock = threading.Lock()
|
|
58
|
+
self._manifests: dict[str, RBACManifest] = {}
|
|
59
|
+
self._records: dict[str, RBACDecisionPayload] = {}
|
|
60
|
+
self._by_trace: dict[str, list[str]] = {}
|
|
61
|
+
self._total_checks = 0
|
|
62
|
+
self._denied_checks = 0
|
|
63
|
+
|
|
64
|
+
def register_actor(
|
|
65
|
+
self,
|
|
66
|
+
*,
|
|
67
|
+
actor_id: str,
|
|
68
|
+
roles: list[str] | None = None,
|
|
69
|
+
resource_roles: dict[str, list[str]] | None = None,
|
|
70
|
+
metadata: dict[str, Any] | None = None,
|
|
71
|
+
) -> RBACManifest:
|
|
72
|
+
"""Register or replace the effective role manifest for an actor."""
|
|
73
|
+
normalized_resource_roles = {
|
|
74
|
+
resource: sorted({role for role in entries if role})
|
|
75
|
+
for resource, entries in (resource_roles or {}).items()
|
|
76
|
+
if resource
|
|
77
|
+
}
|
|
78
|
+
manifest = RBACManifest(
|
|
79
|
+
actor_id=actor_id,
|
|
80
|
+
roles=sorted({role for role in (roles or []) if role}),
|
|
81
|
+
resource_roles=normalized_resource_roles,
|
|
82
|
+
metadata=metadata or {},
|
|
83
|
+
)
|
|
84
|
+
with self._lock:
|
|
85
|
+
self._manifests[actor_id] = manifest
|
|
86
|
+
return manifest
|
|
87
|
+
|
|
88
|
+
def get_manifest(self, actor_id: str) -> RBACManifest | None:
|
|
89
|
+
"""Return the registered RBAC manifest for *actor_id*."""
|
|
90
|
+
with self._lock:
|
|
91
|
+
return self._manifests.get(actor_id)
|
|
92
|
+
|
|
93
|
+
def authorize(
|
|
94
|
+
self,
|
|
95
|
+
*,
|
|
96
|
+
trace_id: str,
|
|
97
|
+
actor_id: str,
|
|
98
|
+
resource: str,
|
|
99
|
+
action_name: str,
|
|
100
|
+
checked_at: str,
|
|
101
|
+
required_roles: list[str] | None = None,
|
|
102
|
+
check_id: str | None = None,
|
|
103
|
+
policy_id: str | None = None,
|
|
104
|
+
policy_action: str | None = None,
|
|
105
|
+
) -> RBACDecisionPayload:
|
|
106
|
+
"""Evaluate a runtime action against the actor's effective roles."""
|
|
107
|
+
from spanforge.ulid import generate as _ulid
|
|
108
|
+
|
|
109
|
+
normalized_required_roles = sorted({role for role in (required_roles or []) if role})
|
|
110
|
+
manifest = self.get_manifest(actor_id)
|
|
111
|
+
effective_roles = self._effective_roles_for_resource(manifest, resource)
|
|
112
|
+
allowed, reason = self._evaluate_roles(
|
|
113
|
+
manifest=manifest,
|
|
114
|
+
actor_id=actor_id,
|
|
115
|
+
resource=resource,
|
|
116
|
+
action_name=action_name,
|
|
117
|
+
required_roles=normalized_required_roles,
|
|
118
|
+
effective_roles=effective_roles,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
payload = RBACDecisionPayload(
|
|
122
|
+
check_id=check_id or _ulid(),
|
|
123
|
+
trace_id=trace_id,
|
|
124
|
+
actor_id=actor_id,
|
|
125
|
+
resource=resource,
|
|
126
|
+
action_name=action_name,
|
|
127
|
+
allowed=allowed,
|
|
128
|
+
outcome=self._resolve_outcome(allowed=allowed, policy_action=policy_action),
|
|
129
|
+
reason=reason,
|
|
130
|
+
checked_at=checked_at,
|
|
131
|
+
required_roles=normalized_required_roles,
|
|
132
|
+
effective_roles=effective_roles,
|
|
133
|
+
policy_id=policy_id,
|
|
134
|
+
policy_action=policy_action,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
with self._lock:
|
|
138
|
+
self._records[payload.check_id] = payload
|
|
139
|
+
self._by_trace.setdefault(trace_id, []).append(payload.check_id)
|
|
140
|
+
self._total_checks += 1
|
|
141
|
+
if not payload.allowed:
|
|
142
|
+
self._denied_checks += 1
|
|
143
|
+
|
|
144
|
+
self._emit_signed_record(payload)
|
|
145
|
+
return payload
|
|
146
|
+
|
|
147
|
+
def authorize_with_policy(
|
|
148
|
+
self,
|
|
149
|
+
*,
|
|
150
|
+
environment: str,
|
|
151
|
+
trace_id: str,
|
|
152
|
+
actor_id: str,
|
|
153
|
+
resource: str,
|
|
154
|
+
action_name: str,
|
|
155
|
+
checked_at: str,
|
|
156
|
+
required_roles: list[str] | None = None,
|
|
157
|
+
policy_client: Any | None = None,
|
|
158
|
+
control: str = "role_enforcement",
|
|
159
|
+
) -> RBACDecisionPayload:
|
|
160
|
+
"""Authorize an RBAC action and attach the active runtime policy decision."""
|
|
161
|
+
normalized_required_roles = sorted({role for role in (required_roles or []) if role})
|
|
162
|
+
manifest = self.get_manifest(actor_id)
|
|
163
|
+
effective_roles = self._effective_roles_for_resource(manifest, resource)
|
|
164
|
+
allowed, _reason = self._evaluate_roles(
|
|
165
|
+
manifest=manifest,
|
|
166
|
+
actor_id=actor_id,
|
|
167
|
+
resource=resource,
|
|
168
|
+
action_name=action_name,
|
|
169
|
+
required_roles=normalized_required_roles,
|
|
170
|
+
effective_roles=effective_roles,
|
|
171
|
+
)
|
|
172
|
+
engine = policy_client or self._default_policy_client()
|
|
173
|
+
decision = engine.evaluate(
|
|
174
|
+
environment=environment,
|
|
175
|
+
trace_id=trace_id,
|
|
176
|
+
service="sf_rbac",
|
|
177
|
+
control=control,
|
|
178
|
+
evaluated_at=checked_at,
|
|
179
|
+
observed_value=1.0 if allowed else 0.0,
|
|
180
|
+
metadata={"actor_id": actor_id, "resource": resource, "action_name": action_name},
|
|
181
|
+
)
|
|
182
|
+
return self.authorize(
|
|
183
|
+
trace_id=trace_id,
|
|
184
|
+
actor_id=actor_id,
|
|
185
|
+
resource=resource,
|
|
186
|
+
action_name=action_name,
|
|
187
|
+
checked_at=checked_at,
|
|
188
|
+
required_roles=normalized_required_roles,
|
|
189
|
+
policy_id=decision.policy_id,
|
|
190
|
+
policy_action=decision.action,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
async def authorize_async(self, **kwargs: Any) -> RBACDecisionPayload:
|
|
194
|
+
"""Async wrapper around :meth:`authorize`."""
|
|
195
|
+
import asyncio
|
|
196
|
+
|
|
197
|
+
loop = asyncio.get_event_loop()
|
|
198
|
+
return await loop.run_in_executor(None, lambda: self.authorize(**kwargs))
|
|
199
|
+
|
|
200
|
+
def get(self, check_id: str) -> RBACDecisionPayload | None:
|
|
201
|
+
"""Return a previously emitted RBAC decision."""
|
|
202
|
+
with self._lock:
|
|
203
|
+
return self._records.get(check_id)
|
|
204
|
+
|
|
205
|
+
def list_for_trace(self, trace_id: str) -> list[RBACDecisionPayload]:
|
|
206
|
+
"""Return all RBAC decisions emitted for a trace."""
|
|
207
|
+
with self._lock:
|
|
208
|
+
ids = list(self._by_trace.get(trace_id, []))
|
|
209
|
+
return [self._records[item] for item in ids if item in self._records]
|
|
210
|
+
|
|
211
|
+
def get_status(self) -> RBACStatusInfo:
|
|
212
|
+
"""Return service health and RBAC counters."""
|
|
213
|
+
with self._lock:
|
|
214
|
+
return RBACStatusInfo(
|
|
215
|
+
status="ok",
|
|
216
|
+
registered_actors=len(self._manifests),
|
|
217
|
+
total_checks=self._total_checks,
|
|
218
|
+
denied_checks=self._denied_checks,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _effective_roles_for_resource(
|
|
223
|
+
manifest: RBACManifest | None,
|
|
224
|
+
resource: str,
|
|
225
|
+
) -> list[str]:
|
|
226
|
+
if manifest is None:
|
|
227
|
+
return []
|
|
228
|
+
effective = set(manifest.roles)
|
|
229
|
+
effective.update(manifest.resource_roles.get(resource, []))
|
|
230
|
+
return sorted(effective)
|
|
231
|
+
|
|
232
|
+
def _evaluate_roles(
|
|
233
|
+
self,
|
|
234
|
+
*,
|
|
235
|
+
manifest: RBACManifest | None,
|
|
236
|
+
actor_id: str,
|
|
237
|
+
resource: str,
|
|
238
|
+
action_name: str,
|
|
239
|
+
required_roles: list[str],
|
|
240
|
+
effective_roles: list[str],
|
|
241
|
+
) -> tuple[bool, str]:
|
|
242
|
+
if manifest is None:
|
|
243
|
+
return False, f"actor '{actor_id}' has no registered RBAC manifest"
|
|
244
|
+
if not required_roles:
|
|
245
|
+
return True, f"actor '{actor_id}' is authorized for {resource}:{action_name}"
|
|
246
|
+
|
|
247
|
+
missing_roles = [role for role in required_roles if role not in effective_roles]
|
|
248
|
+
if missing_roles:
|
|
249
|
+
return (
|
|
250
|
+
False,
|
|
251
|
+
f"actor '{actor_id}' is missing required roles {missing_roles} for {resource}:{action_name}",
|
|
252
|
+
)
|
|
253
|
+
return (
|
|
254
|
+
True,
|
|
255
|
+
f"actor '{actor_id}' is authorized with roles {required_roles} for {resource}:{action_name}",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
@staticmethod
|
|
259
|
+
def _resolve_outcome(*, allowed: bool, policy_action: str | None) -> str:
|
|
260
|
+
if allowed:
|
|
261
|
+
return "allow"
|
|
262
|
+
if policy_action == "block":
|
|
263
|
+
return "block"
|
|
264
|
+
if policy_action == "human_review":
|
|
265
|
+
return "human_review"
|
|
266
|
+
if policy_action == "redact":
|
|
267
|
+
return "redact"
|
|
268
|
+
return "escalate"
|
|
269
|
+
|
|
270
|
+
def _emit_signed_record(self, payload: RBACDecisionPayload) -> None:
|
|
271
|
+
"""Write the RBAC decision payload into sf-audit."""
|
|
272
|
+
from spanforge.sdk import sf_audit
|
|
273
|
+
|
|
274
|
+
sf_audit.append(payload.to_dict(), "spanforge.rbac.v1")
|
|
275
|
+
|
|
276
|
+
@staticmethod
|
|
277
|
+
def _default_policy_client() -> Any:
|
|
278
|
+
from spanforge.sdk import sf_policy
|
|
279
|
+
|
|
280
|
+
return sf_policy
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""spanforge.sdk.registry - ServiceRegistry singleton (Phase 9, CFG-010-013).
|
|
2
|
+
|
|
3
|
+
Implements:
|
|
4
|
+
|
|
5
|
+
* CFG-010: Thread-safe :class:`ServiceRegistry` singleton holding references
|
|
6
|
+
to all 8 service clients. ``registry.get("sf_pii") -> SFPIIClient``.
|
|
7
|
+
* CFG-011: :meth:`ServiceRegistry.run_startup_check` — pings all enabled
|
|
8
|
+
services on first use. Status per service: ``up``, ``degraded``
|
|
9
|
+
(latency > 2 s), or ``down`` (unreachable). If any service is ``down``
|
|
10
|
+
and ``local_fallback.enabled=False`` → raises
|
|
11
|
+
:exc:`~spanforge.sdk._exceptions.SFStartupError`.
|
|
12
|
+
* CFG-012: :meth:`ServiceRegistry.status_response` — returns a dict
|
|
13
|
+
matching the ``GET /v1/spanforge/status`` specification (per spec §6).
|
|
14
|
+
Each service entry includes ``{status, latency_ms, last_checked_at}``.
|
|
15
|
+
* CFG-013: :meth:`ServiceRegistry.start_background_checker` — launches a
|
|
16
|
+
daemon thread that re-checks all services every 60 s. Status changes
|
|
17
|
+
are logged at ``WARNING``; recovery (down → up) logged at ``INFO``.
|
|
18
|
+
|
|
19
|
+
Security requirements
|
|
20
|
+
---------------------
|
|
21
|
+
* Credentials are never included in health-check payloads or log messages.
|
|
22
|
+
* The background thread is a daemon thread and does not prevent process exit.
|
|
23
|
+
* Thread safety is guaranteed via :class:`threading.Lock` guards on all
|
|
24
|
+
shared state.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
import threading
|
|
31
|
+
import time
|
|
32
|
+
import urllib.error
|
|
33
|
+
import urllib.request
|
|
34
|
+
from dataclasses import dataclass
|
|
35
|
+
from datetime import datetime, timezone
|
|
36
|
+
from enum import Enum
|
|
37
|
+
from typing import Any
|
|
38
|
+
|
|
39
|
+
from spanforge.sdk._exceptions import SFStartupError
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"ServiceHealth",
|
|
43
|
+
"ServiceRegistry",
|
|
44
|
+
"ServiceStatus",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
_log = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
# The 8 canonical service names (ordered for consistent logging)
|
|
50
|
+
_SERVICE_NAMES: tuple[str, ...] = (
|
|
51
|
+
"sf_pii",
|
|
52
|
+
"sf_secrets",
|
|
53
|
+
"sf_audit",
|
|
54
|
+
"sf_observe",
|
|
55
|
+
"sf_gate",
|
|
56
|
+
"sf_cec",
|
|
57
|
+
"sf_identity",
|
|
58
|
+
"sf_alert",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Latency threshold above which a service is reported as "degraded" (CFG-011)
|
|
62
|
+
_DEGRADED_LATENCY_MS: float = 2_000.0
|
|
63
|
+
|
|
64
|
+
# Default health-check path appended to the service endpoint
|
|
65
|
+
_HEALTH_PATH: str = "/health"
|
|
66
|
+
|
|
67
|
+
# HTTP 200 OK status code
|
|
68
|
+
_HTTP_OK: int = 200
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Value objects
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ServiceStatus(str, Enum):
|
|
77
|
+
"""Status of a single service endpoint."""
|
|
78
|
+
|
|
79
|
+
UP = "up"
|
|
80
|
+
DEGRADED = "degraded"
|
|
81
|
+
DOWN = "down"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class ServiceHealth:
|
|
86
|
+
"""Point-in-time health snapshot for one service.
|
|
87
|
+
|
|
88
|
+
Attributes:
|
|
89
|
+
status: Current service status.
|
|
90
|
+
latency_ms: Round-trip latency of the last health check in
|
|
91
|
+
milliseconds. ``-1`` when the service was not checked or
|
|
92
|
+
is ``down``.
|
|
93
|
+
last_checked_at: UTC timestamp of the last health check, or
|
|
94
|
+
``None`` if no check has been performed yet.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
status: ServiceStatus = ServiceStatus.DOWN
|
|
98
|
+
latency_ms: float = -1.0
|
|
99
|
+
last_checked_at: datetime | None = None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# ServiceRegistry
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class ServiceRegistry:
|
|
108
|
+
"""Thread-safe singleton registry of all 8 SpanForge service clients.
|
|
109
|
+
|
|
110
|
+
Usage::
|
|
111
|
+
|
|
112
|
+
registry = ServiceRegistry.get_instance()
|
|
113
|
+
registry.register("sf_pii", sf_pii_client)
|
|
114
|
+
client = registry.get("sf_pii")
|
|
115
|
+
|
|
116
|
+
# Run connectivity checks (CFG-011)
|
|
117
|
+
registry.run_startup_check()
|
|
118
|
+
|
|
119
|
+
# GET /v1/spanforge/status payload (CFG-012)
|
|
120
|
+
status = registry.status_response()
|
|
121
|
+
|
|
122
|
+
Only one :class:`ServiceRegistry` instance exists per process; subsequent
|
|
123
|
+
calls to :meth:`get_instance` return the same object.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
_instance: ServiceRegistry | None = None
|
|
127
|
+
_instance_lock: threading.Lock = threading.Lock()
|
|
128
|
+
|
|
129
|
+
def __init__(self) -> None:
|
|
130
|
+
self._clients: dict[str, Any] = {}
|
|
131
|
+
self._health: dict[str, ServiceHealth] = {name: ServiceHealth() for name in _SERVICE_NAMES}
|
|
132
|
+
self._health_lock = threading.RLock()
|
|
133
|
+
self._bg_thread: threading.Thread | None = None
|
|
134
|
+
self._stop_event = threading.Event()
|
|
135
|
+
|
|
136
|
+
# ------------------------------------------------------------------
|
|
137
|
+
# Singleton
|
|
138
|
+
# ------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def get_instance(cls) -> ServiceRegistry:
|
|
142
|
+
"""Return the process-wide singleton.
|
|
143
|
+
|
|
144
|
+
Creates it on first call. Thread-safe via double-checked locking.
|
|
145
|
+
"""
|
|
146
|
+
if cls._instance is None:
|
|
147
|
+
with cls._instance_lock:
|
|
148
|
+
if cls._instance is None:
|
|
149
|
+
cls._instance = cls()
|
|
150
|
+
return cls._instance
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def _reset_for_testing(cls) -> None:
|
|
154
|
+
"""Reset the singleton — use only in tests."""
|
|
155
|
+
with cls._instance_lock:
|
|
156
|
+
if cls._instance is not None:
|
|
157
|
+
cls._instance._stop_event.set()
|
|
158
|
+
cls._instance = None
|
|
159
|
+
|
|
160
|
+
# ------------------------------------------------------------------
|
|
161
|
+
# Client management (CFG-010)
|
|
162
|
+
# ------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
def register(self, name: str, client: Any) -> None:
|
|
165
|
+
"""Register a service client under ``name``.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
name: Service name, e.g. ``"sf_pii"``.
|
|
169
|
+
client: The instantiated service client.
|
|
170
|
+
"""
|
|
171
|
+
self._clients[name] = client
|
|
172
|
+
|
|
173
|
+
def get(self, name: str) -> Any:
|
|
174
|
+
"""Return the registered client for ``name``, or ``None``.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
name: Service name, e.g. ``"sf_pii"``.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
The client object, or ``None`` if not registered.
|
|
181
|
+
"""
|
|
182
|
+
return self._clients.get(name)
|
|
183
|
+
|
|
184
|
+
def register_all(self, clients: dict[str, Any]) -> None:
|
|
185
|
+
"""Bulk-register multiple clients.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
clients: Mapping of ``{service_name: client}``.
|
|
189
|
+
"""
|
|
190
|
+
for name, client in clients.items():
|
|
191
|
+
self._clients[name] = client
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------
|
|
194
|
+
# Health management
|
|
195
|
+
# ------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
def _check_service(self, name: str, endpoint: str, timeout_ms: int) -> ServiceHealth:
|
|
198
|
+
"""Ping one service health endpoint and return a :class:`ServiceHealth`.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
name: Service name (for logging).
|
|
202
|
+
endpoint: Base URL of the service.
|
|
203
|
+
timeout_ms: Request timeout in milliseconds.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
:class:`ServiceHealth` with status/latency/timestamp populated.
|
|
207
|
+
"""
|
|
208
|
+
if not endpoint:
|
|
209
|
+
# No endpoint configured → local mode → treat as UP
|
|
210
|
+
return ServiceHealth(
|
|
211
|
+
status=ServiceStatus.UP,
|
|
212
|
+
latency_ms=0.0,
|
|
213
|
+
last_checked_at=datetime.now(timezone.utc),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
url = endpoint.rstrip("/") + _HEALTH_PATH
|
|
217
|
+
timeout_s = max(timeout_ms / 1000.0, 0.1)
|
|
218
|
+
start = time.monotonic()
|
|
219
|
+
try:
|
|
220
|
+
with urllib.request.urlopen(url, timeout=timeout_s) as resp: # noqa: S310 # nosec B310
|
|
221
|
+
elapsed_ms = (time.monotonic() - start) * 1000
|
|
222
|
+
if resp.status == _HTTP_OK:
|
|
223
|
+
status = (
|
|
224
|
+
ServiceStatus.DEGRADED
|
|
225
|
+
if elapsed_ms > _DEGRADED_LATENCY_MS
|
|
226
|
+
else ServiceStatus.UP
|
|
227
|
+
)
|
|
228
|
+
return ServiceHealth(
|
|
229
|
+
status=status,
|
|
230
|
+
latency_ms=elapsed_ms,
|
|
231
|
+
last_checked_at=datetime.now(timezone.utc),
|
|
232
|
+
)
|
|
233
|
+
return ServiceHealth(
|
|
234
|
+
status=ServiceStatus.DOWN,
|
|
235
|
+
latency_ms=(time.monotonic() - start) * 1000,
|
|
236
|
+
last_checked_at=datetime.now(timezone.utc),
|
|
237
|
+
)
|
|
238
|
+
except Exception:
|
|
239
|
+
elapsed_ms = (time.monotonic() - start) * 1000
|
|
240
|
+
return ServiceHealth(
|
|
241
|
+
status=ServiceStatus.DOWN,
|
|
242
|
+
latency_ms=elapsed_ms,
|
|
243
|
+
last_checked_at=datetime.now(timezone.utc),
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# ------------------------------------------------------------------
|
|
247
|
+
# CFG-011: Startup connectivity check
|
|
248
|
+
# ------------------------------------------------------------------
|
|
249
|
+
|
|
250
|
+
def run_startup_check(
|
|
251
|
+
self,
|
|
252
|
+
endpoint: str = "",
|
|
253
|
+
*,
|
|
254
|
+
enabled_services: set[str] | None = None,
|
|
255
|
+
local_fallback_enabled: bool = True,
|
|
256
|
+
timeout_ms: int = 2000,
|
|
257
|
+
) -> dict[str, ServiceHealth]:
|
|
258
|
+
"""Ping all enabled services and update the health registry.
|
|
259
|
+
|
|
260
|
+
Logs a summary table at ``INFO`` level. If any service is ``down``
|
|
261
|
+
and ``local_fallback_enabled`` is ``False``, raises
|
|
262
|
+
:exc:`~spanforge.sdk._exceptions.SFStartupError`.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
endpoint: Base service URL (same endpoint for all services when
|
|
266
|
+
using a single SpanForge gateway).
|
|
267
|
+
enabled_services: Set of enabled service names. ``None`` means
|
|
268
|
+
all 8 services.
|
|
269
|
+
local_fallback_enabled: If ``False``, a ``down`` service raises
|
|
270
|
+
immediately.
|
|
271
|
+
timeout_ms: Per-service health-check timeout in milliseconds.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
A ``{service_name: ServiceHealth}`` dict.
|
|
275
|
+
|
|
276
|
+
Raises:
|
|
277
|
+
:exc:`~spanforge.sdk._exceptions.SFStartupError`: When any
|
|
278
|
+
enabled service is ``down`` and fallback is disabled.
|
|
279
|
+
"""
|
|
280
|
+
active = set(enabled_services) if enabled_services is not None else set(_SERVICE_NAMES)
|
|
281
|
+
results: dict[str, ServiceHealth] = {}
|
|
282
|
+
|
|
283
|
+
for name in _SERVICE_NAMES:
|
|
284
|
+
if name not in active:
|
|
285
|
+
continue
|
|
286
|
+
health = self._check_service(name, endpoint, timeout_ms)
|
|
287
|
+
results[name] = health
|
|
288
|
+
with self._health_lock:
|
|
289
|
+
self._health[name] = health
|
|
290
|
+
|
|
291
|
+
# Log summary table
|
|
292
|
+
_log.info("SpanForge service health check:")
|
|
293
|
+
for name, h in results.items():
|
|
294
|
+
_log.info(" %-14s %-8s %.0f ms", name, h.status.value, max(h.latency_ms, 0))
|
|
295
|
+
|
|
296
|
+
# Enterprise gate — raise on unreachable service if fallback disabled
|
|
297
|
+
if not local_fallback_enabled:
|
|
298
|
+
down = [n for n, h in results.items() if h.status == ServiceStatus.DOWN]
|
|
299
|
+
if down:
|
|
300
|
+
raise SFStartupError(down)
|
|
301
|
+
|
|
302
|
+
return results
|
|
303
|
+
|
|
304
|
+
# ------------------------------------------------------------------
|
|
305
|
+
# CFG-012: /v1/spanforge/status payload
|
|
306
|
+
# ------------------------------------------------------------------
|
|
307
|
+
|
|
308
|
+
def status_response(self) -> dict[str, dict[str, Any]]:
|
|
309
|
+
"""Return a JSON-serialisable dict for ``GET /v1/spanforge/status``.
|
|
310
|
+
|
|
311
|
+
Each service entry contains::
|
|
312
|
+
|
|
313
|
+
{
|
|
314
|
+
"status": "up" | "degraded" | "down",
|
|
315
|
+
"latency_ms": <float>,
|
|
316
|
+
"last_checked_at": "<ISO-8601 UTC>" | null,
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
A dict keyed by service name.
|
|
321
|
+
"""
|
|
322
|
+
with self._health_lock:
|
|
323
|
+
snapshot = dict(self._health)
|
|
324
|
+
|
|
325
|
+
return {
|
|
326
|
+
name: {
|
|
327
|
+
"status": h.status.value,
|
|
328
|
+
"latency_ms": h.latency_ms,
|
|
329
|
+
"last_checked_at": (h.last_checked_at.isoformat() if h.last_checked_at else None),
|
|
330
|
+
}
|
|
331
|
+
for name, h in snapshot.items()
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
def get_health(self, name: str) -> ServiceHealth:
|
|
335
|
+
"""Return the latest :class:`ServiceHealth` for one service.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
name: Service name, e.g. ``"sf_pii"``.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
The most recently recorded :class:`ServiceHealth`.
|
|
342
|
+
"""
|
|
343
|
+
with self._health_lock:
|
|
344
|
+
return self._health.get(name, ServiceHealth())
|
|
345
|
+
|
|
346
|
+
def update_health(self, name: str, health: ServiceHealth) -> None:
|
|
347
|
+
"""Directly set the health for ``name`` (used by tests and fallbacks).
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
name: Service name.
|
|
351
|
+
health: New :class:`ServiceHealth` value.
|
|
352
|
+
"""
|
|
353
|
+
with self._health_lock:
|
|
354
|
+
self._health[name] = health
|
|
355
|
+
|
|
356
|
+
# ------------------------------------------------------------------
|
|
357
|
+
# CFG-013: Background health re-check
|
|
358
|
+
# ------------------------------------------------------------------
|
|
359
|
+
|
|
360
|
+
def start_background_checker(
|
|
361
|
+
self,
|
|
362
|
+
endpoint: str = "",
|
|
363
|
+
interval: float = 60.0,
|
|
364
|
+
timeout_ms: int = 2000,
|
|
365
|
+
) -> None:
|
|
366
|
+
"""Start a daemon thread that re-checks all services every ``interval`` seconds.
|
|
367
|
+
|
|
368
|
+
Status changes are logged at ``WARNING``. Recovery (``down`` → ``up``)
|
|
369
|
+
is logged at ``INFO``. The thread stops automatically when the process
|
|
370
|
+
exits (daemon=True) or when :meth:`stop_background_checker` is called.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
endpoint: Service endpoint URL passed to each health check.
|
|
374
|
+
interval: Seconds between checks (default: ``60``).
|
|
375
|
+
timeout_ms: Per-service HTTP timeout in milliseconds.
|
|
376
|
+
"""
|
|
377
|
+
if self._bg_thread is not None and self._bg_thread.is_alive():
|
|
378
|
+
return # already running
|
|
379
|
+
|
|
380
|
+
self._stop_event.clear()
|
|
381
|
+
|
|
382
|
+
def _loop() -> None:
|
|
383
|
+
while not self._stop_event.wait(timeout=interval):
|
|
384
|
+
self._run_background_check(endpoint, timeout_ms)
|
|
385
|
+
|
|
386
|
+
self._bg_thread = threading.Thread(target=_loop, daemon=True, name="sf-health-checker")
|
|
387
|
+
self._bg_thread.start()
|
|
388
|
+
_log.debug("SpanForge background health checker started (interval=%.0fs)", interval)
|
|
389
|
+
|
|
390
|
+
def stop_background_checker(self) -> None:
|
|
391
|
+
"""Signal the background health-check thread to stop."""
|
|
392
|
+
self._stop_event.set()
|
|
393
|
+
|
|
394
|
+
def _run_background_check(self, endpoint: str, timeout_ms: int) -> None:
|
|
395
|
+
"""Run one iteration of the background health check (CFG-013)."""
|
|
396
|
+
for name in _SERVICE_NAMES:
|
|
397
|
+
prev_health = self.get_health(name)
|
|
398
|
+
new_health = self._check_service(name, endpoint, timeout_ms)
|
|
399
|
+
|
|
400
|
+
prev_status = prev_health.status
|
|
401
|
+
new_status = new_health.status
|
|
402
|
+
|
|
403
|
+
with self._health_lock:
|
|
404
|
+
self._health[name] = new_health
|
|
405
|
+
|
|
406
|
+
if prev_status != new_status:
|
|
407
|
+
if new_status == ServiceStatus.DOWN:
|
|
408
|
+
_log.warning(
|
|
409
|
+
"sf-%s status changed: %s → %s",
|
|
410
|
+
name,
|
|
411
|
+
prev_status.value,
|
|
412
|
+
new_status.value,
|
|
413
|
+
)
|
|
414
|
+
elif prev_status == ServiceStatus.DOWN and new_status in (
|
|
415
|
+
ServiceStatus.UP,
|
|
416
|
+
ServiceStatus.DEGRADED,
|
|
417
|
+
):
|
|
418
|
+
_log.info(
|
|
419
|
+
"sf-%s recovered: %s → %s",
|
|
420
|
+
name,
|
|
421
|
+
prev_status.value,
|
|
422
|
+
new_status.value,
|
|
423
|
+
)
|
|
424
|
+
else:
|
|
425
|
+
_log.warning(
|
|
426
|
+
"sf-%s status changed: %s → %s",
|
|
427
|
+
name,
|
|
428
|
+
prev_status.value,
|
|
429
|
+
new_status.value,
|
|
430
|
+
)
|