spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/sdk/rbac.py ADDED
@@ -0,0 +1,280 @@
1
+ """spanforge.sdk.rbac - SpanForge sf-rbac client.
2
+
3
+ Phase 1 implementation for GA runtime RBAC enforcement. The client stores
4
+ local role manifests per actor, evaluates requested actions against required
5
+ roles, and emits signed RBAC decision records via sf-audit.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ from spanforge.namespaces.runtime_governance import RBACDecisionPayload
15
+ from spanforge.sdk._base import SFClientConfig, SFServiceClient
16
+
17
+ __all__ = ["RBACManifest", "RBACStatusInfo", "SFRBACClient"]
18
+
19
+
20
+ @dataclass
21
+ class RBACManifest:
22
+ """Registered RBAC manifest for one actor."""
23
+
24
+ actor_id: str
25
+ roles: list[str] = field(default_factory=list)
26
+ resource_roles: dict[str, list[str]] = field(default_factory=dict)
27
+ metadata: dict[str, Any] = field(default_factory=dict)
28
+
29
+ def __post_init__(self) -> None:
30
+ if not self.actor_id:
31
+ raise ValueError("RBACManifest.actor_id must be non-empty")
32
+
33
+ def to_dict(self) -> dict[str, Any]:
34
+ return {
35
+ "actor_id": self.actor_id,
36
+ "roles": list(self.roles),
37
+ "resource_roles": {key: list(value) for key, value in self.resource_roles.items()},
38
+ "metadata": dict(self.metadata),
39
+ }
40
+
41
+
42
+ @dataclass
43
+ class RBACStatusInfo:
44
+ """sf-rbac service status."""
45
+
46
+ status: str
47
+ registered_actors: int
48
+ total_checks: int
49
+ denied_checks: int
50
+
51
+
52
+ class SFRBACClient(SFServiceClient):
53
+ """SpanForge runtime RBAC authorization service client."""
54
+
55
+ def __init__(self, config: SFClientConfig) -> None:
56
+ super().__init__(config, service_name="rbac")
57
+ self._lock = threading.Lock()
58
+ self._manifests: dict[str, RBACManifest] = {}
59
+ self._records: dict[str, RBACDecisionPayload] = {}
60
+ self._by_trace: dict[str, list[str]] = {}
61
+ self._total_checks = 0
62
+ self._denied_checks = 0
63
+
64
+ def register_actor(
65
+ self,
66
+ *,
67
+ actor_id: str,
68
+ roles: list[str] | None = None,
69
+ resource_roles: dict[str, list[str]] | None = None,
70
+ metadata: dict[str, Any] | None = None,
71
+ ) -> RBACManifest:
72
+ """Register or replace the effective role manifest for an actor."""
73
+ normalized_resource_roles = {
74
+ resource: sorted({role for role in entries if role})
75
+ for resource, entries in (resource_roles or {}).items()
76
+ if resource
77
+ }
78
+ manifest = RBACManifest(
79
+ actor_id=actor_id,
80
+ roles=sorted({role for role in (roles or []) if role}),
81
+ resource_roles=normalized_resource_roles,
82
+ metadata=metadata or {},
83
+ )
84
+ with self._lock:
85
+ self._manifests[actor_id] = manifest
86
+ return manifest
87
+
88
+ def get_manifest(self, actor_id: str) -> RBACManifest | None:
89
+ """Return the registered RBAC manifest for *actor_id*."""
90
+ with self._lock:
91
+ return self._manifests.get(actor_id)
92
+
93
+ def authorize(
94
+ self,
95
+ *,
96
+ trace_id: str,
97
+ actor_id: str,
98
+ resource: str,
99
+ action_name: str,
100
+ checked_at: str,
101
+ required_roles: list[str] | None = None,
102
+ check_id: str | None = None,
103
+ policy_id: str | None = None,
104
+ policy_action: str | None = None,
105
+ ) -> RBACDecisionPayload:
106
+ """Evaluate a runtime action against the actor's effective roles."""
107
+ from spanforge.ulid import generate as _ulid
108
+
109
+ normalized_required_roles = sorted({role for role in (required_roles or []) if role})
110
+ manifest = self.get_manifest(actor_id)
111
+ effective_roles = self._effective_roles_for_resource(manifest, resource)
112
+ allowed, reason = self._evaluate_roles(
113
+ manifest=manifest,
114
+ actor_id=actor_id,
115
+ resource=resource,
116
+ action_name=action_name,
117
+ required_roles=normalized_required_roles,
118
+ effective_roles=effective_roles,
119
+ )
120
+
121
+ payload = RBACDecisionPayload(
122
+ check_id=check_id or _ulid(),
123
+ trace_id=trace_id,
124
+ actor_id=actor_id,
125
+ resource=resource,
126
+ action_name=action_name,
127
+ allowed=allowed,
128
+ outcome=self._resolve_outcome(allowed=allowed, policy_action=policy_action),
129
+ reason=reason,
130
+ checked_at=checked_at,
131
+ required_roles=normalized_required_roles,
132
+ effective_roles=effective_roles,
133
+ policy_id=policy_id,
134
+ policy_action=policy_action,
135
+ )
136
+
137
+ with self._lock:
138
+ self._records[payload.check_id] = payload
139
+ self._by_trace.setdefault(trace_id, []).append(payload.check_id)
140
+ self._total_checks += 1
141
+ if not payload.allowed:
142
+ self._denied_checks += 1
143
+
144
+ self._emit_signed_record(payload)
145
+ return payload
146
+
147
+ def authorize_with_policy(
148
+ self,
149
+ *,
150
+ environment: str,
151
+ trace_id: str,
152
+ actor_id: str,
153
+ resource: str,
154
+ action_name: str,
155
+ checked_at: str,
156
+ required_roles: list[str] | None = None,
157
+ policy_client: Any | None = None,
158
+ control: str = "role_enforcement",
159
+ ) -> RBACDecisionPayload:
160
+ """Authorize an RBAC action and attach the active runtime policy decision."""
161
+ normalized_required_roles = sorted({role for role in (required_roles or []) if role})
162
+ manifest = self.get_manifest(actor_id)
163
+ effective_roles = self._effective_roles_for_resource(manifest, resource)
164
+ allowed, _reason = self._evaluate_roles(
165
+ manifest=manifest,
166
+ actor_id=actor_id,
167
+ resource=resource,
168
+ action_name=action_name,
169
+ required_roles=normalized_required_roles,
170
+ effective_roles=effective_roles,
171
+ )
172
+ engine = policy_client or self._default_policy_client()
173
+ decision = engine.evaluate(
174
+ environment=environment,
175
+ trace_id=trace_id,
176
+ service="sf_rbac",
177
+ control=control,
178
+ evaluated_at=checked_at,
179
+ observed_value=1.0 if allowed else 0.0,
180
+ metadata={"actor_id": actor_id, "resource": resource, "action_name": action_name},
181
+ )
182
+ return self.authorize(
183
+ trace_id=trace_id,
184
+ actor_id=actor_id,
185
+ resource=resource,
186
+ action_name=action_name,
187
+ checked_at=checked_at,
188
+ required_roles=normalized_required_roles,
189
+ policy_id=decision.policy_id,
190
+ policy_action=decision.action,
191
+ )
192
+
193
+ async def authorize_async(self, **kwargs: Any) -> RBACDecisionPayload:
194
+ """Async wrapper around :meth:`authorize`."""
195
+ import asyncio
196
+
197
+ loop = asyncio.get_event_loop()
198
+ return await loop.run_in_executor(None, lambda: self.authorize(**kwargs))
199
+
200
+ def get(self, check_id: str) -> RBACDecisionPayload | None:
201
+ """Return a previously emitted RBAC decision."""
202
+ with self._lock:
203
+ return self._records.get(check_id)
204
+
205
+ def list_for_trace(self, trace_id: str) -> list[RBACDecisionPayload]:
206
+ """Return all RBAC decisions emitted for a trace."""
207
+ with self._lock:
208
+ ids = list(self._by_trace.get(trace_id, []))
209
+ return [self._records[item] for item in ids if item in self._records]
210
+
211
+ def get_status(self) -> RBACStatusInfo:
212
+ """Return service health and RBAC counters."""
213
+ with self._lock:
214
+ return RBACStatusInfo(
215
+ status="ok",
216
+ registered_actors=len(self._manifests),
217
+ total_checks=self._total_checks,
218
+ denied_checks=self._denied_checks,
219
+ )
220
+
221
+ @staticmethod
222
+ def _effective_roles_for_resource(
223
+ manifest: RBACManifest | None,
224
+ resource: str,
225
+ ) -> list[str]:
226
+ if manifest is None:
227
+ return []
228
+ effective = set(manifest.roles)
229
+ effective.update(manifest.resource_roles.get(resource, []))
230
+ return sorted(effective)
231
+
232
+ def _evaluate_roles(
233
+ self,
234
+ *,
235
+ manifest: RBACManifest | None,
236
+ actor_id: str,
237
+ resource: str,
238
+ action_name: str,
239
+ required_roles: list[str],
240
+ effective_roles: list[str],
241
+ ) -> tuple[bool, str]:
242
+ if manifest is None:
243
+ return False, f"actor '{actor_id}' has no registered RBAC manifest"
244
+ if not required_roles:
245
+ return True, f"actor '{actor_id}' is authorized for {resource}:{action_name}"
246
+
247
+ missing_roles = [role for role in required_roles if role not in effective_roles]
248
+ if missing_roles:
249
+ return (
250
+ False,
251
+ f"actor '{actor_id}' is missing required roles {missing_roles} for {resource}:{action_name}",
252
+ )
253
+ return (
254
+ True,
255
+ f"actor '{actor_id}' is authorized with roles {required_roles} for {resource}:{action_name}",
256
+ )
257
+
258
+ @staticmethod
259
+ def _resolve_outcome(*, allowed: bool, policy_action: str | None) -> str:
260
+ if allowed:
261
+ return "allow"
262
+ if policy_action == "block":
263
+ return "block"
264
+ if policy_action == "human_review":
265
+ return "human_review"
266
+ if policy_action == "redact":
267
+ return "redact"
268
+ return "escalate"
269
+
270
+ def _emit_signed_record(self, payload: RBACDecisionPayload) -> None:
271
+ """Write the RBAC decision payload into sf-audit."""
272
+ from spanforge.sdk import sf_audit
273
+
274
+ sf_audit.append(payload.to_dict(), "spanforge.rbac.v1")
275
+
276
+ @staticmethod
277
+ def _default_policy_client() -> Any:
278
+ from spanforge.sdk import sf_policy
279
+
280
+ return sf_policy
@@ -0,0 +1,430 @@
1
+ """spanforge.sdk.registry - ServiceRegistry singleton (Phase 9, CFG-010-013).
2
+
3
+ Implements:
4
+
5
+ * CFG-010: Thread-safe :class:`ServiceRegistry` singleton holding references
6
+ to all 8 service clients. ``registry.get("sf_pii") -> SFPIIClient``.
7
+ * CFG-011: :meth:`ServiceRegistry.run_startup_check` — pings all enabled
8
+ services on first use. Status per service: ``up``, ``degraded``
9
+ (latency > 2 s), or ``down`` (unreachable). If any service is ``down``
10
+ and ``local_fallback.enabled=False`` → raises
11
+ :exc:`~spanforge.sdk._exceptions.SFStartupError`.
12
+ * CFG-012: :meth:`ServiceRegistry.status_response` — returns a dict
13
+ matching the ``GET /v1/spanforge/status`` specification (per spec §6).
14
+ Each service entry includes ``{status, latency_ms, last_checked_at}``.
15
+ * CFG-013: :meth:`ServiceRegistry.start_background_checker` — launches a
16
+ daemon thread that re-checks all services every 60 s. Status changes
17
+ are logged at ``WARNING``; recovery (down → up) logged at ``INFO``.
18
+
19
+ Security requirements
20
+ ---------------------
21
+ * Credentials are never included in health-check payloads or log messages.
22
+ * The background thread is a daemon thread and does not prevent process exit.
23
+ * Thread safety is guaranteed via :class:`threading.Lock` guards on all
24
+ shared state.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import threading
31
+ import time
32
+ import urllib.error
33
+ import urllib.request
34
+ from dataclasses import dataclass
35
+ from datetime import datetime, timezone
36
+ from enum import Enum
37
+ from typing import Any
38
+
39
+ from spanforge.sdk._exceptions import SFStartupError
40
+
41
+ __all__ = [
42
+ "ServiceHealth",
43
+ "ServiceRegistry",
44
+ "ServiceStatus",
45
+ ]
46
+
47
+ _log = logging.getLogger(__name__)
48
+
49
+ # The 8 canonical service names (ordered for consistent logging)
50
+ _SERVICE_NAMES: tuple[str, ...] = (
51
+ "sf_pii",
52
+ "sf_secrets",
53
+ "sf_audit",
54
+ "sf_observe",
55
+ "sf_gate",
56
+ "sf_cec",
57
+ "sf_identity",
58
+ "sf_alert",
59
+ )
60
+
61
+ # Latency threshold above which a service is reported as "degraded" (CFG-011)
62
+ _DEGRADED_LATENCY_MS: float = 2_000.0
63
+
64
+ # Default health-check path appended to the service endpoint
65
+ _HEALTH_PATH: str = "/health"
66
+
67
+ # HTTP 200 OK status code
68
+ _HTTP_OK: int = 200
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Value objects
73
+ # ---------------------------------------------------------------------------
74
+
75
+
76
+ class ServiceStatus(str, Enum):
77
+ """Status of a single service endpoint."""
78
+
79
+ UP = "up"
80
+ DEGRADED = "degraded"
81
+ DOWN = "down"
82
+
83
+
84
+ @dataclass
85
+ class ServiceHealth:
86
+ """Point-in-time health snapshot for one service.
87
+
88
+ Attributes:
89
+ status: Current service status.
90
+ latency_ms: Round-trip latency of the last health check in
91
+ milliseconds. ``-1`` when the service was not checked or
92
+ is ``down``.
93
+ last_checked_at: UTC timestamp of the last health check, or
94
+ ``None`` if no check has been performed yet.
95
+ """
96
+
97
+ status: ServiceStatus = ServiceStatus.DOWN
98
+ latency_ms: float = -1.0
99
+ last_checked_at: datetime | None = None
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # ServiceRegistry
104
+ # ---------------------------------------------------------------------------
105
+
106
+
107
+ class ServiceRegistry:
108
+ """Thread-safe singleton registry of all 8 SpanForge service clients.
109
+
110
+ Usage::
111
+
112
+ registry = ServiceRegistry.get_instance()
113
+ registry.register("sf_pii", sf_pii_client)
114
+ client = registry.get("sf_pii")
115
+
116
+ # Run connectivity checks (CFG-011)
117
+ registry.run_startup_check()
118
+
119
+ # GET /v1/spanforge/status payload (CFG-012)
120
+ status = registry.status_response()
121
+
122
+ Only one :class:`ServiceRegistry` instance exists per process; subsequent
123
+ calls to :meth:`get_instance` return the same object.
124
+ """
125
+
126
+ _instance: ServiceRegistry | None = None
127
+ _instance_lock: threading.Lock = threading.Lock()
128
+
129
+ def __init__(self) -> None:
130
+ self._clients: dict[str, Any] = {}
131
+ self._health: dict[str, ServiceHealth] = {name: ServiceHealth() for name in _SERVICE_NAMES}
132
+ self._health_lock = threading.RLock()
133
+ self._bg_thread: threading.Thread | None = None
134
+ self._stop_event = threading.Event()
135
+
136
+ # ------------------------------------------------------------------
137
+ # Singleton
138
+ # ------------------------------------------------------------------
139
+
140
+ @classmethod
141
+ def get_instance(cls) -> ServiceRegistry:
142
+ """Return the process-wide singleton.
143
+
144
+ Creates it on first call. Thread-safe via double-checked locking.
145
+ """
146
+ if cls._instance is None:
147
+ with cls._instance_lock:
148
+ if cls._instance is None:
149
+ cls._instance = cls()
150
+ return cls._instance
151
+
152
+ @classmethod
153
+ def _reset_for_testing(cls) -> None:
154
+ """Reset the singleton — use only in tests."""
155
+ with cls._instance_lock:
156
+ if cls._instance is not None:
157
+ cls._instance._stop_event.set()
158
+ cls._instance = None
159
+
160
+ # ------------------------------------------------------------------
161
+ # Client management (CFG-010)
162
+ # ------------------------------------------------------------------
163
+
164
+ def register(self, name: str, client: Any) -> None:
165
+ """Register a service client under ``name``.
166
+
167
+ Args:
168
+ name: Service name, e.g. ``"sf_pii"``.
169
+ client: The instantiated service client.
170
+ """
171
+ self._clients[name] = client
172
+
173
+ def get(self, name: str) -> Any:
174
+ """Return the registered client for ``name``, or ``None``.
175
+
176
+ Args:
177
+ name: Service name, e.g. ``"sf_pii"``.
178
+
179
+ Returns:
180
+ The client object, or ``None`` if not registered.
181
+ """
182
+ return self._clients.get(name)
183
+
184
+ def register_all(self, clients: dict[str, Any]) -> None:
185
+ """Bulk-register multiple clients.
186
+
187
+ Args:
188
+ clients: Mapping of ``{service_name: client}``.
189
+ """
190
+ for name, client in clients.items():
191
+ self._clients[name] = client
192
+
193
+ # ------------------------------------------------------------------
194
+ # Health management
195
+ # ------------------------------------------------------------------
196
+
197
+ def _check_service(self, name: str, endpoint: str, timeout_ms: int) -> ServiceHealth:
198
+ """Ping one service health endpoint and return a :class:`ServiceHealth`.
199
+
200
+ Args:
201
+ name: Service name (for logging).
202
+ endpoint: Base URL of the service.
203
+ timeout_ms: Request timeout in milliseconds.
204
+
205
+ Returns:
206
+ :class:`ServiceHealth` with status/latency/timestamp populated.
207
+ """
208
+ if not endpoint:
209
+ # No endpoint configured → local mode → treat as UP
210
+ return ServiceHealth(
211
+ status=ServiceStatus.UP,
212
+ latency_ms=0.0,
213
+ last_checked_at=datetime.now(timezone.utc),
214
+ )
215
+
216
+ url = endpoint.rstrip("/") + _HEALTH_PATH
217
+ timeout_s = max(timeout_ms / 1000.0, 0.1)
218
+ start = time.monotonic()
219
+ try:
220
+ with urllib.request.urlopen(url, timeout=timeout_s) as resp: # noqa: S310 # nosec B310
221
+ elapsed_ms = (time.monotonic() - start) * 1000
222
+ if resp.status == _HTTP_OK:
223
+ status = (
224
+ ServiceStatus.DEGRADED
225
+ if elapsed_ms > _DEGRADED_LATENCY_MS
226
+ else ServiceStatus.UP
227
+ )
228
+ return ServiceHealth(
229
+ status=status,
230
+ latency_ms=elapsed_ms,
231
+ last_checked_at=datetime.now(timezone.utc),
232
+ )
233
+ return ServiceHealth(
234
+ status=ServiceStatus.DOWN,
235
+ latency_ms=(time.monotonic() - start) * 1000,
236
+ last_checked_at=datetime.now(timezone.utc),
237
+ )
238
+ except Exception:
239
+ elapsed_ms = (time.monotonic() - start) * 1000
240
+ return ServiceHealth(
241
+ status=ServiceStatus.DOWN,
242
+ latency_ms=elapsed_ms,
243
+ last_checked_at=datetime.now(timezone.utc),
244
+ )
245
+
246
+ # ------------------------------------------------------------------
247
+ # CFG-011: Startup connectivity check
248
+ # ------------------------------------------------------------------
249
+
250
+ def run_startup_check(
251
+ self,
252
+ endpoint: str = "",
253
+ *,
254
+ enabled_services: set[str] | None = None,
255
+ local_fallback_enabled: bool = True,
256
+ timeout_ms: int = 2000,
257
+ ) -> dict[str, ServiceHealth]:
258
+ """Ping all enabled services and update the health registry.
259
+
260
+ Logs a summary table at ``INFO`` level. If any service is ``down``
261
+ and ``local_fallback_enabled`` is ``False``, raises
262
+ :exc:`~spanforge.sdk._exceptions.SFStartupError`.
263
+
264
+ Args:
265
+ endpoint: Base service URL (same endpoint for all services when
266
+ using a single SpanForge gateway).
267
+ enabled_services: Set of enabled service names. ``None`` means
268
+ all 8 services.
269
+ local_fallback_enabled: If ``False``, a ``down`` service raises
270
+ immediately.
271
+ timeout_ms: Per-service health-check timeout in milliseconds.
272
+
273
+ Returns:
274
+ A ``{service_name: ServiceHealth}`` dict.
275
+
276
+ Raises:
277
+ :exc:`~spanforge.sdk._exceptions.SFStartupError`: When any
278
+ enabled service is ``down`` and fallback is disabled.
279
+ """
280
+ active = set(enabled_services) if enabled_services is not None else set(_SERVICE_NAMES)
281
+ results: dict[str, ServiceHealth] = {}
282
+
283
+ for name in _SERVICE_NAMES:
284
+ if name not in active:
285
+ continue
286
+ health = self._check_service(name, endpoint, timeout_ms)
287
+ results[name] = health
288
+ with self._health_lock:
289
+ self._health[name] = health
290
+
291
+ # Log summary table
292
+ _log.info("SpanForge service health check:")
293
+ for name, h in results.items():
294
+ _log.info(" %-14s %-8s %.0f ms", name, h.status.value, max(h.latency_ms, 0))
295
+
296
+ # Enterprise gate — raise on unreachable service if fallback disabled
297
+ if not local_fallback_enabled:
298
+ down = [n for n, h in results.items() if h.status == ServiceStatus.DOWN]
299
+ if down:
300
+ raise SFStartupError(down)
301
+
302
+ return results
303
+
304
+ # ------------------------------------------------------------------
305
+ # CFG-012: /v1/spanforge/status payload
306
+ # ------------------------------------------------------------------
307
+
308
+ def status_response(self) -> dict[str, dict[str, Any]]:
309
+ """Return a JSON-serialisable dict for ``GET /v1/spanforge/status``.
310
+
311
+ Each service entry contains::
312
+
313
+ {
314
+ "status": "up" | "degraded" | "down",
315
+ "latency_ms": <float>,
316
+ "last_checked_at": "<ISO-8601 UTC>" | null,
317
+ }
318
+
319
+ Returns:
320
+ A dict keyed by service name.
321
+ """
322
+ with self._health_lock:
323
+ snapshot = dict(self._health)
324
+
325
+ return {
326
+ name: {
327
+ "status": h.status.value,
328
+ "latency_ms": h.latency_ms,
329
+ "last_checked_at": (h.last_checked_at.isoformat() if h.last_checked_at else None),
330
+ }
331
+ for name, h in snapshot.items()
332
+ }
333
+
334
+ def get_health(self, name: str) -> ServiceHealth:
335
+ """Return the latest :class:`ServiceHealth` for one service.
336
+
337
+ Args:
338
+ name: Service name, e.g. ``"sf_pii"``.
339
+
340
+ Returns:
341
+ The most recently recorded :class:`ServiceHealth`.
342
+ """
343
+ with self._health_lock:
344
+ return self._health.get(name, ServiceHealth())
345
+
346
+ def update_health(self, name: str, health: ServiceHealth) -> None:
347
+ """Directly set the health for ``name`` (used by tests and fallbacks).
348
+
349
+ Args:
350
+ name: Service name.
351
+ health: New :class:`ServiceHealth` value.
352
+ """
353
+ with self._health_lock:
354
+ self._health[name] = health
355
+
356
+ # ------------------------------------------------------------------
357
+ # CFG-013: Background health re-check
358
+ # ------------------------------------------------------------------
359
+
360
+ def start_background_checker(
361
+ self,
362
+ endpoint: str = "",
363
+ interval: float = 60.0,
364
+ timeout_ms: int = 2000,
365
+ ) -> None:
366
+ """Start a daemon thread that re-checks all services every ``interval`` seconds.
367
+
368
+ Status changes are logged at ``WARNING``. Recovery (``down`` → ``up``)
369
+ is logged at ``INFO``. The thread stops automatically when the process
370
+ exits (daemon=True) or when :meth:`stop_background_checker` is called.
371
+
372
+ Args:
373
+ endpoint: Service endpoint URL passed to each health check.
374
+ interval: Seconds between checks (default: ``60``).
375
+ timeout_ms: Per-service HTTP timeout in milliseconds.
376
+ """
377
+ if self._bg_thread is not None and self._bg_thread.is_alive():
378
+ return # already running
379
+
380
+ self._stop_event.clear()
381
+
382
+ def _loop() -> None:
383
+ while not self._stop_event.wait(timeout=interval):
384
+ self._run_background_check(endpoint, timeout_ms)
385
+
386
+ self._bg_thread = threading.Thread(target=_loop, daemon=True, name="sf-health-checker")
387
+ self._bg_thread.start()
388
+ _log.debug("SpanForge background health checker started (interval=%.0fs)", interval)
389
+
390
+ def stop_background_checker(self) -> None:
391
+ """Signal the background health-check thread to stop."""
392
+ self._stop_event.set()
393
+
394
+ def _run_background_check(self, endpoint: str, timeout_ms: int) -> None:
395
+ """Run one iteration of the background health check (CFG-013)."""
396
+ for name in _SERVICE_NAMES:
397
+ prev_health = self.get_health(name)
398
+ new_health = self._check_service(name, endpoint, timeout_ms)
399
+
400
+ prev_status = prev_health.status
401
+ new_status = new_health.status
402
+
403
+ with self._health_lock:
404
+ self._health[name] = new_health
405
+
406
+ if prev_status != new_status:
407
+ if new_status == ServiceStatus.DOWN:
408
+ _log.warning(
409
+ "sf-%s status changed: %s → %s",
410
+ name,
411
+ prev_status.value,
412
+ new_status.value,
413
+ )
414
+ elif prev_status == ServiceStatus.DOWN and new_status in (
415
+ ServiceStatus.UP,
416
+ ServiceStatus.DEGRADED,
417
+ ):
418
+ _log.info(
419
+ "sf-%s recovered: %s → %s",
420
+ name,
421
+ prev_status.value,
422
+ new_status.value,
423
+ )
424
+ else:
425
+ _log.warning(
426
+ "sf-%s status changed: %s → %s",
427
+ name,
428
+ prev_status.value,
429
+ new_status.value,
430
+ )