skillpool 4.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skillpool/__init__.py +74 -0
- skillpool/__main__.py +6 -0
- skillpool/adapters/__init__.py +8 -0
- skillpool/adapters/base.py +41 -0
- skillpool/adapters/claude_adapter.py +36 -0
- skillpool/adapters/codex_adapter.py +92 -0
- skillpool/adapters/hermes_adapter.py +38 -0
- skillpool/audit/__init__.py +651 -0
- skillpool/bridge/__init__.py +16 -0
- skillpool/bridge/freeze_detector.py +134 -0
- skillpool/bridge/maintenance.py +119 -0
- skillpool/bridge/wal_manager.py +136 -0
- skillpool/clawmem_client.py +176 -0
- skillpool/cli.py +700 -0
- skillpool/combiner/__init__.py +31 -0
- skillpool/combiner/lifecycle.py +453 -0
- skillpool/combiner/models.py +99 -0
- skillpool/config.py +34 -0
- skillpool/cost/__init__.py +111 -0
- skillpool/cost/audit_hash.py +51 -0
- skillpool/cost/budget_tracker.py +66 -0
- skillpool/cost/dashboard.py +189 -0
- skillpool/cost/models.py +129 -0
- skillpool/cost/token_governor.py +264 -0
- skillpool/cost/trace_ceiling.py +38 -0
- skillpool/csdf.py +126 -0
- skillpool/evolver/__init__.py +978 -0
- skillpool/gain/__init__.py +285 -0
- skillpool/gate.py +282 -0
- skillpool/gate_policy/__init__.py +31 -0
- skillpool/gate_policy/incremental.py +157 -0
- skillpool/gate_policy/parser.py +258 -0
- skillpool/gate_policy/state_machine.py +432 -0
- skillpool/graph/__init__.py +14 -0
- skillpool/graph/ppr.py +279 -0
- skillpool/health/__init__.py +73 -0
- skillpool/health/check.py +85 -0
- skillpool/health/degradation.py +90 -0
- skillpool/health/models.py +43 -0
- skillpool/hooks/__init__.py +4 -0
- skillpool/hooks/security_scanner.py +288 -0
- skillpool/lifecycle.py +150 -0
- skillpool/materializer/__init__.py +124 -0
- skillpool/materializer/budget_cropper.py +178 -0
- skillpool/materializer/csdf_loader.py +114 -0
- skillpool/materializer/lazy_loader.py +265 -0
- skillpool/materializer/lifecycle_filter.py +93 -0
- skillpool/materializer/mapper.py +178 -0
- skillpool/materializer/models.py +66 -0
- skillpool/mcp_server.py +2005 -0
- skillpool/monitor/__init__.py +576 -0
- skillpool/monitor/bug_collector.py +392 -0
- skillpool/monitor/defect_classifier.py +218 -0
- skillpool/monitor/self_healing.py +530 -0
- skillpool/monitor/telemetry_bridge.py +197 -0
- skillpool/paradigm/__init__.py +312 -0
- skillpool/paradigm/override.py +285 -0
- skillpool/profile.py +94 -0
- skillpool/quality.py +254 -0
- skillpool/registry/__init__.py +509 -0
- skillpool/registry/models.py +98 -0
- skillpool/resolver/__init__.py +320 -0
- skillpool/resolver/cache.py +103 -0
- skillpool/resolver/circuit_breaker.py +103 -0
- skillpool/resolver/conflict_detector.py +111 -0
- skillpool/resolver/health_filter.py +38 -0
- skillpool/resolver/models.py +154 -0
- skillpool/resolver/rate_limiter.py +48 -0
- skillpool/resolver/skill_graph.py +183 -0
- skillpool/review/__init__.py +242 -0
- skillpool/review/async_queue.py +96 -0
- skillpool/review/checkpoint_runner.py +345 -0
- skillpool/review/models.py +164 -0
- skillpool/review/suspect_marker.py +39 -0
- skillpool/review/veto_evaluator.py +94 -0
- skillpool/router/__init__.py +481 -0
- skillpool/schemas.py +119 -0
- skillpool/synergy/__init__.py +240 -0
- skillpool/synergy/detector.py +5 -0
- skillpool/telemetry.py +126 -0
- skillpool/utils/__init__.py +21 -0
- skillpool/utils/changelog.py +218 -0
- skillpool/utils/logger.py +273 -0
- skillpool/utils/runtime_audit.py +163 -0
- skillpool/utils/time_utils.py +13 -0
- skillpool-4.3.0.dist-info/METADATA +21 -0
- skillpool-4.3.0.dist-info/RECORD +90 -0
- skillpool-4.3.0.dist-info/WHEEL +5 -0
- skillpool-4.3.0.dist-info/entry_points.txt +3 -0
- skillpool-4.3.0.dist-info/top_level.txt +1 -0
skillpool/graph/ppr.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Personalized PageRank (PPR) — Three-layer implementation.
|
|
2
|
+
|
|
3
|
+
V1.1 Section 8.5 compliance:
|
|
4
|
+
Layer 1: Pure Python — correctness verification only, not for production
|
|
5
|
+
Layer 2: SciPy CSR sparse matrix — CPU production path for medium graphs
|
|
6
|
+
Layer 3: scikit-network PageRank — fallback for large graphs when available
|
|
7
|
+
|
|
8
|
+
Unified interface:
|
|
9
|
+
personalized_pagerank(adj, seeds, alpha=0.85, epsilon=1e-6, top_k=None)
|
|
10
|
+
|
|
11
|
+
Performance acceptance criteria (must record hardware + params):
|
|
12
|
+
- 10K nodes × 50K edges: CSR path < 10ms
|
|
13
|
+
- 100K nodes: CSR path < 50ms (P95), requires documented benchmarks
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
__all__ = ["personalized_pagerank", "reverse_ppr"]
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
from scipy import sparse as sp
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _validate_inputs(
|
|
25
|
+
adj: sp.spmatrix,
|
|
26
|
+
seeds: list[int] | dict[int, float],
|
|
27
|
+
) -> tuple[int, np.ndarray]:
|
|
28
|
+
"""
|
|
29
|
+
Validate adjacency matrix and normalize seeds into a probability vector.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
adj: Sparse adjacency matrix (n x n)
|
|
33
|
+
seeds: Either list of node indices (uniform weight) or {node: weight} dict
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Tuple of (n_nodes, seed_vector)
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If adj is not square or seeds contain invalid nodes
|
|
40
|
+
"""
|
|
41
|
+
n = adj.shape[0]
|
|
42
|
+
if adj.shape[1] != n:
|
|
43
|
+
raise ValueError(f"Adjacency must be square, got {adj.shape}")
|
|
44
|
+
|
|
45
|
+
seed_vec = np.zeros(n, dtype=np.float64)
|
|
46
|
+
if isinstance(seeds, dict):
|
|
47
|
+
total_w = sum(seeds.values())
|
|
48
|
+
for node, weight in seeds.items():
|
|
49
|
+
if node < 0 or node >= n:
|
|
50
|
+
raise ValueError(f"Seed node {node} out of range [0, {n})")
|
|
51
|
+
seed_vec[node] = weight / total_w
|
|
52
|
+
else:
|
|
53
|
+
for node in seeds:
|
|
54
|
+
if node < 0 or node >= n:
|
|
55
|
+
raise ValueError(f"Seed node {node} out of range [0, {n})")
|
|
56
|
+
seed_vec[list(seeds)] = 1.0 / len(seeds)
|
|
57
|
+
|
|
58
|
+
return n, seed_vec
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ── Layer 1: Pure Python (correctness baseline) ──
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _ppr_push_python(
|
|
65
|
+
adj: sp.spmatrix,
|
|
66
|
+
seeds_vec: np.ndarray,
|
|
67
|
+
alpha: float,
|
|
68
|
+
epsilon: float,
|
|
69
|
+
max_iter: int = 200,
|
|
70
|
+
) -> np.ndarray:
|
|
71
|
+
"""
|
|
72
|
+
Pure Python Push algorithm for PPR — small graph verification only.
|
|
73
|
+
|
|
74
|
+
NOT for production use. Use CSR or sknetwork path.
|
|
75
|
+
"""
|
|
76
|
+
n = len(seeds_vec)
|
|
77
|
+
r = seeds_vec.copy()
|
|
78
|
+
x = np.zeros(n, dtype=np.float64)
|
|
79
|
+
|
|
80
|
+
if not sp.isspmatrix_csr(adj):
|
|
81
|
+
adj = adj.tocsr()
|
|
82
|
+
|
|
83
|
+
out_degree = np.array(adj.sum(axis=1)).flatten()
|
|
84
|
+
dangling_mask = out_degree == 0
|
|
85
|
+
|
|
86
|
+
for _ in range(max_iter):
|
|
87
|
+
max_r = r.max()
|
|
88
|
+
if max_r < epsilon:
|
|
89
|
+
break
|
|
90
|
+
for u in range(n):
|
|
91
|
+
threshold = max(out_degree[u], 1) * epsilon
|
|
92
|
+
if r[u] > threshold:
|
|
93
|
+
push_amount = alpha * r[u]
|
|
94
|
+
x[u] += push_amount
|
|
95
|
+
remain = r[u] - push_amount
|
|
96
|
+
r[u] = 0
|
|
97
|
+
|
|
98
|
+
if not dangling_mask[u]:
|
|
99
|
+
degree_u = out_degree[u]
|
|
100
|
+
share = remain / degree_u
|
|
101
|
+
row_start = adj.indptr[u]
|
|
102
|
+
row_end = adj.indptr[u + 1]
|
|
103
|
+
for idx in range(row_start, row_end):
|
|
104
|
+
v = adj.indices[idx]
|
|
105
|
+
r[v] += share
|
|
106
|
+
else:
|
|
107
|
+
share = remain / n
|
|
108
|
+
r += share
|
|
109
|
+
|
|
110
|
+
x += r
|
|
111
|
+
return x
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ── Layer 2: SciPy CSR Sparse Matrix (production CPU path) ──
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _ppr_csr_power_iteration(
|
|
118
|
+
adj: sp.spmatrix,
|
|
119
|
+
seeds_vec: np.ndarray,
|
|
120
|
+
alpha: float,
|
|
121
|
+
epsilon: float,
|
|
122
|
+
max_iter: int = 100,
|
|
123
|
+
) -> np.ndarray:
|
|
124
|
+
"""
|
|
125
|
+
Power iteration using SciPy CSR sparse matrix for medium graphs.
|
|
126
|
+
|
|
127
|
+
This is the recommended production path for graphs up to ~500K nodes.
|
|
128
|
+
"""
|
|
129
|
+
n = len(seeds_vec)
|
|
130
|
+
if not sp.isspmatrix_csr(adj):
|
|
131
|
+
adj = adj.tocsr()
|
|
132
|
+
|
|
133
|
+
# Build column-stochastic transition matrix M = D^-1 * A^T
|
|
134
|
+
out_degree_raw = np.array(adj.sum(axis=1)).flatten().astype(np.float64)
|
|
135
|
+
dangling_mask = out_degree_raw == 0
|
|
136
|
+
out_degree = out_degree_raw.copy()
|
|
137
|
+
out_degree[out_degree == 0] = 1.0
|
|
138
|
+
d_inv = sp.diags(1.0 / out_degree, format="csr")
|
|
139
|
+
mt = adj.T.dot(d_inv) # M^T for left-multiplication
|
|
140
|
+
|
|
141
|
+
p_prev = seeds_vec.copy()
|
|
142
|
+
|
|
143
|
+
for _iteration in range(max_iter):
|
|
144
|
+
mt_p = mt.dot(p_prev)
|
|
145
|
+
p = alpha * seeds_vec + (1.0 - alpha) * mt_p
|
|
146
|
+
|
|
147
|
+
# Dangling-node correction
|
|
148
|
+
if dangling_mask.any():
|
|
149
|
+
dangling_mass = p_prev[dangling_mask].sum()
|
|
150
|
+
if dangling_mass > 0:
|
|
151
|
+
p += (1.0 - alpha) * dangling_mass / n
|
|
152
|
+
|
|
153
|
+
delta = np.abs(p - p_prev).sum()
|
|
154
|
+
if delta < epsilon * n:
|
|
155
|
+
break
|
|
156
|
+
p_prev = p
|
|
157
|
+
|
|
158
|
+
return p
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ── Layer 3: scikit-network (optional, large graphs) ──
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _ppr_sknetwork(
|
|
165
|
+
adj: sp.spmatrix,
|
|
166
|
+
seeds_vec: np.ndarray,
|
|
167
|
+
alpha: float,
|
|
168
|
+
) -> np.ndarray:
|
|
169
|
+
"""
|
|
170
|
+
Use scikit-network's PageRank for large graphs.
|
|
171
|
+
|
|
172
|
+
Falls back to CSR power iteration if sknetwork is not installed.
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
from sknetwork.ranking import PageRank
|
|
176
|
+
|
|
177
|
+
pr = PageRank(damping_factor=alpha, solver="piteration")
|
|
178
|
+
scores = pr.fit_transform(adj)
|
|
179
|
+
if seeds_vec.sum() > 0:
|
|
180
|
+
scores = alpha * seeds_vec + (1 - alpha) * scores
|
|
181
|
+
return scores
|
|
182
|
+
except ImportError:
|
|
183
|
+
return _ppr_csr_power_iteration(adj, seeds_vec, alpha, 1e-6)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# ── Unified Public API ──
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def personalized_pagerank(
|
|
190
|
+
adj: sp.spmatrix,
|
|
191
|
+
seeds: list[int] | dict[int, float],
|
|
192
|
+
alpha: float = 0.85,
|
|
193
|
+
epsilon: float = 1e-6,
|
|
194
|
+
top_k: int | None = None,
|
|
195
|
+
method: str = "auto",
|
|
196
|
+
) -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Compute Personalized PageRank scores for seed nodes.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
adj: Sparse adjacency matrix (n x n), scipy.sparse format
|
|
202
|
+
seeds: Seed nodes — list (uniform weight) or {node: weight} dict
|
|
203
|
+
alpha: Teleport probability (damping factor), default 0.85
|
|
204
|
+
epsilon: Convergence tolerance, default 1e-6
|
|
205
|
+
top_k: If set, return only top-k scores (approximate for large graphs)
|
|
206
|
+
method: "python" | "csr" | "sknetwork" | "auto" (default: auto-select)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
PPR score vector (n,)
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
ValueError: For invalid inputs
|
|
213
|
+
"""
|
|
214
|
+
n, seeds_vec = _validate_inputs(adj, seeds)
|
|
215
|
+
|
|
216
|
+
# Auto-select method based on graph size
|
|
217
|
+
if method == "auto":
|
|
218
|
+
if n < 1000:
|
|
219
|
+
method = "python"
|
|
220
|
+
elif n < 500000:
|
|
221
|
+
method = "csr"
|
|
222
|
+
else:
|
|
223
|
+
method = "sknetwork"
|
|
224
|
+
|
|
225
|
+
if method == "python":
|
|
226
|
+
scores = _ppr_push_python(adj, seeds_vec, alpha, epsilon)
|
|
227
|
+
elif method == "csr":
|
|
228
|
+
scores = _ppr_csr_power_iteration(adj, seeds_vec, alpha, epsilon)
|
|
229
|
+
elif method == "sknetwork":
|
|
230
|
+
scores = _ppr_sknetwork(adj, seeds_vec, alpha)
|
|
231
|
+
else:
|
|
232
|
+
raise ValueError(f"Unknown method: {method}")
|
|
233
|
+
|
|
234
|
+
# Normalize
|
|
235
|
+
s = scores.sum()
|
|
236
|
+
if s > 0:
|
|
237
|
+
scores /= s
|
|
238
|
+
|
|
239
|
+
if top_k is not None:
|
|
240
|
+
indices = np.argpartition(scores, -top_k)[-top_k:]
|
|
241
|
+
mask = np.zeros(n, dtype=np.float64)
|
|
242
|
+
mask[indices] = scores[indices]
|
|
243
|
+
mask /= mask.sum()
|
|
244
|
+
return mask
|
|
245
|
+
|
|
246
|
+
return scores
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# ── Reverse PPR (V1.1 Section 8.5 target-centric query path) ──
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def reverse_ppr(
|
|
253
|
+
adj: sp.spmatrix,
|
|
254
|
+
target: int,
|
|
255
|
+
alpha: float = 0.85,
|
|
256
|
+
epsilon: float = 1e-6,
|
|
257
|
+
) -> np.ndarray:
|
|
258
|
+
"""
|
|
259
|
+
Compute Reverse PPR for a target node.
|
|
260
|
+
|
|
261
|
+
Equivalent to running PPR on the transpose graph with the target as seed.
|
|
262
|
+
Used for "which skills contribute most to this skill?" queries.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
adj: Sparse adjacency matrix (n x n)
|
|
266
|
+
target: Target node index
|
|
267
|
+
alpha: Teleport probability
|
|
268
|
+
epsilon: Convergence tolerance
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Reverse PPR score vector (n,)
|
|
272
|
+
"""
|
|
273
|
+
return personalized_pagerank(
|
|
274
|
+
adj.T.tocsr() if hasattr(adj, "T") else adj.transpose().tocsr(),
|
|
275
|
+
[target],
|
|
276
|
+
alpha=alpha,
|
|
277
|
+
epsilon=epsilon,
|
|
278
|
+
method="csr",
|
|
279
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Health module — component health checking and degradation management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from skillpool.health.check import HealthChecker
|
|
6
|
+
from skillpool.health.degradation import DegradationManager
|
|
7
|
+
from skillpool.health.models import (
|
|
8
|
+
ComponentHealth as ComponentHealth,
|
|
9
|
+
DegradationLevel,
|
|
10
|
+
HealthCheckResponse,
|
|
11
|
+
ServingStatus,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class HealthManager:
|
|
16
|
+
"""Unified health management: checking + degradation + monitoring.
|
|
17
|
+
|
|
18
|
+
When a MonitorLayer is provided, health check results feed into the
|
|
19
|
+
monitoring system's metrics and five-dimension evaluation.
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
hm = HealthManager()
|
|
23
|
+
hm.register_component("resolver")
|
|
24
|
+
response = hm.check_health()
|
|
25
|
+
level = hm.get_degradation_level()
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, critical_threshold: int = 2, monitor=None) -> None:
|
|
29
|
+
self.checker = HealthChecker()
|
|
30
|
+
self.degradation = DegradationManager(critical_threshold=critical_threshold)
|
|
31
|
+
self._monitor = monitor
|
|
32
|
+
|
|
33
|
+
def register_component(
|
|
34
|
+
self,
|
|
35
|
+
name: str,
|
|
36
|
+
check_fn: callable = None,
|
|
37
|
+
critical: bool = True,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Register a component for health monitoring."""
|
|
40
|
+
self.checker.register(name, check_fn=check_fn, critical=critical)
|
|
41
|
+
|
|
42
|
+
def check_health(self) -> HealthCheckResponse:
|
|
43
|
+
"""Run health checks and update degradation state + monitor."""
|
|
44
|
+
response = self.checker.check()
|
|
45
|
+
# Update degradation based on results
|
|
46
|
+
for comp in response.components:
|
|
47
|
+
if comp.status == ServingStatus.NOT_SERVING:
|
|
48
|
+
# Check if component is critical
|
|
49
|
+
comp_config = self.checker._components.get(comp.component, {})
|
|
50
|
+
is_critical = comp_config.get("critical", True)
|
|
51
|
+
self.degradation.report_failure(comp.component, critical=is_critical)
|
|
52
|
+
else:
|
|
53
|
+
self.degradation.report_recovery(comp.component)
|
|
54
|
+
|
|
55
|
+
# Update response with degradation level
|
|
56
|
+
response.degradation_level = self.degradation.get_degradation_level()
|
|
57
|
+
|
|
58
|
+
# Feed results to monitor layer if available
|
|
59
|
+
if self._monitor is not None:
|
|
60
|
+
from skillpool.monitor import MetricType
|
|
61
|
+
|
|
62
|
+
for comp in response.components:
|
|
63
|
+
status_val = 1.0 if comp.status == ServingStatus.SERVING else 0.0
|
|
64
|
+
self._monitor.record_metric(
|
|
65
|
+
name=f"health.{comp.component}",
|
|
66
|
+
value=status_val,
|
|
67
|
+
metric_type=MetricType.GAUGE,
|
|
68
|
+
labels={"component": comp.component},
|
|
69
|
+
)
|
|
70
|
+
return response
|
|
71
|
+
|
|
72
|
+
def get_degradation_level(self) -> DegradationLevel:
|
|
73
|
+
return self.degradation.get_degradation_level()
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Health check — component health assessment."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from skillpool.health.models import (
|
|
10
|
+
ComponentHealth,
|
|
11
|
+
HealthCheckResponse,
|
|
12
|
+
ServingStatus,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HealthChecker:
|
|
19
|
+
"""Assess health of skillpool components.
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
checker = HealthChecker()
|
|
23
|
+
checker.register("resolver", check_fn=lambda: True)
|
|
24
|
+
response = checker.check()
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self) -> None:
|
|
28
|
+
self._components: dict[str, dict] = {}
|
|
29
|
+
|
|
30
|
+
def register(
|
|
31
|
+
self,
|
|
32
|
+
name: str,
|
|
33
|
+
check_fn: Optional[callable] = None,
|
|
34
|
+
critical: bool = True,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Register a component for health checking."""
|
|
37
|
+
self._components[name] = {
|
|
38
|
+
"check_fn": check_fn,
|
|
39
|
+
"critical": critical,
|
|
40
|
+
"last_status": ServingStatus.SERVING,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def check(self) -> HealthCheckResponse:
|
|
44
|
+
"""Run health checks on all registered components."""
|
|
45
|
+
results = []
|
|
46
|
+
overall = ServingStatus.SERVING
|
|
47
|
+
|
|
48
|
+
for name, config in self._components.items():
|
|
49
|
+
check_fn = config.get("check_fn")
|
|
50
|
+
if check_fn is None:
|
|
51
|
+
comp_status = ServingStatus.SERVING
|
|
52
|
+
else:
|
|
53
|
+
try:
|
|
54
|
+
healthy = check_fn()
|
|
55
|
+
comp_status = ServingStatus.SERVING if healthy else ServingStatus.NOT_SERVING
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.warning("Health check failed for component %s: %s", name, e)
|
|
58
|
+
comp_status = ServingStatus.NOT_SERVING
|
|
59
|
+
|
|
60
|
+
if comp_status == ServingStatus.NOT_SERVING and config.get("critical", True):
|
|
61
|
+
overall = ServingStatus.NOT_SERVING
|
|
62
|
+
elif comp_status == ServingStatus.NOT_SERVING and not config.get("critical", True):
|
|
63
|
+
if overall == ServingStatus.SERVING:
|
|
64
|
+
overall = ServingStatus.DEGRADED
|
|
65
|
+
|
|
66
|
+
config["last_status"] = comp_status
|
|
67
|
+
results.append(
|
|
68
|
+
ComponentHealth(
|
|
69
|
+
component=name,
|
|
70
|
+
status=comp_status,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return HealthCheckResponse(
|
|
75
|
+
status=overall,
|
|
76
|
+
components=results,
|
|
77
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def get_component_status(self, name: str) -> ServingStatus:
|
|
81
|
+
"""Get the last known status of a component."""
|
|
82
|
+
config = self._components.get(name)
|
|
83
|
+
if config:
|
|
84
|
+
return config["last_status"]
|
|
85
|
+
return ServingStatus.NOT_SERVING
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Degradation management — 4-level fallback mode handling.
|
|
2
|
+
|
|
3
|
+
Levels (aligned with cross-system-interfaces.yaml §5.2):
|
|
4
|
+
L0_full: All components healthy
|
|
5
|
+
L1_partial: Non-critical component(s) down, core still functional
|
|
6
|
+
L2_bm25_only: Vector search (VPLS) unavailable, BM25-only fallback
|
|
7
|
+
L3_disabled: Multiple critical failures, minimal/disabled operation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from skillpool.health.models import DegradationLevel
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DegradationManager:
|
|
16
|
+
"""Manage service degradation levels.
|
|
17
|
+
|
|
18
|
+
When components fail, the system degrades gracefully:
|
|
19
|
+
- L0_full: All components healthy
|
|
20
|
+
- L1_partial: Non-critical component(s) down
|
|
21
|
+
- L2_bm25_only: Vector search unavailable, fall back to BM25
|
|
22
|
+
- L3_disabled: Multiple critical failures, minimal operation
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
dm = DegradationManager()
|
|
26
|
+
dm.report_failure("vpls")
|
|
27
|
+
level = dm.get_degradation_level()
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, critical_threshold: int = 2) -> None:
|
|
31
|
+
self.critical_threshold = critical_threshold
|
|
32
|
+
self._failures: dict[str, int] = {} # component → consecutive failures
|
|
33
|
+
self._critical_failures: set[str] = set()
|
|
34
|
+
self._degradation_level = DegradationLevel.L0_FULL
|
|
35
|
+
|
|
36
|
+
def report_failure(self, component: str, critical: bool = True) -> DegradationLevel:
|
|
37
|
+
"""Report a component failure. Returns new degradation level."""
|
|
38
|
+
self._failures[component] = self._failures.get(component, 0) + 1
|
|
39
|
+
if critical:
|
|
40
|
+
self._critical_failures.add(component)
|
|
41
|
+
self._update_level()
|
|
42
|
+
return self._degradation_level
|
|
43
|
+
|
|
44
|
+
def report_recovery(self, component: str) -> DegradationLevel:
|
|
45
|
+
"""Report a component recovery. Returns new degradation level."""
|
|
46
|
+
self._failures.pop(component, None)
|
|
47
|
+
self._critical_failures.discard(component)
|
|
48
|
+
self._update_level()
|
|
49
|
+
return self._degradation_level
|
|
50
|
+
|
|
51
|
+
def get_degradation_level(self) -> DegradationLevel:
|
|
52
|
+
"""Get current degradation level."""
|
|
53
|
+
return self._degradation_level
|
|
54
|
+
|
|
55
|
+
def get_fallback_mode(self) -> str:
|
|
56
|
+
"""Get current fallback mode string."""
|
|
57
|
+
level = self._degradation_level
|
|
58
|
+
if level == DegradationLevel.L0_FULL:
|
|
59
|
+
return "vpls_vector"
|
|
60
|
+
if level == DegradationLevel.L1_PARTIAL:
|
|
61
|
+
return "vpls_vector"
|
|
62
|
+
if level == DegradationLevel.L2_BM25_ONLY:
|
|
63
|
+
return "bm25_keyword"
|
|
64
|
+
return "sqlite_fts5"
|
|
65
|
+
|
|
66
|
+
def _update_level(self) -> None:
|
|
67
|
+
"""Recalculate degradation level based on failures."""
|
|
68
|
+
failed_count = len(self._failures)
|
|
69
|
+
critical_count = len(self._critical_failures)
|
|
70
|
+
|
|
71
|
+
if failed_count == 0:
|
|
72
|
+
self._degradation_level = DegradationLevel.L0_FULL
|
|
73
|
+
elif "vpls" in self._failures and critical_count < self.critical_threshold:
|
|
74
|
+
# VPLS down but not too many critical failures → BM25 fallback
|
|
75
|
+
self._degradation_level = DegradationLevel.L2_BM25_ONLY
|
|
76
|
+
elif critical_count >= self.critical_threshold:
|
|
77
|
+
# Too many critical failures → disabled
|
|
78
|
+
self._degradation_level = DegradationLevel.L3_DISABLED
|
|
79
|
+
elif failed_count > 0 and critical_count == 0:
|
|
80
|
+
# Only non-critical failures → partial degradation
|
|
81
|
+
self._degradation_level = DegradationLevel.L1_PARTIAL
|
|
82
|
+
else:
|
|
83
|
+
# Some critical but below threshold → BM25 fallback
|
|
84
|
+
self._degradation_level = DegradationLevel.L2_BM25_ONLY
|
|
85
|
+
|
|
86
|
+
def reset(self) -> None:
|
|
87
|
+
"""Reset all failures and return to L0_full."""
|
|
88
|
+
self._failures.clear()
|
|
89
|
+
self._critical_failures.clear()
|
|
90
|
+
self._degradation_level = DegradationLevel.L0_FULL
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Health models — Pydantic schemas for health check."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ServingStatus(StrEnum):
|
|
11
|
+
SERVING = "SERVING"
|
|
12
|
+
NOT_SERVING = "NOT_SERVING"
|
|
13
|
+
DEGRADED = "DEGRADED"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DegradationLevel(StrEnum):
|
|
17
|
+
L0_FULL = "L0_full" # Full functionality
|
|
18
|
+
L1_PARTIAL = "L1_partial" # Partial degradation (non-critical components down)
|
|
19
|
+
L2_BM25_ONLY = "L2_bm25_only" # BM25-only fallback (vector search down)
|
|
20
|
+
L3_DISABLED = "L3_disabled" # Minimal/disabled functionality
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ComponentHealth(BaseModel):
|
|
24
|
+
"""Health status of a single component."""
|
|
25
|
+
|
|
26
|
+
component: str
|
|
27
|
+
status: ServingStatus = ServingStatus.SERVING
|
|
28
|
+
latency_p99_ms: float = 0.0
|
|
29
|
+
message: str = ""
|
|
30
|
+
metadata: dict = Field(default_factory=dict)
|
|
31
|
+
fallback_mode: str = Field(default="", description="Fallback mode: vpls_vector/bm25_keyword/sqlite_fts5")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class HealthCheckResponse(BaseModel):
|
|
35
|
+
"""Aggregated health check response."""
|
|
36
|
+
|
|
37
|
+
status: ServingStatus = ServingStatus.SERVING
|
|
38
|
+
components: list[ComponentHealth] = Field(default_factory=list)
|
|
39
|
+
timestamp: str = ""
|
|
40
|
+
degradation_level: DegradationLevel = Field(
|
|
41
|
+
default=DegradationLevel.L0_FULL, description="Current degradation level (L0-L3)"
|
|
42
|
+
)
|
|
43
|
+
vpls_latency_p99_ms: float = Field(default=0.0, description="VPLS P99 latency in ms")
|