nthlayer-workers 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nthlayer_workers/__init__.py +5 -0
- nthlayer_workers/cli.py +234 -0
- nthlayer_workers/correlate/__init__.py +1 -0
- nthlayer_workers/correlate/cli.py +847 -0
- nthlayer_workers/correlate/config.py +111 -0
- nthlayer_workers/correlate/correlation/__init__.py +1 -0
- nthlayer_workers/correlate/correlation/changes.py +87 -0
- nthlayer_workers/correlate/correlation/dedup.py +62 -0
- nthlayer_workers/correlate/correlation/engine.py +244 -0
- nthlayer_workers/correlate/correlation/temporal.py +79 -0
- nthlayer_workers/correlate/correlation/topology.py +104 -0
- nthlayer_workers/correlate/ingestion/__init__.py +1 -0
- nthlayer_workers/correlate/ingestion/protocol.py +10 -0
- nthlayer_workers/correlate/ingestion/severity.py +18 -0
- nthlayer_workers/correlate/ingestion/webhook.py +197 -0
- nthlayer_workers/correlate/notifications.py +85 -0
- nthlayer_workers/correlate/prometheus.py +234 -0
- nthlayer_workers/correlate/reasoning.py +375 -0
- nthlayer_workers/correlate/session.py +189 -0
- nthlayer_workers/correlate/snapshot/__init__.py +1 -0
- nthlayer_workers/correlate/snapshot/generator.py +170 -0
- nthlayer_workers/correlate/snapshot/model.py +177 -0
- nthlayer_workers/correlate/snapshot/token.py +14 -0
- nthlayer_workers/correlate/state.py +88 -0
- nthlayer_workers/correlate/store/__init__.py +5 -0
- nthlayer_workers/correlate/store/protocol.py +48 -0
- nthlayer_workers/correlate/store/sqlite.py +443 -0
- nthlayer_workers/correlate/summary.py +180 -0
- nthlayer_workers/correlate/traces/__init__.py +1 -0
- nthlayer_workers/correlate/traces/protocol.py +120 -0
- nthlayer_workers/correlate/traces/tempo.py +667 -0
- nthlayer_workers/correlate/traces/topology.py +39 -0
- nthlayer_workers/correlate/types.py +77 -0
- nthlayer_workers/correlate/worker.py +630 -0
- nthlayer_workers/learn/__init__.py +5 -0
- nthlayer_workers/learn/__main__.py +5 -0
- nthlayer_workers/learn/cli.py +164 -0
- nthlayer_workers/learn/retrospective.py +381 -0
- nthlayer_workers/learn/trends.py +102 -0
- nthlayer_workers/learn/worker.py +366 -0
- nthlayer_workers/measure/__init__.py +3 -0
- nthlayer_workers/measure/__main__.py +5 -0
- nthlayer_workers/measure/_parsing.py +15 -0
- nthlayer_workers/measure/adapters/__init__.py +0 -0
- nthlayer_workers/measure/adapters/_util.py +24 -0
- nthlayer_workers/measure/adapters/devin.py +119 -0
- nthlayer_workers/measure/adapters/gastown.py +88 -0
- nthlayer_workers/measure/adapters/prometheus.py +277 -0
- nthlayer_workers/measure/adapters/protocol.py +20 -0
- nthlayer_workers/measure/adapters/webhook.py +161 -0
- nthlayer_workers/measure/api/__init__.py +0 -0
- nthlayer_workers/measure/api/normalise.py +50 -0
- nthlayer_workers/measure/api/queue.py +243 -0
- nthlayer_workers/measure/api/response.py +51 -0
- nthlayer_workers/measure/api/server.py +504 -0
- nthlayer_workers/measure/calibration/__init__.py +0 -0
- nthlayer_workers/measure/calibration/loop.py +62 -0
- nthlayer_workers/measure/calibration/slos.py +212 -0
- nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
- nthlayer_workers/measure/cli.py +753 -0
- nthlayer_workers/measure/config.py +191 -0
- nthlayer_workers/measure/detection/__init__.py +6 -0
- nthlayer_workers/measure/detection/detector.py +82 -0
- nthlayer_workers/measure/detection/protocol.py +29 -0
- nthlayer_workers/measure/governance/__init__.py +0 -0
- nthlayer_workers/measure/governance/engine.py +163 -0
- nthlayer_workers/measure/manifest.py +77 -0
- nthlayer_workers/measure/notifications.py +53 -0
- nthlayer_workers/measure/pipeline/__init__.py +0 -0
- nthlayer_workers/measure/pipeline/evaluator.py +155 -0
- nthlayer_workers/measure/pipeline/router.py +160 -0
- nthlayer_workers/measure/store/__init__.py +0 -0
- nthlayer_workers/measure/store/protocol.py +38 -0
- nthlayer_workers/measure/store/sqlite.py +276 -0
- nthlayer_workers/measure/telemetry.py +116 -0
- nthlayer_workers/measure/tiering/__init__.py +0 -0
- nthlayer_workers/measure/tiering/classifier.py +58 -0
- nthlayer_workers/measure/tiering/promotion.py +118 -0
- nthlayer_workers/measure/trends/__init__.py +0 -0
- nthlayer_workers/measure/trends/tracker.py +72 -0
- nthlayer_workers/measure/types.py +75 -0
- nthlayer_workers/measure/worker.py +439 -0
- nthlayer_workers/observe/__init__.py +25 -0
- nthlayer_workers/observe/__main__.py +5 -0
- nthlayer_workers/observe/api/__init__.py +1 -0
- nthlayer_workers/observe/assessment.py +95 -0
- nthlayer_workers/observe/cli.py +737 -0
- nthlayer_workers/observe/config.py +11 -0
- nthlayer_workers/observe/db/__init__.py +1 -0
- nthlayer_workers/observe/decision_records.py +220 -0
- nthlayer_workers/observe/dependencies/__init__.py +18 -0
- nthlayer_workers/observe/dependencies/discovery.py +294 -0
- nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
- nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
- nthlayer_workers/observe/dependencies/providers/base.py +76 -0
- nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
- nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
- nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
- nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
- nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
- nthlayer_workers/observe/deployments/__init__.py +1 -0
- nthlayer_workers/observe/discovery/__init__.py +14 -0
- nthlayer_workers/observe/discovery/classifier.py +66 -0
- nthlayer_workers/observe/discovery/client.py +189 -0
- nthlayer_workers/observe/discovery/models.py +53 -0
- nthlayer_workers/observe/drift/__init__.py +26 -0
- nthlayer_workers/observe/drift/analyzer.py +383 -0
- nthlayer_workers/observe/drift/models.py +174 -0
- nthlayer_workers/observe/drift/patterns.py +88 -0
- nthlayer_workers/observe/explanation.py +118 -0
- nthlayer_workers/observe/gate/__init__.py +39 -0
- nthlayer_workers/observe/gate/conditions.py +92 -0
- nthlayer_workers/observe/gate/correlator.py +154 -0
- nthlayer_workers/observe/gate/evaluator.py +192 -0
- nthlayer_workers/observe/gate/policies.py +226 -0
- nthlayer_workers/observe/gate_adapter.py +40 -0
- nthlayer_workers/observe/incident.py +36 -0
- nthlayer_workers/observe/portfolio/__init__.py +17 -0
- nthlayer_workers/observe/portfolio/aggregator.py +168 -0
- nthlayer_workers/observe/portfolio/scorer.py +13 -0
- nthlayer_workers/observe/slo/__init__.py +19 -0
- nthlayer_workers/observe/slo/collector.py +235 -0
- nthlayer_workers/observe/slo/spec_loader.py +40 -0
- nthlayer_workers/observe/sqlite_store.py +152 -0
- nthlayer_workers/observe/store.py +92 -0
- nthlayer_workers/observe/verification/__init__.py +22 -0
- nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
- nthlayer_workers/observe/verification/extractor.py +127 -0
- nthlayer_workers/observe/verification/models.py +101 -0
- nthlayer_workers/observe/verification/verifier.py +111 -0
- nthlayer_workers/observe/worker.py +332 -0
- nthlayer_workers/respond/__init__.py +2 -0
- nthlayer_workers/respond/__main__.py +4 -0
- nthlayer_workers/respond/agents/__init__.py +0 -0
- nthlayer_workers/respond/agents/base.py +556 -0
- nthlayer_workers/respond/agents/communication.py +115 -0
- nthlayer_workers/respond/agents/investigation.py +124 -0
- nthlayer_workers/respond/agents/remediation.py +219 -0
- nthlayer_workers/respond/agents/triage.py +132 -0
- nthlayer_workers/respond/cli.py +772 -0
- nthlayer_workers/respond/config.py +135 -0
- nthlayer_workers/respond/context_store.py +256 -0
- nthlayer_workers/respond/coordinator.py +487 -0
- nthlayer_workers/respond/metrics.py +104 -0
- nthlayer_workers/respond/notification_backends/__init__.py +1 -0
- nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
- nthlayer_workers/respond/notification_backends/protocol.py +59 -0
- nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
- nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
- nthlayer_workers/respond/notifications.py +247 -0
- nthlayer_workers/respond/oncall/__init__.py +1 -0
- nthlayer_workers/respond/oncall/escalation.py +103 -0
- nthlayer_workers/respond/oncall/runner.py +193 -0
- nthlayer_workers/respond/oncall/schedule.py +243 -0
- nthlayer_workers/respond/safe_actions/__init__.py +0 -0
- nthlayer_workers/respond/safe_actions/actions.py +139 -0
- nthlayer_workers/respond/safe_actions/registry.py +171 -0
- nthlayer_workers/respond/safe_actions/webhook.py +194 -0
- nthlayer_workers/respond/server.py +357 -0
- nthlayer_workers/respond/sre/__init__.py +1 -0
- nthlayer_workers/respond/sre/brief.py +175 -0
- nthlayer_workers/respond/sre/delegation.py +101 -0
- nthlayer_workers/respond/sre/post_incident.py +146 -0
- nthlayer_workers/respond/sre/shift_report.py +129 -0
- nthlayer_workers/respond/sre/suppression.py +91 -0
- nthlayer_workers/respond/types.py +109 -0
- nthlayer_workers/respond/verdict_submission.py +56 -0
- nthlayer_workers/respond/worker.py +533 -0
- nthlayer_workers/respond/worker_helpers.py +140 -0
- nthlayer_workers/runner.py +198 -0
- nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
- nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
- nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
- nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
- nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Pattern detection for drift analysis.
|
|
2
|
+
|
|
3
|
+
Classifies drift patterns beyond simple linear trends:
|
|
4
|
+
- Gradual decline/improvement
|
|
5
|
+
- Step changes (sudden drops or improvements)
|
|
6
|
+
- Volatile patterns
|
|
7
|
+
- Stable (no significant trend)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from nthlayer_workers.observe.drift.models import DriftPattern
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PatternDetector:
|
|
20
|
+
"""Detect drift patterns beyond simple linear trends."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
step_change_threshold: float = 0.05,
|
|
25
|
+
volatility_variance_threshold: float = 0.01,
|
|
26
|
+
volatility_r_squared_threshold: float = 0.3,
|
|
27
|
+
slope_significance_threshold: float = 0.001,
|
|
28
|
+
):
|
|
29
|
+
self.step_change_threshold = step_change_threshold
|
|
30
|
+
self.volatility_variance_threshold = volatility_variance_threshold
|
|
31
|
+
self.volatility_r_squared_threshold = volatility_r_squared_threshold
|
|
32
|
+
self.slope_significance_threshold = slope_significance_threshold
|
|
33
|
+
|
|
34
|
+
def detect(
|
|
35
|
+
self,
|
|
36
|
+
data: list[tuple[datetime, float]],
|
|
37
|
+
slope_per_second: float,
|
|
38
|
+
r_squared: float,
|
|
39
|
+
) -> DriftPattern:
|
|
40
|
+
"""Classify the drift pattern."""
|
|
41
|
+
if len(data) < 2:
|
|
42
|
+
return DriftPattern.STABLE
|
|
43
|
+
|
|
44
|
+
values = np.array([d[1] for d in data])
|
|
45
|
+
variance = float(np.var(values))
|
|
46
|
+
|
|
47
|
+
step_change = self._detect_step_change(data)
|
|
48
|
+
if step_change is not None:
|
|
49
|
+
return step_change
|
|
50
|
+
|
|
51
|
+
if (
|
|
52
|
+
r_squared < self.volatility_r_squared_threshold
|
|
53
|
+
and variance > self.volatility_variance_threshold
|
|
54
|
+
):
|
|
55
|
+
return DriftPattern.VOLATILE
|
|
56
|
+
|
|
57
|
+
seconds_per_week = 7 * 24 * 60 * 60
|
|
58
|
+
weekly_slope = slope_per_second * seconds_per_week
|
|
59
|
+
|
|
60
|
+
if abs(weekly_slope) < self.slope_significance_threshold:
|
|
61
|
+
return DriftPattern.STABLE
|
|
62
|
+
elif weekly_slope < 0:
|
|
63
|
+
return DriftPattern.GRADUAL_DECLINE
|
|
64
|
+
else:
|
|
65
|
+
return DriftPattern.GRADUAL_IMPROVEMENT
|
|
66
|
+
|
|
67
|
+
def _detect_step_change(
|
|
68
|
+
self,
|
|
69
|
+
data: list[tuple[datetime, float]],
|
|
70
|
+
) -> DriftPattern | None:
|
|
71
|
+
"""Detect sudden step changes in the data."""
|
|
72
|
+
if len(data) < 2:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
max_time_window = 86400 * 1.5
|
|
76
|
+
|
|
77
|
+
for i in range(1, len(data)):
|
|
78
|
+
time_diff = (data[i][0] - data[i - 1][0]).total_seconds()
|
|
79
|
+
value_diff = data[i][1] - data[i - 1][1]
|
|
80
|
+
|
|
81
|
+
if time_diff < max_time_window:
|
|
82
|
+
if value_diff < -self.step_change_threshold:
|
|
83
|
+
return DriftPattern.STEP_CHANGE_DOWN
|
|
84
|
+
elif value_diff > self.step_change_threshold:
|
|
85
|
+
return DriftPattern.STEP_CHANGE_UP
|
|
86
|
+
|
|
87
|
+
return None
|
|
88
|
+
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""ExplanationEngine — build human-readable budget explanations from assessments.
|
|
2
|
+
|
|
3
|
+
Deterministic. No LLM. Pure arithmetic on assessment data.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from nthlayer_common.explanation import BudgetExplanation
|
|
8
|
+
from nthlayer_workers.observe.assessment import Assessment
|
|
9
|
+
from nthlayer_workers.observe.store import AssessmentFilter, AssessmentStore
|
|
10
|
+
|
|
11
|
+
# Maps SLO assessment status to explanation severity.
|
|
12
|
+
# ERROR/NO_DATA are data-quality issues (e.g. Prometheus unreachable),
|
|
13
|
+
# not budget concerns — "warning" severity since they need attention
|
|
14
|
+
# but don't indicate budget consumption.
|
|
15
|
+
_STATUS_SEVERITY = {
|
|
16
|
+
"EXHAUSTED": "critical",
|
|
17
|
+
"CRITICAL": "critical",
|
|
18
|
+
"WARNING": "warning",
|
|
19
|
+
"ERROR": "warning",
|
|
20
|
+
"HEALTHY": "info",
|
|
21
|
+
"NO_DATA": "info",
|
|
22
|
+
"UNKNOWN": "info",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ExplanationEngine:
|
|
27
|
+
"""Build budget explanations from the assessment store."""
|
|
28
|
+
|
|
29
|
+
def explain_service(
|
|
30
|
+
self,
|
|
31
|
+
service: str,
|
|
32
|
+
store: AssessmentStore,
|
|
33
|
+
slo_filter: str | None = None,
|
|
34
|
+
) -> list[BudgetExplanation]:
|
|
35
|
+
"""Build explanations for a service from latest slo_status assessments."""
|
|
36
|
+
assessments = store.query(
|
|
37
|
+
AssessmentFilter(service=service, kind="slo_status", limit=0)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Deduplicate: keep latest per SLO name (query returns desc by timestamp)
|
|
41
|
+
seen: set[str] = set()
|
|
42
|
+
latest: list[Assessment] = []
|
|
43
|
+
for a in assessments:
|
|
44
|
+
slo_name = a.data.get("slo_name", "unknown")
|
|
45
|
+
if slo_name not in seen:
|
|
46
|
+
seen.add(slo_name)
|
|
47
|
+
latest.append(a)
|
|
48
|
+
|
|
49
|
+
if slo_filter:
|
|
50
|
+
latest = [a for a in latest if a.data.get("slo_name") == slo_filter]
|
|
51
|
+
|
|
52
|
+
return [self._explain_slo(service, a) for a in latest]
|
|
53
|
+
|
|
54
|
+
def _explain_slo(self, service: str, assessment: Assessment) -> BudgetExplanation:
|
|
55
|
+
data = assessment.data
|
|
56
|
+
slo_name = data.get("slo_name", "unknown")
|
|
57
|
+
status = data.get("status", "UNKNOWN")
|
|
58
|
+
pct = data.get("percent_consumed", 0.0) or 0.0
|
|
59
|
+
burned = data.get("burned_minutes", 0.0) or 0.0
|
|
60
|
+
total = data.get("total_budget_minutes", 0.0) or 0.0
|
|
61
|
+
sli = data.get("current_sli", 0.0) or 0.0
|
|
62
|
+
obj = data.get("objective", 0.0) or 0.0
|
|
63
|
+
window = data.get("window", "30d")
|
|
64
|
+
remaining = max(0, total - burned)
|
|
65
|
+
severity = _STATUS_SEVERITY.get(status, "info")
|
|
66
|
+
|
|
67
|
+
# Headline
|
|
68
|
+
status_desc = {
|
|
69
|
+
"EXHAUSTED": "budget exhausted",
|
|
70
|
+
"CRITICAL": "near exhaustion",
|
|
71
|
+
"WARNING": "approaching threshold",
|
|
72
|
+
}
|
|
73
|
+
desc = status_desc.get(status, "within budget")
|
|
74
|
+
headline = f"{slo_name}: {pct:.0f}% consumed — {desc} ({status})"
|
|
75
|
+
|
|
76
|
+
# Body
|
|
77
|
+
body = (
|
|
78
|
+
f"Window: {window}. "
|
|
79
|
+
f"Budget: {total:.0f} min total, {burned:.0f} min consumed, "
|
|
80
|
+
f"{remaining:.0f} min remaining. "
|
|
81
|
+
f"Current SLI: {sli:.4f} (target: {obj:.4f})."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Causes
|
|
85
|
+
causes: list[str] = []
|
|
86
|
+
if pct > 80:
|
|
87
|
+
causes.append(
|
|
88
|
+
"Budget consumption exceeds 80% — sustained error rate above target"
|
|
89
|
+
)
|
|
90
|
+
if sli < obj and obj > 0:
|
|
91
|
+
gap = (obj - sli) * 100
|
|
92
|
+
causes.append(
|
|
93
|
+
f"Current SLI ({sli:.4f}) is {gap:.2f}pp below target ({obj:.4f})"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Actions
|
|
97
|
+
actions: list[str] = []
|
|
98
|
+
if status == "EXHAUSTED":
|
|
99
|
+
actions.append(
|
|
100
|
+
"Deployment gate will block — resolve underlying issue before deploying"
|
|
101
|
+
)
|
|
102
|
+
if status in ("CRITICAL", "EXHAUSTED"):
|
|
103
|
+
actions.append("Investigate root cause of elevated error rate")
|
|
104
|
+
actions.append("Consider freezing deployments until budget recovers")
|
|
105
|
+
elif status == "WARNING":
|
|
106
|
+
actions.append(
|
|
107
|
+
"Monitor trend — investigate if consumption continues to rise"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return BudgetExplanation(
|
|
111
|
+
service=service,
|
|
112
|
+
slo_name=slo_name,
|
|
113
|
+
headline=headline,
|
|
114
|
+
body=body,
|
|
115
|
+
causes=causes,
|
|
116
|
+
recommended_actions=actions,
|
|
117
|
+
severity=severity,
|
|
118
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Deployment gate evaluation."""
|
|
2
|
+
|
|
3
|
+
from nthlayer_workers.observe.gate.conditions import (
|
|
4
|
+
get_current_context,
|
|
5
|
+
is_business_hours,
|
|
6
|
+
is_freeze_period,
|
|
7
|
+
is_peak_traffic,
|
|
8
|
+
is_weekday,
|
|
9
|
+
)
|
|
10
|
+
from nthlayer_workers.observe.gate.correlator import (
|
|
11
|
+
CorrelationInput,
|
|
12
|
+
CorrelationResult,
|
|
13
|
+
correlate,
|
|
14
|
+
)
|
|
15
|
+
from nthlayer_workers.observe.gate.evaluator import (
|
|
16
|
+
GateCheckResult,
|
|
17
|
+
check_deploy,
|
|
18
|
+
)
|
|
19
|
+
from nthlayer_workers.observe.gate.policies import (
|
|
20
|
+
ConditionEvaluator,
|
|
21
|
+
EvaluationResult,
|
|
22
|
+
PolicyContext,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"ConditionEvaluator",
|
|
27
|
+
"CorrelationInput",
|
|
28
|
+
"CorrelationResult",
|
|
29
|
+
"EvaluationResult",
|
|
30
|
+
"GateCheckResult",
|
|
31
|
+
"PolicyContext",
|
|
32
|
+
"check_deploy",
|
|
33
|
+
"correlate",
|
|
34
|
+
"get_current_context",
|
|
35
|
+
"is_business_hours",
|
|
36
|
+
"is_freeze_period",
|
|
37
|
+
"is_peak_traffic",
|
|
38
|
+
"is_weekday",
|
|
39
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Built-in condition functions for policy evaluation.
|
|
2
|
+
|
|
3
|
+
Provides time-based, date-based, and service-based conditions.
|
|
4
|
+
All functions are pure — no external data access.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import date, datetime
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_current_context(
|
|
14
|
+
budget_remaining: float = 100.0,
|
|
15
|
+
budget_consumed: float = 0.0,
|
|
16
|
+
burn_rate: float = 1.0,
|
|
17
|
+
tier: str = "standard",
|
|
18
|
+
environment: str = "prod",
|
|
19
|
+
downstream_count: int = 0,
|
|
20
|
+
high_criticality_downstream: int = 0,
|
|
21
|
+
now: datetime | None = None,
|
|
22
|
+
) -> dict[str, Any]:
|
|
23
|
+
"""Build context dictionary for condition evaluation."""
|
|
24
|
+
if now is None:
|
|
25
|
+
now = datetime.now()
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
"hour": now.hour,
|
|
29
|
+
"minute": now.minute,
|
|
30
|
+
"weekday": now.weekday() < 5,
|
|
31
|
+
"day_of_week": now.weekday(),
|
|
32
|
+
"date": now.date().isoformat(),
|
|
33
|
+
"month": now.month,
|
|
34
|
+
"day": now.day,
|
|
35
|
+
"year": now.year,
|
|
36
|
+
"budget_remaining": budget_remaining,
|
|
37
|
+
"budget_consumed": budget_consumed,
|
|
38
|
+
"burn_rate": burn_rate,
|
|
39
|
+
"tier": tier,
|
|
40
|
+
"environment": environment,
|
|
41
|
+
"env": environment,
|
|
42
|
+
"downstream_count": downstream_count,
|
|
43
|
+
"high_criticality_downstream": high_criticality_downstream,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def is_business_hours(
|
|
48
|
+
now: datetime | None = None,
|
|
49
|
+
start_hour: int = 9,
|
|
50
|
+
end_hour: int = 17,
|
|
51
|
+
) -> bool:
|
|
52
|
+
"""Check if current time is within business hours (Mon-Fri, 9-17)."""
|
|
53
|
+
if now is None:
|
|
54
|
+
now = datetime.now()
|
|
55
|
+
if now.weekday() >= 5:
|
|
56
|
+
return False
|
|
57
|
+
return start_hour <= now.hour < end_hour
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def is_weekday(now: datetime | None = None) -> bool:
|
|
61
|
+
"""Check if current day is a weekday (Mon-Fri)."""
|
|
62
|
+
if now is None:
|
|
63
|
+
now = datetime.now()
|
|
64
|
+
return now.weekday() < 5
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_freeze_period(
|
|
68
|
+
start_date: str,
|
|
69
|
+
end_date: str,
|
|
70
|
+
now: datetime | None = None,
|
|
71
|
+
) -> bool:
|
|
72
|
+
"""Check if current date is within a freeze period."""
|
|
73
|
+
if now is None:
|
|
74
|
+
now = datetime.now()
|
|
75
|
+
try:
|
|
76
|
+
start = date.fromisoformat(start_date)
|
|
77
|
+
end = date.fromisoformat(end_date)
|
|
78
|
+
return start <= now.date() <= end
|
|
79
|
+
except ValueError:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def is_peak_traffic(
|
|
84
|
+
now: datetime | None = None,
|
|
85
|
+
peak_hours: list[tuple[int, int]] | None = None,
|
|
86
|
+
) -> bool:
|
|
87
|
+
"""Check if current time is during peak traffic hours."""
|
|
88
|
+
if now is None:
|
|
89
|
+
now = datetime.now()
|
|
90
|
+
if peak_hours is None:
|
|
91
|
+
peak_hours = [(10, 12), (14, 16)]
|
|
92
|
+
return any(start <= now.hour < end for start, end in peak_hours)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Deployment correlation engine — 5-factor weighted scoring.
|
|
2
|
+
|
|
3
|
+
Deterministic heuristic that correlates deployments with error budget burns.
|
|
4
|
+
Adapted from nthlayer.slos.correlator to work with pre-computed inputs
|
|
5
|
+
instead of SLORepository queries.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
# Confidence thresholds
|
|
16
|
+
HIGH_CONFIDENCE = 0.7
|
|
17
|
+
MEDIUM_CONFIDENCE = 0.5
|
|
18
|
+
LOW_CONFIDENCE = 0.3
|
|
19
|
+
BLOCKING_CONFIDENCE = 0.8
|
|
20
|
+
|
|
21
|
+
# Factor weights
|
|
22
|
+
WEIGHTS = {
|
|
23
|
+
"burn_rate": 0.35,
|
|
24
|
+
"proximity": 0.25,
|
|
25
|
+
"magnitude": 0.15,
|
|
26
|
+
"dependency": 0.15,
|
|
27
|
+
"history": 0.10,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class CorrelationInput:
|
|
33
|
+
"""Pre-computed input data for correlation scoring.
|
|
34
|
+
|
|
35
|
+
Callers gather this data from assessments or other sources
|
|
36
|
+
before calling the correlator.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
deployment_id: str
|
|
40
|
+
service: str
|
|
41
|
+
deploy_time: datetime
|
|
42
|
+
burn_detected_at: datetime # when burn was first detected (for proximity scoring)
|
|
43
|
+
burn_rate_before: float # burn rate per minute before deploy window
|
|
44
|
+
burn_rate_after: float # burn rate per minute after deploy window
|
|
45
|
+
burn_minutes: float # total burn in after window
|
|
46
|
+
is_same_service: bool = False # deployment targets same service as affected SLO
|
|
47
|
+
is_direct_upstream: bool = False # deployment targets direct upstream dependency
|
|
48
|
+
is_transitive_upstream: bool = False # deployment targets transitive upstream
|
|
49
|
+
is_yaml_downstream: bool = False # service listed in YAML downstream_services
|
|
50
|
+
recent_deploy_count: int = 0 # total deploys for service in history window
|
|
51
|
+
prior_correlations: int = 0 # prior medium+ correlations in history window
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class CorrelationResult:
|
|
56
|
+
"""Result of correlation analysis."""
|
|
57
|
+
|
|
58
|
+
deployment_id: str
|
|
59
|
+
service: str
|
|
60
|
+
burn_minutes: float
|
|
61
|
+
confidence: float
|
|
62
|
+
method: str = "time_window_analysis"
|
|
63
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def confidence_label(self) -> str:
|
|
67
|
+
if self.confidence >= HIGH_CONFIDENCE:
|
|
68
|
+
return "HIGH"
|
|
69
|
+
elif self.confidence >= MEDIUM_CONFIDENCE:
|
|
70
|
+
return "MEDIUM"
|
|
71
|
+
elif self.confidence >= LOW_CONFIDENCE:
|
|
72
|
+
return "LOW"
|
|
73
|
+
return "NONE"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def correlate(inp: CorrelationInput) -> CorrelationResult:
|
|
77
|
+
"""Correlate a deployment with error budget burns using 5-factor scoring.
|
|
78
|
+
|
|
79
|
+
All data is pre-computed in CorrelationInput — no async, no DB queries.
|
|
80
|
+
|
|
81
|
+
Factors:
|
|
82
|
+
- burn_rate (0.35): Spike in burn rate after deploy
|
|
83
|
+
- proximity (0.25): Time proximity (exponential decay, half-life ~30min)
|
|
84
|
+
- magnitude (0.15): Absolute burn amount
|
|
85
|
+
- dependency (0.15): Relationship between deploying and affected service
|
|
86
|
+
- history (0.10): Historical correlation pattern
|
|
87
|
+
"""
|
|
88
|
+
burn_rate_score = _calculate_burn_rate_score(inp.burn_rate_before, inp.burn_rate_after)
|
|
89
|
+
proximity_score = _calculate_proximity_score(inp.deploy_time, inp.burn_detected_at)
|
|
90
|
+
magnitude_score = _calculate_magnitude_score(inp.burn_minutes)
|
|
91
|
+
dependency_score = _calculate_dependency_score(inp)
|
|
92
|
+
history_score = _calculate_history_score(inp.recent_deploy_count, inp.prior_correlations)
|
|
93
|
+
|
|
94
|
+
confidence = (
|
|
95
|
+
WEIGHTS["burn_rate"] * burn_rate_score
|
|
96
|
+
+ WEIGHTS["proximity"] * proximity_score
|
|
97
|
+
+ WEIGHTS["magnitude"] * magnitude_score
|
|
98
|
+
+ WEIGHTS["dependency"] * dependency_score
|
|
99
|
+
+ WEIGHTS["history"] * history_score
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return CorrelationResult(
|
|
103
|
+
deployment_id=inp.deployment_id,
|
|
104
|
+
service=inp.service,
|
|
105
|
+
burn_minutes=inp.burn_minutes,
|
|
106
|
+
confidence=confidence,
|
|
107
|
+
details={
|
|
108
|
+
"burn_rate_before": inp.burn_rate_before,
|
|
109
|
+
"burn_rate_after": inp.burn_rate_after,
|
|
110
|
+
"burn_rate_score": burn_rate_score,
|
|
111
|
+
"proximity_score": proximity_score,
|
|
112
|
+
"magnitude_score": magnitude_score,
|
|
113
|
+
"dependency_score": dependency_score,
|
|
114
|
+
"history_score": history_score,
|
|
115
|
+
},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _calculate_burn_rate_score(before_rate: float, after_rate: float) -> float:
|
|
120
|
+
"""Spike ratio: 5x or more = 1.0. No baseline → absolute rate."""
|
|
121
|
+
if before_rate == 0:
|
|
122
|
+
return min(after_rate / 0.1, 1.0)
|
|
123
|
+
return min((after_rate / before_rate) / 5.0, 1.0)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _calculate_proximity_score(deployed_at: datetime, burn_detected_at: datetime) -> float:
|
|
127
|
+
"""Exponential decay: half-life ~30 minutes."""
|
|
128
|
+
elapsed = abs((burn_detected_at - deployed_at).total_seconds()) / 60.0
|
|
129
|
+
return math.exp(-elapsed / 30.0)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _calculate_magnitude_score(burn_minutes: float) -> float:
|
|
133
|
+
"""Absolute burn: 10+ minutes = 1.0."""
|
|
134
|
+
return min(burn_minutes / 10.0, 1.0)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _calculate_dependency_score(inp: CorrelationInput) -> float:
|
|
138
|
+
"""Relationship score: same=1.0, direct upstream=1.0, transitive=0.4, yaml=0.6, none=0.0."""
|
|
139
|
+
if inp.is_same_service:
|
|
140
|
+
return 1.0
|
|
141
|
+
if inp.is_direct_upstream:
|
|
142
|
+
return 1.0
|
|
143
|
+
if inp.is_transitive_upstream:
|
|
144
|
+
return 0.4
|
|
145
|
+
if inp.is_yaml_downstream:
|
|
146
|
+
return 0.6
|
|
147
|
+
return 0.0
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _calculate_history_score(recent_deploy_count: int, prior_correlations: int) -> float:
|
|
151
|
+
"""Repeat offender penalty: fraction of recent deploys with medium+ correlation."""
|
|
152
|
+
if recent_deploy_count == 0:
|
|
153
|
+
return 0.0
|
|
154
|
+
return min(prior_correlations / recent_deploy_count, 1.0)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Deployment gate evaluation — decides APPROVED/WARNING/BLOCKED from assessments.
|
|
2
|
+
|
|
3
|
+
Reads slo_status assessments to determine error budget status,
|
|
4
|
+
then applies tier-based thresholds to make gate decisions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from nthlayer_common.gate_models import GatePolicy, GateResult
|
|
13
|
+
from nthlayer_common.tiers import TIER_CONFIGS
|
|
14
|
+
|
|
15
|
+
from nthlayer_workers.observe.store import AssessmentFilter, AssessmentStore
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Default thresholds from tier config
|
|
19
|
+
THRESHOLDS: dict[str, dict[str, float | None]] = {
|
|
20
|
+
tier: {
|
|
21
|
+
"warning": config.error_budget_warning_pct,
|
|
22
|
+
"blocking": config.error_budget_blocking_pct,
|
|
23
|
+
}
|
|
24
|
+
for tier, config in TIER_CONFIGS.items()
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class GateCheckResult:
|
|
30
|
+
"""Result of a deployment gate check."""
|
|
31
|
+
|
|
32
|
+
service: str
|
|
33
|
+
tier: str
|
|
34
|
+
result: GateResult
|
|
35
|
+
budget_remaining_pct: float
|
|
36
|
+
warning_threshold: float
|
|
37
|
+
blocking_threshold: float | None
|
|
38
|
+
message: str
|
|
39
|
+
recommendations: list[str]
|
|
40
|
+
slo_count: int = 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def check_deploy(
|
|
44
|
+
service: str,
|
|
45
|
+
tier: str,
|
|
46
|
+
store: AssessmentStore,
|
|
47
|
+
policy: GatePolicy | None = None,
|
|
48
|
+
) -> GateCheckResult:
|
|
49
|
+
"""Check if deployment should be allowed based on assessment data.
|
|
50
|
+
|
|
51
|
+
Reads recent slo_status assessments for the service, computes
|
|
52
|
+
aggregate budget consumption, and applies tier thresholds.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
service: Service name
|
|
56
|
+
tier: Service tier (critical, standard, low)
|
|
57
|
+
store: Assessment store with slo_status assessments
|
|
58
|
+
policy: Optional custom GatePolicy to override tier defaults
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
GateCheckResult with APPROVED/WARNING/BLOCKED decision
|
|
62
|
+
"""
|
|
63
|
+
# Get recent slo_status assessments for this service
|
|
64
|
+
assessments = store.query(
|
|
65
|
+
AssessmentFilter(service=service, kind="slo_status")
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if not assessments:
|
|
69
|
+
return GateCheckResult(
|
|
70
|
+
service=service,
|
|
71
|
+
tier=tier,
|
|
72
|
+
result=GateResult.APPROVED,
|
|
73
|
+
budget_remaining_pct=100.0,
|
|
74
|
+
warning_threshold=_get_warning_threshold(tier, policy),
|
|
75
|
+
blocking_threshold=_get_blocking_threshold(tier, policy),
|
|
76
|
+
message="No SLO assessments found — gate approved by default",
|
|
77
|
+
recommendations=[],
|
|
78
|
+
slo_count=0,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Compute aggregate budget consumption from latest assessments per SLO
|
|
82
|
+
latest_by_slo: dict[str, dict[str, Any]] = {}
|
|
83
|
+
for a in assessments:
|
|
84
|
+
slo_name = a.data.get("slo_name", "unknown")
|
|
85
|
+
if slo_name not in latest_by_slo:
|
|
86
|
+
latest_by_slo[slo_name] = a.data
|
|
87
|
+
|
|
88
|
+
consumed_values = [
|
|
89
|
+
d.get("percent_consumed", 0.0)
|
|
90
|
+
for d in latest_by_slo.values()
|
|
91
|
+
if d.get("percent_consumed") is not None
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
if not consumed_values:
|
|
95
|
+
avg_consumed = 0.0
|
|
96
|
+
else:
|
|
97
|
+
avg_consumed = sum(consumed_values) / len(consumed_values)
|
|
98
|
+
|
|
99
|
+
budget_remaining_pct = max(0.0, 100.0 - avg_consumed)
|
|
100
|
+
|
|
101
|
+
# Get thresholds
|
|
102
|
+
warning_threshold = _get_warning_threshold(tier, policy)
|
|
103
|
+
blocking_threshold = _get_blocking_threshold(tier, policy)
|
|
104
|
+
|
|
105
|
+
# Evaluate gate
|
|
106
|
+
result, message, recommendations = _evaluate_thresholds(
|
|
107
|
+
budget_remaining_pct, warning_threshold, blocking_threshold, tier, policy
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return GateCheckResult(
|
|
111
|
+
service=service,
|
|
112
|
+
tier=tier,
|
|
113
|
+
result=result,
|
|
114
|
+
budget_remaining_pct=budget_remaining_pct,
|
|
115
|
+
warning_threshold=warning_threshold,
|
|
116
|
+
blocking_threshold=blocking_threshold,
|
|
117
|
+
message=message,
|
|
118
|
+
recommendations=recommendations,
|
|
119
|
+
slo_count=len(latest_by_slo),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _get_warning_threshold(tier: str, policy: GatePolicy | None) -> float:
|
|
124
|
+
"""Get warning threshold from policy or tier defaults."""
|
|
125
|
+
if policy and policy.warning is not None:
|
|
126
|
+
return policy.warning
|
|
127
|
+
defaults = THRESHOLDS.get(tier, THRESHOLDS["standard"])
|
|
128
|
+
return defaults.get("warning") or 20.0
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _get_blocking_threshold(tier: str, policy: GatePolicy | None) -> float | None:
|
|
132
|
+
"""Get blocking threshold from policy or tier defaults."""
|
|
133
|
+
if policy and policy.blocking is not None:
|
|
134
|
+
return policy.blocking
|
|
135
|
+
defaults = THRESHOLDS.get(tier, THRESHOLDS["standard"])
|
|
136
|
+
return defaults.get("blocking")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _evaluate_thresholds(
|
|
140
|
+
budget_remaining_pct: float,
|
|
141
|
+
warning_threshold: float,
|
|
142
|
+
blocking_threshold: float | None,
|
|
143
|
+
tier: str,
|
|
144
|
+
policy: GatePolicy | None,
|
|
145
|
+
) -> tuple[GateResult, str, list[str]]:
|
|
146
|
+
"""Apply threshold logic to determine gate result."""
|
|
147
|
+
recommendations: list[str] = []
|
|
148
|
+
|
|
149
|
+
# Check exhaustion
|
|
150
|
+
if budget_remaining_pct <= 0:
|
|
151
|
+
if policy and policy.on_exhausted:
|
|
152
|
+
if "freeze_deploys" in policy.on_exhausted:
|
|
153
|
+
return (
|
|
154
|
+
GateResult.BLOCKED,
|
|
155
|
+
"Error budget exhausted (0% remaining). Deployment frozen per policy.",
|
|
156
|
+
["Wait for error budget to recover before deploying"],
|
|
157
|
+
)
|
|
158
|
+
if "require_approval" in policy.on_exhausted:
|
|
159
|
+
return (
|
|
160
|
+
GateResult.WARNING,
|
|
161
|
+
"Error budget exhausted. Manual approval required per policy.",
|
|
162
|
+
["Get explicit approval before proceeding"],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Check blocking threshold
|
|
166
|
+
if blocking_threshold is not None and budget_remaining_pct <= blocking_threshold:
|
|
167
|
+
return (
|
|
168
|
+
GateResult.BLOCKED,
|
|
169
|
+
f"Error budget critical: {budget_remaining_pct:.1f}% remaining (blocking threshold: {blocking_threshold}%)",
|
|
170
|
+
[
|
|
171
|
+
f"Budget below blocking threshold ({blocking_threshold}%)",
|
|
172
|
+
"Investigate ongoing issues before deploying",
|
|
173
|
+
"Consider waiting for budget recovery",
|
|
174
|
+
],
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Check warning threshold
|
|
178
|
+
if budget_remaining_pct <= warning_threshold:
|
|
179
|
+
recommendations.append(f"Budget below warning threshold ({warning_threshold}%)")
|
|
180
|
+
recommendations.append("Monitor closely after deployment")
|
|
181
|
+
return (
|
|
182
|
+
GateResult.WARNING,
|
|
183
|
+
f"Error budget low: {budget_remaining_pct:.1f}% remaining (warning threshold: {warning_threshold}%)",
|
|
184
|
+
recommendations,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Approved
|
|
188
|
+
return (
|
|
189
|
+
GateResult.APPROVED,
|
|
190
|
+
f"Error budget healthy: {budget_remaining_pct:.1f}% remaining",
|
|
191
|
+
[],
|
|
192
|
+
)
|