nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,88 @@
1
+ """Pattern detection for drift analysis.
2
+
3
+ Classifies drift patterns beyond simple linear trends:
4
+ - Gradual decline/improvement
5
+ - Step changes (sudden drops or improvements)
6
+ - Volatile patterns
7
+ - Stable (no significant trend)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from datetime import datetime
13
+
14
+ import numpy as np
15
+
16
+ from nthlayer_workers.observe.drift.models import DriftPattern
17
+
18
+
19
+ class PatternDetector:
20
+ """Detect drift patterns beyond simple linear trends."""
21
+
22
+ def __init__(
23
+ self,
24
+ step_change_threshold: float = 0.05,
25
+ volatility_variance_threshold: float = 0.01,
26
+ volatility_r_squared_threshold: float = 0.3,
27
+ slope_significance_threshold: float = 0.001,
28
+ ):
29
+ self.step_change_threshold = step_change_threshold
30
+ self.volatility_variance_threshold = volatility_variance_threshold
31
+ self.volatility_r_squared_threshold = volatility_r_squared_threshold
32
+ self.slope_significance_threshold = slope_significance_threshold
33
+
34
+ def detect(
35
+ self,
36
+ data: list[tuple[datetime, float]],
37
+ slope_per_second: float,
38
+ r_squared: float,
39
+ ) -> DriftPattern:
40
+ """Classify the drift pattern."""
41
+ if len(data) < 2:
42
+ return DriftPattern.STABLE
43
+
44
+ values = np.array([d[1] for d in data])
45
+ variance = float(np.var(values))
46
+
47
+ step_change = self._detect_step_change(data)
48
+ if step_change is not None:
49
+ return step_change
50
+
51
+ if (
52
+ r_squared < self.volatility_r_squared_threshold
53
+ and variance > self.volatility_variance_threshold
54
+ ):
55
+ return DriftPattern.VOLATILE
56
+
57
+ seconds_per_week = 7 * 24 * 60 * 60
58
+ weekly_slope = slope_per_second * seconds_per_week
59
+
60
+ if abs(weekly_slope) < self.slope_significance_threshold:
61
+ return DriftPattern.STABLE
62
+ elif weekly_slope < 0:
63
+ return DriftPattern.GRADUAL_DECLINE
64
+ else:
65
+ return DriftPattern.GRADUAL_IMPROVEMENT
66
+
67
+ def _detect_step_change(
68
+ self,
69
+ data: list[tuple[datetime, float]],
70
+ ) -> DriftPattern | None:
71
+ """Detect sudden step changes in the data."""
72
+ if len(data) < 2:
73
+ return None
74
+
75
+ max_time_window = 86400 * 1.5
76
+
77
+ for i in range(1, len(data)):
78
+ time_diff = (data[i][0] - data[i - 1][0]).total_seconds()
79
+ value_diff = data[i][1] - data[i - 1][1]
80
+
81
+ if time_diff < max_time_window:
82
+ if value_diff < -self.step_change_threshold:
83
+ return DriftPattern.STEP_CHANGE_DOWN
84
+ elif value_diff > self.step_change_threshold:
85
+ return DriftPattern.STEP_CHANGE_UP
86
+
87
+ return None
88
+
@@ -0,0 +1,118 @@
1
+ """ExplanationEngine — build human-readable budget explanations from assessments.
2
+
3
+ Deterministic. No LLM. Pure arithmetic on assessment data.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from nthlayer_common.explanation import BudgetExplanation
8
+ from nthlayer_workers.observe.assessment import Assessment
9
+ from nthlayer_workers.observe.store import AssessmentFilter, AssessmentStore
10
+
11
+ # Maps SLO assessment status to explanation severity.
12
+ # ERROR/NO_DATA are data-quality issues (e.g. Prometheus unreachable),
13
+ # not budget concerns — "warning" severity since they need attention
14
+ # but don't indicate budget consumption.
15
+ _STATUS_SEVERITY = {
16
+ "EXHAUSTED": "critical",
17
+ "CRITICAL": "critical",
18
+ "WARNING": "warning",
19
+ "ERROR": "warning",
20
+ "HEALTHY": "info",
21
+ "NO_DATA": "info",
22
+ "UNKNOWN": "info",
23
+ }
24
+
25
+
26
+ class ExplanationEngine:
27
+ """Build budget explanations from the assessment store."""
28
+
29
+ def explain_service(
30
+ self,
31
+ service: str,
32
+ store: AssessmentStore,
33
+ slo_filter: str | None = None,
34
+ ) -> list[BudgetExplanation]:
35
+ """Build explanations for a service from latest slo_status assessments."""
36
+ assessments = store.query(
37
+ AssessmentFilter(service=service, kind="slo_status", limit=0)
38
+ )
39
+
40
+ # Deduplicate: keep latest per SLO name (query returns desc by timestamp)
41
+ seen: set[str] = set()
42
+ latest: list[Assessment] = []
43
+ for a in assessments:
44
+ slo_name = a.data.get("slo_name", "unknown")
45
+ if slo_name not in seen:
46
+ seen.add(slo_name)
47
+ latest.append(a)
48
+
49
+ if slo_filter:
50
+ latest = [a for a in latest if a.data.get("slo_name") == slo_filter]
51
+
52
+ return [self._explain_slo(service, a) for a in latest]
53
+
54
+ def _explain_slo(self, service: str, assessment: Assessment) -> BudgetExplanation:
55
+ data = assessment.data
56
+ slo_name = data.get("slo_name", "unknown")
57
+ status = data.get("status", "UNKNOWN")
58
+ pct = data.get("percent_consumed", 0.0) or 0.0
59
+ burned = data.get("burned_minutes", 0.0) or 0.0
60
+ total = data.get("total_budget_minutes", 0.0) or 0.0
61
+ sli = data.get("current_sli", 0.0) or 0.0
62
+ obj = data.get("objective", 0.0) or 0.0
63
+ window = data.get("window", "30d")
64
+ remaining = max(0, total - burned)
65
+ severity = _STATUS_SEVERITY.get(status, "info")
66
+
67
+ # Headline
68
+ status_desc = {
69
+ "EXHAUSTED": "budget exhausted",
70
+ "CRITICAL": "near exhaustion",
71
+ "WARNING": "approaching threshold",
72
+ }
73
+ desc = status_desc.get(status, "within budget")
74
+ headline = f"{slo_name}: {pct:.0f}% consumed — {desc} ({status})"
75
+
76
+ # Body
77
+ body = (
78
+ f"Window: {window}. "
79
+ f"Budget: {total:.0f} min total, {burned:.0f} min consumed, "
80
+ f"{remaining:.0f} min remaining. "
81
+ f"Current SLI: {sli:.4f} (target: {obj:.4f})."
82
+ )
83
+
84
+ # Causes
85
+ causes: list[str] = []
86
+ if pct > 80:
87
+ causes.append(
88
+ "Budget consumption exceeds 80% — sustained error rate above target"
89
+ )
90
+ if sli < obj and obj > 0:
91
+ gap = (obj - sli) * 100
92
+ causes.append(
93
+ f"Current SLI ({sli:.4f}) is {gap:.2f}pp below target ({obj:.4f})"
94
+ )
95
+
96
+ # Actions
97
+ actions: list[str] = []
98
+ if status == "EXHAUSTED":
99
+ actions.append(
100
+ "Deployment gate will block — resolve underlying issue before deploying"
101
+ )
102
+ if status in ("CRITICAL", "EXHAUSTED"):
103
+ actions.append("Investigate root cause of elevated error rate")
104
+ actions.append("Consider freezing deployments until budget recovers")
105
+ elif status == "WARNING":
106
+ actions.append(
107
+ "Monitor trend — investigate if consumption continues to rise"
108
+ )
109
+
110
+ return BudgetExplanation(
111
+ service=service,
112
+ slo_name=slo_name,
113
+ headline=headline,
114
+ body=body,
115
+ causes=causes,
116
+ recommended_actions=actions,
117
+ severity=severity,
118
+ )
@@ -0,0 +1,39 @@
1
+ """Deployment gate evaluation."""
2
+
3
+ from nthlayer_workers.observe.gate.conditions import (
4
+ get_current_context,
5
+ is_business_hours,
6
+ is_freeze_period,
7
+ is_peak_traffic,
8
+ is_weekday,
9
+ )
10
+ from nthlayer_workers.observe.gate.correlator import (
11
+ CorrelationInput,
12
+ CorrelationResult,
13
+ correlate,
14
+ )
15
+ from nthlayer_workers.observe.gate.evaluator import (
16
+ GateCheckResult,
17
+ check_deploy,
18
+ )
19
+ from nthlayer_workers.observe.gate.policies import (
20
+ ConditionEvaluator,
21
+ EvaluationResult,
22
+ PolicyContext,
23
+ )
24
+
25
+ __all__ = [
26
+ "ConditionEvaluator",
27
+ "CorrelationInput",
28
+ "CorrelationResult",
29
+ "EvaluationResult",
30
+ "GateCheckResult",
31
+ "PolicyContext",
32
+ "check_deploy",
33
+ "correlate",
34
+ "get_current_context",
35
+ "is_business_hours",
36
+ "is_freeze_period",
37
+ "is_peak_traffic",
38
+ "is_weekday",
39
+ ]
@@ -0,0 +1,92 @@
1
+ """Built-in condition functions for policy evaluation.
2
+
3
+ Provides time-based, date-based, and service-based conditions.
4
+ All functions are pure — no external data access.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import date, datetime
10
+ from typing import Any
11
+
12
+
13
+ def get_current_context(
14
+ budget_remaining: float = 100.0,
15
+ budget_consumed: float = 0.0,
16
+ burn_rate: float = 1.0,
17
+ tier: str = "standard",
18
+ environment: str = "prod",
19
+ downstream_count: int = 0,
20
+ high_criticality_downstream: int = 0,
21
+ now: datetime | None = None,
22
+ ) -> dict[str, Any]:
23
+ """Build context dictionary for condition evaluation."""
24
+ if now is None:
25
+ now = datetime.now()
26
+
27
+ return {
28
+ "hour": now.hour,
29
+ "minute": now.minute,
30
+ "weekday": now.weekday() < 5,
31
+ "day_of_week": now.weekday(),
32
+ "date": now.date().isoformat(),
33
+ "month": now.month,
34
+ "day": now.day,
35
+ "year": now.year,
36
+ "budget_remaining": budget_remaining,
37
+ "budget_consumed": budget_consumed,
38
+ "burn_rate": burn_rate,
39
+ "tier": tier,
40
+ "environment": environment,
41
+ "env": environment,
42
+ "downstream_count": downstream_count,
43
+ "high_criticality_downstream": high_criticality_downstream,
44
+ }
45
+
46
+
47
+ def is_business_hours(
48
+ now: datetime | None = None,
49
+ start_hour: int = 9,
50
+ end_hour: int = 17,
51
+ ) -> bool:
52
+ """Check if current time is within business hours (Mon-Fri, 9-17)."""
53
+ if now is None:
54
+ now = datetime.now()
55
+ if now.weekday() >= 5:
56
+ return False
57
+ return start_hour <= now.hour < end_hour
58
+
59
+
60
+ def is_weekday(now: datetime | None = None) -> bool:
61
+ """Check if current day is a weekday (Mon-Fri)."""
62
+ if now is None:
63
+ now = datetime.now()
64
+ return now.weekday() < 5
65
+
66
+
67
+ def is_freeze_period(
68
+ start_date: str,
69
+ end_date: str,
70
+ now: datetime | None = None,
71
+ ) -> bool:
72
+ """Check if current date is within a freeze period."""
73
+ if now is None:
74
+ now = datetime.now()
75
+ try:
76
+ start = date.fromisoformat(start_date)
77
+ end = date.fromisoformat(end_date)
78
+ return start <= now.date() <= end
79
+ except ValueError:
80
+ return False
81
+
82
+
83
+ def is_peak_traffic(
84
+ now: datetime | None = None,
85
+ peak_hours: list[tuple[int, int]] | None = None,
86
+ ) -> bool:
87
+ """Check if current time is during peak traffic hours."""
88
+ if now is None:
89
+ now = datetime.now()
90
+ if peak_hours is None:
91
+ peak_hours = [(10, 12), (14, 16)]
92
+ return any(start <= now.hour < end for start, end in peak_hours)
@@ -0,0 +1,154 @@
1
+ """Deployment correlation engine — 5-factor weighted scoring.
2
+
3
+ Deterministic heuristic that correlates deployments with error budget burns.
4
+ Adapted from nthlayer.slos.correlator to work with pre-computed inputs
5
+ instead of SLORepository queries.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+ from typing import Any
14
+
15
+ # Confidence thresholds
16
+ HIGH_CONFIDENCE = 0.7
17
+ MEDIUM_CONFIDENCE = 0.5
18
+ LOW_CONFIDENCE = 0.3
19
+ BLOCKING_CONFIDENCE = 0.8
20
+
21
+ # Factor weights
22
+ WEIGHTS = {
23
+ "burn_rate": 0.35,
24
+ "proximity": 0.25,
25
+ "magnitude": 0.15,
26
+ "dependency": 0.15,
27
+ "history": 0.10,
28
+ }
29
+
30
+
31
+ @dataclass
32
+ class CorrelationInput:
33
+ """Pre-computed input data for correlation scoring.
34
+
35
+ Callers gather this data from assessments or other sources
36
+ before calling the correlator.
37
+ """
38
+
39
+ deployment_id: str
40
+ service: str
41
+ deploy_time: datetime
42
+ burn_detected_at: datetime # when burn was first detected (for proximity scoring)
43
+ burn_rate_before: float # burn rate per minute before deploy window
44
+ burn_rate_after: float # burn rate per minute after deploy window
45
+ burn_minutes: float # total burn in after window
46
+ is_same_service: bool = False # deployment targets same service as affected SLO
47
+ is_direct_upstream: bool = False # deployment targets direct upstream dependency
48
+ is_transitive_upstream: bool = False # deployment targets transitive upstream
49
+ is_yaml_downstream: bool = False # service listed in YAML downstream_services
50
+ recent_deploy_count: int = 0 # total deploys for service in history window
51
+ prior_correlations: int = 0 # prior medium+ correlations in history window
52
+
53
+
54
+ @dataclass
55
+ class CorrelationResult:
56
+ """Result of correlation analysis."""
57
+
58
+ deployment_id: str
59
+ service: str
60
+ burn_minutes: float
61
+ confidence: float
62
+ method: str = "time_window_analysis"
63
+ details: dict[str, Any] = field(default_factory=dict)
64
+
65
+ @property
66
+ def confidence_label(self) -> str:
67
+ if self.confidence >= HIGH_CONFIDENCE:
68
+ return "HIGH"
69
+ elif self.confidence >= MEDIUM_CONFIDENCE:
70
+ return "MEDIUM"
71
+ elif self.confidence >= LOW_CONFIDENCE:
72
+ return "LOW"
73
+ return "NONE"
74
+
75
+
76
+ def correlate(inp: CorrelationInput) -> CorrelationResult:
77
+ """Correlate a deployment with error budget burns using 5-factor scoring.
78
+
79
+ All data is pre-computed in CorrelationInput — no async, no DB queries.
80
+
81
+ Factors:
82
+ - burn_rate (0.35): Spike in burn rate after deploy
83
+ - proximity (0.25): Time proximity (exponential decay, half-life ~30min)
84
+ - magnitude (0.15): Absolute burn amount
85
+ - dependency (0.15): Relationship between deploying and affected service
86
+ - history (0.10): Historical correlation pattern
87
+ """
88
+ burn_rate_score = _calculate_burn_rate_score(inp.burn_rate_before, inp.burn_rate_after)
89
+ proximity_score = _calculate_proximity_score(inp.deploy_time, inp.burn_detected_at)
90
+ magnitude_score = _calculate_magnitude_score(inp.burn_minutes)
91
+ dependency_score = _calculate_dependency_score(inp)
92
+ history_score = _calculate_history_score(inp.recent_deploy_count, inp.prior_correlations)
93
+
94
+ confidence = (
95
+ WEIGHTS["burn_rate"] * burn_rate_score
96
+ + WEIGHTS["proximity"] * proximity_score
97
+ + WEIGHTS["magnitude"] * magnitude_score
98
+ + WEIGHTS["dependency"] * dependency_score
99
+ + WEIGHTS["history"] * history_score
100
+ )
101
+
102
+ return CorrelationResult(
103
+ deployment_id=inp.deployment_id,
104
+ service=inp.service,
105
+ burn_minutes=inp.burn_minutes,
106
+ confidence=confidence,
107
+ details={
108
+ "burn_rate_before": inp.burn_rate_before,
109
+ "burn_rate_after": inp.burn_rate_after,
110
+ "burn_rate_score": burn_rate_score,
111
+ "proximity_score": proximity_score,
112
+ "magnitude_score": magnitude_score,
113
+ "dependency_score": dependency_score,
114
+ "history_score": history_score,
115
+ },
116
+ )
117
+
118
+
119
+ def _calculate_burn_rate_score(before_rate: float, after_rate: float) -> float:
120
+ """Spike ratio: 5x or more = 1.0. No baseline → absolute rate."""
121
+ if before_rate == 0:
122
+ return min(after_rate / 0.1, 1.0)
123
+ return min((after_rate / before_rate) / 5.0, 1.0)
124
+
125
+
126
+ def _calculate_proximity_score(deployed_at: datetime, burn_detected_at: datetime) -> float:
127
+ """Exponential decay: half-life ~30 minutes."""
128
+ elapsed = abs((burn_detected_at - deployed_at).total_seconds()) / 60.0
129
+ return math.exp(-elapsed / 30.0)
130
+
131
+
132
+ def _calculate_magnitude_score(burn_minutes: float) -> float:
133
+ """Absolute burn: 10+ minutes = 1.0."""
134
+ return min(burn_minutes / 10.0, 1.0)
135
+
136
+
137
+ def _calculate_dependency_score(inp: CorrelationInput) -> float:
138
+ """Relationship score: same=1.0, direct upstream=1.0, transitive=0.4, yaml=0.6, none=0.0."""
139
+ if inp.is_same_service:
140
+ return 1.0
141
+ if inp.is_direct_upstream:
142
+ return 1.0
143
+ if inp.is_transitive_upstream:
144
+ return 0.4
145
+ if inp.is_yaml_downstream:
146
+ return 0.6
147
+ return 0.0
148
+
149
+
150
+ def _calculate_history_score(recent_deploy_count: int, prior_correlations: int) -> float:
151
+ """Repeat offender penalty: fraction of recent deploys with medium+ correlation."""
152
+ if recent_deploy_count == 0:
153
+ return 0.0
154
+ return min(prior_correlations / recent_deploy_count, 1.0)
@@ -0,0 +1,192 @@
1
+ """Deployment gate evaluation — decides APPROVED/WARNING/BLOCKED from assessments.
2
+
3
+ Reads slo_status assessments to determine error budget status,
4
+ then applies tier-based thresholds to make gate decisions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+
12
+ from nthlayer_common.gate_models import GatePolicy, GateResult
13
+ from nthlayer_common.tiers import TIER_CONFIGS
14
+
15
+ from nthlayer_workers.observe.store import AssessmentFilter, AssessmentStore
16
+
17
+
18
+ # Default thresholds from tier config
19
+ THRESHOLDS: dict[str, dict[str, float | None]] = {
20
+ tier: {
21
+ "warning": config.error_budget_warning_pct,
22
+ "blocking": config.error_budget_blocking_pct,
23
+ }
24
+ for tier, config in TIER_CONFIGS.items()
25
+ }
26
+
27
+
28
+ @dataclass
29
+ class GateCheckResult:
30
+ """Result of a deployment gate check."""
31
+
32
+ service: str
33
+ tier: str
34
+ result: GateResult
35
+ budget_remaining_pct: float
36
+ warning_threshold: float
37
+ blocking_threshold: float | None
38
+ message: str
39
+ recommendations: list[str]
40
+ slo_count: int = 0
41
+
42
+
43
+ def check_deploy(
44
+ service: str,
45
+ tier: str,
46
+ store: AssessmentStore,
47
+ policy: GatePolicy | None = None,
48
+ ) -> GateCheckResult:
49
+ """Check if deployment should be allowed based on assessment data.
50
+
51
+ Reads recent slo_status assessments for the service, computes
52
+ aggregate budget consumption, and applies tier thresholds.
53
+
54
+ Args:
55
+ service: Service name
56
+ tier: Service tier (critical, standard, low)
57
+ store: Assessment store with slo_status assessments
58
+ policy: Optional custom GatePolicy to override tier defaults
59
+
60
+ Returns:
61
+ GateCheckResult with APPROVED/WARNING/BLOCKED decision
62
+ """
63
+ # Get recent slo_status assessments for this service
64
+ assessments = store.query(
65
+ AssessmentFilter(service=service, kind="slo_status")
66
+ )
67
+
68
+ if not assessments:
69
+ return GateCheckResult(
70
+ service=service,
71
+ tier=tier,
72
+ result=GateResult.APPROVED,
73
+ budget_remaining_pct=100.0,
74
+ warning_threshold=_get_warning_threshold(tier, policy),
75
+ blocking_threshold=_get_blocking_threshold(tier, policy),
76
+ message="No SLO assessments found — gate approved by default",
77
+ recommendations=[],
78
+ slo_count=0,
79
+ )
80
+
81
+ # Compute aggregate budget consumption from latest assessments per SLO
82
+ latest_by_slo: dict[str, dict[str, Any]] = {}
83
+ for a in assessments:
84
+ slo_name = a.data.get("slo_name", "unknown")
85
+ if slo_name not in latest_by_slo:
86
+ latest_by_slo[slo_name] = a.data
87
+
88
+ consumed_values = [
89
+ d.get("percent_consumed", 0.0)
90
+ for d in latest_by_slo.values()
91
+ if d.get("percent_consumed") is not None
92
+ ]
93
+
94
+ if not consumed_values:
95
+ avg_consumed = 0.0
96
+ else:
97
+ avg_consumed = sum(consumed_values) / len(consumed_values)
98
+
99
+ budget_remaining_pct = max(0.0, 100.0 - avg_consumed)
100
+
101
+ # Get thresholds
102
+ warning_threshold = _get_warning_threshold(tier, policy)
103
+ blocking_threshold = _get_blocking_threshold(tier, policy)
104
+
105
+ # Evaluate gate
106
+ result, message, recommendations = _evaluate_thresholds(
107
+ budget_remaining_pct, warning_threshold, blocking_threshold, tier, policy
108
+ )
109
+
110
+ return GateCheckResult(
111
+ service=service,
112
+ tier=tier,
113
+ result=result,
114
+ budget_remaining_pct=budget_remaining_pct,
115
+ warning_threshold=warning_threshold,
116
+ blocking_threshold=blocking_threshold,
117
+ message=message,
118
+ recommendations=recommendations,
119
+ slo_count=len(latest_by_slo),
120
+ )
121
+
122
+
123
+ def _get_warning_threshold(tier: str, policy: GatePolicy | None) -> float:
124
+ """Get warning threshold from policy or tier defaults."""
125
+ if policy and policy.warning is not None:
126
+ return policy.warning
127
+ defaults = THRESHOLDS.get(tier, THRESHOLDS["standard"])
128
+ return defaults.get("warning") or 20.0
129
+
130
+
131
+ def _get_blocking_threshold(tier: str, policy: GatePolicy | None) -> float | None:
132
+ """Get blocking threshold from policy or tier defaults."""
133
+ if policy and policy.blocking is not None:
134
+ return policy.blocking
135
+ defaults = THRESHOLDS.get(tier, THRESHOLDS["standard"])
136
+ return defaults.get("blocking")
137
+
138
+
139
+ def _evaluate_thresholds(
140
+ budget_remaining_pct: float,
141
+ warning_threshold: float,
142
+ blocking_threshold: float | None,
143
+ tier: str,
144
+ policy: GatePolicy | None,
145
+ ) -> tuple[GateResult, str, list[str]]:
146
+ """Apply threshold logic to determine gate result."""
147
+ recommendations: list[str] = []
148
+
149
+ # Check exhaustion
150
+ if budget_remaining_pct <= 0:
151
+ if policy and policy.on_exhausted:
152
+ if "freeze_deploys" in policy.on_exhausted:
153
+ return (
154
+ GateResult.BLOCKED,
155
+ "Error budget exhausted (0% remaining). Deployment frozen per policy.",
156
+ ["Wait for error budget to recover before deploying"],
157
+ )
158
+ if "require_approval" in policy.on_exhausted:
159
+ return (
160
+ GateResult.WARNING,
161
+ "Error budget exhausted. Manual approval required per policy.",
162
+ ["Get explicit approval before proceeding"],
163
+ )
164
+
165
+ # Check blocking threshold
166
+ if blocking_threshold is not None and budget_remaining_pct <= blocking_threshold:
167
+ return (
168
+ GateResult.BLOCKED,
169
+ f"Error budget critical: {budget_remaining_pct:.1f}% remaining (blocking threshold: {blocking_threshold}%)",
170
+ [
171
+ f"Budget below blocking threshold ({blocking_threshold}%)",
172
+ "Investigate ongoing issues before deploying",
173
+ "Consider waiting for budget recovery",
174
+ ],
175
+ )
176
+
177
+ # Check warning threshold
178
+ if budget_remaining_pct <= warning_threshold:
179
+ recommendations.append(f"Budget below warning threshold ({warning_threshold}%)")
180
+ recommendations.append("Monitor closely after deployment")
181
+ return (
182
+ GateResult.WARNING,
183
+ f"Error budget low: {budget_remaining_pct:.1f}% remaining (warning threshold: {warning_threshold}%)",
184
+ recommendations,
185
+ )
186
+
187
+ # Approved
188
+ return (
189
+ GateResult.APPROVED,
190
+ f"Error budget healthy: {budget_remaining_pct:.1f}% remaining",
191
+ [],
192
+ )