agentsonar 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentsonar/__init__.py +52 -0
- agentsonar/_core/__init__.py +6 -0
- agentsonar/_core/constants.py +16 -0
- agentsonar/_core/detectors/__init__.py +22 -0
- agentsonar/_core/detectors/alert_state.py +202 -0
- agentsonar/_core/detectors/cycle_detector.py +81 -0
- agentsonar/_core/detectors/rate_limiter.py +297 -0
- agentsonar/_core/detectors/repetitive_detector.py +171 -0
- agentsonar/_core/detectors/scc_analyzer.py +83 -0
- agentsonar/_core/engine.py +960 -0
- agentsonar/_core/graph.py +47 -0
- agentsonar/_core/models.py +35 -0
- agentsonar/_core/noop_engine.py +145 -0
- agentsonar/_core/schema.py +565 -0
- agentsonar/_integrations/__init__.py +8 -0
- agentsonar/_integrations/_safe_engine.py +228 -0
- agentsonar/_integrations/crewai_listener.py +217 -0
- agentsonar/_integrations/langgraph_callback.py +431 -0
- agentsonar/_output/__init__.py +16 -0
- agentsonar/_output/_slug.py +127 -0
- agentsonar/_output/html_report.py +919 -0
- agentsonar/_output/json_export.py +161 -0
- agentsonar/_output/terminal.py +1052 -0
- agentsonar-0.1.2.dist-info/METADATA +463 -0
- agentsonar-0.1.2.dist-info/RECORD +26 -0
- agentsonar-0.1.2.dist-info/WHEEL +4 -0
agentsonar/__init__.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AgentSonar — detect coordination failures in multi-agent AI systems.
|
|
3
|
+
|
|
4
|
+
Two lines to integrate:
|
|
5
|
+
|
|
6
|
+
# CrewAI
|
|
7
|
+
from agentsonar import AgentSonarListener
|
|
8
|
+
sonar = AgentSonarListener()
|
|
9
|
+
# ...run your crew normally.
|
|
10
|
+
|
|
11
|
+
# LangGraph / LangChain — option 1: pass the callback yourself
|
|
12
|
+
from agentsonar import AgentSonarCallback
|
|
13
|
+
result = graph.invoke(input, config={"callbacks": [AgentSonarCallback()]})
|
|
14
|
+
|
|
15
|
+
# LangGraph / LangChain — option 2: wrap the graph (auto-merges callbacks)
|
|
16
|
+
from agentsonar import monitor
|
|
17
|
+
graph = monitor(graph)
|
|
18
|
+
result = graph.invoke(input) # auto-monitored
|
|
19
|
+
result = graph.invoke(input, config={"callbacks": [my_cb]}) # my_cb preserved
|
|
20
|
+
|
|
21
|
+
Detection and terminal output happen automatically. No config, no API
|
|
22
|
+
keys, no accounts. Alerts stream to stderr and to timestamped log files
|
|
23
|
+
(`agentsonar_*.log`) in the working directory.
|
|
24
|
+
|
|
25
|
+
Public API:
|
|
26
|
+
AgentSonarListener — CrewAI integration (requires `agentsonar[crewai]`)
|
|
27
|
+
AgentSonarCallback — LangGraph/LangChain integration (requires `agentsonar[langgraph]`)
|
|
28
|
+
monitor — LangGraph wrapper that auto-merges AgentSonar
|
|
29
|
+
into the callback list without overriding existing
|
|
30
|
+
user callbacks. Recommended when you already have
|
|
31
|
+
your own callbacks configured.
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
__version__ = "0.1.0"
|
|
36
|
+
|
|
37
|
+
# Everything below is the ONLY public surface. Framework integrations use
|
|
38
|
+
# conditional imports for their underlying framework — they remain
|
|
39
|
+
# importable even when the optional extra is not installed, and raise a
|
|
40
|
+
# clear RuntimeError at instantiation time instead.
|
|
41
|
+
from agentsonar._integrations.crewai_listener import AgentSonarListener
|
|
42
|
+
from agentsonar._integrations.langgraph_callback import (
|
|
43
|
+
AgentSonarCallback,
|
|
44
|
+
monitor,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"AgentSonarListener",
|
|
49
|
+
"AgentSonarCallback",
|
|
50
|
+
"monitor",
|
|
51
|
+
"__version__",
|
|
52
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AgentSonar schema and detection constants.
|
|
3
|
+
|
|
4
|
+
Token tracking, cost estimation, and velocity metrics are out of scope
|
|
5
|
+
for v0.1.0. The schema intentionally omits these fields.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
SCHEMA_VERSION = "0.1.0"
|
|
10
|
+
|
|
11
|
+
# Default thresholds — may be overridden via DetectionEngine config dict
|
|
12
|
+
DEFAULT_CYCLE_WARNING_ROTATIONS = 3
|
|
13
|
+
DEFAULT_CYCLE_CRITICAL_ROTATIONS = 10
|
|
14
|
+
DEFAULT_REPETITIVE_WARNING_COUNT = 5
|
|
15
|
+
DEFAULT_REPETITIVE_CRITICAL_COUNT = 15
|
|
16
|
+
DEFAULT_STALL_THRESHOLD_SECONDS = 300
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Detection layer implementations. Internal — not part of the public API.
|
|
3
|
+
|
|
4
|
+
Layer 1: SlidingWindowLimiter — rate-limit circuit breaker
|
|
5
|
+
Layer 2: ExponentialDecayDetector — decay-weighted edge anomaly
|
|
6
|
+
Layer 3: CycleDetector — incremental cycle detection
|
|
7
|
+
Layer 4: AlertStateManager — alert state machine (WARNING → CRITICAL → RESOLVED)
|
|
8
|
+
Layer 5: PeriodicSCCAnalyzer — backup SCC sweep
|
|
9
|
+
"""
|
|
10
|
+
from agentsonar._core.detectors.alert_state import AlertStateManager
|
|
11
|
+
from agentsonar._core.detectors.cycle_detector import CycleDetector
|
|
12
|
+
from agentsonar._core.detectors.rate_limiter import SlidingWindowLimiter
|
|
13
|
+
from agentsonar._core.detectors.repetitive_detector import ExponentialDecayDetector
|
|
14
|
+
from agentsonar._core.detectors.scc_analyzer import PeriodicSCCAnalyzer
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"AlertStateManager",
|
|
18
|
+
"CycleDetector",
|
|
19
|
+
"ExponentialDecayDetector",
|
|
20
|
+
"PeriodicSCCAnalyzer",
|
|
21
|
+
"SlidingWindowLimiter",
|
|
22
|
+
]
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Layer 4: AlertStateManager — O(1) alert dedup / state machine.
|
|
3
|
+
|
|
4
|
+
Progressive alert lifecycle with hysteresis recovery. Separated from the
|
|
5
|
+
PeriodicSCCAnalyzer (Layer 5) because they have nothing to do with each
|
|
6
|
+
other — one manages alert state, the other runs backup SCC sweeps.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from agentsonar._core.models import Alert
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class _TrackedAlert:
|
|
17
|
+
fingerprint: tuple
|
|
18
|
+
state: str # "ACTIVE", "ESCALATED", "RESOLVED"
|
|
19
|
+
pattern: str # "cycle", "edge_anomaly", "scc_cycle"
|
|
20
|
+
current_rotations: int = 0
|
|
21
|
+
last_alert_rotations: int = 0
|
|
22
|
+
last_alert_time: float = 0.0
|
|
23
|
+
last_activity_time: float = 0.0
|
|
24
|
+
created_at: float = 0.0
|
|
25
|
+
alert_count: int = 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AlertStateManager:
|
|
29
|
+
"""Progressive alert state machine with hysteresis recovery.
|
|
30
|
+
|
|
31
|
+
Tracks each detected anomaly by fingerprint and manages transitions:
|
|
32
|
+
(new) → ACTIVE (WARNING) → ESCALATED (CRITICAL) → RESOLVED (INFO)
|
|
33
|
+
↑ re-alerts every N rotations
|
|
34
|
+
RESOLVED → ACTIVE on regression
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
warning_threshold: Rotations/count to trigger first WARNING (default 5).
|
|
38
|
+
critical_threshold: Rotations/count to trigger CRITICAL (default 15).
|
|
39
|
+
re_alert_interval: After CRITICAL, re-alert every N additional rotations (default 30).
|
|
40
|
+
resolve_after_seconds: Quiet period before marking RESOLVED (default 60).
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
warning_threshold: int = 5,
|
|
46
|
+
critical_threshold: int = 15,
|
|
47
|
+
re_alert_interval: int = 30,
|
|
48
|
+
resolve_after_seconds: float = 60.0,
|
|
49
|
+
):
|
|
50
|
+
self.warning_threshold = warning_threshold
|
|
51
|
+
self.critical_threshold = critical_threshold
|
|
52
|
+
self.re_alert_interval = re_alert_interval
|
|
53
|
+
self.resolve_after_seconds = resolve_after_seconds
|
|
54
|
+
self._tracked: dict[tuple, _TrackedAlert] = {}
|
|
55
|
+
|
|
56
|
+
def process(
|
|
57
|
+
self,
|
|
58
|
+
fingerprint: tuple,
|
|
59
|
+
rotations: int,
|
|
60
|
+
pattern: str,
|
|
61
|
+
now: float,
|
|
62
|
+
extra_details: dict | None = None,
|
|
63
|
+
) -> Alert | None:
|
|
64
|
+
"""Feed a detection result into the state machine.
|
|
65
|
+
|
|
66
|
+
Returns an Alert if one should be emitted, None otherwise.
|
|
67
|
+
"""
|
|
68
|
+
details = dict(extra_details or {})
|
|
69
|
+
details["fingerprint"] = fingerprint
|
|
70
|
+
details["rotations"] = rotations
|
|
71
|
+
|
|
72
|
+
tracked = self._tracked.get(fingerprint)
|
|
73
|
+
|
|
74
|
+
# --- New fingerprint ---
|
|
75
|
+
if tracked is None:
|
|
76
|
+
if rotations >= self.warning_threshold:
|
|
77
|
+
tracked = _TrackedAlert(
|
|
78
|
+
fingerprint=fingerprint,
|
|
79
|
+
state="ACTIVE",
|
|
80
|
+
pattern=pattern,
|
|
81
|
+
current_rotations=rotations,
|
|
82
|
+
last_alert_rotations=rotations,
|
|
83
|
+
last_alert_time=now,
|
|
84
|
+
last_activity_time=now,
|
|
85
|
+
created_at=now,
|
|
86
|
+
alert_count=1,
|
|
87
|
+
)
|
|
88
|
+
self._tracked[fingerprint] = tracked
|
|
89
|
+
return Alert(
|
|
90
|
+
pattern=pattern,
|
|
91
|
+
severity="WARNING",
|
|
92
|
+
details=details,
|
|
93
|
+
timestamp=now,
|
|
94
|
+
)
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
# --- Existing fingerprint ---
|
|
98
|
+
# NOTE: do NOT mutate `tracked.current_rotations` or
|
|
99
|
+
# `last_activity_time` unconditionally at the top. The old
|
|
100
|
+
# implementation wrote both of these BEFORE the state
|
|
101
|
+
# dispatch decided whether to emit an alert, which meant a
|
|
102
|
+
# no-op call (returning None) still silently corrupted
|
|
103
|
+
# state — e.g. a spurious rotations=4 event after rotations=5
|
|
104
|
+
# would ratchet the count DOWN and refresh the activity
|
|
105
|
+
# time, masking the real rotation count from subsequent
|
|
106
|
+
# recovery checks. Mutations now happen ONLY on paths that
|
|
107
|
+
# actually fire an alert OR represent real progress (see
|
|
108
|
+
# the monotonic-update block below each state branch).
|
|
109
|
+
previous_rotations = tracked.current_rotations
|
|
110
|
+
|
|
111
|
+
if tracked.state == "ACTIVE":
|
|
112
|
+
if rotations >= self.critical_threshold:
|
|
113
|
+
tracked.state = "ESCALATED"
|
|
114
|
+
tracked.current_rotations = rotations
|
|
115
|
+
tracked.last_activity_time = now
|
|
116
|
+
tracked.last_alert_rotations = rotations
|
|
117
|
+
tracked.last_alert_time = now
|
|
118
|
+
tracked.alert_count += 1
|
|
119
|
+
return Alert(
|
|
120
|
+
pattern=pattern,
|
|
121
|
+
severity="CRITICAL",
|
|
122
|
+
details=details,
|
|
123
|
+
timestamp=now,
|
|
124
|
+
)
|
|
125
|
+
# No alert fires, but rotations may still be increasing
|
|
126
|
+
# toward the critical threshold — ratchet forward ONLY
|
|
127
|
+
# if the new count is >= the old one. This preserves
|
|
128
|
+
# monotonic progress without ever decreasing.
|
|
129
|
+
if rotations >= previous_rotations:
|
|
130
|
+
tracked.current_rotations = rotations
|
|
131
|
+
tracked.last_activity_time = now
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
if tracked.state == "ESCALATED":
|
|
135
|
+
if rotations >= tracked.last_alert_rotations + self.re_alert_interval:
|
|
136
|
+
tracked.current_rotations = rotations
|
|
137
|
+
tracked.last_activity_time = now
|
|
138
|
+
tracked.last_alert_rotations = rotations
|
|
139
|
+
tracked.last_alert_time = now
|
|
140
|
+
tracked.alert_count += 1
|
|
141
|
+
return Alert(
|
|
142
|
+
pattern=pattern,
|
|
143
|
+
severity="CRITICAL",
|
|
144
|
+
details=details,
|
|
145
|
+
timestamp=now,
|
|
146
|
+
)
|
|
147
|
+
# No re-alert fires — same monotonic guard as ACTIVE.
|
|
148
|
+
if rotations >= previous_rotations:
|
|
149
|
+
tracked.current_rotations = rotations
|
|
150
|
+
tracked.last_activity_time = now
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
if tracked.state == "RESOLVED":
|
|
154
|
+
if rotations > previous_rotations or rotations > tracked.last_alert_rotations:
|
|
155
|
+
tracked.state = "ACTIVE"
|
|
156
|
+
tracked.current_rotations = rotations
|
|
157
|
+
tracked.last_activity_time = now
|
|
158
|
+
tracked.last_alert_time = now
|
|
159
|
+
# CRITICAL round-3 fix: update `last_alert_rotations`
|
|
160
|
+
# on the regression path. Without this, a future
|
|
161
|
+
# ESCALATED re-alert fires at `OLD_last_alert_rotations
|
|
162
|
+
# + re_alert_interval` instead of the correct
|
|
163
|
+
# `rotations + re_alert_interval`, firing one
|
|
164
|
+
# cycle earlier than designed.
|
|
165
|
+
tracked.last_alert_rotations = rotations
|
|
166
|
+
tracked.alert_count += 1
|
|
167
|
+
details["regression"] = True
|
|
168
|
+
return Alert(
|
|
169
|
+
pattern=pattern,
|
|
170
|
+
severity="WARNING",
|
|
171
|
+
details=details,
|
|
172
|
+
timestamp=now,
|
|
173
|
+
)
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
def check_recoveries(self, now: float) -> list[Alert]:
|
|
179
|
+
"""Check all tracked alerts for recovery (quiet period elapsed).
|
|
180
|
+
|
|
181
|
+
Returns list of INFO alerts for newly resolved anomalies.
|
|
182
|
+
"""
|
|
183
|
+
resolved: list[Alert] = []
|
|
184
|
+
for tracked in self._tracked.values():
|
|
185
|
+
if tracked.state in ("ACTIVE", "ESCALATED"):
|
|
186
|
+
if now - tracked.last_activity_time >= self.resolve_after_seconds:
|
|
187
|
+
tracked.state = "RESOLVED"
|
|
188
|
+
resolved.append(Alert(
|
|
189
|
+
pattern=tracked.pattern,
|
|
190
|
+
severity="INFO",
|
|
191
|
+
details={
|
|
192
|
+
"fingerprint": tracked.fingerprint,
|
|
193
|
+
"rotations": tracked.current_rotations,
|
|
194
|
+
"message": "resolved after quiet period",
|
|
195
|
+
},
|
|
196
|
+
timestamp=now,
|
|
197
|
+
))
|
|
198
|
+
return resolved
|
|
199
|
+
|
|
200
|
+
def reset(self) -> None:
|
|
201
|
+
"""Clear all tracked alerts."""
|
|
202
|
+
self._tracked.clear()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Layer 3: CycleDetector — O(V+E) cycle detection via has_path.
|
|
3
|
+
|
|
4
|
+
Detects when a delegation creates or extends a graph cycle. Uses
|
|
5
|
+
has_path + shortest_path rather than simple_cycles for efficiency.
|
|
6
|
+
Reports CycleInfo; does NOT decide severity (that's AlertStateManager).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import networkx as nx
|
|
11
|
+
|
|
12
|
+
from agentsonar._core.models import CycleInfo, InteractionEvent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CycleDetector:
|
|
16
|
+
"""Stateless cycle detector.
|
|
17
|
+
|
|
18
|
+
On each event (src -> dst), checks whether a path already exists from
|
|
19
|
+
dst back to src in the graph. If so, extracts the cycle path and
|
|
20
|
+
computes rotation count (min edge count around the loop).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self) -> None:
|
|
24
|
+
pass # stateless — thresholds live in AlertStateManager
|
|
25
|
+
|
|
26
|
+
def check(self, graph: nx.DiGraph, event: InteractionEvent) -> CycleInfo | None:
|
|
27
|
+
"""Check if the latest event creates or extends a cycle.
|
|
28
|
+
|
|
29
|
+
Returns CycleInfo if a cycle exists through this edge, else None.
|
|
30
|
+
"""
|
|
31
|
+
src = event.source_agent
|
|
32
|
+
dst = event.target_agent
|
|
33
|
+
|
|
34
|
+
# Self-loop is not a meaningful coordination cycle
|
|
35
|
+
if src == dst:
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
# If dst can reach src, then src->dst closes a cycle
|
|
39
|
+
try:
|
|
40
|
+
if not graph.has_node(dst) or not graph.has_node(src):
|
|
41
|
+
return None
|
|
42
|
+
if not nx.has_path(graph, dst, src):
|
|
43
|
+
return None
|
|
44
|
+
except (nx.NetworkXError, nx.NodeNotFound):
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
path = nx.shortest_path(graph, dst, src)
|
|
49
|
+
except (nx.NetworkXNoPath, nx.NodeNotFound):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
# Full cycle: src -> ... path from dst to src ... but path starts at dst
|
|
53
|
+
# path = [dst, ..., src], so full cycle = [src] + [dst, ..., src] minus trailing src
|
|
54
|
+
# i.e. cycle_path = [src] + path[:-1] (the cycle loops back to src implicitly)
|
|
55
|
+
cycle_path = [src] + path[:-1]
|
|
56
|
+
|
|
57
|
+
# Build edges around the cycle: (A,B), (B,C), (C,A)
|
|
58
|
+
cycle_edges = list(zip(cycle_path, cycle_path[1:] + cycle_path[:1]))
|
|
59
|
+
edge_counts: dict[tuple[str, str], int] = {}
|
|
60
|
+
counts = []
|
|
61
|
+
for u, v in cycle_edges:
|
|
62
|
+
if graph.has_edge(u, v):
|
|
63
|
+
c = graph.edges[u, v].get("count", 0)
|
|
64
|
+
else:
|
|
65
|
+
c = 0
|
|
66
|
+
edge_counts[(u, v)] = c
|
|
67
|
+
counts.append(c)
|
|
68
|
+
|
|
69
|
+
rotations = min(counts) if counts else 0
|
|
70
|
+
fingerprint = tuple(sorted(set(cycle_path)))
|
|
71
|
+
|
|
72
|
+
return CycleInfo(
|
|
73
|
+
fingerprint=fingerprint,
|
|
74
|
+
cycle_path=cycle_path,
|
|
75
|
+
rotations=rotations,
|
|
76
|
+
edge_counts=edge_counts,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def reset(self) -> None:
|
|
80
|
+
"""Nothing to reset (stateless)."""
|
|
81
|
+
pass
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Layer 1: SlidingWindowLimiter — O(1) circuit breaker.
|
|
3
|
+
|
|
4
|
+
Dumb circuit breaker. If any single edge or the whole system exceeds a
|
|
5
|
+
hard delegation rate, fire immediate CRITICAL. No scoring, no analysis.
|
|
6
|
+
|
|
7
|
+
Includes anti-flap hysteresis (half-open circuit breaker pattern): once
|
|
8
|
+
an edge fires, further fires are suppressed until the estimated rate
|
|
9
|
+
cools below `trip_limit * reset_ratio` (default 0.7). Without this,
|
|
10
|
+
every single event over the threshold produced a fresh CRITICAL alert —
|
|
11
|
+
the "machine-gun" anti-pattern called out in Prometheus and Datadog's
|
|
12
|
+
alert-fatigue guidance.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
from agentsonar._core.models import Alert
|
|
19
|
+
|
|
20
|
+
# Default reset ratio for the half-open circuit-breaker pattern.
|
|
21
|
+
# After the limiter fires at `limit`, it re-arms for the next fire
|
|
22
|
+
# only when the estimated rate drops below `limit * RESET_RATIO`.
|
|
23
|
+
# Value of 0.7 picked as a balance: low enough that brief spikes
|
|
24
|
+
# don't re-arm immediately, high enough that real cooldown does.
|
|
25
|
+
# Configurable via the `reset_ratio` constructor arg.
|
|
26
|
+
_DEFAULT_RESET_RATIO = 0.7
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class _EdgeWindow:
|
|
31
|
+
current_count: int = 0
|
|
32
|
+
previous_count: int = 0
|
|
33
|
+
window_start: float = 0.0
|
|
34
|
+
|
|
35
|
+
# Anti-flap state — set when the limiter fires on this window.
|
|
36
|
+
# Cleared when the estimated rate drops below the reset threshold.
|
|
37
|
+
# Multiple fires while `tripped` stays True are suppressed.
|
|
38
|
+
tripped: bool = False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SlidingWindowLimiter:
|
|
42
|
+
"""Hard rate-limit circuit breaker with hysteresis.
|
|
43
|
+
|
|
44
|
+
Uses a two-counter sliding window approximation per edge plus one
|
|
45
|
+
global counter. When the estimated rate in the current window first
|
|
46
|
+
crosses the limit, an immediate CRITICAL alert is returned and the
|
|
47
|
+
window is latched into the "tripped" state. Subsequent events on
|
|
48
|
+
the same window DO NOT re-fire until the estimated rate has cooled
|
|
49
|
+
below `trip_limit * reset_ratio` — the classic circuit-breaker
|
|
50
|
+
half-open pattern.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
window_size: Window duration in seconds (default 60).
|
|
54
|
+
per_edge_limit: Max delegations per single edge per window (default 50).
|
|
55
|
+
global_limit: Max total delegations across ALL edges per window (default 200).
|
|
56
|
+
reset_ratio: Hysteresis threshold as a fraction of the trip limit.
|
|
57
|
+
After a fire, the limiter re-arms for the next fire only when
|
|
58
|
+
the estimated rate drops below `limit * reset_ratio`. Default
|
|
59
|
+
0.7 — lower values (0.5) re-arm more eagerly; higher values
|
|
60
|
+
(0.9) require deeper cooldown. Must satisfy 0 < ratio < 1.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
window_size: float = 60.0,
|
|
66
|
+
per_edge_limit: int = 50,
|
|
67
|
+
global_limit: int = 200,
|
|
68
|
+
reset_ratio: float = _DEFAULT_RESET_RATIO,
|
|
69
|
+
):
|
|
70
|
+
# Defensive validation — every knob has a range where the
|
|
71
|
+
# math makes sense, and outside that range the detector
|
|
72
|
+
# produces nonsense instead of raising. A surprised user
|
|
73
|
+
# who typed `window_size=-60` by accident would see alerts
|
|
74
|
+
# fire or suppress based on negative rate estimates. Reject
|
|
75
|
+
# at construction time with a clear error message.
|
|
76
|
+
if not 0.0 < reset_ratio < 1.0:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"reset_ratio must be between 0 and 1 exclusive, got {reset_ratio}"
|
|
79
|
+
)
|
|
80
|
+
if not isinstance(window_size, (int, float)) or window_size <= 0:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"window_size must be a positive number, got {window_size}"
|
|
83
|
+
)
|
|
84
|
+
import math as _math
|
|
85
|
+
if not _math.isfinite(float(window_size)):
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"window_size must be finite (not NaN or Infinity), got {window_size}"
|
|
88
|
+
)
|
|
89
|
+
if not isinstance(per_edge_limit, int) or per_edge_limit < 1:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"per_edge_limit must be a positive integer, got {per_edge_limit}"
|
|
92
|
+
)
|
|
93
|
+
if not isinstance(global_limit, int) or global_limit < 1:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"global_limit must be a positive integer, got {global_limit}"
|
|
96
|
+
)
|
|
97
|
+
self.window_size = float(window_size)
|
|
98
|
+
self.per_edge_limit = per_edge_limit
|
|
99
|
+
self.global_limit = global_limit
|
|
100
|
+
self.reset_ratio = reset_ratio
|
|
101
|
+
self._edges: dict[tuple[str, str], _EdgeWindow] = {}
|
|
102
|
+
self._global = _EdgeWindow()
|
|
103
|
+
|
|
104
|
+
# ------------------------------------------------------------------
|
|
105
|
+
# Internal helpers
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
def _peek_estimate(self, window: _EdgeWindow, now: float) -> float:
|
|
109
|
+
"""Return the current estimated rate for `window` WITHOUT
|
|
110
|
+
incrementing its counter.
|
|
111
|
+
|
|
112
|
+
Used on the suppressed-per-edge-event path so the global
|
|
113
|
+
window's counter isn't polluted by events the per-edge
|
|
114
|
+
gate already rejected. We still roll the window if it's
|
|
115
|
+
overdue — the rollover is just time bookkeeping and must
|
|
116
|
+
happen regardless of whether we count this event.
|
|
117
|
+
"""
|
|
118
|
+
# Roll the window if it's expired (same bookkeeping as
|
|
119
|
+
# _roll_and_increment, minus the counter bump).
|
|
120
|
+
if window.window_start == 0.0:
|
|
121
|
+
window.window_start = now
|
|
122
|
+
elapsed = now - window.window_start
|
|
123
|
+
if elapsed >= self.window_size:
|
|
124
|
+
window.previous_count = window.current_count
|
|
125
|
+
window.current_count = 0
|
|
126
|
+
window.window_start = now
|
|
127
|
+
elapsed = 0.0
|
|
128
|
+
window.tripped = False
|
|
129
|
+
|
|
130
|
+
fraction_elapsed = elapsed / self.window_size if self.window_size > 0 else 1.0
|
|
131
|
+
estimated = window.current_count + window.previous_count * (1.0 - fraction_elapsed)
|
|
132
|
+
return estimated
|
|
133
|
+
|
|
134
|
+
def _roll_and_increment(self, window: _EdgeWindow, now: float) -> float:
|
|
135
|
+
"""Roll over the window if needed, increment, return estimated count.
|
|
136
|
+
|
|
137
|
+
Window rollover also clears the `tripped` latch — rolling into a
|
|
138
|
+
fresh window is a natural re-arm point, so the next breach fires
|
|
139
|
+
a new alert rather than being suppressed by the old window's
|
|
140
|
+
hysteresis state.
|
|
141
|
+
"""
|
|
142
|
+
if window.window_start == 0.0:
|
|
143
|
+
window.window_start = now
|
|
144
|
+
|
|
145
|
+
elapsed = now - window.window_start
|
|
146
|
+
if elapsed >= self.window_size:
|
|
147
|
+
window.previous_count = window.current_count
|
|
148
|
+
window.current_count = 0
|
|
149
|
+
window.window_start = now
|
|
150
|
+
elapsed = 0.0
|
|
151
|
+
# Fresh window — re-arm the circuit breaker. Any latched
|
|
152
|
+
# trip state from the previous window is stale.
|
|
153
|
+
window.tripped = False
|
|
154
|
+
|
|
155
|
+
window.current_count += 1
|
|
156
|
+
|
|
157
|
+
fraction_elapsed = elapsed / self.window_size if self.window_size > 0 else 1.0
|
|
158
|
+
estimated = window.current_count + window.previous_count * (1.0 - fraction_elapsed)
|
|
159
|
+
return estimated
|
|
160
|
+
|
|
161
|
+
def _try_fire(
|
|
162
|
+
self,
|
|
163
|
+
window: _EdgeWindow,
|
|
164
|
+
estimated: float,
|
|
165
|
+
trip_limit: int,
|
|
166
|
+
) -> bool:
|
|
167
|
+
"""Half-open circuit-breaker logic.
|
|
168
|
+
|
|
169
|
+
Returns True if the caller should FIRE an alert for this event;
|
|
170
|
+
False if the alert should be suppressed (already firing or
|
|
171
|
+
below the trip threshold).
|
|
172
|
+
|
|
173
|
+
State transitions:
|
|
174
|
+
* Not tripped + estimated > limit → trip + fire (True)
|
|
175
|
+
* Not tripped + estimated <= limit → quiet (False)
|
|
176
|
+
* Tripped + estimated < limit * ratio → reset, quiet (False)
|
|
177
|
+
* Tripped + estimated >= limit * ratio → suppressed (False)
|
|
178
|
+
"""
|
|
179
|
+
reset_threshold = trip_limit * self.reset_ratio
|
|
180
|
+
|
|
181
|
+
if window.tripped:
|
|
182
|
+
# Already firing on this window. Only clear the latch when
|
|
183
|
+
# the rate has genuinely cooled.
|
|
184
|
+
if estimated < reset_threshold:
|
|
185
|
+
window.tripped = False
|
|
186
|
+
# Either way, do not fire a second alert from the tripped
|
|
187
|
+
# state — the user was already told about the problem.
|
|
188
|
+
return False
|
|
189
|
+
|
|
190
|
+
if estimated > trip_limit:
|
|
191
|
+
window.tripped = True
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
# ------------------------------------------------------------------
|
|
197
|
+
# Public API
|
|
198
|
+
# ------------------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
def check(self, src: str, dst: str, now: float) -> Alert | None:
|
|
201
|
+
"""Check rate limits for a delegation event.
|
|
202
|
+
|
|
203
|
+
Returns CRITICAL Alert if the per-edge or global limit is
|
|
204
|
+
newly exceeded (first breach of the current window). Returns
|
|
205
|
+
None if the limit is not exceeded OR if the limiter is already
|
|
206
|
+
in the tripped state for this window.
|
|
207
|
+
|
|
208
|
+
Global counter policy: we only count events toward the global
|
|
209
|
+
window when the per-edge window would have let them through.
|
|
210
|
+
Without this, a single runaway edge spews events at the global
|
|
211
|
+
counter long after its per-edge alert has fired and subsequent
|
|
212
|
+
events were suppressed — eventually tripping the global
|
|
213
|
+
limit with a "triggering edge" that isn't actually the
|
|
214
|
+
culprit. The user sees a misleading `global_rate_exceeded`
|
|
215
|
+
attributed to the next benign edge that happened to push the
|
|
216
|
+
global counter past the threshold.
|
|
217
|
+
|
|
218
|
+
By skipping the global increment when per-edge is already
|
|
219
|
+
tripped, "global rate" means "events the per-edge gate let
|
|
220
|
+
through", which matches user intuition about what each alert
|
|
221
|
+
represents.
|
|
222
|
+
"""
|
|
223
|
+
# Per-edge check
|
|
224
|
+
key = (src, dst)
|
|
225
|
+
if key not in self._edges:
|
|
226
|
+
self._edges[key] = _EdgeWindow()
|
|
227
|
+
edge_window = self._edges[key]
|
|
228
|
+
edge_est = self._roll_and_increment(edge_window, now)
|
|
229
|
+
|
|
230
|
+
# Per-edge first — gating against a single runaway edge is
|
|
231
|
+
# usually more actionable than global saturation. Evaluate
|
|
232
|
+
# before touching the global counter so we know whether this
|
|
233
|
+
# event should count toward the global window.
|
|
234
|
+
edge_fires = self._try_fire(edge_window, edge_est, self.per_edge_limit)
|
|
235
|
+
|
|
236
|
+
# Decide whether the event counts toward the global window.
|
|
237
|
+
#
|
|
238
|
+
# Count IF:
|
|
239
|
+
# a) the per-edge window is not tripped at ALL (normal flow), OR
|
|
240
|
+
# b) the per-edge window is tripped AND this event fired the
|
|
241
|
+
# alert (the event that initially breached the limit is
|
|
242
|
+
# legitimate throughput and should be globally counted)
|
|
243
|
+
#
|
|
244
|
+
# Skip IF:
|
|
245
|
+
# c) the per-edge window is already latched tripped and this
|
|
246
|
+
# event is being suppressed (the suppressed events should
|
|
247
|
+
# NOT pollute the global counter)
|
|
248
|
+
#
|
|
249
|
+
# Concretely: if `edge_fires` is True, count globally. If
|
|
250
|
+
# `edge_fires` is False AND `edge_window.tripped` is True,
|
|
251
|
+
# skip. Otherwise (edge untripped, no fire) count normally.
|
|
252
|
+
event_counts_globally = edge_fires or not edge_window.tripped
|
|
253
|
+
|
|
254
|
+
if event_counts_globally:
|
|
255
|
+
global_est = self._roll_and_increment(self._global, now)
|
|
256
|
+
else:
|
|
257
|
+
# Still update the window rollover for time tracking, but
|
|
258
|
+
# without incrementing the counter. This is a no-op on
|
|
259
|
+
# the counter side; the window start still rolls when
|
|
260
|
+
# elapsed > window_size.
|
|
261
|
+
global_est = self._peek_estimate(self._global, now)
|
|
262
|
+
|
|
263
|
+
if edge_fires:
|
|
264
|
+
return Alert(
|
|
265
|
+
pattern="rate_limit_exceeded",
|
|
266
|
+
severity="CRITICAL",
|
|
267
|
+
details={
|
|
268
|
+
"source": src,
|
|
269
|
+
"target": dst,
|
|
270
|
+
"estimated_rate": round(edge_est, 1),
|
|
271
|
+
"limit": self.per_edge_limit,
|
|
272
|
+
"window_size": self.window_size,
|
|
273
|
+
"reset_ratio": self.reset_ratio,
|
|
274
|
+
},
|
|
275
|
+
timestamp=now,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Global check — only reached if the per-edge check did NOT fire
|
|
279
|
+
if self._try_fire(self._global, global_est, self.global_limit):
|
|
280
|
+
return Alert(
|
|
281
|
+
pattern="global_rate_exceeded",
|
|
282
|
+
severity="CRITICAL",
|
|
283
|
+
details={
|
|
284
|
+
"estimated_rate": round(global_est, 1),
|
|
285
|
+
"limit": self.global_limit,
|
|
286
|
+
"window_size": self.window_size,
|
|
287
|
+
"reset_ratio": self.reset_ratio,
|
|
288
|
+
},
|
|
289
|
+
timestamp=now,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
def reset(self) -> None:
|
|
295
|
+
"""Clear all rate-limit state (including hysteresis latches)."""
|
|
296
|
+
self._edges.clear()
|
|
297
|
+
self._global = _EdgeWindow()
|