agentsonar 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agentsonar/__init__.py ADDED
@@ -0,0 +1,52 @@
1
+ """
2
+ AgentSonar — detect coordination failures in multi-agent AI systems.
3
+
4
+ Two lines to integrate:
5
+
6
+ # CrewAI
7
+ from agentsonar import AgentSonarListener
8
+ sonar = AgentSonarListener()
9
+ # ...run your crew normally.
10
+
11
+ # LangGraph / LangChain — option 1: pass the callback yourself
12
+ from agentsonar import AgentSonarCallback
13
+ result = graph.invoke(input, config={"callbacks": [AgentSonarCallback()]})
14
+
15
+ # LangGraph / LangChain — option 2: wrap the graph (auto-merges callbacks)
16
+ from agentsonar import monitor
17
+ graph = monitor(graph)
18
+ result = graph.invoke(input) # auto-monitored
19
+ result = graph.invoke(input, config={"callbacks": [my_cb]}) # my_cb preserved
20
+
21
+ Detection and terminal output happen automatically. No config, no API
22
+ keys, no accounts. Alerts stream to stderr and to timestamped log files
23
+ (`agentsonar_*.log`) in the working directory.
24
+
25
+ Public API:
26
+ AgentSonarListener — CrewAI integration (requires `agentsonar[crewai]`)
27
+ AgentSonarCallback — LangGraph/LangChain integration (requires `agentsonar[langgraph]`)
28
+ monitor — LangGraph wrapper that auto-merges AgentSonar
29
+ into the callback list without overriding existing
30
+ user callbacks. Recommended when you already have
31
+ your own callbacks configured.
32
+ """
33
+ from __future__ import annotations
34
+
35
+ __version__ = "0.1.0"
36
+
37
+ # Everything below is the ONLY public surface. Framework integrations use
38
+ # conditional imports for their underlying framework — they remain
39
+ # importable even when the optional extra is not installed, and raise a
40
+ # clear RuntimeError at instantiation time instead.
41
+ from agentsonar._integrations.crewai_listener import AgentSonarListener
42
+ from agentsonar._integrations.langgraph_callback import (
43
+ AgentSonarCallback,
44
+ monitor,
45
+ )
46
+
47
+ __all__ = [
48
+ "AgentSonarListener",
49
+ "AgentSonarCallback",
50
+ "monitor",
51
+ "__version__",
52
+ ]
@@ -0,0 +1,6 @@
1
+ """
2
+ AgentSonar core — internal detection engine.
3
+
4
+ Not part of the public API. Consumers should import from `agentsonar`
5
+ directly (AgentSonarListener, AgentSonarCallback).
6
+ """
@@ -0,0 +1,16 @@
1
+ """
2
+ AgentSonar schema and detection constants.
3
+
4
+ Token tracking, cost estimation, and velocity metrics are out of scope
5
+ for v0.1.0. The schema intentionally omits these fields.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ SCHEMA_VERSION = "0.1.0"
10
+
11
+ # Default thresholds — may be overridden via DetectionEngine config dict
12
+ DEFAULT_CYCLE_WARNING_ROTATIONS = 3
13
+ DEFAULT_CYCLE_CRITICAL_ROTATIONS = 10
14
+ DEFAULT_REPETITIVE_WARNING_COUNT = 5
15
+ DEFAULT_REPETITIVE_CRITICAL_COUNT = 15
16
+ DEFAULT_STALL_THRESHOLD_SECONDS = 300
@@ -0,0 +1,22 @@
1
+ """
2
+ Detection layer implementations. Internal — not part of the public API.
3
+
4
+ Layer 1: SlidingWindowLimiter — rate-limit circuit breaker
5
+ Layer 2: ExponentialDecayDetector — decay-weighted edge anomaly
6
+ Layer 3: CycleDetector — incremental cycle detection
7
+ Layer 4: AlertStateManager — alert state machine (WARNING → CRITICAL → RESOLVED)
8
+ Layer 5: PeriodicSCCAnalyzer — backup SCC sweep
9
+ """
10
+ from agentsonar._core.detectors.alert_state import AlertStateManager
11
+ from agentsonar._core.detectors.cycle_detector import CycleDetector
12
+ from agentsonar._core.detectors.rate_limiter import SlidingWindowLimiter
13
+ from agentsonar._core.detectors.repetitive_detector import ExponentialDecayDetector
14
+ from agentsonar._core.detectors.scc_analyzer import PeriodicSCCAnalyzer
15
+
16
+ __all__ = [
17
+ "AlertStateManager",
18
+ "CycleDetector",
19
+ "ExponentialDecayDetector",
20
+ "PeriodicSCCAnalyzer",
21
+ "SlidingWindowLimiter",
22
+ ]
@@ -0,0 +1,202 @@
1
+ """
2
+ Layer 4: AlertStateManager — O(1) alert dedup / state machine.
3
+
4
+ Progressive alert lifecycle with hysteresis recovery. Separated from the
5
+ PeriodicSCCAnalyzer (Layer 5) because they have nothing to do with each
6
+ other — one manages alert state, the other runs backup SCC sweeps.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from agentsonar._core.models import Alert
13
+
14
+
15
+ @dataclass
16
+ class _TrackedAlert:
17
+ fingerprint: tuple
18
+ state: str # "ACTIVE", "ESCALATED", "RESOLVED"
19
+ pattern: str # "cycle", "edge_anomaly", "scc_cycle"
20
+ current_rotations: int = 0
21
+ last_alert_rotations: int = 0
22
+ last_alert_time: float = 0.0
23
+ last_activity_time: float = 0.0
24
+ created_at: float = 0.0
25
+ alert_count: int = 0
26
+
27
+
28
+ class AlertStateManager:
29
+ """Progressive alert state machine with hysteresis recovery.
30
+
31
+ Tracks each detected anomaly by fingerprint and manages transitions:
32
+ (new) → ACTIVE (WARNING) → ESCALATED (CRITICAL) → RESOLVED (INFO)
33
+ ↑ re-alerts every N rotations
34
+ RESOLVED → ACTIVE on regression
35
+
36
+ Args:
37
+ warning_threshold: Rotations/count to trigger first WARNING (default 5).
38
+ critical_threshold: Rotations/count to trigger CRITICAL (default 15).
39
+ re_alert_interval: After CRITICAL, re-alert every N additional rotations (default 30).
40
+ resolve_after_seconds: Quiet period before marking RESOLVED (default 60).
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ warning_threshold: int = 5,
46
+ critical_threshold: int = 15,
47
+ re_alert_interval: int = 30,
48
+ resolve_after_seconds: float = 60.0,
49
+ ):
50
+ self.warning_threshold = warning_threshold
51
+ self.critical_threshold = critical_threshold
52
+ self.re_alert_interval = re_alert_interval
53
+ self.resolve_after_seconds = resolve_after_seconds
54
+ self._tracked: dict[tuple, _TrackedAlert] = {}
55
+
56
+ def process(
57
+ self,
58
+ fingerprint: tuple,
59
+ rotations: int,
60
+ pattern: str,
61
+ now: float,
62
+ extra_details: dict | None = None,
63
+ ) -> Alert | None:
64
+ """Feed a detection result into the state machine.
65
+
66
+ Returns an Alert if one should be emitted, None otherwise.
67
+ """
68
+ details = dict(extra_details or {})
69
+ details["fingerprint"] = fingerprint
70
+ details["rotations"] = rotations
71
+
72
+ tracked = self._tracked.get(fingerprint)
73
+
74
+ # --- New fingerprint ---
75
+ if tracked is None:
76
+ if rotations >= self.warning_threshold:
77
+ tracked = _TrackedAlert(
78
+ fingerprint=fingerprint,
79
+ state="ACTIVE",
80
+ pattern=pattern,
81
+ current_rotations=rotations,
82
+ last_alert_rotations=rotations,
83
+ last_alert_time=now,
84
+ last_activity_time=now,
85
+ created_at=now,
86
+ alert_count=1,
87
+ )
88
+ self._tracked[fingerprint] = tracked
89
+ return Alert(
90
+ pattern=pattern,
91
+ severity="WARNING",
92
+ details=details,
93
+ timestamp=now,
94
+ )
95
+ return None
96
+
97
+ # --- Existing fingerprint ---
98
+ # NOTE: do NOT mutate `tracked.current_rotations` or
99
+ # `last_activity_time` unconditionally at the top. The old
100
+ # implementation wrote both of these BEFORE the state
101
+ # dispatch decided whether to emit an alert, which meant a
102
+ # no-op call (returning None) still silently corrupted
103
+ # state — e.g. a spurious rotations=4 event after rotations=5
104
+ # would ratchet the count DOWN and refresh the activity
105
+ # time, masking the real rotation count from subsequent
106
+ # recovery checks. Mutations now happen ONLY on paths that
107
+ # actually fire an alert OR represent real progress (see
108
+ # the monotonic-update block below each state branch).
109
+ previous_rotations = tracked.current_rotations
110
+
111
+ if tracked.state == "ACTIVE":
112
+ if rotations >= self.critical_threshold:
113
+ tracked.state = "ESCALATED"
114
+ tracked.current_rotations = rotations
115
+ tracked.last_activity_time = now
116
+ tracked.last_alert_rotations = rotations
117
+ tracked.last_alert_time = now
118
+ tracked.alert_count += 1
119
+ return Alert(
120
+ pattern=pattern,
121
+ severity="CRITICAL",
122
+ details=details,
123
+ timestamp=now,
124
+ )
125
+ # No alert fires, but rotations may still be increasing
126
+ # toward the critical threshold — ratchet forward ONLY
127
+ # if the new count is >= the old one. This preserves
128
+ # monotonic progress without ever decreasing.
129
+ if rotations >= previous_rotations:
130
+ tracked.current_rotations = rotations
131
+ tracked.last_activity_time = now
132
+ return None
133
+
134
+ if tracked.state == "ESCALATED":
135
+ if rotations >= tracked.last_alert_rotations + self.re_alert_interval:
136
+ tracked.current_rotations = rotations
137
+ tracked.last_activity_time = now
138
+ tracked.last_alert_rotations = rotations
139
+ tracked.last_alert_time = now
140
+ tracked.alert_count += 1
141
+ return Alert(
142
+ pattern=pattern,
143
+ severity="CRITICAL",
144
+ details=details,
145
+ timestamp=now,
146
+ )
147
+ # No re-alert fires — same monotonic guard as ACTIVE.
148
+ if rotations >= previous_rotations:
149
+ tracked.current_rotations = rotations
150
+ tracked.last_activity_time = now
151
+ return None
152
+
153
+ if tracked.state == "RESOLVED":
154
+ if rotations > previous_rotations or rotations > tracked.last_alert_rotations:
155
+ tracked.state = "ACTIVE"
156
+ tracked.current_rotations = rotations
157
+ tracked.last_activity_time = now
158
+ tracked.last_alert_time = now
159
+ # CRITICAL round-3 fix: update `last_alert_rotations`
160
+ # on the regression path. Without this, a future
161
+ # ESCALATED re-alert fires at `OLD_last_alert_rotations
162
+ # + re_alert_interval` instead of the correct
163
+ # `rotations + re_alert_interval`, firing one
164
+ # cycle earlier than designed.
165
+ tracked.last_alert_rotations = rotations
166
+ tracked.alert_count += 1
167
+ details["regression"] = True
168
+ return Alert(
169
+ pattern=pattern,
170
+ severity="WARNING",
171
+ details=details,
172
+ timestamp=now,
173
+ )
174
+ return None
175
+
176
+ return None
177
+
178
+ def check_recoveries(self, now: float) -> list[Alert]:
179
+ """Check all tracked alerts for recovery (quiet period elapsed).
180
+
181
+ Returns list of INFO alerts for newly resolved anomalies.
182
+ """
183
+ resolved: list[Alert] = []
184
+ for tracked in self._tracked.values():
185
+ if tracked.state in ("ACTIVE", "ESCALATED"):
186
+ if now - tracked.last_activity_time >= self.resolve_after_seconds:
187
+ tracked.state = "RESOLVED"
188
+ resolved.append(Alert(
189
+ pattern=tracked.pattern,
190
+ severity="INFO",
191
+ details={
192
+ "fingerprint": tracked.fingerprint,
193
+ "rotations": tracked.current_rotations,
194
+ "message": "resolved after quiet period",
195
+ },
196
+ timestamp=now,
197
+ ))
198
+ return resolved
199
+
200
+ def reset(self) -> None:
201
+ """Clear all tracked alerts."""
202
+ self._tracked.clear()
@@ -0,0 +1,81 @@
1
+ """
2
+ Layer 3: CycleDetector — O(V+E) cycle detection via has_path.
3
+
4
+ Detects when a delegation creates or extends a graph cycle. Uses
5
+ has_path + shortest_path rather than simple_cycles for efficiency.
6
+ Reports CycleInfo; does NOT decide severity (that's AlertStateManager).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import networkx as nx
11
+
12
+ from agentsonar._core.models import CycleInfo, InteractionEvent
13
+
14
+
15
+ class CycleDetector:
16
+ """Stateless cycle detector.
17
+
18
+ On each event (src -> dst), checks whether a path already exists from
19
+ dst back to src in the graph. If so, extracts the cycle path and
20
+ computes rotation count (min edge count around the loop).
21
+ """
22
+
23
+ def __init__(self) -> None:
24
+ pass # stateless — thresholds live in AlertStateManager
25
+
26
+ def check(self, graph: nx.DiGraph, event: InteractionEvent) -> CycleInfo | None:
27
+ """Check if the latest event creates or extends a cycle.
28
+
29
+ Returns CycleInfo if a cycle exists through this edge, else None.
30
+ """
31
+ src = event.source_agent
32
+ dst = event.target_agent
33
+
34
+ # Self-loop is not a meaningful coordination cycle
35
+ if src == dst:
36
+ return None
37
+
38
+ # If dst can reach src, then src->dst closes a cycle
39
+ try:
40
+ if not graph.has_node(dst) or not graph.has_node(src):
41
+ return None
42
+ if not nx.has_path(graph, dst, src):
43
+ return None
44
+ except (nx.NetworkXError, nx.NodeNotFound):
45
+ return None
46
+
47
+ try:
48
+ path = nx.shortest_path(graph, dst, src)
49
+ except (nx.NetworkXNoPath, nx.NodeNotFound):
50
+ return None
51
+
52
+ # Full cycle: src -> ... path from dst to src ... but path starts at dst
53
+ # path = [dst, ..., src], so full cycle = [src] + [dst, ..., src] minus trailing src
54
+ # i.e. cycle_path = [src] + path[:-1] (the cycle loops back to src implicitly)
55
+ cycle_path = [src] + path[:-1]
56
+
57
+ # Build edges around the cycle: (A,B), (B,C), (C,A)
58
+ cycle_edges = list(zip(cycle_path, cycle_path[1:] + cycle_path[:1]))
59
+ edge_counts: dict[tuple[str, str], int] = {}
60
+ counts = []
61
+ for u, v in cycle_edges:
62
+ if graph.has_edge(u, v):
63
+ c = graph.edges[u, v].get("count", 0)
64
+ else:
65
+ c = 0
66
+ edge_counts[(u, v)] = c
67
+ counts.append(c)
68
+
69
+ rotations = min(counts) if counts else 0
70
+ fingerprint = tuple(sorted(set(cycle_path)))
71
+
72
+ return CycleInfo(
73
+ fingerprint=fingerprint,
74
+ cycle_path=cycle_path,
75
+ rotations=rotations,
76
+ edge_counts=edge_counts,
77
+ )
78
+
79
+ def reset(self) -> None:
80
+ """Nothing to reset (stateless)."""
81
+ pass
@@ -0,0 +1,297 @@
1
+ """
2
+ Layer 1: SlidingWindowLimiter — O(1) circuit breaker.
3
+
4
+ Dumb circuit breaker. If any single edge or the whole system exceeds a
5
+ hard delegation rate, fire immediate CRITICAL. No scoring, no analysis.
6
+
7
+ Includes anti-flap hysteresis (half-open circuit breaker pattern): once
8
+ an edge fires, further fires are suppressed until the estimated rate
9
+ cools below `trip_limit * reset_ratio` (default 0.7). Without this,
10
+ every single event over the threshold produced a fresh CRITICAL alert —
11
+ the "machine-gun" anti-pattern called out in Prometheus and Datadog's
12
+ alert-fatigue guidance.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass
17
+
18
+ from agentsonar._core.models import Alert
19
+
20
+ # Default reset ratio for the half-open circuit-breaker pattern.
21
+ # After the limiter fires at `limit`, it re-arms for the next fire
22
+ # only when the estimated rate drops below `limit * RESET_RATIO`.
23
+ # Value of 0.7 picked as a balance: low enough that brief spikes
24
+ # don't re-arm immediately, high enough that real cooldown does.
25
+ # Configurable via the `reset_ratio` constructor arg.
26
+ _DEFAULT_RESET_RATIO = 0.7
27
+
28
+
29
+ @dataclass
30
+ class _EdgeWindow:
31
+ current_count: int = 0
32
+ previous_count: int = 0
33
+ window_start: float = 0.0
34
+
35
+ # Anti-flap state — set when the limiter fires on this window.
36
+ # Cleared when the estimated rate drops below the reset threshold.
37
+ # Multiple fires while `tripped` stays True are suppressed.
38
+ tripped: bool = False
39
+
40
+
41
+ class SlidingWindowLimiter:
42
+ """Hard rate-limit circuit breaker with hysteresis.
43
+
44
+ Uses a two-counter sliding window approximation per edge plus one
45
+ global counter. When the estimated rate in the current window first
46
+ crosses the limit, an immediate CRITICAL alert is returned and the
47
+ window is latched into the "tripped" state. Subsequent events on
48
+ the same window DO NOT re-fire until the estimated rate has cooled
49
+ below `trip_limit * reset_ratio` — the classic circuit-breaker
50
+ half-open pattern.
51
+
52
+ Args:
53
+ window_size: Window duration in seconds (default 60).
54
+ per_edge_limit: Max delegations per single edge per window (default 50).
55
+ global_limit: Max total delegations across ALL edges per window (default 200).
56
+ reset_ratio: Hysteresis threshold as a fraction of the trip limit.
57
+ After a fire, the limiter re-arms for the next fire only when
58
+ the estimated rate drops below `limit * reset_ratio`. Default
59
+ 0.7 — lower values (0.5) re-arm more eagerly; higher values
60
+ (0.9) require deeper cooldown. Must satisfy 0 < ratio < 1.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ window_size: float = 60.0,
66
+ per_edge_limit: int = 50,
67
+ global_limit: int = 200,
68
+ reset_ratio: float = _DEFAULT_RESET_RATIO,
69
+ ):
70
+ # Defensive validation — every knob has a range where the
71
+ # math makes sense, and outside that range the detector
72
+ # produces nonsense instead of raising. A surprised user
73
+ # who typed `window_size=-60` by accident would see alerts
74
+ # fire or suppress based on negative rate estimates. Reject
75
+ # at construction time with a clear error message.
76
+ if not 0.0 < reset_ratio < 1.0:
77
+ raise ValueError(
78
+ f"reset_ratio must be between 0 and 1 exclusive, got {reset_ratio}"
79
+ )
80
+ if not isinstance(window_size, (int, float)) or window_size <= 0:
81
+ raise ValueError(
82
+ f"window_size must be a positive number, got {window_size}"
83
+ )
84
+ import math as _math
85
+ if not _math.isfinite(float(window_size)):
86
+ raise ValueError(
87
+ f"window_size must be finite (not NaN or Infinity), got {window_size}"
88
+ )
89
+ if not isinstance(per_edge_limit, int) or per_edge_limit < 1:
90
+ raise ValueError(
91
+ f"per_edge_limit must be a positive integer, got {per_edge_limit}"
92
+ )
93
+ if not isinstance(global_limit, int) or global_limit < 1:
94
+ raise ValueError(
95
+ f"global_limit must be a positive integer, got {global_limit}"
96
+ )
97
+ self.window_size = float(window_size)
98
+ self.per_edge_limit = per_edge_limit
99
+ self.global_limit = global_limit
100
+ self.reset_ratio = reset_ratio
101
+ self._edges: dict[tuple[str, str], _EdgeWindow] = {}
102
+ self._global = _EdgeWindow()
103
+
104
+ # ------------------------------------------------------------------
105
+ # Internal helpers
106
+ # ------------------------------------------------------------------
107
+
108
+ def _peek_estimate(self, window: _EdgeWindow, now: float) -> float:
109
+ """Return the current estimated rate for `window` WITHOUT
110
+ incrementing its counter.
111
+
112
+ Used on the suppressed-per-edge-event path so the global
113
+ window's counter isn't polluted by events the per-edge
114
+ gate already rejected. We still roll the window if it's
115
+ overdue — the rollover is just time bookkeeping and must
116
+ happen regardless of whether we count this event.
117
+ """
118
+ # Roll the window if it's expired (same bookkeeping as
119
+ # _roll_and_increment, minus the counter bump).
120
+ if window.window_start == 0.0:
121
+ window.window_start = now
122
+ elapsed = now - window.window_start
123
+ if elapsed >= self.window_size:
124
+ window.previous_count = window.current_count
125
+ window.current_count = 0
126
+ window.window_start = now
127
+ elapsed = 0.0
128
+ window.tripped = False
129
+
130
+ fraction_elapsed = elapsed / self.window_size if self.window_size > 0 else 1.0
131
+ estimated = window.current_count + window.previous_count * (1.0 - fraction_elapsed)
132
+ return estimated
133
+
134
+ def _roll_and_increment(self, window: _EdgeWindow, now: float) -> float:
135
+ """Roll over the window if needed, increment, return estimated count.
136
+
137
+ Window rollover also clears the `tripped` latch — rolling into a
138
+ fresh window is a natural re-arm point, so the next breach fires
139
+ a new alert rather than being suppressed by the old window's
140
+ hysteresis state.
141
+ """
142
+ if window.window_start == 0.0:
143
+ window.window_start = now
144
+
145
+ elapsed = now - window.window_start
146
+ if elapsed >= self.window_size:
147
+ window.previous_count = window.current_count
148
+ window.current_count = 0
149
+ window.window_start = now
150
+ elapsed = 0.0
151
+ # Fresh window — re-arm the circuit breaker. Any latched
152
+ # trip state from the previous window is stale.
153
+ window.tripped = False
154
+
155
+ window.current_count += 1
156
+
157
+ fraction_elapsed = elapsed / self.window_size if self.window_size > 0 else 1.0
158
+ estimated = window.current_count + window.previous_count * (1.0 - fraction_elapsed)
159
+ return estimated
160
+
161
+ def _try_fire(
162
+ self,
163
+ window: _EdgeWindow,
164
+ estimated: float,
165
+ trip_limit: int,
166
+ ) -> bool:
167
+ """Half-open circuit-breaker logic.
168
+
169
+ Returns True if the caller should FIRE an alert for this event;
170
+ False if the alert should be suppressed (already firing or
171
+ below the trip threshold).
172
+
173
+ State transitions:
174
+ * Not tripped + estimated > limit → trip + fire (True)
175
+ * Not tripped + estimated <= limit → quiet (False)
176
+ * Tripped + estimated < limit * ratio → reset, quiet (False)
177
+ * Tripped + estimated >= limit * ratio → suppressed (False)
178
+ """
179
+ reset_threshold = trip_limit * self.reset_ratio
180
+
181
+ if window.tripped:
182
+ # Already firing on this window. Only clear the latch when
183
+ # the rate has genuinely cooled.
184
+ if estimated < reset_threshold:
185
+ window.tripped = False
186
+ # Either way, do not fire a second alert from the tripped
187
+ # state — the user was already told about the problem.
188
+ return False
189
+
190
+ if estimated > trip_limit:
191
+ window.tripped = True
192
+ return True
193
+
194
+ return False
195
+
196
+ # ------------------------------------------------------------------
197
+ # Public API
198
+ # ------------------------------------------------------------------
199
+
200
+ def check(self, src: str, dst: str, now: float) -> Alert | None:
201
+ """Check rate limits for a delegation event.
202
+
203
+ Returns CRITICAL Alert if the per-edge or global limit is
204
+ newly exceeded (first breach of the current window). Returns
205
+ None if the limit is not exceeded OR if the limiter is already
206
+ in the tripped state for this window.
207
+
208
+ Global counter policy: we only count events toward the global
209
+ window when the per-edge window would have let them through.
210
+ Without this, a single runaway edge spews events at the global
211
+ counter long after its per-edge alert has fired and subsequent
212
+ events were suppressed — eventually tripping the global
213
+ limit with a "triggering edge" that isn't actually the
214
+ culprit. The user sees a misleading `global_rate_exceeded`
215
+ attributed to the next benign edge that happened to push the
216
+ global counter past the threshold.
217
+
218
+ By skipping the global increment when per-edge is already
219
+ tripped, "global rate" means "events the per-edge gate let
220
+ through", which matches user intuition about what each alert
221
+ represents.
222
+ """
223
+ # Per-edge check
224
+ key = (src, dst)
225
+ if key not in self._edges:
226
+ self._edges[key] = _EdgeWindow()
227
+ edge_window = self._edges[key]
228
+ edge_est = self._roll_and_increment(edge_window, now)
229
+
230
+ # Per-edge first — gating against a single runaway edge is
231
+ # usually more actionable than global saturation. Evaluate
232
+ # before touching the global counter so we know whether this
233
+ # event should count toward the global window.
234
+ edge_fires = self._try_fire(edge_window, edge_est, self.per_edge_limit)
235
+
236
+ # Decide whether the event counts toward the global window.
237
+ #
238
+ # Count IF:
239
+ # a) the per-edge window is not tripped at ALL (normal flow), OR
240
+ # b) the per-edge window is tripped AND this event fired the
241
+ # alert (the event that initially breached the limit is
242
+ # legitimate throughput and should be globally counted)
243
+ #
244
+ # Skip IF:
245
+ # c) the per-edge window is already latched tripped and this
246
+ # event is being suppressed (the suppressed events should
247
+ # NOT pollute the global counter)
248
+ #
249
+ # Concretely: if `edge_fires` is True, count globally. If
250
+ # `edge_fires` is False AND `edge_window.tripped` is True,
251
+ # skip. Otherwise (edge untripped, no fire) count normally.
252
+ event_counts_globally = edge_fires or not edge_window.tripped
253
+
254
+ if event_counts_globally:
255
+ global_est = self._roll_and_increment(self._global, now)
256
+ else:
257
+ # Still update the window rollover for time tracking, but
258
+ # without incrementing the counter. This is a no-op on
259
+ # the counter side; the window start still rolls when
260
+ # elapsed > window_size.
261
+ global_est = self._peek_estimate(self._global, now)
262
+
263
+ if edge_fires:
264
+ return Alert(
265
+ pattern="rate_limit_exceeded",
266
+ severity="CRITICAL",
267
+ details={
268
+ "source": src,
269
+ "target": dst,
270
+ "estimated_rate": round(edge_est, 1),
271
+ "limit": self.per_edge_limit,
272
+ "window_size": self.window_size,
273
+ "reset_ratio": self.reset_ratio,
274
+ },
275
+ timestamp=now,
276
+ )
277
+
278
+ # Global check — only reached if the per-edge check did NOT fire
279
+ if self._try_fire(self._global, global_est, self.global_limit):
280
+ return Alert(
281
+ pattern="global_rate_exceeded",
282
+ severity="CRITICAL",
283
+ details={
284
+ "estimated_rate": round(global_est, 1),
285
+ "limit": self.global_limit,
286
+ "window_size": self.window_size,
287
+ "reset_ratio": self.reset_ratio,
288
+ },
289
+ timestamp=now,
290
+ )
291
+
292
+ return None
293
+
294
+ def reset(self) -> None:
295
+ """Clear all rate-limit state (including hysteresis latches)."""
296
+ self._edges.clear()
297
+ self._global = _EdgeWindow()