swarm-test 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
swarm_test/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ """
2
+ swarm-test: The first reliability testing framework for multi-agent AI systems.
3
+
4
+ Quick start::
5
+
6
+ from swarm_test import SwarmProbe
7
+
8
+ probe = SwarmProbe(crew)
9
+ report = probe.run_all()
10
+ report.print_summary()
11
+ """
12
+
13
+ from swarm_test.core.models import (
14
+ AgentNode,
15
+ EventType,
16
+ Finding,
17
+ InteractionEvent,
18
+ Severity,
19
+ SwarmReport,
20
+ TestResult,
21
+ TestStatus,
22
+ )
23
+ from swarm_test.core.graph import SwarmGraph
24
+ from swarm_test.core.probe import SwarmProbe
25
+
26
+ __version__ = "0.1.0"
27
+ __author__ = "swarm-test contributors"
28
+ __license__ = "MIT"
29
+
30
+ __all__ = [
31
+ # Main API
32
+ "SwarmProbe",
33
+ "SwarmGraph",
34
+ # Models
35
+ "AgentNode",
36
+ "InteractionEvent",
37
+ "Finding",
38
+ "TestResult",
39
+ "SwarmReport",
40
+ "Severity",
41
+ "EventType",
42
+ "TestStatus",
43
+ ]
@@ -0,0 +1 @@
1
+ """Attack modules for swarm-test chaos testing."""
@@ -0,0 +1,38 @@
1
+ """Base class for all swarm chaos attacks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING
7
+
8
+ from swarm_test.core.models import TestResult
9
+
10
+ if TYPE_CHECKING:
11
+ from swarm_test.core.graph import SwarmGraph
12
+
13
+
14
+ class BaseAttack(ABC):
15
+ """
16
+ Abstract base class that all attack modules must implement.
17
+
18
+ Subclasses should override ``name``, ``description``, and ``run()``.
19
+ """
20
+
21
+ name: str = "base_attack"
22
+ description: str = "Base chaos attack."
23
+
24
+ @abstractmethod
25
+ def run(self, graph: SwarmGraph) -> TestResult:
26
+ """
27
+ Execute the attack against the provided SwarmGraph.
28
+
29
+ Args:
30
+ graph: A ``SwarmGraph`` instance containing agent nodes and events.
31
+
32
+ Returns:
33
+ A ``TestResult`` with findings, metrics, and status.
34
+ """
35
+ ...
36
+
37
+ def __repr__(self) -> str:
38
+ return f"{type(self).__name__}(name={self.name!r})"
@@ -0,0 +1,201 @@
1
+ """Blast Radius Attack — quantify impact of targeted agent failures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, Dict, List, Set
7
+
8
+ import networkx as nx
9
+
10
+ from swarm_test.attacks.base import BaseAttack
11
+ from swarm_test.core.models import Finding, Severity, TestResult, TestStatus
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class BlastRadiusAttack(BaseAttack):
17
+ """
18
+ Performs a systematic blast radius analysis across the entire swarm:
19
+ - Identifies agents whose failure would most impact the system
20
+ - Detects single points of failure (articulation points)
21
+ - Quantifies the critical path and its length
22
+ - Checks for adequate redundancy
23
+
24
+ This is a quantitative complement to CascadeFailureAttack,
25
+ focusing on topology-level metrics rather than simulation.
26
+ """
27
+
28
+ name = "blast_radius"
29
+ description = (
30
+ "Topological blast radius analysis: identifies critical agents, "
31
+ "single points of failure, and lack of redundancy in the swarm graph."
32
+ )
33
+
34
+ def run(self, graph: Any) -> TestResult:
35
+ findings: List[Finding] = []
36
+ metrics: Dict[str, Any] = {
37
+ "total_agents": 0,
38
+ "total_edges": 0,
39
+ "single_points_of_failure": [],
40
+ "critical_path": [],
41
+ "critical_path_length": 0,
42
+ "top_blast_agents": [],
43
+ "redundancy_score": 0.0,
44
+ "graph_density": 0.0,
45
+ }
46
+
47
+ g = graph.graph
48
+ n = g.number_of_nodes()
49
+ e = g.number_of_edges()
50
+ metrics["total_agents"] = n
51
+ metrics["total_edges"] = e
52
+
53
+ if n < 2:
54
+ return TestResult(
55
+ test_name=self.name,
56
+ status=TestStatus.PASSED,
57
+ findings=[],
58
+ metrics={"note": "Need ≥2 agents for blast radius analysis"},
59
+ )
60
+
61
+ # Graph density
62
+ density = nx.density(g)
63
+ metrics["graph_density"] = round(density, 4)
64
+
65
+ # 1. Single Points of Failure (articulation points)
66
+ spofs = graph.find_single_points_of_failure()
67
+ metrics["single_points_of_failure"] = [
68
+ g.nodes[s].get("name", s) for s in spofs if s in g
69
+ ]
70
+
71
+ for spof_id in spofs:
72
+ if spof_id not in g:
73
+ continue
74
+ spof_name = g.nodes[spof_id].get("name", spof_id)
75
+ blast = graph.get_blast_radius(spof_id)
76
+ findings.append(
77
+ Finding(
78
+ test_name=self.name,
79
+ severity=Severity.CRITICAL,
80
+ title=f"Single Point of Failure: {spof_name}",
81
+ description=(
82
+ f"Agent '{spof_name}' is an articulation point — removing it "
83
+ f"would disconnect the agent communication graph. "
84
+ f"Blast radius: {blast['impact_percentage']:.1f}% of agents affected."
85
+ ),
86
+ affected_agents=[spof_id] + blast["downstream_agents"],
87
+ evidence={
88
+ "agent_id": spof_id,
89
+ "impact_percentage": blast["impact_percentage"],
90
+ "downstream_count": len(blast["downstream_agents"]),
91
+ },
92
+ remediation=(
93
+ "Introduce redundant agents or fallback paths. "
94
+ "Consider load balancing across multiple agents for this role. "
95
+ "Implement circuit breakers on connections to this agent."
96
+ ),
97
+ )
98
+ )
99
+
100
+ # 2. Critical path analysis
101
+ critical_path = graph.get_critical_path()
102
+ metrics["critical_path"] = [
103
+ g.nodes[n].get("name", n) for n in critical_path if n in g
104
+ ]
105
+ metrics["critical_path_length"] = len(critical_path)
106
+
107
+ if len(critical_path) >= 4:
108
+ path_names = metrics["critical_path"]
109
+ findings.append(
110
+ Finding(
111
+ test_name=self.name,
112
+ severity=Severity.HIGH,
113
+ title=f"Long critical path: {len(critical_path)} agents",
114
+ description=(
115
+ f"The critical path spans {len(critical_path)} agents: "
116
+ f"{' → '.join(path_names)}. "
117
+ "Failure anywhere on this path creates a service outage."
118
+ ),
119
+ affected_agents=critical_path,
120
+ evidence={"path": critical_path, "path_names": path_names},
121
+ remediation=(
122
+ "Shorten the critical path by parallelizing independent agents. "
123
+ "Add checkpointing and recovery mechanisms along this path."
124
+ ),
125
+ )
126
+ )
127
+
128
+ # 3. Top blast radius agents
129
+ blast_scores = []
130
+ for node in g.nodes():
131
+ blast = graph.get_blast_radius(node)
132
+ blast_scores.append(
133
+ {
134
+ "agent_id": node,
135
+ "agent_name": g.nodes[node].get("name", node),
136
+ "impact_pct": blast["impact_percentage"],
137
+ "downstream_count": len(blast["downstream_agents"]),
138
+ }
139
+ )
140
+ blast_scores.sort(key=lambda x: x["impact_pct"], reverse=True)
141
+ metrics["top_blast_agents"] = blast_scores[:5]
142
+
143
+ # 4. Redundancy score — ratio of edges to minimum spanning tree edges
144
+ # Higher = more redundant paths
145
+ undirected = g.to_undirected()
146
+ if undirected.number_of_edges() > 0 and nx.is_connected(undirected):
147
+ mst_edges = undirected.number_of_nodes() - 1
148
+ actual_edges = undirected.number_of_edges()
149
+ redundancy = (actual_edges - mst_edges) / max(mst_edges, 1)
150
+ metrics["redundancy_score"] = round(redundancy, 3)
151
+
152
+ if redundancy < 0.1 and n > 3:
153
+ findings.append(
154
+ Finding(
155
+ test_name=self.name,
156
+ severity=Severity.MEDIUM,
157
+ title=f"Low redundancy score: {redundancy:.2f}",
158
+ description=(
159
+ f"The swarm graph has a redundancy score of {redundancy:.2f} "
160
+ "(close to a tree structure with no alternative paths). "
161
+ "A single edge failure could create an unreachable agent."
162
+ ),
163
+ affected_agents=list(g.nodes()),
164
+ evidence={"redundancy_score": redundancy, "edge_count": actual_edges},
165
+ remediation=(
166
+ "Add fallback communication paths between agents. "
167
+ "Implement retry logic with alternative agent routing."
168
+ ),
169
+ )
170
+ )
171
+
172
+ # 5. Isolated agents (zero in-degree AND zero out-degree, excluding root)
173
+ isolated = [
174
+ n for n in g.nodes()
175
+ if g.in_degree(n) == 0 and g.out_degree(n) == 0
176
+ ]
177
+ if isolated:
178
+ isolated_names = [g.nodes[i].get("name", i) for i in isolated]
179
+ findings.append(
180
+ Finding(
181
+ test_name=self.name,
182
+ severity=Severity.LOW,
183
+ title=f"{len(isolated)} isolated agent(s) detected",
184
+ description=(
185
+ f"Agents {isolated_names} have no connections to any other agent. "
186
+ "They will never be tested under load and may represent dead code."
187
+ ),
188
+ affected_agents=isolated,
189
+ evidence={"isolated_agents": isolated_names},
190
+ remediation=(
191
+ "Remove unused agents or integrate them into the swarm workflow."
192
+ ),
193
+ )
194
+ )
195
+
196
+ return TestResult(
197
+ test_name=self.name,
198
+ status=TestStatus.PASSED,
199
+ findings=findings,
200
+ metrics=metrics,
201
+ )
@@ -0,0 +1,120 @@
1
+ """Cascade Failure Attack — simulate agent failure propagation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, Dict, List
7
+
8
+ from swarm_test.attacks.base import BaseAttack
9
+ from swarm_test.core.models import Finding, Severity, TestResult, TestStatus
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class CascadeFailureAttack(BaseAttack):
15
+ """
16
+ Simulates a cascade failure by disabling each agent one at a time and
17
+ measuring how many downstream agents would be impacted.
18
+
19
+ Findings are raised when:
20
+ - A single agent failure affects >50% of the swarm (CRITICAL)
21
+ - A single agent failure affects >25% (HIGH)
22
+ - A single agent failure affects >10% (MEDIUM)
23
+ """
24
+
25
+ name = "cascade_failure"
26
+ description = (
27
+ "Simulates agent failures and measures downstream propagation "
28
+ "to detect dangerous cascade paths."
29
+ )
30
+
31
+ THRESHOLDS = [
32
+ (50.0, Severity.CRITICAL, "Catastrophic cascade potential"),
33
+ (25.0, Severity.HIGH, "High cascade risk"),
34
+ (10.0, Severity.MEDIUM, "Moderate cascade risk"),
35
+ ]
36
+
37
+ def run(self, graph: Any) -> TestResult:
38
+ findings: List[Finding] = []
39
+ metrics: Dict[str, Any] = {
40
+ "agents_tested": 0,
41
+ "max_impact_pct": 0.0,
42
+ "most_critical_agent": None,
43
+ "cascade_paths": [],
44
+ }
45
+
46
+ nodes = list(graph.graph.nodes())
47
+ metrics["agents_tested"] = len(nodes)
48
+
49
+ if len(nodes) < 2:
50
+ return TestResult(
51
+ test_name=self.name,
52
+ status=TestStatus.PASSED,
53
+ findings=[],
54
+ metrics={"note": "Need ≥2 agents for cascade analysis"},
55
+ )
56
+
57
+ worst_impact = 0.0
58
+ worst_agent = None
59
+
60
+ for agent_id in nodes:
61
+ blast = graph.get_blast_radius(agent_id)
62
+ impact_pct = blast["impact_percentage"]
63
+
64
+ if impact_pct > worst_impact:
65
+ worst_impact = impact_pct
66
+ worst_agent = agent_id
67
+
68
+ downstream = blast["downstream_agents"]
69
+ if downstream:
70
+ metrics["cascade_paths"].append(
71
+ {
72
+ "agent": blast["agent_name"],
73
+ "downstream_count": len(downstream),
74
+ "impact_pct": impact_pct,
75
+ }
76
+ )
77
+
78
+ for threshold, severity, label in self.THRESHOLDS:
79
+ if impact_pct >= threshold:
80
+ agent_name = blast["agent_name"]
81
+ findings.append(
82
+ Finding(
83
+ test_name=self.name,
84
+ severity=severity,
85
+ title=f"{label}: {agent_name} failure cascades to {len(downstream)} agents",
86
+ description=(
87
+ f"Agent '{agent_name}' (id={agent_id}) has a blast radius of "
88
+ f"{impact_pct:.1f}% — failure would directly or indirectly "
89
+ f"impact {len(downstream)} of {blast['total_agents']} agents."
90
+ ),
91
+ affected_agents=[agent_id] + downstream,
92
+ evidence=blast,
93
+ remediation=(
94
+ "Introduce circuit breakers, health checks, and fallback agents "
95
+ "to isolate failures. Consider replicating this agent."
96
+ ),
97
+ )
98
+ )
99
+ break # Only report the highest severity per agent
100
+
101
+ # Deduplicate findings (same agent may match multiple thresholds due to loop)
102
+ # We break after first match, so no dedup needed.
103
+
104
+ metrics["max_impact_pct"] = round(worst_impact, 2)
105
+ metrics["most_critical_agent"] = (
106
+ graph.graph.nodes[worst_agent].get("name", worst_agent)
107
+ if worst_agent and worst_agent in graph.graph
108
+ else None
109
+ )
110
+
111
+ # Sort cascade paths by impact descending
112
+ metrics["cascade_paths"].sort(key=lambda x: x["impact_pct"], reverse=True)
113
+ metrics["cascade_paths"] = metrics["cascade_paths"][:10] # Top 10
114
+
115
+ return TestResult(
116
+ test_name=self.name,
117
+ status=TestStatus.PASSED, # overridden by probe based on findings
118
+ findings=findings,
119
+ metrics=metrics,
120
+ )
@@ -0,0 +1,247 @@
1
+ """Collusion Detection Attack — identify coordinated agent misbehaviour."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections import defaultdict
7
+ from typing import Any, Dict, List, Set, Tuple
8
+
9
+ import networkx as nx
10
+
11
+ from swarm_test.attacks.base import BaseAttack
12
+ from swarm_test.core.models import EventType, Finding, Severity, TestResult, TestStatus
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class CollusionDetectionAttack(BaseAttack):
18
+ """
19
+ Detects potential collusion between agents by analyzing:
20
+ 1. Dense bi-directional communication clusters (cliques)
21
+ 2. Coordinated error suppression (agents consistently hiding failures)
22
+ 3. Echo chambers — groups of agents exclusively talking to each other
23
+ 4. Cyclic dependency groups that bypass orchestrator oversight
24
+ """
25
+
26
+ name = "collusion_detection"
27
+ description = (
28
+ "Identifies clusters of agents with unusually dense or cyclic communication "
29
+ "patterns that could indicate coordinated misbehaviour or oversight bypass."
30
+ )
31
+
32
+ # A clique of this size or larger is flagged
33
+ MIN_CLIQUE_SIZE = 3
34
+ # Fraction of bidirectional edges within a group to be considered an echo chamber
35
+ ECHO_CHAMBER_THRESHOLD = 0.8
36
+
37
+ def run(self, graph: Any) -> TestResult:
38
+ findings: List[Finding] = []
39
+ metrics: Dict[str, Any] = {
40
+ "cliques_found": 0,
41
+ "echo_chambers": 0,
42
+ "cyclic_groups": 0,
43
+ "suppressed_error_pairs": 0,
44
+ }
45
+
46
+ g = graph.graph
47
+ if g.number_of_nodes() < 3:
48
+ return TestResult(
49
+ test_name=self.name,
50
+ status=TestStatus.PASSED,
51
+ findings=[],
52
+ metrics={"note": "Need ≥3 agents for collusion analysis"},
53
+ )
54
+
55
+ # 1. Dense clique detection on undirected projection
56
+ undirected = g.to_undirected(as_view=True)
57
+ cliques = [c for c in nx.find_cliques(undirected) if len(c) >= self.MIN_CLIQUE_SIZE]
58
+ metrics["cliques_found"] = len(cliques)
59
+
60
+ for clique in cliques:
61
+ agent_names = [g.nodes[n].get("name", n) for n in clique]
62
+ findings.append(
63
+ Finding(
64
+ test_name=self.name,
65
+ severity=Severity.HIGH,
66
+ title=f"Dense communication clique detected ({len(clique)} agents)",
67
+ description=(
68
+ f"Agents {agent_names} form a fully-connected communication clique. "
69
+ f"This dense subgraph may indicate coordinated behaviour "
70
+ f"or information sharing outside the orchestrator's oversight."
71
+ ),
72
+ affected_agents=clique,
73
+ evidence={"clique": clique, "agent_names": agent_names},
74
+ remediation=(
75
+ "Audit communication logs for this agent cluster. "
76
+ "Enforce hub-and-spoke topology via an orchestrator agent. "
77
+ "Add rate limiting on peer-to-peer agent communication."
78
+ ),
79
+ )
80
+ )
81
+
82
+ # 2. Echo chamber detection — groups exclusively talking among themselves
83
+ echo_findings = self._detect_echo_chambers(graph)
84
+ metrics["echo_chambers"] = len(echo_findings)
85
+ findings.extend(echo_findings)
86
+
87
+ # 3. Cyclic dependency groups that loop without orchestrator
88
+ cycle_findings = self._detect_collusion_cycles(graph)
89
+ metrics["cyclic_groups"] = len(cycle_findings)
90
+ findings.extend(cycle_findings)
91
+
92
+ # 4. Coordinated error suppression
93
+ suppression_findings = self._detect_error_suppression(graph)
94
+ metrics["suppressed_error_pairs"] = len(suppression_findings)
95
+ findings.extend(suppression_findings)
96
+
97
+ return TestResult(
98
+ test_name=self.name,
99
+ status=TestStatus.PASSED,
100
+ findings=findings,
101
+ metrics=metrics,
102
+ )
103
+
104
+ def _detect_echo_chambers(self, graph: Any) -> List[Finding]:
105
+ """Detect strongly connected components that form isolated echo chambers."""
106
+ findings = []
107
+ g = graph.graph
108
+
109
+ # SCCs with ≥3 members where internal edge density is high
110
+ sccs = [scc for scc in nx.strongly_connected_components(g) if len(scc) >= 3]
111
+
112
+ for scc in sccs:
113
+ subgraph = g.subgraph(scc)
114
+ internal_edges = subgraph.number_of_edges()
115
+ possible_edges = len(scc) * (len(scc) - 1) # directed
116
+ density = internal_edges / possible_edges if possible_edges > 0 else 0
117
+
118
+ # Check if agents in SCC communicate much more with each other than outside
119
+ external_edges = sum(
120
+ 1
121
+ for src, dst in g.edges()
122
+ if (src in scc) != (dst in scc) # XOR: one in, one out
123
+ )
124
+ total_edges = g.number_of_edges()
125
+ isolation_ratio = 1 - (external_edges / max(total_edges, 1))
126
+
127
+ if density >= self.ECHO_CHAMBER_THRESHOLD and isolation_ratio > 0.6:
128
+ agent_names = [g.nodes[n].get("name", n) for n in scc]
129
+ findings.append(
130
+ Finding(
131
+ test_name=self.name,
132
+ severity=Severity.MEDIUM,
133
+ title=f"Echo chamber: {len(scc)} agents with {density:.0%} internal density",
134
+ description=(
135
+ f"Agents {agent_names} form an echo chamber: "
136
+ f"{density:.0%} internal communication density, "
137
+ f"{isolation_ratio:.0%} isolation from the rest of the swarm."
138
+ ),
139
+ affected_agents=list(scc),
140
+ evidence={
141
+ "density": round(density, 3),
142
+ "isolation_ratio": round(isolation_ratio, 3),
143
+ "internal_edges": internal_edges,
144
+ "external_edges": external_edges,
145
+ },
146
+ remediation=(
147
+ "Break up isolated agent clusters. Require all agent "
148
+ "sub-groups to report through a central orchestrator."
149
+ ),
150
+ )
151
+ )
152
+ return findings
153
+
154
+ @staticmethod
155
+ def _detect_collusion_cycles(graph: Any) -> List[Finding]:
156
+ """Flag cycles that bypass any orchestrator/manager node."""
157
+ findings = []
158
+ g = graph.graph
159
+
160
+ # Identify potential orchestrator nodes (high in-degree + out-degree)
161
+ orchestrators: Set[str] = set()
162
+ for node in g.nodes():
163
+ role = g.nodes[node].get("role", "").lower()
164
+ if any(r in role for r in ("manager", "orchestrator", "coordinator", "planner")):
165
+ orchestrators.add(node)
166
+
167
+ if not orchestrators:
168
+ return [] # Can't determine bypass without known orchestrators
169
+
170
+ cycles = list(nx.simple_cycles(g))
171
+ for cycle in cycles:
172
+ # Check if none of the orchestrators are in this cycle
173
+ if not any(o in cycle for o in orchestrators):
174
+ agent_names = [g.nodes[n].get("name", n) for n in cycle]
175
+ findings.append(
176
+ Finding(
177
+ test_name="collusion_detection",
178
+ severity=Severity.HIGH,
179
+ title=f"Orchestrator-bypass cycle: {' → '.join(agent_names)}",
180
+ description=(
181
+ f"Agents {agent_names} form a cyclic dependency that "
182
+ "completely bypasses any orchestrator or manager agent. "
183
+ "This allows coordinated actions without oversight."
184
+ ),
185
+ affected_agents=cycle,
186
+ evidence={"cycle": cycle, "orchestrators": list(orchestrators)},
187
+ remediation=(
188
+ "Redesign the agent graph so all cycles pass through "
189
+ "an orchestrator node that can audit decisions."
190
+ ),
191
+ )
192
+ )
193
+ return findings
194
+
195
+ @staticmethod
196
+ def _detect_error_suppression(graph: Any) -> List[Finding]:
197
+ """
198
+ Detect pairs of agents where one consistently sends failed events
199
+ but the other's subsequent outputs show success (error suppression).
200
+ """
201
+ findings = []
202
+ events = graph.events
203
+
204
+ # Group events by (src, dst) edge
205
+ edge_events: Dict[Tuple[str, str], List[Any]] = defaultdict(list)
206
+ for event in events:
207
+ edge_events[(event.source_agent_id, event.target_agent_id)].append(event)
208
+
209
+ for (src, dst), evts in edge_events.items():
210
+ total = len(evts)
211
+ if total < 3:
212
+ continue
213
+ failures = [e for e in evts if not e.success]
214
+ failure_rate = len(failures) / total
215
+
216
+ if failure_rate > 0.5:
217
+ # Check if downstream agent's events all appear successful
218
+ downstream_events = [
219
+ e for e in events if e.source_agent_id == dst and e.success
220
+ ]
221
+ if downstream_events and len(downstream_events) >= len(failures):
222
+ src_name = graph.graph.nodes.get(src, {}).get("name", src) if src in graph.graph else src
223
+ dst_name = graph.graph.nodes.get(dst, {}).get("name", dst) if dst in graph.graph else dst
224
+ findings.append(
225
+ Finding(
226
+ test_name="collusion_detection",
227
+ severity=Severity.MEDIUM,
228
+ title=f"Possible error suppression: {src_name} → {dst_name}",
229
+ description=(
230
+ f"Edge {src_name}→{dst_name} has a {failure_rate:.0%} failure rate "
231
+ f"({len(failures)}/{total} events failed), yet downstream agent "
232
+ f"'{dst_name}' continues reporting success. "
233
+ "This may indicate coordinated error suppression."
234
+ ),
235
+ affected_agents=[src, dst],
236
+ evidence={
237
+ "failure_rate": round(failure_rate, 3),
238
+ "failed_events": len(failures),
239
+ "total_events": total,
240
+ },
241
+ remediation=(
242
+ "Implement independent health monitoring for each agent. "
243
+ "Require agents to propagate failure signals up the chain."
244
+ ),
245
+ )
246
+ )
247
+ return findings