agentarmour-toolkit 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. agentarmour_toolkit-0.1.1/.github/workflows/ci.yml +34 -0
  2. agentarmour_toolkit-0.1.1/.gitignore +1 -0
  3. agentarmour_toolkit-0.1.1/.python-version +1 -0
  4. agentarmour_toolkit-0.1.1/LICENSE +9 -0
  5. agentarmour_toolkit-0.1.1/PKG-INFO +326 -0
  6. agentarmour_toolkit-0.1.1/README.md +276 -0
  7. agentarmour_toolkit-0.1.1/agentarmour/__init__.py +3 -0
  8. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/__init__.py +34 -0
  9. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/breaker.py +310 -0
  10. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/cli.py +183 -0
  11. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/config.py +43 -0
  12. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/dashboard/__init__.py +0 -0
  13. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/dashboard/app.py +148 -0
  14. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/guard.py +205 -0
  15. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/registry.py +62 -0
  16. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/states.py +243 -0
  17. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/storage/__init__.py +0 -0
  18. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/storage/base.py +37 -0
  19. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/storage/sqlite_ledger.py +148 -0
  20. agentarmour_toolkit-0.1.1/agentarmour/cascadebreaker/strategies.py +243 -0
  21. agentarmour_toolkit-0.1.1/benchmark_latency.py +63 -0
  22. agentarmour_toolkit-0.1.1/cascadebreaker.db +0 -0
  23. agentarmour_toolkit-0.1.1/examples/basic_usage.py +103 -0
  24. agentarmour_toolkit-0.1.1/examples/cascadebreaker_basic.py +101 -0
  25. agentarmour_toolkit-0.1.1/examples/langgraph_example.py +131 -0
  26. agentarmour_toolkit-0.1.1/pyproject.toml +86 -0
  27. agentarmour_toolkit-0.1.1/test.ipynb +400 -0
  28. agentarmour_toolkit-0.1.1/test_audit.db +0 -0
  29. agentarmour_toolkit-0.1.1/test_storage.py +60 -0
  30. agentarmour_toolkit-0.1.1/test_traceback.py +31 -0
  31. agentarmour_toolkit-0.1.1/tests/__init__.py +0 -0
  32. agentarmour_toolkit-0.1.1/tests/unit/__init__.py +0 -0
  33. agentarmour_toolkit-0.1.1/tests/unit/test_breaker.py +207 -0
  34. agentarmour_toolkit-0.1.1/tests/unit/test_states.py +172 -0
  35. agentarmour_toolkit-0.1.1/tests/unit/test_storage.py +71 -0
  36. agentarmour_toolkit-0.1.1/uv.lock +2494 -0
@@ -0,0 +1,34 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ name: Tests (Python ${{ matrix.python-version }})
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ python-version: ["3.10", "3.11", "3.12"]
17
+
18
+ steps:
19
+ - name: Check out the repo
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v3
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+
27
+ - name: Install dependencies
28
+ run: uv sync --all-extras
29
+
30
+ - name: Run linter (ruff)
31
+ run: uv run ruff check agentarmour/
32
+
33
+ - name: Run tests
34
+ run: uv run pytest tests/ -v
@@ -0,0 +1 @@
1
+ notes.txt
@@ -0,0 +1 @@
1
+ 3.10
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Saravanan S
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,326 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentarmour-toolkit
3
+ Version: 0.1.1
4
+ Summary: Production reliability suite for LangChain/LangGraph multi-agent systems
5
+ Project-URL: Homepage, https://github.com/Saravanan-SD/agentarmour
6
+ Project-URL: Repository, https://github.com/Saravanan-SD/agentarmour
7
+ Project-URL: Issues, https://github.com/Saravanan-SD/agentarmour/issues
8
+ Author-email: Saravanan S <saravanansd634@email.com>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 Saravanan S
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18
+ License-File: LICENSE
19
+ Keywords: agentarmour,circuit-breaker,fault-tolerance,langchain,langgraph,llm,multi-agent,self-healing
20
+ Classifier: Development Status :: 3 - Alpha
21
+ Classifier: Intended Audience :: Developers
22
+ Classifier: License :: OSI Approved :: MIT License
23
+ Classifier: Programming Language :: Python :: 3
24
+ Classifier: Programming Language :: Python :: 3.10
25
+ Classifier: Programming Language :: Python :: 3.11
26
+ Classifier: Programming Language :: Python :: 3.12
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Requires-Python: >=3.10
29
+ Requires-Dist: pydantic>=2.0.0
30
+ Requires-Dist: structlog>=24.0.0
31
+ Provides-Extra: all
32
+ Requires-Dist: agentarmour[cascadebreaker,dashboard,dev,storage]; extra == 'all'
33
+ Provides-Extra: cascadebreaker
34
+ Requires-Dist: langchain-core>=0.2.0; extra == 'cascadebreaker'
35
+ Requires-Dist: langgraph>=0.2.0; extra == 'cascadebreaker'
36
+ Provides-Extra: dashboard
37
+ Requires-Dist: pandas>=2.0.0; extra == 'dashboard'
38
+ Requires-Dist: plotly>=5.0.0; extra == 'dashboard'
39
+ Requires-Dist: streamlit>=1.35.0; extra == 'dashboard'
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
42
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
43
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
44
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
45
+ Provides-Extra: langgraph
46
+ Requires-Dist: langchain-core>=1.3.2; extra == 'langgraph'
47
+ Requires-Dist: langgraph>=1.1.10; extra == 'langgraph'
48
+ Provides-Extra: storage
49
+ Description-Content-Type: text/markdown
50
+
51
+ # CascadeBreaker
52
+
53
+ **Circuit breaker and self-healing layer for LangGraph multi-agent systems.**
54
+
55
+ Part of the [AgentArmour](https://github.com/Saravanan-SD/agentarmour) reliability suite.
56
+
57
+ ![CI](https://github.com/Saravanan-SD/agentarmour/actions/workflows/ci.yml/badge.svg)
58
+ ![Python](https://img.shields.io/badge/python-3.10%2B-blue)
59
+ ![License](https://img.shields.io/badge/license-MIT-green)
60
+ ![PyPI](https://img.shields.io/pypi/v/agentarmour)
61
+
62
+ ## Why This Exists
63
+
64
+ Existing circuit breaker tools for LLMs (`llm-circuit`, `aeneassoft`, `llm-cascade`) only protect against **LLM API provider outages**. OpenAI down, Anthropic rate-limited.
65
+
66
+ They do nothing about what actually breaks production multi-agent systems: an agent stuck in a reasoning loop, a hallucinated value silently poisoning shared state, one agent's failure cascading through every downstream node.
67
+
68
+ A March 2025 paper, *["Why Do Multi-Agent LLM Systems Fail?"](https://arxiv.org/abs/2503.13657)* (Cemri, Pan, Yang, Agrawal, Chopra, Tiwari, Keutzer, Parameswaran, Klein, Ramchandran, Zaharia, Gonzalez, and Stoica), analysed over 1,600 execution traces across seven multi-agent frameworks and identified 14 distinct failure modes. None of them involve an API going down.
69
+
70
+ CascadeBreaker operates one level below the API, at the individual LangGraph node.
71
+
72
+ ## Install
73
+
74
+ ```bash
75
+ pip install agentarmour
76
+ ```
77
+
78
+ Core install pulls in only two dependencies: `pydantic` and `structlog`. Everything else is optional, installed only when you need it:
79
+
80
+ ```bash
81
+ pip install agentarmour[langgraph] # LangGraph/LangChain integration
82
+ pip install agentarmour[dashboard] # Streamlit dashboard
83
+ pip install agentarmour[dev] # pytest, ruff, dev tools
84
+ pip install agentarmour[all] # everything
85
+ ```
86
+
87
+ Verified: a clean install of the base package brings in exactly 7 packages total (the library, `pydantic`, `structlog`, and their own small dependencies), nothing else.
88
+
89
+ ## Quick Start
90
+
91
+ ```python
92
+ from agentarmour.cascadebreaker import CircuitBreaker, BreakerConfig
93
+ from agentarmour.cascadebreaker.strategies import CacheStrategy
94
+
95
+ breaker = CircuitBreaker(
96
+ name="research_agent",
97
+ config=BreakerConfig(failure_threshold=3, recovery_timeout=30),
98
+ fallback_strategy=CacheStrategy(max_age_seconds=300),
99
+ )
100
+
101
+ @breaker.protect
102
+ async def research_node(state: dict) -> dict:
103
+ result = await llm_chain.ainvoke(state["query"])
104
+ return {**state, "research": result}
105
+ ```
106
+
107
+ One decorator. The breaker cycles through CLOSED → OPEN → HALF_OPEN automatically based on real failures, no manual intervention needed.
108
+
109
+ ## The Four Fallback Strategies
110
+
111
+ When the breaker is OPEN, something still has to respond. Pick the strategy that fits each node.
112
+
113
+ ### CacheStrategy — return the last good response
114
+
115
+ ```python
116
+ from agentarmour.cascadebreaker.strategies import CacheStrategy
117
+
118
+ breaker = CircuitBreaker(
119
+ name="summary_agent",
120
+ config=BreakerConfig(failure_threshold=3),
121
+ fallback_strategy=CacheStrategy(max_age_seconds=300),
122
+ )
123
+ ```
124
+
125
+ Good when output doesn't shift drastically minute to minute, and "slightly stale but correct" beats nothing. The cache is populated automatically every time the real agent succeeds.
126
+
127
+ ### DegradeStrategy — fall back to a simpler agent
128
+
129
+ ```python
130
+ from agentarmour.cascadebreaker.strategies import DegradeStrategy
131
+
132
+ async def cheap_backup_agent(state: dict) -> dict:
133
+ result = await gpt35_chain.ainvoke(state["query"])
134
+ return {**state, "research": result}
135
+
136
+ breaker = CircuitBreaker(
137
+ name="research_agent",
138
+ config=BreakerConfig(failure_threshold=2),
139
+ fallback_strategy=DegradeStrategy(backup_fn=cheap_backup_agent, confidence_override=0.6),
140
+ )
141
+ ```
142
+
143
+ Good when you have a cheaper, more reliable backup model available.
144
+
145
+ ### EscalateStrategy — alert a human
146
+
147
+ ```python
148
+ from agentarmour.cascadebreaker.strategies import EscalateStrategy
149
+
150
+ async def notify_oncall(breaker_name: str, state: dict, context: dict) -> dict | None:
151
+ await slack_client.post(channel="#incidents", text=f"Circuit '{breaker_name}' OPEN")
152
+ return None # don't block the pipeline waiting for a human
153
+
154
+ breaker = CircuitBreaker(
155
+ name="payment_validation_agent",
156
+ config=BreakerConfig(failure_threshold=1),
157
+ fallback_strategy=EscalateStrategy(escalation_fn=notify_oncall, notification_only=True),
158
+ )
159
+ ```
160
+
161
+ Good for anything where a wrong answer is worse than a delayed one.
162
+
163
+ ### DecomposeStrategy — break the task into smaller pieces
164
+
165
+ ```python
166
+ from agentarmour.cascadebreaker.strategies import DecomposeStrategy
167
+
168
+ async def split_into_chunks(state: dict) -> list[dict]:
169
+ return [{**state, "chunk": c} for c in state["documents"]]
170
+
171
+ async def process_chunk(sub_state: dict) -> dict:
172
+ return {"result": await llm.ainvoke(sub_state["chunk"])}
173
+
174
+ breaker = CircuitBreaker(
175
+ name="batch_summary_agent",
176
+ config=BreakerConfig(failure_threshold=2),
177
+ fallback_strategy=DecomposeStrategy(decompose_fn=split_into_chunks, execute_fn=process_chunk),
178
+ )
179
+ ```
180
+
181
+ Good when the failure mode is the task being too large or complex for one agent call.
182
+
183
+ ## Cross-Agent Contamination Guard
184
+
185
+ A circuit breaker catches loud failures. It does not catch an agent that "succeeds" while quietly writing corrupted data into shared state, which the next agent then trusts and builds on. `CascadeGuard` closes that gap.
186
+
187
+ ```python
188
+ from agentarmour.cascadebreaker import CascadeGuard
189
+
190
+ guard = CascadeGuard(quarantine_ttl_seconds=300)
191
+
192
+ @guard.protect_node(
193
+ "extract_agent",
194
+ quarantine_on_failure=["extracted_entities"],
195
+ reads_from=["raw_document"],
196
+ )
197
+ async def extract_node(state: dict) -> dict:
198
+ state["extracted_entities"] = await extract_llm.ainvoke(state["raw_document"])
199
+ return state
200
+
201
+ @guard.protect_node(
202
+ "analyse_agent",
203
+ reads_from=["extracted_entities"],
204
+ )
205
+ async def analyse_node(state: dict) -> dict:
206
+ entities = state.get("extracted_entities")
207
+ if entities is None:
208
+ return {**state, "analysis": "Entities unavailable, upstream agent degraded."}
209
+ return {**state, "analysis": await analyse_llm.ainvoke(entities)}
210
+ ```
211
+
212
+ If `extract_agent` fails, `extracted_entities` gets quarantined for 5 minutes. `analyse_node` receives `None` for that field instead of inheriting garbage, and handles it explicitly.
213
+
214
+ ## Debugging Without Crashing the Pipeline
215
+
216
+ When an agent fails, the breaker swallows the exception so your pipeline keeps running, but the full original stack trace is preserved and attached to the returned state:
217
+
218
+ ```python
219
+ result = await protected_node(state)
220
+
221
+ if "__cascadebreaker_traceback__" in result:
222
+ print("Something failed upstream:")
223
+ print(result["__cascadebreaker_traceback__"])
224
+ ```
225
+
226
+ The trace includes the exact file, line, and function where the original exception occurred, even though nothing was ever raised to the caller.
227
+
228
+ ## Audit Ledger
229
+
230
+ Every failure and state transition is logged to a local SQLite file, zero extra dependencies (built on Python's standard `sqlite3` + `asyncio.to_thread`, so it works even in the base install).
231
+
232
+ ```python
233
+ from agentarmour.cascadebreaker.storage.sqlite_ledger import SQLiteLedger
234
+
235
+ breaker = CircuitBreaker(
236
+ name="research_agent",
237
+ config=BreakerConfig(),
238
+ fallback_strategy=CacheStrategy(),
239
+ ledger=SQLiteLedger(), # writes to cascadebreaker.db by default
240
+ )
241
+ ```
242
+
243
+ Inspect it from the terminal:
244
+
245
+ ```bash
246
+ agentarmour ledger summary
247
+ agentarmour ledger failures --breaker research_agent --limit 10
248
+ agentarmour ledger transitions
249
+ ```
250
+
251
+ Or visually, with the dashboard (requires `pip install agentarmour[dashboard]`):
252
+
253
+ ```bash
254
+ streamlit run agentarmour/cascadebreaker/dashboard/app.py
255
+ ```
256
+
257
+ Shows live metrics, current state per breaker, a failure timeline chart, and recent failure/transition tables, all reading from the same SQLite file.
258
+
259
+ ## Performance
260
+
261
+ Benchmarked across 5,000 calls: wrapping a node with `@breaker.protect` adds roughly **4 microseconds** of overhead per call. Against a typical LLM call (200ms to 3000ms), that's well under 0.01% of total latency. The wrapper will never be the bottleneck in a real pipeline.
262
+
263
+ ## Known Limitations
264
+
265
+ Stated plainly, not hidden:
266
+
267
+ - **Single-process only.** The breaker's state machine uses `asyncio.Lock`, which coordinates concurrent tasks within one Python process. Running multiple replicas (separate containers, separate pods) means each one tracks its own independent circuit state. They do not share state across processes.
268
+ - **Postgres ledger not yet built.** `SQLiteLedger` is fully implemented and tested. A `PostgresLedger` for centralized, multi-instance audit logging is planned but not built, since it has not yet been tested against a real Postgres instance.
269
+
270
+ ## Architecture
271
+ agentarmour/cascadebreaker/
272
+
273
+ ├── config.py # Pydantic configuration (BreakerConfig, StorageConfig)
274
+
275
+ ├── states.py # BreakerStateMachine — CLOSED/OPEN/HALF_OPEN logic
276
+
277
+ ├── breaker.py # CircuitBreaker — decorator + core execution
278
+
279
+ ├── strategies.py # CACHE / DEGRADE / ESCALATE / DECOMPOSE
280
+
281
+ ├── guard.py # CascadeGuard — cross-agent contamination protection
282
+
283
+ ├── registry.py # BreakerRegistry — process-wide discovery
284
+
285
+ ├── cli.py # Terminal inspection of the audit ledger
286
+
287
+ ├── storage/ # SQLite audit ledger (stdlib only, zero dependencies)
288
+
289
+ └── dashboard/ # Streamlit live dashboard
290
+
291
+ ## Running the Examples
292
+
293
+ ```bash
294
+ # Zero dependencies needed
295
+ python examples/basic_usage.py
296
+
297
+ # Requires pip install agentarmour[langgraph]
298
+ python examples/langgraph_example.py
299
+ ```
300
+
301
+ ## Running Tests
302
+
303
+ ```bash
304
+ pip install agentarmour[dev]
305
+ pytest tests/ -v
306
+ ```
307
+
308
+ 24 tests, covering the state machine, fallback paths, timeout handling, traceback capture, and the audit ledger. CI runs this automatically across Python 3.10, 3.11, and 3.12 on every push.
309
+
310
+ ## Roadmap
311
+
312
+ CascadeBreaker is the first module in the AgentArmour suite. Planned next, in order:
313
+
314
+ - **AgentBudget** — cost and rate-limit control
315
+ - **ToolGuard** — protection against hallucinated tool calls
316
+ - **AgentMock** — reliable testing for non-deterministic agents
317
+
318
+ Each module ships completely before the next one starts.
319
+
320
+ ## Credit
321
+
322
+ The failure taxonomy this project is built around comes from Mert Cemri, Melissa Z. Pan, Shuyi Yang, Lakshya A. Agrawal, Bhavya Chopra, Rishabh Tiwari, Kurt Keutzer, Aditya Parameswaran, Dan Klein, Kannan Ramchandran, Matei Zaharia, Joseph E. Gonzalez, and Ion Stoica. *["Why Do Multi-Agent LLM Systems Fail?"](https://arxiv.org/abs/2503.13657)*, March 2025.
323
+
324
+ ## License
325
+
326
+ MIT
@@ -0,0 +1,276 @@
1
+ # CascadeBreaker
2
+
3
+ **Circuit breaker and self-healing layer for LangGraph multi-agent systems.**
4
+
5
+ Part of the [AgentArmour](https://github.com/Saravanan-SD/agentarmour) reliability suite.
6
+
7
+ ![CI](https://github.com/Saravanan-SD/agentarmour/actions/workflows/ci.yml/badge.svg)
8
+ ![Python](https://img.shields.io/badge/python-3.10%2B-blue)
9
+ ![License](https://img.shields.io/badge/license-MIT-green)
10
+ ![PyPI](https://img.shields.io/pypi/v/agentarmour)
11
+
12
+ ## Why This Exists
13
+
14
+ Existing circuit breaker tools for LLMs (`llm-circuit`, `aeneassoft`, `llm-cascade`) only protect against **LLM API provider outages**. OpenAI down, Anthropic rate-limited.
15
+
16
+ They do nothing about what actually breaks production multi-agent systems: an agent stuck in a reasoning loop, a hallucinated value silently poisoning shared state, one agent's failure cascading through every downstream node.
17
+
18
+ A March 2025 paper, *["Why Do Multi-Agent LLM Systems Fail?"](https://arxiv.org/abs/2503.13657)* (Cemri, Pan, Yang, Agrawal, Chopra, Tiwari, Keutzer, Parameswaran, Klein, Ramchandran, Zaharia, Gonzalez, and Stoica), analysed over 1,600 execution traces across seven multi-agent frameworks and identified 14 distinct failure modes. None of them involve an API going down.
19
+
20
+ CascadeBreaker operates one level below the API, at the individual LangGraph node.
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ pip install agentarmour
26
+ ```
27
+
28
+ Core install pulls in only two dependencies: `pydantic` and `structlog`. Everything else is optional, installed only when you need it:
29
+
30
+ ```bash
31
+ pip install agentarmour[langgraph] # LangGraph/LangChain integration
32
+ pip install agentarmour[dashboard] # Streamlit dashboard
33
+ pip install agentarmour[dev] # pytest, ruff, dev tools
34
+ pip install agentarmour[all] # everything
35
+ ```
36
+
37
+ Verified: a clean install of the base package brings in exactly 7 packages total (the library, `pydantic`, `structlog`, and their own small dependencies), nothing else.
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from agentarmour.cascadebreaker import CircuitBreaker, BreakerConfig
43
+ from agentarmour.cascadebreaker.strategies import CacheStrategy
44
+
45
+ breaker = CircuitBreaker(
46
+ name="research_agent",
47
+ config=BreakerConfig(failure_threshold=3, recovery_timeout=30),
48
+ fallback_strategy=CacheStrategy(max_age_seconds=300),
49
+ )
50
+
51
+ @breaker.protect
52
+ async def research_node(state: dict) -> dict:
53
+ result = await llm_chain.ainvoke(state["query"])
54
+ return {**state, "research": result}
55
+ ```
56
+
57
+ One decorator. The breaker cycles through CLOSED → OPEN → HALF_OPEN automatically based on real failures, no manual intervention needed.
58
+
59
+ ## The Four Fallback Strategies
60
+
61
+ When the breaker is OPEN, something still has to respond. Pick the strategy that fits each node.
62
+
63
+ ### CacheStrategy — return the last good response
64
+
65
+ ```python
66
+ from agentarmour.cascadebreaker.strategies import CacheStrategy
67
+
68
+ breaker = CircuitBreaker(
69
+ name="summary_agent",
70
+ config=BreakerConfig(failure_threshold=3),
71
+ fallback_strategy=CacheStrategy(max_age_seconds=300),
72
+ )
73
+ ```
74
+
75
+ Good when output doesn't shift drastically minute to minute, and "slightly stale but correct" beats nothing. The cache is populated automatically every time the real agent succeeds.
76
+
77
+ ### DegradeStrategy — fall back to a simpler agent
78
+
79
+ ```python
80
+ from agentarmour.cascadebreaker.strategies import DegradeStrategy
81
+
82
+ async def cheap_backup_agent(state: dict) -> dict:
83
+ result = await gpt35_chain.ainvoke(state["query"])
84
+ return {**state, "research": result}
85
+
86
+ breaker = CircuitBreaker(
87
+ name="research_agent",
88
+ config=BreakerConfig(failure_threshold=2),
89
+ fallback_strategy=DegradeStrategy(backup_fn=cheap_backup_agent, confidence_override=0.6),
90
+ )
91
+ ```
92
+
93
+ Good when you have a cheaper, more reliable backup model available.
94
+
95
+ ### EscalateStrategy — alert a human
96
+
97
+ ```python
98
+ from agentarmour.cascadebreaker.strategies import EscalateStrategy
99
+
100
+ async def notify_oncall(breaker_name: str, state: dict, context: dict) -> dict | None:
101
+ await slack_client.post(channel="#incidents", text=f"Circuit '{breaker_name}' OPEN")
102
+ return None # don't block the pipeline waiting for a human
103
+
104
+ breaker = CircuitBreaker(
105
+ name="payment_validation_agent",
106
+ config=BreakerConfig(failure_threshold=1),
107
+ fallback_strategy=EscalateStrategy(escalation_fn=notify_oncall, notification_only=True),
108
+ )
109
+ ```
110
+
111
+ Good for anything where a wrong answer is worse than a delayed one.
112
+
113
+ ### DecomposeStrategy — break the task into smaller pieces
114
+
115
+ ```python
116
+ from agentarmour.cascadebreaker.strategies import DecomposeStrategy
117
+
118
+ async def split_into_chunks(state: dict) -> list[dict]:
119
+ return [{**state, "chunk": c} for c in state["documents"]]
120
+
121
+ async def process_chunk(sub_state: dict) -> dict:
122
+ return {"result": await llm.ainvoke(sub_state["chunk"])}
123
+
124
+ breaker = CircuitBreaker(
125
+ name="batch_summary_agent",
126
+ config=BreakerConfig(failure_threshold=2),
127
+ fallback_strategy=DecomposeStrategy(decompose_fn=split_into_chunks, execute_fn=process_chunk),
128
+ )
129
+ ```
130
+
131
+ Good when the failure mode is the task being too large or complex for one agent call.
132
+
133
+ ## Cross-Agent Contamination Guard
134
+
135
+ A circuit breaker catches loud failures. It does not catch an agent that "succeeds" while quietly writing corrupted data into shared state, which the next agent then trusts and builds on. `CascadeGuard` closes that gap.
136
+
137
+ ```python
138
+ from agentarmour.cascadebreaker import CascadeGuard
139
+
140
+ guard = CascadeGuard(quarantine_ttl_seconds=300)
141
+
142
+ @guard.protect_node(
143
+ "extract_agent",
144
+ quarantine_on_failure=["extracted_entities"],
145
+ reads_from=["raw_document"],
146
+ )
147
+ async def extract_node(state: dict) -> dict:
148
+ state["extracted_entities"] = await extract_llm.ainvoke(state["raw_document"])
149
+ return state
150
+
151
+ @guard.protect_node(
152
+ "analyse_agent",
153
+ reads_from=["extracted_entities"],
154
+ )
155
+ async def analyse_node(state: dict) -> dict:
156
+ entities = state.get("extracted_entities")
157
+ if entities is None:
158
+ return {**state, "analysis": "Entities unavailable, upstream agent degraded."}
159
+ return {**state, "analysis": await analyse_llm.ainvoke(entities)}
160
+ ```
161
+
162
+ If `extract_agent` fails, `extracted_entities` gets quarantined for 5 minutes. `analyse_node` receives `None` for that field instead of inheriting garbage, and handles it explicitly.
163
+
164
+ ## Debugging Without Crashing the Pipeline
165
+
166
+ When an agent fails, the breaker swallows the exception so your pipeline keeps running, but the full original stack trace is preserved and attached to the returned state:
167
+
168
+ ```python
169
+ result = await protected_node(state)
170
+
171
+ if "__cascadebreaker_traceback__" in result:
172
+ print("Something failed upstream:")
173
+ print(result["__cascadebreaker_traceback__"])
174
+ ```
175
+
176
+ The trace includes the exact file, line, and function where the original exception occurred, even though nothing was ever raised to the caller.
177
+
178
+ ## Audit Ledger
179
+
180
+ Every failure and state transition is logged to a local SQLite file, zero extra dependencies (built on Python's standard `sqlite3` + `asyncio.to_thread`, so it works even in the base install).
181
+
182
+ ```python
183
+ from agentarmour.cascadebreaker.storage.sqlite_ledger import SQLiteLedger
184
+
185
+ breaker = CircuitBreaker(
186
+ name="research_agent",
187
+ config=BreakerConfig(),
188
+ fallback_strategy=CacheStrategy(),
189
+ ledger=SQLiteLedger(), # writes to cascadebreaker.db by default
190
+ )
191
+ ```
192
+
193
+ Inspect it from the terminal:
194
+
195
+ ```bash
196
+ agentarmour ledger summary
197
+ agentarmour ledger failures --breaker research_agent --limit 10
198
+ agentarmour ledger transitions
199
+ ```
200
+
201
+ Or visually, with the dashboard (requires `pip install agentarmour[dashboard]`):
202
+
203
+ ```bash
204
+ streamlit run agentarmour/cascadebreaker/dashboard/app.py
205
+ ```
206
+
207
+ Shows live metrics, current state per breaker, a failure timeline chart, and recent failure/transition tables, all reading from the same SQLite file.
208
+
209
+ ## Performance
210
+
211
+ Benchmarked across 5,000 calls: wrapping a node with `@breaker.protect` adds roughly **4 microseconds** of overhead per call. Against a typical LLM call (200ms to 3000ms), that's well under 0.01% of total latency. The wrapper will never be the bottleneck in a real pipeline.
212
+
213
+ ## Known Limitations
214
+
215
+ Stated plainly, not hidden:
216
+
217
+ - **Single-process only.** The breaker's state machine uses `asyncio.Lock`, which coordinates concurrent tasks within one Python process. Running multiple replicas (separate containers, separate pods) means each one tracks its own independent circuit state. They do not share state across processes.
218
+ - **Postgres ledger not yet built.** `SQLiteLedger` is fully implemented and tested. A `PostgresLedger` for centralized, multi-instance audit logging is planned but not built, since it has not yet been tested against a real Postgres instance.
219
+
220
+ ## Architecture
221
+ agentarmour/cascadebreaker/
222
+
223
+ ├── config.py # Pydantic configuration (BreakerConfig, StorageConfig)
224
+
225
+ ├── states.py # BreakerStateMachine — CLOSED/OPEN/HALF_OPEN logic
226
+
227
+ ├── breaker.py # CircuitBreaker — decorator + core execution
228
+
229
+ ├── strategies.py # CACHE / DEGRADE / ESCALATE / DECOMPOSE
230
+
231
+ ├── guard.py # CascadeGuard — cross-agent contamination protection
232
+
233
+ ├── registry.py # BreakerRegistry — process-wide discovery
234
+
235
+ ├── cli.py # Terminal inspection of the audit ledger
236
+
237
+ ├── storage/ # SQLite audit ledger (stdlib only, zero dependencies)
238
+
239
+ └── dashboard/ # Streamlit live dashboard
240
+
241
+ ## Running the Examples
242
+
243
+ ```bash
244
+ # Zero dependencies needed
245
+ python examples/basic_usage.py
246
+
247
+ # Requires pip install agentarmour[langgraph]
248
+ python examples/langgraph_example.py
249
+ ```
250
+
251
+ ## Running Tests
252
+
253
+ ```bash
254
+ pip install agentarmour[dev]
255
+ pytest tests/ -v
256
+ ```
257
+
258
+ 24 tests, covering the state machine, fallback paths, timeout handling, traceback capture, and the audit ledger. CI runs this automatically across Python 3.10, 3.11, and 3.12 on every push.
259
+
260
+ ## Roadmap
261
+
262
+ CascadeBreaker is the first module in the AgentArmour suite. Planned next, in order:
263
+
264
+ - **AgentBudget** — cost and rate-limit control
265
+ - **ToolGuard** — protection against hallucinated tool calls
266
+ - **AgentMock** — reliable testing for non-deterministic agents
267
+
268
+ Each module ships completely before the next one starts.
269
+
270
+ ## Credit
271
+
272
+ The failure taxonomy this project is built around comes from Mert Cemri, Melissa Z. Pan, Shuyi Yang, Lakshya A. Agrawal, Bhavya Chopra, Rishabh Tiwari, Kurt Keutzer, Aditya Parameswaran, Dan Klein, Kannan Ramchandran, Matei Zaharia, Joseph E. Gonzalez, and Ion Stoica. *["Why Do Multi-Agent LLM Systems Fail?"](https://arxiv.org/abs/2503.13657)*, March 2025.
273
+
274
+ ## License
275
+
276
+ MIT
@@ -0,0 +1,3 @@
1
+ """AgentArmour — production reliability suite for LangChain/LangGraph multi-agent systems."""
2
+
3
+ __version__ = "0.1.0"