agentchaos-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. agentchaos_sdk-0.1.0/.env.example +3 -0
  2. agentchaos_sdk-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +98 -0
  3. agentchaos_sdk-0.1.0/.github/ISSUE_TEMPLATE/config.yml +8 -0
  4. agentchaos_sdk-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +66 -0
  5. agentchaos_sdk-0.1.0/.github/secret_scanning.yml +4 -0
  6. agentchaos_sdk-0.1.0/.github/workflows/ci.yml +59 -0
  7. agentchaos_sdk-0.1.0/.github/workflows/publish.yml +39 -0
  8. agentchaos_sdk-0.1.0/.gitignore +38 -0
  9. agentchaos_sdk-0.1.0/.pre-commit-config.yaml +16 -0
  10. agentchaos_sdk-0.1.0/.python-version +1 -0
  11. agentchaos_sdk-0.1.0/LICENSE +21 -0
  12. agentchaos_sdk-0.1.0/PKG-INFO +256 -0
  13. agentchaos_sdk-0.1.0/README.md +223 -0
  14. agentchaos_sdk-0.1.0/agentchaos/__init__.py +272 -0
  15. agentchaos_sdk-0.1.0/agentchaos/__main__.py +65 -0
  16. agentchaos_sdk-0.1.0/agentchaos/fault_config.py +367 -0
  17. agentchaos_sdk-0.1.0/agentchaos/fault_diagnose.py +65 -0
  18. agentchaos_sdk-0.1.0/agentchaos/fault_engine.py +442 -0
  19. agentchaos_sdk-0.1.0/docs/faults.md +191 -0
  20. agentchaos_sdk-0.1.0/docs/fig_fault_type_impact.png +0 -0
  21. agentchaos_sdk-0.1.0/docs/fig_strategy_impact.png +0 -0
  22. agentchaos_sdk-0.1.0/docs/index.html +1568 -0
  23. agentchaos_sdk-0.1.0/docs/overview.png +0 -0
  24. agentchaos_sdk-0.1.0/examples/agent_adk.py +84 -0
  25. agentchaos_sdk-0.1.0/examples/agent_langchain.py +83 -0
  26. agentchaos_sdk-0.1.0/examples/agent_openai.py +96 -0
  27. agentchaos_sdk-0.1.0/examples/eval_batch.py +127 -0
  28. agentchaos_sdk-0.1.0/examples/list_faults.py +31 -0
  29. agentchaos_sdk-0.1.0/pyproject.toml +80 -0
  30. agentchaos_sdk-0.1.0/scripts/gen_figures.py +199 -0
  31. agentchaos_sdk-0.1.0/tests/__init__.py +0 -0
  32. agentchaos_sdk-0.1.0/tests/test_core.py +71 -0
  33. agentchaos_sdk-0.1.0/tests/test_examples.py +297 -0
@@ -0,0 +1,3 @@
1
+ OPENAI_MODEL=""
2
+ OPENAI_BASE_URL=""
3
+ OPENAI_API_KEY=""
@@ -0,0 +1,98 @@
1
+ name: Bug Report
2
+ description: Report a bug or unexpected behavior in AgentChaos
3
+ title: "[Bug] "
4
+ labels: ["bug"]
5
+ assignees: []
6
+
7
+ body:
8
+ - type: markdown
9
+ attributes:
10
+ value: |
11
+ Thanks for taking the time to report a bug! Please fill out the form below.
12
+
13
+ - type: checkboxes
14
+ id: duplicate-check
15
+ attributes:
16
+ label: Pre-submission Checklist
17
+ options:
18
+ - label: I have searched existing issues to ensure this is not a duplicate
19
+ required: true
20
+
21
+ - type: textarea
22
+ id: description
23
+ attributes:
24
+ label: Describe the Bug
25
+ description: A clear and concise description of what the bug is.
26
+ placeholder: When I run agentchaos.inject(...), it fails with...
27
+ validations:
28
+ required: true
29
+
30
+ - type: textarea
31
+ id: steps-to-reproduce
32
+ attributes:
33
+ label: Steps to Reproduce
34
+ description: Provide detailed steps to reproduce the bug.
35
+ placeholder: |
36
+ 1. Install agentchaos
37
+ 2. Run the following code: ...
38
+ 3. Observe the error...
39
+ validations:
40
+ required: true
41
+
42
+ - type: textarea
43
+ id: expected-behavior
44
+ attributes:
45
+ label: Expected Behavior
46
+ description: What did you expect to happen?
47
+ validations:
48
+ required: true
49
+
50
+ - type: textarea
51
+ id: actual-behavior
52
+ attributes:
53
+ label: Actual Behavior
54
+ description: What actually happened?
55
+ validations:
56
+ required: true
57
+
58
+ - type: input
59
+ id: version
60
+ attributes:
61
+ label: AgentChaos Version
62
+ placeholder: "e.g., 0.1.0"
63
+ validations:
64
+ required: true
65
+
66
+ - type: dropdown
67
+ id: os
68
+ attributes:
69
+ label: Operating System
70
+ options:
71
+ - Linux
72
+ - macOS
73
+ - Windows
74
+ validations:
75
+ required: true
76
+
77
+ - type: input
78
+ id: python-version
79
+ attributes:
80
+ label: Python Version
81
+ placeholder: "e.g., 3.12.0"
82
+ validations:
83
+ required: false
84
+
85
+ - type: textarea
86
+ id: logs
87
+ attributes:
88
+ label: Relevant Log Output
89
+ render: shell
90
+ validations:
91
+ required: false
92
+
93
+ - type: textarea
94
+ id: additional-context
95
+ attributes:
96
+ label: Additional Context
97
+ validations:
98
+ required: false
@@ -0,0 +1,8 @@
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: General Question
4
+ url: https://github.com/floritange/AgentChaos/discussions
5
+ about: Ask general questions in GitHub Discussions
6
+ - name: Fault Reference
7
+ url: https://github.com/floritange/AgentChaos/blob/main/docs/faults.md
8
+ about: Check the fault reference for all 65 configurations
@@ -0,0 +1,66 @@
1
+ name: Feature Request
2
+ description: Suggest a new feature or enhancement for AgentChaos
3
+ title: "[Feature] "
4
+ labels: ["enhancement"]
5
+ assignees: []
6
+
7
+ body:
8
+ - type: markdown
9
+ attributes:
10
+ value: |
11
+ Thanks for helping improve AgentChaos! Please describe your feature request below.
12
+
13
+ - type: checkboxes
14
+ id: duplicate-check
15
+ attributes:
16
+ label: Pre-submission Checklist
17
+ options:
18
+ - label: I have searched existing issues to ensure this is not a duplicate
19
+ required: true
20
+
21
+ - type: textarea
22
+ id: problem-statement
23
+ attributes:
24
+ label: Problem Statement
25
+ description: What problem does this feature solve?
26
+ placeholder: |
27
+ When evaluating agent robustness, I need to...
28
+ validations:
29
+ required: true
30
+
31
+ - type: textarea
32
+ id: proposed-solution
33
+ attributes:
34
+ label: Proposed Solution
35
+ description: Describe your proposed solution.
36
+ validations:
37
+ required: true
38
+
39
+ - type: dropdown
40
+ id: category
41
+ attributes:
42
+ label: Feature Category
43
+ options:
44
+ - New Fault Type
45
+ - Evaluation Enhancement
46
+ - Trace Format
47
+ - Framework Support
48
+ - CLI Improvement
49
+ - Documentation
50
+ - Other
51
+ validations:
52
+ required: true
53
+
54
+ - type: textarea
55
+ id: alternatives
56
+ attributes:
57
+ label: Alternatives Considered
58
+ validations:
59
+ required: false
60
+
61
+ - type: textarea
62
+ id: additional-context
63
+ attributes:
64
+ label: Additional Context
65
+ validations:
66
+ required: false
@@ -0,0 +1,4 @@
1
+ paths-ignore:
2
+ - "tests/**"
3
+ - "examples/**"
4
+ - "docs/**"
@@ -0,0 +1,59 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, master]
6
+ pull_request:
7
+ branches: [main, master]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ['3.10', '3.11', '3.12', '3.13']
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+ allow-prereleases: true
24
+
25
+ - name: Install uv
26
+ uses: astral-sh/setup-uv@v4
27
+
28
+ - name: Install dependencies
29
+ run: uv sync --python ${{ matrix.python-version }}
30
+
31
+ - name: Run tests
32
+ run: uv run pytest tests/ -v --cov=agentchaos --cov-report=xml
33
+
34
+ - name: Upload coverage to Codecov
35
+ uses: codecov/codecov-action@v4
36
+ if: matrix.python-version == '3.12'
37
+ with:
38
+ token: ${{ secrets.CODECOV_TOKEN }}
39
+ files: ./coverage.xml
40
+ fail_ci_if_error: false
41
+
42
+ lint:
43
+ runs-on: ubuntu-latest
44
+ steps:
45
+ - uses: actions/checkout@v4
46
+
47
+ - name: Set up Python
48
+ uses: actions/setup-python@v5
49
+ with:
50
+ python-version: '3.12'
51
+
52
+ - name: Install uv
53
+ uses: astral-sh/setup-uv@v4
54
+
55
+ - name: Install dependencies
56
+ run: uv sync
57
+
58
+ - name: Run ruff
59
+ run: uv run ruff check agentchaos/ --output-format=github
@@ -0,0 +1,39 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ environment: pypi
12
+ permissions:
13
+ id-token: write
14
+ contents: write
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: '3.12'
23
+
24
+ - name: Install uv
25
+ uses: astral-sh/setup-uv@v4
26
+
27
+ - name: Build package
28
+ run: uv build
29
+
30
+ - name: Publish to PyPI
31
+ uses: pypa/gh-action-pypi-publish@release/v1
32
+ with:
33
+ skip-existing: true
34
+
35
+ - name: Upload release assets
36
+ if: github.event_name == 'release'
37
+ uses: softprops/action-gh-release@v1
38
+ with:
39
+ files: dist/*
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.egg-info/
6
+ .venv/
7
+ build/
8
+ dist/
9
+ uv.lock
10
+
11
+ # Environment
12
+ .env
13
+ **/.env
14
+ config.yaml
15
+
16
+ # IDE & AI tools
17
+ .vscode/
18
+ .idea/
19
+ .claude/
20
+ .omc/
21
+ CLAUDE.md
22
+
23
+ # OS
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Testing
28
+ .pytest_cache/
29
+ coverage.xml
30
+ htmlcov/
31
+ .deepeval/
32
+
33
+ # Runtime output
34
+ examples/traces/
35
+
36
+ # Project-specific: not published
37
+ refs/
38
+ paper/
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.11.12
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff-check
8
+ types_or: [ python, pyi ]
9
+ args: [ --fix ]
10
+ # Run the formatter.
11
+ - id: ruff-format
12
+ types_or: [ python, pyi ]
13
+ - repo: https://github.com/gitleaks/gitleaks
14
+ rev: v8.24.2
15
+ hooks:
16
+ - id: gitleaks
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 AgentChaos Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentchaos-sdk
3
+ Version: 0.1.0
4
+ Summary: Evaluate agent system robustness through controlled, runtime, non-intrusive LLM API fault injection.
5
+ Project-URL: Homepage, https://github.com/floritange/AgentChaos
6
+ Project-URL: Documentation, https://floritange.github.io/AgentChaos/
7
+ Project-URL: Repository, https://github.com/floritange/AgentChaos
8
+ Project-URL: Issues, https://github.com/floritange/AgentChaos/issues
9
+ Author: AgentChaos Contributors
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: agent,chaos,evaluation,fault-injection,llm,robust,testing
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Software Development :: Testing
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: google-adk[extensions]<2.0,>=1.0
26
+ Requires-Dist: httpx<1.0,>=0.24
27
+ Requires-Dist: langchain-openai<2.0,>=0.3
28
+ Requires-Dist: langchain<2.0,>=0.3
29
+ Requires-Dist: loguru<1.0,>=0.7
30
+ Requires-Dist: openai<3.0,>=2.0
31
+ Requires-Dist: python-dotenv<2.0,>=1.0
32
+ Description-Content-Type: text/markdown
33
+
34
+ # AgentChaos
35
+ **Evaluate agent system robustness through controlled, runtime, non-intrusive LLM API fault injection.**
36
+
37
+ [![PyPI version](https://img.shields.io/pypi/v/agentchaos-sdk?color=blue)](https://pypi.org/project/agentchaos-sdk/)
38
+ [![Python](https://img.shields.io/pypi/pyversions/agentchaos-sdk.svg)](https://pypi.org/project/agentchaos-sdk/)
39
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
40
+ [![CI](https://github.com/floritange/AgentChaos/actions/workflows/ci.yml/badge.svg)](https://github.com/floritange/AgentChaos/actions/workflows/ci.yml)
41
+ [![codecov](https://codecov.io/gh/floritange/AgentChaos/graph/badge.svg?branch=main)](https://codecov.io/gh/floritange/AgentChaos?branch=main)
42
+ [![Tests](https://img.shields.io/badge/tests-passing-brightgreen)]()
43
+ [![Docs](https://img.shields.io/badge/docs-github.io-blue)](https://floritange.github.io/AgentChaos/)
44
+
45
+ ---
46
+
47
+ ## Overview
48
+
49
+ LLM-based agent systems issue multiple API calls per task, and each call can fail (HTTP 5xx, truncation, empty response, encoding corruption, schema violation). Once a faulty response occurs, it propagates through downstream agents and causes task failure. **AgentChaos** injects controlled faults at the HTTP transport layer — without modifying any agent source code — to evaluate robustness before these failures happen in production.
50
+
51
+ ---
52
+
53
+ ## Quick Start
54
+
55
+ ```bash
56
+ pip install agentchaos-sdk
57
+ ```
58
+
59
+ ```python
60
+ import agentchaos
61
+
62
+ # Inject fault (your agent code needs ZERO changes)
63
+ agentchaos.inject("llm_error_single")
64
+ result = await my_agent(query) # agent runs normally, unaware
65
+ agentchaos.disable() # stop
66
+ agentchaos.save_trace("trace.json") # save full LLM call trace
67
+ ```
68
+
69
+ ```bash
70
+ # examples
71
+ git clone https://github.com/floritange/AgentChaos.git
72
+ cd AgentChaos
73
+ uv sync
74
+ uv run python examples/list_faults.py # list all 65 faults
75
+ uv run python examples/agent_openai.py # OpenAI agent: normal vs faulted
76
+ uv run python examples/agent_langchain.py # LangChain agent
77
+ uv run python examples/agent_adk.py # Google ADK agent
78
+ uv run python examples/eval_batch.py # batch evaluation
79
+ ```
80
+
81
+ ---
82
+
83
+ ## How It Works
84
+
85
+ <img src="docs/overview.png"/>
86
+
87
+ An HTTP-layer injection mechanism patches the HTTP client at runtime to intercept and modify LLM API responses according to the fault configuration, requiring no changes to any agent system.
88
+
89
+ **Properties:**
90
+ - Works with **any** framework using OpenAI-compatible APIs (OpenAI, LangChain, ADK, AutoGen, CrewAI, LiteLLM)
91
+ - **Zero code changes** — just `inject()` / `disable()` around your existing code
92
+ - Records full **execution trace** (raw input/output, token usage, timing) for every LLM call
93
+ - **65 pre-built fault configurations** covering all real-world failure modes
94
+
95
+ ---
96
+
97
+ ## API
98
+
99
+ | Function | Description |
100
+ |---|---|
101
+ | `agentchaos.inject(fault)` | Start fault injection + trace (`None` = trace only) |
102
+ | `agentchaos.disable()` | Stop injection and trace |
103
+ | `agentchaos.save_trace(path)` | Save trace to JSON |
104
+ | `agentchaos.eval(agent_fn, query, faults)` | Batch robustness evaluation |
105
+ | `agentchaos.diagnose(text)` | Detect fault type from output |
106
+ | `agentchaos.list_faults()` | List all 65 experiments |
107
+
108
+ ```python
109
+ import agentchaos
110
+
111
+ # Trace only (no fault)
112
+ agentchaos.inject(None)
113
+ result = await my_agent(query)
114
+ agentchaos.disable()
115
+ agentchaos.save_trace("trace_normal.json")
116
+
117
+ # Inject fault + trace
118
+ agentchaos.inject("llm_error_single")
119
+ result = await my_agent(query)
120
+ agentchaos.disable()
121
+ agentchaos.save_trace("trace_faulted.json")
122
+
123
+ # Batch evaluation
124
+ report = await agentchaos.eval(my_agent, query, faults="all")
125
+ print(report.summary())
126
+ ```
127
+
128
+ ---
129
+
130
+ ## Trace Format
131
+
132
+ ```json
133
+ {
134
+ "call_index": 0,
135
+ "raw_input": {"model": "gpt-5.5", "messages": [...], "tools": [...]},
136
+ "raw_output": {
137
+ "content": "The answer is 42.",
138
+ "tool_calls": [],
139
+ "finish_reason": "stop",
140
+ "usage": {"prompt_tokens": 306, "completion_tokens": 54, "total_tokens": 360},
141
+ "http_status": 200
142
+ },
143
+ "injected_output": {
144
+ "content": "[API ERROR] HTTP 500: Internal Server Error.",
145
+ "tool_calls": []
146
+ },
147
+ "timing": {"llm_latency_ms": 1523.4, "total_ms": 1524.1},
148
+ "fault_applied": true
149
+ }
150
+ ```
151
+
152
+ > `raw_output` = LLM original response. `injected_output` = what the agent actually receives (only present when `fault_applied: true`).
153
+
154
+ ---
155
+
156
+ ## Fault Taxonomy
157
+
158
+ We define a fault taxonomy by adapting the classical fault classification from distributed systems (Avizienis et al., 2004) to LLM API responses. The taxonomy covers crash, omission, and value faults on both content and tool call fields.
159
+
160
+ | Category | Fault Type | Content | Tool Call | Real-world Scenario |
161
+ |---|---|:---:|:---:|---|
162
+ | **Crash** | Error | yes | yes | Server overload, HTTP 5xx, rate limiting |
163
+ | **Crash** | Timeout | yes | yes | Network congestion, backend delay, API latency |
164
+ | **Omission** | Empty | yes | yes | Safety filter, content policy rejection |
165
+ | **Omission** | Truncate | yes | yes | Token limit, TCP interruption, incomplete completion |
166
+ | **Value** | Corrupt | yes | yes | Encoding error, garbled characters |
167
+ | **Value** | Schema | yes | yes | Parsing error, schema mismatch |
168
+
169
+ From **Crash** to **Value**, faults become progressively harder to detect. Crash faults produce obvious error signals and are typically retried. Value faults look like valid output and propagate silently — making them the most dangerous in practice.
170
+
171
+ **65 = (6 fault types x 2 targets x 4 strategies) + 8 compound + 9 positional**
172
+
173
+ Detailed documentation: **[docs/faults.md](docs/faults.md)**
174
+
175
+ ---
176
+
177
+ ## Evaluation Results
178
+
179
+ ### Experimental Setup
180
+
181
+ <table>
182
+ <tr><th>Agent System</th><th>Architecture</th><th>Benchmarks</th></tr>
183
+ <tr><td><a href="https://openreview.net/forum?id=BAakY1hNKS">AutoGen</a></td><td>Iterative (coder + executor)</td><td rowspan="4"><a href="https://arxiv.org/abs/2107.03374">HumanEval</a>, <a href="https://openreview.net/forum?id=1qvx610Cu7">HumanEval+</a>, <a href="https://arxiv.org/abs/2108.07732">MBPP</a>, <a href="https://openreview.net/forum?id=1qvx610Cu7">MBPP+</a>, <a href="https://openreview.net/forum?id=US2eyuYlvS">MMLU-Pro</a>, <a href="https://aclanthology.org/2024.acl-long.410">MATH-500</a></td></tr>
184
+ <tr><td><a href="https://aclanthology.org/2024.acl-long.72">MAD</a></td><td>Debate (proposer + critic)</td></tr>
185
+ <tr><td><a href="https://aclanthology.org/2024.acl-long.269">MapCoder</a></td><td>Pipeline (planner + coder + debugger)</td></tr>
186
+ <tr><td><a href="https://openreview.net/forum?id=jd0RewGP4w">EvoMAC</a></td><td>Iterative (multi-agent collaboration)</td></tr>
187
+ <tr><td><a href="https://arxiv.org/abs/2510.22775">Mini-SE</a></td><td>Iterative (SWE agent)</td><td><a href="https://arxiv.org/abs/2501.14975">SWE-bench Pro</a></td></tr>
188
+ </table>
189
+
190
+ **Backbone LLMs**: Claude-Sonnet-4.5, GPT-5.2, DeepSeek-V3.2, Seed-1.8
191
+
192
+ **Metric**: Δpass@1 = pass@1 (w/o fault) − pass@1 (w/ fault). Higher = more vulnerable.
193
+
194
+ ### RQ1: Overall Robustness Degradation (Claude-Sonnet-4.5)
195
+
196
+ | System | HumanEval | HumanEval+ | MBPP | MBPP+ | MMLU-Pro | MATH-500 |
197
+ |---|---|---|---|---|---|---|
198
+ | AutoGen | 19.44 | 21.13 | 17.31 | 11.61 | 7.05 | 8.38 |
199
+ | MAD | 24.20 | 24.84 | 24.49 | 15.08 | 20.64 | 20.70 |
200
+ | **MapCoder** | **48.61** | **49.30** | **41.07** | **40.85** | **38.25** | **34.27** |
201
+ | EvoMAC | 18.48 | 18.18 | 16.67 | 14.73 | 13.63 | 15.85 |
202
+ | Mini-SE | — | — | — | — | — | — |
203
+
204
+ > Mini-SE is evaluated only on SWE-bench Pro (Δpass@1 = 0.87%).
205
+
206
+ ### RQ2: Impact of Fault Configurations
207
+
208
+ <img src="docs/fig_fault_type_impact.png"/>
209
+
210
+ <img src="docs/fig_strategy_impact.png"/>
211
+
212
+ - Content faults cause higher Δpass@1 than tool call faults; only **corrupt** stays below 7%
213
+ - **Persistent** injection causes the highest Δpass@1 — up to **62.39%** (MapCoder)
214
+ - **Pipeline** systems are most position-sensitive — single early fault drops pass@1 by up to **83.87%**
215
+ - **Compound** content faults amplify degradation — up to **86.36%** (MapCoder)
216
+
217
+ ### RQ3: Fault Diagnosis
218
+
219
+ Existing methods achieve below **53%** accuracy on fault type and below **56%** on fault step. Truncation — the most harmful fault — is identified with only **4.3%** accuracy.
220
+
221
+ ### Key Findings
222
+
223
+ | # | Finding |
224
+ |---|---|
225
+ | 1 | All systems degrade under fault injection (Δpass@1 up to 50 pp) |
226
+ | 2 | Most severe faults are NOT most harmful — truncation/empty propagate silently |
227
+ | 3 | Most harmful faults are hardest to diagnose (truncation: 4.3% accuracy) |
228
+ | 4 | Architecture determines robustness — ranking consistent across all LLMs |
229
+ | 5 | Persistent injection overrides architectural advantages (up to 62.39%) |
230
+ | 6 | Compound content faults amplify degradation (up to 86.36%) |
231
+
232
+ ---
233
+
234
+ ## Documentation
235
+
236
+ - **[Fault Reference](docs/faults.md)** — Complete reference for all 65 fault configurations
237
+ - **[Examples](examples/)** — Runnable demos for OpenAI, LangChain, ADK
238
+
239
+ ---
240
+
241
+ ## Citation
242
+
243
+ If you use AgentChaos in your research, please cite:
244
+
245
+ ```bibtex
246
+ @article{agentchaos2026,
247
+ title={AgentChaos: Chaos Engineering for Robust Agent Evaluation via LLM API Fault Injection},
248
+ year={2026}
249
+ }
250
+ ```
251
+
252
+ ---
253
+
254
+ ## License
255
+
256
+ MIT -- see [LICENSE](LICENSE).