agentchaos-sdk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentchaos_sdk-0.1.0/.env.example +3 -0
- agentchaos_sdk-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +98 -0
- agentchaos_sdk-0.1.0/.github/ISSUE_TEMPLATE/config.yml +8 -0
- agentchaos_sdk-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +66 -0
- agentchaos_sdk-0.1.0/.github/secret_scanning.yml +4 -0
- agentchaos_sdk-0.1.0/.github/workflows/ci.yml +59 -0
- agentchaos_sdk-0.1.0/.github/workflows/publish.yml +39 -0
- agentchaos_sdk-0.1.0/.gitignore +38 -0
- agentchaos_sdk-0.1.0/.pre-commit-config.yaml +16 -0
- agentchaos_sdk-0.1.0/.python-version +1 -0
- agentchaos_sdk-0.1.0/LICENSE +21 -0
- agentchaos_sdk-0.1.0/PKG-INFO +256 -0
- agentchaos_sdk-0.1.0/README.md +223 -0
- agentchaos_sdk-0.1.0/agentchaos/__init__.py +272 -0
- agentchaos_sdk-0.1.0/agentchaos/__main__.py +65 -0
- agentchaos_sdk-0.1.0/agentchaos/fault_config.py +367 -0
- agentchaos_sdk-0.1.0/agentchaos/fault_diagnose.py +65 -0
- agentchaos_sdk-0.1.0/agentchaos/fault_engine.py +442 -0
- agentchaos_sdk-0.1.0/docs/faults.md +191 -0
- agentchaos_sdk-0.1.0/docs/fig_fault_type_impact.png +0 -0
- agentchaos_sdk-0.1.0/docs/fig_strategy_impact.png +0 -0
- agentchaos_sdk-0.1.0/docs/index.html +1568 -0
- agentchaos_sdk-0.1.0/docs/overview.png +0 -0
- agentchaos_sdk-0.1.0/examples/agent_adk.py +84 -0
- agentchaos_sdk-0.1.0/examples/agent_langchain.py +83 -0
- agentchaos_sdk-0.1.0/examples/agent_openai.py +96 -0
- agentchaos_sdk-0.1.0/examples/eval_batch.py +127 -0
- agentchaos_sdk-0.1.0/examples/list_faults.py +31 -0
- agentchaos_sdk-0.1.0/pyproject.toml +80 -0
- agentchaos_sdk-0.1.0/scripts/gen_figures.py +199 -0
- agentchaos_sdk-0.1.0/tests/__init__.py +0 -0
- agentchaos_sdk-0.1.0/tests/test_core.py +71 -0
- agentchaos_sdk-0.1.0/tests/test_examples.py +297 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Report a bug or unexpected behavior in AgentChaos
|
|
3
|
+
title: "[Bug] "
|
|
4
|
+
labels: ["bug"]
|
|
5
|
+
assignees: []
|
|
6
|
+
|
|
7
|
+
body:
|
|
8
|
+
- type: markdown
|
|
9
|
+
attributes:
|
|
10
|
+
value: |
|
|
11
|
+
Thanks for taking the time to report a bug! Please fill out the form below.
|
|
12
|
+
|
|
13
|
+
- type: checkboxes
|
|
14
|
+
id: duplicate-check
|
|
15
|
+
attributes:
|
|
16
|
+
label: Pre-submission Checklist
|
|
17
|
+
options:
|
|
18
|
+
- label: I have searched existing issues to ensure this is not a duplicate
|
|
19
|
+
required: true
|
|
20
|
+
|
|
21
|
+
- type: textarea
|
|
22
|
+
id: description
|
|
23
|
+
attributes:
|
|
24
|
+
label: Describe the Bug
|
|
25
|
+
description: A clear and concise description of what the bug is.
|
|
26
|
+
placeholder: When I run agentchaos.inject(...), it fails with...
|
|
27
|
+
validations:
|
|
28
|
+
required: true
|
|
29
|
+
|
|
30
|
+
- type: textarea
|
|
31
|
+
id: steps-to-reproduce
|
|
32
|
+
attributes:
|
|
33
|
+
label: Steps to Reproduce
|
|
34
|
+
description: Provide detailed steps to reproduce the bug.
|
|
35
|
+
placeholder: |
|
|
36
|
+
1. Install agentchaos
|
|
37
|
+
2. Run the following code: ...
|
|
38
|
+
3. Observe the error...
|
|
39
|
+
validations:
|
|
40
|
+
required: true
|
|
41
|
+
|
|
42
|
+
- type: textarea
|
|
43
|
+
id: expected-behavior
|
|
44
|
+
attributes:
|
|
45
|
+
label: Expected Behavior
|
|
46
|
+
description: What did you expect to happen?
|
|
47
|
+
validations:
|
|
48
|
+
required: true
|
|
49
|
+
|
|
50
|
+
- type: textarea
|
|
51
|
+
id: actual-behavior
|
|
52
|
+
attributes:
|
|
53
|
+
label: Actual Behavior
|
|
54
|
+
description: What actually happened?
|
|
55
|
+
validations:
|
|
56
|
+
required: true
|
|
57
|
+
|
|
58
|
+
- type: input
|
|
59
|
+
id: version
|
|
60
|
+
attributes:
|
|
61
|
+
label: AgentChaos Version
|
|
62
|
+
placeholder: "e.g., 0.1.0"
|
|
63
|
+
validations:
|
|
64
|
+
required: true
|
|
65
|
+
|
|
66
|
+
- type: dropdown
|
|
67
|
+
id: os
|
|
68
|
+
attributes:
|
|
69
|
+
label: Operating System
|
|
70
|
+
options:
|
|
71
|
+
- Linux
|
|
72
|
+
- macOS
|
|
73
|
+
- Windows
|
|
74
|
+
validations:
|
|
75
|
+
required: true
|
|
76
|
+
|
|
77
|
+
- type: input
|
|
78
|
+
id: python-version
|
|
79
|
+
attributes:
|
|
80
|
+
label: Python Version
|
|
81
|
+
placeholder: "e.g., 3.12.0"
|
|
82
|
+
validations:
|
|
83
|
+
required: false
|
|
84
|
+
|
|
85
|
+
- type: textarea
|
|
86
|
+
id: logs
|
|
87
|
+
attributes:
|
|
88
|
+
label: Relevant Log Output
|
|
89
|
+
render: shell
|
|
90
|
+
validations:
|
|
91
|
+
required: false
|
|
92
|
+
|
|
93
|
+
- type: textarea
|
|
94
|
+
id: additional-context
|
|
95
|
+
attributes:
|
|
96
|
+
label: Additional Context
|
|
97
|
+
validations:
|
|
98
|
+
required: false
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
blank_issues_enabled: false
|
|
2
|
+
contact_links:
|
|
3
|
+
- name: General Question
|
|
4
|
+
url: https://github.com/floritange/AgentChaos/discussions
|
|
5
|
+
about: Ask general questions in GitHub Discussions
|
|
6
|
+
- name: Fault Reference
|
|
7
|
+
url: https://github.com/floritange/AgentChaos/blob/main/docs/faults.md
|
|
8
|
+
about: Check the fault reference for all 65 configurations
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest a new feature or enhancement for AgentChaos
|
|
3
|
+
title: "[Feature] "
|
|
4
|
+
labels: ["enhancement"]
|
|
5
|
+
assignees: []
|
|
6
|
+
|
|
7
|
+
body:
|
|
8
|
+
- type: markdown
|
|
9
|
+
attributes:
|
|
10
|
+
value: |
|
|
11
|
+
Thanks for helping improve AgentChaos! Please describe your feature request below.
|
|
12
|
+
|
|
13
|
+
- type: checkboxes
|
|
14
|
+
id: duplicate-check
|
|
15
|
+
attributes:
|
|
16
|
+
label: Pre-submission Checklist
|
|
17
|
+
options:
|
|
18
|
+
- label: I have searched existing issues to ensure this is not a duplicate
|
|
19
|
+
required: true
|
|
20
|
+
|
|
21
|
+
- type: textarea
|
|
22
|
+
id: problem-statement
|
|
23
|
+
attributes:
|
|
24
|
+
label: Problem Statement
|
|
25
|
+
description: What problem does this feature solve?
|
|
26
|
+
placeholder: |
|
|
27
|
+
When evaluating agent robustness, I need to...
|
|
28
|
+
validations:
|
|
29
|
+
required: true
|
|
30
|
+
|
|
31
|
+
- type: textarea
|
|
32
|
+
id: proposed-solution
|
|
33
|
+
attributes:
|
|
34
|
+
label: Proposed Solution
|
|
35
|
+
description: Describe your proposed solution.
|
|
36
|
+
validations:
|
|
37
|
+
required: true
|
|
38
|
+
|
|
39
|
+
- type: dropdown
|
|
40
|
+
id: category
|
|
41
|
+
attributes:
|
|
42
|
+
label: Feature Category
|
|
43
|
+
options:
|
|
44
|
+
- New Fault Type
|
|
45
|
+
- Evaluation Enhancement
|
|
46
|
+
- Trace Format
|
|
47
|
+
- Framework Support
|
|
48
|
+
- CLI Improvement
|
|
49
|
+
- Documentation
|
|
50
|
+
- Other
|
|
51
|
+
validations:
|
|
52
|
+
required: true
|
|
53
|
+
|
|
54
|
+
- type: textarea
|
|
55
|
+
id: alternatives
|
|
56
|
+
attributes:
|
|
57
|
+
label: Alternatives Considered
|
|
58
|
+
validations:
|
|
59
|
+
required: false
|
|
60
|
+
|
|
61
|
+
- type: textarea
|
|
62
|
+
id: additional-context
|
|
63
|
+
attributes:
|
|
64
|
+
label: Additional Context
|
|
65
|
+
validations:
|
|
66
|
+
required: false
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main, master]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ['3.10', '3.11', '3.12', '3.13']
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
allow-prereleases: true
|
|
24
|
+
|
|
25
|
+
- name: Install uv
|
|
26
|
+
uses: astral-sh/setup-uv@v4
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: uv sync --python ${{ matrix.python-version }}
|
|
30
|
+
|
|
31
|
+
- name: Run tests
|
|
32
|
+
run: uv run pytest tests/ -v --cov=agentchaos --cov-report=xml
|
|
33
|
+
|
|
34
|
+
- name: Upload coverage to Codecov
|
|
35
|
+
uses: codecov/codecov-action@v4
|
|
36
|
+
if: matrix.python-version == '3.12'
|
|
37
|
+
with:
|
|
38
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
39
|
+
files: ./coverage.xml
|
|
40
|
+
fail_ci_if_error: false
|
|
41
|
+
|
|
42
|
+
lint:
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
steps:
|
|
45
|
+
- uses: actions/checkout@v4
|
|
46
|
+
|
|
47
|
+
- name: Set up Python
|
|
48
|
+
uses: actions/setup-python@v5
|
|
49
|
+
with:
|
|
50
|
+
python-version: '3.12'
|
|
51
|
+
|
|
52
|
+
- name: Install uv
|
|
53
|
+
uses: astral-sh/setup-uv@v4
|
|
54
|
+
|
|
55
|
+
- name: Install dependencies
|
|
56
|
+
run: uv sync
|
|
57
|
+
|
|
58
|
+
- name: Run ruff
|
|
59
|
+
run: uv run ruff check agentchaos/ --output-format=github
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: pypi
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write
|
|
14
|
+
contents: write
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: '3.12'
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
uses: astral-sh/setup-uv@v4
|
|
26
|
+
|
|
27
|
+
- name: Build package
|
|
28
|
+
run: uv build
|
|
29
|
+
|
|
30
|
+
- name: Publish to PyPI
|
|
31
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
32
|
+
with:
|
|
33
|
+
skip-existing: true
|
|
34
|
+
|
|
35
|
+
- name: Upload release assets
|
|
36
|
+
if: github.event_name == 'release'
|
|
37
|
+
uses: softprops/action-gh-release@v1
|
|
38
|
+
with:
|
|
39
|
+
files: dist/*
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
*.pyo
|
|
5
|
+
*.egg-info/
|
|
6
|
+
.venv/
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
uv.lock
|
|
10
|
+
|
|
11
|
+
# Environment
|
|
12
|
+
.env
|
|
13
|
+
**/.env
|
|
14
|
+
config.yaml
|
|
15
|
+
|
|
16
|
+
# IDE & AI tools
|
|
17
|
+
.vscode/
|
|
18
|
+
.idea/
|
|
19
|
+
.claude/
|
|
20
|
+
.omc/
|
|
21
|
+
CLAUDE.md
|
|
22
|
+
|
|
23
|
+
# OS
|
|
24
|
+
.DS_Store
|
|
25
|
+
Thumbs.db
|
|
26
|
+
|
|
27
|
+
# Testing
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
coverage.xml
|
|
30
|
+
htmlcov/
|
|
31
|
+
.deepeval/
|
|
32
|
+
|
|
33
|
+
# Runtime output
|
|
34
|
+
examples/traces/
|
|
35
|
+
|
|
36
|
+
# Project-specific: not published
|
|
37
|
+
refs/
|
|
38
|
+
paper/
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
# Ruff version.
|
|
4
|
+
rev: v0.11.12
|
|
5
|
+
hooks:
|
|
6
|
+
# Run the linter.
|
|
7
|
+
- id: ruff-check
|
|
8
|
+
types_or: [ python, pyi ]
|
|
9
|
+
args: [ --fix ]
|
|
10
|
+
# Run the formatter.
|
|
11
|
+
- id: ruff-format
|
|
12
|
+
types_or: [ python, pyi ]
|
|
13
|
+
- repo: https://github.com/gitleaks/gitleaks
|
|
14
|
+
rev: v8.24.2
|
|
15
|
+
hooks:
|
|
16
|
+
- id: gitleaks
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025-2026 AgentChaos Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentchaos-sdk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evaluate agent system robustness through controlled, runtime, non-intrusive LLM API fault injection.
|
|
5
|
+
Project-URL: Homepage, https://github.com/floritange/AgentChaos
|
|
6
|
+
Project-URL: Documentation, https://floritange.github.io/AgentChaos/
|
|
7
|
+
Project-URL: Repository, https://github.com/floritange/AgentChaos
|
|
8
|
+
Project-URL: Issues, https://github.com/floritange/AgentChaos/issues
|
|
9
|
+
Author: AgentChaos Contributors
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: agent,chaos,evaluation,fault-injection,llm,robust,testing
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Software Development :: Testing
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: google-adk[extensions]<2.0,>=1.0
|
|
26
|
+
Requires-Dist: httpx<1.0,>=0.24
|
|
27
|
+
Requires-Dist: langchain-openai<2.0,>=0.3
|
|
28
|
+
Requires-Dist: langchain<2.0,>=0.3
|
|
29
|
+
Requires-Dist: loguru<1.0,>=0.7
|
|
30
|
+
Requires-Dist: openai<3.0,>=2.0
|
|
31
|
+
Requires-Dist: python-dotenv<2.0,>=1.0
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# AgentChaos
|
|
35
|
+
**Evaluate agent system robustness through controlled, runtime, non-intrusive LLM API fault injection.**
|
|
36
|
+
|
|
37
|
+
[](https://pypi.org/project/agentchaos-sdk/)
|
|
38
|
+
[](https://pypi.org/project/agentchaos-sdk/)
|
|
39
|
+
[](https://opensource.org/licenses/MIT)
|
|
40
|
+
[](https://github.com/floritange/AgentChaos/actions/workflows/ci.yml)
|
|
41
|
+
[](https://codecov.io/gh/floritange/AgentChaos?branch=main)
|
|
42
|
+
[]()
|
|
43
|
+
[](https://floritange.github.io/AgentChaos/)
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
LLM-based agent systems issue multiple API calls per task, and each call can fail (HTTP 5xx, truncation, empty response, encoding corruption, schema violation). Once a faulty response occurs, it propagates through downstream agents and causes task failure. **AgentChaos** injects controlled faults at the HTTP transport layer — without modifying any agent source code — to evaluate robustness before these failures happen in production.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install agentchaos-sdk
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import agentchaos
|
|
61
|
+
|
|
62
|
+
# Inject fault (your agent code needs ZERO changes)
|
|
63
|
+
agentchaos.inject("llm_error_single")
|
|
64
|
+
result = await my_agent(query) # agent runs normally, unaware
|
|
65
|
+
agentchaos.disable() # stop
|
|
66
|
+
agentchaos.save_trace("trace.json") # save full LLM call trace
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# examples
|
|
71
|
+
git clone https://github.com/floritange/AgentChaos.git
|
|
72
|
+
cd AgentChaos
|
|
73
|
+
uv sync
|
|
74
|
+
uv run python examples/list_faults.py # list all 65 faults
|
|
75
|
+
uv run python examples/agent_openai.py # OpenAI agent: normal vs faulted
|
|
76
|
+
uv run python examples/agent_langchain.py # LangChain agent
|
|
77
|
+
uv run python examples/agent_adk.py # Google ADK agent
|
|
78
|
+
uv run python examples/eval_batch.py # batch evaluation
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## How It Works
|
|
84
|
+
|
|
85
|
+
<img src="docs/overview.png"/>
|
|
86
|
+
|
|
87
|
+
An HTTP-layer injection mechanism patches the HTTP client at runtime to intercept and modify LLM API responses according to the fault configuration, requiring no changes to any agent system.
|
|
88
|
+
|
|
89
|
+
**Properties:**
|
|
90
|
+
- Works with **any** framework using OpenAI-compatible APIs (OpenAI, LangChain, ADK, AutoGen, CrewAI, LiteLLM)
|
|
91
|
+
- **Zero code changes** — just `inject()` / `disable()` around your existing code
|
|
92
|
+
- Records full **execution trace** (raw input/output, token usage, timing) for every LLM call
|
|
93
|
+
- **65 pre-built fault configurations** covering all real-world failure modes
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## API
|
|
98
|
+
|
|
99
|
+
| Function | Description |
|
|
100
|
+
|---|---|
|
|
101
|
+
| `agentchaos.inject(fault)` | Start fault injection + trace (`None` = trace only) |
|
|
102
|
+
| `agentchaos.disable()` | Stop injection and trace |
|
|
103
|
+
| `agentchaos.save_trace(path)` | Save trace to JSON |
|
|
104
|
+
| `agentchaos.eval(agent_fn, query, faults)` | Batch robustness evaluation |
|
|
105
|
+
| `agentchaos.diagnose(text)` | Detect fault type from output |
|
|
106
|
+
| `agentchaos.list_faults()` | List all 65 experiments |
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
import agentchaos
|
|
110
|
+
|
|
111
|
+
# Trace only (no fault)
|
|
112
|
+
agentchaos.inject(None)
|
|
113
|
+
result = await my_agent(query)
|
|
114
|
+
agentchaos.disable()
|
|
115
|
+
agentchaos.save_trace("trace_normal.json")
|
|
116
|
+
|
|
117
|
+
# Inject fault + trace
|
|
118
|
+
agentchaos.inject("llm_error_single")
|
|
119
|
+
result = await my_agent(query)
|
|
120
|
+
agentchaos.disable()
|
|
121
|
+
agentchaos.save_trace("trace_faulted.json")
|
|
122
|
+
|
|
123
|
+
# Batch evaluation
|
|
124
|
+
report = await agentchaos.eval(my_agent, query, faults="all")
|
|
125
|
+
print(report.summary())
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Trace Format
|
|
131
|
+
|
|
132
|
+
```json
|
|
133
|
+
{
|
|
134
|
+
"call_index": 0,
|
|
135
|
+
"raw_input": {"model": "gpt-5.5", "messages": [...], "tools": [...]},
|
|
136
|
+
"raw_output": {
|
|
137
|
+
"content": "The answer is 42.",
|
|
138
|
+
"tool_calls": [],
|
|
139
|
+
"finish_reason": "stop",
|
|
140
|
+
"usage": {"prompt_tokens": 306, "completion_tokens": 54, "total_tokens": 360},
|
|
141
|
+
"http_status": 200
|
|
142
|
+
},
|
|
143
|
+
"injected_output": {
|
|
144
|
+
"content": "[API ERROR] HTTP 500: Internal Server Error.",
|
|
145
|
+
"tool_calls": []
|
|
146
|
+
},
|
|
147
|
+
"timing": {"llm_latency_ms": 1523.4, "total_ms": 1524.1},
|
|
148
|
+
"fault_applied": true
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
> `raw_output` = LLM original response. `injected_output` = what the agent actually receives (only present when `fault_applied: true`).
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Fault Taxonomy
|
|
157
|
+
|
|
158
|
+
We define a fault taxonomy by adapting the classical fault classification from distributed systems (Avizienis et al., 2004) to LLM API responses. The taxonomy covers crash, omission, and value faults on both content and tool call fields.
|
|
159
|
+
|
|
160
|
+
| Category | Fault Type | Content | Tool Call | Real-world Scenario |
|
|
161
|
+
|---|---|:---:|:---:|---|
|
|
162
|
+
| **Crash** | Error | yes | yes | Server overload, HTTP 5xx, rate limiting |
|
|
163
|
+
| **Crash** | Timeout | yes | yes | Network congestion, backend delay, API latency |
|
|
164
|
+
| **Omission** | Empty | yes | yes | Safety filter, content policy rejection |
|
|
165
|
+
| **Omission** | Truncate | yes | yes | Token limit, TCP interruption, incomplete completion |
|
|
166
|
+
| **Value** | Corrupt | yes | yes | Encoding error, garbled characters |
|
|
167
|
+
| **Value** | Schema | yes | yes | Parsing error, schema mismatch |
|
|
168
|
+
|
|
169
|
+
From **Crash** to **Value**, faults become progressively harder to detect. Crash faults produce obvious error signals and are typically retried. Value faults look like valid output and propagate silently — making them the most dangerous in practice.
|
|
170
|
+
|
|
171
|
+
**65 = (6 fault types x 2 targets x 4 strategies) + 8 compound + 9 positional**
|
|
172
|
+
|
|
173
|
+
Detailed documentation: **[docs/faults.md](docs/faults.md)**
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Evaluation Results
|
|
178
|
+
|
|
179
|
+
### Experimental Setup
|
|
180
|
+
|
|
181
|
+
<table>
|
|
182
|
+
<tr><th>Agent System</th><th>Architecture</th><th>Benchmarks</th></tr>
|
|
183
|
+
<tr><td><a href="https://openreview.net/forum?id=BAakY1hNKS">AutoGen</a></td><td>Iterative (coder + executor)</td><td rowspan="4"><a href="https://arxiv.org/abs/2107.03374">HumanEval</a>, <a href="https://openreview.net/forum?id=1qvx610Cu7">HumanEval+</a>, <a href="https://arxiv.org/abs/2108.07732">MBPP</a>, <a href="https://openreview.net/forum?id=1qvx610Cu7">MBPP+</a>, <a href="https://openreview.net/forum?id=US2eyuYlvS">MMLU-Pro</a>, <a href="https://aclanthology.org/2024.acl-long.410">MATH-500</a></td></tr>
|
|
184
|
+
<tr><td><a href="https://aclanthology.org/2024.acl-long.72">MAD</a></td><td>Debate (proposer + critic)</td></tr>
|
|
185
|
+
<tr><td><a href="https://aclanthology.org/2024.acl-long.269">MapCoder</a></td><td>Pipeline (planner + coder + debugger)</td></tr>
|
|
186
|
+
<tr><td><a href="https://openreview.net/forum?id=jd0RewGP4w">EvoMAC</a></td><td>Iterative (multi-agent collaboration)</td></tr>
|
|
187
|
+
<tr><td><a href="https://arxiv.org/abs/2510.22775">Mini-SE</a></td><td>Iterative (SWE agent)</td><td><a href="https://arxiv.org/abs/2501.14975">SWE-bench Pro</a></td></tr>
|
|
188
|
+
</table>
|
|
189
|
+
|
|
190
|
+
**Backbone LLMs**: Claude-Sonnet-4.5, GPT-5.2, DeepSeek-V3.2, Seed-1.8
|
|
191
|
+
|
|
192
|
+
**Metric**: Δpass@1 = pass@1 (w/o fault) − pass@1 (w/ fault). Higher = more vulnerable.
|
|
193
|
+
|
|
194
|
+
### RQ1: Overall Robustness Degradation (Claude-Sonnet-4.5)
|
|
195
|
+
|
|
196
|
+
| System | HumanEval | HumanEval+ | MBPP | MBPP+ | MMLU-Pro | MATH-500 |
|
|
197
|
+
|---|---|---|---|---|---|---|
|
|
198
|
+
| AutoGen | 19.44 | 21.13 | 17.31 | 11.61 | 7.05 | 8.38 |
|
|
199
|
+
| MAD | 24.20 | 24.84 | 24.49 | 15.08 | 20.64 | 20.70 |
|
|
200
|
+
| **MapCoder** | **48.61** | **49.30** | **41.07** | **40.85** | **38.25** | **34.27** |
|
|
201
|
+
| EvoMAC | 18.48 | 18.18 | 16.67 | 14.73 | 13.63 | 15.85 |
|
|
202
|
+
| Mini-SE | — | — | — | — | — | — |
|
|
203
|
+
|
|
204
|
+
> Mini-SE is evaluated only on SWE-bench Pro (Δpass@1 = 0.87%).
|
|
205
|
+
|
|
206
|
+
### RQ2: Impact of Fault Configurations
|
|
207
|
+
|
|
208
|
+
<img src="docs/fig_fault_type_impact.png"/>
|
|
209
|
+
|
|
210
|
+
<img src="docs/fig_strategy_impact.png"/>
|
|
211
|
+
|
|
212
|
+
- Content faults cause higher Δpass@1 than tool call faults; only **corrupt** stays below 7%
|
|
213
|
+
- **Persistent** injection causes the highest Δpass@1 — up to **62.39%** (MapCoder)
|
|
214
|
+
- **Pipeline** systems are most position-sensitive — single early fault drops pass@1 by up to **83.87%**
|
|
215
|
+
- **Compound** content faults amplify degradation — up to **86.36%** (MapCoder)
|
|
216
|
+
|
|
217
|
+
### RQ3: Fault Diagnosis
|
|
218
|
+
|
|
219
|
+
Existing methods achieve below **53%** accuracy on fault type and below **56%** on fault step. Truncation — the most harmful fault — is identified with only **4.3%** accuracy.
|
|
220
|
+
|
|
221
|
+
### Key Findings
|
|
222
|
+
|
|
223
|
+
| # | Finding |
|
|
224
|
+
|---|---|
|
|
225
|
+
| 1 | All systems degrade under fault injection (Δpass@1 up to 50 pp) |
|
|
226
|
+
| 2 | Most severe faults are NOT most harmful — truncation/empty propagate silently |
|
|
227
|
+
| 3 | Most harmful faults are hardest to diagnose (truncation: 4.3% accuracy) |
|
|
228
|
+
| 4 | Architecture determines robustness — ranking consistent across all LLMs |
|
|
229
|
+
| 5 | Persistent injection overrides architectural advantages (up to 62.39%) |
|
|
230
|
+
| 6 | Compound content faults amplify degradation (up to 86.36%) |
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Documentation
|
|
235
|
+
|
|
236
|
+
- **[Fault Reference](docs/faults.md)** — Complete reference for all 65 fault configurations
|
|
237
|
+
- **[Examples](examples/)** — Runnable demos for OpenAI, LangChain, ADK
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Citation
|
|
242
|
+
|
|
243
|
+
If you use AgentChaos in your research, please cite:
|
|
244
|
+
|
|
245
|
+
```bibtex
|
|
246
|
+
@article{agentchaos2026,
|
|
247
|
+
title={AgentChaos: Chaos Engineering for Robust Agent Evaluation via LLM API Fault Injection},
|
|
248
|
+
year={2026}
|
|
249
|
+
}
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## License
|
|
255
|
+
|
|
256
|
+
MIT -- see [LICENSE](LICENSE).
|