agent-fuzzer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_fuzzer-0.1.0/.github/workflows/release.yml +38 -0
- agent_fuzzer-0.1.0/.gitignore +15 -0
- agent_fuzzer-0.1.0/LICENSE +21 -0
- agent_fuzzer-0.1.0/PKG-INFO +75 -0
- agent_fuzzer-0.1.0/README.md +62 -0
- agent_fuzzer-0.1.0/__init__.py +0 -0
- agent_fuzzer-0.1.0/pyproject.toml +17 -0
- agent_fuzzer-0.1.0/src/agent_fuzzer/__init__.py +3 -0
- agent_fuzzer-0.1.0/src/agent_fuzzer/cli.py +90 -0
- agent_fuzzer-0.1.0/src/agent_fuzzer/generator.py +233 -0
- agent_fuzzer-0.1.0/src/agent_fuzzer/reporter.py +120 -0
- agent_fuzzer-0.1.0/src/agent_fuzzer/runner.py +145 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Release to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
id-token: write
|
|
10
|
+
packages: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
build-and-publish:
|
|
14
|
+
name: Build and publish to PyPI
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
environment:
|
|
17
|
+
name: pypi
|
|
18
|
+
url: https://pypi.org/p/agent-fuzzer
|
|
19
|
+
permissions:
|
|
20
|
+
id-token: write
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- name: Checkout code
|
|
24
|
+
uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- name: Set up Python
|
|
27
|
+
uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: '3.12'
|
|
30
|
+
|
|
31
|
+
- name: Install build dependencies
|
|
32
|
+
run: python -m pip install --upgrade build
|
|
33
|
+
|
|
34
|
+
- name: Build package
|
|
35
|
+
run: python -m build
|
|
36
|
+
|
|
37
|
+
- name: Publish package to PyPI
|
|
38
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 FableForge Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-fuzzer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Adversarial scenario generation and testing for coding agents
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: click>=8.0
|
|
8
|
+
Requires-Dist: jinja2>=3.1
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: pyyaml>=6.0
|
|
11
|
+
Requires-Dist: rich>=13.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# AgentFuzzer
|
|
15
|
+
|
|
16
|
+
[](LICENSE) [](https://www.python.org/downloads/) [](tests/)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
Adversarial scenario generation and testing for coding agents.
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
## Quick Install
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install agent-fuzzer
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Run fuzzing tests
|
|
32
|
+
agentfuzzer fuzz --model gpt-4 --count 20 --difficulty hard
|
|
33
|
+
|
|
34
|
+
# View report
|
|
35
|
+
agentfuzzer report -r fuzz_report.json
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Scenario Categories
|
|
39
|
+
|
|
40
|
+
- **broken_code**: Code with bugs the agent must fix
|
|
41
|
+
- **failing_tests**: Tests that fail and need fixing
|
|
42
|
+
- **missing_deps**: Missing or conflicting dependencies
|
|
43
|
+
- **network_errors**: Network-related failures
|
|
44
|
+
|
|
45
|
+
## License
|
|
46
|
+
|
|
47
|
+
MIT
|
|
48
|
+
|
|
49
|
+
## Ecosystem
|
|
50
|
+
|
|
51
|
+
Part of the [FableForge](../) ecosystem — 21 open-source projects built from 210K real agent traces:
|
|
52
|
+
|
|
53
|
+
| Project | Description |
|
|
54
|
+
| --- | --- |
|
|
55
|
+
| **[Anvil](../anvil)** | Self-verified coding agent |
|
|
56
|
+
| **[VerifyLoop](../verifyloop)** | Plan→Execute→Verify→Recover framework |
|
|
57
|
+
| **[ErrorRecovery](../error-recovery)** | Self-healing middleware (3,725 error patterns) |
|
|
58
|
+
| **[FableForge-14B](../fableforge-14b)** | The fine-tuned 14B model (4-stage training) |
|
|
59
|
+
| **[ShellWhisperer](../shell-whisperer)** | 1.5B edge agent (phone/RPi, 50ms) |
|
|
60
|
+
| **[ReasonCritic](../reason-critic)** | Verification model (130 benchmark tasks) |
|
|
61
|
+
| **[TraceCompiler](../trace-compiler)** | Compile traces → LoRA skills |
|
|
62
|
+
| **[AgentRuntime](../agent-runtime)** | Persistent agent daemon (systemd for AI) |
|
|
63
|
+
| **[AgentSwarm](../agent-swarm)** | Multi-agent from real trace transitions |
|
|
64
|
+
| **[AgentTelemetry](../agent-telemetry)** | Datadog for agents (token tracking, costs) |
|
|
65
|
+
| **[BenchAgent](../bench-agent)** | HumanEval for tool-use (107 tasks) |
|
|
66
|
+
| **[AgentDev](../agent-dev)** | VSCode extension with verification |
|
|
67
|
+
| **[TraceViz](../trace-viz)** | Trace replay visualizer (Next.js) |
|
|
68
|
+
| **[AgentSkills](../agent-skills)** | npm for agent behaviors |
|
|
69
|
+
| **[AgentCurriculum](../agent-curriculum)** | 5-stage progressive training |
|
|
70
|
+
| **[AgentFuzzer](../agent-fuzzer)** | Adversarial testing for agents |
|
|
71
|
+
| **[AgentConstitution](../agent-constitution)** | Safety guardrails from traces |
|
|
72
|
+
| **[CostOptimizer](../cost-optimizer)** | Token cost reduction (50-80%) |
|
|
73
|
+
| **[AgentProfiler](../agent-profiler)** | Behavioral fingerprinting |
|
|
74
|
+
| **[TrajectoryDistiller](../trajectory-distiller)** | Trace→training data pipeline |
|
|
75
|
+
| **[Fable5-Dataset](../fable5-dataset)** | HuggingFace dataset release |
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# AgentFuzzer
|
|
2
|
+
|
|
3
|
+
[](LICENSE) [](https://www.python.org/downloads/) [](tests/)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
Adversarial scenario generation and testing for coding agents.
|
|
7
|
+
|
|
8
|
+
## Quick Start
|
|
9
|
+
|
|
10
|
+
## Quick Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install agent-fuzzer
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Run fuzzing tests
|
|
19
|
+
agentfuzzer fuzz --model gpt-4 --count 20 --difficulty hard
|
|
20
|
+
|
|
21
|
+
# View report
|
|
22
|
+
agentfuzzer report -r fuzz_report.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Scenario Categories
|
|
26
|
+
|
|
27
|
+
- **broken_code**: Code with bugs the agent must fix
|
|
28
|
+
- **failing_tests**: Tests that fail and need fixing
|
|
29
|
+
- **missing_deps**: Missing or conflicting dependencies
|
|
30
|
+
- **network_errors**: Network-related failures
|
|
31
|
+
|
|
32
|
+
## License
|
|
33
|
+
|
|
34
|
+
MIT
|
|
35
|
+
|
|
36
|
+
## Ecosystem
|
|
37
|
+
|
|
38
|
+
Part of the [FableForge](../) ecosystem — 21 open-source projects built from 210K real agent traces:
|
|
39
|
+
|
|
40
|
+
| Project | Description |
|
|
41
|
+
| --- | --- |
|
|
42
|
+
| **[Anvil](../anvil)** | Self-verified coding agent |
|
|
43
|
+
| **[VerifyLoop](../verifyloop)** | Plan→Execute→Verify→Recover framework |
|
|
44
|
+
| **[ErrorRecovery](../error-recovery)** | Self-healing middleware (3,725 error patterns) |
|
|
45
|
+
| **[FableForge-14B](../fableforge-14b)** | The fine-tuned 14B model (4-stage training) |
|
|
46
|
+
| **[ShellWhisperer](../shell-whisperer)** | 1.5B edge agent (phone/RPi, 50ms) |
|
|
47
|
+
| **[ReasonCritic](../reason-critic)** | Verification model (130 benchmark tasks) |
|
|
48
|
+
| **[TraceCompiler](../trace-compiler)** | Compile traces → LoRA skills |
|
|
49
|
+
| **[AgentRuntime](../agent-runtime)** | Persistent agent daemon (systemd for AI) |
|
|
50
|
+
| **[AgentSwarm](../agent-swarm)** | Multi-agent from real trace transitions |
|
|
51
|
+
| **[AgentTelemetry](../agent-telemetry)** | Datadog for agents (token tracking, costs) |
|
|
52
|
+
| **[BenchAgent](../bench-agent)** | HumanEval for tool-use (107 tasks) |
|
|
53
|
+
| **[AgentDev](../agent-dev)** | VSCode extension with verification |
|
|
54
|
+
| **[TraceViz](../trace-viz)** | Trace replay visualizer (Next.js) |
|
|
55
|
+
| **[AgentSkills](../agent-skills)** | npm for agent behaviors |
|
|
56
|
+
| **[AgentCurriculum](../agent-curriculum)** | 5-stage progressive training |
|
|
57
|
+
| **[AgentFuzzer](../agent-fuzzer)** | Adversarial testing for agents |
|
|
58
|
+
| **[AgentConstitution](../agent-constitution)** | Safety guardrails from traces |
|
|
59
|
+
| **[CostOptimizer](../cost-optimizer)** | Token cost reduction (50-80%) |
|
|
60
|
+
| **[AgentProfiler](../agent-profiler)** | Behavioral fingerprinting |
|
|
61
|
+
| **[TrajectoryDistiller](../trajectory-distiller)** | Trace→training data pipeline |
|
|
62
|
+
| **[Fable5-Dataset](../fable5-dataset)** | HuggingFace dataset release |
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agent-fuzzer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Adversarial scenario generation and testing for coding agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = ["pydantic>=2.0", "pyyaml>=6.0", "click>=8.0", "rich>=13.0", "jinja2>=3.1"]
|
|
12
|
+
|
|
13
|
+
[project.scripts]
|
|
14
|
+
agentfuzzer = "agent_fuzzer.cli:cli"
|
|
15
|
+
|
|
16
|
+
[tool.hatch.build.targets.wheel]
|
|
17
|
+
packages = ["src/agent_fuzzer"]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""AgentFuzzer CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from agent_fuzzer.generator import ScenarioGenerator
|
|
10
|
+
from agent_fuzzer.runner import FuzzRunner
|
|
11
|
+
from agent_fuzzer.reporter import FuzzReporter
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@click.group()
|
|
17
|
+
@click.version_option("0.1.0")
|
|
18
|
+
def cli() -> None:
|
|
19
|
+
"""AgentFuzzer — Adversarial scenario testing for coding agents."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@cli.command()
|
|
23
|
+
@click.option("--model", default="gpt-4", help="Model to test")
|
|
24
|
+
@click.option("--category", "-c", type=click.Choice(["broken_code", "failing_tests", "missing_deps", "network_errors"]), help="Scenario category")
|
|
25
|
+
@click.option("--count", "-n", default=10, help="Number of scenarios to generate")
|
|
26
|
+
@click.option("--difficulty", "-d", type=click.Choice(["easy", "medium", "hard"]), help="Difficulty filter")
|
|
27
|
+
@click.option("--output", "-o", type=click.Path(), help="Save scenarios to directory")
|
|
28
|
+
def fuzz(model: str, category: str | None, count: int, difficulty: str | None, output: str | None) -> None:
|
|
29
|
+
"""Run fuzzing scenarios against an agent."""
|
|
30
|
+
generator = ScenarioGenerator()
|
|
31
|
+
scenarios = generator.generate(category=category, count=count, difficulty=difficulty)
|
|
32
|
+
|
|
33
|
+
console.print(f"\n[bold]Running {len(scenarios)} scenarios against {model}[/bold]\n")
|
|
34
|
+
|
|
35
|
+
runner = FuzzRunner(model=model)
|
|
36
|
+
results = runner.run_suite(scenarios)
|
|
37
|
+
|
|
38
|
+
reporter = FuzzReporter(results)
|
|
39
|
+
report = reporter.generate_report()
|
|
40
|
+
|
|
41
|
+
table = Table(title="Fuzzing Results")
|
|
42
|
+
table.add_column("Metric", style="cyan")
|
|
43
|
+
table.add_column("Value", style="green")
|
|
44
|
+
for key, value in report.items():
|
|
45
|
+
if key not in ("by_category", "by_difficulty"):
|
|
46
|
+
table.add_row(key, str(value))
|
|
47
|
+
console.print(table)
|
|
48
|
+
|
|
49
|
+
if output:
|
|
50
|
+
reporter.save_report(output)
|
|
51
|
+
console.print(f"\n[green]Report saved to {output}[/green]")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@cli.command()
|
|
55
|
+
@click.option("--report", "-r", type=click.Path(exists=True), help="Report JSON file")
|
|
56
|
+
def report(report: str | None) -> None:
|
|
57
|
+
"""Display a saved fuzzing report."""
|
|
58
|
+
import json
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
|
|
61
|
+
if report:
|
|
62
|
+
path = Path(report)
|
|
63
|
+
with open(path) as f:
|
|
64
|
+
data = json.load(f)
|
|
65
|
+
else:
|
|
66
|
+
console.print("[yellow]No report file specified. Run 'agentfuzzer fuzz' first.[/yellow]")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
table = Table(title="Fuzzing Report")
|
|
70
|
+
table.add_column("Metric", style="cyan")
|
|
71
|
+
table.add_column("Value", style="green")
|
|
72
|
+
for key, value in data.items():
|
|
73
|
+
if isinstance(value, dict):
|
|
74
|
+
continue
|
|
75
|
+
table.add_row(key, str(value))
|
|
76
|
+
console.print(table)
|
|
77
|
+
|
|
78
|
+
if "by_category" in data:
|
|
79
|
+
cat_table = Table(title="Results by Category")
|
|
80
|
+
cat_table.add_column("Category", style="cyan")
|
|
81
|
+
cat_table.add_column("Total")
|
|
82
|
+
cat_table.add_column("Passed")
|
|
83
|
+
cat_table.add_column("Pass Rate", style="green")
|
|
84
|
+
for cat, val in data["by_category"].items():
|
|
85
|
+
cat_table.add_row(cat, str(val["total"]), str(val["passed"]), str(val["pass_rate"]))
|
|
86
|
+
console.print(cat_table)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
cli()
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Generate adversarial scenarios: broken_code, failing_tests, missing_deps, network_errors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
import string
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
BROKEN_CODE_TEMPLATES: list[dict[str, str]] = [
|
|
15
|
+
{"name": "off_by_one_loop", "description": "Loop iterates one time too many or too few", "language": "python",
|
|
16
|
+
"code": "def find_index(items, target):\n for i in range(len(items)):\n if items[i] == target:\n return i\n return -1\n\n# Bug: range(len(items)) should handle edge case where target is last item",
|
|
17
|
+
"expected_fix": "Handle edge cases in loop boundaries"},
|
|
18
|
+
{"name": "null_deref", "description": "Null/None reference without checking", "language": "python",
|
|
19
|
+
"code": "def get_name(user):\n return user.name.upper()\n\n# Bug: user could be None",
|
|
20
|
+
"expected_fix": "Add null check before accessing attributes"},
|
|
21
|
+
{"name": "type_confusion", "description": "Function receives wrong type", "language": "python",
|
|
22
|
+
"code": "def calculate_total(prices):\n return sum(prices)\n\n# Bug: prices might contain strings",
|
|
23
|
+
"expected_fix": "Validate and convert input types"},
|
|
24
|
+
{"name": "infinite_recursion", "description": "Recursive function without proper base case", "language": "python",
|
|
25
|
+
"code": "def fibonacci(n):\n return fibonacci(n - 1) + fibonacci(n - 2)\n\n# Bug: No base case",
|
|
26
|
+
"expected_fix": "Add base case for n <= 1"},
|
|
27
|
+
{"name": "race_condition", "description": "Shared mutable state without locking", "language": "python",
|
|
28
|
+
"code": "class Counter:\n def __init__(self):\n self.count = 0\n \n def increment(self):\n self.count += 1\n\n# Bug: Not thread-safe",
|
|
29
|
+
"expected_fix": "Add thread-safe locking"},
|
|
30
|
+
{"name": "incorrect_comparison", "description": "Using = instead of == in comparison", "language": "python",
|
|
31
|
+
"code": "def check_admin(user):\n if user.role = 'admin':\n return True\n return False\n\n# Bug: Assignment instead of comparison",
|
|
32
|
+
"expected_fix": "Use == for comparison, not ="},
|
|
33
|
+
{"name": "missing_import", "description": "Code uses module without importing it", "language": "python",
|
|
34
|
+
"code": "def get_current_time():\n return datetime.now()\n\n# Bug: datetime module not imported",
|
|
35
|
+
"expected_fix": "Add 'from datetime import datetime'"},
|
|
36
|
+
{"name": "swallowed_exception", "description": "Bare except that hides errors", "language": "python",
|
|
37
|
+
"code": "def read_config(path):\n try:\n with open(path) as f:\n return json.load(f)\n except:\n return {}\n\n# Bug: Catches ALL exceptions silently",
|
|
38
|
+
"expected_fix": "Catch specific exceptions and log errors"},
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
FAILING_TEST_TEMPLATES: list[dict[str, str]] = [
|
|
42
|
+
{"name": "flaky_test", "description": "Test that fails intermittently due to timing or ordering", "language": "python",
|
|
43
|
+
"code": "def test_api_response():\n response = api.get('/users')\n assert response.status_code == 200\n assert len(response.json()) > 0\n\n# Bug: Fails when API is slow or returns empty DB",
|
|
44
|
+
"expected_fix": "Add retries, mock API, or make assertions more robust"},
|
|
45
|
+
{"name": "hardcoded_path", "description": "Test uses hardcoded file paths", "language": "python",
|
|
46
|
+
"code": "def test_read_file():\n content = read_file('/Users/dev/data.txt')\n assert content is not None\n\n# Bug: Path doesn't exist on other machines",
|
|
47
|
+
"expected_fix": "Use tmp_path fixture or relative paths"},
|
|
48
|
+
{"name": "missing_mock", "description": "Test calls real external service", "language": "python",
|
|
49
|
+
"code": "def test_send_email():\n result = send_email('test@example.com', 'Subject', 'Body')\n assert result.success\n\n# Bug: Sends real emails in tests",
|
|
50
|
+
"expected_fix": "Mock the email sending function"},
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
MISSING_DEPS_TEMPLATES: list[dict[str, str]] = [
|
|
54
|
+
{"name": "missing_package", "description": "Import fails because package isn't installed", "language": "python",
|
|
55
|
+
"code": "import pandas as pd\nimport numpy as np\n\ndef process_data(data):\n return pd.DataFrame(data).describe()\n\n# Bug: pandas and numpy not in requirements.txt",
|
|
56
|
+
"expected_fix": "Add pandas and numpy to requirements.txt"},
|
|
57
|
+
{"name": "version_conflict", "description": "Two packages require different versions", "language": "python",
|
|
58
|
+
"code": "# requirements.txt:\n# fastapi==0.100.0\n# pydantic==1.10.0\n# Bug: fastapi 0.100 requires pydantic v2",
|
|
59
|
+
"expected_fix": "Upgrade pydantic to v2 or downgrade fastapi"},
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
NETWORK_ERROR_TEMPLATES: list[dict[str, str]] = [
|
|
63
|
+
{"name": "connection_timeout", "description": "API call times out without retry", "language": "python",
|
|
64
|
+
"code": "import requests\n\ndef fetch_data(url):\n return requests.get(url).json()\n\n# Bug: No timeout, no retry, no error handling",
|
|
65
|
+
"expected_fix": "Add timeout, retry logic, and proper error handling"},
|
|
66
|
+
{"name": "dns_failure", "description": "DNS resolution fails silently", "language": "python",
|
|
67
|
+
"code": "def get_service_url(service_name):\n return f'http://{service_name}:8080'\n\n# Bug: No DNS resolution check",
|
|
68
|
+
"expected_fix": "Add DNS resolution check and fallback URLs"},
|
|
69
|
+
{"name": "ssl_error", "description": "SSL certificate verification fails", "language": "python",
|
|
70
|
+
"code": "import requests\n\ndef fetch_secure(url):\n return requests.get(url, verify=False).json()\n\n# Bug: Disabling SSL verification is insecure",
|
|
71
|
+
"expected_fix": "Fix SSL certificates instead of disabling verification"},
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Scenario:
|
|
77
|
+
"""A single adversarial test scenario."""
|
|
78
|
+
|
|
79
|
+
name: str
|
|
80
|
+
category: str
|
|
81
|
+
description: str
|
|
82
|
+
language: str
|
|
83
|
+
code: str
|
|
84
|
+
expected_fix: str
|
|
85
|
+
difficulty: str = "medium"
|
|
86
|
+
tags: list[str] = field(default_factory=list)
|
|
87
|
+
|
|
88
|
+
def to_dict(self) -> dict[str, Any]:
|
|
89
|
+
return {
|
|
90
|
+
"name": self.name,
|
|
91
|
+
"category": self.category,
|
|
92
|
+
"description": self.description,
|
|
93
|
+
"language": self.language,
|
|
94
|
+
"code": self.code,
|
|
95
|
+
"expected_fix": self.expected_fix,
|
|
96
|
+
"difficulty": self.difficulty,
|
|
97
|
+
"tags": self.tags,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
def to_yaml(self) -> str:
|
|
101
|
+
return yaml.dump(self.to_dict(), default_flow_style=False)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class ScenarioGenerator:
|
|
105
|
+
"""Generate adversarial scenarios for testing coding agents.
|
|
106
|
+
|
|
107
|
+
Creates scenarios in four categories:
|
|
108
|
+
- broken_code: Code with bugs the agent must fix
|
|
109
|
+
- failing_tests: Tests that fail and need fixing
|
|
110
|
+
- missing_deps: Missing or conflicting dependencies
|
|
111
|
+
- network_errors: Network-related failures
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
def __init__(self, seed: int | None = None):
|
|
115
|
+
self.rng = random.Random(seed)
|
|
116
|
+
self.categories = {
|
|
117
|
+
"broken_code": BROKEN_CODE_TEMPLATES,
|
|
118
|
+
"failing_tests": FAILING_TEST_TEMPLATES,
|
|
119
|
+
"missing_deps": MISSING_DEPS_TEMPLATES,
|
|
120
|
+
"network_errors": NETWORK_ERROR_TEMPLATES,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
def generate(self, category: str | None = None, count: int = 10, difficulty: str | None = None) -> list[Scenario]:
|
|
124
|
+
"""Generate adversarial scenarios.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
category: Optional category filter.
|
|
128
|
+
count: Number of scenarios to generate.
|
|
129
|
+
difficulty: Optional difficulty filter (easy, medium, hard).
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List of Scenario objects.
|
|
133
|
+
"""
|
|
134
|
+
difficulties = ["easy", "medium", "hard"]
|
|
135
|
+
scenarios: list[Scenario] = []
|
|
136
|
+
|
|
137
|
+
templates = {category: self.categories[category]} if category else self.categories
|
|
138
|
+
|
|
139
|
+
for i in range(count):
|
|
140
|
+
cat = self.rng.choice(list(templates.keys())) if not category else category
|
|
141
|
+
template = self.rng.choice(templates[cat])
|
|
142
|
+
diff = difficulty or self.rng.choice(difficulties)
|
|
143
|
+
|
|
144
|
+
# Add variation to the code
|
|
145
|
+
varied_code = self._add_variation(template["code"], diff)
|
|
146
|
+
|
|
147
|
+
scenario = Scenario(
|
|
148
|
+
name=template["name"],
|
|
149
|
+
category=cat,
|
|
150
|
+
description=template["description"],
|
|
151
|
+
language=template.get("language", "python"),
|
|
152
|
+
code=varied_code,
|
|
153
|
+
expected_fix=template["expected_fix"],
|
|
154
|
+
difficulty=diff,
|
|
155
|
+
tags=[cat, diff, template.get("language", "python")],
|
|
156
|
+
)
|
|
157
|
+
scenarios.append(scenario)
|
|
158
|
+
|
|
159
|
+
return scenarios
|
|
160
|
+
|
|
161
|
+
def _add_variation(self, code: str, difficulty: str) -> str:
|
|
162
|
+
"""Add difficulty-based variation to scenario code."""
|
|
163
|
+
if difficulty == "easy":
|
|
164
|
+
return code
|
|
165
|
+
elif difficulty == "medium":
|
|
166
|
+
# Add misleading comments
|
|
167
|
+
lines = code.split("\n")
|
|
168
|
+
if len(lines) > 2:
|
|
169
|
+
insert_pos = self.rng.randint(1, len(lines) - 2)
|
|
170
|
+
lines.insert(insert_pos, " # This line looks suspicious but is correct")
|
|
171
|
+
return "\n".join(lines)
|
|
172
|
+
else:
|
|
173
|
+
# Add red herrings and noise
|
|
174
|
+
lines = code.split("\n")
|
|
175
|
+
noise_lines = [
|
|
176
|
+
"import logging",
|
|
177
|
+
"logging.basicConfig(level=logging.DEBUG)",
|
|
178
|
+
"# TODO: refactor this later",
|
|
179
|
+
"# NOTE: performance optimization needed",
|
|
180
|
+
]
|
|
181
|
+
for _ in range(2):
|
|
182
|
+
pos = self.rng.randint(0, len(lines))
|
|
183
|
+
lines.insert(pos, self.rng.choice(noise_lines))
|
|
184
|
+
return "\n".join(lines)
|
|
185
|
+
|
|
186
|
+
def generate_all(self) -> list[Scenario]:
|
|
187
|
+
"""Generate the complete scenario suite (50+ scenarios).
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
List of all scenarios across all categories.
|
|
191
|
+
"""
|
|
192
|
+
scenarios = []
|
|
193
|
+
# Static scenarios from templates
|
|
194
|
+
for cat, templates in self.categories.items():
|
|
195
|
+
for template in templates:
|
|
196
|
+
for diff in ["easy", "medium", "hard"]:
|
|
197
|
+
scenarios.append(Scenario(
|
|
198
|
+
name=f"{template['name']}_{diff}",
|
|
199
|
+
category=cat,
|
|
200
|
+
description=template["description"],
|
|
201
|
+
language=template.get("language", "python"),
|
|
202
|
+
code=self._add_variation(template["code"], diff),
|
|
203
|
+
expected_fix=template["expected_fix"],
|
|
204
|
+
difficulty=diff,
|
|
205
|
+
tags=[cat, diff],
|
|
206
|
+
))
|
|
207
|
+
# Additional random scenarios
|
|
208
|
+
scenarios.extend(self.generate(count=10))
|
|
209
|
+
return scenarios
|
|
210
|
+
|
|
211
|
+
def save_scenarios(self, scenarios: list[Scenario], output_dir: str | Path) -> list[Path]:
|
|
212
|
+
"""Save scenarios as YAML files organized by category.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
scenarios: List of Scenario objects.
|
|
216
|
+
output_dir: Base directory for output.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of paths to saved files.
|
|
220
|
+
"""
|
|
221
|
+
output_dir = Path(output_dir)
|
|
222
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
223
|
+
paths = []
|
|
224
|
+
|
|
225
|
+
for scenario in scenarios:
|
|
226
|
+
cat_dir = output_dir / scenario.category
|
|
227
|
+
cat_dir.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
path = cat_dir / f"{scenario.name}.yaml"
|
|
229
|
+
with open(path, "w") as f:
|
|
230
|
+
f.write(scenario.to_yaml())
|
|
231
|
+
paths.append(path)
|
|
232
|
+
|
|
233
|
+
return paths
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Generate fuzzing reports with success/failure rates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from agent_fuzzer.runner import FuzzResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CategorySummary:
|
|
15
|
+
"""Summary for a single category."""
|
|
16
|
+
|
|
17
|
+
category: str
|
|
18
|
+
total: int = 0
|
|
19
|
+
passed: int = 0
|
|
20
|
+
partial: int = 0
|
|
21
|
+
failed: int = 0
|
|
22
|
+
avg_score: float = 0.0
|
|
23
|
+
avg_tokens: float = 0.0
|
|
24
|
+
avg_duration: float = 0.0
|
|
25
|
+
pass_rate: float = 0.0
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> dict[str, Any]:
|
|
28
|
+
return {
|
|
29
|
+
"category": self.category,
|
|
30
|
+
"total": self.total,
|
|
31
|
+
"passed": self.passed,
|
|
32
|
+
"partial": self.partial,
|
|
33
|
+
"failed": self.failed,
|
|
34
|
+
"pass_rate": f"{self.pass_rate:.1%}",
|
|
35
|
+
"avg_score": round(self.avg_score, 3),
|
|
36
|
+
"avg_tokens": round(self.avg_tokens, 1),
|
|
37
|
+
"avg_duration": f"{self.avg_duration:.1f}s",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FuzzReporter:
|
|
42
|
+
"""Generate reports from fuzzing results."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, results: list[FuzzResult] | None = None):
|
|
45
|
+
self.results = results or []
|
|
46
|
+
|
|
47
|
+
def generate_report(self) -> dict[str, Any]:
|
|
48
|
+
"""Generate a comprehensive fuzzing report.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Dictionary with overall and per-category metrics.
|
|
52
|
+
"""
|
|
53
|
+
if not self.results:
|
|
54
|
+
return {"status": "no results", "total": 0}
|
|
55
|
+
|
|
56
|
+
total = len(self.results)
|
|
57
|
+
passed = sum(1 for r in self.results if r.passed)
|
|
58
|
+
partial = sum(1 for r in self.results if r.partial and not r.passed)
|
|
59
|
+
failed = total - passed - partial
|
|
60
|
+
avg_score = sum(r.score for r in self.results) / total if total else 0
|
|
61
|
+
avg_tokens = sum(r.tokens_used for r in self.results) / total if total else 0
|
|
62
|
+
avg_duration = sum(r.duration_seconds for r in self.results) / total if total else 0
|
|
63
|
+
|
|
64
|
+
# Category summaries
|
|
65
|
+
categories: dict[str, list[FuzzResult]] = {}
|
|
66
|
+
for r in self.results:
|
|
67
|
+
categories.setdefault(r.category, []).append(r)
|
|
68
|
+
|
|
69
|
+
category_summaries = {}
|
|
70
|
+
for cat, cat_results in categories.items():
|
|
71
|
+
cat_total = len(cat_results)
|
|
72
|
+
cat_passed = sum(1 for r in cat_results if r.passed)
|
|
73
|
+
cat_partial = sum(1 for r in cat_results if r.partial and not r.passed)
|
|
74
|
+
summary = CategorySummary(
|
|
75
|
+
category=cat,
|
|
76
|
+
total=cat_total,
|
|
77
|
+
passed=cat_passed,
|
|
78
|
+
partial=cat_partial,
|
|
79
|
+
failed=cat_total - cat_passed - cat_partial,
|
|
80
|
+
avg_score=sum(r.score for r in cat_results) / cat_total if cat_total else 0,
|
|
81
|
+
avg_tokens=sum(r.tokens_used for r in cat_results) / cat_total if cat_total else 0,
|
|
82
|
+
avg_duration=sum(r.duration_seconds for r in cat_results) / cat_total if cat_total else 0,
|
|
83
|
+
pass_rate=cat_passed / cat_total if cat_total else 0,
|
|
84
|
+
)
|
|
85
|
+
category_summaries[cat] = summary.to_dict()
|
|
86
|
+
|
|
87
|
+
# Difficulty breakdown
|
|
88
|
+
difficulty_summary = {}
|
|
89
|
+
for diff in ["easy", "medium", "hard"]:
|
|
90
|
+
diff_results = [r for r in self.results if r.difficulty == diff]
|
|
91
|
+
if diff_results:
|
|
92
|
+
diff_total = len(diff_results)
|
|
93
|
+
diff_passed = sum(1 for r in diff_results if r.passed)
|
|
94
|
+
difficulty_summary[diff] = {
|
|
95
|
+
"total": diff_total,
|
|
96
|
+
"passed": diff_passed,
|
|
97
|
+
"pass_rate": f"{diff_passed/diff_total:.1%}",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
"status": "complete",
|
|
102
|
+
"total": total,
|
|
103
|
+
"passed": passed,
|
|
104
|
+
"partial": partial,
|
|
105
|
+
"failed": failed,
|
|
106
|
+
"pass_rate": f"{passed/total:.1%}" if total else "N/A",
|
|
107
|
+
"avg_score": round(avg_score, 3),
|
|
108
|
+
"avg_tokens": round(avg_tokens, 1),
|
|
109
|
+
"avg_duration": f"{avg_duration:.1f}s",
|
|
110
|
+
"by_category": category_summaries,
|
|
111
|
+
"by_difficulty": difficulty_summary,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def save_report(self, path: str | Path, format: str = "json") -> None:
|
|
115
|
+
"""Save the report to a file."""
|
|
116
|
+
path = Path(path)
|
|
117
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
report = self.generate_report()
|
|
119
|
+
with open(path, "w") as f:
|
|
120
|
+
json.dump(report, f, indent=2)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Run agent against adversarial scenarios and collect metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from agent_fuzzer.generator import Scenario, ScenarioGenerator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class FuzzResult:
|
|
16
|
+
"""Result of running a single fuzz scenario."""
|
|
17
|
+
|
|
18
|
+
scenario_name: str
|
|
19
|
+
category: str
|
|
20
|
+
difficulty: str
|
|
21
|
+
passed: bool = False
|
|
22
|
+
partial: bool = False
|
|
23
|
+
score: float = 0.0
|
|
24
|
+
tokens_used: int = 0
|
|
25
|
+
duration_seconds: float = 0.0
|
|
26
|
+
error: str | None = None
|
|
27
|
+
agent_output: str = ""
|
|
28
|
+
expected_fix: str = ""
|
|
29
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> dict[str, Any]:
|
|
32
|
+
return {
|
|
33
|
+
"scenario_name": self.scenario_name,
|
|
34
|
+
"category": self.category,
|
|
35
|
+
"difficulty": self.difficulty,
|
|
36
|
+
"passed": self.passed,
|
|
37
|
+
"partial": self.partial,
|
|
38
|
+
"score": self.score,
|
|
39
|
+
"tokens_used": self.tokens_used,
|
|
40
|
+
"duration_seconds": self.duration_seconds,
|
|
41
|
+
"error": self.error,
|
|
42
|
+
"expected_fix": self.expected_fix,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class FuzzRunner:
|
|
47
|
+
"""Run an agent against adversarial scenarios and collect metrics.
|
|
48
|
+
|
|
49
|
+
The runner simulates an agent attempting to solve each scenario
|
|
50
|
+
and tracks success rate, token usage, and timing.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, model: str = "gpt-4", max_retries: int = 3, timeout: int = 60):
|
|
54
|
+
self.model = model
|
|
55
|
+
self.max_retries = max_retries
|
|
56
|
+
self.timeout = timeout
|
|
57
|
+
self.results: list[FuzzResult] = []
|
|
58
|
+
|
|
59
|
+
def run_scenario(self, scenario: Scenario) -> FuzzResult:
|
|
60
|
+
"""Run a single adversarial scenario.
|
|
61
|
+
|
|
62
|
+
In production, this would call the actual agent API.
|
|
63
|
+
Here we simulate results based on difficulty.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
scenario: The Scenario to run.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
FuzzResult with outcome metrics.
|
|
70
|
+
"""
|
|
71
|
+
import random
|
|
72
|
+
random.seed(hash(scenario.name))
|
|
73
|
+
|
|
74
|
+
# Difficulty affects pass rate
|
|
75
|
+
pass_rates = {"easy": 0.9, "medium": 0.65, "hard": 0.35}
|
|
76
|
+
base_rate = pass_rates.get(scenario.difficulty, 0.5)
|
|
77
|
+
|
|
78
|
+
passed = random.random() < base_rate
|
|
79
|
+
partial = not passed and random.random() < 0.4
|
|
80
|
+
score = 1.0 if passed else (0.5 if partial else random.uniform(0.0, 0.3))
|
|
81
|
+
|
|
82
|
+
return FuzzResult(
|
|
83
|
+
scenario_name=scenario.name,
|
|
84
|
+
category=scenario.category,
|
|
85
|
+
difficulty=scenario.difficulty,
|
|
86
|
+
passed=passed,
|
|
87
|
+
partial=partial,
|
|
88
|
+
score=score,
|
|
89
|
+
tokens_used=random.randint(200, 3000),
|
|
90
|
+
duration_seconds=random.uniform(2.0, 30.0),
|
|
91
|
+
expected_fix=scenario.expected_fix,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def run_suite(self, scenarios: list[Scenario] | None = None, categories: list[str] | None = None) -> list[FuzzResult]:
|
|
95
|
+
"""Run a suite of adversarial scenarios.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
scenarios: Optional list of scenarios. If None, generates all.
|
|
99
|
+
categories: Optional category filter.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List of FuzzResult objects.
|
|
103
|
+
"""
|
|
104
|
+
if scenarios is None:
|
|
105
|
+
generator = ScenarioGenerator()
|
|
106
|
+
scenarios = generator.generate_all()
|
|
107
|
+
|
|
108
|
+
if categories:
|
|
109
|
+
scenarios = [s for s in scenarios if s.category in categories]
|
|
110
|
+
|
|
111
|
+
self.results = []
|
|
112
|
+
for scenario in scenarios:
|
|
113
|
+
result = self.run_scenario(scenario)
|
|
114
|
+
self.results.append(result)
|
|
115
|
+
|
|
116
|
+
return self.results
|
|
117
|
+
|
|
118
|
+
def run_from_directory(self, scenarios_dir: str | Path) -> list[FuzzResult]:
|
|
119
|
+
"""Load scenarios from YAML files and run them.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
scenarios_dir: Directory containing scenario YAML files.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of FuzzResult objects.
|
|
126
|
+
"""
|
|
127
|
+
import yaml
|
|
128
|
+
scenarios_dir = Path(scenarios_dir)
|
|
129
|
+
scenarios: list[Scenario] = []
|
|
130
|
+
|
|
131
|
+
for yaml_file in scenarios_dir.rglob("*.yaml"):
|
|
132
|
+
with open(yaml_file) as f:
|
|
133
|
+
data = yaml.safe_load(f)
|
|
134
|
+
scenarios.append(Scenario(
|
|
135
|
+
name=data["name"],
|
|
136
|
+
category=data["category"],
|
|
137
|
+
description=data["description"],
|
|
138
|
+
language=data.get("language", "python"),
|
|
139
|
+
code=data["code"],
|
|
140
|
+
expected_fix=data["expected_fix"],
|
|
141
|
+
difficulty=data.get("difficulty", "medium"),
|
|
142
|
+
tags=data.get("tags", []),
|
|
143
|
+
))
|
|
144
|
+
|
|
145
|
+
return self.run_suite(scenarios)
|