agent-fuzzer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ name: Release to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ permissions:
9
+ id-token: write
10
+ packages: write
11
+
12
+ jobs:
13
+ build-and-publish:
14
+ name: Build and publish to PyPI
15
+ runs-on: ubuntu-latest
16
+ environment:
17
+ name: pypi
18
+ url: https://pypi.org/p/agent-fuzzer
19
+ permissions:
20
+ id-token: write
21
+
22
+ steps:
23
+ - name: Checkout code
24
+ uses: actions/checkout@v4
25
+
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v5
28
+ with:
29
+ python-version: '3.12'
30
+
31
+ - name: Install build dependencies
32
+ run: python -m pip install --upgrade build
33
+
34
+ - name: Build package
35
+ run: python -m build
36
+
37
+ - name: Publish package to PyPI
38
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,15 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+ .venv/
13
+ venv/
14
+ *.so
15
+ .env
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 FableForge Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-fuzzer
3
+ Version: 0.1.0
4
+ Summary: Adversarial scenario generation and testing for coding agents
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: click>=8.0
8
+ Requires-Dist: jinja2>=3.1
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: pyyaml>=6.0
11
+ Requires-Dist: rich>=13.0
12
+ Description-Content-Type: text/markdown
13
+
14
+ # AgentFuzzer
15
+
16
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![Tests](https://img.shields.io/badge/tests-0-yellow.svg)](tests/)
17
+
18
+
19
+ Adversarial scenario generation and testing for coding agents.
20
+
21
+ ## Quick Start
22
+
23
+ ## Quick Install
24
+
25
+ ```bash
26
+ pip install agent-fuzzer
27
+ ```
28
+
29
+
30
+ ```bash
31
+ # Run fuzzing tests
32
+ agentfuzzer fuzz --model gpt-4 --count 20 --difficulty hard
33
+
34
+ # View report
35
+ agentfuzzer report -r fuzz_report.json
36
+ ```
37
+
38
+ ## Scenario Categories
39
+
40
+ - **broken_code**: Code with bugs the agent must fix
41
+ - **failing_tests**: Tests that fail and need fixing
42
+ - **missing_deps**: Missing or conflicting dependencies
43
+ - **network_errors**: Network-related failures
44
+
45
+ ## License
46
+
47
+ MIT
48
+
49
+ ## Ecosystem
50
+
51
+ Part of the [FableForge](../) ecosystem — 21 open-source projects built from 210K real agent traces:
52
+
53
+ | Project | Description |
54
+ | --- | --- |
55
+ | **[Anvil](../anvil)** | Self-verified coding agent |
56
+ | **[VerifyLoop](../verifyloop)** | Plan→Execute→Verify→Recover framework |
57
+ | **[ErrorRecovery](../error-recovery)** | Self-healing middleware (3,725 error patterns) |
58
+ | **[FableForge-14B](../fableforge-14b)** | The fine-tuned 14B model (4-stage training) |
59
+ | **[ShellWhisperer](../shell-whisperer)** | 1.5B edge agent (phone/RPi, 50ms) |
60
+ | **[ReasonCritic](../reason-critic)** | Verification model (130 benchmark tasks) |
61
+ | **[TraceCompiler](../trace-compiler)** | Compile traces → LoRA skills |
62
+ | **[AgentRuntime](../agent-runtime)** | Persistent agent daemon (systemd for AI) |
63
+ | **[AgentSwarm](../agent-swarm)** | Multi-agent from real trace transitions |
64
+ | **[AgentTelemetry](../agent-telemetry)** | Datadog for agents (token tracking, costs) |
65
+ | **[BenchAgent](../bench-agent)** | HumanEval for tool-use (107 tasks) |
66
+ | **[AgentDev](../agent-dev)** | VSCode extension with verification |
67
+ | **[TraceViz](../trace-viz)** | Trace replay visualizer (Next.js) |
68
+ | **[AgentSkills](../agent-skills)** | npm for agent behaviors |
69
+ | **[AgentCurriculum](../agent-curriculum)** | 5-stage progressive training |
70
+ | **[AgentFuzzer](../agent-fuzzer)** | Adversarial testing for agents |
71
+ | **[AgentConstitution](../agent-constitution)** | Safety guardrails from traces |
72
+ | **[CostOptimizer](../cost-optimizer)** | Token cost reduction (50-80%) |
73
+ | **[AgentProfiler](../agent-profiler)** | Behavioral fingerprinting |
74
+ | **[TrajectoryDistiller](../trajectory-distiller)** | Trace→training data pipeline |
75
+ | **[Fable5-Dataset](../fable5-dataset)** | HuggingFace dataset release |
@@ -0,0 +1,62 @@
1
+ # AgentFuzzer
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![Tests](https://img.shields.io/badge/tests-0-yellow.svg)](tests/)
4
+
5
+
6
+ Adversarial scenario generation and testing for coding agents.
7
+
8
+ ## Quick Start
9
+
10
+ ## Quick Install
11
+
12
+ ```bash
13
+ pip install agent-fuzzer
14
+ ```
15
+
16
+
17
+ ```bash
18
+ # Run fuzzing tests
19
+ agentfuzzer fuzz --model gpt-4 --count 20 --difficulty hard
20
+
21
+ # View report
22
+ agentfuzzer report -r fuzz_report.json
23
+ ```
24
+
25
+ ## Scenario Categories
26
+
27
+ - **broken_code**: Code with bugs the agent must fix
28
+ - **failing_tests**: Tests that fail and need fixing
29
+ - **missing_deps**: Missing or conflicting dependencies
30
+ - **network_errors**: Network-related failures
31
+
32
+ ## License
33
+
34
+ MIT
35
+
36
+ ## Ecosystem
37
+
38
+ Part of the [FableForge](../) ecosystem — 21 open-source projects built from 210K real agent traces:
39
+
40
+ | Project | Description |
41
+ | --- | --- |
42
+ | **[Anvil](../anvil)** | Self-verified coding agent |
43
+ | **[VerifyLoop](../verifyloop)** | Plan→Execute→Verify→Recover framework |
44
+ | **[ErrorRecovery](../error-recovery)** | Self-healing middleware (3,725 error patterns) |
45
+ | **[FableForge-14B](../fableforge-14b)** | The fine-tuned 14B model (4-stage training) |
46
+ | **[ShellWhisperer](../shell-whisperer)** | 1.5B edge agent (phone/RPi, 50ms) |
47
+ | **[ReasonCritic](../reason-critic)** | Verification model (130 benchmark tasks) |
48
+ | **[TraceCompiler](../trace-compiler)** | Compile traces → LoRA skills |
49
+ | **[AgentRuntime](../agent-runtime)** | Persistent agent daemon (systemd for AI) |
50
+ | **[AgentSwarm](../agent-swarm)** | Multi-agent from real trace transitions |
51
+ | **[AgentTelemetry](../agent-telemetry)** | Datadog for agents (token tracking, costs) |
52
+ | **[BenchAgent](../bench-agent)** | HumanEval for tool-use (107 tasks) |
53
+ | **[AgentDev](../agent-dev)** | VSCode extension with verification |
54
+ | **[TraceViz](../trace-viz)** | Trace replay visualizer (Next.js) |
55
+ | **[AgentSkills](../agent-skills)** | npm for agent behaviors |
56
+ | **[AgentCurriculum](../agent-curriculum)** | 5-stage progressive training |
57
+ | **[AgentFuzzer](../agent-fuzzer)** | Adversarial testing for agents |
58
+ | **[AgentConstitution](../agent-constitution)** | Safety guardrails from traces |
59
+ | **[CostOptimizer](../cost-optimizer)** | Token cost reduction (50-80%) |
60
+ | **[AgentProfiler](../agent-profiler)** | Behavioral fingerprinting |
61
+ | **[TrajectoryDistiller](../trajectory-distiller)** | Trace→training data pipeline |
62
+ | **[Fable5-Dataset](../fable5-dataset)** | HuggingFace dataset release |
File without changes
@@ -0,0 +1,17 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "agent-fuzzer"
7
+ version = "0.1.0"
8
+ description = "Adversarial scenario generation and testing for coding agents"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = ["pydantic>=2.0", "pyyaml>=6.0", "click>=8.0", "rich>=13.0", "jinja2>=3.1"]
12
+
13
+ [project.scripts]
14
+ agentfuzzer = "agent_fuzzer.cli:cli"
15
+
16
+ [tool.hatch.build.targets.wheel]
17
+ packages = ["src/agent_fuzzer"]
@@ -0,0 +1,3 @@
1
+ """AgentFuzzer — Adversarial scenario generation and testing for coding agents."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,90 @@
1
+ """AgentFuzzer CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import click
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from agent_fuzzer.generator import ScenarioGenerator
10
+ from agent_fuzzer.runner import FuzzRunner
11
+ from agent_fuzzer.reporter import FuzzReporter
12
+
13
+ console = Console()
14
+
15
+
16
+ @click.group()
17
+ @click.version_option("0.1.0")
18
+ def cli() -> None:
19
+ """AgentFuzzer — Adversarial scenario testing for coding agents."""
20
+
21
+
22
+ @cli.command()
23
+ @click.option("--model", default="gpt-4", help="Model to test")
24
+ @click.option("--category", "-c", type=click.Choice(["broken_code", "failing_tests", "missing_deps", "network_errors"]), help="Scenario category")
25
+ @click.option("--count", "-n", default=10, help="Number of scenarios to generate")
26
+ @click.option("--difficulty", "-d", type=click.Choice(["easy", "medium", "hard"]), help="Difficulty filter")
27
+ @click.option("--output", "-o", type=click.Path(), help="Save scenarios to directory")
28
+ def fuzz(model: str, category: str | None, count: int, difficulty: str | None, output: str | None) -> None:
29
+ """Run fuzzing scenarios against an agent."""
30
+ generator = ScenarioGenerator()
31
+ scenarios = generator.generate(category=category, count=count, difficulty=difficulty)
32
+
33
+ console.print(f"\n[bold]Running {len(scenarios)} scenarios against {model}[/bold]\n")
34
+
35
+ runner = FuzzRunner(model=model)
36
+ results = runner.run_suite(scenarios)
37
+
38
+ reporter = FuzzReporter(results)
39
+ report = reporter.generate_report()
40
+
41
+ table = Table(title="Fuzzing Results")
42
+ table.add_column("Metric", style="cyan")
43
+ table.add_column("Value", style="green")
44
+ for key, value in report.items():
45
+ if key not in ("by_category", "by_difficulty"):
46
+ table.add_row(key, str(value))
47
+ console.print(table)
48
+
49
+ if output:
50
+ reporter.save_report(output)
51
+ console.print(f"\n[green]Report saved to {output}[/green]")
52
+
53
+
54
+ @cli.command()
55
+ @click.option("--report", "-r", type=click.Path(exists=True), help="Report JSON file")
56
+ def report(report: str | None) -> None:
57
+ """Display a saved fuzzing report."""
58
+ import json
59
+ from pathlib import Path
60
+
61
+ if report:
62
+ path = Path(report)
63
+ with open(path) as f:
64
+ data = json.load(f)
65
+ else:
66
+ console.print("[yellow]No report file specified. Run 'agentfuzzer fuzz' first.[/yellow]")
67
+ return
68
+
69
+ table = Table(title="Fuzzing Report")
70
+ table.add_column("Metric", style="cyan")
71
+ table.add_column("Value", style="green")
72
+ for key, value in data.items():
73
+ if isinstance(value, dict):
74
+ continue
75
+ table.add_row(key, str(value))
76
+ console.print(table)
77
+
78
+ if "by_category" in data:
79
+ cat_table = Table(title="Results by Category")
80
+ cat_table.add_column("Category", style="cyan")
81
+ cat_table.add_column("Total")
82
+ cat_table.add_column("Passed")
83
+ cat_table.add_column("Pass Rate", style="green")
84
+ for cat, val in data["by_category"].items():
85
+ cat_table.add_row(cat, str(val["total"]), str(val["passed"]), str(val["pass_rate"]))
86
+ console.print(cat_table)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ cli()
@@ -0,0 +1,233 @@
1
+ """Generate adversarial scenarios: broken_code, failing_tests, missing_deps, network_errors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import random
7
+ import string
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import yaml
13
+
14
+ BROKEN_CODE_TEMPLATES: list[dict[str, str]] = [
15
+ {"name": "off_by_one_loop", "description": "Loop iterates one time too many or too few", "language": "python",
16
+ "code": "def find_index(items, target):\n for i in range(len(items)):\n if items[i] == target:\n return i\n return -1\n\n# Bug: range(len(items)) should handle edge case where target is last item",
17
+ "expected_fix": "Handle edge cases in loop boundaries"},
18
+ {"name": "null_deref", "description": "Null/None reference without checking", "language": "python",
19
+ "code": "def get_name(user):\n return user.name.upper()\n\n# Bug: user could be None",
20
+ "expected_fix": "Add null check before accessing attributes"},
21
+ {"name": "type_confusion", "description": "Function receives wrong type", "language": "python",
22
+ "code": "def calculate_total(prices):\n return sum(prices)\n\n# Bug: prices might contain strings",
23
+ "expected_fix": "Validate and convert input types"},
24
+ {"name": "infinite_recursion", "description": "Recursive function without proper base case", "language": "python",
25
+ "code": "def fibonacci(n):\n return fibonacci(n - 1) + fibonacci(n - 2)\n\n# Bug: No base case",
26
+ "expected_fix": "Add base case for n <= 1"},
27
+ {"name": "race_condition", "description": "Shared mutable state without locking", "language": "python",
28
+ "code": "class Counter:\n def __init__(self):\n self.count = 0\n \n def increment(self):\n self.count += 1\n\n# Bug: Not thread-safe",
29
+ "expected_fix": "Add thread-safe locking"},
30
+ {"name": "incorrect_comparison", "description": "Using = instead of == in comparison", "language": "python",
31
+ "code": "def check_admin(user):\n if user.role = 'admin':\n return True\n return False\n\n# Bug: Assignment instead of comparison",
32
+ "expected_fix": "Use == for comparison, not ="},
33
+ {"name": "missing_import", "description": "Code uses module without importing it", "language": "python",
34
+ "code": "def get_current_time():\n return datetime.now()\n\n# Bug: datetime module not imported",
35
+ "expected_fix": "Add 'from datetime import datetime'"},
36
+ {"name": "swallowed_exception", "description": "Bare except that hides errors", "language": "python",
37
+ "code": "def read_config(path):\n try:\n with open(path) as f:\n return json.load(f)\n except:\n return {}\n\n# Bug: Catches ALL exceptions silently",
38
+ "expected_fix": "Catch specific exceptions and log errors"},
39
+ ]
40
+
41
+ FAILING_TEST_TEMPLATES: list[dict[str, str]] = [
42
+ {"name": "flaky_test", "description": "Test that fails intermittently due to timing or ordering", "language": "python",
43
+ "code": "def test_api_response():\n response = api.get('/users')\n assert response.status_code == 200\n assert len(response.json()) > 0\n\n# Bug: Fails when API is slow or returns empty DB",
44
+ "expected_fix": "Add retries, mock API, or make assertions more robust"},
45
+ {"name": "hardcoded_path", "description": "Test uses hardcoded file paths", "language": "python",
46
+ "code": "def test_read_file():\n content = read_file('/Users/dev/data.txt')\n assert content is not None\n\n# Bug: Path doesn't exist on other machines",
47
+ "expected_fix": "Use tmp_path fixture or relative paths"},
48
+ {"name": "missing_mock", "description": "Test calls real external service", "language": "python",
49
+ "code": "def test_send_email():\n result = send_email('test@example.com', 'Subject', 'Body')\n assert result.success\n\n# Bug: Sends real emails in tests",
50
+ "expected_fix": "Mock the email sending function"},
51
+ ]
52
+
53
+ MISSING_DEPS_TEMPLATES: list[dict[str, str]] = [
54
+ {"name": "missing_package", "description": "Import fails because package isn't installed", "language": "python",
55
+ "code": "import pandas as pd\nimport numpy as np\n\ndef process_data(data):\n return pd.DataFrame(data).describe()\n\n# Bug: pandas and numpy not in requirements.txt",
56
+ "expected_fix": "Add pandas and numpy to requirements.txt"},
57
+ {"name": "version_conflict", "description": "Two packages require different versions", "language": "python",
58
+ "code": "# requirements.txt:\n# fastapi==0.100.0\n# pydantic==1.10.0\n# Bug: fastapi 0.100 requires pydantic v2",
59
+ "expected_fix": "Upgrade pydantic to v2 or downgrade fastapi"},
60
+ ]
61
+
62
+ NETWORK_ERROR_TEMPLATES: list[dict[str, str]] = [
63
+ {"name": "connection_timeout", "description": "API call times out without retry", "language": "python",
64
+ "code": "import requests\n\ndef fetch_data(url):\n return requests.get(url).json()\n\n# Bug: No timeout, no retry, no error handling",
65
+ "expected_fix": "Add timeout, retry logic, and proper error handling"},
66
+ {"name": "dns_failure", "description": "DNS resolution fails silently", "language": "python",
67
+ "code": "def get_service_url(service_name):\n return f'http://{service_name}:8080'\n\n# Bug: No DNS resolution check",
68
+ "expected_fix": "Add DNS resolution check and fallback URLs"},
69
+ {"name": "ssl_error", "description": "SSL certificate verification fails", "language": "python",
70
+ "code": "import requests\n\ndef fetch_secure(url):\n return requests.get(url, verify=False).json()\n\n# Bug: Disabling SSL verification is insecure",
71
+ "expected_fix": "Fix SSL certificates instead of disabling verification"},
72
+ ]
73
+
74
+
75
+ @dataclass
76
+ class Scenario:
77
+ """A single adversarial test scenario."""
78
+
79
+ name: str
80
+ category: str
81
+ description: str
82
+ language: str
83
+ code: str
84
+ expected_fix: str
85
+ difficulty: str = "medium"
86
+ tags: list[str] = field(default_factory=list)
87
+
88
+ def to_dict(self) -> dict[str, Any]:
89
+ return {
90
+ "name": self.name,
91
+ "category": self.category,
92
+ "description": self.description,
93
+ "language": self.language,
94
+ "code": self.code,
95
+ "expected_fix": self.expected_fix,
96
+ "difficulty": self.difficulty,
97
+ "tags": self.tags,
98
+ }
99
+
100
+ def to_yaml(self) -> str:
101
+ return yaml.dump(self.to_dict(), default_flow_style=False)
102
+
103
+
104
+ class ScenarioGenerator:
105
+ """Generate adversarial scenarios for testing coding agents.
106
+
107
+ Creates scenarios in four categories:
108
+ - broken_code: Code with bugs the agent must fix
109
+ - failing_tests: Tests that fail and need fixing
110
+ - missing_deps: Missing or conflicting dependencies
111
+ - network_errors: Network-related failures
112
+ """
113
+
114
+ def __init__(self, seed: int | None = None):
115
+ self.rng = random.Random(seed)
116
+ self.categories = {
117
+ "broken_code": BROKEN_CODE_TEMPLATES,
118
+ "failing_tests": FAILING_TEST_TEMPLATES,
119
+ "missing_deps": MISSING_DEPS_TEMPLATES,
120
+ "network_errors": NETWORK_ERROR_TEMPLATES,
121
+ }
122
+
123
+ def generate(self, category: str | None = None, count: int = 10, difficulty: str | None = None) -> list[Scenario]:
124
+ """Generate adversarial scenarios.
125
+
126
+ Args:
127
+ category: Optional category filter.
128
+ count: Number of scenarios to generate.
129
+ difficulty: Optional difficulty filter (easy, medium, hard).
130
+
131
+ Returns:
132
+ List of Scenario objects.
133
+ """
134
+ difficulties = ["easy", "medium", "hard"]
135
+ scenarios: list[Scenario] = []
136
+
137
+ templates = {category: self.categories[category]} if category else self.categories
138
+
139
+ for i in range(count):
140
+ cat = self.rng.choice(list(templates.keys())) if not category else category
141
+ template = self.rng.choice(templates[cat])
142
+ diff = difficulty or self.rng.choice(difficulties)
143
+
144
+ # Add variation to the code
145
+ varied_code = self._add_variation(template["code"], diff)
146
+
147
+ scenario = Scenario(
148
+ name=template["name"],
149
+ category=cat,
150
+ description=template["description"],
151
+ language=template.get("language", "python"),
152
+ code=varied_code,
153
+ expected_fix=template["expected_fix"],
154
+ difficulty=diff,
155
+ tags=[cat, diff, template.get("language", "python")],
156
+ )
157
+ scenarios.append(scenario)
158
+
159
+ return scenarios
160
+
161
+ def _add_variation(self, code: str, difficulty: str) -> str:
162
+ """Add difficulty-based variation to scenario code."""
163
+ if difficulty == "easy":
164
+ return code
165
+ elif difficulty == "medium":
166
+ # Add misleading comments
167
+ lines = code.split("\n")
168
+ if len(lines) > 2:
169
+ insert_pos = self.rng.randint(1, len(lines) - 2)
170
+ lines.insert(insert_pos, " # This line looks suspicious but is correct")
171
+ return "\n".join(lines)
172
+ else:
173
+ # Add red herrings and noise
174
+ lines = code.split("\n")
175
+ noise_lines = [
176
+ "import logging",
177
+ "logging.basicConfig(level=logging.DEBUG)",
178
+ "# TODO: refactor this later",
179
+ "# NOTE: performance optimization needed",
180
+ ]
181
+ for _ in range(2):
182
+ pos = self.rng.randint(0, len(lines))
183
+ lines.insert(pos, self.rng.choice(noise_lines))
184
+ return "\n".join(lines)
185
+
186
+ def generate_all(self) -> list[Scenario]:
187
+ """Generate the complete scenario suite (50+ scenarios).
188
+
189
+ Returns:
190
+ List of all scenarios across all categories.
191
+ """
192
+ scenarios = []
193
+ # Static scenarios from templates
194
+ for cat, templates in self.categories.items():
195
+ for template in templates:
196
+ for diff in ["easy", "medium", "hard"]:
197
+ scenarios.append(Scenario(
198
+ name=f"{template['name']}_{diff}",
199
+ category=cat,
200
+ description=template["description"],
201
+ language=template.get("language", "python"),
202
+ code=self._add_variation(template["code"], diff),
203
+ expected_fix=template["expected_fix"],
204
+ difficulty=diff,
205
+ tags=[cat, diff],
206
+ ))
207
+ # Additional random scenarios
208
+ scenarios.extend(self.generate(count=10))
209
+ return scenarios
210
+
211
+ def save_scenarios(self, scenarios: list[Scenario], output_dir: str | Path) -> list[Path]:
212
+ """Save scenarios as YAML files organized by category.
213
+
214
+ Args:
215
+ scenarios: List of Scenario objects.
216
+ output_dir: Base directory for output.
217
+
218
+ Returns:
219
+ List of paths to saved files.
220
+ """
221
+ output_dir = Path(output_dir)
222
+ output_dir.mkdir(parents=True, exist_ok=True)
223
+ paths = []
224
+
225
+ for scenario in scenarios:
226
+ cat_dir = output_dir / scenario.category
227
+ cat_dir.mkdir(parents=True, exist_ok=True)
228
+ path = cat_dir / f"{scenario.name}.yaml"
229
+ with open(path, "w") as f:
230
+ f.write(scenario.to_yaml())
231
+ paths.append(path)
232
+
233
+ return paths
@@ -0,0 +1,120 @@
1
+ """Generate fuzzing reports with success/failure rates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from agent_fuzzer.runner import FuzzResult
11
+
12
+
13
+ @dataclass
14
+ class CategorySummary:
15
+ """Summary for a single category."""
16
+
17
+ category: str
18
+ total: int = 0
19
+ passed: int = 0
20
+ partial: int = 0
21
+ failed: int = 0
22
+ avg_score: float = 0.0
23
+ avg_tokens: float = 0.0
24
+ avg_duration: float = 0.0
25
+ pass_rate: float = 0.0
26
+
27
+ def to_dict(self) -> dict[str, Any]:
28
+ return {
29
+ "category": self.category,
30
+ "total": self.total,
31
+ "passed": self.passed,
32
+ "partial": self.partial,
33
+ "failed": self.failed,
34
+ "pass_rate": f"{self.pass_rate:.1%}",
35
+ "avg_score": round(self.avg_score, 3),
36
+ "avg_tokens": round(self.avg_tokens, 1),
37
+ "avg_duration": f"{self.avg_duration:.1f}s",
38
+ }
39
+
40
+
41
+ class FuzzReporter:
42
+ """Generate reports from fuzzing results."""
43
+
44
+ def __init__(self, results: list[FuzzResult] | None = None):
45
+ self.results = results or []
46
+
47
+ def generate_report(self) -> dict[str, Any]:
48
+ """Generate a comprehensive fuzzing report.
49
+
50
+ Returns:
51
+ Dictionary with overall and per-category metrics.
52
+ """
53
+ if not self.results:
54
+ return {"status": "no results", "total": 0}
55
+
56
+ total = len(self.results)
57
+ passed = sum(1 for r in self.results if r.passed)
58
+ partial = sum(1 for r in self.results if r.partial and not r.passed)
59
+ failed = total - passed - partial
60
+ avg_score = sum(r.score for r in self.results) / total if total else 0
61
+ avg_tokens = sum(r.tokens_used for r in self.results) / total if total else 0
62
+ avg_duration = sum(r.duration_seconds for r in self.results) / total if total else 0
63
+
64
+ # Category summaries
65
+ categories: dict[str, list[FuzzResult]] = {}
66
+ for r in self.results:
67
+ categories.setdefault(r.category, []).append(r)
68
+
69
+ category_summaries = {}
70
+ for cat, cat_results in categories.items():
71
+ cat_total = len(cat_results)
72
+ cat_passed = sum(1 for r in cat_results if r.passed)
73
+ cat_partial = sum(1 for r in cat_results if r.partial and not r.passed)
74
+ summary = CategorySummary(
75
+ category=cat,
76
+ total=cat_total,
77
+ passed=cat_passed,
78
+ partial=cat_partial,
79
+ failed=cat_total - cat_passed - cat_partial,
80
+ avg_score=sum(r.score for r in cat_results) / cat_total if cat_total else 0,
81
+ avg_tokens=sum(r.tokens_used for r in cat_results) / cat_total if cat_total else 0,
82
+ avg_duration=sum(r.duration_seconds for r in cat_results) / cat_total if cat_total else 0,
83
+ pass_rate=cat_passed / cat_total if cat_total else 0,
84
+ )
85
+ category_summaries[cat] = summary.to_dict()
86
+
87
+ # Difficulty breakdown
88
+ difficulty_summary = {}
89
+ for diff in ["easy", "medium", "hard"]:
90
+ diff_results = [r for r in self.results if r.difficulty == diff]
91
+ if diff_results:
92
+ diff_total = len(diff_results)
93
+ diff_passed = sum(1 for r in diff_results if r.passed)
94
+ difficulty_summary[diff] = {
95
+ "total": diff_total,
96
+ "passed": diff_passed,
97
+ "pass_rate": f"{diff_passed/diff_total:.1%}",
98
+ }
99
+
100
+ return {
101
+ "status": "complete",
102
+ "total": total,
103
+ "passed": passed,
104
+ "partial": partial,
105
+ "failed": failed,
106
+ "pass_rate": f"{passed/total:.1%}" if total else "N/A",
107
+ "avg_score": round(avg_score, 3),
108
+ "avg_tokens": round(avg_tokens, 1),
109
+ "avg_duration": f"{avg_duration:.1f}s",
110
+ "by_category": category_summaries,
111
+ "by_difficulty": difficulty_summary,
112
+ }
113
+
114
+ def save_report(self, path: str | Path, format: str = "json") -> None:
115
+ """Save the report to a file."""
116
+ path = Path(path)
117
+ path.parent.mkdir(parents=True, exist_ok=True)
118
+ report = self.generate_report()
119
+ with open(path, "w") as f:
120
+ json.dump(report, f, indent=2)
@@ -0,0 +1,145 @@
1
+ """Run agent against adversarial scenarios and collect metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from agent_fuzzer.generator import Scenario, ScenarioGenerator
12
+
13
+
14
+ @dataclass
15
+ class FuzzResult:
16
+ """Result of running a single fuzz scenario."""
17
+
18
+ scenario_name: str
19
+ category: str
20
+ difficulty: str
21
+ passed: bool = False
22
+ partial: bool = False
23
+ score: float = 0.0
24
+ tokens_used: int = 0
25
+ duration_seconds: float = 0.0
26
+ error: str | None = None
27
+ agent_output: str = ""
28
+ expected_fix: str = ""
29
+ details: dict[str, Any] = field(default_factory=dict)
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ return {
33
+ "scenario_name": self.scenario_name,
34
+ "category": self.category,
35
+ "difficulty": self.difficulty,
36
+ "passed": self.passed,
37
+ "partial": self.partial,
38
+ "score": self.score,
39
+ "tokens_used": self.tokens_used,
40
+ "duration_seconds": self.duration_seconds,
41
+ "error": self.error,
42
+ "expected_fix": self.expected_fix,
43
+ }
44
+
45
+
46
+ class FuzzRunner:
47
+ """Run an agent against adversarial scenarios and collect metrics.
48
+
49
+ The runner simulates an agent attempting to solve each scenario
50
+ and tracks success rate, token usage, and timing.
51
+ """
52
+
53
+ def __init__(self, model: str = "gpt-4", max_retries: int = 3, timeout: int = 60):
54
+ self.model = model
55
+ self.max_retries = max_retries
56
+ self.timeout = timeout
57
+ self.results: list[FuzzResult] = []
58
+
59
+ def run_scenario(self, scenario: Scenario) -> FuzzResult:
60
+ """Run a single adversarial scenario.
61
+
62
+ In production, this would call the actual agent API.
63
+ Here we simulate results based on difficulty.
64
+
65
+ Args:
66
+ scenario: The Scenario to run.
67
+
68
+ Returns:
69
+ FuzzResult with outcome metrics.
70
+ """
71
+ import random
72
+ random.seed(hash(scenario.name))
73
+
74
+ # Difficulty affects pass rate
75
+ pass_rates = {"easy": 0.9, "medium": 0.65, "hard": 0.35}
76
+ base_rate = pass_rates.get(scenario.difficulty, 0.5)
77
+
78
+ passed = random.random() < base_rate
79
+ partial = not passed and random.random() < 0.4
80
+ score = 1.0 if passed else (0.5 if partial else random.uniform(0.0, 0.3))
81
+
82
+ return FuzzResult(
83
+ scenario_name=scenario.name,
84
+ category=scenario.category,
85
+ difficulty=scenario.difficulty,
86
+ passed=passed,
87
+ partial=partial,
88
+ score=score,
89
+ tokens_used=random.randint(200, 3000),
90
+ duration_seconds=random.uniform(2.0, 30.0),
91
+ expected_fix=scenario.expected_fix,
92
+ )
93
+
94
+ def run_suite(self, scenarios: list[Scenario] | None = None, categories: list[str] | None = None) -> list[FuzzResult]:
95
+ """Run a suite of adversarial scenarios.
96
+
97
+ Args:
98
+ scenarios: Optional list of scenarios. If None, generates all.
99
+ categories: Optional category filter.
100
+
101
+ Returns:
102
+ List of FuzzResult objects.
103
+ """
104
+ if scenarios is None:
105
+ generator = ScenarioGenerator()
106
+ scenarios = generator.generate_all()
107
+
108
+ if categories:
109
+ scenarios = [s for s in scenarios if s.category in categories]
110
+
111
+ self.results = []
112
+ for scenario in scenarios:
113
+ result = self.run_scenario(scenario)
114
+ self.results.append(result)
115
+
116
+ return self.results
117
+
118
+ def run_from_directory(self, scenarios_dir: str | Path) -> list[FuzzResult]:
119
+ """Load scenarios from YAML files and run them.
120
+
121
+ Args:
122
+ scenarios_dir: Directory containing scenario YAML files.
123
+
124
+ Returns:
125
+ List of FuzzResult objects.
126
+ """
127
+ import yaml
128
+ scenarios_dir = Path(scenarios_dir)
129
+ scenarios: list[Scenario] = []
130
+
131
+ for yaml_file in scenarios_dir.rglob("*.yaml"):
132
+ with open(yaml_file) as f:
133
+ data = yaml.safe_load(f)
134
+ scenarios.append(Scenario(
135
+ name=data["name"],
136
+ category=data["category"],
137
+ description=data["description"],
138
+ language=data.get("language", "python"),
139
+ code=data["code"],
140
+ expected_fix=data["expected_fix"],
141
+ difficulty=data.get("difficulty", "medium"),
142
+ tags=data.get("tags", []),
143
+ ))
144
+
145
+ return self.run_suite(scenarios)