verdict-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. verdict_eval-0.1.0/LICENSE +21 -0
  2. verdict_eval-0.1.0/PKG-INFO +175 -0
  3. verdict_eval-0.1.0/README.md +143 -0
  4. verdict_eval-0.1.0/pyproject.toml +58 -0
  5. verdict_eval-0.1.0/verdict/__init__.py +3 -0
  6. verdict_eval-0.1.0/verdict/adapters/__init__.py +5 -0
  7. verdict_eval-0.1.0/verdict/adapters/base.py +316 -0
  8. verdict_eval-0.1.0/verdict/adapters/simple_rag.py +205 -0
  9. verdict_eval-0.1.0/verdict/agents/__init__.py +1 -0
  10. verdict_eval-0.1.0/verdict/agents/adaptive_generator.py +241 -0
  11. verdict_eval-0.1.0/verdict/agents/executor.py +99 -0
  12. verdict_eval-0.1.0/verdict/agents/judge.py +383 -0
  13. verdict_eval-0.1.0/verdict/agents/prompts/__init__.py +1 -0
  14. verdict_eval-0.1.0/verdict/agents/prompts/judge_prompt.py +85 -0
  15. verdict_eval-0.1.0/verdict/agents/prompts/reporter_prompt.py +88 -0
  16. verdict_eval-0.1.0/verdict/agents/prompts/test_generator_prompt.py +119 -0
  17. verdict_eval-0.1.0/verdict/agents/reporter.py +354 -0
  18. verdict_eval-0.1.0/verdict/agents/test_generator.py +221 -0
  19. verdict_eval-0.1.0/verdict/analysis/__init__.py +4 -0
  20. verdict_eval-0.1.0/verdict/analysis/flakiness.py +171 -0
  21. verdict_eval-0.1.0/verdict/caching/__init__.py +12 -0
  22. verdict_eval-0.1.0/verdict/caching/backends.py +154 -0
  23. verdict_eval-0.1.0/verdict/caching/cache.py +62 -0
  24. verdict_eval-0.1.0/verdict/cli/__init__.py +1 -0
  25. verdict_eval-0.1.0/verdict/cli/guardrails.py +94 -0
  26. verdict_eval-0.1.0/verdict/cli.py +356 -0
  27. verdict_eval-0.1.0/verdict/config/__init__.py +5 -0
  28. verdict_eval-0.1.0/verdict/config/settings.py +119 -0
  29. verdict_eval-0.1.0/verdict/costs/__init__.py +5 -0
  30. verdict_eval-0.1.0/verdict/costs/calculator.py +142 -0
  31. verdict_eval-0.1.0/verdict/costs/pricing.py +92 -0
  32. verdict_eval-0.1.0/verdict/crews/__init__.py +1 -0
  33. verdict_eval-0.1.0/verdict/evals/__init__.py +1 -0
  34. verdict_eval-0.1.0/verdict/evals/attack_patterns/patterns.json +255 -0
  35. verdict_eval-0.1.0/verdict/evals/categories.py +262 -0
  36. verdict_eval-0.1.0/verdict/evals/rubrics.py +366 -0
  37. verdict_eval-0.1.0/verdict/models/__init__.py +17 -0
  38. verdict_eval-0.1.0/verdict/models/schemas.py +373 -0
  39. verdict_eval-0.1.0/verdict/observability/__init__.py +4 -0
  40. verdict_eval-0.1.0/verdict/observability/token_tracker.py +125 -0
  41. verdict_eval-0.1.0/verdict/orchestration/__init__.py +1 -0
  42. verdict_eval-0.1.0/verdict/orchestration/diff.py +261 -0
  43. verdict_eval-0.1.0/verdict/reports/__init__.py +1 -0
  44. verdict_eval-0.1.0/verdict/reports/builder.py +150 -0
  45. verdict_eval-0.1.0/verdict/stats/__init__.py +4 -0
  46. verdict_eval-0.1.0/verdict/stats/bootstrap.py +70 -0
  47. verdict_eval-0.1.0/verdict/tasks/__init__.py +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Dan Nicolau
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.4
2
+ Name: verdict-eval
3
+ Version: 0.1.0
4
+ Summary: Evaluation infrastructure for AI agents.
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: ai,evaluation,agents,llm,crewai,testing,compliance,red-teaming
8
+ Author: Dan Nicolau
9
+ Author-email: dan@dannicolau.com
10
+ Requires-Python: >=3.11,<3.14
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Software Development :: Quality Assurance
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Requires-Dist: click (>=8.1.7,<8.2.0)
21
+ Requires-Dist: crewai (==1.14.5)
22
+ Requires-Dist: langchain-anthropic (==1.4.3)
23
+ Requires-Dist: pydantic (>=2.11.9,<2.13)
24
+ Requires-Dist: pydantic-settings (>=2.6.0,<3.0)
25
+ Requires-Dist: python-dotenv (>=1.2.2,<2.0.0)
26
+ Requires-Dist: rich (>=13.7.0,<15.0)
27
+ Project-URL: Documentation, https://github.com/dannicolau7/verdict#readme
28
+ Project-URL: Homepage, https://github.com/dannicolau7/verdict
29
+ Project-URL: Repository, https://github.com/dannicolau7/verdict
30
+ Description-Content-Type: text/markdown
31
+
32
+ # Verdict
33
+
34
+ > Evaluation infrastructure for AI agents.
35
+
36
+ [![CI](https://github.com/dannicolau7/verdict/actions/workflows/test.yml/badge.svg)](https://github.com/dannicolau7/verdict/actions/workflows/test.yml)
37
+ ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
38
+ ![License](https://img.shields.io/badge/license-MIT-green)
39
+ ![PyPI](https://img.shields.io/pypi/v/verdict-eval)
40
+
41
+ ## Demo
42
+
43
+ <!-- Replace the link below after recording with `asciinema rec docs/demo.cast --command ./scripts/demo.sh` -->
44
+ > Recording coming soon — run `./scripts/demo.sh` locally after `pip install verdict-eval`.
45
+
46
+ ## Install
47
+
48
+ ```bash
49
+ pip install verdict-eval
50
+ ```
51
+
52
+ ## Quickstart
53
+
54
+ ```bash
55
+ # Run an evaluation against a built-in adapter
56
+ verdict eval --target simple_rag --num-per-category 5
57
+
58
+ # Compare two adapter versions
59
+ verdict diff --target-a simple_rag --target-b path/to/v2.py:MyAdapter --num 10
60
+
61
+ # Analyze flakiness across historical runs
62
+ verdict flakiness --target my-system --reports-dir ./reports
63
+ ```
64
+
65
+ ## CLI reference
66
+
67
+ ### `verdict eval`
68
+
69
+ Run a full evaluation against a target adapter.
70
+
71
+ | Flag | Default | Description |
72
+ |------|---------|-------------|
73
+ | `--target` | required | Adapter spec: `simple_rag` or `path/to/file.py:ClassName` |
74
+ | `--num-per-category` | 5 | Prompts per test category |
75
+ | `--categories` | all | Specific categories (repeat for multiple) |
76
+ | `--output-dir` | `./reports` | Report output directory |
77
+ | `--run-id` | auto | Custom run identifier |
78
+ | `--model` | settings default | Override LLM model for all agents |
79
+ | `--bootstrap-iterations` | 1000 | Bootstrap CI iterations (0 to disable) |
80
+ | `--max-cost-usd` | — | Fail (exit 2) if total cost exceeds this amount |
81
+ | `--max-total-latency-seconds` | — | Fail (exit 2) if total latency exceeds this |
82
+ | `--fail-on-pass-rate-below` | — | Fail (exit 2) if pass rate < threshold |
83
+ | `--fail-on-ci-low-below` | — | Fail (exit 2) if CI lower bound < threshold |
84
+ | `--cache-mode` | `off` | `off` / `record` / `replay` / `update` |
85
+ | `--cache-dir` | `.verdict_cache` | Directory for cached responses |
86
+ | `--adaptive` | off | Run adaptive follow-up probes based on initial responses |
87
+
88
+ ### `verdict diff`
89
+
90
+ Compare two adapter versions against the same generated test suite.
91
+
92
+ ```bash
93
+ verdict diff \
94
+ --target-a simple_rag \
95
+ --target-b path/to/v2.py:V2Adapter \
96
+ --num 10
97
+ ```
98
+
99
+ ### `verdict flakiness`
100
+
101
+ Analyze judge and target consistency across historical evaluation runs.
102
+
103
+ ```bash
104
+ verdict flakiness --target my-system --min-runs 5 --reports-dir ./reports
105
+ ```
106
+
107
+ ## Adaptive mode
108
+
109
+ When `--adaptive` is enabled, Verdict runs a second pass of follow-up probes selected
110
+ based on each initial response. Pattern selection is **entirely rule-based** — no LLM
111
+ is used to generate new attacks. All probes are composed from the curated
112
+ [OWASP LLM Top 10](https://owasp.org/www-project-top-10-for-large-language-model-applications/)
113
+ pattern library in `verdict/evals/attack_patterns/patterns.json`.
114
+
115
+ This design ensures Verdict remains a defensive evaluation tool. See
116
+ [CONTRIBUTING.md](CONTRIBUTING.md) for the security boundary policy.
117
+
118
+ ```bash
119
+ verdict eval --target simple_rag --adaptive
120
+ ```
121
+
122
+ ## Writing a custom adapter
123
+
124
+ ```python
125
+ # my_adapter.py
126
+ from verdict.adapters.base import TargetAdapter
127
+ from verdict.models.schemas import ExecutionResult, TestPrompt
128
+
129
+ class MyAdapter(TargetAdapter):
130
+ name = "my-system"
131
+ version = "1.0.0"
132
+
133
+ async def execute(self, prompt: TestPrompt) -> ExecutionResult:
134
+ response = call_my_system(prompt.prompt)
135
+ return self.make_result(prompt, response=response)
136
+ ```
137
+
138
+ ```bash
139
+ verdict eval --target my_adapter.py:MyAdapter
140
+ ```
141
+
142
+ ## Test categories
143
+
144
+ | Category | What it evaluates |
145
+ |----------|-------------------|
146
+ | `correctness` | Factual accuracy and reasoning quality |
147
+ | `safety` | Refusal of harmful, dangerous, or unethical requests |
148
+ | `injection` | Robustness against prompt injection (OWASP LLM01, LLM07) |
149
+ | `edge_case` | Graceful handling of malformed and ambiguous inputs |
150
+ | `compliance` | Privacy and data handling (OWASP LLM02) |
151
+
152
+ ## Judge calibration
153
+
154
+ The Judge is validated against 22 hand-labeled examples covering all five test
155
+ categories. Results are produced by running the live judge against known ground
156
+ truth — no labels were derived from judge output.
157
+
158
+ | Metric | Target | Baseline |
159
+ |--------|--------|----------|
160
+ | Pass/fail agreement (non-borderline) | ≥ 80% | TBD |
161
+ | Critical failure detection | 5 / 5 | TBD |
162
+ | Score accuracy (±1) | ≥ 70% | TBD |
163
+
164
+ Run calibration locally (requires `ANTHROPIC_API_KEY`):
165
+
166
+ ```bash
167
+ pytest tests/qa/test_judge_calibration.py -v -m llm
168
+ ```
169
+
170
+ See [docs/judge_calibration.md](docs/judge_calibration.md) for full methodology.
171
+
172
+ ## License
173
+
174
+ MIT — see [LICENSE](LICENSE).
175
+
@@ -0,0 +1,143 @@
1
+ # Verdict
2
+
3
+ > Evaluation infrastructure for AI agents.
4
+
5
+ [![CI](https://github.com/dannicolau7/verdict/actions/workflows/test.yml/badge.svg)](https://github.com/dannicolau7/verdict/actions/workflows/test.yml)
6
+ ![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
7
+ ![License](https://img.shields.io/badge/license-MIT-green)
8
+ ![PyPI](https://img.shields.io/pypi/v/verdict-eval)
9
+
10
+ ## Demo
11
+
12
+ <!-- Replace the link below after recording with `asciinema rec docs/demo.cast --command ./scripts/demo.sh` -->
13
+ > Recording coming soon — run `./scripts/demo.sh` locally after `pip install verdict-eval`.
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ pip install verdict-eval
19
+ ```
20
+
21
+ ## Quickstart
22
+
23
+ ```bash
24
+ # Run an evaluation against a built-in adapter
25
+ verdict eval --target simple_rag --num-per-category 5
26
+
27
+ # Compare two adapter versions
28
+ verdict diff --target-a simple_rag --target-b path/to/v2.py:MyAdapter --num 10
29
+
30
+ # Analyze flakiness across historical runs
31
+ verdict flakiness --target my-system --reports-dir ./reports
32
+ ```
33
+
34
+ ## CLI reference
35
+
36
+ ### `verdict eval`
37
+
38
+ Run a full evaluation against a target adapter.
39
+
40
+ | Flag | Default | Description |
41
+ |------|---------|-------------|
42
+ | `--target` | required | Adapter spec: `simple_rag` or `path/to/file.py:ClassName` |
43
+ | `--num-per-category` | 5 | Prompts per test category |
44
+ | `--categories` | all | Specific categories (repeat for multiple) |
45
+ | `--output-dir` | `./reports` | Report output directory |
46
+ | `--run-id` | auto | Custom run identifier |
47
+ | `--model` | settings default | Override LLM model for all agents |
48
+ | `--bootstrap-iterations` | 1000 | Bootstrap CI iterations (0 to disable) |
49
+ | `--max-cost-usd` | — | Fail (exit 2) if total cost exceeds this amount |
50
+ | `--max-total-latency-seconds` | — | Fail (exit 2) if total latency exceeds this |
51
+ | `--fail-on-pass-rate-below` | — | Fail (exit 2) if pass rate < threshold |
52
+ | `--fail-on-ci-low-below` | — | Fail (exit 2) if CI lower bound < threshold |
53
+ | `--cache-mode` | `off` | `off` / `record` / `replay` / `update` |
54
+ | `--cache-dir` | `.verdict_cache` | Directory for cached responses |
55
+ | `--adaptive` | off | Run adaptive follow-up probes based on initial responses |
56
+
57
+ ### `verdict diff`
58
+
59
+ Compare two adapter versions against the same generated test suite.
60
+
61
+ ```bash
62
+ verdict diff \
63
+ --target-a simple_rag \
64
+ --target-b path/to/v2.py:V2Adapter \
65
+ --num 10
66
+ ```
67
+
68
+ ### `verdict flakiness`
69
+
70
+ Analyze judge and target consistency across historical evaluation runs.
71
+
72
+ ```bash
73
+ verdict flakiness --target my-system --min-runs 5 --reports-dir ./reports
74
+ ```
75
+
76
+ ## Adaptive mode
77
+
78
+ When `--adaptive` is enabled, Verdict runs a second pass of follow-up probes selected
79
+ based on each initial response. Pattern selection is **entirely rule-based** — no LLM
80
+ is used to generate new attacks. All probes are composed from the curated
81
+ [OWASP LLM Top 10](https://owasp.org/www-project-top-10-for-large-language-model-applications/)
82
+ pattern library in `verdict/evals/attack_patterns/patterns.json`.
83
+
84
+ This design ensures Verdict remains a defensive evaluation tool. See
85
+ [CONTRIBUTING.md](CONTRIBUTING.md) for the security boundary policy.
86
+
87
+ ```bash
88
+ verdict eval --target simple_rag --adaptive
89
+ ```
90
+
91
+ ## Writing a custom adapter
92
+
93
+ ```python
94
+ # my_adapter.py
95
+ from verdict.adapters.base import TargetAdapter
96
+ from verdict.models.schemas import ExecutionResult, TestPrompt
97
+
98
+ class MyAdapter(TargetAdapter):
99
+ name = "my-system"
100
+ version = "1.0.0"
101
+
102
+ async def execute(self, prompt: TestPrompt) -> ExecutionResult:
103
+ response = call_my_system(prompt.prompt)
104
+ return self.make_result(prompt, response=response)
105
+ ```
106
+
107
+ ```bash
108
+ verdict eval --target my_adapter.py:MyAdapter
109
+ ```
110
+
111
+ ## Test categories
112
+
113
+ | Category | What it evaluates |
114
+ |----------|-------------------|
115
+ | `correctness` | Factual accuracy and reasoning quality |
116
+ | `safety` | Refusal of harmful, dangerous, or unethical requests |
117
+ | `injection` | Robustness against prompt injection (OWASP LLM01, LLM07) |
118
+ | `edge_case` | Graceful handling of malformed and ambiguous inputs |
119
+ | `compliance` | Privacy and data handling (OWASP LLM02) |
120
+
121
+ ## Judge calibration
122
+
123
+ The Judge is validated against 22 hand-labeled examples covering all five test
124
+ categories. Results are produced by running the live judge against known ground
125
+ truth — no labels were derived from judge output.
126
+
127
+ | Metric | Target | Baseline |
128
+ |--------|--------|----------|
129
+ | Pass/fail agreement (non-borderline) | ≥ 80% | TBD |
130
+ | Critical failure detection | 5 / 5 | TBD |
131
+ | Score accuracy (±1) | ≥ 70% | TBD |
132
+
133
+ Run calibration locally (requires `ANTHROPIC_API_KEY`):
134
+
135
+ ```bash
136
+ pytest tests/qa/test_judge_calibration.py -v -m llm
137
+ ```
138
+
139
+ See [docs/judge_calibration.md](docs/judge_calibration.md) for full methodology.
140
+
141
+ ## License
142
+
143
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,58 @@
1
+ [tool.poetry]
2
+ name = "verdict-eval"
3
+ version = "0.1.0"
4
+ description = "Evaluation infrastructure for AI agents."
5
+ authors = ["Dan Nicolau <dan@dannicolau.com>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/dannicolau7/verdict"
9
+ repository = "https://github.com/dannicolau7/verdict"
10
+ documentation = "https://github.com/dannicolau7/verdict#readme"
11
+ keywords = ["ai", "evaluation", "agents", "llm", "crewai", "testing", "compliance", "red-teaming"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Topic :: Software Development :: Testing",
20
+ "Topic :: Software Development :: Quality Assurance",
21
+ ]
22
+ packages = [{include = "verdict"}]
23
+
24
+ [tool.poetry.dependencies]
25
+ python = ">=3.11,<3.14"
26
+ crewai = "1.14.5"
27
+ langchain-anthropic = "1.4.3"
28
+ pydantic = ">=2.11.9,<2.13"
29
+ pydantic-settings = ">=2.6.0,<3.0"
30
+ python-dotenv = "^1.2.2"
31
+ click = ">=8.1.7,<8.2.0"
32
+ rich = ">=13.7.0,<15.0"
33
+
34
+ [tool.poetry.group.dev.dependencies]
35
+ pytest = "^9.0.3"
36
+ pytest-asyncio = "^1.3.0"
37
+ pytest-cov = "^7.1.0"
38
+ mypy = "^2.1.0"
39
+ ruff = "^0.15.13"
40
+ black = "^26.5.1"
41
+
42
+ [build-system]
43
+ requires = ["poetry-core"]
44
+ build-backend = "poetry.core.masonry.api"
45
+
46
+ [tool.ruff]
47
+ line-length = 100
48
+ target-version = "py311"
49
+
50
+ [tool.ruff.lint]
51
+ select = ["E", "F", "I", "UP"]
52
+ ignore = ["E501"]
53
+
54
+ [tool.mypy]
55
+ python_version = "3.11"
56
+ strict = true
57
+ ignore_missing_imports = true
58
+ exclude = ["tests/", "examples/"]
@@ -0,0 +1,3 @@
1
+ """Verdict: evaluation infrastructure for AI agents."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ """Verdict target-system adapters."""
2
+
3
+ from verdict.adapters.base import TargetAdapter
4
+
5
+ __all__ = ["TargetAdapter"]