PyPI - verdict-eval - Versions diffs - 0.1.0__tar.gz - Mend

verdict-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

verdict_eval-0.1.0/LICENSE +21 -0
verdict_eval-0.1.0/PKG-INFO +175 -0
verdict_eval-0.1.0/README.md +143 -0
verdict_eval-0.1.0/pyproject.toml +58 -0
verdict_eval-0.1.0/verdict/__init__.py +3 -0
verdict_eval-0.1.0/verdict/adapters/__init__.py +5 -0
verdict_eval-0.1.0/verdict/adapters/base.py +316 -0
verdict_eval-0.1.0/verdict/adapters/simple_rag.py +205 -0
verdict_eval-0.1.0/verdict/agents/__init__.py +1 -0
verdict_eval-0.1.0/verdict/agents/adaptive_generator.py +241 -0
verdict_eval-0.1.0/verdict/agents/executor.py +99 -0
verdict_eval-0.1.0/verdict/agents/judge.py +383 -0
verdict_eval-0.1.0/verdict/agents/prompts/__init__.py +1 -0
verdict_eval-0.1.0/verdict/agents/prompts/judge_prompt.py +85 -0
verdict_eval-0.1.0/verdict/agents/prompts/reporter_prompt.py +88 -0
verdict_eval-0.1.0/verdict/agents/prompts/test_generator_prompt.py +119 -0
verdict_eval-0.1.0/verdict/agents/reporter.py +354 -0
verdict_eval-0.1.0/verdict/agents/test_generator.py +221 -0
verdict_eval-0.1.0/verdict/analysis/__init__.py +4 -0
verdict_eval-0.1.0/verdict/analysis/flakiness.py +171 -0
verdict_eval-0.1.0/verdict/caching/__init__.py +12 -0
verdict_eval-0.1.0/verdict/caching/backends.py +154 -0
verdict_eval-0.1.0/verdict/caching/cache.py +62 -0
verdict_eval-0.1.0/verdict/cli/__init__.py +1 -0
verdict_eval-0.1.0/verdict/cli/guardrails.py +94 -0
verdict_eval-0.1.0/verdict/cli.py +356 -0
verdict_eval-0.1.0/verdict/config/__init__.py +5 -0
verdict_eval-0.1.0/verdict/config/settings.py +119 -0
verdict_eval-0.1.0/verdict/costs/__init__.py +5 -0
verdict_eval-0.1.0/verdict/costs/calculator.py +142 -0
verdict_eval-0.1.0/verdict/costs/pricing.py +92 -0
verdict_eval-0.1.0/verdict/crews/__init__.py +1 -0
verdict_eval-0.1.0/verdict/evals/__init__.py +1 -0
verdict_eval-0.1.0/verdict/evals/attack_patterns/patterns.json +255 -0
verdict_eval-0.1.0/verdict/evals/categories.py +262 -0
verdict_eval-0.1.0/verdict/evals/rubrics.py +366 -0
verdict_eval-0.1.0/verdict/models/__init__.py +17 -0
verdict_eval-0.1.0/verdict/models/schemas.py +373 -0
verdict_eval-0.1.0/verdict/observability/__init__.py +4 -0
verdict_eval-0.1.0/verdict/observability/token_tracker.py +125 -0
verdict_eval-0.1.0/verdict/orchestration/__init__.py +1 -0
verdict_eval-0.1.0/verdict/orchestration/diff.py +261 -0
verdict_eval-0.1.0/verdict/reports/__init__.py +1 -0
verdict_eval-0.1.0/verdict/reports/builder.py +150 -0
verdict_eval-0.1.0/verdict/stats/__init__.py +4 -0
verdict_eval-0.1.0/verdict/stats/bootstrap.py +70 -0
verdict_eval-0.1.0/verdict/tasks/__init__.py +1 -0

verdict_eval-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Dan Nicolau
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

verdict_eval-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,175 @@
+Metadata-Version: 2.4
+Name: verdict-eval
+Version: 0.1.0
+Summary: Evaluation infrastructure for AI agents.
+License: MIT
+License-File: LICENSE
+Keywords: ai,evaluation,agents,llm,crewai,testing,compliance,red-teaming
+Author: Dan Nicolau
+Author-email: dan@dannicolau.com
+Requires-Python: >=3.11,<3.14
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Quality Assurance
+Classifier: Topic :: Software Development :: Testing
+Requires-Dist: click (>=8.1.7,<8.2.0)
+Requires-Dist: crewai (==1.14.5)
+Requires-Dist: langchain-anthropic (==1.4.3)
+Requires-Dist: pydantic (>=2.11.9,<2.13)
+Requires-Dist: pydantic-settings (>=2.6.0,<3.0)
+Requires-Dist: python-dotenv (>=1.2.2,<2.0.0)
+Requires-Dist: rich (>=13.7.0,<15.0)
+Project-URL: Documentation, https://github.com/dannicolau7/verdict#readme
+Project-URL: Homepage, https://github.com/dannicolau7/verdict
+Project-URL: Repository, https://github.com/dannicolau7/verdict
+Description-Content-Type: text/markdown
+# Verdict
+> Evaluation infrastructure for AI agents.
+[![CI](https://github.com/dannicolau7/verdict/actions/workflows/test.yml/badge.svg)](https://github.com/dannicolau7/verdict/actions/workflows/test.yml)
+![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+![PyPI](https://img.shields.io/pypi/v/verdict-eval)
+## Demo
+<!-- Replace the link below after recording with `asciinema rec docs/demo.cast --command ./scripts/demo.sh` -->
+> Recording coming soon — run `./scripts/demo.sh` locally after `pip install verdict-eval`.
+## Install
+```bash
+pip install verdict-eval
+```
+## Quickstart
+```bash
+# Run an evaluation against a built-in adapter
+verdict eval --target simple_rag --num-per-category 5
+# Compare two adapter versions
+verdict diff --target-a simple_rag --target-b path/to/v2.py:MyAdapter --num 10
+# Analyze flakiness across historical runs
+verdict flakiness --target my-system --reports-dir ./reports
+```
+## CLI reference
+### `verdict eval`
+Run a full evaluation against a target adapter.
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--target` | required | Adapter spec: `simple_rag` or `path/to/file.py:ClassName` |
+| `--num-per-category` | 5 | Prompts per test category |
+| `--categories` | all | Specific categories (repeat for multiple) |
+| `--output-dir` | `./reports` | Report output directory |
+| `--run-id` | auto | Custom run identifier |
+| `--model` | settings default | Override LLM model for all agents |
+| `--bootstrap-iterations` | 1000 | Bootstrap CI iterations (0 to disable) |
+| `--max-cost-usd` | — | Fail (exit 2) if total cost exceeds this amount |
+| `--max-total-latency-seconds` | — | Fail (exit 2) if total latency exceeds this |
+| `--fail-on-pass-rate-below` | — | Fail (exit 2) if pass rate < threshold |
+| `--fail-on-ci-low-below` | — | Fail (exit 2) if CI lower bound < threshold |
+| `--cache-mode` | `off` | `off` / `record` / `replay` / `update` |
+| `--cache-dir` | `.verdict_cache` | Directory for cached responses |
+| `--adaptive` | off | Run adaptive follow-up probes based on initial responses |
+### `verdict diff`
+Compare two adapter versions against the same generated test suite.
+```bash
+verdict diff \
+  --target-a simple_rag \
+  --target-b path/to/v2.py:V2Adapter \
+  --num 10
+```
+### `verdict flakiness`
+Analyze judge and target consistency across historical evaluation runs.
+```bash
+verdict flakiness --target my-system --min-runs 5 --reports-dir ./reports
+```
+## Adaptive mode
+When `--adaptive` is enabled, Verdict runs a second pass of follow-up probes selected
+based on each initial response. Pattern selection is **entirely rule-based** — no LLM
+is used to generate new attacks. All probes are composed from the curated
+[OWASP LLM Top 10](https://owasp.org/www-project-top-10-for-large-language-model-applications/)
+pattern library in `verdict/evals/attack_patterns/patterns.json`.
+This design ensures Verdict remains a defensive evaluation tool. See
+[CONTRIBUTING.md](CONTRIBUTING.md) for the security boundary policy.
+```bash
+verdict eval --target simple_rag --adaptive
+```
+## Writing a custom adapter
+```python
+# my_adapter.py
+from verdict.adapters.base import TargetAdapter
+from verdict.models.schemas import ExecutionResult, TestPrompt
+class MyAdapter(TargetAdapter):
+    name = "my-system"
+    version = "1.0.0"
+    async def execute(self, prompt: TestPrompt) -> ExecutionResult:
+        response = call_my_system(prompt.prompt)
+        return self.make_result(prompt, response=response)
+```
+```bash
+verdict eval --target my_adapter.py:MyAdapter
+```
+## Test categories
+| Category | What it evaluates |
+|----------|-------------------|
+| `correctness` | Factual accuracy and reasoning quality |
+| `safety` | Refusal of harmful, dangerous, or unethical requests |
+| `injection` | Robustness against prompt injection (OWASP LLM01, LLM07) |
+| `edge_case` | Graceful handling of malformed and ambiguous inputs |
+| `compliance` | Privacy and data handling (OWASP LLM02) |
+## Judge calibration
+The Judge is validated against 22 hand-labeled examples covering all five test
+categories. Results are produced by running the live judge against known ground
+truth — no labels were derived from judge output.
+| Metric | Target | Baseline |
+|--------|--------|----------|
+| Pass/fail agreement (non-borderline) | ≥ 80% | TBD |
+| Critical failure detection | 5 / 5 | TBD |
+| Score accuracy (±1) | ≥ 70% | TBD |
+Run calibration locally (requires `ANTHROPIC_API_KEY`):
+```bash
+pytest tests/qa/test_judge_calibration.py -v -m llm
+```
+See [docs/judge_calibration.md](docs/judge_calibration.md) for full methodology.
+## License
+MIT — see [LICENSE](LICENSE).

verdict_eval-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,143 @@
+# Verdict
+> Evaluation infrastructure for AI agents.
+[![CI](https://github.com/dannicolau7/verdict/actions/workflows/test.yml/badge.svg)](https://github.com/dannicolau7/verdict/actions/workflows/test.yml)
+![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+![PyPI](https://img.shields.io/pypi/v/verdict-eval)
+## Demo
+<!-- Replace the link below after recording with `asciinema rec docs/demo.cast --command ./scripts/demo.sh` -->
+> Recording coming soon — run `./scripts/demo.sh` locally after `pip install verdict-eval`.
+## Install
+```bash
+pip install verdict-eval
+```
+## Quickstart
+```bash
+# Run an evaluation against a built-in adapter
+verdict eval --target simple_rag --num-per-category 5
+# Compare two adapter versions
+verdict diff --target-a simple_rag --target-b path/to/v2.py:MyAdapter --num 10
+# Analyze flakiness across historical runs
+verdict flakiness --target my-system --reports-dir ./reports
+```
+## CLI reference
+### `verdict eval`
+Run a full evaluation against a target adapter.
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--target` | required | Adapter spec: `simple_rag` or `path/to/file.py:ClassName` |
+| `--num-per-category` | 5 | Prompts per test category |
+| `--categories` | all | Specific categories (repeat for multiple) |
+| `--output-dir` | `./reports` | Report output directory |
+| `--run-id` | auto | Custom run identifier |
+| `--model` | settings default | Override LLM model for all agents |
+| `--bootstrap-iterations` | 1000 | Bootstrap CI iterations (0 to disable) |
+| `--max-cost-usd` | — | Fail (exit 2) if total cost exceeds this amount |
+| `--max-total-latency-seconds` | — | Fail (exit 2) if total latency exceeds this |
+| `--fail-on-pass-rate-below` | — | Fail (exit 2) if pass rate < threshold |
+| `--fail-on-ci-low-below` | — | Fail (exit 2) if CI lower bound < threshold |
+| `--cache-mode` | `off` | `off` / `record` / `replay` / `update` |
+| `--cache-dir` | `.verdict_cache` | Directory for cached responses |
+| `--adaptive` | off | Run adaptive follow-up probes based on initial responses |
+### `verdict diff`
+Compare two adapter versions against the same generated test suite.
+```bash
+verdict diff \
+  --target-a simple_rag \
+  --target-b path/to/v2.py:V2Adapter \
+  --num 10
+```
+### `verdict flakiness`
+Analyze judge and target consistency across historical evaluation runs.
+```bash
+verdict flakiness --target my-system --min-runs 5 --reports-dir ./reports
+```
+## Adaptive mode
+When `--adaptive` is enabled, Verdict runs a second pass of follow-up probes selected
+based on each initial response. Pattern selection is **entirely rule-based** — no LLM
+is used to generate new attacks. All probes are composed from the curated
+[OWASP LLM Top 10](https://owasp.org/www-project-top-10-for-large-language-model-applications/)
+pattern library in `verdict/evals/attack_patterns/patterns.json`.
+This design ensures Verdict remains a defensive evaluation tool. See
+[CONTRIBUTING.md](CONTRIBUTING.md) for the security boundary policy.
+```bash
+verdict eval --target simple_rag --adaptive
+```
+## Writing a custom adapter
+```python
+# my_adapter.py
+from verdict.adapters.base import TargetAdapter
+from verdict.models.schemas import ExecutionResult, TestPrompt
+class MyAdapter(TargetAdapter):
+    name = "my-system"
+    version = "1.0.0"
+    async def execute(self, prompt: TestPrompt) -> ExecutionResult:
+        response = call_my_system(prompt.prompt)
+        return self.make_result(prompt, response=response)
+```
+```bash
+verdict eval --target my_adapter.py:MyAdapter
+```
+## Test categories
+| Category | What it evaluates |
+|----------|-------------------|
+| `correctness` | Factual accuracy and reasoning quality |
+| `safety` | Refusal of harmful, dangerous, or unethical requests |
+| `injection` | Robustness against prompt injection (OWASP LLM01, LLM07) |
+| `edge_case` | Graceful handling of malformed and ambiguous inputs |
+| `compliance` | Privacy and data handling (OWASP LLM02) |
+## Judge calibration
+The Judge is validated against 22 hand-labeled examples covering all five test
+categories. Results are produced by running the live judge against known ground
+truth — no labels were derived from judge output.
+| Metric | Target | Baseline |
+|--------|--------|----------|
+| Pass/fail agreement (non-borderline) | ≥ 80% | TBD |
+| Critical failure detection | 5 / 5 | TBD |
+| Score accuracy (±1) | ≥ 70% | TBD |
+Run calibration locally (requires `ANTHROPIC_API_KEY`):
+```bash
+pytest tests/qa/test_judge_calibration.py -v -m llm
+```
+See [docs/judge_calibration.md](docs/judge_calibration.md) for full methodology.
+## License
+MIT — see [LICENSE](LICENSE).

verdict_eval-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,58 @@
+[tool.poetry]
+name = "verdict-eval"
+version = "0.1.0"
+description = "Evaluation infrastructure for AI agents."
+authors = ["Dan Nicolau <dan@dannicolau.com>"]
+license = "MIT"
+readme = "README.md"
+homepage = "https://github.com/dannicolau7/verdict"
+repository = "https://github.com/dannicolau7/verdict"
+documentation = "https://github.com/dannicolau7/verdict#readme"
+keywords = ["ai", "evaluation", "agents", "llm", "crewai", "testing", "compliance", "red-teaming"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+    "Topic :: Software Development :: Quality Assurance",
+]
+packages = [{include = "verdict"}]
+[tool.poetry.dependencies]
+python = ">=3.11,<3.14"
+crewai = "1.14.5"
+langchain-anthropic = "1.4.3"
+pydantic = ">=2.11.9,<2.13"
+pydantic-settings = ">=2.6.0,<3.0"
+python-dotenv = "^1.2.2"
+click = ">=8.1.7,<8.2.0"
+rich = ">=13.7.0,<15.0"
+[tool.poetry.group.dev.dependencies]
+pytest = "^9.0.3"
+pytest-asyncio = "^1.3.0"
+pytest-cov = "^7.1.0"
+mypy = "^2.1.0"
+ruff = "^0.15.13"
+black = "^26.5.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP"]
+ignore = ["E501"]
+[tool.mypy]
+python_version = "3.11"
+strict = true
+ignore_missing_imports = true
+exclude = ["tests/", "examples/"]

verdict_eval-0.1.0/verdict/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Verdict: evaluation infrastructure for AI agents."""
+__version__ = "0.1.0"

verdict_eval-0.1.0/verdict/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Verdict target-system adapters."""
+from verdict.adapters.base import TargetAdapter
+__all__ = ["TargetAdapter"]