agentstax-eval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentstax_eval-0.1.0/PKG-INFO +496 -0
- agentstax_eval-0.1.0/README.md +467 -0
- agentstax_eval-0.1.0/pyproject.toml +57 -0
- agentstax_eval-0.1.0/src/agentstax_eval/__init__.py +52 -0
- agentstax_eval-0.1.0/src/agentstax_eval/agent_graph.py +227 -0
- agentstax_eval-0.1.0/src/agentstax_eval/agent_metadata.py +151 -0
- agentstax_eval-0.1.0/src/agentstax_eval/cache/__init__.py +5 -0
- agentstax_eval-0.1.0/src/agentstax_eval/cache/_base.py +36 -0
- agentstax_eval-0.1.0/src/agentstax_eval/cache/disk.py +83 -0
- agentstax_eval-0.1.0/src/agentstax_eval/cache/memory.py +48 -0
- agentstax_eval-0.1.0/src/agentstax_eval/dataset.py +66 -0
- agentstax_eval-0.1.0/src/agentstax_eval/evaluation.py +196 -0
- agentstax_eval-0.1.0/src/agentstax_eval/evaluation_results.py +282 -0
- agentstax_eval-0.1.0/src/agentstax_eval/exceptions.py +72 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/__init__.py +27 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/_children.py +56 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/_providers.py +72 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/_validation.py +38 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/crewai.py +42 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/google_adk.py +31 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/langgraph.py +166 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/llamaindex.py +53 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/msaf.py +38 -0
- agentstax_eval-0.1.0/src/agentstax_eval/extractors/openai_agents.py +53 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/__init__.py +21 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/_helpers.py +192 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/_result.py +16 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/contains_answer.py +19 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/exact_match.py +17 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/json_valid.py +22 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/llm_completeness.py +69 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/llm_correctness.py +67 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/llm_faithfulness.py +66 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/llm_relevance.py +66 -0
- agentstax_eval-0.1.0/src/agentstax_eval/metrics/llm_rubric.py +76 -0
- agentstax_eval-0.1.0/src/agentstax_eval/pipeline.py +105 -0
- agentstax_eval-0.1.0/src/agentstax_eval/providers/__init__.py +9 -0
- agentstax_eval-0.1.0/src/agentstax_eval/providers/anthropic.py +78 -0
- agentstax_eval-0.1.0/src/agentstax_eval/providers/google.py +86 -0
- agentstax_eval-0.1.0/src/agentstax_eval/providers/openai.py +72 -0
- agentstax_eval-0.1.0/src/agentstax_eval/task.py +156 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: agentstax-eval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight Python library for evaluating AI agents and RAG pipelines.
|
|
5
|
+
Author: Brandon Cate
|
|
6
|
+
Author-email: Brandon Cate <brandoncate95@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Testing
|
|
18
|
+
Requires-Dist: anthropic>=0.80.0 ; extra == 'anthropic'
|
|
19
|
+
Requires-Dist: google-genai>=1.0.0 ; extra == 'google'
|
|
20
|
+
Requires-Dist: openai>=1.0.0 ; extra == 'openai'
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Project-URL: Homepage, https://github.com/agentstax/eval-python-sdk
|
|
23
|
+
Project-URL: Repository, https://github.com/agentstax/eval-python-sdk
|
|
24
|
+
Project-URL: Issues, https://github.com/agentstax/eval-python-sdk/issues
|
|
25
|
+
Provides-Extra: anthropic
|
|
26
|
+
Provides-Extra: google
|
|
27
|
+
Provides-Extra: openai
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# agentstax-eval
|
|
31
|
+
|
|
32
|
+
A lightweight Python library for evaluating AI agents and RAG pipelines. No magic, no vendor lock-in — just explicit, readable evaluation logic.
|
|
33
|
+
|
|
34
|
+
[](https://pypi.org/project/agentstax-eval/)
|
|
35
|
+
[](https://python.org)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Table of Contents
|
|
41
|
+
|
|
42
|
+
- [Key Features](#key-features)
|
|
43
|
+
- [Installation](#installation)
|
|
44
|
+
- [Why agentstax-eval](#why-agentstax-eval)
|
|
45
|
+
- [Quickstart](#quickstart)
|
|
46
|
+
- [Metrics](#metrics)
|
|
47
|
+
- [Providers](#providers)
|
|
48
|
+
- [Monitoring Dashboard](#monitoring-dashboard)
|
|
49
|
+
- [Core API](#core-api)
|
|
50
|
+
- [Framework Auto-Extraction](#framework-auto-extraction)
|
|
51
|
+
- [Caching](#caching)
|
|
52
|
+
- [Multi-Agent Evaluation](#multi-agent-evaluation)
|
|
53
|
+
- [Async Support](#async-support)
|
|
54
|
+
- [CI / Regression Testing](#ci--regression-testing)
|
|
55
|
+
- [Further Reading](#further-reading)
|
|
56
|
+
- [Contributing](#contributing)
|
|
57
|
+
- [License](#license)
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Key Features
|
|
62
|
+
|
|
63
|
+
- **Four objects, one job each** — `Dataset`, `Task`, `Evaluation`, `Pipeline`.
|
|
64
|
+
- **Zero required dependencies** — the core library installs with no third-party packages.
|
|
65
|
+
- **Bring your own LLM** — pass any `fn(prompt: str) -> str` as a judge. No default provider.
|
|
66
|
+
- **Metrics are functions** — `fn(dataset_row: dict) -> float`. No base classes, no decorators.
|
|
67
|
+
- **Built-in LLM-as-judge** — correctness, relevance, faithfulness, completeness, rubric.
|
|
68
|
+
- **Framework auto-extraction** — pass a LangGraph, Google ADK, OpenAI Agents, CrewAI, LlamaIndex, or MSAF agent to `Pipeline` and get topology, model, tools, and system prompt extracted automatically.
|
|
69
|
+
- **Agent fingerprinting** — topology changes are hashed, so caches auto-invalidate and the monitoring dashboard detects architecture drift.
|
|
70
|
+
- **LLM response caching** — `DiskCache` and `MemoryCache` keyed on judge + prompt + agent fingerprint.
|
|
71
|
+
- **Real-time monitoring** — pair with [agentstax-eval-monitor](#monitoring-dashboard) for a live dashboard with regression detection, metric trends, and agent network visualization.
|
|
72
|
+
- **CI-ready** — `assert_passing()`, `failures()`, and JSON save/load for pytest regression tests.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install agentstax-eval
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Requires:** Python 3.9+. Zero core dependencies.
|
|
83
|
+
|
|
84
|
+
**Optional extras** for built-in provider functions:
|
|
85
|
+
|
|
86
|
+
| Extra | Install command | Adds |
|
|
87
|
+
|-------|----------------|------|
|
|
88
|
+
| `openai` | `pip install openai` | `openai` SDK |
|
|
89
|
+
| `anthropic` | `pip install anthropic` | `anthropic` SDK |
|
|
90
|
+
| `google` | `pip install google` | `google-genai` SDK |
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Why agentstax-eval
|
|
95
|
+
|
|
96
|
+
All the eval frameworks I have used feel more complicated than they should. They are either too heavy, too complicated, or require vendor lock-in.
|
|
97
|
+
|
|
98
|
+
I often find myself creating a simple script to test what I want, which works great in the beginning but shows its weakness later on.
|
|
99
|
+
|
|
100
|
+
I built agentstax-eval to fix that. It's simple when you need it, but flexible enough to grow with your agent architecture.
|
|
101
|
+
|
|
102
|
+
It works with whatever you're already using — you can pass your LangGraph, ADK, OpenAI Agents, CrewAI, LlamaIndex, or MSAF agent directly to Pipeline. It will walk the hierarchy, extract the topology, and fingerprint it. When your architecture changes, the cache invalidates and the [monitor](#monitoring-dashboard) flags the drift.
|
|
103
|
+
|
|
104
|
+
Pair it with [agentstax-eval-monitor](#monitoring-dashboard) and you get a live dashboard that shows regression detection, metric trends, and agent network visualization.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Quickstart
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from openai import OpenAI
|
|
112
|
+
from agentstax_eval import Pipeline, Dataset, Task, Evaluation
|
|
113
|
+
from agentstax_eval.metrics import llm_correctness
|
|
114
|
+
from agentstax_eval.providers import openai_provider
|
|
115
|
+
|
|
116
|
+
client = OpenAI()
|
|
117
|
+
|
|
118
|
+
dataset = Dataset([
|
|
119
|
+
{"question": "What is the capital of France?", "expected_answer": "Paris"},
|
|
120
|
+
{"question": "Who wrote Hamlet?", "expected_answer": "Shakespeare"},
|
|
121
|
+
{"question": "What is 2 + 2?", "expected_answer": "4"},
|
|
122
|
+
])
|
|
123
|
+
|
|
124
|
+
def get_answer(dataset_row: dict) -> dict:
|
|
125
|
+
response = client.chat.completions.create(
|
|
126
|
+
model="gpt-4o-mini",
|
|
127
|
+
messages=[{"role": "user", "content": dataset_row["question"]}],
|
|
128
|
+
)
|
|
129
|
+
return {"answer": response.choices[0].message.content}
|
|
130
|
+
|
|
131
|
+
pipeline = Pipeline(
|
|
132
|
+
dataset=dataset,
|
|
133
|
+
tasks=[Task(get_answer)],
|
|
134
|
+
evaluation=Evaluation(metrics=[llm_correctness(llm=openai_provider())]),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
results = pipeline.run()
|
|
138
|
+
results.save(directory="results", base_filename="eval")
|
|
139
|
+
print(results)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### With agent auto-extraction
|
|
143
|
+
|
|
144
|
+
Pass your agent object directly to unlock automatic metadata extraction, topology mapping, and smart caching:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from agentstax_eval import Pipeline, Dataset, Task, Evaluation, DiskCache
|
|
148
|
+
from agentstax_eval.metrics import llm_correctness
|
|
149
|
+
from agentstax_eval.providers import openai_provider
|
|
150
|
+
|
|
151
|
+
pipeline = Pipeline(
|
|
152
|
+
dataset=dataset,
|
|
153
|
+
tasks=[Task(get_answer)],
|
|
154
|
+
evaluation=Evaluation(metrics=[llm_correctness(llm=openai_provider())]),
|
|
155
|
+
agent=my_langgraph_agent, # auto-extracts topology, model, tools
|
|
156
|
+
cache=DiskCache(".cache"), # caches judge responses, invalidates on topology change
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
results = pipeline.run()
|
|
160
|
+
results.save(directory="results", base_filename="my_agent_eval")
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
The saved JSON now includes a `topology` field with the full agent graph, fingerprint, and per-node metadata — which the monitoring dashboard uses for network visualization and change detection.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Metrics
|
|
168
|
+
|
|
169
|
+
### Deterministic
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from agentstax_eval.metrics import exact_match, contains_answer, json_valid
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
| Metric | What it checks | Source | LLM required |
|
|
176
|
+
|--------|---------------|--------|--------------|
|
|
177
|
+
| [`exact_match`](src/agentstax_eval/metrics/exact_match.py#L13) | `answer.strip().lower() == expected_answer.strip().lower()` | [source](src/agentstax_eval/metrics/exact_match.py) | No |
|
|
178
|
+
| [`contains_answer`](src/agentstax_eval/metrics/contains_answer.py#L15) | `expected_answer` appears in `answer` | [source](src/agentstax_eval/metrics/contains_answer.py) | No |
|
|
179
|
+
| [`json_valid`](src/agentstax_eval/metrics/json_valid.py#L16) | `answer` parses as valid JSON | [source](src/agentstax_eval/metrics/json_valid.py) | No |
|
|
180
|
+
|
|
181
|
+
### LLM-as-Judge
|
|
182
|
+
|
|
183
|
+
Factory functions that take an `llm` callable and return a metric. The `llm` can be synchronous (`fn(prompt: str) -> str`) or asynchronous (`async def`).
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from agentstax_eval.metrics import llm_correctness, llm_relevance, llm_faithfulness, llm_completeness, llm_rubric
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
| Metric | Reads | What it scores | Prompt |
|
|
190
|
+
|--------|-------|---------------|--------|
|
|
191
|
+
| [`llm_correctness(llm=)`](src/agentstax_eval/metrics/llm_correctness.py#L49) | `question`, `expected_answer`, `answer` | Factual correctness against reference | [prompt](src/agentstax_eval/metrics/llm_correctness.py#L17) |
|
|
192
|
+
| [`llm_relevance(llm=)`](src/agentstax_eval/metrics/llm_relevance.py#L48) | `question`, `answer` | Whether answer addresses the question (reference-free) | [prompt](src/agentstax_eval/metrics/llm_relevance.py#L17) |
|
|
193
|
+
| [`llm_faithfulness(llm=)`](src/agentstax_eval/metrics/llm_faithfulness.py#L48) | `answer`, `context` | Whether claims are supported by context (RAG) | [prompt](src/agentstax_eval/metrics/llm_faithfulness.py#L17) |
|
|
194
|
+
| [`llm_completeness(llm=)`](src/agentstax_eval/metrics/llm_completeness.py#L51) | `question`, `expected_answer`, `answer` | Whether all key points are covered | [prompt](src/agentstax_eval/metrics/llm_completeness.py#L17) |
|
|
195
|
+
| [`llm_rubric(llm=, criteria=)`](src/agentstax_eval/metrics/llm_rubric.py#L51) | `question`, `answer` | User-defined criteria in plain English | [prompt](src/agentstax_eval/metrics/llm_rubric.py#L16) |
|
|
196
|
+
|
|
197
|
+
All use a six-point scale (`1.0`, `0.8`, `0.6`, `0.4`, `0.2`, `0.0`) with chain-of-thought reasoning.
|
|
198
|
+
|
|
199
|
+
### Custom Metrics
|
|
200
|
+
|
|
201
|
+
Any function with the signature `fn(dataset_row: dict) -> float` works as a metric. The function receives the full row dict — including `question`, `expected_answer`, `answer`, and any extra fields your tasks added (like `context`). Return a float between `0.0` and `1.0`. The function's `__name__` is used as the column name in results.
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
# Deterministic metric — no LLM needed
|
|
205
|
+
def answer_is_concise(dataset_row: dict) -> float:
|
|
206
|
+
return 1.0 if len(dataset_row["answer"].split()) <= 20 else 0.0
|
|
207
|
+
|
|
208
|
+
# Custom LLM metric — you control the prompt and parsing
|
|
209
|
+
def my_judge_metric(dataset_row: dict) -> float:
|
|
210
|
+
prompt = f"Is this answer polite? Answer 1 or 0.\nAnswer: {dataset_row['answer']}"
|
|
211
|
+
response = client.chat.completions.create(
|
|
212
|
+
model="gpt-4o",
|
|
213
|
+
messages=[{"role": "user", "content": prompt}],
|
|
214
|
+
)
|
|
215
|
+
return float(response.choices[0].message.content.strip())
|
|
216
|
+
|
|
217
|
+
evaluation = Evaluation(metrics=[exact_match, answer_is_concise, my_judge_metric])
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
When using factory functions or lambdas, set `__name__` to control the result column name:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
concise = llm_rubric(llm=judge, criteria="Two sentences or fewer.")
|
|
224
|
+
concise.__name__ = "rubric_concise"
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Providers
|
|
230
|
+
|
|
231
|
+
Built-in `fn(prompt: str) -> str` wrappers that handle client setup and set `judge_model` metadata automatically:
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from agentstax_eval.providers import openai_provider, anthropic_provider, google_provider
|
|
235
|
+
|
|
236
|
+
judge = openai_provider() # default: gpt-4.1, reads OPENAI_API_KEY
|
|
237
|
+
judge = anthropic_provider(model="claude-opus-4-6") # reads ANTHROPIC_API_KEY
|
|
238
|
+
judge = google_provider(model="gemini-2.5-pro") # reads GEMINI_API_KEY
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Custom Providers
|
|
242
|
+
|
|
243
|
+
A provider is any `fn(prompt: str) -> str`. To get automatic `judge_model` tracking in result metadata, set the attribute on the function:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
from openai import OpenAI
|
|
247
|
+
|
|
248
|
+
client = OpenAI()
|
|
249
|
+
|
|
250
|
+
def my_judge(prompt: str) -> str:
|
|
251
|
+
response = client.chat.completions.create(
|
|
252
|
+
model="gpt-4o",
|
|
253
|
+
messages=[{"role": "user", "content": prompt}],
|
|
254
|
+
)
|
|
255
|
+
return response.choices[0].message.content
|
|
256
|
+
|
|
257
|
+
my_judge.judge_model = "openai/gpt-4o"
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
The `judge_model` string flows into `metadata.scoring.<metric_name>.judge_model` in saved results. If omitted, scoring metadata will simply not include it.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
264
|
+
## Monitoring Dashboard
|
|
265
|
+
|
|
266
|
+
**[agentstax-eval-monitor](https://github.com/your-org/agentstax-eval-monitor)** is a companion real-time dashboard that watches a directory of agentstax-eval result files and provides:
|
|
267
|
+
|
|
268
|
+
- **Regression detection** — agents categorized as regressed, improved, or healthy based on metric deltas
|
|
269
|
+
- **Metric trend sparklines** — per-metric performance history at a glance
|
|
270
|
+
- **Per-agent deep dives** — zoomable line charts with threshold lines and metadata change markers
|
|
271
|
+
- **Agent network visualization** — interactive DAG of your multi-agent topology, with added/removed agents highlighted
|
|
272
|
+
- **Architecture drift detection** — visual timeline of when topology or metric config changed
|
|
273
|
+
- **Live updates** — WebSocket-powered, updates as new result files land
|
|
274
|
+
|
|
275
|
+
### Setup
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
# Install
|
|
279
|
+
cd agentstax-eval-monitor
|
|
280
|
+
bun install
|
|
281
|
+
|
|
282
|
+
# Point at your results directory
|
|
283
|
+
bun run index.ts ./results
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Opens a dashboard at `http://localhost:3000`.
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## Core API
|
|
291
|
+
|
|
292
|
+
### Dataset
|
|
293
|
+
|
|
294
|
+
Wraps a list of dicts. Each row must have a `question`. The `expected_answer` field is optional at the dataset level — metrics that need it raise `MissingFieldError` at scoring time.
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
dataset = Dataset([
|
|
298
|
+
{"question": "What year did WWII end?", "expected_answer": "1945"},
|
|
299
|
+
])
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Task
|
|
303
|
+
|
|
304
|
+
Wraps a callable `fn(dataset_row: dict) -> dict`. The returned dict is merged into each row.
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
task = Task(get_answer)
|
|
308
|
+
answered_dataset = task.run(dataset) # returns new Dataset, original not mutated
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### Evaluation
|
|
312
|
+
|
|
313
|
+
Holds a list of metrics and scores a dataset. Metrics are `fn(dataset_row: dict) -> float`.
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
evaluation = Evaluation(metrics=[exact_match, llm_correctness(llm=my_judge)])
|
|
317
|
+
results = evaluation.run(answered_dataset, metadata={"model": "gpt-4o"})
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### Pipeline
|
|
321
|
+
|
|
322
|
+
Convenience wrapper: runs tasks sequentially, then scores. Accepts `agent` for auto-extraction and `cache` for LLM judge caching.
|
|
323
|
+
|
|
324
|
+
```python
|
|
325
|
+
pipeline = Pipeline(
|
|
326
|
+
dataset=dataset,
|
|
327
|
+
tasks=[Task(get_answer)],
|
|
328
|
+
evaluation=Evaluation(metrics=[llm_correctness(llm=my_judge)]),
|
|
329
|
+
agent=my_agent, # optional: auto-extract topology
|
|
330
|
+
cache=DiskCache(".cache"), # optional: cache judge responses (requires agent)
|
|
331
|
+
metadata={"experiment": "v2"}, # optional: merged into results (overrides extracted values)
|
|
332
|
+
)
|
|
333
|
+
results = pipeline.run()
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
| Parameter | Type | Description |
|
|
337
|
+
|---|---|---|
|
|
338
|
+
| `dataset` | `Dataset` | Input rows to process |
|
|
339
|
+
| `tasks` | `list[Task]` | Ordered task list, run sequentially |
|
|
340
|
+
| `evaluation` | `Evaluation` | Scores completed rows |
|
|
341
|
+
| `metadata` | `dict \| None` | Merged into result metadata; overrides extracted values |
|
|
342
|
+
| `agent` | `object \| None` | Agent object for [auto-extraction](#framework-auto-extraction) |
|
|
343
|
+
| `cache` | `Cache \| None` | LLM judge [cache](#caching); requires `agent` |
|
|
344
|
+
|
|
345
|
+
### EvaluationResults
|
|
346
|
+
|
|
347
|
+
```python
|
|
348
|
+
results.metadata # dict with timestamp_utc, topology, scoring, etc.
|
|
349
|
+
results.rows # list[ResultRow] — each has .data (dict) and .scores (dict)
|
|
350
|
+
results.assert_passing() # True if all thresholded metrics pass on every row
|
|
351
|
+
results.failures() # list[Failure] — each row/metric pair below threshold
|
|
352
|
+
|
|
353
|
+
# Save and load
|
|
354
|
+
path = results.save(directory="results", base_filename="eval")
|
|
355
|
+
latest = EvaluationResults.load(directory="results", base_filename="eval")
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
---
|
|
359
|
+
|
|
360
|
+
## Framework Auto-Extraction
|
|
361
|
+
|
|
362
|
+
Pass an agent object from a supported framework and Pipeline automatically extracts its metadata and topology — no manual `metadata` dict needed.
|
|
363
|
+
|
|
364
|
+
Supported frameworks:
|
|
365
|
+
|
|
366
|
+
| Framework | Install |
|
|
367
|
+
|---|---|
|
|
368
|
+
| LangGraph | `pip install langgraph` |
|
|
369
|
+
| Google ADK | `pip install google-adk` |
|
|
370
|
+
| OpenAI Agents SDK | `pip install openai-agents` |
|
|
371
|
+
| CrewAI | `pip install crewai` |
|
|
372
|
+
| LlamaIndex | `pip install llama-index` |
|
|
373
|
+
| Microsoft Agent Framework | `pip install msaf` |
|
|
374
|
+
|
|
375
|
+
```python
|
|
376
|
+
pipeline = Pipeline(
|
|
377
|
+
dataset=dataset,
|
|
378
|
+
tasks=[Task(get_answer)],
|
|
379
|
+
evaluation=Evaluation(metrics=[llm_correctness(llm=judge)]),
|
|
380
|
+
agent=my_agent, # just pass your agent
|
|
381
|
+
)
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
---
|
|
385
|
+
|
|
386
|
+
## Caching
|
|
387
|
+
|
|
388
|
+
Cache LLM judge responses to avoid redundant API calls. Cache keys include judge model, prompt, and agent fingerprint — the cache auto-invalidates when agent topology changes.
|
|
389
|
+
|
|
390
|
+
```python
|
|
391
|
+
from agentstax_eval import DiskCache, MemoryCache
|
|
392
|
+
|
|
393
|
+
# Persistent (JSON file, atomic writes, thread-safe)
|
|
394
|
+
cache = DiskCache(directory=".cache")
|
|
395
|
+
|
|
396
|
+
# In-process (lost when process exits, thread-safe)
|
|
397
|
+
cache = MemoryCache()
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
Pass to Pipeline with `agent` for automatic fingerprint extraction:
|
|
401
|
+
|
|
402
|
+
```python
|
|
403
|
+
pipeline = Pipeline(
|
|
404
|
+
dataset=dataset,
|
|
405
|
+
tasks=[Task(get_answer)],
|
|
406
|
+
evaluation=Evaluation(metrics=[llm_correctness(llm=judge)]),
|
|
407
|
+
agent=my_agent,
|
|
408
|
+
cache=DiskCache(".cache"),
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
results = pipeline.run()
|
|
412
|
+
cache.stats() # {"hits": 0, "misses": 3, "size": 3}
|
|
413
|
+
|
|
414
|
+
# Second run with same agent topology: all hits
|
|
415
|
+
results = pipeline.run()
|
|
416
|
+
cache.stats() # {"hits": 3, "misses": 3, "size": 3}
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
---
|
|
420
|
+
|
|
421
|
+
## Async Support
|
|
422
|
+
|
|
423
|
+
All three core objects support `run_async(concurrency=N)` for concurrent row processing:
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
import asyncio
|
|
427
|
+
from agentstax_eval import Pipeline, Task, Evaluation
|
|
428
|
+
from agentstax_eval.metrics import exact_match
|
|
429
|
+
|
|
430
|
+
async def call_agent(dataset_row: dict) -> dict:
|
|
431
|
+
response = await async_client.chat.completions.create(
|
|
432
|
+
model="gpt-4o-mini",
|
|
433
|
+
messages=[{"role": "user", "content": dataset_row["question"]}],
|
|
434
|
+
)
|
|
435
|
+
return {"answer": response.choices[0].message.content}
|
|
436
|
+
|
|
437
|
+
pipeline = Pipeline(
|
|
438
|
+
dataset=dataset,
|
|
439
|
+
tasks=[Task(call_agent)],
|
|
440
|
+
evaluation=Evaluation(metrics=[exact_match]),
|
|
441
|
+
agent=my_agent,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
results = asyncio.run(pipeline.run_async(concurrency=5))
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
`Task.run_async()` and `Evaluation.run_async()` are also available for step-by-step use. Default concurrency is `10`.
|
|
448
|
+
|
|
449
|
+
---
|
|
450
|
+
|
|
451
|
+
## CI / Regression Testing
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
# test_agent_quality.py
|
|
455
|
+
from agentstax_eval import Pipeline, Dataset, Task, Evaluation
|
|
456
|
+
from agentstax_eval.metrics import llm_correctness
|
|
457
|
+
from agentstax_eval.providers import openai_provider
|
|
458
|
+
|
|
459
|
+
def test_agent_passes_threshold():
|
|
460
|
+
results = Pipeline(
|
|
461
|
+
dataset=Dataset([
|
|
462
|
+
{"question": "Capital of France?", "expected_answer": "Paris"},
|
|
463
|
+
{"question": "What is 2 + 2?", "expected_answer": "4"},
|
|
464
|
+
]),
|
|
465
|
+
tasks=[Task(call_my_agent)],
|
|
466
|
+
evaluation=Evaluation(metrics=[llm_correctness(llm=openai_provider())]),
|
|
467
|
+
agent=my_agent,
|
|
468
|
+
).run()
|
|
469
|
+
|
|
470
|
+
assert results.assert_passing(), "\n".join(str(f) for f in results.failures())
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
Save results for trend tracking — the monitoring dashboard can watch the same directory for live regression visibility.
|
|
474
|
+
|
|
475
|
+
---
|
|
476
|
+
|
|
477
|
+
## Further Reading
|
|
478
|
+
|
|
479
|
+
- [LLM Judge Prompt Writing Guide](docs/llm-judge-prompt-guide.md) — rubric design, scoring scales, bias mitigation, and prompt examples.
|
|
480
|
+
|
|
481
|
+
---
|
|
482
|
+
|
|
483
|
+
## Contributing
|
|
484
|
+
|
|
485
|
+
Contributions are welcome. Please open an issue to discuss before submitting a PR.
|
|
486
|
+
|
|
487
|
+
```bash
|
|
488
|
+
git clone https://github.com/your-org/agentstax-eval.git && cd agentstax-eval
|
|
489
|
+
uv sync && uv run pytest --ignore=tests/e2e -q
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
---
|
|
493
|
+
|
|
494
|
+
## License
|
|
495
|
+
|
|
496
|
+
MIT — see [LICENSE](LICENSE) for details.
|