aixtools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aixtools might be problematic. Click here for more details.
- aixtools/_version.py +2 -2
- aixtools/evals/{evals.py → __main__.py} +4 -4
- aixtools/evals/dataset.py +87 -0
- aixtools/evals/discovery.py +52 -40
- aixtools/evals/run_evals.py +31 -43
- {aixtools-0.2.3.dist-info → aixtools-0.2.5.dist-info}/METADATA +50 -15
- {aixtools-0.2.3.dist-info → aixtools-0.2.5.dist-info}/RECORD +10 -9
- {aixtools-0.2.3.dist-info → aixtools-0.2.5.dist-info}/WHEEL +0 -0
- {aixtools-0.2.3.dist-info → aixtools-0.2.5.dist-info}/entry_points.txt +0 -0
- {aixtools-0.2.3.dist-info → aixtools-0.2.5.dist-info}/top_level.txt +0 -0
aixtools/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.5'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 5)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -11,8 +11,8 @@ import asyncio
|
|
|
11
11
|
import sys
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
|
|
14
|
-
from .discovery import discover_all_datasets, find_eval_files
|
|
15
|
-
from .run_evals import run_all_evaluations_and_print_results
|
|
14
|
+
from aixtools.evals.discovery import discover_all_datasets, find_eval_files # pylint: disable=E0401
|
|
15
|
+
from aixtools.evals.run_evals import run_all_evaluations_and_print_results # pylint: disable=E0401
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
async def main():
|
|
@@ -24,8 +24,8 @@ async def main():
|
|
|
24
24
|
parser.add_argument(
|
|
25
25
|
"--filter", type=str, help="Filter to run only matching evaluations (matches module, file, or dataset names)"
|
|
26
26
|
)
|
|
27
|
-
parser.add_argument("--include-input", action="store_true", help="Include input in report output")
|
|
28
|
-
parser.add_argument("--include-output", action="store_true", help="Include output in report output")
|
|
27
|
+
parser.add_argument("--include-input", action="store_true", default=True, help="Include input in report output")
|
|
28
|
+
parser.add_argument("--include-output", action="store_true", default=True, help="Include output in report output")
|
|
29
29
|
parser.add_argument(
|
|
30
30
|
"--include-evaluator-failures", action="store_true", help="Include evaluator failures in report output"
|
|
31
31
|
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Custom dataset and evaluation utilities for AixTools.
|
|
2
|
+
|
|
3
|
+
This module provides wrapper classes and decorators for building and running
|
|
4
|
+
evaluations using the pydantic-evals framework. It includes a custom Dataset
|
|
5
|
+
class, decorators for marking target functions, scorers, and evaluators, and
|
|
6
|
+
a default scoring function based on assertion averages.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Awaitable, Callable, Generic
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
from pydantic_evals.dataset import Case, Dataset, InputsT, MetadataT, OutputT
|
|
13
|
+
from pydantic_evals.evaluators import Evaluator
|
|
14
|
+
from pydantic_evals.reporting import EvaluationReport
|
|
15
|
+
|
|
16
|
+
TargetT = Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT]
|
|
17
|
+
ScorerT = Callable[[EvaluationReport, "AixDataset", float, bool], bool]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AixDataset(BaseModel, Generic[InputsT, OutputT, MetadataT]):
|
|
21
|
+
"""Custom Dataset class for AixTools evaluations."""
|
|
22
|
+
|
|
23
|
+
dataset: Dataset[InputsT, OutputT]
|
|
24
|
+
name: str
|
|
25
|
+
target_func: TargetT
|
|
26
|
+
scorers: list[ScorerT]
|
|
27
|
+
|
|
28
|
+
def __init__( # pylint: disable=R0913,R0917
|
|
29
|
+
self,
|
|
30
|
+
cases: list[Case[InputsT, OutputT]],
|
|
31
|
+
target_func: TargetT,
|
|
32
|
+
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] | None = None,
|
|
33
|
+
name: str | None = None,
|
|
34
|
+
scoring_funcs: list[ScorerT] | None = None,
|
|
35
|
+
):
|
|
36
|
+
super().__init__(
|
|
37
|
+
dataset=Dataset(cases=cases, evaluators=evaluators or []),
|
|
38
|
+
target_func=target_func,
|
|
39
|
+
name=name or "dataset",
|
|
40
|
+
scorers=scoring_funcs or [average_assertions],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def cases(self) -> list[Case[InputsT, OutputT]]:
|
|
45
|
+
"""Return the list of cases in the dataset."""
|
|
46
|
+
return self.dataset.cases
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def evaluators(self) -> list[Evaluator[InputsT, OutputT, MetadataT]]:
|
|
50
|
+
"""Return the list of evaluators in the dataset."""
|
|
51
|
+
return self.dataset.evaluators
|
|
52
|
+
|
|
53
|
+
async def evaluate(
|
|
54
|
+
self,
|
|
55
|
+
) -> EvaluationReport:
|
|
56
|
+
"""Run the evaluation using the target function and return an EvaluationReport."""
|
|
57
|
+
return await self.dataset.evaluate(self.target_func)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Decorators removed - using name-based discovery only for simplicity and async compatibility
|
|
61
|
+
# Functions should be named with prefixes: target_, scorer_, evaluator_
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def average_assertions(
|
|
65
|
+
report: EvaluationReport, dataset: "AixDataset", min_score: float = 1.0, verbose: bool = False
|
|
66
|
+
) -> bool:
|
|
67
|
+
"""Scoring function that checks if the average assertions meet a minimum threshold."""
|
|
68
|
+
averages = report.averages()
|
|
69
|
+
if averages and averages.assertions is not None:
|
|
70
|
+
success = averages.assertions >= min_score
|
|
71
|
+
if verbose:
|
|
72
|
+
print(f"\nAssertions Summary for {dataset.name}:")
|
|
73
|
+
print(f" Assertions Average: {averages.assertions:.3f}")
|
|
74
|
+
print(f" Minimum Required: {min_score:.3f}")
|
|
75
|
+
print(f" Status: {'PASSED' if success else 'FAILED'}")
|
|
76
|
+
else:
|
|
77
|
+
print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
|
|
78
|
+
else:
|
|
79
|
+
success = False
|
|
80
|
+
if verbose:
|
|
81
|
+
print(f"\nAssertions Summary for {dataset.name}:")
|
|
82
|
+
print(" No assertions found or evaluation failed")
|
|
83
|
+
print(f" Minimum Required: {min_score:.3f}")
|
|
84
|
+
print(" Status: FAILED")
|
|
85
|
+
else:
|
|
86
|
+
print("FAILED (no assertions)")
|
|
87
|
+
return success
|
aixtools/evals/discovery.py
CHANGED
|
@@ -9,10 +9,14 @@ import inspect
|
|
|
9
9
|
import sys
|
|
10
10
|
import traceback
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any
|
|
12
|
+
from typing import Any, TypeVar
|
|
13
13
|
|
|
14
14
|
from pydantic_evals.dataset import Dataset
|
|
15
15
|
|
|
16
|
+
from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
|
|
17
|
+
|
|
18
|
+
SpecialFuncT = TypeVar("SpecialFuncT")
|
|
19
|
+
|
|
16
20
|
|
|
17
21
|
def find_eval_files(evals_dir: Path) -> list[Path]:
|
|
18
22
|
"""Find all eval_*.py files in the evals directory."""
|
|
@@ -33,7 +37,7 @@ def find_datasets_in_module(module: Any) -> list[tuple[str, Dataset]]:
|
|
|
33
37
|
datasets = []
|
|
34
38
|
|
|
35
39
|
for name, obj in inspect.getmembers(module):
|
|
36
|
-
if name.startswith("dataset_") and isinstance(obj, Dataset):
|
|
40
|
+
if name.startswith("dataset_") and isinstance(obj, (Dataset, AixDataset)):
|
|
37
41
|
datasets.append((name, obj))
|
|
38
42
|
|
|
39
43
|
return datasets
|
|
@@ -66,66 +70,74 @@ def matches_filter(module_name: str, file_name: str, dataset_name: str, name_fil
|
|
|
66
70
|
)
|
|
67
71
|
|
|
68
72
|
|
|
69
|
-
def
|
|
70
|
-
"""Find
|
|
73
|
+
def find_prefixed_functions(module: Any, prefix: str) -> list[Any]:
|
|
74
|
+
"""Find all functions with a specific prefix (name-based discovery only)."""
|
|
75
|
+
funcs = []
|
|
71
76
|
for name, obj in inspect.getmembers(module):
|
|
72
|
-
if inspect.
|
|
73
|
-
|
|
74
|
-
return None
|
|
77
|
+
if name.startswith(prefix) and (inspect.isfunction(obj) or inspect.iscoroutinefunction(obj)):
|
|
78
|
+
funcs.append(obj) # Return function directly, no decorator wrapping
|
|
75
79
|
|
|
80
|
+
return funcs
|
|
76
81
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
if inspect.iscoroutinefunction(obj) and not name.startswith("_")
|
|
83
|
-
]
|
|
82
|
+
|
|
83
|
+
def print_v(message: str, verbose: bool) -> None:
|
|
84
|
+
"""Print message if verbose is enabled."""
|
|
85
|
+
if verbose:
|
|
86
|
+
print(message)
|
|
84
87
|
|
|
85
88
|
|
|
86
89
|
def process_datasets_from_module(
|
|
87
90
|
module: Any, eval_file: Path, name_filter: str | None, verbose: bool
|
|
88
|
-
) -> list[
|
|
91
|
+
) -> list[AixDataset]:
|
|
89
92
|
"""Process all datasets from a single module and return valid dataset tuples."""
|
|
90
93
|
datasets = find_datasets_in_module(module)
|
|
91
|
-
|
|
92
|
-
|
|
94
|
+
|
|
95
|
+
print_v(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}", verbose)
|
|
93
96
|
|
|
94
97
|
valid_datasets = []
|
|
95
98
|
|
|
99
|
+
targets = find_prefixed_functions(module, "target_")
|
|
100
|
+
scorers = find_prefixed_functions(module, "scorer_")
|
|
101
|
+
evaluators = find_prefixed_functions(module, "evaluator_")
|
|
102
|
+
|
|
103
|
+
print_v(f" Found target functions: {[f.__name__ for f in targets]}", verbose)
|
|
104
|
+
print_v(f" Found scoring functions: {[f.__name__ for f in scorers]}", verbose)
|
|
105
|
+
print_v(f" Found evaluator functions: {[f.__name__ for f in evaluators]}", verbose)
|
|
106
|
+
|
|
96
107
|
for dataset_name, dataset in datasets:
|
|
97
108
|
full_name = f"{eval_file.stem}.{dataset_name}"
|
|
98
109
|
|
|
99
110
|
if not matches_filter(module.__name__, eval_file.stem, dataset_name, name_filter):
|
|
100
|
-
|
|
101
|
-
print(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})")
|
|
111
|
+
print_v(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})", verbose)
|
|
102
112
|
continue
|
|
103
113
|
|
|
104
|
-
|
|
105
|
-
print(f" ✓ Including dataset: {dataset_name}")
|
|
114
|
+
print_v(f" ✓ Including dataset: {dataset_name}", verbose)
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
async_functions = get_async_function_names(module)
|
|
116
|
+
if isinstance(dataset, Dataset):
|
|
117
|
+
# Wrap in AixDataset if not already
|
|
110
118
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
119
|
+
if len(targets) != 1:
|
|
120
|
+
print_v(
|
|
121
|
+
f" ✗ Skipping dataset: {dataset_name} (has {len(targets)} target functions, expected exactly 1)",
|
|
122
|
+
verbose,
|
|
123
|
+
)
|
|
115
124
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
dataset = AixDataset( # noqa: PLW2901
|
|
128
|
+
cases=dataset.cases,
|
|
129
|
+
evaluators=dataset.evaluators, # evaluators are plain functions now
|
|
130
|
+
name=full_name,
|
|
131
|
+
target_func=targets[0], # target function is used directly
|
|
132
|
+
scoring_funcs=scorers, # scorers are plain functions now
|
|
133
|
+
)
|
|
120
134
|
|
|
121
|
-
|
|
135
|
+
valid_datasets.append(dataset)
|
|
122
136
|
|
|
123
137
|
return valid_datasets
|
|
124
138
|
|
|
125
139
|
|
|
126
|
-
def discover_all_datasets(
|
|
127
|
-
eval_files: list[Path], name_filter: str | None, verbose: bool
|
|
128
|
-
) -> list[tuple[str, Dataset, Any]]:
|
|
140
|
+
def discover_all_datasets(eval_files: list[Path], name_filter: str | None, verbose: bool) -> list[AixDataset]:
|
|
129
141
|
"""Discover all datasets from eval files."""
|
|
130
142
|
all_datasets = []
|
|
131
143
|
|
|
@@ -141,7 +153,7 @@ def discover_all_datasets(
|
|
|
141
153
|
datasets = process_datasets_from_module(module, eval_file, name_filter, verbose)
|
|
142
154
|
all_datasets.extend(datasets)
|
|
143
155
|
|
|
144
|
-
except Exception as e: # pylint: disable=
|
|
156
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
145
157
|
if verbose:
|
|
146
158
|
print(f"Error loading {eval_file}: {e}")
|
|
147
159
|
print(f" Traceback: {traceback.format_exc()}")
|
|
@@ -162,9 +174,9 @@ def discover_all_datasets(
|
|
|
162
174
|
print(f"\n{'=' * 60}")
|
|
163
175
|
print("Datasets to Evaluate:")
|
|
164
176
|
print(f"{'=' * 60}")
|
|
165
|
-
for i, (
|
|
166
|
-
print(f"{i}. {
|
|
167
|
-
print(f" Target function: {
|
|
177
|
+
for i, (dataset) in enumerate(all_datasets, 1):
|
|
178
|
+
print(f"{i}. {dataset.name}")
|
|
179
|
+
print(f" Target function: {dataset.target_func.__name__}")
|
|
168
180
|
print(f" Cases: {len(dataset.cases)}")
|
|
169
181
|
print(f" Evaluators: {len(dataset.evaluators)}")
|
|
170
182
|
print(f"{'=' * 60}")
|
aixtools/evals/run_evals.py
CHANGED
|
@@ -5,30 +5,29 @@ This module handles running evaluations and printing results.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import sys
|
|
8
|
-
from typing import Any
|
|
9
8
|
|
|
10
|
-
from pydantic_evals.
|
|
9
|
+
from pydantic_evals.reporting import EvaluationReport
|
|
11
10
|
|
|
11
|
+
from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
dataset:
|
|
16
|
-
target_function: Any,
|
|
13
|
+
|
|
14
|
+
async def run_dataset_evaluation(
|
|
15
|
+
dataset: AixDataset,
|
|
17
16
|
print_options: dict[str, bool],
|
|
18
17
|
min_assertions: float,
|
|
19
18
|
verbose: bool = False,
|
|
20
|
-
) -> tuple[str, bool]:
|
|
21
|
-
"""Run evaluation for a single dataset and return (name, success)."""
|
|
19
|
+
) -> tuple[str, bool, EvaluationReport | None]:
|
|
20
|
+
"""Run evaluation for a single dataset and return (name, success, report)."""
|
|
22
21
|
if verbose:
|
|
23
22
|
print(f"\n{'=' * 60}")
|
|
24
|
-
print(f"Running evaluation: {
|
|
23
|
+
print(f"Running evaluation: {dataset.name}")
|
|
25
24
|
print(f"{'=' * 60}")
|
|
26
25
|
else:
|
|
27
|
-
print(f"Running {
|
|
26
|
+
print(f"Running {dataset.name}...", end=" ")
|
|
28
27
|
|
|
29
28
|
try:
|
|
30
29
|
# Execute the evaluation
|
|
31
|
-
report = await dataset.evaluate(
|
|
30
|
+
report = await dataset.evaluate()
|
|
32
31
|
|
|
33
32
|
# Print the results
|
|
34
33
|
report.print(
|
|
@@ -38,60 +37,49 @@ async def run_dataset_evaluation( # noqa: PLR0913, pylint: disable=too-many-arg
|
|
|
38
37
|
include_reasons=print_options["include_reasons"],
|
|
39
38
|
)
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
averages = report.averages()
|
|
43
|
-
if averages and averages.assertions is not None:
|
|
44
|
-
success = averages.assertions >= min_assertions
|
|
45
|
-
if verbose:
|
|
46
|
-
print(f"\nEvaluation Summary for {dataset_name}:")
|
|
47
|
-
print(f" Assertions Average: {averages.assertions:.3f}")
|
|
48
|
-
print(f" Minimum Required: {min_assertions:.3f}")
|
|
49
|
-
print(f" Status: {'PASSED' if success else 'FAILED'}")
|
|
50
|
-
else:
|
|
51
|
-
print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
|
|
52
|
-
else:
|
|
53
|
-
success = False
|
|
54
|
-
if verbose:
|
|
55
|
-
print(f"\nEvaluation Summary for {dataset_name}:")
|
|
56
|
-
print(" No assertions found or evaluation failed")
|
|
57
|
-
print(f" Minimum Required: {min_assertions:.3f}")
|
|
58
|
-
print(" Status: FAILED")
|
|
59
|
-
else:
|
|
60
|
-
print("FAILED (no assertions)")
|
|
40
|
+
success = all(scorer(report, dataset, min_assertions, verbose) for scorer in dataset.scorers)
|
|
61
41
|
|
|
62
|
-
return
|
|
42
|
+
return dataset.name, success, report
|
|
63
43
|
|
|
64
44
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
65
45
|
if verbose:
|
|
66
|
-
print(f"Error running evaluation {
|
|
46
|
+
print(f"Error running evaluation {dataset.name}: {e}")
|
|
67
47
|
else:
|
|
68
48
|
print(f"ERROR ({e})")
|
|
69
|
-
return
|
|
49
|
+
return dataset.name, False, None
|
|
70
50
|
|
|
71
51
|
|
|
72
52
|
async def run_all_evaluations_and_print_results(
|
|
73
|
-
datasets: list[
|
|
53
|
+
datasets: list[AixDataset], print_options: dict[str, bool], min_assertions: float, verbose: bool
|
|
74
54
|
) -> None:
|
|
75
55
|
"""Run all evaluations and print results with summary."""
|
|
76
56
|
# Run all evaluations
|
|
77
57
|
results = []
|
|
78
|
-
for
|
|
79
|
-
result = await run_dataset_evaluation(
|
|
80
|
-
dataset_name, dataset, target_function, print_options, min_assertions, verbose
|
|
81
|
-
)
|
|
58
|
+
for dataset in datasets:
|
|
59
|
+
result = await run_dataset_evaluation(dataset, print_options, min_assertions, verbose)
|
|
82
60
|
results.append(result)
|
|
83
61
|
|
|
62
|
+
# Print reports
|
|
63
|
+
for _, _, report in results:
|
|
64
|
+
if report:
|
|
65
|
+
report.print(
|
|
66
|
+
include_input=print_options["include_input"],
|
|
67
|
+
include_output=print_options["include_output"],
|
|
68
|
+
include_evaluator_failures=print_options["include_evaluator_failures"],
|
|
69
|
+
include_reasons=print_options["include_reasons"],
|
|
70
|
+
)
|
|
71
|
+
|
|
84
72
|
# Print summary
|
|
85
|
-
passed = sum(1 for _, success in results if success)
|
|
73
|
+
passed = sum(1 for _, success, _ in results if success)
|
|
86
74
|
total = len(results)
|
|
87
|
-
failed_results = [(name, success) for name, success in results if not success]
|
|
75
|
+
failed_results = [(name, success, _) for name, success, _ in results if not success]
|
|
88
76
|
|
|
89
77
|
if verbose:
|
|
90
78
|
print(f"\n{'=' * 60}")
|
|
91
79
|
print("EVALUATION SUMMARY")
|
|
92
80
|
print(f"{'=' * 60}")
|
|
93
81
|
|
|
94
|
-
for name, success in results:
|
|
82
|
+
for name, success, _ in results:
|
|
95
83
|
status = "PASSED" if success else "FAILED"
|
|
96
84
|
print(f" {name}: {status}")
|
|
97
85
|
|
|
@@ -99,7 +87,7 @@ async def run_all_evaluations_and_print_results(
|
|
|
99
87
|
# Only show failed evaluations when not verbose
|
|
100
88
|
elif failed_results:
|
|
101
89
|
print("\nFailed evaluations:")
|
|
102
|
-
for name, _ in failed_results:
|
|
90
|
+
for name, _, _ in failed_results:
|
|
103
91
|
print(f" {name}: FAILED")
|
|
104
92
|
|
|
105
93
|
# Exit with non-zero code if any evaluations failed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aixtools
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Tools for AI exploration and debugging
|
|
5
5
|
Requires-Python: >=3.11.2
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -416,30 +416,30 @@ By default, the "FaultyMCP" includes several tools you can use in your tests:
|
|
|
416
416
|
|
|
417
417
|
### Evals
|
|
418
418
|
|
|
419
|
-
Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework.
|
|
419
|
+
Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework with AIXtools enhancements.
|
|
420
420
|
|
|
421
421
|
```bash
|
|
422
422
|
# Run all evaluations
|
|
423
|
-
evals
|
|
423
|
+
python -m aixtools.evals
|
|
424
424
|
|
|
425
425
|
# Run evaluations with filtering
|
|
426
|
-
evals --filter "specific_test"
|
|
426
|
+
python -m aixtools.evals --filter "specific_test"
|
|
427
427
|
|
|
428
428
|
# Run with verbose output and detailed reporting
|
|
429
|
-
evals --verbose --include-input --include-output --include-reasons
|
|
429
|
+
python -m aixtools.evals --verbose --include-input --include-output --include-reasons
|
|
430
430
|
|
|
431
431
|
# Specify custom evaluations directory
|
|
432
|
-
evals --evals-dir /path/to/evals
|
|
432
|
+
python -m aixtools.evals --evals-dir /path/to/evals
|
|
433
433
|
|
|
434
434
|
# Set minimum assertions threshold
|
|
435
|
-
evals --min-assertions 0.8
|
|
435
|
+
python -m aixtools.evals --min-assertions 0.8
|
|
436
436
|
```
|
|
437
437
|
|
|
438
438
|
**Command Line Options:**
|
|
439
439
|
- `--evals-dir` - Directory containing eval_*.py files (default: evals)
|
|
440
440
|
- `--filter` - Filter to run only matching evaluations
|
|
441
|
-
- `--include-input` - Include input in report output
|
|
442
|
-
- `--include-output` - Include output in report output
|
|
441
|
+
- `--include-input` - Include input in report output (default: True)
|
|
442
|
+
- `--include-output` - Include output in report output (default: True)
|
|
443
443
|
- `--include-evaluator-failures` - Include evaluator failures in report
|
|
444
444
|
- `--include-reasons` - Include reasons in report output
|
|
445
445
|
- `--min-assertions` - Minimum assertions average required for success (default: 1.0)
|
|
@@ -447,14 +447,16 @@ evals --min-assertions 0.8
|
|
|
447
447
|
|
|
448
448
|
The evaluation system discovers and runs all Dataset objects from eval_*.py files in the specified directory, similar to test runners but specifically designed for LLM evaluations using pydantic_evals.
|
|
449
449
|
|
|
450
|
-
**Discovery Mechanism
|
|
450
|
+
**Discovery Mechanism**
|
|
451
451
|
|
|
452
|
-
The evaluation framework uses an automatic discovery system
|
|
452
|
+
The evaluation framework uses an automatic discovery system:
|
|
453
453
|
|
|
454
454
|
1. **File Discovery**: Scans the specified directory for files matching the pattern `eval_*.py`
|
|
455
455
|
2. **Dataset Discovery**: Within each file, looks for variables named `dataset_*` that are instances of `pydantic_evals.Dataset`
|
|
456
|
-
3. **Target Function Discovery**:
|
|
457
|
-
4. **
|
|
456
|
+
3. **Target Function Discovery**: Within the same file looks for function or async function named `target_*`. There must be 1 target function per file.
|
|
457
|
+
4. **Function Discovery**: Looks for functions with specific prefixes:
|
|
458
|
+
- Functions prefixed with `scorer_*`, `evaluator_*` for custom scorer and evaluator functions that will be used for each dataset in that file
|
|
459
|
+
5. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
|
|
458
460
|
|
|
459
461
|
**Example Evaluation File Structure:**
|
|
460
462
|
```python
|
|
@@ -472,11 +474,16 @@ dataset_addition = Dataset(
|
|
|
472
474
|
)
|
|
473
475
|
|
|
474
476
|
# This function will be used as the evaluation target
|
|
475
|
-
async def
|
|
476
|
-
# Your agent
|
|
477
|
+
async def target_math_agent(input_text: str) -> str:
|
|
478
|
+
# Your agent run logic here
|
|
477
479
|
agent = get_agent(system_prompt="You are a math assistant.")
|
|
478
480
|
result, _ = await run_agent(agent, input_text)
|
|
479
481
|
return result
|
|
482
|
+
|
|
483
|
+
# This function will be used as evaluator for all datasets (optional)
|
|
484
|
+
def evaluator_check_output(ctx: EvaluatorContext) -> bool:
|
|
485
|
+
# Your result evaluation logic here
|
|
486
|
+
return ctx.output == ctx.expected_output
|
|
480
487
|
```
|
|
481
488
|
|
|
482
489
|
The discovery system will:
|
|
@@ -485,6 +492,34 @@ The discovery system will:
|
|
|
485
492
|
- Use `evaluate_math_agent` as the target function for evaluation
|
|
486
493
|
- Run each case through the target function and evaluate results
|
|
487
494
|
|
|
495
|
+
#### Name-Based Discovery
|
|
496
|
+
|
|
497
|
+
The evaluation system uses name-based discovery for all components:
|
|
498
|
+
|
|
499
|
+
**Target Functions** (exactly one required per eval file):
|
|
500
|
+
- **Purpose**: The main function being evaluated - processes inputs and returns outputs
|
|
501
|
+
- **Naming**: Functions named `target_*` (e.g., `target_my_function`)
|
|
502
|
+
- **Signature**: `def target_name(inputs: InputType) -> OutputType` or `async def target_name(inputs: InputType) -> OutputType`
|
|
503
|
+
- **Example**: `async def target_math_agent(input_text: str) -> str`
|
|
504
|
+
|
|
505
|
+
**Scoring Functions** (optional):
|
|
506
|
+
- **Purpose**: Determine if evaluation results meet success criteria
|
|
507
|
+
- **Naming**: Functions named `scorer_*` (e.g., `scorer_custom`)
|
|
508
|
+
- **Signature**: `def scorer_name(report: EvaluationReport, dataset: AixDataset, min_score: float = 1.0, verbose: bool = False) -> bool`
|
|
509
|
+
- **Example**: `def scorer_accuracy_threshold(report, dataset, min_score=0.8, verbose=False) -> bool`
|
|
510
|
+
|
|
511
|
+
**Evaluator Functions** (optional):
|
|
512
|
+
- **Purpose**: Custom evaluation logic for comparing outputs with expected results
|
|
513
|
+
- **Naming**: Functions named `evaluator_*` (e.g., `evaluator_check_output`)
|
|
514
|
+
- **Signature**: `def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput` or `async def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput`
|
|
515
|
+
- **Example**: `def evaluator_exact_match(ctx) -> EvaluatorOutput`
|
|
516
|
+
|
|
517
|
+
This name-based approach works seamlessly with both synchronous and asynchronous functions.
|
|
518
|
+
|
|
519
|
+
#### Scoring System
|
|
520
|
+
|
|
521
|
+
The framework includes a custom scoring system with [`average_assertions`](aixtools/evals/dataset.py:67) as the default scorer. This scorer checks if the average assertion score meets a minimum threshold and provides detailed pass/fail reporting.
|
|
522
|
+
|
|
488
523
|
## Testing & Tools
|
|
489
524
|
|
|
490
525
|
AIXtools provides comprehensive testing utilities and diagnostic tools for AI agent development and debugging.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
aixtools/__init__.py,sha256=9NGHm7LjsQmsvjTZvw6QFJexSvAU4bCoN_KBk9SCa00,260
|
|
2
|
-
aixtools/_version.py,sha256=
|
|
2
|
+
aixtools/_version.py,sha256=9wrJ_4Dlc0arUzKiaIqvTY85rMJma3eb1nNlF3uHAxU,704
|
|
3
3
|
aixtools/app.py,sha256=JzQ0nrv_bjDQokllIlGHOV0HEb-V8N6k_nGQH-TEsVU,5227
|
|
4
4
|
aixtools/chainlit.md,sha256=yC37Ly57vjKyiIvK4oUvf4DYxZCwH7iocTlx7bLeGLU,761
|
|
5
5
|
aixtools/context.py,sha256=I_MD40ZnvRm5WPKAKqBUAdXIf8YaurkYUUHSVVy-QvU,598
|
|
@@ -38,9 +38,10 @@ aixtools/db/__init__.py,sha256=b8vRhme3egV-aUZbAntnOaDkSXB8UT0Xy5oqQhU_z0Q,399
|
|
|
38
38
|
aixtools/db/database.py,sha256=caWe95GlxZYlxn2ubDmR-_cQUW0ulkpR3BHunKIaOsw,3369
|
|
39
39
|
aixtools/db/vector_db.py,sha256=be4JGyXj3o8VEfy9L6SO1aAoDET_zazMJkYfjlYHTYQ,4133
|
|
40
40
|
aixtools/evals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
aixtools/evals/
|
|
42
|
-
aixtools/evals/
|
|
43
|
-
aixtools/evals/
|
|
41
|
+
aixtools/evals/__main__.py,sha256=f6_X5jHFHIR36r_jerj3ol8SLhZB8nVk8vffoRVtxLs,2844
|
|
42
|
+
aixtools/evals/dataset.py,sha256=qsLrx9hgvZWY1FzuwYtm3aJscNth1EkeLWTgIdici5Q,3374
|
|
43
|
+
aixtools/evals/discovery.py,sha256=gHKfutzdtjZPkjxXnD_WYHqL2WMem8kkJpk2HDHIgKg,6488
|
|
44
|
+
aixtools/evals/run_evals.py,sha256=J5sfdfC_2NwQcRP4mGV4FYSpzawRe4pmkOTjUm1IAWU,3278
|
|
44
45
|
aixtools/google/client.py,sha256=8yuv_zEZKlmUTI-zRxAb3vjLUrfiwrBhcpNe0hYsO0g,1078
|
|
45
46
|
aixtools/log_view/__init__.py,sha256=0fWLCq9BMo8GoH3Z5WDgvf0-J2TP0XWqtef0f28SHBA,405
|
|
46
47
|
aixtools/log_view/app.py,sha256=DZp3PUM_iS3DpMHqHfFXVACvbZ9PItbOCNMkDjIOfTc,6595
|
|
@@ -88,8 +89,8 @@ aixtools/utils/chainlit/cl_agent_show.py,sha256=vaRuowp4BRvhxEr5hw0zHEJ7iaSF_5bo
|
|
|
88
89
|
aixtools/utils/chainlit/cl_utils.py,sha256=fxaxdkcZg6uHdM8uztxdPowg3a2f7VR7B26VPY4t-3c,5738
|
|
89
90
|
aixtools/vault/__init__.py,sha256=fsr_NuX3GZ9WZ7dGfe0gp_5-z3URxAfwVRXw7Xyc0dU,141
|
|
90
91
|
aixtools/vault/vault.py,sha256=9dZLWdZQk9qN_Q9Djkofw9LUKnJqnrX5H0fGusVLBhA,6037
|
|
91
|
-
aixtools-0.2.
|
|
92
|
-
aixtools-0.2.
|
|
93
|
-
aixtools-0.2.
|
|
94
|
-
aixtools-0.2.
|
|
95
|
-
aixtools-0.2.
|
|
92
|
+
aixtools-0.2.5.dist-info/METADATA,sha256=BHPUgnHXs7ET3BvwAkPxYRkkXnxLdptFwbYNDkoBMbw,27229
|
|
93
|
+
aixtools-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
94
|
+
aixtools-0.2.5.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
|
|
95
|
+
aixtools-0.2.5.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
|
|
96
|
+
aixtools-0.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|