aixtools 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aixtools might be problematic. Click here for more details.

aixtools/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.2'
32
- __version_tuple__ = version_tuple = (0, 2, 2)
31
+ __version__ = version = '0.2.4'
32
+ __version_tuple__ = version_tuple = (0, 2, 4)
33
33
 
34
34
  __commit_id__ = commit_id = None
File without changes
aixtools/auth/auth.py ADDED
@@ -0,0 +1,70 @@
1
+ """
2
+ Module that manages OAuth2 functions for authentication
3
+ """
4
+
5
+ import logging
6
+
7
+ import jwt
8
+ from jwt import ExpiredSignatureError, InvalidAudienceError, InvalidIssuerError, PyJWKClient
9
+
10
+ from aixtools.utils import config
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AuthTokenError(Exception):
16
+ """Exception raised for authentication token errors."""
17
+
18
+
19
+ # pylint: disable=too-few-public-methods
20
+ class AccessTokenVerifier:
21
+ """
22
+ Verifies Microsoft SSO JWT token against the configured Tenant ID, Audience, API ID and Issuer URL.
23
+ """
24
+
25
+ def __init__(self):
26
+ tenant_id = config.APP_TENANT_ID
27
+ self.api_id = config.APP_API_ID
28
+ self.issuer_url = config.APP_ISSUER_URL
29
+ # Azure AD endpoints
30
+ jwks_url = f"https://login.microsoftonline.com/{tenant_id}/discovery/v2.0/keys"
31
+ self.jwks_client = PyJWKClient(
32
+ uri=jwks_url,
33
+ # cache keys url response to reduce SSO server network calls,
34
+ # as public keys are not expected to change frequently
35
+ cache_jwk_set=True,
36
+ # cache resolved public keys
37
+ cache_keys=True,
38
+ # cache url response for 10 hours
39
+ lifespan=36000,
40
+ )
41
+
42
+ logger.info("Using JWKS: %s", jwks_url)
43
+
44
+ def verify(self, token: str) -> dict:
45
+ """
46
+ Verifies The JWT access token and returns decoded claims as a dictionary if the token is
47
+ valid, otherwise raises an AuthTokenError
48
+ """
49
+ try:
50
+ signing_key = self.jwks_client.get_signing_key_from_jwt(token)
51
+
52
+ claims = jwt.decode(
53
+ token,
54
+ signing_key.key,
55
+ algorithms=["RS256"],
56
+ audience=self.api_id,
57
+ issuer=self.issuer_url,
58
+ # ensure audience verification is carried out
59
+ options={"verify_aud": True},
60
+ )
61
+ return claims
62
+
63
+ except ExpiredSignatureError as e:
64
+ raise AuthTokenError("Token expired") from e
65
+ except InvalidAudienceError as e:
66
+ raise AuthTokenError(f"Token not for expected audience: {e}") from e
67
+ except InvalidIssuerError as e:
68
+ raise AuthTokenError(f"Token not for expected issuer: {e}") from e
69
+ except jwt.exceptions.PyJWTError as e:
70
+ raise AuthTokenError(f"Invalid token: {e}") from e
@@ -11,8 +11,8 @@ import asyncio
11
11
  import sys
12
12
  from pathlib import Path
13
13
 
14
- from .discovery import discover_all_datasets, find_eval_files
15
- from .run_evals import run_all_evaluations_and_print_results
14
+ from aixtools.evals.discovery import discover_all_datasets, find_eval_files # pylint: disable=E0401
15
+ from aixtools.evals.run_evals import run_all_evaluations_and_print_results # pylint: disable=E0401
16
16
 
17
17
 
18
18
  async def main():
@@ -24,8 +24,8 @@ async def main():
24
24
  parser.add_argument(
25
25
  "--filter", type=str, help="Filter to run only matching evaluations (matches module, file, or dataset names)"
26
26
  )
27
- parser.add_argument("--include-input", action="store_true", help="Include input in report output")
28
- parser.add_argument("--include-output", action="store_true", help="Include output in report output")
27
+ parser.add_argument("--include-input", action="store_true", default=True, help="Include input in report output")
28
+ parser.add_argument("--include-output", action="store_true", default=True, help="Include output in report output")
29
29
  parser.add_argument(
30
30
  "--include-evaluator-failures", action="store_true", help="Include evaluator failures in report output"
31
31
  )
@@ -0,0 +1,87 @@
1
+ """Custom dataset and evaluation utilities for AixTools.
2
+
3
+ This module provides wrapper classes and decorators for building and running
4
+ evaluations using the pydantic-evals framework. It includes a custom Dataset
5
+ class, decorators for marking target functions, scorers, and evaluators, and
6
+ a default scoring function based on assertion averages.
7
+ """
8
+
9
+ from typing import Awaitable, Callable, Generic
10
+
11
+ from pydantic import BaseModel
12
+ from pydantic_evals.dataset import Case, Dataset, InputsT, MetadataT, OutputT
13
+ from pydantic_evals.evaluators import Evaluator
14
+ from pydantic_evals.reporting import EvaluationReport
15
+
16
+ TargetT = Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT]
17
+ ScorerT = Callable[[EvaluationReport, "AixDataset", float, bool], bool]
18
+
19
+
20
+ class AixDataset(BaseModel, Generic[InputsT, OutputT, MetadataT]):
21
+ """Custom Dataset class for AixTools evaluations."""
22
+
23
+ dataset: Dataset[InputsT, OutputT]
24
+ name: str
25
+ target_func: TargetT
26
+ scorers: list[ScorerT]
27
+
28
+ def __init__( # pylint: disable=R0913,R0917
29
+ self,
30
+ cases: list[Case[InputsT, OutputT]],
31
+ target_func: TargetT,
32
+ evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] | None = None,
33
+ name: str | None = None,
34
+ scoring_funcs: list[ScorerT] | None = None,
35
+ ):
36
+ super().__init__(
37
+ dataset=Dataset(cases=cases, evaluators=evaluators or []),
38
+ target_func=target_func,
39
+ name=name or "dataset",
40
+ scorers=scoring_funcs or [average_assertions],
41
+ )
42
+
43
+ @property
44
+ def cases(self) -> list[Case[InputsT, OutputT]]:
45
+ """Return the list of cases in the dataset."""
46
+ return self.dataset.cases
47
+
48
+ @property
49
+ def evaluators(self) -> list[Evaluator[InputsT, OutputT, MetadataT]]:
50
+ """Return the list of evaluators in the dataset."""
51
+ return self.dataset.evaluators
52
+
53
+ async def evaluate(
54
+ self,
55
+ ) -> EvaluationReport:
56
+ """Run the evaluation using the target function and return an EvaluationReport."""
57
+ return await self.dataset.evaluate(self.target_func)
58
+
59
+
60
+ # Decorators removed - using name-based discovery only for simplicity and async compatibility
61
+ # Functions should be named with prefixes: target_, scorer_, evaluator_
62
+
63
+
64
+ def average_assertions(
65
+ report: EvaluationReport, dataset: "AixDataset", min_score: float = 1.0, verbose: bool = False
66
+ ) -> bool:
67
+ """Scoring function that checks if the average assertions meet a minimum threshold."""
68
+ averages = report.averages()
69
+ if averages and averages.assertions is not None:
70
+ success = averages.assertions >= min_score
71
+ if verbose:
72
+ print(f"\nAssertions Summary for {dataset.name}:")
73
+ print(f" Assertions Average: {averages.assertions:.3f}")
74
+ print(f" Minimum Required: {min_score:.3f}")
75
+ print(f" Status: {'PASSED' if success else 'FAILED'}")
76
+ else:
77
+ print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
78
+ else:
79
+ success = False
80
+ if verbose:
81
+ print(f"\nAssertions Summary for {dataset.name}:")
82
+ print(" No assertions found or evaluation failed")
83
+ print(f" Minimum Required: {min_score:.3f}")
84
+ print(" Status: FAILED")
85
+ else:
86
+ print("FAILED (no assertions)")
87
+ return success
@@ -9,10 +9,14 @@ import inspect
9
9
  import sys
10
10
  import traceback
11
11
  from pathlib import Path
12
- from typing import Any
12
+ from typing import Any, TypeVar
13
13
 
14
14
  from pydantic_evals.dataset import Dataset
15
15
 
16
+ from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
17
+
18
+ SpecialFuncT = TypeVar("SpecialFuncT")
19
+
16
20
 
17
21
  def find_eval_files(evals_dir: Path) -> list[Path]:
18
22
  """Find all eval_*.py files in the evals directory."""
@@ -33,7 +37,7 @@ def find_datasets_in_module(module: Any) -> list[tuple[str, Dataset]]:
33
37
  datasets = []
34
38
 
35
39
  for name, obj in inspect.getmembers(module):
36
- if name.startswith("dataset_") and isinstance(obj, Dataset):
40
+ if name.startswith("dataset_") and isinstance(obj, (Dataset, AixDataset)):
37
41
  datasets.append((name, obj))
38
42
 
39
43
  return datasets
@@ -66,66 +70,74 @@ def matches_filter(module_name: str, file_name: str, dataset_name: str, name_fil
66
70
  )
67
71
 
68
72
 
69
- def find_target_function(module: Any) -> Any | None:
70
- """Find the first async function in a module that doesn't start with underscore."""
73
+ def find_prefixed_functions(module: Any, prefix: str) -> list[Any]:
74
+ """Find all functions with a specific prefix (name-based discovery only)."""
75
+ funcs = []
71
76
  for name, obj in inspect.getmembers(module):
72
- if inspect.iscoroutinefunction(obj) and not name.startswith("_"):
73
- return obj
74
- return None
77
+ if name.startswith(prefix) and (inspect.isfunction(obj) or inspect.iscoroutinefunction(obj)):
78
+ funcs.append(obj) # Return function directly, no decorator wrapping
75
79
 
80
+ return funcs
76
81
 
77
- def get_async_function_names(module: Any) -> list[str]:
78
- """Get names of all async functions in a module that don't start with underscore."""
79
- return [
80
- name
81
- for name, obj in inspect.getmembers(module)
82
- if inspect.iscoroutinefunction(obj) and not name.startswith("_")
83
- ]
82
+
83
+ def print_v(message: str, verbose: bool) -> None:
84
+ """Print message if verbose is enabled."""
85
+ if verbose:
86
+ print(message)
84
87
 
85
88
 
86
89
  def process_datasets_from_module(
87
90
  module: Any, eval_file: Path, name_filter: str | None, verbose: bool
88
- ) -> list[tuple[str, Dataset, Any]]:
91
+ ) -> list[AixDataset]:
89
92
  """Process all datasets from a single module and return valid dataset tuples."""
90
93
  datasets = find_datasets_in_module(module)
91
- if verbose:
92
- print(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}")
94
+
95
+ print_v(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}", verbose)
93
96
 
94
97
  valid_datasets = []
95
98
 
99
+ targets = find_prefixed_functions(module, "target_")
100
+ scorers = find_prefixed_functions(module, "scorer_")
101
+ evaluators = find_prefixed_functions(module, "evaluator_")
102
+
103
+ print_v(f" Found target functions: {[f.__name__ for f in targets]}", verbose)
104
+ print_v(f" Found scoring functions: {[f.__name__ for f in scorers]}", verbose)
105
+ print_v(f" Found evaluator functions: {[f.__name__ for f in evaluators]}", verbose)
106
+
96
107
  for dataset_name, dataset in datasets:
97
108
  full_name = f"{eval_file.stem}.{dataset_name}"
98
109
 
99
110
  if not matches_filter(module.__name__, eval_file.stem, dataset_name, name_filter):
100
- if verbose:
101
- print(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})")
111
+ print_v(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})", verbose)
102
112
  continue
103
113
 
104
- if verbose:
105
- print(f" ✓ Including dataset: {dataset_name}")
114
+ print_v(f" ✓ Including dataset: {dataset_name}", verbose)
106
115
 
107
- # Find the target function
108
- target_function = find_target_function(module)
109
- async_functions = get_async_function_names(module)
116
+ if isinstance(dataset, Dataset):
117
+ # Wrap in AixDataset if not already
110
118
 
111
- if verbose:
112
- print(f" Found async functions: {async_functions}")
113
- if target_function:
114
- print(f" Using target function: {target_function.__name__}")
119
+ if len(targets) != 1:
120
+ print_v(
121
+ f" ✗ Skipping dataset: {dataset_name} (has {len(targets)} target functions, expected exactly 1)",
122
+ verbose,
123
+ )
115
124
 
116
- if target_function is None:
117
- if verbose:
118
- print(f"Warning: No async function found in {eval_file.name} for dataset {dataset_name}")
119
- continue
125
+ continue
126
+
127
+ dataset = AixDataset( # noqa: PLW2901
128
+ cases=dataset.cases,
129
+ evaluators=dataset.evaluators, # evaluators are plain functions now
130
+ name=full_name,
131
+ target_func=targets[0], # target function is used directly
132
+ scoring_funcs=scorers, # scorers are plain functions now
133
+ )
120
134
 
121
- valid_datasets.append((full_name, dataset, target_function))
135
+ valid_datasets.append(dataset)
122
136
 
123
137
  return valid_datasets
124
138
 
125
139
 
126
- def discover_all_datasets(
127
- eval_files: list[Path], name_filter: str | None, verbose: bool
128
- ) -> list[tuple[str, Dataset, Any]]:
140
+ def discover_all_datasets(eval_files: list[Path], name_filter: str | None, verbose: bool) -> list[AixDataset]:
129
141
  """Discover all datasets from eval files."""
130
142
  all_datasets = []
131
143
 
@@ -141,7 +153,7 @@ def discover_all_datasets(
141
153
  datasets = process_datasets_from_module(module, eval_file, name_filter, verbose)
142
154
  all_datasets.extend(datasets)
143
155
 
144
- except Exception as e: # pylint: disable=W0718
156
+ except Exception as e: # pylint: disable=broad-exception-caught
145
157
  if verbose:
146
158
  print(f"Error loading {eval_file}: {e}")
147
159
  print(f" Traceback: {traceback.format_exc()}")
@@ -162,9 +174,9 @@ def discover_all_datasets(
162
174
  print(f"\n{'=' * 60}")
163
175
  print("Datasets to Evaluate:")
164
176
  print(f"{'=' * 60}")
165
- for i, (dataset_name, dataset, target_function) in enumerate(all_datasets, 1):
166
- print(f"{i}. {dataset_name}")
167
- print(f" Target function: {target_function.__name__}")
177
+ for i, (dataset) in enumerate(all_datasets, 1):
178
+ print(f"{i}. {dataset.name}")
179
+ print(f" Target function: {dataset.target_func.__name__}")
168
180
  print(f" Cases: {len(dataset.cases)}")
169
181
  print(f" Evaluators: {len(dataset.evaluators)}")
170
182
  print(f"{'=' * 60}")
@@ -5,30 +5,29 @@ This module handles running evaluations and printing results.
5
5
  """
6
6
 
7
7
  import sys
8
- from typing import Any
9
8
 
10
- from pydantic_evals.dataset import Dataset
9
+ from pydantic_evals.reporting import EvaluationReport
11
10
 
11
+ from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
12
12
 
13
- async def run_dataset_evaluation( # noqa: PLR0913, pylint: disable=too-many-arguments,too-many-positional-arguments
14
- dataset_name: str,
15
- dataset: Dataset,
16
- target_function: Any,
13
+
14
+ async def run_dataset_evaluation(
15
+ dataset: AixDataset,
17
16
  print_options: dict[str, bool],
18
17
  min_assertions: float,
19
18
  verbose: bool = False,
20
- ) -> tuple[str, bool]:
21
- """Run evaluation for a single dataset and return (name, success)."""
19
+ ) -> tuple[str, bool, EvaluationReport | None]:
20
+ """Run evaluation for a single dataset and return (name, success, report)."""
22
21
  if verbose:
23
22
  print(f"\n{'=' * 60}")
24
- print(f"Running evaluation: {dataset_name}")
23
+ print(f"Running evaluation: {dataset.name}")
25
24
  print(f"{'=' * 60}")
26
25
  else:
27
- print(f"Running {dataset_name}...", end=" ")
26
+ print(f"Running {dataset.name}...", end=" ")
28
27
 
29
28
  try:
30
29
  # Execute the evaluation
31
- report = await dataset.evaluate(target_function)
30
+ report = await dataset.evaluate()
32
31
 
33
32
  # Print the results
34
33
  report.print(
@@ -38,60 +37,49 @@ async def run_dataset_evaluation( # noqa: PLR0913, pylint: disable=too-many-arg
38
37
  include_reasons=print_options["include_reasons"],
39
38
  )
40
39
 
41
- # Check if evaluation passed based on assertions average
42
- averages = report.averages()
43
- if averages and averages.assertions is not None:
44
- success = averages.assertions >= min_assertions
45
- if verbose:
46
- print(f"\nEvaluation Summary for {dataset_name}:")
47
- print(f" Assertions Average: {averages.assertions:.3f}")
48
- print(f" Minimum Required: {min_assertions:.3f}")
49
- print(f" Status: {'PASSED' if success else 'FAILED'}")
50
- else:
51
- print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
52
- else:
53
- success = False
54
- if verbose:
55
- print(f"\nEvaluation Summary for {dataset_name}:")
56
- print(" No assertions found or evaluation failed")
57
- print(f" Minimum Required: {min_assertions:.3f}")
58
- print(" Status: FAILED")
59
- else:
60
- print("FAILED (no assertions)")
40
+ success = all(scorer(report, dataset, min_assertions, verbose) for scorer in dataset.scorers)
61
41
 
62
- return dataset_name, success
42
+ return dataset.name, success, report
63
43
 
64
44
  except Exception as e: # pylint: disable=broad-exception-caught
65
45
  if verbose:
66
- print(f"Error running evaluation {dataset_name}: {e}")
46
+ print(f"Error running evaluation {dataset.name}: {e}")
67
47
  else:
68
48
  print(f"ERROR ({e})")
69
- return dataset_name, False
49
+ return dataset.name, False, None
70
50
 
71
51
 
72
52
  async def run_all_evaluations_and_print_results(
73
- datasets: list[tuple[str, Dataset, Any]], print_options: dict[str, bool], min_assertions: float, verbose: bool
53
+ datasets: list[AixDataset], print_options: dict[str, bool], min_assertions: float, verbose: bool
74
54
  ) -> None:
75
55
  """Run all evaluations and print results with summary."""
76
56
  # Run all evaluations
77
57
  results = []
78
- for dataset_name, dataset, target_function in datasets:
79
- result = await run_dataset_evaluation(
80
- dataset_name, dataset, target_function, print_options, min_assertions, verbose
81
- )
58
+ for dataset in datasets:
59
+ result = await run_dataset_evaluation(dataset, print_options, min_assertions, verbose)
82
60
  results.append(result)
83
61
 
62
+ # Print reports
63
+ for _, _, report in results:
64
+ if report:
65
+ report.print(
66
+ include_input=print_options["include_input"],
67
+ include_output=print_options["include_output"],
68
+ include_evaluator_failures=print_options["include_evaluator_failures"],
69
+ include_reasons=print_options["include_reasons"],
70
+ )
71
+
84
72
  # Print summary
85
- passed = sum(1 for _, success in results if success)
73
+ passed = sum(1 for _, success, _ in results if success)
86
74
  total = len(results)
87
- failed_results = [(name, success) for name, success in results if not success]
75
+ failed_results = [(name, success, _) for name, success, _ in results if not success]
88
76
 
89
77
  if verbose:
90
78
  print(f"\n{'=' * 60}")
91
79
  print("EVALUATION SUMMARY")
92
80
  print(f"{'=' * 60}")
93
81
 
94
- for name, success in results:
82
+ for name, success, _ in results:
95
83
  status = "PASSED" if success else "FAILED"
96
84
  print(f" {name}: {status}")
97
85
 
@@ -99,7 +87,7 @@ async def run_all_evaluations_and_print_results(
99
87
  # Only show failed evaluations when not verbose
100
88
  elif failed_results:
101
89
  print("\nFailed evaluations:")
102
- for name, _ in failed_results:
90
+ for name, _, _ in failed_results:
103
91
  print(f" {name}: FAILED")
104
92
 
105
93
  # Exit with non-zero code if any evaluations failed
aixtools/utils/config.py CHANGED
@@ -56,7 +56,6 @@ else:
56
56
  logging.error("No '.env' file found in any of the search paths, or their parents: %s", env_dirs)
57
57
  sys.exit(1)
58
58
 
59
-
60
59
  # ---
61
60
  # Directories
62
61
  # ---
@@ -124,7 +123,17 @@ GOOGLE_CLOUD_LOCATION = get_variable_env("GOOGLE_CLOUD_LOCATION", True)
124
123
 
125
124
  # vault parameters.
126
125
  VAULT_ADDRESS = get_variable_env("VAULT_ADDRESS", default="http://localhost:8200")
127
- VAULT_TOKEN = get_variable_env("VAULT_TOKEN", default="vault-token")
128
- VAULT_ENV = get_variable_env("ENV", default="dev")
129
- VAULT_MOUNT_POINT = get_variable_env("VAULT_MOUNT_POINT", default="secret")
130
- VAULT_PATH_PREFIX = get_variable_env("VAULT_PATH_PREFIX", default="path")
126
+ VAULT_TOKEN = get_variable_env("VAULT_TOKEN", allow_empty=True)
127
+ VAULT_ENV = get_variable_env("ENV", allow_empty=True)
128
+ VAULT_MOUNT_POINT = get_variable_env("VAULT_MOUNT_POINT", allow_empty=True)
129
+ VAULT_PATH_PREFIX = get_variable_env("VAULT_PATH_PREFIX", allow_empty=True)
130
+
131
+ # OAuth parameters
132
+ APP_SECRET_ID = get_variable_env("APP_SECRET_ID")
133
+ APP_CLIENT_ID = get_variable_env("APP_CLIENT_ID")
134
+
135
+ # used for token audience check
136
+ APP_API_ID = get_variable_env("APP_API_ID")
137
+ APP_TENANT_ID = get_variable_env("APP_TENANT_ID")
138
+ # used for token issuer check
139
+ APP_ISSUER_URL = get_variable_env("APP_ISSUER_URL")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aixtools
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Tools for AI exploration and debugging
5
5
  Requires-Python: >=3.11.2
6
6
  Description-Content-Type: text/markdown
@@ -20,6 +20,7 @@ Requires-Dist: mypy>=1.18.2
20
20
  Requires-Dist: pandas>=2.2.3
21
21
  Requires-Dist: pydantic-evals>=0.4.10
22
22
  Requires-Dist: pydantic-ai>=1.0.9
23
+ Requires-Dist: pyjwt>=2.10.1
23
24
  Requires-Dist: pylint>=3.3.7
24
25
  Requires-Dist: rich>=14.0.0
25
26
  Requires-Dist: ruff>=0.11.6
@@ -415,30 +416,30 @@ By default, the "FaultyMCP" includes several tools you can use in your tests:
415
416
 
416
417
  ### Evals
417
418
 
418
- Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework.
419
+ Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework with AIXtools enhancements.
419
420
 
420
421
  ```bash
421
422
  # Run all evaluations
422
- evals
423
+ python -m aixtools.evals
423
424
 
424
425
  # Run evaluations with filtering
425
- evals --filter "specific_test"
426
+ python -m aixtools.evals --filter "specific_test"
426
427
 
427
428
  # Run with verbose output and detailed reporting
428
- evals --verbose --include-input --include-output --include-reasons
429
+ python -m aixtools.evals --verbose --include-input --include-output --include-reasons
429
430
 
430
431
  # Specify custom evaluations directory
431
- evals --evals-dir /path/to/evals
432
+ python -m aixtools.evals --evals-dir /path/to/evals
432
433
 
433
434
  # Set minimum assertions threshold
434
- evals --min-assertions 0.8
435
+ python -m aixtools.evals --min-assertions 0.8
435
436
  ```
436
437
 
437
438
  **Command Line Options:**
438
439
  - `--evals-dir` - Directory containing eval_*.py files (default: evals)
439
440
  - `--filter` - Filter to run only matching evaluations
440
- - `--include-input` - Include input in report output
441
- - `--include-output` - Include output in report output
441
+ - `--include-input` - Include input in report output (default: True)
442
+ - `--include-output` - Include output in report output (default: True)
442
443
  - `--include-evaluator-failures` - Include evaluator failures in report
443
444
  - `--include-reasons` - Include reasons in report output
444
445
  - `--min-assertions` - Minimum assertions average required for success (default: 1.0)
@@ -446,14 +447,16 @@ evals --min-assertions 0.8
446
447
 
447
448
  The evaluation system discovers and runs all Dataset objects from eval_*.py files in the specified directory, similar to test runners but specifically designed for LLM evaluations using pydantic_evals.
448
449
 
449
- **Discovery Mechanism:**
450
+ **Discovery Mechanism**
450
451
 
451
- The evaluation framework uses an automatic discovery system that:
452
+ The evaluation framework uses an automatic discovery system:
452
453
 
453
454
  1. **File Discovery**: Scans the specified directory for files matching the pattern `eval_*.py`
454
455
  2. **Dataset Discovery**: Within each file, looks for variables named `dataset_*` that are instances of `pydantic_evals.Dataset`
455
- 3. **Target Function Discovery**: Automatically finds the first async function in each module that doesn't start with an underscore (`_`) to use as the evaluation target
456
- 4. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
456
+ 3. **Target Function Discovery**: Within the same file looks for function or async function named `target_*`. There must be 1 target function per file.
457
+ 4. **Function Discovery**: Looks for functions with specific prefixes:
458
+ - Functions prefixed with `scorer_*`, `evaluator_*` for custom scorer and evaluator functions that will be used for each dataset in that file
459
+ 5. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
457
460
 
458
461
  **Example Evaluation File Structure:**
459
462
  ```python
@@ -471,11 +474,16 @@ dataset_addition = Dataset(
471
474
  )
472
475
 
473
476
  # This function will be used as the evaluation target
474
- async def evaluate_math_agent(input_text: str) -> str:
475
- # Your agent evaluation logic here
477
+ async def target_math_agent(input_text: str) -> str:
478
+ # Your agent run logic here
476
479
  agent = get_agent(system_prompt="You are a math assistant.")
477
480
  result, _ = await run_agent(agent, input_text)
478
481
  return result
482
+
483
+ # This function will be used as evaluator for all datasets (optional)
484
+ def evaluator_check_output(ctx: EvaluatorContext) -> bool:
485
+ # Your result evaluation logic here
486
+ return ctx.output == ctx.expected_output
479
487
  ```
480
488
 
481
489
  The discovery system will:
@@ -484,6 +492,34 @@ The discovery system will:
484
492
  - Use `evaluate_math_agent` as the target function for evaluation
485
493
  - Run each case through the target function and evaluate results
486
494
 
495
+ #### Name-Based Discovery
496
+
497
+ The evaluation system uses name-based discovery for all components:
498
+
499
+ **Target Functions** (exactly one required per eval file):
500
+ - **Purpose**: The main function being evaluated - processes inputs and returns outputs
501
+ - **Naming**: Functions named `target_*` (e.g., `target_my_function`)
502
+ - **Signature**: `def target_name(inputs: InputType) -> OutputType` or `async def target_name(inputs: InputType) -> OutputType`
503
+ - **Example**: `async def target_math_agent(input_text: str) -> str`
504
+
505
+ **Scoring Functions** (optional):
506
+ - **Purpose**: Determine if evaluation results meet success criteria
507
+ - **Naming**: Functions named `scorer_*` (e.g., `scorer_custom`)
508
+ - **Signature**: `def scorer_name(report: EvaluationReport, dataset: AixDataset, min_score: float = 1.0, verbose: bool = False) -> bool`
509
+ - **Example**: `def scorer_accuracy_threshold(report, dataset, min_score=0.8, verbose=False) -> bool`
510
+
511
+ **Evaluator Functions** (optional):
512
+ - **Purpose**: Custom evaluation logic for comparing outputs with expected results
513
+ - **Naming**: Functions named `evaluator_*` (e.g., `evaluator_check_output`)
514
+ - **Signature**: `def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput` or `async def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput`
515
+ - **Example**: `def evaluator_exact_match(ctx) -> EvaluatorOutput`
516
+
517
+ This name-based approach works seamlessly with both synchronous and asynchronous functions.
518
+
519
+ #### Scoring System
520
+
521
+ The framework includes a custom scoring system with [`average_assertions`](aixtools/evals/dataset.py:67) as the default scorer. This scorer checks if the average assertion score meets a minimum threshold and provides detailed pass/fail reporting.
522
+
487
523
  ## Testing & Tools
488
524
 
489
525
  AIXtools provides comprehensive testing utilities and diagnostic tools for AI agent development and debugging.
@@ -1,5 +1,5 @@
1
1
  aixtools/__init__.py,sha256=9NGHm7LjsQmsvjTZvw6QFJexSvAU4bCoN_KBk9SCa00,260
2
- aixtools/_version.py,sha256=o3ZTescp-19Z9cvBGq9dQnbppljgzdUYUf98Nov0spY,704
2
+ aixtools/_version.py,sha256=NRw4Jle4n9v_DD2wtplRqflGCvX8OU5eAjycYY0vY3Y,704
3
3
  aixtools/app.py,sha256=JzQ0nrv_bjDQokllIlGHOV0HEb-V8N6k_nGQH-TEsVU,5227
4
4
  aixtools/chainlit.md,sha256=yC37Ly57vjKyiIvK4oUvf4DYxZCwH7iocTlx7bLeGLU,761
5
5
  aixtools/context.py,sha256=I_MD40ZnvRm5WPKAKqBUAdXIf8YaurkYUUHSVVy-QvU,598
@@ -30,15 +30,18 @@ aixtools/agents/agent.py,sha256=tceQByn-RGBIhW8BOjKoP0yhNzZLwAa6CxwhPhRe3PU,7270
30
30
  aixtools/agents/agent_batch.py,sha256=0Zu9yNCRPAQZPjXQ-dIUAmP1uGTVbxVt7xvnMpoJMjU,2251
31
31
  aixtools/agents/print_nodes.py,sha256=wVTngNfqM0As845WTRz6G3Rei_Gr3HuBlvu-G_eXuig,1665
32
32
  aixtools/agents/prompt.py,sha256=p9OYnyJ4-MyGXwHPrQeJBhZ2a3RV2HqhtdUUCrTMsAQ,3361
33
+ aixtools/auth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ aixtools/auth/auth.py,sha256=aKYCKJRjSNrVZmIWN2h2p1zYqkhMLLBXBfk_Qy5NKik,2365
33
35
  aixtools/compliance/__init__.py,sha256=vnw0zEdySIJWvDAJ8DCRRaWmY_agEOz1qlpAdhmtiuo,191
34
36
  aixtools/compliance/private_data.py,sha256=OOM9mIp3_w0fNgj3VAEWBl7-jrPc19_Ls1pC5dfF5UY,5323
35
37
  aixtools/db/__init__.py,sha256=b8vRhme3egV-aUZbAntnOaDkSXB8UT0Xy5oqQhU_z0Q,399
36
38
  aixtools/db/database.py,sha256=caWe95GlxZYlxn2ubDmR-_cQUW0ulkpR3BHunKIaOsw,3369
37
39
  aixtools/db/vector_db.py,sha256=be4JGyXj3o8VEfy9L6SO1aAoDET_zazMJkYfjlYHTYQ,4133
38
40
  aixtools/evals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- aixtools/evals/discovery.py,sha256=nKBMHuM3Q87GFY4U0QXvU-zXmJjR-bnmlwf5cfp5E9s,5907
40
- aixtools/evals/evals.py,sha256=3oJ6_HjojLOzG8XxdfdTYFk-gxep41nk_viHTsUwFNo,2738
41
- aixtools/evals/run_evals.py,sha256=oJpGIPF5avq1r275Yd_RJyxUiUgOd00LItdKXfGlAbA,3910
41
+ aixtools/evals/__main__.py,sha256=f6_X5jHFHIR36r_jerj3ol8SLhZB8nVk8vffoRVtxLs,2844
42
+ aixtools/evals/dataset.py,sha256=qsLrx9hgvZWY1FzuwYtm3aJscNth1EkeLWTgIdici5Q,3374
43
+ aixtools/evals/discovery.py,sha256=gHKfutzdtjZPkjxXnD_WYHqL2WMem8kkJpk2HDHIgKg,6488
44
+ aixtools/evals/run_evals.py,sha256=J5sfdfC_2NwQcRP4mGV4FYSpzawRe4pmkOTjUm1IAWU,3278
42
45
  aixtools/google/client.py,sha256=8yuv_zEZKlmUTI-zRxAb3vjLUrfiwrBhcpNe0hYsO0g,1078
43
46
  aixtools/log_view/__init__.py,sha256=0fWLCq9BMo8GoH3Z5WDgvf0-J2TP0XWqtef0f28SHBA,405
44
47
  aixtools/log_view/app.py,sha256=DZp3PUM_iS3DpMHqHfFXVACvbZ9PItbOCNMkDjIOfTc,6595
@@ -76,7 +79,7 @@ aixtools/tools/doctor/mcp_tool_doctor.py,sha256=sX2q5GfNkmUYxnXrqMpeGIwGfeL1LpYJ
76
79
  aixtools/tools/doctor/tool_doctor.py,sha256=EY1pshjLGLD0j6cc1ZFtbc0G19I5IbOZwHFDqypE49Q,2661
77
80
  aixtools/tools/doctor/tool_recommendation.py,sha256=LYyVOSXdAorWiY4P-ucSA1vLlV5BTEfX4GzBXNE_X0M,1569
78
81
  aixtools/utils/__init__.py,sha256=xT6almZBQYMfj4h7Hq9QXDHyVXbOOTxqLsmJsxYYnSw,757
79
- aixtools/utils/config.py,sha256=JeUbGls1womGZWIp6gPBT0IoAfrljpscKEoKx2eBXjw,4819
82
+ aixtools/utils/config.py,sha256=t32731F53Cv1YYoX95wksoreE0Zn0B8UKyEiKWne4ec,5147
80
83
  aixtools/utils/config_util.py,sha256=3Ya4Qqhj1RJ1qtTTykQ6iayf5uxlpigPXgEJlTi1wn4,2229
81
84
  aixtools/utils/enum_with_description.py,sha256=zjSzWxG74eR4x7dpmb74pLTYCWNSMvauHd7_9LpDYIw,1088
82
85
  aixtools/utils/files.py,sha256=8JnxwHJRJcjWCdFpjzWmo0po2fRg8esj4H7sOxElYXU,517
@@ -86,8 +89,8 @@ aixtools/utils/chainlit/cl_agent_show.py,sha256=vaRuowp4BRvhxEr5hw0zHEJ7iaSF_5bo
86
89
  aixtools/utils/chainlit/cl_utils.py,sha256=fxaxdkcZg6uHdM8uztxdPowg3a2f7VR7B26VPY4t-3c,5738
87
90
  aixtools/vault/__init__.py,sha256=fsr_NuX3GZ9WZ7dGfe0gp_5-z3URxAfwVRXw7Xyc0dU,141
88
91
  aixtools/vault/vault.py,sha256=9dZLWdZQk9qN_Q9Djkofw9LUKnJqnrX5H0fGusVLBhA,6037
89
- aixtools-0.2.2.dist-info/METADATA,sha256=uF-hTQvikYFOiybcQY5Dj1Vc20ubJndbWKB8aytBo6c,24951
90
- aixtools-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
91
- aixtools-0.2.2.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
92
- aixtools-0.2.2.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
93
- aixtools-0.2.2.dist-info/RECORD,,
92
+ aixtools-0.2.4.dist-info/METADATA,sha256=EzZB-SOZLdj5QZDkk3YPu0PCipJOiAKT08xJNWKENfg,27229
93
+ aixtools-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ aixtools-0.2.4.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
95
+ aixtools-0.2.4.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
96
+ aixtools-0.2.4.dist-info/RECORD,,