aixtools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aixtools might be problematic. Click here for more details.

aixtools/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.3'
32
- __version_tuple__ = version_tuple = (0, 2, 3)
31
+ __version__ = version = '0.2.5'
32
+ __version_tuple__ = version_tuple = (0, 2, 5)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -11,8 +11,8 @@ import asyncio
11
11
  import sys
12
12
  from pathlib import Path
13
13
 
14
- from .discovery import discover_all_datasets, find_eval_files
15
- from .run_evals import run_all_evaluations_and_print_results
14
+ from aixtools.evals.discovery import discover_all_datasets, find_eval_files # pylint: disable=E0401
15
+ from aixtools.evals.run_evals import run_all_evaluations_and_print_results # pylint: disable=E0401
16
16
 
17
17
 
18
18
  async def main():
@@ -24,8 +24,8 @@ async def main():
24
24
  parser.add_argument(
25
25
  "--filter", type=str, help="Filter to run only matching evaluations (matches module, file, or dataset names)"
26
26
  )
27
- parser.add_argument("--include-input", action="store_true", help="Include input in report output")
28
- parser.add_argument("--include-output", action="store_true", help="Include output in report output")
27
+ parser.add_argument("--include-input", action="store_true", default=True, help="Include input in report output")
28
+ parser.add_argument("--include-output", action="store_true", default=True, help="Include output in report output")
29
29
  parser.add_argument(
30
30
  "--include-evaluator-failures", action="store_true", help="Include evaluator failures in report output"
31
31
  )
@@ -0,0 +1,87 @@
1
+ """Custom dataset and evaluation utilities for AixTools.
2
+
3
+ This module provides wrapper classes and decorators for building and running
4
+ evaluations using the pydantic-evals framework. It includes a custom Dataset
5
+ class, decorators for marking target functions, scorers, and evaluators, and
6
+ a default scoring function based on assertion averages.
7
+ """
8
+
9
+ from typing import Awaitable, Callable, Generic
10
+
11
+ from pydantic import BaseModel
12
+ from pydantic_evals.dataset import Case, Dataset, InputsT, MetadataT, OutputT
13
+ from pydantic_evals.evaluators import Evaluator
14
+ from pydantic_evals.reporting import EvaluationReport
15
+
16
+ TargetT = Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT]
17
+ ScorerT = Callable[[EvaluationReport, "AixDataset", float, bool], bool]
18
+
19
+
20
+ class AixDataset(BaseModel, Generic[InputsT, OutputT, MetadataT]):
21
+ """Custom Dataset class for AixTools evaluations."""
22
+
23
+ dataset: Dataset[InputsT, OutputT]
24
+ name: str
25
+ target_func: TargetT
26
+ scorers: list[ScorerT]
27
+
28
+ def __init__( # pylint: disable=R0913,R0917
29
+ self,
30
+ cases: list[Case[InputsT, OutputT]],
31
+ target_func: TargetT,
32
+ evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] | None = None,
33
+ name: str | None = None,
34
+ scoring_funcs: list[ScorerT] | None = None,
35
+ ):
36
+ super().__init__(
37
+ dataset=Dataset(cases=cases, evaluators=evaluators or []),
38
+ target_func=target_func,
39
+ name=name or "dataset",
40
+ scorers=scoring_funcs or [average_assertions],
41
+ )
42
+
43
+ @property
44
+ def cases(self) -> list[Case[InputsT, OutputT]]:
45
+ """Return the list of cases in the dataset."""
46
+ return self.dataset.cases
47
+
48
+ @property
49
+ def evaluators(self) -> list[Evaluator[InputsT, OutputT, MetadataT]]:
50
+ """Return the list of evaluators in the dataset."""
51
+ return self.dataset.evaluators
52
+
53
+ async def evaluate(
54
+ self,
55
+ ) -> EvaluationReport:
56
+ """Run the evaluation using the target function and return an EvaluationReport."""
57
+ return await self.dataset.evaluate(self.target_func)
58
+
59
+
60
+ # Decorators removed - using name-based discovery only for simplicity and async compatibility
61
+ # Functions should be named with prefixes: target_, scorer_, evaluator_
62
+
63
+
64
+ def average_assertions(
65
+ report: EvaluationReport, dataset: "AixDataset", min_score: float = 1.0, verbose: bool = False
66
+ ) -> bool:
67
+ """Scoring function that checks if the average assertions meet a minimum threshold."""
68
+ averages = report.averages()
69
+ if averages and averages.assertions is not None:
70
+ success = averages.assertions >= min_score
71
+ if verbose:
72
+ print(f"\nAssertions Summary for {dataset.name}:")
73
+ print(f" Assertions Average: {averages.assertions:.3f}")
74
+ print(f" Minimum Required: {min_score:.3f}")
75
+ print(f" Status: {'PASSED' if success else 'FAILED'}")
76
+ else:
77
+ print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
78
+ else:
79
+ success = False
80
+ if verbose:
81
+ print(f"\nAssertions Summary for {dataset.name}:")
82
+ print(" No assertions found or evaluation failed")
83
+ print(f" Minimum Required: {min_score:.3f}")
84
+ print(" Status: FAILED")
85
+ else:
86
+ print("FAILED (no assertions)")
87
+ return success
@@ -9,10 +9,14 @@ import inspect
9
9
  import sys
10
10
  import traceback
11
11
  from pathlib import Path
12
- from typing import Any
12
+ from typing import Any, TypeVar
13
13
 
14
14
  from pydantic_evals.dataset import Dataset
15
15
 
16
+ from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
17
+
18
+ SpecialFuncT = TypeVar("SpecialFuncT")
19
+
16
20
 
17
21
  def find_eval_files(evals_dir: Path) -> list[Path]:
18
22
  """Find all eval_*.py files in the evals directory."""
@@ -33,7 +37,7 @@ def find_datasets_in_module(module: Any) -> list[tuple[str, Dataset]]:
33
37
  datasets = []
34
38
 
35
39
  for name, obj in inspect.getmembers(module):
36
- if name.startswith("dataset_") and isinstance(obj, Dataset):
40
+ if name.startswith("dataset_") and isinstance(obj, (Dataset, AixDataset)):
37
41
  datasets.append((name, obj))
38
42
 
39
43
  return datasets
@@ -66,66 +70,74 @@ def matches_filter(module_name: str, file_name: str, dataset_name: str, name_fil
66
70
  )
67
71
 
68
72
 
69
- def find_target_function(module: Any) -> Any | None:
70
- """Find the first async function in a module that doesn't start with underscore."""
73
+ def find_prefixed_functions(module: Any, prefix: str) -> list[Any]:
74
+ """Find all functions with a specific prefix (name-based discovery only)."""
75
+ funcs = []
71
76
  for name, obj in inspect.getmembers(module):
72
- if inspect.iscoroutinefunction(obj) and not name.startswith("_"):
73
- return obj
74
- return None
77
+ if name.startswith(prefix) and (inspect.isfunction(obj) or inspect.iscoroutinefunction(obj)):
78
+ funcs.append(obj) # Return function directly, no decorator wrapping
75
79
 
80
+ return funcs
76
81
 
77
- def get_async_function_names(module: Any) -> list[str]:
78
- """Get names of all async functions in a module that don't start with underscore."""
79
- return [
80
- name
81
- for name, obj in inspect.getmembers(module)
82
- if inspect.iscoroutinefunction(obj) and not name.startswith("_")
83
- ]
82
+
83
+ def print_v(message: str, verbose: bool) -> None:
84
+ """Print message if verbose is enabled."""
85
+ if verbose:
86
+ print(message)
84
87
 
85
88
 
86
89
  def process_datasets_from_module(
87
90
  module: Any, eval_file: Path, name_filter: str | None, verbose: bool
88
- ) -> list[tuple[str, Dataset, Any]]:
91
+ ) -> list[AixDataset]:
89
92
  """Process all datasets from a single module and return valid dataset tuples."""
90
93
  datasets = find_datasets_in_module(module)
91
- if verbose:
92
- print(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}")
94
+
95
+ print_v(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}", verbose)
93
96
 
94
97
  valid_datasets = []
95
98
 
99
+ targets = find_prefixed_functions(module, "target_")
100
+ scorers = find_prefixed_functions(module, "scorer_")
101
+ evaluators = find_prefixed_functions(module, "evaluator_")
102
+
103
+ print_v(f" Found target functions: {[f.__name__ for f in targets]}", verbose)
104
+ print_v(f" Found scoring functions: {[f.__name__ for f in scorers]}", verbose)
105
+ print_v(f" Found evaluator functions: {[f.__name__ for f in evaluators]}", verbose)
106
+
96
107
  for dataset_name, dataset in datasets:
97
108
  full_name = f"{eval_file.stem}.{dataset_name}"
98
109
 
99
110
  if not matches_filter(module.__name__, eval_file.stem, dataset_name, name_filter):
100
- if verbose:
101
- print(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})")
111
+ print_v(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})", verbose)
102
112
  continue
103
113
 
104
- if verbose:
105
- print(f" ✓ Including dataset: {dataset_name}")
114
+ print_v(f" ✓ Including dataset: {dataset_name}", verbose)
106
115
 
107
- # Find the target function
108
- target_function = find_target_function(module)
109
- async_functions = get_async_function_names(module)
116
+ if isinstance(dataset, Dataset):
117
+ # Wrap in AixDataset if not already
110
118
 
111
- if verbose:
112
- print(f" Found async functions: {async_functions}")
113
- if target_function:
114
- print(f" Using target function: {target_function.__name__}")
119
+ if len(targets) != 1:
120
+ print_v(
121
+ f" ✗ Skipping dataset: {dataset_name} (has {len(targets)} target functions, expected exactly 1)",
122
+ verbose,
123
+ )
115
124
 
116
- if target_function is None:
117
- if verbose:
118
- print(f"Warning: No async function found in {eval_file.name} for dataset {dataset_name}")
119
- continue
125
+ continue
126
+
127
+ dataset = AixDataset( # noqa: PLW2901
128
+ cases=dataset.cases,
129
+ evaluators=dataset.evaluators, # evaluators are plain functions now
130
+ name=full_name,
131
+ target_func=targets[0], # target function is used directly
132
+ scoring_funcs=scorers, # scorers are plain functions now
133
+ )
120
134
 
121
- valid_datasets.append((full_name, dataset, target_function))
135
+ valid_datasets.append(dataset)
122
136
 
123
137
  return valid_datasets
124
138
 
125
139
 
126
- def discover_all_datasets(
127
- eval_files: list[Path], name_filter: str | None, verbose: bool
128
- ) -> list[tuple[str, Dataset, Any]]:
140
+ def discover_all_datasets(eval_files: list[Path], name_filter: str | None, verbose: bool) -> list[AixDataset]:
129
141
  """Discover all datasets from eval files."""
130
142
  all_datasets = []
131
143
 
@@ -141,7 +153,7 @@ def discover_all_datasets(
141
153
  datasets = process_datasets_from_module(module, eval_file, name_filter, verbose)
142
154
  all_datasets.extend(datasets)
143
155
 
144
- except Exception as e: # pylint: disable=W0718
156
+ except Exception as e: # pylint: disable=broad-exception-caught
145
157
  if verbose:
146
158
  print(f"Error loading {eval_file}: {e}")
147
159
  print(f" Traceback: {traceback.format_exc()}")
@@ -162,9 +174,9 @@ def discover_all_datasets(
162
174
  print(f"\n{'=' * 60}")
163
175
  print("Datasets to Evaluate:")
164
176
  print(f"{'=' * 60}")
165
- for i, (dataset_name, dataset, target_function) in enumerate(all_datasets, 1):
166
- print(f"{i}. {dataset_name}")
167
- print(f" Target function: {target_function.__name__}")
177
+ for i, (dataset) in enumerate(all_datasets, 1):
178
+ print(f"{i}. {dataset.name}")
179
+ print(f" Target function: {dataset.target_func.__name__}")
168
180
  print(f" Cases: {len(dataset.cases)}")
169
181
  print(f" Evaluators: {len(dataset.evaluators)}")
170
182
  print(f"{'=' * 60}")
@@ -5,30 +5,29 @@ This module handles running evaluations and printing results.
5
5
  """
6
6
 
7
7
  import sys
8
- from typing import Any
9
8
 
10
- from pydantic_evals.dataset import Dataset
9
+ from pydantic_evals.reporting import EvaluationReport
11
10
 
11
+ from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
12
12
 
13
- async def run_dataset_evaluation( # noqa: PLR0913, pylint: disable=too-many-arguments,too-many-positional-arguments
14
- dataset_name: str,
15
- dataset: Dataset,
16
- target_function: Any,
13
+
14
+ async def run_dataset_evaluation(
15
+ dataset: AixDataset,
17
16
  print_options: dict[str, bool],
18
17
  min_assertions: float,
19
18
  verbose: bool = False,
20
- ) -> tuple[str, bool]:
21
- """Run evaluation for a single dataset and return (name, success)."""
19
+ ) -> tuple[str, bool, EvaluationReport | None]:
20
+ """Run evaluation for a single dataset and return (name, success, report)."""
22
21
  if verbose:
23
22
  print(f"\n{'=' * 60}")
24
- print(f"Running evaluation: {dataset_name}")
23
+ print(f"Running evaluation: {dataset.name}")
25
24
  print(f"{'=' * 60}")
26
25
  else:
27
- print(f"Running {dataset_name}...", end=" ")
26
+ print(f"Running {dataset.name}...", end=" ")
28
27
 
29
28
  try:
30
29
  # Execute the evaluation
31
- report = await dataset.evaluate(target_function)
30
+ report = await dataset.evaluate()
32
31
 
33
32
  # Print the results
34
33
  report.print(
@@ -38,60 +37,49 @@ async def run_dataset_evaluation( # noqa: PLR0913, pylint: disable=too-many-arg
38
37
  include_reasons=print_options["include_reasons"],
39
38
  )
40
39
 
41
- # Check if evaluation passed based on assertions average
42
- averages = report.averages()
43
- if averages and averages.assertions is not None:
44
- success = averages.assertions >= min_assertions
45
- if verbose:
46
- print(f"\nEvaluation Summary for {dataset_name}:")
47
- print(f" Assertions Average: {averages.assertions:.3f}")
48
- print(f" Minimum Required: {min_assertions:.3f}")
49
- print(f" Status: {'PASSED' if success else 'FAILED'}")
50
- else:
51
- print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
52
- else:
53
- success = False
54
- if verbose:
55
- print(f"\nEvaluation Summary for {dataset_name}:")
56
- print(" No assertions found or evaluation failed")
57
- print(f" Minimum Required: {min_assertions:.3f}")
58
- print(" Status: FAILED")
59
- else:
60
- print("FAILED (no assertions)")
40
+ success = all(scorer(report, dataset, min_assertions, verbose) for scorer in dataset.scorers)
61
41
 
62
- return dataset_name, success
42
+ return dataset.name, success, report
63
43
 
64
44
  except Exception as e: # pylint: disable=broad-exception-caught
65
45
  if verbose:
66
- print(f"Error running evaluation {dataset_name}: {e}")
46
+ print(f"Error running evaluation {dataset.name}: {e}")
67
47
  else:
68
48
  print(f"ERROR ({e})")
69
- return dataset_name, False
49
+ return dataset.name, False, None
70
50
 
71
51
 
72
52
  async def run_all_evaluations_and_print_results(
73
- datasets: list[tuple[str, Dataset, Any]], print_options: dict[str, bool], min_assertions: float, verbose: bool
53
+ datasets: list[AixDataset], print_options: dict[str, bool], min_assertions: float, verbose: bool
74
54
  ) -> None:
75
55
  """Run all evaluations and print results with summary."""
76
56
  # Run all evaluations
77
57
  results = []
78
- for dataset_name, dataset, target_function in datasets:
79
- result = await run_dataset_evaluation(
80
- dataset_name, dataset, target_function, print_options, min_assertions, verbose
81
- )
58
+ for dataset in datasets:
59
+ result = await run_dataset_evaluation(dataset, print_options, min_assertions, verbose)
82
60
  results.append(result)
83
61
 
62
+ # Print reports
63
+ for _, _, report in results:
64
+ if report:
65
+ report.print(
66
+ include_input=print_options["include_input"],
67
+ include_output=print_options["include_output"],
68
+ include_evaluator_failures=print_options["include_evaluator_failures"],
69
+ include_reasons=print_options["include_reasons"],
70
+ )
71
+
84
72
  # Print summary
85
- passed = sum(1 for _, success in results if success)
73
+ passed = sum(1 for _, success, _ in results if success)
86
74
  total = len(results)
87
- failed_results = [(name, success) for name, success in results if not success]
75
+ failed_results = [(name, success, _) for name, success, _ in results if not success]
88
76
 
89
77
  if verbose:
90
78
  print(f"\n{'=' * 60}")
91
79
  print("EVALUATION SUMMARY")
92
80
  print(f"{'=' * 60}")
93
81
 
94
- for name, success in results:
82
+ for name, success, _ in results:
95
83
  status = "PASSED" if success else "FAILED"
96
84
  print(f" {name}: {status}")
97
85
 
@@ -99,7 +87,7 @@ async def run_all_evaluations_and_print_results(
99
87
  # Only show failed evaluations when not verbose
100
88
  elif failed_results:
101
89
  print("\nFailed evaluations:")
102
- for name, _ in failed_results:
90
+ for name, _, _ in failed_results:
103
91
  print(f" {name}: FAILED")
104
92
 
105
93
  # Exit with non-zero code if any evaluations failed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aixtools
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Tools for AI exploration and debugging
5
5
  Requires-Python: >=3.11.2
6
6
  Description-Content-Type: text/markdown
@@ -416,30 +416,30 @@ By default, the "FaultyMCP" includes several tools you can use in your tests:
416
416
 
417
417
  ### Evals
418
418
 
419
- Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework.
419
+ Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework with AIXtools enhancements.
420
420
 
421
421
  ```bash
422
422
  # Run all evaluations
423
- evals
423
+ python -m aixtools.evals
424
424
 
425
425
  # Run evaluations with filtering
426
- evals --filter "specific_test"
426
+ python -m aixtools.evals --filter "specific_test"
427
427
 
428
428
  # Run with verbose output and detailed reporting
429
- evals --verbose --include-input --include-output --include-reasons
429
+ python -m aixtools.evals --verbose --include-input --include-output --include-reasons
430
430
 
431
431
  # Specify custom evaluations directory
432
- evals --evals-dir /path/to/evals
432
+ python -m aixtools.evals --evals-dir /path/to/evals
433
433
 
434
434
  # Set minimum assertions threshold
435
- evals --min-assertions 0.8
435
+ python -m aixtools.evals --min-assertions 0.8
436
436
  ```
437
437
 
438
438
  **Command Line Options:**
439
439
  - `--evals-dir` - Directory containing eval_*.py files (default: evals)
440
440
  - `--filter` - Filter to run only matching evaluations
441
- - `--include-input` - Include input in report output
442
- - `--include-output` - Include output in report output
441
+ - `--include-input` - Include input in report output (default: True)
442
+ - `--include-output` - Include output in report output (default: True)
443
443
  - `--include-evaluator-failures` - Include evaluator failures in report
444
444
  - `--include-reasons` - Include reasons in report output
445
445
  - `--min-assertions` - Minimum assertions average required for success (default: 1.0)
@@ -447,14 +447,16 @@ evals --min-assertions 0.8
447
447
 
448
448
  The evaluation system discovers and runs all Dataset objects from eval_*.py files in the specified directory, similar to test runners but specifically designed for LLM evaluations using pydantic_evals.
449
449
 
450
- **Discovery Mechanism:**
450
+ **Discovery Mechanism**
451
451
 
452
- The evaluation framework uses an automatic discovery system that:
452
+ The evaluation framework uses an automatic discovery system:
453
453
 
454
454
  1. **File Discovery**: Scans the specified directory for files matching the pattern `eval_*.py`
455
455
  2. **Dataset Discovery**: Within each file, looks for variables named `dataset_*` that are instances of `pydantic_evals.Dataset`
456
- 3. **Target Function Discovery**: Automatically finds the first async function in each module that doesn't start with an underscore (`_`) to use as the evaluation target
457
- 4. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
456
+ 3. **Target Function Discovery**: Within the same file looks for function or async function named `target_*`. There must be 1 target function per file.
457
+ 4. **Function Discovery**: Looks for functions with specific prefixes:
458
+ - Functions prefixed with `scorer_*`, `evaluator_*` for custom scorer and evaluator functions that will be used for each dataset in that file
459
+ 5. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
458
460
 
459
461
  **Example Evaluation File Structure:**
460
462
  ```python
@@ -472,11 +474,16 @@ dataset_addition = Dataset(
472
474
  )
473
475
 
474
476
  # This function will be used as the evaluation target
475
- async def evaluate_math_agent(input_text: str) -> str:
476
- # Your agent evaluation logic here
477
+ async def target_math_agent(input_text: str) -> str:
478
+ # Your agent run logic here
477
479
  agent = get_agent(system_prompt="You are a math assistant.")
478
480
  result, _ = await run_agent(agent, input_text)
479
481
  return result
482
+
483
+ # This function will be used as evaluator for all datasets (optional)
484
+ def evaluator_check_output(ctx: EvaluatorContext) -> bool:
485
+ # Your result evaluation logic here
486
+ return ctx.output == ctx.expected_output
480
487
  ```
481
488
 
482
489
  The discovery system will:
@@ -485,6 +492,34 @@ The discovery system will:
485
492
  - Use `evaluate_math_agent` as the target function for evaluation
486
493
  - Run each case through the target function and evaluate results
487
494
 
495
+ #### Name-Based Discovery
496
+
497
+ The evaluation system uses name-based discovery for all components:
498
+
499
+ **Target Functions** (exactly one required per eval file):
500
+ - **Purpose**: The main function being evaluated - processes inputs and returns outputs
501
+ - **Naming**: Functions named `target_*` (e.g., `target_my_function`)
502
+ - **Signature**: `def target_name(inputs: InputType) -> OutputType` or `async def target_name(inputs: InputType) -> OutputType`
503
+ - **Example**: `async def target_math_agent(input_text: str) -> str`
504
+
505
+ **Scoring Functions** (optional):
506
+ - **Purpose**: Determine if evaluation results meet success criteria
507
+ - **Naming**: Functions named `scorer_*` (e.g., `scorer_custom`)
508
+ - **Signature**: `def scorer_name(report: EvaluationReport, dataset: AixDataset, min_score: float = 1.0, verbose: bool = False) -> bool`
509
+ - **Example**: `def scorer_accuracy_threshold(report, dataset, min_score=0.8, verbose=False) -> bool`
510
+
511
+ **Evaluator Functions** (optional):
512
+ - **Purpose**: Custom evaluation logic for comparing outputs with expected results
513
+ - **Naming**: Functions named `evaluator_*` (e.g., `evaluator_check_output`)
514
+ - **Signature**: `def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput` or `async def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput`
515
+ - **Example**: `def evaluator_exact_match(ctx) -> EvaluatorOutput`
516
+
517
+ This name-based approach works seamlessly with both synchronous and asynchronous functions.
518
+
519
+ #### Scoring System
520
+
521
+ The framework includes a custom scoring system with [`average_assertions`](aixtools/evals/dataset.py:67) as the default scorer. This scorer checks if the average assertion score meets a minimum threshold and provides detailed pass/fail reporting.
522
+
488
523
  ## Testing & Tools
489
524
 
490
525
  AIXtools provides comprehensive testing utilities and diagnostic tools for AI agent development and debugging.
@@ -1,5 +1,5 @@
1
1
  aixtools/__init__.py,sha256=9NGHm7LjsQmsvjTZvw6QFJexSvAU4bCoN_KBk9SCa00,260
2
- aixtools/_version.py,sha256=kBRz0P2plw1eVdIpt70W6m1LMbEIhLY3RyOfVGdubaI,704
2
+ aixtools/_version.py,sha256=9wrJ_4Dlc0arUzKiaIqvTY85rMJma3eb1nNlF3uHAxU,704
3
3
  aixtools/app.py,sha256=JzQ0nrv_bjDQokllIlGHOV0HEb-V8N6k_nGQH-TEsVU,5227
4
4
  aixtools/chainlit.md,sha256=yC37Ly57vjKyiIvK4oUvf4DYxZCwH7iocTlx7bLeGLU,761
5
5
  aixtools/context.py,sha256=I_MD40ZnvRm5WPKAKqBUAdXIf8YaurkYUUHSVVy-QvU,598
@@ -38,9 +38,10 @@ aixtools/db/__init__.py,sha256=b8vRhme3egV-aUZbAntnOaDkSXB8UT0Xy5oqQhU_z0Q,399
38
38
  aixtools/db/database.py,sha256=caWe95GlxZYlxn2ubDmR-_cQUW0ulkpR3BHunKIaOsw,3369
39
39
  aixtools/db/vector_db.py,sha256=be4JGyXj3o8VEfy9L6SO1aAoDET_zazMJkYfjlYHTYQ,4133
40
40
  aixtools/evals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- aixtools/evals/discovery.py,sha256=nKBMHuM3Q87GFY4U0QXvU-zXmJjR-bnmlwf5cfp5E9s,5907
42
- aixtools/evals/evals.py,sha256=3oJ6_HjojLOzG8XxdfdTYFk-gxep41nk_viHTsUwFNo,2738
43
- aixtools/evals/run_evals.py,sha256=oJpGIPF5avq1r275Yd_RJyxUiUgOd00LItdKXfGlAbA,3910
41
+ aixtools/evals/__main__.py,sha256=f6_X5jHFHIR36r_jerj3ol8SLhZB8nVk8vffoRVtxLs,2844
42
+ aixtools/evals/dataset.py,sha256=qsLrx9hgvZWY1FzuwYtm3aJscNth1EkeLWTgIdici5Q,3374
43
+ aixtools/evals/discovery.py,sha256=gHKfutzdtjZPkjxXnD_WYHqL2WMem8kkJpk2HDHIgKg,6488
44
+ aixtools/evals/run_evals.py,sha256=J5sfdfC_2NwQcRP4mGV4FYSpzawRe4pmkOTjUm1IAWU,3278
44
45
  aixtools/google/client.py,sha256=8yuv_zEZKlmUTI-zRxAb3vjLUrfiwrBhcpNe0hYsO0g,1078
45
46
  aixtools/log_view/__init__.py,sha256=0fWLCq9BMo8GoH3Z5WDgvf0-J2TP0XWqtef0f28SHBA,405
46
47
  aixtools/log_view/app.py,sha256=DZp3PUM_iS3DpMHqHfFXVACvbZ9PItbOCNMkDjIOfTc,6595
@@ -88,8 +89,8 @@ aixtools/utils/chainlit/cl_agent_show.py,sha256=vaRuowp4BRvhxEr5hw0zHEJ7iaSF_5bo
88
89
  aixtools/utils/chainlit/cl_utils.py,sha256=fxaxdkcZg6uHdM8uztxdPowg3a2f7VR7B26VPY4t-3c,5738
89
90
  aixtools/vault/__init__.py,sha256=fsr_NuX3GZ9WZ7dGfe0gp_5-z3URxAfwVRXw7Xyc0dU,141
90
91
  aixtools/vault/vault.py,sha256=9dZLWdZQk9qN_Q9Djkofw9LUKnJqnrX5H0fGusVLBhA,6037
91
- aixtools-0.2.3.dist-info/METADATA,sha256=QwClHDH-4L6s6WlM4051WFHEhIcdvFui9ely0GVNwJY,24980
92
- aixtools-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
93
- aixtools-0.2.3.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
94
- aixtools-0.2.3.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
95
- aixtools-0.2.3.dist-info/RECORD,,
92
+ aixtools-0.2.5.dist-info/METADATA,sha256=BHPUgnHXs7ET3BvwAkPxYRkkXnxLdptFwbYNDkoBMbw,27229
93
+ aixtools-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ aixtools-0.2.5.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
95
+ aixtools-0.2.5.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
96
+ aixtools-0.2.5.dist-info/RECORD,,