PyPI - aixtools - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

aixtools 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of aixtools might be problematic. Click here for more details.

Files changed (13) hide show

aixtools/_version.py +2 -2
aixtools/auth/__init__.py +0 -0
aixtools/auth/auth.py +70 -0
aixtools/evals/{evals.py → __main__.py} +4 -4
aixtools/evals/dataset.py +87 -0
aixtools/evals/discovery.py +52 -40
aixtools/evals/run_evals.py +31 -43
aixtools/utils/config.py +14 -5
{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/METADATA +51 -15
{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/RECORD +13 -10
{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/WHEEL +0 -0
{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/entry_points.txt +0 -0
{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/top_level.txt +0 -0

aixtools/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.2'
-__version_tuple__ = version_tuple = (0, 2, 2)
+__version__ = version = '0.2.4'
+__version_tuple__ = version_tuple = (0, 2, 4)
 __commit_id__ = commit_id = None

aixtools/auth/__init__.py ADDED Viewed

File without changes

aixtools/auth/auth.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""
+Module that manages OAuth2 functions for authentication
+"""
+import logging
+import jwt
+from jwt import ExpiredSignatureError, InvalidAudienceError, InvalidIssuerError, PyJWKClient
+from aixtools.utils import config
+logger = logging.getLogger(__name__)
+class AuthTokenError(Exception):
+    """Exception raised for authentication token errors."""
+# pylint: disable=too-few-public-methods
+class AccessTokenVerifier:
+    """
+    Verifies Microsoft SSO JWT token against the configured Tenant ID, Audience, API ID and Issuer URL.
+    """
+    def __init__(self):
+        tenant_id = config.APP_TENANT_ID
+        self.api_id = config.APP_API_ID
+        self.issuer_url = config.APP_ISSUER_URL
+        # Azure AD endpoints
+        jwks_url = f"https://login.microsoftonline.com/{tenant_id}/discovery/v2.0/keys"
+        self.jwks_client = PyJWKClient(
+            uri=jwks_url,
+            # cache keys url response to reduce SSO server network calls,
+            # as public keys are not expected to change frequently
+            cache_jwk_set=True,
+            # cache resolved public keys
+            cache_keys=True,
+            # cache url response for 10 hours
+            lifespan=36000,
+        )
+        logger.info("Using JWKS: %s", jwks_url)
+    def verify(self, token: str) -> dict:
+        """
+        Verifies The JWT access token and returns decoded claims as a dictionary if the token is
+        valid, otherwise raises an AuthTokenError
+        """
+        try:
+            signing_key = self.jwks_client.get_signing_key_from_jwt(token)
+            claims = jwt.decode(
+                token,
+                signing_key.key,
+                algorithms=["RS256"],
+                audience=self.api_id,
+                issuer=self.issuer_url,
+                # ensure audience verification is carried out
+                options={"verify_aud": True},
+            )
+            return claims
+        except ExpiredSignatureError as e:
+            raise AuthTokenError("Token expired") from e
+        except InvalidAudienceError as e:
+            raise AuthTokenError(f"Token not for expected audience: {e}") from e
+        except InvalidIssuerError as e:
+            raise AuthTokenError(f"Token not for expected issuer: {e}") from e
+        except jwt.exceptions.PyJWTError as e:
+            raise AuthTokenError(f"Invalid token: {e}") from e

aixtools/evals/{evals.py → __main__.py} RENAMED Viewed

@@ -11,8 +11,8 @@ import asyncio
 import sys
 from pathlib import Path
-from .discovery import discover_all_datasets, find_eval_files
-from .run_evals import run_all_evaluations_and_print_results
+from aixtools.evals.discovery import discover_all_datasets, find_eval_files  # pylint: disable=E0401
+from aixtools.evals.run_evals import run_all_evaluations_and_print_results  # pylint: disable=E0401
 async def main():
@@ -24,8 +24,8 @@ async def main():
     parser.add_argument(
         "--filter", type=str, help="Filter to run only matching evaluations (matches module, file, or dataset names)"
     )
-    parser.add_argument("--include-input", action="store_true", help="Include input in report output")
-    parser.add_argument("--include-output", action="store_true", help="Include output in report output")
+    parser.add_argument("--include-input", action="store_true", default=True, help="Include input in report output")
+    parser.add_argument("--include-output", action="store_true", default=True, help="Include output in report output")
     parser.add_argument(
         "--include-evaluator-failures", action="store_true", help="Include evaluator failures in report output"
     )

aixtools/evals/dataset.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Custom dataset and evaluation utilities for AixTools.
+This module provides wrapper classes and decorators for building and running
+evaluations using the pydantic-evals framework. It includes a custom Dataset
+class, decorators for marking target functions, scorers, and evaluators, and
+a default scoring function based on assertion averages.
+"""
+from typing import Awaitable, Callable, Generic
+from pydantic import BaseModel
+from pydantic_evals.dataset import Case, Dataset, InputsT, MetadataT, OutputT
+from pydantic_evals.evaluators import Evaluator
+from pydantic_evals.reporting import EvaluationReport
+TargetT = Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT]
+ScorerT = Callable[[EvaluationReport, "AixDataset", float, bool], bool]
+class AixDataset(BaseModel, Generic[InputsT, OutputT, MetadataT]):
+    """Custom Dataset class for AixTools evaluations."""
+    dataset: Dataset[InputsT, OutputT]
+    name: str
+    target_func: TargetT
+    scorers: list[ScorerT]
+    def __init__(  # pylint: disable=R0913,R0917
+        self,
+        cases: list[Case[InputsT, OutputT]],
+        target_func: TargetT,
+        evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] | None = None,
+        name: str | None = None,
+        scoring_funcs: list[ScorerT] | None = None,
+    ):
+        super().__init__(
+            dataset=Dataset(cases=cases, evaluators=evaluators or []),
+            target_func=target_func,
+            name=name or "dataset",
+            scorers=scoring_funcs or [average_assertions],
+        )
+    @property
+    def cases(self) -> list[Case[InputsT, OutputT]]:
+        """Return the list of cases in the dataset."""
+        return self.dataset.cases
+    @property
+    def evaluators(self) -> list[Evaluator[InputsT, OutputT, MetadataT]]:
+        """Return the list of evaluators in the dataset."""
+        return self.dataset.evaluators
+    async def evaluate(
+        self,
+    ) -> EvaluationReport:
+        """Run the evaluation using the target function and return an EvaluationReport."""
+        return await self.dataset.evaluate(self.target_func)
+# Decorators removed - using name-based discovery only for simplicity and async compatibility
+# Functions should be named with prefixes: target_, scorer_, evaluator_
+def average_assertions(
+    report: EvaluationReport, dataset: "AixDataset", min_score: float = 1.0, verbose: bool = False
+) -> bool:
+    """Scoring function that checks if the average assertions meet a minimum threshold."""
+    averages = report.averages()
+    if averages and averages.assertions is not None:
+        success = averages.assertions >= min_score
+        if verbose:
+            print(f"\nAssertions Summary for {dataset.name}:")
+            print(f"  Assertions Average: {averages.assertions:.3f}")
+            print(f"  Minimum Required: {min_score:.3f}")
+            print(f"  Status: {'PASSED' if success else 'FAILED'}")
+        else:
+            print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
+    else:
+        success = False
+        if verbose:
+            print(f"\nAssertions Summary for {dataset.name}:")
+            print("  No assertions found or evaluation failed")
+            print(f"  Minimum Required: {min_score:.3f}")
+            print("  Status: FAILED")
+        else:
+            print("FAILED (no assertions)")
+    return success

aixtools/evals/discovery.py CHANGED Viewed

@@ -9,10 +9,14 @@ import inspect
 import sys
 import traceback
 from pathlib import Path
-from typing import Any
+from typing import Any, TypeVar
 from pydantic_evals.dataset import Dataset
+from aixtools.evals.dataset import AixDataset  # pylint: disable=E0401
+SpecialFuncT = TypeVar("SpecialFuncT")
 def find_eval_files(evals_dir: Path) -> list[Path]:
     """Find all eval_*.py files in the evals directory."""
@@ -33,7 +37,7 @@ def find_datasets_in_module(module: Any) -> list[tuple[str, Dataset]]:
     datasets = []
     for name, obj in inspect.getmembers(module):
-        if name.startswith("dataset_") and isinstance(obj, Dataset):
+        if name.startswith("dataset_") and isinstance(obj, (Dataset, AixDataset)):
             datasets.append((name, obj))
     return datasets
@@ -66,66 +70,74 @@ def matches_filter(module_name: str, file_name: str, dataset_name: str, name_fil
     )
-def find_target_function(module: Any) -> Any | None:
-    """Find the first async function in a module that doesn't start with underscore."""
+def find_prefixed_functions(module: Any, prefix: str) -> list[Any]:
+    """Find all functions with a specific prefix (name-based discovery only)."""
+    funcs = []
     for name, obj in inspect.getmembers(module):
-        if inspect.iscoroutinefunction(obj) and not name.startswith("_"):
-            return obj
-    return None
+        if name.startswith(prefix) and (inspect.isfunction(obj) or inspect.iscoroutinefunction(obj)):
+            funcs.append(obj)  # Return function directly, no decorator wrapping
+    return funcs
-def get_async_function_names(module: Any) -> list[str]:
-    """Get names of all async functions in a module that don't start with underscore."""
-    return [
-        name
-        for name, obj in inspect.getmembers(module)
-        if inspect.iscoroutinefunction(obj) and not name.startswith("_")
-    ]
+def print_v(message: str, verbose: bool) -> None:
+    """Print message if verbose is enabled."""
+    if verbose:
+        print(message)
 def process_datasets_from_module(
     module: Any, eval_file: Path, name_filter: str | None, verbose: bool
-) -> list[tuple[str, Dataset, Any]]:
+) -> list[AixDataset]:
     """Process all datasets from a single module and return valid dataset tuples."""
     datasets = find_datasets_in_module(module)
-    if verbose:
-        print(f"  Found {len(datasets)} datasets: {[name for name, _ in datasets]}")
+    print_v(f"  Found {len(datasets)} datasets: {[name for name, _ in datasets]}", verbose)
     valid_datasets = []
+    targets = find_prefixed_functions(module, "target_")
+    scorers = find_prefixed_functions(module, "scorer_")
+    evaluators = find_prefixed_functions(module, "evaluator_")
+    print_v(f"  Found target functions: {[f.__name__ for f in targets]}", verbose)
+    print_v(f"  Found scoring functions: {[f.__name__ for f in scorers]}", verbose)
+    print_v(f"  Found evaluator functions: {[f.__name__ for f in evaluators]}", verbose)
     for dataset_name, dataset in datasets:
         full_name = f"{eval_file.stem}.{dataset_name}"
         if not matches_filter(module.__name__, eval_file.stem, dataset_name, name_filter):
-            if verbose:
-                print(f"    ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})")
+            print_v(f"    ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})", verbose)
             continue
-        if verbose:
-            print(f"    ✓ Including dataset: {dataset_name}")
+        print_v(f"    ✓ Including dataset: {dataset_name}", verbose)
-        # Find the target function
-        target_function = find_target_function(module)
-        async_functions = get_async_function_names(module)
+        if isinstance(dataset, Dataset):
+            # Wrap in AixDataset if not already
-        if verbose:
-            print(f"      Found async functions: {async_functions}")
-            if target_function:
-                print(f"      Using target function: {target_function.__name__}")
+            if len(targets) != 1:
+                print_v(
+                    f"    ✗ Skipping dataset: {dataset_name} (has {len(targets)} target functions, expected exactly 1)",
+                    verbose,
+                )
-        if target_function is None:
-            if verbose:
-                print(f"Warning: No async function found in {eval_file.name} for dataset {dataset_name}")
-            continue
+                continue
+            dataset = AixDataset(  # noqa: PLW2901
+                cases=dataset.cases,
+                evaluators=dataset.evaluators,  # evaluators are plain functions now
+                name=full_name,
+                target_func=targets[0],  # target function is used directly
+                scoring_funcs=scorers,  # scorers are plain functions now
+            )
-        valid_datasets.append((full_name, dataset, target_function))
+            valid_datasets.append(dataset)
     return valid_datasets
-def discover_all_datasets(
-    eval_files: list[Path], name_filter: str | None, verbose: bool
-) -> list[tuple[str, Dataset, Any]]:
+def discover_all_datasets(eval_files: list[Path], name_filter: str | None, verbose: bool) -> list[AixDataset]:
     """Discover all datasets from eval files."""
     all_datasets = []
@@ -141,7 +153,7 @@ def discover_all_datasets(
             datasets = process_datasets_from_module(module, eval_file, name_filter, verbose)
             all_datasets.extend(datasets)
-        except Exception as e:  # pylint: disable=W0718
+        except Exception as e:  # pylint: disable=broad-exception-caught
             if verbose:
                 print(f"Error loading {eval_file}: {e}")
                 print(f"  Traceback: {traceback.format_exc()}")
@@ -162,9 +174,9 @@ def discover_all_datasets(
         print(f"\n{'=' * 60}")
         print("Datasets to Evaluate:")
         print(f"{'=' * 60}")
-        for i, (dataset_name, dataset, target_function) in enumerate(all_datasets, 1):
-            print(f"{i}. {dataset_name}")
-            print(f"   Target function: {target_function.__name__}")
+        for i, (dataset) in enumerate(all_datasets, 1):
+            print(f"{i}. {dataset.name}")
+            print(f"   Target function: {dataset.target_func.__name__}")
             print(f"   Cases: {len(dataset.cases)}")
             print(f"   Evaluators: {len(dataset.evaluators)}")
         print(f"{'=' * 60}")

aixtools/evals/run_evals.py CHANGED Viewed

@@ -5,30 +5,29 @@ This module handles running evaluations and printing results.
 """
 import sys
-from typing import Any
-from pydantic_evals.dataset import Dataset
+from pydantic_evals.reporting import EvaluationReport
+from aixtools.evals.dataset import AixDataset  # pylint: disable=E0401
-async def run_dataset_evaluation(  # noqa: PLR0913, pylint: disable=too-many-arguments,too-many-positional-arguments
-    dataset_name: str,
-    dataset: Dataset,
-    target_function: Any,
+async def run_dataset_evaluation(
+    dataset: AixDataset,
     print_options: dict[str, bool],
     min_assertions: float,
     verbose: bool = False,
-) -> tuple[str, bool]:
-    """Run evaluation for a single dataset and return (name, success)."""
+) -> tuple[str, bool, EvaluationReport | None]:
+    """Run evaluation for a single dataset and return (name, success, report)."""
     if verbose:
         print(f"\n{'=' * 60}")
-        print(f"Running evaluation: {dataset_name}")
+        print(f"Running evaluation: {dataset.name}")
         print(f"{'=' * 60}")
     else:
-        print(f"Running {dataset_name}...", end=" ")
+        print(f"Running {dataset.name}...", end=" ")
     try:
         # Execute the evaluation
-        report = await dataset.evaluate(target_function)
+        report = await dataset.evaluate()
         # Print the results
         report.print(
@@ -38,60 +37,49 @@ async def run_dataset_evaluation(  # noqa: PLR0913, pylint: disable=too-many-arg
             include_reasons=print_options["include_reasons"],
         )
-        # Check if evaluation passed based on assertions average
-        averages = report.averages()
-        if averages and averages.assertions is not None:
-            success = averages.assertions >= min_assertions
-            if verbose:
-                print(f"\nEvaluation Summary for {dataset_name}:")
-                print(f"  Assertions Average: {averages.assertions:.3f}")
-                print(f"  Minimum Required: {min_assertions:.3f}")
-                print(f"  Status: {'PASSED' if success else 'FAILED'}")
-            else:
-                print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
-        else:
-            success = False
-            if verbose:
-                print(f"\nEvaluation Summary for {dataset_name}:")
-                print("  No assertions found or evaluation failed")
-                print(f"  Minimum Required: {min_assertions:.3f}")
-                print("  Status: FAILED")
-            else:
-                print("FAILED (no assertions)")
+        success = all(scorer(report, dataset, min_assertions, verbose) for scorer in dataset.scorers)
-        return dataset_name, success
+        return dataset.name, success, report
     except Exception as e:  # pylint: disable=broad-exception-caught
         if verbose:
-            print(f"Error running evaluation {dataset_name}: {e}")
+            print(f"Error running evaluation {dataset.name}: {e}")
         else:
             print(f"ERROR ({e})")
-        return dataset_name, False
+        return dataset.name, False, None
 async def run_all_evaluations_and_print_results(
-    datasets: list[tuple[str, Dataset, Any]], print_options: dict[str, bool], min_assertions: float, verbose: bool
+    datasets: list[AixDataset], print_options: dict[str, bool], min_assertions: float, verbose: bool
 ) -> None:
     """Run all evaluations and print results with summary."""
     # Run all evaluations
     results = []
-    for dataset_name, dataset, target_function in datasets:
-        result = await run_dataset_evaluation(
-            dataset_name, dataset, target_function, print_options, min_assertions, verbose
-        )
+    for dataset in datasets:
+        result = await run_dataset_evaluation(dataset, print_options, min_assertions, verbose)
         results.append(result)
+    # Print reports
+    for _, _, report in results:
+        if report:
+            report.print(
+                include_input=print_options["include_input"],
+                include_output=print_options["include_output"],
+                include_evaluator_failures=print_options["include_evaluator_failures"],
+                include_reasons=print_options["include_reasons"],
+            )
     # Print summary
-    passed = sum(1 for _, success in results if success)
+    passed = sum(1 for _, success, _ in results if success)
     total = len(results)
-    failed_results = [(name, success) for name, success in results if not success]
+    failed_results = [(name, success, _) for name, success, _ in results if not success]
     if verbose:
         print(f"\n{'=' * 60}")
         print("EVALUATION SUMMARY")
         print(f"{'=' * 60}")
-        for name, success in results:
+        for name, success, _ in results:
             status = "PASSED" if success else "FAILED"
             print(f"  {name}: {status}")
@@ -99,7 +87,7 @@ async def run_all_evaluations_and_print_results(
     # Only show failed evaluations when not verbose
     elif failed_results:
         print("\nFailed evaluations:")
-        for name, _ in failed_results:
+        for name, _, _ in failed_results:
             print(f"  {name}: FAILED")
     # Exit with non-zero code if any evaluations failed

aixtools/utils/config.py CHANGED Viewed

@@ -56,7 +56,6 @@ else:
     logging.error("No '.env' file found in any of the search paths, or their parents: %s", env_dirs)
     sys.exit(1)
 # ---
 # Directories
 # ---
@@ -124,7 +123,17 @@ GOOGLE_CLOUD_LOCATION = get_variable_env("GOOGLE_CLOUD_LOCATION", True)
 # vault parameters.
 VAULT_ADDRESS = get_variable_env("VAULT_ADDRESS", default="http://localhost:8200")
-VAULT_TOKEN = get_variable_env("VAULT_TOKEN", default="vault-token")
-VAULT_ENV = get_variable_env("ENV", default="dev")
-VAULT_MOUNT_POINT = get_variable_env("VAULT_MOUNT_POINT", default="secret")
-VAULT_PATH_PREFIX = get_variable_env("VAULT_PATH_PREFIX", default="path")
+VAULT_TOKEN = get_variable_env("VAULT_TOKEN", allow_empty=True)
+VAULT_ENV = get_variable_env("ENV", allow_empty=True)
+VAULT_MOUNT_POINT = get_variable_env("VAULT_MOUNT_POINT", allow_empty=True)
+VAULT_PATH_PREFIX = get_variable_env("VAULT_PATH_PREFIX", allow_empty=True)
+# OAuth parameters
+APP_SECRET_ID = get_variable_env("APP_SECRET_ID")
+APP_CLIENT_ID = get_variable_env("APP_CLIENT_ID")
+# used for token audience check
+APP_API_ID = get_variable_env("APP_API_ID")
+APP_TENANT_ID = get_variable_env("APP_TENANT_ID")
+# used for token issuer check
+APP_ISSUER_URL = get_variable_env("APP_ISSUER_URL")

{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aixtools
-Version: 0.2.2
+Version: 0.2.4
 Summary: Tools for AI exploration and debugging
 Requires-Python: >=3.11.2
 Description-Content-Type: text/markdown
@@ -20,6 +20,7 @@ Requires-Dist: mypy>=1.18.2
 Requires-Dist: pandas>=2.2.3
 Requires-Dist: pydantic-evals>=0.4.10
 Requires-Dist: pydantic-ai>=1.0.9
+Requires-Dist: pyjwt>=2.10.1
 Requires-Dist: pylint>=3.3.7
 Requires-Dist: rich>=14.0.0
 Requires-Dist: ruff>=0.11.6
@@ -415,30 +416,30 @@ By default, the "FaultyMCP" includes several tools you can use in your tests:
 ### Evals
-Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework.
+Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework with AIXtools enhancements.
 ```bash
 # Run all evaluations
-evals
+python -m aixtools.evals
 # Run evaluations with filtering
-evals --filter "specific_test"
+python -m aixtools.evals --filter "specific_test"
 # Run with verbose output and detailed reporting
-evals --verbose --include-input --include-output --include-reasons
+python -m aixtools.evals --verbose --include-input --include-output --include-reasons
 # Specify custom evaluations directory
-evals --evals-dir /path/to/evals
+python -m aixtools.evals --evals-dir /path/to/evals
 # Set minimum assertions threshold
-evals --min-assertions 0.8
+python -m aixtools.evals --min-assertions 0.8
 ```
 **Command Line Options:**
 - `--evals-dir` - Directory containing eval_*.py files (default: evals)
 - `--filter` - Filter to run only matching evaluations
-- `--include-input` - Include input in report output
-- `--include-output` - Include output in report output
+- `--include-input` - Include input in report output (default: True)
+- `--include-output` - Include output in report output (default: True)
 - `--include-evaluator-failures` - Include evaluator failures in report
 - `--include-reasons` - Include reasons in report output
 - `--min-assertions` - Minimum assertions average required for success (default: 1.0)
@@ -446,14 +447,16 @@ evals --min-assertions 0.8
 The evaluation system discovers and runs all Dataset objects from eval_*.py files in the specified directory, similar to test runners but specifically designed for LLM evaluations using pydantic_evals.
-**Discovery Mechanism:**
+**Discovery Mechanism**
-The evaluation framework uses an automatic discovery system that:
+The evaluation framework uses an automatic discovery system:
 1. **File Discovery**: Scans the specified directory for files matching the pattern `eval_*.py`
 2. **Dataset Discovery**: Within each file, looks for variables named `dataset_*` that are instances of `pydantic_evals.Dataset`
-3. **Target Function Discovery**: Automatically finds the first async function in each module that doesn't start with an underscore (`_`) to use as the evaluation target
-4. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
+3. **Target Function Discovery**: Within the same file looks for function or async function named `target_*`. There must be 1 target function per file.
+4. **Function Discovery**: Looks for functions with specific prefixes:
+   - Functions prefixed with `scorer_*`, `evaluator_*` for custom scorer and evaluator functions that will be used for each dataset in that file
+5. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
 **Example Evaluation File Structure:**
 ```python
@@ -471,11 +474,16 @@ dataset_addition = Dataset(
 )
 # This function will be used as the evaluation target
-async def evaluate_math_agent(input_text: str) -> str:
-    # Your agent evaluation logic here
+async def target_math_agent(input_text: str) -> str:
+    # Your agent run logic here
     agent = get_agent(system_prompt="You are a math assistant.")
     result, _ = await run_agent(agent, input_text)
     return result
+# This function will be used as evaluator for all datasets (optional)
+def evaluator_check_output(ctx: EvaluatorContext) -> bool:
+    # Your result evaluation logic here
+    return ctx.output == ctx.expected_output
 ```
 The discovery system will:
@@ -484,6 +492,34 @@ The discovery system will:
 - Use `evaluate_math_agent` as the target function for evaluation
 - Run each case through the target function and evaluate results
+#### Name-Based Discovery
+The evaluation system uses name-based discovery for all components:
+**Target Functions** (exactly one required per eval file):
+- **Purpose**: The main function being evaluated - processes inputs and returns outputs
+- **Naming**: Functions named `target_*` (e.g., `target_my_function`)
+- **Signature**: `def target_name(inputs: InputType) -> OutputType` or `async def target_name(inputs: InputType) -> OutputType`
+- **Example**: `async def target_math_agent(input_text: str) -> str`
+**Scoring Functions** (optional):
+- **Purpose**: Determine if evaluation results meet success criteria
+- **Naming**: Functions named `scorer_*` (e.g., `scorer_custom`)
+- **Signature**: `def scorer_name(report: EvaluationReport, dataset: AixDataset, min_score: float = 1.0, verbose: bool = False) -> bool`
+- **Example**: `def scorer_accuracy_threshold(report, dataset, min_score=0.8, verbose=False) -> bool`
+**Evaluator Functions** (optional):
+- **Purpose**: Custom evaluation logic for comparing outputs with expected results
+- **Naming**: Functions named `evaluator_*` (e.g., `evaluator_check_output`)
+- **Signature**: `def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput` or `async def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput`
+- **Example**: `def evaluator_exact_match(ctx) -> EvaluatorOutput`
+This name-based approach works seamlessly with both synchronous and asynchronous functions.
+#### Scoring System
+The framework includes a custom scoring system with [`average_assertions`](aixtools/evals/dataset.py:67) as the default scorer. This scorer checks if the average assertion score meets a minimum threshold and provides detailed pass/fail reporting.
 ## Testing & Tools
 AIXtools provides comprehensive testing utilities and diagnostic tools for AI agent development and debugging.

{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 aixtools/__init__.py,sha256=9NGHm7LjsQmsvjTZvw6QFJexSvAU4bCoN_KBk9SCa00,260
-aixtools/_version.py,sha256=o3ZTescp-19Z9cvBGq9dQnbppljgzdUYUf98Nov0spY,704
+aixtools/_version.py,sha256=NRw4Jle4n9v_DD2wtplRqflGCvX8OU5eAjycYY0vY3Y,704
 aixtools/app.py,sha256=JzQ0nrv_bjDQokllIlGHOV0HEb-V8N6k_nGQH-TEsVU,5227
 aixtools/chainlit.md,sha256=yC37Ly57vjKyiIvK4oUvf4DYxZCwH7iocTlx7bLeGLU,761
 aixtools/context.py,sha256=I_MD40ZnvRm5WPKAKqBUAdXIf8YaurkYUUHSVVy-QvU,598
@@ -30,15 +30,18 @@ aixtools/agents/agent.py,sha256=tceQByn-RGBIhW8BOjKoP0yhNzZLwAa6CxwhPhRe3PU,7270
 aixtools/agents/agent_batch.py,sha256=0Zu9yNCRPAQZPjXQ-dIUAmP1uGTVbxVt7xvnMpoJMjU,2251
 aixtools/agents/print_nodes.py,sha256=wVTngNfqM0As845WTRz6G3Rei_Gr3HuBlvu-G_eXuig,1665
 aixtools/agents/prompt.py,sha256=p9OYnyJ4-MyGXwHPrQeJBhZ2a3RV2HqhtdUUCrTMsAQ,3361
+aixtools/auth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+aixtools/auth/auth.py,sha256=aKYCKJRjSNrVZmIWN2h2p1zYqkhMLLBXBfk_Qy5NKik,2365
 aixtools/compliance/__init__.py,sha256=vnw0zEdySIJWvDAJ8DCRRaWmY_agEOz1qlpAdhmtiuo,191
 aixtools/compliance/private_data.py,sha256=OOM9mIp3_w0fNgj3VAEWBl7-jrPc19_Ls1pC5dfF5UY,5323
 aixtools/db/__init__.py,sha256=b8vRhme3egV-aUZbAntnOaDkSXB8UT0Xy5oqQhU_z0Q,399
 aixtools/db/database.py,sha256=caWe95GlxZYlxn2ubDmR-_cQUW0ulkpR3BHunKIaOsw,3369
 aixtools/db/vector_db.py,sha256=be4JGyXj3o8VEfy9L6SO1aAoDET_zazMJkYfjlYHTYQ,4133
 aixtools/evals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-aixtools/evals/discovery.py,sha256=nKBMHuM3Q87GFY4U0QXvU-zXmJjR-bnmlwf5cfp5E9s,5907
-aixtools/evals/evals.py,sha256=3oJ6_HjojLOzG8XxdfdTYFk-gxep41nk_viHTsUwFNo,2738
-aixtools/evals/run_evals.py,sha256=oJpGIPF5avq1r275Yd_RJyxUiUgOd00LItdKXfGlAbA,3910
+aixtools/evals/__main__.py,sha256=f6_X5jHFHIR36r_jerj3ol8SLhZB8nVk8vffoRVtxLs,2844
+aixtools/evals/dataset.py,sha256=qsLrx9hgvZWY1FzuwYtm3aJscNth1EkeLWTgIdici5Q,3374
+aixtools/evals/discovery.py,sha256=gHKfutzdtjZPkjxXnD_WYHqL2WMem8kkJpk2HDHIgKg,6488
+aixtools/evals/run_evals.py,sha256=J5sfdfC_2NwQcRP4mGV4FYSpzawRe4pmkOTjUm1IAWU,3278
 aixtools/google/client.py,sha256=8yuv_zEZKlmUTI-zRxAb3vjLUrfiwrBhcpNe0hYsO0g,1078
 aixtools/log_view/__init__.py,sha256=0fWLCq9BMo8GoH3Z5WDgvf0-J2TP0XWqtef0f28SHBA,405
 aixtools/log_view/app.py,sha256=DZp3PUM_iS3DpMHqHfFXVACvbZ9PItbOCNMkDjIOfTc,6595
@@ -76,7 +79,7 @@ aixtools/tools/doctor/mcp_tool_doctor.py,sha256=sX2q5GfNkmUYxnXrqMpeGIwGfeL1LpYJ
 aixtools/tools/doctor/tool_doctor.py,sha256=EY1pshjLGLD0j6cc1ZFtbc0G19I5IbOZwHFDqypE49Q,2661
 aixtools/tools/doctor/tool_recommendation.py,sha256=LYyVOSXdAorWiY4P-ucSA1vLlV5BTEfX4GzBXNE_X0M,1569
 aixtools/utils/__init__.py,sha256=xT6almZBQYMfj4h7Hq9QXDHyVXbOOTxqLsmJsxYYnSw,757
-aixtools/utils/config.py,sha256=JeUbGls1womGZWIp6gPBT0IoAfrljpscKEoKx2eBXjw,4819
+aixtools/utils/config.py,sha256=t32731F53Cv1YYoX95wksoreE0Zn0B8UKyEiKWne4ec,5147
 aixtools/utils/config_util.py,sha256=3Ya4Qqhj1RJ1qtTTykQ6iayf5uxlpigPXgEJlTi1wn4,2229
 aixtools/utils/enum_with_description.py,sha256=zjSzWxG74eR4x7dpmb74pLTYCWNSMvauHd7_9LpDYIw,1088
 aixtools/utils/files.py,sha256=8JnxwHJRJcjWCdFpjzWmo0po2fRg8esj4H7sOxElYXU,517
@@ -86,8 +89,8 @@ aixtools/utils/chainlit/cl_agent_show.py,sha256=vaRuowp4BRvhxEr5hw0zHEJ7iaSF_5bo
 aixtools/utils/chainlit/cl_utils.py,sha256=fxaxdkcZg6uHdM8uztxdPowg3a2f7VR7B26VPY4t-3c,5738
 aixtools/vault/__init__.py,sha256=fsr_NuX3GZ9WZ7dGfe0gp_5-z3URxAfwVRXw7Xyc0dU,141
 aixtools/vault/vault.py,sha256=9dZLWdZQk9qN_Q9Djkofw9LUKnJqnrX5H0fGusVLBhA,6037
-aixtools-0.2.2.dist-info/METADATA,sha256=uF-hTQvikYFOiybcQY5Dj1Vc20ubJndbWKB8aytBo6c,24951
-aixtools-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-aixtools-0.2.2.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
-aixtools-0.2.2.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
-aixtools-0.2.2.dist-info/RECORD,,
+aixtools-0.2.4.dist-info/METADATA,sha256=EzZB-SOZLdj5QZDkk3YPu0PCipJOiAKT08xJNWKENfg,27229
+aixtools-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+aixtools-0.2.4.dist-info/entry_points.txt,sha256=q8412TG4T0S8K0SKeWp2vkVPIDYQs0jNoHqcQ7qxOiA,155
+aixtools-0.2.4.dist-info/top_level.txt,sha256=wBn-rw9bCtxrR4AYEYgjilNCUVmKY0LWby9Zan2PRJM,9
+aixtools-0.2.4.dist-info/RECORD,,

{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{aixtools-0.2.2.dist-info → aixtools-0.2.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

aixtools 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

aixtools 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl