PyPI - aixtools - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

aixtools 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of aixtools might be problematic. Click here for more details.

Files changed (25) hide show

aixtools/_version.py +2 -2
aixtools/agents/agent.py +26 -7
aixtools/agents/print_nodes.py +54 -0
aixtools/agents/prompt.py +2 -2
aixtools/compliance/private_data.py +1 -1
aixtools/evals/__init__.py +0 -0
aixtools/evals/discovery.py +174 -0
aixtools/evals/evals.py +74 -0
aixtools/evals/run_evals.py +110 -0
aixtools/logging/log_objects.py +24 -23
aixtools/mcp/client.py +46 -1
aixtools/server/__init__.py +0 -6
aixtools/server/path.py +88 -31
aixtools/testing/aix_test_model.py +7 -1
aixtools/tools/doctor/mcp_tool_doctor.py +79 -0
aixtools/tools/doctor/tool_doctor.py +4 -0
aixtools/tools/doctor/tool_recommendation.py +5 -0
aixtools/utils/config.py +0 -1
{aixtools-0.1.11.dist-info → aixtools-0.2.1.dist-info}/METADATA +185 -30
{aixtools-0.1.11.dist-info → aixtools-0.2.1.dist-info}/RECORD +23 -18
aixtools-0.2.1.dist-info/entry_points.txt +4 -0
aixtools/server/workspace_privacy.py +0 -65
aixtools-0.1.11.dist-info/entry_points.txt +0 -2
{aixtools-0.1.11.dist-info → aixtools-0.2.1.dist-info}/WHEEL +0 -0
{aixtools-0.1.11.dist-info → aixtools-0.2.1.dist-info}/top_level.txt +0 -0

aixtools/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.11'
-__version_tuple__ = version_tuple = (0, 1, 11)
+__version__ = version = '0.2.1'
+__version_tuple__ = version_tuple = (0, 2, 1)
 __commit_id__ = commit_id = None

aixtools/agents/agent.py CHANGED Viewed

@@ -5,10 +5,11 @@ Core agent implementation providing model selection and configuration for AI age
 from types import NoneType
 from typing import Any
+from fastmcp import Context
 from openai import AsyncAzureOpenAI
 from pydantic_ai import Agent
 from pydantic_ai.models.bedrock import BedrockConverseModel
-from pydantic_ai.models.openai import OpenAIModel
+from pydantic_ai.models.openai import OpenAIChatModel
 from pydantic_ai.providers.bedrock import BedrockProvider
 from pydantic_ai.providers.openai import OpenAIProvider
 from pydantic_ai.settings import ModelSettings
@@ -54,14 +55,14 @@ def _get_model_ollama(model_name=OLLAMA_MODEL_NAME, ollama_url=OLLAMA_URL):
     assert ollama_url, "OLLAMA_URL is not set"
     assert model_name, "Model name is not set"
     provider = OpenAIProvider(base_url=ollama_url)
-    return OpenAIModel(model_name=model_name, provider=provider)
+    return OpenAIChatModel(model_name=model_name, provider=provider)
 def _get_model_openai(model_name=OPENAI_MODEL_NAME, openai_api_key=OPENAI_API_KEY):
     assert openai_api_key, "OPENAI_API_KEY is not set"
     assert model_name, "Model name is not set"
     provider = OpenAIProvider(api_key=openai_api_key)
-    return OpenAIModel(model_name=model_name, provider=provider)
+    return OpenAIChatModel(model_name=model_name, provider=provider)
 def _get_model_openai_azure(
@@ -77,7 +78,7 @@ def _get_model_openai_azure(
     client = AsyncAzureOpenAI(
         azure_endpoint=azure_openai_endpoint, api_version=azure_openai_api_version, api_key=azure_openai_api_key
     )
-    return OpenAIModel(model_name=model_name, provider=OpenAIProvider(openai_client=client))
+    return OpenAIChatModel(model_name=model_name, provider=OpenAIProvider(openai_client=client))
 def _get_model_open_router(
@@ -87,7 +88,7 @@ def _get_model_open_router(
     assert openrouter_api_key, "OPENROUTER_API_KEY is not set"
     assert model_name, "Model name is not set, missing 'OPENROUTER_MODEL_NAME' environment variable?"
     provider = OpenAIProvider(base_url=openrouter_api_url, api_key=openrouter_api_key)
-    return OpenAIModel(model_name, provider=provider)
+    return OpenAIChatModel(model_name, provider=provider)
 def get_model(model_family=MODEL_FAMILY, model_name=None, **kwargs):
@@ -146,8 +147,22 @@ async def run_agent(  # noqa: PLR0913, pylint: disable=too-many-arguments,too-ma
     debug: bool = False,
     log_model_requests: bool = False,
     parent_logger: ObjectLogger | None = None,
+    ctx: Context | None = None,
 ):
-    """Query the LLM"""
+    """
+    Run the agent with the given prompt and log the execution details.
+    Args:
+        agent (Agent): The PydanticAI agent to run.
+        prompt (str | list[str]): The input prompt(s) for the agent.
+        usage_limits (UsageLimits | None): Optional usage limits for the agent.
+        verbose (bool): If True, enables verbose logging.
+        debug (bool): If True, enables debug logging.
+        log_model_requests (bool): If True, logs model requests and responses.
+        parent_logger (ObjectLogger | None): Optional parent logger for hierarchical logging.
+        ctx (Context | None): Optional FastMCP context for logging messages to the MCP client.
+    Returns:
+        tuple[final_output, nodes]: A tuple containing the agent's final output and a list of all logged nodes.
+    """
     # Results
     nodes, result = [], None
     async with agent.iter(prompt, usage_limits=usage_limits) as agent_run:
@@ -158,7 +173,11 @@ async def run_agent(  # noqa: PLR0913, pylint: disable=too-many-arguments,too-ma
                 agent.model = model_patch_logging(agent.model, agent_logger)
             # Run the agent
             async for node in agent_run:
-                agent_logger.log(node)
+                await agent_logger.log(node)  # Log each node
+                if ctx:
+                    # If we are executing in an MCP server, send info messages to the client for better debugging
+                    server_name = ctx.fastmcp.name
+                    await ctx.info(f"MCP server {server_name}: {node}")
                 nodes.append(node)
             result = agent_run.result
     return result.output if result else None, nodes

aixtools/agents/print_nodes.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Utility functions to print nodes and their parts in a readable format."""
+from pydantic_ai import CallToolsNode, ModelRequestNode, UserPromptNode
+from pydantic_ai.messages import TextPart, ToolCallPart
+from pydantic_graph.nodes import End
+def tab(s, prefix: str = "\t|") -> str:
+    """ "Tab a string with a given prefix (default is tab + pipe)."""
+    return prefix + str(s).replace("\n", "\n" + prefix)
+def part2str(p, prefix: str = "\t"):
+    """Convert a Part to a string representation."""
+    match p:
+        case ToolCallPart():
+            return f"{prefix}Tool: {p.tool_name}, args: {p.args}"
+        case TextPart():
+            return f"{prefix}Text: {tab(p.content)}"
+        case _:
+            return f"{prefix}Part {type(p)}: {p}"
+def print_parts(parts, prefix: str = ""):
+    """Print a list of Parts with a given prefix."""
+    if len(parts) == 0:
+        print(f"{prefix}No parts")
+        return
+    if len(parts) == 1:
+        print(part2str(parts[0], prefix=prefix))
+        return
+    for p in parts:
+        print(f"{part2str(p, prefix=prefix)}")
+def print_node(n):
+    """Print a node in a readable format."""
+    match n:
+        case UserPromptNode():
+            print(f"Prompt:\n{tab(n.user_prompt)}")
+        case CallToolsNode():
+            print_parts(n.model_response.parts)
+        case ModelRequestNode():
+            print(f"Model request: ~ {len(str(n))} chars")
+        case End():
+            pass  # print(f"End:\n{tab(n.data.output)}")
+        case _:
+            print(f"{type(n)}: {n}")
+def print_nodes(nodes):
+    """Print a list of nodes in a readable format."""
+    for n in nodes:
+        print_node(n)

aixtools/agents/prompt.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Prompt building utilities for Pydantic AI agent, including file handling and context management."""
 import mimetypes
-from pathlib import Path, PurePosixPath
+from pathlib import Path
 from pydantic_ai import BinaryContent
@@ -78,7 +78,7 @@ def build_user_input(
     binary_attachments = []
     for workspace_path in file_paths:
-        host_path = container_to_host_path(PurePosixPath(workspace_path), ctx=session_tuple)
+        host_path = container_to_host_path(workspace_path, ctx=session_tuple)
         file_size = host_path.stat().st_size
         mime_type, _ = mimetypes.guess_type(host_path)
         mime_type = mime_type or "application/octet-stream"

aixtools/compliance/private_data.py CHANGED Viewed

@@ -88,7 +88,7 @@ class PrivateData:
     def _get_private_data_path(self) -> Path:
         """Get the path to the private data file in the workspace."""
-        return get_workspace_path(service_name=None, ctx=self.ctx) / PRIVATE_DATA_FILE
+        return get_workspace_path(ctx=self.ctx) / PRIVATE_DATA_FILE
     def _has_private_data_file(self) -> bool:
         """Check if the private data file exists in the workspace."""

aixtools/evals/__init__.py ADDED Viewed

File without changes

aixtools/evals/discovery.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""
+Dataset discovery functionality for LLM evaluations.
+This module handles discovering and loading Dataset objects from eval_*.py files.
+"""
+import importlib.util
+import inspect
+import sys
+import traceback
+from pathlib import Path
+from typing import Any
+from pydantic_evals.dataset import Dataset
+def find_eval_files(evals_dir: Path) -> list[Path]:
+    """Find all eval_*.py files in the evals directory."""
+    if not evals_dir.exists():
+        print(f"Error: Evals directory '{evals_dir}' does not exist")
+        sys.exit(1)
+    eval_files = list(evals_dir.glob("eval_*.py"))
+    if not eval_files:
+        print(f"No eval_*.py files found in '{evals_dir}'")
+        sys.exit(1)
+    return eval_files
+def find_datasets_in_module(module: Any) -> list[tuple[str, Dataset]]:
+    """Find all Dataset objects with names matching dataset_* in a module."""
+    datasets = []
+    for name, obj in inspect.getmembers(module):
+        if name.startswith("dataset_") and isinstance(obj, Dataset):
+            datasets.append((name, obj))
+    return datasets
+def load_module_from_file(file_path: Path) -> Any:
+    """Load a Python module from a file path."""
+    module_name = file_path.stem
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load module from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def matches_filter(module_name: str, file_name: str, dataset_name: str, name_filter: str | None) -> bool:
+    """Check if the dataset matches the name filter."""
+    if name_filter is None:
+        return True
+    # Check if filter matches any of: module name, file name, dataset name, or full qualified name
+    full_name = f"{module_name}.{dataset_name}"
+    return (
+        name_filter in module_name
+        or name_filter in file_name
+        or name_filter in dataset_name
+        or name_filter in full_name
+    )
+def find_target_function(module: Any) -> Any | None:
+    """Find the first async function in a module that doesn't start with underscore."""
+    for name, obj in inspect.getmembers(module):
+        if inspect.iscoroutinefunction(obj) and not name.startswith("_"):
+            return obj
+    return None
+def get_async_function_names(module: Any) -> list[str]:
+    """Get names of all async functions in a module that don't start with underscore."""
+    return [
+        name
+        for name, obj in inspect.getmembers(module)
+        if inspect.iscoroutinefunction(obj) and not name.startswith("_")
+    ]
+def process_datasets_from_module(
+    module: Any, eval_file: Path, name_filter: str | None, verbose: bool
+) -> list[tuple[str, Dataset, Any]]:
+    """Process all datasets from a single module and return valid dataset tuples."""
+    datasets = find_datasets_in_module(module)
+    if verbose:
+        print(f"  Found {len(datasets)} datasets: {[name for name, _ in datasets]}")
+    valid_datasets = []
+    for dataset_name, dataset in datasets:
+        full_name = f"{eval_file.stem}.{dataset_name}"
+        if not matches_filter(module.__name__, eval_file.stem, dataset_name, name_filter):
+            if verbose:
+                print(f"    ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})")
+            continue
+        if verbose:
+            print(f"    ✓ Including dataset: {dataset_name}")
+        # Find the target function
+        target_function = find_target_function(module)
+        async_functions = get_async_function_names(module)
+        if verbose:
+            print(f"      Found async functions: {async_functions}")
+            if target_function:
+                print(f"      Using target function: {target_function.__name__}")
+        if target_function is None:
+            if verbose:
+                print(f"Warning: No async function found in {eval_file.name} for dataset {dataset_name}")
+            continue
+        valid_datasets.append((full_name, dataset, target_function))
+    return valid_datasets
+def discover_all_datasets(
+    eval_files: list[Path], name_filter: str | None, verbose: bool
+) -> list[tuple[str, Dataset, Any]]:
+    """Discover all datasets from eval files."""
+    all_datasets = []
+    for eval_file in eval_files:
+        if verbose:
+            print(f"\nProcessing file: {eval_file}")
+        try:
+            module = load_module_from_file(eval_file)
+            if verbose:
+                print(f"  Loaded module: {module.__name__}")
+            datasets = process_datasets_from_module(module, eval_file, name_filter, verbose)
+            all_datasets.extend(datasets)
+        except Exception as e:  # pylint: disable=W0718
+            if verbose:
+                print(f"Error loading {eval_file}: {e}")
+                print(f"  Traceback: {traceback.format_exc()}")
+            continue
+    # Check if any datasets were found
+    if not all_datasets:
+        print("No datasets found to evaluate")
+        if verbose:
+            print("This could be because:")
+            print("  - No eval_*.py files contain dataset_* objects")
+            print("  - The filter excluded all datasets")
+            print("  - There were errors loading the modules")
+        sys.exit(1)
+    # Print summary of discovered datasets
+    if verbose:
+        print(f"\n{'=' * 60}")
+        print("Datasets to Evaluate:")
+        print(f"{'=' * 60}")
+        for i, (dataset_name, dataset, target_function) in enumerate(all_datasets, 1):
+            print(f"{i}. {dataset_name}")
+            print(f"   Target function: {target_function.__name__}")
+            print(f"   Cases: {len(dataset.cases)}")
+            print(f"   Evaluators: {len(dataset.evaluators)}")
+        print(f"{'=' * 60}")
+    else:
+        print(f"Found {len(all_datasets)} datasets to evaluate")
+    return all_datasets

aixtools/evals/evals.py ADDED Viewed

@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+Script to run all LLM evaluations.
+This script discovers and runs all Dataset objects from eval_*.py files in the evals directory.
+Similar to test runners but for LLM evaluations using pydantic_evals.
+"""
+import argparse
+import asyncio
+import sys
+from pathlib import Path
+from .discovery import discover_all_datasets, find_eval_files
+from .run_evals import run_all_evaluations_and_print_results
+async def main():
+    """Main function to discover and run all evaluations."""
+    parser = argparse.ArgumentParser(description="Run LLM evaluations")
+    parser.add_argument(
+        "--evals-dir", type=Path, default=Path("evals"), help="Directory containing eval_*.py files (default: evals)"
+    )
+    parser.add_argument(
+        "--filter", type=str, help="Filter to run only matching evaluations (matches module, file, or dataset names)"
+    )
+    parser.add_argument("--include-input", action="store_true", help="Include input in report output")
+    parser.add_argument("--include-output", action="store_true", help="Include output in report output")
+    parser.add_argument(
+        "--include-evaluator-failures", action="store_true", help="Include evaluator failures in report output"
+    )
+    parser.add_argument("--include-reasons", action="store_true", help="Include reasons in report output")
+    parser.add_argument(
+        "--min-assertions",
+        type=float,
+        default=1.0,
+        help="Minimum assertions average required for success (default: 1.0)",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true", help="Print detailed information about discovery and processing"
+    )
+    args = parser.parse_args()
+    # Prepare print options
+    print_options = {
+        "include_input": args.include_input,
+        "include_output": args.include_output,
+        "include_evaluator_failures": args.include_evaluator_failures,
+        "include_reasons": args.include_reasons,
+    }
+    # Find all eval files
+    eval_files = find_eval_files(args.evals_dir)
+    if args.verbose:
+        print(f"Scanning directory: {args.evals_dir}")
+        print(f"Found {len(eval_files)} eval files:")
+        for f in eval_files:
+            print(f"  - {f}")
+    # Discover all datasets
+    all_datasets = discover_all_datasets(eval_files, args.filter, args.verbose)
+    if args.filter and not args.verbose:
+        print(f"Filter applied: {args.filter}")
+    # Run all evaluations and print results
+    await run_all_evaluations_and_print_results(all_datasets, print_options, args.min_assertions, args.verbose)
+if __name__ == "__main__":
+    # Add the current directory to Python path so we can import modules
+    sys.path.insert(0, str(Path.cwd()))
+    asyncio.run(main())

aixtools/evals/run_evals.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""
+Evaluation execution functionality for LLM evaluations.
+This module handles running evaluations and printing results.
+"""
+import sys
+from typing import Any
+from pydantic_evals.dataset import Dataset
+async def run_dataset_evaluation(  # noqa: PLR0913, pylint: disable=too-many-arguments,too-many-positional-arguments
+    dataset_name: str,
+    dataset: Dataset,
+    target_function: Any,
+    print_options: dict[str, bool],
+    min_assertions: float,
+    verbose: bool = False,
+) -> tuple[str, bool]:
+    """Run evaluation for a single dataset and return (name, success)."""
+    if verbose:
+        print(f"\n{'=' * 60}")
+        print(f"Running evaluation: {dataset_name}")
+        print(f"{'=' * 60}")
+    else:
+        print(f"Running {dataset_name}...", end=" ")
+    try:
+        # Execute the evaluation
+        report = await dataset.evaluate(target_function)
+        # Print the results
+        report.print(
+            include_input=print_options["include_input"],
+            include_output=print_options["include_output"],
+            include_evaluator_failures=print_options["include_evaluator_failures"],
+            include_reasons=print_options["include_reasons"],
+        )
+        # Check if evaluation passed based on assertions average
+        averages = report.averages()
+        if averages and averages.assertions is not None:
+            success = averages.assertions >= min_assertions
+            if verbose:
+                print(f"\nEvaluation Summary for {dataset_name}:")
+                print(f"  Assertions Average: {averages.assertions:.3f}")
+                print(f"  Minimum Required: {min_assertions:.3f}")
+                print(f"  Status: {'PASSED' if success else 'FAILED'}")
+            else:
+                print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
+        else:
+            success = False
+            if verbose:
+                print(f"\nEvaluation Summary for {dataset_name}:")
+                print("  No assertions found or evaluation failed")
+                print(f"  Minimum Required: {min_assertions:.3f}")
+                print("  Status: FAILED")
+            else:
+                print("FAILED (no assertions)")
+        return dataset_name, success
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        if verbose:
+            print(f"Error running evaluation {dataset_name}: {e}")
+        else:
+            print(f"ERROR ({e})")
+        return dataset_name, False
+async def run_all_evaluations_and_print_results(
+    datasets: list[tuple[str, Dataset, Any]], print_options: dict[str, bool], min_assertions: float, verbose: bool
+) -> None:
+    """Run all evaluations and print results with summary."""
+    # Run all evaluations
+    results = []
+    for dataset_name, dataset, target_function in datasets:
+        result = await run_dataset_evaluation(
+            dataset_name, dataset, target_function, print_options, min_assertions, verbose
+        )
+        results.append(result)
+    # Print summary
+    passed = sum(1 for _, success in results if success)
+    total = len(results)
+    failed_results = [(name, success) for name, success in results if not success]
+    if verbose:
+        print(f"\n{'=' * 60}")
+        print("EVALUATION SUMMARY")
+        print(f"{'=' * 60}")
+        for name, success in results:
+            status = "PASSED" if success else "FAILED"
+            print(f"  {name}: {status}")
+        print(f"\nTotal: {passed}/{total} evaluations passed")
+    # Only show failed evaluations when not verbose
+    elif failed_results:
+        print("\nFailed evaluations:")
+        for name, _ in failed_results:
+            print(f"  {name}: FAILED")
+    # Exit with non-zero code if any evaluations failed
+    if passed < total:
+        print(f"\n{total - passed} evaluation(s) failed")
+        sys.exit(1)
+    else:
+        print("\nAll evaluations passed!")

aixtools/logging/log_objects.py CHANGED Viewed

@@ -114,7 +114,26 @@ def save_objects_to_logfile(objects: list, log_dir=LOGS_DIR):
             object_logger.log(obj)
-class ObjectLogger:
+class BaseLogger:
+    """
+    Base class for loggers.
+    A context manager for logging objects.
+    """
+    def __init__(self, **kwargs):
+        pass
+    def __enter__(self):
+        pass
+    async def log(self, obj):
+        """Log an object to the configured destination."""
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+class ObjectLogger(BaseLogger):
     """
     A context manager for logging objects to a file.
     It uses pickle to save the objects and handles exceptions during the save process.
@@ -161,14 +180,14 @@ class ObjectLogger:
         self.file = open(self.log_file, "ab")  # append in binary mode
         return self
-    def log(self, obj):
+    async def log(self, obj):
         """
         Log an object to the file.
         It uses safe_deepcopy to ensure the object is pickleable.
         """
         if self.has_parent():
             # Delegate to the parent logger
-            self.parent_logger.log(obj)
+            await self.parent_logger.log(obj)
         else:
             try:
                 if self.debug:
@@ -190,25 +209,7 @@ class ObjectLogger:
             self.file.close()
-class NullObjectLogger:
-    """
-    A null logger that does nothing.
-    """
-    def __init__(self, **kwargs):
-        pass
-    def __enter__(self):
-        pass
-    def log(self, obj):
-        """Log an object to the configured destination."""
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-class PrintObjectLogger:
+class PrintObjectLogger(BaseLogger):
     """
     Print to stdout
     """
@@ -219,7 +220,7 @@ class PrintObjectLogger:
     def __enter__(self):
         pass
-    def log(self, obj):
+    async def log(self, obj):
         """Log an object using rich print for formatted output."""
         rich.print(obj, flush=True)

aixtools 0.1.11__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

aixtools 0.1.11py3-none-any.whl → 0.2.1py3-none-any.whl