aixtools 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aixtools might be problematic. Click here for more details.

Files changed (97) hide show
  1. {aixtools-0.2.2 → aixtools-0.2.4}/PKG-INFO +51 -15
  2. {aixtools-0.2.2 → aixtools-0.2.4}/README.md +49 -14
  3. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/_version.py +3 -3
  4. aixtools-0.2.4/aixtools/auth/auth.py +70 -0
  5. aixtools-0.2.4/aixtools/evals/__init__.py +0 -0
  6. aixtools-0.2.2/aixtools/evals/evals.py → aixtools-0.2.4/aixtools/evals/__main__.py +4 -4
  7. aixtools-0.2.4/aixtools/evals/dataset.py +87 -0
  8. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/evals/discovery.py +52 -40
  9. aixtools-0.2.4/aixtools/evals/run_evals.py +98 -0
  10. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/config.py +14 -5
  11. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools.egg-info/SOURCES.txt +4 -1
  12. {aixtools-0.2.2 → aixtools-0.2.4}/pyproject.toml +2 -1
  13. aixtools-0.2.2/aixtools/evals/run_evals.py +0 -110
  14. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/config.toml +0 -0
  15. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/bn.json +0 -0
  16. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/en-US.json +0 -0
  17. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/gu.json +0 -0
  18. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/he-IL.json +0 -0
  19. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/hi.json +0 -0
  20. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/ja.json +0 -0
  21. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/kn.json +0 -0
  22. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/ml.json +0 -0
  23. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/mr.json +0 -0
  24. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/nl.json +0 -0
  25. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/ta.json +0 -0
  26. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/te.json +0 -0
  27. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/.chainlit/translations/zh-CN.json +0 -0
  28. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/__init__.py +0 -0
  29. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/app.py +0 -0
  30. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/google_sdk/__init__.py +0 -0
  31. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/google_sdk/card.py +0 -0
  32. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/google_sdk/pydantic_ai_adapter/agent_executor.py +0 -0
  33. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/google_sdk/pydantic_ai_adapter/storage.py +0 -0
  34. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/google_sdk/remote_agent_connection.py +0 -0
  35. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/google_sdk/utils.py +0 -0
  36. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/a2a/utils.py +0 -0
  37. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/agents/__init__.py +0 -0
  38. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/agents/agent.py +0 -0
  39. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/agents/agent_batch.py +0 -0
  40. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/agents/print_nodes.py +0 -0
  41. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/agents/prompt.py +0 -0
  42. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/app.py +0 -0
  43. {aixtools-0.2.2/aixtools/evals → aixtools-0.2.4/aixtools/auth}/__init__.py +0 -0
  44. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/chainlit.md +0 -0
  45. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/compliance/__init__.py +0 -0
  46. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/compliance/private_data.py +0 -0
  47. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/context.py +0 -0
  48. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/db/__init__.py +0 -0
  49. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/db/database.py +0 -0
  50. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/db/vector_db.py +0 -0
  51. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/google/client.py +0 -0
  52. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/__init__.py +0 -0
  53. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/app.py +0 -0
  54. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/display.py +0 -0
  55. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/export.py +0 -0
  56. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/filters.py +0 -0
  57. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/log_utils.py +0 -0
  58. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/log_view/node_summary.py +0 -0
  59. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logfilters/__init__.py +0 -0
  60. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logfilters/context_filter.py +0 -0
  61. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/__init__.py +0 -0
  62. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/log_objects.py +0 -0
  63. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/logging_config.py +0 -0
  64. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/mcp_log_models.py +0 -0
  65. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/mcp_logger.py +0 -0
  66. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/model_patch_logging.py +0 -0
  67. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/logging/open_telemetry.py +0 -0
  68. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/mcp/__init__.py +0 -0
  69. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/mcp/client.py +0 -0
  70. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/mcp/example_client.py +0 -0
  71. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/mcp/example_server.py +0 -0
  72. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/mcp/fast_mcp_log.py +0 -0
  73. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/mcp/faulty_mcp.py +0 -0
  74. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/model_patch/model_patch.py +0 -0
  75. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/server/__init__.py +0 -0
  76. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/server/app_mounter.py +0 -0
  77. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/server/path.py +0 -0
  78. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/server/utils.py +0 -0
  79. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/testing/__init__.py +0 -0
  80. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/testing/aix_test_model.py +0 -0
  81. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/testing/mock_tool.py +0 -0
  82. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/testing/model_patch_cache.py +0 -0
  83. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/tools/doctor/__init__.py +0 -0
  84. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/tools/doctor/mcp_tool_doctor.py +0 -0
  85. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/tools/doctor/tool_doctor.py +0 -0
  86. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/tools/doctor/tool_recommendation.py +0 -0
  87. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/__init__.py +0 -0
  88. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/chainlit/cl_agent_show.py +0 -0
  89. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/chainlit/cl_utils.py +0 -0
  90. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/config_util.py +0 -0
  91. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/enum_with_description.py +0 -0
  92. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/files.py +0 -0
  93. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/persisted_dict.py +0 -0
  94. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/utils/utils.py +0 -0
  95. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/vault/__init__.py +0 -0
  96. {aixtools-0.2.2 → aixtools-0.2.4}/aixtools/vault/vault.py +0 -0
  97. {aixtools-0.2.2 → aixtools-0.2.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aixtools
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Tools for AI exploration and debugging
5
5
  Requires-Python: >=3.11.2
6
6
  Description-Content-Type: text/markdown
@@ -20,6 +20,7 @@ Requires-Dist: mypy>=1.18.2
20
20
  Requires-Dist: pandas>=2.2.3
21
21
  Requires-Dist: pydantic-evals>=0.4.10
22
22
  Requires-Dist: pydantic-ai>=1.0.9
23
+ Requires-Dist: pyjwt>=2.10.1
23
24
  Requires-Dist: pylint>=3.3.7
24
25
  Requires-Dist: rich>=14.0.0
25
26
  Requires-Dist: ruff>=0.11.6
@@ -415,30 +416,30 @@ By default, the "FaultyMCP" includes several tools you can use in your tests:
415
416
 
416
417
  ### Evals
417
418
 
418
- Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework.
419
+ Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework with AIXtools enhancements.
419
420
 
420
421
  ```bash
421
422
  # Run all evaluations
422
- evals
423
+ python -m aixtools.evals
423
424
 
424
425
  # Run evaluations with filtering
425
- evals --filter "specific_test"
426
+ python -m aixtools.evals --filter "specific_test"
426
427
 
427
428
  # Run with verbose output and detailed reporting
428
- evals --verbose --include-input --include-output --include-reasons
429
+ python -m aixtools.evals --verbose --include-input --include-output --include-reasons
429
430
 
430
431
  # Specify custom evaluations directory
431
- evals --evals-dir /path/to/evals
432
+ python -m aixtools.evals --evals-dir /path/to/evals
432
433
 
433
434
  # Set minimum assertions threshold
434
- evals --min-assertions 0.8
435
+ python -m aixtools.evals --min-assertions 0.8
435
436
  ```
436
437
 
437
438
  **Command Line Options:**
438
439
  - `--evals-dir` - Directory containing eval_*.py files (default: evals)
439
440
  - `--filter` - Filter to run only matching evaluations
440
- - `--include-input` - Include input in report output
441
- - `--include-output` - Include output in report output
441
+ - `--include-input` - Include input in report output (default: True)
442
+ - `--include-output` - Include output in report output (default: True)
442
443
  - `--include-evaluator-failures` - Include evaluator failures in report
443
444
  - `--include-reasons` - Include reasons in report output
444
445
  - `--min-assertions` - Minimum assertions average required for success (default: 1.0)
@@ -446,14 +447,16 @@ evals --min-assertions 0.8
446
447
 
447
448
  The evaluation system discovers and runs all Dataset objects from eval_*.py files in the specified directory, similar to test runners but specifically designed for LLM evaluations using pydantic_evals.
448
449
 
449
- **Discovery Mechanism:**
450
+ **Discovery Mechanism**
450
451
 
451
- The evaluation framework uses an automatic discovery system that:
452
+ The evaluation framework uses an automatic discovery system:
452
453
 
453
454
  1. **File Discovery**: Scans the specified directory for files matching the pattern `eval_*.py`
454
455
  2. **Dataset Discovery**: Within each file, looks for variables named `dataset_*` that are instances of `pydantic_evals.Dataset`
455
- 3. **Target Function Discovery**: Automatically finds the first async function in each module that doesn't start with an underscore (`_`) to use as the evaluation target
456
- 4. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
456
+ 3. **Target Function Discovery**: Within the same file looks for function or async function named `target_*`. There must be 1 target function per file.
457
+ 4. **Function Discovery**: Looks for functions with specific prefixes:
458
+ - Functions prefixed with `scorer_*`, `evaluator_*` for custom scorer and evaluator functions that will be used for each dataset in that file
459
+ 5. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
457
460
 
458
461
  **Example Evaluation File Structure:**
459
462
  ```python
@@ -471,11 +474,16 @@ dataset_addition = Dataset(
471
474
  )
472
475
 
473
476
  # This function will be used as the evaluation target
474
- async def evaluate_math_agent(input_text: str) -> str:
475
- # Your agent evaluation logic here
477
+ async def target_math_agent(input_text: str) -> str:
478
+ # Your agent run logic here
476
479
  agent = get_agent(system_prompt="You are a math assistant.")
477
480
  result, _ = await run_agent(agent, input_text)
478
481
  return result
482
+
483
+ # This function will be used as evaluator for all datasets (optional)
484
+ def evaluator_check_output(ctx: EvaluatorContext) -> bool:
485
+ # Your result evaluation logic here
486
+ return ctx.output == ctx.expected_output
479
487
  ```
480
488
 
481
489
  The discovery system will:
@@ -484,6 +492,34 @@ The discovery system will:
484
492
  - Use `evaluate_math_agent` as the target function for evaluation
485
493
  - Run each case through the target function and evaluate results
486
494
 
495
+ #### Name-Based Discovery
496
+
497
+ The evaluation system uses name-based discovery for all components:
498
+
499
+ **Target Functions** (exactly one required per eval file):
500
+ - **Purpose**: The main function being evaluated - processes inputs and returns outputs
501
+ - **Naming**: Functions named `target_*` (e.g., `target_my_function`)
502
+ - **Signature**: `def target_name(inputs: InputType) -> OutputType` or `async def target_name(inputs: InputType) -> OutputType`
503
+ - **Example**: `async def target_math_agent(input_text: str) -> str`
504
+
505
+ **Scoring Functions** (optional):
506
+ - **Purpose**: Determine if evaluation results meet success criteria
507
+ - **Naming**: Functions named `scorer_*` (e.g., `scorer_custom`)
508
+ - **Signature**: `def scorer_name(report: EvaluationReport, dataset: AixDataset, min_score: float = 1.0, verbose: bool = False) -> bool`
509
+ - **Example**: `def scorer_accuracy_threshold(report, dataset, min_score=0.8, verbose=False) -> bool`
510
+
511
+ **Evaluator Functions** (optional):
512
+ - **Purpose**: Custom evaluation logic for comparing outputs with expected results
513
+ - **Naming**: Functions named `evaluator_*` (e.g., `evaluator_check_output`)
514
+ - **Signature**: `def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput` or `async def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput`
515
+ - **Example**: `def evaluator_exact_match(ctx) -> EvaluatorOutput`
516
+
517
+ This name-based approach works seamlessly with both synchronous and asynchronous functions.
518
+
519
+ #### Scoring System
520
+
521
+ The framework includes a custom scoring system with [`average_assertions`](aixtools/evals/dataset.py:67) as the default scorer. This scorer checks if the average assertion score meets a minimum threshold and provides detailed pass/fail reporting.
522
+
487
523
  ## Testing & Tools
488
524
 
489
525
  AIXtools provides comprehensive testing utilities and diagnostic tools for AI agent development and debugging.
@@ -383,30 +383,30 @@ By default, the "FaultyMCP" includes several tools you can use in your tests:
383
383
 
384
384
  ### Evals
385
385
 
386
- Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework.
386
+ Run comprehensive Agent/LLM evaluations using the built-in evaluation discovery based on Pydantic-AI framework with AIXtools enhancements.
387
387
 
388
388
  ```bash
389
389
  # Run all evaluations
390
- evals
390
+ python -m aixtools.evals
391
391
 
392
392
  # Run evaluations with filtering
393
- evals --filter "specific_test"
393
+ python -m aixtools.evals --filter "specific_test"
394
394
 
395
395
  # Run with verbose output and detailed reporting
396
- evals --verbose --include-input --include-output --include-reasons
396
+ python -m aixtools.evals --verbose --include-input --include-output --include-reasons
397
397
 
398
398
  # Specify custom evaluations directory
399
- evals --evals-dir /path/to/evals
399
+ python -m aixtools.evals --evals-dir /path/to/evals
400
400
 
401
401
  # Set minimum assertions threshold
402
- evals --min-assertions 0.8
402
+ python -m aixtools.evals --min-assertions 0.8
403
403
  ```
404
404
 
405
405
  **Command Line Options:**
406
406
  - `--evals-dir` - Directory containing eval_*.py files (default: evals)
407
407
  - `--filter` - Filter to run only matching evaluations
408
- - `--include-input` - Include input in report output
409
- - `--include-output` - Include output in report output
408
+ - `--include-input` - Include input in report output (default: True)
409
+ - `--include-output` - Include output in report output (default: True)
410
410
  - `--include-evaluator-failures` - Include evaluator failures in report
411
411
  - `--include-reasons` - Include reasons in report output
412
412
  - `--min-assertions` - Minimum assertions average required for success (default: 1.0)
@@ -414,14 +414,16 @@ evals --min-assertions 0.8
414
414
 
415
415
  The evaluation system discovers and runs all Dataset objects from eval_*.py files in the specified directory, similar to test runners but specifically designed for LLM evaluations using pydantic_evals.
416
416
 
417
- **Discovery Mechanism:**
417
+ **Discovery Mechanism**
418
418
 
419
- The evaluation framework uses an automatic discovery system that:
419
+ The evaluation framework uses an automatic discovery system:
420
420
 
421
421
  1. **File Discovery**: Scans the specified directory for files matching the pattern `eval_*.py`
422
422
  2. **Dataset Discovery**: Within each file, looks for variables named `dataset_*` that are instances of `pydantic_evals.Dataset`
423
- 3. **Target Function Discovery**: Automatically finds the first async function in each module that doesn't start with an underscore (`_`) to use as the evaluation target
424
- 4. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
423
+ 3. **Target Function Discovery**: Within the same file looks for function or async function named `target_*`. There must be 1 target function per file.
424
+ 4. **Function Discovery**: Looks for functions with specific prefixes:
425
+ - Functions prefixed with `scorer_*`, `evaluator_*` for custom scorer and evaluator functions that will be used for each dataset in that file
426
+ 5. **Filtering**: Supports filtering by module name, file name, dataset name, or fully qualified name
425
427
 
426
428
  **Example Evaluation File Structure:**
427
429
  ```python
@@ -439,11 +441,16 @@ dataset_addition = Dataset(
439
441
  )
440
442
 
441
443
  # This function will be used as the evaluation target
442
- async def evaluate_math_agent(input_text: str) -> str:
443
- # Your agent evaluation logic here
444
+ async def target_math_agent(input_text: str) -> str:
445
+ # Your agent run logic here
444
446
  agent = get_agent(system_prompt="You are a math assistant.")
445
447
  result, _ = await run_agent(agent, input_text)
446
448
  return result
449
+
450
+ # This function will be used as evaluator for all datasets (optional)
451
+ def evaluator_check_output(ctx: EvaluatorContext) -> bool:
452
+ # Your result evaluation logic here
453
+ return ctx.output == ctx.expected_output
447
454
  ```
448
455
 
449
456
  The discovery system will:
@@ -452,6 +459,34 @@ The discovery system will:
452
459
  - Use `evaluate_math_agent` as the target function for evaluation
453
460
  - Run each case through the target function and evaluate results
454
461
 
462
+ #### Name-Based Discovery
463
+
464
+ The evaluation system uses name-based discovery for all components:
465
+
466
+ **Target Functions** (exactly one required per eval file):
467
+ - **Purpose**: The main function being evaluated - processes inputs and returns outputs
468
+ - **Naming**: Functions named `target_*` (e.g., `target_my_function`)
469
+ - **Signature**: `def target_name(inputs: InputType) -> OutputType` or `async def target_name(inputs: InputType) -> OutputType`
470
+ - **Example**: `async def target_math_agent(input_text: str) -> str`
471
+
472
+ **Scoring Functions** (optional):
473
+ - **Purpose**: Determine if evaluation results meet success criteria
474
+ - **Naming**: Functions named `scorer_*` (e.g., `scorer_custom`)
475
+ - **Signature**: `def scorer_name(report: EvaluationReport, dataset: AixDataset, min_score: float = 1.0, verbose: bool = False) -> bool`
476
+ - **Example**: `def scorer_accuracy_threshold(report, dataset, min_score=0.8, verbose=False) -> bool`
477
+
478
+ **Evaluator Functions** (optional):
479
+ - **Purpose**: Custom evaluation logic for comparing outputs with expected results
480
+ - **Naming**: Functions named `evaluator_*` (e.g., `evaluator_check_output`)
481
+ - **Signature**: `def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput` or `async def evaluator_name(ctx: EvaluatorContext) -> EvaluatorOutput`
482
+ - **Example**: `def evaluator_exact_match(ctx) -> EvaluatorOutput`
483
+
484
+ This name-based approach works seamlessly with both synchronous and asynchronous functions.
485
+
486
+ #### Scoring System
487
+
488
+ The framework includes a custom scoring system with [`average_assertions`](aixtools/evals/dataset.py:67) as the default scorer. This scorer checks if the average assertion score meets a minimum threshold and provides detailed pass/fail reporting.
489
+
455
490
  ## Testing & Tools
456
491
 
457
492
  AIXtools provides comprehensive testing utilities and diagnostic tools for AI agent development and debugging.
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.2'
32
- __version_tuple__ = version_tuple = (0, 2, 2)
31
+ __version__ = version = '0.2.4'
32
+ __version_tuple__ = version_tuple = (0, 2, 4)
33
33
 
34
- __commit_id__ = commit_id = 'g407a0cb96'
34
+ __commit_id__ = commit_id = 'g3a46ce295'
@@ -0,0 +1,70 @@
1
+ """
2
+ Module that manages OAuth2 functions for authentication
3
+ """
4
+
5
+ import logging
6
+
7
+ import jwt
8
+ from jwt import ExpiredSignatureError, InvalidAudienceError, InvalidIssuerError, PyJWKClient
9
+
10
+ from aixtools.utils import config
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AuthTokenError(Exception):
16
+ """Exception raised for authentication token errors."""
17
+
18
+
19
+ # pylint: disable=too-few-public-methods
20
+ class AccessTokenVerifier:
21
+ """
22
+ Verifies Microsoft SSO JWT token against the configured Tenant ID, Audience, API ID and Issuer URL.
23
+ """
24
+
25
+ def __init__(self):
26
+ tenant_id = config.APP_TENANT_ID
27
+ self.api_id = config.APP_API_ID
28
+ self.issuer_url = config.APP_ISSUER_URL
29
+ # Azure AD endpoints
30
+ jwks_url = f"https://login.microsoftonline.com/{tenant_id}/discovery/v2.0/keys"
31
+ self.jwks_client = PyJWKClient(
32
+ uri=jwks_url,
33
+ # cache keys url response to reduce SSO server network calls,
34
+ # as public keys are not expected to change frequently
35
+ cache_jwk_set=True,
36
+ # cache resolved public keys
37
+ cache_keys=True,
38
+ # cache url response for 10 hours
39
+ lifespan=36000,
40
+ )
41
+
42
+ logger.info("Using JWKS: %s", jwks_url)
43
+
44
+ def verify(self, token: str) -> dict:
45
+ """
46
+ Verifies The JWT access token and returns decoded claims as a dictionary if the token is
47
+ valid, otherwise raises an AuthTokenError
48
+ """
49
+ try:
50
+ signing_key = self.jwks_client.get_signing_key_from_jwt(token)
51
+
52
+ claims = jwt.decode(
53
+ token,
54
+ signing_key.key,
55
+ algorithms=["RS256"],
56
+ audience=self.api_id,
57
+ issuer=self.issuer_url,
58
+ # ensure audience verification is carried out
59
+ options={"verify_aud": True},
60
+ )
61
+ return claims
62
+
63
+ except ExpiredSignatureError as e:
64
+ raise AuthTokenError("Token expired") from e
65
+ except InvalidAudienceError as e:
66
+ raise AuthTokenError(f"Token not for expected audience: {e}") from e
67
+ except InvalidIssuerError as e:
68
+ raise AuthTokenError(f"Token not for expected issuer: {e}") from e
69
+ except jwt.exceptions.PyJWTError as e:
70
+ raise AuthTokenError(f"Invalid token: {e}") from e
File without changes
@@ -11,8 +11,8 @@ import asyncio
11
11
  import sys
12
12
  from pathlib import Path
13
13
 
14
- from .discovery import discover_all_datasets, find_eval_files
15
- from .run_evals import run_all_evaluations_and_print_results
14
+ from aixtools.evals.discovery import discover_all_datasets, find_eval_files # pylint: disable=E0401
15
+ from aixtools.evals.run_evals import run_all_evaluations_and_print_results # pylint: disable=E0401
16
16
 
17
17
 
18
18
  async def main():
@@ -24,8 +24,8 @@ async def main():
24
24
  parser.add_argument(
25
25
  "--filter", type=str, help="Filter to run only matching evaluations (matches module, file, or dataset names)"
26
26
  )
27
- parser.add_argument("--include-input", action="store_true", help="Include input in report output")
28
- parser.add_argument("--include-output", action="store_true", help="Include output in report output")
27
+ parser.add_argument("--include-input", action="store_true", default=True, help="Include input in report output")
28
+ parser.add_argument("--include-output", action="store_true", default=True, help="Include output in report output")
29
29
  parser.add_argument(
30
30
  "--include-evaluator-failures", action="store_true", help="Include evaluator failures in report output"
31
31
  )
@@ -0,0 +1,87 @@
1
+ """Custom dataset and evaluation utilities for AixTools.
2
+
3
+ This module provides wrapper classes and decorators for building and running
4
+ evaluations using the pydantic-evals framework. It includes a custom Dataset
5
+ class, decorators for marking target functions, scorers, and evaluators, and
6
+ a default scoring function based on assertion averages.
7
+ """
8
+
9
+ from typing import Awaitable, Callable, Generic
10
+
11
+ from pydantic import BaseModel
12
+ from pydantic_evals.dataset import Case, Dataset, InputsT, MetadataT, OutputT
13
+ from pydantic_evals.evaluators import Evaluator
14
+ from pydantic_evals.reporting import EvaluationReport
15
+
16
+ TargetT = Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT]
17
+ ScorerT = Callable[[EvaluationReport, "AixDataset", float, bool], bool]
18
+
19
+
20
+ class AixDataset(BaseModel, Generic[InputsT, OutputT, MetadataT]):
21
+ """Custom Dataset class for AixTools evaluations."""
22
+
23
+ dataset: Dataset[InputsT, OutputT]
24
+ name: str
25
+ target_func: TargetT
26
+ scorers: list[ScorerT]
27
+
28
+ def __init__( # pylint: disable=R0913,R0917
29
+ self,
30
+ cases: list[Case[InputsT, OutputT]],
31
+ target_func: TargetT,
32
+ evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] | None = None,
33
+ name: str | None = None,
34
+ scoring_funcs: list[ScorerT] | None = None,
35
+ ):
36
+ super().__init__(
37
+ dataset=Dataset(cases=cases, evaluators=evaluators or []),
38
+ target_func=target_func,
39
+ name=name or "dataset",
40
+ scorers=scoring_funcs or [average_assertions],
41
+ )
42
+
43
+ @property
44
+ def cases(self) -> list[Case[InputsT, OutputT]]:
45
+ """Return the list of cases in the dataset."""
46
+ return self.dataset.cases
47
+
48
+ @property
49
+ def evaluators(self) -> list[Evaluator[InputsT, OutputT, MetadataT]]:
50
+ """Return the list of evaluators in the dataset."""
51
+ return self.dataset.evaluators
52
+
53
+ async def evaluate(
54
+ self,
55
+ ) -> EvaluationReport:
56
+ """Run the evaluation using the target function and return an EvaluationReport."""
57
+ return await self.dataset.evaluate(self.target_func)
58
+
59
+
60
+ # Decorators removed - using name-based discovery only for simplicity and async compatibility
61
+ # Functions should be named with prefixes: target_, scorer_, evaluator_
62
+
63
+
64
+ def average_assertions(
65
+ report: EvaluationReport, dataset: "AixDataset", min_score: float = 1.0, verbose: bool = False
66
+ ) -> bool:
67
+ """Scoring function that checks if the average assertions meet a minimum threshold."""
68
+ averages = report.averages()
69
+ if averages and averages.assertions is not None:
70
+ success = averages.assertions >= min_score
71
+ if verbose:
72
+ print(f"\nAssertions Summary for {dataset.name}:")
73
+ print(f" Assertions Average: {averages.assertions:.3f}")
74
+ print(f" Minimum Required: {min_score:.3f}")
75
+ print(f" Status: {'PASSED' if success else 'FAILED'}")
76
+ else:
77
+ print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
78
+ else:
79
+ success = False
80
+ if verbose:
81
+ print(f"\nAssertions Summary for {dataset.name}:")
82
+ print(" No assertions found or evaluation failed")
83
+ print(f" Minimum Required: {min_score:.3f}")
84
+ print(" Status: FAILED")
85
+ else:
86
+ print("FAILED (no assertions)")
87
+ return success
@@ -9,10 +9,14 @@ import inspect
9
9
  import sys
10
10
  import traceback
11
11
  from pathlib import Path
12
- from typing import Any
12
+ from typing import Any, TypeVar
13
13
 
14
14
  from pydantic_evals.dataset import Dataset
15
15
 
16
+ from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
17
+
18
+ SpecialFuncT = TypeVar("SpecialFuncT")
19
+
16
20
 
17
21
  def find_eval_files(evals_dir: Path) -> list[Path]:
18
22
  """Find all eval_*.py files in the evals directory."""
@@ -33,7 +37,7 @@ def find_datasets_in_module(module: Any) -> list[tuple[str, Dataset]]:
33
37
  datasets = []
34
38
 
35
39
  for name, obj in inspect.getmembers(module):
36
- if name.startswith("dataset_") and isinstance(obj, Dataset):
40
+ if name.startswith("dataset_") and isinstance(obj, (Dataset, AixDataset)):
37
41
  datasets.append((name, obj))
38
42
 
39
43
  return datasets
@@ -66,66 +70,74 @@ def matches_filter(module_name: str, file_name: str, dataset_name: str, name_fil
66
70
  )
67
71
 
68
72
 
69
- def find_target_function(module: Any) -> Any | None:
70
- """Find the first async function in a module that doesn't start with underscore."""
73
+ def find_prefixed_functions(module: Any, prefix: str) -> list[Any]:
74
+ """Find all functions with a specific prefix (name-based discovery only)."""
75
+ funcs = []
71
76
  for name, obj in inspect.getmembers(module):
72
- if inspect.iscoroutinefunction(obj) and not name.startswith("_"):
73
- return obj
74
- return None
77
+ if name.startswith(prefix) and (inspect.isfunction(obj) or inspect.iscoroutinefunction(obj)):
78
+ funcs.append(obj) # Return function directly, no decorator wrapping
75
79
 
80
+ return funcs
76
81
 
77
- def get_async_function_names(module: Any) -> list[str]:
78
- """Get names of all async functions in a module that don't start with underscore."""
79
- return [
80
- name
81
- for name, obj in inspect.getmembers(module)
82
- if inspect.iscoroutinefunction(obj) and not name.startswith("_")
83
- ]
82
+
83
+ def print_v(message: str, verbose: bool) -> None:
84
+ """Print message if verbose is enabled."""
85
+ if verbose:
86
+ print(message)
84
87
 
85
88
 
86
89
  def process_datasets_from_module(
87
90
  module: Any, eval_file: Path, name_filter: str | None, verbose: bool
88
- ) -> list[tuple[str, Dataset, Any]]:
91
+ ) -> list[AixDataset]:
89
92
  """Process all datasets from a single module and return valid dataset tuples."""
90
93
  datasets = find_datasets_in_module(module)
91
- if verbose:
92
- print(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}")
94
+
95
+ print_v(f" Found {len(datasets)} datasets: {[name for name, _ in datasets]}", verbose)
93
96
 
94
97
  valid_datasets = []
95
98
 
99
+ targets = find_prefixed_functions(module, "target_")
100
+ scorers = find_prefixed_functions(module, "scorer_")
101
+ evaluators = find_prefixed_functions(module, "evaluator_")
102
+
103
+ print_v(f" Found target functions: {[f.__name__ for f in targets]}", verbose)
104
+ print_v(f" Found scoring functions: {[f.__name__ for f in scorers]}", verbose)
105
+ print_v(f" Found evaluator functions: {[f.__name__ for f in evaluators]}", verbose)
106
+
96
107
  for dataset_name, dataset in datasets:
97
108
  full_name = f"{eval_file.stem}.{dataset_name}"
98
109
 
99
110
  if not matches_filter(module.__name__, eval_file.stem, dataset_name, name_filter):
100
- if verbose:
101
- print(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})")
111
+ print_v(f" ✗ Skipping dataset: {dataset_name} (doesn't match filter: {name_filter})", verbose)
102
112
  continue
103
113
 
104
- if verbose:
105
- print(f" ✓ Including dataset: {dataset_name}")
114
+ print_v(f" ✓ Including dataset: {dataset_name}", verbose)
106
115
 
107
- # Find the target function
108
- target_function = find_target_function(module)
109
- async_functions = get_async_function_names(module)
116
+ if isinstance(dataset, Dataset):
117
+ # Wrap in AixDataset if not already
110
118
 
111
- if verbose:
112
- print(f" Found async functions: {async_functions}")
113
- if target_function:
114
- print(f" Using target function: {target_function.__name__}")
119
+ if len(targets) != 1:
120
+ print_v(
121
+ f" ✗ Skipping dataset: {dataset_name} (has {len(targets)} target functions, expected exactly 1)",
122
+ verbose,
123
+ )
115
124
 
116
- if target_function is None:
117
- if verbose:
118
- print(f"Warning: No async function found in {eval_file.name} for dataset {dataset_name}")
119
- continue
125
+ continue
126
+
127
+ dataset = AixDataset( # noqa: PLW2901
128
+ cases=dataset.cases,
129
+ evaluators=dataset.evaluators, # evaluators are plain functions now
130
+ name=full_name,
131
+ target_func=targets[0], # target function is used directly
132
+ scoring_funcs=scorers, # scorers are plain functions now
133
+ )
120
134
 
121
- valid_datasets.append((full_name, dataset, target_function))
135
+ valid_datasets.append(dataset)
122
136
 
123
137
  return valid_datasets
124
138
 
125
139
 
126
- def discover_all_datasets(
127
- eval_files: list[Path], name_filter: str | None, verbose: bool
128
- ) -> list[tuple[str, Dataset, Any]]:
140
+ def discover_all_datasets(eval_files: list[Path], name_filter: str | None, verbose: bool) -> list[AixDataset]:
129
141
  """Discover all datasets from eval files."""
130
142
  all_datasets = []
131
143
 
@@ -141,7 +153,7 @@ def discover_all_datasets(
141
153
  datasets = process_datasets_from_module(module, eval_file, name_filter, verbose)
142
154
  all_datasets.extend(datasets)
143
155
 
144
- except Exception as e: # pylint: disable=W0718
156
+ except Exception as e: # pylint: disable=broad-exception-caught
145
157
  if verbose:
146
158
  print(f"Error loading {eval_file}: {e}")
147
159
  print(f" Traceback: {traceback.format_exc()}")
@@ -162,9 +174,9 @@ def discover_all_datasets(
162
174
  print(f"\n{'=' * 60}")
163
175
  print("Datasets to Evaluate:")
164
176
  print(f"{'=' * 60}")
165
- for i, (dataset_name, dataset, target_function) in enumerate(all_datasets, 1):
166
- print(f"{i}. {dataset_name}")
167
- print(f" Target function: {target_function.__name__}")
177
+ for i, (dataset) in enumerate(all_datasets, 1):
178
+ print(f"{i}. {dataset.name}")
179
+ print(f" Target function: {dataset.target_func.__name__}")
168
180
  print(f" Cases: {len(dataset.cases)}")
169
181
  print(f" Evaluators: {len(dataset.evaluators)}")
170
182
  print(f"{'=' * 60}")
@@ -0,0 +1,98 @@
1
+ """
2
+ Evaluation execution functionality for LLM evaluations.
3
+
4
+ This module handles running evaluations and printing results.
5
+ """
6
+
7
+ import sys
8
+
9
+ from pydantic_evals.reporting import EvaluationReport
10
+
11
+ from aixtools.evals.dataset import AixDataset # pylint: disable=E0401
12
+
13
+
14
+ async def run_dataset_evaluation(
15
+ dataset: AixDataset,
16
+ print_options: dict[str, bool],
17
+ min_assertions: float,
18
+ verbose: bool = False,
19
+ ) -> tuple[str, bool, EvaluationReport | None]:
20
+ """Run evaluation for a single dataset and return (name, success, report)."""
21
+ if verbose:
22
+ print(f"\n{'=' * 60}")
23
+ print(f"Running evaluation: {dataset.name}")
24
+ print(f"{'=' * 60}")
25
+ else:
26
+ print(f"Running {dataset.name}...", end=" ")
27
+
28
+ try:
29
+ # Execute the evaluation
30
+ report = await dataset.evaluate()
31
+
32
+ # Print the results
33
+ report.print(
34
+ include_input=print_options["include_input"],
35
+ include_output=print_options["include_output"],
36
+ include_evaluator_failures=print_options["include_evaluator_failures"],
37
+ include_reasons=print_options["include_reasons"],
38
+ )
39
+
40
+ success = all(scorer(report, dataset, min_assertions, verbose) for scorer in dataset.scorers)
41
+
42
+ return dataset.name, success, report
43
+
44
+ except Exception as e: # pylint: disable=broad-exception-caught
45
+ if verbose:
46
+ print(f"Error running evaluation {dataset.name}: {e}")
47
+ else:
48
+ print(f"ERROR ({e})")
49
+ return dataset.name, False, None
50
+
51
+
52
+ async def run_all_evaluations_and_print_results(
53
+ datasets: list[AixDataset], print_options: dict[str, bool], min_assertions: float, verbose: bool
54
+ ) -> None:
55
+ """Run all evaluations and print results with summary."""
56
+ # Run all evaluations
57
+ results = []
58
+ for dataset in datasets:
59
+ result = await run_dataset_evaluation(dataset, print_options, min_assertions, verbose)
60
+ results.append(result)
61
+
62
+ # Print reports
63
+ for _, _, report in results:
64
+ if report:
65
+ report.print(
66
+ include_input=print_options["include_input"],
67
+ include_output=print_options["include_output"],
68
+ include_evaluator_failures=print_options["include_evaluator_failures"],
69
+ include_reasons=print_options["include_reasons"],
70
+ )
71
+
72
+ # Print summary
73
+ passed = sum(1 for _, success, _ in results if success)
74
+ total = len(results)
75
+ failed_results = [(name, success, _) for name, success, _ in results if not success]
76
+
77
+ if verbose:
78
+ print(f"\n{'=' * 60}")
79
+ print("EVALUATION SUMMARY")
80
+ print(f"{'=' * 60}")
81
+
82
+ for name, success, _ in results:
83
+ status = "PASSED" if success else "FAILED"
84
+ print(f" {name}: {status}")
85
+
86
+ print(f"\nTotal: {passed}/{total} evaluations passed")
87
+ # Only show failed evaluations when not verbose
88
+ elif failed_results:
89
+ print("\nFailed evaluations:")
90
+ for name, _, _ in failed_results:
91
+ print(f" {name}: FAILED")
92
+
93
+ # Exit with non-zero code if any evaluations failed
94
+ if passed < total:
95
+ print(f"\n{total - passed} evaluation(s) failed")
96
+ sys.exit(1)
97
+ else:
98
+ print("\nAll evaluations passed!")
@@ -56,7 +56,6 @@ else:
56
56
  logging.error("No '.env' file found in any of the search paths, or their parents: %s", env_dirs)
57
57
  sys.exit(1)
58
58
 
59
-
60
59
  # ---
61
60
  # Directories
62
61
  # ---
@@ -124,7 +123,17 @@ GOOGLE_CLOUD_LOCATION = get_variable_env("GOOGLE_CLOUD_LOCATION", True)
124
123
 
125
124
  # vault parameters.
126
125
  VAULT_ADDRESS = get_variable_env("VAULT_ADDRESS", default="http://localhost:8200")
127
- VAULT_TOKEN = get_variable_env("VAULT_TOKEN", default="vault-token")
128
- VAULT_ENV = get_variable_env("ENV", default="dev")
129
- VAULT_MOUNT_POINT = get_variable_env("VAULT_MOUNT_POINT", default="secret")
130
- VAULT_PATH_PREFIX = get_variable_env("VAULT_PATH_PREFIX", default="path")
126
+ VAULT_TOKEN = get_variable_env("VAULT_TOKEN", allow_empty=True)
127
+ VAULT_ENV = get_variable_env("ENV", allow_empty=True)
128
+ VAULT_MOUNT_POINT = get_variable_env("VAULT_MOUNT_POINT", allow_empty=True)
129
+ VAULT_PATH_PREFIX = get_variable_env("VAULT_PATH_PREFIX", allow_empty=True)
130
+
131
+ # OAuth parameters
132
+ APP_SECRET_ID = get_variable_env("APP_SECRET_ID")
133
+ APP_CLIENT_ID = get_variable_env("APP_CLIENT_ID")
134
+
135
+ # used for token audience check
136
+ APP_API_ID = get_variable_env("APP_API_ID")
137
+ APP_TENANT_ID = get_variable_env("APP_TENANT_ID")
138
+ # used for token issuer check
139
+ APP_ISSUER_URL = get_variable_env("APP_ISSUER_URL")
@@ -32,14 +32,17 @@ aixtools/agents/agent.py
32
32
  aixtools/agents/agent_batch.py
33
33
  aixtools/agents/print_nodes.py
34
34
  aixtools/agents/prompt.py
35
+ aixtools/auth/__init__.py
36
+ aixtools/auth/auth.py
35
37
  aixtools/compliance/__init__.py
36
38
  aixtools/compliance/private_data.py
37
39
  aixtools/db/__init__.py
38
40
  aixtools/db/database.py
39
41
  aixtools/db/vector_db.py
40
42
  aixtools/evals/__init__.py
43
+ aixtools/evals/__main__.py
44
+ aixtools/evals/dataset.py
41
45
  aixtools/evals/discovery.py
42
- aixtools/evals/evals.py
43
46
  aixtools/evals/run_evals.py
44
47
  aixtools/google/client.py
45
48
  aixtools/log_view/__init__.py
@@ -28,11 +28,12 @@ dependencies = [
28
28
  "pandas>=2.2.3",
29
29
  "pydantic-evals>=0.4.10",
30
30
  "pydantic-ai>=1.0.9",
31
+ "pyjwt>=2.10.1",
31
32
  "pylint>=3.3.7",
32
33
  "rich>=14.0.0",
33
34
  "ruff>=0.11.6",
34
35
  "streamlit>=1.44.1",
35
- "watchdog>=6.0.0",
36
+ "watchdog>=6.0.0"
36
37
  ]
37
38
 
38
39
  [project.scripts]
@@ -1,110 +0,0 @@
1
- """
2
- Evaluation execution functionality for LLM evaluations.
3
-
4
- This module handles running evaluations and printing results.
5
- """
6
-
7
- import sys
8
- from typing import Any
9
-
10
- from pydantic_evals.dataset import Dataset
11
-
12
-
13
- async def run_dataset_evaluation( # noqa: PLR0913, pylint: disable=too-many-arguments,too-many-positional-arguments
14
- dataset_name: str,
15
- dataset: Dataset,
16
- target_function: Any,
17
- print_options: dict[str, bool],
18
- min_assertions: float,
19
- verbose: bool = False,
20
- ) -> tuple[str, bool]:
21
- """Run evaluation for a single dataset and return (name, success)."""
22
- if verbose:
23
- print(f"\n{'=' * 60}")
24
- print(f"Running evaluation: {dataset_name}")
25
- print(f"{'=' * 60}")
26
- else:
27
- print(f"Running {dataset_name}...", end=" ")
28
-
29
- try:
30
- # Execute the evaluation
31
- report = await dataset.evaluate(target_function)
32
-
33
- # Print the results
34
- report.print(
35
- include_input=print_options["include_input"],
36
- include_output=print_options["include_output"],
37
- include_evaluator_failures=print_options["include_evaluator_failures"],
38
- include_reasons=print_options["include_reasons"],
39
- )
40
-
41
- # Check if evaluation passed based on assertions average
42
- averages = report.averages()
43
- if averages and averages.assertions is not None:
44
- success = averages.assertions >= min_assertions
45
- if verbose:
46
- print(f"\nEvaluation Summary for {dataset_name}:")
47
- print(f" Assertions Average: {averages.assertions:.3f}")
48
- print(f" Minimum Required: {min_assertions:.3f}")
49
- print(f" Status: {'PASSED' if success else 'FAILED'}")
50
- else:
51
- print(f"{'PASSED' if success else 'FAILED'} ({averages.assertions:.3f})")
52
- else:
53
- success = False
54
- if verbose:
55
- print(f"\nEvaluation Summary for {dataset_name}:")
56
- print(" No assertions found or evaluation failed")
57
- print(f" Minimum Required: {min_assertions:.3f}")
58
- print(" Status: FAILED")
59
- else:
60
- print("FAILED (no assertions)")
61
-
62
- return dataset_name, success
63
-
64
- except Exception as e: # pylint: disable=broad-exception-caught
65
- if verbose:
66
- print(f"Error running evaluation {dataset_name}: {e}")
67
- else:
68
- print(f"ERROR ({e})")
69
- return dataset_name, False
70
-
71
-
72
- async def run_all_evaluations_and_print_results(
73
- datasets: list[tuple[str, Dataset, Any]], print_options: dict[str, bool], min_assertions: float, verbose: bool
74
- ) -> None:
75
- """Run all evaluations and print results with summary."""
76
- # Run all evaluations
77
- results = []
78
- for dataset_name, dataset, target_function in datasets:
79
- result = await run_dataset_evaluation(
80
- dataset_name, dataset, target_function, print_options, min_assertions, verbose
81
- )
82
- results.append(result)
83
-
84
- # Print summary
85
- passed = sum(1 for _, success in results if success)
86
- total = len(results)
87
- failed_results = [(name, success) for name, success in results if not success]
88
-
89
- if verbose:
90
- print(f"\n{'=' * 60}")
91
- print("EVALUATION SUMMARY")
92
- print(f"{'=' * 60}")
93
-
94
- for name, success in results:
95
- status = "PASSED" if success else "FAILED"
96
- print(f" {name}: {status}")
97
-
98
- print(f"\nTotal: {passed}/{total} evaluations passed")
99
- # Only show failed evaluations when not verbose
100
- elif failed_results:
101
- print("\nFailed evaluations:")
102
- for name, _ in failed_results:
103
- print(f" {name}: FAILED")
104
-
105
- # Exit with non-zero code if any evaluations failed
106
- if passed < total:
107
- print(f"\n{total - passed} evaluation(s) failed")
108
- sys.exit(1)
109
- else:
110
- print("\nAll evaluations passed!")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes