pydantic-ai 0.0.49__tar.gz → 0.0.51__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-ai might be problematic. Click here for more details.

Files changed (142) hide show
  1. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/Makefile +1 -1
  2. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/PKG-INFO +3 -3
  3. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_dataset.py +85 -38
  4. pydantic_ai-0.0.51/tests/evals/test_reporting.py +437 -0
  5. pydantic_ai-0.0.51/tests/evals/utils.py +14 -0
  6. pydantic_ai-0.0.51/tests/models/cassettes/test_gemini/test_gemini_drop_exclusive_maximum.yaml +326 -0
  7. pydantic_ai-0.0.51/tests/models/cassettes/test_gemini/test_gemini_exclusive_minimum_and_maximum.yaml +158 -0
  8. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_gemini.py +17 -0
  9. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_cli.py +3 -3
  10. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_examples.py +1 -0
  11. pydantic_ai-0.0.49/tests/evals/test_reporting.py +0 -380
  12. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/.gitignore +0 -0
  13. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/LICENSE +0 -0
  14. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/README.md +0 -0
  15. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/pyproject.toml +0 -0
  16. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/__init__.py +0 -0
  17. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/assets/dummy.pdf +0 -0
  18. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/assets/kiwi.png +0 -0
  19. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/assets/marcelo.mp3 +0 -0
  20. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/cassettes/test_mcp/test_agent_with_stdio_server.yaml +0 -0
  21. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/conftest.py +0 -0
  22. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/__init__.py +0 -0
  23. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_evaluator_base.py +0 -0
  24. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_evaluator_common.py +0 -0
  25. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_evaluator_context.py +0 -0
  26. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_evaluator_spec.py +0 -0
  27. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_evaluators.py +0 -0
  28. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_llm_as_a_judge.py +0 -0
  29. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_otel.py +0 -0
  30. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_render_numbers.py +0 -0
  31. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_reports.py +0 -0
  32. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_utils.py +0 -0
  33. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/example_modules/README.md +0 -0
  34. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/example_modules/bank_database.py +0 -0
  35. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/example_modules/fake_database.py +0 -0
  36. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/example_modules/weather_service.py +0 -0
  37. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/__init__.py +0 -0
  38. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/test_file_persistence.py +0 -0
  39. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/test_graph.py +0 -0
  40. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/test_mermaid.py +0 -0
  41. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/test_persistence.py +0 -0
  42. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/test_state.py +0 -0
  43. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/graph/test_utils.py +0 -0
  44. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/import_examples.py +0 -0
  45. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/json_body_serializer.py +0 -0
  46. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/mcp_server.py +0 -0
  47. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/__init__.py +0 -0
  48. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_anthropic/test_document_binary_content_input.yaml +0 -0
  49. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_anthropic/test_document_url_input.yaml +0 -0
  50. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_anthropic/test_image_url_input.yaml +0 -0
  51. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_anthropic/test_image_url_input_invalid_mime_type.yaml +0 -0
  52. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_anthropic/test_multiple_parallel_tool_calls.yaml +0 -0
  53. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_anthropic/test_text_document_url_input.yaml +0 -0
  54. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model.yaml +0 -0
  55. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_anthropic_model_without_tools.yaml +0 -0
  56. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_iter_stream.yaml +0 -0
  57. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_max_tokens.yaml +0 -0
  58. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_retry.yaml +0 -0
  59. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_stream.yaml +0 -0
  60. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_structured_response.yaml +0 -0
  61. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_bedrock_model_top_p.yaml +0 -0
  62. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_document_url_input.yaml +0 -0
  63. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_image_as_binary_content_input.yaml +0 -0
  64. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_image_url_input.yaml +0 -0
  65. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_text_as_binary_content_input.yaml +0 -0
  66. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_bedrock/test_text_document_url_input.yaml +0 -0
  67. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_cohere/test_request_simple_success_with_vcr.yaml +0 -0
  68. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_gemini/test_document_url_input.yaml +0 -0
  69. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_gemini/test_image_as_binary_content_input.yaml +0 -0
  70. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_gemini/test_image_url_input.yaml +0 -0
  71. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_groq/test_image_as_binary_content_input.yaml +0 -0
  72. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_groq/test_image_url_input.yaml +0 -0
  73. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_audio_as_binary_content_input.yaml +0 -0
  74. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_document_url_input.yaml +0 -0
  75. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_image_as_binary_content_input.yaml +0 -0
  76. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_max_completion_tokens[gpt-4.5-preview].yaml +0 -0
  77. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_max_completion_tokens[gpt-4o-mini].yaml +0 -0
  78. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_max_completion_tokens[o3-mini].yaml +0 -0
  79. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_multiple_agent_tool_calls.yaml +0 -0
  80. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_openai_o1_mini_system_role[developer].yaml +0 -0
  81. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_openai_o1_mini_system_role[system].yaml +0 -0
  82. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai/test_user_id.yaml +0 -0
  83. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_audio_as_binary_content_input.yaml +0 -0
  84. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_image_as_binary_content_input.yaml +0 -0
  85. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_document_as_binary_content_input.yaml +0 -0
  86. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_document_url_input.yaml +0 -0
  87. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_image_url_input.yaml +0 -0
  88. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_builtin_tools.yaml +0 -0
  89. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_http_error.yaml +0 -0
  90. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_retry.yaml +0 -0
  91. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_simple_response.yaml +0 -0
  92. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_simple_response_with_tool_call.yaml +0 -0
  93. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_reasoning_effort.yaml +0 -0
  94. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_reasoning_generate_summary.yaml +0 -0
  95. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_result_type.yaml +0 -0
  96. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_stream.yaml +0 -0
  97. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_system_prompt.yaml +0 -0
  98. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/cassettes/test_openai_responses/test_openai_responses_text_document_url_input.yaml +0 -0
  99. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/mock_async_stream.py +0 -0
  100. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_anthropic.py +0 -0
  101. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_bedrock.py +0 -0
  102. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_cohere.py +0 -0
  103. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_fallback.py +0 -0
  104. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_groq.py +0 -0
  105. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_instrumented.py +0 -0
  106. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_mistral.py +0 -0
  107. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_model.py +0 -0
  108. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_model_function.py +0 -0
  109. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_model_names.py +0 -0
  110. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_model_test.py +0 -0
  111. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_openai.py +0 -0
  112. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/models/test_openai_responses.py +0 -0
  113. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/__init__.py +0 -0
  114. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/cassettes/test_azure/test_azure_provider_call.yaml +0 -0
  115. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/cassettes/test_google_vertex/test_vertexai_provider.yaml +0 -0
  116. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_anthropic.py +0 -0
  117. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_azure.py +0 -0
  118. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_bedrock.py +0 -0
  119. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_cohere.py +0 -0
  120. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_deepseek.py +0 -0
  121. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_google_gla.py +0 -0
  122. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_google_vertex.py +0 -0
  123. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_groq.py +0 -0
  124. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_mistral.py +0 -0
  125. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_openai.py +0 -0
  126. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/providers/test_provider_names.py +0 -0
  127. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_agent.py +0 -0
  128. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_deps.py +0 -0
  129. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_format_as_xml.py +0 -0
  130. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_json_body_serializer.py +0 -0
  131. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_live.py +0 -0
  132. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_logfire.py +0 -0
  133. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_mcp.py +0 -0
  134. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_messages.py +0 -0
  135. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_parts_manager.py +0 -0
  136. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_settings.py +0 -0
  137. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_streaming.py +0 -0
  138. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_tools.py +0 -0
  139. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_usage_limits.py +0 -0
  140. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/test_utils.py +0 -0
  141. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/typed_agent.py +0 -0
  142. {pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/typed_graph.py +0 -0
@@ -37,7 +37,7 @@ lint: ## Lint the code
37
37
 
38
38
  .PHONY: lint-js
39
39
  lint-js: ## Lint JS and TS code
40
- cd mcp-run-python && npm run lint
40
+ cd mcp-run-python && deno task lint-format
41
41
 
42
42
  .PHONY: typecheck-pyright
43
43
  typecheck-pyright:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-ai
3
- Version: 0.0.49
3
+ Version: 0.0.51
4
4
  Summary: Agent Framework / shim to use Pydantic with LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -28,9 +28,9 @@ Classifier: Topic :: Internet
28
28
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
29
29
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.9
31
- Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.49
31
+ Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.51
32
32
  Provides-Extra: examples
33
- Requires-Dist: pydantic-ai-examples==0.0.49; extra == 'examples'
33
+ Requires-Dist: pydantic-ai-examples==0.0.51; extra == 'examples'
34
34
  Provides-Extra: logfire
35
35
  Requires-Dist: logfire>=3.11.0; extra == 'logfire'
36
36
  Description-Content-Type: text/markdown
@@ -13,6 +13,7 @@ from inline_snapshot import snapshot
13
13
  from pydantic import BaseModel
14
14
 
15
15
  from ..conftest import try_import
16
+ from .utils import render_table
16
17
 
17
18
  with try_import() as imports_successful:
18
19
  from pydantic_evals import Case, Dataset
@@ -342,40 +343,42 @@ async def test_increment_eval_metric(example_dataset: Dataset[TaskInput, TaskOut
342
343
  return TaskOutput(answer=f'answer to {inputs.query}')
343
344
 
344
345
  report = await example_dataset.evaluate(my_task)
345
- assert report.cases == [
346
- ReportCase(
347
- name='case1',
348
- inputs={'query': 'What is 2+2?'},
349
- metadata=TaskMetadata(difficulty='easy', category='general'),
350
- expected_output=TaskOutput(answer='4', confidence=1.0),
351
- output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
352
- metrics={'chars': 12},
353
- attributes={'is_about_france': False},
354
- scores={},
355
- labels={},
356
- assertions={},
357
- task_duration=1.0,
358
- total_duration=3.0,
359
- trace_id='00000000000000000000000000000001',
360
- span_id='0000000000000003',
361
- ),
362
- ReportCase(
363
- name='case2',
364
- inputs={'query': 'What is the capital of France?'},
365
- metadata=TaskMetadata(difficulty='medium', category='geography'),
366
- expected_output=TaskOutput(answer='Paris', confidence=1.0),
367
- output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
368
- metrics={'chars': 30},
369
- attributes={'is_about_france': True},
370
- scores={},
371
- labels={},
372
- assertions={},
373
- task_duration=1.0,
374
- total_duration=3.0,
375
- trace_id='00000000000000000000000000000001',
376
- span_id='0000000000000007',
377
- ),
378
- ]
346
+ assert report.cases == snapshot(
347
+ [
348
+ ReportCase(
349
+ name='case1',
350
+ inputs=TaskInput(query='What is 2+2?'),
351
+ metadata=TaskMetadata(difficulty='easy', category='general'),
352
+ expected_output=TaskOutput(answer='4', confidence=1.0),
353
+ output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
354
+ metrics={'chars': 12},
355
+ attributes={'is_about_france': False},
356
+ scores={},
357
+ labels={},
358
+ assertions={},
359
+ task_duration=1.0,
360
+ total_duration=3.0,
361
+ trace_id='00000000000000000000000000000001',
362
+ span_id='0000000000000003',
363
+ ),
364
+ ReportCase(
365
+ name='case2',
366
+ inputs=TaskInput(query='What is the capital of France?'),
367
+ metadata=TaskMetadata(difficulty='medium', category='geography'),
368
+ expected_output=TaskOutput(answer='Paris', confidence=1.0),
369
+ output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
370
+ metrics={'chars': 30},
371
+ attributes={'is_about_france': True},
372
+ scores={},
373
+ labels={},
374
+ assertions={},
375
+ task_duration=1.0,
376
+ total_duration=3.0,
377
+ trace_id='00000000000000000000000000000001',
378
+ span_id='0000000000000007',
379
+ ),
380
+ ]
381
+ )
379
382
 
380
383
 
381
384
  async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
@@ -393,7 +396,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
393
396
  [
394
397
  ReportCase(
395
398
  name='case1',
396
- inputs={'query': 'What is 2+2?'},
399
+ inputs=TaskInput(query='What is 2+2?'),
397
400
  metadata=TaskMetadata(difficulty='easy', category='general'),
398
401
  expected_output=TaskOutput(answer='4', confidence=1.0),
399
402
  output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
@@ -419,7 +422,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
419
422
  ),
420
423
  ReportCase(
421
424
  name='case2',
422
- inputs={'query': 'What is the capital of France?'},
425
+ inputs=TaskInput(query='What is the capital of France?'),
423
426
  metadata=TaskMetadata(difficulty='medium', category='geography'),
424
427
  expected_output=TaskOutput(answer='Paris', confidence=1.0),
425
428
  output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
@@ -467,7 +470,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
467
470
  [
468
471
  ReportCase(
469
472
  name='case1',
470
- inputs={'query': 'What is 2+2?'},
473
+ inputs=TaskInput(query='What is 2+2?'),
471
474
  metadata=TaskMetadata(difficulty='easy', category='general'),
472
475
  expected_output=TaskOutput(answer='4', confidence=1.0),
473
476
  output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
@@ -483,7 +486,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
483
486
  ),
484
487
  ReportCase(
485
488
  name='case2',
486
- inputs={'query': 'What is the capital of France?'},
489
+ inputs=TaskInput(query='What is the capital of France?'),
487
490
  metadata=TaskMetadata(difficulty='medium', category='geography'),
488
491
  expected_output=TaskOutput(answer='Paris', confidence=1.0),
489
492
  output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
@@ -988,3 +991,47 @@ def test_import_generate_dataset():
988
991
  from pydantic_evals.generation import generate_dataset
989
992
 
990
993
  assert generate_dataset
994
+
995
+
996
+ def test_evaluate_non_serializable_inputs():
997
+ @dataclass
998
+ class MyInputs:
999
+ result_type: type[str] | type[int]
1000
+
1001
+ my_dataset = Dataset[MyInputs, Any, Any](
1002
+ cases=[
1003
+ Case(
1004
+ name='str',
1005
+ inputs=MyInputs(result_type=str),
1006
+ expected_output='abc',
1007
+ ),
1008
+ Case(
1009
+ name='int',
1010
+ inputs=MyInputs(result_type=int),
1011
+ expected_output=123,
1012
+ ),
1013
+ ],
1014
+ )
1015
+
1016
+ async def my_task(my_inputs: MyInputs) -> int | str:
1017
+ if issubclass(my_inputs.result_type, str):
1018
+ return my_inputs.result_type('abc')
1019
+ else:
1020
+ return my_inputs.result_type(123)
1021
+
1022
+ report = my_dataset.evaluate_sync(task=my_task)
1023
+ assert [c.inputs for c in report.cases] == snapshot([MyInputs(result_type=str), MyInputs(result_type=int)])
1024
+
1025
+ table = report.console_table(include_input=True)
1026
+ assert render_table(table) == snapshot("""\
1027
+ Evaluation Summary: my_task
1028
+ ┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
1029
+ ┃ Case ID ┃ Inputs ┃ Duration ┃
1030
+ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
1031
+ │ str │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'str'>) │ 1.0s │
1032
+ ├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
1033
+ │ int │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'int'>) │ 1.0s │
1034
+ ├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
1035
+ │ Averages │ │ 1.0s │
1036
+ └──────────┴────────────────────────────────────────────────────────────────────────────────────┴──────────┘
1037
+ """)
@@ -0,0 +1,437 @@
1
+ from __future__ import annotations as _annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import pytest
6
+ from inline_snapshot import snapshot
7
+ from pydantic import BaseModel
8
+
9
+ from ..conftest import try_import
10
+ from .utils import render_table
11
+
12
+ with try_import() as imports_successful:
13
+ from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorContext
14
+ from pydantic_evals.reporting import (
15
+ EvaluationRenderer,
16
+ EvaluationReport,
17
+ ReportCase,
18
+ ReportCaseAggregate,
19
+ )
20
+
21
+ pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
22
+
23
+
24
+ class TaskInput(BaseModel):
25
+ query: str
26
+
27
+
28
+ class TaskOutput(BaseModel):
29
+ answer: str
30
+
31
+
32
+ class TaskMetadata(BaseModel):
33
+ difficulty: str
34
+
35
+
36
+ @pytest.fixture
37
+ def mock_evaluator() -> Evaluator[TaskInput, TaskOutput, TaskMetadata]:
38
+ class MockEvaluator(Evaluator[TaskInput, TaskOutput, TaskMetadata]):
39
+ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) -> bool:
40
+ raise NotImplementedError
41
+
42
+ return MockEvaluator()
43
+
44
+
45
+ @pytest.fixture
46
+ def sample_assertion(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) -> EvaluationResult[bool]:
47
+ return EvaluationResult(
48
+ name='MockEvaluator',
49
+ value=True,
50
+ reason=None,
51
+ source=mock_evaluator,
52
+ )
53
+
54
+
55
+ @pytest.fixture
56
+ def sample_score(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) -> EvaluationResult[float]:
57
+ return EvaluationResult(
58
+ name='MockEvaluator',
59
+ value=2.5,
60
+ reason=None,
61
+ source=mock_evaluator,
62
+ )
63
+
64
+
65
+ @pytest.fixture
66
+ def sample_label(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) -> EvaluationResult[str]:
67
+ return EvaluationResult(
68
+ name='MockEvaluator',
69
+ value='hello',
70
+ reason=None,
71
+ source=mock_evaluator,
72
+ )
73
+
74
+
75
+ @pytest.fixture
76
+ def sample_report_case(
77
+ sample_assertion: EvaluationResult[bool], sample_score: EvaluationResult[float], sample_label: EvaluationResult[str]
78
+ ) -> ReportCase:
79
+ return ReportCase(
80
+ name='test_case',
81
+ inputs={'query': 'What is 2+2?'},
82
+ output={'answer': '4'},
83
+ expected_output={'answer': '4'},
84
+ metadata={'difficulty': 'easy'},
85
+ metrics={'accuracy': 0.95},
86
+ attributes={},
87
+ scores={'score1': sample_score},
88
+ labels={'label1': sample_label},
89
+ assertions={sample_assertion.name: sample_assertion},
90
+ task_duration=0.1,
91
+ total_duration=0.2,
92
+ trace_id='test-trace-id',
93
+ span_id='test-span-id',
94
+ )
95
+
96
+
97
+ @pytest.fixture
98
+ def sample_report(sample_report_case: ReportCase) -> EvaluationReport:
99
+ return EvaluationReport(
100
+ cases=[sample_report_case],
101
+ name='test_report',
102
+ )
103
+
104
+
105
+ async def test_evaluation_renderer_basic(sample_report: EvaluationReport):
106
+ """Test basic functionality of EvaluationRenderer."""
107
+ renderer = EvaluationRenderer(
108
+ include_input=True,
109
+ include_output=True,
110
+ include_metadata=True,
111
+ include_expected_output=True,
112
+ include_durations=True,
113
+ include_total_duration=True,
114
+ include_removed_cases=False,
115
+ include_averages=True,
116
+ input_config={},
117
+ metadata_config={},
118
+ output_config={},
119
+ score_configs={},
120
+ label_configs={},
121
+ metric_configs={},
122
+ duration_config={},
123
+ )
124
+
125
+ table = renderer.build_table(sample_report)
126
+ assert render_table(table) == snapshot("""\
127
+ Evaluation Summary: test_report
128
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
129
+ ┃ Case ID ┃ Inputs ┃ Metadata ┃ Expected Output ┃ Outputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Durations ┃
130
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
131
+ │ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ task: 0.100 │
132
+ │ │ │ │ │ │ │ │ │ │ total: 0.200 │
133
+ ├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────────┤
134
+ │ Averages │ │ │ │ │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔ │ task: 0.100 │
135
+ │ │ │ │ │ │ │ │ │ │ total: 0.200 │
136
+ └───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────────┘
137
+ """)
138
+
139
+
140
+ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport):
141
+ """Test EvaluationRenderer with baseline comparison."""
142
+ baseline_report = EvaluationReport(
143
+ cases=[
144
+ ReportCase(
145
+ name='test_case',
146
+ inputs={'query': 'What is 2+2?'},
147
+ output={'answer': '4'},
148
+ expected_output={'answer': '4'},
149
+ metadata={'difficulty': 'easy'},
150
+ metrics={'accuracy': 0.90},
151
+ attributes={},
152
+ scores={
153
+ 'score1': EvaluationResult(
154
+ name='MockEvaluator',
155
+ value=2.5,
156
+ reason=None,
157
+ source=sample_report.cases[0].scores['score1'].source,
158
+ )
159
+ },
160
+ labels={
161
+ 'label1': EvaluationResult(
162
+ name='MockEvaluator',
163
+ value='hello',
164
+ reason=None,
165
+ source=sample_report.cases[0].labels['label1'].source,
166
+ )
167
+ },
168
+ assertions={},
169
+ task_duration=0.15,
170
+ total_duration=0.25,
171
+ trace_id='test-trace-id',
172
+ span_id='test-span-id',
173
+ )
174
+ ],
175
+ name='baseline_report',
176
+ )
177
+
178
+ renderer = EvaluationRenderer(
179
+ include_input=True,
180
+ include_metadata=True,
181
+ include_expected_output=True,
182
+ include_output=True,
183
+ include_durations=True,
184
+ include_total_duration=True,
185
+ include_removed_cases=False,
186
+ include_averages=True,
187
+ input_config={},
188
+ metadata_config={},
189
+ output_config={},
190
+ score_configs={},
191
+ label_configs={},
192
+ metric_configs={},
193
+ duration_config={},
194
+ )
195
+
196
+ table = renderer.build_diff_table(sample_report, baseline_report)
197
+ assert render_table(table) == snapshot("""\
198
+ Evaluation Diff: baseline_report → test_report
199
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
200
+ ┃ Case ID ┃ Inputs ┃ Metadata ┃ Expected Output ┃ Outputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Durations ┃
201
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
202
+ │ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: EvaluationResult(name='MockEvaluator', value='hello', reason=None, │ accuracy: 0.900 → 0.950 (+0.05 / +5.6%) │ → ✔ │ task: 0.150 → 0.100 (-0.05 / -33.3%) │
203
+ │ │ │ │ │ │ │ source=mock_evaluator.<locals>.MockEvaluator()) │ │ │ total: 0.250 → 0.200 (-0.05 / -20.0%) │
204
+ ├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼──────────────┼─────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────┼──────────────┼───────────────────────────────────────┤
205
+ │ Averages │ │ │ │ │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.900 → 0.950 (+0.05 / +5.6%) │ - → 100.0% ✔ │ task: 0.150 → 0.100 (-0.05 / -33.3%) │
206
+ │ │ │ │ │ │ │ │ │ │ total: 0.250 → 0.200 (-0.05 / -20.0%) │
207
+ └───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴──────────────┴─────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────┴──────────────┴───────────────────────────────────────┘
208
+ """)
209
+
210
+
211
+ async def test_evaluation_renderer_with_removed_cases(sample_report: EvaluationReport):
212
+ """Test EvaluationRenderer with removed cases."""
213
+ baseline_report = EvaluationReport(
214
+ cases=[
215
+ ReportCase(
216
+ name='removed_case',
217
+ inputs={'query': 'What is 3+3?'},
218
+ output={'answer': '6'},
219
+ expected_output={'answer': '6'},
220
+ metadata={'difficulty': 'medium'},
221
+ metrics={'accuracy': 0.85},
222
+ attributes={},
223
+ scores={},
224
+ labels={},
225
+ assertions={},
226
+ task_duration=0.1,
227
+ total_duration=0.15,
228
+ trace_id='test-trace-id-2',
229
+ span_id='test-span-id-2',
230
+ )
231
+ ],
232
+ name='baseline_report',
233
+ )
234
+
235
+ renderer = EvaluationRenderer(
236
+ include_input=True,
237
+ include_metadata=True,
238
+ include_expected_output=True,
239
+ include_output=True,
240
+ include_durations=True,
241
+ include_total_duration=True,
242
+ include_removed_cases=True,
243
+ include_averages=True,
244
+ input_config={},
245
+ metadata_config={},
246
+ output_config={},
247
+ score_configs={},
248
+ label_configs={},
249
+ metric_configs={},
250
+ duration_config={},
251
+ )
252
+
253
+ table = renderer.build_diff_table(sample_report, baseline_report)
254
+ assert render_table(table) == snapshot("""\
255
+ Evaluation Diff: baseline_report → test_report
256
+ ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
257
+ ┃ Case ID ┃ Inputs ┃ Metadata ┃ Expected Output ┃ Outputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Durations ┃
258
+ ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
259
+ │ + Added Case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ task: 0.100 │
260
+ │ test_case │ │ │ │ │ │ │ │ │ total: 0.200 │
261
+ ├────────────────┼───────────────────────────┼──────────────────────────┼─────────────────┼─────────────────┼──────────────────────────┼────────────────────────────────────┼─────────────────────────────────────────┼──────────────┼───────────────────────────────────────┤
262
+ │ - Removed Case │ {'query': 'What is 3+3?'} │ {'difficulty': 'medium'} │ {'answer': '6'} │ {'answer': '6'} │ - │ - │ accuracy: 0.850 │ - │ task: 0.100 │
263
+ │ removed_case │ │ │ │ │ │ │ │ │ total: 0.150 │
264
+ ├────────────────┼───────────────────────────┼──────────────────────────┼─────────────────┼─────────────────┼──────────────────────────┼────────────────────────────────────┼─────────────────────────────────────────┼──────────────┼───────────────────────────────────────┤
265
+ │ Averages │ │ │ │ │ score1: <missing> → 2.50 │ label1: <missing> → {'hello': 1.0} │ accuracy: 0.850 → 0.950 (+0.1 / +11.8%) │ - → 100.0% ✔ │ task: 0.100 │
266
+ │ │ │ │ │ │ │ │ │ │ total: 0.150 → 0.200 (+0.05 / +33.3%) │
267
+ └────────────────┴───────────────────────────┴──────────────────────────┴─────────────────┴─────────────────┴──────────────────────────┴────────────────────────────────────┴─────────────────────────────────────────┴──────────────┴───────────────────────────────────────┘
268
+ """)
269
+
270
+
271
+ async def test_evaluation_renderer_with_custom_configs(sample_report: EvaluationReport):
272
+ """Test EvaluationRenderer with custom render configurations."""
273
+ renderer = EvaluationRenderer(
274
+ include_input=True,
275
+ include_metadata=True,
276
+ include_expected_output=True,
277
+ include_output=True,
278
+ include_durations=True,
279
+ include_total_duration=True,
280
+ include_removed_cases=False,
281
+ include_averages=True,
282
+ input_config={'value_formatter': lambda x: str(x)},
283
+ metadata_config={'value_formatter': lambda x: str(x)},
284
+ output_config={'value_formatter': lambda x: str(x)},
285
+ score_configs={
286
+ 'score1': {
287
+ 'value_formatter': '{:.2f}',
288
+ 'diff_formatter': '{:+.2f}',
289
+ 'diff_atol': 0.01,
290
+ 'diff_rtol': 0.05,
291
+ 'diff_increase_style': 'bold green',
292
+ 'diff_decrease_style': 'bold red',
293
+ }
294
+ },
295
+ label_configs={'label1': {'value_formatter': lambda x: str(x)}},
296
+ metric_configs={
297
+ 'accuracy': {
298
+ 'value_formatter': '{:.1%}',
299
+ 'diff_formatter': '{:+.1%}',
300
+ 'diff_atol': 0.01,
301
+ 'diff_rtol': 0.05,
302
+ 'diff_increase_style': 'bold green',
303
+ 'diff_decrease_style': 'bold red',
304
+ }
305
+ },
306
+ duration_config={
307
+ 'value_formatter': '{:.3f}s',
308
+ 'diff_formatter': '{:+.3f}s',
309
+ 'diff_atol': 0.001,
310
+ 'diff_rtol': 0.05,
311
+ 'diff_increase_style': 'bold red',
312
+ 'diff_decrease_style': 'bold green',
313
+ },
314
+ )
315
+
316
+ table = renderer.build_table(sample_report)
317
+ assert render_table(table) == snapshot("""\
318
+ Evaluation Summary: test_report
319
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
320
+ ┃ Case ID ┃ Inputs ┃ Metadata ┃ Expected Output ┃ Outputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Durations ┃
321
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
322
+ │ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: hello │ accuracy: 95.0% │ ✔ │ task: 0.100s │
323
+ │ │ │ │ │ │ │ │ │ │ total: 0.200s │
324
+ ├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼───────────────┤
325
+ │ Averages │ │ │ │ │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 95.0% │ 100.0% ✔ │ task: 0.100s │
326
+ │ │ │ │ │ │ │ │ │ │ total: 0.200s │
327
+ └───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴───────────────┘
328
+ """)
329
+
330
+
331
+ async def test_report_case_aggregate_average():
332
+ """Test ReportCaseAggregate.average() method."""
333
+
334
+ @dataclass
335
+ class MockEvaluator(Evaluator[TaskInput, TaskOutput, TaskMetadata]):
336
+ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) -> float:
337
+ raise NotImplementedError
338
+
339
+ cases = [
340
+ ReportCase(
341
+ name='case1',
342
+ inputs={'query': 'What is 2+2?'},
343
+ output={'answer': '4'},
344
+ expected_output={'answer': '4'},
345
+ metadata={'difficulty': 'easy'},
346
+ metrics={'accuracy': 0.95},
347
+ attributes={},
348
+ scores={
349
+ 'score1': EvaluationResult(
350
+ name='MockEvaluator',
351
+ value=0.8,
352
+ reason=None,
353
+ source=MockEvaluator(),
354
+ )
355
+ },
356
+ labels={
357
+ 'label1': EvaluationResult(
358
+ name='MockEvaluator',
359
+ value='good',
360
+ reason=None,
361
+ source=MockEvaluator(),
362
+ )
363
+ },
364
+ assertions={
365
+ 'assert1': EvaluationResult(
366
+ name='MockEvaluator',
367
+ value=True,
368
+ reason=None,
369
+ source=MockEvaluator(),
370
+ )
371
+ },
372
+ task_duration=0.1,
373
+ total_duration=0.2,
374
+ trace_id='test-trace-id-1',
375
+ span_id='test-span-id-1',
376
+ ),
377
+ ReportCase(
378
+ name='case2',
379
+ inputs={'query': 'What is 3+3?'},
380
+ output={'answer': '6'},
381
+ expected_output={'answer': '6'},
382
+ metadata={'difficulty': 'medium'},
383
+ metrics={'accuracy': 0.85},
384
+ attributes={},
385
+ scores={
386
+ 'score1': EvaluationResult(
387
+ name='MockEvaluator',
388
+ value=0.7,
389
+ reason=None,
390
+ source=MockEvaluator(),
391
+ )
392
+ },
393
+ labels={
394
+ 'label1': EvaluationResult(
395
+ name='MockEvaluator',
396
+ value='good',
397
+ reason=None,
398
+ source=MockEvaluator(),
399
+ )
400
+ },
401
+ assertions={
402
+ 'assert1': EvaluationResult(
403
+ name='MockEvaluator',
404
+ value=False,
405
+ reason=None,
406
+ source=MockEvaluator(),
407
+ )
408
+ },
409
+ task_duration=0.15,
410
+ total_duration=0.25,
411
+ trace_id='test-trace-id-2',
412
+ span_id='test-span-id-2',
413
+ ),
414
+ ]
415
+
416
+ aggregate = ReportCaseAggregate.average(cases)
417
+
418
+ assert aggregate.name == 'Averages'
419
+ assert aggregate.scores['score1'] == 0.75 # (0.8 + 0.7) / 2
420
+ assert aggregate.labels['label1']['good'] == 1.0 # Both cases have 'good' label
421
+ assert abs(aggregate.metrics['accuracy'] - 0.90) < 1e-10 # floating-point error # (0.95 + 0.85) / 2
422
+ assert aggregate.assertions == 0.5 # 1 passing out of 2 assertions
423
+ assert aggregate.task_duration == 0.125 # (0.1 + 0.15) / 2
424
+ assert aggregate.total_duration == 0.225 # (0.2 + 0.25) / 2
425
+
426
+
427
+ async def test_report_case_aggregate_empty():
428
+ """Test ReportCaseAggregate.average() with empty cases list."""
429
+ assert ReportCaseAggregate.average([]).model_dump() == {
430
+ 'assertions': None,
431
+ 'labels': {},
432
+ 'metrics': {},
433
+ 'name': 'Averages',
434
+ 'scores': {},
435
+ 'task_duration': 0.0,
436
+ 'total_duration': 0.0,
437
+ }