pydantic-ai 0.0.48__tar.gz → 0.0.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-ai might be problematic. Click here for more details.

Files changed (145) hide show
  1. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/PKG-INFO +3 -3
  2. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_dataset.py +85 -42
  3. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_otel.py +40 -18
  4. pydantic_ai-0.0.50/tests/evals/test_reporting.py +437 -0
  5. pydantic_ai-0.0.50/tests/evals/utils.py +14 -0
  6. pydantic_ai-0.0.50/tests/models/cassettes/test_anthropic/test_document_url_input.yaml +63 -0
  7. pydantic_ai-0.0.50/tests/models/cassettes/test_anthropic/test_image_url_input.yaml +62 -0
  8. pydantic_ai-0.0.50/tests/models/cassettes/test_gemini/test_gemini_drop_exclusive_maximum.yaml +326 -0
  9. pydantic_ai-0.0.50/tests/models/cassettes/test_gemini/test_gemini_exclusive_minimum_and_maximum.yaml +158 -0
  10. pydantic_ai-0.0.50/tests/models/cassettes/test_openai_responses/test_openai_responses_model_builtin_tools.yaml +120 -0
  11. pydantic_ai-0.0.50/tests/models/cassettes/test_openai_responses/test_openai_responses_reasoning_generate_summary.yaml +105 -0
  12. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_anthropic.py +5 -7
  13. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_gemini.py +17 -0
  14. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_openai_responses.py +61 -1
  15. pydantic_ai-0.0.50/tests/test_cli.py +191 -0
  16. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_examples.py +1 -0
  17. pydantic_ai-0.0.48/tests/evals/test_reporting.py +0 -380
  18. pydantic_ai-0.0.48/tests/models/cassettes/test_anthropic/test_document_url_input.yaml +0 -340
  19. pydantic_ai-0.0.48/tests/models/cassettes/test_anthropic/test_image_url_input.yaml +0 -662
  20. pydantic_ai-0.0.48/tests/test_cli.py +0 -78
  21. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/.gitignore +0 -0
  22. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/LICENSE +0 -0
  23. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/Makefile +0 -0
  24. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/README.md +0 -0
  25. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/pyproject.toml +0 -0
  26. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/__init__.py +0 -0
  27. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/assets/dummy.pdf +0 -0
  28. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/assets/kiwi.png +0 -0
  29. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/assets/marcelo.mp3 +0 -0
  30. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/cassettes/test_mcp/test_agent_with_stdio_server.yaml +0 -0
  31. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/conftest.py +0 -0
  32. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/__init__.py +0 -0
  33. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_base.py +0 -0
  34. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_common.py +0 -0
  35. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_context.py +0 -0
  36. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_spec.py +0 -0
  37. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluators.py +0 -0
  38. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_llm_as_a_judge.py +0 -0
  39. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_render_numbers.py +0 -0
  40. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_reports.py +0 -0
  41. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_utils.py +0 -0
  42. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/README.md +0 -0
  43. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/bank_database.py +0 -0
  44. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/fake_database.py +0 -0
  45. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/weather_service.py +0 -0
  46. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/__init__.py +0 -0
  47. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_file_persistence.py +0 -0
  48. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_graph.py +0 -0
  49. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_mermaid.py +0 -0
  50. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_persistence.py +0 -0
  51. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_state.py +0 -0
  52. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_utils.py +0 -0
  53. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/import_examples.py +0 -0
  54. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/json_body_serializer.py +0 -0
  55. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/mcp_server.py +0 -0
  56. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/__init__.py +0 -0
  57. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_document_binary_content_input.yaml +0 -0
  58. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_image_url_input_invalid_mime_type.yaml +0 -0
  59. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_multiple_parallel_tool_calls.yaml +0 -0
  60. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_text_document_url_input.yaml +0 -0
  61. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model.yaml +0 -0
  62. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_anthropic_model_without_tools.yaml +0 -0
  63. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_iter_stream.yaml +0 -0
  64. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_max_tokens.yaml +0 -0
  65. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_retry.yaml +0 -0
  66. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_stream.yaml +0 -0
  67. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_structured_response.yaml +0 -0
  68. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_top_p.yaml +0 -0
  69. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_document_url_input.yaml +0 -0
  70. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_image_as_binary_content_input.yaml +0 -0
  71. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_image_url_input.yaml +0 -0
  72. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_text_as_binary_content_input.yaml +0 -0
  73. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_text_document_url_input.yaml +0 -0
  74. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_cohere/test_request_simple_success_with_vcr.yaml +0 -0
  75. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_gemini/test_document_url_input.yaml +0 -0
  76. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_gemini/test_image_as_binary_content_input.yaml +0 -0
  77. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_gemini/test_image_url_input.yaml +0 -0
  78. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_groq/test_image_as_binary_content_input.yaml +0 -0
  79. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_groq/test_image_url_input.yaml +0 -0
  80. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_audio_as_binary_content_input.yaml +0 -0
  81. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_document_url_input.yaml +0 -0
  82. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_image_as_binary_content_input.yaml +0 -0
  83. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_max_completion_tokens[gpt-4.5-preview].yaml +0 -0
  84. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_max_completion_tokens[gpt-4o-mini].yaml +0 -0
  85. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_max_completion_tokens[o3-mini].yaml +0 -0
  86. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_multiple_agent_tool_calls.yaml +0 -0
  87. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_openai_o1_mini_system_role[developer].yaml +0 -0
  88. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_openai_o1_mini_system_role[system].yaml +0 -0
  89. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_user_id.yaml +0 -0
  90. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_audio_as_binary_content_input.yaml +0 -0
  91. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_image_as_binary_content_input.yaml +0 -0
  92. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_document_as_binary_content_input.yaml +0 -0
  93. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_document_url_input.yaml +0 -0
  94. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_image_url_input.yaml +0 -0
  95. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_http_error.yaml +0 -0
  96. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_retry.yaml +0 -0
  97. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_simple_response.yaml +0 -0
  98. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_simple_response_with_tool_call.yaml +0 -0
  99. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_reasoning_effort.yaml +0 -0
  100. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_result_type.yaml +0 -0
  101. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_stream.yaml +0 -0
  102. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_system_prompt.yaml +0 -0
  103. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_text_document_url_input.yaml +0 -0
  104. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/mock_async_stream.py +0 -0
  105. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_bedrock.py +0 -0
  106. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_cohere.py +0 -0
  107. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_fallback.py +0 -0
  108. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_groq.py +0 -0
  109. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_instrumented.py +0 -0
  110. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_mistral.py +0 -0
  111. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model.py +0 -0
  112. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model_function.py +0 -0
  113. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model_names.py +0 -0
  114. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model_test.py +0 -0
  115. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_openai.py +0 -0
  116. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/__init__.py +0 -0
  117. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/cassettes/test_azure/test_azure_provider_call.yaml +0 -0
  118. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/cassettes/test_google_vertex/test_vertexai_provider.yaml +0 -0
  119. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_anthropic.py +0 -0
  120. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_azure.py +0 -0
  121. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_bedrock.py +0 -0
  122. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_cohere.py +0 -0
  123. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_deepseek.py +0 -0
  124. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_google_gla.py +0 -0
  125. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_google_vertex.py +0 -0
  126. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_groq.py +0 -0
  127. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_mistral.py +0 -0
  128. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_openai.py +0 -0
  129. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_provider_names.py +0 -0
  130. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_agent.py +0 -0
  131. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_deps.py +0 -0
  132. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_format_as_xml.py +0 -0
  133. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_json_body_serializer.py +0 -0
  134. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_live.py +0 -0
  135. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_logfire.py +0 -0
  136. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_mcp.py +0 -0
  137. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_messages.py +0 -0
  138. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_parts_manager.py +0 -0
  139. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_settings.py +0 -0
  140. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_streaming.py +0 -0
  141. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_tools.py +0 -0
  142. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_usage_limits.py +0 -0
  143. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_utils.py +0 -0
  144. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/typed_agent.py +0 -0
  145. {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/typed_graph.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-ai
3
- Version: 0.0.48
3
+ Version: 0.0.50
4
4
  Summary: Agent Framework / shim to use Pydantic with LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -28,9 +28,9 @@ Classifier: Topic :: Internet
28
28
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
29
29
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.9
31
- Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.48
31
+ Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.50
32
32
  Provides-Extra: examples
33
- Requires-Dist: pydantic-ai-examples==0.0.48; extra == 'examples'
33
+ Requires-Dist: pydantic-ai-examples==0.0.50; extra == 'examples'
34
34
  Provides-Extra: logfire
35
35
  Requires-Dist: logfire>=3.11.0; extra == 'logfire'
36
36
  Description-Content-Type: text/markdown
@@ -13,6 +13,7 @@ from inline_snapshot import snapshot
13
13
  from pydantic import BaseModel
14
14
 
15
15
  from ..conftest import try_import
16
+ from .utils import render_table
16
17
 
17
18
  with try_import() as imports_successful:
18
19
  from pydantic_evals import Case, Dataset
@@ -164,16 +165,12 @@ async def test_add_evaluator(
164
165
  'cases': [
165
166
  {
166
167
  'evaluators': [{'Python': 'ctx.output == 2'}],
167
- 'expected_output': None,
168
168
  'inputs': {'query': 'What is 1+1?'},
169
- 'metadata': None,
170
169
  'name': 'My Case 1',
171
170
  },
172
171
  {
173
172
  'evaluators': [{'Python': 'ctx.output == 4'}],
174
- 'expected_output': None,
175
173
  'inputs': {'query': 'What is 2+2?'},
176
- 'metadata': None,
177
174
  'name': 'My Case 2',
178
175
  },
179
176
  ],
@@ -346,40 +343,42 @@ async def test_increment_eval_metric(example_dataset: Dataset[TaskInput, TaskOut
346
343
  return TaskOutput(answer=f'answer to {inputs.query}')
347
344
 
348
345
  report = await example_dataset.evaluate(my_task)
349
- assert report.cases == [
350
- ReportCase(
351
- name='case1',
352
- inputs={'query': 'What is 2+2?'},
353
- metadata=TaskMetadata(difficulty='easy', category='general'),
354
- expected_output=TaskOutput(answer='4', confidence=1.0),
355
- output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
356
- metrics={'chars': 12},
357
- attributes={'is_about_france': False},
358
- scores={},
359
- labels={},
360
- assertions={},
361
- task_duration=1.0,
362
- total_duration=3.0,
363
- trace_id='00000000000000000000000000000001',
364
- span_id='0000000000000003',
365
- ),
366
- ReportCase(
367
- name='case2',
368
- inputs={'query': 'What is the capital of France?'},
369
- metadata=TaskMetadata(difficulty='medium', category='geography'),
370
- expected_output=TaskOutput(answer='Paris', confidence=1.0),
371
- output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
372
- metrics={'chars': 30},
373
- attributes={'is_about_france': True},
374
- scores={},
375
- labels={},
376
- assertions={},
377
- task_duration=1.0,
378
- total_duration=3.0,
379
- trace_id='00000000000000000000000000000001',
380
- span_id='0000000000000007',
381
- ),
382
- ]
346
+ assert report.cases == snapshot(
347
+ [
348
+ ReportCase(
349
+ name='case1',
350
+ inputs=TaskInput(query='What is 2+2?'),
351
+ metadata=TaskMetadata(difficulty='easy', category='general'),
352
+ expected_output=TaskOutput(answer='4', confidence=1.0),
353
+ output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
354
+ metrics={'chars': 12},
355
+ attributes={'is_about_france': False},
356
+ scores={},
357
+ labels={},
358
+ assertions={},
359
+ task_duration=1.0,
360
+ total_duration=3.0,
361
+ trace_id='00000000000000000000000000000001',
362
+ span_id='0000000000000003',
363
+ ),
364
+ ReportCase(
365
+ name='case2',
366
+ inputs=TaskInput(query='What is the capital of France?'),
367
+ metadata=TaskMetadata(difficulty='medium', category='geography'),
368
+ expected_output=TaskOutput(answer='Paris', confidence=1.0),
369
+ output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
370
+ metrics={'chars': 30},
371
+ attributes={'is_about_france': True},
372
+ scores={},
373
+ labels={},
374
+ assertions={},
375
+ task_duration=1.0,
376
+ total_duration=3.0,
377
+ trace_id='00000000000000000000000000000001',
378
+ span_id='0000000000000007',
379
+ ),
380
+ ]
381
+ )
383
382
 
384
383
 
385
384
  async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
@@ -397,7 +396,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
397
396
  [
398
397
  ReportCase(
399
398
  name='case1',
400
- inputs={'query': 'What is 2+2?'},
399
+ inputs=TaskInput(query='What is 2+2?'),
401
400
  metadata=TaskMetadata(difficulty='easy', category='general'),
402
401
  expected_output=TaskOutput(answer='4', confidence=1.0),
403
402
  output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
@@ -423,7 +422,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
423
422
  ),
424
423
  ReportCase(
425
424
  name='case2',
426
- inputs={'query': 'What is the capital of France?'},
425
+ inputs=TaskInput(query='What is the capital of France?'),
427
426
  metadata=TaskMetadata(difficulty='medium', category='geography'),
428
427
  expected_output=TaskOutput(answer='Paris', confidence=1.0),
429
428
  output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
@@ -471,7 +470,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
471
470
  [
472
471
  ReportCase(
473
472
  name='case1',
474
- inputs={'query': 'What is 2+2?'},
473
+ inputs=TaskInput(query='What is 2+2?'),
475
474
  metadata=TaskMetadata(difficulty='easy', category='general'),
476
475
  expected_output=TaskOutput(answer='4', confidence=1.0),
477
476
  output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
@@ -487,7 +486,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
487
486
  ),
488
487
  ReportCase(
489
488
  name='case2',
490
- inputs={'query': 'What is the capital of France?'},
489
+ inputs=TaskInput(query='What is the capital of France?'),
491
490
  metadata=TaskMetadata(difficulty='medium', category='geography'),
492
491
  expected_output=TaskOutput(answer='Paris', confidence=1.0),
493
492
  output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
@@ -992,3 +991,47 @@ def test_import_generate_dataset():
992
991
  from pydantic_evals.generation import generate_dataset
993
992
 
994
993
  assert generate_dataset
994
+
995
+
996
+ def test_evaluate_non_serializable_inputs():
997
+ @dataclass
998
+ class MyInputs:
999
+ result_type: type[str] | type[int]
1000
+
1001
+ my_dataset = Dataset[MyInputs, Any, Any](
1002
+ cases=[
1003
+ Case(
1004
+ name='str',
1005
+ inputs=MyInputs(result_type=str),
1006
+ expected_output='abc',
1007
+ ),
1008
+ Case(
1009
+ name='int',
1010
+ inputs=MyInputs(result_type=int),
1011
+ expected_output=123,
1012
+ ),
1013
+ ],
1014
+ )
1015
+
1016
+ async def my_task(my_inputs: MyInputs) -> int | str:
1017
+ if issubclass(my_inputs.result_type, str):
1018
+ return my_inputs.result_type('abc')
1019
+ else:
1020
+ return my_inputs.result_type(123)
1021
+
1022
+ report = my_dataset.evaluate_sync(task=my_task)
1023
+ assert [c.inputs for c in report.cases] == snapshot([MyInputs(result_type=str), MyInputs(result_type=int)])
1024
+
1025
+ table = report.console_table(include_input=True)
1026
+ assert render_table(table) == snapshot("""\
1027
+ Evaluation Summary: my_task
1028
+ ┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
1029
+ ┃ Case ID ┃ Inputs ┃ Duration ┃
1030
+ ┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
1031
+ │ str │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'str'>) │ 1.0s │
1032
+ ├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
1033
+ │ int │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'int'>) │ 1.0s │
1034
+ ├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
1035
+ │ Averages │ │ 1.0s │
1036
+ └──────────┴────────────────────────────────────────────────────────────────────────────────────┴──────────┘
1037
+ """)
@@ -245,26 +245,26 @@ async def test_span_tree_repr(span_tree: SpanTree):
245
245
  """)
246
246
  assert span_tree.repr_xml(include_span_id=True) == snapshot("""\
247
247
  <SpanTree>
248
- <SpanNode name='root' span_id=0000000000000001 >
249
- <SpanNode name='child1' span_id=0000000000000003 >
250
- <SpanNode name='grandchild1' span_id=0000000000000005 />
251
- <SpanNode name='grandchild2' span_id=0000000000000007 />
248
+ <SpanNode name='root' span_id='0000000000000001' >
249
+ <SpanNode name='child1' span_id='0000000000000003' >
250
+ <SpanNode name='grandchild1' span_id='0000000000000005' />
251
+ <SpanNode name='grandchild2' span_id='0000000000000007' />
252
252
  </SpanNode>
253
- <SpanNode name='child2' span_id=0000000000000009 >
254
- <SpanNode name='grandchild3' span_id=000000000000000b />
253
+ <SpanNode name='child2' span_id='0000000000000009' >
254
+ <SpanNode name='grandchild3' span_id='000000000000000b' />
255
255
  </SpanNode>
256
256
  </SpanNode>
257
257
  </SpanTree>\
258
258
  """)
259
259
  assert span_tree.repr_xml(include_trace_id=True) == snapshot("""\
260
260
  <SpanTree>
261
- <SpanNode name='root' trace_id=00000000000000000000000000000001 >
262
- <SpanNode name='child1' trace_id=00000000000000000000000000000001 >
263
- <SpanNode name='grandchild1' trace_id=00000000000000000000000000000001 />
264
- <SpanNode name='grandchild2' trace_id=00000000000000000000000000000001 />
261
+ <SpanNode name='root' trace_id='00000000000000000000000000000001' >
262
+ <SpanNode name='child1' trace_id='00000000000000000000000000000001' >
263
+ <SpanNode name='grandchild1' trace_id='00000000000000000000000000000001' />
264
+ <SpanNode name='grandchild2' trace_id='00000000000000000000000000000001' />
265
265
  </SpanNode>
266
- <SpanNode name='child2' trace_id=00000000000000000000000000000001 >
267
- <SpanNode name='grandchild3' trace_id=00000000000000000000000000000001 />
266
+ <SpanNode name='child2' trace_id='00000000000000000000000000000001' >
267
+ <SpanNode name='grandchild3' trace_id='00000000000000000000000000000001' />
268
268
  </SpanNode>
269
269
  </SpanNode>
270
270
  </SpanTree>\
@@ -302,9 +302,9 @@ async def test_span_node_repr(span_tree: SpanTree):
302
302
  assert node is not None
303
303
 
304
304
  leaf_node = span_tree.first({'name_equals': 'grandchild1'})
305
- assert str(leaf_node) == snapshot("<SpanNode name='grandchild1' span_id=0000000000000005 />")
305
+ assert str(leaf_node) == snapshot("<SpanNode name='grandchild1' span_id='0000000000000005' />")
306
306
 
307
- assert str(node) == snapshot("<SpanNode name='child2' span_id=0000000000000009>...</SpanNode>")
307
+ assert str(node) == snapshot("<SpanNode name='child2' span_id='0000000000000009'>...</SpanNode>")
308
308
  assert repr(node) == snapshot("""\
309
309
  <SpanNode name='child2' >
310
310
  <SpanNode name='grandchild3' />
@@ -312,13 +312,13 @@ async def test_span_node_repr(span_tree: SpanTree):
312
312
  """)
313
313
  assert node.repr_xml(include_children=False) == snapshot("<SpanNode name='child2' children=... />")
314
314
  assert node.repr_xml(include_span_id=True) == snapshot("""\
315
- <SpanNode name='child2' span_id=0000000000000009 >
316
- <SpanNode name='grandchild3' span_id=000000000000000b />
315
+ <SpanNode name='child2' span_id='0000000000000009' >
316
+ <SpanNode name='grandchild3' span_id='000000000000000b' />
317
317
  </SpanNode>\
318
318
  """)
319
319
  assert node.repr_xml(include_trace_id=True) == snapshot("""\
320
- <SpanNode name='child2' trace_id=00000000000000000000000000000001 >
321
- <SpanNode name='grandchild3' trace_id=00000000000000000000000000000001 />
320
+ <SpanNode name='child2' trace_id='00000000000000000000000000000001' >
321
+ <SpanNode name='grandchild3' trace_id='00000000000000000000000000000001' />
322
322
  </SpanNode>\
323
323
  """)
324
324
  assert node.repr_xml(include_start_timestamp=True) == snapshot("""\
@@ -383,6 +383,17 @@ async def test_span_tree_ancestors_methods():
383
383
  assert not leaf_node.matches({'no_ancestor_has': {'name_matches_regex': 'root'}})
384
384
  assert leaf_node.matches({'no_ancestor_has': {'name_matches_regex': 'abc'}})
385
385
 
386
+ # Test stop_recursing_when:
387
+ assert not leaf_node.matches(
388
+ {'some_ancestor_has': {'name_equals': 'level1'}, 'stop_recursing_when': {'name_equals': 'level2'}}
389
+ )
390
+ assert leaf_node.matches(
391
+ {'all_ancestors_have': {'name_matches_regex': 'level'}, 'stop_recursing_when': {'name_equals': 'level1'}}
392
+ )
393
+ assert leaf_node.matches(
394
+ {'no_ancestor_has': {'name_matches_regex': 'root'}, 'stop_recursing_when': {'name_equals': 'level1'}}
395
+ )
396
+
386
397
 
387
398
  async def test_span_tree_descendants_methods():
388
399
  """Test the descendant traversal methods in SpanNode."""
@@ -462,6 +473,17 @@ async def test_span_tree_descendants_methods():
462
473
  assert leaf_node.matches(negated_descendant_query)
463
474
  assert leaf_node.matches({'no_descendant_has': {'has_attributes': {'depth': 4}}})
464
475
 
476
+ # Test stop_recursing_when:
477
+ assert not root_node.matches(
478
+ {'some_descendant_has': {'name_equals': 'leaf'}, 'stop_recursing_when': {'name_equals': 'level2'}}
479
+ )
480
+ assert root_node.matches(
481
+ {'all_descendants_have': {'has_attribute_keys': ['depth']}, 'stop_recursing_when': {'name_equals': 'level2'}}
482
+ )
483
+ assert root_node.matches(
484
+ {'no_descendant_has': {'name_equals': 'leaf'}, 'stop_recursing_when': {'name_equals': 'level3'}}
485
+ )
486
+
465
487
 
466
488
  async def test_log_levels_and_exceptions():
467
489
  """Test recording different log levels and exceptions in spans."""