pydantic-ai 0.0.48__tar.gz → 0.0.50__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-ai might be problematic. Click here for more details.
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/PKG-INFO +3 -3
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_dataset.py +85 -42
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_otel.py +40 -18
- pydantic_ai-0.0.50/tests/evals/test_reporting.py +437 -0
- pydantic_ai-0.0.50/tests/evals/utils.py +14 -0
- pydantic_ai-0.0.50/tests/models/cassettes/test_anthropic/test_document_url_input.yaml +63 -0
- pydantic_ai-0.0.50/tests/models/cassettes/test_anthropic/test_image_url_input.yaml +62 -0
- pydantic_ai-0.0.50/tests/models/cassettes/test_gemini/test_gemini_drop_exclusive_maximum.yaml +326 -0
- pydantic_ai-0.0.50/tests/models/cassettes/test_gemini/test_gemini_exclusive_minimum_and_maximum.yaml +158 -0
- pydantic_ai-0.0.50/tests/models/cassettes/test_openai_responses/test_openai_responses_model_builtin_tools.yaml +120 -0
- pydantic_ai-0.0.50/tests/models/cassettes/test_openai_responses/test_openai_responses_reasoning_generate_summary.yaml +105 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_anthropic.py +5 -7
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_gemini.py +17 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_openai_responses.py +61 -1
- pydantic_ai-0.0.50/tests/test_cli.py +191 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_examples.py +1 -0
- pydantic_ai-0.0.48/tests/evals/test_reporting.py +0 -380
- pydantic_ai-0.0.48/tests/models/cassettes/test_anthropic/test_document_url_input.yaml +0 -340
- pydantic_ai-0.0.48/tests/models/cassettes/test_anthropic/test_image_url_input.yaml +0 -662
- pydantic_ai-0.0.48/tests/test_cli.py +0 -78
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/.gitignore +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/LICENSE +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/Makefile +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/README.md +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/pyproject.toml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/__init__.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/assets/dummy.pdf +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/assets/kiwi.png +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/assets/marcelo.mp3 +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/cassettes/test_mcp/test_agent_with_stdio_server.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/conftest.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/__init__.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_base.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_common.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_context.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluator_spec.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_evaluators.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_llm_as_a_judge.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_render_numbers.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_reports.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/evals/test_utils.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/README.md +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/bank_database.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/fake_database.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/example_modules/weather_service.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/__init__.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_file_persistence.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_graph.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_mermaid.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_persistence.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_state.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/graph/test_utils.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/import_examples.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/json_body_serializer.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/mcp_server.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/__init__.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_document_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_image_url_input_invalid_mime_type.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_multiple_parallel_tool_calls.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_anthropic/test_text_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_anthropic_model_without_tools.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_iter_stream.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_max_tokens.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_retry.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_stream.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_structured_response.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_bedrock_model_top_p.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_image_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_image_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_text_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_bedrock/test_text_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_cohere/test_request_simple_success_with_vcr.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_gemini/test_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_gemini/test_image_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_gemini/test_image_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_groq/test_image_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_groq/test_image_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_audio_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_image_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_max_completion_tokens[gpt-4.5-preview].yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_max_completion_tokens[gpt-4o-mini].yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_max_completion_tokens[o3-mini].yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_multiple_agent_tool_calls.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_openai_o1_mini_system_role[developer].yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_openai_o1_mini_system_role[system].yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai/test_user_id.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_audio_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_image_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_document_as_binary_content_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_image_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_http_error.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_retry.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_simple_response.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_model_simple_response_with_tool_call.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_reasoning_effort.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_result_type.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_stream.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_system_prompt.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/cassettes/test_openai_responses/test_openai_responses_text_document_url_input.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/mock_async_stream.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_bedrock.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_cohere.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_fallback.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_groq.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_instrumented.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_mistral.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model_function.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model_names.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_model_test.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/models/test_openai.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/__init__.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/cassettes/test_azure/test_azure_provider_call.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/cassettes/test_google_vertex/test_vertexai_provider.yaml +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_anthropic.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_azure.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_bedrock.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_cohere.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_deepseek.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_google_gla.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_google_vertex.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_groq.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_mistral.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_openai.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/providers/test_provider_names.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_agent.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_deps.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_format_as_xml.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_json_body_serializer.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_live.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_logfire.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_mcp.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_messages.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_parts_manager.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_settings.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_streaming.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_tools.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_usage_limits.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/test_utils.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/typed_agent.py +0 -0
- {pydantic_ai-0.0.48 → pydantic_ai-0.0.50}/tests/typed_graph.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-ai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.50
|
|
4
4
|
Summary: Agent Framework / shim to use Pydantic with LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -28,9 +28,9 @@ Classifier: Topic :: Internet
|
|
|
28
28
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
29
29
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
30
30
|
Requires-Python: >=3.9
|
|
31
|
-
Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.
|
|
31
|
+
Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.50
|
|
32
32
|
Provides-Extra: examples
|
|
33
|
-
Requires-Dist: pydantic-ai-examples==0.0.
|
|
33
|
+
Requires-Dist: pydantic-ai-examples==0.0.50; extra == 'examples'
|
|
34
34
|
Provides-Extra: logfire
|
|
35
35
|
Requires-Dist: logfire>=3.11.0; extra == 'logfire'
|
|
36
36
|
Description-Content-Type: text/markdown
|
|
@@ -13,6 +13,7 @@ from inline_snapshot import snapshot
|
|
|
13
13
|
from pydantic import BaseModel
|
|
14
14
|
|
|
15
15
|
from ..conftest import try_import
|
|
16
|
+
from .utils import render_table
|
|
16
17
|
|
|
17
18
|
with try_import() as imports_successful:
|
|
18
19
|
from pydantic_evals import Case, Dataset
|
|
@@ -164,16 +165,12 @@ async def test_add_evaluator(
|
|
|
164
165
|
'cases': [
|
|
165
166
|
{
|
|
166
167
|
'evaluators': [{'Python': 'ctx.output == 2'}],
|
|
167
|
-
'expected_output': None,
|
|
168
168
|
'inputs': {'query': 'What is 1+1?'},
|
|
169
|
-
'metadata': None,
|
|
170
169
|
'name': 'My Case 1',
|
|
171
170
|
},
|
|
172
171
|
{
|
|
173
172
|
'evaluators': [{'Python': 'ctx.output == 4'}],
|
|
174
|
-
'expected_output': None,
|
|
175
173
|
'inputs': {'query': 'What is 2+2?'},
|
|
176
|
-
'metadata': None,
|
|
177
174
|
'name': 'My Case 2',
|
|
178
175
|
},
|
|
179
176
|
],
|
|
@@ -346,40 +343,42 @@ async def test_increment_eval_metric(example_dataset: Dataset[TaskInput, TaskOut
|
|
|
346
343
|
return TaskOutput(answer=f'answer to {inputs.query}')
|
|
347
344
|
|
|
348
345
|
report = await example_dataset.evaluate(my_task)
|
|
349
|
-
assert report.cases ==
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
346
|
+
assert report.cases == snapshot(
|
|
347
|
+
[
|
|
348
|
+
ReportCase(
|
|
349
|
+
name='case1',
|
|
350
|
+
inputs=TaskInput(query='What is 2+2?'),
|
|
351
|
+
metadata=TaskMetadata(difficulty='easy', category='general'),
|
|
352
|
+
expected_output=TaskOutput(answer='4', confidence=1.0),
|
|
353
|
+
output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
|
|
354
|
+
metrics={'chars': 12},
|
|
355
|
+
attributes={'is_about_france': False},
|
|
356
|
+
scores={},
|
|
357
|
+
labels={},
|
|
358
|
+
assertions={},
|
|
359
|
+
task_duration=1.0,
|
|
360
|
+
total_duration=3.0,
|
|
361
|
+
trace_id='00000000000000000000000000000001',
|
|
362
|
+
span_id='0000000000000003',
|
|
363
|
+
),
|
|
364
|
+
ReportCase(
|
|
365
|
+
name='case2',
|
|
366
|
+
inputs=TaskInput(query='What is the capital of France?'),
|
|
367
|
+
metadata=TaskMetadata(difficulty='medium', category='geography'),
|
|
368
|
+
expected_output=TaskOutput(answer='Paris', confidence=1.0),
|
|
369
|
+
output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
|
|
370
|
+
metrics={'chars': 30},
|
|
371
|
+
attributes={'is_about_france': True},
|
|
372
|
+
scores={},
|
|
373
|
+
labels={},
|
|
374
|
+
assertions={},
|
|
375
|
+
task_duration=1.0,
|
|
376
|
+
total_duration=3.0,
|
|
377
|
+
trace_id='00000000000000000000000000000001',
|
|
378
|
+
span_id='0000000000000007',
|
|
379
|
+
),
|
|
380
|
+
]
|
|
381
|
+
)
|
|
383
382
|
|
|
384
383
|
|
|
385
384
|
async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
|
|
@@ -397,7 +396,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
|
|
|
397
396
|
[
|
|
398
397
|
ReportCase(
|
|
399
398
|
name='case1',
|
|
400
|
-
inputs=
|
|
399
|
+
inputs=TaskInput(query='What is 2+2?'),
|
|
401
400
|
metadata=TaskMetadata(difficulty='easy', category='general'),
|
|
402
401
|
expected_output=TaskOutput(answer='4', confidence=1.0),
|
|
403
402
|
output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
|
|
@@ -423,7 +422,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
|
|
|
423
422
|
),
|
|
424
423
|
ReportCase(
|
|
425
424
|
name='case2',
|
|
426
|
-
inputs=
|
|
425
|
+
inputs=TaskInput(query='What is the capital of France?'),
|
|
427
426
|
metadata=TaskMetadata(difficulty='medium', category='geography'),
|
|
428
427
|
expected_output=TaskOutput(answer='Paris', confidence=1.0),
|
|
429
428
|
output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
|
|
@@ -471,7 +470,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
|
|
|
471
470
|
[
|
|
472
471
|
ReportCase(
|
|
473
472
|
name='case1',
|
|
474
|
-
inputs=
|
|
473
|
+
inputs=TaskInput(query='What is 2+2?'),
|
|
475
474
|
metadata=TaskMetadata(difficulty='easy', category='general'),
|
|
476
475
|
expected_output=TaskOutput(answer='4', confidence=1.0),
|
|
477
476
|
output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
|
|
@@ -487,7 +486,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
|
|
|
487
486
|
),
|
|
488
487
|
ReportCase(
|
|
489
488
|
name='case2',
|
|
490
|
-
inputs=
|
|
489
|
+
inputs=TaskInput(query='What is the capital of France?'),
|
|
491
490
|
metadata=TaskMetadata(difficulty='medium', category='geography'),
|
|
492
491
|
expected_output=TaskOutput(answer='Paris', confidence=1.0),
|
|
493
492
|
output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
|
|
@@ -992,3 +991,47 @@ def test_import_generate_dataset():
|
|
|
992
991
|
from pydantic_evals.generation import generate_dataset
|
|
993
992
|
|
|
994
993
|
assert generate_dataset
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def test_evaluate_non_serializable_inputs():
|
|
997
|
+
@dataclass
|
|
998
|
+
class MyInputs:
|
|
999
|
+
result_type: type[str] | type[int]
|
|
1000
|
+
|
|
1001
|
+
my_dataset = Dataset[MyInputs, Any, Any](
|
|
1002
|
+
cases=[
|
|
1003
|
+
Case(
|
|
1004
|
+
name='str',
|
|
1005
|
+
inputs=MyInputs(result_type=str),
|
|
1006
|
+
expected_output='abc',
|
|
1007
|
+
),
|
|
1008
|
+
Case(
|
|
1009
|
+
name='int',
|
|
1010
|
+
inputs=MyInputs(result_type=int),
|
|
1011
|
+
expected_output=123,
|
|
1012
|
+
),
|
|
1013
|
+
],
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
async def my_task(my_inputs: MyInputs) -> int | str:
|
|
1017
|
+
if issubclass(my_inputs.result_type, str):
|
|
1018
|
+
return my_inputs.result_type('abc')
|
|
1019
|
+
else:
|
|
1020
|
+
return my_inputs.result_type(123)
|
|
1021
|
+
|
|
1022
|
+
report = my_dataset.evaluate_sync(task=my_task)
|
|
1023
|
+
assert [c.inputs for c in report.cases] == snapshot([MyInputs(result_type=str), MyInputs(result_type=int)])
|
|
1024
|
+
|
|
1025
|
+
table = report.console_table(include_input=True)
|
|
1026
|
+
assert render_table(table) == snapshot("""\
|
|
1027
|
+
Evaluation Summary: my_task
|
|
1028
|
+
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
|
|
1029
|
+
┃ Case ID ┃ Inputs ┃ Duration ┃
|
|
1030
|
+
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
|
|
1031
|
+
│ str │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'str'>) │ 1.0s │
|
|
1032
|
+
├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
|
|
1033
|
+
│ int │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'int'>) │ 1.0s │
|
|
1034
|
+
├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
|
|
1035
|
+
│ Averages │ │ 1.0s │
|
|
1036
|
+
└──────────┴────────────────────────────────────────────────────────────────────────────────────┴──────────┘
|
|
1037
|
+
""")
|
|
@@ -245,26 +245,26 @@ async def test_span_tree_repr(span_tree: SpanTree):
|
|
|
245
245
|
""")
|
|
246
246
|
assert span_tree.repr_xml(include_span_id=True) == snapshot("""\
|
|
247
247
|
<SpanTree>
|
|
248
|
-
<SpanNode name='root' span_id=0000000000000001 >
|
|
249
|
-
<SpanNode name='child1' span_id=0000000000000003 >
|
|
250
|
-
<SpanNode name='grandchild1' span_id=0000000000000005 />
|
|
251
|
-
<SpanNode name='grandchild2' span_id=0000000000000007 />
|
|
248
|
+
<SpanNode name='root' span_id='0000000000000001' >
|
|
249
|
+
<SpanNode name='child1' span_id='0000000000000003' >
|
|
250
|
+
<SpanNode name='grandchild1' span_id='0000000000000005' />
|
|
251
|
+
<SpanNode name='grandchild2' span_id='0000000000000007' />
|
|
252
252
|
</SpanNode>
|
|
253
|
-
<SpanNode name='child2' span_id=0000000000000009 >
|
|
254
|
-
<SpanNode name='grandchild3' span_id=000000000000000b />
|
|
253
|
+
<SpanNode name='child2' span_id='0000000000000009' >
|
|
254
|
+
<SpanNode name='grandchild3' span_id='000000000000000b' />
|
|
255
255
|
</SpanNode>
|
|
256
256
|
</SpanNode>
|
|
257
257
|
</SpanTree>\
|
|
258
258
|
""")
|
|
259
259
|
assert span_tree.repr_xml(include_trace_id=True) == snapshot("""\
|
|
260
260
|
<SpanTree>
|
|
261
|
-
<SpanNode name='root' trace_id=00000000000000000000000000000001 >
|
|
262
|
-
<SpanNode name='child1' trace_id=00000000000000000000000000000001 >
|
|
263
|
-
<SpanNode name='grandchild1' trace_id=00000000000000000000000000000001 />
|
|
264
|
-
<SpanNode name='grandchild2' trace_id=00000000000000000000000000000001 />
|
|
261
|
+
<SpanNode name='root' trace_id='00000000000000000000000000000001' >
|
|
262
|
+
<SpanNode name='child1' trace_id='00000000000000000000000000000001' >
|
|
263
|
+
<SpanNode name='grandchild1' trace_id='00000000000000000000000000000001' />
|
|
264
|
+
<SpanNode name='grandchild2' trace_id='00000000000000000000000000000001' />
|
|
265
265
|
</SpanNode>
|
|
266
|
-
<SpanNode name='child2' trace_id=00000000000000000000000000000001 >
|
|
267
|
-
<SpanNode name='grandchild3' trace_id=00000000000000000000000000000001 />
|
|
266
|
+
<SpanNode name='child2' trace_id='00000000000000000000000000000001' >
|
|
267
|
+
<SpanNode name='grandchild3' trace_id='00000000000000000000000000000001' />
|
|
268
268
|
</SpanNode>
|
|
269
269
|
</SpanNode>
|
|
270
270
|
</SpanTree>\
|
|
@@ -302,9 +302,9 @@ async def test_span_node_repr(span_tree: SpanTree):
|
|
|
302
302
|
assert node is not None
|
|
303
303
|
|
|
304
304
|
leaf_node = span_tree.first({'name_equals': 'grandchild1'})
|
|
305
|
-
assert str(leaf_node) == snapshot("<SpanNode name='grandchild1' span_id=0000000000000005 />")
|
|
305
|
+
assert str(leaf_node) == snapshot("<SpanNode name='grandchild1' span_id='0000000000000005' />")
|
|
306
306
|
|
|
307
|
-
assert str(node) == snapshot("<SpanNode name='child2' span_id=0000000000000009>...</SpanNode>")
|
|
307
|
+
assert str(node) == snapshot("<SpanNode name='child2' span_id='0000000000000009'>...</SpanNode>")
|
|
308
308
|
assert repr(node) == snapshot("""\
|
|
309
309
|
<SpanNode name='child2' >
|
|
310
310
|
<SpanNode name='grandchild3' />
|
|
@@ -312,13 +312,13 @@ async def test_span_node_repr(span_tree: SpanTree):
|
|
|
312
312
|
""")
|
|
313
313
|
assert node.repr_xml(include_children=False) == snapshot("<SpanNode name='child2' children=... />")
|
|
314
314
|
assert node.repr_xml(include_span_id=True) == snapshot("""\
|
|
315
|
-
<SpanNode name='child2' span_id=0000000000000009 >
|
|
316
|
-
<SpanNode name='grandchild3' span_id=000000000000000b />
|
|
315
|
+
<SpanNode name='child2' span_id='0000000000000009' >
|
|
316
|
+
<SpanNode name='grandchild3' span_id='000000000000000b' />
|
|
317
317
|
</SpanNode>\
|
|
318
318
|
""")
|
|
319
319
|
assert node.repr_xml(include_trace_id=True) == snapshot("""\
|
|
320
|
-
<SpanNode name='child2' trace_id=00000000000000000000000000000001 >
|
|
321
|
-
<SpanNode name='grandchild3' trace_id=00000000000000000000000000000001 />
|
|
320
|
+
<SpanNode name='child2' trace_id='00000000000000000000000000000001' >
|
|
321
|
+
<SpanNode name='grandchild3' trace_id='00000000000000000000000000000001' />
|
|
322
322
|
</SpanNode>\
|
|
323
323
|
""")
|
|
324
324
|
assert node.repr_xml(include_start_timestamp=True) == snapshot("""\
|
|
@@ -383,6 +383,17 @@ async def test_span_tree_ancestors_methods():
|
|
|
383
383
|
assert not leaf_node.matches({'no_ancestor_has': {'name_matches_regex': 'root'}})
|
|
384
384
|
assert leaf_node.matches({'no_ancestor_has': {'name_matches_regex': 'abc'}})
|
|
385
385
|
|
|
386
|
+
# Test stop_recursing_when:
|
|
387
|
+
assert not leaf_node.matches(
|
|
388
|
+
{'some_ancestor_has': {'name_equals': 'level1'}, 'stop_recursing_when': {'name_equals': 'level2'}}
|
|
389
|
+
)
|
|
390
|
+
assert leaf_node.matches(
|
|
391
|
+
{'all_ancestors_have': {'name_matches_regex': 'level'}, 'stop_recursing_when': {'name_equals': 'level1'}}
|
|
392
|
+
)
|
|
393
|
+
assert leaf_node.matches(
|
|
394
|
+
{'no_ancestor_has': {'name_matches_regex': 'root'}, 'stop_recursing_when': {'name_equals': 'level1'}}
|
|
395
|
+
)
|
|
396
|
+
|
|
386
397
|
|
|
387
398
|
async def test_span_tree_descendants_methods():
|
|
388
399
|
"""Test the descendant traversal methods in SpanNode."""
|
|
@@ -462,6 +473,17 @@ async def test_span_tree_descendants_methods():
|
|
|
462
473
|
assert leaf_node.matches(negated_descendant_query)
|
|
463
474
|
assert leaf_node.matches({'no_descendant_has': {'has_attributes': {'depth': 4}}})
|
|
464
475
|
|
|
476
|
+
# Test stop_recursing_when:
|
|
477
|
+
assert not root_node.matches(
|
|
478
|
+
{'some_descendant_has': {'name_equals': 'leaf'}, 'stop_recursing_when': {'name_equals': 'level2'}}
|
|
479
|
+
)
|
|
480
|
+
assert root_node.matches(
|
|
481
|
+
{'all_descendants_have': {'has_attribute_keys': ['depth']}, 'stop_recursing_when': {'name_equals': 'level2'}}
|
|
482
|
+
)
|
|
483
|
+
assert root_node.matches(
|
|
484
|
+
{'no_descendant_has': {'name_equals': 'leaf'}, 'stop_recursing_when': {'name_equals': 'level3'}}
|
|
485
|
+
)
|
|
486
|
+
|
|
465
487
|
|
|
466
488
|
async def test_log_levels_and_exceptions():
|
|
467
489
|
"""Test recording different log levels and exceptions in spans."""
|