google-adk 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/__init__.py +20 -0
- google/adk/agents/__init__.py +32 -0
- google/adk/agents/active_streaming_tool.py +38 -0
- google/adk/agents/base_agent.py +345 -0
- google/adk/agents/callback_context.py +112 -0
- google/adk/agents/invocation_context.py +181 -0
- google/adk/agents/langgraph_agent.py +140 -0
- google/adk/agents/live_request_queue.py +64 -0
- google/adk/agents/llm_agent.py +376 -0
- google/adk/agents/loop_agent.py +62 -0
- google/adk/agents/parallel_agent.py +96 -0
- google/adk/agents/readonly_context.py +46 -0
- google/adk/agents/remote_agent.py +50 -0
- google/adk/agents/run_config.py +87 -0
- google/adk/agents/sequential_agent.py +45 -0
- google/adk/agents/transcription_entry.py +34 -0
- google/adk/artifacts/__init__.py +23 -0
- google/adk/artifacts/base_artifact_service.py +128 -0
- google/adk/artifacts/gcs_artifact_service.py +195 -0
- google/adk/artifacts/in_memory_artifact_service.py +133 -0
- google/adk/auth/__init__.py +22 -0
- google/adk/auth/auth_credential.py +220 -0
- google/adk/auth/auth_handler.py +268 -0
- google/adk/auth/auth_preprocessor.py +116 -0
- google/adk/auth/auth_schemes.py +67 -0
- google/adk/auth/auth_tool.py +55 -0
- google/adk/cli/__init__.py +15 -0
- google/adk/cli/__main__.py +18 -0
- google/adk/cli/agent_graph.py +122 -0
- google/adk/cli/browser/adk_favicon.svg +17 -0
- google/adk/cli/browser/assets/audio-processor.js +51 -0
- google/adk/cli/browser/assets/config/runtime-config.json +3 -0
- google/adk/cli/browser/index.html +33 -0
- google/adk/cli/browser/main-XUU6OGCC.js +75 -0
- google/adk/cli/browser/polyfills-FFHMD2TL.js +18 -0
- google/adk/cli/browser/styles-4VDSPQ37.css +17 -0
- google/adk/cli/cli.py +181 -0
- google/adk/cli/cli_deploy.py +181 -0
- google/adk/cli/cli_eval.py +282 -0
- google/adk/cli/cli_tools_click.py +479 -0
- google/adk/cli/fast_api.py +774 -0
- google/adk/cli/media_streamer/__init__.py +19 -0
- google/adk/cli/media_streamer/index.html +228 -0
- google/adk/cli/utils/__init__.py +49 -0
- google/adk/cli/utils/envs.py +57 -0
- google/adk/cli/utils/evals.py +93 -0
- google/adk/cli/utils/logs.py +72 -0
- google/adk/code_executors/__init__.py +49 -0
- google/adk/code_executors/base_code_executor.py +97 -0
- google/adk/code_executors/code_execution_utils.py +256 -0
- google/adk/code_executors/code_executor_context.py +202 -0
- google/adk/code_executors/container_code_executor.py +196 -0
- google/adk/code_executors/unsafe_local_code_executor.py +71 -0
- google/adk/code_executors/vertex_ai_code_executor.py +234 -0
- google/adk/evaluation/__init__.py +31 -0
- google/adk/evaluation/agent_evaluator.py +329 -0
- google/adk/evaluation/evaluation_constants.py +24 -0
- google/adk/evaluation/evaluation_generator.py +270 -0
- google/adk/evaluation/response_evaluator.py +135 -0
- google/adk/evaluation/trajectory_evaluator.py +184 -0
- google/adk/events/__init__.py +21 -0
- google/adk/events/event.py +130 -0
- google/adk/events/event_actions.py +55 -0
- google/adk/examples/__init__.py +28 -0
- google/adk/examples/base_example_provider.py +35 -0
- google/adk/examples/example.py +27 -0
- google/adk/examples/example_util.py +123 -0
- google/adk/examples/vertex_ai_example_store.py +104 -0
- google/adk/flows/__init__.py +14 -0
- google/adk/flows/llm_flows/__init__.py +20 -0
- google/adk/flows/llm_flows/_base_llm_processor.py +52 -0
- google/adk/flows/llm_flows/_code_execution.py +458 -0
- google/adk/flows/llm_flows/_nl_planning.py +129 -0
- google/adk/flows/llm_flows/agent_transfer.py +132 -0
- google/adk/flows/llm_flows/audio_transcriber.py +109 -0
- google/adk/flows/llm_flows/auto_flow.py +49 -0
- google/adk/flows/llm_flows/base_llm_flow.py +559 -0
- google/adk/flows/llm_flows/basic.py +72 -0
- google/adk/flows/llm_flows/contents.py +370 -0
- google/adk/flows/llm_flows/functions.py +486 -0
- google/adk/flows/llm_flows/identity.py +47 -0
- google/adk/flows/llm_flows/instructions.py +137 -0
- google/adk/flows/llm_flows/single_flow.py +57 -0
- google/adk/memory/__init__.py +35 -0
- google/adk/memory/base_memory_service.py +74 -0
- google/adk/memory/in_memory_memory_service.py +62 -0
- google/adk/memory/vertex_ai_rag_memory_service.py +177 -0
- google/adk/models/__init__.py +31 -0
- google/adk/models/anthropic_llm.py +243 -0
- google/adk/models/base_llm.py +87 -0
- google/adk/models/base_llm_connection.py +76 -0
- google/adk/models/gemini_llm_connection.py +200 -0
- google/adk/models/google_llm.py +331 -0
- google/adk/models/lite_llm.py +673 -0
- google/adk/models/llm_request.py +98 -0
- google/adk/models/llm_response.py +111 -0
- google/adk/models/registry.py +102 -0
- google/adk/planners/__init__.py +23 -0
- google/adk/planners/base_planner.py +66 -0
- google/adk/planners/built_in_planner.py +75 -0
- google/adk/planners/plan_re_act_planner.py +208 -0
- google/adk/runners.py +456 -0
- google/adk/sessions/__init__.py +41 -0
- google/adk/sessions/base_session_service.py +133 -0
- google/adk/sessions/database_session_service.py +522 -0
- google/adk/sessions/in_memory_session_service.py +206 -0
- google/adk/sessions/session.py +54 -0
- google/adk/sessions/state.py +71 -0
- google/adk/sessions/vertex_ai_session_service.py +356 -0
- google/adk/telemetry.py +189 -0
- google/adk/tests/__init__.py +14 -0
- google/adk/tests/integration/.env.example +10 -0
- google/adk/tests/integration/__init__.py +18 -0
- google/adk/tests/integration/conftest.py +119 -0
- google/adk/tests/integration/fixture/__init__.py +14 -0
- google/adk/tests/integration/fixture/agent_with_config/__init__.py +15 -0
- google/adk/tests/integration/fixture/agent_with_config/agent.py +88 -0
- google/adk/tests/integration/fixture/callback_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/callback_agent/agent.py +105 -0
- google/adk/tests/integration/fixture/context_update_test/OWNERS +1 -0
- google/adk/tests/integration/fixture/context_update_test/__init__.py +15 -0
- google/adk/tests/integration/fixture/context_update_test/agent.py +43 -0
- google/adk/tests/integration/fixture/context_update_test/successful_test.session.json +582 -0
- google/adk/tests/integration/fixture/context_variable_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/context_variable_agent/agent.py +115 -0
- google/adk/tests/integration/fixture/customer_support_ma/__init__.py +15 -0
- google/adk/tests/integration/fixture/customer_support_ma/agent.py +172 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/agent.py +338 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json +69 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/test_config.json +6 -0
- google/adk/tests/integration/fixture/flow_complex_spark/__init__.py +15 -0
- google/adk/tests/integration/fixture/flow_complex_spark/agent.py +182 -0
- google/adk/tests/integration/fixture/flow_complex_spark/sample.debug.log +243 -0
- google/adk/tests/integration/fixture/flow_complex_spark/sample.session.json +190 -0
- google/adk/tests/integration/fixture/hello_world_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/hello_world_agent/agent.py +95 -0
- google/adk/tests/integration/fixture/hello_world_agent/roll_die.test.json +24 -0
- google/adk/tests/integration/fixture/hello_world_agent/test_config.json +6 -0
- google/adk/tests/integration/fixture/home_automation_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/home_automation_agent/agent.py +304 -0
- google/adk/tests/integration/fixture/home_automation_agent/simple_test.test.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/simple_test2.test.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_config.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json +18 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json +17 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/test_config.json +6 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json +18 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json +17 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/test_config.json +5 -0
- google/adk/tests/integration/fixture/tool_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/tool_agent/agent.py +218 -0
- google/adk/tests/integration/fixture/tool_agent/files/Agent_test_plan.pdf +0 -0
- google/adk/tests/integration/fixture/trip_planner_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/trip_planner_agent/agent.py +110 -0
- google/adk/tests/integration/fixture/trip_planner_agent/initial.session.json +13 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_config.json +5 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json +13 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_files/test_config.json +5 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json +7 -0
- google/adk/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json +19 -0
- google/adk/tests/integration/models/__init__.py +14 -0
- google/adk/tests/integration/models/test_google_llm.py +65 -0
- google/adk/tests/integration/test_callback.py +70 -0
- google/adk/tests/integration/test_context_variable.py +67 -0
- google/adk/tests/integration/test_evalute_agent_in_fixture.py +76 -0
- google/adk/tests/integration/test_multi_agent.py +28 -0
- google/adk/tests/integration/test_multi_turn.py +42 -0
- google/adk/tests/integration/test_single_agent.py +23 -0
- google/adk/tests/integration/test_sub_agent.py +26 -0
- google/adk/tests/integration/test_system_instruction.py +177 -0
- google/adk/tests/integration/test_tools.py +287 -0
- google/adk/tests/integration/test_with_test_file.py +34 -0
- google/adk/tests/integration/tools/__init__.py +14 -0
- google/adk/tests/integration/utils/__init__.py +16 -0
- google/adk/tests/integration/utils/asserts.py +75 -0
- google/adk/tests/integration/utils/test_runner.py +97 -0
- google/adk/tests/unittests/__init__.py +14 -0
- google/adk/tests/unittests/agents/__init__.py +14 -0
- google/adk/tests/unittests/agents/test_base_agent.py +407 -0
- google/adk/tests/unittests/agents/test_langgraph_agent.py +191 -0
- google/adk/tests/unittests/agents/test_llm_agent_callbacks.py +138 -0
- google/adk/tests/unittests/agents/test_llm_agent_fields.py +231 -0
- google/adk/tests/unittests/agents/test_loop_agent.py +136 -0
- google/adk/tests/unittests/agents/test_parallel_agent.py +92 -0
- google/adk/tests/unittests/agents/test_sequential_agent.py +114 -0
- google/adk/tests/unittests/artifacts/__init__.py +14 -0
- google/adk/tests/unittests/artifacts/test_artifact_service.py +276 -0
- google/adk/tests/unittests/auth/test_auth_handler.py +575 -0
- google/adk/tests/unittests/conftest.py +73 -0
- google/adk/tests/unittests/fast_api/__init__.py +14 -0
- google/adk/tests/unittests/fast_api/test_fast_api.py +269 -0
- google/adk/tests/unittests/flows/__init__.py +14 -0
- google/adk/tests/unittests/flows/llm_flows/__init__.py +14 -0
- google/adk/tests/unittests/flows/llm_flows/_test_examples.py +142 -0
- google/adk/tests/unittests/flows/llm_flows/test_agent_transfer.py +311 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_long_running.py +244 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_request_euc.py +346 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_sequential.py +93 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_simple.py +258 -0
- google/adk/tests/unittests/flows/llm_flows/test_identity.py +66 -0
- google/adk/tests/unittests/flows/llm_flows/test_instructions.py +164 -0
- google/adk/tests/unittests/flows/llm_flows/test_model_callbacks.py +142 -0
- google/adk/tests/unittests/flows/llm_flows/test_other_configs.py +46 -0
- google/adk/tests/unittests/flows/llm_flows/test_tool_callbacks.py +269 -0
- google/adk/tests/unittests/models/__init__.py +14 -0
- google/adk/tests/unittests/models/test_google_llm.py +224 -0
- google/adk/tests/unittests/models/test_litellm.py +804 -0
- google/adk/tests/unittests/models/test_models.py +60 -0
- google/adk/tests/unittests/sessions/__init__.py +14 -0
- google/adk/tests/unittests/sessions/test_session_service.py +227 -0
- google/adk/tests/unittests/sessions/test_vertex_ai_session_service.py +246 -0
- google/adk/tests/unittests/streaming/__init__.py +14 -0
- google/adk/tests/unittests/streaming/test_streaming.py +50 -0
- google/adk/tests/unittests/tools/__init__.py +14 -0
- google/adk/tests/unittests/tools/apihub_tool/clients/test_apihub_client.py +499 -0
- google/adk/tests/unittests/tools/apihub_tool/test_apihub_toolset.py +204 -0
- google/adk/tests/unittests/tools/application_integration_tool/clients/test_connections_client.py +600 -0
- google/adk/tests/unittests/tools/application_integration_tool/clients/test_integration_client.py +630 -0
- google/adk/tests/unittests/tools/application_integration_tool/test_application_integration_toolset.py +345 -0
- google/adk/tests/unittests/tools/google_api_tool/__init__.py +13 -0
- google/adk/tests/unittests/tools/google_api_tool/test_googleapi_to_openapi_converter.py +657 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_auto_auth_credential_exchanger.py +145 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_base_auth_credential_exchanger.py +68 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_oauth2_exchanger.py +153 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_service_account_exchanger.py +196 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/test_auth_helper.py +573 -0
- google/adk/tests/unittests/tools/openapi_tool/common/test_common.py +436 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test.yaml +1367 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_openapi_spec_parser.py +628 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_openapi_toolset.py +139 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_operation_parser.py +406 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_rest_api_tool.py +966 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_tool_auth_handler.py +201 -0
- google/adk/tests/unittests/tools/retrieval/__init__.py +14 -0
- google/adk/tests/unittests/tools/retrieval/test_vertex_ai_rag_retrieval.py +147 -0
- google/adk/tests/unittests/tools/test_agent_tool.py +167 -0
- google/adk/tests/unittests/tools/test_base_tool.py +141 -0
- google/adk/tests/unittests/tools/test_build_function_declaration.py +277 -0
- google/adk/tests/unittests/utils.py +304 -0
- google/adk/tools/__init__.py +51 -0
- google/adk/tools/_automatic_function_calling_util.py +346 -0
- google/adk/tools/agent_tool.py +176 -0
- google/adk/tools/apihub_tool/__init__.py +19 -0
- google/adk/tools/apihub_tool/apihub_toolset.py +209 -0
- google/adk/tools/apihub_tool/clients/__init__.py +13 -0
- google/adk/tools/apihub_tool/clients/apihub_client.py +332 -0
- google/adk/tools/apihub_tool/clients/secret_client.py +115 -0
- google/adk/tools/application_integration_tool/__init__.py +19 -0
- google/adk/tools/application_integration_tool/application_integration_toolset.py +230 -0
- google/adk/tools/application_integration_tool/clients/connections_client.py +903 -0
- google/adk/tools/application_integration_tool/clients/integration_client.py +253 -0
- google/adk/tools/base_tool.py +144 -0
- google/adk/tools/built_in_code_execution_tool.py +59 -0
- google/adk/tools/crewai_tool.py +72 -0
- google/adk/tools/example_tool.py +62 -0
- google/adk/tools/exit_loop_tool.py +23 -0
- google/adk/tools/function_parameter_parse_util.py +307 -0
- google/adk/tools/function_tool.py +87 -0
- google/adk/tools/get_user_choice_tool.py +28 -0
- google/adk/tools/google_api_tool/__init__.py +14 -0
- google/adk/tools/google_api_tool/google_api_tool.py +59 -0
- google/adk/tools/google_api_tool/google_api_tool_set.py +107 -0
- google/adk/tools/google_api_tool/google_api_tool_sets.py +55 -0
- google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +521 -0
- google/adk/tools/google_search_tool.py +68 -0
- google/adk/tools/langchain_tool.py +86 -0
- google/adk/tools/load_artifacts_tool.py +113 -0
- google/adk/tools/load_memory_tool.py +58 -0
- google/adk/tools/load_web_page.py +41 -0
- google/adk/tools/long_running_tool.py +39 -0
- google/adk/tools/mcp_tool/__init__.py +42 -0
- google/adk/tools/mcp_tool/conversion_utils.py +161 -0
- google/adk/tools/mcp_tool/mcp_tool.py +113 -0
- google/adk/tools/mcp_tool/mcp_toolset.py +272 -0
- google/adk/tools/openapi_tool/__init__.py +21 -0
- google/adk/tools/openapi_tool/auth/__init__.py +19 -0
- google/adk/tools/openapi_tool/auth/auth_helpers.py +498 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/__init__.py +25 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/auto_auth_credential_exchanger.py +105 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +55 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/oauth2_exchanger.py +117 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +97 -0
- google/adk/tools/openapi_tool/common/__init__.py +19 -0
- google/adk/tools/openapi_tool/common/common.py +300 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +32 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_spec_parser.py +231 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +144 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +260 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +496 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +268 -0
- google/adk/tools/preload_memory_tool.py +72 -0
- google/adk/tools/retrieval/__init__.py +36 -0
- google/adk/tools/retrieval/base_retrieval_tool.py +37 -0
- google/adk/tools/retrieval/files_retrieval.py +33 -0
- google/adk/tools/retrieval/llama_index_retrieval.py +41 -0
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +107 -0
- google/adk/tools/tool_context.py +90 -0
- google/adk/tools/toolbox_tool.py +46 -0
- google/adk/tools/transfer_to_agent_tool.py +21 -0
- google/adk/tools/vertex_ai_search_tool.py +96 -0
- google/adk/version.py +16 -0
- google_adk-0.0.1.dist-info/LICENSE.txt → google_adk-0.0.2.dist-info/LICENSE +32 -0
- google_adk-0.0.2.dist-info/METADATA +73 -0
- google_adk-0.0.2.dist-info/RECORD +308 -0
- {google_adk-0.0.1.dist-info → google_adk-0.0.2.dist-info}/WHEEL +1 -2
- google_adk-0.0.2.dist-info/entry_points.txt +3 -0
- agent_kit/__init__.py +0 -0
- google_adk-0.0.1.dist-info/METADATA +0 -15
- google_adk-0.0.1.dist-info/RECORD +0 -6
- google_adk-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,71 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from contextlib import redirect_stdout
|
16
|
+
import io
|
17
|
+
|
18
|
+
from pydantic import Field
|
19
|
+
from typing_extensions import override
|
20
|
+
|
21
|
+
from ..agents.invocation_context import InvocationContext
|
22
|
+
from .base_code_executor import BaseCodeExecutor
|
23
|
+
from .code_execution_utils import CodeExecutionInput
|
24
|
+
from .code_execution_utils import CodeExecutionResult
|
25
|
+
|
26
|
+
|
27
|
+
class UnsafeLocalCodeExecutor(BaseCodeExecutor):
|
28
|
+
"""A code executor that unsafely execute code in the current local context."""
|
29
|
+
|
30
|
+
# Overrides the BaseCodeExecutor attribute: this executor cannot be stateful.
|
31
|
+
stateful: bool = Field(default=False, frozen=True, exclude=True)
|
32
|
+
|
33
|
+
# Overrides the BaseCodeExecutor attribute: this executor cannot
|
34
|
+
# optimize_data_file.
|
35
|
+
optimize_data_file: bool = Field(default=False, frozen=True, exclude=True)
|
36
|
+
|
37
|
+
def __init__(self, **data):
|
38
|
+
"""Initializes the UnsafeLocalCodeExecutor."""
|
39
|
+
if 'stateful' in data and data['stateful']:
|
40
|
+
raise ValueError('Cannot set `stateful=True` in UnsafeLocalCodeExecutor.')
|
41
|
+
if 'optimize_data_file' in data and data['optimize_data_file']:
|
42
|
+
raise ValueError(
|
43
|
+
'Cannot set `optimize_data_file=True` in UnsafeLocalCodeExecutor.'
|
44
|
+
)
|
45
|
+
super().__init__(**data)
|
46
|
+
|
47
|
+
@override
|
48
|
+
def execute_code(
|
49
|
+
self,
|
50
|
+
invocation_context: InvocationContext,
|
51
|
+
code_execution_input: CodeExecutionInput,
|
52
|
+
) -> CodeExecutionResult:
|
53
|
+
# Execute the code.
|
54
|
+
output = ''
|
55
|
+
error = ''
|
56
|
+
try:
|
57
|
+
globals_ = {}
|
58
|
+
locals_ = {}
|
59
|
+
stdout = io.StringIO()
|
60
|
+
with redirect_stdout(stdout):
|
61
|
+
exec(code_execution_input.code, globals_, locals_)
|
62
|
+
output = stdout.getvalue()
|
63
|
+
except Exception as e:
|
64
|
+
error = str(e)
|
65
|
+
|
66
|
+
# Collect the final result.
|
67
|
+
return CodeExecutionResult(
|
68
|
+
stdout=output,
|
69
|
+
stderr=error,
|
70
|
+
output_files=[],
|
71
|
+
)
|
@@ -0,0 +1,234 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import datetime
|
16
|
+
import mimetypes
|
17
|
+
import os
|
18
|
+
from typing import Any, Optional
|
19
|
+
|
20
|
+
from typing_extensions import override
|
21
|
+
from vertexai.preview.extensions import Extension
|
22
|
+
|
23
|
+
from ..agents.invocation_context import InvocationContext
|
24
|
+
from .base_code_executor import BaseCodeExecutor
|
25
|
+
from .code_execution_utils import CodeExecutionInput
|
26
|
+
from .code_execution_utils import CodeExecutionResult
|
27
|
+
from .code_execution_utils import File
|
28
|
+
|
29
|
+
_SUPPORTED_IMAGE_TYPES = ['png', 'jpg', 'jpeg']
|
30
|
+
_SUPPORTED_DATA_FILE_TYPES = ['csv']
|
31
|
+
|
32
|
+
_IMPORTED_LIBRARIES = '''
|
33
|
+
import io
|
34
|
+
import math
|
35
|
+
import re
|
36
|
+
|
37
|
+
import matplotlib.pyplot as plt
|
38
|
+
import numpy as np
|
39
|
+
import pandas as pd
|
40
|
+
import scipy
|
41
|
+
|
42
|
+
def crop(s: str, max_chars: int = 64) -> str:
|
43
|
+
"""Crops a string to max_chars characters."""
|
44
|
+
return s[: max_chars - 3] + '...' if len(s) > max_chars else s
|
45
|
+
|
46
|
+
|
47
|
+
def explore_df(df: pd.DataFrame) -> None:
|
48
|
+
"""Prints some information about a pandas DataFrame."""
|
49
|
+
|
50
|
+
with pd.option_context(
|
51
|
+
'display.max_columns', None, 'display.expand_frame_repr', False
|
52
|
+
):
|
53
|
+
# Print the column names to never encounter KeyError when selecting one.
|
54
|
+
df_dtypes = df.dtypes
|
55
|
+
|
56
|
+
# Obtain information about data types and missing values.
|
57
|
+
df_nulls = (len(df) - df.isnull().sum()).apply(
|
58
|
+
lambda x: f'{x} / {df.shape[0]} non-null'
|
59
|
+
)
|
60
|
+
|
61
|
+
# Explore unique total values in columns using `.unique()`.
|
62
|
+
df_unique_count = df.apply(lambda x: len(x.unique()))
|
63
|
+
|
64
|
+
# Explore unique values in columns using `.unique()`.
|
65
|
+
df_unique = df.apply(lambda x: crop(str(list(x.unique()))))
|
66
|
+
|
67
|
+
df_info = pd.concat(
|
68
|
+
(
|
69
|
+
df_dtypes.rename('Dtype'),
|
70
|
+
df_nulls.rename('Non-Null Count'),
|
71
|
+
df_unique_count.rename('Unique Values Count'),
|
72
|
+
df_unique.rename('Unique Values'),
|
73
|
+
),
|
74
|
+
axis=1,
|
75
|
+
)
|
76
|
+
df_info.index.name = 'Columns'
|
77
|
+
print(f"""Total rows: {df.shape[0]}
|
78
|
+
Total columns: {df.shape[1]}
|
79
|
+
|
80
|
+
{df_info}""")
|
81
|
+
'''
|
82
|
+
|
83
|
+
|
84
|
+
def _get_code_interpreter_extension(resource_name: str = None):
|
85
|
+
"""Returns: Load or create the code interpreter extension."""
|
86
|
+
if not resource_name:
|
87
|
+
resource_name = os.environ.get('CODE_INTERPRETER_EXTENSION_NAME')
|
88
|
+
if resource_name:
|
89
|
+
new_code_interpreter = Extension(resource_name)
|
90
|
+
else:
|
91
|
+
print('No CODE_INTERPRETER_ID found in the environment. Create a new one.')
|
92
|
+
new_code_interpreter = Extension.from_hub('code_interpreter')
|
93
|
+
os.environ['CODE_INTERPRETER_EXTENSION_NAME'] = (
|
94
|
+
new_code_interpreter.gca_resource.name
|
95
|
+
)
|
96
|
+
return new_code_interpreter
|
97
|
+
|
98
|
+
|
99
|
+
class VertexAiCodeExecutor(BaseCodeExecutor):
|
100
|
+
"""A code executor that uses Vertex Code Interpreter Extension to execute code.
|
101
|
+
|
102
|
+
Attributes:
|
103
|
+
resource_name: If set, load the existing resource name of the code
|
104
|
+
interpreter extension instead of creating a new one. Format:
|
105
|
+
projects/123/locations/us-central1/extensions/456
|
106
|
+
"""
|
107
|
+
|
108
|
+
resource_name: str = None
|
109
|
+
"""
|
110
|
+
If set, load the existing resource name of the code interpreter extension
|
111
|
+
instead of creating a new one.
|
112
|
+
Format: projects/123/locations/us-central1/extensions/456
|
113
|
+
"""
|
114
|
+
|
115
|
+
_code_interpreter_extension: Extension
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self,
|
119
|
+
resource_name: str = None,
|
120
|
+
**data,
|
121
|
+
):
|
122
|
+
"""Initializes the VertexAiCodeExecutor.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
resource_name: If set, load the existing resource name of the code
|
126
|
+
interpreter extension instead of creating a new one. Format:
|
127
|
+
projects/123/locations/us-central1/extensions/456
|
128
|
+
**data: Additional keyword arguments to be passed to the base class.
|
129
|
+
"""
|
130
|
+
super().__init__(**data)
|
131
|
+
self.resource_name = resource_name
|
132
|
+
self._code_interpreter_extension = _get_code_interpreter_extension(
|
133
|
+
self.resource_name
|
134
|
+
)
|
135
|
+
|
136
|
+
@override
|
137
|
+
def execute_code(
|
138
|
+
self,
|
139
|
+
invocation_context: InvocationContext,
|
140
|
+
code_execution_input: CodeExecutionInput,
|
141
|
+
) -> CodeExecutionResult:
|
142
|
+
# Execute the code.
|
143
|
+
code_execution_result = self._execute_code_interpreter(
|
144
|
+
self._get_code_with_imports(code_execution_input.code),
|
145
|
+
code_execution_input.input_files,
|
146
|
+
code_execution_input.execution_id,
|
147
|
+
)
|
148
|
+
|
149
|
+
# Save output file as artifacts.
|
150
|
+
current_timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
151
|
+
file_name_prefix = '%s_' % str(current_timestamp)
|
152
|
+
saved_files = []
|
153
|
+
file_count = 0
|
154
|
+
for output_file in code_execution_result['output_files']:
|
155
|
+
file_type = output_file['name'].split('.')[-1]
|
156
|
+
file_name = file_name_prefix + '%d.%s' % (file_count, file_type)
|
157
|
+
if file_type in _SUPPORTED_IMAGE_TYPES:
|
158
|
+
file_count += 1
|
159
|
+
saved_files.append(
|
160
|
+
File(
|
161
|
+
name='plot_' + file_name,
|
162
|
+
content=output_file['contents'],
|
163
|
+
mime_type=f'image/{file_type}',
|
164
|
+
)
|
165
|
+
)
|
166
|
+
elif file_type in _SUPPORTED_DATA_FILE_TYPES:
|
167
|
+
file_count += 1
|
168
|
+
saved_files.append(
|
169
|
+
File(
|
170
|
+
name='data_' + file_name,
|
171
|
+
content=output_file['contents'],
|
172
|
+
mime_type=f'text/{file_type}',
|
173
|
+
)
|
174
|
+
)
|
175
|
+
else:
|
176
|
+
mime_type, _ = mimetypes.guess_type(file_name)
|
177
|
+
saved_files.append(
|
178
|
+
File(
|
179
|
+
name=file_name,
|
180
|
+
content=output_file['contents'],
|
181
|
+
mime_type=mime_type,
|
182
|
+
)
|
183
|
+
)
|
184
|
+
|
185
|
+
# Collect the final result.
|
186
|
+
return CodeExecutionResult(
|
187
|
+
stdout=code_execution_result.get('execution_result', ''),
|
188
|
+
stderr=code_execution_result.get('execution_error', ''),
|
189
|
+
output_files=saved_files,
|
190
|
+
)
|
191
|
+
|
192
|
+
def _execute_code_interpreter(
|
193
|
+
self,
|
194
|
+
code: str,
|
195
|
+
input_files: Optional[list[File]] = None,
|
196
|
+
session_id: Optional[str] = None,
|
197
|
+
) -> dict[str, Any]:
|
198
|
+
"""Executes the code interpreter extension.
|
199
|
+
|
200
|
+
Args:
|
201
|
+
code: The code to execute.
|
202
|
+
input_files: The input files to execute the code with.
|
203
|
+
session_id: The session ID to execute the code with.
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
The response from the code interpreter extension.
|
207
|
+
"""
|
208
|
+
operation_params = {'code': code}
|
209
|
+
if input_files:
|
210
|
+
operation_params['files'] = [
|
211
|
+
{'name': f.name, 'contents': f.content} for f in input_files
|
212
|
+
]
|
213
|
+
if session_id:
|
214
|
+
operation_params['session_id'] = session_id
|
215
|
+
response = self._code_interpreter_extension.execute(
|
216
|
+
operation_id='execute',
|
217
|
+
operation_params=operation_params,
|
218
|
+
)
|
219
|
+
return response
|
220
|
+
|
221
|
+
def _get_code_with_imports(self, code: str) -> str:
|
222
|
+
"""Builds the code string with built-in imports.
|
223
|
+
|
224
|
+
Args:
|
225
|
+
code: The code to execute.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
The code string with built-in imports.
|
229
|
+
"""
|
230
|
+
return f"""
|
231
|
+
{_IMPORTED_LIBRARIES}
|
232
|
+
|
233
|
+
{code}
|
234
|
+
"""
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import logging
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
__all__ = []
|
20
|
+
|
21
|
+
try:
|
22
|
+
from .agent_evaluator import AgentEvaluator
|
23
|
+
|
24
|
+
__all__.append('AgentEvaluator')
|
25
|
+
except ImportError:
|
26
|
+
logger.debug(
|
27
|
+
'The Vertex[eval] sdk is not installed. If you want to use the Vertex'
|
28
|
+
' Evaluation with agents, please install it(pip install'
|
29
|
+
' "google-cloud-aiplatform[evaluation]). If not, you can ignore this'
|
30
|
+
' warning.'
|
31
|
+
)
|
@@ -0,0 +1,329 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import json
|
16
|
+
import os
|
17
|
+
from os import path
|
18
|
+
from typing import Dict
|
19
|
+
from typing import List
|
20
|
+
from typing import Union
|
21
|
+
|
22
|
+
from .evaluation_generator import EvaluationGenerator
|
23
|
+
from .response_evaluator import ResponseEvaluator
|
24
|
+
from .trajectory_evaluator import TrajectoryEvaluator
|
25
|
+
|
26
|
+
# Constants for default runs and evaluation criteria
|
27
|
+
NUM_RUNS = 2
|
28
|
+
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
|
29
|
+
# This evaluation is not very stable.
|
30
|
+
# This is always optional unless explicitly specified.
|
31
|
+
RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
|
32
|
+
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
|
33
|
+
|
34
|
+
ALLOWED_CRITERIA = [
|
35
|
+
TOOL_TRAJECTORY_SCORE_KEY,
|
36
|
+
RESPONSE_EVALUATION_SCORE_KEY,
|
37
|
+
RESPONSE_MATCH_SCORE_KEY,
|
38
|
+
]
|
39
|
+
|
40
|
+
|
41
|
+
QUERY_COLUMN = "query"
|
42
|
+
REFERENCE_COLUMN = "reference"
|
43
|
+
EXPECTED_TOOL_USE_COLUMN = "expected_tool_use"
|
44
|
+
|
45
|
+
|
46
|
+
DEFAULT_CRITERIA = {
|
47
|
+
TOOL_TRAJECTORY_SCORE_KEY: 1.0, # 1-point scale; 1.0 is perfect.
|
48
|
+
RESPONSE_MATCH_SCORE_KEY: 0.8, # Rouge-1 text match; 0.8 is default.
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
def load_json(file_path: str) -> Union[Dict, List]:
|
53
|
+
with open(file_path, "r") as f:
|
54
|
+
return json.load(f)
|
55
|
+
|
56
|
+
|
57
|
+
class AgentEvaluator:
|
58
|
+
"""An evaluator for Agents, mainly intented for helping with test cases."""
|
59
|
+
|
60
|
+
@staticmethod
|
61
|
+
def find_config_for_test_file(test_file: str):
|
62
|
+
"""Find the test_config.json file in the same folder as the test file."""
|
63
|
+
test_folder = os.path.dirname(test_file)
|
64
|
+
config_path = os.path.join(test_folder, "test_config.json")
|
65
|
+
if os.path.exists(config_path):
|
66
|
+
config_data = load_json(config_path)
|
67
|
+
if "criteria" in config_data and isinstance(
|
68
|
+
config_data["criteria"], dict
|
69
|
+
):
|
70
|
+
return config_data["criteria"]
|
71
|
+
else:
|
72
|
+
raise ValueError(
|
73
|
+
f"Invalid format for test_config.json at {config_path}. Expected a"
|
74
|
+
" 'criteria' dictionary."
|
75
|
+
)
|
76
|
+
return DEFAULT_CRITERIA
|
77
|
+
|
78
|
+
@staticmethod
|
79
|
+
def evaluate(
|
80
|
+
agent_module,
|
81
|
+
eval_dataset_file_path_or_dir,
|
82
|
+
num_runs=NUM_RUNS,
|
83
|
+
agent_name=None,
|
84
|
+
initial_session_file=None,
|
85
|
+
):
|
86
|
+
"""Evaluates an Agent given eval data.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
agent_module: The path to python module that contains the definition of
|
90
|
+
the agent. There is convention in place here, where the code is going to
|
91
|
+
look for 'root_agent' in the loaded module.
|
92
|
+
eval_dataset: The eval data set. This can be either a string representing
|
93
|
+
full path to the file containing eval dataset, or a directory that is
|
94
|
+
recusively explored for all files that have a `.test.json` suffix.
|
95
|
+
num_runs: Number of times all entries in the eval dataset should be
|
96
|
+
assessed.
|
97
|
+
agent_name: The name of the agent.
|
98
|
+
initial_session_file: File that contains initial session state that is
|
99
|
+
needed by all the evals in the eval dataset.
|
100
|
+
"""
|
101
|
+
test_files = []
|
102
|
+
if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
|
103
|
+
eval_dataset_file_path_or_dir
|
104
|
+
):
|
105
|
+
for root, _, files in os.walk(eval_dataset_file_path_or_dir):
|
106
|
+
for file in files:
|
107
|
+
if file.endswith(".test.json"):
|
108
|
+
test_files.append(path.join(root, file))
|
109
|
+
else:
|
110
|
+
test_files = [eval_dataset_file_path_or_dir]
|
111
|
+
|
112
|
+
initial_session_state = {}
|
113
|
+
if initial_session_file:
|
114
|
+
with open(initial_session_file, "r") as f:
|
115
|
+
initial_session_state = json.loads(f.read())["state"]
|
116
|
+
|
117
|
+
for test_file in test_files:
|
118
|
+
dataset = AgentEvaluator._load_dataset(test_file)[0]
|
119
|
+
criteria = AgentEvaluator.find_config_for_test_file(test_file)
|
120
|
+
|
121
|
+
AgentEvaluator._validate_input([dataset], criteria)
|
122
|
+
|
123
|
+
evaluation_response = AgentEvaluator._generate_responses(
|
124
|
+
agent_module,
|
125
|
+
[dataset],
|
126
|
+
num_runs,
|
127
|
+
agent_name=agent_name,
|
128
|
+
initial_session={"state": initial_session_state},
|
129
|
+
)
|
130
|
+
|
131
|
+
if AgentEvaluator._response_evaluation_required(criteria, [dataset]):
|
132
|
+
AgentEvaluator._evaluate_response_scores(
|
133
|
+
agent_module, evaluation_response, criteria
|
134
|
+
)
|
135
|
+
|
136
|
+
if AgentEvaluator._trajectory_evaluation_required(criteria, [dataset]):
|
137
|
+
AgentEvaluator._evaluate_tool_trajectory(
|
138
|
+
agent_module, evaluation_response, criteria
|
139
|
+
)
|
140
|
+
|
141
|
+
@staticmethod
|
142
|
+
def _load_dataset(
|
143
|
+
input_data: Union[str, List[str], List[Dict], List[List[Dict]]],
|
144
|
+
) -> List[List[Dict]]:
|
145
|
+
def load_json_file(file_path: str) -> List[Dict]:
|
146
|
+
data = load_json(file_path)
|
147
|
+
if not isinstance(data, list) or not all(
|
148
|
+
isinstance(d, dict) for d in data
|
149
|
+
):
|
150
|
+
raise ValueError(f"{file_path} must contain a list of dictionaries.")
|
151
|
+
return data
|
152
|
+
|
153
|
+
if isinstance(input_data, str):
|
154
|
+
if os.path.isdir(input_data):
|
155
|
+
test_files = []
|
156
|
+
for root, _, files in os.walk(input_data):
|
157
|
+
for file in files:
|
158
|
+
if file.endswith(".test.json"):
|
159
|
+
test_files.append(os.path.join(root, file))
|
160
|
+
return [load_json_file(f) for f in test_files]
|
161
|
+
elif os.path.isfile(input_data):
|
162
|
+
return [load_json_file(input_data)]
|
163
|
+
else:
|
164
|
+
raise ValueError(f"Input path {input_data} is invalid.")
|
165
|
+
elif isinstance(input_data, list):
|
166
|
+
if all(isinstance(i, str) and os.path.isfile(i) for i in input_data):
|
167
|
+
return [load_json_file(i) for i in input_data]
|
168
|
+
raise TypeError("Input list must contain valid file paths.")
|
169
|
+
raise TypeError("Invalid input type for dataset loading.")
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def _validate_input(eval_dataset, criteria):
|
173
|
+
"""Validates that the evaluation criteria align with the provided dataset.
|
174
|
+
|
175
|
+
For efficiency, we only use first row to validate input.
|
176
|
+
"""
|
177
|
+
if not eval_dataset:
|
178
|
+
raise ValueError("The evaluation dataset is None or empty.")
|
179
|
+
|
180
|
+
for key in criteria:
|
181
|
+
if key not in ALLOWED_CRITERIA:
|
182
|
+
raise ValueError(
|
183
|
+
f"Invalid criteria key: {key}. Expected one of {ALLOWED_CRITERIA}."
|
184
|
+
)
|
185
|
+
|
186
|
+
if not eval_dataset:
|
187
|
+
raise ValueError("The evaluation dataset is empty.")
|
188
|
+
sample = eval_dataset[0]
|
189
|
+
first_query = sample[0]
|
190
|
+
|
191
|
+
if not isinstance(sample, list) and not isinstance(first_query, dict):
|
192
|
+
raise ValueError(
|
193
|
+
"Each evaluation dataset sample must be list of dictionary. But it's"
|
194
|
+
f" {eval_dataset}"
|
195
|
+
)
|
196
|
+
|
197
|
+
if TOOL_TRAJECTORY_SCORE_KEY in criteria:
|
198
|
+
if (
|
199
|
+
QUERY_COLUMN not in first_query
|
200
|
+
or EXPECTED_TOOL_USE_COLUMN not in first_query
|
201
|
+
):
|
202
|
+
raise ValueError(
|
203
|
+
f"Samples for {TOOL_TRAJECTORY_SCORE_KEY} must include"
|
204
|
+
f" '{QUERY_COLUMN}' and '{EXPECTED_TOOL_USE_COLUMN}' keys. The"
|
205
|
+
f" sample is {sample}."
|
206
|
+
)
|
207
|
+
|
208
|
+
if RESPONSE_EVALUATION_SCORE_KEY in criteria:
|
209
|
+
if QUERY_COLUMN not in first_query:
|
210
|
+
raise ValueError(
|
211
|
+
f"Samples for {RESPONSE_EVALUATION_SCORE_KEY} must include"
|
212
|
+
f" '{QUERY_COLUMN}' key. The sample is {sample}."
|
213
|
+
)
|
214
|
+
|
215
|
+
if RESPONSE_MATCH_SCORE_KEY in criteria:
|
216
|
+
if QUERY_COLUMN not in first_query or REFERENCE_COLUMN not in first_query:
|
217
|
+
raise ValueError(
|
218
|
+
f"Samples for {RESPONSE_MATCH_SCORE_KEY} must include"
|
219
|
+
f" '{QUERY_COLUMN}' and '{REFERENCE_COLUMN}' keys. The sample is"
|
220
|
+
f" {sample}."
|
221
|
+
)
|
222
|
+
|
223
|
+
@staticmethod
|
224
|
+
def _get_infer_criteria(eval_dataset):
|
225
|
+
"""Infers evaluation criteria based on the provided dataset.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
eval_dataset (list): A list of evaluation samples.
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
dict: Inferred evaluation criteria based on dataset fields.
|
232
|
+
"""
|
233
|
+
inferred_criteria = {}
|
234
|
+
sample = eval_dataset[0][0]
|
235
|
+
|
236
|
+
if QUERY_COLUMN in sample and EXPECTED_TOOL_USE_COLUMN in sample:
|
237
|
+
inferred_criteria[TOOL_TRAJECTORY_SCORE_KEY] = DEFAULT_CRITERIA[
|
238
|
+
TOOL_TRAJECTORY_SCORE_KEY
|
239
|
+
]
|
240
|
+
|
241
|
+
if QUERY_COLUMN in sample and REFERENCE_COLUMN in sample:
|
242
|
+
inferred_criteria[RESPONSE_MATCH_SCORE_KEY] = DEFAULT_CRITERIA[
|
243
|
+
RESPONSE_MATCH_SCORE_KEY
|
244
|
+
]
|
245
|
+
|
246
|
+
return inferred_criteria
|
247
|
+
|
248
|
+
@staticmethod
|
249
|
+
def _generate_responses(
|
250
|
+
agent_module, eval_dataset, num_runs, agent_name=None, initial_session={}
|
251
|
+
):
|
252
|
+
"""Generates evaluation responses by running the agent module multiple times."""
|
253
|
+
return EvaluationGenerator.generate_responses(
|
254
|
+
eval_dataset,
|
255
|
+
agent_module,
|
256
|
+
repeat_num=num_runs,
|
257
|
+
agent_name=agent_name,
|
258
|
+
initial_session=initial_session,
|
259
|
+
)
|
260
|
+
|
261
|
+
@staticmethod
|
262
|
+
def _generate_responses_from_session(eval_dataset, session_path):
|
263
|
+
"""Generates evaluation responses by running the agent module multiple times."""
|
264
|
+
return EvaluationGenerator.generate_responses_from_session(
|
265
|
+
session_path, eval_dataset
|
266
|
+
)
|
267
|
+
|
268
|
+
@staticmethod
|
269
|
+
def _response_evaluation_required(criteria, eval_dataset):
|
270
|
+
"""Checks if response evaluation are needed."""
|
271
|
+
return REFERENCE_COLUMN in eval_dataset[0][0] and any(
|
272
|
+
key in criteria
|
273
|
+
for key in [RESPONSE_EVALUATION_SCORE_KEY, RESPONSE_MATCH_SCORE_KEY]
|
274
|
+
)
|
275
|
+
|
276
|
+
@staticmethod
|
277
|
+
def _trajectory_evaluation_required(evaluation_criteria, eval_dataset):
|
278
|
+
"""Checks if response evaluation are needed."""
|
279
|
+
return (
|
280
|
+
EXPECTED_TOOL_USE_COLUMN in eval_dataset[0][0]
|
281
|
+
and TOOL_TRAJECTORY_SCORE_KEY in evaluation_criteria
|
282
|
+
)
|
283
|
+
|
284
|
+
@staticmethod
|
285
|
+
def _evaluate_response_scores(agent_module, evaluation_response, criteria):
|
286
|
+
"""Evaluates response scores and raises an assertion error if they don't meet the criteria."""
|
287
|
+
metrics = ResponseEvaluator.evaluate(
|
288
|
+
evaluation_response, criteria, print_detailed_results=True
|
289
|
+
)
|
290
|
+
|
291
|
+
AgentEvaluator._assert_score(
|
292
|
+
metrics,
|
293
|
+
"coherence/mean",
|
294
|
+
criteria.get(RESPONSE_EVALUATION_SCORE_KEY),
|
295
|
+
"Average response evaluation score",
|
296
|
+
agent_module,
|
297
|
+
)
|
298
|
+
|
299
|
+
AgentEvaluator._assert_score(
|
300
|
+
metrics,
|
301
|
+
"rouge_1/mean",
|
302
|
+
criteria.get(RESPONSE_MATCH_SCORE_KEY),
|
303
|
+
"Average response match score",
|
304
|
+
agent_module,
|
305
|
+
)
|
306
|
+
|
307
|
+
@staticmethod
|
308
|
+
def _evaluate_tool_trajectory(agent_module, evaluation_response, criteria):
|
309
|
+
"""Evaluates tool trajectory scores and raises an assertion error if they don't meet the criteria."""
|
310
|
+
score = TrajectoryEvaluator.evaluate(
|
311
|
+
evaluation_response, print_detailed_results=True
|
312
|
+
)
|
313
|
+
AgentEvaluator._assert_score(
|
314
|
+
{TOOL_TRAJECTORY_SCORE_KEY: score},
|
315
|
+
TOOL_TRAJECTORY_SCORE_KEY,
|
316
|
+
criteria[TOOL_TRAJECTORY_SCORE_KEY],
|
317
|
+
"Average tool trajectory evaluation score",
|
318
|
+
agent_module,
|
319
|
+
)
|
320
|
+
|
321
|
+
@staticmethod
|
322
|
+
def _assert_score(metrics, metric_key, threshold, description, agent_module):
|
323
|
+
"""Asserts that a metric meets the specified threshold."""
|
324
|
+
if metric_key in metrics:
|
325
|
+
actual_score = metrics[metric_key]
|
326
|
+
assert actual_score >= threshold, (
|
327
|
+
f"{description} for {agent_module} is lower than expected. "
|
328
|
+
f"Expected >= {threshold}, but got {actual_score}."
|
329
|
+
)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
class EvalConstants:
|
16
|
+
"""Holds constants for evaluation file constants."""
|
17
|
+
|
18
|
+
QUERY = "query"
|
19
|
+
EXPECTED_TOOL_USE = "expected_tool_use"
|
20
|
+
RESPONSE = "response"
|
21
|
+
REFERENCE = "reference"
|
22
|
+
TOOL_NAME = "tool_name"
|
23
|
+
TOOL_INPUT = "tool_input"
|
24
|
+
MOCK_TOOL_OUTPUT = "mock_tool_output"
|