google-adk 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/._version.py +0 -0
- google/adk/__init__.py +20 -0
- google/adk/agents/__init__.py +32 -0
- google/adk/agents/active_streaming_tool.py +38 -0
- google/adk/agents/base_agent.py +345 -0
- google/adk/agents/callback_context.py +112 -0
- google/adk/agents/invocation_context.py +181 -0
- google/adk/agents/langgraph_agent.py +140 -0
- google/adk/agents/live_request_queue.py +64 -0
- google/adk/agents/llm_agent.py +376 -0
- google/adk/agents/loop_agent.py +62 -0
- google/adk/agents/parallel_agent.py +96 -0
- google/adk/agents/readonly_context.py +46 -0
- google/adk/agents/remote_agent.py +50 -0
- google/adk/agents/run_config.py +87 -0
- google/adk/agents/sequential_agent.py +45 -0
- google/adk/agents/transcription_entry.py +34 -0
- google/adk/artifacts/__init__.py +23 -0
- google/adk/artifacts/base_artifact_service.py +128 -0
- google/adk/artifacts/gcs_artifact_service.py +195 -0
- google/adk/artifacts/in_memory_artifact_service.py +133 -0
- google/adk/auth/__init__.py +22 -0
- google/adk/auth/auth_credential.py +220 -0
- google/adk/auth/auth_handler.py +268 -0
- google/adk/auth/auth_preprocessor.py +116 -0
- google/adk/auth/auth_schemes.py +67 -0
- google/adk/auth/auth_tool.py +55 -0
- google/adk/cli/__init__.py +15 -0
- google/adk/cli/__main__.py +18 -0
- google/adk/cli/agent_graph.py +148 -0
- google/adk/cli/browser/adk_favicon.svg +17 -0
- google/adk/cli/browser/assets/audio-processor.js +51 -0
- google/adk/cli/browser/assets/config/runtime-config.json +3 -0
- google/adk/cli/browser/index.html +33 -0
- google/adk/cli/browser/main-SY2WYYGV.js +75 -0
- google/adk/cli/browser/polyfills-FFHMD2TL.js +18 -0
- google/adk/cli/browser/styles-4VDSPQ37.css +17 -0
- google/adk/cli/cli.py +181 -0
- google/adk/cli/cli_deploy.py +181 -0
- google/adk/cli/cli_eval.py +282 -0
- google/adk/cli/cli_tools_click.py +524 -0
- google/adk/cli/fast_api.py +784 -0
- google/adk/cli/utils/__init__.py +49 -0
- google/adk/cli/utils/envs.py +57 -0
- google/adk/cli/utils/evals.py +93 -0
- google/adk/cli/utils/logs.py +72 -0
- google/adk/code_executors/__init__.py +49 -0
- google/adk/code_executors/base_code_executor.py +97 -0
- google/adk/code_executors/code_execution_utils.py +256 -0
- google/adk/code_executors/code_executor_context.py +202 -0
- google/adk/code_executors/container_code_executor.py +196 -0
- google/adk/code_executors/unsafe_local_code_executor.py +71 -0
- google/adk/code_executors/vertex_ai_code_executor.py +234 -0
- google/adk/docs/Makefile +20 -0
- google/adk/docs/build/doctrees/google-adk.doctree +0 -0
- google/adk/docs/build/html/_sources/google-adk.rst.txt +98 -0
- google/adk/docs/build/html/_sources/index.rst.txt +7 -0
- google/adk/docs/build/html/_static/autodoc_pydantic.css +27 -0
- google/adk/docs/build/html/_static/basic.css +925 -0
- google/adk/docs/build/html/_static/debug.css +85 -0
- google/adk/docs/build/html/_static/doctools.js +156 -0
- google/adk/docs/build/html/_static/documentation_options.js +29 -0
- google/adk/docs/build/html/_static/file.png +0 -0
- google/adk/docs/build/html/_static/language_data.js +199 -0
- google/adk/docs/build/html/_static/minus.png +0 -0
- google/adk/docs/build/html/_static/plus.png +0 -0
- google/adk/docs/build/html/_static/pygments.css +274 -0
- google/adk/docs/build/html/_static/scripts/furo-extensions.js +16 -0
- google/adk/docs/build/html/_static/scripts/furo.js +19 -0
- google/adk/docs/build/html/_static/scripts/furo.js.LICENSE.txt +7 -0
- google/adk/docs/build/html/_static/scripts/furo.js.map +1 -0
- google/adk/docs/build/html/_static/searchtools.js +620 -0
- google/adk/docs/build/html/_static/skeleton.css +312 -0
- google/adk/docs/build/html/_static/sphinx_highlight.js +170 -0
- google/adk/docs/build/html/_static/styles/furo-extensions.css +18 -0
- google/adk/docs/build/html/_static/styles/furo-extensions.css.map +1 -0
- google/adk/docs/build/html/_static/styles/furo.css +18 -0
- google/adk/docs/build/html/_static/styles/furo.css.map +1 -0
- google/adk/docs/build/html/genindex.html +861 -0
- google/adk/docs/build/html/google-adk.html +5461 -0
- google/adk/docs/build/html/index.html +567 -0
- google/adk/docs/build/html/objects.inv +0 -0
- google/adk/docs/build/html/py-modindex.html +373 -0
- google/adk/docs/build/html/search.html +333 -0
- google/adk/docs/build/html/searchindex.js +17 -0
- google/adk/docs/source/conf.py +133 -0
- google/adk/docs/source/google-adk.rst +98 -0
- google/adk/docs/source/index.rst +7 -0
- google/adk/evaluation/__init__.py +31 -0
- google/adk/evaluation/agent_evaluator.py +329 -0
- google/adk/evaluation/evaluation_constants.py +24 -0
- google/adk/evaluation/evaluation_generator.py +270 -0
- google/adk/evaluation/response_evaluator.py +135 -0
- google/adk/evaluation/trajectory_evaluator.py +184 -0
- google/adk/events/__init__.py +21 -0
- google/adk/events/event.py +130 -0
- google/adk/events/event_actions.py +55 -0
- google/adk/examples/__init__.py +28 -0
- google/adk/examples/base_example_provider.py +35 -0
- google/adk/examples/example.py +27 -0
- google/adk/examples/example_util.py +123 -0
- google/adk/examples/vertex_ai_example_store.py +104 -0
- google/adk/flows/__init__.py +14 -0
- google/adk/flows/llm_flows/__init__.py +20 -0
- google/adk/flows/llm_flows/_base_llm_processor.py +52 -0
- google/adk/flows/llm_flows/_code_execution.py +458 -0
- google/adk/flows/llm_flows/_nl_planning.py +129 -0
- google/adk/flows/llm_flows/agent_transfer.py +132 -0
- google/adk/flows/llm_flows/audio_transcriber.py +109 -0
- google/adk/flows/llm_flows/auto_flow.py +49 -0
- google/adk/flows/llm_flows/base_llm_flow.py +559 -0
- google/adk/flows/llm_flows/basic.py +72 -0
- google/adk/flows/llm_flows/contents.py +370 -0
- google/adk/flows/llm_flows/functions.py +486 -0
- google/adk/flows/llm_flows/identity.py +47 -0
- google/adk/flows/llm_flows/instructions.py +137 -0
- google/adk/flows/llm_flows/single_flow.py +57 -0
- google/adk/memory/__init__.py +35 -0
- google/adk/memory/base_memory_service.py +74 -0
- google/adk/memory/in_memory_memory_service.py +62 -0
- google/adk/memory/vertex_ai_rag_memory_service.py +177 -0
- google/adk/models/__init__.py +31 -0
- google/adk/models/anthropic_llm.py +243 -0
- google/adk/models/base_llm.py +87 -0
- google/adk/models/base_llm_connection.py +76 -0
- google/adk/models/gemini_llm_connection.py +200 -0
- google/adk/models/google_llm.py +331 -0
- google/adk/models/lite_llm.py +673 -0
- google/adk/models/llm_request.py +98 -0
- google/adk/models/llm_response.py +111 -0
- google/adk/models/registry.py +102 -0
- google/adk/planners/__init__.py +23 -0
- google/adk/planners/base_planner.py +66 -0
- google/adk/planners/built_in_planner.py +75 -0
- google/adk/planners/plan_re_act_planner.py +208 -0
- google/adk/runners.py +456 -0
- google/adk/sessions/__init__.py +41 -0
- google/adk/sessions/base_session_service.py +133 -0
- google/adk/sessions/database_session_service.py +522 -0
- google/adk/sessions/in_memory_session_service.py +206 -0
- google/adk/sessions/session.py +54 -0
- google/adk/sessions/state.py +71 -0
- google/adk/sessions/vertex_ai_session_service.py +356 -0
- google/adk/telemetry.py +189 -0
- google/adk/tests/__init__.py +14 -0
- google/adk/tests/integration/.env.example +10 -0
- google/adk/tests/integration/__init__.py +18 -0
- google/adk/tests/integration/conftest.py +119 -0
- google/adk/tests/integration/fixture/__init__.py +14 -0
- google/adk/tests/integration/fixture/agent_with_config/__init__.py +15 -0
- google/adk/tests/integration/fixture/agent_with_config/agent.py +88 -0
- google/adk/tests/integration/fixture/callback_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/callback_agent/agent.py +105 -0
- google/adk/tests/integration/fixture/context_update_test/OWNERS +1 -0
- google/adk/tests/integration/fixture/context_update_test/__init__.py +15 -0
- google/adk/tests/integration/fixture/context_update_test/agent.py +43 -0
- google/adk/tests/integration/fixture/context_update_test/successful_test.session.json +582 -0
- google/adk/tests/integration/fixture/context_variable_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/context_variable_agent/agent.py +115 -0
- google/adk/tests/integration/fixture/customer_support_ma/__init__.py +15 -0
- google/adk/tests/integration/fixture/customer_support_ma/agent.py +172 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/agent.py +338 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/order_query.test.json +69 -0
- google/adk/tests/integration/fixture/ecommerce_customer_service_agent/test_config.json +6 -0
- google/adk/tests/integration/fixture/flow_complex_spark/__init__.py +15 -0
- google/adk/tests/integration/fixture/flow_complex_spark/agent.py +182 -0
- google/adk/tests/integration/fixture/flow_complex_spark/sample.session.json +190 -0
- google/adk/tests/integration/fixture/hello_world_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/hello_world_agent/agent.py +95 -0
- google/adk/tests/integration/fixture/hello_world_agent/roll_die.test.json +24 -0
- google/adk/tests/integration/fixture/hello_world_agent/test_config.json +6 -0
- google/adk/tests/integration/fixture/home_automation_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/home_automation_agent/agent.py +304 -0
- google/adk/tests/integration/fixture/home_automation_agent/simple_test.test.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/simple_test2.test.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_config.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/dependent_tool_calls.test.json +18 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/eval_data.test.json +17 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/memorizing_past_events/test_config.json +6 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/simple_multi_turn_conversation.test.json +18 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/simple_test.test.json +17 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/simple_test2.test.json +5 -0
- google/adk/tests/integration/fixture/home_automation_agent/test_files/test_config.json +5 -0
- google/adk/tests/integration/fixture/tool_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/tool_agent/agent.py +218 -0
- google/adk/tests/integration/fixture/tool_agent/files/Agent_test_plan.pdf +0 -0
- google/adk/tests/integration/fixture/trip_planner_agent/__init__.py +15 -0
- google/adk/tests/integration/fixture/trip_planner_agent/agent.py +110 -0
- google/adk/tests/integration/fixture/trip_planner_agent/initial.session.json +13 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_config.json +5 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_files/initial.session.json +13 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_files/test_config.json +5 -0
- google/adk/tests/integration/fixture/trip_planner_agent/test_files/trip_inquiry_sub_agent.test.json +7 -0
- google/adk/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json +19 -0
- google/adk/tests/integration/models/__init__.py +14 -0
- google/adk/tests/integration/models/test_google_llm.py +65 -0
- google/adk/tests/integration/test_callback.py +70 -0
- google/adk/tests/integration/test_context_variable.py +67 -0
- google/adk/tests/integration/test_evalute_agent_in_fixture.py +76 -0
- google/adk/tests/integration/test_multi_agent.py +28 -0
- google/adk/tests/integration/test_multi_turn.py +42 -0
- google/adk/tests/integration/test_single_agent.py +23 -0
- google/adk/tests/integration/test_sub_agent.py +26 -0
- google/adk/tests/integration/test_system_instruction.py +177 -0
- google/adk/tests/integration/test_tools.py +287 -0
- google/adk/tests/integration/test_with_test_file.py +34 -0
- google/adk/tests/integration/tools/__init__.py +14 -0
- google/adk/tests/integration/utils/__init__.py +16 -0
- google/adk/tests/integration/utils/asserts.py +75 -0
- google/adk/tests/integration/utils/test_runner.py +97 -0
- google/adk/tests/unittests/__init__.py +14 -0
- google/adk/tests/unittests/agents/__init__.py +14 -0
- google/adk/tests/unittests/agents/test_base_agent.py +407 -0
- google/adk/tests/unittests/agents/test_langgraph_agent.py +191 -0
- google/adk/tests/unittests/agents/test_llm_agent_callbacks.py +138 -0
- google/adk/tests/unittests/agents/test_llm_agent_fields.py +231 -0
- google/adk/tests/unittests/agents/test_loop_agent.py +136 -0
- google/adk/tests/unittests/agents/test_parallel_agent.py +92 -0
- google/adk/tests/unittests/agents/test_sequential_agent.py +114 -0
- google/adk/tests/unittests/artifacts/__init__.py +14 -0
- google/adk/tests/unittests/artifacts/test_artifact_service.py +276 -0
- google/adk/tests/unittests/auth/test_auth_handler.py +575 -0
- google/adk/tests/unittests/conftest.py +73 -0
- google/adk/tests/unittests/fast_api/__init__.py +14 -0
- google/adk/tests/unittests/fast_api/test_fast_api.py +269 -0
- google/adk/tests/unittests/flows/__init__.py +14 -0
- google/adk/tests/unittests/flows/llm_flows/__init__.py +14 -0
- google/adk/tests/unittests/flows/llm_flows/_test_examples.py +142 -0
- google/adk/tests/unittests/flows/llm_flows/test_agent_transfer.py +311 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_long_running.py +244 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_request_euc.py +346 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_sequential.py +93 -0
- google/adk/tests/unittests/flows/llm_flows/test_functions_simple.py +258 -0
- google/adk/tests/unittests/flows/llm_flows/test_identity.py +66 -0
- google/adk/tests/unittests/flows/llm_flows/test_instructions.py +164 -0
- google/adk/tests/unittests/flows/llm_flows/test_model_callbacks.py +142 -0
- google/adk/tests/unittests/flows/llm_flows/test_other_configs.py +46 -0
- google/adk/tests/unittests/flows/llm_flows/test_tool_callbacks.py +269 -0
- google/adk/tests/unittests/models/__init__.py +14 -0
- google/adk/tests/unittests/models/test_google_llm.py +224 -0
- google/adk/tests/unittests/models/test_litellm.py +804 -0
- google/adk/tests/unittests/models/test_models.py +60 -0
- google/adk/tests/unittests/sessions/__init__.py +14 -0
- google/adk/tests/unittests/sessions/test_session_service.py +227 -0
- google/adk/tests/unittests/sessions/test_vertex_ai_session_service.py +246 -0
- google/adk/tests/unittests/streaming/__init__.py +14 -0
- google/adk/tests/unittests/streaming/test_streaming.py +50 -0
- google/adk/tests/unittests/tools/__init__.py +14 -0
- google/adk/tests/unittests/tools/apihub_tool/clients/test_apihub_client.py +499 -0
- google/adk/tests/unittests/tools/apihub_tool/test_apihub_toolset.py +204 -0
- google/adk/tests/unittests/tools/application_integration_tool/clients/test_connections_client.py +600 -0
- google/adk/tests/unittests/tools/application_integration_tool/clients/test_integration_client.py +630 -0
- google/adk/tests/unittests/tools/application_integration_tool/test_application_integration_toolset.py +345 -0
- google/adk/tests/unittests/tools/google_api_tool/__init__.py +13 -0
- google/adk/tests/unittests/tools/google_api_tool/test_googleapi_to_openapi_converter.py +657 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_auto_auth_credential_exchanger.py +145 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_base_auth_credential_exchanger.py +68 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_oauth2_exchanger.py +153 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/credential_exchangers/test_service_account_exchanger.py +196 -0
- google/adk/tests/unittests/tools/openapi_tool/auth/test_auth_helper.py +573 -0
- google/adk/tests/unittests/tools/openapi_tool/common/test_common.py +436 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test.yaml +1367 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_openapi_spec_parser.py +628 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_openapi_toolset.py +139 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_operation_parser.py +406 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_rest_api_tool.py +966 -0
- google/adk/tests/unittests/tools/openapi_tool/openapi_spec_parser/test_tool_auth_handler.py +201 -0
- google/adk/tests/unittests/tools/retrieval/__init__.py +14 -0
- google/adk/tests/unittests/tools/retrieval/test_vertex_ai_rag_retrieval.py +147 -0
- google/adk/tests/unittests/tools/test_agent_tool.py +167 -0
- google/adk/tests/unittests/tools/test_base_tool.py +141 -0
- google/adk/tests/unittests/tools/test_build_function_declaration.py +277 -0
- google/adk/tests/unittests/utils.py +304 -0
- google/adk/tools/__init__.py +51 -0
- google/adk/tools/_automatic_function_calling_util.py +346 -0
- google/adk/tools/agent_tool.py +176 -0
- google/adk/tools/apihub_tool/__init__.py +19 -0
- google/adk/tools/apihub_tool/apihub_toolset.py +209 -0
- google/adk/tools/apihub_tool/clients/__init__.py +13 -0
- google/adk/tools/apihub_tool/clients/apihub_client.py +332 -0
- google/adk/tools/apihub_tool/clients/secret_client.py +115 -0
- google/adk/tools/application_integration_tool/__init__.py +19 -0
- google/adk/tools/application_integration_tool/application_integration_toolset.py +230 -0
- google/adk/tools/application_integration_tool/clients/connections_client.py +903 -0
- google/adk/tools/application_integration_tool/clients/integration_client.py +253 -0
- google/adk/tools/base_tool.py +144 -0
- google/adk/tools/built_in_code_execution_tool.py +59 -0
- google/adk/tools/crewai_tool.py +72 -0
- google/adk/tools/example_tool.py +62 -0
- google/adk/tools/exit_loop_tool.py +23 -0
- google/adk/tools/function_parameter_parse_util.py +307 -0
- google/adk/tools/function_tool.py +87 -0
- google/adk/tools/get_user_choice_tool.py +28 -0
- google/adk/tools/google_api_tool/__init__.py +14 -0
- google/adk/tools/google_api_tool/google_api_tool.py +59 -0
- google/adk/tools/google_api_tool/google_api_tool_set.py +107 -0
- google/adk/tools/google_api_tool/google_api_tool_sets.py +55 -0
- google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +521 -0
- google/adk/tools/google_search_tool.py +68 -0
- google/adk/tools/langchain_tool.py +86 -0
- google/adk/tools/load_artifacts_tool.py +113 -0
- google/adk/tools/load_memory_tool.py +58 -0
- google/adk/tools/load_web_page.py +41 -0
- google/adk/tools/long_running_tool.py +39 -0
- google/adk/tools/mcp_tool/__init__.py +42 -0
- google/adk/tools/mcp_tool/conversion_utils.py +161 -0
- google/adk/tools/mcp_tool/mcp_tool.py +113 -0
- google/adk/tools/mcp_tool/mcp_toolset.py +272 -0
- google/adk/tools/openapi_tool/__init__.py +21 -0
- google/adk/tools/openapi_tool/auth/__init__.py +19 -0
- google/adk/tools/openapi_tool/auth/auth_helpers.py +498 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/__init__.py +25 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/auto_auth_credential_exchanger.py +105 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +55 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/oauth2_exchanger.py +117 -0
- google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +97 -0
- google/adk/tools/openapi_tool/common/__init__.py +19 -0
- google/adk/tools/openapi_tool/common/common.py +300 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +32 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_spec_parser.py +231 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +144 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +260 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +496 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +268 -0
- google/adk/tools/preload_memory_tool.py +72 -0
- google/adk/tools/retrieval/__init__.py +36 -0
- google/adk/tools/retrieval/base_retrieval_tool.py +37 -0
- google/adk/tools/retrieval/files_retrieval.py +33 -0
- google/adk/tools/retrieval/llama_index_retrieval.py +41 -0
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +107 -0
- google/adk/tools/tool_context.py +90 -0
- google/adk/tools/toolbox_tool.py +46 -0
- google/adk/tools/transfer_to_agent_tool.py +21 -0
- google/adk/tools/vertex_ai_search_tool.py +96 -0
- google/adk/version.py +16 -0
- google_adk-0.0.3.dist-info/METADATA +73 -0
- google_adk-0.0.3.dist-info/RECORD +340 -0
- {google_adk-0.0.1.dist-info → google_adk-0.0.3.dist-info}/WHEEL +1 -2
- google_adk-0.0.3.dist-info/entry_points.txt +3 -0
- agent_kit/__init__.py +0 -0
- google_adk-0.0.1.dist-info/LICENSE.txt +0 -170
- google_adk-0.0.1.dist-info/METADATA +0 -15
- google_adk-0.0.1.dist-info/RECORD +0 -6
- google_adk-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,270 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import importlib
|
16
|
+
import uuid
|
17
|
+
|
18
|
+
from google.genai import types
|
19
|
+
|
20
|
+
from ..agents.base_agent import BaseAgent
|
21
|
+
from ..agents.llm_agent import Agent
|
22
|
+
from ..agents.llm_agent import BeforeToolCallback
|
23
|
+
from ..agents.llm_agent import LlmAgent
|
24
|
+
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
25
|
+
from ..runners import Runner
|
26
|
+
from ..sessions.in_memory_session_service import InMemorySessionService
|
27
|
+
from ..sessions.session import Session
|
28
|
+
from .evaluation_constants import EvalConstants
|
29
|
+
|
30
|
+
|
31
|
+
class EvaluationGenerator:
|
32
|
+
"""Generates evaluation responses for agents."""
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def generate_responses(
|
36
|
+
eval_dataset,
|
37
|
+
agent_module_path,
|
38
|
+
repeat_num=3,
|
39
|
+
agent_name=None,
|
40
|
+
initial_session={},
|
41
|
+
):
|
42
|
+
"""Returns evaluation responses for the given dataset and agent.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
eval_dataset: The dataset that needs to be scraped for resposnes.
|
46
|
+
agent_module_path: Path to the module that contains the root agent.
|
47
|
+
repeat_num: Number of time the eval dataset should be repeated. This is
|
48
|
+
usually done to remove uncertainity that a single run may bring.
|
49
|
+
agent_name: The name of the agent that should be evaluated. This is
|
50
|
+
usually the sub-agent.
|
51
|
+
initial_session: Initial session for the eval data.
|
52
|
+
"""
|
53
|
+
results = []
|
54
|
+
|
55
|
+
for _ in range(repeat_num):
|
56
|
+
for data in eval_dataset:
|
57
|
+
results.append(
|
58
|
+
EvaluationGenerator._process_query(
|
59
|
+
data, agent_module_path, agent_name, initial_session
|
60
|
+
)
|
61
|
+
)
|
62
|
+
|
63
|
+
return results
|
64
|
+
|
65
|
+
@staticmethod
|
66
|
+
def generate_responses_from_session(session_path, eval_dataset):
|
67
|
+
"""Returns evaluation responses by combining session data with eval data.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
session_path: Path to a json file that contains session data.
|
71
|
+
eval_dataset: The eval data set that should be combined with the session
|
72
|
+
data.
|
73
|
+
"""
|
74
|
+
results = []
|
75
|
+
|
76
|
+
with open(session_path, "r") as f:
|
77
|
+
session_data = Session.model_validate_json(f.read())
|
78
|
+
print("loaded session", session_path)
|
79
|
+
|
80
|
+
for data in eval_dataset:
|
81
|
+
# load session data from session_path
|
82
|
+
results.append(
|
83
|
+
EvaluationGenerator._process_query_with_session(
|
84
|
+
session_data,
|
85
|
+
data,
|
86
|
+
)
|
87
|
+
)
|
88
|
+
|
89
|
+
return results
|
90
|
+
|
91
|
+
@staticmethod
|
92
|
+
def _process_query(data, module_name, agent_name=None, initial_session={}):
|
93
|
+
"""Process a query using the agent and evaluation dataset."""
|
94
|
+
module_path = f"{module_name}"
|
95
|
+
agent_module = importlib.import_module(module_path)
|
96
|
+
root_agent = agent_module.agent.root_agent
|
97
|
+
|
98
|
+
reset_func = getattr(agent_module.agent, "reset_data", None)
|
99
|
+
|
100
|
+
agent_to_evaluate = root_agent
|
101
|
+
if agent_name:
|
102
|
+
agent_to_evaluate = root_agent.find_agent(agent_name)
|
103
|
+
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
104
|
+
|
105
|
+
return EvaluationGenerator._process_query_with_root_agent(
|
106
|
+
data, agent_to_evaluate, reset_func, initial_session
|
107
|
+
)
|
108
|
+
|
109
|
+
@staticmethod
|
110
|
+
def _process_query_with_root_agent(
|
111
|
+
data,
|
112
|
+
root_agent,
|
113
|
+
reset_func,
|
114
|
+
initial_session={},
|
115
|
+
session_id=None,
|
116
|
+
session_service=None,
|
117
|
+
artifact_service=None,
|
118
|
+
):
|
119
|
+
"""Process a query using the agent and evaluation dataset."""
|
120
|
+
|
121
|
+
# we don't know which tools belong to which agent
|
122
|
+
# so we just apply to any agents that has certain tool outputs
|
123
|
+
all_mock_tools = set()
|
124
|
+
for eval_entry in data:
|
125
|
+
expected_tool_use = eval_entry.get(EvalConstants.EXPECTED_TOOL_USE, [])
|
126
|
+
for expected in expected_tool_use:
|
127
|
+
if EvalConstants.MOCK_TOOL_OUTPUT in expected:
|
128
|
+
all_mock_tools.add(expected[EvalConstants.TOOL_NAME])
|
129
|
+
|
130
|
+
eval_data_copy = data.copy()
|
131
|
+
EvaluationGenerator.apply_before_tool_callback(
|
132
|
+
root_agent,
|
133
|
+
lambda *args: EvaluationGenerator.before_tool_callback(
|
134
|
+
*args, eval_dataset=eval_data_copy
|
135
|
+
),
|
136
|
+
all_mock_tools,
|
137
|
+
)
|
138
|
+
|
139
|
+
if not session_service:
|
140
|
+
session_service = InMemorySessionService()
|
141
|
+
|
142
|
+
app_name = initial_session.get("app_name", "EvaluationGenerator")
|
143
|
+
user_id = initial_session.get("user_id", "test_user_id")
|
144
|
+
session_id = session_id if session_id else str(uuid.uuid4())
|
145
|
+
|
146
|
+
_ = session_service.create_session(
|
147
|
+
app_name=app_name,
|
148
|
+
user_id=user_id,
|
149
|
+
state=initial_session.get("state", {}),
|
150
|
+
session_id=session_id,
|
151
|
+
)
|
152
|
+
|
153
|
+
if not artifact_service:
|
154
|
+
artifact_service = InMemoryArtifactService()
|
155
|
+
runner = Runner(
|
156
|
+
app_name=app_name,
|
157
|
+
agent=root_agent,
|
158
|
+
artifact_service=artifact_service,
|
159
|
+
session_service=session_service,
|
160
|
+
)
|
161
|
+
|
162
|
+
# Reset agent state for each query
|
163
|
+
if callable(reset_func):
|
164
|
+
reset_func()
|
165
|
+
|
166
|
+
responses = data.copy()
|
167
|
+
|
168
|
+
for index, eval_entry in enumerate(responses):
|
169
|
+
response = None
|
170
|
+
query = eval_entry["query"]
|
171
|
+
content = types.Content(role="user", parts=[types.Part(text=query)])
|
172
|
+
turn_actual_tool_uses = []
|
173
|
+
|
174
|
+
for event in runner.run(
|
175
|
+
user_id=user_id, session_id=session_id, new_message=content
|
176
|
+
):
|
177
|
+
if event.is_final_response() and event.content and event.content.parts:
|
178
|
+
response = event.content.parts[0].text
|
179
|
+
elif event.get_function_calls():
|
180
|
+
for call in event.get_function_calls():
|
181
|
+
turn_actual_tool_uses.append({
|
182
|
+
EvalConstants.TOOL_NAME: call.name,
|
183
|
+
EvalConstants.TOOL_INPUT: call.args,
|
184
|
+
})
|
185
|
+
|
186
|
+
responses[index]["actual_tool_use"] = turn_actual_tool_uses
|
187
|
+
responses[index]["response"] = response
|
188
|
+
|
189
|
+
return responses
|
190
|
+
|
191
|
+
@staticmethod
|
192
|
+
def _process_query_with_session(session_data, data):
|
193
|
+
"""Process the queries using the existing session data without invoking the runner."""
|
194
|
+
responses = data.copy()
|
195
|
+
|
196
|
+
# Iterate through the provided queries and align them with the session events
|
197
|
+
for index, eval_entry in enumerate(responses):
|
198
|
+
query = eval_entry["query"]
|
199
|
+
actual_tool_uses = []
|
200
|
+
response = None
|
201
|
+
|
202
|
+
# Search for the corresponding session events
|
203
|
+
for event in session_data.events:
|
204
|
+
# Match the query to a user event
|
205
|
+
if (
|
206
|
+
event.author == "user"
|
207
|
+
and event.content
|
208
|
+
and event.content.parts
|
209
|
+
and event.content.parts[0].text == query
|
210
|
+
):
|
211
|
+
# Look for subsequent tool usage or model responses
|
212
|
+
for subsequent_event in session_data.events:
|
213
|
+
if subsequent_event.invocation_id == event.invocation_id:
|
214
|
+
# Extract tool usage
|
215
|
+
if subsequent_event.content.parts[0].function_call:
|
216
|
+
call = subsequent_event.content.parts[0].function_call
|
217
|
+
actual_tool_uses.append(
|
218
|
+
{"tool_name": call.name, "tool_input": call.args}
|
219
|
+
)
|
220
|
+
# Extract final response
|
221
|
+
elif subsequent_event.author != "user":
|
222
|
+
response = subsequent_event.content.parts[0].text
|
223
|
+
|
224
|
+
# Update the results for the current query
|
225
|
+
responses[index]["actual_tool_use"] = actual_tool_uses
|
226
|
+
responses[index]["response"] = response
|
227
|
+
return responses
|
228
|
+
|
229
|
+
@staticmethod
|
230
|
+
def before_tool_callback(tool, args, tool_context, eval_dataset):
|
231
|
+
"""Intercept specific tool calls and return predefined outputs
|
232
|
+
|
233
|
+
from eval_dataset.
|
234
|
+
"""
|
235
|
+
for index, eval_entry in enumerate(eval_dataset):
|
236
|
+
expected_tool_use = eval_entry.get("expected_tool_use", [])
|
237
|
+
for expected in expected_tool_use:
|
238
|
+
if (
|
239
|
+
EvalConstants.MOCK_TOOL_OUTPUT in expected
|
240
|
+
and tool.name == expected[EvalConstants.TOOL_NAME]
|
241
|
+
and args == expected.get(EvalConstants.TOOL_INPUT, {})
|
242
|
+
):
|
243
|
+
# pop the matched entry so we don't rematch again
|
244
|
+
eval_dataset.pop(index)
|
245
|
+
return {"result": expected[EvalConstants.MOCK_TOOL_OUTPUT]}
|
246
|
+
|
247
|
+
return None
|
248
|
+
|
249
|
+
@staticmethod
|
250
|
+
def apply_before_tool_callback(
|
251
|
+
agent: BaseAgent,
|
252
|
+
callback: BeforeToolCallback,
|
253
|
+
all_mock_tools: set[str],
|
254
|
+
):
|
255
|
+
"""Recursively apply the before_tool_callback to the root agent and all its subagents."""
|
256
|
+
# check if the agent has tools that defined by evalset
|
257
|
+
# We use function name to check if tools match
|
258
|
+
if not isinstance(agent, Agent) and not isinstance(agent, LlmAgent):
|
259
|
+
return
|
260
|
+
|
261
|
+
for tool in agent.canonical_tools:
|
262
|
+
tool_name = tool.name
|
263
|
+
if tool_name in all_mock_tools:
|
264
|
+
agent.before_tool_callback = callback
|
265
|
+
|
266
|
+
# Apply recursively to subagents if they exist
|
267
|
+
for sub_agent in agent.sub_agents:
|
268
|
+
EvaluationGenerator.apply_before_tool_callback(
|
269
|
+
sub_agent, callback, all_mock_tools
|
270
|
+
)
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Any
|
16
|
+
|
17
|
+
import pandas as pd
|
18
|
+
from tabulate import tabulate
|
19
|
+
from vertexai.preview.evaluation import EvalTask
|
20
|
+
from vertexai.preview.evaluation import MetricPromptTemplateExamples
|
21
|
+
|
22
|
+
|
23
|
+
class ResponseEvaluator:
|
24
|
+
"""Runs response evaluation for agents."""
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def evaluate(
|
28
|
+
raw_eval_dataset: list[list[dict[str, Any]]],
|
29
|
+
evaluation_criteria: list[str],
|
30
|
+
*,
|
31
|
+
print_detailed_results: bool = False
|
32
|
+
):
|
33
|
+
r"""Returns the value of requested evaluation metrics.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
raw_eval_dataset: The dataset that will be evaluated.
|
37
|
+
evaluation_criteria: The evaluation criteria to be used. This method
|
38
|
+
support two criterias, `response_evaluation_score` and
|
39
|
+
`response_match_score`.
|
40
|
+
print_detailed_results: Prints detailed results on the console. This is
|
41
|
+
usually helpful during debugging.
|
42
|
+
|
43
|
+
A note on evaluation_criteria:
|
44
|
+
`response_match_score`: This metric compares the agents final natural
|
45
|
+
language reponse with the expected final response, stored in the
|
46
|
+
"reference" field in test/eval files. We use Rouge metric to compare the
|
47
|
+
two responses.
|
48
|
+
|
49
|
+
Value Range: [0, 1]. A score closer to 0 means poor similarity between
|
50
|
+
response and reference. A score closer to 1 means strong similarity
|
51
|
+
between response and reference.
|
52
|
+
|
53
|
+
`response_evaluation_score`: Uses LLM to evalaute coherence of the
|
54
|
+
response, including tool use. This is pointwise metric.
|
55
|
+
|
56
|
+
Value range: [0, 5], where 0 means that the agent's response is not
|
57
|
+
coherent, while 5 means it is . High values are good.
|
58
|
+
A note on raw_eval_dataset:
|
59
|
+
The dataset should be a list session, where each sesssion is represented
|
60
|
+
as a list of interaction that need evaluation. Each evaluation is
|
61
|
+
represented as a dictionary that is expected to have values for the
|
62
|
+
following keys:
|
63
|
+
|
64
|
+
1) query
|
65
|
+
2) response
|
66
|
+
3) acutal_tool_use
|
67
|
+
4) expected_tool_use
|
68
|
+
5) reference
|
69
|
+
|
70
|
+
Here is a sample eval_dataset value with one entry:
|
71
|
+
[
|
72
|
+
[
|
73
|
+
{
|
74
|
+
"query": "roll a die for me",
|
75
|
+
"response": "I rolled a 16 sided die and got 13.\n",
|
76
|
+
"expected_tool_use": [
|
77
|
+
{
|
78
|
+
"tool_name": "roll_die",
|
79
|
+
"tool_input": {
|
80
|
+
"sides": 16
|
81
|
+
}
|
82
|
+
}
|
83
|
+
],
|
84
|
+
"acutal_tool_use": [
|
85
|
+
{
|
86
|
+
"tool_name": "roll_die",
|
87
|
+
"tool_input": {
|
88
|
+
"sides": 16
|
89
|
+
}
|
90
|
+
}
|
91
|
+
],
|
92
|
+
"reference": "I rolled a 16 sided die and got 13.\n"
|
93
|
+
}
|
94
|
+
]
|
95
|
+
]
|
96
|
+
"""
|
97
|
+
if not raw_eval_dataset:
|
98
|
+
raise ValueError("The evaluation dataset is empty.")
|
99
|
+
|
100
|
+
metrics = ResponseEvaluator._get_metrics(
|
101
|
+
raw_eval_dataset, evaluation_criteria
|
102
|
+
)
|
103
|
+
flattened_queries = [
|
104
|
+
item for sublist in raw_eval_dataset for item in sublist
|
105
|
+
]
|
106
|
+
eval_dataset = pd.DataFrame(flattened_queries).rename(
|
107
|
+
columns={"query": "prompt", "expected_tool_use": "reference_trajectory"}
|
108
|
+
)
|
109
|
+
eval_task = EvalTask(dataset=eval_dataset, metrics=metrics)
|
110
|
+
|
111
|
+
eval_result = eval_task.evaluate()
|
112
|
+
if print_detailed_results:
|
113
|
+
ResponseEvaluator._print_results(eval_result)
|
114
|
+
return eval_result.summary_metrics
|
115
|
+
|
116
|
+
@staticmethod
|
117
|
+
def _get_metrics(raw_eval_dataset, criteria):
|
118
|
+
metrics = []
|
119
|
+
if (
|
120
|
+
"response_evaluation_score" in criteria
|
121
|
+
and "query" in raw_eval_dataset[0][0]
|
122
|
+
and "expected_tool_use" in raw_eval_dataset[0][0]
|
123
|
+
):
|
124
|
+
metrics.append(MetricPromptTemplateExamples.Pointwise.COHERENCE)
|
125
|
+
if (
|
126
|
+
"response_match_score" in criteria
|
127
|
+
and "reference" in raw_eval_dataset[0][0]
|
128
|
+
):
|
129
|
+
metrics.append("rouge_1")
|
130
|
+
return metrics
|
131
|
+
|
132
|
+
@staticmethod
|
133
|
+
def _print_results(eval_result):
|
134
|
+
print("Evaluation Summary Metrics:", eval_result.summary_metrics)
|
135
|
+
print(tabulate(eval_result.metrics_table, headers="keys", tablefmt="grid"))
|
@@ -0,0 +1,184 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Any
|
16
|
+
|
17
|
+
import pandas as pd
|
18
|
+
from tabulate import tabulate
|
19
|
+
|
20
|
+
from .evaluation_constants import EvalConstants
|
21
|
+
|
22
|
+
|
23
|
+
class TrajectoryEvaluator:
|
24
|
+
"""Evaluates tool use trajectories for accuracy."""
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def evaluate(
|
28
|
+
eval_dataset: list[list[dict[str, Any]]],
|
29
|
+
*,
|
30
|
+
print_detailed_results: bool = False,
|
31
|
+
):
|
32
|
+
r"""Returns the mean tool use accuracy of the eval dataset.
|
33
|
+
|
34
|
+
Tool use accuracy is calculated by comparing the expected and actuall tool
|
35
|
+
use trajectories. An exact match scores a 1, 0 otherwise. The final number
|
36
|
+
is an
|
37
|
+
average of these individual scores.
|
38
|
+
|
39
|
+
Value range: [0, 1], where 0 is means none of the too use entries aligned,
|
40
|
+
and 1 would mean all of them aligned. Higher value is good.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
eval_dataset: The dataset that will be evaluated.
|
44
|
+
print_detailed_results: Prints detailed results on the console. This is
|
45
|
+
usually helpful during debugging.
|
46
|
+
|
47
|
+
A note on eval_dataset:
|
48
|
+
The dataset should be a list session, where each sesssion is represented
|
49
|
+
as a list of interaction that need evaluation. Each evaluation is
|
50
|
+
represented as a dictionary that is expected to have values for the
|
51
|
+
following keys:
|
52
|
+
1) query
|
53
|
+
2) response
|
54
|
+
3) acutal_tool_use
|
55
|
+
4) expected_tool_use
|
56
|
+
|
57
|
+
Here is a sample eval_dataset value with one entry:
|
58
|
+
|
59
|
+
[
|
60
|
+
[
|
61
|
+
{
|
62
|
+
"query": "Roll a 16 sided dice for me",
|
63
|
+
"response": "I rolled a 16 sided die and got 13.\n",
|
64
|
+
"expected_tool_use": [
|
65
|
+
{
|
66
|
+
"tool_name": "roll_die",
|
67
|
+
"tool_input": {
|
68
|
+
"sides": 16
|
69
|
+
}
|
70
|
+
}
|
71
|
+
],
|
72
|
+
"acutal_tool_use": [
|
73
|
+
{
|
74
|
+
"tool_name": "roll_die",
|
75
|
+
"tool_input": {
|
76
|
+
"sides": 16
|
77
|
+
}
|
78
|
+
}
|
79
|
+
]
|
80
|
+
}
|
81
|
+
]
|
82
|
+
]
|
83
|
+
"""
|
84
|
+
if not eval_dataset:
|
85
|
+
raise ValueError("The evaluation dataset is empty.")
|
86
|
+
|
87
|
+
results_df = pd.DataFrame(
|
88
|
+
columns=[
|
89
|
+
"query",
|
90
|
+
"response",
|
91
|
+
"actual_tool_use",
|
92
|
+
"expected_tool_use",
|
93
|
+
"tool_use_accuracy",
|
94
|
+
]
|
95
|
+
)
|
96
|
+
failures = []
|
97
|
+
|
98
|
+
for conversation in eval_dataset:
|
99
|
+
for index, row in enumerate(conversation):
|
100
|
+
new_row, failure = TrajectoryEvaluator._evaluate_row(row)
|
101
|
+
results_df = pd.concat(
|
102
|
+
[results_df, pd.DataFrame([new_row])], ignore_index=True
|
103
|
+
)
|
104
|
+
if failure:
|
105
|
+
failure["turn"] = index + 1
|
106
|
+
failures.append(failure)
|
107
|
+
|
108
|
+
TrajectoryEvaluator._report_failures(failures)
|
109
|
+
|
110
|
+
if print_detailed_results:
|
111
|
+
TrajectoryEvaluator._print_results(results_df)
|
112
|
+
|
113
|
+
return results_df["tool_use_accuracy"].mean()
|
114
|
+
|
115
|
+
@staticmethod
|
116
|
+
def _evaluate_row(row):
|
117
|
+
# We don't evaluate the mock tool outputs.
|
118
|
+
expected = TrajectoryEvaluator._remove_tool_outputs(
|
119
|
+
row["expected_tool_use"]
|
120
|
+
)
|
121
|
+
actual = row["actual_tool_use"]
|
122
|
+
tool_use_accuracy = (
|
123
|
+
1.0 if TrajectoryEvaluator.are_tools_equal(actual, expected) else 0.0
|
124
|
+
)
|
125
|
+
|
126
|
+
new_row = {
|
127
|
+
"query": row["query"],
|
128
|
+
"response": row["response"],
|
129
|
+
"actual_tool_use": actual,
|
130
|
+
"expected_tool_use": expected,
|
131
|
+
"tool_use_accuracy": tool_use_accuracy,
|
132
|
+
}
|
133
|
+
failure = (
|
134
|
+
None
|
135
|
+
if tool_use_accuracy == 1.0
|
136
|
+
else {"query": row["query"], "actual": actual, "expected": expected}
|
137
|
+
)
|
138
|
+
return new_row, failure
|
139
|
+
|
140
|
+
@staticmethod
|
141
|
+
def are_tools_equal(list_a_original, list_b_original):
|
142
|
+
# Remove other entries that we don't want to evaluate
|
143
|
+
list_a = [
|
144
|
+
{"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]}
|
145
|
+
for tool in list_a_original
|
146
|
+
]
|
147
|
+
|
148
|
+
list_b = [
|
149
|
+
{"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]}
|
150
|
+
for tool in list_b_original
|
151
|
+
]
|
152
|
+
|
153
|
+
return list_a == list_b
|
154
|
+
|
155
|
+
@staticmethod
|
156
|
+
def _remove_tool_outputs(tool_use_list):
|
157
|
+
"""Removes 'mock_tool_output' from each dictionary in the list."""
|
158
|
+
result = []
|
159
|
+
for tool_use in tool_use_list:
|
160
|
+
new_tool_use = (
|
161
|
+
tool_use.copy()
|
162
|
+
) # Create a copy to avoid modifying the original
|
163
|
+
new_tool_use.pop(
|
164
|
+
EvalConstants.MOCK_TOOL_OUTPUT, None
|
165
|
+
) # Remove 'tool_output' if it exists
|
166
|
+
result.append(new_tool_use)
|
167
|
+
return result
|
168
|
+
|
169
|
+
@staticmethod
|
170
|
+
def _report_failures(failures):
|
171
|
+
if failures:
|
172
|
+
print("Failures:")
|
173
|
+
for failure in failures:
|
174
|
+
print(f"""{{
|
175
|
+
"turn": {failure["turn"]},
|
176
|
+
"query": '{failure["query"]}',
|
177
|
+
"actual": {failure["actual"]},
|
178
|
+
"expected_tool_use": {failure["expected"]},
|
179
|
+
}}
|
180
|
+
""")
|
181
|
+
|
182
|
+
@staticmethod
|
183
|
+
def _print_results(results_df):
|
184
|
+
print(tabulate(results_df, headers="keys", tablefmt="grid"))
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from .event import Event
|
16
|
+
from .event_actions import EventActions
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
'Event',
|
20
|
+
'EventActions',
|
21
|
+
]
|