eval-protocol 0.2.35__tar.gz → 0.2.35.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.35/eval_protocol.egg-info → eval_protocol-0.2.35.dev2}/PKG-INFO +8 -16
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/__init__.py +9 -18
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/huggingface.py +2 -10
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli.py +42 -7
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -4
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +2 -1
- eval_protocol-0.2.35.dev2/eval_protocol/cli_commands/upload.py +522 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/execution/pipeline.py +8 -1
- eval_protocol-0.2.35.dev2/eval_protocol/logging/elasticsearch_client.py +286 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/logging/elasticsearch_direct_http_handler.py +58 -20
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/logging/elasticsearch_index_manager.py +47 -66
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/execution/manager.py +7 -3
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/elasticsearch_setup.py +8 -8
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/remote_rollout_processor.py +9 -3
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/utils.py +12 -16
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/utils.py +2 -7
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/types/remote_rollout_processor.py +9 -2
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2/eval_protocol.egg-info}/PKG-INFO +8 -16
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol.egg-info/SOURCES.txt +2 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol.egg-info/requires.txt +7 -16
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/pyproject.toml +7 -18
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_evaluation_postprocess.py +27 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/LICENSE +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/README.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/development/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/setup.cfg +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/setup.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_config.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_format.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_length.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_math.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_models.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/versioneer.py +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.35
|
|
3
|
+
Version: 0.2.35.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -20,30 +20,25 @@ Requires-Dist: aiosqlite
|
|
|
20
20
|
Requires-Dist: aiohttp
|
|
21
21
|
Requires-Dist: mcp>=1.9.2
|
|
22
22
|
Requires-Dist: PyYAML>=5.0
|
|
23
|
-
Requires-Dist: datasets>=3.0.0
|
|
24
|
-
Requires-Dist: fsspec
|
|
25
23
|
Requires-Dist: hydra-core>=1.3.2
|
|
26
24
|
Requires-Dist: omegaconf>=2.3.0
|
|
27
|
-
Requires-Dist: gymnasium>=0.29.0
|
|
28
25
|
Requires-Dist: httpx>=0.24.0
|
|
29
26
|
Requires-Dist: anthropic>=0.59.0
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: litellm<1.75.0
|
|
28
|
+
Requires-Dist: pytest>=6.0.0
|
|
29
|
+
Requires-Dist: pytest-asyncio>=0.21.0
|
|
30
|
+
Requires-Dist: peewee>=3.18.2
|
|
31
|
+
Requires-Dist: backoff>=2.2.0
|
|
32
|
+
Requires-Dist: questionary>=2.0.0
|
|
32
33
|
Requires-Dist: toml>=0.10.0
|
|
33
34
|
Requires-Dist: loguru>=0.6.0
|
|
34
35
|
Requires-Dist: docstring-parser>=0.15
|
|
35
36
|
Requires-Dist: rich>=12.0.0
|
|
36
37
|
Requires-Dist: psutil>=5.8.0
|
|
37
|
-
Requires-Dist: litellm>=1.0.0
|
|
38
38
|
Requires-Dist: addict>=2.4.0
|
|
39
39
|
Requires-Dist: deepdiff>=6.0.0
|
|
40
|
-
Requires-Dist: pandas>=1.5.0
|
|
41
40
|
Requires-Dist: websockets>=15.0.1
|
|
42
41
|
Requires-Dist: fastapi>=0.116.1
|
|
43
|
-
Requires-Dist: pytest>=6.0.0
|
|
44
|
-
Requires-Dist: pytest-asyncio>=0.21.0
|
|
45
|
-
Requires-Dist: peewee>=3.18.2
|
|
46
|
-
Requires-Dist: backoff>=2.2.0
|
|
47
42
|
Provides-Extra: dev
|
|
48
43
|
Requires-Dist: build; extra == "dev"
|
|
49
44
|
Requires-Dist: twine; extra == "dev"
|
|
@@ -51,6 +46,7 @@ Requires-Dist: pytest-httpserver; extra == "dev"
|
|
|
51
46
|
Requires-Dist: werkzeug>=2.0.0; extra == "dev"
|
|
52
47
|
Requires-Dist: ruff>=0.5.0; extra == "dev"
|
|
53
48
|
Requires-Dist: transformers>=4.0.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pandas>=1.5.0; extra == "dev"
|
|
54
50
|
Requires-Dist: types-setuptools; extra == "dev"
|
|
55
51
|
Requires-Dist: types-requests; extra == "dev"
|
|
56
52
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -86,10 +82,6 @@ Requires-Dist: langfuse>=2.0.0; extra == "langfuse"
|
|
|
86
82
|
Provides-Extra: huggingface
|
|
87
83
|
Requires-Dist: datasets>=3.0.0; extra == "huggingface"
|
|
88
84
|
Requires-Dist: transformers>=4.0.0; extra == "huggingface"
|
|
89
|
-
Provides-Extra: adapters
|
|
90
|
-
Requires-Dist: langfuse>=2.0.0; extra == "adapters"
|
|
91
|
-
Requires-Dist: datasets>=3.0.0; extra == "adapters"
|
|
92
|
-
Requires-Dist: transformers>=4.0.0; extra == "adapters"
|
|
93
85
|
Provides-Extra: langsmith
|
|
94
86
|
Requires-Dist: langsmith>=0.1.86; extra == "langsmith"
|
|
95
87
|
Provides-Extra: bigquery
|
|
@@ -23,17 +23,8 @@ from .mcp_env import (
|
|
|
23
23
|
test_mcp,
|
|
24
24
|
)
|
|
25
25
|
from .data_loader import DynamicDataLoader, InlineDataLoader
|
|
26
|
-
|
|
27
|
-
# Try to import FireworksPolicy if available
|
|
28
|
-
try:
|
|
29
|
-
from .mcp_env import FireworksPolicy
|
|
30
|
-
|
|
31
|
-
_FIREWORKS_AVAILABLE = True
|
|
32
|
-
except (ImportError, AttributeError):
|
|
33
|
-
_FIREWORKS_AVAILABLE = False
|
|
34
|
-
# Import submodules to make them available via eval_protocol.rewards, etc.
|
|
35
26
|
from . import mcp, rewards
|
|
36
|
-
from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata
|
|
27
|
+
from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata, Status
|
|
37
28
|
from .playback_policy import PlaybackPolicyBase
|
|
38
29
|
from .resources import create_llm_resource
|
|
39
30
|
from .reward_function import RewardFunction
|
|
@@ -42,6 +33,13 @@ from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assista
|
|
|
42
33
|
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
|
|
43
34
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
44
35
|
|
|
36
|
+
from .types.remote_rollout_processor import (
|
|
37
|
+
InitRequest,
|
|
38
|
+
RolloutMetadata,
|
|
39
|
+
StatusResponse,
|
|
40
|
+
create_langfuse_config_tags,
|
|
41
|
+
)
|
|
42
|
+
|
|
45
43
|
try:
|
|
46
44
|
from .adapters import OpenAIResponsesAdapter
|
|
47
45
|
except ImportError:
|
|
@@ -62,17 +60,10 @@ try:
|
|
|
62
60
|
except ImportError:
|
|
63
61
|
LangSmithAdapter = None
|
|
64
62
|
|
|
65
|
-
# Remote server types
|
|
66
|
-
from .types.remote_rollout_processor import (
|
|
67
|
-
InitRequest,
|
|
68
|
-
RolloutMetadata,
|
|
69
|
-
StatusResponse,
|
|
70
|
-
create_langfuse_config_tags,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
63
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
74
64
|
|
|
75
65
|
__all__ = [
|
|
66
|
+
"Status",
|
|
76
67
|
"RemoteRolloutProcessor",
|
|
77
68
|
"InputMetadata",
|
|
78
69
|
"EvaluationRow",
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-02T12:04:07-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.35"
|
|
14
|
+
"full-revisionid": "52178b3b90bb27a7f53fcbbba0bfbb50e7ebb416",
|
|
15
|
+
"version": "0.2.35-dev2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -13,12 +13,9 @@ from .base import BaseAdapter
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
|
-
from datasets import Dataset, DatasetDict, load_dataset
|
|
17
|
-
|
|
18
|
-
DATASETS_AVAILABLE = True
|
|
16
|
+
from datasets import Dataset, DatasetDict, load_dataset # pyright: ignore[reportAttributeAccessIssue]
|
|
19
17
|
except ImportError:
|
|
20
|
-
|
|
21
|
-
logger.warning("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'")
|
|
18
|
+
raise ImportError("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'")
|
|
22
19
|
|
|
23
20
|
# Type alias for transformation function
|
|
24
21
|
TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
|
|
@@ -80,11 +77,6 @@ class HuggingFaceAdapter(BaseAdapter):
|
|
|
80
77
|
revision: Optional dataset revision/commit hash
|
|
81
78
|
**load_dataset_kwargs: Additional arguments to pass to load_dataset
|
|
82
79
|
"""
|
|
83
|
-
if not DATASETS_AVAILABLE:
|
|
84
|
-
raise ImportError(
|
|
85
|
-
"HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'"
|
|
86
|
-
)
|
|
87
|
-
|
|
88
80
|
self.dataset_id = dataset_id
|
|
89
81
|
self.transform_fn = transform_fn
|
|
90
82
|
self.config_name = config_name
|
|
@@ -15,19 +15,14 @@ from pathlib import Path
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
from eval_protocol.evaluation import create_evaluation, preview_evaluation
|
|
19
|
-
|
|
20
18
|
from .cli_commands.agent_eval_cmd import agent_eval_command
|
|
21
|
-
from .cli_commands.common import
|
|
22
|
-
check_agent_environment,
|
|
23
|
-
check_environment,
|
|
24
|
-
setup_logging,
|
|
25
|
-
)
|
|
19
|
+
from .cli_commands.common import setup_logging
|
|
26
20
|
from .cli_commands.deploy import deploy_command
|
|
27
21
|
from .cli_commands.deploy_mcp import deploy_mcp_command
|
|
28
22
|
from .cli_commands.logs import logs_command
|
|
29
23
|
from .cli_commands.preview import preview_command
|
|
30
24
|
from .cli_commands.run_eval_cmd import hydra_cli_entry_point
|
|
25
|
+
from .cli_commands.upload import upload_command
|
|
31
26
|
|
|
32
27
|
|
|
33
28
|
def parse_args(args=None):
|
|
@@ -291,6 +286,44 @@ def parse_args(args=None):
|
|
|
291
286
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
292
287
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
293
288
|
|
|
289
|
+
# Upload command
|
|
290
|
+
upload_parser = subparsers.add_parser(
|
|
291
|
+
"upload",
|
|
292
|
+
help="Scan for evaluation tests, select, and upload as Fireworks evaluators",
|
|
293
|
+
)
|
|
294
|
+
upload_parser.add_argument(
|
|
295
|
+
"--path",
|
|
296
|
+
default=".",
|
|
297
|
+
help="Path to search for evaluation tests (default: current directory)",
|
|
298
|
+
)
|
|
299
|
+
upload_parser.add_argument(
|
|
300
|
+
"--entry",
|
|
301
|
+
help="Entrypoint of evaluation test to upload (module:function or path::function). For multiple, separate by commas.",
|
|
302
|
+
)
|
|
303
|
+
upload_parser.add_argument(
|
|
304
|
+
"--id",
|
|
305
|
+
help="Evaluator ID to use (if multiple selections, a numeric suffix is appended)",
|
|
306
|
+
)
|
|
307
|
+
upload_parser.add_argument(
|
|
308
|
+
"--display-name",
|
|
309
|
+
help="Display name for evaluator (defaults to ID)",
|
|
310
|
+
)
|
|
311
|
+
upload_parser.add_argument(
|
|
312
|
+
"--description",
|
|
313
|
+
help="Description for evaluator",
|
|
314
|
+
)
|
|
315
|
+
upload_parser.add_argument(
|
|
316
|
+
"--force",
|
|
317
|
+
action="store_true",
|
|
318
|
+
help="Overwrite existing evaluator with the same ID",
|
|
319
|
+
)
|
|
320
|
+
upload_parser.add_argument(
|
|
321
|
+
"--yes",
|
|
322
|
+
"-y",
|
|
323
|
+
action="store_true",
|
|
324
|
+
help="Non-interactive: upload all discovered evaluation tests",
|
|
325
|
+
)
|
|
326
|
+
|
|
294
327
|
# Run command (for Hydra-based evaluations)
|
|
295
328
|
# This subparser intentionally defines no arguments itself.
|
|
296
329
|
# All arguments after 'run' will be passed to Hydra by parse_known_args.
|
|
@@ -346,6 +379,8 @@ def main():
|
|
|
346
379
|
return agent_eval_command(args)
|
|
347
380
|
elif args.command == "logs":
|
|
348
381
|
return logs_command(args)
|
|
382
|
+
elif args.command == "upload":
|
|
383
|
+
return upload_command(args)
|
|
349
384
|
elif args.command == "run":
|
|
350
385
|
# For the 'run' command, Hydra takes over argument parsing.
|
|
351
386
|
|
{eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py
RENAMED
|
@@ -27,11 +27,7 @@ import logging # For logger instance
|
|
|
27
27
|
import os # For environment variables
|
|
28
28
|
from pathlib import Path
|
|
29
29
|
|
|
30
|
-
from pydantic import ValidationError
|
|
31
|
-
|
|
32
|
-
from eval_protocol.agent import Orchestrator
|
|
33
30
|
from eval_protocol.agent.task_manager import TaskManager
|
|
34
|
-
from eval_protocol.models import TaskDefinitionModel # Import the new Pydantic model
|
|
35
31
|
|
|
36
32
|
# setup_logging is already called in cli.py's main, but good for standalone use if any
|
|
37
33
|
# from .common import setup_logging
|
{eval_protocol-0.2.35 → eval_protocol-0.2.35.dev2}/eval_protocol/cli_commands/run_eval_cmd.py
RENAMED
|
@@ -17,7 +17,6 @@ from omegaconf import ( # Ensure MISSING is imported if used in configs
|
|
|
17
17
|
OmegaConf,
|
|
18
18
|
)
|
|
19
19
|
|
|
20
|
-
from eval_protocol.execution.pipeline import EvaluationPipeline
|
|
21
20
|
|
|
22
21
|
logger = logging.getLogger(__name__)
|
|
23
22
|
|
|
@@ -26,6 +25,8 @@ def run_evaluation_command_logic(cfg: DictConfig) -> None:
|
|
|
26
25
|
"""
|
|
27
26
|
Main logic for the 'run-evaluation' command.
|
|
28
27
|
"""
|
|
28
|
+
from eval_protocol.execution.pipeline import EvaluationPipeline
|
|
29
|
+
|
|
29
30
|
logger.info("Starting 'run-evaluation' command with resolved Hydra config.")
|
|
30
31
|
|
|
31
32
|
# Make Hydra's runtime output directory available to the pipeline if needed
|