eval-protocol 0.2.32__tar.gz → 0.2.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.32/eval_protocol.egg-info → eval_protocol-0.2.34}/PKG-INFO +1 -1
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/evaluation_test.py +10 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/handle_persist_flow.py +9 -2
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/plugin.py +36 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/remote_rollout_processor.py +3 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/utils.py +41 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/LICENSE +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/README.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/development/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/pyproject.toml +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/setup.cfg +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/setup.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_config.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_format.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_length.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_math.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_models.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/versioneer.py +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.32 → eval_protocol-0.2.34}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.34
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-09-
|
|
11
|
+
"date": "2025-09-30T15:39:15-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "c09755b30386c03c95bd79d7b142ed614419c7c4",
|
|
15
|
+
"version": "0.2.34"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -52,10 +52,12 @@ from eval_protocol.pytest.utils import (
|
|
|
52
52
|
add_cost_metrics,
|
|
53
53
|
log_eval_status_and_rows,
|
|
54
54
|
parse_ep_completion_params,
|
|
55
|
+
parse_ep_completion_params_overwrite,
|
|
55
56
|
parse_ep_max_concurrent_rollouts,
|
|
56
57
|
parse_ep_max_rows,
|
|
57
58
|
parse_ep_num_runs,
|
|
58
59
|
parse_ep_passed_threshold,
|
|
60
|
+
parse_ep_dataloaders,
|
|
59
61
|
rollout_processor_with_retry,
|
|
60
62
|
run_tasks_with_eval_progress,
|
|
61
63
|
run_tasks_with_run_progress,
|
|
@@ -189,10 +191,18 @@ def evaluation_test(
|
|
|
189
191
|
max_concurrent_rollouts = parse_ep_max_concurrent_rollouts(max_concurrent_rollouts)
|
|
190
192
|
max_dataset_rows = parse_ep_max_rows(max_dataset_rows)
|
|
191
193
|
completion_params = parse_ep_completion_params(completion_params)
|
|
194
|
+
completion_params = parse_ep_completion_params_overwrite(completion_params)
|
|
192
195
|
original_completion_params = completion_params
|
|
193
196
|
passed_threshold = parse_ep_passed_threshold(passed_threshold)
|
|
197
|
+
data_loaders = parse_ep_dataloaders(data_loaders)
|
|
194
198
|
custom_invocation_id = os.environ.get("EP_INVOCATION_ID", None)
|
|
195
199
|
|
|
200
|
+
# ignore other data input params when dataloader is provided
|
|
201
|
+
if data_loaders:
|
|
202
|
+
input_dataset = None
|
|
203
|
+
input_messages = None
|
|
204
|
+
input_rows = None
|
|
205
|
+
|
|
196
206
|
def decorator(
|
|
197
207
|
test_func: TestFunction,
|
|
198
208
|
) -> TestFunction:
|
|
@@ -16,9 +16,10 @@ import requests
|
|
|
16
16
|
def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: str):
|
|
17
17
|
try:
|
|
18
18
|
# Default is to save and upload experiment JSONL files, unless explicitly disabled
|
|
19
|
-
|
|
19
|
+
custom_output_dir = os.getenv("EP_OUTPUT_DIR")
|
|
20
|
+
should_save = os.getenv("EP_NO_UPLOAD") != "1" or custom_output_dir is not None
|
|
20
21
|
|
|
21
|
-
if
|
|
22
|
+
if should_save:
|
|
22
23
|
current_run_rows = [item for sublist in all_results for item in sublist]
|
|
23
24
|
if current_run_rows:
|
|
24
25
|
experiments: dict[str, list[EvaluationRow]] = defaultdict(list)
|
|
@@ -27,6 +28,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
|
|
|
27
28
|
experiments[row.execution_metadata.experiment_id].append(row)
|
|
28
29
|
|
|
29
30
|
eval_protocol_dir = find_eval_protocol_dir()
|
|
31
|
+
if custom_output_dir:
|
|
32
|
+
eval_protocol_dir = custom_output_dir
|
|
30
33
|
exp_dir = pathlib.Path(eval_protocol_dir) / "experiment_results"
|
|
31
34
|
exp_dir.mkdir(parents=True, exist_ok=True)
|
|
32
35
|
|
|
@@ -81,6 +84,10 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
|
|
|
81
84
|
json.dump(row_data, f, ensure_ascii=False)
|
|
82
85
|
f.write("\n")
|
|
83
86
|
|
|
87
|
+
should_upload = os.getenv("EP_NO_UPLOAD") != "1"
|
|
88
|
+
if not should_upload:
|
|
89
|
+
continue
|
|
90
|
+
|
|
84
91
|
def get_auth_value(key: str) -> str | None:
|
|
85
92
|
"""Get auth value from config file or environment."""
|
|
86
93
|
try:
|
|
@@ -19,6 +19,7 @@ import json
|
|
|
19
19
|
import pathlib
|
|
20
20
|
import sys
|
|
21
21
|
from pytest import StashKey
|
|
22
|
+
import pytest
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def pytest_addoption(parser) -> None:
|
|
@@ -56,6 +57,7 @@ def pytest_addoption(parser) -> None:
|
|
|
56
57
|
default=None,
|
|
57
58
|
help=("Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."),
|
|
58
59
|
)
|
|
60
|
+
# deprecate this later
|
|
59
61
|
group.addoption(
|
|
60
62
|
"--ep-input-param",
|
|
61
63
|
action="append",
|
|
@@ -115,6 +117,27 @@ def pytest_addoption(parser) -> None:
|
|
|
115
117
|
"Default: false (experiment JSONs are saved and uploaded by default)."
|
|
116
118
|
),
|
|
117
119
|
)
|
|
120
|
+
group.addoption(
|
|
121
|
+
"--ep-jsonl-path",
|
|
122
|
+
default=None,
|
|
123
|
+
help=("Load input from a jsonl file that is already in EvaluationRow or openai CHAT format"),
|
|
124
|
+
)
|
|
125
|
+
group.addoption(
|
|
126
|
+
"--ep-completion-params",
|
|
127
|
+
default=[],
|
|
128
|
+
action="append",
|
|
129
|
+
help=("Overwrite completion params with json. Can be used multiple times. "),
|
|
130
|
+
)
|
|
131
|
+
group.addoption(
|
|
132
|
+
"--ep-remote-rollout-processor-base-url",
|
|
133
|
+
default=None,
|
|
134
|
+
help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"),
|
|
135
|
+
)
|
|
136
|
+
group.addoption(
|
|
137
|
+
"--ep-output-dir",
|
|
138
|
+
default=None,
|
|
139
|
+
help=("If set, save evaluation results to this directory in jsonl format."),
|
|
140
|
+
)
|
|
118
141
|
|
|
119
142
|
|
|
120
143
|
def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
|
|
@@ -240,9 +263,22 @@ def pytest_configure(config) -> None:
|
|
|
240
263
|
if threshold_env is not None:
|
|
241
264
|
os.environ["EP_PASSED_THRESHOLD"] = threshold_env
|
|
242
265
|
|
|
266
|
+
if config.getoption("--ep-output-dir"):
|
|
267
|
+
# set this to save eval results to the target dir in jsonl format
|
|
268
|
+
os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir")
|
|
269
|
+
|
|
243
270
|
if config.getoption("--ep-no-upload"):
|
|
244
271
|
os.environ["EP_NO_UPLOAD"] = "1"
|
|
245
272
|
|
|
273
|
+
if config.getoption("--ep-jsonl-path"):
|
|
274
|
+
os.environ["EP_JSONL_PATH"] = config.getoption("--ep-jsonl-path")
|
|
275
|
+
|
|
276
|
+
if config.getoption("--ep-completion-params"):
|
|
277
|
+
# redump to json to make sure they are legit
|
|
278
|
+
os.environ["EP_COMPLETION_PARAMS"] = json.dumps(
|
|
279
|
+
[json.loads(s) for s in config.getoption("--ep-completion-params") or []]
|
|
280
|
+
)
|
|
281
|
+
|
|
246
282
|
# Allow ad-hoc overrides of input params via CLI flags
|
|
247
283
|
try:
|
|
248
284
|
merged: dict = {}
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -9,6 +9,7 @@ from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
|
|
|
9
9
|
from eval_protocol.types.remote_rollout_processor import InitRequest, RolloutMetadata
|
|
10
10
|
from .rollout_processor import RolloutProcessor
|
|
11
11
|
from .types import RolloutProcessorConfig
|
|
12
|
+
import os
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class RemoteRolloutProcessor(RolloutProcessor):
|
|
@@ -31,6 +32,8 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
31
32
|
# config.kwargs at call time for backward compatibility.
|
|
32
33
|
self._remote_base_url = remote_base_url
|
|
33
34
|
self._model_base_url = model_base_url
|
|
35
|
+
if os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"):
|
|
36
|
+
self._remote_base_url = os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL")
|
|
34
37
|
self._poll_interval = poll_interval
|
|
35
38
|
self._timeout_seconds = timeout_seconds
|
|
36
39
|
self._output_data_loader = output_data_loader
|
|
@@ -19,6 +19,8 @@ from eval_protocol.models import (
|
|
|
19
19
|
EvaluationThresholdDict,
|
|
20
20
|
Status,
|
|
21
21
|
)
|
|
22
|
+
from eval_protocol.data_loader import DynamicDataLoader
|
|
23
|
+
from eval_protocol.data_loader.models import EvaluationDataLoader
|
|
22
24
|
from eval_protocol.pytest.rollout_processor import RolloutProcessor
|
|
23
25
|
from eval_protocol.pytest.types import (
|
|
24
26
|
RolloutProcessorConfig,
|
|
@@ -239,6 +241,45 @@ def parse_ep_completion_params(
|
|
|
239
241
|
return completion_params
|
|
240
242
|
|
|
241
243
|
|
|
244
|
+
def parse_ep_completion_params_overwrite(
|
|
245
|
+
completion_params: Sequence[CompletionParams | None] | None,
|
|
246
|
+
) -> Sequence[CompletionParams | None]:
|
|
247
|
+
new_completion_params = os.getenv("EP_COMPLETION_PARAMS")
|
|
248
|
+
if new_completion_params:
|
|
249
|
+
try:
|
|
250
|
+
new_completion_params_list = json.loads(new_completion_params)
|
|
251
|
+
if isinstance(new_completion_params_list, list):
|
|
252
|
+
return new_completion_params_list
|
|
253
|
+
except Exception:
|
|
254
|
+
pass
|
|
255
|
+
return completion_params or []
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _rows_from_jsonl(path: str) -> list[EvaluationRow]:
|
|
259
|
+
rows = []
|
|
260
|
+
try:
|
|
261
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
262
|
+
for line in f:
|
|
263
|
+
rows.append(EvaluationRow(**json.loads(line)))
|
|
264
|
+
except Exception as e:
|
|
265
|
+
print(f"❌ Failed to load rows from JSONL at {path}: {e}")
|
|
266
|
+
return []
|
|
267
|
+
|
|
268
|
+
return rows
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def parse_ep_dataloaders(
|
|
272
|
+
dataloaders: Sequence[EvaluationDataLoader] | EvaluationDataLoader | None,
|
|
273
|
+
) -> Sequence[EvaluationDataLoader] | EvaluationDataLoader | None:
|
|
274
|
+
try:
|
|
275
|
+
load_from_jsonl_path = os.getenv("EP_JSONL_PATH")
|
|
276
|
+
if load_from_jsonl_path:
|
|
277
|
+
return DynamicDataLoader(generators=[lambda path=load_from_jsonl_path: _rows_from_jsonl(path)])
|
|
278
|
+
except Exception:
|
|
279
|
+
pass
|
|
280
|
+
return dataloaders or None
|
|
281
|
+
|
|
282
|
+
|
|
242
283
|
def parse_ep_passed_threshold(
|
|
243
284
|
default_value: float | EvaluationThresholdDict | EvaluationThreshold | None,
|
|
244
285
|
) -> EvaluationThreshold | None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.34
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/math_api.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_envs/posting_api.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/bfcl_sim_api_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/docker_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/filesystem_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/agent/resources/python_state_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/data/airline_dataset.jsonl
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/data/retail_dataset.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_tau_bench_airline.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/benchmarks/test_tau_bench_retail.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/dynamic_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/factory_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/data_loader/inline_data_loader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/dataset_logger/dataset_logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/event_bus/sqlite_event_bus_database.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/orchestration/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_agent/orchestration/base_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/default_dataset_adapter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.32 → eval_protocol-0.2.34}/eval_protocol/pytest/evaluation_test_postprocess.py
RENAMED
|
File without changes
|