eval-protocol 0.2.34__tar.gz → 0.2.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.34/eval_protocol.egg-info → eval_protocol-0.2.35}/PKG-INFO +1 -1
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/_version.py +3 -3
- eval_protocol-0.2.35/eval_protocol/logging/elasticsearch_direct_http_handler.py +91 -0
- eval_protocol-0.2.35/eval_protocol/logging/elasticsearch_index_manager.py +187 -0
- eval_protocol-0.2.35/eval_protocol/pytest/elasticsearch_setup.py +167 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/evaluation_test.py +2 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/remote_rollout_processor.py +40 -2
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/rollout_processor.py +4 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/remote_rollout_processor.py +11 -0
- eval_protocol-0.2.35/eval_protocol/utils/subprocess_utils.py +118 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/SOURCES.txt +4 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/LICENSE +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/README.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/pyproject.toml +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/setup.cfg +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/setup.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_config.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_format.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_length.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_math.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_models.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/versioneer.py +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.35
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-
|
|
11
|
+
"date": "2025-10-01T13:28:59-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "43ea8eaa8329931e6a9e61aa23a7aeca359f1d1c",
|
|
15
|
+
"version": "0.2.35"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import asyncio
|
|
4
|
+
import threading
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from typing import Optional, Tuple, Any, Dict
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ElasticsearchDirectHttpHandler(logging.Handler):
|
|
15
|
+
def __init__(self, elasticsearch_config: ElasticSearchConfig) -> None:
|
|
16
|
+
super().__init__()
|
|
17
|
+
self.base_url: str = elasticsearch_config.url.rstrip("/")
|
|
18
|
+
self.index_name: str = elasticsearch_config.index_name
|
|
19
|
+
self.api_key: str = elasticsearch_config.api_key
|
|
20
|
+
self.url: str = f"{self.base_url}/{self.index_name}/_doc"
|
|
21
|
+
self.formatter: logging.Formatter = logging.Formatter()
|
|
22
|
+
self._executor = None
|
|
23
|
+
|
|
24
|
+
# Parse URL to determine if we should verify SSL
|
|
25
|
+
parsed_url = urlparse(elasticsearch_config.url)
|
|
26
|
+
self.verify_ssl = parsed_url.scheme == "https"
|
|
27
|
+
|
|
28
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
29
|
+
"""Emit a log record by scheduling it for async transmission."""
|
|
30
|
+
try:
|
|
31
|
+
# Create proper ISO 8601 timestamp
|
|
32
|
+
timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
33
|
+
|
|
34
|
+
data: Dict[str, Any] = {
|
|
35
|
+
"@timestamp": timestamp,
|
|
36
|
+
"level": record.levelname,
|
|
37
|
+
"message": record.getMessage(),
|
|
38
|
+
"logger_name": record.name,
|
|
39
|
+
# Add other relevant record attributes if needed
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Schedule the HTTP request to run asynchronously
|
|
43
|
+
self._schedule_async_send(data, record)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
self.handleError(record)
|
|
46
|
+
print(f"Error preparing log for Elasticsearch: {e}")
|
|
47
|
+
|
|
48
|
+
def _schedule_async_send(self, data: Dict[str, Any], record: logging.LogRecord) -> None:
|
|
49
|
+
"""Schedule an async task to send the log data to Elasticsearch."""
|
|
50
|
+
if self._executor is None:
|
|
51
|
+
self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="elasticsearch-logger")
|
|
52
|
+
|
|
53
|
+
# Submit the HTTP request to the thread pool
|
|
54
|
+
future = self._executor.submit(self._send_to_elasticsearch, data, record)
|
|
55
|
+
|
|
56
|
+
# Add error handling callback
|
|
57
|
+
future.add_done_callback(lambda f: self._handle_async_result(f, record))
|
|
58
|
+
|
|
59
|
+
def _send_to_elasticsearch(self, data: Dict[str, Any], record: logging.LogRecord) -> None:
|
|
60
|
+
"""Send data to Elasticsearch (runs in thread pool)."""
|
|
61
|
+
try:
|
|
62
|
+
response: requests.Response = requests.post(
|
|
63
|
+
self.url,
|
|
64
|
+
headers={"Content-Type": "application/json", "Authorization": f"ApiKey {self.api_key}"},
|
|
65
|
+
data=json.dumps(data),
|
|
66
|
+
verify=self.verify_ssl, # If using HTTPS, verify SSL certificate
|
|
67
|
+
)
|
|
68
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
69
|
+
except Exception as e:
|
|
70
|
+
# Re-raise to be handled by the callback
|
|
71
|
+
raise e
|
|
72
|
+
|
|
73
|
+
def _handle_async_result(self, future, record: logging.LogRecord) -> None:
|
|
74
|
+
"""Handle the result of the async send operation."""
|
|
75
|
+
try:
|
|
76
|
+
future.result() # This will raise any exception that occurred
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.handleError(record)
|
|
79
|
+
# You might want to log this error to a file or console
|
|
80
|
+
# to prevent a logging loop.
|
|
81
|
+
if hasattr(e, "response") and getattr(e, "response", None) is not None:
|
|
82
|
+
print(f"Error sending log to Elasticsearch: {e}")
|
|
83
|
+
print(f"Response content: {getattr(e, 'response').text}")
|
|
84
|
+
else:
|
|
85
|
+
print(f"Error sending log to Elasticsearch: {e}")
|
|
86
|
+
|
|
87
|
+
def close(self) -> None:
|
|
88
|
+
"""Clean up resources when the handler is closed."""
|
|
89
|
+
super().close()
|
|
90
|
+
if self._executor:
|
|
91
|
+
self._executor.shutdown(wait=True)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from typing import Dict, Any, Optional
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ElasticsearchIndexManager:
|
|
7
|
+
"""Manages Elasticsearch index creation and mapping configuration."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, base_url: str, index_name: str, api_key: str) -> None:
|
|
10
|
+
"""Initialize the Elasticsearch index manager.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
base_url: Elasticsearch base URL (e.g., "https://localhost:9200")
|
|
14
|
+
index_name: Name of the index to manage
|
|
15
|
+
api_key: API key for authentication
|
|
16
|
+
"""
|
|
17
|
+
self.base_url: str = base_url.rstrip("/")
|
|
18
|
+
self.index_name: str = index_name
|
|
19
|
+
self.api_key: str = api_key
|
|
20
|
+
self.index_url: str = f"{self.base_url}/{self.index_name}"
|
|
21
|
+
self._mapping_created: bool = False
|
|
22
|
+
|
|
23
|
+
# Parse URL to determine if we should verify SSL
|
|
24
|
+
parsed_url = urlparse(base_url)
|
|
25
|
+
self.verify_ssl = parsed_url.scheme == "https"
|
|
26
|
+
|
|
27
|
+
def create_logging_index_mapping(self) -> bool:
|
|
28
|
+
"""Create index with proper mapping for logging data.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
bool: True if mapping was created successfully, False otherwise.
|
|
32
|
+
"""
|
|
33
|
+
if self._mapping_created:
|
|
34
|
+
return True
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
# Check if index exists and has correct mapping
|
|
38
|
+
if self._index_exists_with_correct_mapping():
|
|
39
|
+
self._mapping_created = True
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
# If index exists but has wrong mapping, delete and recreate it
|
|
43
|
+
if self.index_exists():
|
|
44
|
+
print(f"Warning: Index {self.index_name} exists with incorrect mapping. Deleting and recreating...")
|
|
45
|
+
if not self.delete_index():
|
|
46
|
+
print(f"Warning: Failed to delete existing index {self.index_name}")
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
# Create index with proper mapping
|
|
50
|
+
mapping = self._get_logging_mapping()
|
|
51
|
+
response = requests.put(
|
|
52
|
+
self.index_url,
|
|
53
|
+
headers={"Content-Type": "application/json", "Authorization": f"ApiKey {self.api_key}"},
|
|
54
|
+
json=mapping,
|
|
55
|
+
verify=self.verify_ssl,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if response.status_code in [200, 201]:
|
|
59
|
+
self._mapping_created = True
|
|
60
|
+
return True
|
|
61
|
+
else:
|
|
62
|
+
print(f"Warning: Failed to create index mapping: {response.status_code} - {response.text}")
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"Warning: Failed to create index mapping: {e}")
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def _index_exists_with_correct_mapping(self) -> bool:
|
|
70
|
+
"""Check if index exists and has the correct @timestamp mapping.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
bool: True if index exists with correct mapping, False otherwise.
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
# Check if index exists
|
|
77
|
+
response = requests.head(
|
|
78
|
+
self.index_url, headers={"Authorization": f"ApiKey {self.api_key}"}, verify=self.verify_ssl
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if response.status_code != 200:
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
# Check if mapping is correct
|
|
85
|
+
mapping_response = requests.get(
|
|
86
|
+
f"{self.index_url}/_mapping",
|
|
87
|
+
headers={"Authorization": f"ApiKey {self.api_key}"},
|
|
88
|
+
verify=self.verify_ssl,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if mapping_response.status_code != 200:
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
mapping_data = mapping_response.json()
|
|
95
|
+
return self._has_correct_timestamp_mapping(mapping_data)
|
|
96
|
+
|
|
97
|
+
except Exception:
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
def _has_correct_timestamp_mapping(self, mapping_data: Dict[str, Any]) -> bool:
|
|
101
|
+
"""Check if the mapping has @timestamp as a date field.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
mapping_data: Elasticsearch mapping response data
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if @timestamp is correctly mapped as date field
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
return (
|
|
111
|
+
self.index_name in mapping_data
|
|
112
|
+
and "mappings" in mapping_data[self.index_name]
|
|
113
|
+
and "properties" in mapping_data[self.index_name]["mappings"]
|
|
114
|
+
and "@timestamp" in mapping_data[self.index_name]["mappings"]["properties"]
|
|
115
|
+
and mapping_data[self.index_name]["mappings"]["properties"]["@timestamp"].get("type") == "date"
|
|
116
|
+
)
|
|
117
|
+
except (KeyError, TypeError):
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def _get_logging_mapping(self) -> Dict[str, Any]:
|
|
121
|
+
"""Get the standard mapping for logging data.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Dict containing the index mapping configuration
|
|
125
|
+
"""
|
|
126
|
+
return {
|
|
127
|
+
"mappings": {
|
|
128
|
+
"properties": {
|
|
129
|
+
"@timestamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
|
|
130
|
+
"level": {"type": "keyword"},
|
|
131
|
+
"message": {"type": "text"},
|
|
132
|
+
"logger_name": {"type": "keyword"},
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
def delete_index(self) -> bool:
|
|
138
|
+
"""Delete the managed index.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
bool: True if index was deleted successfully, False otherwise.
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
response = requests.delete(
|
|
145
|
+
self.index_url, headers={"Authorization": f"ApiKey {self.api_key}"}, verify=self.verify_ssl
|
|
146
|
+
)
|
|
147
|
+
if response.status_code in [200, 404]: # 404 means index doesn't exist, which is fine
|
|
148
|
+
self._mapping_created = False
|
|
149
|
+
return True
|
|
150
|
+
else:
|
|
151
|
+
print(f"Warning: Failed to delete index: {response.status_code} - {response.text}")
|
|
152
|
+
return False
|
|
153
|
+
except Exception as e:
|
|
154
|
+
print(f"Warning: Failed to delete index: {e}")
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
def index_exists(self) -> bool:
|
|
158
|
+
"""Check if the index exists.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
bool: True if index exists, False otherwise.
|
|
162
|
+
"""
|
|
163
|
+
try:
|
|
164
|
+
response = requests.head(
|
|
165
|
+
self.index_url, headers={"Authorization": f"ApiKey {self.api_key}"}, verify=self.verify_ssl
|
|
166
|
+
)
|
|
167
|
+
return response.status_code == 200
|
|
168
|
+
except Exception:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
def get_index_stats(self) -> Optional[Dict[str, Any]]:
|
|
172
|
+
"""Get statistics about the index.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Dict containing index statistics, or None if failed
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
response = requests.get(
|
|
179
|
+
f"{self.index_url}/_stats",
|
|
180
|
+
headers={"Authorization": f"ApiKey {self.api_key}"},
|
|
181
|
+
verify=self.verify_ssl,
|
|
182
|
+
)
|
|
183
|
+
if response.status_code == 200:
|
|
184
|
+
return response.json()
|
|
185
|
+
return None
|
|
186
|
+
except Exception:
|
|
187
|
+
return None
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import tempfile
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
9
|
+
from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig
|
|
10
|
+
from eval_protocol.logging.elasticsearch_index_manager import ElasticsearchIndexManager
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ElasticsearchSetupError(Exception):
|
|
16
|
+
"""Exception raised when Elasticsearch setup fails."""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ElasticsearchSetup:
|
|
22
|
+
"""Handles Elasticsearch setup with retry logic for existing containers."""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
self.eval_protocol_dir = find_eval_protocol_dir()
|
|
26
|
+
|
|
27
|
+
def setup_elasticsearch(self, index_name: str = "default-logs") -> ElasticSearchConfig:
|
|
28
|
+
"""
|
|
29
|
+
Set up Elasticsearch, handling both local and remote scenarios.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
index_name: Name of the Elasticsearch index to use for logging
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
ElasticSearchConfig for the running instance with the specified index name.
|
|
36
|
+
"""
|
|
37
|
+
elastic_start_local_dir = os.path.join(self.eval_protocol_dir, "elastic-start-local")
|
|
38
|
+
env_file_path = os.path.join(elastic_start_local_dir, ".env")
|
|
39
|
+
|
|
40
|
+
# If elastic-start-local directory exists, use existing Docker script
|
|
41
|
+
if os.path.exists(elastic_start_local_dir):
|
|
42
|
+
config = self._setup_existing_docker_elasticsearch(elastic_start_local_dir, env_file_path)
|
|
43
|
+
else:
|
|
44
|
+
# Otherwise, initialize Docker setup from scratch
|
|
45
|
+
config = self._setup_initialized_docker_elasticsearch(env_file_path)
|
|
46
|
+
|
|
47
|
+
# Create the logging index with proper mapping
|
|
48
|
+
self.create_logging_index(index_name)
|
|
49
|
+
|
|
50
|
+
# Return config with the specified index name
|
|
51
|
+
return ElasticSearchConfig(url=config.url, api_key=config.api_key, index_name=index_name)
|
|
52
|
+
|
|
53
|
+
def _setup_existing_docker_elasticsearch(
|
|
54
|
+
self, elastic_start_local_dir: str, env_file_path: str
|
|
55
|
+
) -> ElasticSearchConfig:
|
|
56
|
+
"""Set up Elasticsearch using existing Docker start.sh script."""
|
|
57
|
+
from eval_protocol.utils.subprocess_utils import run_script_and_wait
|
|
58
|
+
|
|
59
|
+
run_script_and_wait(
|
|
60
|
+
script_name="start.sh",
|
|
61
|
+
working_directory=elastic_start_local_dir,
|
|
62
|
+
inherit_stdout=True,
|
|
63
|
+
)
|
|
64
|
+
return self._parse_elastic_env_file(env_file_path)
|
|
65
|
+
|
|
66
|
+
def _setup_initialized_docker_elasticsearch(self, env_file_path: str) -> ElasticSearchConfig:
|
|
67
|
+
"""Set up Elasticsearch by initializing Docker setup from scratch with retry logic."""
|
|
68
|
+
max_retries = 2
|
|
69
|
+
for attempt in range(max_retries):
|
|
70
|
+
# Use a temporary file to capture output while also showing it in parent stdout
|
|
71
|
+
with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file:
|
|
72
|
+
temp_file_path = temp_file.name
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# Run the command and tee output to both stdout and temp file
|
|
76
|
+
# Use set -o pipefail to ensure we get the return code of the first failing command
|
|
77
|
+
process = subprocess.Popen(
|
|
78
|
+
[
|
|
79
|
+
"sh",
|
|
80
|
+
"-c",
|
|
81
|
+
f"set -o pipefail; curl -fsSL https://elastic.co/start-local | sh -s -- --esonly | tee {temp_file_path}",
|
|
82
|
+
],
|
|
83
|
+
cwd=self.eval_protocol_dir,
|
|
84
|
+
)
|
|
85
|
+
returncode = process.wait()
|
|
86
|
+
|
|
87
|
+
# Read the captured output
|
|
88
|
+
with open(temp_file_path, "r") as f:
|
|
89
|
+
stdout = f.read()
|
|
90
|
+
|
|
91
|
+
if returncode == 0:
|
|
92
|
+
return self._parse_elastic_env_file(env_file_path)
|
|
93
|
+
|
|
94
|
+
# Check if container is already running and handle it
|
|
95
|
+
if self._handle_existing_elasticsearch_container(stdout):
|
|
96
|
+
logger.info(f"Retrying Elasticsearch setup (attempt {attempt + 1}/{max_retries})")
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# If we get here, it's a different error
|
|
100
|
+
raise ElasticsearchSetupError(
|
|
101
|
+
f"Failed to start Elasticsearch (attempt {attempt + 1}/{max_retries}): {stdout}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
finally:
|
|
105
|
+
# Clean up the temporary file
|
|
106
|
+
try:
|
|
107
|
+
os.unlink(temp_file_path)
|
|
108
|
+
except OSError:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
raise ElasticsearchSetupError(f"Failed to start Elasticsearch after {max_retries} attempts")
|
|
112
|
+
|
|
113
|
+
def _handle_existing_elasticsearch_container(self, output: str) -> bool:
|
|
114
|
+
"""
|
|
115
|
+
Check if the curl command output indicates that the Elasticsearch container is already running.
|
|
116
|
+
If so, stop the existing container and return True to indicate a retry is needed.
|
|
117
|
+
"""
|
|
118
|
+
if "docker stop es-local-dev" in output:
|
|
119
|
+
logger.info("Elasticsearch container 'es-local-dev' is already running. Stopping it...")
|
|
120
|
+
try:
|
|
121
|
+
subprocess.run(["docker", "stop", "es-local-dev"], check=True, capture_output=True, text=True)
|
|
122
|
+
logger.info("Successfully stopped existing Elasticsearch container")
|
|
123
|
+
return True # Indicate retry is needed
|
|
124
|
+
except subprocess.CalledProcessError as e:
|
|
125
|
+
logger.warning(f"Failed to stop existing container: {e}")
|
|
126
|
+
return False
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
def _parse_elastic_env_file(self, env_file_path: str) -> ElasticSearchConfig:
|
|
130
|
+
"""Parse ES_LOCAL_API_KEY and ES_LOCAL_URL from .env file."""
|
|
131
|
+
loaded = load_dotenv(env_file_path)
|
|
132
|
+
if not loaded:
|
|
133
|
+
raise ElasticsearchSetupError("Failed to load .env file")
|
|
134
|
+
|
|
135
|
+
api_key = os.getenv("ES_LOCAL_API_KEY")
|
|
136
|
+
url = os.getenv("ES_LOCAL_URL")
|
|
137
|
+
|
|
138
|
+
if not url or not api_key:
|
|
139
|
+
raise ElasticsearchSetupError("Failed to parse ES_LOCAL_API_KEY and ES_LOCAL_URL from .env file")
|
|
140
|
+
|
|
141
|
+
return ElasticSearchConfig(url=url, api_key=api_key, index_name="default-logs")
|
|
142
|
+
|
|
143
|
+
def create_logging_index(self, index_name: str) -> bool:
|
|
144
|
+
"""Create an Elasticsearch index with proper mapping for logging data.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
index_name: Name of the index to create
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
bool: True if index was created successfully, False otherwise.
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
# Get the config from the .env file
|
|
154
|
+
config = self._parse_elastic_env_file(self._get_env_file_path())
|
|
155
|
+
|
|
156
|
+
# Create index manager and set up mapping
|
|
157
|
+
index_manager = ElasticsearchIndexManager(config.url, index_name, config.api_key)
|
|
158
|
+
return index_manager.create_logging_index_mapping()
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.error(f"Failed to create logging index {index_name}: {e}")
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
def _get_env_file_path(self) -> str:
|
|
165
|
+
"""Get the path to the .env file."""
|
|
166
|
+
elastic_start_local_dir = os.path.join(self.eval_protocol_dir, "elastic-start-local")
|
|
167
|
+
return os.path.join(elastic_start_local_dir, ".env")
|
{eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -6,11 +6,16 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from eval_protocol.models import EvaluationRow, Status
|
|
8
8
|
from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
|
|
9
|
-
from eval_protocol.types.remote_rollout_processor import InitRequest, RolloutMetadata
|
|
9
|
+
from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig, InitRequest, RolloutMetadata
|
|
10
10
|
from .rollout_processor import RolloutProcessor
|
|
11
11
|
from .types import RolloutProcessorConfig
|
|
12
|
+
from .elasticsearch_setup import ElasticsearchSetup
|
|
13
|
+
import logging
|
|
14
|
+
|
|
12
15
|
import os
|
|
13
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
14
19
|
|
|
15
20
|
class RemoteRolloutProcessor(RolloutProcessor):
|
|
16
21
|
"""
|
|
@@ -27,6 +32,8 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
27
32
|
poll_interval: float = 1.0,
|
|
28
33
|
timeout_seconds: float = 120.0,
|
|
29
34
|
output_data_loader: Callable[[str], DynamicDataLoader],
|
|
35
|
+
disable_elastic_search: bool = False,
|
|
36
|
+
elastic_search_config: Optional[ElasticSearchConfig] = None,
|
|
30
37
|
):
|
|
31
38
|
# Prefer constructor-provided configuration. These can be overridden via
|
|
32
39
|
# config.kwargs at call time for backward compatibility.
|
|
@@ -37,6 +44,21 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
37
44
|
self._poll_interval = poll_interval
|
|
38
45
|
self._timeout_seconds = timeout_seconds
|
|
39
46
|
self._output_data_loader = output_data_loader
|
|
47
|
+
self._disable_elastic_search = disable_elastic_search
|
|
48
|
+
self._elastic_search_config = elastic_search_config
|
|
49
|
+
|
|
50
|
+
def setup(self) -> None:
|
|
51
|
+
if self._disable_elastic_search:
|
|
52
|
+
logger.info("Elasticsearch is disabled, skipping setup")
|
|
53
|
+
return
|
|
54
|
+
logger.info("Setting up Elasticsearch")
|
|
55
|
+
self._elastic_search_config = self._setup_elastic_search()
|
|
56
|
+
logger.info("Elasticsearch setup complete")
|
|
57
|
+
|
|
58
|
+
def _setup_elastic_search(self) -> ElasticSearchConfig:
|
|
59
|
+
"""Set up Elasticsearch using the dedicated setup module."""
|
|
60
|
+
setup = ElasticsearchSetup()
|
|
61
|
+
return setup.setup_elasticsearch()
|
|
40
62
|
|
|
41
63
|
def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
|
|
42
64
|
tasks: List[asyncio.Task[EvaluationRow]] = []
|
|
@@ -113,12 +135,23 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
113
135
|
if row.execution_metadata.rollout_id is None:
|
|
114
136
|
raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
|
|
115
137
|
|
|
138
|
+
final_model_base_url = model_base_url
|
|
139
|
+
if model_base_url and model_base_url.startswith("https://tracing.fireworks.ai/project_id/"):
|
|
140
|
+
final_model_base_url = (
|
|
141
|
+
f"{model_base_url}/rollout_id/{meta.rollout_id}"
|
|
142
|
+
f"/invocation_id/{meta.invocation_id}"
|
|
143
|
+
f"/experiment_id/{meta.experiment_id}"
|
|
144
|
+
f"/run_id/{meta.run_id}"
|
|
145
|
+
f"/row_id/{meta.row_id}"
|
|
146
|
+
)
|
|
147
|
+
|
|
116
148
|
init_payload: InitRequest = InitRequest(
|
|
117
149
|
model=model,
|
|
118
150
|
messages=clean_messages,
|
|
119
151
|
tools=row.tools,
|
|
120
152
|
metadata=meta,
|
|
121
|
-
model_base_url=
|
|
153
|
+
model_base_url=final_model_base_url,
|
|
154
|
+
elastic_search_config=self._elastic_search_config,
|
|
122
155
|
)
|
|
123
156
|
|
|
124
157
|
# Fire-and-poll
|
|
@@ -197,6 +230,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
197
230
|
langfuse_row.input_metadata.dataset_info = row.input_metadata.dataset_info
|
|
198
231
|
langfuse_row.eval_metadata = row.eval_metadata
|
|
199
232
|
langfuse_row.ground_truth = row.ground_truth
|
|
233
|
+
|
|
234
|
+
# this is useful to detect stopped evaluations so we can update
|
|
235
|
+
# the status in the logs server
|
|
236
|
+
langfuse_row.pid = row.pid
|
|
237
|
+
|
|
200
238
|
return langfuse_row
|
|
201
239
|
else:
|
|
202
240
|
raise ValueError("RemoteRolloutProcessor's output_data_loader should return exactly one row.")
|
|
@@ -10,6 +10,10 @@ class RolloutProcessor(ABC):
|
|
|
10
10
|
Abstract base class for all rollout processor strategies.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
def setup(self) -> None:
|
|
14
|
+
"""Setup resources. Override in subclasses if setup is needed. Executed once per invocation."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
13
17
|
@abstractmethod
|
|
14
18
|
def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> list[asyncio.Task[EvaluationRow]]:
|
|
15
19
|
"""Process evaluation rows and return async tasks. Must be implemented by subclasses."""
|
{eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/remote_rollout_processor.py
RENAMED
|
@@ -7,6 +7,16 @@ from pydantic import BaseModel, Field
|
|
|
7
7
|
from eval_protocol.models import Message, Status
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
class ElasticSearchConfig(BaseModel):
|
|
11
|
+
"""
|
|
12
|
+
Configuration for Elasticsearch.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
url: str
|
|
16
|
+
api_key: str
|
|
17
|
+
index_name: str
|
|
18
|
+
|
|
19
|
+
|
|
10
20
|
class RolloutMetadata(BaseModel):
|
|
11
21
|
"""Metadata for rollout execution."""
|
|
12
22
|
|
|
@@ -21,6 +31,7 @@ class InitRequest(BaseModel):
|
|
|
21
31
|
"""Request model for POST /init endpoint."""
|
|
22
32
|
|
|
23
33
|
model: str
|
|
34
|
+
elastic_search_config: Optional[ElasticSearchConfig] = None
|
|
24
35
|
messages: Optional[List[Message]] = None
|
|
25
36
|
tools: Optional[List[Dict[str, Any]]] = None
|
|
26
37
|
|