eval-protocol 0.2.43__tar.gz → 0.2.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.43/eval_protocol.egg-info → eval_protocol-0.2.45}/PKG-INFO +1 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/__init__.py +2 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/fireworks_tracing.py +29 -50
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli.py +1 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/logs.py +2 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +8 -1
- eval_protocol-0.2.45/eval_protocol/event_bus/sqlite_event_bus.py +126 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/sqlite_event_bus_database.py +6 -8
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_client.py +19 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +3 -3
- eval_protocol-0.2.45/eval_protocol/log_utils/util.py +22 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/logs_server.py +126 -22
- {eval_protocol-0.2.43 → eval_protocol-0.2.45/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/SOURCES.txt +2 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_event_bus.py +74 -38
- eval_protocol-0.2.45/tests/test_event_bus_helper.py +74 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_logs_server.py +2 -2
- eval_protocol-0.2.43/eval_protocol/event_bus/sqlite_event_bus.py +0 -109
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/LICENSE +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/README.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/pyproject.toml +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/setup.cfg +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/setup.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_format.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_math.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/versioneer.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/index-C81y9r9l.js +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/index-C81y9r9l.js.map +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/index-DpYZaoAr.css +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.45
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,6 +34,7 @@ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutPr
|
|
|
34
34
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
35
35
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
36
36
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
37
|
+
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
37
38
|
|
|
38
39
|
from .types.remote_rollout_processor import (
|
|
39
40
|
InitRequest,
|
|
@@ -68,6 +69,7 @@ warnings.filterwarnings("default", category=DeprecationWarning, module="eval_pro
|
|
|
68
69
|
__all__ = [
|
|
69
70
|
"ElasticsearchDirectHttpHandler",
|
|
70
71
|
"RolloutIdFilter",
|
|
72
|
+
"setup_rollout_logging_for_elasticsearch_handler",
|
|
71
73
|
"DataLoaderConfig",
|
|
72
74
|
"Status",
|
|
73
75
|
"RemoteRolloutProcessor",
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-08T14:59:37-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "b120611112b84df8476cefcc02660c542e61b2a9",
|
|
15
|
+
"version": "0.2.45"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -281,9 +281,8 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
281
281
|
from_timestamp: Optional[datetime] = None,
|
|
282
282
|
to_timestamp: Optional[datetime] = None,
|
|
283
283
|
include_tool_calls: bool = True,
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
proxy_max_retries: int = 3,
|
|
284
|
+
sleep_between_gets: float = 0.1,
|
|
285
|
+
max_retries: int = 3,
|
|
287
286
|
span_name: Optional[str] = None,
|
|
288
287
|
converter: Optional[TraceDictConverter] = None,
|
|
289
288
|
) -> List[EvaluationRow]:
|
|
@@ -305,10 +304,8 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
305
304
|
from_timestamp: Explicit start time (ISO format)
|
|
306
305
|
to_timestamp: Explicit end time (ISO format)
|
|
307
306
|
include_tool_calls: Whether to include tool calling traces
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
proxy_max_retries: Maximum retries when proxy returns 404 (client-side retries with exponential backoff)
|
|
311
|
-
span_name: If provided, extract messages from generations within this named span
|
|
307
|
+
sleep_between_gets: Sleep time between polling attempts (default: 2.5s)
|
|
308
|
+
max_retries: Max retry attempts used by proxy (default: 3)
|
|
312
309
|
converter: Optional custom converter implementing TraceDictConverter protocol.
|
|
313
310
|
If provided, this will be used instead of the default conversion logic.
|
|
314
311
|
|
|
@@ -318,9 +315,9 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
318
315
|
Raises:
|
|
319
316
|
ValueError: If tags list is empty
|
|
320
317
|
"""
|
|
321
|
-
# Validate that tags are provided
|
|
318
|
+
# Validate that tags are provided
|
|
322
319
|
if not tags or len(tags) == 0:
|
|
323
|
-
raise ValueError("At least one tag is required to fetch traces
|
|
320
|
+
raise ValueError("At least one tag is required to fetch traces")
|
|
324
321
|
|
|
325
322
|
eval_rows = []
|
|
326
323
|
|
|
@@ -339,58 +336,40 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
339
336
|
"hours_back": hours_back,
|
|
340
337
|
"from_timestamp": from_timestamp.isoformat() if from_timestamp else None,
|
|
341
338
|
"to_timestamp": to_timestamp.isoformat() if to_timestamp else None,
|
|
342
|
-
"sleep_between_gets":
|
|
343
|
-
"max_retries":
|
|
339
|
+
"sleep_between_gets": sleep_between_gets,
|
|
340
|
+
"max_retries": max_retries,
|
|
344
341
|
}
|
|
345
342
|
|
|
346
343
|
# Remove None values
|
|
347
344
|
params = {k: v for k, v in params.items() if v is not None}
|
|
348
345
|
|
|
349
|
-
# Make request to proxy
|
|
346
|
+
# Make request to proxy
|
|
350
347
|
if self.project_id:
|
|
351
348
|
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
|
|
352
349
|
else:
|
|
353
350
|
url = f"{self.base_url}/v1/traces"
|
|
354
351
|
|
|
355
|
-
# Retry loop for handling backend indexing delays (proxy returns 404)
|
|
356
352
|
result = None
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
error_msg = e.response.text
|
|
378
|
-
|
|
379
|
-
if should_retry and attempt < proxy_max_retries - 1:
|
|
380
|
-
sleep_time = 2 ** (attempt + 1)
|
|
381
|
-
logger.warning(error_msg)
|
|
382
|
-
time.sleep(sleep_time)
|
|
383
|
-
else:
|
|
384
|
-
# Final retry or non-retryable error
|
|
385
|
-
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
386
|
-
return eval_rows
|
|
387
|
-
except requests.exceptions.RequestException as e:
|
|
388
|
-
# Non-HTTP errors (network issues, timeouts, etc.)
|
|
389
|
-
logger.error("Failed to fetch traces from proxy: %s", str(e))
|
|
390
|
-
return eval_rows
|
|
391
|
-
|
|
392
|
-
if result is None:
|
|
393
|
-
logger.error("Failed to fetch traces after %d retries", proxy_max_retries)
|
|
353
|
+
try:
|
|
354
|
+
response = requests.get(url, params=params, timeout=self.timeout)
|
|
355
|
+
response.raise_for_status()
|
|
356
|
+
result = response.json()
|
|
357
|
+
except requests.exceptions.HTTPError as e:
|
|
358
|
+
error_msg = str(e)
|
|
359
|
+
|
|
360
|
+
# Try to extract detail message from response
|
|
361
|
+
if e.response is not None:
|
|
362
|
+
try:
|
|
363
|
+
error_detail = e.response.json().get("detail", {})
|
|
364
|
+
error_msg = error_detail or e.response.text
|
|
365
|
+
except Exception: # In case e.response.json() fails
|
|
366
|
+
error_msg = f"Proxy error: {e.response.text}"
|
|
367
|
+
|
|
368
|
+
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
369
|
+
return eval_rows
|
|
370
|
+
except requests.exceptions.RequestException as e:
|
|
371
|
+
# Non-HTTP errors (network issues, timeouts, etc.)
|
|
372
|
+
logger.error("Failed to fetch traces from proxy: %s", str(e))
|
|
394
373
|
return eval_rows
|
|
395
374
|
|
|
396
375
|
# Extract traces from response
|
|
@@ -300,6 +300,7 @@ def parse_args(args=None):
|
|
|
300
300
|
# Logs command
|
|
301
301
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
302
302
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
303
|
+
logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
303
304
|
|
|
304
305
|
# Upload command
|
|
305
306
|
upload_parser = subparsers.add_parser(
|
|
@@ -16,6 +16,7 @@ def logs_command(args):
|
|
|
16
16
|
print(f"🌐 URL: http://localhost:{port}")
|
|
17
17
|
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
18
18
|
print(f"👀 Watching paths: {['current directory']}")
|
|
19
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
19
20
|
print("Press Ctrl+C to stop the server")
|
|
20
21
|
print("-" * 50)
|
|
21
22
|
|
|
@@ -25,7 +26,7 @@ def logs_command(args):
|
|
|
25
26
|
elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
|
|
26
27
|
|
|
27
28
|
try:
|
|
28
|
-
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config)
|
|
29
|
+
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
|
|
29
30
|
return 0
|
|
30
31
|
except KeyboardInterrupt:
|
|
31
32
|
print("\n🛑 Server stopped by user")
|
|
@@ -23,12 +23,19 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
|
|
|
23
23
|
|
|
24
24
|
def log(self, row: "EvaluationRow") -> None:
|
|
25
25
|
data = row.model_dump(exclude_none=True, mode="json")
|
|
26
|
+
rollout_id = data.get("execution_metadata", {}).get("rollout_id", "unknown")
|
|
27
|
+
logger.debug(f"[EVENT_BUS_EMIT] Starting to log row with rollout_id: {rollout_id}")
|
|
28
|
+
|
|
26
29
|
self._store.upsert_row(data=data)
|
|
30
|
+
logger.debug(f"[EVENT_BUS_EMIT] Successfully stored row in database for rollout_id: {rollout_id}")
|
|
31
|
+
|
|
27
32
|
try:
|
|
33
|
+
logger.debug(f"[EVENT_BUS_EMIT] Emitting event '{LOG_EVENT_TYPE}' for rollout_id: {rollout_id}")
|
|
28
34
|
event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
|
|
35
|
+
logger.debug(f"[EVENT_BUS_EMIT] Successfully emitted event for rollout_id: {rollout_id}")
|
|
29
36
|
except Exception as e:
|
|
30
37
|
# Avoid breaking storage due to event emission issues
|
|
31
|
-
logger.error(f"Failed to emit row_upserted event: {e}")
|
|
38
|
+
logger.error(f"[EVENT_BUS_EMIT] Failed to emit row_upserted event for rollout_id {rollout_id}: {e}")
|
|
32
39
|
pass
|
|
33
40
|
|
|
34
41
|
def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
from eval_protocol.event_bus.event_bus import EventBus
|
|
9
|
+
from eval_protocol.event_bus.logger import logger
|
|
10
|
+
from eval_protocol.event_bus.sqlite_event_bus_database import SqliteEventBusDatabase
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SqliteEventBus(EventBus):
|
|
14
|
+
"""SQLite-based event bus implementation that supports cross-process communication."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
17
|
+
super().__init__()
|
|
18
|
+
|
|
19
|
+
# Use the same database as the evaluation row store
|
|
20
|
+
if db_path is None:
|
|
21
|
+
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
22
|
+
|
|
23
|
+
eval_protocol_dir = find_eval_protocol_dir()
|
|
24
|
+
db_path = os.path.join(eval_protocol_dir, "logs.db")
|
|
25
|
+
|
|
26
|
+
self._db: SqliteEventBusDatabase = SqliteEventBusDatabase(db_path)
|
|
27
|
+
self._running = False
|
|
28
|
+
self._process_id = str(os.getpid())
|
|
29
|
+
|
|
30
|
+
def emit(self, event_type: str, data: Any) -> None:
|
|
31
|
+
"""Emit an event to all subscribers.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
event_type: Type of event (e.g., "log")
|
|
35
|
+
data: Event data
|
|
36
|
+
"""
|
|
37
|
+
logger.debug(f"[CROSS_PROCESS_EMIT] Emitting event type: {event_type}")
|
|
38
|
+
|
|
39
|
+
# Call local listeners immediately
|
|
40
|
+
logger.debug(f"[CROSS_PROCESS_EMIT] Calling {len(self._listeners)} local listeners")
|
|
41
|
+
super().emit(event_type, data)
|
|
42
|
+
logger.debug("[CROSS_PROCESS_EMIT] Completed local listener calls")
|
|
43
|
+
|
|
44
|
+
# Publish to cross-process subscribers
|
|
45
|
+
logger.debug("[CROSS_PROCESS_EMIT] Publishing to cross-process subscribers")
|
|
46
|
+
self._publish_cross_process(event_type, data)
|
|
47
|
+
logger.debug("[CROSS_PROCESS_EMIT] Completed cross-process publish")
|
|
48
|
+
|
|
49
|
+
def _publish_cross_process(self, event_type: str, data: Any) -> None:
|
|
50
|
+
"""Publish event to cross-process subscribers via database."""
|
|
51
|
+
logger.debug(f"[CROSS_PROCESS_PUBLISH] Publishing event {event_type} to database")
|
|
52
|
+
try:
|
|
53
|
+
self._db.publish_event(event_type, data, self._process_id)
|
|
54
|
+
logger.debug(f"[CROSS_PROCESS_PUBLISH] Successfully published event {event_type} to database")
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.error(f"[CROSS_PROCESS_PUBLISH] Failed to publish event {event_type} to database: {e}")
|
|
57
|
+
|
|
58
|
+
def start_listening(self) -> None:
|
|
59
|
+
"""Start listening for cross-process events."""
|
|
60
|
+
if self._running:
|
|
61
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Already listening, skipping start")
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Starting cross-process event listening")
|
|
65
|
+
self._running = True
|
|
66
|
+
loop = asyncio.get_running_loop()
|
|
67
|
+
loop.create_task(self._database_listener_task())
|
|
68
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Started async database listener task")
|
|
69
|
+
|
|
70
|
+
def stop_listening(self) -> None:
|
|
71
|
+
"""Stop listening for cross-process events."""
|
|
72
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Stopping cross-process event listening")
|
|
73
|
+
self._running = False
|
|
74
|
+
|
|
75
|
+
async def _database_listener_task(self) -> None:
|
|
76
|
+
"""Single database listener task that processes events and recreates itself."""
|
|
77
|
+
if not self._running:
|
|
78
|
+
# this should end the task loop
|
|
79
|
+
logger.debug("[CROSS_PROCESS_LISTENER] Stopping database listener task")
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
# Get unprocessed events from other processes
|
|
83
|
+
events = self._db.get_unprocessed_events(str(self._process_id))
|
|
84
|
+
if events:
|
|
85
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] Found {len(events)} unprocessed events")
|
|
86
|
+
else:
|
|
87
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] No unprocessed events found for process {self._process_id}")
|
|
88
|
+
|
|
89
|
+
for event in events:
|
|
90
|
+
logger.debug(
|
|
91
|
+
f"[CROSS_PROCESS_LISTENER] Processing event {event['event_id']} of type {event['event_type']}"
|
|
92
|
+
)
|
|
93
|
+
# Handle the event
|
|
94
|
+
self._handle_cross_process_event(event["event_type"], event["data"])
|
|
95
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] Successfully processed event {event['event_id']}")
|
|
96
|
+
|
|
97
|
+
# Mark as processed
|
|
98
|
+
self._db.mark_event_processed(event["event_id"])
|
|
99
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] Marked event {event['event_id']} as processed")
|
|
100
|
+
|
|
101
|
+
# Clean up old events every hour
|
|
102
|
+
current_time = time.time()
|
|
103
|
+
if not hasattr(self, "_last_cleanup"):
|
|
104
|
+
self._last_cleanup = current_time
|
|
105
|
+
elif current_time - self._last_cleanup >= 3600:
|
|
106
|
+
logger.debug("[CROSS_PROCESS_LISTENER] Cleaning up old events")
|
|
107
|
+
self._db.cleanup_old_events()
|
|
108
|
+
self._last_cleanup = current_time
|
|
109
|
+
|
|
110
|
+
# Schedule the next task if still running
|
|
111
|
+
await asyncio.sleep(1.0)
|
|
112
|
+
loop = asyncio.get_running_loop()
|
|
113
|
+
loop.create_task(self._database_listener_task())
|
|
114
|
+
|
|
115
|
+
def _handle_cross_process_event(self, event_type: str, data: Any) -> None:
|
|
116
|
+
"""Handle events received from other processes."""
|
|
117
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Handling cross-process event type: {event_type}")
|
|
118
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Calling {len(self._listeners)} listeners")
|
|
119
|
+
|
|
120
|
+
for i, listener in enumerate(self._listeners):
|
|
121
|
+
try:
|
|
122
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Calling listener {i}")
|
|
123
|
+
listener(event_type, data)
|
|
124
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Successfully called listener {i}")
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Cross-process event listener {i} failed for {event_type}: {e}")
|
{eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/sqlite_event_bus_database.py
RENAMED
|
@@ -2,7 +2,7 @@ import time
|
|
|
2
2
|
from typing import Any, List
|
|
3
3
|
from uuid import uuid4
|
|
4
4
|
|
|
5
|
-
from peewee import CharField, DateTimeField, Model, SqliteDatabase
|
|
5
|
+
from peewee import BooleanField, CharField, DateTimeField, Model, SqliteDatabase
|
|
6
6
|
from playhouse.sqlite_ext import JSONField
|
|
7
7
|
|
|
8
8
|
from eval_protocol.event_bus.logger import logger
|
|
@@ -25,7 +25,7 @@ class SqliteEventBusDatabase:
|
|
|
25
25
|
data = JSONField()
|
|
26
26
|
timestamp = DateTimeField()
|
|
27
27
|
process_id = CharField()
|
|
28
|
-
processed =
|
|
28
|
+
processed = BooleanField(default=False) # Track if event has been processed
|
|
29
29
|
|
|
30
30
|
self._Event = Event
|
|
31
31
|
self._db.connect()
|
|
@@ -46,7 +46,7 @@ class SqliteEventBusDatabase:
|
|
|
46
46
|
data=serialized_data,
|
|
47
47
|
timestamp=time.time(),
|
|
48
48
|
process_id=process_id,
|
|
49
|
-
processed=
|
|
49
|
+
processed=False,
|
|
50
50
|
)
|
|
51
51
|
except Exception as e:
|
|
52
52
|
logger.warning(f"Failed to publish event to database: {e}")
|
|
@@ -56,7 +56,7 @@ class SqliteEventBusDatabase:
|
|
|
56
56
|
try:
|
|
57
57
|
query = (
|
|
58
58
|
self._Event.select()
|
|
59
|
-
.where((self._Event.process_id != process_id) & (self._Event.processed
|
|
59
|
+
.where((self._Event.process_id != process_id) & (~self._Event.processed))
|
|
60
60
|
.order_by(self._Event.timestamp)
|
|
61
61
|
)
|
|
62
62
|
|
|
@@ -80,7 +80,7 @@ class SqliteEventBusDatabase:
|
|
|
80
80
|
def mark_event_processed(self, event_id: str) -> None:
|
|
81
81
|
"""Mark an event as processed."""
|
|
82
82
|
try:
|
|
83
|
-
self._Event.update(processed=
|
|
83
|
+
self._Event.update(processed=True).where(self._Event.event_id == event_id).execute()
|
|
84
84
|
except Exception as e:
|
|
85
85
|
logger.debug(f"Failed to mark event as processed: {e}")
|
|
86
86
|
|
|
@@ -88,8 +88,6 @@ class SqliteEventBusDatabase:
|
|
|
88
88
|
"""Clean up old processed events."""
|
|
89
89
|
try:
|
|
90
90
|
cutoff_time = time.time() - (max_age_hours * 3600)
|
|
91
|
-
self._Event.delete().where(
|
|
92
|
-
(self._Event.processed == "true") & (self._Event.timestamp < cutoff_time)
|
|
93
|
-
).execute()
|
|
91
|
+
self._Event.delete().where((self._Event.processed) & (self._Event.timestamp < cutoff_time)).execute()
|
|
94
92
|
except Exception as e:
|
|
95
93
|
logger.debug(f"Failed to cleanup old events: {e}")
|
{eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_client.py
RENAMED
|
@@ -100,6 +100,25 @@ class ElasticsearchClient:
|
|
|
100
100
|
except Exception:
|
|
101
101
|
return False
|
|
102
102
|
|
|
103
|
+
def clear_index(self) -> bool:
|
|
104
|
+
"""Clear all documents from the index.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if successful, False otherwise
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
# Delete all documents by query
|
|
111
|
+
response = self._make_request(
|
|
112
|
+
"POST", f"{self.index_url}/_delete_by_query", json_data={"query": {"match_all": {}}}
|
|
113
|
+
)
|
|
114
|
+
if response.status_code == 200:
|
|
115
|
+
# Refresh the index to ensure changes are visible
|
|
116
|
+
refresh_response = self._make_request("POST", f"{self.index_url}/_refresh")
|
|
117
|
+
return refresh_response.status_code == 200
|
|
118
|
+
return False
|
|
119
|
+
except Exception:
|
|
120
|
+
return False
|
|
121
|
+
|
|
103
122
|
def get_mapping(self) -> Optional[Dict[str, Any]]:
|
|
104
123
|
"""Get the index mapping.
|
|
105
124
|
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
4
|
from typing import Optional, Any, Dict
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
6
|
|
|
7
7
|
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
8
8
|
from .elasticsearch_client import ElasticsearchClient
|
|
@@ -36,8 +36,8 @@ class ElasticsearchDirectHttpHandler(logging.Handler):
|
|
|
36
36
|
def emit(self, record: logging.LogRecord) -> None:
|
|
37
37
|
"""Emit a log record by scheduling it for async transmission."""
|
|
38
38
|
try:
|
|
39
|
-
# Create proper ISO 8601 timestamp
|
|
40
|
-
timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
39
|
+
# Create proper ISO 8601 timestamp in UTC
|
|
40
|
+
timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
41
41
|
|
|
42
42
|
rollout_id = self._get_rollout_id(record)
|
|
43
43
|
logger.debug(f"Emitting log record: {record.getMessage()} with rollout_id: {rollout_id}")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
3
|
+
from .elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def setup_rollout_logging_for_elasticsearch_handler(
|
|
7
|
+
handler: ElasticsearchDirectHttpHandler, rollout_id: str, elastic_search_config: ElasticsearchConfig
|
|
8
|
+
) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Whenever a new subprocess is created, we need to setup the rollout context
|
|
11
|
+
for the subprocess. This is useful when implementing your own remote server
|
|
12
|
+
for rollout processing.
|
|
13
|
+
|
|
14
|
+
1. Set the EP_ROLLOUT_ID environment variable
|
|
15
|
+
2. Configure the Elasticsearch handler with the Elasticsearch config
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# this should only affect this subprocess so logs from this subprocess can
|
|
19
|
+
# be correlated to the rollout
|
|
20
|
+
os.environ["EP_ROLLOUT_ID"] = rollout_id
|
|
21
|
+
|
|
22
|
+
handler.configure(elasticsearch_config=elastic_search_config)
|
{eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -70,7 +70,7 @@ def _default_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:
|
|
|
70
70
|
def fetch_traces() -> List[EvaluationRow]:
|
|
71
71
|
base_url = config.model_base_url or "https://tracing.fireworks.ai"
|
|
72
72
|
adapter = FireworksTracingAdapter(base_url=base_url)
|
|
73
|
-
return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"],
|
|
73
|
+
return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], max_retries=5)
|
|
74
74
|
|
|
75
75
|
return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=filter_longest_conversation)
|
|
76
76
|
|