eval-protocol 0.2.43__tar.gz → 0.2.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.43/eval_protocol.egg-info → eval_protocol-0.2.44}/PKG-INFO +1 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli.py +1 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/logs.py +2 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +8 -1
- eval_protocol-0.2.44/eval_protocol/event_bus/sqlite_event_bus.py +126 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/sqlite_event_bus_database.py +6 -8
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_client.py +19 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +3 -3
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/logs_server.py +126 -22
- {eval_protocol-0.2.43 → eval_protocol-0.2.44/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/SOURCES.txt +1 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_event_bus.py +74 -38
- eval_protocol-0.2.44/tests/test_event_bus_helper.py +74 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_logs_server.py +2 -2
- eval_protocol-0.2.43/eval_protocol/event_bus/sqlite_event_bus.py +0 -109
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/LICENSE +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/README.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/pyproject.toml +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/setup.cfg +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/setup.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_format.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_length.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_math.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_models.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/versioneer.py +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/index-C81y9r9l.js +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/index-C81y9r9l.js.map +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/index-DpYZaoAr.css +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.44
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-08T11:55:20-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "e5883aeb569de1af057de3eae81aaf7790f468f1",
|
|
15
|
+
"version": "0.2.44"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -300,6 +300,7 @@ def parse_args(args=None):
|
|
|
300
300
|
# Logs command
|
|
301
301
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
302
302
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
303
|
+
logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
303
304
|
|
|
304
305
|
# Upload command
|
|
305
306
|
upload_parser = subparsers.add_parser(
|
|
@@ -16,6 +16,7 @@ def logs_command(args):
|
|
|
16
16
|
print(f"🌐 URL: http://localhost:{port}")
|
|
17
17
|
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
18
18
|
print(f"👀 Watching paths: {['current directory']}")
|
|
19
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
19
20
|
print("Press Ctrl+C to stop the server")
|
|
20
21
|
print("-" * 50)
|
|
21
22
|
|
|
@@ -25,7 +26,7 @@ def logs_command(args):
|
|
|
25
26
|
elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
|
|
26
27
|
|
|
27
28
|
try:
|
|
28
|
-
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config)
|
|
29
|
+
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
|
|
29
30
|
return 0
|
|
30
31
|
except KeyboardInterrupt:
|
|
31
32
|
print("\n🛑 Server stopped by user")
|
|
@@ -23,12 +23,19 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
|
|
|
23
23
|
|
|
24
24
|
def log(self, row: "EvaluationRow") -> None:
|
|
25
25
|
data = row.model_dump(exclude_none=True, mode="json")
|
|
26
|
+
rollout_id = data.get("execution_metadata", {}).get("rollout_id", "unknown")
|
|
27
|
+
logger.debug(f"[EVENT_BUS_EMIT] Starting to log row with rollout_id: {rollout_id}")
|
|
28
|
+
|
|
26
29
|
self._store.upsert_row(data=data)
|
|
30
|
+
logger.debug(f"[EVENT_BUS_EMIT] Successfully stored row in database for rollout_id: {rollout_id}")
|
|
31
|
+
|
|
27
32
|
try:
|
|
33
|
+
logger.debug(f"[EVENT_BUS_EMIT] Emitting event '{LOG_EVENT_TYPE}' for rollout_id: {rollout_id}")
|
|
28
34
|
event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
|
|
35
|
+
logger.debug(f"[EVENT_BUS_EMIT] Successfully emitted event for rollout_id: {rollout_id}")
|
|
29
36
|
except Exception as e:
|
|
30
37
|
# Avoid breaking storage due to event emission issues
|
|
31
|
-
logger.error(f"Failed to emit row_upserted event: {e}")
|
|
38
|
+
logger.error(f"[EVENT_BUS_EMIT] Failed to emit row_upserted event for rollout_id {rollout_id}: {e}")
|
|
32
39
|
pass
|
|
33
40
|
|
|
34
41
|
def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
from eval_protocol.event_bus.event_bus import EventBus
|
|
9
|
+
from eval_protocol.event_bus.logger import logger
|
|
10
|
+
from eval_protocol.event_bus.sqlite_event_bus_database import SqliteEventBusDatabase
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SqliteEventBus(EventBus):
|
|
14
|
+
"""SQLite-based event bus implementation that supports cross-process communication."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
17
|
+
super().__init__()
|
|
18
|
+
|
|
19
|
+
# Use the same database as the evaluation row store
|
|
20
|
+
if db_path is None:
|
|
21
|
+
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
22
|
+
|
|
23
|
+
eval_protocol_dir = find_eval_protocol_dir()
|
|
24
|
+
db_path = os.path.join(eval_protocol_dir, "logs.db")
|
|
25
|
+
|
|
26
|
+
self._db: SqliteEventBusDatabase = SqliteEventBusDatabase(db_path)
|
|
27
|
+
self._running = False
|
|
28
|
+
self._process_id = str(os.getpid())
|
|
29
|
+
|
|
30
|
+
def emit(self, event_type: str, data: Any) -> None:
|
|
31
|
+
"""Emit an event to all subscribers.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
event_type: Type of event (e.g., "log")
|
|
35
|
+
data: Event data
|
|
36
|
+
"""
|
|
37
|
+
logger.debug(f"[CROSS_PROCESS_EMIT] Emitting event type: {event_type}")
|
|
38
|
+
|
|
39
|
+
# Call local listeners immediately
|
|
40
|
+
logger.debug(f"[CROSS_PROCESS_EMIT] Calling {len(self._listeners)} local listeners")
|
|
41
|
+
super().emit(event_type, data)
|
|
42
|
+
logger.debug("[CROSS_PROCESS_EMIT] Completed local listener calls")
|
|
43
|
+
|
|
44
|
+
# Publish to cross-process subscribers
|
|
45
|
+
logger.debug("[CROSS_PROCESS_EMIT] Publishing to cross-process subscribers")
|
|
46
|
+
self._publish_cross_process(event_type, data)
|
|
47
|
+
logger.debug("[CROSS_PROCESS_EMIT] Completed cross-process publish")
|
|
48
|
+
|
|
49
|
+
def _publish_cross_process(self, event_type: str, data: Any) -> None:
|
|
50
|
+
"""Publish event to cross-process subscribers via database."""
|
|
51
|
+
logger.debug(f"[CROSS_PROCESS_PUBLISH] Publishing event {event_type} to database")
|
|
52
|
+
try:
|
|
53
|
+
self._db.publish_event(event_type, data, self._process_id)
|
|
54
|
+
logger.debug(f"[CROSS_PROCESS_PUBLISH] Successfully published event {event_type} to database")
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.error(f"[CROSS_PROCESS_PUBLISH] Failed to publish event {event_type} to database: {e}")
|
|
57
|
+
|
|
58
|
+
def start_listening(self) -> None:
|
|
59
|
+
"""Start listening for cross-process events."""
|
|
60
|
+
if self._running:
|
|
61
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Already listening, skipping start")
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Starting cross-process event listening")
|
|
65
|
+
self._running = True
|
|
66
|
+
loop = asyncio.get_running_loop()
|
|
67
|
+
loop.create_task(self._database_listener_task())
|
|
68
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Started async database listener task")
|
|
69
|
+
|
|
70
|
+
def stop_listening(self) -> None:
|
|
71
|
+
"""Stop listening for cross-process events."""
|
|
72
|
+
logger.debug("[CROSS_PROCESS_LISTEN] Stopping cross-process event listening")
|
|
73
|
+
self._running = False
|
|
74
|
+
|
|
75
|
+
async def _database_listener_task(self) -> None:
|
|
76
|
+
"""Single database listener task that processes events and recreates itself."""
|
|
77
|
+
if not self._running:
|
|
78
|
+
# this should end the task loop
|
|
79
|
+
logger.debug("[CROSS_PROCESS_LISTENER] Stopping database listener task")
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
# Get unprocessed events from other processes
|
|
83
|
+
events = self._db.get_unprocessed_events(str(self._process_id))
|
|
84
|
+
if events:
|
|
85
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] Found {len(events)} unprocessed events")
|
|
86
|
+
else:
|
|
87
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] No unprocessed events found for process {self._process_id}")
|
|
88
|
+
|
|
89
|
+
for event in events:
|
|
90
|
+
logger.debug(
|
|
91
|
+
f"[CROSS_PROCESS_LISTENER] Processing event {event['event_id']} of type {event['event_type']}"
|
|
92
|
+
)
|
|
93
|
+
# Handle the event
|
|
94
|
+
self._handle_cross_process_event(event["event_type"], event["data"])
|
|
95
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] Successfully processed event {event['event_id']}")
|
|
96
|
+
|
|
97
|
+
# Mark as processed
|
|
98
|
+
self._db.mark_event_processed(event["event_id"])
|
|
99
|
+
logger.debug(f"[CROSS_PROCESS_LISTENER] Marked event {event['event_id']} as processed")
|
|
100
|
+
|
|
101
|
+
# Clean up old events every hour
|
|
102
|
+
current_time = time.time()
|
|
103
|
+
if not hasattr(self, "_last_cleanup"):
|
|
104
|
+
self._last_cleanup = current_time
|
|
105
|
+
elif current_time - self._last_cleanup >= 3600:
|
|
106
|
+
logger.debug("[CROSS_PROCESS_LISTENER] Cleaning up old events")
|
|
107
|
+
self._db.cleanup_old_events()
|
|
108
|
+
self._last_cleanup = current_time
|
|
109
|
+
|
|
110
|
+
# Schedule the next task if still running
|
|
111
|
+
await asyncio.sleep(1.0)
|
|
112
|
+
loop = asyncio.get_running_loop()
|
|
113
|
+
loop.create_task(self._database_listener_task())
|
|
114
|
+
|
|
115
|
+
def _handle_cross_process_event(self, event_type: str, data: Any) -> None:
|
|
116
|
+
"""Handle events received from other processes."""
|
|
117
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Handling cross-process event type: {event_type}")
|
|
118
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Calling {len(self._listeners)} listeners")
|
|
119
|
+
|
|
120
|
+
for i, listener in enumerate(self._listeners):
|
|
121
|
+
try:
|
|
122
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Calling listener {i}")
|
|
123
|
+
listener(event_type, data)
|
|
124
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Successfully called listener {i}")
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.debug(f"[CROSS_PROCESS_HANDLE] Cross-process event listener {i} failed for {event_type}: {e}")
|
{eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/sqlite_event_bus_database.py
RENAMED
|
@@ -2,7 +2,7 @@ import time
|
|
|
2
2
|
from typing import Any, List
|
|
3
3
|
from uuid import uuid4
|
|
4
4
|
|
|
5
|
-
from peewee import CharField, DateTimeField, Model, SqliteDatabase
|
|
5
|
+
from peewee import BooleanField, CharField, DateTimeField, Model, SqliteDatabase
|
|
6
6
|
from playhouse.sqlite_ext import JSONField
|
|
7
7
|
|
|
8
8
|
from eval_protocol.event_bus.logger import logger
|
|
@@ -25,7 +25,7 @@ class SqliteEventBusDatabase:
|
|
|
25
25
|
data = JSONField()
|
|
26
26
|
timestamp = DateTimeField()
|
|
27
27
|
process_id = CharField()
|
|
28
|
-
processed =
|
|
28
|
+
processed = BooleanField(default=False) # Track if event has been processed
|
|
29
29
|
|
|
30
30
|
self._Event = Event
|
|
31
31
|
self._db.connect()
|
|
@@ -46,7 +46,7 @@ class SqliteEventBusDatabase:
|
|
|
46
46
|
data=serialized_data,
|
|
47
47
|
timestamp=time.time(),
|
|
48
48
|
process_id=process_id,
|
|
49
|
-
processed=
|
|
49
|
+
processed=False,
|
|
50
50
|
)
|
|
51
51
|
except Exception as e:
|
|
52
52
|
logger.warning(f"Failed to publish event to database: {e}")
|
|
@@ -56,7 +56,7 @@ class SqliteEventBusDatabase:
|
|
|
56
56
|
try:
|
|
57
57
|
query = (
|
|
58
58
|
self._Event.select()
|
|
59
|
-
.where((self._Event.process_id != process_id) & (self._Event.processed
|
|
59
|
+
.where((self._Event.process_id != process_id) & (~self._Event.processed))
|
|
60
60
|
.order_by(self._Event.timestamp)
|
|
61
61
|
)
|
|
62
62
|
|
|
@@ -80,7 +80,7 @@ class SqliteEventBusDatabase:
|
|
|
80
80
|
def mark_event_processed(self, event_id: str) -> None:
|
|
81
81
|
"""Mark an event as processed."""
|
|
82
82
|
try:
|
|
83
|
-
self._Event.update(processed=
|
|
83
|
+
self._Event.update(processed=True).where(self._Event.event_id == event_id).execute()
|
|
84
84
|
except Exception as e:
|
|
85
85
|
logger.debug(f"Failed to mark event as processed: {e}")
|
|
86
86
|
|
|
@@ -88,8 +88,6 @@ class SqliteEventBusDatabase:
|
|
|
88
88
|
"""Clean up old processed events."""
|
|
89
89
|
try:
|
|
90
90
|
cutoff_time = time.time() - (max_age_hours * 3600)
|
|
91
|
-
self._Event.delete().where(
|
|
92
|
-
(self._Event.processed == "true") & (self._Event.timestamp < cutoff_time)
|
|
93
|
-
).execute()
|
|
91
|
+
self._Event.delete().where((self._Event.processed) & (self._Event.timestamp < cutoff_time)).execute()
|
|
94
92
|
except Exception as e:
|
|
95
93
|
logger.debug(f"Failed to cleanup old events: {e}")
|
{eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_client.py
RENAMED
|
@@ -100,6 +100,25 @@ class ElasticsearchClient:
|
|
|
100
100
|
except Exception:
|
|
101
101
|
return False
|
|
102
102
|
|
|
103
|
+
def clear_index(self) -> bool:
|
|
104
|
+
"""Clear all documents from the index.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if successful, False otherwise
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
# Delete all documents by query
|
|
111
|
+
response = self._make_request(
|
|
112
|
+
"POST", f"{self.index_url}/_delete_by_query", json_data={"query": {"match_all": {}}}
|
|
113
|
+
)
|
|
114
|
+
if response.status_code == 200:
|
|
115
|
+
# Refresh the index to ensure changes are visible
|
|
116
|
+
refresh_response = self._make_request("POST", f"{self.index_url}/_refresh")
|
|
117
|
+
return refresh_response.status_code == 200
|
|
118
|
+
return False
|
|
119
|
+
except Exception:
|
|
120
|
+
return False
|
|
121
|
+
|
|
103
122
|
def get_mapping(self) -> Optional[Dict[str, Any]]:
|
|
104
123
|
"""Get the index mapping.
|
|
105
124
|
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import os
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor
|
|
4
4
|
from typing import Optional, Any, Dict
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
6
|
|
|
7
7
|
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
8
8
|
from .elasticsearch_client import ElasticsearchClient
|
|
@@ -36,8 +36,8 @@ class ElasticsearchDirectHttpHandler(logging.Handler):
|
|
|
36
36
|
def emit(self, record: logging.LogRecord) -> None:
|
|
37
37
|
"""Emit a log record by scheduling it for async transmission."""
|
|
38
38
|
try:
|
|
39
|
-
# Create proper ISO 8601 timestamp
|
|
40
|
-
timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
39
|
+
# Create proper ISO 8601 timestamp in UTC
|
|
40
|
+
timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
41
41
|
|
|
42
42
|
rollout_id = self._get_rollout_id(record)
|
|
43
43
|
logger.debug(f"Emitting log record: {record.getMessage()} with rollout_id: {rollout_id}")
|
|
@@ -30,6 +30,19 @@ if TYPE_CHECKING:
|
|
|
30
30
|
logger = logging.getLogger(__name__)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
def enable_debug_mode():
|
|
34
|
+
"""Enable debug mode for all relevant loggers in the logs server system."""
|
|
35
|
+
# Set debug level for all relevant loggers
|
|
36
|
+
logger.setLevel(logging.DEBUG)
|
|
37
|
+
|
|
38
|
+
# Set debug level for event bus logger
|
|
39
|
+
from eval_protocol.event_bus.logger import logger as event_bus_logger
|
|
40
|
+
|
|
41
|
+
event_bus_logger.setLevel(logging.DEBUG)
|
|
42
|
+
|
|
43
|
+
print("Debug mode enabled for all relevant loggers")
|
|
44
|
+
|
|
45
|
+
|
|
33
46
|
class WebSocketManager:
|
|
34
47
|
"""Manages WebSocket connections and broadcasts messages."""
|
|
35
48
|
|
|
@@ -40,100 +53,152 @@ class WebSocketManager:
|
|
|
40
53
|
self._lock = threading.Lock()
|
|
41
54
|
|
|
42
55
|
async def connect(self, websocket: WebSocket):
|
|
56
|
+
logger.debug("[WEBSOCKET_CONNECT] New websocket connection attempt")
|
|
43
57
|
await websocket.accept()
|
|
44
58
|
with self._lock:
|
|
45
59
|
self.active_connections.append(websocket)
|
|
46
60
|
connection_count = len(self.active_connections)
|
|
47
|
-
logger.info(f"WebSocket connected. Total connections: {connection_count}")
|
|
61
|
+
logger.info(f"[WEBSOCKET_CONNECT] WebSocket connected. Total connections: {connection_count}")
|
|
62
|
+
|
|
63
|
+
logger.debug("[WEBSOCKET_CONNECT] Reading logs for initialization")
|
|
48
64
|
logs = default_logger.read()
|
|
65
|
+
logger.debug(f"[WEBSOCKET_CONNECT] Found {len(logs)} logs to send")
|
|
66
|
+
|
|
49
67
|
data = {
|
|
50
68
|
"type": "initialize_logs",
|
|
51
69
|
"logs": [log.model_dump(exclude_none=True, mode="json") for log in logs],
|
|
52
70
|
}
|
|
71
|
+
logger.debug("[WEBSOCKET_CONNECT] Sending initialization data")
|
|
53
72
|
await websocket.send_text(json.dumps(data))
|
|
73
|
+
logger.debug("[WEBSOCKET_CONNECT] Successfully sent initialization data")
|
|
54
74
|
|
|
55
75
|
def disconnect(self, websocket: WebSocket):
|
|
76
|
+
logger.debug("[WEBSOCKET_DISCONNECT] WebSocket disconnection")
|
|
56
77
|
with self._lock:
|
|
57
78
|
if websocket in self.active_connections:
|
|
58
79
|
self.active_connections.remove(websocket)
|
|
80
|
+
logger.debug("[WEBSOCKET_DISCONNECT] Removed websocket from active connections")
|
|
81
|
+
else:
|
|
82
|
+
logger.debug("[WEBSOCKET_DISCONNECT] Websocket was not in active connections")
|
|
59
83
|
connection_count = len(self.active_connections)
|
|
60
|
-
logger.info(f"WebSocket disconnected. Total connections: {connection_count}")
|
|
84
|
+
logger.info(f"[WEBSOCKET_DISCONNECT] WebSocket disconnected. Total connections: {connection_count}")
|
|
61
85
|
|
|
62
86
|
def broadcast_row_upserted(self, row: "EvaluationRow"):
|
|
63
87
|
"""Broadcast a row-upsert event to all connected clients.
|
|
64
88
|
|
|
65
89
|
Safe no-op if server loop is not running or there are no connections.
|
|
66
90
|
"""
|
|
91
|
+
rollout_id = row.execution_metadata.rollout_id if row.execution_metadata else "unknown"
|
|
92
|
+
logger.debug(f"[WEBSOCKET_BROADCAST] Starting broadcast for rollout_id: {rollout_id}")
|
|
93
|
+
|
|
94
|
+
with self._lock:
|
|
95
|
+
active_connections_count = len(self.active_connections)
|
|
96
|
+
logger.debug(f"[WEBSOCKET_BROADCAST] Active connections: {active_connections_count}")
|
|
97
|
+
|
|
67
98
|
try:
|
|
68
99
|
# Serialize pydantic model
|
|
100
|
+
logger.debug(f"[WEBSOCKET_BROADCAST] Serializing row for rollout_id: {rollout_id}")
|
|
69
101
|
json_message = json.dumps({"type": "log", "row": row.model_dump(exclude_none=True, mode="json")})
|
|
102
|
+
logger.debug(
|
|
103
|
+
f"[WEBSOCKET_BROADCAST] Successfully serialized message (length: {len(json_message)}) for rollout_id: {rollout_id}"
|
|
104
|
+
)
|
|
105
|
+
|
|
70
106
|
# Queue the message for broadcasting in the main event loop
|
|
107
|
+
logger.debug(f"[WEBSOCKET_BROADCAST] Queuing message for broadcast for rollout_id: {rollout_id}")
|
|
71
108
|
self._broadcast_queue.put(json_message)
|
|
109
|
+
logger.debug(f"[WEBSOCKET_BROADCAST] Successfully queued message for rollout_id: {rollout_id}")
|
|
72
110
|
except Exception as e:
|
|
73
|
-
logger.error(
|
|
111
|
+
logger.error(
|
|
112
|
+
f"[WEBSOCKET_BROADCAST] Failed to serialize row for broadcast for rollout_id {rollout_id}: {e}"
|
|
113
|
+
)
|
|
74
114
|
|
|
75
115
|
async def _start_broadcast_loop(self):
|
|
76
116
|
"""Start the broadcast loop that processes queued messages."""
|
|
117
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Starting broadcast loop")
|
|
77
118
|
while True:
|
|
78
119
|
try:
|
|
79
120
|
# Wait for a message to be queued
|
|
121
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Waiting for message from queue")
|
|
80
122
|
message_data = await asyncio.get_event_loop().run_in_executor(None, self._broadcast_queue.get)
|
|
123
|
+
logger.debug(
|
|
124
|
+
f"[WEBSOCKET_BROADCAST_LOOP] Retrieved message from queue (length: {len(str(message_data))})"
|
|
125
|
+
)
|
|
81
126
|
|
|
82
127
|
# Regular string message for all connections
|
|
128
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Sending message to all connections")
|
|
83
129
|
await self._send_text_to_all_connections(str(message_data))
|
|
130
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Successfully sent message to all connections")
|
|
84
131
|
|
|
85
132
|
except Exception as e:
|
|
86
|
-
logger.error(f"Error in broadcast loop: {e}")
|
|
133
|
+
logger.error(f"[WEBSOCKET_BROADCAST_LOOP] Error in broadcast loop: {e}")
|
|
87
134
|
await asyncio.sleep(0.1)
|
|
88
135
|
except asyncio.CancelledError:
|
|
89
|
-
logger.info("Broadcast loop cancelled")
|
|
136
|
+
logger.info("[WEBSOCKET_BROADCAST_LOOP] Broadcast loop cancelled")
|
|
90
137
|
break
|
|
91
138
|
|
|
92
139
|
async def _send_text_to_all_connections(self, text: str):
|
|
93
140
|
with self._lock:
|
|
94
141
|
connections = list(self.active_connections)
|
|
95
142
|
|
|
143
|
+
logger.debug(f"[WEBSOCKET_SEND] Attempting to send to {len(connections)} connections")
|
|
144
|
+
|
|
96
145
|
if not connections:
|
|
146
|
+
logger.debug("[WEBSOCKET_SEND] No connections available, skipping send")
|
|
97
147
|
return
|
|
98
148
|
|
|
99
149
|
tasks = []
|
|
100
150
|
failed_connections = []
|
|
101
151
|
|
|
102
|
-
for connection in connections:
|
|
152
|
+
for i, connection in enumerate(connections):
|
|
103
153
|
try:
|
|
154
|
+
logger.debug(f"[WEBSOCKET_SEND] Preparing to send to connection {i}")
|
|
104
155
|
tasks.append(connection.send_text(text))
|
|
105
156
|
except Exception as e:
|
|
106
|
-
logger.error(f"Failed to send
|
|
157
|
+
logger.error(f"[WEBSOCKET_SEND] Failed to prepare send to WebSocket {i}: {e}")
|
|
107
158
|
failed_connections.append(connection)
|
|
108
159
|
|
|
109
160
|
# Execute all sends in parallel
|
|
110
161
|
if tasks:
|
|
162
|
+
logger.debug(f"[WEBSOCKET_SEND] Executing {len(tasks)} parallel sends")
|
|
111
163
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
164
|
+
logger.debug("[WEBSOCKET_SEND] Completed parallel sends")
|
|
112
165
|
|
|
113
166
|
# Check for any exceptions that occurred during execution
|
|
114
167
|
for i, result in enumerate(results):
|
|
115
168
|
if isinstance(result, Exception):
|
|
116
|
-
logger.error(f"Failed to send text to WebSocket: {result}")
|
|
169
|
+
logger.error(f"[WEBSOCKET_SEND] Failed to send text to WebSocket {i}: {result}")
|
|
117
170
|
failed_connections.append(connections[i])
|
|
171
|
+
else:
|
|
172
|
+
logger.debug(f"[WEBSOCKET_SEND] Successfully sent to connection {i}")
|
|
118
173
|
|
|
119
174
|
# Remove all failed connections
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
175
|
+
if failed_connections:
|
|
176
|
+
logger.debug(f"[WEBSOCKET_SEND] Removing {len(failed_connections)} failed connections")
|
|
177
|
+
with self._lock:
|
|
178
|
+
for connection in failed_connections:
|
|
179
|
+
try:
|
|
180
|
+
self.active_connections.remove(connection)
|
|
181
|
+
except ValueError:
|
|
182
|
+
pass
|
|
126
183
|
|
|
127
184
|
def start_broadcast_loop(self):
|
|
128
185
|
"""Start the broadcast loop in the current event loop."""
|
|
129
186
|
if self._broadcast_task is None or self._broadcast_task.done():
|
|
187
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Creating new broadcast task")
|
|
130
188
|
self._broadcast_task = asyncio.create_task(self._start_broadcast_loop())
|
|
189
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Broadcast task created")
|
|
190
|
+
else:
|
|
191
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Broadcast task already running")
|
|
131
192
|
|
|
132
193
|
def stop_broadcast_loop(self):
|
|
133
194
|
"""Stop the broadcast loop."""
|
|
134
195
|
if self._broadcast_task and not self._broadcast_task.done():
|
|
196
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Cancelling broadcast task")
|
|
135
197
|
self._broadcast_task.cancel()
|
|
136
198
|
self._broadcast_task = None
|
|
199
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] Broadcast task cancelled")
|
|
200
|
+
else:
|
|
201
|
+
logger.debug("[WEBSOCKET_BROADCAST_LOOP] No active broadcast task to stop")
|
|
137
202
|
|
|
138
203
|
|
|
139
204
|
class EvaluationWatcher:
|
|
@@ -260,7 +325,12 @@ class LogsServer(ViteServer):
|
|
|
260
325
|
port: Optional[int] = 8000,
|
|
261
326
|
index_file: str = "index.html",
|
|
262
327
|
elasticsearch_config: Optional[ElasticsearchConfig] = None,
|
|
328
|
+
debug: bool = False,
|
|
263
329
|
):
|
|
330
|
+
# Enable debug mode if requested
|
|
331
|
+
if debug:
|
|
332
|
+
enable_debug_mode()
|
|
333
|
+
|
|
264
334
|
# Initialize WebSocket manager
|
|
265
335
|
self.websocket_manager = WebSocketManager()
|
|
266
336
|
|
|
@@ -304,9 +374,11 @@ class LogsServer(ViteServer):
|
|
|
304
374
|
logger.info(f" {methods} {path}")
|
|
305
375
|
|
|
306
376
|
# Subscribe to events and start listening for cross-process events
|
|
377
|
+
logger.debug("[LOGS_SERVER_INIT] Subscribing to event bus")
|
|
307
378
|
event_bus.subscribe(self._handle_event)
|
|
379
|
+
logger.debug("[LOGS_SERVER_INIT] Successfully subscribed to event bus")
|
|
308
380
|
|
|
309
|
-
logger.info(f"LogsServer initialized on {host}:{port}")
|
|
381
|
+
logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {host}:{port}")
|
|
310
382
|
|
|
311
383
|
def _setup_websocket_routes(self):
|
|
312
384
|
"""Set up WebSocket routes for real-time communication."""
|
|
@@ -418,17 +490,34 @@ class LogsServer(ViteServer):
|
|
|
418
490
|
|
|
419
491
|
def _handle_event(self, event_type: str, data: Any) -> None:
|
|
420
492
|
"""Handle events from the event bus."""
|
|
493
|
+
logger.debug(f"[EVENT_BUS_RECEIVE] Received event type: {event_type}")
|
|
494
|
+
|
|
421
495
|
if event_type in [LOG_EVENT_TYPE]:
|
|
422
496
|
from eval_protocol.models import EvaluationRow
|
|
423
497
|
|
|
424
|
-
|
|
425
|
-
|
|
498
|
+
try:
|
|
499
|
+
logger.debug("[EVENT_BUS_RECEIVE] Processing LOG_EVENT_TYPE event")
|
|
500
|
+
data = EvaluationRow(**data)
|
|
501
|
+
rollout_id = data.execution_metadata.rollout_id if data.execution_metadata else "unknown"
|
|
502
|
+
logger.debug(f"[EVENT_BUS_RECEIVE] Successfully parsed EvaluationRow for rollout_id: {rollout_id}")
|
|
503
|
+
|
|
504
|
+
logger.debug("[EVENT_BUS_RECEIVE] Broadcasting row_upserted to websocket manager")
|
|
505
|
+
self.websocket_manager.broadcast_row_upserted(data)
|
|
506
|
+
logger.debug(f"[EVENT_BUS_RECEIVE] Successfully queued broadcast for rollout_id: {rollout_id}")
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.error(f"[EVENT_BUS_RECEIVE] Failed to process LOG_EVENT_TYPE event: {e}")
|
|
509
|
+
else:
|
|
510
|
+
logger.debug(f"[EVENT_BUS_RECEIVE] Ignoring event type: {event_type} (not LOG_EVENT_TYPE)")
|
|
426
511
|
|
|
427
512
|
def start_loops(self):
|
|
428
513
|
"""Start the broadcast loop and evaluation watcher."""
|
|
514
|
+
logger.debug("[LOGS_SERVER_LOOPS] Starting all loops")
|
|
429
515
|
self.websocket_manager.start_broadcast_loop()
|
|
516
|
+
logger.debug("[LOGS_SERVER_LOOPS] Started websocket broadcast loop")
|
|
430
517
|
self.evaluation_watcher.start()
|
|
518
|
+
logger.debug("[LOGS_SERVER_LOOPS] Started evaluation watcher")
|
|
431
519
|
event_bus.start_listening()
|
|
520
|
+
logger.debug("[LOGS_SERVER_LOOPS] Started event bus listening")
|
|
432
521
|
|
|
433
522
|
async def run_async(self):
|
|
434
523
|
"""
|
|
@@ -477,6 +566,7 @@ def create_app(
|
|
|
477
566
|
port: int = 8000,
|
|
478
567
|
build_dir: Optional[str] = None,
|
|
479
568
|
elasticsearch_config: Optional[ElasticsearchConfig] = None,
|
|
569
|
+
debug: bool = False,
|
|
480
570
|
) -> FastAPI:
|
|
481
571
|
"""
|
|
482
572
|
Factory function to create a FastAPI app instance and start the server with async loops.
|
|
@@ -498,17 +588,21 @@ def create_app(
|
|
|
498
588
|
os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
|
|
499
589
|
)
|
|
500
590
|
|
|
501
|
-
server = LogsServer(
|
|
591
|
+
server = LogsServer(
|
|
592
|
+
host=host, port=port, build_dir=build_dir, elasticsearch_config=elasticsearch_config, debug=debug
|
|
593
|
+
)
|
|
502
594
|
server.start_loops()
|
|
503
595
|
return server.app
|
|
504
596
|
|
|
505
597
|
|
|
506
598
|
# For backward compatibility and direct usage
|
|
507
|
-
def serve_logs(
|
|
599
|
+
def serve_logs(
|
|
600
|
+
port: Optional[int] = None, elasticsearch_config: Optional[ElasticsearchConfig] = None, debug: bool = False
|
|
601
|
+
):
|
|
508
602
|
"""
|
|
509
603
|
Convenience function to create and run a LogsServer.
|
|
510
604
|
"""
|
|
511
|
-
server = LogsServer(port=port, elasticsearch_config=elasticsearch_config)
|
|
605
|
+
server = LogsServer(port=port, elasticsearch_config=elasticsearch_config, debug=debug)
|
|
512
606
|
server.run()
|
|
513
607
|
|
|
514
608
|
|
|
@@ -519,17 +613,27 @@ if __name__ == "__main__":
|
|
|
519
613
|
parser.add_argument("--host", default="localhost", help="Host to bind to (default: localhost)")
|
|
520
614
|
parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
521
615
|
parser.add_argument("--build-dir", help="Path to Vite build directory")
|
|
616
|
+
parser.add_argument("--debug", help="Set logger level to DEBUG")
|
|
522
617
|
|
|
523
618
|
args = parser.parse_args()
|
|
524
619
|
|
|
620
|
+
if args.debug:
|
|
621
|
+
enable_debug_mode()
|
|
622
|
+
|
|
525
623
|
elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
|
|
526
624
|
|
|
527
625
|
# Create server with command line arguments
|
|
528
626
|
if args.build_dir:
|
|
529
627
|
server = LogsServer(
|
|
530
|
-
host=args.host,
|
|
628
|
+
host=args.host,
|
|
629
|
+
port=args.port,
|
|
630
|
+
build_dir=args.build_dir,
|
|
631
|
+
elasticsearch_config=elasticsearch_config,
|
|
632
|
+
debug=bool(args.debug),
|
|
531
633
|
)
|
|
532
634
|
else:
|
|
533
|
-
server = LogsServer(
|
|
635
|
+
server = LogsServer(
|
|
636
|
+
host=args.host, port=args.port, elasticsearch_config=elasticsearch_config, debug=bool(args.debug)
|
|
637
|
+
)
|
|
534
638
|
|
|
535
639
|
server.run()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.44
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -265,6 +265,7 @@ tests/test_evaluation_integration.py
|
|
|
265
265
|
tests/test_evaluation_postprocess.py
|
|
266
266
|
tests/test_evaluation_preview_integration.py
|
|
267
267
|
tests/test_event_bus.py
|
|
268
|
+
tests/test_event_bus_helper.py
|
|
268
269
|
tests/test_examples_end_to_end.py
|
|
269
270
|
tests/test_fireworks_api.py
|
|
270
271
|
tests/test_format.py
|