eval-protocol 0.2.57__tar.gz → 0.2.57.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.57/eval_protocol.egg-info → eval_protocol-0.2.57.dev2}/PKG-INFO +1 -1
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/__init__.py +0 -2
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/fireworks_tracing.py +49 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli.py +10 -0
- eval_protocol-0.2.57.dev2/eval_protocol/cli_commands/logs.py +57 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/upload.py +15 -19
- eval_protocol-0.2.57.dev2/eval_protocol/event_bus/__init__.py +25 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +11 -0
- eval_protocol-0.2.57.dev2/eval_protocol/log_utils/fireworks_tracing_http_handler.py +138 -0
- eval_protocol-0.2.57.dev2/eval_protocol/log_utils/init.py +69 -0
- eval_protocol-0.2.57.dev2/eval_protocol/log_utils/rollout_context.py +84 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +11 -2
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test.py +48 -14
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/remote_rollout_processor.py +37 -65
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/tracing_utils.py +0 -2
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/logs_server.py +78 -5
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/SOURCES.txt +2 -0
- eval_protocol-0.2.57/eval_protocol/cli_commands/logs.py +0 -76
- eval_protocol-0.2.57/eval_protocol/event_bus/__init__.py +0 -5
- eval_protocol-0.2.57/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -63
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/LICENSE +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/README.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/development/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/pyproject.toml +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/setup.cfg +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/setup.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_config.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_format.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_length.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_math.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_models.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/versioneer.py +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/index-BnDJont9.css +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/index-Cu9t0G5i.js +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/index-Cu9t0G5i.js.map +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.57
|
|
3
|
+
Version: 0.2.57.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -36,7 +36,6 @@ from .utils.evaluation_row_utils import (
|
|
|
36
36
|
filter_longest_conversation,
|
|
37
37
|
)
|
|
38
38
|
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
|
|
39
|
-
from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
|
|
40
39
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
41
40
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
42
41
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
@@ -90,7 +89,6 @@ except ImportError:
|
|
|
90
89
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
91
90
|
|
|
92
91
|
__all__ = [
|
|
93
|
-
"create_elasticsearch_config_from_env",
|
|
94
92
|
"ElasticsearchConfig",
|
|
95
93
|
"ElasticsearchDirectHttpHandler",
|
|
96
94
|
"RolloutIdFilter",
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-21T14:44:45-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.57"
|
|
14
|
+
"full-revisionid": "5a0eb89e557f1362bc17acd8a02c25a072dc3092",
|
|
15
|
+
"version": "0.2.57-dev2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -265,6 +265,55 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
265
265
|
self.base_url = base_url.rstrip("/")
|
|
266
266
|
self.timeout = timeout
|
|
267
267
|
|
|
268
|
+
def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
|
|
269
|
+
"""Fetch logs from Fireworks tracing gateway /logs endpoint.
|
|
270
|
+
|
|
271
|
+
Returns entries with keys: timestamp, message, severity, tags.
|
|
272
|
+
"""
|
|
273
|
+
if not tags:
|
|
274
|
+
raise ValueError("At least one tag is required to fetch logs")
|
|
275
|
+
|
|
276
|
+
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
277
|
+
params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
|
|
278
|
+
|
|
279
|
+
# Try /logs first, fall back to /v1/logs if not found
|
|
280
|
+
urls_to_try = [f"{self.base_url}/logs", f"{self.base_url}/v1/logs"]
|
|
281
|
+
data: Dict[str, Any] = {}
|
|
282
|
+
last_error: Optional[str] = None
|
|
283
|
+
for url in urls_to_try:
|
|
284
|
+
try:
|
|
285
|
+
response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
|
|
286
|
+
if response.status_code == 404:
|
|
287
|
+
# Try next variant
|
|
288
|
+
last_error = f"404 for {url}"
|
|
289
|
+
continue
|
|
290
|
+
response.raise_for_status()
|
|
291
|
+
data = response.json() or {}
|
|
292
|
+
break
|
|
293
|
+
except requests.exceptions.RequestException as e:
|
|
294
|
+
last_error = str(e)
|
|
295
|
+
continue
|
|
296
|
+
else:
|
|
297
|
+
# All attempts failed
|
|
298
|
+
if last_error:
|
|
299
|
+
logger.error("Failed to fetch logs from Fireworks (tried %s): %s", urls_to_try, last_error)
|
|
300
|
+
return []
|
|
301
|
+
|
|
302
|
+
entries: List[Dict[str, Any]] = data.get("entries", []) or []
|
|
303
|
+
# Normalize minimal shape
|
|
304
|
+
results: List[Dict[str, Any]] = []
|
|
305
|
+
for e in entries:
|
|
306
|
+
results.append(
|
|
307
|
+
{
|
|
308
|
+
"timestamp": e.get("timestamp"),
|
|
309
|
+
"message": e.get("message"),
|
|
310
|
+
"severity": e.get("severity", "INFO"),
|
|
311
|
+
"tags": e.get("tags", []),
|
|
312
|
+
"status": e.get("status"),
|
|
313
|
+
}
|
|
314
|
+
)
|
|
315
|
+
return results
|
|
316
|
+
|
|
268
317
|
def get_evaluation_rows(
|
|
269
318
|
self,
|
|
270
319
|
tags: List[str],
|
|
@@ -307,6 +307,16 @@ def parse_args(args=None):
|
|
|
307
307
|
action="store_true",
|
|
308
308
|
help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
|
|
309
309
|
)
|
|
310
|
+
logs_parser.add_argument(
|
|
311
|
+
"--use-fireworks",
|
|
312
|
+
action="store_true",
|
|
313
|
+
help="Force Fireworks tracing backend for logs UI (overrides env auto-detection)",
|
|
314
|
+
)
|
|
315
|
+
logs_parser.add_argument(
|
|
316
|
+
"--use-elasticsearch",
|
|
317
|
+
action="store_true",
|
|
318
|
+
help="Force Elasticsearch backend for logs UI (overrides env auto-detection)",
|
|
319
|
+
)
|
|
310
320
|
|
|
311
321
|
# Upload command
|
|
312
322
|
upload_parser = subparsers.add_parser(
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for serving logs with file watching and real-time updates.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from ..utils.logs_server import serve_logs
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def logs_command(args):
|
|
13
|
+
"""Serve logs with file watching and real-time updates"""
|
|
14
|
+
|
|
15
|
+
port = args.port
|
|
16
|
+
print("🚀 Starting Eval Protocol Logs Server")
|
|
17
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
18
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
19
|
+
print(f"👀 Watching paths: {['current directory']}")
|
|
20
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
21
|
+
print("Press Ctrl+C to stop the server")
|
|
22
|
+
print("-" * 50)
|
|
23
|
+
|
|
24
|
+
# Backend selection: Fireworks first when API key present, unless overridden
|
|
25
|
+
use_fireworks = False
|
|
26
|
+
if getattr(args, "use_fireworks", False):
|
|
27
|
+
use_fireworks = True
|
|
28
|
+
elif getattr(args, "use_elasticsearch", False):
|
|
29
|
+
use_fireworks = False
|
|
30
|
+
else:
|
|
31
|
+
use_fireworks = bool(os.environ.get("FIREWORKS_API_KEY"))
|
|
32
|
+
|
|
33
|
+
# Setup backend configs
|
|
34
|
+
elasticsearch_config = None
|
|
35
|
+
# Prefer explicit FW_TRACING_GATEWAY_BASE_URL, then GATEWAY_URL from env (remote validation),
|
|
36
|
+
# finally default to public tracing.fireworks.ai
|
|
37
|
+
fireworks_base_url = (
|
|
38
|
+
os.environ.get("FW_TRACING_GATEWAY_BASE_URL")
|
|
39
|
+
or os.environ.get("GATEWAY_URL")
|
|
40
|
+
or "https://tracing.fireworks.ai"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
serve_logs(
|
|
45
|
+
port=args.port,
|
|
46
|
+
elasticsearch_config=elasticsearch_config,
|
|
47
|
+
debug=args.debug,
|
|
48
|
+
backend="fireworks" if use_fireworks else "elasticsearch",
|
|
49
|
+
fireworks_base_url=fireworks_base_url if use_fireworks else None,
|
|
50
|
+
)
|
|
51
|
+
return 0
|
|
52
|
+
except KeyboardInterrupt:
|
|
53
|
+
print("\n🛑 Server stopped by user")
|
|
54
|
+
return 0
|
|
55
|
+
except Exception as e:
|
|
56
|
+
print(f"❌ Error starting server: {e}")
|
|
57
|
+
return 1
|
|
@@ -267,32 +267,29 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
|
|
|
267
267
|
def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
|
|
268
268
|
target, func = _parse_entry(entry, cwd)
|
|
269
269
|
|
|
270
|
-
#
|
|
270
|
+
# Determine the file path to load
|
|
271
271
|
if "/" in target or "\\" in target or os.path.exists(target):
|
|
272
|
-
# It's a file path - convert to absolute
|
|
272
|
+
# It's a file path - convert to absolute
|
|
273
273
|
if not os.path.isabs(target):
|
|
274
274
|
target = os.path.abspath(os.path.join(cwd, target))
|
|
275
|
-
|
|
276
275
|
if not target.endswith(".py"):
|
|
277
276
|
target = target + ".py"
|
|
278
|
-
|
|
279
277
|
if not os.path.isfile(target):
|
|
280
278
|
raise ValueError(f"File not found: {target}")
|
|
281
|
-
|
|
282
|
-
# Import module from file path
|
|
283
|
-
spec = importlib.util.spec_from_file_location(Path(target).stem, target)
|
|
284
|
-
if not spec or not spec.loader:
|
|
285
|
-
raise ValueError(f"Unable to load module from path: {target}")
|
|
286
|
-
module = importlib.util.module_from_spec(spec)
|
|
287
|
-
sys.modules[spec.name] = module
|
|
288
|
-
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
289
|
-
module_name = spec.name
|
|
290
279
|
source_file_path = target
|
|
291
280
|
else:
|
|
292
|
-
# Treat as
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
281
|
+
# Treat dotted name as a file path
|
|
282
|
+
dotted_as_path = target.replace(".", "/") + ".py"
|
|
283
|
+
source_file_path = os.path.join(cwd, dotted_as_path)
|
|
284
|
+
|
|
285
|
+
# Load the module from the file path
|
|
286
|
+
spec = importlib.util.spec_from_file_location(Path(source_file_path).stem, source_file_path)
|
|
287
|
+
if not spec or not spec.loader:
|
|
288
|
+
raise ValueError(f"Unable to load module from path: {source_file_path}")
|
|
289
|
+
module = importlib.util.module_from_spec(spec)
|
|
290
|
+
sys.modules[spec.name] = module
|
|
291
|
+
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
292
|
+
module_name = spec.name
|
|
296
293
|
|
|
297
294
|
if not hasattr(module, func):
|
|
298
295
|
raise ValueError(f"Function '{func}' not found in module '{module_name}'")
|
|
@@ -591,8 +588,7 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
591
588
|
|
|
592
589
|
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
|
|
593
590
|
try:
|
|
594
|
-
|
|
595
|
-
test_dir = os.path.dirname(source_file_path) if source_file_path else root
|
|
591
|
+
test_dir = root
|
|
596
592
|
metric_name = os.path.basename(test_dir) or "metric"
|
|
597
593
|
result = create_evaluation(
|
|
598
594
|
evaluator_id=evaluator_id,
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Global event bus instance - uses SqliteEventBus for cross-process functionality
|
|
2
|
+
from eval_protocol.event_bus.event_bus import EventBus
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _get_default_event_bus():
|
|
6
|
+
from eval_protocol.event_bus.sqlite_event_bus import SqliteEventBus
|
|
7
|
+
|
|
8
|
+
return SqliteEventBus()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Lazy property that creates the event bus only when accessed
|
|
12
|
+
class _LazyEventBus(EventBus):
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self._event_bus: EventBus | None = None
|
|
15
|
+
|
|
16
|
+
def _get_event_bus(self):
|
|
17
|
+
if self._event_bus is None:
|
|
18
|
+
self._event_bus = _get_default_event_bus()
|
|
19
|
+
return self._event_bus
|
|
20
|
+
|
|
21
|
+
def __getattr__(self, name):
|
|
22
|
+
return getattr(self._get_event_bus(), name)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
event_bus: EventBus = _LazyEventBus()
|
|
@@ -60,6 +60,17 @@ class ElasticsearchDirectHttpHandler(logging.Handler):
|
|
|
60
60
|
if status_info:
|
|
61
61
|
data.update(status_info)
|
|
62
62
|
|
|
63
|
+
# Optional correlation enrichment
|
|
64
|
+
experiment_id = getattr(record, "experiment_id", None)
|
|
65
|
+
if experiment_id is not None:
|
|
66
|
+
data["experiment_id"] = experiment_id
|
|
67
|
+
run_id = getattr(record, "run_id", None)
|
|
68
|
+
if run_id is not None:
|
|
69
|
+
data["run_id"] = run_id
|
|
70
|
+
rollout_ids = getattr(record, "rollout_ids", None)
|
|
71
|
+
if rollout_ids is not None:
|
|
72
|
+
data["rollout_ids"] = rollout_ids
|
|
73
|
+
|
|
63
74
|
# Schedule the HTTP request to run asynchronously
|
|
64
75
|
self._schedule_async_send(data, record)
|
|
65
76
|
except Exception as e:
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Optional, Any, Dict, List, cast
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FireworksTracingHttpHandler(logging.Handler):
|
|
11
|
+
"""Logging handler that posts structured logs to tracing.fireworks gateway /logs endpoint."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, gateway_base_url: Optional[str] = None, rollout_id_env: str = "EP_ROLLOUT_ID") -> None:
|
|
14
|
+
super().__init__()
|
|
15
|
+
self.gateway_base_url = (
|
|
16
|
+
gateway_base_url or os.getenv("FW_TRACING_GATEWAY_BASE_URL") or "https://tracing.fireworks.ai"
|
|
17
|
+
)
|
|
18
|
+
self.rollout_id_env = rollout_id_env
|
|
19
|
+
self._session = requests.Session()
|
|
20
|
+
self._lock = threading.Lock()
|
|
21
|
+
# Include Authorization header if FIREWORKS_API_KEY is available
|
|
22
|
+
api_key = os.environ.get("FIREWORKS_API_KEY")
|
|
23
|
+
if api_key:
|
|
24
|
+
try:
|
|
25
|
+
self._session.headers.update({"Authorization": f"Bearer {api_key}"})
|
|
26
|
+
except Exception:
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
30
|
+
try:
|
|
31
|
+
if not self.gateway_base_url:
|
|
32
|
+
return
|
|
33
|
+
rollout_id = self._get_rollout_id(record)
|
|
34
|
+
if not rollout_id:
|
|
35
|
+
return
|
|
36
|
+
payload = self._build_payload(record, rollout_id)
|
|
37
|
+
base = self.gateway_base_url.rstrip("/")
|
|
38
|
+
url = f"{base}/logs"
|
|
39
|
+
# Optional debug prints to aid local diagnostics
|
|
40
|
+
if os.environ.get("EP_DEBUG") == "true":
|
|
41
|
+
try:
|
|
42
|
+
tags_val = payload.get("tags")
|
|
43
|
+
tags_len = len(tags_val) if isinstance(tags_val, list) else 0
|
|
44
|
+
msg_val = payload.get("message")
|
|
45
|
+
msg_preview = msg_val[:80] if isinstance(msg_val, str) else msg_val
|
|
46
|
+
print(f"[FW_LOG] POST {url} rollout_id={rollout_id} tags={tags_len} msg={msg_preview}")
|
|
47
|
+
except Exception:
|
|
48
|
+
pass
|
|
49
|
+
with self._lock:
|
|
50
|
+
resp = self._session.post(url, json=payload, timeout=5)
|
|
51
|
+
if os.environ.get("EP_DEBUG") == "true":
|
|
52
|
+
try:
|
|
53
|
+
print(f"[FW_LOG] resp={resp.status_code}")
|
|
54
|
+
except Exception:
|
|
55
|
+
pass
|
|
56
|
+
# Fallback to /v1/logs if /logs is not found
|
|
57
|
+
if resp is not None and getattr(resp, "status_code", None) == 404:
|
|
58
|
+
alt = f"{base}/v1/logs"
|
|
59
|
+
if os.environ.get("EP_DEBUG") == "true":
|
|
60
|
+
try:
|
|
61
|
+
tags_val = payload.get("tags")
|
|
62
|
+
tags_len = len(tags_val) if isinstance(tags_val, list) else 0
|
|
63
|
+
print(f"[FW_LOG] RETRY POST {alt} rollout_id={rollout_id} tags={tags_len}")
|
|
64
|
+
except Exception:
|
|
65
|
+
pass
|
|
66
|
+
with self._lock:
|
|
67
|
+
resp2 = self._session.post(alt, json=payload, timeout=5)
|
|
68
|
+
if os.environ.get("EP_DEBUG") == "true":
|
|
69
|
+
try:
|
|
70
|
+
print(f"[FW_LOG] retry resp={resp2.status_code}")
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
except Exception:
|
|
74
|
+
# Avoid raising exceptions from logging
|
|
75
|
+
self.handleError(record)
|
|
76
|
+
|
|
77
|
+
def _get_rollout_id(self, record: logging.LogRecord) -> Optional[str]:
|
|
78
|
+
if hasattr(record, "rollout_id") and cast(Any, getattr(record, "rollout_id")) is not None:
|
|
79
|
+
return str(cast(Any, getattr(record, "rollout_id")))
|
|
80
|
+
return os.getenv(self.rollout_id_env)
|
|
81
|
+
|
|
82
|
+
def _get_status_info(self, record: logging.LogRecord) -> Optional[Dict[str, Any]]:
|
|
83
|
+
"""Extract status information from the log record's extra data."""
|
|
84
|
+
# Check if 'status' is in the extra data (passed via extra parameter)
|
|
85
|
+
if hasattr(record, "status") and record.status is not None: # type: ignore
|
|
86
|
+
status = record.status # type: ignore
|
|
87
|
+
|
|
88
|
+
# Handle Status class instances (Pydantic BaseModel)
|
|
89
|
+
if hasattr(status, "code") and hasattr(status, "message"):
|
|
90
|
+
# Status object - extract code and message
|
|
91
|
+
status_code = status.code
|
|
92
|
+
# Handle both enum values and direct integer values
|
|
93
|
+
if hasattr(status_code, "value"):
|
|
94
|
+
status_code = status_code.value
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"code": status_code,
|
|
98
|
+
"message": status.message,
|
|
99
|
+
"details": getattr(status, "details", []),
|
|
100
|
+
}
|
|
101
|
+
elif isinstance(status, dict):
|
|
102
|
+
# Dictionary representation of status
|
|
103
|
+
return {
|
|
104
|
+
"code": status.get("code"),
|
|
105
|
+
"message": status.get("message"),
|
|
106
|
+
"details": status.get("details", []),
|
|
107
|
+
}
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
def _build_payload(self, record: logging.LogRecord, rollout_id: str) -> Dict[str, Any]:
|
|
111
|
+
timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
112
|
+
message = record.getMessage()
|
|
113
|
+
tags: List[str] = [f"rollout_id:{rollout_id}"]
|
|
114
|
+
# Optional additional tags
|
|
115
|
+
if hasattr(record, "experiment_id") and cast(Any, getattr(record, "experiment_id")):
|
|
116
|
+
tags.append(f"experiment_id:{cast(Any, getattr(record, 'experiment_id'))}")
|
|
117
|
+
if hasattr(record, "run_id") and cast(Any, getattr(record, "run_id")):
|
|
118
|
+
tags.append(f"run_id:{cast(Any, getattr(record, 'run_id'))}")
|
|
119
|
+
# Groupwise list of rollout_ids
|
|
120
|
+
if hasattr(record, "rollout_ids") and cast(Any, getattr(record, "rollout_ids")):
|
|
121
|
+
try:
|
|
122
|
+
for rid in cast(List[str], getattr(record, "rollout_ids")):
|
|
123
|
+
tags.append(f"rollout_id:{rid}")
|
|
124
|
+
except Exception:
|
|
125
|
+
pass
|
|
126
|
+
program = cast(Optional[str], getattr(record, "program", None)) or "eval_protocol"
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
"program": program,
|
|
130
|
+
"status": self._get_status_info(record),
|
|
131
|
+
"message": message,
|
|
132
|
+
"tags": tags,
|
|
133
|
+
"extras": {
|
|
134
|
+
"logger_name": record.name,
|
|
135
|
+
"level": record.levelname,
|
|
136
|
+
"timestamp": timestamp,
|
|
137
|
+
},
|
|
138
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from eval_protocol.log_utils.fireworks_tracing_http_handler import (
|
|
6
|
+
FireworksTracingHttpHandler,
|
|
7
|
+
)
|
|
8
|
+
from eval_protocol.log_utils.elasticsearch_direct_http_handler import (
|
|
9
|
+
ElasticsearchDirectHttpHandler,
|
|
10
|
+
)
|
|
11
|
+
from eval_protocol.log_utils.rollout_context import ContextRolloutIdFilter
|
|
12
|
+
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_INITIALIZED = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_env(name: str) -> Optional[str]:
|
|
19
|
+
val = os.getenv(name)
|
|
20
|
+
return val if val and val.strip() else None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def init_external_logging_from_env() -> None:
|
|
24
|
+
"""
|
|
25
|
+
Initialize external logging sinks (Fireworks tracing, optional Elasticsearch) from env vars.
|
|
26
|
+
|
|
27
|
+
Idempotent: safe to call multiple times.
|
|
28
|
+
|
|
29
|
+
Environment variables:
|
|
30
|
+
- FW_TRACING_GATEWAY_BASE_URL: enable Fireworks tracing handler when set
|
|
31
|
+
- EP_ELASTICSEARCH_URL, EP_ELASTICSEARCH_API_KEY, EP_ELASTICSEARCH_INDEX: enable ES when all set
|
|
32
|
+
"""
|
|
33
|
+
global _INITIALIZED
|
|
34
|
+
if _INITIALIZED:
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
root_logger = logging.getLogger()
|
|
38
|
+
|
|
39
|
+
# Ensure we do not add duplicate handlers if already present
|
|
40
|
+
existing_handler_types = {type(h).__name__ for h in root_logger.handlers}
|
|
41
|
+
|
|
42
|
+
# Fireworks tracing: prefer if FIREWORKS_API_KEY is present; default base URL if not provided
|
|
43
|
+
fw_key = _get_env("FIREWORKS_API_KEY")
|
|
44
|
+
# Allow remote validation gateway to act as tracing base when provided
|
|
45
|
+
fw_url = _get_env("FW_TRACING_GATEWAY_BASE_URL") or _get_env("GATEWAY_URL") or "https://tracing.fireworks.ai"
|
|
46
|
+
if fw_key and "FireworksTracingHttpHandler" not in existing_handler_types:
|
|
47
|
+
fw_handler = FireworksTracingHttpHandler(gateway_base_url=fw_url)
|
|
48
|
+
fw_handler.setLevel(logging.INFO)
|
|
49
|
+
fw_handler.addFilter(ContextRolloutIdFilter())
|
|
50
|
+
root_logger.addHandler(fw_handler)
|
|
51
|
+
|
|
52
|
+
# Elasticsearch
|
|
53
|
+
es_url = _get_env("EP_ELASTICSEARCH_URL")
|
|
54
|
+
es_api_key = _get_env("EP_ELASTICSEARCH_API_KEY")
|
|
55
|
+
es_index = _get_env("EP_ELASTICSEARCH_INDEX")
|
|
56
|
+
if (
|
|
57
|
+
not fw_key
|
|
58
|
+
and es_url
|
|
59
|
+
and es_api_key
|
|
60
|
+
and es_index
|
|
61
|
+
and "ElasticsearchDirectHttpHandler" not in existing_handler_types
|
|
62
|
+
):
|
|
63
|
+
es_config = ElasticsearchConfig(url=es_url, api_key=es_api_key, index_name=es_index)
|
|
64
|
+
es_handler = ElasticsearchDirectHttpHandler(elasticsearch_config=es_config)
|
|
65
|
+
es_handler.setLevel(logging.INFO)
|
|
66
|
+
es_handler.addFilter(ContextRolloutIdFilter())
|
|
67
|
+
root_logger.addHandler(es_handler)
|
|
68
|
+
|
|
69
|
+
_INITIALIZED = True
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import contextvars
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Context variables used to correlate logs with rollouts under concurrency
|
|
10
|
+
current_rollout_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar("ep_rollout_id", default=None)
|
|
11
|
+
current_rollout_ids: contextvars.ContextVar[Optional[List[str]]] = contextvars.ContextVar(
|
|
12
|
+
"ep_rollout_ids", default=None
|
|
13
|
+
)
|
|
14
|
+
current_experiment_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar("ep_experiment_id", default=None)
|
|
15
|
+
current_run_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar("ep_run_id", default=None)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ContextRolloutIdFilter(logging.Filter):
|
|
19
|
+
"""
|
|
20
|
+
Logging filter that injects correlation fields into a LogRecord from ContextVars.
|
|
21
|
+
|
|
22
|
+
The filter is intended to be attached ONLY to external sink handlers (e.g.,
|
|
23
|
+
Fireworks or Elasticsearch). If there is no active rollout context, it drops
|
|
24
|
+
the record for that handler to avoid shipping uncorrelated logs.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def filter(self, record: logging.LogRecord) -> bool: # type: ignore[override]
|
|
28
|
+
rollout_id = current_rollout_id.get()
|
|
29
|
+
if not rollout_id:
|
|
30
|
+
# Allow explicit rollout IDs on the record or via environment fallback.
|
|
31
|
+
rollout_id = getattr(record, "rollout_id", None) or os.getenv("EP_ROLLOUT_ID")
|
|
32
|
+
if not rollout_id:
|
|
33
|
+
# No correlation context → do not emit to external sink
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
# Inject primary correlation fields
|
|
37
|
+
setattr(record, "rollout_id", rollout_id)
|
|
38
|
+
|
|
39
|
+
rollout_ids = current_rollout_ids.get()
|
|
40
|
+
if rollout_ids:
|
|
41
|
+
setattr(record, "rollout_ids", rollout_ids)
|
|
42
|
+
|
|
43
|
+
experiment_id = current_experiment_id.get()
|
|
44
|
+
if experiment_id:
|
|
45
|
+
setattr(record, "experiment_id", experiment_id)
|
|
46
|
+
|
|
47
|
+
run_id = current_run_id.get()
|
|
48
|
+
if run_id:
|
|
49
|
+
setattr(record, "run_id", run_id)
|
|
50
|
+
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@asynccontextmanager
|
|
55
|
+
async def rollout_logging_context(
|
|
56
|
+
rollout_id: str,
|
|
57
|
+
*,
|
|
58
|
+
experiment_id: Optional[str] = None,
|
|
59
|
+
run_id: Optional[str] = None,
|
|
60
|
+
rollout_ids: Optional[List[str]] = None,
|
|
61
|
+
):
|
|
62
|
+
"""
|
|
63
|
+
Async context manager to set correlation ContextVars for the current task.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
rollout_id: Primary rollout identifier for correlation.
|
|
67
|
+
experiment_id: Optional experiment ID for tagging.
|
|
68
|
+
run_id: Optional run ID for tagging.
|
|
69
|
+
rollout_ids: Optional list of related rollout IDs (e.g., groupwise mode).
|
|
70
|
+
"""
|
|
71
|
+
t_rollout = current_rollout_id.set(rollout_id)
|
|
72
|
+
t_rollouts = current_rollout_ids.set(rollout_ids) if rollout_ids is not None else None
|
|
73
|
+
t_experiment = current_experiment_id.set(experiment_id) if experiment_id is not None else None
|
|
74
|
+
t_run = current_run_id.set(run_id) if run_id is not None else None
|
|
75
|
+
try:
|
|
76
|
+
yield
|
|
77
|
+
finally:
|
|
78
|
+
current_rollout_id.reset(t_rollout)
|
|
79
|
+
if t_rollouts is not None:
|
|
80
|
+
current_rollout_ids.reset(t_rollouts)
|
|
81
|
+
if t_experiment is not None:
|
|
82
|
+
current_experiment_id.reset(t_experiment)
|
|
83
|
+
if t_run is not None:
|
|
84
|
+
current_run_id.reset(t_run)
|
{eval_protocol-0.2.57 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py
RENAMED
|
@@ -3,7 +3,7 @@ Redis utilities for tracking chat completions via insertion IDs.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Set
|
|
6
|
+
from typing import Set, cast
|
|
7
7
|
import redis
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
@@ -40,7 +40,16 @@ def get_insertion_ids(redis_client: redis.Redis, rollout_id: str) -> Set[str]:
|
|
|
40
40
|
Set of insertion_id strings, empty set if none found or on error
|
|
41
41
|
"""
|
|
42
42
|
try:
|
|
43
|
-
|
|
43
|
+
raw = redis_client.smembers(rollout_id)
|
|
44
|
+
# Typing in redis stubs may be Awaitable[Set[Any]] | Set[Any]; at runtime this is a Set[bytes]
|
|
45
|
+
raw_ids = cast(Set[object], raw)
|
|
46
|
+
# Normalize to set[str]
|
|
47
|
+
insertion_ids: Set[str] = set()
|
|
48
|
+
for b in raw_ids:
|
|
49
|
+
try:
|
|
50
|
+
insertion_ids.add(b.decode("utf-8") if isinstance(b, (bytes, bytearray)) else cast(str, b))
|
|
51
|
+
except Exception:
|
|
52
|
+
continue
|
|
44
53
|
logger.debug(f"Found {len(insertion_ids)} expected insertion_ids for rollout {rollout_id}")
|
|
45
54
|
return insertion_ids
|
|
46
55
|
except Exception as e:
|