eval-protocol 0.2.85__tar.gz → 0.2.87__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.85/eval_protocol.egg-info → eval_protocol-0.2.87}/PKG-INFO +18 -94
- eval_protocol-0.2.87/README.md +39 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli.py +26 -9
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/create_rft.py +45 -18
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/local_test.py +25 -12
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/fireworks_rft.py +9 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87/eval_protocol.egg-info}/PKG-INFO +18 -94
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_create_rft_infer.py +346 -18
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_local_test.py +115 -4
- eval_protocol-0.2.85/README.md +0 -115
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/LICENSE +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/pyproject.toml +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/setup.cfg +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/setup.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_config.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_format.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_length.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_math.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_models.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/versioneer.py +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.87
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -113,113 +113,37 @@ Requires-Dist: langfuse>=2.0.0; extra == "proxy"
|
|
|
113
113
|
Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
|
|
114
114
|
Dynamic: license-file
|
|
115
115
|
|
|
116
|
-
# Eval Protocol
|
|
116
|
+
# Eval Protocol
|
|
117
117
|
|
|
118
118
|
[](https://pypi.org/project/eval-protocol/)
|
|
119
119
|
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
120
120
|
|
|
121
|
-
**
|
|
121
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
122
122
|
|
|
123
|
-
|
|
123
|
+

|
|
124
124
|
|
|
125
|
-
|
|
125
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
126
126
|
|
|
127
|
-
|
|
128
|
-
- **Robust rollouts**: Handles flaky LLM APIs and parallel execution
|
|
129
|
-
- **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
|
|
130
|
-
- **Agent support**: LangGraph and Pydantic AI
|
|
131
|
-
- **MCP RL envs**: Build reinforcement learning environments with MCP
|
|
132
|
-
- **Built-in benchmarks**: AIME, tau-bench
|
|
133
|
-
- **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
|
|
134
|
-
- **Local UI**: Pivot/table views for real-time analysis
|
|
127
|
+
Eval Protocol makes this possible in two ways:
|
|
135
128
|
|
|
136
|
-
|
|
129
|
+
1. **Expose your agent through a simple API**
|
|
130
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
131
|
+
2. **Connect with any trainer**
|
|
132
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
137
133
|
|
|
138
|
-
|
|
134
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
139
135
|
|
|
140
|
-
|
|
141
|
-
pip install 'eval-protocol[langfuse]'
|
|
136
|
+
## Who This Is For
|
|
142
137
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
export GEMINI_API_KEY=...
|
|
138
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
139
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
140
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
147
141
|
|
|
148
|
-
|
|
149
|
-
export LANGFUSE_PUBLIC_KEY=...
|
|
150
|
-
export LANGFUSE_SECRET_KEY=...
|
|
151
|
-
export LANGFUSE_HOST=https://your-deployment.com # optional
|
|
152
|
-
```
|
|
142
|
+
## Quickstart
|
|
153
143
|
|
|
154
|
-
|
|
144
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
155
145
|
|
|
156
|
-
|
|
157
|
-
from datetime import datetime
|
|
158
|
-
import pytest
|
|
159
|
-
|
|
160
|
-
from eval_protocol import (
|
|
161
|
-
evaluation_test,
|
|
162
|
-
aha_judge,
|
|
163
|
-
EvaluationRow,
|
|
164
|
-
SingleTurnRolloutProcessor,
|
|
165
|
-
DynamicDataLoader,
|
|
166
|
-
create_langfuse_adapter,
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def langfuse_data_generator() -> list[EvaluationRow]:
|
|
171
|
-
adapter = create_langfuse_adapter()
|
|
172
|
-
return adapter.get_evaluation_rows(
|
|
173
|
-
to_timestamp=datetime.utcnow(),
|
|
174
|
-
limit=20,
|
|
175
|
-
sample_size=5,
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
@pytest.mark.parametrize(
|
|
180
|
-
"completion_params",
|
|
181
|
-
[
|
|
182
|
-
{"model": "openai/gpt-4.1"},
|
|
183
|
-
{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
|
|
184
|
-
],
|
|
185
|
-
)
|
|
186
|
-
@evaluation_test(
|
|
187
|
-
data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
|
|
188
|
-
rollout_processor=SingleTurnRolloutProcessor(),
|
|
189
|
-
)
|
|
190
|
-
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
|
|
191
|
-
return await aha_judge(row)
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
Run it:
|
|
195
|
-
|
|
196
|
-
```bash
|
|
197
|
-
pytest -q -s
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
|
|
201
|
-
|
|
202
|
-
## Installation
|
|
203
|
-
|
|
204
|
-
This library requires Python >= 3.10.
|
|
205
|
-
|
|
206
|
-
### pip
|
|
207
|
-
|
|
208
|
-
```bash
|
|
209
|
-
pip install eval-protocol
|
|
210
|
-
```
|
|
211
|
-
|
|
212
|
-
### uv (recommended)
|
|
213
|
-
|
|
214
|
-
```bash
|
|
215
|
-
# Install uv (if needed)
|
|
216
|
-
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
217
|
-
|
|
218
|
-
# Add to your project
|
|
219
|
-
uv add eval-protocol
|
|
220
|
-
```
|
|
221
|
-
|
|
222
|
-
## 📚 Resources
|
|
146
|
+
## Resources
|
|
223
147
|
|
|
224
148
|
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
225
149
|
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Eval Protocol
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/eval-protocol/)
|
|
4
|
+
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
5
|
+
|
|
6
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
11
|
+
|
|
12
|
+
Eval Protocol makes this possible in two ways:
|
|
13
|
+
|
|
14
|
+
1. **Expose your agent through a simple API**
|
|
15
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
16
|
+
2. **Connect with any trainer**
|
|
17
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
18
|
+
|
|
19
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
20
|
+
|
|
21
|
+
## Who This Is For
|
|
22
|
+
|
|
23
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
24
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
25
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
30
|
+
|
|
31
|
+
## Resources
|
|
32
|
+
|
|
33
|
+
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
34
|
+
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|
|
35
|
+
- **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
|
|
36
|
+
|
|
37
|
+
## License
|
|
38
|
+
|
|
39
|
+
[MIT](LICENSE)
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-11-
|
|
11
|
+
"date": "2025-11-12T15:43:06-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "8ab1c920bb77880deb87f2320c6cf6ea8780458e",
|
|
15
|
+
"version": "0.2.87"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -371,13 +371,13 @@ def parse_args(args=None):
|
|
|
371
371
|
help="Create a Reinforcement Fine-tuning Job on Fireworks",
|
|
372
372
|
)
|
|
373
373
|
rft_parser.add_argument(
|
|
374
|
-
"--evaluator
|
|
375
|
-
help="Evaluator ID
|
|
374
|
+
"--evaluator",
|
|
375
|
+
help="Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests",
|
|
376
376
|
)
|
|
377
377
|
# Dataset options
|
|
378
378
|
rft_parser.add_argument(
|
|
379
|
-
"--dataset
|
|
380
|
-
help="Use existing
|
|
379
|
+
"--dataset",
|
|
380
|
+
help="Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization",
|
|
381
381
|
)
|
|
382
382
|
rft_parser.add_argument(
|
|
383
383
|
"--dataset-jsonl",
|
|
@@ -400,6 +400,8 @@ def parse_args(args=None):
|
|
|
400
400
|
rft_parser.add_argument("--learning-rate", type=float, default=3e-5)
|
|
401
401
|
rft_parser.add_argument("--max-context-length", type=int, default=65536)
|
|
402
402
|
rft_parser.add_argument("--lora-rank", type=int, default=16)
|
|
403
|
+
rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
|
|
404
|
+
rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
|
|
403
405
|
rft_parser.add_argument("--accelerator-count", type=int, default=1)
|
|
404
406
|
rft_parser.add_argument("--region", help="Fireworks region enum value")
|
|
405
407
|
rft_parser.add_argument("--display-name", help="RFT job display name")
|
|
@@ -407,14 +409,19 @@ def parse_args(args=None):
|
|
|
407
409
|
rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
|
|
408
410
|
rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
|
|
409
411
|
# Rollout chunking
|
|
410
|
-
rft_parser.add_argument("--chunk-size", type=int, default=
|
|
412
|
+
rft_parser.add_argument("--chunk-size", type=int, default=100, help="Data chunk size for rollout batching")
|
|
411
413
|
# Inference params
|
|
412
414
|
rft_parser.add_argument("--temperature", type=float)
|
|
413
415
|
rft_parser.add_argument("--top-p", type=float)
|
|
414
416
|
rft_parser.add_argument("--top-k", type=int)
|
|
415
|
-
rft_parser.add_argument("--max-tokens", type=int, default=32768)
|
|
416
|
-
rft_parser.add_argument("--
|
|
417
|
-
rft_parser.add_argument("--
|
|
417
|
+
rft_parser.add_argument("--max-output-tokens", type=int, default=32768)
|
|
418
|
+
rft_parser.add_argument("--response-candidates-count", type=int, default=8)
|
|
419
|
+
rft_parser.add_argument("--extra-body", help="JSON string for extra inference params")
|
|
420
|
+
# MCP server (optional)
|
|
421
|
+
rft_parser.add_argument(
|
|
422
|
+
"--mcp-server",
|
|
423
|
+
help="The MCP server resource name to use for the reinforcement fine-tuning job.",
|
|
424
|
+
)
|
|
418
425
|
# Wandb
|
|
419
426
|
rft_parser.add_argument("--wandb-enabled", action="store_true")
|
|
420
427
|
rft_parser.add_argument("--wandb-project")
|
|
@@ -422,7 +429,7 @@ def parse_args(args=None):
|
|
|
422
429
|
rft_parser.add_argument("--wandb-run-id")
|
|
423
430
|
rft_parser.add_argument("--wandb-api-key")
|
|
424
431
|
# Misc
|
|
425
|
-
rft_parser.add_argument("--
|
|
432
|
+
rft_parser.add_argument("--job-id", help="Specify an explicit RFT job id")
|
|
426
433
|
rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
|
|
427
434
|
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
|
|
428
435
|
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
|
|
@@ -447,6 +454,16 @@ def parse_args(args=None):
|
|
|
447
454
|
action="store_true",
|
|
448
455
|
help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
|
|
449
456
|
)
|
|
457
|
+
local_test_parser.add_argument(
|
|
458
|
+
"--docker-build-extra",
|
|
459
|
+
default="",
|
|
460
|
+
help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
|
|
461
|
+
)
|
|
462
|
+
local_test_parser.add_argument(
|
|
463
|
+
"--docker-run-extra",
|
|
464
|
+
default="",
|
|
465
|
+
help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")",
|
|
466
|
+
)
|
|
450
467
|
|
|
451
468
|
# Run command (for Hydra-based evaluations)
|
|
452
469
|
# This subparser intentionally defines no arguments itself.
|
|
@@ -344,7 +344,7 @@ def _poll_evaluator_status(
|
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
def create_rft_command(args) -> int:
|
|
347
|
-
evaluator_id: Optional[str] = getattr(args, "
|
|
347
|
+
evaluator_id: Optional[str] = getattr(args, "evaluator", None)
|
|
348
348
|
non_interactive: bool = bool(getattr(args, "yes", False))
|
|
349
349
|
dry_run: bool = bool(getattr(args, "dry_run", False))
|
|
350
350
|
force: bool = bool(getattr(args, "force", False))
|
|
@@ -373,11 +373,11 @@ def create_rft_command(args) -> int:
|
|
|
373
373
|
print("No evaluation tests found.")
|
|
374
374
|
print("\nHint: Make sure your tests use the @evaluation_test decorator.")
|
|
375
375
|
return 1
|
|
376
|
-
# Always interactive selection here
|
|
376
|
+
# Always interactive selection here
|
|
377
377
|
try:
|
|
378
378
|
selected_tests = _prompt_select(tests, non_interactive=non_interactive)
|
|
379
379
|
except Exception:
|
|
380
|
-
print("Error: Failed to open selector UI. Please pass --evaluator
|
|
380
|
+
print("Error: Failed to open selector UI. Please pass --evaluator or --entry explicitly.")
|
|
381
381
|
return 1
|
|
382
382
|
if not selected_tests:
|
|
383
383
|
print("No tests selected.")
|
|
@@ -385,7 +385,7 @@ def create_rft_command(args) -> int:
|
|
|
385
385
|
if len(selected_tests) != 1:
|
|
386
386
|
if non_interactive and len(selected_tests) > 1:
|
|
387
387
|
print("Error: Multiple evaluation tests found in --yes (non-interactive) mode.")
|
|
388
|
-
print(" Please pass --evaluator
|
|
388
|
+
print(" Please pass --evaluator or --entry to disambiguate.")
|
|
389
389
|
try:
|
|
390
390
|
# Offer candidate evaluator ids for convenience
|
|
391
391
|
tests = _discover_tests(project_root)
|
|
@@ -410,8 +410,13 @@ def create_rft_command(args) -> int:
|
|
|
410
410
|
selected_test_file_path, selected_test_func_name = _resolve_selected_test(
|
|
411
411
|
project_root, evaluator_id, selected_tests=selected_tests
|
|
412
412
|
)
|
|
413
|
-
# Resolve evaluator resource name to fully-qualified format required by API
|
|
414
|
-
|
|
413
|
+
# Resolve evaluator resource name to fully-qualified format required by API.
|
|
414
|
+
# Allow users to pass either short id or fully-qualified resource.
|
|
415
|
+
if evaluator_id and evaluator_id.startswith("accounts/"):
|
|
416
|
+
evaluator_resource_name = evaluator_id
|
|
417
|
+
evaluator_id = _extract_terminal_segment(evaluator_id)
|
|
418
|
+
else:
|
|
419
|
+
evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
|
|
415
420
|
|
|
416
421
|
# Optional short-circuit: if evaluator already exists and not forcing, skip upload path
|
|
417
422
|
skip_upload = False
|
|
@@ -470,10 +475,10 @@ def create_rft_command(args) -> int:
|
|
|
470
475
|
# If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
|
|
471
476
|
if selected_entry is None and len(tests) > 1:
|
|
472
477
|
print(
|
|
473
|
-
f"Error: Multiple evaluation tests found, and the selected
|
|
474
|
-
" Please re-run specifying the evaluator
|
|
478
|
+
f"Error: Multiple evaluation tests found, and the selected evaluator {evaluator_id} does not match any discovered test.\n"
|
|
479
|
+
" Please re-run specifying the evaluator.\n"
|
|
475
480
|
" Hints:\n"
|
|
476
|
-
" - eval-protocol create rft --evaluator
|
|
481
|
+
" - eval-protocol create rft --evaluator <existing-evaluator-id>\n"
|
|
477
482
|
)
|
|
478
483
|
return 1
|
|
479
484
|
|
|
@@ -523,10 +528,15 @@ def create_rft_command(args) -> int:
|
|
|
523
528
|
print(f"Warning: Failed to upload evaluator automatically: {e}")
|
|
524
529
|
|
|
525
530
|
# Determine dataset id and materialization path
|
|
526
|
-
dataset_id = getattr(args, "
|
|
531
|
+
dataset_id = getattr(args, "dataset", None)
|
|
527
532
|
dataset_jsonl = getattr(args, "dataset_jsonl", None)
|
|
528
533
|
dataset_display_name = getattr(args, "dataset_display_name", None)
|
|
529
534
|
dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
|
|
535
|
+
dataset_resource_override: Optional[str] = None
|
|
536
|
+
if isinstance(dataset_id, str) and dataset_id.startswith("accounts/"):
|
|
537
|
+
# Caller passed a fully-qualified dataset; capture it for body and keep only terminal id for printing
|
|
538
|
+
dataset_resource_override = dataset_id
|
|
539
|
+
dataset_id = _extract_terminal_segment(dataset_id)
|
|
530
540
|
|
|
531
541
|
if not dataset_id:
|
|
532
542
|
# Prefer explicit --dataset-jsonl, else attempt to extract from the selected test's data loader or input_dataset.
|
|
@@ -573,7 +583,7 @@ def create_rft_command(args) -> int:
|
|
|
573
583
|
print(f"Warning: dataset builder failed: {e}")
|
|
574
584
|
if not dataset_jsonl:
|
|
575
585
|
print(
|
|
576
|
-
"Error: Could not determine dataset. Provide --dataset
|
|
586
|
+
"Error: Could not determine dataset. Provide --dataset or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
|
|
577
587
|
)
|
|
578
588
|
return 1
|
|
579
589
|
|
|
@@ -628,6 +638,8 @@ def create_rft_command(args) -> int:
|
|
|
628
638
|
("learningRate", "learning_rate"),
|
|
629
639
|
("maxContextLength", "max_context_length"),
|
|
630
640
|
("loraRank", "lora_rank"),
|
|
641
|
+
("gradientAccumulationSteps", "gradient_accumulation_steps"),
|
|
642
|
+
("learningRateWarmupSteps", "learning_rate_warmup_steps"),
|
|
631
643
|
("acceleratorCount", "accelerator_count"),
|
|
632
644
|
("region", "region"),
|
|
633
645
|
]:
|
|
@@ -640,14 +652,25 @@ def create_rft_command(args) -> int:
|
|
|
640
652
|
("temperature", "temperature"),
|
|
641
653
|
("topP", "top_p"),
|
|
642
654
|
("topK", "top_k"),
|
|
643
|
-
("maxTokens", "
|
|
644
|
-
("n", "
|
|
655
|
+
("maxTokens", "max_output_tokens"),
|
|
656
|
+
("n", "response_candidates_count"),
|
|
645
657
|
]:
|
|
646
658
|
val = getattr(args, arg_name, None)
|
|
647
659
|
if val is not None:
|
|
648
660
|
inference_params[key] = val
|
|
649
|
-
if getattr(args, "
|
|
650
|
-
|
|
661
|
+
if getattr(args, "extra_body", None):
|
|
662
|
+
extra = getattr(args, "extra_body")
|
|
663
|
+
if isinstance(extra, (dict, list)):
|
|
664
|
+
try:
|
|
665
|
+
inference_params["extraBody"] = json.dumps(extra, ensure_ascii=False)
|
|
666
|
+
except (TypeError, ValueError) as e:
|
|
667
|
+
print(f"Error: --extra-body dict/list must be JSON-serializable: {e}")
|
|
668
|
+
return 1
|
|
669
|
+
elif isinstance(extra, str):
|
|
670
|
+
inference_params["extraBody"] = extra
|
|
671
|
+
else:
|
|
672
|
+
print("Error: --extra-body must be a JSON string or a JSON-serializable dict/list.")
|
|
673
|
+
return 1
|
|
651
674
|
|
|
652
675
|
wandb_config: Optional[Dict[str, Any]] = None
|
|
653
676
|
if getattr(args, "wandb_enabled", False):
|
|
@@ -659,9 +682,12 @@ def create_rft_command(args) -> int:
|
|
|
659
682
|
"runId": getattr(args, "wandb_run_id", None),
|
|
660
683
|
}
|
|
661
684
|
|
|
685
|
+
# Build dataset resource (prefer override when provided)
|
|
686
|
+
dataset_resource = dataset_resource_override or f"accounts/{account_id}/datasets/{dataset_id}"
|
|
687
|
+
|
|
662
688
|
body: Dict[str, Any] = {
|
|
663
|
-
|
|
664
|
-
"dataset":
|
|
689
|
+
"displayName": getattr(args, "display_name", None),
|
|
690
|
+
"dataset": dataset_resource,
|
|
665
691
|
"evaluator": evaluator_resource_name,
|
|
666
692
|
"evalAutoCarveout": bool(getattr(args, "eval_auto_carveout", True)),
|
|
667
693
|
"trainingConfig": training_config,
|
|
@@ -670,7 +696,8 @@ def create_rft_command(args) -> int:
|
|
|
670
696
|
"chunkSize": getattr(args, "chunk_size", None),
|
|
671
697
|
"outputStats": None,
|
|
672
698
|
"outputMetrics": None,
|
|
673
|
-
"mcpServer": None,
|
|
699
|
+
"mcpServer": getattr(args, "mcp_server", None),
|
|
700
|
+
"jobId": getattr(args, "job_id", None),
|
|
674
701
|
}
|
|
675
702
|
# Debug: print minimal summary
|
|
676
703
|
print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
|
|
@@ -2,6 +2,7 @@ import argparse
|
|
|
2
2
|
import os
|
|
3
3
|
import subprocess
|
|
4
4
|
import sys
|
|
5
|
+
import shlex
|
|
5
6
|
from typing import List
|
|
6
7
|
|
|
7
8
|
from .upload import _discover_tests, _prompt_select
|
|
@@ -24,16 +25,15 @@ def _run_pytest_host(pytest_target: str) -> int:
|
|
|
24
25
|
return proc.returncode
|
|
25
26
|
|
|
26
27
|
|
|
27
|
-
def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
|
|
28
|
+
def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool:
|
|
28
29
|
context_dir = os.path.dirname(dockerfile_path)
|
|
29
30
|
print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
|
|
30
31
|
try:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
)
|
|
32
|
+
base_cmd = ["docker", "build"]
|
|
33
|
+
if build_extras:
|
|
34
|
+
base_cmd += build_extras
|
|
35
|
+
base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
|
|
36
|
+
proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
|
37
37
|
print(proc.stdout)
|
|
38
38
|
return proc.returncode == 0
|
|
39
39
|
except FileNotFoundError:
|
|
@@ -41,7 +41,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
|
|
|
41
41
|
return False
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def _run_pytest_in_docker(
|
|
44
|
+
def _run_pytest_in_docker(
|
|
45
|
+
project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None
|
|
46
|
+
) -> int:
|
|
45
47
|
workdir = "/workspace"
|
|
46
48
|
# Host HOME logs directory to map into container
|
|
47
49
|
host_home = os.path.expanduser("~")
|
|
@@ -73,6 +75,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
|
|
|
73
75
|
cmd += ["--user", f"{uid}:{gid}"]
|
|
74
76
|
except Exception:
|
|
75
77
|
pass
|
|
78
|
+
if run_extras:
|
|
79
|
+
cmd += run_extras
|
|
76
80
|
cmd += [image_tag, "pytest", pytest_target, "-vs"]
|
|
77
81
|
print("Running in Docker:", " ".join(cmd))
|
|
78
82
|
try:
|
|
@@ -91,11 +95,16 @@ def local_test_command(args: argparse.Namespace) -> int:
|
|
|
91
95
|
entry = getattr(args, "entry", None)
|
|
92
96
|
if entry:
|
|
93
97
|
if "::" in entry:
|
|
94
|
-
file_part = entry.split("::", 1)
|
|
98
|
+
file_part, func_part = entry.split("::", 1)
|
|
95
99
|
file_path = (
|
|
96
100
|
file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
|
|
97
101
|
)
|
|
98
|
-
|
|
102
|
+
# Convert to project-relative like the non-:: path
|
|
103
|
+
try:
|
|
104
|
+
rel = os.path.relpath(file_path, project_root)
|
|
105
|
+
except Exception:
|
|
106
|
+
rel = file_path
|
|
107
|
+
pytest_target = f"{rel}::{func_part}"
|
|
99
108
|
else:
|
|
100
109
|
file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
|
|
101
110
|
# Use path relative to project_root when possible
|
|
@@ -126,6 +135,10 @@ def local_test_command(args: argparse.Namespace) -> int:
|
|
|
126
135
|
pytest_target = rel
|
|
127
136
|
|
|
128
137
|
ignore_docker = bool(getattr(args, "ignore_docker", False))
|
|
138
|
+
build_extras_str = getattr(args, "docker_build_extra", "") or ""
|
|
139
|
+
run_extras_str = getattr(args, "docker_run_extra", "") or ""
|
|
140
|
+
build_extras = shlex.split(build_extras_str) if build_extras_str else []
|
|
141
|
+
run_extras = shlex.split(run_extras_str) if run_extras_str else []
|
|
129
142
|
if ignore_docker:
|
|
130
143
|
if not pytest_target:
|
|
131
144
|
print("Error: Failed to resolve a pytest target to run.")
|
|
@@ -146,14 +159,14 @@ def local_test_command(args: argparse.Namespace) -> int:
|
|
|
146
159
|
except Exception:
|
|
147
160
|
pass
|
|
148
161
|
image_tag = "ep-evaluator:local"
|
|
149
|
-
ok = _build_docker_image(dockerfiles[0], image_tag)
|
|
162
|
+
ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras)
|
|
150
163
|
if not ok:
|
|
151
164
|
print("Docker build failed. See logs above.")
|
|
152
165
|
return 1
|
|
153
166
|
if not pytest_target:
|
|
154
167
|
print("Error: Failed to resolve a pytest target to run.")
|
|
155
168
|
return 1
|
|
156
|
-
return _run_pytest_in_docker(project_root, image_tag, pytest_target)
|
|
169
|
+
return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras)
|
|
157
170
|
|
|
158
171
|
# No Dockerfile: run on host
|
|
159
172
|
if not pytest_target:
|
|
@@ -8,6 +8,7 @@ import time
|
|
|
8
8
|
import uuid
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any, Callable, Dict, Iterable, Optional, Tuple
|
|
11
|
+
from urllib.parse import urlencode
|
|
11
12
|
|
|
12
13
|
import requests
|
|
13
14
|
|
|
@@ -186,6 +187,14 @@ def create_reinforcement_fine_tuning_job(
|
|
|
186
187
|
body: Dict[str, Any],
|
|
187
188
|
) -> Dict[str, Any]:
|
|
188
189
|
url = f"{api_base.rstrip('/')}/v1/accounts/{account_id}/reinforcementFineTuningJobs"
|
|
190
|
+
# Move optional jobId from body to query parameter if provided
|
|
191
|
+
job_id = body.get("jobId")
|
|
192
|
+
if isinstance(job_id, str):
|
|
193
|
+
job_id = job_id.strip()
|
|
194
|
+
if job_id:
|
|
195
|
+
# Remove from body and append as query param
|
|
196
|
+
body.pop("jobId", None)
|
|
197
|
+
url = f"{url}?{urlencode({'reinforcementFineTuningJobId': job_id})}"
|
|
189
198
|
headers = {
|
|
190
199
|
"Authorization": f"Bearer {api_key}",
|
|
191
200
|
"Content-Type": "application/json",
|