eval-protocol 0.2.86__tar.gz → 0.2.87__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.86/eval_protocol.egg-info → eval_protocol-0.2.87}/PKG-INFO +18 -92
- eval_protocol-0.2.87/README.md +39 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli.py +15 -8
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/create_rft.py +45 -18
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/fireworks_rft.py +9 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87/eval_protocol.egg-info}/PKG-INFO +18 -92
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_cli_create_rft_infer.py +346 -18
- eval_protocol-0.2.86/README.md +0 -113
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/LICENSE +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/development/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/pyproject.toml +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/setup.cfg +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/setup.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_config.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_format.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_length.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_math.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_models.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/versioneer.py +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.86 → eval_protocol-0.2.87}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.87
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -113,111 +113,37 @@ Requires-Dist: langfuse>=2.0.0; extra == "proxy"
|
|
|
113
113
|
Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
|
|
114
114
|
Dynamic: license-file
|
|
115
115
|
|
|
116
|
-
# Eval Protocol
|
|
116
|
+
# Eval Protocol
|
|
117
117
|
|
|
118
118
|
[](https://pypi.org/project/eval-protocol/)
|
|
119
119
|
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
120
120
|
|
|
121
|
-
**
|
|
121
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
122
122
|
|
|
123
|
-
|
|
123
|
+

|
|
124
124
|
|
|
125
|
-
-
|
|
126
|
-
- **Robust rollouts**: Handles flaky LLM APIs and parallel execution
|
|
127
|
-
- **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
|
|
128
|
-
- **Agent support**: LangGraph and Pydantic AI
|
|
129
|
-
- **MCP RL envs**: Build reinforcement learning environments with MCP
|
|
130
|
-
- **Built-in benchmarks**: AIME, tau-bench
|
|
131
|
-
- **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
|
|
132
|
-
- **Local UI**: Pivot/table views for real-time analysis
|
|
125
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
133
126
|
|
|
134
|
-
|
|
127
|
+
Eval Protocol makes this possible in two ways:
|
|
135
128
|
|
|
136
|
-
|
|
129
|
+
1. **Expose your agent through a simple API**
|
|
130
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
131
|
+
2. **Connect with any trainer**
|
|
132
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
137
133
|
|
|
138
|
-
|
|
139
|
-
pip install 'eval-protocol[langfuse]'
|
|
134
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
140
135
|
|
|
141
|
-
|
|
142
|
-
export OPENAI_API_KEY=...
|
|
143
|
-
export FIREWORKS_API_KEY=...
|
|
144
|
-
export GEMINI_API_KEY=...
|
|
136
|
+
## Who This Is For
|
|
145
137
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
export LANGFUSE_HOST=https://your-deployment.com # optional
|
|
150
|
-
```
|
|
138
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
139
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
140
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
151
141
|
|
|
152
|
-
|
|
142
|
+
## Quickstart
|
|
153
143
|
|
|
154
|
-
|
|
155
|
-
from datetime import datetime
|
|
156
|
-
import pytest
|
|
144
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
157
145
|
|
|
158
|
-
|
|
159
|
-
evaluation_test,
|
|
160
|
-
aha_judge,
|
|
161
|
-
EvaluationRow,
|
|
162
|
-
SingleTurnRolloutProcessor,
|
|
163
|
-
DynamicDataLoader,
|
|
164
|
-
create_langfuse_adapter,
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def langfuse_data_generator() -> list[EvaluationRow]:
|
|
169
|
-
adapter = create_langfuse_adapter()
|
|
170
|
-
return adapter.get_evaluation_rows(
|
|
171
|
-
to_timestamp=datetime.utcnow(),
|
|
172
|
-
limit=20,
|
|
173
|
-
sample_size=5,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@pytest.mark.parametrize(
|
|
178
|
-
"completion_params",
|
|
179
|
-
[
|
|
180
|
-
{"model": "openai/gpt-4.1"},
|
|
181
|
-
{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
|
|
182
|
-
],
|
|
183
|
-
)
|
|
184
|
-
@evaluation_test(
|
|
185
|
-
data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
|
|
186
|
-
rollout_processor=SingleTurnRolloutProcessor(),
|
|
187
|
-
)
|
|
188
|
-
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
|
|
189
|
-
return await aha_judge(row)
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
Run it:
|
|
193
|
-
|
|
194
|
-
```bash
|
|
195
|
-
pytest -q -s
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
|
|
199
|
-
|
|
200
|
-
## Installation
|
|
201
|
-
|
|
202
|
-
This library requires Python >= 3.10.
|
|
203
|
-
|
|
204
|
-
### pip
|
|
205
|
-
|
|
206
|
-
```bash
|
|
207
|
-
pip install eval-protocol
|
|
208
|
-
```
|
|
209
|
-
|
|
210
|
-
### uv (recommended)
|
|
211
|
-
|
|
212
|
-
```bash
|
|
213
|
-
# Install uv (if needed)
|
|
214
|
-
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
215
|
-
|
|
216
|
-
# Add to your project
|
|
217
|
-
uv add eval-protocol
|
|
218
|
-
```
|
|
219
|
-
|
|
220
|
-
## 📚 Resources
|
|
146
|
+
## Resources
|
|
221
147
|
|
|
222
148
|
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
223
149
|
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Eval Protocol
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/eval-protocol/)
|
|
4
|
+
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
5
|
+
|
|
6
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
11
|
+
|
|
12
|
+
Eval Protocol makes this possible in two ways:
|
|
13
|
+
|
|
14
|
+
1. **Expose your agent through a simple API**
|
|
15
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
16
|
+
2. **Connect with any trainer**
|
|
17
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
18
|
+
|
|
19
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
20
|
+
|
|
21
|
+
## Who This Is For
|
|
22
|
+
|
|
23
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
24
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
25
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
30
|
+
|
|
31
|
+
## Resources
|
|
32
|
+
|
|
33
|
+
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
34
|
+
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|
|
35
|
+
- **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
|
|
36
|
+
|
|
37
|
+
## License
|
|
38
|
+
|
|
39
|
+
[MIT](LICENSE)
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-11-
|
|
11
|
+
"date": "2025-11-12T15:43:06-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "8ab1c920bb77880deb87f2320c6cf6ea8780458e",
|
|
15
|
+
"version": "0.2.87"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -371,13 +371,13 @@ def parse_args(args=None):
|
|
|
371
371
|
help="Create a Reinforcement Fine-tuning Job on Fireworks",
|
|
372
372
|
)
|
|
373
373
|
rft_parser.add_argument(
|
|
374
|
-
"--evaluator
|
|
375
|
-
help="Evaluator ID
|
|
374
|
+
"--evaluator",
|
|
375
|
+
help="Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests",
|
|
376
376
|
)
|
|
377
377
|
# Dataset options
|
|
378
378
|
rft_parser.add_argument(
|
|
379
|
-
"--dataset
|
|
380
|
-
help="Use existing
|
|
379
|
+
"--dataset",
|
|
380
|
+
help="Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization",
|
|
381
381
|
)
|
|
382
382
|
rft_parser.add_argument(
|
|
383
383
|
"--dataset-jsonl",
|
|
@@ -400,6 +400,8 @@ def parse_args(args=None):
|
|
|
400
400
|
rft_parser.add_argument("--learning-rate", type=float, default=3e-5)
|
|
401
401
|
rft_parser.add_argument("--max-context-length", type=int, default=65536)
|
|
402
402
|
rft_parser.add_argument("--lora-rank", type=int, default=16)
|
|
403
|
+
rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
|
|
404
|
+
rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
|
|
403
405
|
rft_parser.add_argument("--accelerator-count", type=int, default=1)
|
|
404
406
|
rft_parser.add_argument("--region", help="Fireworks region enum value")
|
|
405
407
|
rft_parser.add_argument("--display-name", help="RFT job display name")
|
|
@@ -412,9 +414,14 @@ def parse_args(args=None):
|
|
|
412
414
|
rft_parser.add_argument("--temperature", type=float)
|
|
413
415
|
rft_parser.add_argument("--top-p", type=float)
|
|
414
416
|
rft_parser.add_argument("--top-k", type=int)
|
|
415
|
-
rft_parser.add_argument("--max-tokens", type=int, default=32768)
|
|
416
|
-
rft_parser.add_argument("--
|
|
417
|
-
rft_parser.add_argument("--
|
|
417
|
+
rft_parser.add_argument("--max-output-tokens", type=int, default=32768)
|
|
418
|
+
rft_parser.add_argument("--response-candidates-count", type=int, default=8)
|
|
419
|
+
rft_parser.add_argument("--extra-body", help="JSON string for extra inference params")
|
|
420
|
+
# MCP server (optional)
|
|
421
|
+
rft_parser.add_argument(
|
|
422
|
+
"--mcp-server",
|
|
423
|
+
help="The MCP server resource name to use for the reinforcement fine-tuning job.",
|
|
424
|
+
)
|
|
418
425
|
# Wandb
|
|
419
426
|
rft_parser.add_argument("--wandb-enabled", action="store_true")
|
|
420
427
|
rft_parser.add_argument("--wandb-project")
|
|
@@ -422,7 +429,7 @@ def parse_args(args=None):
|
|
|
422
429
|
rft_parser.add_argument("--wandb-run-id")
|
|
423
430
|
rft_parser.add_argument("--wandb-api-key")
|
|
424
431
|
# Misc
|
|
425
|
-
rft_parser.add_argument("--
|
|
432
|
+
rft_parser.add_argument("--job-id", help="Specify an explicit RFT job id")
|
|
426
433
|
rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
|
|
427
434
|
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
|
|
428
435
|
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
|
|
@@ -344,7 +344,7 @@ def _poll_evaluator_status(
|
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
def create_rft_command(args) -> int:
|
|
347
|
-
evaluator_id: Optional[str] = getattr(args, "
|
|
347
|
+
evaluator_id: Optional[str] = getattr(args, "evaluator", None)
|
|
348
348
|
non_interactive: bool = bool(getattr(args, "yes", False))
|
|
349
349
|
dry_run: bool = bool(getattr(args, "dry_run", False))
|
|
350
350
|
force: bool = bool(getattr(args, "force", False))
|
|
@@ -373,11 +373,11 @@ def create_rft_command(args) -> int:
|
|
|
373
373
|
print("No evaluation tests found.")
|
|
374
374
|
print("\nHint: Make sure your tests use the @evaluation_test decorator.")
|
|
375
375
|
return 1
|
|
376
|
-
# Always interactive selection here
|
|
376
|
+
# Always interactive selection here
|
|
377
377
|
try:
|
|
378
378
|
selected_tests = _prompt_select(tests, non_interactive=non_interactive)
|
|
379
379
|
except Exception:
|
|
380
|
-
print("Error: Failed to open selector UI. Please pass --evaluator
|
|
380
|
+
print("Error: Failed to open selector UI. Please pass --evaluator or --entry explicitly.")
|
|
381
381
|
return 1
|
|
382
382
|
if not selected_tests:
|
|
383
383
|
print("No tests selected.")
|
|
@@ -385,7 +385,7 @@ def create_rft_command(args) -> int:
|
|
|
385
385
|
if len(selected_tests) != 1:
|
|
386
386
|
if non_interactive and len(selected_tests) > 1:
|
|
387
387
|
print("Error: Multiple evaluation tests found in --yes (non-interactive) mode.")
|
|
388
|
-
print(" Please pass --evaluator
|
|
388
|
+
print(" Please pass --evaluator or --entry to disambiguate.")
|
|
389
389
|
try:
|
|
390
390
|
# Offer candidate evaluator ids for convenience
|
|
391
391
|
tests = _discover_tests(project_root)
|
|
@@ -410,8 +410,13 @@ def create_rft_command(args) -> int:
|
|
|
410
410
|
selected_test_file_path, selected_test_func_name = _resolve_selected_test(
|
|
411
411
|
project_root, evaluator_id, selected_tests=selected_tests
|
|
412
412
|
)
|
|
413
|
-
# Resolve evaluator resource name to fully-qualified format required by API
|
|
414
|
-
|
|
413
|
+
# Resolve evaluator resource name to fully-qualified format required by API.
|
|
414
|
+
# Allow users to pass either short id or fully-qualified resource.
|
|
415
|
+
if evaluator_id and evaluator_id.startswith("accounts/"):
|
|
416
|
+
evaluator_resource_name = evaluator_id
|
|
417
|
+
evaluator_id = _extract_terminal_segment(evaluator_id)
|
|
418
|
+
else:
|
|
419
|
+
evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
|
|
415
420
|
|
|
416
421
|
# Optional short-circuit: if evaluator already exists and not forcing, skip upload path
|
|
417
422
|
skip_upload = False
|
|
@@ -470,10 +475,10 @@ def create_rft_command(args) -> int:
|
|
|
470
475
|
# If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
|
|
471
476
|
if selected_entry is None and len(tests) > 1:
|
|
472
477
|
print(
|
|
473
|
-
f"Error: Multiple evaluation tests found, and the selected
|
|
474
|
-
" Please re-run specifying the evaluator
|
|
478
|
+
f"Error: Multiple evaluation tests found, and the selected evaluator {evaluator_id} does not match any discovered test.\n"
|
|
479
|
+
" Please re-run specifying the evaluator.\n"
|
|
475
480
|
" Hints:\n"
|
|
476
|
-
" - eval-protocol create rft --evaluator
|
|
481
|
+
" - eval-protocol create rft --evaluator <existing-evaluator-id>\n"
|
|
477
482
|
)
|
|
478
483
|
return 1
|
|
479
484
|
|
|
@@ -523,10 +528,15 @@ def create_rft_command(args) -> int:
|
|
|
523
528
|
print(f"Warning: Failed to upload evaluator automatically: {e}")
|
|
524
529
|
|
|
525
530
|
# Determine dataset id and materialization path
|
|
526
|
-
dataset_id = getattr(args, "
|
|
531
|
+
dataset_id = getattr(args, "dataset", None)
|
|
527
532
|
dataset_jsonl = getattr(args, "dataset_jsonl", None)
|
|
528
533
|
dataset_display_name = getattr(args, "dataset_display_name", None)
|
|
529
534
|
dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
|
|
535
|
+
dataset_resource_override: Optional[str] = None
|
|
536
|
+
if isinstance(dataset_id, str) and dataset_id.startswith("accounts/"):
|
|
537
|
+
# Caller passed a fully-qualified dataset; capture it for body and keep only terminal id for printing
|
|
538
|
+
dataset_resource_override = dataset_id
|
|
539
|
+
dataset_id = _extract_terminal_segment(dataset_id)
|
|
530
540
|
|
|
531
541
|
if not dataset_id:
|
|
532
542
|
# Prefer explicit --dataset-jsonl, else attempt to extract from the selected test's data loader or input_dataset.
|
|
@@ -573,7 +583,7 @@ def create_rft_command(args) -> int:
|
|
|
573
583
|
print(f"Warning: dataset builder failed: {e}")
|
|
574
584
|
if not dataset_jsonl:
|
|
575
585
|
print(
|
|
576
|
-
"Error: Could not determine dataset. Provide --dataset
|
|
586
|
+
"Error: Could not determine dataset. Provide --dataset or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
|
|
577
587
|
)
|
|
578
588
|
return 1
|
|
579
589
|
|
|
@@ -628,6 +638,8 @@ def create_rft_command(args) -> int:
|
|
|
628
638
|
("learningRate", "learning_rate"),
|
|
629
639
|
("maxContextLength", "max_context_length"),
|
|
630
640
|
("loraRank", "lora_rank"),
|
|
641
|
+
("gradientAccumulationSteps", "gradient_accumulation_steps"),
|
|
642
|
+
("learningRateWarmupSteps", "learning_rate_warmup_steps"),
|
|
631
643
|
("acceleratorCount", "accelerator_count"),
|
|
632
644
|
("region", "region"),
|
|
633
645
|
]:
|
|
@@ -640,14 +652,25 @@ def create_rft_command(args) -> int:
|
|
|
640
652
|
("temperature", "temperature"),
|
|
641
653
|
("topP", "top_p"),
|
|
642
654
|
("topK", "top_k"),
|
|
643
|
-
("maxTokens", "
|
|
644
|
-
("n", "
|
|
655
|
+
("maxTokens", "max_output_tokens"),
|
|
656
|
+
("n", "response_candidates_count"),
|
|
645
657
|
]:
|
|
646
658
|
val = getattr(args, arg_name, None)
|
|
647
659
|
if val is not None:
|
|
648
660
|
inference_params[key] = val
|
|
649
|
-
if getattr(args, "
|
|
650
|
-
|
|
661
|
+
if getattr(args, "extra_body", None):
|
|
662
|
+
extra = getattr(args, "extra_body")
|
|
663
|
+
if isinstance(extra, (dict, list)):
|
|
664
|
+
try:
|
|
665
|
+
inference_params["extraBody"] = json.dumps(extra, ensure_ascii=False)
|
|
666
|
+
except (TypeError, ValueError) as e:
|
|
667
|
+
print(f"Error: --extra-body dict/list must be JSON-serializable: {e}")
|
|
668
|
+
return 1
|
|
669
|
+
elif isinstance(extra, str):
|
|
670
|
+
inference_params["extraBody"] = extra
|
|
671
|
+
else:
|
|
672
|
+
print("Error: --extra-body must be a JSON string or a JSON-serializable dict/list.")
|
|
673
|
+
return 1
|
|
651
674
|
|
|
652
675
|
wandb_config: Optional[Dict[str, Any]] = None
|
|
653
676
|
if getattr(args, "wandb_enabled", False):
|
|
@@ -659,9 +682,12 @@ def create_rft_command(args) -> int:
|
|
|
659
682
|
"runId": getattr(args, "wandb_run_id", None),
|
|
660
683
|
}
|
|
661
684
|
|
|
685
|
+
# Build dataset resource (prefer override when provided)
|
|
686
|
+
dataset_resource = dataset_resource_override or f"accounts/{account_id}/datasets/{dataset_id}"
|
|
687
|
+
|
|
662
688
|
body: Dict[str, Any] = {
|
|
663
|
-
|
|
664
|
-
"dataset":
|
|
689
|
+
"displayName": getattr(args, "display_name", None),
|
|
690
|
+
"dataset": dataset_resource,
|
|
665
691
|
"evaluator": evaluator_resource_name,
|
|
666
692
|
"evalAutoCarveout": bool(getattr(args, "eval_auto_carveout", True)),
|
|
667
693
|
"trainingConfig": training_config,
|
|
@@ -670,7 +696,8 @@ def create_rft_command(args) -> int:
|
|
|
670
696
|
"chunkSize": getattr(args, "chunk_size", None),
|
|
671
697
|
"outputStats": None,
|
|
672
698
|
"outputMetrics": None,
|
|
673
|
-
"mcpServer": None,
|
|
699
|
+
"mcpServer": getattr(args, "mcp_server", None),
|
|
700
|
+
"jobId": getattr(args, "job_id", None),
|
|
674
701
|
}
|
|
675
702
|
# Debug: print minimal summary
|
|
676
703
|
print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
|
|
@@ -8,6 +8,7 @@ import time
|
|
|
8
8
|
import uuid
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any, Callable, Dict, Iterable, Optional, Tuple
|
|
11
|
+
from urllib.parse import urlencode
|
|
11
12
|
|
|
12
13
|
import requests
|
|
13
14
|
|
|
@@ -186,6 +187,14 @@ def create_reinforcement_fine_tuning_job(
|
|
|
186
187
|
body: Dict[str, Any],
|
|
187
188
|
) -> Dict[str, Any]:
|
|
188
189
|
url = f"{api_base.rstrip('/')}/v1/accounts/{account_id}/reinforcementFineTuningJobs"
|
|
190
|
+
# Move optional jobId from body to query parameter if provided
|
|
191
|
+
job_id = body.get("jobId")
|
|
192
|
+
if isinstance(job_id, str):
|
|
193
|
+
job_id = job_id.strip()
|
|
194
|
+
if job_id:
|
|
195
|
+
# Remove from body and append as query param
|
|
196
|
+
body.pop("jobId", None)
|
|
197
|
+
url = f"{url}?{urlencode({'reinforcementFineTuningJobId': job_id})}"
|
|
189
198
|
headers = {
|
|
190
199
|
"Authorization": f"Bearer {api_key}",
|
|
191
200
|
"Content-Type": "application/json",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.87
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -113,111 +113,37 @@ Requires-Dist: langfuse>=2.0.0; extra == "proxy"
|
|
|
113
113
|
Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
|
|
114
114
|
Dynamic: license-file
|
|
115
115
|
|
|
116
|
-
# Eval Protocol
|
|
116
|
+
# Eval Protocol
|
|
117
117
|
|
|
118
118
|
[](https://pypi.org/project/eval-protocol/)
|
|
119
119
|
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
120
120
|
|
|
121
|
-
**
|
|
121
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
122
122
|
|
|
123
|
-
|
|
123
|
+

|
|
124
124
|
|
|
125
|
-
-
|
|
126
|
-
- **Robust rollouts**: Handles flaky LLM APIs and parallel execution
|
|
127
|
-
- **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
|
|
128
|
-
- **Agent support**: LangGraph and Pydantic AI
|
|
129
|
-
- **MCP RL envs**: Build reinforcement learning environments with MCP
|
|
130
|
-
- **Built-in benchmarks**: AIME, tau-bench
|
|
131
|
-
- **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
|
|
132
|
-
- **Local UI**: Pivot/table views for real-time analysis
|
|
125
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
133
126
|
|
|
134
|
-
|
|
127
|
+
Eval Protocol makes this possible in two ways:
|
|
135
128
|
|
|
136
|
-
|
|
129
|
+
1. **Expose your agent through a simple API**
|
|
130
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
131
|
+
2. **Connect with any trainer**
|
|
132
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
137
133
|
|
|
138
|
-
|
|
139
|
-
pip install 'eval-protocol[langfuse]'
|
|
134
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
140
135
|
|
|
141
|
-
|
|
142
|
-
export OPENAI_API_KEY=...
|
|
143
|
-
export FIREWORKS_API_KEY=...
|
|
144
|
-
export GEMINI_API_KEY=...
|
|
136
|
+
## Who This Is For
|
|
145
137
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
export LANGFUSE_HOST=https://your-deployment.com # optional
|
|
150
|
-
```
|
|
138
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
139
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
140
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
151
141
|
|
|
152
|
-
|
|
142
|
+
## Quickstart
|
|
153
143
|
|
|
154
|
-
|
|
155
|
-
from datetime import datetime
|
|
156
|
-
import pytest
|
|
144
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
157
145
|
|
|
158
|
-
|
|
159
|
-
evaluation_test,
|
|
160
|
-
aha_judge,
|
|
161
|
-
EvaluationRow,
|
|
162
|
-
SingleTurnRolloutProcessor,
|
|
163
|
-
DynamicDataLoader,
|
|
164
|
-
create_langfuse_adapter,
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def langfuse_data_generator() -> list[EvaluationRow]:
|
|
169
|
-
adapter = create_langfuse_adapter()
|
|
170
|
-
return adapter.get_evaluation_rows(
|
|
171
|
-
to_timestamp=datetime.utcnow(),
|
|
172
|
-
limit=20,
|
|
173
|
-
sample_size=5,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@pytest.mark.parametrize(
|
|
178
|
-
"completion_params",
|
|
179
|
-
[
|
|
180
|
-
{"model": "openai/gpt-4.1"},
|
|
181
|
-
{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
|
|
182
|
-
],
|
|
183
|
-
)
|
|
184
|
-
@evaluation_test(
|
|
185
|
-
data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
|
|
186
|
-
rollout_processor=SingleTurnRolloutProcessor(),
|
|
187
|
-
)
|
|
188
|
-
async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
|
|
189
|
-
return await aha_judge(row)
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
Run it:
|
|
193
|
-
|
|
194
|
-
```bash
|
|
195
|
-
pytest -q -s
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
|
|
199
|
-
|
|
200
|
-
## Installation
|
|
201
|
-
|
|
202
|
-
This library requires Python >= 3.10.
|
|
203
|
-
|
|
204
|
-
### pip
|
|
205
|
-
|
|
206
|
-
```bash
|
|
207
|
-
pip install eval-protocol
|
|
208
|
-
```
|
|
209
|
-
|
|
210
|
-
### uv (recommended)
|
|
211
|
-
|
|
212
|
-
```bash
|
|
213
|
-
# Install uv (if needed)
|
|
214
|
-
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
215
|
-
|
|
216
|
-
# Add to your project
|
|
217
|
-
uv add eval-protocol
|
|
218
|
-
```
|
|
219
|
-
|
|
220
|
-
## 📚 Resources
|
|
146
|
+
## Resources
|
|
221
147
|
|
|
222
148
|
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
223
149
|
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|