eval-protocol 0.3.10.dev1__tar.gz → 0.3.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.10.dev1/eval_protocol.egg-info → eval_protocol-0.3.11}/PKG-INFO +2 -2
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/auth.py +1 -29
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli.py +6 -8
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/create_rft.py +100 -66
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/upload.py +3 -3
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/evaluation.py +32 -53
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/platform_api.py +27 -17
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_agent_rollout_processor.py +5 -1
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +27 -21
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +11 -7
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_single_turn_rollout_process.py +12 -11
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/evaluation_test.py +0 -3
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/evaluation_test_utils.py +0 -19
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/github_action_rollout_processor.py +7 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/openenv_rollout_processor.py +10 -6
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/remote_rollout_processor.py +7 -0
- eval_protocol-0.3.11/eval_protocol/pytest/utils.py +24 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11/eval_protocol.egg-info}/PKG-INFO +2 -2
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/SOURCES.txt +1 -2
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/requires.txt +1 -1
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/pyproject.toml +1 -1
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_create_rft.py +61 -17
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_ep_upload_e2e.py +140 -51
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_evaluation.py +7 -22
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_upload_entrypoint.py +12 -10
- eval_protocol-0.3.10.dev1/eval_protocol/fireworks_client.py +0 -132
- eval_protocol-0.3.10.dev1/tests/test_fireworks_client.py +0 -143
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/LICENSE +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/README.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/setup.cfg +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/setup.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_format.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_length.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_math.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_models.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/versioneer.py +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/index-10cZ11iB.js +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/index-10cZ11iB.js.map +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/index-DOD73Wyg.css +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.11
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -29,7 +29,7 @@ Requires-Dist: pytest>=6.0.0
|
|
|
29
29
|
Requires-Dist: pytest-asyncio>=0.21.0
|
|
30
30
|
Requires-Dist: peewee>=3.18.2
|
|
31
31
|
Requires-Dist: backoff>=2.2.0
|
|
32
|
-
Requires-Dist: fireworks-ai==1.0.
|
|
32
|
+
Requires-Dist: fireworks-ai==1.0.0a20
|
|
33
33
|
Requires-Dist: questionary>=2.0.0
|
|
34
34
|
Requires-Dist: toml>=0.10.0
|
|
35
35
|
Requires-Dist: loguru>=0.6.0
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-01-
|
|
11
|
+
"date": "2026-01-13T17:18:11-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.
|
|
14
|
+
"full-revisionid": "6702c557e88f2d256fd820770e6ab6b32db72701",
|
|
15
|
+
"version": "0.3.11"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -3,30 +3,9 @@ import os
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
|
-
from dotenv import find_dotenv, load_dotenv
|
|
7
6
|
|
|
8
7
|
logger = logging.getLogger(__name__)
|
|
9
8
|
|
|
10
|
-
# --- Load .env files ---
|
|
11
|
-
# Attempt to load .env.dev first, then .env as a fallback.
|
|
12
|
-
# This happens when the module is imported.
|
|
13
|
-
# We use override=False (default) so that existing environment variables
|
|
14
|
-
# (e.g., set in the shell) are NOT overridden by .env files.
|
|
15
|
-
_ENV_DEV_PATH = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
|
|
16
|
-
if _ENV_DEV_PATH:
|
|
17
|
-
load_dotenv(dotenv_path=_ENV_DEV_PATH, override=False)
|
|
18
|
-
logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_DEV_PATH}")
|
|
19
|
-
else:
|
|
20
|
-
_ENV_PATH = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
|
|
21
|
-
if _ENV_PATH:
|
|
22
|
-
load_dotenv(dotenv_path=_ENV_PATH, override=False)
|
|
23
|
-
logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_PATH}")
|
|
24
|
-
else:
|
|
25
|
-
logger.debug(
|
|
26
|
-
"eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
|
|
27
|
-
)
|
|
28
|
-
# --- End .env loading ---
|
|
29
|
-
|
|
30
9
|
|
|
31
10
|
def get_fireworks_api_key() -> Optional[str]:
|
|
32
11
|
"""
|
|
@@ -94,8 +73,6 @@ def verify_api_key_and_get_account_id(
|
|
|
94
73
|
Args:
|
|
95
74
|
api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
|
|
96
75
|
api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
|
|
97
|
-
If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
|
|
98
|
-
dev.api.fireworks.ai for the verification call.
|
|
99
76
|
|
|
100
77
|
Returns:
|
|
101
78
|
The resolved account id if verification succeeds and the header is present; otherwise None.
|
|
@@ -104,12 +81,7 @@ def verify_api_key_and_get_account_id(
|
|
|
104
81
|
resolved_key = api_key or get_fireworks_api_key()
|
|
105
82
|
if not resolved_key:
|
|
106
83
|
return None
|
|
107
|
-
|
|
108
|
-
# Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
|
|
109
|
-
if "api.fireworks.ai" in provided_base:
|
|
110
|
-
resolved_base = provided_base
|
|
111
|
-
else:
|
|
112
|
-
resolved_base = "https://dev.api.fireworks.ai"
|
|
84
|
+
resolved_base = api_base or get_fireworks_api_base()
|
|
113
85
|
|
|
114
86
|
from .common_utils import get_user_agent
|
|
115
87
|
|
|
@@ -81,12 +81,13 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
|
|
|
81
81
|
"--env-file",
|
|
82
82
|
help="Path to .env file containing secrets to upload (default: .env in current directory)",
|
|
83
83
|
)
|
|
84
|
+
upload_parser.add_argument(
|
|
85
|
+
"--force",
|
|
86
|
+
action="store_true",
|
|
87
|
+
help="Overwrite existing evaluator with the same ID",
|
|
88
|
+
)
|
|
84
89
|
|
|
85
90
|
# Auto-generate flags from SDK Fireworks().evaluators.create() signature
|
|
86
|
-
# Note: We use Fireworks() directly here instead of create_fireworks_client()
|
|
87
|
-
# because we only need the method signature for introspection, not a fully
|
|
88
|
-
# authenticated client. create_fireworks_client() would trigger an HTTP request
|
|
89
|
-
# to verify the API key, causing delays even for --help invocations.
|
|
90
91
|
create_evaluator_fn = Fireworks().evaluators.create
|
|
91
92
|
|
|
92
93
|
upload_skip_fields = {
|
|
@@ -136,6 +137,7 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
|
|
|
136
137
|
|
|
137
138
|
rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
|
|
138
139
|
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
|
|
140
|
+
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
|
|
139
141
|
rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
|
|
140
142
|
rft_parser.add_argument(
|
|
141
143
|
"--ignore-docker",
|
|
@@ -196,10 +198,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
|
|
|
196
198
|
"loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
|
|
197
199
|
}
|
|
198
200
|
|
|
199
|
-
# Note: We use Fireworks() directly here instead of create_fireworks_client()
|
|
200
|
-
# because we only need the method signature for introspection, not a fully
|
|
201
|
-
# authenticated client. create_fireworks_client() would trigger an HTTP request
|
|
202
|
-
# to verify the API key, causing delays even for --help invocations.
|
|
203
201
|
create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
|
|
204
202
|
|
|
205
203
|
add_args_from_callable_signature(
|
|
@@ -7,18 +7,19 @@ import sys
|
|
|
7
7
|
import time
|
|
8
8
|
from typing import Any, Callable, Dict, Optional
|
|
9
9
|
import inspect
|
|
10
|
+
import requests
|
|
10
11
|
import tempfile
|
|
11
12
|
from pydantic import ValidationError
|
|
12
13
|
|
|
13
14
|
from ..auth import get_fireworks_api_base, get_fireworks_api_key
|
|
14
|
-
from ..
|
|
15
|
-
from ..common_utils import load_jsonl
|
|
15
|
+
from ..common_utils import get_user_agent, load_jsonl
|
|
16
16
|
from ..fireworks_rft import (
|
|
17
17
|
create_dataset_from_jsonl,
|
|
18
18
|
detect_dataset_builder,
|
|
19
19
|
materialize_dataset_via_builder,
|
|
20
20
|
)
|
|
21
21
|
from ..models import EvaluationRow
|
|
22
|
+
from .upload import upload_command
|
|
22
23
|
from .utils import (
|
|
23
24
|
_build_entry_point,
|
|
24
25
|
_build_trimmed_dataset_id,
|
|
@@ -34,6 +35,8 @@ from .utils import (
|
|
|
34
35
|
)
|
|
35
36
|
from .local_test import run_evaluator_test
|
|
36
37
|
|
|
38
|
+
from fireworks import Fireworks
|
|
39
|
+
|
|
37
40
|
|
|
38
41
|
def _extract_dataset_adapter(
|
|
39
42
|
test_file_path: str, test_func_name: str
|
|
@@ -220,68 +223,64 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
|
|
|
220
223
|
return None
|
|
221
224
|
|
|
222
225
|
|
|
223
|
-
def
|
|
224
|
-
|
|
225
|
-
version_id: str,
|
|
226
|
-
api_key: str,
|
|
227
|
-
api_base: str,
|
|
228
|
-
timeout_minutes: int = 10,
|
|
226
|
+
def _poll_evaluator_status(
|
|
227
|
+
evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
|
|
229
228
|
) -> bool:
|
|
230
229
|
"""
|
|
231
|
-
Poll
|
|
232
|
-
|
|
233
|
-
Uses the Fireworks SDK to get the specified version of the evaluator and checks
|
|
234
|
-
its build state.
|
|
230
|
+
Poll evaluator status until it becomes ACTIVE or times out.
|
|
235
231
|
|
|
236
232
|
Args:
|
|
237
|
-
|
|
238
|
-
version_id: The specific version ID to poll
|
|
233
|
+
evaluator_resource_name: Full evaluator resource name (e.g., accounts/xxx/evaluators/yyy)
|
|
239
234
|
api_key: Fireworks API key
|
|
240
235
|
api_base: Fireworks API base URL
|
|
241
236
|
timeout_minutes: Maximum time to wait in minutes
|
|
242
237
|
|
|
243
238
|
Returns:
|
|
244
|
-
True if evaluator
|
|
239
|
+
True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
|
|
245
240
|
"""
|
|
241
|
+
headers = {
|
|
242
|
+
"Authorization": f"Bearer {api_key}",
|
|
243
|
+
"Content-Type": "application/json",
|
|
244
|
+
"User-Agent": get_user_agent(),
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
check_url = f"{api_base}/v1/{evaluator_resource_name}"
|
|
246
248
|
timeout_seconds = timeout_minutes * 60
|
|
247
249
|
poll_interval = 10 # seconds
|
|
248
250
|
start_time = time.time()
|
|
249
251
|
|
|
250
|
-
print(
|
|
251
|
-
f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
client = create_fireworks_client(api_key=api_key, base_url=api_base)
|
|
252
|
+
print(f"Polling evaluator status (timeout: {timeout_minutes}m, interval: {poll_interval}s)...")
|
|
255
253
|
|
|
256
254
|
while time.time() - start_time < timeout_seconds:
|
|
257
255
|
try:
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
256
|
+
response = requests.get(check_url, headers=headers, timeout=30)
|
|
257
|
+
response.raise_for_status()
|
|
258
|
+
|
|
259
|
+
evaluator_data = response.json()
|
|
260
|
+
state = evaluator_data.get("state", "STATE_UNSPECIFIED")
|
|
261
|
+
status = evaluator_data.get("status", "")
|
|
263
262
|
|
|
264
263
|
if state == "ACTIVE":
|
|
265
|
-
print("✅ Evaluator
|
|
264
|
+
print("✅ Evaluator is ACTIVE and ready!")
|
|
266
265
|
return True
|
|
267
266
|
elif state == "BUILD_FAILED":
|
|
268
|
-
print(f"❌ Evaluator
|
|
267
|
+
print(f"❌ Evaluator build failed. Status: {status}")
|
|
269
268
|
return False
|
|
270
269
|
elif state == "BUILDING":
|
|
271
270
|
elapsed_minutes = (time.time() - start_time) / 60
|
|
272
|
-
print(f"⏳ Evaluator
|
|
271
|
+
print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
|
|
273
272
|
else:
|
|
274
|
-
print(f"⏳ Evaluator
|
|
273
|
+
print(f"⏳ Evaluator state: {state}, status: {status}")
|
|
275
274
|
|
|
276
|
-
except
|
|
277
|
-
print(f"Warning: Failed to check evaluator
|
|
275
|
+
except requests.exceptions.RequestException as e:
|
|
276
|
+
print(f"Warning: Failed to check evaluator status: {e}")
|
|
278
277
|
|
|
279
278
|
# Wait before next poll
|
|
280
279
|
time.sleep(poll_interval)
|
|
281
280
|
|
|
282
281
|
# Timeout reached
|
|
283
282
|
elapsed_minutes = (time.time() - start_time) / 60
|
|
284
|
-
print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator
|
|
283
|
+
print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
|
|
285
284
|
return False
|
|
286
285
|
|
|
287
286
|
|
|
@@ -566,16 +565,42 @@ def _upload_dataset(
|
|
|
566
565
|
def _upload_and_ensure_evaluator(
|
|
567
566
|
project_root: str,
|
|
568
567
|
evaluator_id: str,
|
|
568
|
+
evaluator_resource_name: str,
|
|
569
569
|
api_key: str,
|
|
570
570
|
api_base: str,
|
|
571
|
+
force: bool,
|
|
571
572
|
) -> bool:
|
|
572
|
-
"""
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
573
|
+
"""Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
|
|
574
|
+
# Optional short-circuit: if evaluator already exists and not forcing, skip upload path
|
|
575
|
+
if not force:
|
|
576
|
+
try:
|
|
577
|
+
headers = {
|
|
578
|
+
"Authorization": f"Bearer {api_key}",
|
|
579
|
+
"Content-Type": "application/json",
|
|
580
|
+
"User-Agent": get_user_agent(),
|
|
581
|
+
}
|
|
582
|
+
resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
|
|
583
|
+
if resp.ok:
|
|
584
|
+
state = resp.json().get("state", "STATE_UNSPECIFIED")
|
|
585
|
+
print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
|
|
586
|
+
# Poll for ACTIVE before proceeding
|
|
587
|
+
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
|
|
588
|
+
if not _poll_evaluator_status(
|
|
589
|
+
evaluator_resource_name=evaluator_resource_name,
|
|
590
|
+
api_key=api_key,
|
|
591
|
+
api_base=api_base,
|
|
592
|
+
timeout_minutes=10,
|
|
593
|
+
):
|
|
594
|
+
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
|
|
595
|
+
print("\n❌ Evaluator is not ready within the timeout period.")
|
|
596
|
+
print(f"📊 Please check the evaluator status at: {dashboard_url}")
|
|
597
|
+
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
|
|
598
|
+
return False
|
|
599
|
+
return True
|
|
600
|
+
except requests.exceptions.RequestException:
|
|
601
|
+
pass
|
|
578
602
|
|
|
603
|
+
# Ensure evaluator exists by invoking the upload flow programmatically
|
|
579
604
|
try:
|
|
580
605
|
tests = _discover_tests(project_root)
|
|
581
606
|
selected_entry: Optional[str] = None
|
|
@@ -592,37 +617,43 @@ def _upload_and_ensure_evaluator(
|
|
|
592
617
|
)
|
|
593
618
|
return False
|
|
594
619
|
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
620
|
+
upload_args = argparse.Namespace(
|
|
621
|
+
path=project_root,
|
|
622
|
+
entry=selected_entry,
|
|
623
|
+
id=evaluator_id,
|
|
624
|
+
display_name=None,
|
|
625
|
+
description=None,
|
|
626
|
+
force=force, # Pass through the --force flag
|
|
627
|
+
yes=True,
|
|
628
|
+
env_file=None, # Add the new env_file parameter
|
|
601
629
|
)
|
|
602
630
|
|
|
603
|
-
if
|
|
604
|
-
print("
|
|
605
|
-
return False
|
|
631
|
+
if force:
|
|
632
|
+
print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
|
|
606
633
|
|
|
607
|
-
|
|
634
|
+
rc = upload_command(upload_args)
|
|
635
|
+
if rc == 0:
|
|
636
|
+
print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
|
|
608
637
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
)
|
|
638
|
+
# Poll for evaluator status
|
|
639
|
+
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
|
|
640
|
+
is_active = _poll_evaluator_status(
|
|
641
|
+
evaluator_resource_name=evaluator_resource_name,
|
|
642
|
+
api_key=api_key,
|
|
643
|
+
api_base=api_base,
|
|
644
|
+
timeout_minutes=10,
|
|
645
|
+
)
|
|
618
646
|
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
647
|
+
if not is_active:
|
|
648
|
+
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
|
|
649
|
+
print("\n❌ Evaluator is not ready within the timeout period.")
|
|
650
|
+
print(f"📊 Please check the evaluator status at: {dashboard_url}")
|
|
651
|
+
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
|
|
652
|
+
return False
|
|
653
|
+
return True
|
|
654
|
+
else:
|
|
655
|
+
print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
|
|
624
656
|
return False
|
|
625
|
-
return True
|
|
626
657
|
except Exception as e:
|
|
627
658
|
print(f"Warning: Failed to upload evaluator automatically: {e}")
|
|
628
659
|
return False
|
|
@@ -641,7 +672,7 @@ def _create_rft_job(
|
|
|
641
672
|
) -> int:
|
|
642
673
|
"""Build and submit the RFT job request (via Fireworks SDK)."""
|
|
643
674
|
|
|
644
|
-
signature = inspect.signature(
|
|
675
|
+
signature = inspect.signature(Fireworks().reinforcement_fine_tuning_jobs.create)
|
|
645
676
|
|
|
646
677
|
# Build top-level SDK kwargs
|
|
647
678
|
sdk_kwargs: Dict[str, Any] = {
|
|
@@ -680,7 +711,7 @@ def _create_rft_job(
|
|
|
680
711
|
return 0
|
|
681
712
|
|
|
682
713
|
try:
|
|
683
|
-
fw: Fireworks =
|
|
714
|
+
fw: Fireworks = Fireworks(api_key=api_key, base_url=api_base)
|
|
684
715
|
job: ReinforcementFineTuningJob = fw.reinforcement_fine_tuning_jobs.create(account_id=account_id, **sdk_kwargs)
|
|
685
716
|
job_name = job.name
|
|
686
717
|
print(f"\n✅ Created Reinforcement Fine-tuning Job: {job_name}")
|
|
@@ -708,6 +739,7 @@ def create_rft_command(args) -> int:
|
|
|
708
739
|
evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
|
|
709
740
|
non_interactive: bool = bool(getattr(args, "yes", False))
|
|
710
741
|
dry_run: bool = bool(getattr(args, "dry_run", False))
|
|
742
|
+
force: bool = bool(getattr(args, "force", False))
|
|
711
743
|
skip_validation: bool = bool(getattr(args, "skip_validation", False))
|
|
712
744
|
ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
|
|
713
745
|
docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
|
|
@@ -778,12 +810,14 @@ def create_rft_command(args) -> int:
|
|
|
778
810
|
if not dataset_id or not dataset_resource:
|
|
779
811
|
return 1
|
|
780
812
|
|
|
781
|
-
# 5) Ensure evaluator exists and
|
|
813
|
+
# 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
|
|
782
814
|
if not _upload_and_ensure_evaluator(
|
|
783
815
|
project_root=project_root,
|
|
784
816
|
evaluator_id=evaluator_id,
|
|
817
|
+
evaluator_resource_name=evaluator_resource_name,
|
|
785
818
|
api_key=api_key,
|
|
786
819
|
api_base=api_base,
|
|
820
|
+
force=force,
|
|
787
821
|
):
|
|
788
822
|
return 1
|
|
789
823
|
|
|
@@ -289,6 +289,7 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
289
289
|
base_id = getattr(args, "id", None)
|
|
290
290
|
display_name = getattr(args, "display_name", None)
|
|
291
291
|
description = getattr(args, "description", None)
|
|
292
|
+
force = bool(getattr(args, "force", False))
|
|
292
293
|
env_file = getattr(args, "env_file", None)
|
|
293
294
|
|
|
294
295
|
# Load secrets from .env file and ensure they're available on Fireworks
|
|
@@ -377,18 +378,17 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
377
378
|
|
|
378
379
|
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
|
|
379
380
|
try:
|
|
380
|
-
result
|
|
381
|
+
result = create_evaluation(
|
|
381
382
|
evaluator_id=evaluator_id,
|
|
382
383
|
display_name=display_name or evaluator_id,
|
|
383
384
|
description=description or f"Evaluator for {qualname}",
|
|
385
|
+
force=force,
|
|
384
386
|
entry_point=entry_point,
|
|
385
387
|
)
|
|
386
388
|
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
|
|
387
389
|
|
|
388
390
|
# Print success message with Fireworks dashboard link
|
|
389
391
|
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
|
|
390
|
-
if version_id:
|
|
391
|
-
print(f" Version: {version_id}")
|
|
392
392
|
print("📊 View in Fireworks Dashboard:")
|
|
393
393
|
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
|
|
394
394
|
print(f" {dashboard_url}\n")
|
|
@@ -4,15 +4,14 @@ import time
|
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
|
|
6
6
|
import fireworks
|
|
7
|
-
from fireworks.types import EvaluatorVersionParam
|
|
8
7
|
import requests
|
|
8
|
+
from fireworks import Fireworks
|
|
9
9
|
|
|
10
10
|
from eval_protocol.auth import (
|
|
11
11
|
get_fireworks_account_id,
|
|
12
12
|
get_fireworks_api_key,
|
|
13
13
|
verify_api_key_and_get_account_id,
|
|
14
14
|
)
|
|
15
|
-
from eval_protocol.fireworks_client import create_fireworks_client
|
|
16
15
|
from eval_protocol.get_pep440_version import get_pep440_version
|
|
17
16
|
|
|
18
17
|
logger = logging.getLogger(__name__)
|
|
@@ -154,7 +153,7 @@ class Evaluator:
|
|
|
154
153
|
logger.info(f"Created {output_path} ({size_bytes:,} bytes)")
|
|
155
154
|
return size_bytes
|
|
156
155
|
|
|
157
|
-
def create(self, evaluator_id, display_name=None, description=None):
|
|
156
|
+
def create(self, evaluator_id, display_name=None, description=None, force=False):
|
|
158
157
|
auth_token = self.api_key or get_fireworks_api_key()
|
|
159
158
|
account_id = self.account_id or get_fireworks_account_id()
|
|
160
159
|
if not account_id and auth_token:
|
|
@@ -164,11 +163,7 @@ class Evaluator:
|
|
|
164
163
|
logger.error("Authentication error: API credentials appear to be invalid or incomplete.")
|
|
165
164
|
raise ValueError("Invalid or missing API credentials.")
|
|
166
165
|
|
|
167
|
-
client =
|
|
168
|
-
api_key=auth_token,
|
|
169
|
-
base_url=self.api_base,
|
|
170
|
-
account_id=account_id,
|
|
171
|
-
)
|
|
166
|
+
client = Fireworks(api_key=auth_token, base_url=self.api_base, account_id=account_id)
|
|
172
167
|
|
|
173
168
|
self.display_name = display_name or evaluator_id
|
|
174
169
|
self.description = description or f"Evaluator created from {evaluator_id}"
|
|
@@ -202,20 +197,28 @@ class Evaluator:
|
|
|
202
197
|
logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...")
|
|
203
198
|
|
|
204
199
|
try:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
evaluator_id=evaluator_id
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
200
|
+
if force:
|
|
201
|
+
try:
|
|
202
|
+
logger.info("Checking if evaluator exists")
|
|
203
|
+
existing_evaluator = client.evaluators.get(evaluator_id=evaluator_id)
|
|
204
|
+
if existing_evaluator:
|
|
205
|
+
logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...")
|
|
206
|
+
try:
|
|
207
|
+
client.evaluators.delete(evaluator_id=evaluator_id)
|
|
208
|
+
logger.info(f"Successfully deleted evaluator '{evaluator_id}'")
|
|
209
|
+
except fireworks.NotFoundError:
|
|
210
|
+
logger.info(f"Evaluator '{evaluator_id}' not found, creating...")
|
|
211
|
+
except fireworks.APIError as e:
|
|
212
|
+
logger.warning(f"Error deleting evaluator: {str(e)}")
|
|
213
|
+
except fireworks.NotFoundError:
|
|
214
|
+
logger.info(f"Evaluator '{evaluator_id}' does not exist, creating...")
|
|
215
|
+
|
|
216
|
+
# Create evaluator using SDK
|
|
217
|
+
result = client.evaluators.create(
|
|
218
|
+
evaluator_id=evaluator_id,
|
|
219
|
+
evaluator=evaluator_params,
|
|
220
|
+
)
|
|
221
|
+
logger.info(f"Successfully created evaluator '{evaluator_id}'")
|
|
219
222
|
|
|
220
223
|
# Upload code as tar.gz to GCS
|
|
221
224
|
evaluator_name = result.name # e.g., "accounts/pyroworks/evaluators/test-123"
|
|
@@ -226,25 +229,6 @@ class Evaluator:
|
|
|
226
229
|
f"Cannot proceed with code upload. Response: {result}"
|
|
227
230
|
)
|
|
228
231
|
|
|
229
|
-
evaluator_version_param: EvaluatorVersionParam = {}
|
|
230
|
-
if "commit_hash" in evaluator_params:
|
|
231
|
-
evaluator_version_param["commit_hash"] = evaluator_params["commit_hash"]
|
|
232
|
-
if "entry_point" in evaluator_params:
|
|
233
|
-
evaluator_version_param["entry_point"] = evaluator_params["entry_point"]
|
|
234
|
-
if "requirements" in evaluator_params:
|
|
235
|
-
evaluator_version_param["requirements"] = evaluator_params["requirements"]
|
|
236
|
-
|
|
237
|
-
evaluator_version = client.evaluator_versions.create(
|
|
238
|
-
evaluator_id=evaluator_id,
|
|
239
|
-
evaluator_version=evaluator_version_param,
|
|
240
|
-
)
|
|
241
|
-
evaluator_version_id = evaluator_version.name.split("/")[-1] if evaluator_version.name else None
|
|
242
|
-
if not evaluator_version_id:
|
|
243
|
-
raise ValueError(
|
|
244
|
-
"Create evaluator version response missing 'name' field. "
|
|
245
|
-
f"Cannot proceed with code upload. Response: {evaluator_version}"
|
|
246
|
-
)
|
|
247
|
-
|
|
248
232
|
try:
|
|
249
233
|
# Create tar.gz of current directory
|
|
250
234
|
cwd = os.getcwd()
|
|
@@ -256,8 +240,7 @@ class Evaluator:
|
|
|
256
240
|
|
|
257
241
|
# Call GetEvaluatorUploadEndpoint using SDK
|
|
258
242
|
logger.info(f"Requesting upload endpoint for {tar_filename}")
|
|
259
|
-
upload_response = client.
|
|
260
|
-
version_id=evaluator_version_id,
|
|
243
|
+
upload_response = client.evaluators.get_upload_endpoint(
|
|
261
244
|
evaluator_id=evaluator_id,
|
|
262
245
|
filename_to_size={tar_filename: str(tar_size)},
|
|
263
246
|
)
|
|
@@ -338,9 +321,9 @@ class Evaluator:
|
|
|
338
321
|
raise
|
|
339
322
|
|
|
340
323
|
# Step 3: Validate upload using SDK
|
|
341
|
-
client.
|
|
342
|
-
version_id=evaluator_version_id,
|
|
324
|
+
client.evaluators.validate_upload(
|
|
343
325
|
evaluator_id=evaluator_id,
|
|
326
|
+
body={},
|
|
344
327
|
)
|
|
345
328
|
logger.info("Upload validated successfully")
|
|
346
329
|
|
|
@@ -351,10 +334,8 @@ class Evaluator:
|
|
|
351
334
|
except Exception as upload_error:
|
|
352
335
|
logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}")
|
|
353
336
|
# Don't fail - evaluator is created, just code upload failed
|
|
354
|
-
# Return None for version_id since upload failed
|
|
355
|
-
return result, None
|
|
356
337
|
|
|
357
|
-
return result
|
|
338
|
+
return result # Return after attempting upload
|
|
358
339
|
except fireworks.APIStatusError as e:
|
|
359
340
|
logger.error(f"Error creating evaluator: {str(e)}")
|
|
360
341
|
logger.error(f"Status code: {e.status_code}, Response: {e.response.text}")
|
|
@@ -380,6 +361,7 @@ def create_evaluation(
|
|
|
380
361
|
evaluator_id: str,
|
|
381
362
|
display_name: Optional[str] = None,
|
|
382
363
|
description: Optional[str] = None,
|
|
364
|
+
force: bool = False,
|
|
383
365
|
account_id: Optional[str] = None,
|
|
384
366
|
api_key: Optional[str] = None,
|
|
385
367
|
entry_point: Optional[str] = None,
|
|
@@ -391,13 +373,10 @@ def create_evaluation(
|
|
|
391
373
|
evaluator_id: Unique identifier for the evaluator
|
|
392
374
|
display_name: Display name for the evaluator
|
|
393
375
|
description: Description for the evaluator
|
|
376
|
+
force: If True, delete and recreate if evaluator exists
|
|
394
377
|
account_id: Optional Fireworks account ID
|
|
395
378
|
api_key: Optional Fireworks API key
|
|
396
379
|
entry_point: Optional entry point (module::function or path::function)
|
|
397
|
-
|
|
398
|
-
Returns:
|
|
399
|
-
A tuple of (evaluator_result, version_id) where version_id is the ID of the
|
|
400
|
-
created evaluator version, or None if upload failed.
|
|
401
380
|
"""
|
|
402
381
|
evaluator = Evaluator(
|
|
403
382
|
account_id=account_id,
|
|
@@ -405,4 +384,4 @@ def create_evaluation(
|
|
|
405
384
|
entry_point=entry_point,
|
|
406
385
|
)
|
|
407
386
|
|
|
408
|
-
return evaluator.create(evaluator_id, display_name, description)
|
|
387
|
+
return evaluator.create(evaluator_id, display_name, description, force)
|