eval-protocol 0.3.9.dev3__tar.gz → 0.3.10.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.9.dev3/eval_protocol.egg-info → eval_protocol-0.3.10.dev1}/PKG-INFO +2 -2
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/fireworks_tracing.py +9 -2
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/auth.py +29 -1
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli.py +8 -6
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/create_rft.py +66 -100
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/upload.py +3 -3
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/evaluation.py +53 -32
- eval_protocol-0.3.10.dev1/eval_protocol/fireworks_client.py +132 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/platform_api.py +17 -27
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/evaluation_test.py +27 -24
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/evaluation_test_utils.py +19 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/tracing_utils.py +6 -2
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1/eval_protocol.egg-info}/PKG-INFO +2 -2
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol.egg-info/SOURCES.txt +2 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol.egg-info/requires.txt +1 -1
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/pyproject.toml +1 -1
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_cli_create_rft.py +17 -61
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_ep_upload_e2e.py +51 -140
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_evaluation.py +22 -7
- eval_protocol-0.3.10.dev1/tests/test_fireworks_client.py +143 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_upload_entrypoint.py +10 -12
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/LICENSE +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/README.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/development/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/setup.cfg +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/setup.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_format.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_length.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_math.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_models.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/versioneer.py +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vite-app/dist/assets/index-10cZ11iB.js +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vite-app/dist/assets/index-10cZ11iB.js.map +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vite-app/dist/assets/index-DOD73Wyg.css +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10.dev1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -29,7 +29,7 @@ Requires-Dist: pytest>=6.0.0
|
|
|
29
29
|
Requires-Dist: pytest-asyncio>=0.21.0
|
|
30
30
|
Requires-Dist: peewee>=3.18.2
|
|
31
31
|
Requires-Dist: backoff>=2.2.0
|
|
32
|
-
Requires-Dist: fireworks-ai==1.0.
|
|
32
|
+
Requires-Dist: fireworks-ai==1.0.0a22
|
|
33
33
|
Requires-Dist: questionary>=2.0.0
|
|
34
34
|
Requires-Dist: toml>=0.10.0
|
|
35
35
|
Requires-Dist: loguru>=0.6.0
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-01-
|
|
11
|
+
"date": "2026-01-13T15:54:22-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.
|
|
14
|
+
"full-revisionid": "3314becfcdf35f771c41988a24f38dcb91593203",
|
|
15
|
+
"version": "0.3.10.dev.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -253,6 +253,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
253
253
|
project_id: Optional[str] = None,
|
|
254
254
|
base_url: str = "https://tracing.fireworks.ai",
|
|
255
255
|
timeout: int = 300,
|
|
256
|
+
api_key: Optional[str] = None,
|
|
256
257
|
):
|
|
257
258
|
"""Initialize the Fireworks Tracing adapter.
|
|
258
259
|
|
|
@@ -260,10 +261,16 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
260
261
|
project_id: Optional project ID. If not provided, uses the default project configured on the server.
|
|
261
262
|
base_url: The base URL of the tracing proxy (default: https://tracing.fireworks.ai)
|
|
262
263
|
timeout: Request timeout in seconds (default: 300)
|
|
264
|
+
api_key: Optional API key. If not provided, falls back to FIREWORKS_API_KEY environment variable.
|
|
263
265
|
"""
|
|
264
266
|
self.project_id = project_id
|
|
265
267
|
self.base_url = base_url.rstrip("/")
|
|
266
268
|
self.timeout = timeout
|
|
269
|
+
self._api_key = api_key
|
|
270
|
+
|
|
271
|
+
def _get_api_key(self) -> Optional[str]:
|
|
272
|
+
"""Get the API key, preferring instance-level key over environment variable."""
|
|
273
|
+
return self._api_key or os.environ.get("FIREWORKS_API_KEY")
|
|
267
274
|
|
|
268
275
|
def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
|
|
269
276
|
"""Fetch logs from Fireworks tracing gateway /logs endpoint.
|
|
@@ -276,7 +283,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
276
283
|
from ..common_utils import get_user_agent
|
|
277
284
|
|
|
278
285
|
headers = {
|
|
279
|
-
"Authorization": f"Bearer {
|
|
286
|
+
"Authorization": f"Bearer {self._get_api_key()}",
|
|
280
287
|
"User-Agent": get_user_agent(),
|
|
281
288
|
}
|
|
282
289
|
params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
|
|
@@ -407,7 +414,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
407
414
|
from ..common_utils import get_user_agent
|
|
408
415
|
|
|
409
416
|
headers = {
|
|
410
|
-
"Authorization": f"Bearer {
|
|
417
|
+
"Authorization": f"Bearer {self._get_api_key()}",
|
|
411
418
|
"User-Agent": get_user_agent(),
|
|
412
419
|
}
|
|
413
420
|
|
|
@@ -3,9 +3,30 @@ import os
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
|
+
from dotenv import find_dotenv, load_dotenv
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
10
|
+
# --- Load .env files ---
|
|
11
|
+
# Attempt to load .env.dev first, then .env as a fallback.
|
|
12
|
+
# This happens when the module is imported.
|
|
13
|
+
# We use override=False (default) so that existing environment variables
|
|
14
|
+
# (e.g., set in the shell) are NOT overridden by .env files.
|
|
15
|
+
_ENV_DEV_PATH = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
|
|
16
|
+
if _ENV_DEV_PATH:
|
|
17
|
+
load_dotenv(dotenv_path=_ENV_DEV_PATH, override=False)
|
|
18
|
+
logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_DEV_PATH}")
|
|
19
|
+
else:
|
|
20
|
+
_ENV_PATH = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
|
|
21
|
+
if _ENV_PATH:
|
|
22
|
+
load_dotenv(dotenv_path=_ENV_PATH, override=False)
|
|
23
|
+
logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_PATH}")
|
|
24
|
+
else:
|
|
25
|
+
logger.debug(
|
|
26
|
+
"eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
|
|
27
|
+
)
|
|
28
|
+
# --- End .env loading ---
|
|
29
|
+
|
|
9
30
|
|
|
10
31
|
def get_fireworks_api_key() -> Optional[str]:
|
|
11
32
|
"""
|
|
@@ -73,6 +94,8 @@ def verify_api_key_and_get_account_id(
|
|
|
73
94
|
Args:
|
|
74
95
|
api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
|
|
75
96
|
api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
|
|
97
|
+
If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
|
|
98
|
+
dev.api.fireworks.ai for the verification call.
|
|
76
99
|
|
|
77
100
|
Returns:
|
|
78
101
|
The resolved account id if verification succeeds and the header is present; otherwise None.
|
|
@@ -81,7 +104,12 @@ def verify_api_key_and_get_account_id(
|
|
|
81
104
|
resolved_key = api_key or get_fireworks_api_key()
|
|
82
105
|
if not resolved_key:
|
|
83
106
|
return None
|
|
84
|
-
|
|
107
|
+
provided_base = api_base or get_fireworks_api_base()
|
|
108
|
+
# Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
|
|
109
|
+
if "api.fireworks.ai" in provided_base:
|
|
110
|
+
resolved_base = provided_base
|
|
111
|
+
else:
|
|
112
|
+
resolved_base = "https://dev.api.fireworks.ai"
|
|
85
113
|
|
|
86
114
|
from .common_utils import get_user_agent
|
|
87
115
|
|
|
@@ -81,13 +81,12 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
|
|
|
81
81
|
"--env-file",
|
|
82
82
|
help="Path to .env file containing secrets to upload (default: .env in current directory)",
|
|
83
83
|
)
|
|
84
|
-
upload_parser.add_argument(
|
|
85
|
-
"--force",
|
|
86
|
-
action="store_true",
|
|
87
|
-
help="Overwrite existing evaluator with the same ID",
|
|
88
|
-
)
|
|
89
84
|
|
|
90
85
|
# Auto-generate flags from SDK Fireworks().evaluators.create() signature
|
|
86
|
+
# Note: We use Fireworks() directly here instead of create_fireworks_client()
|
|
87
|
+
# because we only need the method signature for introspection, not a fully
|
|
88
|
+
# authenticated client. create_fireworks_client() would trigger an HTTP request
|
|
89
|
+
# to verify the API key, causing delays even for --help invocations.
|
|
91
90
|
create_evaluator_fn = Fireworks().evaluators.create
|
|
92
91
|
|
|
93
92
|
upload_skip_fields = {
|
|
@@ -137,7 +136,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
|
|
|
137
136
|
|
|
138
137
|
rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
|
|
139
138
|
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
|
|
140
|
-
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
|
|
141
139
|
rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
|
|
142
140
|
rft_parser.add_argument(
|
|
143
141
|
"--ignore-docker",
|
|
@@ -198,6 +196,10 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
|
|
|
198
196
|
"loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
|
|
199
197
|
}
|
|
200
198
|
|
|
199
|
+
# Note: We use Fireworks() directly here instead of create_fireworks_client()
|
|
200
|
+
# because we only need the method signature for introspection, not a fully
|
|
201
|
+
# authenticated client. create_fireworks_client() would trigger an HTTP request
|
|
202
|
+
# to verify the API key, causing delays even for --help invocations.
|
|
201
203
|
create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
|
|
202
204
|
|
|
203
205
|
add_args_from_callable_signature(
|
{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/create_rft.py
RENAMED
|
@@ -7,19 +7,18 @@ import sys
|
|
|
7
7
|
import time
|
|
8
8
|
from typing import Any, Callable, Dict, Optional
|
|
9
9
|
import inspect
|
|
10
|
-
import requests
|
|
11
10
|
import tempfile
|
|
12
11
|
from pydantic import ValidationError
|
|
13
12
|
|
|
14
13
|
from ..auth import get_fireworks_api_base, get_fireworks_api_key
|
|
15
|
-
from ..
|
|
14
|
+
from ..fireworks_client import create_fireworks_client
|
|
15
|
+
from ..common_utils import load_jsonl
|
|
16
16
|
from ..fireworks_rft import (
|
|
17
17
|
create_dataset_from_jsonl,
|
|
18
18
|
detect_dataset_builder,
|
|
19
19
|
materialize_dataset_via_builder,
|
|
20
20
|
)
|
|
21
21
|
from ..models import EvaluationRow
|
|
22
|
-
from .upload import upload_command
|
|
23
22
|
from .utils import (
|
|
24
23
|
_build_entry_point,
|
|
25
24
|
_build_trimmed_dataset_id,
|
|
@@ -35,8 +34,6 @@ from .utils import (
|
|
|
35
34
|
)
|
|
36
35
|
from .local_test import run_evaluator_test
|
|
37
36
|
|
|
38
|
-
from fireworks import Fireworks
|
|
39
|
-
|
|
40
37
|
|
|
41
38
|
def _extract_dataset_adapter(
|
|
42
39
|
test_file_path: str, test_func_name: str
|
|
@@ -223,64 +220,68 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
|
|
|
223
220
|
return None
|
|
224
221
|
|
|
225
222
|
|
|
226
|
-
def
|
|
227
|
-
|
|
223
|
+
def _poll_evaluator_version_status(
|
|
224
|
+
evaluator_id: str,
|
|
225
|
+
version_id: str,
|
|
226
|
+
api_key: str,
|
|
227
|
+
api_base: str,
|
|
228
|
+
timeout_minutes: int = 10,
|
|
228
229
|
) -> bool:
|
|
229
230
|
"""
|
|
230
|
-
Poll evaluator status until it becomes ACTIVE or times out.
|
|
231
|
+
Poll a specific evaluator version status until it becomes ACTIVE or times out.
|
|
232
|
+
|
|
233
|
+
Uses the Fireworks SDK to get the specified version of the evaluator and checks
|
|
234
|
+
its build state.
|
|
231
235
|
|
|
232
236
|
Args:
|
|
233
|
-
|
|
237
|
+
evaluator_id: The evaluator ID (not full resource name)
|
|
238
|
+
version_id: The specific version ID to poll
|
|
234
239
|
api_key: Fireworks API key
|
|
235
240
|
api_base: Fireworks API base URL
|
|
236
241
|
timeout_minutes: Maximum time to wait in minutes
|
|
237
242
|
|
|
238
243
|
Returns:
|
|
239
|
-
True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
|
|
244
|
+
True if evaluator version becomes ACTIVE, False if timeout or BUILD_FAILED
|
|
240
245
|
"""
|
|
241
|
-
headers = {
|
|
242
|
-
"Authorization": f"Bearer {api_key}",
|
|
243
|
-
"Content-Type": "application/json",
|
|
244
|
-
"User-Agent": get_user_agent(),
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
check_url = f"{api_base}/v1/{evaluator_resource_name}"
|
|
248
246
|
timeout_seconds = timeout_minutes * 60
|
|
249
247
|
poll_interval = 10 # seconds
|
|
250
248
|
start_time = time.time()
|
|
251
249
|
|
|
252
|
-
print(
|
|
250
|
+
print(
|
|
251
|
+
f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
client = create_fireworks_client(api_key=api_key, base_url=api_base)
|
|
253
255
|
|
|
254
256
|
while time.time() - start_time < timeout_seconds:
|
|
255
257
|
try:
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
status = evaluator_data.get("status", "")
|
|
258
|
+
version = client.evaluator_versions.get(version_id, evaluator_id=evaluator_id)
|
|
259
|
+
state = version.state or "STATE_UNSPECIFIED"
|
|
260
|
+
status_msg = ""
|
|
261
|
+
if version.status and version.status.message:
|
|
262
|
+
status_msg = version.status.message
|
|
262
263
|
|
|
263
264
|
if state == "ACTIVE":
|
|
264
|
-
print("✅ Evaluator is ACTIVE and ready!")
|
|
265
|
+
print("✅ Evaluator version is ACTIVE and ready!")
|
|
265
266
|
return True
|
|
266
267
|
elif state == "BUILD_FAILED":
|
|
267
|
-
print(f"❌ Evaluator build failed. Status: {
|
|
268
|
+
print(f"❌ Evaluator version build failed. Status: {status_msg}")
|
|
268
269
|
return False
|
|
269
270
|
elif state == "BUILDING":
|
|
270
271
|
elapsed_minutes = (time.time() - start_time) / 60
|
|
271
|
-
print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
|
|
272
|
+
print(f"⏳ Evaluator version is still building... ({elapsed_minutes:.1f}m elapsed)")
|
|
272
273
|
else:
|
|
273
|
-
print(f"⏳ Evaluator state: {state}, status: {
|
|
274
|
+
print(f"⏳ Evaluator version state: {state}, status: {status_msg}")
|
|
274
275
|
|
|
275
|
-
except
|
|
276
|
-
print(f"Warning: Failed to check evaluator status: {e}")
|
|
276
|
+
except Exception as e:
|
|
277
|
+
print(f"Warning: Failed to check evaluator version status: {e}")
|
|
277
278
|
|
|
278
279
|
# Wait before next poll
|
|
279
280
|
time.sleep(poll_interval)
|
|
280
281
|
|
|
281
282
|
# Timeout reached
|
|
282
283
|
elapsed_minutes = (time.time() - start_time) / 60
|
|
283
|
-
print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
|
|
284
|
+
print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator version is not yet ACTIVE")
|
|
284
285
|
return False
|
|
285
286
|
|
|
286
287
|
|
|
@@ -565,42 +566,16 @@ def _upload_dataset(
|
|
|
565
566
|
def _upload_and_ensure_evaluator(
|
|
566
567
|
project_root: str,
|
|
567
568
|
evaluator_id: str,
|
|
568
|
-
evaluator_resource_name: str,
|
|
569
569
|
api_key: str,
|
|
570
570
|
api_base: str,
|
|
571
|
-
force: bool,
|
|
572
571
|
) -> bool:
|
|
573
|
-
"""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
"Content-Type": "application/json",
|
|
580
|
-
"User-Agent": get_user_agent(),
|
|
581
|
-
}
|
|
582
|
-
resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
|
|
583
|
-
if resp.ok:
|
|
584
|
-
state = resp.json().get("state", "STATE_UNSPECIFIED")
|
|
585
|
-
print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
|
|
586
|
-
# Poll for ACTIVE before proceeding
|
|
587
|
-
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
|
|
588
|
-
if not _poll_evaluator_status(
|
|
589
|
-
evaluator_resource_name=evaluator_resource_name,
|
|
590
|
-
api_key=api_key,
|
|
591
|
-
api_base=api_base,
|
|
592
|
-
timeout_minutes=10,
|
|
593
|
-
):
|
|
594
|
-
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
|
|
595
|
-
print("\n❌ Evaluator is not ready within the timeout period.")
|
|
596
|
-
print(f"📊 Please check the evaluator status at: {dashboard_url}")
|
|
597
|
-
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
|
|
598
|
-
return False
|
|
599
|
-
return True
|
|
600
|
-
except requests.exceptions.RequestException:
|
|
601
|
-
pass
|
|
572
|
+
"""Upload evaluator and ensure its version becomes ACTIVE.
|
|
573
|
+
|
|
574
|
+
Creates/updates the evaluator and uploads the code, then polls the specific
|
|
575
|
+
version until it becomes ACTIVE.
|
|
576
|
+
"""
|
|
577
|
+
from eval_protocol.evaluation import create_evaluation
|
|
602
578
|
|
|
603
|
-
# Ensure evaluator exists by invoking the upload flow programmatically
|
|
604
579
|
try:
|
|
605
580
|
tests = _discover_tests(project_root)
|
|
606
581
|
selected_entry: Optional[str] = None
|
|
@@ -617,43 +592,37 @@ def _upload_and_ensure_evaluator(
|
|
|
617
592
|
)
|
|
618
593
|
return False
|
|
619
594
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
force=force, # Pass through the --force flag
|
|
627
|
-
yes=True,
|
|
628
|
-
env_file=None, # Add the new env_file parameter
|
|
595
|
+
print(f"\nUploading evaluator '{evaluator_id}'...")
|
|
596
|
+
result, version_id = create_evaluation(
|
|
597
|
+
evaluator_id=evaluator_id,
|
|
598
|
+
display_name=evaluator_id,
|
|
599
|
+
description=f"Evaluator for {evaluator_id}",
|
|
600
|
+
entry_point=selected_entry,
|
|
629
601
|
)
|
|
630
602
|
|
|
631
|
-
if
|
|
632
|
-
print(
|
|
603
|
+
if not version_id:
|
|
604
|
+
print("Warning: Evaluator created but version upload failed.")
|
|
605
|
+
return False
|
|
633
606
|
|
|
634
|
-
|
|
635
|
-
if rc == 0:
|
|
636
|
-
print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
|
|
607
|
+
print(f"✓ Uploaded evaluator: {evaluator_id} (version: {version_id})")
|
|
637
608
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
609
|
+
# Poll for the specific evaluator version status
|
|
610
|
+
print(f"Waiting for evaluator '{evaluator_id}' version '{version_id}' to become ACTIVE...")
|
|
611
|
+
is_active = _poll_evaluator_version_status(
|
|
612
|
+
evaluator_id=evaluator_id,
|
|
613
|
+
version_id=version_id,
|
|
614
|
+
api_key=api_key,
|
|
615
|
+
api_base=api_base,
|
|
616
|
+
timeout_minutes=10,
|
|
617
|
+
)
|
|
646
618
|
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
return False
|
|
653
|
-
return True
|
|
654
|
-
else:
|
|
655
|
-
print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
|
|
619
|
+
if not is_active:
|
|
620
|
+
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
|
|
621
|
+
print("\n❌ Evaluator version is not ready within the timeout period.")
|
|
622
|
+
print(f"📊 Please check the evaluator status at: {dashboard_url}")
|
|
623
|
+
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
|
|
656
624
|
return False
|
|
625
|
+
return True
|
|
657
626
|
except Exception as e:
|
|
658
627
|
print(f"Warning: Failed to upload evaluator automatically: {e}")
|
|
659
628
|
return False
|
|
@@ -672,7 +641,7 @@ def _create_rft_job(
|
|
|
672
641
|
) -> int:
|
|
673
642
|
"""Build and submit the RFT job request (via Fireworks SDK)."""
|
|
674
643
|
|
|
675
|
-
signature = inspect.signature(
|
|
644
|
+
signature = inspect.signature(create_fireworks_client().reinforcement_fine_tuning_jobs.create)
|
|
676
645
|
|
|
677
646
|
# Build top-level SDK kwargs
|
|
678
647
|
sdk_kwargs: Dict[str, Any] = {
|
|
@@ -711,7 +680,7 @@ def _create_rft_job(
|
|
|
711
680
|
return 0
|
|
712
681
|
|
|
713
682
|
try:
|
|
714
|
-
fw: Fireworks =
|
|
683
|
+
fw: Fireworks = create_fireworks_client(api_key=api_key, base_url=api_base)
|
|
715
684
|
job: ReinforcementFineTuningJob = fw.reinforcement_fine_tuning_jobs.create(account_id=account_id, **sdk_kwargs)
|
|
716
685
|
job_name = job.name
|
|
717
686
|
print(f"\n✅ Created Reinforcement Fine-tuning Job: {job_name}")
|
|
@@ -739,7 +708,6 @@ def create_rft_command(args) -> int:
|
|
|
739
708
|
evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
|
|
740
709
|
non_interactive: bool = bool(getattr(args, "yes", False))
|
|
741
710
|
dry_run: bool = bool(getattr(args, "dry_run", False))
|
|
742
|
-
force: bool = bool(getattr(args, "force", False))
|
|
743
711
|
skip_validation: bool = bool(getattr(args, "skip_validation", False))
|
|
744
712
|
ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
|
|
745
713
|
docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
|
|
@@ -810,14 +778,12 @@ def create_rft_command(args) -> int:
|
|
|
810
778
|
if not dataset_id or not dataset_resource:
|
|
811
779
|
return 1
|
|
812
780
|
|
|
813
|
-
# 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
|
|
781
|
+
# 5) Ensure evaluator exists and its latest version is ACTIVE (upload + poll if needed)
|
|
814
782
|
if not _upload_and_ensure_evaluator(
|
|
815
783
|
project_root=project_root,
|
|
816
784
|
evaluator_id=evaluator_id,
|
|
817
|
-
evaluator_resource_name=evaluator_resource_name,
|
|
818
785
|
api_key=api_key,
|
|
819
786
|
api_base=api_base,
|
|
820
|
-
force=force,
|
|
821
787
|
):
|
|
822
788
|
return 1
|
|
823
789
|
|
|
@@ -289,7 +289,6 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
289
289
|
base_id = getattr(args, "id", None)
|
|
290
290
|
display_name = getattr(args, "display_name", None)
|
|
291
291
|
description = getattr(args, "description", None)
|
|
292
|
-
force = bool(getattr(args, "force", False))
|
|
293
292
|
env_file = getattr(args, "env_file", None)
|
|
294
293
|
|
|
295
294
|
# Load secrets from .env file and ensure they're available on Fireworks
|
|
@@ -378,17 +377,18 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
378
377
|
|
|
379
378
|
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
|
|
380
379
|
try:
|
|
381
|
-
result = create_evaluation(
|
|
380
|
+
result, version_id = create_evaluation(
|
|
382
381
|
evaluator_id=evaluator_id,
|
|
383
382
|
display_name=display_name or evaluator_id,
|
|
384
383
|
description=description or f"Evaluator for {qualname}",
|
|
385
|
-
force=force,
|
|
386
384
|
entry_point=entry_point,
|
|
387
385
|
)
|
|
388
386
|
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
|
|
389
387
|
|
|
390
388
|
# Print success message with Fireworks dashboard link
|
|
391
389
|
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
|
|
390
|
+
if version_id:
|
|
391
|
+
print(f" Version: {version_id}")
|
|
392
392
|
print("📊 View in Fireworks Dashboard:")
|
|
393
393
|
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
|
|
394
394
|
print(f" {dashboard_url}\n")
|
|
@@ -4,14 +4,15 @@ import time
|
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
|
|
6
6
|
import fireworks
|
|
7
|
+
from fireworks.types import EvaluatorVersionParam
|
|
7
8
|
import requests
|
|
8
|
-
from fireworks import Fireworks
|
|
9
9
|
|
|
10
10
|
from eval_protocol.auth import (
|
|
11
11
|
get_fireworks_account_id,
|
|
12
12
|
get_fireworks_api_key,
|
|
13
13
|
verify_api_key_and_get_account_id,
|
|
14
14
|
)
|
|
15
|
+
from eval_protocol.fireworks_client import create_fireworks_client
|
|
15
16
|
from eval_protocol.get_pep440_version import get_pep440_version
|
|
16
17
|
|
|
17
18
|
logger = logging.getLogger(__name__)
|
|
@@ -153,7 +154,7 @@ class Evaluator:
|
|
|
153
154
|
logger.info(f"Created {output_path} ({size_bytes:,} bytes)")
|
|
154
155
|
return size_bytes
|
|
155
156
|
|
|
156
|
-
def create(self, evaluator_id, display_name=None, description=None
|
|
157
|
+
def create(self, evaluator_id, display_name=None, description=None):
|
|
157
158
|
auth_token = self.api_key or get_fireworks_api_key()
|
|
158
159
|
account_id = self.account_id or get_fireworks_account_id()
|
|
159
160
|
if not account_id and auth_token:
|
|
@@ -163,7 +164,11 @@ class Evaluator:
|
|
|
163
164
|
logger.error("Authentication error: API credentials appear to be invalid or incomplete.")
|
|
164
165
|
raise ValueError("Invalid or missing API credentials.")
|
|
165
166
|
|
|
166
|
-
client =
|
|
167
|
+
client = create_fireworks_client(
|
|
168
|
+
api_key=auth_token,
|
|
169
|
+
base_url=self.api_base,
|
|
170
|
+
account_id=account_id,
|
|
171
|
+
)
|
|
167
172
|
|
|
168
173
|
self.display_name = display_name or evaluator_id
|
|
169
174
|
self.description = description or f"Evaluator created from {evaluator_id}"
|
|
@@ -197,28 +202,20 @@ class Evaluator:
|
|
|
197
202
|
logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...")
|
|
198
203
|
|
|
199
204
|
try:
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
logger.info(f"Evaluator '{evaluator_id}' does not exist, creating...")
|
|
215
|
-
|
|
216
|
-
# Create evaluator using SDK
|
|
217
|
-
result = client.evaluators.create(
|
|
218
|
-
evaluator_id=evaluator_id,
|
|
219
|
-
evaluator=evaluator_params,
|
|
220
|
-
)
|
|
221
|
-
logger.info(f"Successfully created evaluator '{evaluator_id}'")
|
|
205
|
+
# Try to create evaluator using SDK
|
|
206
|
+
try:
|
|
207
|
+
result = client.evaluators.create(
|
|
208
|
+
evaluator_id=evaluator_id,
|
|
209
|
+
evaluator=evaluator_params,
|
|
210
|
+
)
|
|
211
|
+
logger.info(f"Successfully created evaluator '{evaluator_id}'")
|
|
212
|
+
except fireworks.APIStatusError as create_error:
|
|
213
|
+
if create_error.status_code == 409:
|
|
214
|
+
# Evaluator already exists, get the existing one and proceed to create a new version
|
|
215
|
+
logger.info(f"Evaluator '{evaluator_id}' already exists, creating new version...")
|
|
216
|
+
result = client.evaluators.get(evaluator_id=evaluator_id)
|
|
217
|
+
else:
|
|
218
|
+
raise
|
|
222
219
|
|
|
223
220
|
# Upload code as tar.gz to GCS
|
|
224
221
|
evaluator_name = result.name # e.g., "accounts/pyroworks/evaluators/test-123"
|
|
@@ -229,6 +226,25 @@ class Evaluator:
|
|
|
229
226
|
f"Cannot proceed with code upload. Response: {result}"
|
|
230
227
|
)
|
|
231
228
|
|
|
229
|
+
evaluator_version_param: EvaluatorVersionParam = {}
|
|
230
|
+
if "commit_hash" in evaluator_params:
|
|
231
|
+
evaluator_version_param["commit_hash"] = evaluator_params["commit_hash"]
|
|
232
|
+
if "entry_point" in evaluator_params:
|
|
233
|
+
evaluator_version_param["entry_point"] = evaluator_params["entry_point"]
|
|
234
|
+
if "requirements" in evaluator_params:
|
|
235
|
+
evaluator_version_param["requirements"] = evaluator_params["requirements"]
|
|
236
|
+
|
|
237
|
+
evaluator_version = client.evaluator_versions.create(
|
|
238
|
+
evaluator_id=evaluator_id,
|
|
239
|
+
evaluator_version=evaluator_version_param,
|
|
240
|
+
)
|
|
241
|
+
evaluator_version_id = evaluator_version.name.split("/")[-1] if evaluator_version.name else None
|
|
242
|
+
if not evaluator_version_id:
|
|
243
|
+
raise ValueError(
|
|
244
|
+
"Create evaluator version response missing 'name' field. "
|
|
245
|
+
f"Cannot proceed with code upload. Response: {evaluator_version}"
|
|
246
|
+
)
|
|
247
|
+
|
|
232
248
|
try:
|
|
233
249
|
# Create tar.gz of current directory
|
|
234
250
|
cwd = os.getcwd()
|
|
@@ -240,7 +256,8 @@ class Evaluator:
|
|
|
240
256
|
|
|
241
257
|
# Call GetEvaluatorUploadEndpoint using SDK
|
|
242
258
|
logger.info(f"Requesting upload endpoint for {tar_filename}")
|
|
243
|
-
upload_response = client.
|
|
259
|
+
upload_response = client.evaluator_versions.get_upload_endpoint(
|
|
260
|
+
version_id=evaluator_version_id,
|
|
244
261
|
evaluator_id=evaluator_id,
|
|
245
262
|
filename_to_size={tar_filename: str(tar_size)},
|
|
246
263
|
)
|
|
@@ -321,9 +338,9 @@ class Evaluator:
|
|
|
321
338
|
raise
|
|
322
339
|
|
|
323
340
|
# Step 3: Validate upload using SDK
|
|
324
|
-
client.
|
|
341
|
+
client.evaluator_versions.validate_upload(
|
|
342
|
+
version_id=evaluator_version_id,
|
|
325
343
|
evaluator_id=evaluator_id,
|
|
326
|
-
body={},
|
|
327
344
|
)
|
|
328
345
|
logger.info("Upload validated successfully")
|
|
329
346
|
|
|
@@ -334,8 +351,10 @@ class Evaluator:
|
|
|
334
351
|
except Exception as upload_error:
|
|
335
352
|
logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}")
|
|
336
353
|
# Don't fail - evaluator is created, just code upload failed
|
|
354
|
+
# Return None for version_id since upload failed
|
|
355
|
+
return result, None
|
|
337
356
|
|
|
338
|
-
return result # Return
|
|
357
|
+
return result, evaluator_version_id # Return evaluator result and version ID
|
|
339
358
|
except fireworks.APIStatusError as e:
|
|
340
359
|
logger.error(f"Error creating evaluator: {str(e)}")
|
|
341
360
|
logger.error(f"Status code: {e.status_code}, Response: {e.response.text}")
|
|
@@ -361,7 +380,6 @@ def create_evaluation(
|
|
|
361
380
|
evaluator_id: str,
|
|
362
381
|
display_name: Optional[str] = None,
|
|
363
382
|
description: Optional[str] = None,
|
|
364
|
-
force: bool = False,
|
|
365
383
|
account_id: Optional[str] = None,
|
|
366
384
|
api_key: Optional[str] = None,
|
|
367
385
|
entry_point: Optional[str] = None,
|
|
@@ -373,10 +391,13 @@ def create_evaluation(
|
|
|
373
391
|
evaluator_id: Unique identifier for the evaluator
|
|
374
392
|
display_name: Display name for the evaluator
|
|
375
393
|
description: Description for the evaluator
|
|
376
|
-
force: If True, delete and recreate if evaluator exists
|
|
377
394
|
account_id: Optional Fireworks account ID
|
|
378
395
|
api_key: Optional Fireworks API key
|
|
379
396
|
entry_point: Optional entry point (module::function or path::function)
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
A tuple of (evaluator_result, version_id) where version_id is the ID of the
|
|
400
|
+
created evaluator version, or None if upload failed.
|
|
380
401
|
"""
|
|
381
402
|
evaluator = Evaluator(
|
|
382
403
|
account_id=account_id,
|
|
@@ -384,4 +405,4 @@ def create_evaluation(
|
|
|
384
405
|
entry_point=entry_point,
|
|
385
406
|
)
|
|
386
407
|
|
|
387
|
-
return evaluator.create(evaluator_id, display_name, description
|
|
408
|
+
return evaluator.create(evaluator_id, display_name, description)
|