eval-protocol 0.2.83__tar.gz → 0.2.84__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.83/eval_protocol.egg-info → eval_protocol-0.2.84}/PKG-INFO +1 -1
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/create_rft.py +96 -29
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/upload.py +22 -67
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/fireworks_rft.py +0 -21
- {eval_protocol-0.2.83 → eval_protocol-0.2.84/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli_create_rft_infer.py +322 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/LICENSE +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/README.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/pyproject.toml +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/setup.cfg +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/setup.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_config.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_format.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_length.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_math.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_models.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/versioneer.py +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.84
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-11-
|
|
11
|
+
"date": "2025-11-10T00:30:58-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "2d75acf5944468856d9f1bea787fce63dcabc16f",
|
|
15
|
+
"version": "0.2.84"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -20,6 +20,7 @@ from ..fireworks_rft import (
|
|
|
20
20
|
create_dataset_from_jsonl,
|
|
21
21
|
create_reinforcement_fine_tuning_job,
|
|
22
22
|
)
|
|
23
|
+
from ..fireworks_rft import detect_dataset_builder, materialize_dataset_via_builder
|
|
23
24
|
from .upload import _discover_tests, _normalize_evaluator_id, _prompt_select
|
|
24
25
|
|
|
25
26
|
|
|
@@ -250,6 +251,37 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
|
|
|
250
251
|
return f"{base}{suffix}"
|
|
251
252
|
|
|
252
253
|
|
|
254
|
+
def _resolve_selected_test(
|
|
255
|
+
project_root: str,
|
|
256
|
+
evaluator_id: Optional[str],
|
|
257
|
+
selected_tests: Optional[list] = None,
|
|
258
|
+
) -> tuple[Optional[str], Optional[str]]:
|
|
259
|
+
"""
|
|
260
|
+
Resolve a single test's source file path and function name to use downstream.
|
|
261
|
+
Priority:
|
|
262
|
+
1) If selected_tests provided and length == 1, use it.
|
|
263
|
+
2) Else discover tests; if exactly one test, use it.
|
|
264
|
+
3) Else, if evaluator_id provided, match by normalized '<file-stem>-<func-name>'.
|
|
265
|
+
Returns: (file_path, func_name) or (None, None) if unresolved.
|
|
266
|
+
"""
|
|
267
|
+
try:
|
|
268
|
+
tests = selected_tests if selected_tests is not None else _discover_tests(project_root)
|
|
269
|
+
if not tests:
|
|
270
|
+
return None, None
|
|
271
|
+
if len(tests) == 1:
|
|
272
|
+
return tests[0].file_path, tests[0].qualname.split(".")[-1]
|
|
273
|
+
if evaluator_id:
|
|
274
|
+
for t in tests:
|
|
275
|
+
func_name = t.qualname.split(".")[-1]
|
|
276
|
+
source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
|
|
277
|
+
candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
|
|
278
|
+
if candidate == evaluator_id:
|
|
279
|
+
return t.file_path, func_name
|
|
280
|
+
return None, None
|
|
281
|
+
except Exception:
|
|
282
|
+
return None, None
|
|
283
|
+
|
|
284
|
+
|
|
253
285
|
def _poll_evaluator_status(
|
|
254
286
|
evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
|
|
255
287
|
) -> bool:
|
|
@@ -316,6 +348,9 @@ def create_rft_command(args) -> int:
|
|
|
316
348
|
non_interactive: bool = bool(getattr(args, "yes", False))
|
|
317
349
|
dry_run: bool = bool(getattr(args, "dry_run", False))
|
|
318
350
|
force: bool = bool(getattr(args, "force", False))
|
|
351
|
+
# Track the specifically chosen test (if any) to aid dataset inference later
|
|
352
|
+
selected_test_file_path: Optional[str] = None
|
|
353
|
+
selected_test_func_name: Optional[str] = None
|
|
319
354
|
|
|
320
355
|
api_key = get_fireworks_api_key()
|
|
321
356
|
if not api_key:
|
|
@@ -348,12 +383,33 @@ def create_rft_command(args) -> int:
|
|
|
348
383
|
print("No tests selected.")
|
|
349
384
|
return 1
|
|
350
385
|
if len(selected_tests) != 1:
|
|
351
|
-
|
|
386
|
+
if non_interactive and len(selected_tests) > 1:
|
|
387
|
+
print("Error: Multiple evaluation tests found in --yes (non-interactive) mode.")
|
|
388
|
+
print(" Please pass --evaluator-id or --entry to disambiguate.")
|
|
389
|
+
try:
|
|
390
|
+
# Offer candidate evaluator ids for convenience
|
|
391
|
+
tests = _discover_tests(project_root)
|
|
392
|
+
if tests:
|
|
393
|
+
print(" Candidate evaluator ids:")
|
|
394
|
+
for t in tests:
|
|
395
|
+
func = t.qualname.split(".")[-1]
|
|
396
|
+
stem = os.path.splitext(os.path.basename(t.file_path))[0]
|
|
397
|
+
cand = _normalize_evaluator_id(f"{stem}-{func}")
|
|
398
|
+
print(f" - {cand}")
|
|
399
|
+
except Exception:
|
|
400
|
+
pass
|
|
401
|
+
else:
|
|
402
|
+
print("Error: Please select exactly one evaluation test for 'create rft'.")
|
|
352
403
|
return 1
|
|
404
|
+
# Derive evaluator_id from user's single selection
|
|
353
405
|
chosen = selected_tests[0]
|
|
354
406
|
func_name = chosen.qualname.split(".")[-1]
|
|
355
407
|
source_file_name = os.path.splitext(os.path.basename(chosen.file_path))[0]
|
|
356
408
|
evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
|
|
409
|
+
# Resolve selected test once for downstream
|
|
410
|
+
selected_test_file_path, selected_test_func_name = _resolve_selected_test(
|
|
411
|
+
project_root, evaluator_id, selected_tests=selected_tests
|
|
412
|
+
)
|
|
357
413
|
# Resolve evaluator resource name to fully-qualified format required by API
|
|
358
414
|
evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
|
|
359
415
|
|
|
@@ -386,6 +442,11 @@ def create_rft_command(args) -> int:
|
|
|
386
442
|
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
|
|
387
443
|
return 1
|
|
388
444
|
skip_upload = True
|
|
445
|
+
# Populate selected test info for dataset inference later
|
|
446
|
+
st_path, st_func = _resolve_selected_test(project_root, evaluator_id)
|
|
447
|
+
if st_path and st_func:
|
|
448
|
+
selected_test_file_path = st_path
|
|
449
|
+
selected_test_func_name = st_func
|
|
389
450
|
except requests.exceptions.RequestException:
|
|
390
451
|
pass
|
|
391
452
|
|
|
@@ -396,28 +457,16 @@ def create_rft_command(args) -> int:
|
|
|
396
457
|
|
|
397
458
|
tests = _discover_tests(project_root)
|
|
398
459
|
selected_entry: Optional[str] = None
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
abs_path = os.path.abspath(
|
|
460
|
+
st_path, st_func = _resolve_selected_test(project_root, evaluator_id, selected_tests=tests)
|
|
461
|
+
if st_path and st_func:
|
|
462
|
+
abs_path = os.path.abspath(st_path)
|
|
402
463
|
try:
|
|
403
464
|
rel = os.path.relpath(abs_path, project_root)
|
|
404
465
|
except Exception:
|
|
405
466
|
rel = abs_path
|
|
406
|
-
selected_entry = f"{rel}::{
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
for t in tests:
|
|
410
|
-
func_name = t.qualname.split(".")[-1]
|
|
411
|
-
source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
|
|
412
|
-
candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
|
|
413
|
-
if candidate == evaluator_id:
|
|
414
|
-
abs_path = os.path.abspath(t.file_path)
|
|
415
|
-
try:
|
|
416
|
-
rel = os.path.relpath(abs_path, project_root)
|
|
417
|
-
except Exception:
|
|
418
|
-
rel = abs_path
|
|
419
|
-
selected_entry = f"{rel}::{func_name}"
|
|
420
|
-
break
|
|
467
|
+
selected_entry = f"{rel}::{st_func}"
|
|
468
|
+
selected_test_file_path = st_path
|
|
469
|
+
selected_test_func_name = st_func
|
|
421
470
|
# If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
|
|
422
471
|
if selected_entry is None and len(tests) > 1:
|
|
423
472
|
print(
|
|
@@ -480,30 +529,48 @@ def create_rft_command(args) -> int:
|
|
|
480
529
|
dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
|
|
481
530
|
|
|
482
531
|
if not dataset_id:
|
|
483
|
-
# Prefer explicit --dataset-jsonl, else attempt to extract from data loader or input_dataset
|
|
532
|
+
# Prefer explicit --dataset-jsonl, else attempt to extract from the selected test's data loader or input_dataset.
|
|
484
533
|
if not dataset_jsonl:
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
534
|
+
# Use specifically selected test if available; else only infer when exactly one test exists
|
|
535
|
+
test_file_for_infer = None
|
|
536
|
+
func_for_infer = None
|
|
537
|
+
if selected_test_file_path and selected_test_func_name:
|
|
538
|
+
test_file_for_infer = selected_test_file_path
|
|
539
|
+
func_for_infer = selected_test_func_name
|
|
540
|
+
else:
|
|
541
|
+
tests = _discover_tests(project_root)
|
|
542
|
+
if len(tests) == 1:
|
|
543
|
+
test_file_for_infer = tests[0].file_path
|
|
544
|
+
func_for_infer = tests[0].qualname.split(".")[-1]
|
|
545
|
+
if test_file_for_infer and func_for_infer:
|
|
546
|
+
# Try data_loaders first
|
|
547
|
+
dataset_jsonl = _extract_jsonl_from_dataloader(test_file_for_infer, func_for_infer)
|
|
490
548
|
if dataset_jsonl:
|
|
491
|
-
# Display relative path for readability
|
|
492
549
|
try:
|
|
493
550
|
rel = os.path.relpath(dataset_jsonl, project_root)
|
|
494
551
|
except Exception:
|
|
495
552
|
rel = dataset_jsonl
|
|
496
553
|
print(f"✓ Using JSONL from data loader: {rel}")
|
|
497
|
-
|
|
554
|
+
if not dataset_jsonl:
|
|
498
555
|
# Fall back to input_dataset (dataset_path)
|
|
499
|
-
dataset_jsonl = _extract_jsonl_from_input_dataset(
|
|
556
|
+
dataset_jsonl = _extract_jsonl_from_input_dataset(test_file_for_infer, func_for_infer)
|
|
500
557
|
if dataset_jsonl:
|
|
501
|
-
# Display relative path for readability
|
|
502
558
|
try:
|
|
503
559
|
rel = os.path.relpath(dataset_jsonl, project_root)
|
|
504
560
|
except Exception:
|
|
505
561
|
rel = dataset_jsonl
|
|
506
562
|
print(f"✓ Using JSONL from input_dataset: {rel}")
|
|
563
|
+
if not dataset_jsonl:
|
|
564
|
+
# Last resort: attempt to detect and run a dataset builder in the test's directory
|
|
565
|
+
metric_dir = os.path.dirname(test_file_for_infer)
|
|
566
|
+
builder_spec = detect_dataset_builder(metric_dir)
|
|
567
|
+
if builder_spec:
|
|
568
|
+
try:
|
|
569
|
+
tmp_jsonl, count = materialize_dataset_via_builder(builder_spec)
|
|
570
|
+
dataset_jsonl = tmp_jsonl
|
|
571
|
+
print(f"✓ Materialized {count} rows via dataset builder: {builder_spec}")
|
|
572
|
+
except Exception as e:
|
|
573
|
+
print(f"Warning: dataset builder failed: {e}")
|
|
507
574
|
if not dataset_jsonl:
|
|
508
575
|
print(
|
|
509
576
|
"Error: Could not determine dataset. Provide --dataset-id or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
|
|
@@ -21,7 +21,6 @@ from eval_protocol.auth import (
|
|
|
21
21
|
from eval_protocol.platform_api import create_or_update_fireworks_secret
|
|
22
22
|
|
|
23
23
|
from eval_protocol.evaluation import create_evaluation
|
|
24
|
-
from eval_protocol.fireworks_rft import save_evaluator_trace, detect_dataset_builder
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
@dataclass
|
|
@@ -444,49 +443,25 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
|
|
|
444
443
|
else:
|
|
445
444
|
return []
|
|
446
445
|
|
|
447
|
-
#
|
|
448
|
-
remaining_indices = list(range(len(tests)))
|
|
449
|
-
selected_indices: list[int] = []
|
|
450
|
-
|
|
446
|
+
# Single-select UX
|
|
451
447
|
print("\n")
|
|
452
|
-
print("Tip: Use ↑/↓ arrows to navigate and press ENTER to select
|
|
453
|
-
print(" After selecting one, you can choose to add more.\n")
|
|
454
|
-
|
|
455
|
-
while remaining_indices:
|
|
456
|
-
# Build choices from remaining
|
|
457
|
-
choices = []
|
|
458
|
-
for idx, test_idx in enumerate(remaining_indices, 1):
|
|
459
|
-
t = tests[test_idx]
|
|
460
|
-
choice_text = _format_test_choice(t, idx)
|
|
461
|
-
choices.append({"name": choice_text, "value": test_idx})
|
|
462
|
-
|
|
463
|
-
selected = questionary.select(
|
|
464
|
-
"Select an evaluation test to upload:", choices=choices, style=custom_style
|
|
465
|
-
).ask()
|
|
466
|
-
|
|
467
|
-
if selected is None: # Ctrl+C
|
|
468
|
-
print("\nUpload cancelled.")
|
|
469
|
-
return []
|
|
448
|
+
print("Tip: Use ↑/↓ arrows to navigate and press ENTER to select.\n")
|
|
470
449
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
remaining_indices.remove(selected)
|
|
450
|
+
choices = []
|
|
451
|
+
for idx, t in enumerate(tests, 1):
|
|
452
|
+
choice_text = _format_test_choice(t, idx)
|
|
453
|
+
choices.append({"name": choice_text, "value": idx - 1})
|
|
476
454
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
break
|
|
481
|
-
else:
|
|
482
|
-
break
|
|
455
|
+
selected = questionary.select(
|
|
456
|
+
"Select an evaluation test to upload:", choices=choices, style=custom_style
|
|
457
|
+
).ask()
|
|
483
458
|
|
|
484
|
-
if
|
|
485
|
-
print("\
|
|
459
|
+
if selected is None: # Ctrl+C
|
|
460
|
+
print("\nUpload cancelled.")
|
|
486
461
|
return []
|
|
487
462
|
|
|
488
|
-
print(
|
|
489
|
-
return [tests[
|
|
463
|
+
print("\n✓ Selected 1 test")
|
|
464
|
+
return [tests[selected]]
|
|
490
465
|
|
|
491
466
|
except ImportError:
|
|
492
467
|
# Fallback to simpler implementation
|
|
@@ -525,22 +500,19 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
|
|
|
525
500
|
|
|
526
501
|
print("=" * 80)
|
|
527
502
|
try:
|
|
528
|
-
choice = input("Enter
|
|
503
|
+
choice = input("Enter the number to upload: ").strip()
|
|
529
504
|
except KeyboardInterrupt:
|
|
530
505
|
print("\n\nUpload cancelled.")
|
|
531
506
|
return []
|
|
532
507
|
|
|
533
|
-
if choice.
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
indices.append(n - 1)
|
|
542
|
-
indices = sorted(set(indices))
|
|
543
|
-
return [tests[i] for i in indices]
|
|
508
|
+
if not choice.isdigit():
|
|
509
|
+
print("\n⚠️ Invalid selection.")
|
|
510
|
+
return []
|
|
511
|
+
n = int(choice)
|
|
512
|
+
if not (1 <= n <= len(tests)):
|
|
513
|
+
print("\n⚠️ Selection out of range.")
|
|
514
|
+
return []
|
|
515
|
+
return [tests[n - 1]]
|
|
544
516
|
|
|
545
517
|
|
|
546
518
|
def _prompt_select(tests: list[DiscoveredTest], non_interactive: bool) -> list[DiscoveredTest]:
|
|
@@ -718,23 +690,6 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
718
690
|
)
|
|
719
691
|
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
|
|
720
692
|
|
|
721
|
-
# Persist local evaluator trace for later `create rft`
|
|
722
|
-
try:
|
|
723
|
-
metric_dir = os.path.dirname(source_file_path) if source_file_path else root
|
|
724
|
-
builder_spec = detect_dataset_builder(metric_dir) or None
|
|
725
|
-
trace_payload = {
|
|
726
|
-
"evaluator_id": evaluator_id,
|
|
727
|
-
"evaluator_resource_name": name,
|
|
728
|
-
"entry_point": entry_point,
|
|
729
|
-
"metric_dir": metric_dir,
|
|
730
|
-
"project_root": root,
|
|
731
|
-
"dataset_builder": builder_spec,
|
|
732
|
-
}
|
|
733
|
-
save_evaluator_trace(project_root=root, evaluator_id=evaluator_id, trace=trace_payload)
|
|
734
|
-
except Exception:
|
|
735
|
-
# Non-fatal; continue
|
|
736
|
-
pass
|
|
737
|
-
|
|
738
693
|
# Print success message with Fireworks dashboard link
|
|
739
694
|
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
|
|
740
695
|
print("📊 View in Fireworks Dashboard:")
|
|
@@ -37,25 +37,6 @@ def _map_api_host_to_app_host(api_base: str) -> str:
|
|
|
37
37
|
return "https://app.fireworks.ai"
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def load_evaluator_trace(project_root: str, evaluator_id: str) -> Optional[Dict[str, Any]]:
|
|
41
|
-
trace_path = Path(project_root) / ".eval_protocol" / "evaluators" / f"{evaluator_id}.json"
|
|
42
|
-
if not trace_path.exists():
|
|
43
|
-
return None
|
|
44
|
-
try:
|
|
45
|
-
with open(trace_path, "r", encoding="utf-8") as f:
|
|
46
|
-
return json.load(f)
|
|
47
|
-
except Exception:
|
|
48
|
-
return None
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def save_evaluator_trace(project_root: str, evaluator_id: str, trace: Dict[str, Any]) -> None:
|
|
52
|
-
base_dir = Path(project_root) / ".eval_protocol" / "evaluators"
|
|
53
|
-
base_dir.mkdir(parents=True, exist_ok=True)
|
|
54
|
-
trace_path = base_dir / f"{evaluator_id}.json"
|
|
55
|
-
with open(trace_path, "w", encoding="utf-8") as f:
|
|
56
|
-
json.dump(trace, f, indent=2, ensure_ascii=False)
|
|
57
|
-
|
|
58
|
-
|
|
59
40
|
def detect_dataset_builder(metric_dir: str) -> Optional[str]:
|
|
60
41
|
"""
|
|
61
42
|
Best-effort scan for a dataset builder callable inside the metric directory.
|
|
@@ -228,8 +209,6 @@ def build_default_output_model(evaluator_id: str) -> str:
|
|
|
228
209
|
|
|
229
210
|
|
|
230
211
|
__all__ = [
|
|
231
|
-
"load_evaluator_trace",
|
|
232
|
-
"save_evaluator_trace",
|
|
233
212
|
"detect_dataset_builder",
|
|
234
213
|
"materialize_dataset_via_builder",
|
|
235
214
|
"create_dataset_from_jsonl",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.84
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|