eval-protocol 0.2.72__tar.gz → 0.2.73__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.72/eval_protocol.egg-info → eval_protocol-0.2.73}/PKG-INFO +1 -1
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli.py +2 -0
- eval_protocol-0.2.73/eval_protocol/cli_commands/create_rft.py +492 -0
- eval_protocol-0.2.73/eval_protocol/data_loader/__init__.py +5 -0
- eval_protocol-0.2.73/eval_protocol/data_loader/jsonl_data_loader.py +42 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/evaluation.py +41 -1
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/fireworks_rft.py +12 -4
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/evaluation_test.py +41 -12
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/evaluation_test_postprocess.py +2 -1
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/exception_config.py +1 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
- eval_protocol-0.2.73/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +223 -0
- eval_protocol-0.2.73/eval_protocol/quickstart/svg_agent/evaluator/utils.py +523 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/SOURCES.txt +3 -0
- eval_protocol-0.2.72/eval_protocol/cli_commands/create_rft.py +0 -254
- eval_protocol-0.2.72/eval_protocol/data_loader/__init__.py +0 -4
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/LICENSE +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/README.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/pyproject.toml +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/setup.cfg +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/setup.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_config.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_format.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_length.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_math.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_models.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/versioneer.py +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.73
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-
|
|
11
|
+
"date": "2025-11-01T13:56:18-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "d8477be9df5508ec4c7ef53cb7a5e8cb758cec3d",
|
|
15
|
+
"version": "0.2.73"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -402,6 +402,8 @@ def parse_args(args=None):
|
|
|
402
402
|
rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
|
|
403
403
|
rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
|
|
404
404
|
rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
|
|
405
|
+
# Rollout chunking
|
|
406
|
+
rft_parser.add_argument("--chunk-size", type=int, help="Data chunk size for rollout batching")
|
|
405
407
|
# Inference params
|
|
406
408
|
rft_parser.add_argument("--temperature", type=float)
|
|
407
409
|
rft_parser.add_argument("--top-p", type=float)
|
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
import argparse
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
from ..auth import (
|
|
9
|
+
get_fireworks_account_id,
|
|
10
|
+
get_fireworks_api_base,
|
|
11
|
+
get_fireworks_api_key,
|
|
12
|
+
verify_api_key_and_get_account_id,
|
|
13
|
+
)
|
|
14
|
+
from ..fireworks_rft import (
|
|
15
|
+
_map_api_host_to_app_host,
|
|
16
|
+
create_dataset_from_jsonl,
|
|
17
|
+
create_reinforcement_fine_tuning_job,
|
|
18
|
+
)
|
|
19
|
+
from .upload import _discover_tests, _normalize_evaluator_id, _resolve_entry_to_qual_and_source
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _ensure_account_id() -> Optional[str]:
|
|
23
|
+
account_id = get_fireworks_account_id()
|
|
24
|
+
api_key = get_fireworks_api_key()
|
|
25
|
+
if not account_id and api_key:
|
|
26
|
+
resolved = verify_api_key_and_get_account_id(api_key=api_key, api_base=get_fireworks_api_base())
|
|
27
|
+
if resolved:
|
|
28
|
+
os.environ["FIREWORKS_ACCOUNT_ID"] = resolved
|
|
29
|
+
return resolved
|
|
30
|
+
return account_id
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _extract_terminal_segment(resource_name: str) -> str:
|
|
34
|
+
"""Return the last path segment if a fully-qualified resource name is provided."""
|
|
35
|
+
try:
|
|
36
|
+
return resource_name.strip("/").split("/")[-1]
|
|
37
|
+
except Exception:
|
|
38
|
+
return resource_name
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _print_links(evaluator_id: str, dataset_id: str, job_name: Optional[str]) -> None:
|
|
42
|
+
api_base = get_fireworks_api_base()
|
|
43
|
+
app_base = _map_api_host_to_app_host(api_base)
|
|
44
|
+
print("\n📊 Dashboard Links:")
|
|
45
|
+
evaluator_slug = _extract_terminal_segment(evaluator_id)
|
|
46
|
+
print(f" Evaluator: {app_base}/dashboard/evaluators/{evaluator_slug}")
|
|
47
|
+
if dataset_id:
|
|
48
|
+
print(f" Dataset: {app_base}/dashboard/datasets/{dataset_id}")
|
|
49
|
+
if job_name:
|
|
50
|
+
# job_name likely like accounts/{account}/reinforcementFineTuningJobs/{id}
|
|
51
|
+
try:
|
|
52
|
+
job_id = job_name.strip().split("/")[-1]
|
|
53
|
+
print(f" RFT Job: {app_base}/dashboard/fine-tuning/reinforcement/{job_id}")
|
|
54
|
+
except Exception:
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _auto_find_jsonl(cwd: str) -> Optional[str]:
|
|
59
|
+
"""Find a reasonable JSONL dataset file in the current project.
|
|
60
|
+
|
|
61
|
+
Priority order:
|
|
62
|
+
- dataset.jsonl in cwd
|
|
63
|
+
- data/dataset.jsonl
|
|
64
|
+
- first *.jsonl under cwd (depth-first, skipping common vendor/venv/build dirs)
|
|
65
|
+
Returns a RELATIVE path from cwd if possible.
|
|
66
|
+
"""
|
|
67
|
+
# Direct candidates
|
|
68
|
+
direct_candidates = [
|
|
69
|
+
os.path.join(cwd, "dataset.jsonl"),
|
|
70
|
+
os.path.join(cwd, "data", "dataset.jsonl"),
|
|
71
|
+
]
|
|
72
|
+
for p in direct_candidates:
|
|
73
|
+
if os.path.isfile(p):
|
|
74
|
+
try:
|
|
75
|
+
return os.path.relpath(p, cwd)
|
|
76
|
+
except Exception:
|
|
77
|
+
return p
|
|
78
|
+
|
|
79
|
+
# Walk and find any .jsonl
|
|
80
|
+
skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
|
|
81
|
+
for dirpath, dirnames, filenames in os.walk(cwd):
|
|
82
|
+
# prune
|
|
83
|
+
dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
|
|
84
|
+
for name in sorted(filenames):
|
|
85
|
+
if name.endswith(".jsonl"):
|
|
86
|
+
candidate = os.path.join(dirpath, name)
|
|
87
|
+
try:
|
|
88
|
+
return os.path.relpath(candidate, cwd)
|
|
89
|
+
except Exception:
|
|
90
|
+
return candidate
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) -> Optional[str]:
|
|
95
|
+
"""Import the test module and extract a JSONL path from data_loaders param if present.
|
|
96
|
+
|
|
97
|
+
Looks for a pytest.mark.parametrize with argnames containing 'data_loaders' and attempts to
|
|
98
|
+
find an object with attribute 'jsonl_path'. If a relative path is found, it is resolved
|
|
99
|
+
relative to the directory of the test file.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
import importlib.util
|
|
103
|
+
from pathlib import Path
|
|
104
|
+
|
|
105
|
+
spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
|
|
106
|
+
if not spec or not spec.loader:
|
|
107
|
+
return None
|
|
108
|
+
module = importlib.util.module_from_spec(spec)
|
|
109
|
+
sys.modules[spec.name] = module
|
|
110
|
+
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
111
|
+
if not hasattr(module, test_func_name):
|
|
112
|
+
return None
|
|
113
|
+
wrapper = getattr(module, test_func_name)
|
|
114
|
+
marks = getattr(wrapper, "pytestmark", [])
|
|
115
|
+
for m in marks:
|
|
116
|
+
if getattr(m, "name", "") == "parametrize":
|
|
117
|
+
kwargs = getattr(m, "kwargs", {})
|
|
118
|
+
argnames = kwargs.get("argnames", (m.args[0] if m.args else []))
|
|
119
|
+
argvalues = kwargs.get("argvalues", (m.args[1] if len(m.args) > 1 else []))
|
|
120
|
+
# Normalize argnames to list
|
|
121
|
+
if isinstance(argnames, str):
|
|
122
|
+
names_list = [n.strip() for n in argnames.split(",") if n.strip()]
|
|
123
|
+
else:
|
|
124
|
+
names_list = list(argnames)
|
|
125
|
+
if "data_loaders" not in names_list:
|
|
126
|
+
continue
|
|
127
|
+
idx = names_list.index("data_loaders")
|
|
128
|
+
# argvalues is a list of tuples/values aligned with argnames
|
|
129
|
+
for val in argvalues:
|
|
130
|
+
# Normalize to tuple
|
|
131
|
+
if not isinstance(val, (tuple, list)):
|
|
132
|
+
params = (val,)
|
|
133
|
+
else:
|
|
134
|
+
params = tuple(val)
|
|
135
|
+
if idx >= len(params):
|
|
136
|
+
continue
|
|
137
|
+
dataloaders_obj = params[idx]
|
|
138
|
+
# May be a list or single loader
|
|
139
|
+
candidates = (
|
|
140
|
+
list(dataloaders_obj) if isinstance(dataloaders_obj, (list, tuple)) else [dataloaders_obj]
|
|
141
|
+
)
|
|
142
|
+
for dl in candidates:
|
|
143
|
+
jsonl_path = getattr(dl, "jsonl_path", None)
|
|
144
|
+
if isinstance(jsonl_path, str) and jsonl_path:
|
|
145
|
+
if os.path.isabs(jsonl_path):
|
|
146
|
+
return jsonl_path
|
|
147
|
+
base_dir = os.path.dirname(os.path.abspath(test_file_path))
|
|
148
|
+
return os.path.abspath(os.path.join(base_dir, jsonl_path))
|
|
149
|
+
return None
|
|
150
|
+
except Exception:
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str) -> Optional[str]:
|
|
155
|
+
"""Import the test module and extract a JSONL path from input_dataset (dataset_path) param if present.
|
|
156
|
+
|
|
157
|
+
Looks for a pytest.mark.parametrize with argnames containing 'dataset_path' and extracts the
|
|
158
|
+
first dataset path value. If a relative path is found, it is resolved relative to the directory
|
|
159
|
+
of the test file.
|
|
160
|
+
"""
|
|
161
|
+
try:
|
|
162
|
+
import importlib.util
|
|
163
|
+
from pathlib import Path
|
|
164
|
+
|
|
165
|
+
spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
|
|
166
|
+
if not spec or not spec.loader:
|
|
167
|
+
return None
|
|
168
|
+
module = importlib.util.module_from_spec(spec)
|
|
169
|
+
sys.modules[spec.name] = module
|
|
170
|
+
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
171
|
+
if not hasattr(module, test_func_name):
|
|
172
|
+
return None
|
|
173
|
+
wrapper = getattr(module, test_func_name)
|
|
174
|
+
marks = getattr(wrapper, "pytestmark", [])
|
|
175
|
+
for m in marks:
|
|
176
|
+
if getattr(m, "name", "") == "parametrize":
|
|
177
|
+
kwargs = getattr(m, "kwargs", {})
|
|
178
|
+
argnames = kwargs.get("argnames", (m.args[0] if m.args else []))
|
|
179
|
+
argvalues = kwargs.get("argvalues", (m.args[1] if len(m.args) > 1 else []))
|
|
180
|
+
# Normalize argnames to list
|
|
181
|
+
if isinstance(argnames, str):
|
|
182
|
+
names_list = [n.strip() for n in argnames.split(",") if n.strip()]
|
|
183
|
+
else:
|
|
184
|
+
names_list = list(argnames)
|
|
185
|
+
if "dataset_path" not in names_list:
|
|
186
|
+
continue
|
|
187
|
+
idx = names_list.index("dataset_path")
|
|
188
|
+
# argvalues is a list of tuples/values aligned with argnames
|
|
189
|
+
# Get the first value (first test case)
|
|
190
|
+
if argvalues:
|
|
191
|
+
val = argvalues[0]
|
|
192
|
+
# Normalize to tuple
|
|
193
|
+
if not isinstance(val, (tuple, list)):
|
|
194
|
+
params = (val,)
|
|
195
|
+
else:
|
|
196
|
+
params = tuple(val)
|
|
197
|
+
if idx < len(params):
|
|
198
|
+
dataset_path = params[idx]
|
|
199
|
+
# dataset_path is typically a string, but could be a list if combine_datasets=True
|
|
200
|
+
if isinstance(dataset_path, (list, tuple)) and len(dataset_path) > 0:
|
|
201
|
+
dataset_path = dataset_path[0]
|
|
202
|
+
if isinstance(dataset_path, str) and dataset_path:
|
|
203
|
+
if os.path.isabs(dataset_path):
|
|
204
|
+
return dataset_path
|
|
205
|
+
base_dir = os.path.dirname(os.path.abspath(test_file_path))
|
|
206
|
+
resolved = os.path.abspath(os.path.join(base_dir, dataset_path))
|
|
207
|
+
if os.path.isfile(resolved):
|
|
208
|
+
return resolved
|
|
209
|
+
# Try resolving from project root if relative to test file doesn't work
|
|
210
|
+
if not os.path.isabs(dataset_path):
|
|
211
|
+
# Try resolving from current working directory
|
|
212
|
+
cwd_path = os.path.abspath(os.path.join(os.getcwd(), dataset_path))
|
|
213
|
+
if os.path.isfile(cwd_path):
|
|
214
|
+
return cwd_path
|
|
215
|
+
return None
|
|
216
|
+
except Exception:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _build_trimmed_dataset_id(evaluator_id: str) -> str:
|
|
221
|
+
"""Build a dataset id derived from evaluator_id, trimmed to 63 chars.
|
|
222
|
+
|
|
223
|
+
Format: <normalized-base>-dataset-YYYYMMDDHHMMSS, where base is trimmed to fit.
|
|
224
|
+
"""
|
|
225
|
+
# Normalize base similarly to evaluator id rules
|
|
226
|
+
from .upload import _normalize_evaluator_id # local import to avoid cycle at module import time
|
|
227
|
+
|
|
228
|
+
base = _normalize_evaluator_id(evaluator_id)
|
|
229
|
+
suffix = f"-dataset-{time.strftime('%Y%m%d%H%M%S')}"
|
|
230
|
+
max_total = 63
|
|
231
|
+
max_base_len = max_total - len(suffix)
|
|
232
|
+
if max_base_len < 1:
|
|
233
|
+
max_base_len = 1
|
|
234
|
+
if len(base) > max_base_len:
|
|
235
|
+
base = base[:max_base_len].rstrip("-")
|
|
236
|
+
if not base:
|
|
237
|
+
base = "dataset"
|
|
238
|
+
# Ensure first char is a letter
|
|
239
|
+
if not base[0].isalpha():
|
|
240
|
+
base = f"eval-{base}"
|
|
241
|
+
if len(base) > max_base_len:
|
|
242
|
+
base = base[:max_base_len]
|
|
243
|
+
base = base.rstrip("-") or "dataset"
|
|
244
|
+
return f"{base}{suffix}"
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _auto_select_evaluator_id(cwd: str) -> Optional[str]:
|
|
248
|
+
# Try local traces
|
|
249
|
+
traces_dir = os.path.join(cwd, ".eval_protocol", "evaluators")
|
|
250
|
+
if os.path.isdir(traces_dir):
|
|
251
|
+
candidates = [f[:-5] for f in os.listdir(traces_dir) if f.endswith(".json")]
|
|
252
|
+
if len(candidates) == 1:
|
|
253
|
+
return candidates[0]
|
|
254
|
+
# Fall back to discovering a single evaluation_test
|
|
255
|
+
tests = _discover_tests(cwd)
|
|
256
|
+
if len(tests) == 1:
|
|
257
|
+
qualname, source_file_path = tests[0].qualname, tests[0].file_path
|
|
258
|
+
test_func_name = qualname.split(".")[-1]
|
|
259
|
+
source_file_name = os.path.splitext(os.path.basename(source_file_path))[0]
|
|
260
|
+
evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{test_func_name}")
|
|
261
|
+
return evaluator_id
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def create_rft_command(args) -> int:
|
|
266
|
+
evaluator_id: Optional[str] = getattr(args, "evaluator_id", None)
|
|
267
|
+
non_interactive: bool = bool(getattr(args, "yes", False))
|
|
268
|
+
dry_run: bool = bool(getattr(args, "dry_run", False))
|
|
269
|
+
|
|
270
|
+
api_key = get_fireworks_api_key()
|
|
271
|
+
if not api_key:
|
|
272
|
+
print("Error: FIREWORKS_API_KEY not set.")
|
|
273
|
+
return 1
|
|
274
|
+
|
|
275
|
+
account_id = _ensure_account_id()
|
|
276
|
+
if not account_id:
|
|
277
|
+
print("Error: FIREWORKS_ACCOUNT_ID not set and could not be resolved.")
|
|
278
|
+
return 1
|
|
279
|
+
|
|
280
|
+
api_base = get_fireworks_api_base()
|
|
281
|
+
|
|
282
|
+
# Resolve evaluator id if omitted
|
|
283
|
+
project_root = os.getcwd()
|
|
284
|
+
if not evaluator_id:
|
|
285
|
+
evaluator_id = _auto_select_evaluator_id(project_root)
|
|
286
|
+
if not evaluator_id:
|
|
287
|
+
print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.")
|
|
288
|
+
return 1
|
|
289
|
+
|
|
290
|
+
# Resolve evaluator resource name to fully-qualified format required by API
|
|
291
|
+
evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
|
|
292
|
+
|
|
293
|
+
# Ensure evaluator exists by invoking the upload flow programmatically
|
|
294
|
+
try:
|
|
295
|
+
from .upload import upload_command
|
|
296
|
+
|
|
297
|
+
tests = _discover_tests(project_root)
|
|
298
|
+
selected_entry: Optional[str] = None
|
|
299
|
+
if len(tests) == 1:
|
|
300
|
+
func_name = tests[0].qualname.split(".")[-1]
|
|
301
|
+
abs_path = os.path.abspath(tests[0].file_path)
|
|
302
|
+
try:
|
|
303
|
+
rel = os.path.relpath(abs_path, project_root)
|
|
304
|
+
except Exception:
|
|
305
|
+
rel = abs_path
|
|
306
|
+
selected_entry = f"{rel}::{func_name}"
|
|
307
|
+
else:
|
|
308
|
+
# Try to match evaluator_id to a discovered test's normalized ID
|
|
309
|
+
for t in tests:
|
|
310
|
+
func_name = t.qualname.split(".")[-1]
|
|
311
|
+
source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
|
|
312
|
+
candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
|
|
313
|
+
if candidate == evaluator_id:
|
|
314
|
+
abs_path = os.path.abspath(t.file_path)
|
|
315
|
+
try:
|
|
316
|
+
rel = os.path.relpath(abs_path, project_root)
|
|
317
|
+
except Exception:
|
|
318
|
+
rel = abs_path
|
|
319
|
+
selected_entry = f"{rel}::{func_name}"
|
|
320
|
+
break
|
|
321
|
+
|
|
322
|
+
upload_args = argparse.Namespace(
|
|
323
|
+
path=project_root,
|
|
324
|
+
entry=selected_entry,
|
|
325
|
+
id=evaluator_id,
|
|
326
|
+
display_name=None,
|
|
327
|
+
description=None,
|
|
328
|
+
force=False,
|
|
329
|
+
yes=True,
|
|
330
|
+
)
|
|
331
|
+
rc = upload_command(upload_args)
|
|
332
|
+
if rc == 0:
|
|
333
|
+
print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
|
|
334
|
+
else:
|
|
335
|
+
print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
|
|
336
|
+
except Exception as e:
|
|
337
|
+
print(f"Warning: Failed to upload evaluator automatically: {e}")
|
|
338
|
+
|
|
339
|
+
# Determine dataset id and materialization path
|
|
340
|
+
dataset_id = getattr(args, "dataset_id", None)
|
|
341
|
+
dataset_jsonl = getattr(args, "dataset_jsonl", None)
|
|
342
|
+
dataset_display_name = getattr(args, "dataset_display_name", None)
|
|
343
|
+
dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
|
|
344
|
+
|
|
345
|
+
if not dataset_id:
|
|
346
|
+
# Prefer explicit --dataset-jsonl, else attempt to extract from data loader or input_dataset of the single discovered test
|
|
347
|
+
if not dataset_jsonl:
|
|
348
|
+
tests = _discover_tests(project_root)
|
|
349
|
+
if len(tests) == 1:
|
|
350
|
+
func_name = tests[0].qualname.split(".")[-1]
|
|
351
|
+
# Try data_loaders first (existing behavior)
|
|
352
|
+
dataset_jsonl = _extract_jsonl_from_dataloader(tests[0].file_path, func_name)
|
|
353
|
+
if dataset_jsonl:
|
|
354
|
+
# Display relative path for readability
|
|
355
|
+
try:
|
|
356
|
+
rel = os.path.relpath(dataset_jsonl, project_root)
|
|
357
|
+
except Exception:
|
|
358
|
+
rel = dataset_jsonl
|
|
359
|
+
print(f"✓ Using JSONL from data loader: {rel}")
|
|
360
|
+
else:
|
|
361
|
+
# Fall back to input_dataset (dataset_path)
|
|
362
|
+
dataset_jsonl = _extract_jsonl_from_input_dataset(tests[0].file_path, func_name)
|
|
363
|
+
if dataset_jsonl:
|
|
364
|
+
# Display relative path for readability
|
|
365
|
+
try:
|
|
366
|
+
rel = os.path.relpath(dataset_jsonl, project_root)
|
|
367
|
+
except Exception:
|
|
368
|
+
rel = dataset_jsonl
|
|
369
|
+
print(f"✓ Using JSONL from input_dataset: {rel}")
|
|
370
|
+
if not dataset_jsonl:
|
|
371
|
+
print(
|
|
372
|
+
"Error: Could not determine dataset. Provide --dataset-id or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
|
|
373
|
+
)
|
|
374
|
+
return 1
|
|
375
|
+
|
|
376
|
+
inferred_dataset_id = _build_trimmed_dataset_id(evaluator_id)
|
|
377
|
+
if dry_run:
|
|
378
|
+
print("--dry-run: would create dataset and upload JSONL")
|
|
379
|
+
dataset_id = inferred_dataset_id
|
|
380
|
+
else:
|
|
381
|
+
try:
|
|
382
|
+
# Resolve dataset_jsonl path relative to CWD if needed
|
|
383
|
+
jsonl_path_for_upload = (
|
|
384
|
+
dataset_jsonl
|
|
385
|
+
if os.path.isabs(dataset_jsonl)
|
|
386
|
+
else os.path.abspath(os.path.join(project_root, dataset_jsonl))
|
|
387
|
+
)
|
|
388
|
+
dataset_id, _ = create_dataset_from_jsonl(
|
|
389
|
+
account_id=account_id,
|
|
390
|
+
api_key=api_key,
|
|
391
|
+
api_base=api_base,
|
|
392
|
+
dataset_id=inferred_dataset_id,
|
|
393
|
+
display_name=dataset_display_name or inferred_dataset_id,
|
|
394
|
+
jsonl_path=jsonl_path_for_upload,
|
|
395
|
+
)
|
|
396
|
+
print(f"✓ Created and uploaded dataset: {dataset_id}")
|
|
397
|
+
except Exception as e:
|
|
398
|
+
print(f"Error creating/uploading dataset: {e}")
|
|
399
|
+
return 1
|
|
400
|
+
|
|
401
|
+
# Build training config/body
|
|
402
|
+
# Ensure base model is explicitly provided for clarity
|
|
403
|
+
if not getattr(args, "base_model", None):
|
|
404
|
+
print(
|
|
405
|
+
"Error: --base-model is required. Please specify the base model resource id (e.g., accounts/{account}/models/<model_id>)."
|
|
406
|
+
)
|
|
407
|
+
return 1
|
|
408
|
+
|
|
409
|
+
training_config: Dict[str, Any] = {"baseModel": args.base_model}
|
|
410
|
+
if getattr(args, "warm_start_from", None):
|
|
411
|
+
training_config["warmStartFrom"] = args.warm_start_from
|
|
412
|
+
|
|
413
|
+
# Optional hyperparameters
|
|
414
|
+
for key, arg_name in [
|
|
415
|
+
("epochs", "epochs"),
|
|
416
|
+
("batchSize", "batch_size"),
|
|
417
|
+
("learningRate", "learning_rate"),
|
|
418
|
+
("maxContextLength", "max_context_length"),
|
|
419
|
+
("loraRank", "lora_rank"),
|
|
420
|
+
("acceleratorCount", "accelerator_count"),
|
|
421
|
+
("region", "region"),
|
|
422
|
+
]:
|
|
423
|
+
val = getattr(args, arg_name, None)
|
|
424
|
+
if val is not None:
|
|
425
|
+
training_config[key] = val
|
|
426
|
+
|
|
427
|
+
inference_params: Dict[str, Any] = {}
|
|
428
|
+
for key, arg_name in [
|
|
429
|
+
("temperature", "temperature"),
|
|
430
|
+
("topP", "top_p"),
|
|
431
|
+
("topK", "top_k"),
|
|
432
|
+
("maxTokens", "max_tokens"),
|
|
433
|
+
("n", "n"),
|
|
434
|
+
]:
|
|
435
|
+
val = getattr(args, arg_name, None)
|
|
436
|
+
if val is not None:
|
|
437
|
+
inference_params[key] = val
|
|
438
|
+
if getattr(args, "inference_extra_body", None):
|
|
439
|
+
inference_params["extraBody"] = args.inference_extra_body
|
|
440
|
+
|
|
441
|
+
wandb_config: Optional[Dict[str, Any]] = None
|
|
442
|
+
if getattr(args, "wandb_enabled", False):
|
|
443
|
+
wandb_config = {
|
|
444
|
+
"enabled": True,
|
|
445
|
+
"apiKey": getattr(args, "wandb_api_key", None),
|
|
446
|
+
"project": getattr(args, "wandb_project", None),
|
|
447
|
+
"entity": getattr(args, "wandb_entity", None),
|
|
448
|
+
"runId": getattr(args, "wandb_run_id", None),
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
body: Dict[str, Any] = {
|
|
452
|
+
# "displayName": getattr(args, "display_name", None) or f"{evaluator_id}-rft",
|
|
453
|
+
"dataset": f"accounts/{account_id}/datasets/{dataset_id}",
|
|
454
|
+
"evaluator": evaluator_resource_name,
|
|
455
|
+
"evalAutoCarveout": bool(getattr(args, "eval_auto_carveout", True)),
|
|
456
|
+
"trainingConfig": training_config,
|
|
457
|
+
"inferenceParameters": inference_params or None,
|
|
458
|
+
"wandbConfig": wandb_config,
|
|
459
|
+
"chunkSize": getattr(args, "chunk_size", None),
|
|
460
|
+
"outputStats": None,
|
|
461
|
+
"outputMetrics": None,
|
|
462
|
+
"mcpServer": None,
|
|
463
|
+
}
|
|
464
|
+
# Debug: print minimal summary
|
|
465
|
+
print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
|
|
466
|
+
if getattr(args, "evaluation_dataset", None):
|
|
467
|
+
body["evaluationDataset"] = args.evaluation_dataset
|
|
468
|
+
if getattr(args, "output_model", None):
|
|
469
|
+
body.setdefault("trainingConfig", {})["outputModel"] = f"accounts/{account_id}/models/{args.output_model}"
|
|
470
|
+
|
|
471
|
+
# Clean None fields to avoid noisy payloads
|
|
472
|
+
body = {k: v for k, v in body.items() if v is not None}
|
|
473
|
+
|
|
474
|
+
if dry_run:
|
|
475
|
+
print("--dry-run: would create RFT job with body:")
|
|
476
|
+
print(json.dumps(body, indent=2))
|
|
477
|
+
_print_links(evaluator_id, dataset_id, None)
|
|
478
|
+
return 0
|
|
479
|
+
|
|
480
|
+
try:
|
|
481
|
+
result = create_reinforcement_fine_tuning_job(
|
|
482
|
+
account_id=account_id, api_key=api_key, api_base=api_base, body=body
|
|
483
|
+
)
|
|
484
|
+
job_name = result.get("name") if isinstance(result, dict) else None
|
|
485
|
+
print("\n✅ Created Reinforcement Fine-tuning Job")
|
|
486
|
+
if job_name:
|
|
487
|
+
print(f" name: {job_name}")
|
|
488
|
+
_print_links(evaluator_id, dataset_id, job_name)
|
|
489
|
+
return 0
|
|
490
|
+
except Exception as e:
|
|
491
|
+
print(f"Error creating RFT job: {e}")
|
|
492
|
+
return 1
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
|
|
7
|
+
from eval_protocol.common_utils import load_jsonl
|
|
8
|
+
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
|
|
9
|
+
from eval_protocol.data_loader.models import (
|
|
10
|
+
DataLoaderResult,
|
|
11
|
+
DataLoaderVariant,
|
|
12
|
+
EvaluationDataLoader,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(kw_only=True)
|
|
17
|
+
class EvaluationRowJsonlDataLoader(EvaluationDataLoader):
|
|
18
|
+
"""Data loader that reads EvaluationRows from a JSONL file path.
|
|
19
|
+
|
|
20
|
+
Each line of the JSONL file should be a serialized EvaluationRow dict.
|
|
21
|
+
The loader will construct EvaluationRow objects via the default dataset adapter.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
jsonl_path: str
|
|
25
|
+
id: str = "jsonl"
|
|
26
|
+
description: str | None = None
|
|
27
|
+
|
|
28
|
+
def variants(self) -> Sequence[DataLoaderVariant]:
|
|
29
|
+
def _load() -> DataLoaderResult:
|
|
30
|
+
path = self.jsonl_path
|
|
31
|
+
if not os.path.isabs(path):
|
|
32
|
+
path = os.path.abspath(path)
|
|
33
|
+
rows_json = load_jsonl(path)
|
|
34
|
+
eval_rows = default_dataset_adapter(rows_json)
|
|
35
|
+
return DataLoaderResult(
|
|
36
|
+
rows=eval_rows,
|
|
37
|
+
type=self.__class__.__name__,
|
|
38
|
+
variant_id=self.id,
|
|
39
|
+
variant_description=self.description,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
return [_load]
|
|
@@ -602,7 +602,47 @@ class Evaluator:
|
|
|
602
602
|
from pathlib import Path
|
|
603
603
|
import fnmatch
|
|
604
604
|
|
|
605
|
-
default_ignores = [
|
|
605
|
+
default_ignores = [
|
|
606
|
+
".git",
|
|
607
|
+
".github",
|
|
608
|
+
"__pycache__",
|
|
609
|
+
"*.pyc",
|
|
610
|
+
"*.pyo",
|
|
611
|
+
"*.pyd",
|
|
612
|
+
".venv",
|
|
613
|
+
"venv",
|
|
614
|
+
".tox",
|
|
615
|
+
".pytest_cache",
|
|
616
|
+
".mypy_cache",
|
|
617
|
+
".ruff_cache",
|
|
618
|
+
".ipynb_checkpoints",
|
|
619
|
+
".idea",
|
|
620
|
+
".vscode",
|
|
621
|
+
".cache",
|
|
622
|
+
"node_modules",
|
|
623
|
+
"vendor",
|
|
624
|
+
"dist",
|
|
625
|
+
"build",
|
|
626
|
+
"*.egg-info",
|
|
627
|
+
"*.egg",
|
|
628
|
+
"*.whl",
|
|
629
|
+
"*.tar.gz",
|
|
630
|
+
"*.zip",
|
|
631
|
+
"*.log",
|
|
632
|
+
"*.tmp",
|
|
633
|
+
"*.swp",
|
|
634
|
+
".DS_Store",
|
|
635
|
+
"coverage",
|
|
636
|
+
"htmlcov",
|
|
637
|
+
".coverage",
|
|
638
|
+
"coverage.xml",
|
|
639
|
+
".env",
|
|
640
|
+
".env.*",
|
|
641
|
+
"*.so",
|
|
642
|
+
"*.dylib",
|
|
643
|
+
".pytest_cache/",
|
|
644
|
+
"env/",
|
|
645
|
+
]
|
|
606
646
|
all_patterns = default_ignores + ignore_patterns
|
|
607
647
|
|
|
608
648
|
path_obj = Path(path)
|
|
@@ -18,12 +18,20 @@ def _map_api_host_to_app_host(api_base: str) -> str:
|
|
|
18
18
|
from urllib.parse import urlparse
|
|
19
19
|
|
|
20
20
|
parsed = urlparse(api_base)
|
|
21
|
-
host = parsed.netloc or parsed.path
|
|
21
|
+
host = (parsed.netloc or parsed.path).lower()
|
|
22
|
+
scheme = parsed.scheme or "https"
|
|
23
|
+
|
|
24
|
+
# Explicit mappings first
|
|
22
25
|
if host.startswith("dev.api.fireworks.ai"):
|
|
23
|
-
return f"{
|
|
26
|
+
return f"{scheme}://dev.fireworks.ai"
|
|
27
|
+
if host == "staging.api.fireworks.ai" or host == "api.fireworks.ai":
|
|
28
|
+
return f"{scheme}://app.fireworks.ai"
|
|
29
|
+
|
|
30
|
+
# Generic mapping: api.<...> → app.<...>
|
|
24
31
|
if host.startswith("api."):
|
|
25
|
-
return f"{
|
|
26
|
-
|
|
32
|
+
return f"{scheme}://{host.replace('api.', 'app.', 1)}"
|
|
33
|
+
|
|
34
|
+
return f"{scheme}://{host}"
|
|
27
35
|
except Exception:
|
|
28
36
|
return "https://app.fireworks.ai"
|
|
29
37
|
|