eval-protocol 0.3.4__tar.gz → 0.3.5.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.4/eval_protocol.egg-info → eval_protocol-0.3.5.dev1}/PKG-INFO +1 -1
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/fireworks_tracing.py +2 -1
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/create_rft.py +95 -25
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/upload.py +3 -8
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/utils.py +28 -2
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +7 -5
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_create_rft.py +92 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/LICENSE +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/README.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/pyproject.toml +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/setup.cfg +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/setup.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_format.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_length.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_math.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_models.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/versioneer.py +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/index-CuQbfdPD.js +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5.dev1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-01-
|
|
11
|
+
"date": "2026-01-06T11:14:00-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.
|
|
14
|
+
"full-revisionid": "38c47583e50493262d74915850e7a4a7e594baf3",
|
|
15
|
+
"version": "0.3.5.dev.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -268,7 +268,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
268
268
|
def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
|
|
269
269
|
"""Fetch logs from Fireworks tracing gateway /logs endpoint.
|
|
270
270
|
|
|
271
|
-
Returns entries with keys: timestamp, message, severity, tags.
|
|
271
|
+
Returns entries with keys: timestamp, message, severity, tags, status, extras.
|
|
272
272
|
"""
|
|
273
273
|
if not tags:
|
|
274
274
|
raise ValueError("At least one tag is required to fetch logs")
|
|
@@ -315,6 +315,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
315
315
|
"severity": e.get("severity", "INFO"),
|
|
316
316
|
"tags": e.get("tags", []),
|
|
317
317
|
"status": e.get("status"),
|
|
318
|
+
"extras": e.get("extras"),
|
|
318
319
|
}
|
|
319
320
|
)
|
|
320
321
|
return results
|
|
@@ -5,15 +5,15 @@ import json
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
7
|
import time
|
|
8
|
-
from typing import Any, Dict, Optional
|
|
8
|
+
from typing import Any, Callable, Dict, Optional
|
|
9
9
|
import inspect
|
|
10
10
|
import requests
|
|
11
|
+
import tempfile
|
|
11
12
|
from pydantic import ValidationError
|
|
12
13
|
|
|
13
14
|
from ..auth import get_fireworks_api_base, get_fireworks_api_key
|
|
14
|
-
from ..common_utils import get_user_agent
|
|
15
|
+
from ..common_utils import get_user_agent, load_jsonl
|
|
15
16
|
from ..fireworks_rft import (
|
|
16
|
-
build_default_output_model,
|
|
17
17
|
create_dataset_from_jsonl,
|
|
18
18
|
detect_dataset_builder,
|
|
19
19
|
materialize_dataset_via_builder,
|
|
@@ -31,12 +31,88 @@ from .utils import (
|
|
|
31
31
|
_normalize_evaluator_id,
|
|
32
32
|
_print_links,
|
|
33
33
|
_resolve_selected_test,
|
|
34
|
+
load_module_from_file_path,
|
|
34
35
|
)
|
|
35
36
|
from .local_test import run_evaluator_test
|
|
36
37
|
|
|
37
38
|
from fireworks import Fireworks
|
|
38
39
|
|
|
39
40
|
|
|
41
|
+
def _extract_dataset_adapter(
|
|
42
|
+
test_file_path: str, test_func_name: str
|
|
43
|
+
) -> Optional[Callable[[list[dict[str, Any]]], Any]]:
|
|
44
|
+
"""Extract dataset_adapter from an @evaluation_test wrapper via __ep_params__."""
|
|
45
|
+
try:
|
|
46
|
+
module = load_module_from_file_path(test_file_path)
|
|
47
|
+
wrapper = getattr(module, test_func_name, None)
|
|
48
|
+
if wrapper is None:
|
|
49
|
+
return None
|
|
50
|
+
ep_params = getattr(wrapper, "__ep_params__", None)
|
|
51
|
+
if ep_params is None:
|
|
52
|
+
return None
|
|
53
|
+
adapter = getattr(ep_params, "dataset_adapter", None)
|
|
54
|
+
if callable(adapter):
|
|
55
|
+
return adapter
|
|
56
|
+
return None
|
|
57
|
+
except Exception:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _maybe_transform_dataset_jsonl_via_adapter(
|
|
62
|
+
project_root: str,
|
|
63
|
+
dataset_jsonl: str,
|
|
64
|
+
test_file_path: Optional[str],
|
|
65
|
+
test_func_name: Optional[str],
|
|
66
|
+
) -> str:
|
|
67
|
+
"""Transform dataset_jsonl via the test's dataset_adapter (when available).
|
|
68
|
+
|
|
69
|
+
For RFT dataset uploads, we want the uploaded dataset to match what evaluation-time
|
|
70
|
+
would run on. If the selected evaluation test provides a dataset_adapter, that
|
|
71
|
+
adapter is treated as the source of truth for constructing EvaluationRows.
|
|
72
|
+
"""
|
|
73
|
+
if not dataset_jsonl:
|
|
74
|
+
return dataset_jsonl
|
|
75
|
+
|
|
76
|
+
if not test_file_path or not test_func_name:
|
|
77
|
+
return dataset_jsonl
|
|
78
|
+
|
|
79
|
+
adapter = _extract_dataset_adapter(test_file_path, test_func_name)
|
|
80
|
+
if not adapter:
|
|
81
|
+
return dataset_jsonl
|
|
82
|
+
|
|
83
|
+
raw_rows: list[dict[str, Any]] = load_jsonl(dataset_jsonl) # type: ignore[assignment]
|
|
84
|
+
adapted = adapter(raw_rows)
|
|
85
|
+
if not isinstance(adapted, list):
|
|
86
|
+
raise ValueError("dataset_adapter must return a list of EvaluationRow (or dicts parseable as EvaluationRow).")
|
|
87
|
+
|
|
88
|
+
eval_rows: list[EvaluationRow] = []
|
|
89
|
+
for item in adapted:
|
|
90
|
+
if isinstance(item, EvaluationRow):
|
|
91
|
+
eval_rows.append(item)
|
|
92
|
+
else:
|
|
93
|
+
eval_rows.append(EvaluationRow.model_validate(item))
|
|
94
|
+
|
|
95
|
+
output_dir = os.path.join(project_root, ".ep_tmp")
|
|
96
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
97
|
+
with tempfile.NamedTemporaryFile(
|
|
98
|
+
mode="w",
|
|
99
|
+
encoding="utf-8",
|
|
100
|
+
delete=False,
|
|
101
|
+
suffix=".jsonl",
|
|
102
|
+
prefix="ep_rft_dataset_",
|
|
103
|
+
dir=output_dir,
|
|
104
|
+
) as f:
|
|
105
|
+
for row in eval_rows:
|
|
106
|
+
f.write(json.dumps(row.model_dump(mode="json"), ensure_ascii=False) + "\n")
|
|
107
|
+
out_path = os.path.abspath(f.name)
|
|
108
|
+
try:
|
|
109
|
+
rel = os.path.relpath(out_path, project_root)
|
|
110
|
+
except Exception:
|
|
111
|
+
rel = out_path
|
|
112
|
+
print(f"✓ Transformed dataset via dataset_adapter into EvaluationRow JSONL: {rel} ({len(eval_rows)} rows)")
|
|
113
|
+
return out_path
|
|
114
|
+
|
|
115
|
+
|
|
40
116
|
def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) -> Optional[str]:
|
|
41
117
|
"""Import the test module and extract a JSONL path from data_loaders param if present.
|
|
42
118
|
|
|
@@ -45,18 +121,10 @@ def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) ->
|
|
|
45
121
|
relative to the directory of the test file.
|
|
46
122
|
"""
|
|
47
123
|
try:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
|
|
52
|
-
if not spec or not spec.loader:
|
|
124
|
+
module = load_module_from_file_path(test_file_path)
|
|
125
|
+
wrapper = getattr(module, test_func_name, None)
|
|
126
|
+
if wrapper is None:
|
|
53
127
|
return None
|
|
54
|
-
module = importlib.util.module_from_spec(spec)
|
|
55
|
-
sys.modules[spec.name] = module
|
|
56
|
-
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
57
|
-
if not hasattr(module, test_func_name):
|
|
58
|
-
return None
|
|
59
|
-
wrapper = getattr(module, test_func_name)
|
|
60
128
|
marks = getattr(wrapper, "pytestmark", [])
|
|
61
129
|
for m in marks:
|
|
62
130
|
if getattr(m, "name", "") == "parametrize":
|
|
@@ -105,18 +173,10 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
|
|
|
105
173
|
of the test file.
|
|
106
174
|
"""
|
|
107
175
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
|
|
112
|
-
if not spec or not spec.loader:
|
|
113
|
-
return None
|
|
114
|
-
module = importlib.util.module_from_spec(spec)
|
|
115
|
-
sys.modules[spec.name] = module
|
|
116
|
-
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
117
|
-
if not hasattr(module, test_func_name):
|
|
176
|
+
module = load_module_from_file_path(test_file_path)
|
|
177
|
+
wrapper = getattr(module, test_func_name, None)
|
|
178
|
+
if wrapper is None:
|
|
118
179
|
return None
|
|
119
|
-
wrapper = getattr(module, test_func_name)
|
|
120
180
|
marks = getattr(wrapper, "pytestmark", [])
|
|
121
181
|
for m in marks:
|
|
122
182
|
if getattr(m, "name", "") == "parametrize":
|
|
@@ -719,6 +779,16 @@ def create_rft_command(args) -> int:
|
|
|
719
779
|
if dataset_jsonl is None and not dataset_id:
|
|
720
780
|
return 1
|
|
721
781
|
|
|
782
|
+
# 2.5) If the selected evaluation test provides a dataset_adapter, always use it to
|
|
783
|
+
# construct the EvaluationRow dataset that we upload for RFT.
|
|
784
|
+
if dataset_jsonl is not None:
|
|
785
|
+
dataset_jsonl = _maybe_transform_dataset_jsonl_via_adapter(
|
|
786
|
+
project_root=project_root,
|
|
787
|
+
dataset_jsonl=dataset_jsonl,
|
|
788
|
+
test_file_path=selected_test_file_path,
|
|
789
|
+
test_func_name=selected_test_func_name,
|
|
790
|
+
)
|
|
791
|
+
|
|
722
792
|
# 3) Optional local validation
|
|
723
793
|
if not skip_validation:
|
|
724
794
|
# Dataset validation (JSONL must be EvaluationRow-compatible when present)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from eval_protocol.cli_commands.utils import DiscoveredTest
|
|
3
|
-
import importlib.util
|
|
4
3
|
import os
|
|
5
4
|
import re
|
|
6
5
|
import sys
|
|
@@ -18,6 +17,7 @@ from .utils import (
|
|
|
18
17
|
_discover_tests,
|
|
19
18
|
_ensure_account_id,
|
|
20
19
|
_get_questionary_style,
|
|
20
|
+
load_module_from_file_path,
|
|
21
21
|
_normalize_evaluator_id,
|
|
22
22
|
_prompt_select,
|
|
23
23
|
)
|
|
@@ -120,13 +120,8 @@ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
|
|
|
120
120
|
source_file_path = os.path.join(cwd, dotted_as_path)
|
|
121
121
|
|
|
122
122
|
# Load the module from the file path
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
raise ValueError(f"Unable to load module from path: {source_file_path}")
|
|
126
|
-
module = importlib.util.module_from_spec(spec)
|
|
127
|
-
sys.modules[spec.name] = module
|
|
128
|
-
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
129
|
-
module_name = spec.name
|
|
123
|
+
module = load_module_from_file_path(source_file_path)
|
|
124
|
+
module_name = getattr(module, "__name__", Path(source_file_path).stem)
|
|
130
125
|
|
|
131
126
|
if not hasattr(module, func):
|
|
132
127
|
raise ValueError(f"Function '{func}' not found in module '{module_name}'")
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from types import ModuleType
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
import os
|
|
2
5
|
import ast
|
|
3
6
|
import sys
|
|
@@ -6,16 +9,16 @@ import inspect
|
|
|
6
9
|
import argparse
|
|
7
10
|
import typing
|
|
8
11
|
import types
|
|
12
|
+
import importlib.util
|
|
9
13
|
from dataclasses import dataclass
|
|
10
14
|
from pathlib import Path
|
|
11
|
-
from typing import Any, List, Optional
|
|
15
|
+
from typing import Any, List, Optional
|
|
12
16
|
import typing_extensions
|
|
13
17
|
import inspect
|
|
14
18
|
from collections.abc import Callable
|
|
15
19
|
import pytest
|
|
16
20
|
|
|
17
21
|
from ..auth import (
|
|
18
|
-
get_fireworks_account_id,
|
|
19
22
|
get_fireworks_api_base,
|
|
20
23
|
get_fireworks_api_key,
|
|
21
24
|
verify_api_key_and_get_account_id,
|
|
@@ -23,6 +26,29 @@ from ..auth import (
|
|
|
23
26
|
from ..fireworks_rft import _map_api_host_to_app_host
|
|
24
27
|
|
|
25
28
|
|
|
29
|
+
def load_module_from_file_path(source_file_path: str) -> ModuleType:
|
|
30
|
+
"""Load a Python module from an absolute/relative filesystem path.
|
|
31
|
+
|
|
32
|
+
This mirrors the CLI behavior used by `upload.py` and `create_rft.py`:
|
|
33
|
+
- module name is derived from the file stem (e.g. /a/b/foo.py -> foo)
|
|
34
|
+
- the module is inserted into sys.modules under that name before exec
|
|
35
|
+
"""
|
|
36
|
+
abs_path = os.path.abspath(source_file_path)
|
|
37
|
+
if not os.path.isfile(abs_path):
|
|
38
|
+
raise ValueError(f"File not found: {abs_path}")
|
|
39
|
+
if not abs_path.endswith(".py"):
|
|
40
|
+
raise ValueError(f"Expected a .py file path, got: {abs_path}")
|
|
41
|
+
|
|
42
|
+
module_name = Path(abs_path).stem
|
|
43
|
+
spec = importlib.util.spec_from_file_location(module_name, abs_path)
|
|
44
|
+
if not spec or not spec.loader:
|
|
45
|
+
raise ValueError(f"Unable to load module from path: {abs_path}")
|
|
46
|
+
module = importlib.util.module_from_spec(spec)
|
|
47
|
+
sys.modules[spec.name] = module
|
|
48
|
+
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
49
|
+
return module
|
|
50
|
+
|
|
51
|
+
|
|
26
52
|
def _get_questionary_style():
|
|
27
53
|
"""Get the shared questionary style for CLI prompts - minimal and clean."""
|
|
28
54
|
try:
|
|
@@ -125,14 +125,16 @@ class FireworksTracingHttpHandler(logging.Handler):
|
|
|
125
125
|
pass
|
|
126
126
|
program = cast(Optional[str], getattr(record, "program", None)) or "eval_protocol"
|
|
127
127
|
|
|
128
|
+
extras_input = getattr(record, "extras", None)
|
|
129
|
+
extras: Dict[str, Any] = dict(extras_input) if isinstance(extras_input, dict) else {}
|
|
130
|
+
extras["logger_name"] = record.name
|
|
131
|
+
extras["level"] = record.levelname
|
|
132
|
+
extras["timestamp"] = timestamp
|
|
133
|
+
|
|
128
134
|
return {
|
|
129
135
|
"program": program,
|
|
130
136
|
"status": self._get_status_info(record),
|
|
131
137
|
"message": message,
|
|
132
138
|
"tags": tags,
|
|
133
|
-
"extras":
|
|
134
|
-
"logger_name": record.name,
|
|
135
|
-
"level": record.levelname,
|
|
136
|
-
"timestamp": timestamp,
|
|
137
|
-
},
|
|
139
|
+
"extras": extras,
|
|
138
140
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5.dev1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1206,3 +1206,95 @@ def test_create_rft_prefers_explicit_dataset_jsonl_over_input_dataset(rft_test_h
|
|
|
1206
1206
|
assert captured["jsonl_path"] != str(inferred_jsonl)
|
|
1207
1207
|
# And because --dataset-jsonl was provided, we should never call the input_dataset extractor
|
|
1208
1208
|
assert calls["input_dataset"] == 0
|
|
1209
|
+
|
|
1210
|
+
|
|
1211
|
+
def test_create_rft_transforms_raw_input_dataset_via_dataset_adapter_before_upload(rft_test_harness, monkeypatch):
|
|
1212
|
+
project = rft_test_harness
|
|
1213
|
+
|
|
1214
|
+
# Create a real @evaluation_test-decorated module so create_rft can extract __ep_params__.dataset_adapter
|
|
1215
|
+
metric_dir = project / "metric"
|
|
1216
|
+
metric_dir.mkdir(parents=True, exist_ok=True)
|
|
1217
|
+
|
|
1218
|
+
raw_jsonl = metric_dir / "raw.jsonl"
|
|
1219
|
+
raw_jsonl.write_text('{"q":"hi","a":"ok"}\n{"q":"yo","a":"ok2"}\n', encoding="utf-8")
|
|
1220
|
+
|
|
1221
|
+
test_file = metric_dir / "test_adapt.py"
|
|
1222
|
+
test_file.write_text(
|
|
1223
|
+
"""
|
|
1224
|
+
from typing import Any
|
|
1225
|
+
from eval_protocol.models import EvaluationRow, Message
|
|
1226
|
+
from eval_protocol.pytest import evaluation_test
|
|
1227
|
+
|
|
1228
|
+
def my_adapter(rows: list[dict[str, Any]]) -> list[EvaluationRow]:
|
|
1229
|
+
return [
|
|
1230
|
+
EvaluationRow(messages=[Message(role="user", content=r["q"])], ground_truth=r.get("a"))
|
|
1231
|
+
for r in rows
|
|
1232
|
+
]
|
|
1233
|
+
|
|
1234
|
+
@evaluation_test(
|
|
1235
|
+
input_dataset=["raw.jsonl"],
|
|
1236
|
+
dataset_adapter=my_adapter,
|
|
1237
|
+
num_runs=1,
|
|
1238
|
+
max_dataset_rows=2,
|
|
1239
|
+
mode="pointwise",
|
|
1240
|
+
)
|
|
1241
|
+
def test_adapt(row: EvaluationRow) -> EvaluationRow:
|
|
1242
|
+
return row
|
|
1243
|
+
""".lstrip(),
|
|
1244
|
+
encoding="utf-8",
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
# Discovery: exactly one test, and resolve_selected_test points to our module/function
|
|
1248
|
+
single_disc = SimpleNamespace(qualname="metric.test_adapt.test_adapt", file_path=str(test_file))
|
|
1249
|
+
monkeypatch.setattr(cr, "_discover_and_select_tests", lambda cwd, non_interactive=False: [single_disc])
|
|
1250
|
+
monkeypatch.setattr(
|
|
1251
|
+
cr,
|
|
1252
|
+
"_resolve_selected_test",
|
|
1253
|
+
lambda project_root, evaluator_id, selected_tests=None: (str(test_file), "test_adapt"),
|
|
1254
|
+
)
|
|
1255
|
+
|
|
1256
|
+
captured = {"jsonl_path": None}
|
|
1257
|
+
|
|
1258
|
+
def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
|
|
1259
|
+
captured["jsonl_path"] = jsonl_path
|
|
1260
|
+
return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
|
|
1261
|
+
|
|
1262
|
+
monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
|
|
1263
|
+
|
|
1264
|
+
# Ensure upload path doesn't touch the network; job creation via stub_fireworks fixture
|
|
1265
|
+
args = argparse.Namespace(
|
|
1266
|
+
evaluator=None,
|
|
1267
|
+
yes=True,
|
|
1268
|
+
dry_run=False,
|
|
1269
|
+
force=False,
|
|
1270
|
+
env_file=None,
|
|
1271
|
+
dataset=None,
|
|
1272
|
+
dataset_jsonl=None,
|
|
1273
|
+
dataset_display_name=None,
|
|
1274
|
+
dataset_builder=None,
|
|
1275
|
+
base_model=None,
|
|
1276
|
+
warm_start_from="accounts/acct123/models/ft-abc123",
|
|
1277
|
+
output_model=None,
|
|
1278
|
+
n=None,
|
|
1279
|
+
max_tokens=None,
|
|
1280
|
+
learning_rate=None,
|
|
1281
|
+
batch_size=None,
|
|
1282
|
+
epochs=None,
|
|
1283
|
+
lora_rank=None,
|
|
1284
|
+
max_context_length=None,
|
|
1285
|
+
chunk_size=None,
|
|
1286
|
+
eval_auto_carveout=None,
|
|
1287
|
+
skip_validation=True,
|
|
1288
|
+
ignore_docker=False,
|
|
1289
|
+
docker_build_extra="",
|
|
1290
|
+
docker_run_extra="",
|
|
1291
|
+
)
|
|
1292
|
+
|
|
1293
|
+
rc = cr.create_rft_command(args)
|
|
1294
|
+
assert rc == 0
|
|
1295
|
+
assert captured["jsonl_path"] is not None
|
|
1296
|
+
# Raw JSONL should NOT be uploaded; transformed EvaluationRow JSONL should be.
|
|
1297
|
+
assert os.path.abspath(captured["jsonl_path"]) != os.path.abspath(str(raw_jsonl))
|
|
1298
|
+
assert os.path.basename(captured["jsonl_path"]).endswith(".jsonl")
|
|
1299
|
+
# The transformed file should validate as EvaluationRow JSONL
|
|
1300
|
+
assert cr._validate_dataset_jsonl(captured["jsonl_path"])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/docker_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/sql_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/data/airline_dataset.jsonl
RENAMED
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/data/retail_dataset.jsonl
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_frozen_lake.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_tau_bench_airline.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_tau_bench_retail.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/dynamic_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/factory_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/inline_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/jsonl_data_loader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/dataset_logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/sqlite_event_bus.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|