eval-protocol 0.3.29__tar.gz → 0.3.30__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.29/eval_protocol.egg-info → eval_protocol-0.3.30}/PKG-INFO +2 -1
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/fireworks_tracing.py +78 -1
- eval_protocol-0.3.30/eval_protocol/adapters/lp_deserializer.py +109 -0
- eval_protocol-0.3.30/eval_protocol/adapters/r3_deserializer.py +187 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/remote_rollout_processor.py +44 -37
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/tracing_utils.py +65 -6
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/remote_rollout_processor.py +1 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30/eval_protocol.egg-info}/PKG-INFO +2 -1
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/SOURCES.txt +2 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/requires.txt +1 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/pyproject.toml +1 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/LICENSE +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/README.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/fireworks_v1_completions_client.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/setup.cfg +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/setup.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_create_rft.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_startup_benchmark.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_fireworks_v1_completions_client.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_format.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_length.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_math.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_models.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_no_implicit_dotenv.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/versioneer.py +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DFeF7AG_.js +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DFeF7AG_.js.map +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DvKW7FQL.css +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.30
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,6 +40,7 @@ Requires-Dist: addict>=2.4.0
|
|
|
40
40
|
Requires-Dist: deepdiff>=6.0.0
|
|
41
41
|
Requires-Dist: websockets>=15.0.1
|
|
42
42
|
Requires-Dist: fastapi>=0.116.1
|
|
43
|
+
Requires-Dist: zstandard>=0.19.0
|
|
43
44
|
Provides-Extra: dev
|
|
44
45
|
Requires-Dist: build; extra == "dev"
|
|
45
46
|
Requires-Dist: twine; extra == "dev"
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-
|
|
11
|
+
"date": "2026-05-29T16:09:24-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.
|
|
14
|
+
"full-revisionid": "1bd5447a3afbca3b71e0f0d205ed7cff6c3afe5d",
|
|
15
|
+
"version": "0.3.30"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -16,6 +16,8 @@ import os
|
|
|
16
16
|
|
|
17
17
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
18
18
|
from .base import BaseAdapter
|
|
19
|
+
from .lp_deserializer import decompress_and_parse_lp
|
|
20
|
+
from .r3_deserializer import decompress_and_parse_r3
|
|
19
21
|
from .utils import extract_messages_from_data
|
|
20
22
|
from ..common_utils import get_user_agent
|
|
21
23
|
|
|
@@ -100,13 +102,53 @@ def convert_trace_dict_to_evaluation_row(
|
|
|
100
102
|
):
|
|
101
103
|
break # Break early if we've found all the metadata we need
|
|
102
104
|
|
|
105
|
+
# Extract router replay payloads when present
|
|
106
|
+
payloads = trace.get("payloads")
|
|
107
|
+
if isinstance(payloads, dict):
|
|
108
|
+
router_replay = payloads.get("router_replay")
|
|
109
|
+
if isinstance(router_replay, dict) and router_replay.get("data"):
|
|
110
|
+
try:
|
|
111
|
+
matrices, r3_meta = decompress_and_parse_r3(router_replay["data"])
|
|
112
|
+
if execution_metadata.extra is None:
|
|
113
|
+
execution_metadata.extra = {}
|
|
114
|
+
execution_metadata.extra["routing_matrices"] = matrices
|
|
115
|
+
execution_metadata.extra["routing_metadata"] = r3_meta
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.warning("Failed to decompress R3 payload for trace %s: %s", trace.get("id"), e)
|
|
118
|
+
|
|
119
|
+
logprobs_payload = payloads.get("logprobs")
|
|
120
|
+
if isinstance(logprobs_payload, dict) and logprobs_payload.get("data"):
|
|
121
|
+
try:
|
|
122
|
+
logprobs, token_ids, lp_meta = decompress_and_parse_lp(logprobs_payload["data"])
|
|
123
|
+
if execution_metadata.extra is None:
|
|
124
|
+
execution_metadata.extra = {}
|
|
125
|
+
execution_metadata.extra["completion_logprobs"] = logprobs
|
|
126
|
+
if token_ids is not None:
|
|
127
|
+
execution_metadata.extra["completion_token_ids"] = token_ids
|
|
128
|
+
execution_metadata.extra["logprobs_metadata"] = lp_meta
|
|
129
|
+
|
|
130
|
+
for i in range(len(messages) - 1, -1, -1):
|
|
131
|
+
if messages[i].role == "assistant":
|
|
132
|
+
content_entries = [{"logprob": lp} for lp in logprobs]
|
|
133
|
+
if token_ids is not None:
|
|
134
|
+
for entry, tid in zip(content_entries, token_ids):
|
|
135
|
+
entry["token_id"] = tid
|
|
136
|
+
messages[i].logprobs = {"content": content_entries}
|
|
137
|
+
break
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.warning(
|
|
140
|
+
"Failed to decompress logprobs payload for trace %s: %s",
|
|
141
|
+
trace.get("id"),
|
|
142
|
+
e,
|
|
143
|
+
)
|
|
144
|
+
|
|
103
145
|
return EvaluationRow(
|
|
104
146
|
messages=messages,
|
|
105
147
|
tools=tools,
|
|
106
148
|
input_metadata=InputMetadata(
|
|
107
149
|
row_id=row_id,
|
|
108
150
|
session_data={
|
|
109
|
-
"langfuse_trace_id": trace.get("id"),
|
|
151
|
+
"langfuse_trace_id": trace.get("id"),
|
|
110
152
|
},
|
|
111
153
|
),
|
|
112
154
|
execution_metadata=execution_metadata,
|
|
@@ -375,6 +417,37 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
375
417
|
)
|
|
376
418
|
return results
|
|
377
419
|
|
|
420
|
+
async def async_get_status(self, session: aiohttp.ClientSession, rollout_id: str) -> Optional[Dict[str, Any]]:
|
|
421
|
+
"""Fetch rollout status from the lightweight /status endpoint.
|
|
422
|
+
|
|
423
|
+
Returns the parsed JSON response or None if the status is not yet available.
|
|
424
|
+
Response shape: {"rollout_id": "...", "status": {"code": ...} | null, "extras": {...} | null}
|
|
425
|
+
"""
|
|
426
|
+
headers = {
|
|
427
|
+
"Authorization": f"Bearer {self._get_api_key()}",
|
|
428
|
+
"User-Agent": get_user_agent(),
|
|
429
|
+
}
|
|
430
|
+
params: Dict[str, Any] = {"rollout_id": rollout_id}
|
|
431
|
+
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
|
432
|
+
|
|
433
|
+
urls_to_try = [f"{self.base_url}/v1/status", f"{self.base_url}/status"]
|
|
434
|
+
last_error: Optional[str] = None
|
|
435
|
+
for url in urls_to_try:
|
|
436
|
+
try:
|
|
437
|
+
async with session.get(url, params=params, headers=headers, timeout=timeout) as resp:
|
|
438
|
+
if resp.status == 404:
|
|
439
|
+
last_error = f"404 for {url}"
|
|
440
|
+
continue
|
|
441
|
+
resp.raise_for_status()
|
|
442
|
+
return (await resp.json(content_type=None)) or {}
|
|
443
|
+
except (aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError) as e:
|
|
444
|
+
last_error = str(e)
|
|
445
|
+
continue
|
|
446
|
+
|
|
447
|
+
if last_error:
|
|
448
|
+
logger.error("Failed to fetch status from Fireworks (tried %s): %s", urls_to_try, last_error)
|
|
449
|
+
return None
|
|
450
|
+
|
|
378
451
|
def get_evaluation_rows(
|
|
379
452
|
self,
|
|
380
453
|
tags: List[str],
|
|
@@ -395,6 +468,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
395
468
|
max_retries: int = 3,
|
|
396
469
|
span_name: Optional[str] = None,
|
|
397
470
|
converter: Optional[TraceDictConverter] = None,
|
|
471
|
+
include_payloads: bool = False,
|
|
398
472
|
) -> List[EvaluationRow]:
|
|
399
473
|
"""Pull traces from Langfuse via proxy and convert to EvaluationRow format.
|
|
400
474
|
|
|
@@ -418,6 +492,8 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
418
492
|
max_retries: Max retry attempts used by proxy (default: 3)
|
|
419
493
|
converter: Optional custom converter implementing TraceDictConverter protocol.
|
|
420
494
|
If provided, this will be used instead of the default conversion logic.
|
|
495
|
+
include_payloads: If True, request payload data (e.g., router replay)
|
|
496
|
+
from the gateway and decompress it into the returned EvaluationRows.
|
|
421
497
|
|
|
422
498
|
Returns:
|
|
423
499
|
List[EvaluationRow]: Converted evaluation rows
|
|
@@ -448,6 +524,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
448
524
|
"to_timestamp": to_timestamp.isoformat() if to_timestamp else None,
|
|
449
525
|
"sleep_between_gets": sleep_between_gets,
|
|
450
526
|
"max_retries": max_retries,
|
|
527
|
+
"include_payloads": include_payloads if include_payloads else None,
|
|
451
528
|
}
|
|
452
529
|
|
|
453
530
|
# Remove None values
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""LP/v1 binary deserializer for per-token logprobs payloads.
|
|
2
|
+
|
|
3
|
+
Implements the inverse of the tracing gateway's ``logprobs_serializer.serialize_logprobs``.
|
|
4
|
+
See that module for the full header specification.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import struct
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
import zstandard as zstd
|
|
14
|
+
|
|
15
|
+
MAGIC = b"LP01"
|
|
16
|
+
HEADER_VERSION = 1
|
|
17
|
+
MISSING_TOKEN_ID = -1
|
|
18
|
+
ENTRY_FORMAT = "<if"
|
|
19
|
+
ENTRY_SIZE = struct.calcsize(ENTRY_FORMAT) # 8 bytes
|
|
20
|
+
HEADER_FORMAT = "<4sBBHIIQ"
|
|
21
|
+
HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 24 bytes
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_header(raw: bytes) -> Dict[str, Any]:
|
|
25
|
+
if len(raw) < HEADER_SIZE:
|
|
26
|
+
raise ValueError(f"Payload too short for lp/v1 header: {len(raw)} < {HEADER_SIZE}")
|
|
27
|
+
|
|
28
|
+
(
|
|
29
|
+
magic,
|
|
30
|
+
version,
|
|
31
|
+
flags,
|
|
32
|
+
reserved_u16,
|
|
33
|
+
token_count,
|
|
34
|
+
body_byte_length,
|
|
35
|
+
reserved_u64,
|
|
36
|
+
) = struct.unpack(HEADER_FORMAT, raw[:HEADER_SIZE])
|
|
37
|
+
|
|
38
|
+
if magic != MAGIC:
|
|
39
|
+
raise ValueError(f"Bad LP/v1 magic: {magic!r}")
|
|
40
|
+
if version != HEADER_VERSION:
|
|
41
|
+
raise ValueError(f"Unsupported lp/v1 header version: {version}")
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
"flags": flags,
|
|
45
|
+
"reserved_u16": reserved_u16,
|
|
46
|
+
"token_count": token_count,
|
|
47
|
+
"body_byte_length": body_byte_length,
|
|
48
|
+
"reserved_u64": reserved_u64,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def parse_logprobs(raw: bytes) -> Tuple[List[float], Optional[List[int]], Dict[str, Any]]:
|
|
53
|
+
"""Parse uncompressed LP/v1 bytes into logprobs, optional token ids, and metadata."""
|
|
54
|
+
header = _parse_header(raw)
|
|
55
|
+
token_count = header["token_count"]
|
|
56
|
+
body_byte_length = header["body_byte_length"]
|
|
57
|
+
|
|
58
|
+
if token_count == 0:
|
|
59
|
+
raise ValueError("LP/v1 token_count must be > 0")
|
|
60
|
+
if body_byte_length != token_count * ENTRY_SIZE:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"body_byte_length ({body_byte_length}) != token_count * {ENTRY_SIZE} "
|
|
63
|
+
f"({token_count * ENTRY_SIZE})"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
expected_len = HEADER_SIZE + body_byte_length
|
|
67
|
+
if len(raw) != expected_len:
|
|
68
|
+
raise ValueError(f"LP/v1 payload length mismatch: {len(raw)} != {expected_len}")
|
|
69
|
+
|
|
70
|
+
logprobs: List[float] = []
|
|
71
|
+
token_ids: List[int] = []
|
|
72
|
+
all_token_ids_valid = True
|
|
73
|
+
offset = HEADER_SIZE
|
|
74
|
+
for _ in range(token_count):
|
|
75
|
+
wire_id, logprob = struct.unpack(ENTRY_FORMAT, raw[offset : offset + ENTRY_SIZE])
|
|
76
|
+
offset += ENTRY_SIZE
|
|
77
|
+
logprobs.append(logprob)
|
|
78
|
+
if wire_id == MISSING_TOKEN_ID:
|
|
79
|
+
all_token_ids_valid = False
|
|
80
|
+
token_ids.append(wire_id)
|
|
81
|
+
else:
|
|
82
|
+
token_ids.append(wire_id)
|
|
83
|
+
|
|
84
|
+
metadata: Dict[str, Any] = {
|
|
85
|
+
"scope": "completion_only",
|
|
86
|
+
"completion_token_count": token_count,
|
|
87
|
+
"all_token_ids_valid": all_token_ids_valid,
|
|
88
|
+
}
|
|
89
|
+
header.update(metadata)
|
|
90
|
+
ids_out: Optional[List[int]] = token_ids if all_token_ids_valid else None
|
|
91
|
+
return logprobs, ids_out, header
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def decompress_and_parse_lp(data_b64: str) -> Tuple[List[float], Optional[List[int]], Dict[str, Any]]:
|
|
95
|
+
"""Decompress and unpack an LP/v1 payload into completion logprobs and token ids.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
data_b64: Base64-encoded zstd-compressed LP binary blob from
|
|
99
|
+
``payloads.logprobs.data``.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
``(logprobs, token_ids, metadata)`` where ``logprobs`` is per-completion-token
|
|
103
|
+
scalars, ``token_ids`` is ``None`` if any wire id was ``MISSING_TOKEN_ID``,
|
|
104
|
+
and ``metadata`` includes ``all_token_ids_valid`` and ``completion_token_count``.
|
|
105
|
+
"""
|
|
106
|
+
compressed = base64.b64decode(data_b64)
|
|
107
|
+
decompressor = zstd.ZstdDecompressor()
|
|
108
|
+
raw = decompressor.decompress(compressed)
|
|
109
|
+
return parse_logprobs(raw)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""R3/v1 binary deserializer for router-replay payloads.
|
|
2
|
+
|
|
3
|
+
Implements the inverse of the packed binary format produced by the tracing
|
|
4
|
+
gateway's ``r3_serializer.serialize_r3``. See that module for the full
|
|
5
|
+
header specification.
|
|
6
|
+
|
|
7
|
+
The main entry point is :func:`decompress_and_parse_r3`, which accepts the
|
|
8
|
+
base64-encoded compressed blob returned by the gateway's
|
|
9
|
+
``/v1/traces/pointwise?include_payloads=true`` endpoint and produces
|
|
10
|
+
per-token routing matrices in the same ``List[Optional[str]]`` format used
|
|
11
|
+
by the direct inference path (``DeploymentSampler.sample_with_tokens()``).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import base64
|
|
17
|
+
import struct
|
|
18
|
+
from enum import IntEnum
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
import zstandard as zstd
|
|
22
|
+
|
|
23
|
+
MAGIC = b"R3V1"
|
|
24
|
+
HEADER_FORMAT = "<4sBBBBIIIIQ"
|
|
25
|
+
HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 32 bytes
|
|
26
|
+
BITS_PER_BYTE = 8
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _SelectorMode(IntEnum):
|
|
30
|
+
ALL = 0
|
|
31
|
+
SUFFIX = 1
|
|
32
|
+
BITMAP = 2
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class _RoutingDtype(IntEnum):
|
|
36
|
+
UINT8 = 1
|
|
37
|
+
UINT16 = 2
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_SELECTOR_MODE_NAMES = {v: v.name.lower() for v in _SelectorMode}
|
|
41
|
+
_ROUTING_DTYPE_NAMES = {v: v.name.lower() for v in _RoutingDtype}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _parse_header(raw: bytes) -> Dict[str, Any]:
|
|
45
|
+
if len(raw) < HEADER_SIZE:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Payload too short for r3/v1 header: {len(raw)} < {HEADER_SIZE}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
(
|
|
51
|
+
magic,
|
|
52
|
+
version,
|
|
53
|
+
selector_mode,
|
|
54
|
+
routing_dtype,
|
|
55
|
+
flags,
|
|
56
|
+
total_token_count,
|
|
57
|
+
replayed_token_count,
|
|
58
|
+
replay_start_token,
|
|
59
|
+
selector_byte_length,
|
|
60
|
+
matrix_byte_length,
|
|
61
|
+
) = struct.unpack(HEADER_FORMAT, raw[:HEADER_SIZE])
|
|
62
|
+
|
|
63
|
+
if magic != MAGIC:
|
|
64
|
+
raise ValueError(f"Bad R3 magic: {magic!r}")
|
|
65
|
+
if version != 1:
|
|
66
|
+
raise ValueError(f"Unsupported R3 header version: {version}")
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
"selector_mode": selector_mode,
|
|
70
|
+
"routing_dtype": routing_dtype,
|
|
71
|
+
"flags": flags,
|
|
72
|
+
"total_token_count": total_token_count,
|
|
73
|
+
"replayed_token_count": replayed_token_count,
|
|
74
|
+
"replay_start_token": replay_start_token,
|
|
75
|
+
"selector_byte_length": selector_byte_length,
|
|
76
|
+
"matrix_byte_length": matrix_byte_length,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _read_bitmap_positions(
|
|
81
|
+
selector_bytes: bytes, total_token_count: int
|
|
82
|
+
) -> List[int]:
|
|
83
|
+
"""Return sorted token indices where the bitmap bit is set."""
|
|
84
|
+
positions: List[int] = []
|
|
85
|
+
for i in range(total_token_count):
|
|
86
|
+
byte_idx = i // BITS_PER_BYTE
|
|
87
|
+
bit_idx = i % BITS_PER_BYTE
|
|
88
|
+
if byte_idx < len(selector_bytes) and (selector_bytes[byte_idx] >> bit_idx) & 1:
|
|
89
|
+
positions.append(i)
|
|
90
|
+
return positions
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def decompress_and_parse_r3(
|
|
94
|
+
data_b64: str,
|
|
95
|
+
) -> Tuple[List[Optional[str]], Dict[str, Any]]:
|
|
96
|
+
"""Decompress and unpack an R3/v1 payload into per-token routing matrices.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
data_b64: Base64-encoded zstd-compressed R3 binary blob, as returned
|
|
100
|
+
by the tracing gateway in ``payloads.router_replay.data``.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
A tuple of ``(routing_matrices, metadata)`` where:
|
|
104
|
+
|
|
105
|
+
- ``routing_matrices`` is a ``List[Optional[str]]`` of length
|
|
106
|
+
``total_token_count``. Each present position contains a
|
|
107
|
+
base64-encoded routing matrix (matching the format returned by
|
|
108
|
+
the direct inference path); absent positions are ``None``.
|
|
109
|
+
- ``metadata`` is a dict with keys ``routing_dtype``,
|
|
110
|
+
``selector_mode``, ``total_token_count``, ``replayed_token_count``,
|
|
111
|
+
``replay_start_token``.
|
|
112
|
+
"""
|
|
113
|
+
compressed = base64.b64decode(data_b64)
|
|
114
|
+
|
|
115
|
+
# ZstdCompressor.compress() embeds the uncompressed size in the frame
|
|
116
|
+
# header by default, so the library can auto-allocate the output buffer.
|
|
117
|
+
decompressor = zstd.ZstdDecompressor()
|
|
118
|
+
raw = decompressor.decompress(compressed)
|
|
119
|
+
|
|
120
|
+
header = _parse_header(raw)
|
|
121
|
+
|
|
122
|
+
selector_mode = header["selector_mode"]
|
|
123
|
+
routing_dtype = header["routing_dtype"]
|
|
124
|
+
total_token_count = header["total_token_count"]
|
|
125
|
+
replayed_token_count = header["replayed_token_count"]
|
|
126
|
+
replay_start_token = header["replay_start_token"]
|
|
127
|
+
selector_byte_length = header["selector_byte_length"]
|
|
128
|
+
matrix_byte_length = header["matrix_byte_length"]
|
|
129
|
+
|
|
130
|
+
metadata: Dict[str, Any] = {
|
|
131
|
+
"routing_dtype": _ROUTING_DTYPE_NAMES.get(routing_dtype, str(routing_dtype)),
|
|
132
|
+
"selector_mode": _SELECTOR_MODE_NAMES.get(selector_mode, str(selector_mode)),
|
|
133
|
+
"total_token_count": total_token_count,
|
|
134
|
+
"replayed_token_count": replayed_token_count,
|
|
135
|
+
"replay_start_token": replay_start_token,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if replayed_token_count == 0:
|
|
139
|
+
return [None] * total_token_count, metadata
|
|
140
|
+
|
|
141
|
+
# Per-token matrix byte size is implicit in the payload: all replayed
|
|
142
|
+
# tokens share the same matrix length, so we can recover it from the
|
|
143
|
+
# matrix section total length divided by the replayed-token count.
|
|
144
|
+
if matrix_byte_length % replayed_token_count != 0:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"matrix_byte_length ({matrix_byte_length}) is not a multiple of "
|
|
147
|
+
f"replayed_token_count ({replayed_token_count}); cannot split "
|
|
148
|
+
"into per-token matrices"
|
|
149
|
+
)
|
|
150
|
+
matrix_elem_size = matrix_byte_length // replayed_token_count
|
|
151
|
+
|
|
152
|
+
body = raw[HEADER_SIZE:]
|
|
153
|
+
expected_body_length = selector_byte_length + matrix_byte_length
|
|
154
|
+
if len(body) < expected_body_length:
|
|
155
|
+
raise ValueError(
|
|
156
|
+
f"Payload body too short for selector and matrix sections: "
|
|
157
|
+
f"{len(body)} < {expected_body_length}"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
selector_bytes = body[:selector_byte_length]
|
|
161
|
+
matrix_bytes = body[selector_byte_length : selector_byte_length + matrix_byte_length]
|
|
162
|
+
|
|
163
|
+
if selector_mode == _SelectorMode.ALL:
|
|
164
|
+
replayed_positions = list(range(total_token_count))
|
|
165
|
+
elif selector_mode == _SelectorMode.SUFFIX:
|
|
166
|
+
replayed_positions = list(
|
|
167
|
+
range(replay_start_token, replay_start_token + replayed_token_count)
|
|
168
|
+
)
|
|
169
|
+
elif selector_mode == _SelectorMode.BITMAP:
|
|
170
|
+
replayed_positions = _read_bitmap_positions(selector_bytes, total_token_count)
|
|
171
|
+
else:
|
|
172
|
+
raise ValueError(f"Unknown selector_mode: {selector_mode}")
|
|
173
|
+
|
|
174
|
+
if len(replayed_positions) != replayed_token_count:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"Selector produced {len(replayed_positions)} replayed positions, "
|
|
177
|
+
f"but header replayed_token_count is {replayed_token_count}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Split matrix bytes into per-token chunks and base64-encode each one
|
|
181
|
+
matrices: List[Optional[str]] = [None] * total_token_count
|
|
182
|
+
for idx, pos in enumerate(replayed_positions):
|
|
183
|
+
start = idx * matrix_elem_size
|
|
184
|
+
end = start + matrix_elem_size
|
|
185
|
+
matrices[pos] = base64.b64encode(matrix_bytes[start:end]).decode("ascii")
|
|
186
|
+
|
|
187
|
+
return matrices, metadata
|
{eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -35,11 +35,13 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
35
35
|
model_base_url: str = "https://tracing.fireworks.ai",
|
|
36
36
|
poll_interval: float = 1.0,
|
|
37
37
|
timeout_seconds: float = 120.0,
|
|
38
|
+
include_payloads: bool = False,
|
|
38
39
|
):
|
|
39
40
|
# Prefer constructor-provided configuration. These can be overridden via
|
|
40
41
|
# config.kwargs at call time for backward compatibility.
|
|
41
42
|
self._remote_base_url = remote_base_url
|
|
42
43
|
self._model_base_url = model_base_url
|
|
44
|
+
self._include_payloads = include_payloads
|
|
43
45
|
if os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"):
|
|
44
46
|
self._remote_base_url = os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL")
|
|
45
47
|
_ep_model_base_url = os.getenv("EP_MODEL_BASE_URL")
|
|
@@ -122,45 +124,46 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
122
124
|
|
|
123
125
|
while time.time() < deadline:
|
|
124
126
|
session = self._get_or_create_session()
|
|
125
|
-
|
|
126
|
-
session,
|
|
127
|
+
status_result = await self._tracing_adapter.async_get_status(
|
|
128
|
+
session,
|
|
129
|
+
rollout_id=row.execution_metadata.rollout_id,
|
|
127
130
|
)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if status_logs:
|
|
136
|
-
if len(status_logs) > 1:
|
|
137
|
-
logger.warning(
|
|
138
|
-
"Found %s status logs for rollout %s; expected at most 1. Using the first one: %s",
|
|
139
|
-
len(status_logs),
|
|
140
|
-
row.execution_metadata.rollout_id,
|
|
141
|
-
status_logs[0],
|
|
142
|
-
)
|
|
143
|
-
# Use the first log with status information
|
|
144
|
-
status_log = status_logs[0]
|
|
145
|
-
status_dict = status_log.get("status")
|
|
146
|
-
raw_extras = status_log.get("extras") or {}
|
|
147
|
-
status_extras = {
|
|
148
|
-
k: v for k, v in raw_extras.items() if k not in ("logger_name", "level", "timestamp")
|
|
149
|
-
}
|
|
131
|
+
status = (status_result or {}).get("status")
|
|
132
|
+
if isinstance(status, dict) and "code" in status:
|
|
133
|
+
status_code = status["code"]
|
|
134
|
+
if status_code == Status.Code.RUNNING:
|
|
135
|
+
await asyncio.sleep(poll_interval)
|
|
136
|
+
continue
|
|
150
137
|
|
|
151
138
|
logger.info(
|
|
152
|
-
|
|
139
|
+
"Found status for rollout %s with code %s",
|
|
140
|
+
row.execution_metadata.rollout_id,
|
|
141
|
+
status_code,
|
|
153
142
|
)
|
|
154
143
|
|
|
155
|
-
|
|
156
|
-
status_message =
|
|
157
|
-
status_details =
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
f"
|
|
144
|
+
# /status only returns the code; backfill message/details/extras from Logs once.
|
|
145
|
+
status_message: str = ""
|
|
146
|
+
status_details: list = []
|
|
147
|
+
status_extras: dict = {}
|
|
148
|
+
completed_logs = await self._tracing_adapter.async_search_logs(
|
|
149
|
+
session, tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
|
|
161
150
|
)
|
|
151
|
+
# Pick the log row whose status code matches the terminal
|
|
152
|
+
# code from /status, so intermediate RUNNING checkpoints
|
|
153
|
+
# don't poison the backfill.
|
|
154
|
+
for log in completed_logs:
|
|
155
|
+
sd = log.get("status")
|
|
156
|
+
if isinstance(sd, dict) and sd.get("code") == status_code:
|
|
157
|
+
status_message = sd.get("message", "") or ""
|
|
158
|
+
status_details = sd.get("details", []) or []
|
|
159
|
+
raw_extras = log.get("extras") or {}
|
|
160
|
+
status_extras = {
|
|
161
|
+
k: v
|
|
162
|
+
for k, v in raw_extras.items()
|
|
163
|
+
if k not in ("logger_name", "level", "timestamp")
|
|
164
|
+
}
|
|
165
|
+
break
|
|
162
166
|
|
|
163
|
-
# Create and raise exception if appropriate, preserving original message
|
|
164
167
|
exception = exception_for_status_code(status_code, status_message)
|
|
165
168
|
if exception is not None:
|
|
166
169
|
raise exception
|
|
@@ -171,10 +174,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
171
174
|
details=status_details,
|
|
172
175
|
)
|
|
173
176
|
|
|
174
|
-
if
|
|
175
|
-
row.execution_metadata.extra
|
|
176
|
-
|
|
177
|
-
|
|
177
|
+
if status_extras:
|
|
178
|
+
if row.execution_metadata.extra:
|
|
179
|
+
row.execution_metadata.extra.update(status_extras)
|
|
180
|
+
else:
|
|
181
|
+
row.execution_metadata.extra = status_extras
|
|
178
182
|
|
|
179
183
|
logger.info("Stopping polling for rollout %s", row.execution_metadata.rollout_id)
|
|
180
184
|
break
|
|
@@ -192,7 +196,10 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
192
196
|
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
193
197
|
|
|
194
198
|
def _update_with_trace() -> None:
|
|
195
|
-
return update_row_with_remote_trace(
|
|
199
|
+
return update_row_with_remote_trace(
|
|
200
|
+
row, default_fireworks_output_data_loader, model_base_url,
|
|
201
|
+
include_payloads=self._include_payloads,
|
|
202
|
+
)
|
|
196
203
|
|
|
197
204
|
await asyncio.to_thread(_update_with_trace) # Update row with remote trace in-place
|
|
198
205
|
return row
|
|
@@ -22,9 +22,61 @@ def default_fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDat
|
|
|
22
22
|
# Use EP_REMOTE_API_KEY for fetching remote traces, falling back to FIREWORKS_API_KEY
|
|
23
23
|
api_key = os.environ.get("EP_REMOTE_API_KEY") or os.environ.get("FIREWORKS_API_KEY")
|
|
24
24
|
adapter = FireworksTracingAdapter(base_url=base_url, api_key=api_key)
|
|
25
|
-
return adapter.get_evaluation_rows(
|
|
25
|
+
return adapter.get_evaluation_rows(
|
|
26
|
+
tags=[f"rollout_id:{config.rollout_id}"],
|
|
27
|
+
max_retries=5,
|
|
28
|
+
include_payloads=config.include_payloads,
|
|
29
|
+
)
|
|
26
30
|
|
|
27
|
-
|
|
31
|
+
def preprocess_traces(rows: List[EvaluationRow]) -> List[EvaluationRow]:
|
|
32
|
+
filtered_rows = filter_longest_conversation(rows)
|
|
33
|
+
if config.include_payloads and filtered_rows:
|
|
34
|
+
_merge_payloads_into_longest_row(filtered_rows[0], rows)
|
|
35
|
+
return filtered_rows
|
|
36
|
+
|
|
37
|
+
return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=preprocess_traces)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _merge_payloads_into_longest_row(longest_row: EvaluationRow, rows: List[EvaluationRow]) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Preserve per-turn payload-derived metadata after selecting the longest trace row.
|
|
43
|
+
|
|
44
|
+
Each trace row carries payloads for its final assistant turn. The longest row
|
|
45
|
+
keeps the full conversation, while its top-level execution metadata remains
|
|
46
|
+
the payload metadata for the final completion for backward compatibility.
|
|
47
|
+
"""
|
|
48
|
+
target_assistants = longest_row.get_assistant_messages()
|
|
49
|
+
assistant_turn_payloads = []
|
|
50
|
+
|
|
51
|
+
for row in sorted(rows, key=lambda item: len(item.messages)):
|
|
52
|
+
source = row.last_assistant_message()
|
|
53
|
+
source_turn_index = len(row.get_assistant_messages()) - 1
|
|
54
|
+
if source_turn_index < 0 or source_turn_index >= len(target_assistants):
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
if source and source.logprobs and not target_assistants[source_turn_index].logprobs:
|
|
58
|
+
target_assistants[source_turn_index].logprobs = source.logprobs
|
|
59
|
+
|
|
60
|
+
extra = row.execution_metadata.extra or {}
|
|
61
|
+
turn_payload = {
|
|
62
|
+
key: extra[key]
|
|
63
|
+
for key in (
|
|
64
|
+
"completion_logprobs",
|
|
65
|
+
"completion_token_ids",
|
|
66
|
+
"logprobs_metadata",
|
|
67
|
+
"routing_matrices",
|
|
68
|
+
"routing_metadata",
|
|
69
|
+
)
|
|
70
|
+
if key in extra
|
|
71
|
+
}
|
|
72
|
+
if turn_payload:
|
|
73
|
+
turn_payload["assistant_turn_index"] = source_turn_index
|
|
74
|
+
assistant_turn_payloads.append(turn_payload)
|
|
75
|
+
|
|
76
|
+
if assistant_turn_payloads:
|
|
77
|
+
if longest_row.execution_metadata.extra is None:
|
|
78
|
+
longest_row.execution_metadata.extra = {}
|
|
79
|
+
longest_row.execution_metadata.extra["assistant_turn_payloads"] = assistant_turn_payloads
|
|
28
80
|
|
|
29
81
|
|
|
30
82
|
def build_fireworks_tracing_url(
|
|
@@ -99,7 +151,7 @@ def build_init_request(
|
|
|
99
151
|
if not completion_params_dict.get("model"):
|
|
100
152
|
raise ValueError("Model must be provided in completion_params")
|
|
101
153
|
|
|
102
|
-
# Extract base_url from completion_params
|
|
154
|
+
# Extract base_url from completion_params for tracing-gateway URL encoding
|
|
103
155
|
completion_params_base_url: Optional[str] = completion_params_dict.get("base_url")
|
|
104
156
|
|
|
105
157
|
# Strip non-OpenAI fields from messages
|
|
@@ -129,7 +181,7 @@ def build_init_request(
|
|
|
129
181
|
|
|
130
182
|
# Build final model base URL with tracing metadata
|
|
131
183
|
final_model_base_url = model_base_url
|
|
132
|
-
if model_base_url and ("tracing.fireworks.ai" in model_base_url or model_base_url.startswith("http://localhost")):
|
|
184
|
+
if model_base_url and ("tracing.fireworks.ai" in model_base_url or model_base_url.startswith("http://localhost") or "litellm-gateway" in model_base_url):
|
|
133
185
|
final_model_base_url = build_fireworks_tracing_url(model_base_url, meta, completion_params_base_url)
|
|
134
186
|
|
|
135
187
|
# Extract API key from environment or completion_params
|
|
@@ -148,13 +200,20 @@ def build_init_request(
|
|
|
148
200
|
|
|
149
201
|
|
|
150
202
|
def update_row_with_remote_trace(
|
|
151
|
-
row: EvaluationRow,
|
|
203
|
+
row: EvaluationRow,
|
|
204
|
+
output_data_loader: Callable[[DataLoaderConfig], DynamicDataLoader],
|
|
205
|
+
model_base_url: str,
|
|
206
|
+
include_payloads: bool = False,
|
|
152
207
|
) -> None:
|
|
153
208
|
"""Update row with remote trace data using output_data_loader (shared logic)."""
|
|
154
209
|
if not row.execution_metadata.rollout_id:
|
|
155
210
|
return None
|
|
156
211
|
|
|
157
|
-
loader_config = DataLoaderConfig(
|
|
212
|
+
loader_config = DataLoaderConfig(
|
|
213
|
+
rollout_id=row.execution_metadata.rollout_id,
|
|
214
|
+
model_base_url=model_base_url,
|
|
215
|
+
include_payloads=include_payloads,
|
|
216
|
+
)
|
|
158
217
|
data_loader = output_data_loader(loader_config)
|
|
159
218
|
results = data_loader.load()
|
|
160
219
|
output_rows: List[EvaluationRow] = [r for result in results for r in result.rows]
|